示例#1
0
def clean_english_articles_with_spanish_parallels(en_category_name,
                                                  es_category_name):
    """
    Iterates on english articles and for each english article,
    checks if there exists a spanish article which is the direct link from the english article.
    Removes those english articles..

    :param en_category_name: name of the english category to check
    :param es_category_name: name of the respective category in spanish

    """
    en_folder = get_category_folder('en', en_category_name)
    es_folder = get_category_folder('es', es_category_name)

    # collect original spanish urls from spanish category
    es_urls = set()
    for fname in os.listdir(es_folder):
        p = os.path.join(es_folder, fname)
        with open(p) as f:
            d = simplejson.loads(f.read())
            es_urls.add(d['original_url'])

    to_delete = []
    for fname in os.listdir(en_folder):
        p = os.path.join(en_folder, fname)
        with open(p) as f:
            d = simplejson.loads(f.read())
            if d['spanish_url'] in es_urls:
                to_delete.append(p)

    # delete them!
    for path in to_delete:
        print 'removing', path
        os.remove(path)
示例#2
0
def clean_english_articles_with_spanish_parallels(en_category_name, es_category_name):
    """
    Iterates on english articles and for each english article,
    checks if there exists a spanish article which is the direct link from the english article.
    Removes those english articles..

    :param en_category_name: name of the english category to check
    :param es_category_name: name of the respective category in spanish

    """
    print 'cleaning...'
    en_folder = get_category_folder('en', en_category_name)
    es_folder = get_category_folder('es', es_category_name)

    # collect original spanish urls from spanish category
    es_urls = set()
    for fname in os.listdir(es_folder):
        p = os.path.join(es_folder, fname)
        with open(p) as f:
            d = ujson.loads(f.read())
            es_urls.add(d['original_url'])

    to_delete = []
    for fname in os.listdir(en_folder):
        p = os.path.join(en_folder, fname)
        with open(p) as f:
            d = ujson.loads(f.read())
            if d['spanish_url'] in es_urls:
                to_delete.append(p)

    # delete them!
    for path in to_delete:
        print 'removing', path
        os.remove(path)
示例#3
0
def clean_untranslated_articels(language, category_name):
    """ Clean articles that couldn't be translated """

    category_path = get_category_folder(language, category_name)
    other_lang = 'es' if language == 'en' else 'en'
    # get original language files
    orig = set(filter(lambda x: os.path.splitext(x)[-1]=='.txt', os.listdir(category_path)))
    trans = set(filter(lambda x: os.path.splitext(x)[-1]=='.txt', os.listdir(os.path.join(category_path, other_lang))))

    untranslated = orig - trans
    # delete untranslated
    for fname in untranslated:
        path = os.path.join(category_path, fname)
        print 'deleting', path
        os.remove(path)
示例#4
0
def clean_untranslated_articels(language, category_name):
    """ Clean articles that couldn't be translated """

    category_path = get_category_folder(language, category_name)
    other_lang = 'es' if language == 'en' else 'en'
    # get original language files
    orig = set(
        filter(lambda x: os.path.splitext(x)[-1] == '.txt',
               os.listdir(category_path)))
    trans = set(
        filter(lambda x: os.path.splitext(x)[-1] == '.txt',
               os.listdir(os.path.join(category_path, other_lang))))

    untranslated = orig - trans
    # delete untranslated
    for fname in untranslated:
        path = os.path.join(category_path, fname)
        print 'deleting', path
        os.remove(path)