def load_country_groups_map():
    return pd.read_excel(
        get_data_dir("whitelists", "countries", "codelist.xlsx"),
        sheet_name="groups_iso3c",
        header=1,
        index_col=1).drop("country.name.en", axis=1).apply(
            lambda col_ser: col_ser.dropna().index.dropna().tolist(),
            axis=0).to_dict()
def get_standardized_regions(iso_code="iso3c"):
    assert iso_code in ["iso2c", "iso3c", "full"]

    codelist_path = Path(
        get_data_dir("whitelists", "countries", "codelist.xlsx"))

    standardized_regions_path = codelist_path.parent / "standardized_regions.xlsx"

    if not standardized_regions_path.exists():

        codelist = pd.read_excel(codelist_path)
        iso_region = codelist[["country.name.en", "iso2c", "iso3c", "region"]]
        standardized_regions = iso_region.dropna(
            subset=["iso2c", "region"]).set_index("iso2c")

        standardized_regions.to_excel(standardized_regions_path)
    else:
        standardized_regions = pd.read_excel(standardized_regions_path)

    if iso_code != "full":
        standardized_regions = standardized_regions.reset_index().set_index(
            iso_code)["region"].to_dict()

    return standardized_regions
示例#3
0
        from wb_cleaning.ops.cache_utils import redis_cacher
        cache_decorator = redis_cacher

    except redis.ConnectionError as error:
        args = error.args
        print(args[0])
        print("Redis not available, falling back to joblib cache...")
        USE_JOBLIB_MEMORY = True

if USE_JOBLIB_MEMORY:
    RESPELLER_CACHE_LOCATION = "/dev/shm/respeller-cachedir"

    try:
        respeller_cache = Memory(RESPELLER_CACHE_LOCATION, verbose=0)
    except PermissionError:
        RESPELLER_CACHE_LOCATION = dir_manager.get_data_dir(
            'shm', 'respeller-cachedir')

        if not os.path.isdir(RESPELLER_CACHE_LOCATION):
            os.makedirs(RESPELLER_CACHE_LOCATION)

        respeller_cache = Memory(RESPELLER_CACHE_LOCATION, verbose=0)

    cache_decorator = respeller_cache.cache

# # Returns self without any form of caching.
# cache_decorator = lambda f: f

#
# en_dict = language.Language().get_en_dict()
en_lang = language.Language()
示例#4
0
 def __init__(self, tag="en_US", pwl=dir_manager.get_data_dir(
         "whitelists", "whitelists", "doc-freq-wiki-wordlist.txt")
 ):
     self.tag = tag
     self.pwl = pwl
     self.init_en_dict()
示例#5
0
# -*- coding: utf-8 -*-
import os
from wb_cleaning.dir_manager import get_data_dir

WHITELIST_PATH = get_data_dir('whitelists', 'extraction')


def get_country_csv():
    return os.path.join(WHITELIST_PATH, 'whitelist_countries_multilingual.csv')


def get_wb_presidents_csv():
    return os.path.join(WHITELIST_PATH, 'wb_presidents.csv')
示例#6
0
import glob
import functools
from flashtext import KeywordProcessor
from wb_cleaning import dir_manager
# export DASK_DISTRIBUTED__SCHEDULER__ALLOWED_FAILURES=210
# export DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT=60
# export DASK_DISTRIBUTED__COMM__RETRY__COUNT=20

ACCENTED_CHARS = set(
    "ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ")

keyword_processor = KeywordProcessor()
keyword_processor.set_non_word_boundaries(keyword_processor.non_word_boundaries
                                          | ACCENTED_CHARS)

with open(dir_manager.get_data_dir('whitelists', 'whitelists',
                                   'phrases.txt')) as phrase_file:
    # Use flashtext format
    phrases_map = {
        l.strip(): [l.strip().replace('_', ' ')]
        for l in phrase_file if l.strip()
    }
    keyword_processor.add_keywords_from_dict(phrases_map)


@functools.lru_cache(maxsize=None)
def cached_load_file(fname: Path, split: bool = True):
    return load_file(fname=fname, split=split)


def replace_phrases(txt):
    return keyword_processor.replace_keywords(txt)
示例#7
0
                    dest=dest,
                    remove_accents=False)).map(
                        lambda x: x +
                        [inflect_engine.plural(i) for i in x if "_" not in i])

        for dest in translate_to:
            tags_mapping = tags_mapping + lang_map[dest]

    # Clean up the keywords to remove duplicates.
    tags_mapping = tags_mapping.map(
        lambda x: sorted(set(filter(lambda i: i, x))))

    return tags_mapping


tags_sheet = pd.read_excel(get_data_dir("whitelists", "jdc",
                                        "List_filtering_keywords.xlsx"),
                           header=None,
                           index_col=0).rename(columns={1: "tag_keyword"})

tags_mapping = get_keywords_mapping(tags_sheet=tags_sheet)
if "Kakuma (Kenya)" in tags_mapping:
    tags_mapping.pop("Kakuma (Kenya)")

jdc_tags_processor.add_keywords_from_dict(tags_mapping)


def get_jdc_tag_counts(txt):
    data = []

    for tag, value in Counter(
            jdc_tags_processor.extract_keywords(txt)).most_common():
def load_country_groups_names():
    return pd.read_excel(get_data_dir("whitelists", "countries",
                                      "codelist.xlsx"),
                         sheet_name="groups_names",
                         header=0,
                         index_col=0).to_dict()["Full name"]
def load_iso3166_3_country_info():
    return pd.read_json(get_data_dir("maps",
                                     "iso3166-3-country-info.json")).to_dict()
示例#10
0
from functools import lru_cache
from pathlib import Path

from unidecode import unidecode

import googletrans
from googletrans import client as gt
import requests

from wb_cleaning import dir_manager
assert googletrans.__version__ == "3.1.0-alpha"

_trans = gt.Translator()

language_codes_path = Path(
    dir_manager.get_data_dir("whitelists", "language", "language_codes.json"))

if not language_codes_path.exists():
    result = requests.get(
        "https://translate.shell.com/api/Translate/GetLanguages")
    result = result.json()

    language_codes = [{"name": o["item1"], "code": o["item2"]} for o in result]

    with open(language_codes_path, "w") as open_file:
        json.dump(language_codes, open_file)

else:
    with open(language_codes_path) as open_file:
        language_codes = json.load(open_file)