def load_country_groups_map(): return pd.read_excel( get_data_dir("whitelists", "countries", "codelist.xlsx"), sheet_name="groups_iso3c", header=1, index_col=1).drop("country.name.en", axis=1).apply( lambda col_ser: col_ser.dropna().index.dropna().tolist(), axis=0).to_dict()
def get_standardized_regions(iso_code="iso3c"): assert iso_code in ["iso2c", "iso3c", "full"] codelist_path = Path( get_data_dir("whitelists", "countries", "codelist.xlsx")) standardized_regions_path = codelist_path.parent / "standardized_regions.xlsx" if not standardized_regions_path.exists(): codelist = pd.read_excel(codelist_path) iso_region = codelist[["country.name.en", "iso2c", "iso3c", "region"]] standardized_regions = iso_region.dropna( subset=["iso2c", "region"]).set_index("iso2c") standardized_regions.to_excel(standardized_regions_path) else: standardized_regions = pd.read_excel(standardized_regions_path) if iso_code != "full": standardized_regions = standardized_regions.reset_index().set_index( iso_code)["region"].to_dict() return standardized_regions
from wb_cleaning.ops.cache_utils import redis_cacher cache_decorator = redis_cacher except redis.ConnectionError as error: args = error.args print(args[0]) print("Redis not available, falling back to joblib cache...") USE_JOBLIB_MEMORY = True if USE_JOBLIB_MEMORY: RESPELLER_CACHE_LOCATION = "/dev/shm/respeller-cachedir" try: respeller_cache = Memory(RESPELLER_CACHE_LOCATION, verbose=0) except PermissionError: RESPELLER_CACHE_LOCATION = dir_manager.get_data_dir( 'shm', 'respeller-cachedir') if not os.path.isdir(RESPELLER_CACHE_LOCATION): os.makedirs(RESPELLER_CACHE_LOCATION) respeller_cache = Memory(RESPELLER_CACHE_LOCATION, verbose=0) cache_decorator = respeller_cache.cache # # Returns self without any form of caching. # cache_decorator = lambda f: f # # en_dict = language.Language().get_en_dict() en_lang = language.Language()
def __init__(self, tag="en_US", pwl=dir_manager.get_data_dir( "whitelists", "whitelists", "doc-freq-wiki-wordlist.txt") ): self.tag = tag self.pwl = pwl self.init_en_dict()
# -*- coding: utf-8 -*- import os from wb_cleaning.dir_manager import get_data_dir WHITELIST_PATH = get_data_dir('whitelists', 'extraction') def get_country_csv(): return os.path.join(WHITELIST_PATH, 'whitelist_countries_multilingual.csv') def get_wb_presidents_csv(): return os.path.join(WHITELIST_PATH, 'wb_presidents.csv')
import glob import functools from flashtext import KeywordProcessor from wb_cleaning import dir_manager # export DASK_DISTRIBUTED__SCHEDULER__ALLOWED_FAILURES=210 # export DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT=60 # export DASK_DISTRIBUTED__COMM__RETRY__COUNT=20 ACCENTED_CHARS = set( "ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ") keyword_processor = KeywordProcessor() keyword_processor.set_non_word_boundaries(keyword_processor.non_word_boundaries | ACCENTED_CHARS) with open(dir_manager.get_data_dir('whitelists', 'whitelists', 'phrases.txt')) as phrase_file: # Use flashtext format phrases_map = { l.strip(): [l.strip().replace('_', ' ')] for l in phrase_file if l.strip() } keyword_processor.add_keywords_from_dict(phrases_map) @functools.lru_cache(maxsize=None) def cached_load_file(fname: Path, split: bool = True): return load_file(fname=fname, split=split) def replace_phrases(txt): return keyword_processor.replace_keywords(txt)
dest=dest, remove_accents=False)).map( lambda x: x + [inflect_engine.plural(i) for i in x if "_" not in i]) for dest in translate_to: tags_mapping = tags_mapping + lang_map[dest] # Clean up the keywords to remove duplicates. tags_mapping = tags_mapping.map( lambda x: sorted(set(filter(lambda i: i, x)))) return tags_mapping tags_sheet = pd.read_excel(get_data_dir("whitelists", "jdc", "List_filtering_keywords.xlsx"), header=None, index_col=0).rename(columns={1: "tag_keyword"}) tags_mapping = get_keywords_mapping(tags_sheet=tags_sheet) if "Kakuma (Kenya)" in tags_mapping: tags_mapping.pop("Kakuma (Kenya)") jdc_tags_processor.add_keywords_from_dict(tags_mapping) def get_jdc_tag_counts(txt): data = [] for tag, value in Counter( jdc_tags_processor.extract_keywords(txt)).most_common():
def load_country_groups_names(): return pd.read_excel(get_data_dir("whitelists", "countries", "codelist.xlsx"), sheet_name="groups_names", header=0, index_col=0).to_dict()["Full name"]
def load_iso3166_3_country_info(): return pd.read_json(get_data_dir("maps", "iso3166-3-country-info.json")).to_dict()
from functools import lru_cache from pathlib import Path from unidecode import unidecode import googletrans from googletrans import client as gt import requests from wb_cleaning import dir_manager assert googletrans.__version__ == "3.1.0-alpha" _trans = gt.Translator() language_codes_path = Path( dir_manager.get_data_dir("whitelists", "language", "language_codes.json")) if not language_codes_path.exists(): result = requests.get( "https://translate.shell.com/api/Translate/GetLanguages") result = result.json() language_codes = [{"name": o["item1"], "code": o["item2"]} for o in result] with open(language_codes_path, "w") as open_file: json.dump(language_codes, open_file) else: with open(language_codes_path) as open_file: language_codes = json.load(open_file)