from collections import Counter import pandas as pd from flashtext import KeywordProcessor import inflect from wb_cleaning.dir_manager import get_data_dir from wb_cleaning.translate import translation ACCENTED_CHARS = set( "ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ") jdc_tags_processor = KeywordProcessor() jdc_tags_processor.set_non_word_boundaries( jdc_tags_processor.non_word_boundaries | ACCENTED_CHARS) inflect_engine = inflect.engine() # input schema # -> tag_value # -> tag_prototypes # Definition of data input: # The input to the tag extractor is an excel or csv file. # The first column of the data must be the intended tag keyword. # To remove ambiguity, a header with name "tag_keyword" must be present. # Additionally, all non-empty values in the columns to the right of the tag keyword are considered as prototypes. # Occurences of these prototypes will be mapped to the tag keyword. def get_keywords_mapping(tags_sheet, src="en", translate_to=None): # translate_to = ["fr", "es"]
''' import os from pathlib import Path import glob import functools from flashtext import KeywordProcessor from wb_cleaning import dir_manager # export DASK_DISTRIBUTED__SCHEDULER__ALLOWED_FAILURES=210 # export DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT=60 # export DASK_DISTRIBUTED__COMM__RETRY__COUNT=20 ACCENTED_CHARS = set( "ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ") keyword_processor = KeywordProcessor() keyword_processor.set_non_word_boundaries(keyword_processor.non_word_boundaries | ACCENTED_CHARS) with open(dir_manager.get_data_dir('whitelists', 'whitelists', 'phrases.txt')) as phrase_file: # Use flashtext format phrases_map = { l.strip(): [l.strip().replace('_', ' ')] for l in phrase_file if l.strip() } keyword_processor.add_keywords_from_dict(phrases_map) @functools.lru_cache(maxsize=None) def cached_load_file(fname: Path, split: bool = True): return load_file(fname=fname, split=split)
from pathlib import Path import re from collections import Counter import pandas as pd from flashtext import KeywordProcessor from wb_cleaning.extraction.whitelist import mappings from wb_cleaning.dir_manager import get_data_dir from wb_cleaning.types.metadata_enums import RegionTypes ACCENTED_CHARS = set( "ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ") country_code_processor = KeywordProcessor() country_code_processor.set_non_word_boundaries( country_code_processor.non_word_boundaries | ACCENTED_CHARS) country_group_processor = KeywordProcessor() country_group_processor.set_non_word_boundaries( country_group_processor.non_word_boundaries | ACCENTED_CHARS) def get_standardized_regions(iso_code="iso3c"): assert iso_code in ["iso2c", "iso3c", "full"] codelist_path = Path( get_data_dir("whitelists", "countries", "codelist.xlsx")) standardized_regions_path = codelist_path.parent / "standardized_regions.xlsx" if not standardized_regions_path.exists():