Exemplo n.º 1
0
from collections import Counter

import pandas as pd
from flashtext import KeywordProcessor
import inflect
from wb_cleaning.dir_manager import get_data_dir
from wb_cleaning.translate import translation

ACCENTED_CHARS = set(
    "ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ")

jdc_tags_processor = KeywordProcessor()
jdc_tags_processor.set_non_word_boundaries(
    jdc_tags_processor.non_word_boundaries | ACCENTED_CHARS)

inflect_engine = inflect.engine()

# input schema
# -> tag_value
# -> tag_prototypes

# Definition of data input:
# The input to the tag extractor is an excel or csv file.
# The first column of the data must be the intended tag keyword.
# To remove ambiguity, a header with name "tag_keyword" must be present.
# Additionally, all non-empty values in the columns to the right of the tag keyword are considered as prototypes.
# Occurences of these prototypes will be mapped to the tag keyword.


def get_keywords_mapping(tags_sheet, src="en", translate_to=None):
    # translate_to = ["fr", "es"]
Exemplo n.º 2
0
'''
import os
from pathlib import Path
import glob
import functools
from flashtext import KeywordProcessor
from wb_cleaning import dir_manager
# export DASK_DISTRIBUTED__SCHEDULER__ALLOWED_FAILURES=210
# export DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT=60
# export DASK_DISTRIBUTED__COMM__RETRY__COUNT=20

ACCENTED_CHARS = set(
    "ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ")

keyword_processor = KeywordProcessor()
keyword_processor.set_non_word_boundaries(keyword_processor.non_word_boundaries
                                          | ACCENTED_CHARS)

with open(dir_manager.get_data_dir('whitelists', 'whitelists',
                                   'phrases.txt')) as phrase_file:
    # Use flashtext format
    phrases_map = {
        l.strip(): [l.strip().replace('_', ' ')]
        for l in phrase_file if l.strip()
    }
    keyword_processor.add_keywords_from_dict(phrases_map)


@functools.lru_cache(maxsize=None)
def cached_load_file(fname: Path, split: bool = True):
    return load_file(fname=fname, split=split)
Exemplo n.º 3
0
from pathlib import Path
import re
from collections import Counter

import pandas as pd
from flashtext import KeywordProcessor

from wb_cleaning.extraction.whitelist import mappings
from wb_cleaning.dir_manager import get_data_dir
from wb_cleaning.types.metadata_enums import RegionTypes

ACCENTED_CHARS = set(
    "ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ")

country_code_processor = KeywordProcessor()
country_code_processor.set_non_word_boundaries(
    country_code_processor.non_word_boundaries | ACCENTED_CHARS)

country_group_processor = KeywordProcessor()
country_group_processor.set_non_word_boundaries(
    country_group_processor.non_word_boundaries | ACCENTED_CHARS)


def get_standardized_regions(iso_code="iso3c"):
    assert iso_code in ["iso2c", "iso3c", "full"]

    codelist_path = Path(
        get_data_dir("whitelists", "countries", "codelist.xlsx"))

    standardized_regions_path = codelist_path.parent / "standardized_regions.xlsx"

    if not standardized_regions_path.exists():