Exemplos de KeywordProcessor.set_non_word_boundaries em Python

Linguagem de programação: Python

Espaço para nome / nome do pacote: flashtext

Classe / Tipo: KeywordProcessor

Método / Função: set_non_word_boundaries

Exemplos em hotexamples.com: 3

KeywordProcessor.set_non_word_boundaries em Python - 3 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de flashtext.KeywordProcessor.set_non_word_boundaries em Python extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Métodos Frequentes

Exibir Ocultar

KeywordProcessor(30)

replace_keywords(30)

add_keywords_from_dict(30)

add_keywords_from_list(30)

extract_keywords(30)

add_keyword(30)

add_keyword_from_file(9)

add_non_word_boundary(7)

remove_keywords_from_dict(7)

get_all_keywords(6)

get_keyword(5)

remove_keyword(4)

remove_keywords_from_list(4)

levensthein(3)

set_non_word_boundaries(3)

transform(2)

non_word_boundaries(2)

get_next_word(1)

Métodos Frequentes

KeywordProcessor (30)

replace_keywords (30)

add_keywords_from_dict (30)

add_keywords_from_list (30)

extract_keywords (30)

add_keyword (30)

add_keyword_from_file (9)

add_non_word_boundary (7)

remove_keywords_from_dict (7)

get_all_keywords (6)

Métodos Frequentes

get_keyword (5)

remove_keyword (4)

remove_keywords_from_list (4)

levensthein (3)

set_non_word_boundaries (3)

transform (2)

non_word_boundaries (2)

get_next_word (1)

Exemplo n.º 1

0

Exibir arquivo

from collections import Counter import pandas as pd from flashtext import KeywordProcessor import inflect from wb_cleaning.dir_manager import get_data_dir from wb_cleaning.translate import translation ACCENTED_CHARS = set( "ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ") jdc_tags_processor = KeywordProcessor() jdc_tags_processor.set_non_word_boundaries( jdc_tags_processor.non_word_boundaries | ACCENTED_CHARS) inflect_engine = inflect.engine() # input schema # -> tag_value # -> tag_prototypes # Definition of data input: # The input to the tag extractor is an excel or csv file. # The first column of the data must be the intended tag keyword. # To remove ambiguity, a header with name "tag_keyword" must be present. # Additionally, all non-empty values in the columns to the right of the tag keyword are considered as prototypes. # Occurences of these prototypes will be mapped to the tag keyword. def get_keywords_mapping(tags_sheet, src="en", translate_to=None): # translate_to = ["fr", "es"]

Exemplo n.º 2

0

Exibir arquivo

''' import os from pathlib import Path import glob import functools from flashtext import KeywordProcessor from wb_cleaning import dir_manager # export DASK_DISTRIBUTED__SCHEDULER__ALLOWED_FAILURES=210 # export DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT=60 # export DASK_DISTRIBUTED__COMM__RETRY__COUNT=20 ACCENTED_CHARS = set( "ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ") keyword_processor = KeywordProcessor() keyword_processor.set_non_word_boundaries(keyword_processor.non_word_boundaries | ACCENTED_CHARS) with open(dir_manager.get_data_dir('whitelists', 'whitelists', 'phrases.txt')) as phrase_file: # Use flashtext format phrases_map = { l.strip(): [l.strip().replace('_', ' ')] for l in phrase_file if l.strip() } keyword_processor.add_keywords_from_dict(phrases_map) @functools.lru_cache(maxsize=None) def cached_load_file(fname: Path, split: bool = True): return load_file(fname=fname, split=split)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: country_extractor.py Projeto: worldbank/wb-nlp-tools

from pathlib import Path import re from collections import Counter import pandas as pd from flashtext import KeywordProcessor from wb_cleaning.extraction.whitelist import mappings from wb_cleaning.dir_manager import get_data_dir from wb_cleaning.types.metadata_enums import RegionTypes ACCENTED_CHARS = set( "ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ") country_code_processor = KeywordProcessor() country_code_processor.set_non_word_boundaries( country_code_processor.non_word_boundaries | ACCENTED_CHARS) country_group_processor = KeywordProcessor() country_group_processor.set_non_word_boundaries( country_group_processor.non_word_boundaries | ACCENTED_CHARS) def get_standardized_regions(iso_code="iso3c"): assert iso_code in ["iso2c", "iso3c", "full"] codelist_path = Path( get_data_dir("whitelists", "countries", "codelist.xlsx")) standardized_regions_path = codelist_path.parent / "standardized_regions.xlsx" if not standardized_regions_path.exists():