示例#1
0
def processing(i, file_list):
    tk = RegexpTokenizer(r'\w\S+\w')

    # create English stop words list
    en_stop = get_stop_words('en')

    stopword_processor = KeywordProcessor()
    for w in en_stop:
        stopword_processor.add_keyword(w, ' ')

    with open('stopword_processor.pkl', 'wb') as f:
        pickle.dump(stopword_processor, f)

    p_stemmer = PorterStemmer()

    with codecs.open('whole_dialogs_stem_%d' % i, 'w', 'utf-8') as out:
        for fi in tqdm(file_list):
            with codecs.open(fi, 'r', 'utf-8') as f:
                sentences = [
                    stopword_processor.replace_keywords(
                        line.strip().split('\t')[-1].lower()) for line in f
                ]
                words = functools.reduce(lambda x, y: x + y,
                                         map(tk.tokenize, sentences))
                words = map(p_stemmer.stem, words)
                out.write(' '.join(words) + '\n')
示例#2
0
class Tokenizer(object):
    def __init__(self):
        log.info("Tokenizer initialization")

        from .lookup_dic import _PhraseDictionary as __PD
        log.debug("Tokenizer: calls lookup_dic.read_phrases")
        self.lookup_dic = __PD()

        log.debug("Instanciate flashtext.KeyworkProcessor")
        self.__keyword_processor = KeywordProcessor()
        log.debug("Insert data into flashtext.KeyworkProcessor instance.")
        self.__keyword_processor.add_keywords_from_dict(
            self.lookup_dic.lookup_dic_CODE)
        log.info("Tokenizer initialization successful")

    def tokenize(self, text):
        log.debug(f"Tokenizer called on {text}")

        log.debug("Phase I: Replacing phrases.")
        text = self.__keyword_processor.replace_keywords(text)

        log.debug("Phase II: Split by space.")
        tokens_list = text.split()

        log.debug("Phase III: Replace back token id to its original form.")
        tokens_list = [
            self.lookup_dic.reverse_replace(token)
            if token in self.lookup_dic.lookup_dic_CODE else token
            for token in tokens_list
        ]

        return tokens_list

    def __call__(self, text):
        return self.tokenize(text)
示例#3
0
    def test_replace_keywords(self):
        """For each of the test case initialize a new KeywordProcessor.
        Add the keywords the test case to KeywordProcessor.
        Replace keywords and check if they match the expected result for the test case.

        """
        for test_id, test_case in enumerate(self.test_cases):
            keyword_replacer = KeywordProcessor()
            # To handle issue like https://github.com/vi3k6i5/flashtext/issues/8
            # clean names are replaced with "_" in place of white space.
            for key, values in test_case['keyword_dict'].items():
                for value in values:
                    keyword_replacer.add_keyword(value, key.replace(" ", "_"))
            new_sentence = keyword_replacer.replace_keywords(test_case['sentence'])

            replaced_sentence = test_case['sentence']
            keyword_mapping = {}
            for val in test_case['keyword_dict']:
                for value in test_case['keyword_dict'][val]:
                    keyword_mapping[value] = val.replace(" ", "_")
            for key in sorted(keyword_mapping, key=len, reverse=True):
                lowercase = re.compile(r'(?<!\w){}(?!\w)'.format(re.escape(key)))
                replaced_sentence = lowercase.sub(keyword_mapping[key], replaced_sentence)

            self.assertEqual(new_sentence, replaced_sentence,
                             "new_sentence don't match the expected results for test case: {}".format(test_id))
示例#4
0
def remove_stopwords(text: str,
                     lang: str,
                     custom_stopwords: list = None,
                     ignored_stopwords: list = None) -> str:
    """
    Given ``text`` str, remove classic stopwords for a given language and
    custom stopwords given as a list. Words and groups of words from
    ignored_stopwords list are ignored during stopwords removal.

    Parameters
    ----------
    text : string
    lang : string
    custom_stopwords : list of strings
    ignored_stopwords : list of strings

    Returns
    -------
    string

    Raises
    -------
    ValueError
        if ``custom_stopwords``  and ``ignored_stopwords`` have common elements.
    """
    if custom_stopwords and ignored_stopwords:
        common_elements = set(custom_stopwords).intersection(
            set(ignored_stopwords))
        if common_elements != set():
            raise ValueError(
                f"Found common words in custom_stopwords and ignored_stopwords: \
                {common_elements}. Please remove duplicated values.")
    stopwords = get_stopwords(lang)
    if ignored_stopwords:
        keyword_processor = KeywordProcessor()
        singletons_to_keep = [
            x for x in ignored_stopwords if len(x.split()) == 1
        ]
        for group_of_words in ignored_stopwords:
            keyword_processor.add_keyword(group_of_words,
                                          remove_whitespace(group_of_words))
        text = keyword_processor.replace_keywords(text)
    else:
        singletons_to_keep = []
    if custom_stopwords:
        stopwords += custom_stopwords
    if lang in ["fr", "en"]:
        lang_module = {"fr": "fr_spacy", "en": "en_spacy"}[lang]
        tokens = tokenize(text, lang_module)
    else:
        tokens = text.split()
    tokens = [
        t for t in tokens if (t not in stopwords or t in singletons_to_keep)
    ]
    tokens = ungroup_ignored_stopwords(tokens, ignored_stopwords)
    return ' '.join(tokens)
示例#5
0
 def test_empty_string(self):
     keyword_processor = KeywordProcessor()
     keyword_dict = {
         "java": "java_2e",
         "product management": "product manager"
     }
     self.assertEqual(keyword_processor.extract_keywords(""), [],
                      "new_sentence don't match the expected result")
     self.assertEqual(keyword_processor.replace_keywords(""), "",
                      "new_sentence don't match the expected result")
示例#6
0
 def test_file_format_two(self):
     keyword_processor = KeywordProcessor()
     keyword_processor.add_keyword_from_file('test/keywords_format_two.txt')
     sentence = 'I know java and product management'
     keywords_extracted = keyword_processor.extract_keywords(sentence)
     self.assertEqual(keywords_extracted, ['java', 'product management'],
                      "Failed file format one test")
     sentence_new = keyword_processor.replace_keywords(sentence)
     self.assertEqual(sentence_new, "I know java and product management",
                      "Failed file format one test")
示例#7
0
def replace(text):
    kwp = KeywordProcessor()
    kwp.non_word_boundaries = set()
    kwp.add_keywords_from_dict(
        {" {} ".format(v): [k]
         for k, v in UNICODE_EMOJI.items()})

    clean_text = kwp.replace_keywords(text).strip()

    return clean_text
示例#8
0
 def test_file_format_one_first_occ(self):
     keyword_processor = KeywordProcessor()
     keyword_processor.add_keyword_from_file('test/keywords_format_one.txt')
     sentence = 'I know java_2e and product management techniques'
     keywords_extracted = keyword_processor.extract_keywords(
         sentence, stop_at_first_occ=True)
     self.assertEqual(keywords_extracted, ['java'],
                      "Failed file format one test")
     sentence_new = keyword_processor.replace_keywords(sentence)
     self.assertEqual(sentence_new, "I know java and product management",
                      "Failed file format one test")
示例#9
0
 def test_list_loading(self):
     keyword_processor = KeywordProcessor()
     keyword_list = ["java", "product management"]
     keyword_processor.add_keywords_from_list(keyword_list)
     sentence = 'I know java and product management'
     keywords_extracted = keyword_processor.extract_keywords(sentence)
     self.assertEqual(keywords_extracted, ['java', 'product management'],
                      "Failed file format one test")
     sentence_new = keyword_processor.replace_keywords(sentence)
     self.assertEqual(sentence_new, "I know java and product management",
                      "Failed file format one test")
示例#10
0
def fix_segment(segmented_text):
    keyword_processor = KeywordProcessor()
    fix_word = pd.read_csv('./data/baidu_wrong_segment.csv')
    fix_word = fix_word.fillna('')
    wrong = fix_word['wrong'].tolist()
    correct = fix_word['correct'].tolist()
    for i in range(fix_word.shape[0]):
        keyword_processor.add_keyword(wrong[i], correct[i])
    segmented_text = [
        keyword_processor.replace_keywords(sent) for sent in segmented_text
    ]
    return segmented_text
    def fix_mispelled_dict(self, text):
        kp = KeywordProcessor(case_sensitive=True)
        mix_mispelled_dict = {}
        for k, v in self.mispelled_dict.items():
            mix_mispelled_dict[k] = v
            mix_mispelled_dict[k.lower()] = v.lower()
            mix_mispelled_dict[k.upper()] = v.upper()
            mix_mispelled_dict[k.capitalize()] = v.capitalize()

        for k, v in mix_mispelled_dict.items():
            kp.add_keyword(k, v)
        return kp.replace_keywords(text)
示例#12
0
class KeyProc:

    with open('./data/cleaned_name.pickle', 'rb') as handle:
        dict_cleaned_name = pickle.load(handle)

    def __init__(self, dict=dict_cleaned_name):
        #self.text = text
        self.kp = KeywordProcessor()
        self.kp.add_keywords_from_dict(dict)

    def extractKeywords(self, text):
        return self.kp.extract_keywords(text)

    def replaceKeywords(self, text):
        return self.kp.replace_keywords(text)
示例#13
0
 def change_file_str(self, currentStr, newStr):
     keyword = KeywordProcessor(True)
     keyword.add_keyword(currentStr, newStr)
     for fi in self._allLuaFiles:
         if os.path.exists(fi):
             with open(fi, "r+") as f:
                 file_str = f.read()
                 keywords_found = keyword.extract_keywords(file_str)
                 if len(keywords_found) > 0:
                     print "will replace file:", fi
                     newfileStr = keyword.replace_keywords(file_str)
                     f.seek(0, 0)
                     f.write(newfileStr)
         else:
             print "Does`n exist file:", fi
示例#14
0
 def _replace_execute(self) -> Union[str, List[str]]:
     """
     替换内容解析
     :return:
     """
     # TODO:可能可以用flashtext
     replace_a, to_b = self.script.split(TRANSFORM_TO)
     # 1. 初始化关键字处理器
     keyword_processor = KeywordProcessor(case_sensitive=True)
     # 2. 添加关键词
     keyword_processor.add_keyword(replace_a, to_b)
     try:
         if isinstance(self.target, str):
             # 3. 替换关键词
             return keyword_processor.replace_keywords(self.target)
         elif isinstance(self.target, List):
             return [
                 keyword_processor.replace_keywords(s) for s in self.target
             ]
         else:
             raise Exception()
     except Exception:
         raise RegexException("core.model.Operation._replace_execute",
                              self.script)
示例#15
0
    def test_dictionary_loading(self):
        keyword_processor = KeywordProcessor()
        keyword_dict = {
            "java": ["java_2e", "java programing"],
            "product management": ["product management techniques", "product management"]
        }
        keyword_processor.add_keywords_from_dict(keyword_dict)

        sentence = 'I know java_2e and product management techniques'
        keywords_extracted = keyword_processor.extract_keywords(sentence)
        self.assertEqual(keywords_extracted, ['java', 'product management'],
                         "Failed file format one test")
        sentence_new = keyword_processor.replace_keywords(sentence)
        self.assertEqual(sentence_new, "I know java and product management",
                         "Failed file format one test")
示例#16
0
class KeyProc:
    def __init__(self, main_path=None):
        """
        Parameters
        ----------
            main_path : string
                path to project
        """

        with open(f'{main_path}/data/processed/cleaned_name.pickle',
                  'rb') as handle:
            self.dict_cleaned_name = pickle.load(handle)

        self.kp = KeywordProcessor()
        self.kp.add_keywords_from_dict(self.dict_cleaned_name)

    def extractKeywords(self, text):
        return self.kp.extract_keywords(text)

    def replaceKeywords(self, text):
        return self.kp.replace_keywords(text)
示例#17
0
def multipleReplace(text, wordDict):
    kp_replace = KeywordProcessor()
    for key in wordDict:
        kp_replace.add_keyword(key, wordDict[key])
        mod_text = kp_replace.replace_keywords(text)
    return mod_text
示例#18
0
    f = open(os.path.join(rdf_path, rdf_file), 'r')
    rdf_text = f.read()

    # Get the URIs included within diamonds <> and encode them
    # TODO: should be put as utils clearner
    matches = re.findall(r'\<(.*?)\>', rdf_text)
    matches = list(
        map((lambda x: {
            x: urllib.parse.quote(x, safe='http://')
        }), matches))

    for index in range(len(matches)):
        for key in matches[index]:
            keyword_processor.add_keyword(key, matches[index][key])

    rdf_updated = keyword_processor.replace_keywords(rdf_text)

    print('End encoding of ' + rdf_file + '.')
    print("--- %s seconds ---" % (time.time() - start_enc_time))
    '''
    TODO: This can be useful in case of needs cleaning

    # Get the URIs included within diamonds <> and replace whites spaces and vertical lines
    matches = re.findall(r'\<(.*?)\>', rdf_text)
    matches = list(
        map((lambda x: {x: x.replace(' ', 'WSP').replace('|', 'VRT')}), matches))

    for index in range(len(matches)):
        for key in matches[index]:
            keyword_processor.add_keyword(key, matches[index][key])
示例#19
0
    print('\n    * The number of attributes is equal to: '
          + str(len(st[0]['attributes'])))

    # ------ Extract data from complete rdf files
    complete = open(rdf_path + name + '/complete.nt').read()
    # Get the URIs included within diamonds <> and encode them
    # TODO: should be put as utils clearner
    matches = re.findall(r'\<(.*?)\>', complete)
    matches = list(
        map((lambda x: {x: urllib.parse.quote(x, safe='http://')}), matches))

    for index in range(len(matches)):
        for key in matches[index]:
            keyword_processor.add_keyword(key, matches[index][key])

    complete_updated = keyword_processor.replace_keywords(complete)

    g = rdflib.Graph()
    graph = g.parse(data=complete_updated, format='turtle')

    res = g.query(
        """
            SELECT (COUNT (DISTINCT ?s) AS ?num_entities)
            WHERE {
                ?s ?p ?o
            }
            """)

    for row in res:
        print("    * Number of entities in complete: %s" % row)
示例#20
0
from flashtext import KeywordProcessor

# determine the position of the New York
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('New York', 'New York')
keywords_found = keyword_processor.extract_keywords(
    'New York City (NYC), often called the City of New York or simply New York (NY), is the most populous city in the United States. With an estimated 2018 population of 8,398,748 distributed over about 302.6 square miles (784 km2), New York is also the most densely populated major city in the United States.[10] Located at the southern tip of the U.S. state of New York, the city is the center of the New York metropolitan area, the largest metropolitan area in the world by urban landmass.[11] With almost 20 million people in its metropolitan statistical area and approximately 23 million in its combined statistical area, it is one of the worlds most populous megacities. New York City has been described as the cultural, financial, and media capital of the world, significantly influencing commerce,[12] entertainment, research, technology, education, politics, tourism, art, fashion, and sports. Home to the headquarters of the United Nations,[13] New York is an important center for international diplomacy.[14][15]',
    span_info=True)
print(keywords_found)

# change the keyword 'New York City' to 'NYC' and 'New York' to 'NY'
keyword_processor.add_keyword('New York City', 'NYC')
keyword_processor.add_keyword('New York', 'NY')
new_sentence = keyword_processor.replace_keywords(
    'New York City, often called the City of New York or simply New York (NY), is the most populous city in the United States. With an estimated 2018 population of 8,398,748 distributed over about 302.6 square miles (784 km2), New York is also the most densely populated major city in the United States.[10] Located at the southern tip of the U.S. state of New York, the city is the center of the New York metropolitan area, the largest metropolitan area in the world by urban landmass.[11] With almost 20 million people in its metropolitan statistical area and approximately 23 million in its combined statistical area, it is one of the worlds most populous megacities. New York City has been described as the cultural, financial, and media capital of the world, significantly influencing commerce,[12] entertainment, research, technology, education, politics, tourism, art, fashion, and sports. Home to the headquarters of the United Nations,[13] New York is an important center for international diplomacy.[14][15]'
)
print(new_sentence)
list_removals = list(set(list_removals))  # unique sets

list_clients = df[df.columns[0]].tolist()
list_industries = df[df.columns[1]].tolist()

str_clients = ", ".join(list_clients)

keyword_processor = KeywordProcessor()

keyword_names = list_removals
clean_names = [' '] * len(keyword_names)

for keyword_name, clean_name in zip(keyword_names, clean_names):
    keyword_processor.add_keyword(keyword_name, clean_name)

clean_str_clients = keyword_processor.replace_keywords(str_clients)
clean_str_clients = re.sub(" +", " ", clean_str_clients)

print("Client name cleaning finished...", "\n")

list_clients = clean_str_clients.split(", ")

cleaned_list_clients = [
    ' '.join(
        set([
            lemmatizer.lemmatize(word) for word in sentence.split(" ")
            if len(lemmatizer.lemmatize(word)) > 2 or word in list_reinclude
        ])) for sentence in list_clients
]

list_tk_clients = []
# http://blog.csdn.net/CoderPai/article/details/78574863
# https://github.com/vi3k6i5/flashtext

# 关键词搜索
from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor()
# keyword_processor.add_keyword(<别名>, <标准词>)
keyword_processor.add_keyword('广东', '广东省')
keyword_processor.add_keyword('北京')
keywords_found = keyword_processor.extract_keywords('北京与广东都是中国的')
print(keywords_found)
# ['北京', '广东省']

# 关键词替换
keyword_processor.add_keyword('A卡', '邂逅a卡')
new_sentence = keyword_processor.replace_keywords('a卡与b卡哪个卡好')
print(new_sentence)
# Out[40]: '邂逅a卡与b卡哪个卡好'


# 提取关键词,区分大小写字母
keyword_processor = KeywordProcessor(case_sensitive=True)
keyword_processor.add_keyword('A卡', '邂逅a卡')
keyword_processor.add_keyword('b卡')
keywords_found = keyword_processor.extract_keywords('a卡与b卡哪个卡好?')
print(keywords_found)
# ['b卡']

# 提取关键词,并输出关键词的位置
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('Big Apple', 'New York')
from flashtext import KeywordProcessor

keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('New Delhi', 'NCR region')
new_sentence = keyword_processor.replace_keywords('I love nlp and new delhi.')
print(new_sentence)
示例#24
0
from flashtext import KeywordProcessor

string = 'hello from Jamaica! I need grapes'

keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('Jamaica', 'J Town')

new_sentence = keyword_processor.replace_keywords(string)
print(new_sentence)
# should replace Jamaica with J Town, mon.
示例#25
0
    def handle(self, *args, **options):
        proto_files = {}
        # app_list = list(apps.all_models)

        # collect path of proto files
        self.stderr.write("Collect proto files...")
        protos_path = os.path.join(BASE_DIR, "microservice", "protos",
                                   "microservice")
        if os.path.exists(protos_path):
            for proto in os.listdir(protos_path):
                proto_files[protos_path] = os.path.join(protos_path, proto)
                self.stderr.write("\t✓ {} found.".format(proto))

        # create python path for messages
        python_out_path = os.path.join(BASE_DIR, "microservice", "message")
        if not os.path.exists(python_out_path):
            os.makedirs(python_out_path)
        else:
            shutil.rmtree(python_out_path)
            os.makedirs(python_out_path)

        # create grpc path for rpc
        grpc_path_out = os.path.join(BASE_DIR, "microservice", "rpc")
        if not os.path.exists(grpc_path_out):
            os.makedirs(grpc_path_out)
        else:
            shutil.rmtree(grpc_path_out)
            os.makedirs(grpc_path_out)

        # compile proto files
        self.stderr.write("Generate...")
        for proto_path, proto_file in proto_files.items():
            command = [
                "grpc_tools.protoc",
                "-I={}".format(proto_path),
                "--python_out={}".format(python_out_path),
                "--grpc_python_out={}".format(grpc_path_out),
            ] + [proto_file]
            if protoc.main(command) != 0:
                self.stderr.write("Failed to generate {}".format(proto_file))
            else:
                self.stderr.write("\t✓ compiled {}".format(proto_file))

        self.stderr.write("✓ Generate completed")

        # correct "import" message in rpc files
        rpc_paths = {}
        for rpc in os.listdir(grpc_path_out):
            rpc_paths[rpc] = os.path.join(grpc_path_out, rpc)

        for rpc_name, rpc_file in rpc_paths.items():
            app_name = "_".join(rpc_name.split('_')[:-2])

            keyword_processor = KeywordProcessor()
            # keyword_processor.add_keyword(<unclean name>, <standardised name>)
            keyword_processor.add_keyword(
                'import {} as {}'.format(app_name + "_pb2",
                                         app_name.replace("_", "__") +
                                         "__pb2"),
                'from microservice.message import {} as {}'.format(
                    app_name + "_pb2",
                    app_name.replace("_", "__") + "__pb2"))

            with open(rpc_file, 'r') as old_file:
                new_code = keyword_processor.replace_keywords(old_file.read())
            with open(rpc_file, 'w') as new_file:
                new_file.write(new_code)
示例#26
0
def generate_listing(settings):
    """
    Generate glossary listing

    :param settings: settings dict
    :return: html content
    """

    glossary_registry = load_glossary_registry(source=settings['data-source'])

    if glossary_registry and 'glossary' in glossary_registry and glossary_registry[
            'glossary']:
        if 'sets' in glossary_registry and settings[
                'set'] in glossary_registry['sets']:
            glossary = glossary_registry['sets'][settings['set']]

        else:
            glossary = glossary_registry['glossary']

        if settings['sort']:
            glossary = collections.OrderedDict(sorted(glossary.items()))

        logger.warn('`pelican-bglossary` has [{term_count}] terms'.format(
            term_count=len(glossary)))

        from flashtext import KeywordProcessor
        keyword_processor = KeywordProcessor()

        from textblob import TextBlob

        intra_links = {}
        for item_key, item in glossary.items():
            term = item.get('term', '')
            anchor = term.lower().replace(' ', '-').replace('.', '').replace(
                '(', '').replace(')', '').replace('/', '')
            anchor = unicodedata.normalize('NFKD', anchor).encode(
                'ascii', 'ignore').decode('utf-8')

            for t in term.split(','):
                t = t.strip()

                tt = TextBlob(t)
                if tt.tags[-1][1] == 'NN':
                    if 'abbreviation' in item:
                        anchor += '-' + item.get('abbreviation', '').replace(
                            ' ', '-').lower()

                    words = list(tt.words)
                    words[-1] = tt.words[-1].pluralize()
                    t_plural = " ".join(words)

                    words = list(tt.words)
                    words[-1] = tt.words[-1].singularize()
                    t_singular = " ".join(words)

                    keyword_processor.add_keyword(
                        t_singular.lower(),
                        u'<a href="#{anchor}">{term}</a>'.format(
                            anchor=anchor, term=t_singular.lower()))
                    keyword_processor.add_keyword(
                        t_plural.lower(),
                        u'<a href="#{anchor}">{term}</a>'.format(
                            anchor=anchor, term=t_plural.lower()))

                if len(t.split(' ')) > 1:
                    intra_links[term.lower()] = anchor

                    if 'abbreviation' in item:
                        intra_links[item.get('abbreviation',
                                             '').lower()] = anchor

                        keyword_processor.add_keyword(
                            t.lower(),
                            u'<a href="#{anchor}">{term}</a>'.format(
                                anchor=anchor, term=item.get('abbreviation')))

                    keyword_processor.add_keyword(
                        t.lower(),
                        u'<a href="#{anchor}">{term}</a>'.format(anchor=anchor,
                                                                 term=t))

        for item_key, item in glossary.items():
            term = item.get('term', '')
            anchor = term.lower().replace(' ', '-').replace('.', '').replace(
                '(', '').replace(')', '').replace('/', '')
            anchor = unicodedata.normalize('NFKD', anchor).encode(
                'ascii', 'ignore').decode('utf-8')

            for t in term.split(','):
                if len(t.split(' ')) == 1:
                    if 'abbreviation' in item:
                        anchor += '-' + item.get('abbreviation', '').replace(
                            ' ', '-').lower()

                    intra_links[term.lower()] = anchor

                    if 'abbreviation' in item:
                        intra_links[item.get('abbreviation',
                                             '').lower()] = anchor

                        keyword_processor.add_keyword(
                            t.lower(),
                            u'<a href="#{anchor}">{term}</a>'.format(
                                anchor=anchor, term=item.get('abbreviation')))

                    keyword_processor.add_keyword(
                        t.lower(),
                        u'<a href="#{anchor}">{term}</a>'.format(anchor=anchor,
                                                                 term=t))

        html = "\n"
        alphabet_header_template = Template(settings['alphabet-template'][
            settings['mode']].strip('\t\r\n').replace('&gt;', '>').replace(
                '&lt;', '<'))

        current_letter = None
        term_count = 0
        translation_counts = {}
        active_translations = []
        for field in settings['fields']:
            if len(field) == 2:
                active_translations.append(field)
        for glossary_key, glossary_item in glossary.items():
            if current_letter != glossary_item['term'][0].upper():
                if settings['show-dividers']:
                    html += alphabet_header_template.render(
                        alphabet=glossary_item['term'][0].upper())

                current_letter = glossary_item['term'][0].upper()

            if 'definition' in glossary_item:

                glossary_item[
                    'definition'] = keyword_processor.replace_keywords(
                        glossary_item['definition'])

            if 'intra_link' in glossary_item:
                glossary_item[
                    'intra_link'] = keyword_processor.replace_keywords(
                        glossary_item['intra_link'])

            html += generate_listing_item(
                glossary_item=glossary_item,
                settings=settings,
            ) + "\n"

            term_count += 1

            for lang in active_translations:
                if lang in glossary_item and glossary_item[lang]:
                    if lang not in translation_counts:
                        translation_counts[lang] = 0
                    translation_counts[lang] += 1

        html += "\n"

        template = Template(
            settings['template'][settings['mode']].strip('\t\r\n').replace(
                '&gt;', '>').replace('&lt;', '<'))

        return BeautifulSoup(
            template.render(
                list=html,
                header=settings.get('header'),
                site_url=settings.get('site-url'),
                panel_color=settings.get('panel-color'),
                show_stats=settings.get('show-stats'),
                show_search=settings.get('show-search'),
                show_lang_selector=settings.get('show-lang-selector'),
                latest_update=glossary_registry['modification_date'],
                term_count=term_count,
                translation_counts=translation_counts), "html.parser")

    else:
        return ''
示例#27
0
import os
# import flashtext
from flashtext import KeywordProcessor
import fileinput

kp = KeywordProcessor()

kp.add_keyword('另外', '现在')
kp.add_keyword("第二部分", "第三部分")
# print(os.path.abspath('.')
# print(os.path.abspath(os.path.dirname(__file__)))
BASE_DIR = os.path.abspath(os.path.dirname(__file__))

print(BASE_DIR)
# for i in os.listdir(BASE_DIR):
#     print(i)
#     if i == "06.txt":
# j = kp.replace_keywords(i)
i = BASE_DIR + '/06.txt'
for line in fileinput.input(i):
    j = kp.replace_keywords(line)
    print(j)
# https://github.com/vi3k6i5/flashtext

# 关键词搜索
from flashtext import KeywordProcessor

keyword_processor = KeywordProcessor()
# keyword_processor.add_keyword(<别名>, <标准词>)
keyword_processor.add_keyword('广东', '广东省')
keyword_processor.add_keyword('北京')
keywords_found = keyword_processor.extract_keywords('北京与广东都是中国的')
print(keywords_found)
# ['北京', '广东省']

# 关键词替换
keyword_processor.add_keyword('A卡', '邂逅a卡')
new_sentence = keyword_processor.replace_keywords('a卡与b卡哪个卡好')
print(new_sentence)
# Out[40]: '邂逅a卡与b卡哪个卡好'

# 提取关键词,区分大小写字母
keyword_processor = KeywordProcessor(case_sensitive=True)
keyword_processor.add_keyword('A卡', '邂逅a卡')
keyword_processor.add_keyword('b卡')
keywords_found = keyword_processor.extract_keywords('a卡与b卡哪个卡好?')
print(keywords_found)
# ['b卡']

# 提取关键词,并输出关键词的位置
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('Big Apple', 'New York')
keyword_processor.add_keyword('Bay Area')
示例#29
0
class FromDatasetKeyTermsPredictor(Predictor):
    """ Predicts key terms of a text by looking up terms in a dataset."""

    location_string = None
    dataset = pd.DataFrame(columns=["term", "parent_terms"], dtype=str)
    flashtext = None
    key_terms_marked_for_removal = []

    def __init__(self, predictor_config):
        super().__init__(predictor_config)
        self.load_dataset(predictor_config["location"])

    @property
    def config_validation_schema_custom_part(self):
        return yaml.load(
            """
            location:
                type: string
                regex: "^.+?:.+"
                required: True
            """,
            Loader=yaml.FullLoader,
        )

    def load_dataset(self, location_string):
        # update location_string
        self.location_string = location_string
        # load data
        self.dataset = DatasetManager.load_dataset_from_location_string(
            location_string, {
                "term": str,
                "parent_terms": str
            })[0]
        # setup flashtext for later string replacements
        temp_replace_against_dataset = self.dataset.copy()
        temp_replace_against_dataset["replace"] = temp_replace_against_dataset[
            "term"]
        temp_replace_against_dataset["against"] = temp_replace_against_dataset[
            "replace"]
        temp_replace_against_dataset.loc[
            temp_replace_against_dataset["parent_terms"] != "",
            "against"] = ("`" + temp_replace_against_dataset["term"] +
                          "``PK``" +
                          temp_replace_against_dataset["parent_terms"] + "`´")
        temp_replace_against_dataset.loc[
            temp_replace_against_dataset["parent_terms"] == "",
            "against"] = ("`" + temp_replace_against_dataset["term"] +
                          "``SK`´")
        temp_replace_against_dataset = temp_replace_against_dataset[[
            "replace", "against"
        ]]
        temp_replace_against_dataset_as_dict = {
            row["against"]: [row["replace"]]
            for index, row in temp_replace_against_dataset.iterrows()
        }
        self.flashtext = KeywordProcessor()
        self.flashtext.add_keywords_from_dict(
            temp_replace_against_dataset_as_dict)

    def add_key_term_to_dataset(self, key_term, parent_terms):
        new_row = pd.DataFrame({
            "term": [key_term],
            "parent_terms": [parent_terms]
        })
        self.dataset = self.dataset.append(new_row)
        if parent_terms != "":
            self.flashtext.add_keywords_from_dict(
                {"`{}``PK``{}`´".format(key_term, parent_terms): [key_term]})
        else:
            self.flashtext.add_keywords_from_dict(
                {"`{}``SK`´".format(key_term): [key_term]})

    def remove_key_term_from_dataset(self, key_term):
        self.dataset = self.dataset[self.dataset.term != key_term]
        self.flashtext.remove_keyword(key_term)

    def save_dataset(self, location_string):
        # sort the key terms dataset for convenience
        self.dataset["sort"] = self.dataset["term"].str.lower()
        self.dataset = self.dataset.sort_values(by=["sort"])
        del self.dataset["sort"]
        # save the dataset
        DatasetManager.save_dataset_to_location_string(self.dataset,
                                                       location_string)

    def mark_key_term_for_removal(self, key_term):
        if key_term not in self.key_terms_marked_for_removal:
            self.key_terms_marked_for_removal.append(key_term)

    def reset_key_terms_marked_for_removal(self):
        self.key_terms_marked_for_removal = []

    def learn_from_annotated_text(self, annotated_text, language):
        # get terms to add/update
        key_terms_to_add = {}
        parented_terms_to_update = []
        existing_terms_list = list(self.dataset["term"])
        for annotation in extract_annotations_as_generator(
                annotated_text,
                types_to_extract=["standalone_key_term", "parented_key_term"],
        ):
            if annotation["term"] not in existing_terms_list:
                # term does not exist yet
                key_terms_to_add = merge_dict(
                    key_terms_to_add,
                    {
                        annotation["term"]:
                        annotation["parent_terms"]
                        if "parent_terms" in annotation else ""
                    },
                )
            else:
                # term exists but may need update due to different parent terms
                if "parent_terms" in annotation:
                    currently_stored_parent_terms = list(
                        self.dataset[self.dataset["term"] ==
                                     annotation["term"]]["parent_terms"])[0]
                    if currently_stored_parent_terms != annotation[
                            "parent_terms"]:
                        # needs update
                        key_terms_to_add = merge_dict(
                            key_terms_to_add,
                            {
                                annotation["term"]:
                                annotation["parent_terms"]
                                if "parent_terms" in annotation else ""
                            },
                        )
                        parented_terms_to_update.append(annotation["term"])

        # get total terms to remove
        key_terms_to_remove = [
            key_term for key_term in self.key_terms_marked_for_removal
            if key_term not in key_terms_to_add
        ]
        key_terms_to_remove.extend(parented_terms_to_update)

        # update key terms dataset (incl. flashtext)
        # remove
        if key_terms_to_remove:
            for key_term in key_terms_to_remove:
                self.remove_key_term_from_dataset(key_term)
        # add
        if key_terms_to_add:
            for key_term in key_terms_to_add:
                self.add_key_term_to_dataset(key_term,
                                             key_terms_to_add[key_term])
        # save
        self.save_dataset(self.location_string)

    def predict_inline_annotations(self, text, language="en-US"):
        return (self.flashtext.replace_keywords(text)
                if self.flashtext is not None else text)
示例#30
0
class FromDatasetsNamedEntitiesPredictor(Predictor):
    """ Predicts named entities of a text by looking up terms in a dataset."""

    location_strings = {}
    dataset = pd.DataFrame(columns=["term", "entity_code", "parent_terms"],
                           dtype=str)
    flashtext = None
    marked_for_removal = []

    def __init__(self, predictor_config):
        super().__init__(predictor_config)
        self.load_datasets(predictor_config["datasets"])

    @property
    def config_validation_schema_custom_part(self):
        return yaml.load(
            """
            datasets:
                type: list
                schema:
                    type: dict
                    schema:
                        code:
                            type: string
                            required: True
                        location:
                            type: string
                            regex: "^.+?:.+"
                            required: True
            """,
            Loader=yaml.FullLoader,
        )

    def load_datasets(self, entity_code_location_string_dict):
        for entity_code_location_string in entity_code_location_string_dict:
            entity_code = entity_code_location_string["code"]
            location_string = entity_code_location_string["location"]
            # remember location string
            self.location_strings[entity_code] = location_string
            # load entities into dataset
            new_data = DatasetManager.load_dataset_from_location_string(
                location_string, {
                    "term": str,
                    "entity_code": str,
                    "parent_terms": str
                })[0]
            self.dataset = self.dataset.append(new_data)
            # update flashtext
            self.flashtext = KeywordProcessor()
            data_for_flashtext = pd.DataFrame({
                "against": [
                    "`{}``SN``{}`´".format(row["term"], row["entity_code"])
                    if not row["parent_terms"] else "`{}``PN``{}``{}`´".format(
                        row["term"], row["entity_code"], row["parent_terms"])
                    for index, row in self.dataset.iterrows()
                ],
                "replace":
                self.dataset["term"],
            })
            dict_for_flashtext = data_for_flashtext.set_index(
                "against").T.to_dict("list")
            self.flashtext.add_keywords_from_dict(dict_for_flashtext)

    def add_named_entity_term_to_dataset(self, term, entity_code,
                                         parent_terms):
        new_row = pd.DataFrame({
            "term": [term],
            "entity_code": [entity_code],
            "parent_terms": [parent_terms],
        })
        self.dataset = self.dataset.append(new_row)
        if parent_terms != "":
            self.flashtext.add_keywords_from_dict({
                "`{}``PN``{}``{}`´".format(term, entity_code, parent_terms):
                [term]
            })
        else:
            self.flashtext.add_keywords_from_dict(
                {"`{}``SN``{}`´".format(term, entity_code): [term]})

    def remove_named_entity_term_from_dataset(self, term, entity_code):
        self.dataset = self.dataset[~(
            (self.dataset["term"] == term)
            & (self.dataset["entity_code"] == entity_code))]
        self.flashtext.remove_keyword(term)

    def save_dataset(self, location_string, entity_code):
        # get the named entities with the specified entity code
        filtered_named_entities = self.dataset[self.dataset["entity_code"] ==
                                               entity_code].copy()
        # sort the filtered named entities for convenience
        filtered_named_entities["sort"] = filtered_named_entities[
            "term"].str.lower()
        filtered_named_entities = filtered_named_entities.sort_values(
            by=["sort"])
        del filtered_named_entities["sort"]
        # save the dataset
        DatasetManager.save_dataset_to_location_string(filtered_named_entities,
                                                       location_string)

    def mark_named_entity_term_for_removal(self, term, entity_code):
        if (term, entity_code) not in self.marked_for_removal:
            self.marked_for_removal.append((term, entity_code))

    def reset_marked_for_removal(self):
        self.marked_for_removal = []

    def get_parent_terms_for_named_entity(self, term, entity_code):
        # check if we have corresponding parent terms in the named entities dataset
        dataset_query_result = list(
            self.dataset[(self.dataset["entity_code"] == entity_code)
                         & (self.dataset["term"] == term)]["parent_terms"])
        if len(dataset_query_result) > 0:
            # we got a row back
            # return either the parent terms or None depending on parent_terms value in dataset
            dataset_query_result = dataset_query_result[0]
            return (None if dataset_query_result is None or
                    pd.isnull(dataset_query_result) else dataset_query_result)
        else:
            # no, no parent terms found in dataset
            return None

    def learn_from_annotated_text(self, annotated_text, language):
        # note: the definition of a "term" within this function is a tuple of term and entity code
        # get terms to add/update
        terms_to_add = {}
        parented_terms_to_update = []
        affected_entity_codes = []
        for annotation in extract_annotations_as_generator(
                annotated_text,
                types_to_extract=[
                    "standalone_named_entity", "parented_named_entity"
                ],
        ):
            if (len(self.dataset[(self.dataset["term"] == annotation["term"])
                                 & (self.dataset["entity_code"] ==
                                    annotation["entity_code"])]) == 0):
                # term does not exist yet
                terms_to_add = merge_dict(
                    terms_to_add,
                    {
                        (annotation["term"], annotation["entity_code"]):
                        annotation["parent_terms"]
                        if "parent_terms" in annotation else ""
                    },
                )
                affected_entity_codes.append(annotation["entity_code"])
            else:
                # term exists but may need update due to different parent terms
                if "parent_terms" in annotation:
                    currently_stored_parent_terms = list(self.dataset[
                        (self.dataset["term"] == annotation["term"])
                        & (self.dataset["entity_code"] ==
                           annotation["entity_code"])]["parent_terms"])[0]
                    if currently_stored_parent_terms != annotation[
                            "parent_terms"]:
                        # needs update
                        terms_to_add = merge_dict(
                            terms_to_add,
                            {
                                (
                                    annotation["term"],
                                    annotation["entity_code"],
                                ):
                                annotation["parent_terms"]
                                if "parent_terms" in annotation else ""
                            },
                        )
                        parented_terms_to_update.append(
                            (annotation["term"], annotation["entity_code"]))
                        affected_entity_codes.append(annotation["entity_code"])

        # get total terms to remove
        terms_to_remove = []
        for term in self.marked_for_removal:
            if term in terms_to_add:
                continue
            terms_to_remove.append(term)
            affected_entity_codes.append(term[1])
        terms_to_remove.extend(parented_terms_to_update)

        # update key terms dataset (incl. flashtext)
        # remove
        if terms_to_remove:
            for term in terms_to_remove:
                self.remove_named_entity_term_from_dataset(term[0], term[1])
        # add
        if terms_to_add:
            for term in terms_to_add:
                self.add_named_entity_term_to_dataset(term[0], term[1],
                                                      terms_to_add[term])
        # save
        for affected_entity_code in affected_entity_codes:
            if affected_entity_code in self.location_strings:
                self.save_dataset(self.location_strings[affected_entity_code],
                                  affected_entity_code)

    def predict_inline_annotations(self, text, language="en-US"):
        return (self.flashtext.replace_keywords(text)
                if self.flashtext is not None else text)
示例#31
0
from flashtext import KeywordProcessor

doc = """
    Hello, there. I'm the better python programmer in this world.
    Anything more, you can send me an email.
"""

p = KeywordProcessor()
p.add_keyword('email')
found = p.extract_keywords(doc)
print("Result: ", found)

p = KeywordProcessor(case_sensitive=False)
p.add_keyword('email', 'message')
found = p.replace_keywords(doc)
print(found)