Exemplo n.º 1
0
def find_keyword(keyword_dict, text_to_test):

    keyword_processor = KeywordProcessor()
    keyword_processor.add_keywords_from_dict(keyword_dict)
    result = keyword_processor.extract_keywords(text_to_test)

    return result
Exemplo n.º 2
0
    def __init__(self, f_dict=None, prefix=''):
        '''
        Initialize the parser.

        Args:
            f_dict: filename, location of the replacement dictionary.
            prefix: string, text to prefix each replacement.
        '''
        self.logger = logging.getLogger(__name__)

        if f_dict is None:
            local_path = os.path.dirname(__file__)
            f_dict = os.path.join(local_path, self.f_MeSH)
            self.logger.debug('Using default dictionary: %s' % f_dict)

        if not os.path.exists(f_dict):
            msg = "Can't find dictionary {}".format(f_dict)
            self.logger.error(msg)
            raise IOError()

        self.prefix = prefix
        terms = collections.defaultdict(list)

        with open(f_dict) as FIN:
            csvfile = csv.DictReader(FIN)
            for row in csvfile:
                terms[row["replacement"]].append(row['term'])

        self.FT = KeywordProcessor()
        self.FT.add_keywords_from_dict(terms)
Exemplo n.º 3
0
 def load_datasets(self, entity_code_location_string_dict):
     for entity_code_location_string in entity_code_location_string_dict:
         entity_code = entity_code_location_string["code"]
         location_string = entity_code_location_string["location"]
         # remember location string
         self.location_strings[entity_code] = location_string
         # load entities into dataset
         new_data = DatasetManager.load_dataset_from_location_string(
             location_string, {
                 "term": str,
                 "entity_code": str,
                 "parent_terms": str
             })[0]
         self.dataset = self.dataset.append(new_data)
         # update flashtext
         self.flashtext = KeywordProcessor()
         data_for_flashtext = pd.DataFrame({
             "against": [
                 "`{}``SN``{}`´".format(row["term"], row["entity_code"])
                 if not row["parent_terms"] else "`{}``PN``{}``{}`´".format(
                     row["term"], row["entity_code"], row["parent_terms"])
                 for index, row in self.dataset.iterrows()
             ],
             "replace":
             self.dataset["term"],
         })
         dict_for_flashtext = data_for_flashtext.set_index(
             "against").T.to_dict("list")
         self.flashtext.add_keywords_from_dict(dict_for_flashtext)
Exemplo n.º 4
0
    def test_remove_keywords_dictionary_len(self):
        """For each of the test case initialize a new KeywordProcessor.
        Add the keywords the test case to KeywordProcessor.
        Remove the keywords in remove_keyword_dict
        Extract keywords and check if they match the expected result for the test case.
        """
        for test_id, test_case in enumerate(self.test_cases):
            keyword_processor = KeywordProcessor()
            keyword_processor.add_keywords_from_dict(test_case['keyword_dict'])
            keyword_processor.remove_keywords_from_dict(
                test_case['remove_keyword_dict'])

            kp_len = len(keyword_processor)

            new_dictionary = defaultdict(list)
            for key, values in test_case['keyword_dict'].items():
                for value in values:
                    if not (key in test_case['remove_keyword_dict'] and value
                            in test_case['remove_keyword_dict'][key]):
                        new_dictionary[key].append(value)

            keyword_processor_two = KeywordProcessor()
            keyword_processor_two.add_keywords_from_dict(new_dictionary)
            kp_len_two = len(keyword_processor_two)
            self.assertEqual(
                kp_len, kp_len_two,
                "keyword processor length doesn't match for Text ID {}".format(
                    test_id))
Exemplo n.º 5
0
def _create_flashtext_object():
    """
    Instantiates a Flashtext object.
    Separators are specified to not be considered as word boundaries
    """
    keyword_processor = KeywordProcessor()
    # special characters are included as natively flashtext library does not handle them correctly
    for separator in [
            "-",
            "_",
            "/",
            "é",
            "è",
            "ê",
            "â",
            "ô",
            "ö",
            "ü",
            "û",
            "ù",
            "ï",
            "î",
            "æ",
    ]:
        keyword_processor.add_non_word_boundary(separator)
    return keyword_processor
Exemplo n.º 6
0
    def __call__(self, from_file=False, file_path=None, stdout=False):
        watchlist = self.open_watchlist()

        keyword_processor = KeywordProcessor()
        keyword_processor.add_keywords_from_list(watchlist)

        if from_file:
            egg_path = _ask_spy_file()
        elif file_path:
            egg_path = file_path
        else:
            raise ValueError("Expected egg, from_file or file_path argument.")

        gui = Scanner(egg_path)

        for line, reaction in enumerate(open(egg_path).readlines(), start=1):
            if reaction and not reaction.startswith('#'):
                match = keyword_processor.extract_keywords(reaction)
                if match:
                    matches = ', '.join(match)
                    reaction = reaction.strip()
                    gui.insert_data(line, matches, reaction)

                    if stdout:
                        log.info(
                            f"Line {line}: {', '.join(match)} \n{reaction}")
class PhraseFeatsFromTopicModel(FeatsFromTopicModelBase, FeatsFromSpacyDoc):
    '''
    This class allows you to make use of a topic model which has multi-token entries (i.e., terms in topics which
    have spaces in them.)
    It requires Flashtext to be installed.
    '''
    def __init__(self,
                 topic_model,
                 use_lemmas=False,
                 entity_types_to_censor=set(),
                 entity_types_to_use=None,
                 tag_types_to_censor=set(),
                 strip_final_period=False,
                 keyword_processor_args = {'case_sensitive' :False}):
        from flashtext import KeywordProcessor
        self._keyword_processor = KeywordProcessor(**keyword_processor_args)
        self._topic_model = topic_model
        for keyphrase in reduce(lambda x, y: set(x) | set(y), topic_model.values()):
            self._keyword_processor.add_keyword(keyphrase)
        FeatsFromSpacyDoc.__init__(self, use_lemmas, entity_types_to_censor,
                                   tag_types_to_censor, strip_final_period)
        FeatsFromTopicModelBase.__init__(self, topic_model)


    def get_top_model_term_lists(self):
        return self._topic_model

    def _get_terms_from_doc(self, doc):
        return Counter(self._keyword_processor.extract_keywords(doc))

    def get_feats(self, doc):
        return Counter(self._get_terms_from_doc(str(doc)))
def used_func_for_fast_key_word_matching():
    # Load tokenizer
    path_stanford_corenlp_full_2017_06_09 = str(config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*')
    drqa.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09)
    tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner'])

    keyword_processor = KeywordProcessor(case_sensitive=True)
    id_to_key_dict = load_keyword_dict(config.DATA_ROOT / "id_dict.jsonl")

    # Write this in a for loop to keep track of the progress
    for clean_name, keywords in tqdm(id_to_key_dict.items()):
        if not isinstance(keywords, list):
            raise AttributeError("Value of key {} should be a list".format(clean_name))

        for keyword in keywords:
            keyword_processor.add_keyword(keyword, clean_name)

    # Load data for predicting
    d_list = load_data(config.FEVER_DEV_JSONL)
    sample_answer(d_list, tok, keyword_p=keyword_processor)

    # save the the results for evaluating
    out_fname = config.RESULT_PATH / "doc_retri" / f"{utils.get_current_time_str()}_r" / "dev.jsonl"
    save_intermidiate_results(d_list, out_filename=out_fname)

    # Evaluating
    # out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_06_29_17:41:14_r/dev.jsonl'
    d_list = load_data(out_fname)
    eval_mode = {'check_doc_id_correct': True, 'standard': False}
    # print(fever_score(d_list, d_list, mode=eval_mode, error_analysis_file=Path(out_fname).parent / "analysis.log"))
    print(fever_score(d_list, d_list, mode=eval_mode))
Exemplo n.º 9
0
class FeatsFromTopicModel(FeatsFromTopicModelBase, FeatsFromSpacyDoc):
    def __init__(self,
                 topic_model,
                 use_lemmas=False,
                 entity_types_to_censor=set(),
                 entity_types_to_use=None,
                 tag_types_to_censor=set(),
                 strip_final_period=False,
                 keyword_processor_args={'case_sensitive': False}):
        self._keyword_processor = KeywordProcessor(**keyword_processor_args)
        self._topic_model = topic_model.copy()
        if keyword_processor_args.get('case_sensitive', None) is False:
            for k, v in self._topic_model.items():
                self._topic_model[k] = [e.lower() for e in v]
        for keyphrase in reduce(lambda x, y: set(x) | set(y),
                                self._topic_model.values()):
            self._keyword_processor.add_keyword(keyphrase)
        FeatsFromSpacyDoc.__init__(self, use_lemmas, entity_types_to_censor,
                                   tag_types_to_censor, strip_final_period)
        FeatsFromTopicModelBase.__init__(self, topic_model)

    def get_top_model_term_lists(self):
        return self._topic_model

    def _get_terms_from_doc(self, doc):
        return Counter(self._keyword_processor.extract_keywords(str(doc)))

    def get_feats(self, doc):
        return Counter(self._get_terms_from_doc(str(doc)))
Exemplo n.º 10
0
def processing(i, file_list):
    tk = RegexpTokenizer(r'\w\S+\w')

    # create English stop words list
    en_stop = get_stop_words('en')

    stopword_processor = KeywordProcessor()
    for w in en_stop:
        stopword_processor.add_keyword(w, ' ')

    with open('stopword_processor.pkl', 'wb') as f:
        pickle.dump(stopword_processor, f)

    p_stemmer = PorterStemmer()

    with codecs.open('whole_dialogs_stem_%d' % i, 'w', 'utf-8') as out:
        for fi in tqdm(file_list):
            with codecs.open(fi, 'r', 'utf-8') as f:
                sentences = [
                    stopword_processor.replace_keywords(
                        line.strip().split('\t')[-1].lower()) for line in f
                ]
                words = functools.reduce(lambda x, y: x + y,
                                         map(tk.tokenize, sentences))
                words = map(p_stemmer.stem, words)
                out.write(' '.join(words) + '\n')
Exemplo n.º 11
0
class FlashtextBackend(Backend):
    """This backend recognizes entities using the FlashText algorithm.

    See https://flashtext.readthedocs.io/en/latest/ for more information about the FlashText algorithm.
    """
    def __init__(self):
        self.keyword_processor = KeywordProcessor()

    def register_recognizer(self, recognizer_cls: Type[FlashtextRecognizer]):
        recognizer = recognizer_cls()

        key = f"{recognizer.TAG}:{recognizer.SCORE}:{recognizer_cls.__name__}"
        keyword_dict = {key: recognizer.keywords}
        self.keyword_processor.add_keywords_from_dict(keyword_dict)

    def run(self, text):
        keywords = self.keyword_processor.extract_keywords(text,
                                                           span_info=True)

        ents = []
        for keyword_key, start, end in keywords:
            tag, score, recognizer_name = keyword_key.split(":")
            ent = NamedEntity(start, end, tag, text[start:end], float(score),
                              recognizer_name)
            ents.append(ent)
        return ents
Exemplo n.º 12
0
async def main(job_title: str, sh_links_we_already_have: list[str],
               skills: dict[str, list[str]]):
    # Import this function to collect vacancies for a given job title.
    fake_agent = get_user_agent()
    async with ClientSession(headers={
            "user-agent": fake_agent,
            "Connection": "close"
    }) as session:
        all_links = await scan_all_search_results(job_title, session)
        for _ in range(10):
            try:
                vacancies_without_skills = await fetch_all_vacancy_pages(
                    all_links, sh_links_we_already_have, session)
                keyword_processor = KeywordProcessor()
                keyword_processor.add_keywords_from_dict(skills)
                collected_jobs = (
                    process_vacancy_content(vacancy_without_skills,
                                            keyword_processor)
                    for vacancy_without_skills in vacancies_without_skills
                    if vacancy_without_skills is not None)
                await asyncio.sleep(60)
                return collected_jobs
            except OSError:
                logger.warning(f"🚨 OSError occured for {job_title}.")
        # If couldn't recover after errors, then return an empty list.
        await asyncio.sleep(60)
        return []
Exemplo n.º 13
0
def get_DL(s):
    dl = False
    dl_valid = False
    dl_State = ""
    arr = ['driver', 'license']
    keyword_processor = KeywordProcessor()
    keyword_processor.add_keyword('california')
    if any(re.findall('|'.join(arr), qual)):
        dl = True
    if (dl == True):
        sentences = sent_tokenize(s)
        selected_sentence = [
            sent for sent in sentences if "driver" in word_tokenize(sent)
        ]
        if (len(selected_sentence) > 0):
            words = selected_sentence[0].split()
            selected_word = [word for word in words if "valid" in words]
            if len(selected_word) > 0:
                dl_valid = True
        for i in range(len(selected_sentence)):
            keywords_found = keyword_processor.extract_keywords(
                selected_sentence[i])
            for i in range(len(keywords_found)):
                if keywords_found[i] == 'california':
                    dl_State = "CA"

    if (dl_valid) == True:
        dl_valid = "R"
    else:
        dl_valid = "P"
    return dl_valid, dl_State
Exemplo n.º 14
0
def search_from_keywords(synonyms_list, sentences, keywords_with_weight):
    #synonyms_list is a list of dictionaries
    global keywords_rank
    final_result = []
    for index in range(len(synonyms_list)):
        synonyms_dict = synonyms_list[index]
        keywords_list = list(synonyms_dict.keys())
        threshold = 0.5 * len(keywords_list)
        if not keywords_list:
            final_result.append(None)
            continue
        keywords_rank = keywords_with_weight[index]
        matched_sentences = PriorityQueue()
        keyword_processor = KeywordProcessor()
        keyword_processor.add_keywords_from_dict(synonyms_dict)
        for sent in sentences:
            keywords_found = set(keyword_processor.extract_keywords(sent))
            if keywords_found:
                entry_obj = Entry(list(keywords_found), sent)
                if entry_obj.matched >= threshold:
                  matched_sentences.put(entry_obj)
        if not matched_sentences.empty():
          best_sentence = matched_sentences.get()
          final_result.append(best_sentence.sentence)
        else:
            final_result.append(None)            
    return final_result
def rm_main():
    ScienceEvent = {
        "scienza", "universo", "geologia", "biologia", "scientifico",
        "scientifica", "scienziato", "scienzata"
    }
    VisualArtEvent = {
        "pittura", "scultura", "artista", "artisti", "art", "opere", "opera",
        "part"
    }
    keyPArt = KeywordProcessor()
    keyPScience = KeywordProcessor()
    for element in ScienceEvent:
        keyPScience.add_keyword(element)
    for element in VisualArtEvent:
        keyPArt.add_keyword(element)

    url = 'https://raw.githubusercontent.com/andreamatt/KDI/master/dataset/muse.json'
    obj = json.loads(requests.get(url).text)
    eventsArray = obj["events"]
    while {} in eventsArray:
        eventsArray.remove({})
    for event in eventsArray:
        for k, v in event.items():
            event[k] = v.replace('\n', '; ')
        artList = keyPArt.extract_keywords(event["description"])
        scienceList = keyPScience.extract_keywords(event["description"])
        if len(artList) > len(scienceList):
            event.update({'Subcategory': visual})
        elif len(artList) <= len(scienceList):
            event.update({'Subcategory': science})

    obj["events"] = eventsArray
    return json.dumps(obj)
Exemplo n.º 16
0
 def load_dataset(self, location_string):
     # update location_string
     self.location_string = location_string
     # load data
     self.dataset = DatasetManager.load_dataset_from_location_string(
         location_string, {
             "term": str,
             "parent_terms": str
         })[0]
     # setup flashtext for later string replacements
     temp_replace_against_dataset = self.dataset.copy()
     temp_replace_against_dataset["replace"] = temp_replace_against_dataset[
         "term"]
     temp_replace_against_dataset["against"] = temp_replace_against_dataset[
         "replace"]
     temp_replace_against_dataset.loc[
         temp_replace_against_dataset["parent_terms"] != "",
         "against"] = ("`" + temp_replace_against_dataset["term"] +
                       "``PK``" +
                       temp_replace_against_dataset["parent_terms"] + "`´")
     temp_replace_against_dataset.loc[
         temp_replace_against_dataset["parent_terms"] == "",
         "against"] = ("`" + temp_replace_against_dataset["term"] +
                       "``SK`´")
     temp_replace_against_dataset = temp_replace_against_dataset[[
         "replace", "against"
     ]]
     temp_replace_against_dataset_as_dict = {
         row["against"]: [row["replace"]]
         for index, row in temp_replace_against_dataset.iterrows()
     }
     self.flashtext = KeywordProcessor()
     self.flashtext.add_keywords_from_dict(
         temp_replace_against_dataset_as_dict)
Exemplo n.º 17
0
class Tokenizer(object):
    def __init__(self):
        log.info("Tokenizer initialization")

        from .lookup_dic import _PhraseDictionary as __PD
        log.debug("Tokenizer: calls lookup_dic.read_phrases")
        self.lookup_dic = __PD()

        log.debug("Instanciate flashtext.KeyworkProcessor")
        self.__keyword_processor = KeywordProcessor()
        log.debug("Insert data into flashtext.KeyworkProcessor instance.")
        self.__keyword_processor.add_keywords_from_dict(
            self.lookup_dic.lookup_dic_CODE)
        log.info("Tokenizer initialization successful")

    def tokenize(self, text):
        log.debug(f"Tokenizer called on {text}")

        log.debug("Phase I: Replacing phrases.")
        text = self.__keyword_processor.replace_keywords(text)

        log.debug("Phase II: Split by space.")
        tokens_list = text.split()

        log.debug("Phase III: Replace back token id to its original form.")
        tokens_list = [
            self.lookup_dic.reverse_replace(token)
            if token in self.lookup_dic.lookup_dic_CODE else token
            for token in tokens_list
        ]

        return tokens_list

    def __call__(self, text):
        return self.tokenize(text)
Exemplo n.º 18
0
def load_gbif_categories(
        gbif_extract_file="../data/gbif_extract_categories.csv"):
    """ load gbif categies dataset """
    """ input: csv file name """
    """ ourput: fleshtext keyword_processor """

    df_canonicalName = pd.read_csv(gbif_extract_file, sep=";")
    print(
        gbif_extract_file,
        ", loaded",
        df_canonicalName.shape,
        list(df_canonicalName.columns),
    )
    kwords = list(df_canonicalName["name"].values)
    keyword_processor = KeywordProcessor()
    for k in kwords:
        tab = k.split(" ")
        for y in tab:
            if len(y) > 0:
                keyword_processor.add_keyword(y)
    print("len(keyword_processor):", len(keyword_processor))

    # ['family' 'genus' 'species' 'subspecies']
    print(df_canonicalName["rank"].unique())
    return keyword_processor
Exemplo n.º 19
0
    def __init__(self, **kwargs):
        self.label = kwargs.get('label', True)
        filename_to_load = kwargs.get('filename', None)
        self.skip_misses = kwargs.get('skip_misses', True)
        self.include_hashtags = kwargs.get('include_hashtags', True)

        # Load the keywords
        '''keywords = []
        with open(filename_to_load) as my_input:
            for line in my_input:
                phrase = line.strip().lower()
                keywords.append(phrase)

                if self.include_hashtags and not phrase.startswith('#'):
                    phrase = '#' + phrase
                    keywords.append(phrase)

        self.trie = Trie(keywords)'''

        # flashtext implementation
        self.trie = KeywordProcessor()
        with open(filename_to_load) as my_input:
            for line in my_input:
                phrase = line.strip().lower()
                self.trie.add_keyword(phrase)
                if self.include_hashtags and not phrase.startswith('#'):
                    phrase = '#' + phrase
                    self.trie.add_keyword(phrase)
Exemplo n.º 20
0
def get_sentences_for_keyword(keywords, sentences):
    """
    
        For each keyword, find the sentence(s) that correspond to that keyword
    
    """

    keyword_processor = KeywordProcessor(
    )  # use this implementation as fast alternative to keyword matching
    keyword_sentences = {}

    # loop through all keywords
    for word in keywords:
        keyword_sentences[word] = []
        keyword_processor.add_keyword(word)

    # loop through each sentence and keyword
    for sentence in sentences:
        keywords_found = keyword_processor.extract_keywords(sentence)
        for key in keywords_found:
            keyword_sentences[key].append(sentence)

    for key in keyword_sentences.keys():
        values = keyword_sentences[key]
        values = sorted(values, key=len, reverse=True)
        keyword_sentences[key] = values

    return keyword_sentences
Exemplo n.º 21
0
    def test_replace_keywords(self):
        """For each of the test case initialize a new KeywordProcessor.
        Add the keywords the test case to KeywordProcessor.
        Replace keywords and check if they match the expected result for the test case.

        """
        for test_id, test_case in enumerate(self.test_cases):
            keyword_replacer = KeywordProcessor()
            # To handle issue like https://github.com/vi3k6i5/flashtext/issues/8
            # clean names are replaced with "_" in place of white space.
            for key, values in test_case['keyword_dict'].items():
                for value in values:
                    keyword_replacer.add_keyword(value, key.replace(" ", "_"))
            new_sentence = keyword_replacer.replace_keywords(test_case['sentence'])

            replaced_sentence = test_case['sentence']
            keyword_mapping = {}
            for val in test_case['keyword_dict']:
                for value in test_case['keyword_dict'][val]:
                    keyword_mapping[value] = val.replace(" ", "_")
            for key in sorted(keyword_mapping, key=len, reverse=True):
                lowercase = re.compile(r'(?<!\w){}(?!\w)'.format(re.escape(key)))
                replaced_sentence = lowercase.sub(keyword_mapping[key], replaced_sentence)

            self.assertEqual(new_sentence, replaced_sentence,
                             "new_sentence don't match the expected results for test case: {}".format(test_id))
Exemplo n.º 22
0
 def __init__(self):
     self.req_params = None
     self.keywords = []
     self.islands_visited = {}
     self.keyword_processor = KeywordProcessor()
     self.price_threshold = 0
     self.response = None
Exemplo n.º 23
0
 def __init__(self, crawl_type="", keywords=[]):
     super(StdOutListener, self).__init__()
     self.crawl_type = crawl_type
     self.keywords = keywords
     self.tweet_dump = []
     self.keyword_processor = KeywordProcessor()
     self.keyword_processor.add_keywords_from_list(self.keywords)
Exemplo n.º 24
0
def make_ne_founder(DICT_FILE):
    keyword_processor = KeywordProcessor(case_sensitive=False)
    with open(DICT_FILE, 'r') as r:
        vocab_list = [vocab for vocab in r]
    for vocab in vocab_list:
        keyword_processor.add_keyword(vocab.strip('\n'))
    return keyword_processor
 def __init__(self, document_id, ip_document, dictionary_terms):
     self.document_id = document_id
     self.ip_document = ip_document
     self.dictionary_terms = dictionary_terms
     self.keyword_processor = KeywordProcessor()
     self.keyword_processor.add_keywords_from_list(list(dictionary_terms.keys()))
     self.result = []
Exemplo n.º 26
0
def get_sentences_for_keyword(keywords, sentences):
    keyword_processor = KeywordProcessor()
    keyword_sentences = {}
    for word in keywords:
        word = word.strip()
        keyword_sentences[word] = []
        keyword_processor.add_keyword(word)
    for sentence in sentences:
        keywords_found = keyword_processor.extract_keywords(sentence)
        for key in keywords_found:
            keyword_sentences[key].append(sentence)

    for key in keyword_sentences.keys():
        values = keyword_sentences[key]
        values = sorted(values, key=len, reverse=True)
        keyword_sentences[key] = values

    delete_keys = []
    for k in keyword_sentences.keys():
        if len(keyword_sentences[k]) == 0:
            delete_keys.append(k)
    for del_key in delete_keys:
        del keyword_sentences[del_key]

    return keyword_sentences
Exemplo n.º 27
0
    def extract_summary(self):
        result = {}
        result['Development Languages'] = []

        keyword_processor = KeywordProcessor()
        keyword_processor.add_keywords_from_dict(self.skills_list)

        for keyword in self.candidate_keywords:
            matched_keyword = keyword_processor.extract_keywords(
                self.stemmer.stem(keyword))

            if not matched_keyword:
                # print('"{}" is not matched in our skill database.'.format(keyword))
                continue

            for i in range(len(matched_keyword)):
                category, name = str(matched_keyword[i]).split(": ", 1)

                if category not in result:
                    result[category] = {}

                if name not in result[category]:
                    result[category][name] = []

                result[category][name].append(keyword)

                if category == 'Tools & Technologies':
                    result['Development Languages'].append(keyword)

        return result
    def test_remove_keywords_dictionary_compare(self):
        """For each of the test case initialize a new KeywordProcessor.
        Add the keywords the test case to KeywordProcessor.
        Remove the keywords in remove_keyword_dict
        Extract keywords and check if they match the expected result for the test case.
        """
        for test_id, test_case in enumerate(self.test_cases):
            keyword_processor = KeywordProcessor()
            keyword_processor.add_keywords_from_dict(test_case['keyword_dict'])
            keyword_processor.remove_keywords_from_dict(
                test_case['remove_keyword_dict'])
            keyword_trie_dict = keyword_processor.keyword_trie_dict

            new_dictionary = defaultdict(list)
            for key, values in test_case['keyword_dict'].items():
                for value in values:
                    if not (key in test_case['remove_keyword_dict'] and value
                            in test_case['remove_keyword_dict'][key]):
                        new_dictionary[key].append(value)

            keyword_processor_two = KeywordProcessor()
            keyword_processor_two.add_keywords_from_dict(new_dictionary)
            keyword_trie_dict_two = keyword_processor_two.keyword_trie_dict
            self.assertTrue(
                keyword_trie_dict == keyword_trie_dict_two,
                "keywords_extracted don't match the expected results for test case: {}"
                .format(test_id))
Exemplo n.º 29
0
    def __init__(self,type_link,isLower,iscase_sensitive,currFlag):
        self.keyword_processor = KeywordProcessor(case_sensitive=iscase_sensitive)
        if type_link is None:
            return
        curr_dict = {"USD": ["$","$", "dollars", "U.S. Dollar", "USD", "US$", "United States dollar"]}
        # Temp logic for hierarchy identification -- remove
        # asset_dict ={"Assets": ["assets","total assets"]}
        # curAss_Dict={"Current Assets": ["current assets","total current assets"]}
        # nonCurAss_Dict={"NonCurrent Assets": ["longterm assets","long-term assets","non current assets","non-current assets","total noncurrent assets","total non-current assets"]}
        # liab_Dict={"Liabilities": ["liabilities","liabilities and common shareholders equity","liabilities and shareholders equity","total liabilities","total liabilities and equity","total liabilities and shareholders equity","total liabilities and stockholders equity","total liabilities and common shareholders equity"]}
        # curLiab_Dict ={"Current Liabilities" : ["current liabilities","total current liabilities"]}
        # nonCurrLiab_Dict = {"NonCurrent Liabilities" : ["non current liabilities","non-current liabilities","longterm liabilities","long-term liabilities","total noncurrent liabilities","total non-current liabilities"]}
        # shrEq_Dict = {"Shareholders Equity" : ["shareholders equity","equity","stockholders equity","total equity","total stockholders equity","total common shareholders equity","total wells fargo stockholders equity"]}


        with open(type_link,encoding='utf-8',errors='ignore') as inf:
            lines = inf.readlines()

        for line in lines:
            if len(line.strip()) >0 :
                if isLower:
                    self.keyword_processor.add_keyword(line.strip().lower())
                else:
                    self.keyword_processor.add_keyword(line.strip())
        if currFlag:
            self.keyword_processor.add_keywords_from_dict(curr_dict)
Exemplo n.º 30
0
    def cleanDomain(self):
        L = []
        black = []
        keyword_processor = KeywordProcessor()

        f = open(self.whitePath, "r")
        for i in f.readlines():
            L.append(i.strip('\r\n'))

        keyword_processor.add_keywords_from_list(L)
        try:
            for i in self.param:  #待过滤的domain

                hostname, ip, ipBelong, firstVisitTime, lastestVisitTime, userSet, visitNum, similarityValue, imitate = \
                    i.strip('\r\n').split(";")

                hostname = hostname.strip('\r\n')

                try:
                    # if re.search("\."+j.strip('\r\n')+'$',hostname):
                    if len(keyword_processor.extract_keywords(hostname)) == 0:
                        black.append(i)
                except:
                    traceback.print_exc()

        except:
            traceback.print_exc()
            print("匹配错误")
        finally:
            f.close()
            return black
#!/usr/bin/python3
# coding: utf-8

# http://blog.csdn.net/CoderPai/article/details/78574863
# https://github.com/vi3k6i5/flashtext

# 关键词搜索
from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor()
# keyword_processor.add_keyword(<别名>, <标准词>)
keyword_processor.add_keyword('广东', '广东省')
keyword_processor.add_keyword('北京')
keywords_found = keyword_processor.extract_keywords('北京与广东都是中国的')
print(keywords_found)
# ['北京', '广东省']

# 关键词替换
keyword_processor.add_keyword('A卡', '邂逅a卡')
new_sentence = keyword_processor.replace_keywords('a卡与b卡哪个卡好')
print(new_sentence)
# Out[40]: '邂逅a卡与b卡哪个卡好'


# 提取关键词,区分大小写字母
keyword_processor = KeywordProcessor(case_sensitive=True)
keyword_processor.add_keyword('A卡', '邂逅a卡')
keyword_processor.add_keyword('b卡')
keywords_found = keyword_processor.extract_keywords('a卡与b卡哪个卡好?')
print(keywords_found)
# ['b卡']