Exemplo n.º 1
0
class Entity(object):

    def __init__(self, nlp, keywords_list=[], keywords_dict={}, keywords_file=None, label='', attrs=('has_entities', 'is_entity', 'entity_desc', 'entities')):
        self._has_entities, self._is_entity, self._entity_desc, self._entities = attrs
        self.keyword_processor = KeywordProcessor()
        self.keyword_processor.add_keywords_from_list(keywords_list)
        self.keyword_processor.add_keywords_from_dict(keywords_dict)
        if keywords_file: self.keyword_processor.add_keyword_from_file(keywords_file)
        self.label = label
        Doc.set_extension(self._has_entities, getter=self.has_entities)
        Doc.set_extension(self._entities, getter=self.iter_entities)
        Span.set_extension(self._has_entities, getter=self.has_entities)
        Span.set_extension(self._entities, getter=self.iter_entities)
        Token.set_extension(self._is_entity, default=False)
        Token.set_extension(self._entity_desc, getter=self.get_entity_desc)

    def __call__(self, doc):
        matches = self.keyword_processor.extract_keywords(doc.text, span_info=True)
        spans = []  # keep spans here to merge them later
        for _, start, end in matches:
            entity = doc.char_span(start, end, label=self.label)
            for token in entity: token._.set(self._is_entity, True)
            spans.append(entity)
            doc.ents = list(doc.ents) + [entity]
        for span in spans: span.merge()
        return doc

    def has_entities(self, tokens):
        return any(token._.get(self._is_entity) for token in tokens)

    def iter_entities(self, tokens):
        return [(t.text, i, t._.get(self._entity_desc)) for i, t in enumerate(tokens) if t._.get(self._is_entity)]

    def get_entity_desc(self, token):
        return token.text
Exemplo n.º 2
0
    def extract_summary(self):
        result = {}
        result['Development Languages'] = []

        keyword_processor = KeywordProcessor()
        keyword_processor.add_keywords_from_dict(self.skills_list)

        for keyword in self.candidate_keywords:
            matched_keyword = keyword_processor.extract_keywords(
                self.stemmer.stem(keyword))

            if not matched_keyword:
                # print('"{}" is not matched in our skill database.'.format(keyword))
                continue

            for i in range(len(matched_keyword)):
                category, name = str(matched_keyword[i]).split(": ", 1)

                if category not in result:
                    result[category] = {}

                if name not in result[category]:
                    result[category][name] = []

                result[category][name].append(keyword)

                if category == 'Tools & Technologies':
                    result['Development Languages'].append(keyword)

        return result
Exemplo n.º 3
0
def find_keyword(keyword_dict, text_to_test):

    keyword_processor = KeywordProcessor()
    keyword_processor.add_keywords_from_dict(keyword_dict)
    result = keyword_processor.extract_keywords(text_to_test)

    return result
Exemplo n.º 4
0
class FlashtextBackend(Backend):
    """This backend recognizes entities using the FlashText algorithm.

    See https://flashtext.readthedocs.io/en/latest/ for more information about the FlashText algorithm.
    """
    def __init__(self):
        self.keyword_processor = KeywordProcessor()

    def register_recognizer(self, recognizer_cls: Type[FlashtextRecognizer]):
        recognizer = recognizer_cls()

        key = f"{recognizer.TAG}:{recognizer.SCORE}:{recognizer_cls.__name__}"
        keyword_dict = {key: recognizer.keywords}
        self.keyword_processor.add_keywords_from_dict(keyword_dict)

    def run(self, text):
        keywords = self.keyword_processor.extract_keywords(text,
                                                           span_info=True)

        ents = []
        for keyword_key, start, end in keywords:
            tag, score, recognizer_name = keyword_key.split(":")
            ent = NamedEntity(start, end, tag, text[start:end], float(score),
                              recognizer_name)
            ents.append(ent)
        return ents
Exemplo n.º 5
0
    def test_remove_keywords_dictionary_len(self):
        """For each of the test case initialize a new KeywordProcessor.
        Add the keywords the test case to KeywordProcessor.
        Remove the keywords in remove_keyword_dict
        Extract keywords and check if they match the expected result for the test case.
        """
        for test_id, test_case in enumerate(self.test_cases):
            keyword_processor = KeywordProcessor()
            keyword_processor.add_keywords_from_dict(test_case['keyword_dict'])
            keyword_processor.remove_keywords_from_dict(
                test_case['remove_keyword_dict'])

            kp_len = len(keyword_processor)

            new_dictionary = defaultdict(list)
            for key, values in test_case['keyword_dict'].items():
                for value in values:
                    if not (key in test_case['remove_keyword_dict'] and value
                            in test_case['remove_keyword_dict'][key]):
                        new_dictionary[key].append(value)

            keyword_processor_two = KeywordProcessor()
            keyword_processor_two.add_keywords_from_dict(new_dictionary)
            kp_len_two = len(keyword_processor_two)
            self.assertEqual(
                kp_len, kp_len_two,
                "keyword processor length doesn't match for Text ID {}".format(
                    test_id))
Exemplo n.º 6
0
async def main(job_title: str, sh_links_we_already_have: list[str],
               skills: dict[str, list[str]]):
    # Import this function to collect vacancies for a given job title.
    fake_agent = get_user_agent()
    async with ClientSession(headers={
            "user-agent": fake_agent,
            "Connection": "close"
    }) as session:
        all_links = await scan_all_search_results(job_title, session)
        for _ in range(10):
            try:
                vacancies_without_skills = await fetch_all_vacancy_pages(
                    all_links, sh_links_we_already_have, session)
                keyword_processor = KeywordProcessor()
                keyword_processor.add_keywords_from_dict(skills)
                collected_jobs = (
                    process_vacancy_content(vacancy_without_skills,
                                            keyword_processor)
                    for vacancy_without_skills in vacancies_without_skills
                    if vacancy_without_skills is not None)
                await asyncio.sleep(60)
                return collected_jobs
            except OSError:
                logger.warning(f"🚨 OSError occured for {job_title}.")
        # If couldn't recover after errors, then return an empty list.
        await asyncio.sleep(60)
        return []
Exemplo n.º 7
0
class Tokenizer(object):
    def __init__(self):
        log.info("Tokenizer initialization")

        from .lookup_dic import _PhraseDictionary as __PD
        log.debug("Tokenizer: calls lookup_dic.read_phrases")
        self.lookup_dic = __PD()

        log.debug("Instanciate flashtext.KeyworkProcessor")
        self.__keyword_processor = KeywordProcessor()
        log.debug("Insert data into flashtext.KeyworkProcessor instance.")
        self.__keyword_processor.add_keywords_from_dict(
            self.lookup_dic.lookup_dic_CODE)
        log.info("Tokenizer initialization successful")

    def tokenize(self, text):
        log.debug(f"Tokenizer called on {text}")

        log.debug("Phase I: Replacing phrases.")
        text = self.__keyword_processor.replace_keywords(text)

        log.debug("Phase II: Split by space.")
        tokens_list = text.split()

        log.debug("Phase III: Replace back token id to its original form.")
        tokens_list = [
            self.lookup_dic.reverse_replace(token)
            if token in self.lookup_dic.lookup_dic_CODE else token
            for token in tokens_list
        ]

        return tokens_list

    def __call__(self, text):
        return self.tokenize(text)
Exemplo n.º 8
0
def search_from_keywords(synonyms_list, sentences, keywords_with_weight):
    #synonyms_list is a list of dictionaries
    global keywords_rank
    final_result = []
    for index in range(len(synonyms_list)):
        synonyms_dict = synonyms_list[index]
        keywords_list = list(synonyms_dict.keys())
        threshold = 0.5 * len(keywords_list)
        if not keywords_list:
            final_result.append(None)
            continue
        keywords_rank = keywords_with_weight[index]
        matched_sentences = PriorityQueue()
        keyword_processor = KeywordProcessor()
        keyword_processor.add_keywords_from_dict(synonyms_dict)
        for sent in sentences:
            keywords_found = set(keyword_processor.extract_keywords(sent))
            if keywords_found:
                entry_obj = Entry(list(keywords_found), sent)
                if entry_obj.matched >= threshold:
                  matched_sentences.put(entry_obj)
        if not matched_sentences.empty():
          best_sentence = matched_sentences.get()
          final_result.append(best_sentence.sentence)
        else:
            final_result.append(None)            
    return final_result
Exemplo n.º 9
0
 def test_remove_keywords_len(self):
     """For each of the test case initialize a new KeywordProcessor.
     Add the keywords the test case to KeywordProcessor.
     Remove the keywords in remove_keyword_dict
     Extract keywords and check if they match the expected result for the test case.
     """
     for test_id, test_case in enumerate(self.test_cases):
         keyword_processor = KeywordProcessor()
         keyword_processor.add_keywords_from_dict(test_case['keyword_dict'])
         # check length
         kp_len = len(keyword_processor)
         kp_len_expected = sum([
             len(values)
             for key, values in test_case['keyword_dict'].items()
         ])
         self.assertEqual(
             kp_len, kp_len_expected,
             "keyword processor length doesn't match".format(test_id))
         keyword_processor.remove_keywords_from_dict(
             test_case['remove_keyword_dict'])
         # check length
         kp_len = len(keyword_processor)
         kp_len_decreased = sum([
             len(values)
             for key, values in test_case['remove_keyword_dict'].items()
         ])
         self.assertEqual(
             kp_len, kp_len_expected - kp_len_decreased,
             "keyword processor length doesn't match for Text ID {}".format(
                 test_id))
    def test_remove_keywords_dictionary_compare(self):
        """For each of the test case initialize a new KeywordProcessor.
        Add the keywords the test case to KeywordProcessor.
        Remove the keywords in remove_keyword_dict
        Extract keywords and check if they match the expected result for the test case.
        """
        for test_id, test_case in enumerate(self.test_cases):
            keyword_processor = KeywordProcessor()
            keyword_processor.add_keywords_from_dict(test_case['keyword_dict'])
            keyword_processor.remove_keywords_from_dict(
                test_case['remove_keyword_dict'])
            keyword_trie_dict = keyword_processor.keyword_trie_dict

            new_dictionary = defaultdict(list)
            for key, values in test_case['keyword_dict'].items():
                for value in values:
                    if not (key in test_case['remove_keyword_dict'] and value
                            in test_case['remove_keyword_dict'][key]):
                        new_dictionary[key].append(value)

            keyword_processor_two = KeywordProcessor()
            keyword_processor_two.add_keywords_from_dict(new_dictionary)
            keyword_trie_dict_two = keyword_processor_two.keyword_trie_dict
            self.assertTrue(
                keyword_trie_dict == keyword_trie_dict_two,
                "keywords_extracted don't match the expected results for test case: {}"
                .format(test_id))
Exemplo n.º 11
0
 def test_add_keyword_file_missing(self):
     keyword_processor = KeywordProcessor()
     keyword_dict = {
         "java": "java_2e",
         "product management": "product manager"
     }
     with pytest.raises(AttributeError):
         keyword_processor.add_keywords_from_dict(keyword_dict)
Exemplo n.º 12
0
class PhraseSubPopulation(SubPopulation):
    r"""
    Filter samples based on a group of phrases

    for example, with phrase_name = 'question'::

        sample 1: "Who is Jack.", score: 1
        sample 2: "I am Jack.", score: 0
    """
    def __init__(self, phrase_name='negation'):
        super().__init__()
        self.phrase_name = phrase_name
        if self.phrase_name == 'negation':
            self.phrases = NEGATION
        elif self.phrase_name == 'question':
            self.phrases = QUESTION

        self.phrase_processor = KeywordProcessor(case_sensitive=True)
        self.phrase_processor.add_keywords_from_dict(
            {self.phrase_name: self.phrases})

    def __repr__(self):
        return "PhraseSubPopulation" + "-" + self.phrase_name

    def phrase_match(self, text):
        match = False
        # Search for phrases
        result = self.phrase_processor.extract_keywords(text)
        if result:
            match = True
        return match

    def _score(self, sample, fields, **kwargs):
        """
        1 or 0 indicates whether sample fields match phrase groups

        :param sample: data sample
        :param list fields: list of field str
        :param kwargs:
        :return int: score for sample

        """

        text = ' '.join([sample.get_text(field) for field in fields])
        match = self.phrase_match(text)

        return match

    def get_slice(self, scores, dataset):
        r"""
        Save the samples that mach the phrase groups

        """
        sub_samples = []
        for i, sample in enumerate(dataset):
            if scores[i]:
                sub_samples.append(sample)
        return sub_samples
Exemplo n.º 13
0
def init():
    global __keyword_processor
    global __reverse_lookup_dict
    global lookup_dic_CODE
    global pattern

    lookup_dic.read_phrases()
    __keyword_processor = KeywordProcessor()
    __keyword_processor.add_keywords_from_dict(lookup_dic.lookup_dic_CODE)
Exemplo n.º 14
0
def replace(text):
    kwp = KeywordProcessor()
    kwp.non_word_boundaries = set()
    kwp.add_keywords_from_dict(
        {" {} ".format(v): [k]
         for k, v in UNICODE_EMOJI.items()})

    clean_text = kwp.replace_keywords(text).strip()

    return clean_text
Exemplo n.º 15
0
def prepare_keyword_processor(synonyms: Dict[str, list]) -> KeywordProcessor:
    """
    28x faster than a compiled regexp for 1,000 keywords
    https://github.com/vi3k6i5/flashtext

    :param synonyms: dict of entity synonyms
    :return:
    """
    kp = KeywordProcessor(case_sensitive=True)
    kp.add_keywords_from_dict(synonyms)
    return kp
def keyword_processor_generation():
    keyword_processor = KeywordProcessor()
    data = open('merge_syno.txt', 'r', encoding='utf-8').readlines()
    word = []
    sys_word = []
    for i in data:
        word_item, sys_word_item = i.strip('\n').split(' ')
        word.append(word_item)
        sys_word.append(sys_word_item.split())
    keyword_dict = dict(zip(word, sys_word))
    keyword_processor.add_keywords_from_dict(keyword_dict)
    return keyword_processor
Exemplo n.º 17
0
def count_corpus_mentions_cooccurs(mentions: Dict[str, List[str]]) -> Any:
    mention2idx = get_mention2idx_dict(mentions)

    def count_cooccur(
        bag: Iterable[int],
        cooccur: Dict[int, Dict[int,
                                int]] = defaultdict(lambda: defaultdict(int))
    ) -> Dict[int, Dict[int, int]]:
        for a in bag:
            for b in bag:
                cooccur[a][b] += 1
        return cooccur

    nlp = spacy.load('en_core_web_sm')
    processor = KeywordProcessor()
    processor.add_keywords_from_dict(mentions)

    sent_cooccur, atk_cooccur = defaultdict(
        lambda: defaultdict(int)), defaultdict(lambda: defaultdict(int))
    atk_bags = []
    for i, hit in enumerate(es.scan_scraper_page("*cnbc*")):
        if i % 100 == 0:
            print(i)
        if i > 100:
            break
        sents = []
        try:
            sents.append(hit.article_title)
        except:
            pass
        try:
            sents += [sent.text for sent in nlp(hit.article_text).sents]
        except:
            pass

        atk_bag = set()
        for sent in sents:
            kws = processor.extract_keywords(sent, span_info=False)
            sent_bag = [mention2idx[kw] for kw in kws]
            sent_cooccur = count_cooccur(sent_bag, sent_cooccur)
            atk_bag |= set(sent_bag)
        atk_cooccur = count_cooccur(atk_bag, atk_cooccur)
        atk_bags.append(atk_bag)

    return (
        # 在一個句子中,兩個entity同時出現的次數: {a: {a: 4, b: 2, c:1}, b: {...}}
        dict(sent_cooccur),
        # 在一篇文章中,兩個entity同時出現的次數: {a: {a: 4, b: 2, c:1}, b: {...}}
        dict(atk_cooccur),
        # 在一篇文章中出現的entities集合成一個bag: [{a, b, c}, {a, c, f}, ...]
        set(map(frozenset, atk_bags)),
    )
Exemplo n.º 18
0
def main(job_title: str, indeed_links_we_already_have: list[str],
         skills: dict[str, list[str]]):
    # Main flow of parsing the vacancies and counting the relevant skills.
    all_links = scan_all_search_results(job_title)
    vacancies_without_skills = fetch_all_vacancy_pages(
        all_links, indeed_links_we_already_have)
    keyword_processor = KeywordProcessor()
    keyword_processor.add_keywords_from_dict(skills)
    collected_jobs = (process_vacancy_content(vacancy_without_skills,
                                              keyword_processor)
                      for vacancy_without_skills in vacancies_without_skills
                      if vacancy_without_skills is not None)
    return collected_jobs
Exemplo n.º 19
0
 def test_remove_keywords(self):
     """For each of the test case initialize a new KeywordProcessor.
     Add the keywords the test case to KeywordProcessor.
     Remove the keywords in remove_keyword_dict
     Extract keywords and check if they match the expected result for the test case.
     """
     for test_id, test_case in enumerate(self.test_cases):
         keyword_processor = KeywordProcessor()
         keyword_processor.add_keywords_from_dict(test_case['keyword_dict'])
         keyword_processor.remove_keywords_from_dict(test_case['remove_keyword_dict'])
         keywords_extracted = keyword_processor.extract_keywords(test_case['sentence'])
         self.assertEqual(keywords_extracted, test_case['keywords'],
                          "keywords_extracted don't match the expected results for test case: {}".format(test_id))
Exemplo n.º 20
0
    def test_dictionary_loading(self):
        keyword_processor = KeywordProcessor()
        keyword_dict = {
            "java": ["java_2e", "java programing"],
            "product management": ["product management techniques", "product management"]
        }
        keyword_processor.add_keywords_from_dict(keyword_dict)

        sentence = 'I know java_2e and product management techniques'
        keywords_extracted = keyword_processor.extract_keywords(sentence)
        self.assertEqual(keywords_extracted, ['java', 'product management'],
                         "Failed file format one test")
        sentence_new = keyword_processor.replace_keywords(sentence)
        self.assertEqual(sentence_new, "I know java and product management",
                         "Failed file format one test")
Exemplo n.º 21
0
async def extract_keywords(article: ArticleContent):
    try:
        keyword_processor = KeywordProcessor()

        keyword_dict = {}
        for keyword in article.keywords:
            keyword_dict[keyword.name] = keyword.keywords

        keyword_processor.add_keywords_from_dict(keyword_dict)

        found_keywords = keyword_processor.extract_keywords(article.text)

        return found_keywords
    except Exception as e:
        raise HTTPException(status_code=500, detail=e)
Exemplo n.º 22
0
class KeyProc:

    with open('./data/cleaned_name.pickle', 'rb') as handle:
        dict_cleaned_name = pickle.load(handle)

    def __init__(self, dict=dict_cleaned_name):
        #self.text = text
        self.kp = KeywordProcessor()
        self.kp.add_keywords_from_dict(dict)

    def extractKeywords(self, text):
        return self.kp.extract_keywords(text)

    def replaceKeywords(self, text):
        return self.kp.replace_keywords(text)
Exemplo n.º 23
0
class Entity(object):

    name = 'entity'

    def __init__(self,
                 nlp,
                 keywords_list=[],
                 keywords_dict={},
                 keywords_file=None,
                 label='',
                 attrs=('has_entities', 'is_entity', 'entity_desc',
                        'entities')):
        self._has_entities, self._is_entity, self._entity_desc, self._entities = attrs
        self.keyword_processor = KeywordProcessor()
        self.keyword_processor.add_keywords_from_list(keywords_list)
        self.keyword_processor.add_keywords_from_dict(keywords_dict)
        if keywords_file:
            self.keyword_processor.add_keyword_from_file(keywords_file)
        self.label = label
        # Add attributes
        Doc.set_extension(self._has_entities, getter=self.has_entities)
        Doc.set_extension(self._entities, getter=self.iter_entities)
        Span.set_extension(self._has_entities, getter=self.has_entities)
        Span.set_extension(self._entities, getter=self.iter_entities)
        Token.set_extension(self._is_entity, default=False)
        Token.set_extension(self._entity_desc, getter=self.get_entity_desc)

    def __call__(self, doc):
        matches = self.keyword_processor.extract_keywords(doc.text,
                                                          span_info=True)
        for _, start, end in matches:
            entity = doc.char_span(start, end, label=self.label)
            if entity:
                for token in entity:
                    token._.set(self._is_entity, True)
            # Overwrite doc.ents and add entity – be careful not to replace!
            doc.ents = list(doc.ents) + [entity]
        return doc

    def has_entities(self, tokens):
        return any(token._.get(self._is_entity) for token in tokens)

    def iter_entities(self, tokens):
        return [(t.text, i, t._.get(self._entity_desc))
                for i, t in enumerate(tokens) if t._.get(self._is_entity)]

    def get_entity_desc(self, token):
        return token.text
Exemplo n.º 24
0
def find_skills_in_resume(text_from_resume: str) -> set[str]:
    # Extract from resume the list of unique skills that need to be matched against the vacancies.
    skills_from_db = cache.get("skills_from_db")
    # Cache skills for 12 hours – additional check despite the custom cron command to warmup the cache.
    if skills_from_db is None:
        skills_from_db = list(Skill.objects.all())
        cache.set("skills_from_db", skills_from_db, 12 * 60 * 60)
    skills = {
        skill.clean_name: ast.literal_eval(skill.unclean_names)
        for skill in skills_from_db
    }
    keyword_processor = KeywordProcessor()
    keyword_processor.add_keywords_from_dict(skills)
    skills_from_resume = set(
        keyword_processor.extract_keywords(text_from_resume))
    return skills_from_resume
class Keywords_Classifier:
    def __init__(self, dic_dir):
        self.dic_dir = Path(dic_dir)
        self.keywords_dict = {}
        self.processor = None
        self.label_dict = defaultdict(set)
        self.label_dict['other'] = set()
        type_of_dic = [i.stem for i in self.dic_dir.glob('*.dic')]
        for label in type_of_dic:
            self.label_dict[label] = set()

    def build_keyword_dict(self):
        '''Read a text file that contains specific dictionary
        which words or phrase is arranged by line by line'''

        dic_files = self.dic_dir.glob(('**/*.dic'))
        pbar = tqdm(dic_files)
        for dic_file in pbar:
            with dic_file.open() as f:
                dic_name = dic_file.stem
                dic_value = [word.strip() for word in f.readlines()]
                self.keywords_dict[dic_name] = dic_value
            pbar.set_description(f"Loading {dic_name} Dict Completely!\n")
        self.processor = KeywordProcessor(case_sensitive=False)
        self.processor.add_keywords_from_dict(self.keywords_dict)

    def query(self, phrase):
        '''Query keyword to get related category'''

        cat = self.processor.extract_keywords(phrase)
        return cat

    def classifier(self, keywords):
        '''Input a list of keywords, output a dict
        key of dict is label of word, value of dict is set of keywords'''

        for i, keyword in enumerate(keywords):
            result = self.query(keyword)
            if result:
                for each_label in result:
                    self.label_dict[each_label].add(keyword)
            else:
                self.label_dict['other'].add(keyword)

        return self.label_dict.items()
Exemplo n.º 26
0
    def process(self, message, **kwargs):
        """Retrieve the text message, pass it to the classifier
            and append the prediction results to the message class."""

        # 提取应用实体
        app_keyword_processor = KeywordProcessor()
        query = InFo.select(InFo.content).where(InFo.name == "app_kw_dict")
        app_dict = ast.literal_eval(query[0].content)
        app_keyword_processor.add_keywords_from_dict(app_dict)
        # print(f'query为:{message.text}')

        try:
            app_name = app_keyword_processor.extract_keywords(message.text)[0]
            # print(f'app_name词槽为:{app_name}')
            entity = self.save_to_app(app_name)
            message.set("entities", [entity], add_to_output=True)
        except Exception as ex:
            print(ex)
Exemplo n.º 27
0
def custom_entity_extraction(uuid, date, scenario_id, text, dic_ref):
    """
    Goes through the news for the
    Custom Entities that we defined
    """

    columns = [
        "story_uuid", "text", "label", "salience", "published_date",
        "scenario_id", "wiki", "mentions"
    ]

    processor = KeywordProcessor()
    processor.add_keywords_from_dict(dic_ref)
    found = processor.extract_keywords(
        text,
        span_info=True,
    )
    dic = defaultdict(list)

    for i, j, k in found:

        if i not in dic.keys():
            dic[i] = [text[j:k]]
        else:
            dic[i].append(text[j:k])

    final = pd.DataFrame()
    for key in dic.keys():
        df = pd.DataFrame({'label': key, 'text': dic[key], 'mentions': 1})
        final = final.append(df)

    logging.info("{} Custom Entities found.".format(final.shape[0]))
    if final.shape[0] == 0:
        return []

    final = final.groupby(['label', 'text'],
                          as_index=False)['mentions'].agg('sum')
    final['story_uuid'] = uuid
    final['published_date'] = date
    final['scenario_id'] = scenario_id
    final['salience'] = 0
    final['wiki'] = "custom"

    return final[columns].values.tolist()
Exemplo n.º 28
0
def for_english():
    # all_posts = load_posts('analysis/_2_remove_unrelated_data/english.json') # this line is to check whether this new algorithm differ from the previous algo
    all_posts = load_posts('analysis/_1_process_raw_data/output/english.json')
    leaders = json.load(open(f'keywords/target/leader.json'))
    parties = json.load(open(f'keywords/target/party.json'))
    combined = {**leaders, **parties}
    keyword_dict = {}
    for key, value in combined.items():
        keyword_dict[key] = [key] + combined[key]["alias_en"]

    kp = KeywordProcessor()
    kp.add_keywords_from_dict(keyword_dict)
    for p in all_posts:
        p["related_to"] = list(set(kp.extract_keywords(p["value"])))

    purified = [x for x in all_posts if len(x['related_to']) > 0]
    log(f"Number of removed posts = " + str(len(all_posts) - len(purified)), 1)

    save_posts(purified, f'analysis/_2_remove_unrelated_data/english.json')
Exemplo n.º 29
0
class gazetteer():
    def __init__(self,type_link,isLower,iscase_sensitive,currFlag):
        self.keyword_processor = KeywordProcessor(case_sensitive=iscase_sensitive)
        if type_link is None:
            return
        curr_dict = {"USD": ["$","$", "dollars", "U.S. Dollar", "USD", "US$", "United States dollar"]}
        # Temp logic for hierarchy identification -- remove
        # asset_dict ={"Assets": ["assets","total assets"]}
        # curAss_Dict={"Current Assets": ["current assets","total current assets"]}
        # nonCurAss_Dict={"NonCurrent Assets": ["longterm assets","long-term assets","non current assets","non-current assets","total noncurrent assets","total non-current assets"]}
        # liab_Dict={"Liabilities": ["liabilities","liabilities and common shareholders equity","liabilities and shareholders equity","total liabilities","total liabilities and equity","total liabilities and shareholders equity","total liabilities and stockholders equity","total liabilities and common shareholders equity"]}
        # curLiab_Dict ={"Current Liabilities" : ["current liabilities","total current liabilities"]}
        # nonCurrLiab_Dict = {"NonCurrent Liabilities" : ["non current liabilities","non-current liabilities","longterm liabilities","long-term liabilities","total noncurrent liabilities","total non-current liabilities"]}
        # shrEq_Dict = {"Shareholders Equity" : ["shareholders equity","equity","stockholders equity","total equity","total stockholders equity","total common shareholders equity","total wells fargo stockholders equity"]}


        with open(type_link,encoding='utf-8',errors='ignore') as inf:
            lines = inf.readlines()

        for line in lines:
            if len(line.strip()) >0 :
                if isLower:
                    self.keyword_processor.add_keyword(line.strip().lower())
                else:
                    self.keyword_processor.add_keyword(line.strip())
        if currFlag:
            self.keyword_processor.add_keywords_from_dict(curr_dict)
    #     temp logic for hierarchy identification -- remove
    #     self.keyword_processor.add_keywords_from_dict(asset_dict)
    #     self.keyword_processor.add_keywords_from_dict(curAss_Dict)
    #     self.keyword_processor.add_keywords_from_dict(nonCurAss_Dict)
    #     self.keyword_processor.add_keywords_from_dict(liab_Dict)
    #     self.keyword_processor.add_keywords_from_dict(curLiab_Dict)
    #     self.keyword_processor.add_keywords_from_dict(nonCurrLiab_Dict)
    #     self.keyword_processor.add_keywords_from_dict(shrEq_Dict)

    def findPhrases(self,text):
        list_phrases = list()
        flash_text=self.keyword_processor.extract_keywords(text, span_info=True)
        for flash in flash_text:
            list_phrases.append(flash[0])
        return set(list_phrases)
Exemplo n.º 30
0
def build_flashtext_trie(file, limit=numpy.Inf):
    id2phrases = {}

    def process_line(line):
        line = line.replace('\n', '')
        if '\t' in line:
            id, phrase = line.split('\t')
            if id in id2phrases:
                id2phrases[id].append(phrase)
            else:
                id2phrases[id] = [phrase]
        else:
            phrase = line
            id2phrases[len(id2phrases) + 1] = [phrase]

    [process_line(line) for line in data_io.read_lines(file, limit=limit)]
    keyword_processor = KeywordProcessor()
    keyword_processor.add_keywords_from_dict(id2phrases)
    # print(keyword_processor.extract_keywords('gesellschaftervertrag'))
    return keyword_processor
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('Big Apple')
keyword_processor.add_keyword('Bay Area')
keywords_found = keyword_processor.extract_keywords('I love big Apple and Bay Area.')
print(keywords_found)
# ['Big Apple', 'Bay Area']

# 同时添加多个关键词
keyword_processor = KeywordProcessor()
# 字典格式的关键词,其对应的key为最终匹配出的词,但key不记入关键词搜索的范围
keyword_dict = {
    "java": ["java_2e", "java programing"],
    "product management": ["PM", "product manager"]
}
# {'clean_name': ['list of unclean names']}
keyword_processor.add_keywords_from_dict(keyword_dict)
# Or add keywords from a list:
keyword_processor.add_keywords_from_list(["java", "python"])
keyword_processor.extract_keywords('I am a product manager for a java_2e platform')
# output ['product management', 'java']

# 删除关键词
keyword_processor = KeywordProcessor()
keyword_dict = {
    "java": ["java_2e", "java programing"],
    "product management": ["PM", "product manager"]
}
keyword_processor.add_keywords_from_dict(keyword_dict)
print(keyword_processor.extract_keywords('I am a product manager for a java_2e platform'))
# output ['product management', 'java']
keyword_processor.remove_keyword('java_2e')