def rm_main():
    ScienceEvent = {
        "scienza", "universo", "geologia", "biologia", "scientifico",
        "scientifica", "scienziato", "scienzata"
    }
    VisualArtEvent = {
        "pittura", "scultura", "artista", "artisti", "art", "opere", "opera",
        "part"
    }
    keyPArt = KeywordProcessor()
    keyPScience = KeywordProcessor()
    for element in ScienceEvent:
        keyPScience.add_keyword(element)
    for element in VisualArtEvent:
        keyPArt.add_keyword(element)

    url = 'https://raw.githubusercontent.com/andreamatt/KDI/master/dataset/muse.json'
    obj = json.loads(requests.get(url).text)
    eventsArray = obj["events"]
    while {} in eventsArray:
        eventsArray.remove({})
    for event in eventsArray:
        for k, v in event.items():
            event[k] = v.replace('\n', '; ')
        artList = keyPArt.extract_keywords(event["description"])
        scienceList = keyPScience.extract_keywords(event["description"])
        if len(artList) > len(scienceList):
            event.update({'Subcategory': visual})
        elif len(artList) <= len(scienceList):
            event.update({'Subcategory': science})

    obj["events"] = eventsArray
    return json.dumps(obj)
Exemplo n.º 2
0
def filter_keywords(data: pd.DataFrame,
                    keywords: Iterable[str],
                    source: Union[str, List[str]] = "text",
                    case_sensitive: bool = True) -> pd.DataFrame:
    """Filters out rows that do not have any keywords in any source column(s)

    Args:
        data: dataframe containing [source] column(s) of type str
        keywords: Iterable of strings to search for in text
        source: column name or names containing the source text
        case_sensitive: Toggle keyword case sensitivity

    Returns:
        Original dataframe with rows filtered out
    """
    # Get keyword processor and add keywords
    proc = KeywordProcessor(case_sensitive=case_sensitive)
    proc.add_keywords_from_list(keywords)
    # If single source column, only need to check one element in each row, otherwise, apply any(..)
    # to check all source columns iteratively through each row
    if isinstance(source, str):
        mask = data[source].apply(
            lambda sent: bool(proc.extract_keywords(sent)))
    else:
        mask = data[source].apply(lambda sents: any(
            bool(proc.extract_keywords(sent)) for sent in sents))
    output = data[mask]  # Use mask to filter out rows without any keywords
    return output
Exemplo n.º 3
0
def get_eduYrs(s):
    keyword_processor = KeywordProcessor()
    education_yrs = 0.0
    keyword_processor.add_keyword('four-year')
    keyword_processor.add_keyword('four years')
    sentences = sent_tokenize(s)
    selected_sentences = [
        sent for sent in sentences if "degree" in word_tokenize(sent)
    ]
    selected_sentences1 = [
        sent for sent in sentences if "Graduation" in word_tokenize(sent)
    ]

    for i in range(len(selected_sentences)):
        keywords_found = keyword_processor.extract_keywords(
            selected_sentences[i])
        if (len(keywords_found) > 0):
            education_yrs = 4.0
    for i in range(len(selected_sentences1)):
        keywords_found = keyword_processor.extract_keywords(
            selected_sentences1[i])
        if (len(keywords_found) > 0):
            education_yrs = 4.0

    return education_yrs
Exemplo n.º 4
0
    def find(model, keyword, types):
        """
        Easily find a metabolite or reaction by a keyword.
        
            Parameters:
                model (obj) : model
                keyword (str) : keyword to search for
                types (str|tuple) : 'met' or 'reac'

            Returns:
                matches (list) : list of matches
        """
        keyword_processor = KeywordProcessor()
        keyword_processor.add_keyword(keyword)

        matches = []

        if "met" in types:
            for met in model.smx.rnames:
                match = keyword_processor.extract_keywords(met)
                if match: matches.append(met)

        if "reac" in types:
            for reac in model.smx.cnames:
                match = keyword_processor.extract_keywords(reac)
                if match:
                    matches.append(reac)

        return matches
Exemplo n.º 5
0
def UploadAction(event=None):
    filename = filedialog.askopenfilename()
    a = filename
    pytesseract.pytesseract.tesseract_cmd=r"C:\Program Files\Tesseract-OCR\tesseract.exe"

    img=Image.open(a)

    text=pytesseract.image_to_string(img)

    print(text)

    kp = KeywordProcessor()
    kp.add_keyword('invoice', ('Monument', 'Taj Mahal'))
 	kp.add_keyword('bill', ('Location', 'Delhi'))
 	kp.extract_keywords(text)
Exemplo n.º 6
0
def find_needles():
    needles = []

    start = time.time()
    db = Database(None)
    print "Connected to DB in", time.time() - start, "secs"

    start = time.time()
    db.execute("SELECT STR FROM `Final_consolidated`")
    print "Retrieved info from DB in", time.time() - start, "secs"

    start = time.time()
    for ele in db.cursor:
        needles.append(ele[0].lower())
    print "Listed needles in", time.time() - start, "secs"

    start = time.time()
    processor = KeywordProcessor()
    processor.add_keywords_from_list(needles)
    print "Processed needles in", time.time() - start, "secs"
    start = time.time()
    file = open("hello.txt", "r")
    for haystack in file.readlines():
        print "Before hyphen removal:"
        found = list(set(processor.extract_keywords(haystack.lower(), span_info=True)))
        string = ""
        for ele in found:
            ele = ele.replace('"', '\\"')
            string += ',"' + ele + '"'
        print string, "\n\n"
        haystack = haystack.replace("-", " ")
        print "After hyphen removal:"
        found = list(set(processor.extract_keywords(haystack.lower())))
        string = ""
        for ele in found:
            ele = ele.replace('"', '\\"')
            string += ',"' + ele + '"'
        print string, "\n\n\n"
    # haystacks = []
    # db.execute("SELECT id, detailed_description FROM `sherlock_ct_new`.`trials`")
    # for row in db.cursor:
    #     if row[1]:
    #         haystack = row[1].lower()
    #         found = list(set(processor.extract_keywords(haystack.lower())))
    #         file.write(str(row[0]) + "," + json.dumps(found))
    file.close()
    time.time() - start
    print "Found needles within haystack in", time.time() - start, "secs"
Exemplo n.º 7
0
def extract_brands(
    processor: KeywordProcessor,
    text: str,
    data_source_name: str,
    automatic_processing: bool,
) -> List[Prediction]:
    predictions = []

    for (brand_tag, brand), span_start, span_end in processor.extract_keywords(
            text, span_info=True):
        match_str = text[span_start:span_end]
        predictions.append(
            Prediction(
                type=PredictionType.brand,
                value=brand,
                value_tag=brand_tag,
                automatic_processing=automatic_processing,
                predictor=data_source_name,
                data={
                    "text": match_str,
                    "notify": False
                },
            ))

    return predictions
Exemplo n.º 8
0
class FlashtextBackend(Backend):
    """This backend recognizes entities using the FlashText algorithm.

    See https://flashtext.readthedocs.io/en/latest/ for more information about the FlashText algorithm.
    """
    def __init__(self):
        self.keyword_processor = KeywordProcessor()

    def register_recognizer(self, recognizer_cls: Type[FlashtextRecognizer]):
        recognizer = recognizer_cls()

        key = f"{recognizer.TAG}:{recognizer.SCORE}:{recognizer_cls.__name__}"
        keyword_dict = {key: recognizer.keywords}
        self.keyword_processor.add_keywords_from_dict(keyword_dict)

    def run(self, text):
        keywords = self.keyword_processor.extract_keywords(text,
                                                           span_info=True)

        ents = []
        for keyword_key, start, end in keywords:
            tag, score, recognizer_name = keyword_key.split(":")
            ent = NamedEntity(start, end, tag, text[start:end], float(score),
                              recognizer_name)
            ents.append(ent)
        return ents
Exemplo n.º 9
0
def search_from_keywords(synonyms_list, sentences, keywords_with_weight):
    #synonyms_list is a list of dictionaries
    global keywords_rank
    final_result = []
    for index in range(len(synonyms_list)):
        synonyms_dict = synonyms_list[index]
        keywords_list = list(synonyms_dict.keys())
        threshold = 0.5 * len(keywords_list)
        if not keywords_list:
            final_result.append(None)
            continue
        keywords_rank = keywords_with_weight[index]
        matched_sentences = PriorityQueue()
        keyword_processor = KeywordProcessor()
        keyword_processor.add_keywords_from_dict(synonyms_dict)
        for sent in sentences:
            keywords_found = set(keyword_processor.extract_keywords(sent))
            if keywords_found:
                entry_obj = Entry(list(keywords_found), sent)
                if entry_obj.matched >= threshold:
                  matched_sentences.put(entry_obj)
        if not matched_sentences.empty():
          best_sentence = matched_sentences.get()
          final_result.append(best_sentence.sentence)
        else:
            final_result.append(None)            
    return final_result
Exemplo n.º 10
0
def get_DL(s):
    dl = False
    dl_valid = False
    dl_State = ""
    arr = ['driver', 'license']
    keyword_processor = KeywordProcessor()
    keyword_processor.add_keyword('california')
    if any(re.findall('|'.join(arr), qual)):
        dl = True
    if (dl == True):
        sentences = sent_tokenize(s)
        selected_sentence = [
            sent for sent in sentences if "driver" in word_tokenize(sent)
        ]
        if (len(selected_sentence) > 0):
            words = selected_sentence[0].split()
            selected_word = [word for word in words if "valid" in words]
            if len(selected_word) > 0:
                dl_valid = True
        for i in range(len(selected_sentence)):
            keywords_found = keyword_processor.extract_keywords(
                selected_sentence[i])
            for i in range(len(keywords_found)):
                if keywords_found[i] == 'california':
                    dl_State = "CA"

    if (dl_valid) == True:
        dl_valid = "R"
    else:
        dl_valid = "P"
    return dl_valid, dl_State
Exemplo n.º 11
0
def get_expYrs(s):
    keyword_processor = KeywordProcessor()
    exp_yrs = 0.0
    keyword_processor.add_keyword('four-year')
    keyword_processor.add_keyword('four years')
    keyword_processor.add_keyword('three years')
    keyword_processor.add_keyword('one year')
    keyword_processor.add_keyword('two years')
    keyword_processor.add_keyword('six years')
    sentences = sent_tokenize(s)
    selected_sentences = [
        sent for sent in sentences if "experience" in word_tokenize(sent)
    ]

    for i in range(len(selected_sentences)):
        keywords_found = keyword_processor.extract_keywords(
            selected_sentences[i])
        for i in range(len(keywords_found)):
            if keywords_found[i] == 'two years':
                exp_yrs = 2.0
            elif keywords_found[i] == 'one year':
                exp_yrs = 1.0
            elif keywords_found[i] == 'three years':
                exp_yrs = 3.0
            elif keywords_found[i] == 'six years':
                exp_yrs = 6.0
            elif keywords_found[i] == 'four years':
                exp_yrs = 4.0
            elif keywords_found[i] == 'four-year':
                exp_yrs = 4.0

    return exp_yrs
Exemplo n.º 12
0
def get_college(s):
    college = ""
    keyword_processor = KeywordProcessor()
    keyword_processor.add_keyword('college or university')
    keyword_processor.add_keyword('college')
    keyword_processor.add_keyword('university')
    keyword_processor.add_keyword('high school')
    sentences = sent_tokenize(s)
    for j in range(len(sentences)):
        sentence = sentences[j]
        keywords_found = keyword_processor.extract_keywords(sentence)
        if (len(keywords_found) > 0):
            for i in range(len(keywords_found)):
                if (keywords_found[i] == 'college or university'):
                    college = 'college or university'
                    break
                elif (keywords_found[i] == 'college'):
                    college = 'college'
                    break
                elif (keywords_found[i] == 'university'):
                    college = 'university'
                    break
                elif (keywords_found[i] == 'high school'):
                    college = 'high school'
                    break

    return college
Exemplo n.º 13
0
def extract_frequencies(sentence: Sentence, words: KeywordProcessor,
                        frequencies: MutableMapping) -> MutableMapping:
    """Extracts statistical information for each word in words and mwes and stores it in w_stats.

    Args:
        sentence: A Sentence object.
        words: KeywordProcessor containing the list of words to count (see _get_wlist()).
        frequencies: A dictionary with the frequencies to update. Keys are lemma, values are WordFrequency objects.

    Returns:
        Reference to the updated dictionary.
    """

    if not frequencies:
        frequencies['last_id'] = 0
    all_lemmas = ' '.join([word.lemma for word in sentence])
    matched_indexes = [
        match[1]
        for match in words.extract_keywords(all_lemmas, span_info=True)
    ]
    matched_lemmas = [w for w in sentence if w.lemma_i in matched_indexes]
    for word in matched_lemmas:
        lemma = word.lemma
        if lemma not in frequencies:
            frequencies['last_id'] += 1
            frequencies[lemma] = WordFrequencies(lemma, frequencies['last_id'])
        norm_token = word.token
        # Only the normalized form of proper nouns is left capitalized.
        if not word.token.istitle():
            norm_token = norm_token.lower()
        frequencies[lemma].freq += 1
        frequencies[lemma].norm[norm_token] += 1
        frequencies[lemma].cap[_cap_type(word.token)] += 1
        frequencies[lemma].pos_dep[word.pos + "_" + word.dep] += 1
    return frequencies
Exemplo n.º 14
0
class Entity(object):

    def __init__(self, nlp, keywords_list=[], keywords_dict={}, keywords_file=None, label='', attrs=('has_entities', 'is_entity', 'entity_desc', 'entities')):
        self._has_entities, self._is_entity, self._entity_desc, self._entities = attrs
        self.keyword_processor = KeywordProcessor()
        self.keyword_processor.add_keywords_from_list(keywords_list)
        self.keyword_processor.add_keywords_from_dict(keywords_dict)
        if keywords_file: self.keyword_processor.add_keyword_from_file(keywords_file)
        self.label = label
        Doc.set_extension(self._has_entities, getter=self.has_entities)
        Doc.set_extension(self._entities, getter=self.iter_entities)
        Span.set_extension(self._has_entities, getter=self.has_entities)
        Span.set_extension(self._entities, getter=self.iter_entities)
        Token.set_extension(self._is_entity, default=False)
        Token.set_extension(self._entity_desc, getter=self.get_entity_desc)

    def __call__(self, doc):
        matches = self.keyword_processor.extract_keywords(doc.text, span_info=True)
        spans = []  # keep spans here to merge them later
        for _, start, end in matches:
            entity = doc.char_span(start, end, label=self.label)
            for token in entity: token._.set(self._is_entity, True)
            spans.append(entity)
            doc.ents = list(doc.ents) + [entity]
        for span in spans: span.merge()
        return doc

    def has_entities(self, tokens):
        return any(token._.get(self._is_entity) for token in tokens)

    def iter_entities(self, tokens):
        return [(t.text, i, t._.get(self._entity_desc)) for i, t in enumerate(tokens) if t._.get(self._is_entity)]

    def get_entity_desc(self, token):
        return token.text
Exemplo n.º 15
0
    def __call__(self, from_file=False, file_path=None, stdout=False):
        watchlist = self.open_watchlist()

        keyword_processor = KeywordProcessor()
        keyword_processor.add_keywords_from_list(watchlist)

        if from_file:
            egg_path = _ask_spy_file()
        elif file_path:
            egg_path = file_path
        else:
            raise ValueError("Expected egg, from_file or file_path argument.")

        gui = Scanner(egg_path)

        for line, reaction in enumerate(open(egg_path).readlines(), start=1):
            if reaction and not reaction.startswith('#'):
                match = keyword_processor.extract_keywords(reaction)
                if match:
                    matches = ', '.join(match)
                    reaction = reaction.strip()
                    gui.insert_data(line, matches, reaction)

                    if stdout:
                        log.info(
                            f"Line {line}: {', '.join(match)} \n{reaction}")
Exemplo n.º 16
0
def prediction(text):
    keyword_processor = KeywordProcessor()
    keyword_processor.add_keyword('school')
    keyword_processor.add_keyword('bus')
    keyword_processor.add_keyword('!@#$%')
    keywords_found = keyword_processor.extract_keywords(text, span_info=True)
    return keywords_found
Exemplo n.º 17
0
def find_keyword(keyword_dict, text_to_test):

    keyword_processor = KeywordProcessor()
    keyword_processor.add_keywords_from_dict(keyword_dict)
    result = keyword_processor.extract_keywords(text_to_test)

    return result
Exemplo n.º 18
0
    def cleanDomain(self):
        L = []
        black = []
        keyword_processor = KeywordProcessor()

        f = open(self.whitePath, "r")
        for i in f.readlines():
            L.append(i.strip('\r\n'))

        keyword_processor.add_keywords_from_list(L)
        try:
            for i in self.param:  #待过滤的domain

                hostname, ip, ipBelong, firstVisitTime, lastestVisitTime, userSet, visitNum, similarityValue, imitate = \
                    i.strip('\r\n').split(";")

                hostname = hostname.strip('\r\n')

                try:
                    # if re.search("\."+j.strip('\r\n')+'$',hostname):
                    if len(keyword_processor.extract_keywords(hostname)) == 0:
                        black.append(i)
                except:
                    traceback.print_exc()

        except:
            traceback.print_exc()
            print("匹配错误")
        finally:
            f.close()
            return black
Exemplo n.º 19
0
def get_sentences_for_keyword(keywords, sentences):
    keyword_processor = KeywordProcessor()
    keyword_sentences = {}
    for word in keywords:
        word = word.strip()
        keyword_sentences[word] = []
        keyword_processor.add_keyword(word)
    for sentence in sentences:
        keywords_found = keyword_processor.extract_keywords(sentence)
        for key in keywords_found:
            keyword_sentences[key].append(sentence)

    for key in keyword_sentences.keys():
        values = keyword_sentences[key]
        values = sorted(values, key=len, reverse=True)
        keyword_sentences[key] = values

    delete_keys = []
    for k in keyword_sentences.keys():
        if len(keyword_sentences[k]) == 0:
            delete_keys.append(k)
    for del_key in delete_keys:
        del keyword_sentences[del_key]

    return keyword_sentences
class PhraseFeatsFromTopicModel(FeatsFromTopicModelBase, FeatsFromSpacyDoc):
    '''
    This class allows you to make use of a topic model which has multi-token entries (i.e., terms in topics which
    have spaces in them.)
    It requires Flashtext to be installed.
    '''
    def __init__(self,
                 topic_model,
                 use_lemmas=False,
                 entity_types_to_censor=set(),
                 entity_types_to_use=None,
                 tag_types_to_censor=set(),
                 strip_final_period=False,
                 keyword_processor_args = {'case_sensitive' :False}):
        from flashtext import KeywordProcessor
        self._keyword_processor = KeywordProcessor(**keyword_processor_args)
        self._topic_model = topic_model
        for keyphrase in reduce(lambda x, y: set(x) | set(y), topic_model.values()):
            self._keyword_processor.add_keyword(keyphrase)
        FeatsFromSpacyDoc.__init__(self, use_lemmas, entity_types_to_censor,
                                   tag_types_to_censor, strip_final_period)
        FeatsFromTopicModelBase.__init__(self, topic_model)


    def get_top_model_term_lists(self):
        return self._topic_model

    def _get_terms_from_doc(self, doc):
        return Counter(self._keyword_processor.extract_keywords(doc))

    def get_feats(self, doc):
        return Counter(self._get_terms_from_doc(str(doc)))
Exemplo n.º 21
0
class FeatsFromTopicModel(FeatsFromTopicModelBase, FeatsFromSpacyDoc):
    def __init__(self,
                 topic_model,
                 use_lemmas=False,
                 entity_types_to_censor=set(),
                 entity_types_to_use=None,
                 tag_types_to_censor=set(),
                 strip_final_period=False,
                 keyword_processor_args={'case_sensitive': False}):
        self._keyword_processor = KeywordProcessor(**keyword_processor_args)
        self._topic_model = topic_model.copy()
        if keyword_processor_args.get('case_sensitive', None) is False:
            for k, v in self._topic_model.items():
                self._topic_model[k] = [e.lower() for e in v]
        for keyphrase in reduce(lambda x, y: set(x) | set(y),
                                self._topic_model.values()):
            self._keyword_processor.add_keyword(keyphrase)
        FeatsFromSpacyDoc.__init__(self, use_lemmas, entity_types_to_censor,
                                   tag_types_to_censor, strip_final_period)
        FeatsFromTopicModelBase.__init__(self, topic_model)

    def get_top_model_term_lists(self):
        return self._topic_model

    def _get_terms_from_doc(self, doc):
        return Counter(self._keyword_processor.extract_keywords(str(doc)))

    def get_feats(self, doc):
        return Counter(self._get_terms_from_doc(str(doc)))
Exemplo n.º 22
0
def get_sentences_for_keyword(keywords, sentences):
    """
    
        For each keyword, find the sentence(s) that correspond to that keyword
    
    """

    keyword_processor = KeywordProcessor(
    )  # use this implementation as fast alternative to keyword matching
    keyword_sentences = {}

    # loop through all keywords
    for word in keywords:
        keyword_sentences[word] = []
        keyword_processor.add_keyword(word)

    # loop through each sentence and keyword
    for sentence in sentences:
        keywords_found = keyword_processor.extract_keywords(sentence)
        for key in keywords_found:
            keyword_sentences[key].append(sentence)

    for key in keyword_sentences.keys():
        values = keyword_sentences[key]
        values = sorted(values, key=len, reverse=True)
        keyword_sentences[key] = values

    return keyword_sentences
Exemplo n.º 23
0
def test_flashtext():
    keyword_processor = KeywordProcessor()
    keyword_processor.add_keyword("Big Apple", "New York")
    keyword_processor.add_keyword("Bay Area")
    keywords_found = keyword_processor.extract_keywords(
        "I love big Apple and Bay Area.", span_info=True)
    print(keywords_found)
Exemplo n.º 24
0
    def extract_summary(self):
        result = {}
        result['Development Languages'] = []

        keyword_processor = KeywordProcessor()
        keyword_processor.add_keywords_from_dict(self.skills_list)

        for keyword in self.candidate_keywords:
            matched_keyword = keyword_processor.extract_keywords(
                self.stemmer.stem(keyword))

            if not matched_keyword:
                # print('"{}" is not matched in our skill database.'.format(keyword))
                continue

            for i in range(len(matched_keyword)):
                category, name = str(matched_keyword[i]).split(": ", 1)

                if category not in result:
                    result[category] = {}

                if name not in result[category]:
                    result[category][name] = []

                result[category][name].append(keyword)

                if category == 'Tools & Technologies':
                    result['Development Languages'].append(keyword)

        return result
def searchkeywords(key,values,words):
	keyword_processor = KeywordProcessor()
	for term_1 in values:
		keyword_processor.add_keyword(term_1)
		keywords_found = keyword_processor.extract_keywords(str(words))
		
	#print("Set correspoding to " + key + str(set(keywords_found)))
	return list(set(keywords_found)),key
Exemplo n.º 26
0
class PhraseSubPopulation(SubPopulation):
    r"""
    Filter samples based on a group of phrases

    for example, with phrase_name = 'question'::

        sample 1: "Who is Jack.", score: 1
        sample 2: "I am Jack.", score: 0
    """
    def __init__(self, phrase_name='negation'):
        super().__init__()
        self.phrase_name = phrase_name
        if self.phrase_name == 'negation':
            self.phrases = NEGATION
        elif self.phrase_name == 'question':
            self.phrases = QUESTION

        self.phrase_processor = KeywordProcessor(case_sensitive=True)
        self.phrase_processor.add_keywords_from_dict(
            {self.phrase_name: self.phrases})

    def __repr__(self):
        return "PhraseSubPopulation" + "-" + self.phrase_name

    def phrase_match(self, text):
        match = False
        # Search for phrases
        result = self.phrase_processor.extract_keywords(text)
        if result:
            match = True
        return match

    def _score(self, sample, fields, **kwargs):
        """
        1 or 0 indicates whether sample fields match phrase groups

        :param sample: data sample
        :param list fields: list of field str
        :param kwargs:
        :return int: score for sample

        """

        text = ' '.join([sample.get_text(field) for field in fields])
        match = self.phrase_match(text)

        return match

    def get_slice(self, scores, dataset):
        r"""
        Save the samples that mach the phrase groups

        """
        sub_samples = []
        for i, sample in enumerate(dataset):
            if scores[i]:
                sub_samples.append(sample)
        return sub_samples
Exemplo n.º 27
0
def flashmatch():
    lentities = entities['name'].tolist()
    processor = KeywordProcessor()
    processor.add_keywords_from_list(lentities)
    keywords['entity']=''
    for i in range(len(keywords)):
        keywords['entity'][i]=processor.extract_keywords(keywords['keywords'][i])
    nermatch()
    return(keywords)
Exemplo n.º 28
0
def flash_text():
    # haystack = "Narendra Narendra Modi is the Prime Minister of India. He was a Chief Minister of Gujarat. Gujarat is a nice place"
    haystack = "sof/vel is a short form for sofosbuvir/velpatasvir"
    # needles = ["Narendra Modi", "Prime Minister", "Gujarat", "Modi", "Narendra"]
    needles = ["sof/vel", "sofosbuvir/velpatasvir"]
    processor = KeywordProcessor()
    processor.add_keywords_from_list(needles)
    found = processor.extract_keywords(haystack)
    print found
Exemplo n.º 29
0
def extract_image_flag_flashtext(processor: KeywordProcessor, text: str):
    for (_, key), span_start, span_end in processor.extract_keywords(
            text, span_info=True):
        match_str = text[span_start:span_end]
        return {
            "text": match_str,
            "type": "text",
            "label": key,
        }
class _AnalyzeDocument(object):

    def __init__(self, document_id, ip_document, dictionary_terms):
        self.document_id = document_id
        self.ip_document = ip_document
        self.dictionary_terms = dictionary_terms
        self.keyword_processor = KeywordProcessor()
        self.keyword_processor.add_keywords_from_list(list(dictionary_terms.keys()))
        self.result = []

    def analyze(self, doc_location):
        for ip_paragraph in self.ip_document.get_paragraphs():
            if ip_paragraph.get_paragraph_index() < doc_location.get_paragraph_index():
                continue

            text = ip_paragraph.get_text()
            match_results = self.keyword_processor.extract_keywords(text, span_info=True)
            if not match_results:
                continue

            for match, start, end in match_results:
                if doc_location.get_paragraph_index() == ip_paragraph.get_paragraph_index():
                    if start < doc_location.get_start_offset():
                        continue
                sbh_uri = self.dictionary_terms[match]
                analyze_result = AnalyzeResult(ip_paragraph.get_paragraph_index(),
                                               match,
                                               sbh_uri,
                                               start,
                                               end-1)
                self.result.append(analyze_result)

    def remove_first_occurrence(self, paragraph_index, matching_term, sbh_uri, start_offset, end_offset):
        for index in reversed(range(len(self.result))):
            analyze_result = self.result[index]
            # if users want to manually enter in a sbh_uri then allow users  to remove current result
            # as long as the term and position where the term occurs in the document matches.
            if (analyze_result.get_paragraph_index() == paragraph_index
                    and analyze_result.get_matching_term() == matching_term
                    and analyze_result.get_start_offset() == start_offset
                    and analyze_result.get_end_offset() == end_offset):
                self.result.pop(index)
                return True
        return False

    def remove_all(self, term):
        removed_item = []
        for index in reversed(range(len(self.result))):
            analyze_result = self.result[index]
            if analyze_result.get_matching_term() == term:
                self.result.pop(index)
                removed_item.append(analyze_result)
        return removed_item

    def get_result(self):
        return self.result
#!/usr/bin/python3
# coding: utf-8

# http://blog.csdn.net/CoderPai/article/details/78574863
# https://github.com/vi3k6i5/flashtext

# 关键词搜索
from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor()
# keyword_processor.add_keyword(<别名>, <标准词>)
keyword_processor.add_keyword('广东', '广东省')
keyword_processor.add_keyword('北京')
keywords_found = keyword_processor.extract_keywords('北京与广东都是中国的')
print(keywords_found)
# ['北京', '广东省']

# 关键词替换
keyword_processor.add_keyword('A卡', '邂逅a卡')
new_sentence = keyword_processor.replace_keywords('a卡与b卡哪个卡好')
print(new_sentence)
# Out[40]: '邂逅a卡与b卡哪个卡好'


# 提取关键词,区分大小写字母
keyword_processor = KeywordProcessor(case_sensitive=True)
keyword_processor.add_keyword('A卡', '邂逅a卡')
keyword_processor.add_keyword('b卡')
keywords_found = keyword_processor.extract_keywords('a卡与b卡哪个卡好?')
print(keywords_found)
# ['b卡']