class Entity(object): def __init__(self, nlp, keywords_list=[], keywords_dict={}, keywords_file=None, label='', attrs=('has_entities', 'is_entity', 'entity_desc', 'entities')): self._has_entities, self._is_entity, self._entity_desc, self._entities = attrs self.keyword_processor = KeywordProcessor() self.keyword_processor.add_keywords_from_list(keywords_list) self.keyword_processor.add_keywords_from_dict(keywords_dict) if keywords_file: self.keyword_processor.add_keyword_from_file(keywords_file) self.label = label Doc.set_extension(self._has_entities, getter=self.has_entities) Doc.set_extension(self._entities, getter=self.iter_entities) Span.set_extension(self._has_entities, getter=self.has_entities) Span.set_extension(self._entities, getter=self.iter_entities) Token.set_extension(self._is_entity, default=False) Token.set_extension(self._entity_desc, getter=self.get_entity_desc) def __call__(self, doc): matches = self.keyword_processor.extract_keywords(doc.text, span_info=True) spans = [] # keep spans here to merge them later for _, start, end in matches: entity = doc.char_span(start, end, label=self.label) for token in entity: token._.set(self._is_entity, True) spans.append(entity) doc.ents = list(doc.ents) + [entity] for span in spans: span.merge() return doc def has_entities(self, tokens): return any(token._.get(self._is_entity) for token in tokens) def iter_entities(self, tokens): return [(t.text, i, t._.get(self._entity_desc)) for i, t in enumerate(tokens) if t._.get(self._is_entity)] def get_entity_desc(self, token): return token.text
def __call__(self, from_file=False, file_path=None, stdout=False): watchlist = self.open_watchlist() keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_list(watchlist) if from_file: egg_path = _ask_spy_file() elif file_path: egg_path = file_path else: raise ValueError("Expected egg, from_file or file_path argument.") gui = Scanner(egg_path) for line, reaction in enumerate(open(egg_path).readlines(), start=1): if reaction and not reaction.startswith('#'): match = keyword_processor.extract_keywords(reaction) if match: matches = ', '.join(match) reaction = reaction.strip() gui.insert_data(line, matches, reaction) if stdout: log.info( f"Line {line}: {', '.join(match)} \n{reaction}")
def filter_keywords(data: pd.DataFrame, keywords: Iterable[str], source: Union[str, List[str]] = "text", case_sensitive: bool = True) -> pd.DataFrame: """Filters out rows that do not have any keywords in any source column(s) Args: data: dataframe containing [source] column(s) of type str keywords: Iterable of strings to search for in text source: column name or names containing the source text case_sensitive: Toggle keyword case sensitivity Returns: Original dataframe with rows filtered out """ # Get keyword processor and add keywords proc = KeywordProcessor(case_sensitive=case_sensitive) proc.add_keywords_from_list(keywords) # If single source column, only need to check one element in each row, otherwise, apply any(..) # to check all source columns iteratively through each row if isinstance(source, str): mask = data[source].apply( lambda sent: bool(proc.extract_keywords(sent))) else: mask = data[source].apply(lambda sents: any( bool(proc.extract_keywords(sent)) for sent in sents)) output = data[mask] # Use mask to filter out rows without any keywords return output
def cleanDomain(self): L = [] black = [] keyword_processor = KeywordProcessor() f = open(self.whitePath, "r") for i in f.readlines(): L.append(i.strip('\r\n')) keyword_processor.add_keywords_from_list(L) try: for i in self.param: #待过滤的domain hostname, ip, ipBelong, firstVisitTime, lastestVisitTime, userSet, visitNum, similarityValue, imitate = \ i.strip('\r\n').split(";") hostname = hostname.strip('\r\n') try: # if re.search("\."+j.strip('\r\n')+'$',hostname): if len(keyword_processor.extract_keywords(hostname)) == 0: black.append(i) except: traceback.print_exc() except: traceback.print_exc() print("匹配错误") finally: f.close() return black
def keywords(all_needles, case_flag = False): all_needles = all_needles[['source_id','needle']].drop_duplicates() all_needles['needle'] = all_needles['needle'].str.strip() gen_needles = all_needles['needle'].to_list() keyword_processor = KeywordProcessor(case_sensitive=case_flag) keyword_processor.add_keywords_from_list(gen_needles) return keyword_processor
def flash_text(): # haystack = "Narendra Narendra Modi is the Prime Minister of India. He was a Chief Minister of Gujarat. Gujarat is a nice place" haystack = "sof/vel is a short form for sofosbuvir/velpatasvir" # needles = ["Narendra Modi", "Prime Minister", "Gujarat", "Modi", "Narendra"] needles = ["sof/vel", "sofosbuvir/velpatasvir"] processor = KeywordProcessor() processor.add_keywords_from_list(needles) found = processor.extract_keywords(haystack) print found
def flashmatch(): lentities = entities['name'].tolist() processor = KeywordProcessor() processor.add_keywords_from_list(lentities) keywords['entity']='' for i in range(len(keywords)): keywords['entity'][i]=processor.extract_keywords(keywords['keywords'][i]) nermatch() return(keywords)
def load_data(): df = pd.read_csv("data/supplements.csv") title_processor = KeywordProcessor() e_title_processor = KeywordProcessor() title_processor.add_keywords_from_list(list(df["title"].values)) e_title_processor.add_keywords_from_list(list(df["numeric_title"].values)) return df, title_processor, e_title_processor
def load_names_processor(): # :: Load vocabulary for is_name features :: global KEYWORD_PROCESSOR from flashtext import KeywordProcessor KEYWORD_PROCESSOR = KeywordProcessor() KEYWORD_PROCESSOR.add_keywords_from_list(list(load_names(FR_NAMES_PATH).keys())) logging.info("Loaded french proper names...") return KEYWORD_PROCESSOR
class _AnalyzeDocument(object): def __init__(self, document_id, ip_document, dictionary_terms): self.document_id = document_id self.ip_document = ip_document self.dictionary_terms = dictionary_terms self.keyword_processor = KeywordProcessor() self.keyword_processor.add_keywords_from_list(list(dictionary_terms.keys())) self.result = [] def analyze(self, doc_location): for ip_paragraph in self.ip_document.get_paragraphs(): if ip_paragraph.get_paragraph_index() < doc_location.get_paragraph_index(): continue text = ip_paragraph.get_text() match_results = self.keyword_processor.extract_keywords(text, span_info=True) if not match_results: continue for match, start, end in match_results: if doc_location.get_paragraph_index() == ip_paragraph.get_paragraph_index(): if start < doc_location.get_start_offset(): continue sbh_uri = self.dictionary_terms[match] analyze_result = AnalyzeResult(ip_paragraph.get_paragraph_index(), match, sbh_uri, start, end-1) self.result.append(analyze_result) def remove_first_occurrence(self, paragraph_index, matching_term, sbh_uri, start_offset, end_offset): for index in reversed(range(len(self.result))): analyze_result = self.result[index] # if users want to manually enter in a sbh_uri then allow users to remove current result # as long as the term and position where the term occurs in the document matches. if (analyze_result.get_paragraph_index() == paragraph_index and analyze_result.get_matching_term() == matching_term and analyze_result.get_start_offset() == start_offset and analyze_result.get_end_offset() == end_offset): self.result.pop(index) return True return False def remove_all(self, term): removed_item = [] for index in reversed(range(len(self.result))): analyze_result = self.result[index] if analyze_result.get_matching_term() == term: self.result.pop(index) removed_item.append(analyze_result) return removed_item def get_result(self): return self.result
def degree_list(self): degree_dict ={} with open(self.degree_file_path) as fp: lines = fp.read().splitlines() processor = KeywordProcessor() processor.add_keywords_from_list(lines) found = processor.extract_keywords(self.document, span_info=True) for count, value in enumerate(found): line_number = self.line_count(value[1]) degree_dict[value[0]] = line_number return(degree_dict)
def degree_list(document, line_index_dict): degree_dict = {} with open("doc_qualification/highest_qualification.txt") as fp: lines = fp.read().splitlines() processor = KeywordProcessor() processor.add_keywords_from_list(lines) found = processor.extract_keywords(document, span_info=True) for count, value in enumerate(found): line_number = line_count(value[1], line_index_dict) degree_dict[value[0]] = line_number return (degree_dict)
def test_list_loading(self): keyword_processor = KeywordProcessor() keyword_list = ["java", "product management"] keyword_processor.add_keywords_from_list(keyword_list) sentence = 'I know java and product management' keywords_extracted = keyword_processor.extract_keywords(sentence) self.assertEqual(keywords_extracted, ['java', 'product management'], "Failed file format one test") sentence_new = keyword_processor.replace_keywords(sentence) self.assertEqual(sentence_new, "I know java and product management", "Failed file format one test")
def tag(self, file, news): with open(file, 'rb') as fb: keywords = pickle.load(fb) keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_list(keywords) keywords_found = keyword_processor.extract_keywords(news) sets = set() for k in keywords_found: sets.add(k) return sets
def in_key_words_list(item, key_words_list): keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_list(key_words_list) found = keyword_processor.extract_keywords(item) if found: return found, 'Yes' else: return "No"
def look_for_imp_messages(message_string, keyword_sets): """ :param message_string: Line of text where we will search for keywords :param keyword_sets: list of list of keywords :return: keywords_extracted (list(str)): List of terms/keywords found in sentence that match our corpus """ kp = KeywordProcessor() for keyword_set in keyword_sets: kp.add_keywords_from_list(keyword_set) return kp.extract_keywords(message_string)
class ExactEntityLinking: def __init__(self, entities, case_sensitive=False): self.linker = KeywordProcessor(case_sensitive=case_sensitive) self.n_no_spans = 0 self.n_overlapping_spans = 0 self.n_multiple_ents = 0 logger.info("Case sensitive entities: {}".format(case_sensitive)) logger.info( "Building data structure with flashText for exact match entity linking (|E|={}) ..." .format(len(entities))) t = time.time() self.linker.add_keywords_from_list(list(set(entities))) t = (time.time() - t) // 60 logger.info("Took %d mins" % t) def link(self, text): spans = sorted([(start_span, end_span) for _, start_span, end_span in self.linker.extract_keywords(text, span_info=True)], key=lambda span: span[0]) if not spans: self.n_no_spans += 1 return # Remove overlapping matches, if any filtered_spans = list() for i in range(1, len(spans)): span_prev, span_next = spans[i - 1], spans[i] if span_prev[1] < span_next[0]: filtered_spans.append(spans[i]) else: self.n_overlapping_spans += 1 spans = filtered_spans[:] matches_texts = [text[s:e] for s, e in spans] # Check if any entity is present more than once, drop this sentence counts = collections.Counter(matches_texts) skip = False for _, count in counts.items(): if count > 1: skip = True self.n_multiple_ents += 1 break if skip: return text2span = {matches_texts[i]: spans[i] for i in range(len(spans))} return text2span
def addIsNameInformation(sentences, keyword_processor=None): """Adds information on whether a word is included or not in word dictionary""" if keyword_processor is None: from flashtext import KeywordProcessor keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_list( list(load_names(FR_NAMES_PATH).keys())) for sentenceIdx in range(len(sentences)): sentences[sentenceIdx]['is_name'] = [] for tokenIdx in range(len(sentences[sentenceIdx]['tokens'])): token = sentences[sentenceIdx]['tokens'][tokenIdx] sentences[sentenceIdx]['is_name'].append( getIsName(keyword_processor, token))
def test_extract_keywords_case_sensitive(self): """For each of the test case initialize a new KeywordProcessor. Add the keywords the test case to KeywordProcessor. Extract keywords and check if they match the expected result for the test case. """ for test_id, test_case in enumerate(self.test_cases): keyword_processor = KeywordProcessor(case_sensitive=True) for key in test_case['keyword_dict']: keyword_processor.add_keywords_from_list(test_case['keyword_dict'][key]) keywords_extracted = keyword_processor.extract_keywords(test_case['sentence'], span_info=True) for kwd in keywords_extracted: # returned keyword should match the sapn from sentence self.assertEqual( kwd[0], test_case['sentence'][kwd[1]:kwd[2]], "keywords span don't match the expected results for test case: {}".format(test_id))
def find_needles(): needles = [] start = time.time() db = Database(None) print "Connected to DB in", time.time() - start, "secs" start = time.time() db.execute("SELECT STR FROM `Final_consolidated`") print "Retrieved info from DB in", time.time() - start, "secs" start = time.time() for ele in db.cursor: needles.append(ele[0].lower()) print "Listed needles in", time.time() - start, "secs" start = time.time() processor = KeywordProcessor() processor.add_keywords_from_list(needles) print "Processed needles in", time.time() - start, "secs" start = time.time() file = open("hello.txt", "r") for haystack in file.readlines(): print "Before hyphen removal:" found = list(set(processor.extract_keywords(haystack.lower(), span_info=True))) string = "" for ele in found: ele = ele.replace('"', '\\"') string += ',"' + ele + '"' print string, "\n\n" haystack = haystack.replace("-", " ") print "After hyphen removal:" found = list(set(processor.extract_keywords(haystack.lower()))) string = "" for ele in found: ele = ele.replace('"', '\\"') string += ',"' + ele + '"' print string, "\n\n\n" # haystacks = [] # db.execute("SELECT id, detailed_description FROM `sherlock_ct_new`.`trials`") # for row in db.cursor: # if row[1]: # haystack = row[1].lower() # found = list(set(processor.extract_keywords(haystack.lower()))) # file.write(str(row[0]) + "," + json.dumps(found)) file.close() time.time() - start print "Found needles within haystack in", time.time() - start, "secs"
class Entity(object): name = 'entity' def __init__(self, nlp, keywords_list=[], keywords_dict={}, keywords_file=None, label='', attrs=('has_entities', 'is_entity', 'entity_desc', 'entities')): self._has_entities, self._is_entity, self._entity_desc, self._entities = attrs self.keyword_processor = KeywordProcessor() self.keyword_processor.add_keywords_from_list(keywords_list) self.keyword_processor.add_keywords_from_dict(keywords_dict) if keywords_file: self.keyword_processor.add_keyword_from_file(keywords_file) self.label = label # Add attributes Doc.set_extension(self._has_entities, getter=self.has_entities) Doc.set_extension(self._entities, getter=self.iter_entities) Span.set_extension(self._has_entities, getter=self.has_entities) Span.set_extension(self._entities, getter=self.iter_entities) Token.set_extension(self._is_entity, default=False) Token.set_extension(self._entity_desc, getter=self.get_entity_desc) def __call__(self, doc): matches = self.keyword_processor.extract_keywords(doc.text, span_info=True) for _, start, end in matches: entity = doc.char_span(start, end, label=self.label) if entity: for token in entity: token._.set(self._is_entity, True) # Overwrite doc.ents and add entity – be careful not to replace! doc.ents = list(doc.ents) + [entity] return doc def has_entities(self, tokens): return any(token._.get(self._is_entity) for token in tokens) def iter_entities(self, tokens): return [(t.text, i, t._.get(self._entity_desc)) for i, t in enumerate(tokens) if t._.get(self._is_entity)] def get_entity_desc(self, token): return token.text
def keyword_bool_proc(keywords, case_sensitive): """Creates a process that returns a boolean value that corresponds to whether or not the passed in text contains any of the provided keywords (respective to case sensitivity) Args: keywords: Iterable of strings to search for in text case_sensitive: Toggle keyword case sensitivity Returns: Callable(str) -> bool that returns True if the string contains a keyword and false otherwise """ proc = KeywordProcessor(case_sensitive=case_sensitive) proc.add_keywords_from_list(keywords) def bool_proc(text): return bool(proc.extract_keywords(text)) return bool_proc
def mp_extract_keywords(keywords, sentences, case_sensitive=False): corpus = dict() kp = KeywordProcessor(case_sensitive=case_sensitive) kp.add_keywords_from_list(keywords) for sentence in sentences: if isinstance(sentence, list): sentence = ' '.join(sentence) keywords_found = kp.extract_keywords(sentence) for keyword in keywords_found: corpus.setdefault(keyword, set()) corpus[keyword].add(sentence) return corpus
def flash_match(text, frequencies: FreqDist, filename='all_linked_skills.txt') -> SkillSet: keyword_processor = KeywordProcessor() #with open('list.pkl', 'rb') as fs: # my_list = pickle.load(fs) my_list = [] with open(filename, 'r', encoding='utf-8') as fs: for line in fs.readlines(): skill = line.strip("\n").lower() my_list.append(skill) keyword_processor.add_keywords_from_list(my_list) matches = keyword_processor.extract_keywords(text) skill_dictionary = {} for match in matches: if frequencies[match] > 0: skill_dictionary[match] = frequencies[match] return SkillSet(skill_dictionary)
def main(): if len(sys.argv) < 3: print("Usage: python RunModel_modified.py modelPath inputPath") exit() modelPath = sys.argv[1] inputPath = sys.argv[2] # :: Read input :: with open(inputPath, 'r') as f: text = f.read() # :: Load vocabulary for is_name features :: from flashtext import KeywordProcessor keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_list(list(load_names(FR_NAMES_PATH).keys())) # :: Load the model :: lstmModel = BiLSTM.loadModel(modelPath) # :: Prepare the input :: pre_treated_lines, _ = pre_treat_text(text) tokenized_sentences = tokenize_text(pre_treated_lines) sentences = [{'tokens': sent} for sent in tokenized_sentences] addCharInformation(sentences) addCasingInformation(sentences) addIsNameInformation(sentences, keyword_processor=keyword_processor) dataMatrix = createMatrices(sentences, lstmModel.mappings, True) # :: Tag the input :: tags = lstmModel.tagSentences(dataMatrix) # :: Output to stdout :: for sentenceIdx in range(len(sentences)): tokens = sentences[sentenceIdx]['tokens'] for tokenIdx in range(len(tokens)): tokenTags = [] for modelName in sorted(tags.keys()): tokenTags.append(tags[modelName][sentenceIdx][tokenIdx]) print("%s\t%s" % (tokens[tokenIdx], "\t".join(tokenTags))) print("")
def load_thesaurus(thesaurus_file): df = pd.read_csv(thesaurus_file) df.fillna('', inplace=True) thesaurus = KeywordProcessor() thesaurus.add_keywords_from_list(list(df['name'].values)) def use(term): u = df[df.name == term]['USE'] if len(u) == 0 or u.values[0] == '': return term else: return u.values[0] def transform(txt): terms = thesaurus.extract_keywords(txt) terms = [use(t) for t in terms] return terms thesaurus.transform = transform return thesaurus
class trie(Datasource): """ Generates a list of strings that match any of a set of provided substrings :Parameters: substrings: `list` ( `str` ) A list of substrings to find in the text text_datasource : :class:`revscoring.Datasource` A datasource that returns a `str` or a `list` of `str` name : `str` A name for the new datasource """ def __init__(self, substrings, text_datasource=None, case_sensitive=False, exclusions=None, name=None): self.word_processor = KeywordProcessor(case_sensitive=case_sensitive) if exclusions is not None: substrings = list(set(substrings).difference(set(exclusions))) self.word_processor.add_keywords_from_list(substrings) name = self._format_name(name, [substrings, text_datasource]) super().__init__(name, self.process, depends_on=[text_datasource]) def process(self, text_or_texts): if text_or_texts is None: return [] elif isinstance(text_or_texts, str): text = text_or_texts return self.word_processor.extract_keywords(text) else: texts = text_or_texts return [ substring for text in texts for substring in self.word_processor.extract_keywords(text) ]
def apply_keywords_tags( # pylint: disable=R0913 df_or_file: Union[str, pd.DataFrame], keywords: Iterable[str], source: Union[str, List[str]] = "text", case_sensitive: bool = True, span: bool = False, tag_suffix: str = TASK_SETTINGS["keywords"]["suffix"], file: Optional[str] = None, ) -> pd.DataFrame: """Inserts keyword column to given dataframe which is of type: List[str] of the keywords found in the [source] column Args: df_or_file: Pandas dataframe or path to csv file with data keywords: Iterable of strings to search for in text source: column name or names containing the source text case_sensitive: Toggle keyword case sensitivity span: If true, will also return the spans the keywords were found in the source tag_suffix: Name of new column will be formatted as [source][tag_suffix] file: If provided, the output will be saved to a given file path as csv Returns: Pandas dataframe of data with attached keywords column """ data, source = _handle_data_and_source(df_or_file, source) # Get keyword processor and add keywords proc = KeywordProcessor(case_sensitive=case_sensitive) proc.add_keywords_from_list(keywords) # Extract keywords from all elements of each source column for col in source: data[f"{col}{tag_suffix}"] = data[col].progress_apply( lambda sent: proc.extract_keywords(sent, span_info=span)) # If file is provided, save to file as well if file is not None: data.to_csv(file) return data
def find_needles(): needles = [ "epclusa", "hcv", "chronic hepatitis c", "sofosbuvir+velpatasvir" ] start = time.time() processor = KeywordProcessor() processor.add_keywords_from_list(needles) print "Processed needles in", time.time() - start, "secs" start = time.time() file = open("hello.txt", "r") for haystack in file.readlines(): found = list( set(processor.extract_keywords(haystack.lower(), span_info=True))) print found # string = "" # for ele in found: # ele = ele.replace('"', '\\"') # string += ',"' + ele + '"' # print string, "\n\n\n" file.close() time.time() - start print "Found needles within haystack in", time.time() - start, "secs"
# input_vectorizer = CountVectorizer(input='content') # input_vectorizer.fit(input_documents(['data/file1.txt', 'data/file2.txt'])) google_api_key = 'AIzaSyDzwz905JyFQmmpVlF6JkujslnjrId0J1M' swj_key = 'AIzaSyCCGOwk_0HBW5uL91yno5jF-jODjcCB3Jg' my_cse_id = "008517825388850444903:eyrvyy-n0i4" # URL = "https://www.googleapis.com/customsearch/v1" URL = 'https://www.googleapis.com/customsearch/v1/siterestrict?' ENTITY_KEYWORD_IN_SUMMARY = [ 'district', 'District', 'province', 'Province', 'City', 'city', 'village', 'Village', 'Country', 'country' ] keyword_processor_summary = KeywordProcessor() keyword_processor_summary.add_keywords_from_list(ENTITY_KEYWORD_IN_SUMMARY) ENTITY_KEYWORD_IN_FREEBASE = [ 'location.', 'person.', 'organization.', ',government.', 'people' ] #, 'people', 'asteroid' keyword_processor_freebase = KeywordProcessor() keyword_processor_freebase.add_keywords_from_list(ENTITY_KEYWORD_IN_FREEBASE) langs = ['en'] def url2text(url): en_entity = None en_soup = None text = None parsed_url = urlparse(url) domain = parsed_url.netloc
keyword_processor.add_keyword('Bay Area') keywords_found = keyword_processor.extract_keywords('I love big Apple and Bay Area.') print(keywords_found) # ['Big Apple', 'Bay Area'] # 同时添加多个关键词 keyword_processor = KeywordProcessor() # 字典格式的关键词,其对应的key为最终匹配出的词,但key不记入关键词搜索的范围 keyword_dict = { "java": ["java_2e", "java programing"], "product management": ["PM", "product manager"] } # {'clean_name': ['list of unclean names']} keyword_processor.add_keywords_from_dict(keyword_dict) # Or add keywords from a list: keyword_processor.add_keywords_from_list(["java", "python"]) keyword_processor.extract_keywords('I am a product manager for a java_2e platform') # output ['product management', 'java'] # 删除关键词 keyword_processor = KeywordProcessor() keyword_dict = { "java": ["java_2e", "java programing"], "product management": ["PM", "product manager"] } keyword_processor.add_keywords_from_dict(keyword_dict) print(keyword_processor.extract_keywords('I am a product manager for a java_2e platform')) # output ['product management', 'java'] keyword_processor.remove_keyword('java_2e') print(keyword_processor.extract_keywords('I am a product manager for a java_2e platform')) # ['product management']