def test_readme_example(self): ''' As used in the projects README. If you have to change this test case, please update the README accordingly. ''' kwtree = KeywordTree(case_insensitive=True) kwtree.add('malaga') kwtree.add('lacrosse') kwtree.add('mallorca') kwtree.add('mallorca bella') kwtree.add('orca') kwtree.finalize() result = kwtree.search('My favorite islands are malaga and sylt.') self.assertEqual(('malaga', 24), result) result = kwtree.search( 'idontlikewhitespaceswhereismalacrossequestionmark') self.assertEqual(('lacrosse', 29), result) results = kwtree.search_all('malheur on mallorca bellacrosse') self.assertIsNotNone(results) self.assertEqual(('mallorca', 11), next(results)) self.assertEqual(('orca', 15), next(results)) self.assertEqual(('mallorca bella', 11), next(results)) self.assertEqual(('lacrosse', 23), next(results)) with self.assertRaises(StopIteration): next(results)
def _search_for_key_words(self, kw_tree: KeywordTree, tokens: List[str], resulted_tokens: List[int], unique_indices: Set[int]): """ Ищет термины в тексте :param kw_tree: Префиксное дерево :param tokens: Список токенов, среди которых ищутся термины :param resulted_tokens: Список токенов, которые являются терминами :param unique_indices: Уникальные id токенов, которые являются терминами (нужно, чтобы избежать пересечения терминов) """ for result in kw_tree.search_all(tokens): indexes = [] tokens_r = result[0] start_index = result[1] for k in range(len(tokens_r)): indexes.append(start_index + k) is_in_indices = False for i in indexes: if i in unique_indices: is_in_indices = True if not is_in_indices: resulted_tokens.append(indexes) for i in indexes: unique_indices.add(i)
def create_keywordtree(lst, s): kwtree = KeywordTree(case_insensitive=True) for w in lst: kwtree.add(w) kwtree.finalize() # (keyword, position)のタプルのリストを返す res = kwtree.search_all(s) return res
def test_search_all_issue_1_similar(self): text = '/foo/bar' words = ['/bara', '/foo/barb', 'bar'] tree = KeywordTree(case_insensitive=True) for word in words: tree.add(word) tree.finalize() results = tree.search_all(text) self.assertEqual(('bar', 5), next(results))
def search(patterns, content): kwtree = KeywordTree(case_insensitive=True) for p in patterns: kwtree.add(p) kwtree.finalize() results = kwtree.search_all(content) result_list = [] for result in results: result_list.append(result[0]) return result_list
def ahocorasick_all_match(text, keywords): kwtree_all = KeywordTree(case_insensitive=True) for key in keywords: kwtree_all.add(key) kwtree_all.finalize() all_match = list() results = kwtree_all.search_all(text) for result in results: if result[0] in all_match: pass else: all_match.append(result[0]) return len(all_match)
def test_many_keywords(self): kwtree = KeywordTree(case_insensitive=True) with open('tests/data/names.txt') as keyword_file: keyword_list = list(map(str.strip, keyword_file.readlines())) for kw in keyword_list: kwtree.add(kw) kwtree.finalize() with open('tests/data/textblob.txt') as keyword_file: textblob = keyword_file.read() result = kwtree.search(textblob) self.assertEqual(('Dawn Higgins', 34153), result) results = kwtree.search_all(textblob) self.assertIsNotNone(results) self.assertEqual(('Dawn Higgins', 34153), next(results)) with self.assertRaises(StopIteration): next(results)
class OWMCitySlot: def __init__( self, path_to_geo_entities: str = "data/openweathermap_city_list.json" ) -> None: """Initialize a trie for finding city names. :param path_to_geo_entities: filepath to a JSON file containing a list of cities file format: ["Ḩeşār-e Sefīd", "‘Ayn Ḩalāqīm", "Taglag", ..... , "Gerton"] this list was created using the source file: https://bulk.openweathermap.org/sample/city.list.json.gz :type path_to_geo_entities: str """ self.geonames = self._load_from_json(path_to_geo_entities) self.kwtree = KeywordTree(case_insensitive=True) for geo in self.geonames: self.kwtree.add(f" {geo} ") self.kwtree.finalize() def _load_from_json(self, path_to_geo_entities: str) -> List[str]: """Load a list with city names from a JSON file. :param path_to_geo_entities: filepath to a JSON file :type path_to_geo_entities: str :return: a list containing city names :rtype: List[str] """ with open(path_to_geo_entities, "r", encoding="utf-8") as f: json_data = json.load(f) geonames = set() for city in json_data: geonames.add(city) return list(geonames) def find_geo_names_in_utterance(self, utterance: str) -> str: """Search the first occurrence of the location name in utterance. :param utterance: human utterance :type utterance: str :return: a location name or an empty string if nothing found :rtype: str """ # replace punctuation with spaces for p in string.punctuation: utterance = utterance.replace(p, " ") # delete excessive spaces utterance = re.sub(r"\s{2,}", " ", utterance.lower()).strip() results = list(self.kwtree.search_all(" %s " % utterance)) # TODO the method could be improved if we search all geo names and then filter # the most precises geo entity. # User may write: "Massachusetts Boston" -> It has 2 entities, and Boston is preferred # because it is more precise location. return self.get_best_match(results) def get_best_match(self, results: Iterable[Tuple[str, int]]) -> str: """Select from the objects with the lowest index the object with the longest length. Usually the earliest entity is the most precise. For example for the utterance: "west valley city utah", we receive: [(' West ', 0), (' West Valley ', 0), (' Valley ', 5), (' West Valley City ', 0), (' Valley City ', 5), (' Utah ', 17)], we should select "West Valley City". :param results: a sequence with the following pairs (<location_name>, <index>) :type results: Iterable[Sequence[str, int]] :return: the best match or an empty string if the results are empty :rtype: str """ best_match = "" if results: results = sorted(results, key=lambda entity: (entity[1], -len(entity[0].strip()))) best_match = results[0][0].strip() return best_match def __call__(self, *args, **kwargs) -> str: """Find the best match in the trie. :return: a location name or an empty string if nothing found :rtype: str """ return self.find_geo_names_in_utterance(*args, **kwargs)
# Add keywords to the trie for word in keywords: kwtree.add(word[0]) clean_keywords.append(word[0]) kwtree.finalize() # Run a search on every tweet and add to dataframe for i in range(len(tweets)): matches = [] tmp = [] # remove all special charactets for k in tweets[i].split("\n"): tweet = re.sub(r"[^a-zA-Z0-9]+", ' ', k).lower() results = kwtree.search_all(tweet) # match results for result in results: matches.append(result) # grab proper nouns from tweet to raise relevance score tagged_sent = pos_tag(tweets[i].split()) propernouns = [ re.sub('[^A-Za-z0-9]+', '', word).lower() for word, pos in tagged_sent if pos == 'NNP' ] propers = list(set(propernouns) & set(clean_keywords)) # relevance score : matches(+1) + propernoun mathches(+1.5) / 20 (max 8 + 12)
U = "CTAGTTAG" V = "bvbccvCTAnGTTAGvfqvsdqvqCTAGTTAcGvfdCTACGATAGvvfGTTGTTfdvCTAtggAGsfsdfdCTAdddddddddddAGvbcvbcvb" print("\n", "Text : ", V) print("Motif :", U) erreur = input("erreur : ") #Pi=textwrap.wrap(U, int(erreur)) Pi = [U[i:i + int(erreur)] for i in range(0, len(U), int(erreur))] print(Pi) #Aho-Corasick recherch kwtree = KeywordTree(case_insensitive=True) for i in range(0, len(Pi)): kwtree.add(Pi[i]) kwtree.finalize() results = kwtree.search_all(V) #afichage de tout les occurence Vals = [] Keyz = [] for result in results: #print(result) Vals.append(result[0]) Keyz.append(result[1]) dictionary = dict(zip(Keyz, Vals)) print(dictionary, "\n") def AlignS(s, ran): l = [] ff = ""
print(submission.title) indivList = [] for top_level_comment in submission.comments: brandCount = 0 repeatSet = set() # if the post had been deleted, it won't store the author name. Later, rows w/o author names get dropped. if (top_level_comment.author): indivList.append(top_level_comment.author.name) commentDate = time.strptime(time.ctime(top_level_comment.created_utc)) indivList.append(commentDate.tm_year) indivList.append(commentDate.tm_mon) indivList.append(commentDate.tm_mday) indivList.append(top_level_comment.score) results = kwtree.search_all(top_level_comment.body) # For my analysis, an individual wearing more than one item of the same brand is not repeated for that post. for result in results: if result[0] not in repeatSet: repeatSet.add(result[0]) indivList.append(result[0]) brandCount += 1 while brandCount < 7: indivList.append('0') brandCount += 1 indivList.append(top_level_comment.permalink) if (len(indivList) == 13):