Exemplo n.º 1
0
    def test_readme_example(self):
        '''
        As used in the projects README. If you have to change this test case,
        please update the README accordingly.
        '''
        kwtree = KeywordTree(case_insensitive=True)
        kwtree.add('malaga')
        kwtree.add('lacrosse')
        kwtree.add('mallorca')
        kwtree.add('mallorca bella')
        kwtree.add('orca')
        kwtree.finalize()

        result = kwtree.search('My favorite islands are malaga and sylt.')
        self.assertEqual(('malaga', 24), result)

        result = kwtree.search(
            'idontlikewhitespaceswhereismalacrossequestionmark')
        self.assertEqual(('lacrosse', 29), result)

        results = kwtree.search_all('malheur on mallorca bellacrosse')
        self.assertIsNotNone(results)
        self.assertEqual(('mallorca', 11), next(results))
        self.assertEqual(('orca', 15), next(results))
        self.assertEqual(('mallorca bella', 11), next(results))
        self.assertEqual(('lacrosse', 23), next(results))
        with self.assertRaises(StopIteration):
            next(results)
Exemplo n.º 2
0
    def _search_for_key_words(self, kw_tree: KeywordTree, tokens: List[str],
                              resulted_tokens: List[int],
                              unique_indices: Set[int]):
        """ Ищет термины в тексте

        :param kw_tree: Префиксное дерево
        :param tokens: Список токенов, среди которых ищутся термины
        :param resulted_tokens: Список токенов, которые являются терминами
        :param unique_indices: Уникальные id токенов, которые являются терминами (нужно, чтобы избежать пересечения
        терминов)
        """
        for result in kw_tree.search_all(tokens):
            indexes = []
            tokens_r = result[0]
            start_index = result[1]
            for k in range(len(tokens_r)):
                indexes.append(start_index + k)
            is_in_indices = False
            for i in indexes:
                if i in unique_indices:
                    is_in_indices = True
            if not is_in_indices:
                resulted_tokens.append(indexes)
            for i in indexes:
                unique_indices.add(i)
Exemplo n.º 3
0
def create_keywordtree(lst, s):
    kwtree = KeywordTree(case_insensitive=True)
    for w in lst:
        kwtree.add(w)
    kwtree.finalize()
    # (keyword, position)のタプルのリストを返す
    res = kwtree.search_all(s)
    return res
Exemplo n.º 4
0
    def test_search_all_issue_1_similar(self):
        text = '/foo/bar'
        words = ['/bara', '/foo/barb', 'bar']
        tree = KeywordTree(case_insensitive=True)
        for word in words:
            tree.add(word)
        tree.finalize()

        results = tree.search_all(text)

        self.assertEqual(('bar', 5), next(results))
Exemplo n.º 5
0
def search(patterns, content):
    kwtree = KeywordTree(case_insensitive=True)
    for p in patterns:
        kwtree.add(p)

    kwtree.finalize()
    results = kwtree.search_all(content)
    result_list = []
    for result in results:
        result_list.append(result[0])
    return result_list
def ahocorasick_all_match(text, keywords):
    kwtree_all = KeywordTree(case_insensitive=True)
    for key in keywords:
        kwtree_all.add(key)
    kwtree_all.finalize()

    all_match = list()
    results = kwtree_all.search_all(text)
    for result in results:
        if result[0] in all_match:
            pass
        else:
            all_match.append(result[0])

    return len(all_match)
Exemplo n.º 7
0
    def test_many_keywords(self):
        kwtree = KeywordTree(case_insensitive=True)
        with open('tests/data/names.txt') as keyword_file:
            keyword_list = list(map(str.strip, keyword_file.readlines()))

        for kw in keyword_list:
            kwtree.add(kw)

        kwtree.finalize()
        with open('tests/data/textblob.txt') as keyword_file:
            textblob = keyword_file.read()

        result = kwtree.search(textblob)
        self.assertEqual(('Dawn Higgins', 34153), result)

        results = kwtree.search_all(textblob)
        self.assertIsNotNone(results)
        self.assertEqual(('Dawn Higgins', 34153), next(results))
        with self.assertRaises(StopIteration):
            next(results)
Exemplo n.º 8
0
class OWMCitySlot:
    def __init__(
        self,
        path_to_geo_entities: str = "data/openweathermap_city_list.json"
    ) -> None:
        """Initialize a trie for finding city names.

        :param path_to_geo_entities: filepath to a JSON file containing a list of cities
            file format: ["Ḩeşār-e Sefīd", "‘Ayn Ḩalāqīm", "Taglag", ..... , "Gerton"]
            this list was created using the source file: https://bulk.openweathermap.org/sample/city.list.json.gz
        :type path_to_geo_entities: str
        """
        self.geonames = self._load_from_json(path_to_geo_entities)
        self.kwtree = KeywordTree(case_insensitive=True)
        for geo in self.geonames:
            self.kwtree.add(f" {geo} ")
        self.kwtree.finalize()

    def _load_from_json(self, path_to_geo_entities: str) -> List[str]:
        """Load a list with city names from a JSON file.

        :param path_to_geo_entities: filepath to a JSON file
        :type path_to_geo_entities: str
        :return: a list containing city names
        :rtype: List[str]
        """
        with open(path_to_geo_entities, "r", encoding="utf-8") as f:
            json_data = json.load(f)
        geonames = set()
        for city in json_data:
            geonames.add(city)
        return list(geonames)

    def find_geo_names_in_utterance(self, utterance: str) -> str:
        """Search the first occurrence of the location name in utterance.

        :param utterance: human utterance
        :type utterance: str
        :return: a location name or an empty string if nothing found
        :rtype: str
        """
        # replace punctuation with spaces
        for p in string.punctuation:
            utterance = utterance.replace(p, " ")
        # delete excessive spaces
        utterance = re.sub(r"\s{2,}", " ", utterance.lower()).strip()
        results = list(self.kwtree.search_all(" %s " % utterance))
        # TODO the method could be improved if we search all geo names and then filter
        # the most precises geo entity.
        # User may write: "Massachusetts Boston" -> It has 2 entities, and Boston is preferred
        # because it is more precise location.
        return self.get_best_match(results)

    def get_best_match(self, results: Iterable[Tuple[str, int]]) -> str:
        """Select from the objects with the lowest index the object with the longest length.

         Usually the earliest entity is the most precise.
         For example for the utterance: "west valley city utah", we receive:
         [(' West ', 0), (' West Valley ', 0), (' Valley ', 5), (' West Valley City ', 0),
         (' Valley City ', 5), (' Utah ', 17)], we should select "West Valley City".

        :param results: a sequence with the following pairs (<location_name>, <index>)
        :type results: Iterable[Sequence[str, int]]
        :return: the best match or an empty string if the results are empty
        :rtype: str
        """
        best_match = ""
        if results:
            results = sorted(results,
                             key=lambda entity:
                             (entity[1], -len(entity[0].strip())))
            best_match = results[0][0].strip()
        return best_match

    def __call__(self, *args, **kwargs) -> str:
        """Find the best match in the trie.

        :return: a location name or an empty string if nothing found
        :rtype: str
        """
        return self.find_geo_names_in_utterance(*args, **kwargs)
# Add keywords to the trie
for word in keywords:
    kwtree.add(word[0])
    clean_keywords.append(word[0])
kwtree.finalize()

# Run a search on every tweet and add to dataframe

for i in range(len(tweets)):
    matches = []
    tmp = []

    # remove all special charactets
    for k in tweets[i].split("\n"):
        tweet = re.sub(r"[^a-zA-Z0-9]+", ' ', k).lower()
    results = kwtree.search_all(tweet)

    # match results
    for result in results:
        matches.append(result)

    # grab proper nouns from tweet to raise relevance score
    tagged_sent = pos_tag(tweets[i].split())
    propernouns = [
        re.sub('[^A-Za-z0-9]+', '', word).lower() for word, pos in tagged_sent
        if pos == 'NNP'
    ]

    propers = list(set(propernouns) & set(clean_keywords))

    # relevance score : matches(+1) + propernoun mathches(+1.5) / 20 (max 8 + 12)
Exemplo n.º 10
0
U = "CTAGTTAG"
V = "bvbccvCTAnGTTAGvfqvsdqvqCTAGTTAcGvfdCTACGATAGvvfGTTGTTfdvCTAtggAGsfsdfdCTAdddddddddddAGvbcvbcvb"

print("\n", "Text : ", V)
print("Motif :", U)

erreur = input("erreur  : ")
#Pi=textwrap.wrap(U, int(erreur))
Pi = [U[i:i + int(erreur)] for i in range(0, len(U), int(erreur))]
print(Pi)
#Aho-Corasick recherch
kwtree = KeywordTree(case_insensitive=True)
for i in range(0, len(Pi)):
    kwtree.add(Pi[i])
kwtree.finalize()
results = kwtree.search_all(V)
#afichage de tout les occurence
Vals = []
Keyz = []
for result in results:
    #print(result)
    Vals.append(result[0])
    Keyz.append(result[1])

dictionary = dict(zip(Keyz, Vals))
print(dictionary, "\n")


def AlignS(s, ran):
    l = []
    ff = ""
Exemplo n.º 11
0
	print(submission.title)
	indivList = []
	for top_level_comment in submission.comments:
		brandCount = 0
		repeatSet = set()

		# if the post had been deleted, it won't store the author name. Later, rows w/o author names get dropped.
		if (top_level_comment.author):
			indivList.append(top_level_comment.author.name)

			commentDate = time.strptime(time.ctime(top_level_comment.created_utc))
			indivList.append(commentDate.tm_year)
			indivList.append(commentDate.tm_mon)
			indivList.append(commentDate.tm_mday)
			indivList.append(top_level_comment.score)
			results = kwtree.search_all(top_level_comment.body)

			# For my analysis, an individual wearing more than one item of the same brand is not repeated for that post.
			for result in results:
				if result[0] not in repeatSet:
					repeatSet.add(result[0])
					indivList.append(result[0])
					brandCount += 1

		while brandCount < 7:
			indivList.append('0')
			brandCount += 1

		indivList.append(top_level_comment.permalink)

		if (len(indivList) == 13):