예제 #1
0
    def append_data(self, bigrams, unigrams):
        anaghash_map = {
            anagram_hash(word): set()
            for word in bigrams.keys() + unigrams.keys()
        }

        for word in bigrams.keys() + unigrams.keys():
            anaghash_map[anagram_hash(word)].add(word)

        self.anagram_hashmap = anaghash_map

        clean_word = re.compile(r"^[a-zA-Z '-]+$")
        alphabet = set()

        for word in unigrams:
            word = " " + word + " "
            chars = [char for char in word]  # Getting letters from the word
            chars += map(add, chars[:-1],
                         chars[1:])  # Adding bigrams to the list

            alphabet = alphabet.union([
                anagram_hash(char) for char in set(chars)
                if not clean_word.match(char) is None
            ])

        alphabet.add(0)

        self.anagram_alphabet = alphabet
        self.save()
예제 #2
0
def select_anagrams(token, structures):
    """Select possible anagrams for a given token

    Parameters:
        token (:func:`str`): Cleaned token
        structures (:func:`dict`): Datastructures from file

    Returns:
        :func:`dict` - Possible anagrams (keys) along with their score (values)
    """
    anagrams = {}
    focus_alphabet = generate_alphabet_from_word(token[1])
    token_hash = anagram_hash(token)

    hash_list = []
    for c in structures["alphabet"]:
        for f in focus_alphabet:
            hash_list.append(token_hash + c - f)

    hash_counter = Counter(hash_list)  # Counting retrieval occurence

    for h in set(hash_counter.keys()).intersection(set(structures["anagrams"].keys())):
        count = hash_counter[h]
        anag_list = [anag for anag in structures["anagrams"][h] if edit_distance(anag, token) <= 3]

        for anag in anag_list:
            anag_score = rate_anagram(structures["occurence_map"], token, anag, count)

            if anag_score > 0:
                anagrams[anag] = anag_score

    return anagrams
예제 #3
0
def generate_alphabet_from_word(word):
    """Generate anagram hash for all chars in a word

    Parameters:
        word (:func:`str`): Word to generate hash
    Returns:
        set - Set of hashes
    """
    word = " "+word+" "
    chars = [char for char in word]  # Getting letters from the word
    chars += map(add, chars[:-1], chars[1:])  # Adding bigrams to the list

    # Computing hash of items and add 0 to the list
    return set([0] + [anagram_hash(c) for c in set(chars)])
예제 #4
0
def generate_alphabet_from_word(word):
    """Generate anagram hash for all chars in a word

    Parameters:
        word (:func:`str`): Word to generate hash
    Returns:
        set - Set of hashes
    """
    word = " " + word + " "
    chars = [char for char in word]  # Getting letters from the word
    chars += map(add, chars[:-1], chars[1:])  # Adding bigrams to the list

    # Computing hash of items and add 0 to the list
    return set([0] + [anagram_hash(c) for c in set(chars)])
예제 #5
0
    def append_data(self, bigrams, unigrams):
        anaghash_map = {anagram_hash(word): set() for word in bigrams.keys() + unigrams.keys()}

        for word in bigrams.keys() + unigrams.keys():
            anaghash_map[anagram_hash(word)].add(word)

        self.anagram_hashmap = anaghash_map

        clean_word = re.compile(r"^[a-zA-Z '-]+$")
        alphabet = set()

        for word in unigrams:
            word = " "+word+" "
            chars = [char for char in word]  # Getting letters from the word
            chars += map(add, chars[:-1], chars[1:])  # Adding bigrams to the list

            alphabet = alphabet.union([anagram_hash(char) for char in set(chars)
                                       if not clean_word.match(char) is None])

        alphabet.add(0)

        self.anagram_alphabet = alphabet
        self.save()
예제 #6
0
def select_anagrams(token, structures):
    """Select possible anagrams for a given token

    Parameters:
        token (:func:`str`): Cleaned token
        structures (:func:`dict`): Datastructures from file

    Returns:
        :func:`dict` - Possible anagrams (keys) along with their score (values)
    """
    anagrams = {}
    focus_alphabet = generate_alphabet_from_word(token[1])
    token_hash = anagram_hash(token)

    hash_list = []
    for c in structures["alphabet"]:
        for f in focus_alphabet:
            hash_list.append(token_hash + c - f)

    hash_counter = Counter(hash_list)  # Counting retrieval occurence

    for h in set(hash_counter.keys()).intersection(
            set(structures["anagrams"].keys())):
        count = hash_counter[h]
        anag_list = [
            anag for anag in structures["anagrams"][h]
            if edit_distance(anag, token) <= 3
        ]

        for anag in anag_list:
            anag_score = rate_anagram(structures["occurence_map"], token, anag,
                                      count)

            if anag_score > 0:
                anagrams[anag] = anag_score

    return anagrams