def append_data(self, bigrams, unigrams): anaghash_map = { anagram_hash(word): set() for word in bigrams.keys() + unigrams.keys() } for word in bigrams.keys() + unigrams.keys(): anaghash_map[anagram_hash(word)].add(word) self.anagram_hashmap = anaghash_map clean_word = re.compile(r"^[a-zA-Z '-]+$") alphabet = set() for word in unigrams: word = " " + word + " " chars = [char for char in word] # Getting letters from the word chars += map(add, chars[:-1], chars[1:]) # Adding bigrams to the list alphabet = alphabet.union([ anagram_hash(char) for char in set(chars) if not clean_word.match(char) is None ]) alphabet.add(0) self.anagram_alphabet = alphabet self.save()
def select_anagrams(token, structures): """Select possible anagrams for a given token Parameters: token (:func:`str`): Cleaned token structures (:func:`dict`): Datastructures from file Returns: :func:`dict` - Possible anagrams (keys) along with their score (values) """ anagrams = {} focus_alphabet = generate_alphabet_from_word(token[1]) token_hash = anagram_hash(token) hash_list = [] for c in structures["alphabet"]: for f in focus_alphabet: hash_list.append(token_hash + c - f) hash_counter = Counter(hash_list) # Counting retrieval occurence for h in set(hash_counter.keys()).intersection(set(structures["anagrams"].keys())): count = hash_counter[h] anag_list = [anag for anag in structures["anagrams"][h] if edit_distance(anag, token) <= 3] for anag in anag_list: anag_score = rate_anagram(structures["occurence_map"], token, anag, count) if anag_score > 0: anagrams[anag] = anag_score return anagrams
def generate_alphabet_from_word(word): """Generate anagram hash for all chars in a word Parameters: word (:func:`str`): Word to generate hash Returns: set - Set of hashes """ word = " "+word+" " chars = [char for char in word] # Getting letters from the word chars += map(add, chars[:-1], chars[1:]) # Adding bigrams to the list # Computing hash of items and add 0 to the list return set([0] + [anagram_hash(c) for c in set(chars)])
def generate_alphabet_from_word(word): """Generate anagram hash for all chars in a word Parameters: word (:func:`str`): Word to generate hash Returns: set - Set of hashes """ word = " " + word + " " chars = [char for char in word] # Getting letters from the word chars += map(add, chars[:-1], chars[1:]) # Adding bigrams to the list # Computing hash of items and add 0 to the list return set([0] + [anagram_hash(c) for c in set(chars)])
def append_data(self, bigrams, unigrams): anaghash_map = {anagram_hash(word): set() for word in bigrams.keys() + unigrams.keys()} for word in bigrams.keys() + unigrams.keys(): anaghash_map[anagram_hash(word)].add(word) self.anagram_hashmap = anaghash_map clean_word = re.compile(r"^[a-zA-Z '-]+$") alphabet = set() for word in unigrams: word = " "+word+" " chars = [char for char in word] # Getting letters from the word chars += map(add, chars[:-1], chars[1:]) # Adding bigrams to the list alphabet = alphabet.union([anagram_hash(char) for char in set(chars) if not clean_word.match(char) is None]) alphabet.add(0) self.anagram_alphabet = alphabet self.save()
def select_anagrams(token, structures): """Select possible anagrams for a given token Parameters: token (:func:`str`): Cleaned token structures (:func:`dict`): Datastructures from file Returns: :func:`dict` - Possible anagrams (keys) along with their score (values) """ anagrams = {} focus_alphabet = generate_alphabet_from_word(token[1]) token_hash = anagram_hash(token) hash_list = [] for c in structures["alphabet"]: for f in focus_alphabet: hash_list.append(token_hash + c - f) hash_counter = Counter(hash_list) # Counting retrieval occurence for h in set(hash_counter.keys()).intersection( set(structures["anagrams"].keys())): count = hash_counter[h] anag_list = [ anag for anag in structures["anagrams"][h] if edit_distance(anag, token) <= 3 ] for anag in anag_list: anag_score = rate_anagram(structures["occurence_map"], token, anag, count) if anag_score > 0: anagrams[anag] = anag_score return anagrams