def soundex(self, string): syllable = self.split_syllables(string) with open("word/word_bank.txt", "r") as f: data = f.readlines() symbols = [",", " ", ".", "!", "?", "-"] sentence = [] sentence_to_say = [] undefined_word = [] for word in syllable: # print(word) xvocal = self.getVocal(word) x = soundex.encode_word(word) # print(f"soundex x : {x} | {xvocal} - {word}") undefined = False for line in data: y = soundex.encode_word(line.strip()) yvocal = self.getVocal(line.strip()) if y is x and yvocal is xvocal: # print(f"soundex x : {y} | {yvocal} - {line}") sentence.append(line.strip()) sentence_to_say.append(line.strip()) undefined = False break elif word in symbols: sentence.append(word) sentence_to_say.append(word) undefined = False break undefined = True if undefined: undefined_word.append(word) sentence.append(f"\033[1;31;40m {word} \033[0m") sentence_to_say.append(word) self.show_unknown(undefined_word) # export sentence result = " " result = result.join(sentence_to_say) return result
def recommend(self, word: str) -> Dict[str, float]: if word is None or word == '': return {} code = soundex.encode_word(word) recommendations = self.phonetic_index[ code] if code in self.phonetic_index.keys() else [] weight = [1.0 for i in range(len(recommendations))] return self.recommendations_post_processing( dict(zip(recommendations, weight)))
def search_model(input_value, **kwargs): model = kwargs.get("model", False) field = kwargs.get("field", False) filter_args = kwargs.get("filter", False) closest_match = None if isinstance(input_value, (str, unicode)) and model and field: text_soundex = soundex.encode_word(input_value) simple_text = (re.sub("[^A-Za-z0-9]", "", input_value).strip().replace(" ", "").lower()) if filter_args: model_items = model.objects.filter(**filter_args) else: model_items = model.objects.all() matches = [] for item in model_items: field_value = getattr(item, field, "") field_soundex = soundex.encode_word(field_value) simple_field_text = (re.sub("[^A-Za-z0-9]", "", field_value).strip().replace( " ", "").lower()) word_distance = distance(input_value, field_value) if text_soundex == field_soundex: matches.append({"distance": 0, "item": item}) elif simple_text in simple_field_text: matches.append({"distance": 1, "item": item}) else: if word_distance < 10: matches.append({"distance": word_distance, "item": item}) matches = sorted(matches, key=itemgetter("distance")) if len(matches) > 0 and matches[0].get("distance") < 5: closest_match = matches[0] return closest_match.get("item")
def build_index(vocabulary: set) -> Dict[str, Set[str]]: idx = dict() for value_word in vocabulary: key = soundex.encode_word(value_word) if key in idx.keys(): idx[key].add(value_word) else: idx[key] = set() idx[key].add(value_word) return idx
def test_encode(self): assert soundex.encode_word("Example") == soundex.encode_word( "Ekzampul")
def parse_text(request): """ Traverses the language model and compares samples using soundex and Levenshtein distance of the interpreted text. :param request: :return: """ data = request_to_dict(request) text = data.get("text") try: interaction_model = LanguageModel.objects.get(enabled=True) except LanguageModel.DoesNotExist: resp = {"_text": text} else: text_soundex = soundex.encode_word(text) simple_text = text.strip().replace(" ", "").lower() matches = [] for intent in interaction_model.intents.filter(enabled=True): for sample in intent.samples: sample_soundex = soundex.encode_word(sample) word_distance = distance(text, sample) simple_sample = sample.strip().replace(" ", "").lower() if text_soundex == sample_soundex: matches.append( {"distance": 0, "intent": intent.name, "sample": sample} ) elif simple_text in simple_sample: matches.append( {"distance": 1, "intent": intent.name, "sample": sample} ) else: if word_distance < 10: matches.append( { "distance": word_distance, "intent": intent.name, "sample": sample, } ) matches = sorted(matches, key=itemgetter("distance")) if len(matches) > 0 and matches[0].get("distance") < 5: closest_match = matches[0] else: closest_match = None resp = { "_text": text, # 'closest_match': closest_match, "intent": closest_match.get("intent") if closest_match else closest_match, # 'matches': matches } return Response(resp, status=status.HTTP_200_OK, headers=NO_CACHE_HEADERS)