def phono_edit_distance_wrapper(w1, w2, sequence_type, features, max_distance): score = phono_edit_distance(w1, w2, sequence_type=sequence_type, features=features) if score <= max_distance: return score else: return None
def _is_phono_edit_distance_neighbor(w, query, sequence_type, specifier, max_distance): return phono_edit_distance(w, query, sequence_type, specifier) <= max_distance
def phono_edit_distance_wrapper(w1, w2, sequence_type, features, max_distance): score = phono_edit_distance(w1, w2, sequence_type = sequence_type,features = features) if score <= max_distance: return score else: return None
def find_distances(corpus, phono_dict, features, model, use_stoplist=False, n=1000): class_text = Sentence(corpus) # Convert dict keys and word list to upper case for comparison purposes phono_words = [word.upper() for word in list(phono_dict.keys())] # Create set of all unique tokens in the corpus if use_stoplist: set_of_unique_tokens = set(class_text.tokens_cased_without_stop) else: set_of_unique_tokens = set(class_text.tokens_cased) # Filter set of unique tokens to only those that are in the model and the phonological corpus set_of_unique_tokens = { token for token in set_of_unique_tokens if token in model and token.upper() in phono_words } # Empty lists for word tuples (index) and distances (column) of resulting DataFrame distances = [] word_tuples = [] widgets = [ progressbar.Percentage(), ' ', progressbar.Bar(marker='#', left='[', right=']'), ' ', progressbar.ETA(), ' ', progressbar.Counter(format='Completed %(value)d/%(max_value)d') ] pbar = progressbar.ProgressBar(widgets=widgets, maxval=len(set_of_unique_tokens)) pbar.start() for i, token in enumerate(set_of_unique_tokens): similar_set = model.most_similar(token, topn=n) # Only keep words and filter similar_set = [item[0] for item in similar_set] similar_set = [ sim_word for sim_word in similar_set if sim_word in model and sim_word.upper() in phono_words and token.upper() != sim_word.upper() ] for sim_word in similar_set: distance = phono_edit_distance.phono_edit_distance( phono_dict.get(token.upper()), phono_dict.get(sim_word.upper()), 'transcription', features) word_tuples.append((token, sim_word)) distances.append(distance) pbar.update(i) pbar.finish() index = pd.MultiIndex.from_tuples( tuples=word_tuples, names=['Corpus Word', 'Similar Word from Model']) columns = ['Phono Edit Distance'] dist_frame = pd.DataFrame(distances, index=index, columns=columns) return dist_frame
def _is_phono_edit_distance_neighbor(w, query, sequence_type, specifier, max_distance): return phono_edit_distance(w, query, sequence_type, specifier) <= max_distance