def run(dataset_root_path: str = None): split_count = 10 prediction_targets = ["epitope", "paratope"] training_percentage = 0.8 result_path = "./results/" dataset_paths = prepare_dataset_paths(dataset_root_path) fn1 = {"name": "exact_match", "fn": lambda x, y: x != y} fn2 = {"name": "LD", "fn": lambda x, y: edit_distance(x, y) / max(len(x), len(y))} comparison_fns = [fn1, fn2] prior_strength = 0.001 for dataset_path in dataset_paths: for target in prediction_targets: for comparison_fn in comparison_fns: # methods: dataset = pd.read_csv(dataset_path, sep="\t").dropna(axis=0) run_for_setting(dataset, target, prediction_targets[not prediction_targets.index(target)], comparison_fn, prepare_result_path(result_path, dataset_path), training_percentage, split_count, prior_strength) # randomized: dataset[target] = np.random.permutation(dataset[target].values) run_for_setting(dataset, target, prediction_targets[not prediction_targets.index(target)], comparison_fn, prepare_result_path(result_path, dataset_path, True), training_percentage, split_count, prior_strength)
def _get_candidates_exhaustive(self, token: str, max_dist: int): """Return a list of candidate words from the vocabulary at most `max_dist` away from the input token. This version of the function is private and kept for benchmarking pourposes. This function computes the edit distance between the input token and all words in the vocabulary. Then it filters candidates by the edit distance. :param token : word to be corrected. :param max_dist : maximum distance allowed for candidates. """ from editdistance import eval as edit_distance token = token.lower() distance_token_to_words = { word: edit_distance(word, token) for word in self.vocabulary } min_dist = min(distance_token_to_words.values()) if min_dist <= max_dist: if self.sort_candidates: result = sorted([ (distance, word) for word, distance in distance_token_to_words.items() if distance <= max_dist ]) else: result = [ word for word, distance in distance_token_to_words.items() if distance <= max_dist ] return result else: return [token]
def process(command): highest_command = None lowest_dis = None for cmd_string in cmd_strings: if lowest_dis == None: lowest_dis = edit_distance(command, cmd_string) highest_command = cmd_string continue dis = edit_distance(command, cmd_string) if dis < lowest_dis: highest_command = cmd_string lowest_dis = dis print(cmd_strings[highest_command]) completed_process = run(cmd_strings[highest_command], universal_newlines=True, shell=True) print(completed_process.stdout)
def matches_sequence(self, original_sequence: ReceptorSequence, reference_sequence: ReceptorSequence, max_distance): """ :param original_sequence: ReceptorSequence :param reference_sequence: ReceptorSequence :param max_distance: max allowed Levenshtein distance between two sequences to be considered a match :return: True if chain, v_gene and j_gene are the same and sequences are within given Levenshtein distance """ return reference_sequence.metadata.chain == original_sequence.metadata.chain \ and self.matches_gene(reference_sequence.metadata.v_gene, original_sequence.metadata.v_gene) \ and self.matches_gene(reference_sequence.metadata.j_gene, original_sequence.metadata.j_gene) \ and edit_distance(original_sequence.get_sequence(), reference_sequence.get_sequence()) <= max_distance
def _misspelt(spell, spells): """ Returns None if the spell is correct or not existing. Otherwise returns the misspelt spell. :param spell: :param spells: :return: """ from phonetics import metaphone from editdistance import eval as edit_distance assert spell # log.debug("Looking for %r in %r", spell, spells) if spell in spells: return (spell, SPELL_OK) phonetic_spell = metaphone(spell)[:5] for existing_spell in spells: if edit_distance(metaphone(existing_spell)[:5], phonetic_spell) <= 2: log.warning("Incantesimo scorretto: %r invece di %r", spell, existing_spell) return (existing_spell, SPELL_KO) return (spell, SPELL_MISSING)