def run(dataset_root_path: str = None):
    split_count = 10
    prediction_targets = ["epitope", "paratope"]
    training_percentage = 0.8
    result_path = "./results/"
    dataset_paths = prepare_dataset_paths(dataset_root_path)
    fn1 = {"name": "exact_match", "fn": lambda x, y: x != y}
    fn2 = {"name": "LD", "fn": lambda x, y: edit_distance(x, y) / max(len(x), len(y))}
    comparison_fns = [fn1, fn2]
    prior_strength = 0.001

    for dataset_path in dataset_paths:
        for target in prediction_targets:
            for comparison_fn in comparison_fns:

                # methods:
                dataset = pd.read_csv(dataset_path, sep="\t").dropna(axis=0)
                run_for_setting(dataset, target, prediction_targets[not prediction_targets.index(target)], comparison_fn,
                                prepare_result_path(result_path, dataset_path), training_percentage,
                                split_count, prior_strength)

                # randomized:
                dataset[target] = np.random.permutation(dataset[target].values)
                run_for_setting(dataset, target, prediction_targets[not prediction_targets.index(target)], comparison_fn,
                                prepare_result_path(result_path, dataset_path, True), training_percentage,
                                split_count, prior_strength)
예제 #2
0
    def _get_candidates_exhaustive(self, token: str, max_dist: int):
        """Return a list of candidate words from the vocabulary at most `max_dist` away from the input token.
        This version of the function is private and kept for benchmarking pourposes. This function computes the
        edit distance between the input token and all words in the vocabulary. Then it filters candidates by
        the edit distance.
        :param token : word to be corrected.
        :param max_dist : maximum distance allowed for candidates.
        """
        from editdistance import eval as edit_distance

        token = token.lower()
        distance_token_to_words = {
            word: edit_distance(word, token)
            for word in self.vocabulary
        }
        min_dist = min(distance_token_to_words.values())
        if min_dist <= max_dist:

            if self.sort_candidates:
                result = sorted([
                    (distance, word)
                    for word, distance in distance_token_to_words.items()
                    if distance <= max_dist
                ])
            else:
                result = [
                    word for word, distance in distance_token_to_words.items()
                    if distance <= max_dist
                ]

            return result
        else:
            return [token]
예제 #3
0
def process(command):

    highest_command = None
    lowest_dis = None
    for cmd_string in cmd_strings:
        if lowest_dis == None:
            lowest_dis = edit_distance(command, cmd_string)
            highest_command = cmd_string
            continue

        dis = edit_distance(command, cmd_string)
        if dis < lowest_dis:
            highest_command = cmd_string
            lowest_dis = dis
    print(cmd_strings[highest_command])
    completed_process = run(cmd_strings[highest_command],
                            universal_newlines=True,
                            shell=True)
    print(completed_process.stdout)
예제 #4
0
 def matches_sequence(self, original_sequence: ReceptorSequence, reference_sequence: ReceptorSequence, max_distance):
     """
     :param original_sequence: ReceptorSequence
     :param reference_sequence: ReceptorSequence
     :param max_distance: max allowed Levenshtein distance between two sequences to be considered a match
     :return: True if chain, v_gene and j_gene are the same and sequences are within given Levenshtein distance
     """
     return reference_sequence.metadata.chain == original_sequence.metadata.chain \
         and self.matches_gene(reference_sequence.metadata.v_gene, original_sequence.metadata.v_gene) \
         and self.matches_gene(reference_sequence.metadata.j_gene, original_sequence.metadata.j_gene) \
         and edit_distance(original_sequence.get_sequence(), reference_sequence.get_sequence()) <= max_distance
예제 #5
0
def _misspelt(spell, spells):
    """
    Returns None if the spell is correct or not existing. Otherwise returns the
    misspelt spell.
    :param spell:
    :param spells:
    :return:
    """
    from phonetics import metaphone
    from editdistance import eval as edit_distance

    assert spell
    # log.debug("Looking for %r in %r", spell, spells)
    if spell in spells:
        return (spell, SPELL_OK)

    phonetic_spell = metaphone(spell)[:5]
    for existing_spell in spells:
        if edit_distance(metaphone(existing_spell)[:5], phonetic_spell) <= 2:
            log.warning("Incantesimo scorretto: %r invece di %r", spell,
                        existing_spell)
            return (existing_spell, SPELL_KO)

    return (spell, SPELL_MISSING)