Пример #1
0
def A_star_search(tree, source_word, goal_word):
    # The set of nodes already evaluated
    closedSet = set()

    # The set of currently discovered nodes that are not evaluated yet.
    # Initially, only the start node is known.
    openSet = set()
    openSet.add(source_word)

    # For each node, which node it can most efficiently be reached from.
    # If a node can be reached from many nodes, cameFrom will eventually contain the
    # most efficient previous step.
    cameFrom = {}

    # For each node, the cost of getting from the start node to that node.
    # The cost of going from start to start is zero.
    gScore = {source_word: 0}

    # For each node, the total cost of getting from the start node to the goal
    # by passing by that node. That value is partly known, partly heuristic.
    # For the first node, that value is completely heuristic.
    fScore = {source_word: editdistance.distance(source_word, goal_word)}

    while openSet:
        current = find_min(fScore, openSet)  # Find the word with fmin AND in openset

        # If this word is the one we are looking for then return the reconstructed path
        if current == goal_word:
            return reconstruct_path(cameFrom, current)

        # Update the open/closed sets
        openSet.remove(current)
        closedSet.add(current)

        for new_node in bk_search(tree, current, 1):
            neighbour = new_node[1]
            if neighbour in closedSet:  # Ignore the neighbor which is already evaluated.
                continue
            # The distance from start to a neighbor
            tentative_gScore = gScore.get(current, sys.maxsize) + editdistance.distance(current, neighbour)

            if neighbour not in openSet:  # Discover a new node
                openSet.add(neighbour)
            elif tentative_gScore >= gScore.get(neighbour, sys.maxsize):  # Not a better path
                continue

            # This path is the best until now. Record it!
            cameFrom[neighbour] = current
            gScore[neighbour] = tentative_gScore
            fScore[neighbour] = gScore.get(neighbour) + editdistance.distance(neighbour, goal_word)

    # If we cant find a path between these 2 words then return that contains only the source word
    return reconstruct_path(cameFrom, source_word)
Пример #2
0
def get_bot_accuracies(bot, scored_qa_pairs=None, min_qa_bot_confidence=.2):
    """ Compare answers from bot to answers in test set

    >>> from qary.skills import glossary_bots
    >>> bot = glossary_bots.Bot()
    >>> scored_qa_pairs = [dict(question='What is RMSE?', answer='Root Mean Square Error', score=.9, topic='ds')]
    >>> get_bot_accuracies(bot=bot, scored_qa_pairs=scored_qa_pairs)[0]['bot_accuracy']
    1.0
    >>> scored_qa_pairs = [dict(question='What is RMSE?', answer='root-mean-sqr-error', score=.9, topic='ds')]
    >>> get_bot_accuracies(bot=bot, scored_qa_pairs=scored_qa_pairs)[0]
    {'question': 'What is RMSE?',
     'answer': 'root-mean-sqr-error',
     'score': 0.9,
     'topic': 'ds',
     'bot_answer': 'Root Mean Square Error',
     'bot_w2v_similarity': 0.64...,
     'bot_ed_distance': 0.52...,
     'bot_ed_distance_low': 0.31...,
     'bot_ed_distance_folded': 0.15...,
     'bot_accuracy': 0.65...}
    """
    scored_qa_pairs = load_qa_dataset(
    ) if scored_qa_pairs is None else scored_qa_pairs
    scored_qa_pairs = load_qa_dataset(scored_qa_pairs) if isinstance(
        scored_qa_pairs, str) else scored_qa_pairs
    validated_qa_pairs = []
    for truth in scored_qa_pairs:
        texts = scrape_wikipedia.find_document_texts(topic=truth['topic'],
                                                     max_results=10)
        for context in texts:
            bot.reset_context(context)
            replies = sorted(bot.reply(truth['question']))
            if len(replies) and sorted(replies)[-1][0] > min_qa_bot_confidence:
                break
        replies = replies or [(0, "Sorry, I don't know.")]
        truth['bot_answer'] = replies[-1][1]
        truth['bot_w2v_similarity'] = nlp(truth['bot_answer']).similarity(
            nlp(truth['answer']))
        truth['bot_ed_distance'] = distance(
            truth['answer'], truth['bot_answer']) / len(truth['answer'])
        truth['bot_ed_distance_low'] = distance(
            truth['answer'].lower().strip(),
            truth['bot_answer'].lower().strip()) / len(truth['answer'].strip())
        truth['bot_ed_distance_folded'] = distance(
            fold_characters(truth['answer']),
            fold_characters(truth['bot_answer'])) / len(
                truth['answer'].strip())
        truth['bot_accuracy'] = .5 * truth['bot_w2v_similarity'] + .5 * (
            1 - (truth['bot_ed_distance'] + truth['bot_ed_distance_low'] +
                 truth['bot_ed_distance_folded']) / 3)
        validated_qa_pairs.append(dict(truth))

    return validated_qa_pairs
Пример #3
0
def cer(_pred, _true, norm=True):
    """
    Computes the Character Error Rate, defined as the edit distance.
    Arguments:
        s1 (string): space-separated sentence
        s2 (string): space-separated sentence
    """
    _pred, _true, = _pred.replace(" ", ""), _true.replace(" ", "")
    if norm:
        l = len(_true) if len(_true) > 0 else 1
        return float(editdistance.distance(_pred, _true)) / l
    else:
        return float(editdistance.distance(_pred, _true))
def main():
    with open(json_file) as f:
        label_dict = json.load(f)
    label_dict = {int(key): val for key, val in label_dict.items()}
    actual_df = pd.DataFrame.from_dict(label_dict, orient='index')
    actual_df.reset_index(level=0, inplace=True)
    actual_df.columns = ['label', 'actual_word']

    predicted_df = pd.read_csv(predicted_csv_file)
    predicted_df = predicted_df.loc[predicted_df.groupby('predicted_label')
                                    ['confidence'].idxmax()]
    predicted_df = predicted_df.rename(columns={
        'word': 'predicted_word',
        'predicted_label': 'label'
    })

    combined_df = actual_df.merge(predicted_df, on='label')
    combined_df = combined_df.sort_values(by=['label'])
    combined_df.actual_word

    combined_df['edit_distance'] = combined_df.apply(
        lambda row: ed.distance(row.actual_word, row.predicted_word), axis=1)

    print("Number of total rows :", combined_df.shape[0])
    print("Number of rows where editdistance is zero:",
          (combined_df.edit_distance == 0).count())

    combined_df.to_csv("comparison.csv")
Пример #5
0
    def calc_pattern_reliability(self):
        """ calculate the average Edist-distance between the spatial pattern of each core assembly and each of its significant patterns"""

        nCores = self.get_ncores()
        ed_cores_mats = [[]] * nCores
        ed_cores_means = [[]] * nCores
        core_pidx = self.get_core_PatchIdx()

        for c in np.arange(nCores):
            raster = self.get_patterns_raster()[c]
            nPatterns = raster.shape[1]
            ed_mat = np.zeros(nPatterns)
            ed_mat[:] = np.nan

            core_binary = np.zeros(raster.shape[0])
            core_binary[core_pidx[c][0][0]] = 1

            for i in np.arange(nPatterns):
                ed_mat[i] = ed.distance(core_binary.tolist(),
                                        raster[:, i].tolist())

            ed_cores_mats[c] = ed_mat
            ed_cores_means[c] = np.nanmean(ed_mat)

        ed_grandmean = np.nanmean(ed_cores_means)

        return ed_cores_mats, ed_cores_means, ed_grandmean
Пример #6
0
    def evaluate_image(self, gt, pred):
        correct_num = 0
        pred_sum_num = 0
        gt_sum_num = 0

        for gt_text, pred_text in zip(gt, pred):
            pred_num = len(pred_text)
            gt_num = len(gt_text)
            dist = distance(pred_text, gt_text)
            correct_num += max(pred_num, gt_num) - dist
            pred_sum_num += pred_num
            gt_sum_num += gt_num

        precision = correct_num / pred_sum_num
        recall = correct_num / gt_sum_num
        hmean = 0 if (precision + recall) == 0 else 2.0 * \
                precision * recall / (precision + recall)
        per_sample_metric = {
            'pred_num': pred_sum_num,
            'gt_num': gt_sum_num,
            'correct_num': correct_num,
            'precision': precision,
            'recall': recall,
            'hmean': hmean
        }
        return per_sample_metric
Пример #7
0
 def update(self, preds, targets):
     for i in range(len(preds)):
         targets_list, pred_list = list(
             filter(lambda a: a != 0, targets[i].tolist())), preds[i]
         self.edit_distance += editdistance.distance(
             targets_list, pred_list)
         self.target_length += len(targets_list)
Пример #8
0
def compute_cer(predicts: List[Char], targets: List[Char],
                indistinguish: bool) -> Tuple[torch.Tensor, torch.Tensor]:
    '''
    Calculate CER distance between two strings or two lists of strings

    Params:
    -------
    - predicts: List of predicted characters
    - targets: List of target characters
    - indistinguish: set to True to case-insensitive, or False to case-sensitive

    Returns:
    --------
    - distances: List of distances
    - n_references: List of the number of characters of targets
    '''
    assert type(predicts) == type(
        targets), 'predicts and targets must be the same type'
    assert len(predicts) == len(
        targets), 'predicts and targets must have the same length'

    if indistinguish:
        predicts = [list(map(str.lower, predict)) for predict in predicts]
        targets = [list(map(str.lower, target)) for target in targets]

    distances = torch.tensor([
        ed.distance(predict, target)
        for predict, target in zip(predicts, targets)
    ])
    num_references = torch.tensor(list(map(len, targets)))
    return distances, num_references
Пример #9
0
def error_rate(hyps, targets):
    verbose = 0

    assert len(hyps) == len(targets)
    tot_edits = 0.0
    tot_len = 0.0
    idx = 0
    for h, t in zip(hyps, targets):
        distance = editdistance.distance(np.array(h), np.array(t))

        if verbose > 0:
            # If necessary, get 'alphabet' as argument after which you can compare strings.
            # CHECK: Make sure no blanks/ class #0 in here
            print("error_rate() [" + str(idx) + "] hyps:    " +
                  str(tensorList2list(h)))
            print("error_rate() [" + str(idx) + "] targets: " +
                  str(tensorList2list(t)))
            print("error_rate() [" + str(idx) + "] distance: " + str(distance))

        tot_edits += distance
        tot_len += len(t)
        idx += 1
    # end for

    # Compute character error rate (CER) == label error rate (LER)
    cer = (tot_edits * 100.0) / tot_len

    return cer
Пример #10
0
    def _get_most_similar_entity(self, text, entities, keywords, unknown_dist, unknown):
        """
        Every entity is paired with its list of keywords.
        entities = [...]
        keywords = [[], [], ...]
        """
        text  = text.translate(str.maketrans('', '', string.punctuation))
        words = text.split(" ")
        words = [self._morph.parse(word)[0].normal_form for word in words]
        min_dist = 10000000
        min_entity = None
        for keywords, entity in zip(keywords, entities):
            # Skip empty entities
            if len(keywords) == 0:
                continue
            keywords = [self._morph.parse(keyword)[0].normal_form for keyword in keywords]
            entity_distance = 0
            for word in words:
                word = self._morph.parse(word)[0].normal_form
                entity_distance = min([editdistance.distance(word, keyword) for keyword in keywords])

                if entity_distance < min_dist:
                    min_entity = entity
                    min_dist = entity_distance

        if min_dist < unknown_dist:
            return min_entity
        else:
            return unknown
Пример #11
0
def replace_suspect_word_to_sentence(word, sent, dis=1):
    sent_pinyin = pypinyin.pinyin(sent, style=pypinyin.TONE3)
    sent_pinyin = [
        i[0][:-1] if i[0][-1] in tone else i[0] for i in sent_pinyin
    ]
    sent_chars = list(sent)
    word_pinyin = pypinyin.pinyin(word, style=pypinyin.TONE3)
    word_pinyin = [
        i[0][:-1] if i[0][-1] in tone else i[0] for i in word_pinyin
    ]
    word_len = len(word_pinyin)
    sent_len = len(sent_pinyin)
    replace_pos = []
    for i in range(sent_len - word_len + 1):
        sent_word = sent_pinyin[i:i + word_len]
        for s in sent_word:
            if len(s) == 0:
                break
            if len(s) == 1:
                if s not in letters:
                    break
        sent_word_edit = ''.join(sent_word)
        word_edit = ''.join(word_pinyin)
        if editdistance.distance(sent_word_edit, word_edit) <= dis:
            replace_pos.append(i)
    for pos in replace_pos:
        sent_chars[pos:pos + word_len] = word
    res = ''.join(sent_chars)
    return res
Пример #12
0
 def compute_distance(self, predict: str, target: str) -> float:
     """
     Compute edit distance between two strings
     """
     distance = ed.distance(predict, target)
     distance = float(distance) / len(target)
     return distance
Пример #13
0
def clean_up(df, threshold, field):
    '''
        Homogenises a field of the data frame by replacing similar entries with a canonical form.
        That's defined to be the most repeated form withing that similar group

        Parameters:
            df              The bottin pandas dataframe to be cleaned.
            threshold       A float (0-1] representing the maximum relative distance for 2 strings to be considered similar
            field           A string containing the name of the field to be cleaned up. The field must be string-valued

        returns:
            clean_df        The cleaned up dataframe
    '''
    clean_df = df.copy()
    series = df[field]
    counts = df[field].value_counts(sort=True)
    unique = series.unique()

    for i, str1 in enumerate(unique):
        for str2 in unique[i+1:]:
            dist = 2*distance(str1, str2)/(len(str1) + len(str2))
            if (dist > 0) and (dist <= threshold):
                if counts[str2] < counts[str1]:
                    canon = str1
                    abr = str2
                else:
                    canon = str2
                    abr = str1
                series.replace(to_replace=abr, value=canon, inplace=True)

    clean_df[field] = series
    return clean_df
Пример #14
0
    def _min_distance_between_texts(self, src: str, target: str) -> int:
        src_norms, _     = self._text_to_normal_forms(src)
        tar_norms, _     = self._text_to_normal_forms(target)
        target_text      = " ".join(tar_norms)
        cur_min_distance = sys.maxsize

        # Linear time (of normal forms)
        for ind, word in enumerate(src_norms):
            cur_text = word
            cur_min_distance = min(editdistance.distance(cur_text, target_text), cur_min_distance)
            for word1 in src_norms[ind+1:ind+1+self._max_window_size]:
                cur_text = cur_text + " " + word1
                cur_min_distance = min(editdistance.distance(cur_text, target_text), cur_min_distance)

        return cur_min_distance

            
Пример #15
0
 def update(self, preds: torch.Tensor, targets: torch.Tensor) -> None:
     N = preds.shape[0]
     for ind in range(N):
         pred = [_ for _ in preds[ind].tolist() if _ not in self.ignore_tokens]
         target = [_ for _ in targets[ind].tolist() if _ not in self.ignore_tokens]
         distance = editdistance.distance(pred, target)
         error = distance / max(len(pred), len(target))
         self.error = self.error + error
     self.total = self.total + N
Пример #16
0
 def compute_distance(self, predict: str, target: str) -> float:
     """
     Compute edit distance between two strings
     """
     predict = "".join(predict).split(" ")
     target = "".join(target).split(" ")
     distance = ed.distance(predict, target)
     distance = float(distance) / len(target)
     return distance
Пример #17
0
def check_strict(gt, pred):
    if abs(len(gt) - len(pred)) >= 2:
        return 0
    dis = editdistance.distance(gt, pred)
    if dis == 0:
        return 2
    elif dis < max(len(gt), len(pred)) * 0.3:
        return 1
    else:
        return 0
Пример #18
0
 def update(self, preds: torch.Tensor, targets: torch.Tensor) -> None:
     for ind in range(preds.shape[0]):
         pred = [
             _ for _ in preds[ind].tolist() if _ not in self.ignore_tokens
         ]
         target = [
             _ for _ in targets[ind].tolist() if _ not in self.ignore_tokens
         ]
         self.dist_leven += editdistance.distance(pred, target)
         self.len_total += max(len(pred), len(target))
Пример #19
0
def cer(_pred, _true, norm=True):
    """
    Computes the Character Error Rate using the `editdistance` library.

    Parameters
    ----------
    _pred : str
        space-separated sentence (prediction)
    _true : str
        space-separated sentence (ground truth)
    norm : bool
        divide by length of ground truth
    """
    _pred, _true, = _pred.replace(" ", ""), _true.replace(" ", "")
    if norm:
        l = len(_true) if len(_true) > 0 else 1
        return float(editdistance.distance(_pred, _true)) / l
    else:
        return float(editdistance.distance(_pred, _true))
Пример #20
0
 def _levenshtein_candidates(self, predict_word):
     candidates = list()
     dist = dict()
     for word in self.dict_words:
         dist.update({word: ed.distance(predict_word, word)})
     min_dist = min(dist.items(), key=lambda x: x[1])[1]
     for key, value in dist.items():
         if value == min_dist:
             candidates.append(key)
     return candidates
Пример #21
0
def findWord(parola, insieme):
    if parola in insieme:
        return 0, parola
    minDistance = sys.maxsize
    nearWord = None
    for word in insieme:
        d = distance(parola, word)
        if minDistance > d:
            minDistance = d
            nearWord = word
    return minDistance, nearWord
Пример #22
0
def test(model,
         pe,
         TestSet,
         max_len,
         make_mask,
         class_num,
         converter,
         error_analysis=True):
    model.eval()
    loader = DataLoader(TestSet,
                        batch_size=250,
                        shuffle=False,
                        num_workers=8,
                        pin_memory=True,
                        drop_last=False)
    CCR = 0.0
    n_correct = 0
    start = time.time()
    for images, labels in loader:
        if error_analysis:
            images_norm = []
            for item in images:
                images_norm.append(norm(item))
            images_norm = torch.stack(images_norm, dim=0)
        else:
            images_norm = images
        texts = predict(model, pe, images_norm, max_len, make_mask, class_num,
                        converter)
        for i, (text, label) in enumerate(zip(texts, labels)):
            text = text[:text.find('[s]')]
            if text != label:
                n_correct += 1
                if error_analysis:
                    try:
                        name = label + " " + text + ".jpg"
                        ToPIL(
                            images[i]).save("./error_analysis_TwoDAttention/" +
                                            name)
                    except:
                        pass
            try:
                NED = 1 - editdistance.distance(text, label) / max(
                    len(text), len(label))
                CCR += NED
            except:
                pass
    CCR /= TestSet.len
    CCR = round(100 * CCR, 2)
    WCR = (TestSet.len - n_correct) / TestSet.len
    WCR = round(100 * WCR, 2)
    end = time.time()
    print(f"CCR:{CCR}%;WCR:{WCR}%;time consumed:{time_interval(end-start)}")
    torch.cuda.empty_cache()
    return CCR
Пример #23
0
def tree_insertion(tree, word):
    if len(tree) == 0:
        tree.append([word, {}])
        return
    node = tree[0]
    while node is not None:
        node_word = node[0]
        distance = editdistance.distance(word, node_word)
        parent = node
        node = node[1].get(distance)
        if not node:
            parent[1][distance] = [word, {}]
Пример #24
0
def compute_norm_wer(pred_tokens, tgt_tokens, split_token=' '):
    # type: (List[str], List[str], str) -> float
    pred_words = [
        ''.join(word_tokens)
        for word_tokens in split_by_token(pred_tokens, split_token)
    ]
    tgt_words = [
        ''.join(word_tokens)
        for word_tokens in split_by_token(tgt_tokens, split_token)
    ]
    wer = ed.distance(pred_words, tgt_words) / len(tgt_words)
    return wer
Пример #25
0
 def update(self, preds: torch.Tensor, targets: torch.Tensor) -> None:
     N = preds.shape[0]
     for ind in range(N):
         pred = [
             _ for _ in preds[ind].tolist() if _ not in self.ignore_tokens
         ]  # x = torch([0,2,1]) -> [2]
         target = [
             _ for _ in targets[ind].tolist() if _ not in self.ignore_tokens
         ]
         distance = editdistance.distance(
             pred, target)  # 한 문자열을 다른 문자열로 바꾸는데 필요한 add edit delete의 수
         error = distance / max(len(pred), len(target))
         self.error = self.error + error  # 0으로 initialize되어있나?
     self.total = self.total + N
Пример #26
0
    def test_swap(self):
        typos_trans.mode = 'swap'
        x = typos_trans.transform(data_sample, n=1)
        self.assertTrue(1 == len(x))

        for sample in x:
            self.assertTrue(sample.get_text('x') != data_sample.get_text('x'))
            self.assertTrue(editdistance.distance(sample.get_text('x'),
                                                data_sample.get_text('x')) <= 4)

        special_sample = SASample({'x': '', 'y': "negative"})
        self.assertEqual([], typos_trans.transform(special_sample))
        special_sample = SASample({'x': '~!@#$%^7890"\'', 'y': "negative"})
        self.assertEqual(1, len(typos_trans.transform(special_sample)))
Пример #27
0
    def dist(self, o):
        import editdistance

        # replace NaN with 0
        # sd = np.where(self.data == self.nanValue, 0, self.data)
        # od = np.where(o.data == self.nanValue, 0, o.data)

        # neutralize NaNs
        sd = self.data.copy()
        od = o.data.copy()
        sd[sd == self.nanValue] = od[sd == self.nanValue]
        od[od == self.nanValue] = sd[od == self.nanValue]

        return editdistance.distance(str(sd), str(od))
Пример #28
0
def find_match(source_word):
	"""Finds the best match for a source word"""

	min_dist = 100
	# min_dist = len(source_word) * 2
	optimal_words = []

	target_file = open('common_words.txt', 'r')

	# FIXME: Runtime of this is O(n^2). Can we improve this?
	for line in target_file:
		target_word = line.rstrip()

		if distance(source_word, target_word) == min_dist:
			# Add this word to the list
			optimal_words.append(target_word)

		if distance(source_word, target_word) < min_dist:
			min_dist = distance(source_word, target_word)
			# re-initialize the list, with only this word as a possible correction
			optimal_words = [target_word]

	return choice(optimal_words)
Пример #29
0
def compute_global_wer(pred_tokens, tgt_tokens, split_token=' '):
    # type: (List[str], List[str], str) -> Tuple[int, int]
    pred_words = [
        ''.join(word_tokens)
        for word_tokens in split_by_token(pred_tokens, split_token)
    ]
    tgt_words = [
        ''.join(word_tokens)
        for word_tokens in split_by_token(tgt_tokens, split_token)
    ]

    dist = ed.distance(pred_words, tgt_words)
    num_refs = len(tgt_words)
    return dist, num_refs
Пример #30
0
def find_string_distances(embeddings, words, phoc_levels):
    '''
    Given embeddings (2d array of shape mxn where m is the number of words and n is the size of embedding), array of words,list of levels for phocnet and word length, find the words mapped from each row of embeddings, calculates distance between mapped word and actual word and returns this list of distances
    '''
    mapped_words = []
    for i in range(embeddings.shape[0]):
        mapped_word = predict_word_from_embedding(embeddings[i], phoc_levels,
                                                  len(words[i]))
        mapped_words.append(mapped_word)

    distance_array = [
        ed.distance(words[i], mapped_words[i]) for i in range(len(words))
    ]

    return distance_array