Exemplo n.º 1
0
 def maximizeProbabilities(originalWord, words, paperText):
     bestOccurrence = 0
     result = ''
     for word in words:
         occurences = len(re.findall(word, paperText))
         if occurences > bestOccurrence:
             result = word
             bestOccurrence = occurences
         elif occurences == bestOccurrence:
             if editdistance.eval(originalWord, result) > editdistance.eval(originalWord, word):
                 result = word
     return result
def parse_grades(text_path):
    rows = []
    
    with open(text_path) as f:
        text = f.read()
        
        pages = [page
                 for page in text.split(PAGE_SEPARATOR)
                 if len(page.strip()) > 0]
        
        for page_index in range(len(pages)):
            page = pages[page_index]
            
            lines = page.splitlines()
            lines = [line.strip() for line in lines]
            lines = [line
                     for line in lines
                     if len(line) > 0]

            year = term = UNDEFINED
            for line in lines:
                tokens = line.split()
                if len(tokens) < 2:
                    continue
                
                match = False
                t0, t1 = tokens[0], tokens[1]
                if editdistance.eval(t0, SPRING) <= 1:
                    term = SPRING
                    match = True
                elif editdistance.eval(t0, FALL) <= 1:
                    term = FALL
                    match = True
                elif editdistance.eval(t0, SUMMER) <= 1:
                    term = SUMMER
                    match = True

                if match:
                    year = parse_int(t1)
                    break

            tallies = [UNDEFINED] * 22
            for i in range(len(lines)):
                if editdistance.eval(lines[i], GRADE_TITLE) <= 2:
                    high_grade_line = lines[i+2]
                    low_grade_line = lines[i+5]
                    tallies = parse_tally_line(high_grade_line) + parse_tally_line(low_grade_line)

            rows.append([page_index + 1, year, term] + tallies)
            
    return rows
Exemplo n.º 3
0
    def findMatches(self, record):
        bkv = record["bkv"]

        results = self.querier.query(bkv=bkv)
        matchingRecords = [record]

        for r in results:
            if int(editdistance.eval(r["surname"], record["surname"])) < 2 \
               and int(editdistance.eval(r["forename"], record["forename"])) < 2 \
               and int(editdistance.eval(r["title"], record["title"])) < 2 \
               and int(editdistance.eval(r["occupation"], record["occupation"])) < 2 \
               and int(editdistance.eval(r["address"], record["address"])) < 2:
                matchingRecords.append(r)

        return matchingRecords
Exemplo n.º 4
0
    def annotate(self, tokens):
        X_focus = self.preprocessor.transform(tokens=tokens)['X_focus']
        X_context = self.pretrainer.transform(tokens=tokens)
        
        # get predictions:
        new_in = {}
        if self.include_token:
            new_in['focus_in'] = X_focus
        if self.include_context:
            new_in['context_in'] = X_context
        preds = self.model.predict(new_in)

        if isinstance(preds, np.ndarray):
            preds = [preds]
        
        annotation_dict = {'tokens': tokens}
        if self.include_lemma:
            pred_lemmas = self.preprocessor.inverse_transform_lemmas(predictions=preds[self.lemma_out_idx])
            annotation_dict['lemmas'] = pred_lemmas
            if self.postcorrect:
                for i in range(len(pred_lemmas)):
                    if pred_lemmas[i] not in self.known_lemmas:
                        pred_lemmas[i] = min(self.known_lemmas,
                                            key=lambda x: editdistance.eval(x, pred_lemmas[i]))
                annotation_dict['postcorrect_lemmas'] = pred_lemmas

        if self.include_pos:
            pred_pos = self.preprocessor.inverse_transform_pos(predictions=preds[self.pos_out_idx])
            annotation_dict['pos'] = pred_pos
        
        if self.include_morph:
            pred_morph = self.preprocessor.inverse_transform_morph(predictions=preds[self.morph_out_idx])
            annotation_dict['morph'] = pred_morph

        return annotation_dict
Exemplo n.º 5
0
    def calculateRatioNGram(self, itemChildrenTextFile, path, htmlFileBackgroundKnowledge, nodeBackgroundKnowledge):
        '''
        
        N-Gram distance measure - to be developed 
        n-gram overlap
        https://pythonhosted.org/ngram/index.html
        http://odur.let.rug.nl/~vannoord/TextCat/textcat.pdf
        
        '''
        ratioList = []
    
        if len(itemChildrenTextFile) > 0:

            sumBkn = sum( [ htmlFileBackgroundKnowledge[key]['extractCount'] for key in htmlFileBackgroundKnowledge.keys()])
            extractCount = htmlFileBackgroundKnowledge[path]['extractCount']

            for itemChild in itemChildrenTextFile:
                ratio = []
                for itemBack in nodeBackgroundKnowledge:
                    ratio.append(editdistance.eval(itemChild, itemBack) * 2 / (len(itemChild)+len(itemBack)))
                
                ratioList.append(median(ratio) / len(itemChildrenTextFile))
                
        
        else:
            ratioList.append(0)

        return mean(ratioList)    
Exemplo n.º 6
0
def dist(v1, v2):
    import editdistance
    def L1dis(l1, l2):
        dic = {}
        for k in l1:
            dic[k[0]] = k[1]
        for k in l2:
            if k[0] in dic.keys():
                dic[k[0]] = math.fabs(dic[k[0]] - k[1])
            else:
                dic[k[0]] = k[1]
        res = 0
        for k in dic:
            res += dic[k]
        return res
    def tokennum(v):
        res = 0
        for l in v:
            res += l[1]
        return res
        
    timedf = (v1[2] - v2[2]) / 86400
    if v1[4] == v2[4]:
        audf = 0
    else:
        audf = 1
    fndf = int(editdistance.eval(v1[3], v2[3]))
    comdf = L1dis(v1[1], v2[1])
    codedf = L1dis(v1[5], v2[5])
    return (str(v1[0]), str(v2[0]), timedf, audf, fndf, int(comdf), int(codedf), tokennum(v1[5]), tokennum(v2[5]))
Exemplo n.º 7
0
    def match_states(self):
        for pref_d, vp_aspth_dict_d in self.trace_state.iteritems():
            print pref_d
            # if pref_d not in self.pref_d_c:
            #     self.pref_d_c[pref_d] = {}
            if pref_d not in self.pref_c_d:
                self.pref_c_d[pref_d] = {}
            best_match_val = sys.maxint
            best_match_lst = []
            # raw_input('...')
            for vp_d, aspath_d in vp_aspth_dict_d.iteritems():
                # if vp_d not in self.pref_d_c[pref_d]:
                #     self.pref_d_c[pref_d][vp_d] = {}

                vp_aspth_dict_c = self.bgp_state[pref_d]
                for vp_c, aspath_c in vp_aspth_dict_c.iteritems():
                    dist = editdistance.eval(aspath_c.split(), aspath_d.split())

                    # self.pref_d_c[pref_d][vp_d][vp_c] = [dist, max(len(aspath_d.split()), len(aspath_c.split()))]

                    if vp_c not in self.pref_c_d[pref_d]:
                        self.pref_c_d[pref_d][vp_c] = {}
                    # if vp_d not in self.pref_c_d[pref_d][vp_c]:
                    #     self.pref_c_d[pref_d][vp_c][vp_d] = {}
                    self.pref_c_d[pref_d][vp_c][vp_d] = [dist, max(len(aspath_d.split()), len(aspath_c.split()))]

                    if dist < best_match_val:
                        best_match_val = dist
                        best_match_lst = [vp_c]
                    elif dist == best_match_val:
                        best_match_lst.append(vp_c)
                    # print vp_d, '%', vp_c, '=>', editdistance.eval(aspath_c.split(), aspath_d.split())
                # print vp_d, '%', best_match_val, '=>', best_match_lst
                """ NetworkX """
Exemplo n.º 8
0
def is_warning(line):
    line = re.sub(r'[,.]$', '', line)
    for base in WARNINGS:
        d = editdistance.eval(base, line)
        if 2 * d < len(base):
            return True
    return False
Exemplo n.º 9
0
def dist(v1, v2):
    import editdistance
    def L1dis(l1, l2):
        dic = {}
        for k in l1:
            dic[k[0]] = k[1]
        for k in l2:
            if k[0] in dic.keys():
                dic[k[0]] = math.fabs(dic[k[0]] - k[1])
            else:
                dic[k[0]] = k[1]
        res = 0
        for k in dic:
            res += dic[k]
        return res

    timedf = (v1[2] - v2[2]) / 86400
    if v1[4] == v2[4]:
        audf = 0
    else:
        audf = 1
    fndf = int(editdistance.eval(v1[3], v2[3]))
    comdf = L1dis(v1[1], v2[1])
    codedf = L1dis(v1[5], v2[5])
    return timedf + audf + fndf + comdf + codedf
Exemplo n.º 10
0
def average_similar(vocab, embs, dist):
    print "Starting average of similar words"
    dim = len(embs.values()[0])
    added = 0
    for i,w1 in enumerate(vocab):
        similar = []
        if i%10==0:
            print "\rVocab word", i,
            sys.stdout.flush()
        if w1 in embs:
            continue
        for w2 in embs:
            if w1[:3] == w2[:3] and editdistance.eval(w1, w2) <= dist:
                #same language
                similar.append(w2)
        if len(similar) > 0:
            added += 1
            print "\r{}".format(added),
            sys.stdout.flush()
            v = np.zeros(dim)
            for w2 in similar:
                if len(v) != len(embs[w2]):
                    print "Mismatched dimensions"
                    ipy.embed()
                v += embs[w2]
            v /= len(similar)
            embs[w1] = v
    return embs
    def analyzing_sender_profile(self):
        len_ordering = {}
        count = 1
        for sender in self.sender_profile.keys():
            len_ordering[sender] = {}
            for ordering in self.sender_profile[sender]:
                num = len(ordering.split(" "))
                if num not in len_ordering[sender].keys():
                    len_ordering[sender][num] = 1
                else:
                    len_ordering[sender][num] += 1
                count += 1
        # key: diff val: number of times I've seen
        count_diff = {}
        for sender, ordering in self.sender_profile.items():
            if len(ordering) > 1:
                one = list(ordering)[0].split(" ")
                two = list(ordering)[1].split(" ")
                count_diff = self.add_dict(count_diff, int(editdistance.eval(one, two)))
        #pprint.pprint(len_ordering)
        pprint.pprint(count_diff)

        for sender, ordering in self.sender_profile.items():
            if len(ordering) > 4:
                print(len(ordering))
                print(self.sender_to_newformat[sender])
                print("===up===")
        for sender, smt in self.sender_to_newformat.items():
            if smt > 1:
                print(smt)
                print(len(self.sender_profile[sender]))
                print("===uppp===")
Exemplo n.º 12
0
def base32_distances(base32_nmers, metric='levenshtein'):
    """
    Get pairwise distances (different metrics)
    This takes a little while
    """
    N = len(base32_nmers)
    total = N*(N-1.0)/2
    print 'Calculating', N*(N-1)/2, 'pairwise distances.'
    d = np.empty(shape=(N, N), dtype=np.float)
    n = 0
    for i in xrange(N):
        for j in xrange(i, N):
            n += 1
            if n%500000 == 0:
                sys.stdout.write('\r'+'%.4f' % (float(n*100)/total)+'%')
                sys.stdout.flush()
            if metric == 'levenshtein':
                dij = editdistance.eval(base32_nmers[i], base32_nmers[j])
            elif metric == 'bespoke':
                dij = bespoke_distance(base32_nmers[i], base32_nmers[j])
            else:
                raise NotImplementedError
            d[i, j] = dij
            d[j, i] = dij
    print ''
    return d
    def filter_hits(self, hits):
        """
        Solves nasty edge case.

        'Nes 33 1012KC Amsterdam' -> 'Nes 33-H'

        NOT Aert van Nesstraat 33

        """
        new_hits = []

        for hit in hits:
            hit_straat = hit['straatnaam'].lower()
            distance = editdistance.eval(hit_straat, self.straatnaam)

            if not distance:
                # we have an ~exactisch street name match.
                # trumps other matches.
                new_hits.append(hit)

        # no exact matching streetname..
        if not new_hits:
            return hits

        return new_hits
def getLabels(ontology_file, outputFile):
    # In[29]:

    # ontology_file = 'https://github.com/DataONEorg/sem-prov-ontologies/blob/master/observation/d1-ECSO.owl'

    ontology = ConjunctiveGraph()
    ontology.parse(open(ontology_file), format="nt")

    classes = [ontology.resource(c) for c in ontology[: RDF.type : OWL.Class]]
    classes = [
        c for c in classes if c.identifier.startswith(oboe) or c.identifier.startswith("http://purl.dataone.org/odo/")
    ]

    output = ConjunctiveGraph()

    summary = pd.DataFrame(
        columns=["uri", "label", "resource", "dbpl", "score", "combined", "editdist", "altLabel", "definition"]
    )
    i = 0
    for c in classes:
        i += 1
        label = c.label()
        mentions = extract_mentions(label)
        if len(mentions) == 0:
            continue
        g = get([x for x, score in mentions])
        for uri, score in mentions:
            dbpl = g.label(uri)
            if dbpl is None:
                continue
            editdist = editdistance.eval(label.value.replace("_", " "), dbpl)
            if editdist < 4:
                labels = list(g.objects(uri, skos.altLabel))
                for l in labels:
                    output.add((uri, skos.altLabel, l))

                defn = g.value(uri, RDFS.comment)
                if defn is not None:
                    output.add((uri, skos.definition, defn))
                for label in labels:
                    summary = summary.append(
                        dict(
                            uri=c,
                            label=label,
                            resource=uri,
                            dbpl=dbpl,
                            score=score,
                            combined=score / (0.1 + editdist),
                            editdist=editdist,
                            definition=defn,
                            altLabel=label,
                        ),
                        ignore_index=True,
                    )
                break

    f = open(outputFile + ".nt", "w")
    f.write(output.serialize(format="ntriples"))
    f.close()
    summary.sort("combined", ascending=False).to_csv(outputFile + ".csv", encoding="utf-8")
Exemplo n.º 15
0
    def closest_by_edit_distance(self, x):
        if x in self:
            # Optimization: if x is in multiset, then closest
            # edit dist = 0. Nothing can be any closer.
            return (x, 0)

        # Optimization: If we've looked up this value before, 
        # return previously computed answer.
        cached_answer = self.cache1.get(x)
        if cached_answer:
            return cached_answer
        cached_answer = self.cache2.get(x)
        if cached_answer:
            return cached_answer

        closest = None
        closest_dist = None
        for y,_ in self.most_common():
            d = editdistance.eval(x, y)
            if not closest_dist or d < closest_dist:
                closest = y
                closest_dist = d
                if d == 1:
                    # Optimization: nothing can be any closer, as
                    # we know there's nothing at edit distance 0 (x is not
                    # in the multiset).
                    self.cache1.put(x, (closest, closest_dist))
                    return (closest, closest_dist)

        self.cache2.put(x, (closest, closest_dist))
        return (closest, closest_dist)
def beng_word(word):
	word=word.lower()
	bengdict = open("./beng_words.txt",'r')
	line = bengdict.readline()
	line = line.split(",")
	for dict_word in line:
		dict_word=dict_word.strip()
		'''
		if(editdistance.eval(phone[i].lower(),word.lower())<=1 or editdistance.eval(key[i].lower(),word.lower())<=1):
			print("1"+key[i]+" "+phone[i])
			bengdict.close()
			return 1
		'''
		if(editdistance.eval(dict_word.lower(),word.lower())<=1):
			bengdict.close()
			return 1

	bengdict.close()
	beng_suff = open("./beng_suffix.csv",'r')
	suff_list = beng_suff.readline().split(",")
	for suff in suff_list:
		suff=suff.strip(" ")
		if ((word.find(suff,2)==(len(word)-len(suff))) and len(word)>len(suff)):
			beng_suff.close()
			return 1
	beng_suff.close()
	return 0
Exemplo n.º 17
0
def phones_for_closest_match(word):
    """Brute force. Look for lowest distance between all words that are in
    the CMU dictionary.

    """
    by_distance = []
    for possibility in pronouncing.pronunciations:

        # levenstein
        distance = editdistance.eval(possibility, word)

        # give a bonus for same first letter / last letter
        if possibility.startswith(word[0]):
            distance -= 1
        if possibility.endswith(word[-1]):
            distance -= 1 

        # break ties with difference in length
        character_difference = abs(len(possibility) - len(word))
        by_distance.append((distance, character_difference, possibility))

    # find the lowest (final tie breaker is alphabetical, oh well)
    d_edit, d_length, suggestion = min(by_distance)

    # return the suggestion and the phones for the suggestion
    return suggestion, pronouncing.phones_for_word(suggestion)
    def edit_dist(self, anotherVector):
#         intersect_features = set(self.features.keys()) & set(anotherVector.features.keys())
        intersect_features = self.featureSet.intersection(anotherVector.featureSet)                
        intersect_features = [feature for feature in intersect_features if feature not in self.na_metadata ]
        
        file_edit_distance = 0.0
        count = 0
        for feature in intersect_features:
            file1_feature_value = self.featuresText[feature]
            file2_feature_value = anotherVector.featuresText[feature]
            
            divider = (len(file1_feature_value) if len(file1_feature_value) > len(file2_feature_value) else len(file2_feature_value))
            
            if divider == 0:
                continue
            
            feature_distance = float(editdistance.eval(file1_feature_value, file2_feature_value))/ divider
            file_edit_distance += feature_distance
            count += 1
        
        if count == 0:
            return file_edit_distance
        
        file_edit_distance /= count
        return file_edit_distance
    def _finalize_cache(self):
        keys = list(self.cache.keys())
        for key in keys:
            orig_password_list = list(self.cache[key])
            del self.cache[key]
            if len(orig_password_list) > 1:
                shp = list(find_shortest_hamiltonian_path_in_complete_graph(orig_password_list, False))
                if len(shp) == 0:
                    continue  # shortest_hamiltonian_path did not return well.

                edit_distances = []
                for a, b in zip(shp, shp[1:]):
                    ed = editdistance.eval(a, b)
                    edit_distances.append(ed)
                    if ed not in self.cache_key_edit_distance_list:
                        self.cache_key_edit_distance_list[ed] = []
                    self.cache_key_edit_distance_list[ed].append((a, b))

                self.cache[key] = {}
                self.cache[key]['password'] = shp
                self.cache[key]['edit_distance'] = [0] + edit_distances
                mean_edit_distance_key = float('{0:.2f}'.format(np.mean(edit_distances)))
                if mean_edit_distance_key not in self.cache_key_edit_distance_keep_user_struct:
                    self.cache_key_edit_distance_keep_user_struct[mean_edit_distance_key] = []
                new_elt = {'password': self.cache[key]['password'],
                           'edit_distance': self.cache[key]['edit_distance'],
                           'email': key}
                self.cache_key_edit_distance_keep_user_struct[mean_edit_distance_key].append(new_elt)
def fix_ambiguous(ambiguous_sbi):
    """
    For each ambiguous sbi code find to most likely candidate

     0	     vs.id,
     1	     vs.naam,
     2	     codes.hr_code,
     3	     codes.alt_code,
     4	     codes.title,
     5	     codes.alt_title,
     6	     codes.sub_cat,
     7	     codes.alt_sub_cat,
     8	     codes.mks_title

    """
    original_count = 0
    suggestion_count = 0

    for row in ambiguous_sbi:

        normalcode = row[2]
        zerocode = row[3]

        desc1 = row[4]
        desc2 = row[5]
        original = row[8]

        distance_desc1 = editdistance.eval(desc1, original)
        distance_desc2 = editdistance.eval(desc2, original)

        if distance_desc1 > distance_desc2:
            # the alternative match with 0 is better
            suggestion_count += 1
            ves = hrmodels.Vestiging.objects.get(id=row[0])
            invalid_activiteit = ves.activiteiten.get(sbi_code=normalcode)
            # fix the code
            invalid_activiteit.sbi_code = zerocode
            # save the corrected sbi code
            invalid_activiteit.save()
            # now save updated code
        else:
            # do nothing default is fine
            original_count += 1

        log.debug(f'{normalcode}, {zerocode}, {desc1[:18]}, {desc2[:18]}, {original[:18]}, {distance_desc1}, {distance_desc2}')  # noqa

    log.debug("%s-%s = Original-Suggestion", original_count, suggestion_count)
def compute_score_withjson(json_input_file, outCSV, acceptTypes, allKeys):
    f = open(os.getcwd()+"\\test.json","w")
    na_metadata = ["resourceName"]
    with open(outCSV, "wb") as outF:
        a = csv.writer(outF, delimiter=',')
        a.writerow(["x-coordinate","y-coordinate","Similarity_score"])
        metadata_dict={}
        with open(json_input_file) as inputfile:
            parsedData = json.load(inputfile)
            parsedData = parsedData.get("response").get("docs")
        f.write(str(parsedData))
        for doc in parsedData:
            metadata_dict[doc["id"]]=doc
        files_tuple = itertools.combinations(metadata_dict.keys(), 2)
        for file1, file2 in files_tuple:
            try:
                row_edit_distance = [file1, file2]

                file1_metadata = metadata_dict[file1]
                file2_metadata = metadata_dict[file2]

                intersect_features = set(file1_metadata.keys()) & set(file2_metadata.keys())

                intersect_features = [feature for feature in intersect_features if feature not in na_metadata ]

                file_edit_distance = 0.0
                for feature in intersect_features:

                    file1_feature_value = stringify(file1_metadata[feature])
                    file2_feature_value = stringify(file2_metadata[feature])

                    if len(file1_feature_value) == 0 and len(file2_feature_value) == 0:
                        feature_distance = 0.0
                    else:
                        feature_distance = float(editdistance.eval(file1_feature_value, file2_feature_value))/(len(file1_feature_value) if len(file1_feature_value) > len(file2_feature_value) else len(file2_feature_value))

                    file_edit_distance += feature_distance

                if allKeys:
                    file1_only_features = set(file1_metadata.keys()) - set(intersect_features)
                    file1_only_features = [feature for feature in file1_only_features if feature not in na_metadata]

                    file2_only_features = set(file2_metadata.keys()) - set(intersect_features)
                    file2_only_features = [feature for feature in file2_only_features if feature not in na_metadata]

                    file_edit_distance += len(file1_only_features) + len(file2_only_features)       # increment by 1 for each disjunct feature in (A-B) & (B-A), file1_disjunct_feature_value/file1_disjunct_feature_value = 1
                    file_edit_distance /= float(len(intersect_features) + len(file1_only_features) + len(file2_only_features))

                else:
                    file_edit_distance /= float(len(intersect_features))    #average edit distance

                row_edit_distance.append(1-file_edit_distance)
                a.writerow(row_edit_distance)

            except ConnectionError:
                sleep(1)
            except KeyError:
                continue
    return
Exemplo n.º 22
0
 def _edit_distance(self,
                    entity: str,
                    entity_text: List[Token],
                    token: Token,
                    token_index: int,
                    tokens: List[Token]) -> float:
     edit_distance = float(editdistance.eval(' '.join(e.text for e in entity_text), token.text))
     return 1.0 - edit_distance / len(token.text)
Exemplo n.º 23
0
def get_normalized_editscore_words(sent1,sent2):
    dist=editdistance.eval(sent1, sent2)
    l1=len(sent1);
    l2=len(sent2);
    l=l1;
    if l2>l1:
	l=l2;
    return ( 1-dist/(1.0*l) );
Exemplo n.º 24
0
def string_sim(n1, n2):
    """ Applies Levenshtein distance between strings."""
    if (not n1) or (not n2):
      return 0
    l1 = len(n1)
    l2 = len(n2)
    diff = editdistance.eval(n1,n2)
    return 1-(diff/(l1 if l1 > l2 else l2))
Exemplo n.º 25
0
def accept_pair(premise, hypothesis):
    if not d.check(premise) or not str.isalnum(premise):
        return False
    threshold = min(len(premise), len(hypothesis), 4)
    edit_distance = editdistance.eval(premise, hypothesis)
    if edit_distance < threshold:
        return False
    return True
Exemplo n.º 26
0
def similar_strings(s1, s2, threthold):
	new_s1 = s1
	new_s2 = s2
	if len(s1) < len(s2):
		new_s1 = s1 + ' '.ljust(len(s2) - len(s1))
		new_s2 = s2
	elif len(s1) > len(s2):
		new_s2 = s2 + ' '.ljust(len(s1) - len(s2))
		new_s1 = s1
	length = len(new_s1)
	hamming = distance.hamming(new_s1,new_s2,normalized=True)
	print "hamming %f, threthold: %f" %(hamming, threthold)
	#if hamming >= threthold:
	#	return True

	print "calculating levenshtein ...length: %d vs %d " %(len(s1), len(s2))
	integer_threthold = 0
	if min(len(s1), len(s2)) > 15000:
		s1_arr = re.split('[;,]',s1)
		s2_arr = re.split('[;,]',s2)
		print "using fast levenshtein algorithm: s1-len:%d s2-len:%d" \
			%(len(s1_arr), len(s2_arr))
		levenshtein = editdistance.eval(s1_arr, s2_arr)
		integer_threthold = min(len(s1_arr), len(s2_arr)) * (1 - threthold)
	else:
		print "using standard levenshtein algorithm"
		levenshtein = editdistance.eval(s1, s2)
		integer_threthold = min(len(s1), len(s2)) * (1 - threthold)
	print "result levenshtein %d vs threthold %d " %(levenshtein,integer_threthold)
	if levenshtein <= integer_threthold:
		return True
	else:
		return False
	

	print "Done ",str(levenshtein)
	#jaccard = distance.jaccard(s1,s2)
	#print str(jaccard)
	
	'''
	levenshtein = distance.levenshtein(s1,s2)
	print "levenshtein %d, threthold:%d" %(levenshtein, int(length * threthold))
	if levenshtein < length - length * threthold:
		return True
	'''
	return False
Exemplo n.º 27
0
 def most_similar(self, chat_line):
     closest = (None, None, float("inf"))
     for archive_date, lines in self._by_date.items():
         for line in lines:
             distance = editdistance.eval(line.text, chat_line.text)
             if distance < closest[2]:
                 closest = (archive_date, line, distance)
     return closest
Exemplo n.º 28
0
def edit_dist_less_two(words1, words2):
    sum = 0
    for word1 in words1:
        for word2 in words2:
            if len(word1) > 5 and len(word2) > 5 and editdistance.eval(word1, word2) <= 2 and word1[0] == word2[0]:
                sum += 1
                break
    return sum
def computeScores(inputDir, outCSV, acceptTypes, allKeys):

    na_metadata = ["resourceName"]
    with open(outCSV, "wb") as outF:
        a = csv.writer(outF, delimiter=',')
        a.writerow(["x-coordinate","y-coordinate","Similarity_score"])

        filename_list = []

        for root, dirnames, files in os.walk(inputDir):
            dirnames[:] = [d for d in dirnames if not d.startswith('.')]
            for filename in files:
                if not filename.startswith('.'):
                    filename_list.append(os.path.join(root, filename))

        filename_list = [filename for filename in filename_list if parser.from_file(filename)]
        if acceptTypes:
            filename_list = [filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes]
        else:
            print "Accepting all MIME Types....."

        files_tuple = itertools.combinations(filename_list, 2)
        for file1, file2 in files_tuple:

            row_edit_distance = [file1, file2]            

            file1_parsedData = parser.from_file(file1)
            file2_parsedData = parser.from_file(file2)
    
            intersect_features = set(file1_parsedData["metadata"].keys()) & set(file2_parsedData["metadata"].keys())                
            intersect_features = [feature for feature in intersect_features if feature not in na_metadata ]

            file_edit_distance = 0.0
            for feature in intersect_features:

                file1_feature_value = stringify(file1_parsedData["metadata"][feature])
                file2_feature_value = stringify(file2_parsedData["metadata"][feature])

                feature_distance = float(editdistance.eval(file1_feature_value, file2_feature_value))/(len(file1_feature_value) if len(file1_feature_value) > len(file2_feature_value) else len(file2_feature_value))
                    
                file_edit_distance += feature_distance

            
            if allKeys:
                file1_only_features = set(file1_parsedData["metadata"].keys()) - set(intersect_features)
                file1_only_features = [feature for feature in file1_only_features if feature not in na_metadata]

                file2_only_features = set(file2_parsedData["metadata"].keys()) - set(intersect_features)
                file2_only_features = [feature for feature in file2_only_features if feature not in na_metadata]

                file_edit_distance += len(file1_only_features) + len(file2_only_features)
                file_edit_distance /= float(len(intersect_features) + len(file1_only_features) + len(file2_only_features))

            else:
                file_edit_distance /= float(len(intersect_features))    #average edit distance

            row_edit_distance.append(1-file_edit_distance)
            a.writerow(row_edit_distance)
Exemplo n.º 30
0
def calculate_edit(allKeys):

    with open("csvOutput", "wb") as outF:
        a = csv.writer(outF, delimiter=',')
        a.writerow(["x-coordinate","y-coordinate","Similarity_score"])
        na_metadata = []
        solr_handle = solr.Solr("http://localhost:8984/solr/polrsolr")
        select = solr.SearchHandler(solr_handle, "/select")
        row_count = 1000
        response = select.__call__(q="*", rows = row_count)
        file_metadata_list = {}
        files_list = []
        for file_data in response.results:
            file_metadata_list[file_data['id']] = file_data
            files_list.append(file_data['id'])

        files_tuple = itertools.combinations(files_list,2)
        for file_1, file_2 in files_tuple:
            try:
                row_edit_distance = [file_1, file_2]
                file1_data = file_metadata_list[file_1]
                file2_data = file_metadata_list[file_2]

                intersect_features = set(file1_data.keys()) & set(file2_data.keys())

                intersect_features = [feature for feature in intersect_features if feature not in na_metadata ]

                file_edit_distance = 0.0
                for feature in intersect_features:

                    file1_feature_value = stringify(str(file1_data[feature]))
                    file2_feature_value = stringify(str(file2_data[feature]))

                    if len(file1_feature_value) == 0 and len(file2_feature_value) == 0:
                        feature_distance = 0.0
                    else:
                        feature_distance = float(editdistance.eval(file1_feature_value, file2_feature_value))/(len(file1_feature_value) if len(file1_feature_value) > len(file2_feature_value) else len(file2_feature_value))

                    file_edit_distance += feature_distance

                if allKeys:
                    file1_only_features = set(file1_data.keys()) - set(intersect_features)
                    file1_only_features = [feature for feature in file1_only_features if feature not in na_metadata]

                    file2_only_features = set(file2_data.keys()) - set(intersect_features)
                    file2_only_features = [feature for feature in file2_only_features if feature not in na_metadata]

                    file_edit_distance += len(file1_only_features) + len(file2_only_features)       # increment by 1 for each disjunct feature in (A-B) & (B-A), file1_disjunct_feature_value/file1_disjunct_feature_value = 1
                    file_edit_distance /= float(len(intersect_features) + len(file1_only_features) + len(file2_only_features))

                else:
                    file_edit_distance /= float(len(intersect_features))    #average edit distance

                row_edit_distance.append(1-file_edit_distance)
                a.writerow(row_edit_distance)

            except KeyError:
                continue
Exemplo n.º 31
0
def computer_cer(preds, labels):
    dist = sum(editdistance.eval(label, pred) for label, pred in zip(labels, preds))
    total = sum(len(l) for l in labels)
    return dist, total
Exemplo n.º 32
0
def infer():
    tensor_global_step = tf.train.get_or_create_global_step()

    model_infer = args.Model(tensor_global_step,
                             encoder=args.model.encoder.type,
                             decoder=args.model.decoder.type,
                             training=False,
                             args=args)

    dataset_dev = args.dataset_test if args.dataset_test else args.dataset_dev

    saver = tf.train.Saver(max_to_keep=40)
    size_variables()

    config = tf.ConfigProto()
    config.allow_soft_placement = True
    config.gpu_options.allow_growth = True
    config.log_device_placement = False
    with tf.train.MonitoredTrainingSession(config=config) as sess:
        checkpoint = tf.train.latest_checkpoint(args.dirs.checkpoint_init)
        saver.restore(sess, checkpoint)

        total_cer_dist = 0
        total_cer_len = 0
        total_wer_dist = 0
        total_wer_len = 0
        with open(args.dir_model.name + '_decode.txt', 'w') as fw:
            for i, sample in enumerate(dataset_dev):
                if not sample:
                    continue
                dict_feed = {
                    model_infer.list_pl[0]:
                    np.expand_dims(sample['feature'], axis=0),
                    model_infer.list_pl[1]:
                    np.array([len(sample['feature'])])
                }
                sample_id, shape_batch, _ = sess.run(model_infer.list_run,
                                                     feed_dict=dict_feed)
                # decoded, sample_id, decoded_sparse = sess.run(model_infer.list_run, feed_dict=dict_feed)
                res_txt = array2text(sample_id[0], args.data.unit,
                                     args.idx2token, args.token2idx)
                ref_txt = array2text(sample['label'], args.data.unit,
                                     args.idx2token, args.token2idx)

                list_res_char = list(res_txt)
                list_ref_char = list(ref_txt)
                list_res_word = res_txt.split()
                list_ref_word = ref_txt.split()
                cer_dist = ed.eval(list_res_char, list_ref_char)
                cer_len = len(list_ref_char)
                wer_dist = ed.eval(list_res_word, list_ref_word)
                wer_len = len(list_ref_word)
                total_cer_dist += cer_dist
                total_cer_len += cer_len
                total_wer_dist += wer_dist
                total_wer_len += wer_len
                if cer_len == 0:
                    cer_len = 1000
                    wer_len = 1000
                if wer_dist / wer_len > 0:
                    fw.write('id:\t{} \nres:\t{}\nref:\t{}\n\n'.format(
                        sample['id'], res_txt, ref_txt))
                sys.stdout.write(
                    '\rcurrent cer: {:.3f}, wer: {:.3f};\tall cer {:.3f}, wer: {:.3f} {}/{} {:.2f}%'
                    .format(cer_dist / cer_len, wer_dist / wer_len,
                            total_cer_dist / total_cer_len,
                            total_wer_dist / total_wer_len, i,
                            len(dataset_dev), i / len(dataset_dev) * 100))
                sys.stdout.flush()
        logging.info('dev CER {:.3f}:  WER: {:.3f}'.format(
            total_cer_dist / total_cer_len, total_wer_dist / total_wer_len))
Exemplo n.º 33
0
    data = json.loads(response.read())

    responsep = urllib.urlopen(urlp)
    rsp = eval(responsep.read())
    print "QTime=", rsp['responseHeader']['QTime']
    print "number of matches=", rsp['response']['numFound']

    #print out the name field for each returned document
    sumx = 0
    mindist = 255 + len(userin)
    pick = ''

    #note: was referencing doc['title'][0]

    for doc in rsp['response']['docs']:
        dist = editdistance.eval(userin, doc['title'])
        if (dist < mindist):
            mindist = dist
            pick = doc['title']

    print "mindist=", mindist
    for doc in rsp['response']['docs']:
        dist = editdistance.eval(userin, doc['title'])
        print 'title field =', doc['title'], " score=", doc[
            'score'], " dist=", dist
        if (dist == mindist): sumx += doc['score']
    # do a roulette wheel selection based on the sum
    rndpoint = random.uniform(0, sumx)

    sumy = 0
    for doc in rsp['response']['docs']:
Exemplo n.º 34
0
def computeScores(inputDir, outCSV, acceptTypes, allKeys):

    na_metadata = ["resourceName"]
    with open(outCSV, "wb") as outF:
        a = csv.writer(outF, delimiter=',')
        a.writerow(["x-coordinate","y-coordinate","Similarity_score"])

        filename_list = []

        for root, dirnames, files in os.walk(inputDir):
            dirnames[:] = [d for d in dirnames if not d.startswith('.')]
            for filename in files:
                if not filename.startswith('.'):
                    filename_list.append(os.path.join(root, filename))

        if acceptTypes:
            filename_list = [filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes]
        else:
            print "Accepting all MIME Types....."

        files_tuple = itertools.combinations(filename_list, 2)
        num = 0
        for file1, file2 in files_tuple:
            try:           
                row_edit_distance = [file1, file2]            
                print num 
                fp = open(file1, "r")
                fp2 = open(file2, "r")
                file1_parsedData = {}
                file1_parsedData["metadata"] = json.load(fp)
                file2_parsedData = {}
                file2_parsedData["metadata"] = json.load(fp2)
                
                intersect_features = set(file1_parsedData["metadata"].keys()) & set(file2_parsedData["metadata"].keys()) 
                            
                intersect_features = [feature for feature in intersect_features if feature not in na_metadata ]

                file_edit_distance = 0.0
                for feature in intersect_features:

                    file1_feature_value = stringify(file1_parsedData["metadata"][feature])
                    file2_feature_value = stringify(file2_parsedData["metadata"][feature])

                    if len(file1_feature_value) == 0 and len(file2_feature_value) == 0:
                        feature_distance = 0.0
                    else:
                        feature_distance = float(editdistance.eval(file1_feature_value, file2_feature_value))/(len(file1_feature_value) if len(file1_feature_value) > len(file2_feature_value) else len(file2_feature_value))
                    
                    file_edit_distance += feature_distance

            
                if allKeys:
                    file1_only_features = set(file1_parsedData["metadata"].keys()) - set(intersect_features)
                    file1_only_features = [feature for feature in file1_only_features if feature not in na_metadata]

                    file2_only_features = set(file2_parsedData["metadata"].keys()) - set(intersect_features)
                    file2_only_features = [feature for feature in file2_only_features if feature not in na_metadata]

                    file_edit_distance += len(file1_only_features) + len(file2_only_features)       # increment by 1 for each disjunct feature in (A-B) & (B-A), file1_disjunct_feature_value/file1_disjunct_feature_value = 1
                    file_edit_distance /= float(len(intersect_features) + len(file1_only_features) + len(file2_only_features))

                else:
                    file_edit_distance /= float(len(intersect_features))    #average edit distance

                row_edit_distance.append(1-file_edit_distance)
                a.writerow(row_edit_distance)
                num += 1
            except ConnectionError:
                sleep(1)
            except KeyError:
                continue
Exemplo n.º 35
0
def calculate_edit_distance(test_name, train_names_features):
    global edit_distance
    edit_distance[test_name] = {}
    for name in train_names_features:
        edit_distance[test_name][name] = editdistance.eval(test_name, name)
Exemplo n.º 36
0
    def forward(self, model, sample, reduction="sum", log_probs=True):
        """Computes the cross entropy with accuracy metric for the given sample.

        This is similar to CrossEntropyCriterion in fairseq, but also
        computes accuracy metrics as part of logging

        Args:
            logprobs (Torch.tensor) of shape N, T, D i.e.
                batchsize, timesteps, dimensions
            targets (Torch.tensor) of shape N, T  i.e batchsize, timesteps

        Returns:
        tuple: With three elements:
            1) the loss
            2) the sample size, which is used as the denominator for the gradient
            3) logging outputs to display while training

        TODO:
            * Currently this Criterion will only work with LSTMEncoderModels or
            FairseqModels which have decoder, or Models which return TorchTensor
            as net_output.
            We need to make a change to support all FairseqEncoder models.
        """
        net_output = model(**sample["net_input"])
        num_output = net_output["num_output"].int()

        if model.training:
            lprobs, qua_loss, ce_loss = self.compute_loss(
                model, net_output, sample, reduction, log_probs
            )

            nsentences = sample["target"].size(0) + 1.0
            ntokens = sample["ntokens"]
            loss = self.args.lambda_qua * qua_loss * ntokens / nsentences + ce_loss

            sample_size, logging_output = self.get_logging_output(
                sample, lprobs, loss, qua_loss, ce_loss
            )
        else:
            import editdistance

            loss = qua_loss = sample_size = 0.0
            logging_output = {
                "ntokens": sample["ntokens"],
                "nsentences": sample["target"].size(0),
                "sample_size": sample_size
            }
            c_err = 0
            c_len = 0
            with torch.no_grad():
                for logits, l, t in zip(net_output['logits'], num_output, sample["target"]):
                    decoded = logits.argmax(dim=-1)[:l]
                    p = (t != self.task.target_dictionary.pad()) & (
                        t != self.task.target_dictionary.eos()
                    )
                    targ = t[p]
                    targ_units_arr = targ.tolist()
                    pred_units_arr = decoded.tolist()
                    # targ_units_arr = targ.unique_consecutive().tolist()
                    # pred_units_arr = decoded.unique_consecutive().tolist()
                    c_err += editdistance.eval(pred_units_arr, targ_units_arr)
                    c_len += len(targ_units_arr)
                logging_output["c_errors"] = c_err
                logging_output["c_total"] = c_len

        return loss, sample_size, logging_output
Exemplo n.º 37
0
def dist3(word):
    one_array = []
    for words in Dictionary:
        if editdistance.eval(word, words) == 3:
            one_array.append(words)
    return one_array
Exemplo n.º 38
0
def main():
    if len(sys.argv) != 2:
        print('wrong number of arguments')
        exit(1)

    folder_path = sys.argv[1]
    dirs = [
        dir for dir in os.listdir(folder_path)
        if os.path.isdir(os.path.join(folder_path, dir))
    ]

    compiled_set = set()

    for dir in dirs:
        full_dir_path = os.path.join(folder_path, dir)

        print(f'checking: {dir}')

        cpp_list = [x for x in os.listdir(full_dir_path) if '.cpp' in x]

        if not os.path.isfile(os.path.join(full_dir_path, 'test')):
            stdout, stderr, code = compile_file(' '.join(cpp_list),
                                                full_dir_path)

            if code != 0 or len(stdout) != 0 or len(stderr) != 0:
                print('error on compiling!')
                print(stdout, stderr)
            else:
                print('compiled!')
                compiled_set.add(dir)
        else:
            compiled_set.add(dir)
            print('already compiled')

    print('-----check code similarity------')

    similarity = np.zeros((len(dirs), len(dirs)))
    for i in range(len(dirs) - 1):
        for j in range(i + 1, len(dirs)):
            i_dir = dirs[i]
            j_dir = dirs[j]
            i_full_path = os.path.join(folder_path, i_dir)
            j_full_path = os.path.join(folder_path, j_dir)

            cpp_list = [x for x in os.listdir(i_full_path) if '.cpp' in x]
            sum = 0
            for cpp in cpp_list:
                try:
                    i_file = open(os.path.join(i_full_path, cpp), 'r')
                    j_file = open(os.path.join(j_full_path, cpp), 'r')

                    i_content = i_file.read()
                    j_content = j_file.read()

                    sim = editdistance.eval(i_content, j_content)
                    sum = sum + sim

                    i_file.close()
                    j_file.close()
                except:
                    print(
                        'invalid file encoding found, not valid ascii charaters in: '
                    )
                    sum = np.Inf
                    try:

                        i_file = open(os.path.join(i_full_path, cpp), 'r')
                        i_file.read()
                        i_file.close()
                    except:
                        print(
                            f'invalid characters found in file {i_full_path}')

                    try:

                        j_file = open(os.path.join(j_full_path, cpp), 'r')
                        j_file.read()
                        j_file.close()
                    except:
                        print(
                            f'invalid characters found in file {j_full_path}')
                    i_file.close()
                    j_file.close()

            similarity[i][j] = sum
            similarity[j][i] = sum

    for i in range(len(dirs)):
        similarity[i][i] = np.Inf
    print(similarity)

    print('-----done code similarity check------')
    print('-----similarity report-----')
    print('min is:')
    print(np.min(similarity))
    mix_arg = np.argmin(similarity)
    x, y = np.unravel_index(mix_arg, similarity.shape)
    print(f'index at: ({x}, {y})')
    print(f'student: {dirs[x]}, {dirs[y]}')
    print('second highest:')
    similarity[x][y] = np.Inf
    similarity[y][x] = np.Inf
    print(np.min(similarity))
    mix_arg = np.argmin(similarity)
    x, y = np.unravel_index(mix_arg, similarity.shape)
    print(f'index at: ({x}, {y})')
    print(f'student: {dirs[x]}, {dirs[y]}')
    print('-----end of similarity report-----')

    for username in dirs:
        exec_path = os.path.join(folder_path, username, 'test')
        try:
            if test(exec_test_case, exec_path):
                print(f'{username} successfully pass all the test case')
            else:
                print(f'{username} failed the test case')
        except:
            print(
                f'{username} have invalid char in the program, please double check'
            )
Exemplo n.º 39
0
def levenshtein_avg(weights, seq1, seq2):
    norm = .5 * (len(seq1) + len(seq2))
    return 1 - (editdistance.eval(seq1, seq2) / norm)
Exemplo n.º 40
0
    for test in test_set:
        #bow     = vectorizer.transform(test)
        smatrix = vectorizer.transform(test)
        tfidf = TfidfTransformer(norm="l2")
        tfidf.fit(smatrix)

        for item in items:
            term_match = 0
            matched_on = []
            specimen = utils.init_fields(item['data'])

            for term in vectorizer.vocabulary_:
                for key, value in specimen.iteritems():
                    clean_term = utils.normalize(term)
                    clean_value = utils.normalize(value)
                    dist = editdistance.eval(clean_term, clean_value)

                    if term == value or dist < 5:
                        term_match += 1
                        if key not in matched_on:
                            matched_on.append(key)

        if term_match >= 10:
            print "specimen match"
            print ", ".join(matched_on)

            result = db.epandda_match.insert_one({
                "oid": record['oid'],
                "uuid": item['uuid']
            })
Exemplo n.º 41
0
def calc_str_distance(command, candidate):
    """ Calculates the distance between two strings
        ref: Levenshtein Distance
    """
    return editdistance.eval(command, candidate)
Exemplo n.º 42
0
    def forward(self, model, sample, reduction="sum", log_probs=True):
        """Computes the cross entropy with accuracy metric for the given sample.

        This is similar to CrossEntropyCriterion in fairseq, but also
        computes accuracy metrics as part of logging

        Args:
            logprobs (Torch.tensor) of shape N, T, D i.e.
                batchsize, timesteps, dimensions
            targets (Torch.tensor) of shape N, T  i.e batchsize, timesteps

        Returns:
        tuple: With three elements:
            1) the loss
            2) the sample size, which is used as the denominator for the gradient
            3) logging outputs to display while training

        TODO:
            * Currently this Criterion will only work with LSTMEncoderModels or
            FairseqModels which have decoder, or Models which return TorchTensor
            as net_output.
            We need to make a change to support all FairseqEncoder models.
        """
        nsentences = sample["target"].size(0)
        ntokens = sample["ntokens"]

        if model.training:
            net_output = model(**sample["net_input"])
            num_output = torch.round(net_output["num_output"]).int()
            gold_rate = net_output["gold_rate"] if "gold_rate" in net_output else 0.0
            lprobs, ctc_loss, qua_loss, ce_loss = self.compute_loss(
                model, net_output, sample, reduction, log_probs
            )

            e_len = int(sum(abs(sample["target_lengths"].data - num_output.data)))
            loss = ce_loss + \
                   self.args.lambda_qua * qua_loss * ntokens / nsentences + \
                   self.args.lambda_ctc * ctc_loss

            sample_size, logging_output = self.get_logging_output(
                sample, lprobs, e_len, loss, ctc_loss, qua_loss, ce_loss, gold_rate
            )
        else:
            import editdistance

            net_output = model(**sample["net_input"])
            num_output = torch.round(net_output["num_output"]).int()

            loss = sample_size = 0.0
            logging_output = {
                "ntokens": ntokens,
                "nsentences": nsentences,
                "sample_size": sample_size
            }
            c_err = 0
            c_len = 0
            e_len = 0
            with torch.no_grad():
                for i, logits, l, t in zip(range(9999), net_output['logits'], num_output, sample["target"]):
                    # decoded = logits.argmax(dim=-1)[:l]
                    p = t != self.task.target_dictionary.pad()
                    decoded = logits.argmax(dim=-1)[:l]
                    targ = t[p]
                    targ_units_arr = targ.tolist()
                    pred_units_arr = decoded.tolist()
                    c_err += editdistance.eval(pred_units_arr, targ_units_arr)
                    c_len += len(targ_units_arr)
                    e_len += abs(len(targ_units_arr) - len(pred_units_arr)) * 1.0
                logits2sent(pred_units_arr, targ_units_arr, model.tgt_dict, rate=0.03)
                logging_output["c_errors"] = c_err
                logging_output["c_total"] = c_len
                logging_output["e_len"] = e_len

        return loss, sample_size, logging_output
Exemplo n.º 43
0
list_files = os.listdir(parent)

list_files.sort()

division = len(list_files) / 8
for i in xrange(division * 1, division * 2):
    curr_ref = getRef(list_files[i])
    citis.write("********" + list_files[i] + "********\n")
    if curr_ref is None:
        continue
    for each_wiki in list_files:
        if each_wiki is list_files[i]:
            continue
        ref_each_wiki = getRef(each_wiki)
        if ref_each_wiki is None:
            continue
        print_citis = list()
        for each_curr_ref in curr_ref:
            for each_ref_each_wiki in ref_each_wiki:
                if each_curr_ref['year'] == each_ref_each_wiki['year']:
                    edit_dis_authors = editdistance.eval(
                        each_curr_ref['authors'],
                        each_ref_each_wiki['authors'])
                    edit_dis_ref = editdistance.eval(each_curr_ref['ref'],
                                                     each_ref_each_wiki['ref'])
                else:
                    continue
                if edit_dis_authors < 6 or edit_dis_ref < 6:
                    print_citis.append(each_wiki)
                    # less6.write(str(each_curr_ref)+"\n"+str(each_ref_each_wiki)+"\n"+"{"+str(int(edit_dis_authors))+","+str(int(edit_dis_ref))+"}\n")
    citis.write(str(set(print_citis)) + "\n")
Exemplo n.º 44
0
 def __eq__( self, other ):
     if ( isinstance( other, stringa ) and editdistance.eval( self.stringa, other.stringa ) < 5 ):
         return True;
     else:
         return False;
Exemplo n.º 45
0
def get_max_edit_dist(target):
    dists = [
        editdistance.eval(target, rand_seq(len(target))) for _ in xrange(1000)
    ]
    return min(10, np.percentile(dists, 0.5))
Exemplo n.º 46
0
def levenshtein_max(weights, seq1, seq2):
    norm = 1.0 * max(len(seq1), len(seq2))
    return 1 - (editdistance.eval(seq1, seq2) / norm)
Exemplo n.º 47
0
			print(f"Exception: {e.get_code()}")
			print(src +'\t'+ trg)
			continue
		try:
			lang2 = detect(trg)
		except Exception as e:
			print(f"Exception: {e.get_code()}")
			print(src +'\t'+ trg)
			continue
		if lang1 != 'en' or lang2 != 'en':
			print("NOT ENGLISH:\t" + src +'\t'+ trg)
			continue
		zh_tmp = re.sub(r"[\s,\.]", "", src)
		en_tmp = re.sub(r"[\s,\.]", "", trg)
		min_len = min(len(zh_tmp), len(en_tmp))
		dist = editdistance.eval(zh_tmp.lower(), en_tmp.lower())
		ratio = dist * 1.0 / min_len
		if ratio < 0.4:
			print(f"OVERLAP {ratio}:\t" + src +'\t'+ trg)
			continue
		if len(src) > len(trg):
			long_txt = src
			shrt_txt = trg
			len_short = len(trg)
		else:
			long_txt = trg
			shrt_txt = src
			len_short = len(src)
		segment1 = long_txt[:len_short]
		segment2 = long_txt[-len_short:]
		dist1 = editdistance.eval(segment1.lower(), shrt_txt.lower())
Exemplo n.º 48
0
    def forward(self, xs_pad, ilens, ys_pad):
        """E2E forward.

        Args:
            xs_pad (torch.Tensor): batch of padded input sequences (B, Tmax, idim)
            ilens (torch.Tensor): batch of lengths of input sequences (B)
            ys_pad (torch.Tensor): batch of padded character id sequence tensor (B, Lmax)

        Returns:
               loss (torch.Tensor): transducer loss value

        """
        # 0. Frontend
        if self.frontend is not None:
            hs_pad, hlens, mask = self.frontend(to_torch_tensor(xs_pad), ilens)
            hs_pad, hlens = self.feature_transform(hs_pad, hlens)
        else:
            hs_pad, hlens = xs_pad, ilens

        # 1. encoder
        hs_pad, hlens, _ = self.enc(hs_pad, hlens)

        # 2. decoder
        loss = self.dec(hs_pad, hlens, ys_pad)

        # 3. compute cer/wer
        # note: not recommended outside debugging right now,
        # the training time is hugely impacted.
        if self.training or not (self.report_cer or self.report_wer):
            cer, wer = 0.0, 0.0
        else:
            word_eds, word_ref_lens, char_eds, char_ref_lens = [], [], [], []

            batchsize = int(hs_pad.size(0))
            batch_nbest = []

            for b in six.moves.range(batchsize):
                if self.beam_size == 1:
                    nbest_hyps = self.dec.recognize(hs_pad[b], self.recog_args)
                else:
                    nbest_hyps = self.dec.recognize_beam(hs_pad[b], self.recog_args)

                batch_nbest.append(nbest_hyps)

            y_hats = [nbest_hyp[0]['yseq'][1:] for nbest_hyp in batch_nbest]

            for i, y_hat in enumerate(y_hats):
                y_true = ys_pad[i]

                seq_hat = [self.char_list[int(idx)] for idx in y_hat]
                seq_true = [self.char_list[int(idx)] for idx in y_true if int(idx) != -1]
                seq_hat_text = "".join(seq_hat).replace(self.recog_args.space, ' ')
                seq_true_text = "".join(seq_true).replace(self.recog_args.space, ' ')

                hyp_words = seq_hat_text.split()
                ref_words = seq_true_text.split()
                word_eds.append(editdistance.eval(hyp_words, ref_words))
                word_ref_lens.append(len(ref_words))

                hyp_chars = seq_hat_text.replace(' ', '')
                ref_chars = seq_true_text.replace(' ', '')
                char_eds.append(editdistance.eval(hyp_chars, ref_chars))
                char_ref_lens.append(len(ref_chars))
            wer = 0.0 if not self.report_wer else float(sum(word_eds)) / sum(word_ref_lens)
            cer = 0.0 if not self.report_cer else float(sum(char_eds)) / sum(char_ref_lens)

        self.loss = loss
        loss_data = float(self.loss)

        if not math.isnan(loss_data):
            self.reporter.report(loss_data, cer, wer)
        else:
            logging.warning('loss (=%f) is not correct', loss_data)

        return self.loss
Exemplo n.º 49
0
def score(true, pred, iou_threshold=0.5, similarity_threshold=0.5, translator=None):
    """
    Args:
        true: The ground truth boxes provided as a dictionary of {image_id: annotations}
            mappings. `annotations` should be lists of dicts with a `text` and `vertices` key.
            `vertices` should be a list of (x, y) coordinates. Optionally, an "ignore" key can be
            added to indicate that detecting an annotation should neither count as a false positive
            nor should failure to detect it count as a false negative.
        pred: The predicted boxes in the same format as `true`.
        iou_threshold: The minimum IoU to qualify a box as a match.
        similarity_threshold: The minimum texg similarity required to qualify
            a text string as a match.
        translator: A translator acceptable by `str.translate`. Used to
            modify ground truth / predicted strings. For example,
            `str.maketrans(string.ascii_uppercase, string.ascii_lowercase,
            string.punctuation)` would yield a translator that changes all
            strings to lowercase and removes punctuation.

    Returns:
        A results dictionary reporting false positives, false negatives, true positives
        and near matches (IoU > iou_threshold but similarity < similarity_threshold) along
        with the compute precision and recall.
    """
    true_ids = sorted(true)
    pred_ids = sorted(pred)
    assert all(
        true_id == pred_id for true_id, pred_id in zip(true_ids, pred_ids)
    ), "true and pred dictionaries must have the same keys"
    results: typing.Dict[str, typing.List[dict]] = {
        "true_positives": [],
        "false_positives": [],
        "near_true_positives": [],
        "false_negatives": [],
    }
    for image_id in true_ids:
        true_anns = true[image_id]
        pred_anns = copy.deepcopy(pred[image_id])
        pred_matched = set()
        for true_index, true_ann in enumerate(true_anns):
            match = None
            for pred_index, pred_ann in enumerate(pred_anns):
                iou = iou_score(true_ann["vertices"], pred_ann["vertices"])
                if iou >= iou_threshold:
                    match = {
                        "true_idx": true_index,
                        "pred_idx": pred_index,
                        "image_id": image_id,
                    }
                    pred_matched.add(pred_index)
                    true_text = true_ann["text"]
                    pred_text = pred_ann["text"]
                    if true_ann.get("ignore", False):
                        # We recorded that this prediction matched something,
                        # so it won't be a false positive. But we're also ignoring
                        # this ground truth label so we won't count it as a true
                        # positive or a near true positive.
                        continue
                    if translator is not None:
                        true_text = true_text.translate(translator)
                        pred_text = pred_text.translate(translator)
                    edit_distance_norm = max(len(true_text), len(pred_text))
                    if edit_distance_norm == 0:
                        similarity = 1
                    else:
                        similarity = 1 - (
                            editdistance.eval(true_text, pred_text)
                            / max(len(true_text), len(pred_text))
                        )
                    if similarity >= similarity_threshold:
                        results["true_positives"].append(match)
                    else:
                        results["near_true_positives"].append(match)
            if match is None and not true_ann.get("ignore", False):
                results["false_negatives"].append(
                    {"image_id": image_id, "true_idx": true_index}
                )
        results["false_positives"].extend(
            {"pred_index": pred_index, "image_id": image_id}
            for pred_index, _ in enumerate(pred_anns)
            if pred_index not in pred_matched
        )
    fns = len(results["false_negatives"])
    fps = len(results["false_positives"])
    tps = len(
        set(
            (true_positive["image_id"], true_positive["true_idx"])
            for true_positive in results["true_positives"]
        )
    )
    precision = tps / (tps + fps)
    recall = tps / (tps + fns)
    return results, (precision, recall)
Exemplo n.º 50
0
 def cer(predict, truth):        
     cer = [1.0*editdistance.eval(p[0], p[1])/len(p[1]) for p in zip(predict, truth)]
     return np.array(cer).mean()
                 
Exemplo n.º 51
0
def edit_cer_from_list(truth, pred):
    edit = 0
    for t, p in zip(truth, pred):
        edit += editdistance.eval(t, p)
    return edit
Exemplo n.º 52
0
    obo2dist = {}

    allDiseaseWords = actDisease.split(' ')

    for oboname in oboname2node:

        foundOne = True
        for word in allDiseaseWords:
            if len(word) > 4 and word in oboname:
                foundOne = True
                break

        if not foundOne:
            continue

        dist = editdistance.eval(oboname, actDisease)
        obo2dist[oboname] = dist

    if len(obo2dist) == 0:
        continue

    bestDistances = sorted(obo2dist.items(), key=lambda x: x[1])

    print(disease, bestDistances[0])

    if len(disease) < 8 and actDisease not in bestDistances[0][0]:
        continue

    # add best distance
    disease2obo[disease].add(oboname2node[bestDistances[0][0]].id)
Exemplo n.º 53
0
    index2 = blend2.find('\t')
    blend3 = blend2[index2+1:len(blend2)]
    blend2 = blend2[0:index2]
    #print(blend1,blend2,blend3)
    for candi in candis:
        candi = candi.strip()
        #print (candi)
        if blend1 == candi:
            recall_list[ i ]= blend1
            i += 1
    pos=find_vowel(blend1)        
    fpc = first_part(blend1,pos)
    fpd = front_part(blend2,pos)
    spc = second_part(blend1,pos)
    epd = end_part(blend2,len(blend2)-len(blend1)+pos)
    ed_list1[order] = editdistance.eval(fpc,fpd)
    ged_list1[order] =  ged(fpc,fpd)
    ngarm_list1[order] =  NGram(fpc,fpd) 
    ed_list2[order] = editdistance.eval(spc,epd)
    #ged_list2[order] =  ged(spc,epd)
    ngarm_list2[order] =  NGram(spc,epd)               
    order += 1
    
    

print("for ed_list1")  
analyze(ed_list1)
print("for ed_list2")  
analyze(ed_list2)
print("for ged_list1")  
analyze(ged_list1)
def main():
    start = time.time()
    tweets = list(get_tweets())  # get tweets

    movies = json.load(codecs.open('./data/imdb_titles.json', 'r', encoding='utf-8'))  # get movie titles
    movie_titles = list(movies.keys())

    # comment out to use entire IMDB
    # movies = list(get_imdb_movies())
    # movie_titles = list(map(lambda _m: _m['originalTitle'], movies))

    # movie title without episode
    movie_titles_without_eps = filter(lambda _title: '-' in _title, movie_titles)
    movie_titles_without_eps = map(lambda _title: _title.split('-')[0].strip(), movie_titles_without_eps)
    movie_titles_without_eps = list(movie_titles_without_eps)

    # short title
    short_movie_titles = filter(lambda _title: ':' in _title, movie_titles)
    short_movie_titles = map(lambda _title: _title.split(':')[0].strip(), short_movie_titles)
    short_movie_titles = list(short_movie_titles)

    movies = list(set(movie_titles + movie_titles_without_eps + short_movie_titles))

    result = {}

    # for each tweet, find closest movie title
    for tweet in tweets:
        tweet_clean_text = clean_text(tweet['text'])

        if not tweet_clean_text.strip():
            continue

        closest = tweet_clean_text
        closest_dist = sys.maxsize
        closest_dist_word = sys.maxsize

        for movie in movies:
            movie_title = clean_text(movie)

            # calculate edit distance

            dist = editdistance.eval(tweet_clean_text, movie_title)
            # split(' ') to measure edit distance using words instead of characters
            dist_word = editdistance.eval(tweet_clean_text.split(' '), movie_title.split(' '))

            if dist_word < closest_dist_word:
                closest_dist = dist
                closest_dist_word = dist_word
                closest = movie_title
            elif dist_word == closest_dist_word and dist < closest_dist:
                closest_dist = dist
                closest_dist_word = dist_word
                closest = movie_title

        # at most 3 words difference and diff must be less than # of words in the tweet
        if closest_dist_word <= 3 and closest_dist_word < len(tweet_clean_text.split(' ')):
            print(tweet_clean_text, '\t', closest, '\t', closest_dist)
            result[tweet['id']] = {
                'tweet': tweet['text'],
                'tweet_clean': tweet_clean_text,
                'matched_title': closest,
                'dist_char': closest_dist,
                'dist_word': closest_dist_word
            }

    end = time.time()

    json.dump(result, codecs.open("./data/mapped_titles.json", "w", encoding="utf-8"), indent=4)

    print(end - start)
Exemplo n.º 55
0
    def features_for_rank(self, proc, results):
        """Compute features for ranking results from ES/geonames


        Parameters
        ----------
        proc : dict
            One dictionary from the list that comes back from geoparse or from make_country_features (doesn't matter)
        results : dict
            the response from a geonames query

        Returns
        --------
        X : numpy matrix
            holding the computed features

        meta: list of dicts
            including feature information
        """
        feature_list = []
        meta = []
        results = results['hits']['hits']
        search_name = proc['word']
        code_mention = proc['features']['code_mention']
        class_mention = proc['features']['class_mention']

        for rank, entry in enumerate(results):
            # go through the results and calculate some features
            # get population number and exists
            try:
                pop = int(entry['population'])
                has_pop = 1
            except Exception as e:
                pop = 0
                has_pop = 0
            if pop > 0:
                logp = np.log(pop)
            else:
                logp = 0
            ### order the results came back
            adj_rank = 1 / np.log(rank + 2)
            # alternative names
            len_alt = len(entry['alternativenames'])
            adj_alt = np.log(len_alt)
            ### feature class (just boost the good ones)
            if entry['feature_class'] == "A" or entry['feature_class'] == "P":
                good_type = 1
            else:
                good_type = 0
                #fc_score = 3
            ### feature class/code matching
            if entry['feature_class'] == class_mention:
                good_class_mention = 1
            else:
                good_class_mention = 0
            if entry['feature_code'] == code_mention:
                good_code_mention = 1
            else:
                good_code_mention = 0
            ### edit distance
            ed = editdistance.eval(search_name, entry['name'])
            ed = ed  # shrug
            # maybe also get min edit distance to alternative names...

            features = [
                has_pop, pop, logp, adj_rank, len_alt, adj_alt, good_type,
                good_class_mention, good_code_mention, ed
            ]
            m = self.format_geonames(entry)

            feature_list.append(features)
            meta.append(m)

        #meta = geo.format_geonames(results)
        X = np.asmatrix(feature_list)
        return (X, meta)
Exemplo n.º 56
0
def normalized_levenshtein(str_a, str_b):
    '''
    Edit distance normalized to [0, 1].
    '''
    return min(editdistance.eval(str_a, str_b) / (len(str_b) + 1e-16), 1.0)
Exemplo n.º 57
0
# birkbeck_predict_abs_path = os.path.join(script_dir, birkbeck_predict_rel_path)

# read dict
with open(dict_abs_path, "r") as f:
    dictionary_list = f.read().splitlines()

# birkbeck
with open(birkbeck_misspell_abs_path, "r") as fr_misspell:
    with open(birkbeck_correct_abs_path, "r") as fr_correct:
        birkbeck_misspell = fr_misspell.read().splitlines()
        birkbeck_correct = fr_correct.read().splitlines()
        for i in range(0, len(birkbeck_misspell)):
            birkbeck_predict = []
            dist = math.inf
            for each in dictionary_list:
                temp = editdistance.eval(birkbeck_misspell[i], each)
                if temp < dist:
                    dist = temp
                    birkbeck_predict = [each]
                if temp == dist:
                    if (each not in birkbeck_predict):
                        birkbeck_predict.append(each)
            birkbeck_predict_new = []
            dist_ngram = -math.inf
            for predict in birkbeck_predict:
                temp_ngram = ngram.NGram.compare(predict,
                                                 birkbeck_misspell[i],
                                                 N=2)
                if temp_ngram > dist_ngram:
                    dist_ngram = temp_ngram
                    birkbeck_predict_new = [predict]
Exemplo n.º 58
0
def compute_score2(json_input_list, outCSV, acceptTypes, allKeys):
    na_metadata = ["resourceName"]
    with open(outCSV, "wb") as outF:
        a = csv.writer(outF, delimiter=',')
        a.writerow(["x-coordinate","y-coordinate","Similarity_score"])

        json_list = []
        for each in json_input_list:
            with open(each) as json_input_file:
                json_list.extend(json.load(json_input_file))
        # each object in json_list contains a key as file name and a value: as metadata JSON object

        metadata_dict = {}
        for entry in json_list:
            key = entry.keys()[0]
            metadata_dict[key] = entry[key]

        files_tuple = itertools.combinations(metadata_dict.keys(), 2)
        for file1, file2 in files_tuple:
            try:
                row_edit_distance = [file1, file2]

                file1_metadata = metadata_dict[file1]
                file2_metadata = metadata_dict[file2]

                intersect_features = set(file1_metadata.keys()) & set(file2_metadata.keys())

                intersect_features = [feature for feature in intersect_features if feature not in na_metadata ]

                file_edit_distance = 0.0
                for feature in intersect_features:

                    file1_feature_value = stringify(file1_metadata[feature])
                    file2_feature_value = stringify(file2_metadata[feature])

                    if len(file1_feature_value) == 0 and len(file2_feature_value) == 0:
                        feature_distance = 0.0
                    else:
                        feature_distance = float(editdistance.eval(file1_feature_value, file2_feature_value))/(len(file1_feature_value) if len(file1_feature_value) > len(file2_feature_value) else len(file2_feature_value))

                    file_edit_distance += feature_distance

                if allKeys:
                    file1_only_features = set(file1_metadata.keys()) - set(intersect_features)
                    file1_only_features = [feature for feature in file1_only_features if feature not in na_metadata]

                    file2_only_features = set(file2_metadata.keys()) - set(intersect_features)
                    file2_only_features = [feature for feature in file2_only_features if feature not in na_metadata]

                    file_edit_distance += len(file1_only_features) + len(file2_only_features)       # increment by 1 for each disjunct feature in (A-B) & (B-A), file1_disjunct_feature_value/file1_disjunct_feature_value = 1
                    file_edit_distance /= float(len(intersect_features) + len(file1_only_features) + len(file2_only_features))

                else:
                    file_edit_distance /= float(len(intersect_features))    #average edit distance

                row_edit_distance.append(1-file_edit_distance)
                a.writerow(row_edit_distance)

            except ConnectionError:
                sleep(1)
            except KeyError:
                continue
    return
Exemplo n.º 59
0
    pickle.dump(plots_tasks_year_after,
                handle,
                protocol=pickle.HIGHEST_PROTOCOL)


def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3


intersecting_tasks = intersection(list(plots_tasks_year_after.keys()),
                                  list(plots_tasks_year_before.keys()))

import editdistance, pickle

similar_tasks = {}
for elem1 in intersecting_tasks:
    elems = []
    for elem2 in intersecting_tasks:
        if (editdistance.eval(elem1, elem2) / max(len(elem1), len(elem2)) <
                0.3):
            if (elem1 != elem2):
                elems.append(elem2)
    if (len(elems) > 0):
        similar_tasks[elem1] = elems

with open(
        '/mnt/c/Users/D.MONDAL/Downloads/tdm-kg-new/tdm-kg/ACLAnthTill2015_Prediction/similar_tasks.pickle',
        'wb') as handle:
    pickle.dump(similar_tasks, handle, protocol=pickle.HIGHEST_PROTOCOL)
Exemplo n.º 60
0
 def wer(predict, truth):        
     word_pairs = [(p[0].split(' '), p[1].split(' ')) for p in zip(predict, truth)]
     wer = [1.0*editdistance.eval(p[0], p[1])/len(p[1]) for p in word_pairs]
     return np.array(wer).mean()