示例#1
0
def changes(text):
	stop_words = get_stop_words('russian')
	s = ''
	text_lst = []
	for w in text.lower().split():
	    if w not in stop_words:
	        s += w
	    else:
	        if s != '':
	            text_lst.append(s)
	        s = ''
	text_lst.append(s)
	result_word = []
	for elem in text_lst:
	    word = ''
	    min_dist = float('inf')
	    for s in rc.class3:
	        s = ' '.join([w for w in s.lower().split() if w not in stop_words])

	        dist3 = textdistance.hamming(s, elem)
	        if dist3 < min_dist:
	            min_dist = dist3
	            word = s
	    result_word.append(word)
	        
	return result_word
示例#2
0
def execute(stemmed_text, lemmatized_text):
    try:

        lemmatized_words = word_tokenize(lemmatized_text)
        stemmed_words = word_tokenize(stemmed_text)

        final_text = []

        # iterate all lemmas
        for stemm in stemmed_words:

            min_distance = 10
            current_word = ''

            # for each stemm, calculate distance
            for lemma in lemmatized_words:
                distance = textdistance.hamming(stemm, lemma)

                # only keep words with min distance
                if distance < min_distance:
                    min_distance = distance
                    current_word = lemma

            final_text.append(current_word)

        result = ' '.join(final_text)

        return result
    except (Exception) as error:
        raise Exception('[text_distance] - error {0}'.format(error))
示例#3
0
def checkImageHashes():
    # gets the keys and values from the hashes dictionary, in order
    items = sorted(hashes.items())
    # print("Results:") - this is for checking the images
    # iterates over the current and next tuple in the directory
    for cur, nxt in zip(items, items[1:]):

        text1 = cur[1]  # gets the 2nd item in the tuple, the image file itself
        text2 = nxt[1]  # gets the 2nd item in the tuple, the image file itself
        # finds the variation in the image hashes using the Hamming algorithm
        answer = textdistance.hamming(text1, text2)

        # if the Hamming difference between the 32 character strings is less than
        # 30, it detects the image as a duplicate and deletes the current image,
        # keeping the subsequent image

        if 0 <= answer <= 30:
            # log the files that have been deleted
            with open("duplicates.txt", "a") as f:
                f.write("File: " + str(cur[0]) + ", Deviation: " +
                        str(answer) + "\n")
                f.close()
            # create filepath
            toDelete = os.path.join(directory, cur[0])
            # this moves them to the recyle bin
            os.remove(toDelete)
示例#4
0
def suggest_symbols(game_id: int, user_id: int, text: str, buy_or_sell: str):
    if buy_or_sell == "buy":
        to_match = f"{text.upper()}%"
        symbol_suggestions = query_to_dict(
            """
                SELECT * FROM symbols
                WHERE symbol LIKE %s OR name LIKE %s;""", to_match, to_match)

    if buy_or_sell == "sell":
        balances = get_active_balances(game_id, user_id)
        symbols = list(balances["symbol"].unique())
        to_match = f"{text.upper()}%"
        params_list = [to_match] * 2 + symbols
        symbol_suggestions = query_to_dict(
            f"""
            SELECT * FROM symbols
            WHERE (symbol LIKE %s OR name LIKE %s) AND symbol IN ({','.join(['%s'] * len(symbols))});""",
            params_list)

    suggestions = [{
        "symbol": entry["symbol"],
        "label": f"{entry['symbol']} ({entry['name']})",
        "dist": hamming(text, entry['symbol'])
    } for entry in symbol_suggestions]
    # sort suggestions by hamming distance between text and ticker entry
    return sorted(suggestions, key=lambda i: i["dist"])
def check_dist(filename,result):
    """

    :param filename: (str) the path of query file
    :param result:(list) list contains the documents from corpus with similarity to query file
    :return: lists containing jaccard distances, hamming distances, edit distances and cosine distances with each document in the
                result with that of the corpus
    """
    hamming_dist=[]
    jaccard_dist=[]
    edit_dist=[]
    cosine_dist=[]
    file, content = utils.tokenize_file(filename)
    query_cont =set(content)

    for each_result in result:
        file1, content1 = utils.tokenize_file(each_result)
        doc_cont= set(content1)
        jac = nltk.jaccard_distance(query_cont, doc_cont)
        edit_dis =nltk.edit_distance(content,content1)
        hamming = textdistance.hamming(content, content1)
        cos=textdistance.cosine(content, content1)
        cosine_dist.append(cos)
        jaccard_dist.append(jac)
        edit_dist.append(edit_dis)
        hamming_dist.append(hamming)

    return jaccard_dist, edit_dist, hamming_dist,cosine_dist
示例#6
0
def hamming_distance(x, y):
    """Calculate the hamming distance (number of bits different) between the
    two integers given.

    >>> [hamming_distance(x, 15) for x in [0, 8, 10, 12, 14, 15]]
    [4, 3, 2, 2, 1, 0]
    """
    return textdistance.hamming(x,y)
示例#7
0
 def helpful_substation_lookup(self, substation: str):
     if substation in self.graph.nodes:
         return substation, self.graph.nodes(data=True)[substation]
     else:
         close_matches = [
             name for name in list(self.graph.nodes)
             if textdistance.hamming(substation, name) <= 1
         ]
         helper_message = " Did you mean: {}?".format(
             close_matches) if len(close_matches) > 0 else ""
         raise KeyError("Did not find substation '{}'.{}".format(
             substation, helper_message))
示例#8
0
def define_station(text):
	stop_words = get_stop_words('russian')
	text = ' '.join([w for w in text.lower().split() if w not in stop_words])
	min_dist = float('inf')
	word = ''
	for s in rc.class3:
	    s = ' '.join([w for w in s.lower().split() if w not in stop_words])

	    dist3 = textdistance.hamming(s, text)
	    if dist3 < min_dist:
	        
	        min_dist = dist3
	        word = s
	return word
def compute_differences(f1_n,f2_n,numerical=False):
    with open(f1_n,"r") as f1 , open(f2_n,"r") as f2:
        lines_1 = f1.readlines()
        lines_2 = f2.readlines()
        if len(lines_1) != len(lines_2):
            return {"lines":len(lines_1)-len(lines_2),"n_line":len(lines_1)}
        d=0
        val = []
        #print(len(lines_2))
        for l1,l2 in zip(lines_1,lines_2):
            if l1 != l2:
                if not numerical:
                    d += textdistance.hamming(l1,l2)
                else:
                    if l1.startswith(">"):
                        d += textdistance.hamming(l1,l2)
                    else:
                        p1 = np.array(list(map(float,l1.strip().split())))
                        p2 = np.array(list(map(float,l2.strip().split())))
                        print(f"percent b string1 {np.nanmean(p1):.2f} percent b string 2 {np.nanmean(p2):.2f} , size {len(p1)}")
                        val.append(np.abs(np.nanmean(p1)-np.nanmean(p2)))


    return  {"letters":d,"n_line":len(lines_1),"val":val}
示例#10
0
    def compare(self, str1, str2):

        if self.debug:
            self.log("hamming comparison")

        self.start_time()

        self.result.distance = hamming(str1, str2)

        self.end_time()

        self.result.nos = max(len(str1), len(str2))
        self.result.threshold = 90
        self.result.similarity = (100.0 / float(self.result.nos)) * (
            self.result.nos - self.result.distance)

        return self.result
示例#11
0
def load_lat_long_from_csv(df):
    """

    dataset: https://github.com/datosgobar/georef-ar-api/blob/master/config/georef.example.cfg
    """
    # all_localidades = df.to_dict(orient="records")
    data = pd.DataFrame(columns=["lat", "lon"])

    registros = load_localidades()

    localidades = df.Localidad.str.upper()
    provincias = df.Provincia

    # bar = st.progress(0.0)
    # step = 1 / localidades.shape[0]
    # progress = 0

    for l, p in zip(localidades, provincias):
        aux = registros[(registros.nombre == l)]
        # progress += step
        # bar.progress(progress)

        if aux.shape[0] == 1:
            data = data.append(aux[["lat", "lon"]])
        elif aux.shape[0] > 1:
            province_id_max = aux.provincia.apply(
                lambda prov: td.hamming(prov, p)).argmin()
            # print(aux)
            # print(
            #     province_id_max,
            #     p,
            # )
            data = data.append(aux.iloc[[province_id_max]].loc[:,
                                                               ["lat", "lon"]])
        elif aux.shape[0] == 0:
            data = data.append(
                pd.DataFrame.from_records([{
                    "lat": None,
                    "lon": None
                }]))

    return data.set_index(df.index)
示例#12
0
def sequenceDistance(dfEnsp, ref_dic, newcolresult, hamming, hammingNorm,
                     levenshtein, levenshteinNorm):
    res = []
    ham = []
    hamnorm = []
    lev = []
    levnorm = []
    serSeq = dfEnsp['proSequence'].copy()
    serID = dfEnsp['stableID_key'].copy()
    for inx, val in serSeq.items():
        pep = str(val)
        p = pep.strip()
        idd = str(serID[inx])
        # check pep to dict pep sequence
        mypep = ref_dic[idd]
        str(mypep)
        # identical
        if mypep == p:
            res.append('True')
            ham.append('identical')
            hamnorm.append('identical')
            lev.append('identical')
            levnorm.append('identical')
        # not identical to canonical
        if mypep != p:
            res.append('False')
            # calculates hamming distance, penalizes positional differences, edit based distance
            ham.append(textdistance.hamming(mypep, p))
            # normalized hamming = # mismatched positions/ len of longer sequence
            hamnorm.append(textdistance.hamming.normalized_distance(mypep, p))
            # levenshtein score is edit based but not not penalized position, insertion at pos 1 is jsut 1 diff
            lev.append(textdistance.levenshtein(mypep, p))
            levnorm.append(
                textdistance.levenshtein.normalized_distance(mypep, p))
    dfEnsp.loc[:, newcolresult] = res
    dfEnsp.loc[:, hamming] = ham
    dfEnsp.loc[:, hammingNorm] = hamnorm
    dfEnsp.loc[:, levenshtein] = lev
    dfEnsp.loc[:, levenshteinNorm] = levnorm
    return dfEnsp
示例#13
0
def TextMatch(str1, str2):
    """return True is there is a close match
  """

    # Exact match
    if str1 == str2:
        return True

    # Match Text (simple) ignore case
    if str1.lower() == str2.lower():
        return True

    # Use Python 3 casefold to (agressive) ignore case
    if str1.casefold() == str2.casefold():
        return True

    # compare the hamming distance to tolerate a close match
    if hamming(str1, str2) < 2:
        return True

    # no match found
    return False
示例#14
0
def valid_word_permutations(
    word_map: WordMap, word: Optional[str]
) -> List[PermutationOption]:
    if word is None:
        return []
    options = []
    answers = word_map.correct_answers(word)
    if len(answers) == 0:
        return []
    for s in permutations(word):
        candidate = "".join(s)
        if not word_map.is_word(candidate):
            options.append(
                PermutationOption(
                    word=candidate,
                    minimum_distance=min(
                        map(lambda word: textdistance.hamming(candidate, word), answers)
                    ),
                )
            )
        if len(options) > 1000:
            break
    options.sort(key=lambda opt: -opt.minimum_distance)
    return options
示例#15
0
def simple_example():
    str1, str2 = 'test', 'text'
    qval = 2

    #--------------------
    # Edit-based.
    if True:
        print("textdistance.hamming({}, {}) = {}.".format(
            str1, str2, textdistance.hamming(str1, str2)))
        print("textdistance.hamming.distance({}, {}) = {}.".format(
            str1, str2, textdistance.hamming.distance(str1, str2)))
        print("textdistance.hamming.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.hamming.similarity(str1, str2)))
        print("textdistance.hamming.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.hamming.normalized_distance(str1, str2)))
        print(
            "textdistance.hamming.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.hamming.normalized_similarity(str1, str2)))
        print(
            "textdistance.Hamming(qval={}, test_func=None, truncate=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Hamming(qval=qval,
                                     test_func=None,
                                     truncate=False,
                                     external=True).distance(str1, str2)))

        print("textdistance.mlipns({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns(str1, str2)))
        print("textdistance.mlipns.distance({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns.distance(str1, str2)))
        print("textdistance.mlipns.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns.similarity(str1, str2)))
        print("textdistance.mlipns.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns.normalized_distance(str1, str2)))
        print("textdistance.mlipns.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns.normalized_similarity(str1, str2)))
        print(
            "textdistance.MLIPNS(threshold=0.25, maxmismatches=2, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.MLIPNS(threshold=0.25,
                                    maxmismatches=2,
                                    qval=qval,
                                    external=True).distance(str1, str2)))

        print("textdistance.levenshtein({}, {}) = {}.".format(
            str1, str2, textdistance.levenshtein(str1, str2)))
        print("textdistance.levenshtein.distance({}, {}) = {}.".format(
            str1, str2, textdistance.levenshtein.distance(str1, str2)))
        print("textdistance.levenshtein.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.levenshtein.similarity(str1, str2)))
        print("textdistance.levenshtein.normalized_distance({}, {}) = {}.".
              format(str1, str2,
                     textdistance.levenshtein.normalized_distance(str1, str2)))
        print("textdistance.levenshtein.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.levenshtein.normalized_similarity(str1, str2)))
        print(
            "textdistance.Levenshtein(qval={}, test_func=None, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Levenshtein(qval=qval,
                                         test_func=None,
                                         external=True).distance(str1, str2)))

        print("textdistance.damerau_levenshtein({}, {}) = {}.".format(
            str1, str2, textdistance.damerau_levenshtein(str1, str2)))
        print("textdistance.damerau_levenshtein.distance({}, {}) = {}.".format(
            str1, str2, textdistance.damerau_levenshtein.distance(str1, str2)))
        print(
            "textdistance.damerau_levenshtein.similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.damerau_levenshtein.similarity(str1, str2)))
        print(
            "textdistance.damerau_levenshtein.normalized_distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.damerau_levenshtein.normalized_distance(
                    str1, str2)))
        print(
            "textdistance.damerau_levenshtein.normalized_similarity({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.damerau_levenshtein.normalized_similarity(
                    str1, str2)))
        print(
            "textdistance.DamerauLevenshtein(qval={}, test_func=None, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.DamerauLevenshtein(qval=qval,
                                                test_func=None,
                                                external=True).distance(
                                                    str1, str2)))

        print("textdistance.jaro({}, {}) = {}.".format(
            str1, str2, textdistance.jaro(str1, str2)))
        print("textdistance.jaro.distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaro.distance(str1, str2)))
        print("textdistance.jaro.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.jaro.similarity(str1, str2)))
        print("textdistance.jaro.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaro.normalized_distance(str1, str2)))
        print("textdistance.jaro.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.jaro.normalized_similarity(str1, str2)))
        print(
            "textdistance.Jaro(long_tolerance=False, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Jaro(long_tolerance=False,
                                  qval=qval,
                                  external=True).distance(str1, str2)))

        print("textdistance.jaro_winkler({}, {}) = {}.".format(
            str1, str2, textdistance.jaro_winkler(str1, str2)))
        print("textdistance.jaro_winkler.distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaro_winkler.distance(str1, str2)))
        print("textdistance.jaro_winkler.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.jaro_winkler.similarity(str1, str2)))
        print("textdistance.jaro_winkler.normalized_distance({}, {}) = {}.".
              format(str1, str2,
                     textdistance.jaro_winkler.normalized_distance(str1,
                                                                   str2)))
        print("textdistance.jaro_winkler.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.jaro_winkler.normalized_similarity(str1, str2)))
        print(
            "textdistance.JaroWinkler(long_tolerance=False, winklerize=True, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.JaroWinkler(long_tolerance=False,
                                         winklerize=True,
                                         qval=qval,
                                         external=True).distance(str1, str2)))

        print("textdistance.strcmp95({}, {}) = {}.".format(
            str1, str2, textdistance.strcmp95(str1, str2)))
        print("textdistance.strcmp95.distance({}, {}) = {}.".format(
            str1, str2, textdistance.strcmp95.distance(str1, str2)))
        print("textdistance.strcmp95.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.strcmp95.similarity(str1, str2)))
        print("textdistance.strcmp95.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.strcmp95.normalized_distance(str1, str2)))
        print(
            "textdistance.strcmp95.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.strcmp95.normalized_similarity(str1, str2)))
        print(
            "textdistance.StrCmp95(long_strings=False, external=True).distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.StrCmp95(long_strings=False,
                                      external=True).distance(str1, str2)))

        print("textdistance.needleman_wunsch({}, {}) = {}.".format(
            str1, str2, textdistance.needleman_wunsch(str1, str2)))
        print("textdistance.needleman_wunsch.distance({}, {}) = {}.".format(
            str1, str2, textdistance.needleman_wunsch.distance(str1, str2)))
        print("textdistance.needleman_wunsch.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.needleman_wunsch.similarity(str1, str2)))
        print(
            "textdistance.needleman_wunsch.normalized_distance({}, {}) = {}.".
            format(
                str1, str2,
                textdistance.needleman_wunsch.normalized_distance(str1, str2)))
        print(
            "textdistance.needleman_wunsch.normalized_similarity({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.needleman_wunsch.normalized_similarity(
                    str1, str2)))
        print(
            "textdistance.NeedlemanWunsch(gap_cost=1.0, sim_func=None, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.NeedlemanWunsch(gap_cost=1.0,
                                             sim_func=None,
                                             qval=qval,
                                             external=True).distance(
                                                 str1, str2)))

        print("textdistance.gotoh({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh(str1, str2)))
        print("textdistance.gotoh.distance({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh.distance(str1, str2)))
        print("textdistance.gotoh.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh.similarity(str1, str2)))
        print("textdistance.gotoh.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh.normalized_distance(str1, str2)))
        print("textdistance.gotoh.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh.normalized_similarity(str1, str2)))
        print(
            "textdistance.Gotoh(gap_open=1, gap_ext=0.4, sim_func=None, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Gotoh(gap_open=1,
                                   gap_ext=0.4,
                                   sim_func=None,
                                   qval=qval,
                                   external=True).distance(str1, str2)))

        print("textdistance.smith_waterman({}, {}) = {}.".format(
            str1, str2, textdistance.smith_waterman(str1, str2)))
        print("textdistance.smith_waterman.distance({}, {}) = {}.".format(
            str1, str2, textdistance.smith_waterman.distance(str1, str2)))
        print("textdistance.smith_waterman.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.smith_waterman.similarity(str1, str2)))
        print("textdistance.smith_waterman.normalized_distance({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.smith_waterman.normalized_distance(str1, str2)))
        print(
            "textdistance.smith_waterman.normalized_similarity({}, {}) = {}.".
            format(
                str1, str2,
                textdistance.smith_waterman.normalized_similarity(str1, str2)))
        print(
            "textdistance.SmithWaterman(gap_cost=1.0, sim_func=None, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.SmithWaterman(gap_cost=1.0,
                                           sim_func=None,
                                           qval=qval,
                                           external=True).distance(str1,
                                                                   str2)))

    #--------------------
    # Token-based.
    if False:
        print("textdistance.jaccard({}, {}) = {}.".format(
            str1, str2, textdistance.jaccard(str1, str2)))
        print("textdistance.jaccard.distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaccard.distance(str1, str2)))
        print("textdistance.jaccard.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.jaccard.similarity(str1, str2)))
        print("textdistance.jaccard.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaccard.normalized_distance(str1, str2)))
        print(
            "textdistance.jaccard.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.jaccard.normalized_similarity(str1, str2)))
        print(
            "textdistance.Jaccard(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Jaccard(qval=qval, as_set=False,
                                     external=True).distance(str1, str2)))

        print("textdistance.sorensen({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen(str1, str2)))
        print("textdistance.sorensen.distance({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen.distance(str1, str2)))
        print("textdistance.sorensen.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen.similarity(str1, str2)))
        print("textdistance.sorensen.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen.normalized_distance(str1, str2)))
        print(
            "textdistance.sorensen.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.sorensen.normalized_similarity(str1, str2)))
        print(
            "textdistance.Sorensen(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Sorensen(qval=qval, as_set=False,
                                      external=True).distance(str1, str2)))

        print("textdistance.sorensen_dice({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen_dice(str1, str2)))
        print("textdistance.sorensen_dice.distance({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen_dice.distance(str1, str2)))
        print("textdistance.sorensen_dice.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen_dice.similarity(str1, str2)))
        print("textdistance.sorensen_dice.normalized_distance({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.sorensen_dice.normalized_distance(str1, str2)))
        print("textdistance.sorensen_dice.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.sorensen_dice.normalized_similarity(str1,
                                                                   str2)))
        #print("textdistance.SorensenDice().distance({}, {}) = {}.".format(str1, str2, textdistance.SorensenDice().distance(str1, str2)))

        print("textdistance.tversky({}, {}) = {}.".format(
            str1, str2, textdistance.tversky(str1, str2)))
        print("textdistance.tversky.distance({}, {}) = {}.".format(
            str1, str2, textdistance.tversky.distance(str1, str2)))
        print("textdistance.tversky.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.tversky.similarity(str1, str2)))
        print("textdistance.tversky.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.tversky.normalized_distance(str1, str2)))
        print(
            "textdistance.tversky.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.tversky.normalized_similarity(str1, str2)))
        print(
            "textdistance.Tversky(qval={}, ks=None, bias=None, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Tversky(qval=qval,
                                     ks=None,
                                     bias=None,
                                     as_set=False,
                                     external=True).distance(str1, str2)))

        print("textdistance.overlap({}, {}) = {}.".format(
            str1, str2, textdistance.overlap(str1, str2)))
        print("textdistance.overlap.distance({}, {}) = {}.".format(
            str1, str2, textdistance.overlap.distance(str1, str2)))
        print("textdistance.overlap.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.overlap.similarity(str1, str2)))
        print("textdistance.overlap.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.overlap.normalized_distance(str1, str2)))
        print(
            "textdistance.overlap.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.overlap.normalized_similarity(str1, str2)))
        print(
            "textdistance.Overlap(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Overlap(qval=qval, as_set=False,
                                     external=True).distance(str1, str2)))

        # This is identical to the Jaccard similarity coefficient and the Tversky index for alpha=1 and beta=1.
        print("textdistance.tanimoto({}, {}) = {}.".format(
            str1, str2, textdistance.tanimoto(str1, str2)))
        print("textdistance.tanimoto.distance({}, {}) = {}.".format(
            str1, str2, textdistance.tanimoto.distance(str1, str2)))
        print("textdistance.tanimoto.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.tanimoto.similarity(str1, str2)))
        print("textdistance.tanimoto.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.tanimoto.normalized_distance(str1, str2)))
        print(
            "textdistance.tanimoto.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.tanimoto.normalized_similarity(str1, str2)))
        print(
            "textdistance.Tanimoto(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Tanimoto(qval=qval, as_set=False,
                                      external=True).distance(str1, str2)))

        print("textdistance.cosine({}, {}) = {}.".format(
            str1, str2, textdistance.cosine(str1, str2)))
        print("textdistance.cosine.distance({}, {}) = {}.".format(
            str1, str2, textdistance.cosine.distance(str1, str2)))
        print("textdistance.cosine.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.cosine.similarity(str1, str2)))
        print("textdistance.cosine.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.cosine.normalized_distance(str1, str2)))
        print("textdistance.cosine.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.cosine.normalized_similarity(str1, str2)))
        print(
            "textdistance.Cosine(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Cosine(qval=qval, as_set=False,
                                    external=True).distance(str1, str2)))

        print("textdistance.monge_elkan({}, {}) = {}.".format(
            str1, str2, textdistance.monge_elkan(str1, str2)))
        print("textdistance.monge_elkan.distance({}, {}) = {}.".format(
            str1, str2, textdistance.monge_elkan.distance(str1, str2)))
        print("textdistance.monge_elkan.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.monge_elkan.similarity(str1, str2)))
        print("textdistance.monge_elkan.normalized_distance({}, {}) = {}.".
              format(str1, str2,
                     textdistance.monge_elkan.normalized_distance(str1, str2)))
        print("textdistance.monge_elkan.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.monge_elkan.normalized_similarity(str1, str2)))
        print(
            "textdistance.MongeElkan(algorithm=textdistance.DamerauLevenshtein(), symmetric=False, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.MongeElkan(
                    algorithm=textdistance.DamerauLevenshtein(),
                    symmetric=False,
                    qval=qval,
                    external=True).distance(str1, str2)))

        print("textdistance.bag({}, {}) = {}.".format(
            str1, str2, textdistance.bag(str1, str2)))
        print("textdistance.bag.distance({}, {}) = {}.".format(
            str1, str2, textdistance.bag.distance(str1, str2)))
        print("textdistance.bag.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.bag.similarity(str1, str2)))
        print("textdistance.bag.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.bag.normalized_distance(str1, str2)))
        print("textdistance.bag.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.bag.normalized_similarity(str1, str2)))
        print("textdistance.Bag(qval={}).distance({}, {}) = {}.".format(
            qval, str1, str2,
            textdistance.Bag(qval=qval).distance(str1, str2)))

    #--------------------
    # Sequence-based.
    if False:
        print("textdistance.lcsseq({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq(str1, str2)))
        print("textdistance.lcsseq.distance({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq.distance(str1, str2)))
        print("textdistance.lcsseq.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq.similarity(str1, str2)))
        print("textdistance.lcsseq.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq.normalized_distance(str1, str2)))
        print("textdistance.lcsseq.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq.normalized_similarity(str1, str2)))
        #print("textdistance.LCSSeq(qval={}, test_func=None, external=True).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.LCSSeq(qval=qval, test_func=None, external=True).distance(str1, str2)))
        print("textdistance.LCSSeq().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.LCSSeq().distance(str1, str2)))

        print("textdistance.lcsstr({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr(str1, str2)))
        print("textdistance.lcsstr.distance({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr.distance(str1, str2)))
        print("textdistance.lcsstr.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr.similarity(str1, str2)))
        print("textdistance.lcsstr.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr.normalized_distance(str1, str2)))
        print("textdistance.lcsstr.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr.normalized_similarity(str1, str2)))
        print("textdistance.LCSStr(qval={}).distance({}, {}) = {}.".format(
            qval, str1, str2,
            textdistance.LCSStr(qval=qval).distance(str1, str2)))

        print("textdistance.ratcliff_obershelp({}, {}) = {}.".format(
            str1, str2, textdistance.ratcliff_obershelp(str1, str2)))
        print("textdistance.ratcliff_obershelp.distance({}, {}) = {}.".format(
            str1, str2, textdistance.ratcliff_obershelp.distance(str1, str2)))
        print(
            "textdistance.ratcliff_obershelp.similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.ratcliff_obershelp.similarity(str1, str2)))
        print(
            "textdistance.ratcliff_obershelp.normalized_distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.ratcliff_obershelp.normalized_distance(
                    str1, str2)))
        print(
            "textdistance.ratcliff_obershelp.normalized_similarity({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.ratcliff_obershelp.normalized_similarity(
                    str1, str2)))
        print("textdistance.RatcliffObershelp().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.RatcliffObershelp().distance(str1, str2)))

    #--------------------
    # Compression-based.
    if False:
        print("textdistance.arith_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.arith_ncd(str1, str2)))
        print("textdistance.arith_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.arith_ncd.distance(str1, str2)))
        print("textdistance.arith_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.arith_ncd.similarity(str1, str2)))
        print(
            "textdistance.arith_ncd.normalized_distance({}, {}) = {}.".format(
                str1, str2,
                textdistance.arith_ncd.normalized_distance(str1, str2)))
        print("textdistance.arith_ncd.normalized_similarity({}, {}) = {}.".
              format(str1, str2,
                     textdistance.arith_ncd.normalized_similarity(str1, str2)))
        #print("textdistance.ArithNCD(base=2, terminator=None, qval={}).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.ArithNCD(base=2, terminator=None, qval=qval).distance(str1, str2)))
        print("textdistance.ArithNCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.ArithNCD().distance(str1, str2)))

        print("textdistance.rle_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.rle_ncd(str1, str2)))
        print("textdistance.rle_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.rle_ncd.distance(str1, str2)))
        print("textdistance.rle_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.rle_ncd.similarity(str1, str2)))
        print("textdistance.rle_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.rle_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.rle_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.rle_ncd.normalized_similarity(str1, str2)))
        print("textdistance.RLENCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.RLENCD().distance(str1, str2)))

        print("textdistance.bwtrle_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.bwtrle_ncd(str1, str2)))
        print("textdistance.bwtrle_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.bwtrle_ncd.distance(str1, str2)))
        print("textdistance.bwtrle_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.bwtrle_ncd.similarity(str1, str2)))
        print(
            "textdistance.bwtrle_ncd.normalized_distance({}, {}) = {}.".format(
                str1, str2,
                textdistance.bwtrle_ncd.normalized_distance(str1, str2)))
        print("textdistance.bwtrle_ncd.normalized_similarity({}, {}) = {}.".
              format(str1, str2,
                     textdistance.bwtrle_ncd.normalized_similarity(str1,
                                                                   str2)))
        print("textdistance.BWTRLENCD(terminator='\0').distance({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.BWTRLENCD(terminator='\0').distance(str1,
                                                                   str2)))

        print("textdistance.sqrt_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.sqrt_ncd(str1, str2)))
        print("textdistance.sqrt_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.sqrt_ncd.distance(str1, str2)))
        print("textdistance.sqrt_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.sqrt_ncd.similarity(str1, str2)))
        print("textdistance.sqrt_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.sqrt_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.sqrt_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.sqrt_ncd.normalized_similarity(str1, str2)))
        print("textdistance.SqrtNCD(qval={}).distance({}, {}) = {}.".format(
            qval, str1, str2,
            textdistance.SqrtNCD(qval=qval).distance(str1, str2)))

        print("textdistance.entropy_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.entropy_ncd(str1, str2)))
        print("textdistance.entropy_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.entropy_ncd.distance(str1, str2)))
        print("textdistance.entropy_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.entropy_ncd.similarity(str1, str2)))
        print("textdistance.entropy_ncd.normalized_distance({}, {}) = {}.".
              format(str1, str2,
                     textdistance.entropy_ncd.normalized_distance(str1, str2)))
        print("textdistance.entropy_ncd.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.entropy_ncd.normalized_similarity(str1, str2)))
        print(
            "textdistance.EntropyNCD(qval={}, coef=1, base=2).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.EntropyNCD(qval=qval, coef=1,
                                        base=2).distance(str1, str2)))

        print("textdistance.bz2_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.bz2_ncd(str1, str2)))
        print("textdistance.bz2_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.bz2_ncd.distance(str1, str2)))
        print("textdistance.bz2_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.bz2_ncd.similarity(str1, str2)))
        print("textdistance.bz2_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.bz2_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.bz2_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.bz2_ncd.normalized_similarity(str1, str2)))
        print("textdistance.BZ2NCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.BZ2NCD().distance(str1, str2)))

        print("textdistance.lzma_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.lzma_ncd(str1, str2)))
        print("textdistance.lzma_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.lzma_ncd.distance(str1, str2)))
        print("textdistance.lzma_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lzma_ncd.similarity(str1, str2)))
        print("textdistance.lzma_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.lzma_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.lzma_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.lzma_ncd.normalized_similarity(str1, str2)))
        print("textdistance.LZMANCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.LZMANCD().distance(str1, str2)))

        print("textdistance.zlib_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.zlib_ncd(str1, str2)))
        print("textdistance.zlib_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.zlib_ncd.distance(str1, str2)))
        print("textdistance.zlib_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.zlib_ncd.similarity(str1, str2)))
        print("textdistance.zlib_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.zlib_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.zlib_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.zlib_ncd.normalized_similarity(str1, str2)))
        print("textdistance.ZLIBNCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.ZLIBNCD().distance(str1, str2)))

    #--------------------
    # Phonetic.
    if False:
        print("textdistance.mra({}, {}) = {}.".format(
            str1, str2, textdistance.mra(str1, str2)))
        print("textdistance.mra.distance({}, {}) = {}.".format(
            str1, str2, textdistance.mra.distance(str1, str2)))
        print("textdistance.mra.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.mra.similarity(str1, str2)))
        print("textdistance.mra.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.mra.normalized_distance(str1, str2)))
        print("textdistance.mra.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.mra.normalized_similarity(str1, str2)))
        print("textdistance.MRA().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.MRA().distance(str1, str2)))

        print("textdistance.editex({}, {}) = {}.".format(
            str1, str2, textdistance.editex(str1, str2)))
        print("textdistance.editex.distance({}, {}) = {}.".format(
            str1, str2, textdistance.editex.distance(str1, str2)))
        print("textdistance.editex.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.editex.similarity(str1, str2)))
        print("textdistance.editex.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.editex.normalized_distance(str1, str2)))
        print("textdistance.editex.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.editex.normalized_similarity(str1, str2)))
        print(
            "textdistance.Editex(local=False, match_cost=0, group_cost=1, mismatch_cost=2, groups=None, ungrouped=None, external=True).distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.Editex(local=False,
                                    match_cost=0,
                                    group_cost=1,
                                    mismatch_cost=2,
                                    groups=None,
                                    ungrouped=None,
                                    external=True).distance(str1, str2)))

    #--------------------
    # Simple.
    if False:
        print("textdistance.prefix({}, {}) = {}.".format(
            str1, str2, textdistance.prefix(str1, str2)))
        print("textdistance.prefix.distance({}, {}) = {}.".format(
            str1, str2, textdistance.prefix.distance(str1, str2)))
        print("textdistance.prefix.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.prefix.similarity(str1, str2)))
        print("textdistance.prefix.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.prefix.normalized_distance(str1, str2)))
        print("textdistance.prefix.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.prefix.normalized_similarity(str1, str2)))
        print(
            "textdistance.Prefix(qval={}, sim_test=None).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Prefix(qval=qval,
                                    sim_test=None).distance(str1, str2)))

        print("textdistance.postfix({}, {}) = {}.".format(
            str1, str2, textdistance.postfix(str1, str2)))
        print("textdistance.postfix.distance({}, {}) = {}.".format(
            str1, str2, textdistance.postfix.distance(str1, str2)))
        print("textdistance.postfix.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.postfix.similarity(str1, str2)))
        print("textdistance.postfix.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.postfix.normalized_distance(str1, str2)))
        print(
            "textdistance.postfix.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.postfix.normalized_similarity(str1, str2)))
        #print("textdistance.Postfix(qval={}, sim_test=None).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.Postfix(qval=qval, sim_test=None).distance(str1, str2)))
        print("textdistance.Postfix().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.Postfix().distance(str1, str2)))

        print("textdistance.length({}, {}) = {}.".format(
            str1, str2, textdistance.length(str1, str2)))
        print("textdistance.length.distance({}, {}) = {}.".format(
            str1, str2, textdistance.length.distance(str1, str2)))
        print("textdistance.length.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.length.similarity(str1, str2)))
        print("textdistance.length.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.length.normalized_distance(str1, str2)))
        print("textdistance.length.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.length.normalized_similarity(str1, str2)))
        print("textdistance.Length().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.Length().distance(str1, str2)))

        print("textdistance.identity({}, {}) = {}.".format(
            str1, str2, textdistance.identity(str1, str2)))
        print("textdistance.identity.distance({}, {}) = {}.".format(
            str1, str2, textdistance.identity.distance(str1, str2)))
        print("textdistance.identity.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.identity.similarity(str1, str2)))
        print("textdistance.identity.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.identity.normalized_distance(str1, str2)))
        print(
            "textdistance.identity.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.identity.normalized_similarity(str1, str2)))
        print("textdistance.Identity().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.Identity().distance(str1, str2)))

        print("textdistance.matrix({}, {}) = {}.".format(
            str1, str2, textdistance.matrix(str1, str2)))
        print("textdistance.matrix.distance({}, {}) = {}.".format(
            str1, str2, textdistance.matrix.distance(str1, str2)))
        print("textdistance.matrix.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.matrix.similarity(str1, str2)))
        print("textdistance.matrix.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.matrix.normalized_distance(str1, str2)))
        print("textdistance.matrix.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.matrix.normalized_similarity(str1, str2)))
        print(
            "textdistance.Matrix(mat=None, mismatch_cost=0, match_cost=1, symmetric=True, external=True).distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.Matrix(mat=None,
                                    mismatch_cost=0,
                                    match_cost=1,
                                    symmetric=True,
                                    external=True).distance(str1, str2)))
示例#16
0
import pathlib
import textdistance
from fuzzywuzzy import fuzz

if __name__ == "__main__":
	string_a = 'test'
	string_b = 'text'
	fuzz_result = fuzz.ratio(string_a, string_b)
	result = textdistance.hamming(string_a, string_b)
	print("Fuzzy Result: ", fuzz_result)
	for function_name in dir(textdistance):
		try:
			function = getattr(textdistance, function_name)

			result = function(string_a, string_b)
			if isinstance(result, (int,float)):
				print("{}\t{}".format(function_name, result))
		except:
			pass
示例#17
0
def Seq_StringDistance(str_seq, str_ref, method="hamming"):

    if (method is "hamming"):
        return [
            textdistance.hamming(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "levenshtein"):
        return [
            textdistance.levenshtein(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "damerau_lev"):
        return [
            textdistance.damerau_levenshtein(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "j-winkler"):
        return [
            textdistance.jaro_winkler(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "smith-waterman"):
        return [
            textdistance.smith_waterman(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "jaccard"):
        return [
            textdistance.jaccard(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "sorensen-dice"):
        return [
            textdistance.sorensen_dice(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "tversky"):
        return [
            textdistance.tversky(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "tanimoto"):
        return [
            textdistance.tanimoto(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "cosine"):
        return [
            textdistance.cosine(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "tanimoto"):
        return [
            textdistance.tanimoto(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "ratcliff"):
        return [
            textdistance.ratcliff_obershelp(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "bwt"):
        return [
            textdistance.bwtrle_ncd(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]
示例#18
0
# ## Example Methode

# # Hamming
# 
# https://github.com/life4/textdistance

# In[2]:


import textdistance


# In[3]:


textdistance.hamming('test', 'text')


# In[4]:


textdistance.hamming.distance('test', 'text')


# In[5]:


textdistance.hamming.similarity('test', 'text')


# In[6]:
def getEntityLocation(location_string):
    census_hi_file = root + "/Location/census_hindi_sd.csv"
    cdf = pandas.read_csv(census_hi_file)
    cdf['name_hi'] = cdf['name_hi'].str.strip()

    ############ SEGREGATING ALL THE STATE, DISTRICTS, SUB-DISTRICTs, PANCHAYATS/TOWNS, M. CORP. INTO DIFFERENT DATAFRAMES################

    states = cdf[['state_code', 'name_en', 'name_hi'
                  ]][(cdf.district_code == 0) & (cdf.subdistrict_code == 0) &
                     (cdf.panchayat_town_code == 0) & (cdf.state_code != 0)]
    state = states.set_index('state_code')
    districts = cdf[[
        'district_code', 'name_en', 'name_hi'
    ]][(cdf.subdistrict_code == 0) & (cdf.panchayat_town_code == 0) &
       (cdf.state_code != 0) & (cdf.district_code != 0)]
    district = districts.set_index('district_code')
    sub_districts = cdf[[
        'subdistrict_code', 'name_en', 'name_hi'
    ]][(cdf.panchayat_town_code == 0) & (cdf.state_code != 0) &
       (cdf.district_code != 0) & (cdf.subdistrict_code != 0)]
    sub_district = sub_districts.set_index('subdistrict_code')
    panchayats_towns = cdf[[
        'panchayat_town_code', 'name_en', 'name_hi'
    ]][(cdf.state_code != 0) & (cdf.district_code != 0) &
       (cdf.subdistrict_code != 0) & (cdf.panchayat_town_code != 0)]

    ################################################# MAIN LOOP STARTS ##################################################################
    flag_perfectmatch = False  #flag to track the perfect match or not
    S = []
    D = []
    SD = []
    PT = ""
    Loc_hd = ""
    list_output = []
    locations = location_string

    if locations == 'n':  #location contains NaN and can't be processed
        # print("can't understand")
        return (-1, S, D, SD, PT)
    location_entity = list(dict.fromkeys(
        locations.split(',')))  #separating the entities
    # print("Entities : ",location_entity)
    for location in location_entity:
        #### for a single entity in a loop
        #### State Direct Match Code
        for loc in list(states["name_hi"]):
            if " " + loc + " " in location:
                S.append(loc)
                location = location.replace(loc, '')
                alphastate = 0
                flag_perfectmatch = True
                break
        #### District Direct Match Code
        if len(S) != 0:
            for s in S:
                indexnum = cdf[cdf['name_hi'] == s].index.values.astype(int)[0]
                statenum = cdf["state_code"][indexnum]
                inState = find_between(cdf, 'state_code', statenum,
                                       statenum + 1)
                districtsinState = inState[[
                    'district_code', 'name_en', 'name_hi'
                ]][(cdf.subdistrict_code == 0) & (cdf.panchayat_town_code == 0)
                   & (cdf.state_code != 0) & (cdf.district_code != 0)]
            possibledistricts = districtsinState
            # print(possibledistricts)
        else:
            possibledistricts = districts

        for loc in list(possibledistricts["name_hi"]):
            if " " + loc + " " in location:
                D.append(loc)
                location = location.replace(loc, '')
                alphadistrict = 0
                flag_perfectmatch = True
                break
        #### Subdistrict Direct Match Code
        if len(D) != 0:
            for d in D:
                indexnum = cdf[cdf['name_hi'] == d].index.values.astype(int)[0]
                districtnum = cdf["district_code"][indexnum]
                inDistrict = find_between(cdf, 'district_code', districtnum,
                                          districtnum + 1)
                subdistrictsinstate = inDistrict[[
                    'subdistrict_code', 'name_en', 'name_hi'
                ]][(cdf.panchayat_town_code == 0) & (cdf.state_code != 0) &
                   (cdf.district_code != 0) & (cdf.subdistrict_code != 0)]
            possiblesubdistricts = subdistrictsinstate
        elif len(S) != 0:
            for s in S:
                indexnum = cdf[cdf['name_hi'] == s].index.values.astype(int)[0]
                statenum = cdf["state_code"][indexnum]
                inState = find_between(cdf, 'state_code', statenum,
                                       statenum + 1)
                subdistrictsinstate = inState[[
                    'subdistrict_code', 'name_en', 'name_hi'
                ]][(cdf.panchayat_town_code == 0) & (cdf.state_code != 0) &
                   (cdf.district_code != 0) & (cdf.subdistrict_code != 0)]
            possiblesubdistricts = subdistrictsinstate
        else:
            possiblesubdistricts = sub_districts
        for loc in list(possiblesubdistricts["name_hi"]):
            if " " + loc + " " in location:
                SD.append(loc)
                location = location.replace(loc, '')
                alphasubdistrict = 0
                flag_perfectmatch = True
                break
        #### Backpropagate States, Districts
        if len(D) == 0 and len(SD) != 0:
            for sd in SD:
                l = (
                    possiblesubdistricts[possiblesubdistricts["name_hi"] == sd]
                ).index.tolist()  #Index of all matched rows
                for ll in l:  #for each index print corresponding District,State
                    # print("District Code: ",cdf.at[ll,'district_code'],", District: ",district.at[cdf.at[ll,'district_code'],'name_hi'])
                    D.append(district.at[cdf.at[ll, 'district_code'],
                                         'name_hi'])

        if len(S) == 0 and len(D) != 0:
            for d in D:
                l = (possibledistricts[possibledistricts["name_hi"] == d]
                     ).index.tolist()  #Index of all matched rows
                for ll in l:  #for each index print corresponding State
                    # print("State Code: ",cdf.at[ll,'state_code'],", State: ",state.at[cdf.at[ll,'state_code'],'name_hi'])
                    S.append(state.at[cdf.at[ll, 'state_code'], 'name_hi'])

        #### Approximate Matching
        if len(S) == 0:
            min_d_state = 10
            min_s_state = ""
            for loc in list(states["name_hi"]):
                lenloc = len(loc.split())
                tokenised_instance = location.split()
                ngrams = list(
                    zip(*[tokenised_instance[i:] for i in range(lenloc)]))
                ngrams = [' '.join(ngram) for ngram in ngrams]
                for ng in ngrams:
                    d = textdistance.hamming(ng, loc) / len(
                        ng)  #Hamming textdistance algo
                    if (d < min_d_state):
                        min_s_state = loc
                        min_d_state = d
        if len(D) == 0:
            min_d_district = 10
            min_s_district = ""
            for loc in list(possibledistricts["name_hi"]):
                lenloc = len(loc.split())
                tokenised_instance = location.split()
                ngrams = list(
                    zip(*[tokenised_instance[i:] for i in range(lenloc)]))
                ngrams = [' '.join(ngram) for ngram in ngrams]
                for ng in ngrams:
                    d = textdistance.hamming(ng, loc) / len(
                        ng)  #Hamming textdistance algo
                    if (d < min_d_district):
                        min_s_district = loc
                        min_d_district = d
        alpha = 10
        if len(S) == 0 and len(D) == 0:
            flag_perfectmatch = False
            if min_s_state != "" and min_s_district != "":
                if min_d_district < min_d_state:
                    D.append(min_s_district)
                    alpha = min_d_district
                else:
                    S.append(min_s_state)
                    alpha = min_d_state
            elif min_s_district != "":
                D.append(min_s_district)
                alpha = min_d_district
            elif min_s_state != "":
                S.append(min_s_state)
                alpha = min_d_state
        elif len(D) == 0:
            if min_s_district != "":
                D.append(min_s_district)
                alpha = min_d_district

        #### Backpropagate States, Districts
        if len(D) == 0 and len(SD) != 0:
            for sd in SD:
                l = (
                    possiblesubdistricts[possiblesubdistricts["name_hi"] == sd]
                ).index.tolist()  #Index of all matched rows
                for ll in l:  #for each index print corresponding District,State
                    # print("District Code: ",cdf.at[ll,'district_code'],", District: ",district.at[cdf.at[ll,'district_code'],'name_hi'])
                    D.append(district.at[cdf.at[ll, 'district_code'],
                                         'name_hi'])

        if len(S) == 0 and len(D) != 0:
            for d in D:
                l = (possibledistricts[possibledistricts["name_hi"] == d]
                     ).index.tolist()  #Index of all matched rows
                for ll in l:  #for each index print corresponding State
                    # print("State Code: ",cdf.at[ll,'state_code'],", State: ",state.at[cdf.at[ll,'state_code'],'name_hi'])
                    S.append(state.at[cdf.at[ll, 'state_code'], 'name_hi'])

        list_output.append(locations)

        # print(S, D, SD, PT)
        if alpha == 10:
            list_output.append("Yes")
            return (0, S, D, SD, PT)
        else:
            list_output.append("No")
            return (alpha, S, D, SD, PT)
    return
示例#20
0
    rank = 1
    for i in range(len(items)):  # go through
        formattedUrl = items[i].get("formattedUrl", None)
        if "/showcase/" in formattedUrl or "/in/" in formattedUrl or "/.../" in formattedUrl:
            # print ("\tContinuing on ", formattedUrl)
            continue
        else:
            # print("\tProcessing ", formattedUrl)
            pass

        snippet = items[i].get("snippet", None)
        title = items[i].get("title",
                             None)  # formatted "company name | LinkedIn"
        li_firm = title.split('|')[0].rstrip()

        li_dict[li_firm]["jaro"] = textdistance.hamming(firm_clnd, li_firm)
        snippet_clnd = re.sub("[,.]", "", snippet)
        emps = re.findall(p, snippet_clnd)
        if emps:
            li_dict[li_firm]["emps"] = int(emps[0])
        else:
            li_dict[li_firm]["emps"] = 0
        print(li_firm + ': ' + str(li_dict[li_firm]["emps"]))

        li_dict[li_firm]["formattedUrl"] = formattedUrl
        li_dict[li_firm]["rank"] = rank
        rank = rank + 1

    srtd_keys = sorted(li_dict,
                       key=lambda x: (li_dict[x]['emps']),
                       reverse=True)
示例#21
0
def modify_file_content(file_content, file_path):
    global count 

    file_content = HTMLEntitiesToUnicode(file_content)
    
    new_file_lines = []
    for line in file_content.split('\n'):
        new_line = line

        if 'title: Main Page' in line:
            new_file_lines.append('title: Wiki Main Page\n')
            continue

        if 'permalink' in line or '#drawio' in line:
            continue

        if '[Category:Projects]' in line:
            new_line = new_line[new_line.index(')')+2:]

        if '[:Category:Projects]' in line:
            new_file_lines.append("- [Category: Projects]({{ site.baseurl }}/wiki/categories/wikiprojects)\n")
            continue

        if 'date: ' in line:
            print(f'Appending layout line')
            new_file_lines.append('layout: wiki_post\n')
            new_file_lines.append('base: Wiki\n')
            new_file_lines.append('base_url: /wiki\n')
            if 'tensorboard' in file_path:
                new_file_lines.append('categories:\n  - wikitools\n')
            elif any([p.lower().replace(' ','_') in file_path for p in ['cifar10_classifier', 'examples', 'iris_classifier', 'Male or Female classifier', 'MNIST classifier', 'Word Embeddings']]):
                new_file_lines.append('categories:\n  - wikiprojects\n')
            else:
                new_file_lines.append('categories:\n  - wikimisc\n')
            continue

        empty_link_regex = r'(\[\]\(([^\)]*)\))'
        empty_link_matches = re.findall(empty_link_regex, new_line)
        for empty_link_match in empty_link_matches:
            new_link = f'[{empty_link_match[1]}]({empty_link_match[1]})'
            new_file_lines.append(new_line.replace(empty_link_match[0], new_link))
            print(f'New link found, replacing with {new_link}')
            continue

        img_src_regex = r'((\W|^)\[([^\[\]]*)(\[\d*\])?\]\(([^\ \[\]#]*)[^\)\[\]]*\))'
        img_src_regex_matches = re.findall(img_src_regex, new_line, flags=re.IGNORECASE|re.MULTILINE)
        for img_src_regex_match in img_src_regex_matches:
            # don't mess with normal links
            if 'http' in img_src_regex_match[2]:
                continue

            src = str(img_src_regex_match[4])
            src = src.replace('/File:', '/assets/img/wiki/')

            # make sure we only do this to images
            if src.split('.')[-1] in ['jpeg','jpg','png','gif','svg']:
                caption = str(img_src_regex_match[2])
                caption = caption.replace('thumb|','')

                # new_line = new_line.replace(img_src_regex_match.group(0), f"![{caption}]({src})")
                new_image_s = '{%% include figure_caption.html url="%s" description="%s" %%}' % (src, caption.replace('|', ','))
                new_line = new_line.replace(img_src_regex_match[0], new_image_s)
                print(f'Found image tag with src {img_src_regex_match[2]}, replacing with {new_line}')
            elif 'wikilink' in str(img_src_regex_match[0]).lower() and 'file:' not in str(img_src_regex_match[0]).lower():
                title = img_src_regex_match[4]
                title = title.replace('/','').lower()

                found_files = []
                for f in files:
                    if title in f:
                        found_files.append(f)

                if len(found_files) > 0:
                    distances = np.zeros(len(found_files))
                    for i in range(len(distances)):
                        distances[i] = textdistance.hamming(title, found_files[i])

                    closest_file = found_files[np.argmin(distances)]

                    post_link = "{% post_url /wiki/" + closest_file.split('/')[-1].replace('.md','') + "%}"
                    new_link_s = f'[{img_src_regex_match[2]}]({post_link})'
                    if 'file:' not in new_link_s:
                        new_line = new_line.replace(img_src_regex_match[0], new_link_s)
                        print(f'Found link tag with src {img_src_regex_match[3]}, replacing "{img_src_regex_match[0]}" -> {new_link_s}')
                else:                
                    print(f'welpy {title} not found in {files} files')

        reference_regex = r'(\[(\d*)\]\W?<([^>]*)>)'
        reference_regex_matches = re.findall(reference_regex, new_line)
        for reference_regex_match in reference_regex_matches:
            reference = reference_regex_match[1] + f'. [{reference_regex_match[2]}]({reference_regex_match[2]})'
            new_line = new_line.replace(reference_regex_match[0], reference)
            print(f'Replaced reference {reference_regex_match[0]} with {reference}')

        reference_number_regex = r'(\[(\d*)\])'
        reference_number_matches = re.findall(reference_number_regex, new_line)
        for reference_number_match in reference_number_matches:
            # scroll_to_bottom = f'<a href="javascript: document.body.scrollIntoView(false);">{reference_number_match[1]}</a>'
            reference_nr = f'<sup>{reference_number_match[1]}</sup>'
            new_line = new_line.replace(reference_number_match[0], reference_nr)
            
        # latex_regex = r'(\$[^\$]*\$)'
        # latex_regex_matches = re.findall(latex_regex, new_line)
        # for latex_regex_match in latex_regex_matches:
        #     new_latex = '$' + latex_regex_match[0].replace("\\","\\") + '$'
        #     new_line = new_line.replace(latex_regex_match[0], new_latex)

        # remove all html tags except those in github links
        if '<http' not in new_line:
            new_line = re.sub(r'<\/?[^>]*>', '', new_line)

        new_line += '\n'
        new_file_lines.append(new_line)

    return new_file_lines
示例#22
0
 def best_match(m):
     hs = [
         hamming(f['Facility Name'].lower(),
                 x['Data']['commonName'].lower()) for x in m
     ]
     return hs.index(min(hs))
 def fitness(self, word):
     return textdistance.hamming(word, self.constants.solution)
def main(argv):
    file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/results_wide/results_MAHARASHTRA_2020.csv'
    if not os.path.isfile(file_path):
        return
    all_villages = {
        'PUNE': {},
        'SOLAPUR': {}
    }
    with open(file_path, 'r') as original:
        lines = csv.reader(original, delimiter=',')
        skip_first = True
        for line in lines:
            if skip_first is True:
                skip_first = False
                continue
            district = line[3].strip().upper()
            if district != 'PUNE' and district != 'SOLAPUR':
                continue
            block_name = line[5].strip().upper()
            panchayat_name = line[7].strip().upper()
            panchayat_id = line[9].strip().upper()
            block_villages = all_villages[district].get(block_name)
            if block_villages is None:
                all_villages[district][block_name] = []
            all_villages[district][block_name].append({
                'name': panchayat_name,
                'id': panchayat_id,
                'line': line
            })

    file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/villages/sarpanch.csv'
    if not os.path.isfile(file_path):
        return

    result_lines = []
    with open(file_path, 'r') as original:
        lines = csv.reader(original, delimiter=',')
        skip_first = True
        for line in lines:
            if skip_first is True:
                skip_first = False
                continue
            village_id = line[0]
            village_name = line[5].upper().replace('GRAMPANCHAYAT', '').replace('GRAMPANCHAYT', '')\
                .replace(', AKKALKOT', '').replace('(BHOINJE)', '').replace('SAPATNE(BHO)', 'SAPATNE (BHOSE)')\
                .replace('GRAMPANCHYAT', '').replace('GRAMPANACHAYAT', '').replace('GRAM PANCHAYT', '')\
                .replace('GRAMAPANCHAYAT', '').strip()

            block_name = line[7]
            if block_name == '1':
                block_name = 'MADHA'
            elif block_name == '2':
                block_name = 'AKKALKOT'
            elif block_name == '3':
                block_name = 'SOUTH SOLAPUR'
            elif block_name == '4':
                block_name = 'PANDHARPUR'
            elif block_name == '5':
                block_name = 'MOHOL'
            elif block_name == '6':
                block_name = 'BHOR'
            elif block_name == '7':
                block_name = 'BARAMATI'
            elif block_name == '8':
                block_name = 'DAUND'
            elif block_name == '9':
                block_name = 'MULSHI'
            elif block_name == '10':
                block_name = 'KHED'
            else:
                raise Exception(f'No block_name found for {line}')

            district_name = line[6]
            if district_name == '1':
                district_name = 'SOLAPUR'
                if block_name not in ['MADHA', 'AKKALKOT', 'SOUTH SOLAPUR', 'PANDHARPUR', 'MOHOL']:
                    print(f'District and block mistmatch for {line}')
                    continue
            elif district_name == '2':
                district_name = 'PUNE'
                if block_name not in ['BHOR', 'BARAMATI', 'DAUND', 'MULSHI', 'KHED']:
                    print(f'District and block mistmatch for {line}')
                    continue
            else:
                print(f'No district found for {line}')
                continue

            cmp_results = []
            for village in all_villages[district_name][block_name]:
                cmp_results.append({
                    'score': textdistance.hamming(village_name, village['name']),
                    'match': village['name'],
                    'id': village['id'],
                    'line': village['line']
                })
            cmp_results.sort(key=lambda v: v['score'])
            print(f'{district_name} - {block_name} - {village_name} vs {cmp_results[0]["match"]} = {cmp_results[0]["score"]}')

            line = cmp_results[0]['line']
            if cmp_results[0]['score'] > 10:
                for idx, cmp_result in enumerate(cmp_results[0:4]):
                    print('{:>2} {}'.format(cmp_result['score'], cmp_result['match']))
                print()
                selected_row = read_user_input() - 1
                if selected_row < 4:
                    line = cmp_results[selected_row]['line']
                elif selected_row == 4:
                    line = []
            new_line = [village_id, village_name] + line
            result_lines.append(new_line)

        new_file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/villages/merge_sarpanch.csv'
        CsvWriter.write(new_file_path, result_lines)
示例#25
0
def main(argv):
    file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/results_wide/results_HARYANA_2020.csv'
    if not os.path.isfile(file_path):
        return
    all_villages = {}
    chosen_matches = {}
    with open(file_path, 'r') as original:
        lines = csv.reader(original, delimiter=',')
        skip_first = True
        for line in lines:
            if skip_first is True:
                skip_first = False
                continue
            district = line[3].strip().upper()
            block_name = line[5].strip().upper().replace(' (PART)', '')
            panchayat_name = line[7].strip().upper()
            panchayat_id = line[9].strip().upper()
            if all_villages.get(district) is None:
                all_villages[district] = {}
            block_villages = all_villages[district].get(block_name)
            if block_villages is None:
                all_villages[district][block_name] = []
            all_villages[district][block_name].append({
                'name': panchayat_name,
                'id': panchayat_id,
                'line': line
            })

    file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/villages/Haryana_new_incomplete.csv'
    if not os.path.isfile(file_path):
        return

    result_lines = []
    with open(file_path, 'r') as original:
        lines = csv.reader(original, delimiter=',')
        skip_first = 1
        for line in lines:
            if skip_first > 0:
                skip_first -= 1
                continue
            village_id = line[0]
            village_name = line[1].upper().strip()
            district_name = line[12].upper().strip()
            block_name = line[13].replace(' 1', '-I').replace(' 2', '-II').replace('Bhattu', 'Bhattu Kalan')\
                .replace('Ballabhgarh', 'Ballabgarh').replace('Nissing', 'Nissing At Chirao')\
                .replace('Meham', 'Maham').replace('Lakhan', 'Lakhan Majra') \
                .replace('GHARAUNDA (PART)', 'GHARAUNDA')\
                .replace('Block Saha', 'Saha').replace('Block Naraingarh', 'Naraingarh')\
                .replace('Block Shahzadpur', 'Shahzadpur').replace('Block Barara', 'Barara')\
                .upper().strip().replace('BLOCK ', f'{district_name}-')
            if all_villages.get(district_name) is None:
                raise Exception(f'Invalid district {district_name} for {line}')
            if all_villages[district_name].get(block_name) is None:
                raise Exception(f'Invalid {block_name} for {line}')

            cmp_results = []
            for village in all_villages[district_name][block_name]:
                cmp_results.append({
                    'score':
                    textdistance.hamming(village_name, village['name']),
                    'match':
                    village['name'],
                    'id':
                    village['id'],
                    'line':
                    village['line']
                })
            cmp_results.sort(key=lambda v: v['score'])
            print(
                f'{district_name} - {block_name} - {village_name} vs {cmp_results[0]["match"]} = {cmp_results[0]["score"]}'
            )

            line = cmp_results[0]['line']
            if cmp_results[0]['score'] > 10:
                selected_row = chosen_matches.get(
                    f'{district_name} - {block_name} - {village_name}')
                if selected_row is None:
                    for idx, cmp_result in enumerate(cmp_results[0:4]):
                        print('{:>2} {}'.format(cmp_result['score'],
                                                cmp_result['match']))
                    print()
                    selected_row = read_user_input() - 1
                    chosen_matches[
                        f'{district_name} - {block_name} - {village_name}'] = selected_row
                if selected_row < 4:
                    line = cmp_results[selected_row]['line']
                elif selected_row == 4:
                    line = []
            new_line = [village_id, village_name] + line
            result_lines.append(new_line)

        new_file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/villages/merge_sarpanch_haryana.csv'
        CsvWriter.write(new_file_path, result_lines)
示例#26
0
def run(experiment):
    save_path = "checkpoints/" + experiment.name 
    log_path = "tensorboard/train/" + experiment.name
    # create or clean directory
    for path in [save_path, log_path]:
        if not os.path.exists(path):
            os.makedirs(path)
        else:
            shutil.rmtree(path)           
            os.makedirs(path)
    save_path += "/dev"

    # log git commit hash
    repo = git.Repo(search_parent_directories=True)
    sha = repo.head.object.hexsha
    file = open(log_path + "/git_commit_" + sha, 'w')
    file.close()

    epochs, input_batch_size, rnn_size, num_layers, encoding_embedding_size, decoding_embedding_size, learning_rate, keep_probability, num_samples, reward = map(experiment.hyperparams.get, ('epochs', 'input_batch_size', 'rnn_size', 'num_layers', 'encoding_embedding_size', 'decoding_embedding_size', 'learning_rate', 'keep_probability', 'num_samples', "reward"))
    
    ### prepare data ###
    (train_source_int_text, train_target_int_text), (valid_source_int_text, valid_target_int_text), (
            source_vocab_to_int, target_vocab_to_int), (source_int_to_vocab, target_int_to_vocab) = data_preprocessing.get_data(experiment.data["dataset"], experiment.data["folder"], experiment.data["train_source_file"], experiment.data["train_target_file"], experiment.data["dev_source_file"], experiment.data["dev_target_file"], experiment.tokenization)

    max_source_sentence_length = max([len(sentence) for sentence in train_source_int_text])

    train_source = train_source_int_text
    train_target = train_target_int_text
    
    valid_source = valid_source_int_text
    valid_target = valid_target_int_text

    # shuffle
    rnd = random.Random(1234)
    train_combined = list(zip(train_source, train_target))
    rnd.shuffle(train_combined)
    train_source, train_target = zip(*train_combined)

    valid_combined = list(zip(valid_source, valid_target))
    rnd.shuffle(valid_combined)
    valid_source, valid_target = zip(*valid_combined)

    # set reward function
    if reward == "levenshtein":
        reward_func = lambda ref_hyp: - textdistance.levenshtein(ref_hyp[0], ref_hyp[1])   
    elif reward == "jaro-winkler":
        reward_func = lambda ref_hyp: textdistance.JaroWinkler()(ref_hyp[0], ref_hyp[1]) 
    elif reward == "hamming":
        reward_func = lambda ref_hyp: - textdistance.hamming(ref_hyp[0], ref_hyp[1])

    if experiment.train_method == 'MLE':
        graph_batch_size = input_batch_size
    elif experiment.train_method == 'reinforce' or experiment.train_method == 'reinforce_test':
        graph_batch_size = num_samples

    ### prepare model ###
    tf.reset_default_graph()# maybe need?
    with tf.variable_scope(tf.get_variable_scope(), reuse=False):
        model = rnn_model.RNN(graph_batch_size, max_source_sentence_length, source_vocab_to_int, target_vocab_to_int, encoding_embedding_size, decoding_embedding_size, rnn_size, num_layers)
    
    eval_batch_size = 128
    with tf.variable_scope(tf.get_variable_scope(), reuse=True):
        eval_model = rnn_model.RNN(eval_batch_size, max_source_sentence_length, source_vocab_to_int, target_vocab_to_int, encoding_embedding_size, decoding_embedding_size, rnn_size, num_layers, False)


    early_stopping = True

    ### train model ###
    if experiment.train_method == 'reinforce_test':
        train.reinforce_test(model, experiment.start_checkpoint, source_vocab_to_int, learning_rate, keep_probability, graph_batch_size, target_int_to_vocab, source_int_to_vocab, valid_source, valid_target)
    else:
        train.train(experiment.name, experiment.train_method, model, epochs, input_batch_size, train_source, train_target, valid_source, valid_target, learning_rate, keep_probability, save_path, experiment.start_checkpoint, target_int_to_vocab, source_int_to_vocab, source_vocab_to_int, log_path, graph_batch_size, experiment.max_hours, eval_model, eval_batch_size, reward_func, early_stopping)
示例#27
0
        print(item + ' --vs-- ' + contentTwo[count])
        posOne = pos_tagging(item)
        posTwo = pos_tagging(contentTwo[count])
        for x, y in zip(posOne, posTwo):
            sentOne = []
            sentTwo = []
            for pair in x:
                sentOne.append(str(pair[1]))
            for pair in y:
                sentTwo.append(str(pair[1]))
            if (sentOne == ['NN']):
                continue

            print(str(sentOne), str(sentTwo))
            print("hamming: ")
            ham = textdistance.hamming(str(sentOne), str(sentTwo))
            totalHamming += ham
            print(ham)
            print("cosine:  ")
            cos = textdistance.cosine(str(sentOne), str(sentTwo))
            totalCosine += cos
            print(cos)
            print("gotoh:  ")
            got = textdistance.gotoh(str(sentOne), str(sentTwo))
            totalGotoh += got
            print(got)
            print("levenshtein: ")
            lev = textdistance.levenshtein(str(sentOne), str(sentTwo))
            totalLev += lev
            print(lev)
            print('\n')