예제 #1
0
def gen_corrections(only_last, uncorrected_string):
    global transcript_corrections, expected_word, current_word_number_temporary_offset
    if not only_last:
        current_word_number_temporary_offset = -1  #-1?
    words = uncorrected_string.split(' ')
    if only_last:
        words_to_check = [len(words) - 1]
    else:
        words_to_check = range(0, len(words) - 1)

    for i in words_to_check:
        set_expected_word()
        #current_word_number_temporary_offset +=1
        found_correction = False  #initial value
        #Perform a phonetic comparision
        if (jellyfish.match_rating_comparison(words[i], expected_word)):
            found_correction = True  #if its close indicate the correction to be made
            #print("corgen")

        if found_correction:
            #Replace the actual word in the list, and store to intrim
            #words[-1] = expected_word
            #Create correction object
            o = correction
            o.expected_index = i  #blechED YOU ARE IT WORKS-ish
            o.old_string = words[i]
            o.new_string = expected_word
            #Add it to the list...
            transcript_corrections.append(o)
예제 #2
0
def simple_example():
    # String comparison.
    str1, str2 = u'jellyfish', u'smellyfish'

    print("jellyfish.levenshtein_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.levenshtein_distance(str1, str2)))
    print("jellyfish.damerau_levenshtein_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.damerau_levenshtein_distance(str1, str2)))
    print("jellyfish.hamming_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.hamming_distance(str1, str2)))
    print("jellyfish.jaro_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_distance(str1, str2)))
    print("jellyfish.jaro_similarity({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_similarity(str1, str2)))
    print("jellyfish.jaro_winkler({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_winkler(str1, str2)))
    print("jellyfish.jaro_winkler_similarity({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_winkler_similarity(str1, str2)))
    print("jellyfish.match_rating_comparison({}, {}) = {}.".format(
        str1, str2, jellyfish.match_rating_comparison(str1, str2)))

    #--------------------
    # Phonetic encoding.
    ss = u'Jellyfish'

    print("jellyfish.metaphone({}) = {}.".format(ss, jellyfish.metaphone(ss)))
    print("jellyfish.soundex({}) = {}.".format(ss, jellyfish.soundex(ss)))
    print("jellyfish.nysiis({}) = {}.".format(ss, jellyfish.nysiis(ss)))
    print("jellyfish.match_rating_codex({}) = {}.".format(
        ss, jellyfish.match_rating_codex(ss)))
예제 #3
0
파일: NoiseMaker.py 프로젝트: ufal/npfl087
    def noisify_word(self, word):
        # normlaize the word
        w1 = word.strip().strip(".").strip().strip(",").strip().strip(
            "\"").strip().strip(",").strip()

        # TODO
        # we are doing naive substituion
        # some representation words in the terms of features would be much better
        # two alternative improvements options:
        #   v1: create & store metaphone/soundex and compare with suitable algorithm  (match_rating_comparison)
        #   v2: create & store features which represents sounds... find in space using kNN / cos_sim ....
        if not w1 in self.word_wID:
            return word

        # do the subsitution with given probability & threshodls
        else:
            # first obtain all similar words
            wID = self.word_wID[w1]
            lst_of_similar_words = []
            lst_setID = self.wID_setID_lst[
                wID]  # word can possibly be part of multiple sets (homonyms are non transitive)
            for setID in lst_setID:
                wID_list = self.setID_wID_lst[setID]
                for wid in wID_list:
                    w = self.wID_word[wid]
                    lst_of_similar_words.append(w)

            # next calculate some suitable score with all of them
            similar = []
            for w2 in lst_of_similar_words:
                if w1 == w2:  # skip same [just a safety check ... will not happen]
                    continue
                elif jellyfish.match_rating_comparison(
                        w1, w2):  # must be phonetically similar
                    score = jellyfish.jaro_winkler(w1, w2)
                    if score > self.threshold:  # and score must be higher than threshold
                        similar.append((w2, score))

            if len(similar) == 0:
                return word

            # and sort them by this score
            similar = sorted(similar, key=lambda tup: tup[1], reverse=True)

            # sample those in range of threshols & top_k
            # ... todo

            # finally, with given probability
            if random.uniform(0, 1) >= 1 - self.prob:
                idx = random.randint(0, len(similar) - 1)
                w2, _ = similar[idx]
                print("returning w2")
                return w2

        # return the original word, if substituion was not returned ...
        print("returning original")
        return word
예제 #4
0
    def test_match_rating_comparison(self):
        cases = [("Bryne", "Boern", True),
                 ("Smith", "Smyth", True),
                 ("Catherine", "Kathryn", True),
                 ("Michael", "Mike", False),
                 ]

        for (s1, s2, value) in cases:
            self.assertEqual(jellyfish.match_rating_comparison(s1, s2), value)
예제 #5
0
    def get_name_similarity(self, candidate):
        import jellyfish

        return {
            # Phonetic distance
            'mra': jellyfish.match_rating_comparison(self.app_name.replace(' ', ''),
                                                     candidate.app_name.replace(' ', '')),
            # String distance
            'jaro': jellyfish.jaro_winkler(self.app_name, candidate.app_name),
        }
예제 #6
0
def determine_matching_stats(string1: str, string2: str) -> MatchingStats:
    """Determines the different distances between two strings."""
    return MatchingStats(
        string1=string1,
        string2=string2,
        damerau_levenshtein_distance=jellyfish.damerau_levenshtein_distance(
            string1, string2),
        jaro_winkler_distance=jellyfish.jaro_winkler(string1, string2),
        match_rating_approach_comparison=jellyfish.match_rating_comparison(
            string1, string2),
        exact_match=string1.strip().lower() == string2.strip().lower(),
    )
def mra_1_to_all(word, all_words, threshold):
    similar_list = []
    for j, w2 in enumerate(all_words):
        if word == w2:  # skip -- same word
            continue

        # Must similar according to Match Rating Comparison (similarity on MRA hashes)
        if jellyfish.match_rating_comparison(word, w2):
            # And also score must be higher than threshold
            if jellyfish.jaro_winkler_similarity(word, w2) >= threshold:
                similar_list.append(w2)

    return similar_list
예제 #8
0
def getSimilarity(str1, str2):
    distance = {}
    if distance_metric1 == "JaroWinkler":
        distance[distance_metric1] = jellyfish.jaro_winkler(str1, str2)
    if distance_metric2 == "Jaro":
        distance[distance_metric2] = jellyfish.jaro_distance(str1, str2)
    if distance_metric3 == "MatchRating":
        distance[distance_metric3] = jellyfish.match_rating_comparison(
            str1, str2)
    if distance_metric4 == "Levenshtein":
        distance[distance_metric4] = jellyfish.levenshtein_distance(str1, str2)
    if distance_metric5 == "Hamming":
        distance[distance_metric5] = jellyfish.hamming_distance(str1, str2)
    return distance
예제 #9
0
def mrc():
    # english  -----------------------------
    tokens = [
        'Ball Bearing', 'bll brng', 'Centrifugal', 'centrifigal', 'PUmp', 'pmp'
    ]

    print('Running Match Rating Codex (EN)...')

    # print tokens
    print('Tokens: ', end='')
    for i in tokens:
        print(i, ' | ', end='')

    # printcodes
    print('\n', end="")
    print('Codes: ', end='')
    for i in tokens:
        print(jellyfish.match_rating_codex(i), ' | ', end='')

    # print string match comparisons
    print('\n', end="")
    print('Comparisons: ', end='')
    print('Ball Bearing, bll brng: ',
          jellyfish.match_rating_comparison('Ball Bearing', 'bll brng'))
    print('Centrifugal, centrifigal: ',
          jellyfish.match_rating_comparison('Centrifugal', 'centrifigal'))
    print('PUmp, pmp: ', jellyfish.match_rating_comparison('PUmp', 'pmp'))

    # german  -----------------------------
    tokens = [
        'Kugellager', 'kugelagr', 'Zentrifugal', 'zentrifkl', 'PUmpe', 'pmp'
    ]

    print('\n\nRunning Match Rating Codex Comparison (DE)...')

    # print tokens
    print('Tokens: ', end='')
    for i in tokens:
        print(i, ' | ', end='')

    # printcodes
    print('\n', end="")
    print('Codes: ', end='')
    for i in tokens:
        print(jellyfish.match_rating_codex(i), ' | ', end='')

    # print string match comparisons
    print('\n', end="")
    print('Comparisons: ', end='')
    print('Kugellager,  kugelagr: ',
          jellyfish.match_rating_comparison('Kugellager', 'kugelagr'))
    print('Zentrifugal, zentrifkl: ',
          jellyfish.match_rating_comparison('Zentrifugal', 'zentrifkl'))
    print('PUmpe, pmp: ', jellyfish.match_rating_comparison('PUmpe', 'pmp'))
예제 #10
0
파일: views.py 프로젝트: dbarlett/namespect
def distance(string_1, string_2):
    """Compute the edit distance between two strings.
    """
    return jsonify({
        "levenshtein": jellyfish.levenshtein_distance(string_1, string_2),
        "damerau-levenshtein": jellyfish.damerau_levenshtein_distance(
            string_1,
            string_2
        ),
        "jaro": jellyfish.jaro_distance(string_1, string_2),
        "jaro-winkler": jellyfish.jaro_winkler(string_1, string_2),
        "match_rating_codex": jellyfish.match_rating_comparison(
            string_1,
            string_2
        ),
        "sift3": pymailcheck.sift3_distance(string_1, string_2),
    })
예제 #11
0
def _word_similarity_score(a, b):
    if a == b:
        return 1.

    # Case and whitespace insenstive comparison.
    if a.lower().strip() == b.lower().strip():
        return 0.95

    # Penalize whitespace matching to non-whitespace.
    if ((_isspace(a) and not _isspace(b)) or
        (not _isspace(a) and _isspace(b))):
        return 0

    # Exceptions to punctuation.
    if _match_ampersand(a, b):
        return 0.85
    # Penalize punctuation matching to non-punctuation.
    if _ispunc(a) and _ispunc(b):
        return 0.95
    if ((_ispunc(a) and not _ispunc(b)) or
        (not _ispunc(a) and _ispunc(b))):
        return 0

    # Problems with phonetic match functions segfaulting on
    # empty strings. Also beneficial to match strings with
    # no alpha characters to each other (e.g., line numbers).
    a_alpha = u''.join([ c for c in a if c.isalpha() ])
    b_alpha = u''.join([ c for c in b if c.isalpha() ])
    if a_alpha == '' and b_alpha == '':
        return 0.85

    # Strings sound alike (approximate phonetic match).
    if jf.match_rating_comparison(a_alpha, b_alpha):
        return 0.9
    if jf.metaphone(a_alpha) == jf.metaphone(b_alpha):
        return 0.9
    if jf.soundex(a_alpha) == jf.soundex(b_alpha):
        return 0.9
    if jf.nysiis(a_alpha) == jf.nysiis(b_alpha):
        return 0.9

    # Use scaled Jaro-Winkler distance.
    return jf.jaro_winkler(a, b)
예제 #12
0
def gen_corrections(only_last, uncorrected_string):
    global transcript_corrections, expected_word, current_word_number_temporary_offset, transcript_variations, transcript_variations_temporary
    #if not only_last:
    #current_word_number_temporary_offset = -1 #-1?
    words = uncorrected_string.split(' ')
    while '' in words:
        words.remove('')

    print("Gen corrections: {}".format(words))

    words_to_check = []
    if only_last:
        words_to_check = [-1]
    else:
        words_to_check = range(0, len(words))

    for i in words_to_check:
        set_expected_word(current_word_number + i)
        #current_word_number_temporary_offset +=1
        found_correction = False  #initial value
        found_variation = False  #False #initial value
        #Check if its the same
        if words[i] == expected_word:
            pass
        elif (jellyfish.match_rating_comparison(words[i], expected_word)):
            found_correction = True  #if its close indicate the correction to be made
        else:
            found_variation = True

        if found_correction or found_variation:
            o = correction()
            o.expected_index = current_word_number + i  # - 1#blechED YOU ARE IT WORKS-ish
            o.old_string = words[i]
            o.new_string = expected_word

            if found_correction:
                transcript_corrections.append(o)

            if found_variation:
                if only_last:
                    transcript_variations_temporary.append(o)
                else:
                    transcript_variations.append(o)
예제 #13
0
 def string_comparison(self, text1, text2, choice='levenshtein_distance'):
     '''
     text1: String Input 1
     text2: String Input 2
     choice: 'levenshtein_distance' or 'damerau_levenshtein_distance' or 'hamming_distance' or 'jaro_distance' or 'jaro_winkler' or 'match_rating_comparison'
     '''
     # https://jellyfish.readthedocs.io/en/latest/comparison.html
     if choice == 'levenshtein_distance':
         return jellyfish.levenshtein_distance(text1, text2)
     elif choice == 'damerau_levenshtein_distance':
         return jellyfish.damerau_levenshtein_distance(text1, text2)
     elif choice == 'hamming_distance':
         return jellyfish.hamming_distance(text1, text2)
     elif choice == 'jaro_distance':
         return jellyfish.jaro_distance(text1, text2)
     elif choice == 'jaro_winkler':
         return jellyfish.jaro_winkler(text1, text2)
     elif choice == 'match_rating_comparison':
         return jellyfish.match_rating_comparison(text1, text2)
     else:
         print("Wrong Choice")
예제 #14
0
파일: homz.py 프로젝트: juckylv/homz
def generate_data(words):
    word_dict = {}
    for i, w1 in enumerate(words):
        matching_words = []
        for j, w2 in enumerate(words):
            if w1 == w2:
                continue
            else:
                is_match_rating_true = jellyfish.match_rating_comparison(
                    w1, w2)
                if is_match_rating_true:
                    jaro_winkler_score = jellyfish.jaro_winkler(w1, w2)
                    if jaro_winkler_score > 0.0:
                        word_score = (w2, jaro_winkler_score)
                        matching_words.append(word_score)
                if j == len(words) - 1:
                    sorted_matching_words = sorted(matching_words,
                                                   key=lambda tup: tup[1],
                                                   reverse=True)
                    word_dict[w1] = sorted_matching_words
    return word_dict
예제 #15
0
def calcula_similaridade(documents):
    """
    documents = ["The legal system is made up of civil courts, criminal courts and specialty courts such as family law courts and bankruptcy court. Each court has its own jurisdiction, which refers to the cases that the court is allowed to hear. In some instances, a case can only be heard in one type of court. For example, a bankruptcy case must be heard in a bankruptcy court. In other instances, there may be several potential courts with jurisdiction. For example, a federal criminal court and a state criminal court would each have jurisdiction over a crime that is a federal drug offense but that is also an offense on the state level.",
      "The legal system is comprised of criminal and civil courts and specialty courts like bankruptcy and family law courts. Every one of the courts is vested with its own jurisdiction. Jurisdiction means the types of cases each court is permitted to rule on. Sometimes, only one type of court can hear a particular case. For instance, bankruptcy cases an be ruled on only in bankruptcy court. In other situations, it is possible for more than one court to have jurisdiction. For instance, both a state and federal criminal court could have authority over a criminal case that is illegal under federal and state drug laws.",
      "In many jurisdictions the judicial branch has the power to change laws through the process of judicial review. Courts with judicial review power may annul the laws and rules of the state when it finds them incompatible with a higher norm, such as primary legislation, the provisions of the constitution or international law. Judges constitute a critical force for interpretation and implementation of a constitution, thus de facto in common law countries creating the body of constitutional law."]
    """  
    shingles = []
    # handle documents one by one
    # makes a list of sets which are compresized of a list of K words string
    for doc in documents:
        # makes a set of tokens
        # sh = set([' ', ..., ' '])
        sh = make_a_set_of_tokens(doc)

        # shingles : list of sets (sh)
        shingles.append(sh)
        
    # print("shingles=%s") %(shingles)
    
    combinations = list( itertools.combinations([x for x in range(len(shingles))], 2) )
    #print("combinations=",combinations)

    # compare each pair in combinations tuple of shingles
    for c in combinations:
        i1 = c[0]
        i2 = c[1]
        jac = jaccard_set(shingles[i1], shingles[i2])
        #print(c,": jaccard=", jac)

    # Comparação de todo o documento (sem tokenizar) 
    N = len(documents)

    mtx_lv=numpy.empty((N,N,))
    mtx_lv[:]=numpy.nan
    mtx_jd=numpy.empty((N,N,))
    mtx_jd[:]=numpy.nan
    mtx_dlv=numpy.empty((N,N,))
    mtx_dlv[:]=numpy.nan
    mtx_jw=numpy.empty((N,N,))
    mtx_jw[:]=numpy.nan
    mtx_hd=numpy.empty((N,N,))
    mtx_hd[:]=numpy.nan
    mtx_mr=numpy.empty((N,N,))
    mtx_mr[:]=numpy.nan
    mtx_fuz=numpy.empty((N,N,))
    mtx_fuz[:]=numpy.nan
    

    comb = list(itertools.combinations([x for x in range(len(documents))],2))
    #print("comb=", comb)
    for d in comb:
        i1 = d[0]
        i2 = d[1]
        #lv = jellyfish.levenshtein_distance(documents[i1],documents[i2])
        mtx_lv[d[0]][d[1]]=jellyfish.levenshtein_distance(documents[i1],documents[i2])
        mtx_jd[d[0]][d[1]]=jellyfish.jaro_distance(documents[i1],documents[i2]) 
        mtx_dlv[d[0]][d[1]]=jellyfish.damerau_levenshtein_distance(documents[i1],documents[i2])
        mtx_jw[d[0]][d[1]]=jellyfish.jaro_winkler(documents[i1],documents[i2]) 
        mtx_hd[d[0]][d[1]]=jellyfish.hamming_distance(documents[i1],documents[i2]) 
        mtx_mr[d[0]][d[1]]=jellyfish.match_rating_comparison(documents[i1],documents[i2]) 
        mtx_fuz[d[0]][d[1]]= fuzz.ratio(documents[i1],documents[i2])
        """
        print("\n\nlv dist",d,":\t\t",lv )
        jd = jellyfish.jaro_distance(documents[i1],documents[i2])
        print("jaro dist",d,":\t\t",jd )
        dlv = jellyfish.damerau_levenshtein_distance(documents[i1],documents[i2])
        print("damerau dist",d,":\t\t",dlv )
        jw = jellyfish.jaro_winkler(documents[i1],documents[i2]) 
        print("jaro_wink dist",d,":\t\t",jw )
        hd = jellyfish.hamming_distance(documents[i1],documents[i2]) 
        print("hamming dist",d,":\t\t",hd )
        mr = jellyfish.match_rating_comparison(documents[i1],documents[i2]) 
        print("match_rat dist",d,":\t\t",mr )
        fuz = fuzz.ratio(documents[i1],documents[i2])
        print("fuzzy dist",d,":\t\t",fuz)
        """
    print("levenshtein_distance\n",mtx_lv)
    print("\n\njaro distance\n",mtx_jd)
    print("\n\ndemerau levenshtein distance\n",mtx_dlv)
    print("\n\njaro winkler\n",mtx_jw)
    print("\n\nhamming distance\n",mtx_hd)
    print("\n\nmatch_rating\n",mtx_mr)
    print("\n\nfuzz.ratio\n",mtx_fuz)
    import seaborn as sns
    import pandas as pd
    import matplotlib.pyplot as plt
    
    mtx_lv=numpy.tril(mtx_lv.T,1) #tranforma matriz diagonal superior em inferior para a plotagem

    sns.set(style="white")

    mask = numpy.zeros_like(mtx_lv, dtype=numpy.bool)
    mask[numpy.triu_indices_from(mask)] = True

    fig,ax = plt.subplots(figsize=(10,10))
    ax.set_title("levenshtein_distance")


    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    sns.heatmap(mtx_lv, mask=mask, cmap=cmap, vmax=500, center=0,square=True, linewidths=.5, cbar_kws={"shrink": .5})

    plt.show()         
예제 #16
0
def match_rating_comparison(s1, s2):
    return jellyfish.match_rating_comparison(s1, s2)  # return two values
예제 #17
0
 def match_rating_comparison(x, y):
     """The Match Rating comparison score of the
     Jellyfish package
     """
     return 100 if jf.match_rating_comparison(x, y) else 0
예제 #18
0
sim_arry4 = [
    1.0 -
    jellyfish.damerau_levenshtein_distance(unicode(string[0]), unicode(s)) /
    ((len(string[0]) + len(s)) / 2.0) for s in string
]
print 'dameru', sim_arry4
sim_arry5 = [
    jellyfish.jaro_distance(unicode(string[0]), unicode(s)) for s in string
]
print 'jaro', sim_arry5
sim_arry6 = [
    jellyfish.jaro_winkler(unicode(string[0]), unicode(s)) for s in string
]
print 'jaro winkler', sim_arry6
sim_arry7 = [
    jellyfish.match_rating_comparison(unicode(string[0]), unicode(s))
    for s in string
]
print 'match rating comparison', sim_arry7
# tokens = word_tokenize([string])
# print(string_token)
# print tfidf_matrix

# print(y.toarray()
ngram_array = [word_grams(s.split(' ')) for s in string]
# print ngram_array
n = NGram()
# print list(n.split(string[0]))
ngram_array = [list(n.split(s)) for s in string]
# print ngram_array
sim_arry8 = [NGram.compare(string[0].lower(), s.lower(), N=4) for s in string]
예제 #19
0
def main():
    # declare test strings
    # rem: u prefix is required jellyfish convention
    str1 = u'Jellyfish' 
    str2= u'Smellyfish'
    
    
    # test Phonetic Encoding
    print('\nPhonetic Encoding ----------------------------')
    
    # Metaphone
    r1 = jellyfish.metaphone(str1)
    r2 = jellyfish.metaphone(str2)
    print('Metaphone: ', r1, ", ", r2)
    
    # American Soundex
    r1 = jellyfish.soundex(str1)
    r2 = jellyfish.soundex(str2)
    print('Soundex: ', r1, ", ", r2)
    
    # NYSIIS
    r1 = jellyfish.nysiis(str1)
    r2 = jellyfish.nysiis(str2)
    print('NYSIIS: ', r1, ", ", r2)

    # Match Rating Codex    
    r1 = jellyfish.match_rating_codex(str1)
    r2 = jellyfish.match_rating_codex(str2)
    print('Match Rating Codex: ', r1, ", ", r2)
    
    
    # test Stemming
    print('\nStemming -------------------------------------')
    pStr1 = u'Jellyfished'
    pStr2 = u'Smellyfishing'
    r1 = jellyfish.porter_stem(str1)
    r2 = jellyfish.porter_stem(str2)
    print('Porter Stemmer: ', r1, ", ", r2)
    
    
    # test String Comparison
    print('\nString Comparisons ---------------------------')
    
    # Levenshtein Distance
    r = jellyfish.levenshtein_distance(str1, str2)
    print('Levenshtein Distance: ', r)

    # Damerau-Levenshtein Distance
    r = jellyfish.damerau_levenshtein_distance(str1, str2)
    print('Damerau-Levenshtein Distance: ', r)
    
    # Hamming Distance
    result = jellyfish.hamming_distance(str1, str2)
    print('Hamming Distance: ', r)

    # Jaro Distance
    result = jellyfish.jaro_distance(str1, str2)
    print('Jaro Distance: ', r)
    
    # Jaro-Winkler Distance
    result = jellyfish.jaro_winkler(str1, str2)
    print('Jaro-Winkler Distance: ', r)
    
    # Match Rating Approach (comparison)
    r = jellyfish.match_rating_comparison(str1, str2)
    print('Match Rating Comparison: ', r)
     
        
    # end program
    print('Done.')
예제 #20
0
 def test_match_rating_comparison_segfault(self):
     import hashlib
     sha1s = [hashlib.sha1(str(v)).hexdigest() for v in range(100)]
     # this segfaulted on 0.1.2
     r = [[jellyfish.match_rating_comparison(h1, h2) for h1 in sha1s]
          for h2 in sha1s]
예제 #21
0
def match_rating_comparison(d1, d2):
    return jellyfish.match_rating_comparison(d1, d2)
예제 #22
0
import pandas as pd
import jellyfish

importer_list = pd.read_csv(
    r'C:\Users\S\PycharmProjects\CompanyNames\HMRC\importsNames.csv')

importer_names = importer_list[['NAME']].drop_duplicates()

# sample_df = pd.read_csv(r'C:\Users\S\PycharmProjects\CompanyNames\data\raw\company_names.csv')
#
#
# x= pd.merge(sample_df,importer_names,how='inner',left_on = ['CompanyName'],right_on=['NAME'])
# x=x[['NAME']].sample(100)
# x.to_csv('matched.csv',index=None )

x = pd.read_csv(r'./HMRC/matched.csv')
y = x['NAME'][0]

z = [[i, jellyfish.jaro_similarity(i, y)] for i in x['NAME'] if y != i]
z3 = [[i, jellyfish.match_rating_comparison(i, y)] for i in x['NAME']
      if y != i]
z2 = pd.DataFrame(z)
예제 #23
0
def match_rating_comparison(s1, s2):
    return None if s1 == None or s2 == None else J.match_rating_comparison(
        s1, s2)
예제 #24
0
def match_rating(query, template):
    return jellyfish.match_rating_comparison(query, template)
예제 #25
0
def getwikidatacity(_step, list_wikidataid, ne_fid, ne_xid, ne_lon, ne_lat, ne_wikidataid, ne_name ,ne_namealt ,ne_adm0name,ne_adm1name,ne_ls_name,ne_geonameid, ne_scalerank,ne_labelrank,ne_natscale):

    query_template="""
        PREFIX geo: <http://www.opengis.net/ont/geosparql#>
        SELECT
            ?place
            ?placeLabel
            ?placeDescription
            (group_concat(distinct  ?pLabel       ; separator = "#")        as ?type_grp)
            (group_concat(distinct  ?placeLabelru ; separator = "#")        as ?placeLabelru)               
            (group_concat(distinct  ?sitelink_en  ; separator = "#")        as ?sitelink_en)
            (group_concat(distinct  ?sitelink_es  ; separator = "#")        as ?sitelink_es)  
            (group_concat(distinct  ?sitelink_ru  ; separator = "#")        as ?sitelink_ru)   
            (group_concat(distinct  ?sitelink_zh  ; separator = "#")        as ?sitelink_zh)                                    
            (group_concat(distinct  ?sitelink_ceb ; separator = "#")        as ?sitelink_ceb)
            (group_concat(distinct  ?countryLabelx; separator = "#")        as ?countryLabel)
            (SAMPLE(?sistercity)                                            as ?sistercity_sample)
            (AVG(?distance)                                                 as ?distance   )
            (MAX(?population)                                               as ?max_population )
            (group_concat(distinct ?place_alternative ; separator = "#")    as ?place_alternative_grp)
            (group_concat(distinct ?GeoNames_ID       ; separator = "#")    as ?GeoNames_ID_grp)
        WITH {
            SELECT DISTINCT ?place ?distance {

                    #S1#     ?place p:P31/ps:P31  wd:Q515.

                    #S2#     ?place p:P31/ps:P31  wd:Q3957.

                    #S3#           {?place (p:P31/wdt:P31/wdt:P279*)  wd:Q532.     }
                    #S3#     UNION {?place  p:P31/ps:P31              wd:Q532.     }
                    #S3#     UNION {?place (p:P31/wdt:P31/wdt:P279*)  wd:Q15078955.}
                    #S3#     UNION {?place  p:P31/ps:P31              wd:Q15078955.}
                    #S3#     UNION {
                    #S3#      ?place (p:P31/wdt:P31/wdt:P279*) wd:Q486972 .
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q131596.    }.
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q5084.      }.
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q2514025    }.
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P36 ?capitalplace  }.
                    #S3#      ?place rdfs:label ?placeLabel_en FILTER (lang(?placeLabel_en) = "en").
                    #S3#     }
                    #S3#     UNION {
                    #S3#      ?place p:P31/ps:P31  wd:Q486972.
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q131596.    }.
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q5084.      }.
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q2514025    }.
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P36 ?capitalplace  }.
                    #S3#      ?place rdfs:label ?placeLabel_en FILTER (lang(?placeLabel_en) = "en").
                    #S3#     }
                    #S3#     UNION {
                    #S3#      ?place p:P31/ps:P31/wdt:P279*  wd:Q486972.
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q131596.    }.
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q5084.      }.
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q2514025    }.
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P36 ?capitalplace  }.
                    #S3#      ?place rdfs:label ?placeLabel_en FILTER (lang(?placeLabel_en) = "en").
                    #S3#     }

                    #S4#            {?place (p:P31/wdt:P31/wdt:P279*)  wd:Q2039348. }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q2039348. }
                    #S4#     UNION  {?place (p:P31/wdt:P31/wdt:P279*)  wd:Q1867183. }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q1867183. }
                    #S4#     UNION  {?place wdt:P1376     ?admin_ara.               }
                    #S4#     UNION  {?place (p:P31/wdt:P31/wdt:P279*)  wd:Q1637706. }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q1637706. }
                    #S4#     UNION  {?place (p:P31/wdt:P31/wdt:P279*)  wd:Q16861602.}
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q16861602.}
                    #S4#     UNION  {?place p:P31/ps:P31  wd:Q188509.  ?place p:P17/ps:P17  wd:Q408. }
                    #S4#     UNION  {?place (p:P31/wdt:P31/wdt:P279*)  wd:Q1070990. }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q1070990. }
                    #S4#     UNION  {?place p:P31/wdt:P31/wdt:P279*    wd:Q748149.  }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q748149.  }
                    #S4#     UNION  {?place p:P31/wdt:P31/wdt:P279*    wd:Q735428.  }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q735428.  }
                    #S4#     UNION  {?place p:P31/wdt:P31/wdt:P279*    wd:Q318727.  }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q318727.  }
                    #S4#     UNION  {?place p:P31/wdt:P31/wdt:P279*    wd:Q15284.   }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q15284.   }
                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q15284.   }
                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q532.     }
                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q15078955.}
                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q498162.  }
                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q3389680. }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q1639634. }
                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q1639634. }
                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q2112349. }
                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q749622.  }

                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q11618417.  }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q11618417. }
                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q640364.  }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q640364. }
                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q2555896.  }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q2555896. }
                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q109108.  }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q109108. }



                    #S5#            {?place p:P31/ps:P31/wdt:P279*     wd:Q1763214.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q1763214. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q1840161.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q1840161. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q4249901.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q4249901. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q3685463.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q3685463. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q12081657.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q12081657. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q27676416.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q27676416. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q3076994.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q3076994. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q3360771.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q3360771. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q3685463.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q3685463. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q605291.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q605291. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q1539014.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q1539014. }


  

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q7830262.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q7830262. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q3327862.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q3327862. }


                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q956318.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q956318. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q155239.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q155239. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q27676428.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q27676428. }

                    #S5#     UNION  {?place p:P31/ps:P31  wd:Q5084.  ?place p:P17/ps:P17  wd:Q16. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q17305746.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q17305746. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q14762300.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q14762300. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q17366755.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q17366755. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q3327873.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q3327873. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q3788231.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q3788231. }

            # --- S6 -------------------

                    #S6#            {?place p:P31/ps:P31/wdt:P279*     wd:Q6609799.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q6609799. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q3685430.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q3685430. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q2679157.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q2679157. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q2989470.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q2989470. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q6593035.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q6593035. }


                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q43742.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q43742. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q83020.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q83020. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q2706302.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q2706302. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q482821.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q482821. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q2225003.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q2225003. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q133442.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q133442. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q1500350.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q1500350. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q16725943. }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q16725943. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q9316670. }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q9316670. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q1065118. }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q1065118. }
                     
                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q1289426. }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q1289426. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q1336099. }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q1336099. }
                                   
                    #S6#     {
                    #S6#      ?place (p:P31/wdt:P31/wdt:P279*) wd:Q486972 .
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q131596.    }.
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q5084.      }.
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q2514025    }.
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P36 ?capitalplace  }.
                    #S6#     # FILTER(NOT EXISTS  { ?item rdfs:label ?lang_labelx. FILTER(LANG(?lang_labelx) = "en")  }).
                    #S6#      ?place rdfs:label ?placeLabel_xru  FILTER (lang(?placeLabel_xru) = "ru").
                    #S6#     }
                    #S6#     UNION {
                    #S6#      ?place p:P31/ps:P31  wd:Q486972.
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q131596.    }.
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q5084.      }.
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q2514025    }.
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P36 ?capitalplace  }.
                    #S6#      #FILTER(NOT EXISTS  { ?item rdfs:label ?lang_labelx. FILTER(LANG(?lang_labelx) = "en")  }).
                    #S6#      ?place rdfs:label ?placeLabel_xru  FILTER (lang(?placeLabel_xru) = "ru").
                    #S6#     }
                    #S6#     UNION {
                    #S6#      ?place p:P31/ps:P31/wdt:P279*  wd:Q486972.
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q131596.    }.
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q5084.      }.
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q2514025    }.
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P36 ?capitalplace  }.
                    #S6#      #FILTER(NOT EXISTS  { ?item rdfs:label ?lang_labelx. FILTER(LANG(?lang_labelx) = "en")  }).
                    #S6#      ?place rdfs:label ?placeLabel_xru  FILTER (lang(?placeLabel_xru) = "ru").
                    #S6#     }

                    #S7#     FILTER EXISTS { ?place wdt:P190 ?sistercity_x.}

                    #S8#     VALUES ?GeoNames_ID {"3383494"}
                    #S8#     ?place wdt:P1566 ?GeoNames_ID.

                    #S9#      VALUES ?searchnames {"#ne_name#"@en "#ne_name#"@es "#ne_name#"@sv 
                    #S9#                           "#ne_name#"@de "#ne_name#"@fr "#ne_name#"@pt 
                    #S9#                           "#ne_name#"@it "#ne_name#"@da "#ne_name#"@pl
                    #S9#                           "#ne_name#"@cz "#ne_name#"@sk "#ne_name#"@hu
                    #S9#                           "#ne_name#"@lt "#ne_name#"@et "#ne_name#"@lv                    
                    #S9#                           "#ne_name#"@no "#ne_name#"@nl "#ne_name#"@fi  }  
                    #S9#      ?place rdfs:label ?searchnames .

                    SERVICE wikibase:around {     # "#ne_name#" , "#ne_adm0name#"
                        ?place wdt:P625 ?location.
                        bd:serviceParam wikibase:center "Point(16.373064 48.20833)"^^geo:wktLiteral.
                        bd:serviceParam wikibase:radius "#distance#".
                        bd:serviceParam wikibase:distance ?distance.
                    }
                }
            } AS %places
            WHERE {
            INCLUDE %places .
            SERVICE wikibase:label { bd:serviceParam wikibase:language "en".}
            OPTIONAL {?place rdfs:label ?placeLabelru FILTER (lang(?placeLabelru)="ru").}
            OPTIONAL {?place wdt:P31 ?property. ?property rdfs:label ?pLabel FILTER (lang(?pLabel)="en").}
            OPTIONAL {?place wdt:P17 ?country. ?country rdfs:label ?countryLabelx FILTER (lang(?countryLabelx)="en").}
            OPTIONAL {?place wdt:P17       ?country.}
            OPTIONAL {?place wdt:P1566     ?GeoNames_ID.}
            OPTIONAL {?place wdt:P190      ?sistercity.}
            OPTIONAL {?place wdt:P1082     ?population.}
            OPTIONAL {?sitelink_en  schema:about ?place . ?sitelink_en schema:isPartOf  <https://en.wikipedia.org/>.}
            OPTIONAL {?sitelink_es  schema:about ?place . ?sitelink_es schema:isPartOf  <https://es.wikipedia.org/>.}  
            OPTIONAL {?sitelink_ru  schema:about ?place . ?sitelink_ru schema:isPartOf  <https://ru.wikipedia.org/>.}   
            OPTIONAL {?sitelink_zh  schema:about ?place . ?sitelink_zh schema:isPartOf  <https://zh.wikipedia.org/>.}                                  
            OPTIONAL {?sitelink_ceb schema:about ?place . ?sitelink_ceb schema:isPartOf <https://ceb.wikipedia.org/>.}
            OPTIONAL {?place skos:altLabel ?place_alternative   FILTER((LANG(?place_alternative)) = "en").}
        }
        GROUP BY ?place ?placeLabel   ?placeDescription
        ORDER BY ?distance
    """

    q=query_template.replace('16.373064',ne_lon).replace('48.20833',ne_lat)
    q=q.replace('#ne_name#',ne_name).replace('#ne_adm0name#',ne_adm0name)
    q=q.replace('"3383494"','"'+ne_geonameid+'"')

    if   _step==1:
        q=q.replace('#S1#','')
    elif _step==2:
        q=q.replace('#S2#','')
    elif _step==3:
        q=q.replace('#S3#','')
    elif _step==4:
        q=q.replace('#S4#','')
    elif _step==5:
        q=q.replace('#S5#','')
    elif _step==6:
        q=q.replace('#S6#','')    
    elif _step==7:
        q=q.replace('#S7#','')
    elif _step==8:
        q=q.replace('#S8#','')
    elif _step==9:
        q=q.replace('#S9#','')                
    else:
        print("Internal error, _step: ", _step )
        sys.exit(1)



    search_distance=0
    if  ( -10 <=  float(ne_lon) <= 60)  and  (  float(ne_lat) >30  ):
        if   _step==1:
            search_distance=50
        elif _step==2:
            search_distance=50
        elif _step==3:
            search_distance=50
        elif _step==4:
            search_distance=50
        elif _step==5:
            search_distance=50
        elif _step==6:
            search_distance=50
        elif _step==7:
            search_distance=50                        
        elif _step==8:
            search_distance=1200
        elif _step==9:
            search_distance=100

    else:
        if   _step==1:
            search_distance=150
        elif _step==2:
            search_distance=150
        elif _step==3:
            search_distance=120
        elif _step==4:
            search_distance=100
        elif _step==5:
            search_distance=100            
        elif _step==6:
            search_distance=100
        elif _step==7:
            search_distance=100            
        elif _step==8:
            search_distance=1200
        elif _step==9:
            search_distance=100


    print("_step:",_step , "    search_distance=", search_distance)


    # remove double spaces
    while '  ' in q:
        q = q.replace('  ', ' ')

    # remove comments
    qs=''
    for line in q.splitlines():
        if len(line)>0 and line[:2] != ' #'  and  line[:2] != '#S' :
            qs+=line+'\n'
    q=qs

    ts = datetime.datetime.now()

    max_score=-1000

    results = None
    retries = 0
    max_retries=14
    while results == None and retries <  max_retries:
        try:

            results = None

            sleeptime= retries*10 + 5

            qs=q.replace('#distance#', str(search_distance) )
            print("distance-ok")
            if retries > 0:
                print("Try - retries:",retries,"   Distance:",search_distance," Sleeptime:",sleeptime)
            if args.filter_name!='':
                print(qs)
            sparql.setQuery(qs)
            sparql.setTimeout(2000)
            sparql.setReturnFormat(JSON)
            results = sparql.query().convert()

        except SPARQLExceptions.EndPointNotFound as e:
            print("ERRwikidata-SPARQLExceptions-EndPointNotFound:  Retrying in (seconds) : ",sleeptime, flush=True )
            time.sleep(sleeptime)
            retries += 1
            continue

        except SPARQLExceptions.EndPointInternalError as e:
            print("ERRwikidata-SPARQLExceptions-EndPointInternalError: Retrying in (seconds) : ",sleeptime, flush=True )
            time.sleep(sleeptime)
            retries += 1
            # Decrease search distance
            if retries > 3:
                search_distance=int( search_distance*0.9)
            continue

        except TimeoutError:
            print("ERRwikidata-SPARQLExceptions  TimeOut : Retrying in (seconds) : ",sleeptime, flush=True )
            time.sleep(sleeptime)
            retries += 1
            continue

        except SPARQLExceptions.QueryBadFormed as e:
            print("ERRwikidata-SPARQLExceptions-QueryBadFormed : Check!  "  ,  flush=True )
            return "error"

        except HTTPError as e:
            print("ERRwikidata: Got an HTTPError while querying. Retrying in (seconds) : ",sleeptime, flush=True )
            time.sleep(sleeptime)
            retries += 1
            continue

        except:
            print("ERRwikidata: other error. Retrying in (seconds) : ",sleeptime,  flush=True )
            time.sleep(sleeptime)
            retries += 1
            continue


    if results == None and retries >=  max_retries :
        print("Wikidata request failed ; system stopped! ")
        sys.exit(1)

    _runtime=   (datetime.datetime.now() - ts).total_seconds()



    rc_list_wikidataid=[]
#TODO empty answer ..

    for result in results['results']['bindings']:

        _score=0;

        wd_id = result['place']['value'].split('/')[4]


        wd_distance = float( result['distance']['value'] )

        if 'placeLabel' in result:
            wd_label = result['placeLabel']['value']
        else:
            wd_label = ''



        # Check if already queryed?
        if wd_id in list_wikidataid:
            print("Already exist:", wd_id, wd_label)
            continue
        else:
            rc_list_wikidataid.append(wd_id)

        if 'placeLabelru' in result:
            wd_label_ru = result['placeLabelru']['value']
        else:
            wd_label_ru = ''


        if 'placeDescription' in result:
            wd_description = result['placeDescription']['value']
        else:
            wd_description = ''

        if 'type_grp' in result:
            wd_type = "#"+result['type_grp']['value']+"#"
        else:
            wd_type = ''

        if 'countryLabel' in result:
            wd_countrylabel = result['countryLabel']['value']

            cldiff=  - ( 20 -  ( 20 * Levenshtein.jaro_winkler( unidecode.unidecode(ne_adm0name) ,  unidecode.unidecode(wd_countrylabel) )   ) )
            #print( cldiff, ne_adm0name, wd_countrylabel )
            _score+= cldiff

        else:
            wd_countrylabel =''


        if 'sitelink_en' in result:
            wd_sitelink_en = result['sitelink_en']['value']
        else:
            wd_sitelink_en=''


        if wd_sitelink_en != '':
            _score+=   40
        else:
            _score+=  -120
            

        if 'sitelink_es' in result:
            wd_sitelink_es = result['sitelink_es']['value']
        else:
            wd_sitelink_es=''

        if 'sitelink_ru' in result:
            wd_sitelink_ru = result['sitelink_ru']['value']
        else:
            wd_sitelink_ru=''

        if 'sitelink_zh' in result:
            wd_sitelink_zh = result['sitelink_zh']['value']
        else:
            wd_sitelink_zh=''

        if 'sitelink_ceb' in result:
            wd_sitelink_ceb = result['sitelink_ceb']['value']

        else:
            wd_sitelink_ceb=''
 


        if wd_sitelink_en == '':
            if wd_sitelink_es != '':
                _score+= 100
            elif wd_sitelink_ru != '':
                _score+= 80
            elif wd_sitelink_zh != '':
                _score+= 60
            elif wd_sitelink_ceb != '':
                _score+=  -1000        # penalty for   only ceb import






        if 'GeoNames_ID_grp' in result:
            wd_geonames_id_grp="#"+result['GeoNames_ID_grp']['value']+"#"
        else:
            wd_geonames_id_grp=''

        if 'max_population' in result:
            wd_max_population = result['max_population']['value']
            if wd_max_population!='':
                _score+=8
        else:
            wd_max_population=''

        if 'place_alternative_grp' in result:
            wd_place_alternative_grp="#"+result['place_alternative_grp']['value']+"#"
        else:
            wd_place_alternative_grp=''


        if ('#'+ne_name+'#' in wd_place_alternative_grp)  :
            _in_altnames='Y'
            _score+=72
        if ('#'+unidecode.unidecode(ne_name)+'#' in unidecode.unidecode(wd_place_alternative_grp))  :
            _in_altnames='Y'
            _score+=58
        else:
            _in_altnames='N'

        wd_has_sistercity=""
        if ('sistercity_sample' in result):
            if result['sistercity_sample']['value'] !=  '':
                wd_has_sistercity="Y"
                _score+=15




        uni_ne_name=unidecode.unidecode(ne_name)
        uni_ne_ls_name=unidecode.unidecode(ne_ls_name)
        uni_ne_namealt=unidecode.unidecode(ne_namealt)
        uni_ne_adm0name=unidecode.unidecode(ne_adm0name)
        uni_ne_adm1name=unidecode.unidecode(ne_adm1name)

        uni_wd_name=unidecode.unidecode(wd_label)

        if wd_label==wd_id and wd_label_ru != '':    
            _lev_jaro_winkler_ru = Levenshtein.jaro_winkler( uni_ne_name, unidecode.unidecode(wd_label_ru))
        else:
            _lev_jaro_winkler_ru =  0

        _lev_ratio        = Levenshtein.ratio(uni_ne_name, uni_wd_name)
        _lev_distance     = Levenshtein.distance(uni_ne_name, uni_wd_name)
        _lev_jaro         = Levenshtein.jaro(uni_ne_name, uni_wd_name)

        _lev_jaro_winkler       = Levenshtein.jaro_winkler(uni_ne_name, uni_wd_name)
        _lev_jaro_winkler_ls    = Levenshtein.jaro_winkler(uni_ne_ls_name, uni_wd_name)
        _lev_jaro_winkler_alt   = Levenshtein.jaro_winkler(uni_ne_namealt, uni_wd_name)

        _lev_jaro_winkler_adm0  = Levenshtein.jaro_winkler(uni_ne_name+','+uni_ne_adm0name, uni_wd_name )
        _lev_jaro_winkler_adm1  = Levenshtein.jaro_winkler(uni_ne_name+','+uni_ne_adm1name, uni_wd_name )

        _max_lev_jaro_winkler = max(_lev_jaro_winkler,_lev_jaro_winkler_ls,_lev_jaro_winkler_alt,_lev_jaro_winkler_adm0,_lev_jaro_winkler_adm1, _lev_jaro_winkler_ru)

        _match_rating_comparison     = jellyfish.match_rating_comparison(uni_ne_name, uni_wd_name)
        _damerau_levenshtein_distance= jellyfish.damerau_levenshtein_distance(uni_ne_name, uni_wd_name)
        _hamming_distance            = jellyfish.hamming_distance(uni_ne_name, uni_wd_name)

        _score+= _max_lev_jaro_winkler*10;

        if ne_name == wd_label:
            _name_status='R01-Equal'
            _score+=100
        elif ne_name.lower()==wd_label.lower():
            _name_status='R12-Lowcase_equal'
            _score+=99
        elif uni_ne_name==uni_wd_name:
            _name_status='R13-Unidecode_equal'
            _score+=90
        elif uni_ne_ls_name==uni_wd_name:
            _name_status='R31-ls_name eq'
            _score+=60
        elif uni_ne_namealt==uni_wd_name:
            _name_status='R32-namealt eq'
            _score+=60
        elif uni_ne_namealt==uni_wd_name:
            _name_status='R33-namealt eq'
            _score+=60
        elif _max_lev_jaro_winkler == 1.0 :
            _name_status='R41- max(jaro_winkler)=1'
            _score+=50
        elif _max_lev_jaro_winkler >= 0.9 :
            _name_status='R42- max(jaro_winkler) 0.9-1.0'
            _score+=40
        elif _max_lev_jaro_winkler >= 0.8 :
            _name_status='R43- max(jaro_winkler) 0.8-0.9'
            _score+=30
        else:
            _name_status=''


        if wd_distance < 5:
            _score += 10
        elif wd_distance < 10:
            _score += 5
        elif wd_distance > 60:
            _score +=  -30
        elif wd_distance > 30:
            _score +=  -15
        elif wd_distance > 15:
            _score +=  -5

        if ne_geonameid != '' and ('#'+ne_geonameid+'#' in wd_geonames_id_grp)  :
            _geonames_status='EQ'
            _score+=40
        elif ne_geonameid != '' and ne_geonameid != '-1' and wd_geonames_id_grp!='##' and ('#'+ne_geonameid+'#' not in wd_geonames_id_grp)  :
            _geonames_status='NE'
            _score+=0
        else:
            _geonames_status='Na'


        if (ne_wikidataid != '' ) and (wd_id !='' ) and (ne_wikidataid==wd_id):
            _wikidata_status='EQ'
            _score+=15
        elif (ne_wikidataid != '' ) and (wd_id !='' ):
            _wikidata_status='NE'

            # smaller wikidataid is sometimes better
            if float(  ne_wikidataid[1:]) > float(wd_id[1:]):
                _score+=  3
            else:
                _score+= -3

        else:
            _wikidata_status='Na'

        if _score > max_score:
            max_score=_score

        if _score > 140:
            print("@@_score>120:" , ne_name , " :: ",  wd_id, wd_label, wd_description, wd_type )


        c.execute("INSERT INTO wd VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
            (

                        ne_fid,
                        ne_wikidataid,
                        wd_id,
                        ne_name,
                        wd_label,
                        ne_adm0name,
                        wd_countrylabel,
                        ne_adm1name,
                        ne_ls_name,
                        ne_namealt,
                        wd_description,
                        wd_type,
                        ne_geonameid,
                        wd_geonames_id_grp,
                        _geonames_status,
                        wd_place_alternative_grp,
                        wd_sitelink_en,
                        wd_sitelink_es,   
                        wd_sitelink_ru,  
                        wd_sitelink_zh,                                                                          
                        wd_sitelink_ceb,
                        wd_label_ru,
                        wd_has_sistercity,
                        wd_max_population,
                        wd_distance,
                        _step,
                        _score,
                        _name_status,
                        _wikidata_status,
                        _in_altnames,
                        _lev_ratio,
                        _lev_distance,
                        _lev_jaro,
                        _lev_jaro_winkler,
                        ne_scalerank,
                        ne_labelrank,
                        ne_natscale,
                        ne_xid,
                        ts,
                        search_distance,
                        retries,
                        _runtime
            ))

    conn.commit()
    sys.stdout.flush()
    if max_score <= 30:
        print(" Low score .. stop ", max_score)



    return  list_wikidataid + rc_list_wikidataid , max_score