def compare_cpe_packages_fuzzy(CVE_SEARCH_JSON_OUTPUT_PATH , deps_path):
    vendorproductdict = obtain_vendor_product_dict(CVE_SEARCH_JSON_OUTPUT_PATH )
    packageappdict = obtain_package_app_dict(deps_path)
    for vendor, products in vendorproductdict.items():
        for package, apps in packageappdict.items():
            for product in products:
                for app in apps:
                    logger.debug('comparing vendor %s with package %s',
                                 vendor, package)
                    ratio = fuzz.token_sort_ratio(package, vendor)
                    if ratio > THRESHOLD:
                        logger.info('package %s and vendor %s has ratio %s',
                                    package, vendor, ratio)
                    logger.debug('comparing vendor %s with app %s', vendor,
                                 app)
                    ratio = fuzz.token_sort_ratio(app, vendor)
                    if ratio > THRESHOLD:
                        logger.info('app %s and vendor %s in has ratio %s', app,
                                    vendor, ratio)
                    logger.debug('comparing product %s with package %s',
                                 product, package)
                    ratio = fuzz.token_sort_ratio(package, product)
                    if ratio > THRESHOLD:
                        logger.info('package %s and product %s has ratio %s',
                                    package, product, ratio)
                    logger.debug('comparing product %s with app %s', product,
                                 app)
                    ratio = fuzz.token_sort_ratio(app, product)
                    if ratio > THRESHOLD:
                        logger.info('app %s and product %s has ratio %s', app,
                                    product, ratio)
Exemplo n.º 2
0
 def getId(self, title):
     apiArgs = {'api_key' : self.api_key, 'query' : title}
     query = API_URL + self.api_search + "?" + urlencode(apiArgs)
     apiRequest = Request(query, headers=HEADERS)
     result = urlopen(apiRequest).read()
     data = json.loads(result)
             
     movieId = None
     found = {}
     alt = {}
     
     for i in data['results']:
         if i is None:
             continue
         
         if fuzz.token_sort_ratio(title, i[self.title]) == 100:
             movieId = str(i['id'])
             found[movieId] = {'title' : i[self.title], 'date' : i[self.date]}
         elif fuzz.token_sort_ratio(title, i[self.title]) > 85 and fuzz.partial_ratio(title, i[self.title]) > 90:
             altId = str(i['id'])
             alt[altId] = {'title' : i[self.title], 'date' : i[self.date]}
     
     if len(found) == 1:
         return movieId
     elif len(found) > 1:
         print "DUPLICATES FOUND, ENTER THE ID OR -1 TO SKIP"
         movieId = self.movieSelect(found)
     elif len(alt) > 0:
         print "ALTERNATES FOUND, ENTER THE ID OR -1 TO SKIP"
         movieId = self.movieSelect(alt)
     
     return movieId
Exemplo n.º 3
0
def get_CUL_score(record_elems, resp_elems):
    if record_elems is None or resp_elems is None:
        return None
    elif isinstance(record_elems, str) and isinstance(resp_elems, str):
        score = str(fuzz.token_sort_ratio(record_elems, resp_elems))
        return score
    elif isinstance(record_elems, str) and not isinstance(resp_elems, str):
        scores = []
        for n in range(len(resp_elems)):
            score = str(fuzz.token_sort_ratio(record_elems, resp_elems[n]))
            scores.append(score)
        return max(scores)
    elif not isinstance(record_elems, str) and isinstance(resp_elems, str):
        scores = []
        for n in range(len(record_elems)):
            score = str(fuzz.token_sort_ratio(record_elems[n], resp_elems))
            scores.append(score)
        return max(scores)
    elif not isinstance(record_elems, str) and not isinstance(resp_elems, str):
        scores = []
        for n in range(len(record_elems)):
            for m in range(len(resp_elems)):
                score = str(fuzz.token_sort_ratio(record_elems[n],
                            resp_elems[m]))
                scores.append(score)
        if scores != []:
            return max(scores)
    else:
        return None
Exemplo n.º 4
0
def compare_strings(string_one, string_two):
    highest_ratio = 0
    if fuzz.ratio(string_one, string_two)>highest_ratio:
        highest_ratio = fuzz.ratio(string_one, string_two)
    if fuzz.token_sort_ratio(string_one, string_two)>highest_ratio:
        highest_ratio = fuzz.token_sort_ratio(string_one, string_two)
    if fuzz.token_set_ratio(string_one, string_two)>highest_ratio:
        highest_ratio = fuzz.token_set_ratio(string_one, string_two)
    return highest_ratio
Exemplo n.º 5
0
def compare_to_queue(queue, head, ratio, arguments):
    """Compare current title to all in queue."""
    for item in queue:
        if fuzz.token_sort_ratio(item, head.title) > ratio:
            if arguments.verbose:
                print_time_message(arguments, "### Old title: " + item)
                print_time_message(arguments, "### New: " + head.feed_title + ": " + head.title)
                print_time_message(arguments, "### Ratio:" +
                                   str(fuzz.token_sort_ratio(item, head.title)))
            return fuzz.token_sort_ratio(item, head.title)
    return 0
 def compare_two_texts(self, string_a, string_b, normalize_value=True):
     """
     Compare two string and return the value of Token Sort Ratio algorithm
     the value is normalized between 0 and 1 values.
     """
     if ((isinstance(string_a, unicode) and isinstance(string_b, unicode)) or
             (isinstance(string_a, str) and isinstance(string_b, str))):
         if normalize_value:
             return self.__normalized_value(fuzz.token_sort_ratio(string_a, string_b))
         else:
             return fuzz.token_sort_ratio(string_a, string_b)
     else:
         raise TypeError
Exemplo n.º 7
0
def process_group(
        data, group, toc, toc_table, page_num, section,
        sectionid, html):
    """Retreives a group from the full data, and creates toc stuff

    Args:
        data (List): Full set of data containing all hosts
        group (String): String representing group to process
        toc (String): HTML for Table of Contents
        toc_table (String): HTML for Table in ToC
        page_num (int): Page number we're on in the report
        section (String): Display name of the group
        sectionid (String): Unique ID for ToC navigation
        html (String): HTML for current page of report

    Returns:
        List: Elements for category sorted and grouped
        String: HTML representing ToC
        String: HTML representing ToC Table
        String: HTML representing current report page
    """
    group_data = sorted([x for x in data if x.category == group],
                        key=lambda (k): k.page_title)

    grouped_elements = []
    if len(group_data) == 0:
        return grouped_elements, toc, toc_table, html
    if page_num == 0:
        toc += ("<li><a href=\"report.html#{0}\">{1} (Page 1)</a></li>").format(
            sectionid, section)
    else:
        toc += ("<li><a href=\"report_page{0}.html#{1}\">{2} (Page {0})</a></li>").format(
            str(page_num+1), sectionid, section)

    html += "<h2 id=\"{0}\">{1}</h2>".format(sectionid, section)
    unknowns = [x for x in group_data if x.page_title == 'Unknown']
    group_data = [x for x in group_data if x.page_title != 'Unknown']
    while len(group_data) > 0:
        test_element = group_data.pop(0)
        temp = [x for x in group_data if fuzz.token_sort_ratio(
            test_element.page_title, x.page_title) >= 70]
        temp.append(test_element)
        temp = sorted(temp, key=lambda (k): k.page_title)
        grouped_elements.extend(temp)
        group_data = [x for x in group_data if fuzz.token_sort_ratio(
            test_element.page_title, x.page_title) < 70]

    grouped_elements.extend(unknowns)
    toc_table += ("<tr><td>{0}</td><td>{1}</td>").format(section,
                                                         str(len(grouped_elements)))
    return grouped_elements, toc, toc_table, html
Exemplo n.º 8
0
def response_correct(response, answer):
    filtered_response = filter_words(response)
    filtered_answer = filter_words(answer)
    bracketless = strip_brackets(filtered_answer)
    no_whitespace_response = filtered_response.replace(" ", "")
    no_whitespace_answer = filtered_answer.replace(" ", "")
    no_whitespace_bracketless = bracketless.replace(" ", "")
    score = max(
        fuzz.token_sort_ratio(filtered_response, filtered_answer),
        fuzz.token_sort_ratio(filtered_response, bracketless),
        fuzz.ratio(no_whitespace_response, no_whitespace_answer),
        fuzz.ratio(no_whitespace_response, no_whitespace_bracketless)
    )
    return score > 70
Exemplo n.º 9
0
 def search_OCLC(self):
     # since this API does not need a query_type, this variable was used to pass the title to the object
     # dictionary for storing scores for this API
     self.scores = {}
     #import your OCLC develpoer key
     wskey = keys['OCLC-wskey'][0]
     # The API call url
     OCLC = "http://www.worldcat.org/webservices/catalog/search/worldcat/opensearch?q=" + self.query_type + "&wskey=%s" %(wskey)
     try:
         OCLC_result = requests.get(OCLC).text
         # having issues reading the response object. Work around: Write to a file and then read -- the file will be deleted at the end of the process
         with open("temp-file.xml", "w") as file:
             file.write(OCLC_result)
             file.close()
         file = ETree.parse("temp-file.xml")
         root = file.getroot()
         # iterate over the root element and get "title", "author" (name), and "id" (worldcat ID) for each entry 
         for i in root.iter('{http://www.w3.org/2005/Atom}entry'):
             author = i.find('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name').text
             title = i.find('{http://www.w3.org/2005/Atom}title').text
             id = i.find('{http://www.w3.org/2005/Atom}id').text
             # if title was a match (>%95) and author was a match also, then start the process for getting the work_id
             scoreTitle = fuzz.token_sort_ratio(title, self.query_type)
             if scoreTitle > self.th:
                 scoreOCLC = fuzz.token_sort_ratio(author, self.name)
                 if scoreOCLC > self.th:
                     # use this score as the average score
                     score = (scoreTitle + scoreOCLC)/2
                     work_id = ''
                     # get the worldcat ID
                     wid = id.replace('http://worldcat.org/oclc/', '')
                     # store the worldcat ID in the dict -- this ID is not used for enrichment at this point
                     self.scores['OCLC'] = {}
                     self.scores['OCLC']['oclcid'] = {}
                     self.scores['OCLC']['oclcid'][wid] = [self.query_type, scoreTitle]
                     # create the query url and send it to worldcat to get back the JSON-LD
                     workid = 'http://experiment.worldcat.org/oclc/' + wid + '.jsonld'
                     # decode the JSON
                     OCLC_res = requests.get(workid).json()
                     # iterate over the JSON graph and find work_id
                     for i in OCLC_res['@graph']:
                         if 'exampleOfWork' in i.keys():
                             work_id = i['exampleOfWork']
                     if work_id != '':
                         self.scores['OCLC']['work_id'] = {}
                         self.scores['OCLC']['work_id'][work_id] = [self.query_type, score]
     except:
         PrintException(self.log_file, self.name)
     if len(self.scores) > 0:
         return (self.scores)
Exemplo n.º 10
0
def fw_token_sort_ratio(question1, question2):
    fuzzy = []
    for q1, q2 in zip(question1, question2):
        partial_ratio = fuzz.token_sort_ratio(str(q1), str(q2)) / 100
        fuzzy.append([partial_ratio])
    print("Created fuzz token_sort_ratio feature")
    return np.array(fuzzy)
Exemplo n.º 11
0
    def fuzzyMatcher(self):
        """ Fuzzy matching logic, returns two files with results """

        selectSize = len(self.selection)
        for s in self.selection:
            #incCounter()
            sRow, sList, sCode = s
            for c in self.comparison:
                #incCounter()
                cRow, cList, cCode = c
                scoreValue = fuzz.token_sort_ratio(sList, cList)
                dataSet = [sRow, sList, sCode, cRow, cList, cCode, scoreValue]
                if scoreValue >= self.ratio:
                    #print('Hit: Select row %s on Compare row %s with score of %s' %(sRow, cRow, scoreValue))
                    self.match_exact.append(dataSet)

                if scoreValue < self.ratio and scoreValue > self.min_ratio:
                    #print('Fuzzy: Select row %s on Compare row %s with score of %s' %(sRow, cRow, scoreValue))
                    self.match_fuzzy.append(dataSet)

                """ Don't use this unless you want a result set equal to selection * comparison!!! """
                #if scoreValue < self.min_ratio:
                    #print('No Match: Select row %s on Compare row %s with score of %s' %(sRow, cRow, scoreValue))
                #    self.match_none.append(dataSet)

            status = round( ((sRow / selectSize) * 100), 0)
            print('Row %s of %s - Percentage complete - %s' %(sRow, selectSize, status) + '%')
        
        self.csv_writer()
        return self.match_exact, self.match_fuzzy ##, self.match_none
def sentenceCorrector(sentence):
    '''Fucntion to correct the english text using fuzzy logic
        Return Value = String (Corrected sentence)
    '''
    sentence = "".join(sentence)
    #print "SENTENCE: %s" % type(sentence)
    myWord = sentence.split(",")
    #print myWord
    
    for i in range(0, len(myWord)):
      
        check = wordCheck(myWord[i])

        if check == True:
            pass
        else :
            myArray = myDict.suggest(myWord[i])
            #print myArray
            
            tokenSetRatioArray = []
            maxProb = 0
            index = 0
            for j in range(0,len(myArray)):
                tokenSetRatioArray.append(fuzz.token_sort_ratio(myWord[i],myArray[j]))
                if (maxProb < tokenSetRatioArray[j]):
                    maxProb = tokenSetRatioArray[j]
                    index = j;

            #print "Index:" + str(index) + "maxProb:" + str(maxProb) + "index:" + str(index) + "i:" + str(i)
            if tokenSetRatioArray:
                myWord[i] = myArray[index];

    #print myWord
    #print " ".join(myWord)
    return " ".join(myWord)        
Exemplo n.º 13
0
def fuzzer(localstring, dbpstring):
	lwl = localstring.replace('-','').replace(',.', '').split()
	lfwl = [w for w in lwl if not w in stopwords.words('english')]
	dwl = dbpstring.replace('-','').split()
	dfwl = [w for w in dwl if not w in stopwords.words('english')]
	ratio = fuzz.token_sort_ratio(str(lfwl), str(dfwl))
	return ratio
Exemplo n.º 14
0
def getRatio(var1, var2, alg):

    r1test = 40
    r2test = 100
    r3test = 100
    r4test = 90 # 85 is probably too low --- too many FP
    
    # let's keep alg as a dummy, but it may be unimportant
    # it seems that the quality of results can be improved if two (or)
    # -- more results are correlated: [1] can be lowered as long as [4] remains high
    
    r1 = fuzz.ratio(var1,var2)
    r2 = fuzz.partial_ratio(var1,var2)
    r3 = fuzz.token_sort_ratio(var1,var2)
    r4 = fuzz.token_set_ratio(var1,var2)

    if r1 >= r1test:
        if r4 >= r4test:
            ratio = 100
            #reportRatio(var1, var2)
        else:
            ratio = 0
    else:
        ratio = 0

    return(ratio)
Exemplo n.º 15
0
def fuzz_comparisons(x):
    out = {}
    out['fuzz_partial_ratio'] = fuzz.partial_ratio(*x)
    out['fuzz_ratio'] = fuzz.ratio(*x)
    out['fuzz_token_sort_ratio'] = fuzz.token_sort_ratio(*x)
    out['fuzz_token_set_ratio'] = fuzz.token_set_ratio(*x)
    return pd.Series(out)
Exemplo n.º 16
0
    def compare_to(self, name):
        """
        Compares the name object given to itself.

        :param name: The name object to be compared.
        :return: The ratio between them from 0-100.
        """
        return fuzz.token_sort_ratio(name.get_full_name(0), self.get_full_name(0))
def fuzzy_token_sort_ratio_check(full_name_check_value, name_one, name_two):
    """
    Runs a fuzzy token sort ratio check if the record hasn't passed either a full name or name with initial check
    """
    if full_name_check_value == 0:
        return fuzz.token_sort_ratio(name_one, name_two)
    
    return 0
Exemplo n.º 18
0
 def check_translation(self, translation):
     """
     Takes a `translation` and returns `True` if it's correct.
     """
     return any(
         fuzz.token_sort_ratio(translation, t.translation) >= 90
         for t in self.translations
     )
Exemplo n.º 19
0
def GetMatchingFunds(in_fund,df):
   is2fuzzy = 0
   funds = []
   if len(in_fund.split()) != 2:
      funds = df[df['company_name'].apply( lambda x: (in_fund.lower() in x.lower()) )]['company_name'].tolist()
   if funds == [] and len(in_fund.split())==2:
      in_fund = in_fund.split()
      funds = df[df['company_name'].apply( lambda x: (in_fund[0].lower() in x.lower() and in_fund[1].lower() in x.lower()) )]['company_name'].tolist()
   if funds and len(funds) < 6:
      return is2fuzzy, funds
   from fuzzywuzzy import fuzz
   from fuzzywuzzy import process
   funds = df[df['company_name'].apply( lambda x: ( fuzz.token_sort_ratio(x, in_fund) > 50 and fuzz.token_set_ratio(x, in_fund) > 65 ) )]['company_name'].tolist()
   if not funds or len(funds) >= 10:
      is2fuzzy = 1
      funds = df[df['company_name'].apply( lambda x: ( fuzz.token_sort_ratio(x, in_fund) > 40 and fuzz.token_set_ratio(x, in_fund) > 49 ) )]['company_name'].tolist()
   return is2fuzzy, funds
Exemplo n.º 20
0
 def compare_hunks(left, right):
     # This case happens for example, if both hunks remove empty newlines
     # This check is _required_ as fuzzywuzzy currently contains a bug that
     # does misevaluations in case of equivalence. See
     # https://github.com/seatgeek/fuzzywuzzy/issues/196
     if left == right:
         return 100
     return fuzz.token_sort_ratio(left, right)
Exemplo n.º 21
0
def getShowTime(showName):
    
    for item in showTimeDictRev.keys():
        if (fuzz.partial_ratio(what,item) == 100) or (fuzz.token_sort_ratio(what,item) == 100) or (fuzz.token_set_ratio(what,item) == 100):
            print (showTimeDictRev[item])
            print (channelsDictSky[item])
            print (channelsDictSkyRev[(channelsDictSky[item])])
            print (allChannelsRev[channelsDictSky[item]])
def is_same_entity(bentley_term, lc_term, type_):
    if "geogname" in type_:
        similarity = fuzz.token_sort_ratio(bentley_term, lc_term)
        return similarity > 95

    elif "corpname" in type_:
        bentley_term = bentley_term.replace("U.S.", "United States")
        lc_term = lc_term.replace("U.S.", "United States")

        bentley_term = bentley_term.replace("N.Y.", "New York")
        lc_term = lc_term.replace("N.Y.", "New York")

        if "." in bentley_term.strip("."):
            similarity = fuzz.ratio(bentley_term, lc_term)
        else:
            similarity = fuzz.WRatio(bentley_term, lc_term)

        # print("{0}: {1} <--> {2}".format(similarity, original_term, returned_term))
        return similarity >= 90

    elif "persname" in type_:
        bias = 0

        date_regex = r"(\d{4})\-((?:\d{4})?)"
        bentley_dates = re.findall(date_regex, bentley_term)
        lc_dates = re.findall(date_regex, lc_term)

        if len(bentley_dates) > 0 and len(lc_dates) > 0:
            birthdate_bentley, deathdate_bentley = bentley_dates[0]
            birthdate_lc, deathdate_lc = lc_dates[0]

            if birthdate_bentley != birthdate_lc:
                bias -= 100

            if birthdate_bentley == birthdate_lc and deathdate_bentley == deathdate_lc:
                bias += 100

            if birthdate_bentley == birthdate_lc and deathdate_lc and not deathdate_bentley:
                lc_term = lc_term.replace(deathdate_lc, "")
                bias += 25

        similarity = fuzz.token_sort_ratio(bentley_term, lc_term) + bias

        # print("{0}: {1} <--> {2}".format(similarity, bentley_term, lc_term))
        return similarity >= 95
Exemplo n.º 23
0
def preevaluate_filenames(thresholds, right_files, left_file):
    # We won't enter preevaluate_filenames, if tf >= 1.0
    candidates = []
    for right_file in right_files:
        sim = fuzz.token_sort_ratio(left_file, right_file) / 100
        if sim < thresholds.filename:
            continue
        candidates.append(right_file)
    return left_file, candidates
Exemplo n.º 24
0
def similarity(s, t, method):
    if method == "partial":
        return fuzz.partial_ratio(s, t)
    elif method == "token_sort":
        return fuzz.token_sort_ratio(s, t)
    elif method == "token_set":
        return fuzz.token_set_ratio(s, t)
    else:
        return fuzz.ratio(s, t)
Exemplo n.º 25
0
def fuzzy_search(song):
    tracks = client.get('tracks', q=song)
    maxi = 0
    best = ''
    for track in tracks:
        match = fuzz.token_sort_ratio(track.title, song)
        if match>maxi:
            best = track.permalink_url
            maxi = match
    return best
Exemplo n.º 26
0
def process_cv(extracted_resumes, key_multipler, job_description):
  result_list = []

  for resume in extracted_resumes:
    key_checked = []
    title_count, skill_count, generic_count = 0,0,0
    for multipler in key_multipler.keys():
      # Matching job title
      if resume.has_key(header_title) and multipler == header_title:

        # matching first level title
        if fuzz.partial_ratio(job_description[header_title], resume[header_title]) > 80:
          title_count += key_multipler[multipler]

         # recurse in experience
        if resume.has_key(header_experience):
          for experience in resume[header_experience]:
            if experience.has_key(header_title):
              if fuzz.partial_ratio(job_description[header_title], experience[header_title]) > 80:
                title_count += key_multipler[multipler]

      # Matching skills
      elif resume.has_key(header_skill) and multipler == header_skill:
        skill_count += recurse_obj(resume[multipler], job_description[multipler], multipler) * key_multipler[multipler]

      elif resume.has_key(multipler) and multipler not in key_checked:

        key_checked.append(multipler)
        if isinstance(resume[multipler], list) and isinstance(resume[multipler][0], basestring):
          generic_count += recurse_obj(resume[multipler], job_description[multipler], multipler) * key_multipler[multipler]
        elif isinstance(resume[multipler], basestring):
          if fuzz.token_sort_ratio(resume[multipler], job_description[multipler]) > 90:
            generic_count += key_multipler[multipler]
          elif fuzz.token_sort_ratio(resume[multipler], job_description[multipler]) > 60:
            generic_count += key_multipler[multipler] * 0.5

    score = title_count+skill_count+generic_count
    result_list.append({'Name': resume['Name'], 'Score': round(score, 2)})

  # Sort by score
  result_list = (sorted(result_list, key=lambda  t: t.get('Score', 0), reverse=True))
  return result_list
Exemplo n.º 27
0
def UpdateCursor3(infile,test):
    # Update the spatial join shapefile for the different match cases.
    with arcpy.da.UpdateCursor(infile,test) as cursor:
        #Update cursor.
        for row in cursor:
            #row[2]=row[0].area
            #row[3]=row[0].length
            if row[8]!=-1:
                if row[9] !=-1:
                    row[10]=1
            denominatorArea=(row[4]+row[5])/2
            #print denominatorArea
            row[6]=row[2]/((row[4]+row[5])/float(2))
            row[7]=row[2]/((row[5]+row[4])/float(2))
            row[13]=row[3]/((row[11]+row[12])/float(2))
            row[14]=row[3]/((row[11]+row[12])/float(2))         
            #print("{0}, {1}, {2},{3}".format(row[6], row[7], row[13],row[14]))
            if infile==DissolveUnion:
                
                if len(row[15]) > len(row[16]):
                    row[15],row[16]=row[15],row[16]
                distances = range(len(row[15]) + 1)
                for index2,char2 in enumerate(row[16]):
                    newDistances= [index2+1]
                    for index1,char1 in enumerate(row[15]):
                        if char1 == char2:
                            newDistances.append(distances[index1])
                        else:
                            newDistances.append(1 + min((distances[index1],
                                                         distances[index1+1],
                                                         newDistances[-1])))
                    distances = newDistances
                    #print distances                            
                Ldistance=distances[-1]
                s= SequenceMatcher(None,row[15],row[16])
                stringRatio=s.ratio()
                #print stringRatio
                row[17]=Ldistance
                row[18]=stringRatio
                StringRat=fuzz.ratio(row[15], row[16])
                row[19]=StringRat/float(100)
                stringPartialRatio=fuzz.partial_ratio(row[15], row[16])
                row[20]=stringPartialRatio/float(100)
                StringTokenSort=fuzz.token_sort_ratio(row[15], row[16])
                row[21]=StringTokenSort/float(100)
                stringTokenSet=fuzz.token_set_ratio(row[15], row[16])
                row[22]=stringTokenSet/float(100)
                Average=(StringRat+stringPartialRatio+StringTokenSort+stringTokenSet)/4
                row[23]=Average/float(100)
                WeightedAverage=((.10*StringRat)+(.30*stringPartialRatio)+(.30*StringTokenSort)+(.30*stringTokenSet))/4
                row[24]= WeightedAverage/float(100)
                #print Average,WeightedAverage
                #print ("{0},{1},{2}".format(row[15],row[16],row[18]))
            cursor.updateRow(row)
Exemplo n.º 28
0
    def test_service_metadata(self):
        self.maxDiff = None
        response = self.client.get('/api/1.0/refine/reconcile', {'callback': 'jsonp123'})

        self.assertEqual(200, response.status_code)
        self.assertEqual(100,
            fuzz.token_sort_ratio(
                'jsonp123({"name": "Influence Explorer Reconciliation3", "identifierSpace": "http://staging.influenceexplorer.com/ns/entities", "schemaspace": "http://staging.influenceexplorer.com/ns/entity.object.id", "view": { "url": "http://staging.influenceexplorer.com/entity/{{id}}" }, "preview": { "url": "http://staging.influenceexplorer.com/entity/{{id}}", "width": 430, "height": 300 }, "defaultTypes": []})',
                response.content
            )
        )
Exemplo n.º 29
0
def match(song, gdic):
    ftype = song[song.rfind('.'):].lower()
    try:
        if ftype == ".mp3":
            smp = MP3(song)
        elif ftype == ".wma":
            print("wma")
            return "False"
        elif ftype == ".flac":
            smp = FLAC(song)
        elif ftype == ".ogg":
            print("ogg")
            return "False"
        elif ftype in (".mp4", ".m4a"):
            smp = MP4(song)
        else:
            return False
    except IOError:
        return "delete"
    if ftype == ".flac":
        name = smp['title'][0]
        artist = smp['artist'][0]
        album = smp['album'][0]
    elif ftype == ".m4a":
        name = smp['\xa9nam'][0]
        artist = smp['\xa9ART'][0]
        album = smp['\xa9alb'][0] 
    else:
        try:
            name = smp["TIT2"].pprint()[5:].replace('[','(').replace(']',')')
            artist = smp["TPE1"].pprint()[5:].replace("Feat", "Featuring").replace("Andre 3000", "OutKast").replace("Big Boi", "OutKast")
            album = smp["TALB"].pprint()[5:]
        except KeyError:
            return False
    pmatch = [i for i in gdic if fuzz.token_set_ratio(name, i['title']) > 90]
    if len(pmatch) == 1:
        return pmatch[0]
    pmatch = [i for i in pmatch if fuzz.token_set_ratio(artist, i['artist']) > 90]
    if len(pmatch) == 1:
        return pmatch[0]
    pmatch = [i for i in pmatch if fuzz.token_set_ratio(album, i['album']) > 90]
    if len(pmatch) == 1:
        return pmatch[0]
    #pmatch = [i for i in pmatch if ((('(' not in name) and ('(' not in i['title'])) or ((('(' in name) and ('(' in i['title'])) and (name[name.rindex("(") + 1:name.rindex(")")].lower() == i['title'][i['title'].rindex("(") + 1:i['title'].rindex(")")].lower())))]
    pmatch = [i for i in gdic if fuzz.token_sort_ratio(name, i['title']) > 90]
    if len(pmatch) == 1:
        return pmatch[0]
    #print ([(i['title'], i['artist'], i['album'], i['durationMillis']) for i in pmatch])
    pmatch = [i for i in pmatch if abs(smp.info.length * 1000 - int(i['durationMillis'].encode('utf-8'))) < 1000]
    if len(pmatch) == 1:
        return pmatch[0]
    else:
        #print(name, artist, album, smp.info.length * 1000)
        return False
def fuzz_method(linkedin_words, twitter_words):
	counter = 0
	temp_arr = []
	for t in twitter_words:
		for l in linkedin_words:
			temp_arr.append(fuzz.token_sort_ratio(t,l))
			if fuzz.token_sort_ratio(t,l) > 70 and fuzz.token_sort_ratio(t,l) <= 80:
				counter+= 215
			if fuzz.token_sort_ratio(t,l) > 60 and fuzz.token_sort_ratio(t,l) <= 70:
				counter+= 125
			if fuzz.token_sort_ratio(t,l) > 50 and fuzz.token_sort_ratio(t,l) <= 60:
				counter+= 80
			if fuzz.token_sort_ratio(t,l) > 40 and fuzz.token_sort_ratio(t,l) <= 50:
				counter+= 30
			#if fuzz.token_sort_ratio(fullName,query) > 80:
				#counter += 700
	return counter
Exemplo n.º 31
0
    def token_sort_ratio(plans: list, post, threshold=50):
        """
        Match plans based on hardcoded plan topics, using fuzzywuzzy's token_sort_ratio for fuzzy matching
        """

        match_confidence = 0
        match = None

        for plan in plans:
            plan_match_confidence = fuzz.token_sort_ratio(
                post.text.lower(), plan["topic"].lower()
            )

            if plan_match_confidence > match_confidence:
                # Update match
                match_confidence = plan_match_confidence
                match = plan

        return {
            "match": match["id"] if match_confidence > threshold else None,
            "confidence": match_confidence,
            "plan": match,
        }
Exemplo n.º 32
0
async def answer(message: Message):
    user_answer = message.text[1:]
    answer = await db.check_answer()
    user_id = message.from_user.id
    chat_id = message.chat.id
    text1 = "Вы уже отвечали правильно."
    text2 = f" Правильный ответ: {answer}.\nВаш баланс не изменился."
    text3 = "Неверно"
    is_answered = await db.check_points(False)
    await bot.delete_message(chat_id, message.message_id)
    username = message.from_user.username
    full_name = message.from_user.full_name
    name = username
    if username is None:
        name = full_name
    if is_answered:
        await bot.send_message(user_id, text1 + text2)
    elif (fuzz.ratio(user_answer.lower(), answer) >= 80
          or fuzz.token_sort_ratio(user_answer.lower(), answer) >= 80):
        await db.add_points(1)
        await bot.send_message(chat_id, f"@{name}, вы ответили верно")
    else:
        await bot.send_message(user_id, text3)
Exemplo n.º 33
0
def extract_xslx_data(active_cells):
    for row in active_cells:
        for cell in row:
            #### Searching for Revenue(int or float) and Checks number(int or float)
            if isinstance(cell.value, int) or isinstance(cell.value, float):
                for apteka in apteki.keys():
                    if 'revenue_coordinate' in apteki[apteka]:
                        if cell.coordinate[1:] == apteki[apteka][
                                'revenue_coordinate'][1:]:
                            apteki[apteka]['revenue'] = cell.value
                    if 'checks_coordinate' in apteki[apteka]:
                        if cell.coordinate[1:] == apteki[apteka][
                                'checks_coordinate'][1:]:
                            apteki[apteka]['number_of_checks'] = cell.value

            ##### Searching for date str
            if isinstance(cell.value, str):
                if fuzz.token_sort_ratio('00.00.2000', cell.value) >= 45:
                    try:
                        if cell.value[2] and cell.value[5] == '.':
                            the_date.append(cell.value)
                    except IndexError:
                        pass
Exemplo n.º 34
0
def get_playlist(album):
    album_string = album.artist + " " + album.name
    playlist_search_request = requests.get(
        "https://www.googleapis.com/youtube/v3/search?part=snippet&type=playlist&q="
        + album_string + "&key=" + youtube_key)
    playlist_search_dict = json.loads(playlist_search_request.text)
    playlists_list = []
    for item in playlist_search_dict['items']:
        id = item['id']['playlistId']
        title = item['snippet']['title']
        if fuzz.token_sort_ratio(album_string, title) > 80:
            playlists_list.append([id, title])

    skip_playlist = False

    try:
        playlist_vids_list = list_playlist_vids(playlists_list[0][0])
    except IndexError:
        skip_playlist = True

    album_tracklist = Song.objects.filter(album=album)
    if not skip_playlist:
        for vid in playlist_vids_list:
            vid_num = vid[0]
            for song in album_tracklist:
                track_num = song.track_num
                if vid_num == track_num:
                    if compare_song_vid(song, vid):
                        song.save()

    still_no_vid = album_tracklist.filter(youtube_link=0)

    for song in still_no_vid:
        song.youtube_link = keyword_search(song)
        song.save()

    return playlists_list
Exemplo n.º 35
0
def wightedAverage(anime1, anime2):

    totWeight = (5 + 5 + 1 + 1 + 3 + 3)
    acmSim = 0

    if (re.search(anime1.name, anime2.name, re.IGNORECASE)
            or re.search(anime2.name, anime1.name, re.IGNORECASE)):
        acmSim = 100
    else:
        acmSim += fuzz.token_sort_ratio(anime1.name, anime2.name) * 5

    acmSim += fuzz.token_sort_ratio(anime1.genre, anime2.genre) * 5
    acmSim += fuzz.token_sort_ratio(anime1.media, anime2.media) * 1
    acmSim == fuzz.token_sort_ratio(anime1.episodes, anime2.episodes) * 1
    acmSim += fuzz.token_sort_ratio(anime1.rating, anime2.rating) * 3
    acmSim += fuzz.token_sort_ratio(anime1.views, anime2.views) * 3

    return (acmSim / totWeight * 1.0)
Exemplo n.º 36
0
def search(query,type):
    """
    Do a fuzzy match on Journals in MongoDB journals collection, returning results in
    Refine reconciliation API format. 
    
    The type parameter determines if the match is on main_title or abbreviation 
    
    For now, only exact matches are automatically matched, but this can be adjusted.
"""
    out = []
    query = query.lower()
    for item in journal_data:
        id_journal = item.get("id_journal","no_id")
        titleOrAbbrev = str(item.get(type,"nothing_found"))
        if titleOrAbbrev.lower() == query:
            match = True
        else:
            match = False

        #Construct a score using FuzzyWuzzy's token set ratio.
        #https://github.com/seatgeek/fuzzywuzzy
        score = fuzz.token_sort_ratio(query, titleOrAbbrev)
        out.append({
            "id": id_journal,
            "name": titleOrAbbrev,
            "score": score,
            "match": match,
            "type": [
                {
                    "id": "http://purl.org/ontology/bibo/Periodical",
                    "name": "bibo:Periodical",
                }
            ]
        })
    #Sort this list by score
    sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
    return sorted_out[:10]
Exemplo n.º 37
0
def run(query): 
    for a in scan(client, index=config['symbology']['index'], query=query): 
        res = client.search(index=config['suspension']['index'], body={
            "_source" : ["company", "date", "link"],
            "query" : {
                "match" : {
                    "company" : a['_source']['name']
                    }
                }
            })
        
        if res['hits']['total'] > 0:
          mtc       = res['hits']['hits'][0]['_source']
          sym_name  = a['_source']['name'].lower()
          halt_name = mtc['company'].lower() 
          x         = fuzz.token_sort_ratio(sym_name, halt_name)
          y         = fuzz.ratio(sym_name, halt_name)
          halts     = {"match_attempted" : True}
          if res['hits']['hits'][0]['_score'] >= 1 and x >= 90):
              halts.update(mtc)
              halts.update({
                  "fuzz_ratio"            : y,
                  "fuzz_token_sort_ratio" : x, 
                  "match_score"           : a['_score']
              })

        yield {
            "_id"      : a['_id'],
            "_type"    : config['symbology']['_type'],
            "_index"   : config['symbology']['index'],
            "_op_type" : "update",
            "doc" : {
                "__meta__" : {
                    "halts" : halts
                }
            }
        }
def extract_features(dfx):
    # preprocessing each question
    dfx["question1"] = dfx["question1"].fillna("").apply(preprocess)
    dfx["question2"] = dfx["question2"].fillna("").apply(preprocess)

    print("token features...")
    
    # Merging Features with dataset
    
    token_features = dfx.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
    
    dfx["cwc_min"]       = list(map(lambda x: x[0], token_features))
    dfx["cwc_max"]       = list(map(lambda x: x[1], token_features))
    dfx["csc_min"]       = list(map(lambda x: x[2], token_features))
    dfx["csc_max"]       = list(map(lambda x: x[3], token_features))
    dfx["ctc_min"]       = list(map(lambda x: x[4], token_features))
    dfx["ctc_max"]       = list(map(lambda x: x[5], token_features))
    dfx["last_word_eq"]  = list(map(lambda x: x[6], token_features))
    dfx["first_word_eq"] = list(map(lambda x: x[7], token_features))
    dfx["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
    dfx["mean_len"]      = list(map(lambda x: x[9], token_features))
   
    #Computing Fuzzy Features and Merging with Dataset
    
    # do read this blog: http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
    # https://stackoverflow.com/questions/31806695/when-to-use-which-fuzz-function-to-compare-2-strings
    # https://github.com/seatgeek/fuzzywuzzy
    print("fuzzy features..")

    dfx["token_set_ratio"]       = dfx.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
    # The token sort approach involves tokenizing the string in question, sorting the tokens alphabetically, and 
    # then joining them back into a string We then compare the transformed strings with a simple ratio().
    dfx["token_sort_ratio"]      = dfx.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
    dfx["fuzz_ratio"]            = dfx.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
    dfx["fuzz_partial_ratio"]    = dfx.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
    dfx["longest_substr_ratio"]  = dfx.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
    return dfx
Exemplo n.º 39
0
    def _calculate_ratios(
        self,
        message: Message,
        member: Member,
        guild: Guild,
    ) -> None:
        """
        Calculates a messages relation to other messages
        """
        for message_obj in member.messages:
            # This calculates the relation to each other
            if message == message_obj:
                raise DuplicateObject

            elif (self.options(guild).per_channel_spam
                  and message.channel_id != message_obj.channel_id):
                # This user's spam should only be counted per channel
                # and these messages are in different channel
                continue

            elif (fuzz.token_sort_ratio(message.content, message_obj.content)
                  >= self.options(guild).message_duplicate_accuracy):
                """
                The handler works off an internal message duplicate counter
                so just increment that and then let our logic process it later
                """
                self._increment_duplicate_count(member,
                                                guild,
                                                channel_id=message.channel_id)
                message.is_duplicate = True
                message_obj.is_duplicate = True

                if (self._get_duplicate_count(
                        member, channel_id=message.channel_id, guild=guild) >=
                        self.options(guild).message_duplicate_count):
                    break
Exemplo n.º 40
0
def calculate_distance(str1, str2):
    uni1 = re.sub(r'[^\x00-\x7F]+', ' ', str1).decode("utf-8", "ignore")
    uni2 = re.sub(r'[^\x00-\x7F]+', ' ', str2).decode("utf-8", "ignore")

    sm_score = SequenceMatcher(None, str1.lower(), str2.lower()).ratio()
    jar_score = jellyfish.jaro_distance(uni1, uni2)

    lev_score = jellyfish.levenshtein_distance(uni1, uni2)
    dl_score = jellyfish.damerau_levenshtein_distance(uni1, uni2)

    fr_score = fuzz.ratio(str1, str2)
    fpr_score = fuzz.partial_ratio(str1, str2)
    ftsortr_score = fuzz.token_sort_ratio(str1, str2)
    ftsetr_score = fuzz.token_set_ratio(str1, str2)

    fmean_score = (fr_score + fpr_score + ftsortr_score +
                   ftsetr_score) / 400.00
    jar_sm_score = (sm_score + jar_score) / 2

    #print(str(fmean_score) + "\t" + str(jar_sm_score))

    mean = (fmean_score + jar_sm_score) / 2

    return mean
Exemplo n.º 41
0
    def get_similarity(self, sentence):
        """
        Takes in a sentence and returns how similar it is to another sentence
        :param sentence: element of self.sentences
        :type sentence: str
        :return: score of sentence
        :rtype: float
        """
        index = self.sentences.index(sentence)

        if index != len(self.sentences) - 1:
            scores = [
                fuzz.token_sort_ratio(sim, sentence)
                for sim in self.sentences[index + 1:]
            ]
            adjusted_score = -max(scores) / 100
            if adjusted_score < -0.85:
                return 2 * adjusted_score  # heavily decrement highly similar sentences
            elif adjusted_score < -0.6:
                return adjusted_score  # decrement the score of somewhat similar sentences
            else:
                return 0  # ignore low levels of similarity
        else:
            return 0
Exemplo n.º 42
0
 def search_api_LCS(self):
     # dictionary for storing scores for this API
     self.scores = {}
     # The API call url
     suggest = "http://id.loc.gov" + self.query_type + '/suggest/?q=' + urllib.parse.quote(self.name.encode('utf8'))
     try:
         # decode the JSON
         suggest_result = requests.get(suggest).json()
         # iterate over all results
         for n in range(len(suggest_result[1])):
             # get hte "name"
             candidateS = suggest_result[1][n]
             # get the URI (LC ID)
             uriS = suggest_result[3][n].replace('http://id.loc.gov/authorities/names/', '')
             self.scoreSU = fuzz.token_sort_ratio(candidateS, self.name)
             # if the similarity socre is greater than the cut-off, add the "name" LC ID and similarity score to the dict
             if self.scoreSU > self.th:
                 self.scores['LCS'] = {}
                 self.scores['LCS']["lcid"] = {}
                 self.scores['LCS']["lcid"][uriS] = [candidateS, self.scoreSU]
     except:
             PrintException(self.log_file, self.name)
     if len(self.scores) > 0:
         return self.scores
Exemplo n.º 43
0
def dsmatch(name, dataset, fn):
    """
    Fuzzy search best matching object for string name in dataset.

    Args:
        name (str): String to look for
        dataset (list): List of objects to search for
        fn (function): Function to obtain a string from a element of the dataset

    Returns:
        First element with the maximun fuzzy ratio.
    """
    max_ratio = 0
    matching = None
    for e in dataset:
        if fuzz and name:
            ratio = fuzz.token_sort_ratio(normalize(name), normalize(fn(e)))
            if ratio > max_ratio:
                max_ratio = ratio
                matching = e
        elif normalize(name) == normalize(fn(e)):
            matching = e
            break
    return matching
Exemplo n.º 44
0
def recommend(project_id: str, model_id: str, text: str) -> List[str]:
    """
    Recommend which intents have similar templates to [text]
    :param text: texto to look similarities
    :return: list with intents
    """
    intents = defaultdict(lambda: defaultdict(int))

    templates = read_templates(project_id, model_id)

    for example in templates:
        intent_name = read_name_by_id(project_id, example["intent_id"]["$oid"])
        ratio = fuzz.token_sort_ratio(text, example["name"])
        if ratio > 70:
            intents[intent_name]["times"] += 1
            intents[intent_name]["ratio"] += ratio

        if intent_name in intents:
            intents[intent_name]["mean_ratio"] = intents[intent_name].get(
                "ratio") / intents[intent_name].get("times")

    return sorted(intents,
                  key=lambda x: intents.get(x).get("mean_ratio"),
                  reverse=True)
Exemplo n.º 45
0
    def _fuzzy_match(self, t_index, max_t):
        # generic fuzzy matching
        m_stats = {}  # matching statistics
        content = self._content
        found = [] if not self._pot_acts else [th[0] for th in self._pot_acts]

        for tn in self._mind.get_timeline()[::-1]:
            if tn in found: continue
            thot = self._mind.get_thots([tn])[tn]
            if not thot._body: continue
            thot = thot._head
            m_stats[thot] = {}
            m_stats[thot]['ratio'] = fuzz.ratio(content, thot)
            m_stats[thot]['partial_ratio'] = fuzz.partial_ratio(content, thot)
            m_stats[thot]['token_sort_ratio'] = fuzz.token_sort_ratio(
                content, thot)
            m_stats[thot]['token_set_ratio'] = fuzz.token_set_ratio(
                content, thot)
        m_stats['top%s' % max_t] = process.extract(content,
                                                   t_index,
                                                   limit=max_t)
        self._match_stats = m_stats
        self._pot_acts = m_stats['top%s' % max_t]
        return m_stats
Exemplo n.º 46
0
def test_thing_wikidata_query_strict_False():
    """Thing - wikidata - strict=True, check_type=False - : Should pass"""
    thing = Thing(label=u"혁kστ혁ηjh혁kي혁ةsjdジアh", query_language=Lang.DEFAULT)
    thing.add_query_endpoint(Endpoint.wikidata)
    thing.query(strict_mode=True, check_type=False)

    assert thing.endpoints == set([Endpoint.wikidata])
    assert thing.has_label == u'혁kστ혁ηjh혁kي혁ةsjdジアh'
    assert thing.query_language == Lang.English

    expected_query = u'''
   PREFIX wdt: <http://www.wikidata.org/prop/direct/>
   PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
   SELECT DISTINCT ?Thing ?pred ?obj WHERE
   { SERVICE wikibase:label
    { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
     ?Thing ?pred ?obj .
     { ?Thing rdfs:label "혁kστ혁ηjh혁kي혁ةsjdジアh"@en  }
     UNION { ?Thing wdt:P1813 "혁kστ혁ηjh혁kي혁ةsjdジアh"@en  } .
       } LIMIT 1500'''

    ratio = fuzz.token_sort_ratio(
        thing.query_builder.queries[Endpoint.wikidata], expected_query)
    assert ratio > 90
Exemplo n.º 47
0
def fuzzy_search(query, products_list):

    # split the query into a list of tokens
    all_tokens = query.split(' ')
    main_token = all_tokens[-1]

    # remove the main_token fom the list of tokens
    all_tokens.pop()

    prefix_tokens = all_tokens

    main_token_matches = []

    for product in products_list:
        product_string = product.name + ' ' + product.brand
        if fuzz.token_set_ratio(main_token, product_string) == 100:
            main_token_matches.append(product)

    matching_products = []

    for product in main_token_matches:
        product_string = product.name + ' ' + product.brand
        match = fuzz.token_set_ratio(prefix_tokens, product_string)
        score = fuzz.token_sort_ratio(prefix_tokens, product_string)
        if match == 100:
            product.set_score(score)
            matching_products.append(product)

    sorted_matching_products = sorted(matching_products,
                                      key=lambda x: x.score,
                                      reverse=True)

    if len(sorted_matching_products) > 10:
        sorted_matching_products = sorted_matching_products[:10]

    return sorted_matching_products
Exemplo n.º 48
0
    def get_file_url(self, file_string, line):
        matching_paths = []
        file_to_search = file_string
        user_match = self.file_pattern.search(file_string)
        if user_match:
            file_to_search = user_match.group(1)
        for path in self.current_paths:
            current_path = path
            path_match = self.file_pattern.search(path)
            if path_match:
                current_path = path_match.group(1)
            if fuzz.token_sort_ratio(file_to_search, current_path) >= 75:
                matching_paths.append(path)

        result = process.extractOne(file_string,
                                    matching_paths,
                                    scorer=fuzz.token_set_ratio)
        if result:
            return "https://github.com/{own}/{repo}/blob/master/{p}{l}".format(
                own=self.owner,
                repo=self.repo,
                p=result[0],
                l=line if line else '')
        return None
Exemplo n.º 49
0
def anotherSearch(field, data, results_number):
    from fuzzywuzzy import fuzz
    from fuzzywuzzy import process
    aux = {}
    result = []
    mR = 4
    mP = 2
    mS = 4
    for k in data:
        value_ratio = fuzz.ratio(field, data[k])
        value_part = fuzz.partial_ratio(field, data[k])
        value_sort = fuzz.token_sort_ratio(field, data[k])
        #aux[k]=str(int((value_part* + value_ratio + value_sort) / 3))+":"+str(value_ratio)
        aux[k] = int((value_part * mP + value_ratio * mR + value_sort * mS) /
                     (mR + mP + mS))
    sorted_results = sorted(aux.iteritems(),
                            key=lambda (k, v): v,
                            reverse=True)
    results_number = results_number if len(
        sorted_results) >= results_number else len(sorted_results)
    for x, y in sorted_results[:results_number]:  # By value
        result.append((y, x))

    return result
Exemplo n.º 50
0
def _is_rdns_match(rdns_list_a: List[x509.RelativeDistinguishedName], rdns_list_b: List[x509.RelativeDistinguishedName]) \
        -> Tuple[bool, List[x509.NameAttribute]]:
    """
    Performs fuzzy search to check if two RDNS records are the same. Only checks if the record type exists in both
    :param rdns_list_a:
    :param rdns_list_b:
    :return:
    """
    retval = True  # default to assuming same
    diff_list = list()
    for rdns_a in rdns_list_a:
        for name_attr_a in rdns_a:
            for rdns_b in rdns_list_b:
                for name_attr_b in rdns_b:
                    assert isinstance(name_attr_a, x509.NameAttribute)
                    assert isinstance(name_attr_b, x509.NameAttribute)
                    # if OID matches, compare their values
                    if name_attr_a.oid == name_attr_b.oid and name_attr_a.value != name_attr_b.value:
                        # does fuzzy search to check if there is < 80% match b/w values
                        if fuzz.token_sort_ratio(name_attr_a.value, name_attr_b.value) < 80 and not \
                                _special_case_nameattr_equivalence(name_attr_a, name_attr_b):
                            retval = False
                            diff_list.append((name_attr_a, name_attr_b))
    return retval, diff_list
Exemplo n.º 51
0
def fastExact_function(search_subject):
    fast_url = api_base_url + '?&query=' + search_subject
    fast_url += '&queryIndex=suggestall&queryReturn=suggestall,idroot,auth,tag,raw&suggest=autoSubject&rows=5&wt=json'
    try:
        data = requests.get(fast_url).json()
        for item in data:
            if item == 'response':
                response = data.get(item)
                if response.get('numFound') > 0:
                    for metadata in response:
                        if metadata == 'docs':
                            keyInfo = response.get(metadata)
                            for info in keyInfo:
                                auth_name = info.get('auth')
                                fast_id = info.get('idroot')
                                ratio = fuzz.token_sort_ratio(auth_name, search_subject)
                                if auth_name == search_subject or ratio == 95:
                                    result_dict['auth_name'] = auth_name
                                    result_dict['fast_id'] = fast_id
                                    break
                                else:
                                    pass
    except ValueError:
        pass
Exemplo n.º 52
0
def fungsi():
    while(1):
        menu()
        pil = input("Pilhan anda : ")
        if pil == "1":
            kata1 = input("Masukan kata 1 : ")
            kata2 = input("Masukan kata 2 : ")
            print("Nilai : ", fuzz.ratio(kata1,kata2))
        elif pil == "2":
            kata1 = input("Masukan kata 1 : ")
            kata2 = input("Masukan kata 2 : ")
            print("Nilai : ", fuzz.partial_ratio(kata1, kata2))
        elif pil == "3":
            kata1 = input("Masukan kata 1 : ")
            kata2 = input("Masukan kata 2 : ")
            print("Nilai : ", fuzz.token_sort_ratio(kata1, kata2))
        elif pil == "4":
            kata1 = input("Masukan kata 1 : ")
            kata2 = input("Masukan kata 2 : ")
            print("Nilai : ", fuzz.token_set_ratio(kata1, kata2))
        elif pil == "5":
            exit()
        else:
            print("Pilihan anda salah!!")
Exemplo n.º 53
0
def print_sim_compare(t1, t2, stopwords):
    """
    Debug/testing function. Prints out similarity scores.

    :param t1: str, text to compare
    :param t2: str, text to compare against

    """
    t1 = get_nouns(tokenize_and_stem(t1.strip(), stopwords))
    t2 = get_nouns(tokenize_and_stem(t2.strip(), stopwords))
    print('Jaccard:', jaccard_similarity(t1, t2))
    print('Ratio:', fuzz.ratio(" ".join(t1), " ".join(t2)) / 100)
    print('Partial Ratio:',
          fuzz.partial_ratio(" ".join(t1), " ".join(t2)) / 100)
    print('Token Set Ratio:',
          fuzz.token_set_ratio(" ".join(t1), " ".join(t2)) / 100)
    print('Token Sort Ratio:',
          fuzz.token_sort_ratio(" ".join(t1), " ".join(t2)) / 100)

    # Calculate similarity score
    print("noun tokens", t1, t2)
    score = calc_similarity_score(t1, t2)

    print("Score would be:", score)
Exemplo n.º 54
0
def matching(value1, value2, factor=None):
	"""

	:rtype : object
	"""

	fuzzy = []
	fuzzy.append(fuzz.ratio(value1.lower(), value2.lower()))
	fuzzy.append(fuzz.partial_ratio(value1.lower(), value2.lower()))
	fuzzy.append(fuzz.token_set_ratio(value1.lower(), value2.lower()))
	fuzzy.append(fuzz.token_sort_ratio(value1.lower(), value2.lower()))

	log.debug("=" * 50)
	log.debug('Fuzzy Compare: {} - {}'.format(value1.lower(), value2.lower()))
	log.debug("-" * 50)
	log.debug('{}: Simple Ratio'.format(fuzzy[0]))
	log.debug('{}: Partial Ratio'.format(fuzzy[1]))
	log.debug('{}: Token Set Ratio'.format(fuzzy[2]))
	log.debug('{}: Token Sort Ratio'.format(fuzzy[3]))

	if factor:      # Will return True or False
		log.debug('Return with Factor - {}: {}'.format(factor, any([fr > factor for fr in fuzzy])))
		return any([fr >= factor for fr in fuzzy])

	score = 0
	entries = 0
	for fr in fuzzy:
		score += fr
		if fr > 0: entries += 1

	if entries > 0: score = score/entries
	else: score = 0

	log.debug('Return without Factor - Score: {}'.format(score))

	return score
Exemplo n.º 55
0
 def update(self, events) -> None:
     screen.blit(self.image, self.rect)
     if not self.done:
         if self.rect.collidepoint(pygame.mouse.get_pos()) and \
                 pygame.mouse.get_pressed()[0] and not self.asking:
             self.asking = True
             door_open.play()
         else:
             if self.asking and not self.rect.collidepoint(
                 pygame.mouse.get_pos()) and \
                     pygame.mouse.get_pressed()[0]:
                 self.asking = False
                 door_close.play()
         if self.asking:
             self.input.update(events)
             inp = self.input.get_surface()
             inp_rect = inp.get_rect()
             inp_rect.centerx = WIDTH // 2
             inp_rect.y = self.rect.y - 40
             screen.blit(inp, inp_rect)
             q = question_font.render(self.question, True, BLACK)
             screen.blit(q,
                         (WIDTH // 2 - q.get_width() // 2, 50))
             for event in events:
                 if event.type == KEYDOWN:
                     if event.key == K_RETURN:
                         if fuzz.token_sort_ratio(self.answer,
                                                  self.input.get_text()
                                                  ) > 85:
                             self.asking = False
                             self.image = self.door_done
                             self.done = True
                             door_close.play()
                             player.pass_door(self.part_name, self.question)
                         else:
                             wrong_sound.play()
Exemplo n.º 56
0
def extract_features(df):
    df["question1"] = df["question1"].fillna("").apply(preprocess)
    df["question2"] = df["question2"].fillna("").apply(preprocess)

    print("token features...")
    token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
    df["cwc_min"]       = list(map(lambda x: x[0], token_features))
    df["cwc_max"]       = list(map(lambda x: x[1], token_features))
    df["csc_min"]       = list(map(lambda x: x[2], token_features))
    df["csc_max"]       = list(map(lambda x: x[3], token_features))
    df["ctc_min"]       = list(map(lambda x: x[4], token_features))
    df["ctc_max"]       = list(map(lambda x: x[5], token_features))
    df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
    df["first_word_eq"] = list(map(lambda x: x[7], token_features))
    df["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
    df["mean_len"]      = list(map(lambda x: x[9], token_features))

    print("fuzzy features..")
    df["token_set_ratio"]       = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
    df["token_sort_ratio"]      = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
    df["fuzz_ratio"]            = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
    df["fuzz_partial_ratio"]    = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
    df["longest_substr_ratio"]  = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
    return df
Exemplo n.º 57
0
def build_handcraft_text_feats(tmp_df):
    df = tmp_df.copy()
    df['len_title_1'] = df.title_1.apply(lambda x: len(str(x)))
    df['len_title_2'] = df.title_2.apply(lambda x: len(str(x)))
    df['diff_len'] = df.len_title_1 - df.len_title_2
    df['abs_diff_len'] = abs(df.len_title_1 - df.len_title_2)
    df['len_char_title_1'] = df.title_1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
    df['len_char_title_2'] = df.title_2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
    df['len_word_title_1'] = df.title_1.apply(lambda x: len(str(x).split()))
    df['len_word_title_2'] = df.title_2.apply(lambda x: len(str(x).split()))
    df['common_words'] = df.apply(
        lambda x: len(set(str(x['title_1']).lower().split()).intersection(set(str(x['title_2']).lower().split()))),
        axis=1)
    df['fuzz_qratio'] = df.apply(lambda x: fuzz.QRatio(str(x['title_1']), str(x['title_2'])), axis=1)
    df['fuzz_WRatio'] = df.apply(lambda x: fuzz.WRatio(str(x['title_1']), str(x['title_2'])), axis=1)
    df['fuzz_partial_ratio'] = df.apply(lambda x: fuzz.partial_ratio(str(x['title_1']), str(x['title_2'])), axis=1)
    df['fuzz_partial_token_set_ratio'] = df.apply(
        lambda x: fuzz.partial_token_set_ratio(str(x['title_1']), str(x['title_2'])), axis=1)
    df['fuzz_partial_token_sort_ratio'] = df.apply(
        lambda x: fuzz.partial_token_sort_ratio(str(x['title_1']), str(x['title_2'])), axis=1)
    df['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio(str(x['title_1']), str(x['title_2'])), axis=1)
    df['fuzz_token_sort_ratio'] = df.apply(lambda x: fuzz.token_sort_ratio(str(x['title_1']), str(x['title_2'])),
                                           axis=1)
    df['txt_hamming'] = df.apply(
        lambda x: textdistance.hamming.normalized_similarity(str(x['title_1']), str(x['title_2'])), axis=1)
    df['txt_damerau_levenshtein'] = df.apply(
        lambda x: textdistance.damerau_levenshtein.normalized_similarity(str(x['title_1']), str(x['title_2'])), axis=1)
    df['txt_jaro_winkler'] = df.apply(
        lambda x: textdistance.jaro_winkler.normalized_similarity(str(x['title_1']), str(x['title_2'])), axis=1)
    df['txt_overlap'] = df.apply(
        lambda x: textdistance.overlap.normalized_similarity(str(x['title_1']), str(x['title_2'])), axis=1)
    df['txt_mra'] = df.apply(lambda x: textdistance.mra.normalized_similarity(str(x['title_1']), str(x['title_2'])),
                             axis=1)
    df.drop(columns=['title_1', 'title_2'], inplace=True)

    return df
Exemplo n.º 58
0
 def calculateMetric(self, strA, strB, metric):
     if metric == 'ratio':
         return fuzz.ratio(strA, strB)
     if metric == 'partial_ratio':
         return fuzz.partial_ratio(strA, strB)
     if metric == 'token_sort_ratio':
         return fuzz.token_sort_ratio(strA, strB)
     if metric == 'token_set_ratio':
         return fuzz.token_set_ratio(strA, strB)
     if metric == 'distance':
         return Levenshtein.distance(strA, strB)
     if metric == 'l_ratio':
         return Levenshtein.ratio(strA, strB)
     if metric == 'jaro':
         return Levenshtein.jaro(strA, strB)
     if metric == 'jaro_winkler':
         return Levenshtein.jaro_winkler(strA, strB)
     if metric == 'setratio':
         return Levenshtein.setratio(strA, strB)
     if metric == 'seqratio':
         return Levenshtein.seqratio(strA, strB)
     if metric == 'longestnumericseq':
         return longestNumericSubstringMetric(strA, strB)
     return None
Exemplo n.º 59
0
def get_largest_match(title, skills, ngram):

    # make a default matching degree and title
    #     skills=soft_skills
    ngram = ngram
    dic = {}
    match_degree = 0
    match_skill = ''
    match_gram = ''

    # try every skill in skill list
    for sk in skills:
        #print(sk)
        match_degree = 0
        # get a list of all possible distributions like :
        n_Gram_list = get_ngrams(title, ngram)

        # try every distributions in distributions list "n_Gram_list"
        for gram in n_Gram_list:
            #print(gram)

            # get similarity degree of job title "gram" and our title from title list
            similarity_degree = fuzz.token_sort_ratio(sk, gram)
            # SequenceMatcher(None,gram.lower(),i.lower()).ratio()
            # get the most similarity butween the new and old(or default)

            if similarity_degree > match_degree:
                match_degree = similarity_degree
                match_skill = sk
                match_gram = gram
        dic[match_skill] = match_degree
#         if(title=="Eastern Province"):
#             print(match_skill,":",match_gram,":",match_degree)

    dic = sorted(dic.items(), key=lambda x: x[1], reverse=True)
    return dic
Exemplo n.º 60
0
 def search_api_LC(self):
     # dictionary for storing scores for this API
     self.scores = {}
     # The API call url
     dym = "http://id.loc.gov" + self.query_type + "/didyoumean/?label=" + urllib.parse.quote(self.name.encode('utf8'))
     try:
         dym_result = requests.get(dym)
         # get the results in form of a XML tree 
         dym_results = ETree.fromstring(dym_result.content)
         for result in dym_results.iter('{http://id.loc.gov/ns/id_service#}term'):
             # get the "name"
             candidateD = result.text
             # get the URI (LC ID)
             uriD = result.get('uri')
             scoreD = fuzz.token_sort_ratio(candidateD, self.name)
             # if the similarity socre is greater than the cut-off, add the "name" LC ID and similarity score to the dict
             if scoreD > self.th:
                 self.scores['LC'] = {}
                 self.scores['LC']['lcid'] = {}
                 self.scores['LC']["lcid"][uriD] = [candidateD, scoreD]
     except:
             PrintException(self.log_file, self.name)
     if len(self.scores) > 0:
         return self.scores