def find_mapped_forbes_2000(): forbes_2000_file=open('/home/matt/Desktop/Industry_Ratings/data/Forbes_Global_2000_2013.csv','rb') mapped_entities_file=open('/home/matt/Desktop/Industry_Ratings/data/entities.csv','rb') matches_file=open('/home/matt/Desktop/Industry_Ratings/data/matches_name.csv','w') count = 0 #lines = sum(1 for line in forbes_2000_file) #print "number of forbes lines: ", lines #forbes_2000_file.seek(0) for line in forbes_2000_file: #print "in outer loop" forbes_2000_company = line.decode('utf-8').split(",") mapped_entities_file.seek(0) matched_entity = "--,--" for item in mapped_entities_file: #print "in inner loop" mapped_entity = item.decode('utf-8').split(",") forbes_processed = utils.full_process(forbes_2000_company[0]) entity_processed = utils.full_process(mapped_entity[1]) if (fuzz.ratio(forbes_processed,entity_processed) > 85): #print forbes_processed, entity_processed matched_entity = mapped_entity[0] + "," + string_processing.StringProcessor.strip(mapped_entity[1]) count +=1 break match = forbes_2000_company[0] + "," + forbes_2000_company[2] + "," + matched_entity + "," + forbes_2000_company[8] print match.encode('utf-8') matches_file.write(match.encode('utf-8')) forbes_2000_file.close() mapped_entities_file.close() matches_file.close() print "total matches: ", count
def find_mapped_S_P_500(): s_p_500_file=open('/home/matt/Desktop/Industry_Ratings/data/S_P_500_2013.csv','rb') mapped_entities_file=open('/home/matt/Desktop/Industry_Ratings/data/entities.csv','rb') matches_file=open('/home/matt/Desktop/Industry_Ratings/data/s_p_500_matches_name.csv','w') count = 0 #lines = sum(1 for line in forbes_2000_file) #print "number of forbes lines: ", lines #forbes_2000_file.seek(0) for line in s_p_500_file: #print "in outer loop" print line mapped_entities_file.seek(0) matched_entity = "--,--," for item in mapped_entities_file: #print "in inner loop" mapped_entity = item.decode('utf-8').split(",") s_p_500_processed = utils.full_process(line) entity_processed = utils.full_process(mapped_entity[1]) if (fuzz.ratio(s_p_500_processed,entity_processed) > 85): #print forbes_processed, entity_processed matched_entity = mapped_entity[0] + "," + string_processing.StringProcessor.strip(mapped_entity[1]) + "," count +=1 break match = matched_entity + line #hereprint match.encode('utf-8') matches_file.write(match.encode('utf-8')) s_p_500_file.close() mapped_entities_file.close() matches_file.close() print "total matches: ", count
def QRatio(s1, s2, force_ascii=True): p1 = utils.full_process(s1, force_ascii=force_ascii) p2 = utils.full_process(s2, force_ascii=force_ascii) if not utils.validate_string(p1): return 0 if not utils.validate_string(p2): return 0 return ratio(p1, p2)
def _find_name_or_alias_match(self, objects, query): """ Performs name and alias matches on a list of objects. Returns the best match, or ``None`` if nothing was found. :param iterable objects: A list of ``BaseObject`` sub-class instances to attempt to match to. :param str query: The string to match against. :rtype: BaseObject :returns: The best match object for the given query. """ if not objects: return None for obj in objects: # Start by checking all objects for an alias match. aliases = [alias.lower() for alias in obj.aliases] if query.lower() in aliases: # If a match is found, return immediately on said match. return obj processor = lambda x: fuzz_utils.full_process(x) for choice in objects: processed = processor(choice.name) if query in processed: return choice return None
def extract(query, choices, processor=None, scorer=None, limit=5): """Find best matches in a list of choices, return a list of tuples containing the match and it's score. Arguments: query -- an object representing the thing we want to find choices -- a list of objects we are attempting to extract values from scorer -- f(OBJ, QUERY) --> INT. We will return the objects with the highest score by default, we use score.WRatio() and both OBJ and QUERY should be strings processor -- f(OBJ_A) --> OBJ_B, where the output is an input to scorer for example, "processor = lambda x: x[0]" would return the first element in a collection x (of, say, strings) this would then be used in the scoring collection by default, we use utils.full_process() """ if choices is None or len(choices) == 0: return [] # default, turn whatever the choice is into a workable string if processor is None: processor = lambda x: utils.full_process(x) # default: wratio if scorer is None: scorer = WRatio sl = list() for choice in choices: processed = processor(choice) score = scorer(query, processed) tuple = (choice, score) sl.append(tuple) sl.sort(key=lambda i: i[1], reverse=True) return sl[:limit]
def _token_set(s1, s2, partial=True, force_ascii=True): """Find all alphanumeric tokens in each string... - treat them as a set - construct two strings of the form: <sorted_intersection><sorted_remainder> - take ratios of those two strings - controls for unordered partial matches""" p1 = utils.full_process(s1, force_ascii=force_ascii) p2 = utils.full_process(s2, force_ascii=force_ascii) if not utils.validate_string(p1): return 0 if not utils.validate_string(p2): return 0 # pull tokens tokens1 = set(utils.full_process(p1).split()) tokens2 = set(utils.full_process(p2).split()) intersection = tokens1.intersection(tokens2) diff1to2 = tokens1.difference(tokens2) diff2to1 = tokens2.difference(tokens1) sorted_sect = " ".join(sorted(intersection)) sorted_1to2 = " ".join(sorted(diff1to2)) sorted_2to1 = " ".join(sorted(diff2to1)) combined_1to2 = sorted_sect + " " + sorted_1to2 combined_2to1 = sorted_sect + " " + sorted_2to1 # strip sorted_sect = sorted_sect.strip() combined_1to2 = combined_1to2.strip() combined_2to1 = combined_2to1.strip() if partial: ratio_func = partial_ratio else: ratio_func = ratio pairwise = [ ratio_func(sorted_sect, combined_1to2), ratio_func(sorted_sect, combined_2to1), ratio_func(combined_1to2, combined_2to1) ] return max(pairwise)
def _process_and_sort(s, force_ascii): """Return a cleaned string with token sorted.""" # pull tokens tokens = utils.full_process(s, force_ascii=force_ascii).split() # sort tokens and join sorted_string = u" ".join(sorted(tokens)) return sorted_string.strip()
def WRatio(s1, s2, force_ascii=True): """Return a measure of the sequences' similarity between 0 and 100, using different algorithms. """ p1 = utils.full_process(s1, force_ascii=force_ascii) p2 = utils.full_process(s2, force_ascii=force_ascii) if not utils.validate_string(p1): return 0 if not utils.validate_string(p2): return 0 # should we look at partials? try_partial = True unbase_scale = .95 partial_scale = .90 base = ratio(p1, p2) len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2)) # if strings are similar length, don't use partials if len_ratio < 1.5: try_partial = False # if one string is much much shorter than the other if len_ratio > 8: partial_scale = .6 if try_partial: partial = partial_ratio(p1, p2) * partial_scale ptsor = partial_token_sort_ratio(p1, p2, force_ascii=force_ascii) \ * unbase_scale * partial_scale ptser = partial_token_set_ratio(p1, p2, force_ascii=force_ascii) \ * unbase_scale * partial_scale return utils.intr(max(base, partial, ptsor, ptser)) else: tsor = token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale tser = token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale return utils.intr(max(base, tsor, tser))
def text_normalize(raw): """ Borrow normalization from fuzzywuzzy. This uses ascii; should be replaced. """ #make ascii araw = fuzzutils.asciidammit(raw) #use full process to strip whitespace and lowercase fuzzed = fuzzutils.full_process(araw) #Replace multiple spaces with single. return ' '.join(fuzzed.split())
def _link_plenary_deputies(self): queryset = Deputy.objects.all() # items = AgendaItem.objects.all() # Start from 2010 for now as we don't have old deputies anyway items = AgendaItem.objects.filter(plenary__date__gt='2010-01-01', speaker_id=None) count = -1 tot = len(items) # str -> Deputy cache = {} for a in items: count += 1 stripped = self._strip_speaker_name(a.speaker) if stripped is None: #print "IGNORE %s (%d / %d)" % (a.speaker, count, tot) continue if cache.get(stripped) is not None: deputy = cache[stripped] else: match = process.extractOne(stripped, queryset, score_cutoff=FUZZY_THRESHOLD, processor=lambda x: utils.full_process(x.full_name)) if match is None: deputy = None else: deputy = match[0] cache[stripped] = deputy if deputy is not None: print "MATCH %s with %s (%d / %d)" % (a.speaker, deputy.full_name, count, tot) else: print "FAILED %s (%d / %d)" % (a.speaker, count, tot) a.speaker_id = deputy a.save()
def test_fullProcess(self): for s in self.mixed_strings: utils.full_process(s)
def name_selecter(roll, full_name, g, first_name, surname, pub_uri, matchlist, rank=None): # if none of the possibilities are foaf, just make a new vcard # code if len(roll) > 0: # the api found matching last names exit = False foaf = False scoredlist = [] for idx, val in enumerate(roll): if roll[int(idx)][4]: (author_uri, uritype) = roll[int(idx)][4], 'foaf' foaf = True else: (author_uri, uritype) = roll[int(idx)][3], 'vcard' ''' # Map to foaf object if it exists, otherwise vcard individual if roll[int(idx)][4]: (author_uri,uritype) = roll[int(idx)][4],'foaf' else: (author_uri,uritype) = roll[int(idx)][3],'vcard' ''' author_uri = author_uri.replace(D, '') rollname = (roll[idx][0] + ' ' + roll[idx][1] + ' ' + roll[idx][2] if roll[idx][1] else roll[idx][0] + ' ' + roll[idx][2]) try: # Weird character encoding things going on hur full_name.decode('ascii') fuzzy_name = None except UnicodeEncodeError: fuzzy_name = utils.full_process(full_name, force_ascii=True) if len(roll[idx][0]) > 2: # Don't bother scoring against initials fuzznum = (fuzz.ratio(rollname, fuzzy_name) if fuzzy_name else fuzz.ratio(rollname, full_name)) # raw_input(rollname+' vs. '+full_name+str(fuzznum)) if fuzznum == 100: matchlist = assign_authorship(author_uri, g, pub_uri, full_name, matchlist, rank) return matchlist scoredlist.append([ roll[idx][0], roll[idx][1], roll[idx][2], author_uri, uritype, fuzznum ]) else: scoredlist.append([ roll[idx][0], roll[idx][1], roll[idx][2], author_uri, uritype, None ]) if foaf is True: scoredlist = sorted(scoredlist, key=itemgetter(5), reverse=True) for idx, val in enumerate(scoredlist): # Add a handy index number for display scoredlist[idx].insert(0, idx) if scoredlist[idx][6] is None: scoredlist[idx][6] = '-' # Add a hash for prettiness print( tabulate(scoredlist, headers=[ 'num', 'first', 'middle', 'last', 'uri', 'type', 'score' ])) if fuzzy_name: pick = raw_input("\nAuthor " + fuzzy_name + " may already " "exist in the database. Please choose a " "number or press Enter for none ") else: pick = raw_input("\nAuthor " + full_name + " may already exist " "in the database. Please choose a number or " "press Enter for none ") while True: if pick == '': # None of the above # Create a new vcard individual author_uri = new_vcard(first_name, surname, full_name, g) matchlist = assign_authorship(author_uri, g, pub_uri, full_name, matchlist, rank) break elif pick == 'RDF': print(g.serialize(format='turtle')) raw_input('\nYou found the RDF easter egg, look at you! ' 'Press Enter to continue\n') # Temporary testing shortcut print( tabulate(scoredlist, headers=[ 'num', 'first', 'middle', 'last', 'uri', 'score' ])) pick = raw_input("\nAuthor " + full_name + " may already " "exist in the database. Please choose a " "number or press Enter for none ") elif pick.isdigit(): if int(pick) < len(roll): # Make sure the number is valid author_uri = scoredlist[int(pick)][4] matchlist = assign_authorship(author_uri, g, pub_uri, full_name, matchlist, rank) break else: # Number out of range pick = raw_input('invalid input, try again ') else: # Either not a number or not empty pick = raw_input('invalid input, try again ') return matchlist else: # No matches, make new uri author_uri = new_vcard(first_name, surname, full_name, g) matchlist = assign_authorship(author_uri, g, pub_uri, full_name, matchlist, rank) return matchlist else: # No matches, make new uri author_uri = new_vcard(first_name, surname, full_name, g) matchlist = assign_authorship(author_uri, g, pub_uri, full_name, matchlist, rank) return matchlist
def name_selecter(roll, full_name, g, first_name, surname, pub_uri, matchlist, rank=None): # if none of the possibilities are foaf, just make a new vcard # code if len(roll) > 0: # the api found matching last names exit = False foaf = False scoredlist = [] for idx, val in enumerate(roll): if roll[int(idx)][4]: (author_uri, uritype) = roll[int(idx)][4], 'foaf' foaf = True else: (author_uri, uritype) = roll[int(idx)][3], 'vcard' ''' # Map to foaf object if it exists, otherwise vcard individual if roll[int(idx)][4]: (author_uri,uritype) = roll[int(idx)][4],'foaf' else: (author_uri,uritype) = roll[int(idx)][3],'vcard' ''' author_uri = author_uri.replace(D, '') rollname = (roll[idx][0] + ' ' + roll[idx][1] + ' ' + roll[idx][2] if roll[idx][1] else roll[idx][0] + ' ' + roll[idx][2]) try: # Weird character encoding things going on hur full_name.decode('ascii') fuzzy_name = None except UnicodeEncodeError: fuzzy_name = utils.full_process(full_name, force_ascii=True) if len(roll[idx][0]) > 2: # Don't bother scoring against initials fuzznum = (fuzz.ratio(rollname, fuzzy_name) if fuzzy_name else fuzz.ratio(rollname, full_name)) # raw_input(rollname+' vs. '+full_name+str(fuzznum)) if fuzznum == 100: matchlist = assign_authorship(author_uri, g, pub_uri, full_name, matchlist, rank) return matchlist scoredlist.append([roll[idx][0], roll[idx][1], roll[idx][2], author_uri, uritype, fuzznum]) else: scoredlist.append([roll[idx][0], roll[idx][1], roll[idx][2], author_uri, uritype, None]) if foaf is True: scoredlist = sorted(scoredlist, key=itemgetter(5), reverse=True) for idx, val in enumerate(scoredlist): # Add a handy index number for display scoredlist[idx].insert(0, idx) if scoredlist[idx][6] is None: scoredlist[idx][6] = '-' # Add a hash for prettiness print(tabulate(scoredlist, headers=['num', 'first', 'middle', 'last', 'uri', 'type', 'score'])) if fuzzy_name: pick = raw_input("\nAuthor " + fuzzy_name + " may already " "exist in the database. Please choose a " "number or press Enter for none ") else: pick = raw_input("\nAuthor " + full_name+" may already exist " "in the database. Please choose a number or " "press Enter for none ") while True: if pick == '': # None of the above # Create a new vcard individual author_uri = new_vcard(first_name, surname, full_name, g) matchlist = assign_authorship(author_uri, g, pub_uri, full_name, matchlist, rank) break elif pick == 'RDF': print(g.serialize(format='turtle')) raw_input('\nYou found the RDF easter egg, look at you! ' 'Press Enter to continue\n') # Temporary testing shortcut print(tabulate(scoredlist, headers=['num', 'first', 'middle', 'last', 'uri', 'score'])) pick = raw_input("\nAuthor " + full_name + " may already " "exist in the database. Please choose a " "number or press Enter for none ") elif pick.isdigit(): if int(pick) < len(roll): # Make sure the number is valid author_uri = scoredlist[int(pick)][4] matchlist = assign_authorship(author_uri, g, pub_uri, full_name, matchlist, rank) break else: # Number out of range pick = raw_input('invalid input, try again ') else: # Either not a number or not empty pick = raw_input('invalid input, try again ') return matchlist else: # No matches, make new uri author_uri = new_vcard(first_name, surname, full_name, g) matchlist = assign_authorship(author_uri, g, pub_uri, full_name, matchlist, rank) return matchlist else: # No matches, make new uri author_uri = new_vcard(first_name, surname, full_name, g) matchlist = assign_authorship(author_uri, g, pub_uri, full_name, matchlist, rank) return matchlist
def lines_processor(item): if isinstance(item, str): pass else: item = item.name + ' ' + item.tags return fuzzy_utils.full_process(item)
def laaz_process(s): return fuzzyutils.full_process(re.sub(ur"[\":]", "", s), False)
def testCaseInsensitive(self): self.assertNotEqual(fuzz.ratio(self.s1, self.s2), 100) self.assertEqual( fuzz.ratio(utils.full_process(self.s1), utils.full_process(self.s2)), 100)
def testCaseInsensitive(self): self.assertNotEqual(fuzz.ratio(self.s1, self.s2), 100) self.assertEqual(fuzz.ratio(utils.full_process(self.s1), utils.full_process(self.s2)), 100)
def test_fullProcessForceAscii(self): for s in self.mixed_strings: utils.full_process(s, force_ascii=True)
def laaz_process(s): return fuzzyutils.full_process(re.sub(ur'[\":]','', s), False)
def laaz_process(s): return fuzzyutils.full_process(re.sub(r'[\":]', '', s), False)