示例#1
0
def find_mapped_forbes_2000():
    forbes_2000_file=open('/home/matt/Desktop/Industry_Ratings/data/Forbes_Global_2000_2013.csv','rb')
    mapped_entities_file=open('/home/matt/Desktop/Industry_Ratings/data/entities.csv','rb')
    matches_file=open('/home/matt/Desktop/Industry_Ratings/data/matches_name.csv','w')

    count = 0
    #lines = sum(1 for line in forbes_2000_file)
    #print "number of forbes lines: ", lines
    #forbes_2000_file.seek(0)
    for line in forbes_2000_file:
        #print "in outer loop"
        forbes_2000_company = line.decode('utf-8').split(",")
        mapped_entities_file.seek(0)
        matched_entity = "--,--"
        for item in mapped_entities_file:
            #print "in inner loop"
            mapped_entity = item.decode('utf-8').split(",")
            forbes_processed = utils.full_process(forbes_2000_company[0])
            entity_processed = utils.full_process(mapped_entity[1])
            if (fuzz.ratio(forbes_processed,entity_processed) > 85):
                #print forbes_processed, entity_processed
                matched_entity = mapped_entity[0] + "," + string_processing.StringProcessor.strip(mapped_entity[1])
                count +=1
                break
        match = forbes_2000_company[0] + "," + forbes_2000_company[2] + "," + matched_entity + "," + forbes_2000_company[8]
        print match.encode('utf-8')
        matches_file.write(match.encode('utf-8'))
    forbes_2000_file.close()
    mapped_entities_file.close()
    matches_file.close()
    print "total matches: ", count
示例#2
0
def find_mapped_S_P_500():
    s_p_500_file=open('/home/matt/Desktop/Industry_Ratings/data/S_P_500_2013.csv','rb')
    mapped_entities_file=open('/home/matt/Desktop/Industry_Ratings/data/entities.csv','rb')
    matches_file=open('/home/matt/Desktop/Industry_Ratings/data/s_p_500_matches_name.csv','w')

    count = 0
    #lines = sum(1 for line in forbes_2000_file)
    #print "number of forbes lines: ", lines
    #forbes_2000_file.seek(0)
    for line in s_p_500_file:
        #print "in outer loop"
        print line
        mapped_entities_file.seek(0)
        matched_entity = "--,--,"
        for item in mapped_entities_file:
            #print "in inner loop"
            mapped_entity = item.decode('utf-8').split(",")
            s_p_500_processed = utils.full_process(line)
            entity_processed = utils.full_process(mapped_entity[1])
            if (fuzz.ratio(s_p_500_processed,entity_processed) > 85):
                #print forbes_processed, entity_processed
                matched_entity = mapped_entity[0] + "," + string_processing.StringProcessor.strip(mapped_entity[1]) + ","
                count +=1
                break
        match =   matched_entity + line
        #hereprint match.encode('utf-8')
        matches_file.write(match.encode('utf-8'))
    s_p_500_file.close()
    mapped_entities_file.close()
    matches_file.close()
    print "total matches: ", count
示例#3
0
def QRatio(s1, s2, force_ascii=True):

    p1 = utils.full_process(s1, force_ascii=force_ascii)
    p2 = utils.full_process(s2, force_ascii=force_ascii)

    if not utils.validate_string(p1):
        return 0
    if not utils.validate_string(p2):
        return 0

    return ratio(p1, p2)
示例#4
0
文件: base.py 项目: gtaylor/dott
    def _find_name_or_alias_match(self, objects, query):
        """
        Performs name and alias matches on a list of objects. Returns the
        best match, or ``None`` if nothing was found.

        :param iterable objects: A list of ``BaseObject`` sub-class instances
            to attempt to match to.
        :param str query: The string to match against.
        :rtype: BaseObject
        :returns: The best match object for the given query.
        """

        if not objects:
            return None

        for obj in objects:
            # Start by checking all objects for an alias match.
            aliases = [alias.lower() for alias in obj.aliases]
            if query.lower() in aliases:
                # If a match is found, return immediately on said match.
                return obj

        processor = lambda x: fuzz_utils.full_process(x)

        for choice in objects:
            processed = processor(choice.name)
            if query in processed:
                return choice

        return None
示例#5
0
def extract(query, choices, processor=None, scorer=None, limit=5):
    """Find best matches in a list of choices, return a list of tuples containing the match and it's score.

    Arguments:
        query       -- an object representing the thing we want to find
        choices     -- a list of objects we are attempting to extract values from
        scorer      -- f(OBJ, QUERY) --> INT. We will return the objects with the highest score
                        by default, we use score.WRatio() and both OBJ and QUERY should be strings
        processor   -- f(OBJ_A) --> OBJ_B, where the output is an input to scorer
                        for example, "processor = lambda x: x[0]" would return the first element
                        in a collection x (of, say, strings) this would then be used in the scoring collection
                        by default, we use utils.full_process()

    """
    if choices is None or len(choices) == 0:
        return []

    # default, turn whatever the choice is into a workable string
    if processor is None:
        processor = lambda x: utils.full_process(x)

    # default: wratio
    if scorer is None:
        scorer = WRatio

    sl = list()

    for choice in choices:
        processed = processor(choice)
        score = scorer(query, processed)
        tuple = (choice, score)
        sl.append(tuple)

    sl.sort(key=lambda i: i[1], reverse=True)
    return sl[:limit]
示例#6
0
def _token_set(s1, s2, partial=True, force_ascii=True):
    """Find all alphanumeric tokens in each string...
        - treat them as a set
        - construct two strings of the form:
            <sorted_intersection><sorted_remainder>
        - take ratios of those two strings
        - controls for unordered partial matches"""

    p1 = utils.full_process(s1, force_ascii=force_ascii)
    p2 = utils.full_process(s2, force_ascii=force_ascii)

    if not utils.validate_string(p1):
        return 0
    if not utils.validate_string(p2):
        return 0

    # pull tokens
    tokens1 = set(utils.full_process(p1).split())
    tokens2 = set(utils.full_process(p2).split())

    intersection = tokens1.intersection(tokens2)
    diff1to2 = tokens1.difference(tokens2)
    diff2to1 = tokens2.difference(tokens1)

    sorted_sect = " ".join(sorted(intersection))
    sorted_1to2 = " ".join(sorted(diff1to2))
    sorted_2to1 = " ".join(sorted(diff2to1))

    combined_1to2 = sorted_sect + " " + sorted_1to2
    combined_2to1 = sorted_sect + " " + sorted_2to1

    # strip
    sorted_sect = sorted_sect.strip()
    combined_1to2 = combined_1to2.strip()
    combined_2to1 = combined_2to1.strip()

    if partial:
        ratio_func = partial_ratio
    else:
        ratio_func = ratio

    pairwise = [
        ratio_func(sorted_sect, combined_1to2),
        ratio_func(sorted_sect, combined_2to1),
        ratio_func(combined_1to2, combined_2to1)
    ]
    return max(pairwise)
示例#7
0
def _process_and_sort(s, force_ascii):
    """Return a cleaned string with token sorted."""
    # pull tokens
    tokens = utils.full_process(s, force_ascii=force_ascii).split()

    # sort tokens and join
    sorted_string = u" ".join(sorted(tokens))
    return sorted_string.strip()
示例#8
0
def WRatio(s1, s2, force_ascii=True):
    """Return a measure of the sequences' similarity between 0 and 100,
    using different algorithms.
    """

    p1 = utils.full_process(s1, force_ascii=force_ascii)
    p2 = utils.full_process(s2, force_ascii=force_ascii)

    if not utils.validate_string(p1):
        return 0
    if not utils.validate_string(p2):
        return 0

    # should we look at partials?
    try_partial = True
    unbase_scale = .95
    partial_scale = .90

    base = ratio(p1, p2)
    len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2))

    # if strings are similar length, don't use partials
    if len_ratio < 1.5:
        try_partial = False

    # if one string is much much shorter than the other
    if len_ratio > 8:
        partial_scale = .6

    if try_partial:
        partial = partial_ratio(p1, p2) * partial_scale
        ptsor = partial_token_sort_ratio(p1, p2, force_ascii=force_ascii) \
            * unbase_scale * partial_scale
        ptser = partial_token_set_ratio(p1, p2, force_ascii=force_ascii) \
            * unbase_scale * partial_scale

        return utils.intr(max(base, partial, ptsor, ptser))
    else:
        tsor = token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale
        tser = token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale

        return utils.intr(max(base, tsor, tser))
示例#9
0
def text_normalize(raw):
    """
    Borrow normalization from fuzzywuzzy.
    This uses ascii; should be replaced.
    """
    #make ascii
    araw = fuzzutils.asciidammit(raw)
    #use full process to strip whitespace and lowercase
    fuzzed = fuzzutils.full_process(araw)
    #Replace multiple spaces with single.
    return ' '.join(fuzzed.split())
示例#10
0
    def _link_plenary_deputies(self):
        queryset = Deputy.objects.all()

        # items = AgendaItem.objects.all()
        # Start from 2010 for now as we don't have old deputies anyway
        items = AgendaItem.objects.filter(plenary__date__gt='2010-01-01', speaker_id=None)

        count = -1
        tot = len(items)

        # str -> Deputy
        cache = {}

        for a in items:
            count += 1
            stripped = self._strip_speaker_name(a.speaker)
            if stripped is None:
                #print "IGNORE %s (%d / %d)" % (a.speaker, count, tot)
                continue

            if cache.get(stripped) is not None:
                deputy = cache[stripped]
            else:
                match = process.extractOne(stripped, queryset, score_cutoff=FUZZY_THRESHOLD,
                        processor=lambda x: utils.full_process(x.full_name))

                if match is None:
                    deputy = None
                else:
                    deputy = match[0]
                    cache[stripped] = deputy

            if deputy is not None:
                print "MATCH %s with %s (%d / %d)" % (a.speaker, deputy.full_name, count, tot)
            else:
                print "FAILED %s (%d / %d)" % (a.speaker, count, tot)

            a.speaker_id = deputy
            a.save()
示例#11
0
 def test_fullProcess(self):
     for s in self.mixed_strings:
         utils.full_process(s)
def name_selecter(roll,
                  full_name,
                  g,
                  first_name,
                  surname,
                  pub_uri,
                  matchlist,
                  rank=None):
    # if none of the possibilities are foaf, just make a new vcard
    # code
    if len(roll) > 0:  # the api found matching last names
        exit = False
        foaf = False
        scoredlist = []
        for idx, val in enumerate(roll):
            if roll[int(idx)][4]:
                (author_uri, uritype) = roll[int(idx)][4], 'foaf'
                foaf = True
            else:
                (author_uri, uritype) = roll[int(idx)][3], 'vcard'
            '''
            # Map to foaf object if it exists, otherwise vcard individual
            if roll[int(idx)][4]:
                (author_uri,uritype) = roll[int(idx)][4],'foaf'
            else:
                (author_uri,uritype) = roll[int(idx)][3],'vcard'
            '''

            author_uri = author_uri.replace(D, '')
            rollname = (roll[idx][0] + ' ' + roll[idx][1] + ' ' +
                        roll[idx][2] if roll[idx][1] else roll[idx][0] + ' ' +
                        roll[idx][2])
            try:  # Weird character encoding things going on hur
                full_name.decode('ascii')
                fuzzy_name = None
            except UnicodeEncodeError:
                fuzzy_name = utils.full_process(full_name, force_ascii=True)
            if len(roll[idx][0]) > 2:  # Don't bother scoring against initials
                fuzznum = (fuzz.ratio(rollname, fuzzy_name)
                           if fuzzy_name else fuzz.ratio(rollname, full_name))
                #    raw_input(rollname+' vs. '+full_name+str(fuzznum))
                if fuzznum == 100:
                    matchlist = assign_authorship(author_uri, g, pub_uri,
                                                  full_name, matchlist, rank)
                    return matchlist
                scoredlist.append([
                    roll[idx][0], roll[idx][1], roll[idx][2], author_uri,
                    uritype, fuzznum
                ])
            else:
                scoredlist.append([
                    roll[idx][0], roll[idx][1], roll[idx][2], author_uri,
                    uritype, None
                ])
        if foaf is True:
            scoredlist = sorted(scoredlist, key=itemgetter(5), reverse=True)
            for idx, val in enumerate(scoredlist):
                # Add a handy index number for display
                scoredlist[idx].insert(0, idx)
                if scoredlist[idx][6] is None:
                    scoredlist[idx][6] = '-'  # Add a hash for prettiness
            print(
                tabulate(scoredlist,
                         headers=[
                             'num', 'first', 'middle', 'last', 'uri', 'type',
                             'score'
                         ]))
            if fuzzy_name:
                pick = raw_input("\nAuthor " + fuzzy_name + " may already "
                                 "exist in the database. Please choose a "
                                 "number or press Enter for none ")
            else:
                pick = raw_input("\nAuthor " + full_name +
                                 " may already exist "
                                 "in the database. Please choose a number or "
                                 "press Enter for none ")

            while True:
                if pick == '':  # None of the above
                    # Create a new vcard individual
                    author_uri = new_vcard(first_name, surname, full_name, g)
                    matchlist = assign_authorship(author_uri, g, pub_uri,
                                                  full_name, matchlist, rank)
                    break
                elif pick == 'RDF':
                    print(g.serialize(format='turtle'))
                    raw_input('\nYou found the RDF easter egg, look at you! '
                              'Press Enter to continue\n')
                    # Temporary testing shortcut
                    print(
                        tabulate(scoredlist,
                                 headers=[
                                     'num', 'first', 'middle', 'last', 'uri',
                                     'score'
                                 ]))
                    pick = raw_input("\nAuthor " + full_name + " may already "
                                     "exist in the database. Please choose a "
                                     "number or press Enter for none ")
                elif pick.isdigit():
                    if int(pick) < len(roll):  # Make sure the number is valid
                        author_uri = scoredlist[int(pick)][4]
                        matchlist = assign_authorship(author_uri, g, pub_uri,
                                                      full_name, matchlist,
                                                      rank)
                        break
                    else:  # Number out of range
                        pick = raw_input('invalid input, try again ')

                else:  # Either not a number or not empty
                    pick = raw_input('invalid input, try again ')
            return matchlist
        else:
            # No matches, make new uri
            author_uri = new_vcard(first_name, surname, full_name, g)
            matchlist = assign_authorship(author_uri, g, pub_uri, full_name,
                                          matchlist, rank)
            return matchlist

    else:
        # No matches, make new uri
        author_uri = new_vcard(first_name, surname, full_name, g)
        matchlist = assign_authorship(author_uri, g, pub_uri, full_name,
                                      matchlist, rank)
        return matchlist
def name_selecter(roll, full_name, g, first_name, surname, pub_uri, matchlist,
                  rank=None):
    # if none of the possibilities are foaf, just make a new vcard
        # code
    if len(roll) > 0:  # the api found matching last names
        exit = False
        foaf = False
        scoredlist = []
        for idx, val in enumerate(roll):
            if roll[int(idx)][4]:
                (author_uri, uritype) = roll[int(idx)][4], 'foaf'
                foaf = True
            else:
                (author_uri, uritype) = roll[int(idx)][3], 'vcard'
            '''
            # Map to foaf object if it exists, otherwise vcard individual
            if roll[int(idx)][4]:
                (author_uri,uritype) = roll[int(idx)][4],'foaf'
            else:
                (author_uri,uritype) = roll[int(idx)][3],'vcard'
            '''

            author_uri = author_uri.replace(D, '')
            rollname = (roll[idx][0] + ' ' + roll[idx][1] + ' ' + roll[idx][2]
                        if roll[idx][1] else roll[idx][0] + ' ' + roll[idx][2])
            try:  # Weird character encoding things going on hur
                full_name.decode('ascii')
                fuzzy_name = None
            except UnicodeEncodeError:
                fuzzy_name = utils.full_process(full_name, force_ascii=True)
            if len(roll[idx][0]) > 2:  # Don't bother scoring against initials
                fuzznum = (fuzz.ratio(rollname, fuzzy_name) if fuzzy_name else
                           fuzz.ratio(rollname, full_name))
            #    raw_input(rollname+' vs. '+full_name+str(fuzznum))
                if fuzznum == 100:
                    matchlist = assign_authorship(author_uri, g, pub_uri,
                                                  full_name, matchlist, rank)
                    return matchlist
                scoredlist.append([roll[idx][0], roll[idx][1], roll[idx][2],
                                  author_uri, uritype, fuzznum])
            else:
                scoredlist.append([roll[idx][0], roll[idx][1], roll[idx][2],
                                  author_uri, uritype, None])
        if foaf is True:
            scoredlist = sorted(scoredlist, key=itemgetter(5), reverse=True)
            for idx, val in enumerate(scoredlist):
                # Add a handy index number for display
                scoredlist[idx].insert(0, idx)
                if scoredlist[idx][6] is None:
                    scoredlist[idx][6] = '-'  # Add a hash for prettiness
            print(tabulate(scoredlist, headers=['num', 'first', 'middle',
                                                'last', 'uri', 'type',
                                                'score']))
            if fuzzy_name:
                pick = raw_input("\nAuthor " + fuzzy_name + " may already "
                                 "exist in the database. Please choose a "
                                 "number or press Enter for none ")
            else:
                pick = raw_input("\nAuthor " + full_name+" may already exist "
                                 "in the database. Please choose a number or "
                                 "press Enter for none ")

            while True:
                if pick == '':  # None of the above
                    # Create a new vcard individual
                    author_uri = new_vcard(first_name, surname, full_name, g)
                    matchlist = assign_authorship(author_uri, g, pub_uri,
                                                  full_name, matchlist, rank)
                    break
                elif pick == 'RDF':
                    print(g.serialize(format='turtle'))
                    raw_input('\nYou found the RDF easter egg, look at you! '
                              'Press Enter to continue\n')
                    # Temporary testing shortcut
                    print(tabulate(scoredlist, headers=['num', 'first',
                                                        'middle', 'last',
                                                        'uri', 'score']))
                    pick = raw_input("\nAuthor " + full_name + " may already "
                                     "exist in the database. Please choose a "
                                     "number or press Enter for none ")
                elif pick.isdigit():
                    if int(pick) < len(roll):  # Make sure the number is valid
                        author_uri = scoredlist[int(pick)][4]
                        matchlist = assign_authorship(author_uri, g, pub_uri,
                                                      full_name, matchlist,
                                                      rank)
                        break
                    else:  # Number out of range
                        pick = raw_input('invalid input, try again ')

                else:  # Either not a number or not empty
                    pick = raw_input('invalid input, try again ')
            return matchlist
        else:
            # No matches, make new uri
            author_uri = new_vcard(first_name, surname, full_name, g)
            matchlist = assign_authorship(author_uri, g, pub_uri, full_name,
                                          matchlist, rank)
            return matchlist

    else:
        # No matches, make new uri
        author_uri = new_vcard(first_name, surname, full_name, g)
        matchlist = assign_authorship(author_uri, g, pub_uri, full_name,
                                      matchlist, rank)
        return matchlist
示例#14
0
def lines_processor(item):
    if isinstance(item, str):
        pass
    else:
        item = item.name + ' ' + item.tags
    return fuzzy_utils.full_process(item)
示例#15
0
def laaz_process(s):
    return fuzzyutils.full_process(re.sub(ur"[\":]", "", s), False)
示例#16
0
 def testCaseInsensitive(self):
     self.assertNotEqual(fuzz.ratio(self.s1, self.s2), 100)
     self.assertEqual(
         fuzz.ratio(utils.full_process(self.s1),
                    utils.full_process(self.s2)), 100)
示例#17
0
 def testCaseInsensitive(self):
     self.assertNotEqual(fuzz.ratio(self.s1, self.s2), 100)
     self.assertEqual(fuzz.ratio(utils.full_process(self.s1), utils.full_process(self.s2)), 100)
示例#18
0
 def test_fullProcessForceAscii(self):
     for s in self.mixed_strings:
         utils.full_process(s, force_ascii=True)
示例#19
0
 def test_fullProcess(self):
     for s in self.mixed_strings:
         utils.full_process(s)
示例#20
0
 def test_fullProcessForceAscii(self):
     for s in self.mixed_strings:
         utils.full_process(s, force_ascii=True)
def laaz_process(s):
    return fuzzyutils.full_process(re.sub(ur'[\":]','', s), False)
示例#22
0
def laaz_process(s):
    return fuzzyutils.full_process(re.sub(r'[\":]', '', s), False)