示例#1
0
def fuzzy_address_matcher(fuzzy_list, clean_list, thresh=0.5):

    if isinstance(fuzzy_list, pd.Series):
        fuzzy_list = fuzzy_list.tolist()

    if isinstance(clean_list, pd.Series):
        clean_list = clean_list.unique().tolist()

    index = FuzzySet(rel_sim_cutoff=0.000001)

    for c in clean_list:
        index.add(c)

    out_list = []
    for f in fuzzy_list:
        try:
            first_word = f.split('_')[0]
            results = index.get(f)
            results = [i for i in results if i[1].split('_')[0] == first_word
                       ]  # match state at least
            out_list.append(results[0][1])  # take best result
        except Exception as e:
            results = index.get(f)
            out_list.append(results[0][1])

    return out_list
示例#2
0
    def base_check(self, dict_list, word_list, lem=True):
        """
        Check probability that the word is based on a dictionary word

        Arguments
        ---------
        dict_list: list
            list containing pre-loaded language dictionary
        word_list: list
            list containing the passwords to check
        lem: Boolean
            Select whether or not to use lemmatization,
            disable this for matching countries

        Returns
        -------
        word, score: tuple
            Generator where yield is a list of tuples
            containing base word and score
        """
        fuzz = FuzzySet(dict_list)
        lemm = WordNetLemmatizer()
        for word in word_list:
            word = self.cleaner(word)
            score = fuzz.get(word.lower())
            if lem:
                try:
                    lem_word = lemm.lemmatize(score[0][1])
                except TypeError:
                    lem_word = ""
            else:
                lem_word = ""
            yield (word, score, lem_word)
示例#3
0
# read through game-play data
seasons = ['2006-2007', '2007-2008', '2008-2009', '2009-2010']
for season in seasons:
    dir_path = './'+season+'.regular_season'
    game_files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ]
    season_makes_file = open(season+'.made_shots.csv','w')
    season_misses_file = open(season+'.missed_shots.csv','w')
    for game in game_files:
        with open(dir_path+"/"+game, 'r') as in_file:
            content = in_file.readlines()
            for play in content[1:]:
                play = play.rstrip().split(',')
                if len(play)<32:
                    continue
                play_type = play[13]
                if play_type == 'shot' and not (play[30]=='' or play[31]==''):
                    # we don't have to keep points
                    # if result=='made' and play_type=='3pt', then 3 points.
                    player = play[23]
                    result = play[27]
                    shot_type = play[29]
                    x_coord = int(play[30])
                    y_coord = int(play[31])
                    position = player_to_position_map[player_fuzzy_set.get(player.lower())[0][1]]
                    if result == 'made':
                        season_makes_file.write(",".join([player, position, shot_type, str(x_coord), str(y_coord)])+"\n")
                    else:
                        season_misses_file.write(",".join([player, position, shot_type, str(x_coord), str(y_coord)])+"\n")
    season_makes_file.close()
    season_misses_file.close()
示例#4
0
def _interactive_test():
    with gzip.GzipFile(os.path.join(here, '..', 'cities.gz')) as input_file:
        f = FuzzySet((line.strip() for line in input_file))
    while True:
        town = raw_input("Enter town name: ")
        print f.get(town)
示例#5
0
class GridLookup:
    GRID_DATASET_ZIP_NAME = 'grid.zip'
    GRID_DATASET_URL = 'https://digitalscience.figshare.com/ndownloader/files/22091379'
    GRID_DIR = dirname(realpath(__file__))
    GRID_DATA_ROOT = join(GRID_DIR, 'data')
    GRID_DATA_CSV = 'grid.csv'
    GRID_DATA_DICT = 'grid_dict.pkl'

    def __init__(self, use_fuzzy_matching=True):
        self.country_lookup = CountryCodeLookup()

        if not isdir(self.GRID_DATA_ROOT):
            mkdir(self.GRID_DATA_ROOT)

        if not isfile(join(self.GRID_DATA_ROOT, self.GRID_DATA_CSV)):
            sucess = self.__download_dataset()
            if not sucess:
                raise Exception('Failed downloading grid dataset from https://www.grid.ac/')

        if not isfile(join(self.GRID_DATA_ROOT, self.GRID_DATA_DICT)):
            csv_path = join(self.GRID_DATA_ROOT, self.GRID_DATA_CSV)
            data = self.__load_csv(csv_path)
            self.data_dict = self.__get_dict_from_pd(data)
            self.__save_dict(self.data_dict)
        else:
            self.data_dict = self.__load_dict()

        self.use_fuzzy_matching = use_fuzzy_matching
        if use_fuzzy_matching:
            self.fuzzy_set = FuzzySet()
            [self.fuzzy_set.add(x) for x in self.data_dict];


    def __download_dataset(self):
        try:
            zip_file = join(self.GRID_DATA_ROOT, self.GRID_DATASET_ZIP_NAME)
            download_file(self.GRID_DATASET_URL, zip_file)
            self.__extract_zip(zip_file)
            remove(zip_file)
            return True
        except:
            return False
        
    def __extract_zip(self, zip_file):
        with ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extractall(self.GRID_DATA_ROOT)

    def __load_csv(self, path):
        return pd.read_csv(path)

    def __get_dict_from_pd(self, data):
        data_dict = dict()
        for _, row in data.iterrows():
            code = self.country_lookup.get_country_code(row.Country)
            data_dict[row.Name] = {
                'Name': row.Name, 
                'Country': row.Country, 
                'Code': code if code is not None else 'undefined'} #TODO: Fix missing country codes (e.g. South Korea)
        return data_dict

    def __save_dict(self, grid_dict):
        with open(join(self.GRID_DATA_ROOT, self.GRID_DATA_DICT), 'wb') as f:
            pickle.dump(grid_dict, f, pickle.HIGHEST_PROTOCOL)


    def __load_dict(self):
        with open(join(self.GRID_DATA_ROOT, 'grid_dict.pkl'), 'rb') as f:
            return pickle.load(f)

    def __fuzzy_match_institution(self, name):
        result = self.fuzzy_set.get(name)

        if result is None or len(result) == 0: 
            return None

        score, match = result[0]
        return match if score > 0.90 else None

    def get_institution(self, name):
        if name is None: return None
        institution = self.data_dict.get(name)
        if self.use_fuzzy_matching and institution is None:
            matched_name = self.__fuzzy_match_institution(name)
            if matched_name is None:
                return None
            return self.data_dict.get(matched_name)
        return institution

    def get_all_institutions(self):
        return self.data_dict.keys()
示例#6
0
def _interactive_test():
    with gzip.GzipFile(os.path.join(here, '..', 'cities.gz')) as input_file:
        f = FuzzySet((line.strip() for line in input_file))
    while True:
        town = input("Enter town name: ")
        print(f.get(town))
示例#7
0
#outfile = open('2007-2017_edu_test.csv', 'w')
for n, orgline in enumerate(infile):
    if n == 0:
        continue
    #if n > 100:
    #    break
    if n % 1000 == 0:
        print n
    line = orgline.strip().split(',')
    fullname = line[-1]
    #fullname = line[1]
    if fullname not in target_edu_dic:
        target_edu_dic[fullname] = 'None'
        try:
            #result = process.extractOne(fullname, ceo_list, scorer=fuzz.token_set_ratio)
            result = a.get(clean_sin_charac(fullname))[0]
            #print fullname
            #print result
        except:
            pass
        if result == None:
            continue
        if result[0] > 0.5:
            print fullname
            print result
            count += 1
            target_edu_dic[fullname] = ceo_edu_dic[result[1]]
    print >> outfile, orgline.strip() + ',' + target_edu_dic[fullname]

print "#########################"
print count