Exemplo n.º 1
0
def fuzzy_address_matcher(fuzzy_list, clean_list, thresh=0.5):

    if isinstance(fuzzy_list, pd.Series):
        fuzzy_list = fuzzy_list.tolist()

    if isinstance(clean_list, pd.Series):
        clean_list = clean_list.unique().tolist()

    index = FuzzySet(rel_sim_cutoff=0.000001)

    for c in clean_list:
        index.add(c)

    out_list = []
    for f in fuzzy_list:
        try:
            first_word = f.split('_')[0]
            results = index.get(f)
            results = [i for i in results if i[1].split('_')[0] == first_word
                       ]  # match state at least
            out_list.append(results[0][1])  # take best result
        except Exception as e:
            results = index.get(f)
            out_list.append(results[0][1])

    return out_list
Exemplo n.º 2
0
    def base_check(self, dict_list, word_list, lem=True):
        """
        Check probability that the word is based on a dictionary word

        Arguments
        ---------
        dict_list: list
            list containing pre-loaded language dictionary
        word_list: list
            list containing the passwords to check
        lem: Boolean
            Select whether or not to use lemmatization,
            disable this for matching countries

        Returns
        -------
        word, score: tuple
            Generator where yield is a list of tuples
            containing base word and score
        """
        fuzz = FuzzySet(dict_list)
        lemm = WordNetLemmatizer()
        for word in word_list:
            word = self.cleaner(word)
            score = fuzz.get(word.lower())
            if lem:
                try:
                    lem_word = lemm.lemmatize(score[0][1])
                except TypeError:
                    lem_word = ""
            else:
                lem_word = ""
            yield (word, score, lem_word)
Exemplo n.º 3
0
def run_profile():
    f = FuzzySet()
    with gzip.GzipFile(os.path.join(here, '..', 'cities.gz')) as input_file:
        for line in input_file:
            f.add(line.rstrip())
    cProfile.runctx("profiler(f)", globals(), locals(), "Profile.prof")

    s = pstats.Stats("Profile.prof")
    s.strip_dirs().sort_stats("time").print_stats()
    def _load(self, parsed_entities):
        entities = []
        for group in parsed_entities.keys():
            for element in parsed_entities[group].keys():
                fuzzy = FuzzySet()
                for x in [element] + parsed_entities[group][element]:
                    fuzzy.add(x)

                entity = {
                    "group": group,
                    "canonical": element,
                    "fuzzy": fuzzy
                }
                entities.append(entity)
        return entities
Exemplo n.º 5
0
def load_dictionary():
    print("loading dutch dictionary...")
    opentaal_dict_file = "data/Dutch.dic"
    fasttext_vocab_file = "data/dutch_vocabulary.txt"
    words = FuzzySet()

    for counter, line in tqdm.tqdm(enumerate(open(opentaal_dict_file))):
        if counter == 0:
            continue
    words.add(line.split("/")[0].strip())

    for counter, line in tqdm.tqdm(enumerate(open(fasttext_vocab_file))):
        if counter ==0:
            continue
        words.add(line.strip())
    return words
Exemplo n.º 6
0
class QueryPage(BaseModel):

    title: str
    pageid_: set[str] = Field(alias="pageid")
    categories: set[ValidCategory] = Field(alias="categories")
    aliases: set[str] = Field(alias="redirects", default_factory=set)

    _fuzzy: FuzzySet = PrivateAttr(None)

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._fuzzy = FuzzySet([self.title, *self.aliases])

    @validator("title", pre=True, allow_reuse=True)
    def strip_suffixes(cls, title: str):
        return strip_suffix_from_title(title)

    @validator("pageid_", pre=True, allow_reuse=True)
    def ensure_pageid_set(cls, pageid_: Any):
        if isinstance(pageid_, set):
            return pageid_
        elif isinstance(pageid_, (str, int)):
            return {pageid_}
        else:
            return set(pageid_)

    @validator("categories", pre=True, allow_reuse=True)
    def extract_category(cls, categories: dict[str, str]):
        return {category["title"] for category in categories}

    @validator("aliases", pre=True, allow_reuse=True)
    def extract_redirect(cls, redirects: dict[str], values):
        title = values["title"]
        return [redirect["title"] for redirect in redirects if redirect["title"] not in title]

    @property
    def pageid(self) -> str:
        return "|".join(self.pageid_)

    def update(self, other: QueryPage) -> None:
        if self.title != other.title:
            raise KeyError("Cannot merge two pages with different titles.")
        self.pageid_.update(other.pageid_)
        self.categories.update(other.categories)
        self.aliases.update(other.aliases)
        for alias in other.aliases:
            self._fuzzy.add(alias)
    def _load(self, parsed_entities):
        entities = []
        for group in parsed_entities:
            group_name = group["name"]
            for group_element in group["subLists"]:
                fuzzy = FuzzySet()
                for x in [group_element["canonicalForm"]
                          ] + group_element["list"]:
                    fuzzy.add(x)

                entity = {
                    "group": group_name,
                    "canonical": group_element["canonicalForm"],
                    "fuzzy": fuzzy
                }
                entities.append(entity)
        return entities
 def _get_entity_groups(self, database_config: Dict[Text, Text],
                        database_queries: Dict[Text, Text]):
     db = pymysql.connect(host=database_config["host"],
                          user=database_config["user"],
                          passwd=database_config["password"],
                          db=database_config["database"])
     cur = db.cursor()
     print(f"Queries are: {database_queries.keys()}")
     for entity_key in database_queries.keys():
         cur.execute(database_queries[entity_key])
         current_entity = FuzzySet()
         for row in cur.fetchall():
             if len(row) != 1:
                 raise SyntaxError(
                     f"{entity_key}: query returned more than one column!")
             current_entity.add(row[0])
         self.ents[entity_key] = current_entity
     db.close()
Exemplo n.º 9
0
    def __init__(self, use_fuzzy_matching=True):
        self.country_lookup = CountryCodeLookup()

        if not isdir(self.GRID_DATA_ROOT):
            mkdir(self.GRID_DATA_ROOT)

        if not isfile(join(self.GRID_DATA_ROOT, self.GRID_DATA_CSV)):
            sucess = self.__download_dataset()
            if not sucess:
                raise Exception('Failed downloading grid dataset from https://www.grid.ac/')

        if not isfile(join(self.GRID_DATA_ROOT, self.GRID_DATA_DICT)):
            csv_path = join(self.GRID_DATA_ROOT, self.GRID_DATA_CSV)
            data = self.__load_csv(csv_path)
            self.data_dict = self.__get_dict_from_pd(data)
            self.__save_dict(self.data_dict)
        else:
            self.data_dict = self.__load_dict()

        self.use_fuzzy_matching = use_fuzzy_matching
        if use_fuzzy_matching:
            self.fuzzy_set = FuzzySet()
            [self.fuzzy_set.add(x) for x in self.data_dict];
Exemplo n.º 10
0
from cfuzzyset import cFuzzySet as FuzzySet
from os import listdir
from os.path import isfile, join

# read in player-position map
player_to_position_map = {}
with open('./player_positions.tsv','r') as f:
    content = f.readlines()
    for player in content:
        player = player.rstrip().split('\t')
        name = player[0]
        position = player[1]
        player_to_position_map[name.lower()] = position.replace(',','/')

# use fuzzy set to match player names
player_fuzzy_set = FuzzySet(player_to_position_map.keys())

# read through game-play data
seasons = ['2006-2007', '2007-2008', '2008-2009', '2009-2010']
for season in seasons:
    dir_path = './'+season+'.regular_season'
    game_files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ]
    season_makes_file = open(season+'.made_shots.csv','w')
    season_misses_file = open(season+'.missed_shots.csv','w')
    for game in game_files:
        with open(dir_path+"/"+game, 'r') as in_file:
            content = in_file.readlines()
            for play in content[1:]:
                play = play.rstrip().split(',')
                if len(play)<32:
                    continue
Exemplo n.º 11
0
def _interactive_test():
    with gzip.GzipFile(os.path.join(here, '..', 'cities.gz')) as input_file:
        f = FuzzySet((line.strip() for line in input_file))
    while True:
        town = raw_input("Enter town name: ")
        print f.get(town)
Exemplo n.º 12
0
class GridLookup:
    GRID_DATASET_ZIP_NAME = 'grid.zip'
    GRID_DATASET_URL = 'https://digitalscience.figshare.com/ndownloader/files/22091379'
    GRID_DIR = dirname(realpath(__file__))
    GRID_DATA_ROOT = join(GRID_DIR, 'data')
    GRID_DATA_CSV = 'grid.csv'
    GRID_DATA_DICT = 'grid_dict.pkl'

    def __init__(self, use_fuzzy_matching=True):
        self.country_lookup = CountryCodeLookup()

        if not isdir(self.GRID_DATA_ROOT):
            mkdir(self.GRID_DATA_ROOT)

        if not isfile(join(self.GRID_DATA_ROOT, self.GRID_DATA_CSV)):
            sucess = self.__download_dataset()
            if not sucess:
                raise Exception('Failed downloading grid dataset from https://www.grid.ac/')

        if not isfile(join(self.GRID_DATA_ROOT, self.GRID_DATA_DICT)):
            csv_path = join(self.GRID_DATA_ROOT, self.GRID_DATA_CSV)
            data = self.__load_csv(csv_path)
            self.data_dict = self.__get_dict_from_pd(data)
            self.__save_dict(self.data_dict)
        else:
            self.data_dict = self.__load_dict()

        self.use_fuzzy_matching = use_fuzzy_matching
        if use_fuzzy_matching:
            self.fuzzy_set = FuzzySet()
            [self.fuzzy_set.add(x) for x in self.data_dict];


    def __download_dataset(self):
        try:
            zip_file = join(self.GRID_DATA_ROOT, self.GRID_DATASET_ZIP_NAME)
            download_file(self.GRID_DATASET_URL, zip_file)
            self.__extract_zip(zip_file)
            remove(zip_file)
            return True
        except:
            return False
        
    def __extract_zip(self, zip_file):
        with ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extractall(self.GRID_DATA_ROOT)

    def __load_csv(self, path):
        return pd.read_csv(path)

    def __get_dict_from_pd(self, data):
        data_dict = dict()
        for _, row in data.iterrows():
            code = self.country_lookup.get_country_code(row.Country)
            data_dict[row.Name] = {
                'Name': row.Name, 
                'Country': row.Country, 
                'Code': code if code is not None else 'undefined'} #TODO: Fix missing country codes (e.g. South Korea)
        return data_dict

    def __save_dict(self, grid_dict):
        with open(join(self.GRID_DATA_ROOT, self.GRID_DATA_DICT), 'wb') as f:
            pickle.dump(grid_dict, f, pickle.HIGHEST_PROTOCOL)


    def __load_dict(self):
        with open(join(self.GRID_DATA_ROOT, 'grid_dict.pkl'), 'rb') as f:
            return pickle.load(f)

    def __fuzzy_match_institution(self, name):
        result = self.fuzzy_set.get(name)

        if result is None or len(result) == 0: 
            return None

        score, match = result[0]
        return match if score > 0.90 else None

    def get_institution(self, name):
        if name is None: return None
        institution = self.data_dict.get(name)
        if self.use_fuzzy_matching and institution is None:
            matched_name = self.__fuzzy_match_institution(name)
            if matched_name is None:
                return None
            return self.data_dict.get(matched_name)
        return institution

    def get_all_institutions(self):
        return self.data_dict.keys()
Exemplo n.º 13
0
                sys.exit(2)
        for opt,arg in opts:
                if opt == "-h":
                        print "homophonic.py -t <text> -l <method>"
                elif opt in ("-t","--text"):
                        text = arg
		elif opt in ("-l","--method"):
			method = arg
                else:
                        sys.exit(2)
	text = ap_encoding.read_file(text)
	if method == "translate":
		_start = time.time()
		results = get_translation(text)
	elif method == "phonics":
		results = get_phonics(text)
	_end = time.time()
	#print _end - _start
	print ' '.join([val for val in results])

init_sphinx()
words = {}
phones = FuzzySet()
for word, phone in pronunciations: 
	try: words[phone] = word
	except KeyError: words[phone] += word	
[phones.add(phone) for word, phone in pronunciations]

if __name__ == "__main__":
        main(sys.argv[1:])
Exemplo n.º 14
0
def _interactive_test():
    with gzip.GzipFile(os.path.join(here, '..', 'cities.gz')) as input_file:
        f = FuzzySet((line.strip() for line in input_file))
    while True:
        town = input("Enter town name: ")
        print(f.get(town))
Exemplo n.º 15
0
 def __init__(self, **kwargs):
     super().__init__(**kwargs)
     self._fuzzy = FuzzySet([self.title, *self.aliases])
Exemplo n.º 16
0
#!/user/bin/env python
# coding:utf-8
from cfuzzyset import cFuzzySet as FuzzySet
import re
ceo_list = []
ceo_edu_dic = {}
a = FuzzySet()
count = 0


def check_name(name):
    name_tmp = name.strip().split()
    if len(name) <= 5:
        return False
    if len(name_tmp) == 1:
        return False
    if name_tmp[0] == name_tmp[1]:
        return False

    else:
        return True


def clean_name(name):

    todel = [i.start() for i in re.finditer('"', name)]
    if len(todel) == 2:
        myString = name[0:todel[0]] + name[todel[1] + 2:-1]
        #print name + ' --->  ' +  myString
        return myString