示例#1
0
def fuzzy_address_matcher(fuzzy_list, clean_list, thresh=0.5):

    if isinstance(fuzzy_list, pd.Series):
        fuzzy_list = fuzzy_list.tolist()

    if isinstance(clean_list, pd.Series):
        clean_list = clean_list.unique().tolist()

    index = FuzzySet(rel_sim_cutoff=0.000001)

    for c in clean_list:
        index.add(c)

    out_list = []
    for f in fuzzy_list:
        try:
            first_word = f.split('_')[0]
            results = index.get(f)
            results = [i for i in results if i[1].split('_')[0] == first_word
                       ]  # match state at least
            out_list.append(results[0][1])  # take best result
        except Exception as e:
            results = index.get(f)
            out_list.append(results[0][1])

    return out_list
示例#2
0
def run_profile():
    f = FuzzySet()
    with gzip.GzipFile(os.path.join(here, '..', 'cities.gz')) as input_file:
        for line in input_file:
            f.add(line.rstrip())
    cProfile.runctx("profiler(f)", globals(), locals(), "Profile.prof")

    s = pstats.Stats("Profile.prof")
    s.strip_dirs().sort_stats("time").print_stats()
    def _load(self, parsed_entities):
        entities = []
        for group in parsed_entities.keys():
            for element in parsed_entities[group].keys():
                fuzzy = FuzzySet()
                for x in [element] + parsed_entities[group][element]:
                    fuzzy.add(x)

                entity = {
                    "group": group,
                    "canonical": element,
                    "fuzzy": fuzzy
                }
                entities.append(entity)
        return entities
示例#4
0
def load_dictionary():
    print("loading dutch dictionary...")
    opentaal_dict_file = "data/Dutch.dic"
    fasttext_vocab_file = "data/dutch_vocabulary.txt"
    words = FuzzySet()

    for counter, line in tqdm.tqdm(enumerate(open(opentaal_dict_file))):
        if counter == 0:
            continue
    words.add(line.split("/")[0].strip())

    for counter, line in tqdm.tqdm(enumerate(open(fasttext_vocab_file))):
        if counter ==0:
            continue
        words.add(line.strip())
    return words
示例#5
0
class QueryPage(BaseModel):

    title: str
    pageid_: set[str] = Field(alias="pageid")
    categories: set[ValidCategory] = Field(alias="categories")
    aliases: set[str] = Field(alias="redirects", default_factory=set)

    _fuzzy: FuzzySet = PrivateAttr(None)

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._fuzzy = FuzzySet([self.title, *self.aliases])

    @validator("title", pre=True, allow_reuse=True)
    def strip_suffixes(cls, title: str):
        return strip_suffix_from_title(title)

    @validator("pageid_", pre=True, allow_reuse=True)
    def ensure_pageid_set(cls, pageid_: Any):
        if isinstance(pageid_, set):
            return pageid_
        elif isinstance(pageid_, (str, int)):
            return {pageid_}
        else:
            return set(pageid_)

    @validator("categories", pre=True, allow_reuse=True)
    def extract_category(cls, categories: dict[str, str]):
        return {category["title"] for category in categories}

    @validator("aliases", pre=True, allow_reuse=True)
    def extract_redirect(cls, redirects: dict[str], values):
        title = values["title"]
        return [redirect["title"] for redirect in redirects if redirect["title"] not in title]

    @property
    def pageid(self) -> str:
        return "|".join(self.pageid_)

    def update(self, other: QueryPage) -> None:
        if self.title != other.title:
            raise KeyError("Cannot merge two pages with different titles.")
        self.pageid_.update(other.pageid_)
        self.categories.update(other.categories)
        self.aliases.update(other.aliases)
        for alias in other.aliases:
            self._fuzzy.add(alias)
    def _load(self, parsed_entities):
        entities = []
        for group in parsed_entities:
            group_name = group["name"]
            for group_element in group["subLists"]:
                fuzzy = FuzzySet()
                for x in [group_element["canonicalForm"]
                          ] + group_element["list"]:
                    fuzzy.add(x)

                entity = {
                    "group": group_name,
                    "canonical": group_element["canonicalForm"],
                    "fuzzy": fuzzy
                }
                entities.append(entity)
        return entities
 def _get_entity_groups(self, database_config: Dict[Text, Text],
                        database_queries: Dict[Text, Text]):
     db = pymysql.connect(host=database_config["host"],
                          user=database_config["user"],
                          passwd=database_config["password"],
                          db=database_config["database"])
     cur = db.cursor()
     print(f"Queries are: {database_queries.keys()}")
     for entity_key in database_queries.keys():
         cur.execute(database_queries[entity_key])
         current_entity = FuzzySet()
         for row in cur.fetchall():
             if len(row) != 1:
                 raise SyntaxError(
                     f"{entity_key}: query returned more than one column!")
             current_entity.add(row[0])
         self.ents[entity_key] = current_entity
     db.close()
示例#8
0
class GridLookup:
    GRID_DATASET_ZIP_NAME = 'grid.zip'
    GRID_DATASET_URL = 'https://digitalscience.figshare.com/ndownloader/files/22091379'
    GRID_DIR = dirname(realpath(__file__))
    GRID_DATA_ROOT = join(GRID_DIR, 'data')
    GRID_DATA_CSV = 'grid.csv'
    GRID_DATA_DICT = 'grid_dict.pkl'

    def __init__(self, use_fuzzy_matching=True):
        self.country_lookup = CountryCodeLookup()

        if not isdir(self.GRID_DATA_ROOT):
            mkdir(self.GRID_DATA_ROOT)

        if not isfile(join(self.GRID_DATA_ROOT, self.GRID_DATA_CSV)):
            sucess = self.__download_dataset()
            if not sucess:
                raise Exception('Failed downloading grid dataset from https://www.grid.ac/')

        if not isfile(join(self.GRID_DATA_ROOT, self.GRID_DATA_DICT)):
            csv_path = join(self.GRID_DATA_ROOT, self.GRID_DATA_CSV)
            data = self.__load_csv(csv_path)
            self.data_dict = self.__get_dict_from_pd(data)
            self.__save_dict(self.data_dict)
        else:
            self.data_dict = self.__load_dict()

        self.use_fuzzy_matching = use_fuzzy_matching
        if use_fuzzy_matching:
            self.fuzzy_set = FuzzySet()
            [self.fuzzy_set.add(x) for x in self.data_dict];


    def __download_dataset(self):
        try:
            zip_file = join(self.GRID_DATA_ROOT, self.GRID_DATASET_ZIP_NAME)
            download_file(self.GRID_DATASET_URL, zip_file)
            self.__extract_zip(zip_file)
            remove(zip_file)
            return True
        except:
            return False
        
    def __extract_zip(self, zip_file):
        with ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extractall(self.GRID_DATA_ROOT)

    def __load_csv(self, path):
        return pd.read_csv(path)

    def __get_dict_from_pd(self, data):
        data_dict = dict()
        for _, row in data.iterrows():
            code = self.country_lookup.get_country_code(row.Country)
            data_dict[row.Name] = {
                'Name': row.Name, 
                'Country': row.Country, 
                'Code': code if code is not None else 'undefined'} #TODO: Fix missing country codes (e.g. South Korea)
        return data_dict

    def __save_dict(self, grid_dict):
        with open(join(self.GRID_DATA_ROOT, self.GRID_DATA_DICT), 'wb') as f:
            pickle.dump(grid_dict, f, pickle.HIGHEST_PROTOCOL)


    def __load_dict(self):
        with open(join(self.GRID_DATA_ROOT, 'grid_dict.pkl'), 'rb') as f:
            return pickle.load(f)

    def __fuzzy_match_institution(self, name):
        result = self.fuzzy_set.get(name)

        if result is None or len(result) == 0: 
            return None

        score, match = result[0]
        return match if score > 0.90 else None

    def get_institution(self, name):
        if name is None: return None
        institution = self.data_dict.get(name)
        if self.use_fuzzy_matching and institution is None:
            matched_name = self.__fuzzy_match_institution(name)
            if matched_name is None:
                return None
            return self.data_dict.get(matched_name)
        return institution

    def get_all_institutions(self):
        return self.data_dict.keys()
                sys.exit(2)
        for opt,arg in opts:
                if opt == "-h":
                        print "homophonic.py -t <text> -l <method>"
                elif opt in ("-t","--text"):
                        text = arg
		elif opt in ("-l","--method"):
			method = arg
                else:
                        sys.exit(2)
	text = ap_encoding.read_file(text)
	if method == "translate":
		_start = time.time()
		results = get_translation(text)
	elif method == "phonics":
		results = get_phonics(text)
	_end = time.time()
	#print _end - _start
	print ' '.join([val for val in results])

init_sphinx()
words = {}
phones = FuzzySet()
for word, phone in pronunciations: 
	try: words[phone] = word
	except KeyError: words[phone] += word	
[phones.add(phone) for word, phone in pronunciations]

if __name__ == "__main__":
        main(sys.argv[1:])
示例#10
0
        if len(item) > 1:
            final_name = final_name + " " + item
    return final_name.strip()


#Import Existing Info
infile = open('ceo_crawled_education.csv')
for n, line in enumerate(infile):
    if n == 0:
        continue
    line = line.strip().split(',')
    name = clean_name(line[0].upper())
    if check_name(name):
        education = line[3]
        #ceo_list.append(name)
        a.add(name)
        ceo_edu_dic[name] = education

infile.close()

target_edu_dic = {}
#Read Target File and Fuzzy Wuzzy
infile = open('1996-2006.csv')
outfile = open('1996-2006_edu_test.csv', 'w')
#infile = open('2007-2017.csv')
#outfile = open('2007-2017_edu_test.csv', 'w')
for n, orgline in enumerate(infile):
    if n == 0:
        continue
    #if n > 100:
    #    break