Пример #1
0
def now(config):
    def _get_city_match(name):
        if name in replaces.keys():
            return (1, replaces[name])
        return (cities.get(name)[0][0], cities.get(name)[0][1])

    # Get places ids
    df_places_id = get_places_id.now(config)
    df = get_googledrive_df(os.getenv("INLOCO_CITIES_ID"))

    time.sleep(2)

    # Get states closest matches
    states = fuzzyset.FuzzySet()
    for x in df_places_id["state_name"].unique():
        states.add(x)

    df["state_name"] = df["state_name"].apply(lambda x: states.get(x)[0][1])

    # Get cities closest matches by state+city name
    cities = fuzzyset.FuzzySet()

    df_places_id[
        "state_city"] = df_places_id["state_name"] + df_places_id["city_name"]
    for x in df_places_id["state_city"].drop_duplicates():
        cities.add(x)

    # Cities with changed names
    replaces = {
        v["state_name"] + name: v["state_name"] + v["correct_name"]
        for name, v in config["br"]["inloco"]["replace"].items()
    }

    df["state_city"] = df["state_name"] + df["city_name"]
    df["state_city_match"], df["state_city"] = zip(
        *df["state_city"].apply(lambda x: _get_city_match(x)))

    # Merge to get places ids
    del df["state_name"], df["city_name"]
    df = df.merge(
        df_places_id[[
            "state_city",
            "state_num_id",
            "state_name",
            "health_region_name",
            "health_region_id",
            "city_name",
            "city_id",
        ]].drop_duplicates(),
        on=["state_city"],
        how="left",
    )

    del df["state_city"]
    return df
Пример #2
0
 def __init__(self):
     super(ResultsProvider, self).__init__()
     self.cumulative = {}
     self.track_percentiles = [0.0, 50.0, 90.0, 95.0, 99.0, 99.9, 100.0]
     self.listeners = []
     self.buffer_len = 2
     self.min_buffer_len = 2
     self.max_buffer_len = float('inf')
     self.buffer_multiplier = 2
     self.buffer_scale_idx = None
     self.histogram_max = 1.0
     self.known_errors = fuzzyset.FuzzySet(use_levenshtein=True)
     self.max_error_count = 100
     self.known_labels = fuzzyset.FuzzySet(use_levenshtein=True)
     self.generalize_labels = 100
Пример #3
0
 def __init__(self, people, text_detection, ocr, pixel_threshold=0.5, link_threshold=0.5):
     self.prev_badge = ''
     self.prev_time = 0
     self.pixel_threshold = pixel_threshold
     self.link_threshold = link_threshold
     self.text_detection = text_detection
     self.ocr = ocr
     charset, _ = read_charset()
     self.chrset_index = charset
     self.names_db = fuzzyset.FuzzySet()
     self.data_db = {}
     for p in people:
         tokens = p.split(' ')
         for t in tokens:
             if len(t) > 1:
                 self.names_db.add(t)
         perm = permutations(tokens)
         for v in list(perm):
             v1 = ' '.join(v)
             v2 = ''.join(v)
             self.names_db.add(v1)
             self.names_db.add(v2)
             self.data_db[v1] = p
             self.data_db[v2] = p
     self.queue = queue.Queue(maxsize=1)
     self.worker = threading.Thread(target=self.run)
     self.worker.start()
Пример #4
0
def build_structures():
    opts = (
        ('a', 'FuzzySet', fuzzyset.FuzzySet()),
        ('b', 'FuzzySet (no leven)', fuzzyset.FuzzySet()),
        ('c', 'cFuzzySet', cfuzzyset.cFuzzySet()),
        ('d', 'cFuzzySet (no leven)', cfuzzyset.cFuzzySet()),
    )
    ref = {}
    input_file = gzip.GzipFile(os.path.join(here, '..', 'cities.gz'))
    for line in input_file:
        line = line.rstrip()
        for _, _, structure in opts:
            structure.add(line)
        ref[line] = line

    return opts + (('ref', 'reference (dict)', ref), )
Пример #5
0
def find_food(food):
    conn = sqlite3.connect('usda.sql3')
    search_clause = '%' + food + '%'
    c = conn.cursor()
    c.execute('SELECT id, long_desc FROM food WHERE long_desc LIKE ?',
              (search_clause, ))
    strmatch = fuzzyset.FuzzySet()
    strmatch.add(food)

    best_score = -1
    best_food = ''
    best_id = -1

    for row in c:
        food_id = row[0]
        food_name = row[1].lower().split(',')[0]
        if strmatch.get(food_name) is None:
            continue
        score = strmatch.get(food_name)[0][0]

        if score > best_score and food_name.startswith(food):
            best_score = score
            best_food = row[1]
            best_id = food_id
        # if food_name.startswith(food):
        # print(row[1])

    print(str(best_id) + " " + best_food)
    return (best_id, best_food)
Пример #6
0
    def sm_fuzzy_match(street, ed):

        if ed != str:
            ed = str(int(ed))

        #Return null if street is blank
        if street == '':
            return ['', '', False]

        #Microdata ED may not be in Steve Morse, if so then add it to problem ED list and return null
        try:
            sm_ed_streets = sm_ed_st_dict[ed]
            sm_ed_streets_fuzzyset = fuzzyset.FuzzySet(sm_ed_streets)
        except:
            problem_EDs.append(ed)
            return ['', '', False]

        #Step 1: Find best match among streets associated with microdata ED
        try:
            best_match_ed = sm_ed_streets_fuzzyset[street][0]
        except:
            return ['', '', False]

        #Step 2: Find best match among all streets
        try:
            best_match_all = sm_all_streets_fuzzyset[street][0]
        except:
            return ['', '', False]

        #Step 3: If both best matches are the same, return as best match
        if (best_match_ed[1] == best_match_all[1]) & (best_match_ed[0] >= 0.5):
            return [best_match_ed[1], best_match_ed[0], True]
        else:
            return ['', '', False]
Пример #7
0
 def __init__(self, documents):
     self.documents = np.array(documents)
     if documents:
         self.tfidf = TfidfVectorizer(tokenizer=self.tokenize_document,
                                      stop_words=stop_word_set)
         self.tfs = self.tfidf.fit_transform(self.documents)
         self.fuzzyset = fuzzyset.FuzzySet(self.tfidf.get_feature_names(),
                                           use_levenshtein=False)
Пример #8
0
def compare(guess, answer):
    a = fuzzyset.FuzzySet()
    a.add(answer)
    metric = a.get(guess)
    tally = 0
    if metric:
        for item in metric:
            tally += item[0]
        average = tally / len(metric)
        if average > 0.3:
            return True
Пример #9
0
def getid(msgs):
    with open('data.json') as f:
        data = json.load(f)
    a = fuzzyset.FuzzySet()
    a.add(str(msgs))
    value = sys.maxint
    id = 0
    name = ''
    for dt in data:
        val = a.get(str(dt['Name']))
        if(val>value):
            value = val
            id = dt['ID']
    return str(id)
Пример #10
0
def transformed_mean_score(transformed_set, target_set):
    score = 0
    for index, row in transformed_set.iterrows():
        fs = fuzzyset.FuzzySet()
        fs.add(row[0])
        fuzzyval = fs.get(target_set.iloc[index, 0])
        if fuzzyval is None:
            continue
        curr_score = fuzzyval[0][0]
        #curr_score = fs.get(target_set.iloc[index, 0])[0][0]
        # print("curr_score = ")
        # print(curr_score)
        score += curr_score
        del fs
    return score / transformed_set.shape[0]
Пример #11
0
		def fuzzy_match_function(street, ed, ed_st_dict, all_streets_fuzzyset, check_too_similar=False):

			nomatch = ['', '', False]
			ed = str(ed)

			#Return null if street is blank
			if street == '':
				return nomatch
			#Microdata ED may not be in Steve Morse, if so then add it to problem ED list and return null
			try:
				ed_streets = ed_st_dict[ed]
				ed_streets_fuzzyset = fuzzyset.FuzzySet(ed_streets)
			except:
			#	print("Problem ED:" + str(ed))
				return nomatch

			#Step 1: Find best match among streets associated with microdata ED
			try:
				best_match_ed = ed_streets_fuzzyset[street][0]
			except:
				return nomatch

			#Step 2: Find best match among all streets
			try:
				best_match_all = all_streets_fuzzyset[street][0]
			except:
				return nomatch    
			#Step 3: If both best matches are the same, return as best match

			if (best_match_ed[1] == best_match_all[1]) & (best_match_ed[0] >= 0.5):
				#Check how many other streets in ED differ by one character
				if check_too_similar:
					too_similar = sum([diff_by_one_char(st, best_match_ed[1]) for st in sm_ed_streets])
					if too_similar == 0:
						return [best_match_ed[1], best_match_ed[0], True]
					else:
						return nomatch
				else: 
					return [best_match_ed[1], best_match_ed[0], True]
			#Step 4: If both are not the same, return one with the higher score (to help manual cleaning)
			else:
				if best_match_all[0] < best_match_ed[0]:
					return [best_match_ed[1], best_match_ed[0], False]
				else:
					return [best_match_all[1], best_match_all[0], False]
Пример #12
0
def getDayTimeAlt(AT, show):
	#Get broadcast time of show from MAL
	mal_watchlist = mal.User.getAnimeList(AT, "watching", ['broadcast', 'alternative_titles'])[0]["data"]

	fset = fuzzyset.FuzzySet()
	fset.add(show)
	max_prob, max_prob_idx = 0, 0

	for idx, item in enumerate(mal_watchlist):
		result = fset.get(item['node']['title'])
		current_prob = result[0][0] if result != None else 0
		if current_prob > max_prob:
			max_prob = current_prob
			max_prob_idx = idx

	if max_prob >= 0.6:
		day = dayMapping[mal_watchlist[max_prob_idx]['node']['broadcast']['day_of_the_week']]
		time = int(mal_watchlist[max_prob_idx]['node']['broadcast']['start_time'].replace(":", ""))

		#Converting day from JST to IST
		if(time <= 330):
			day = (day + 6) % 7

		hr = int(time / 100)
		mi = time % 100

		if mi < 30:
			hr = (hr + 20) % 24
		else:
			hr = (hr + 21) % 24

		mi = (mi + 30) % 60
		time = hr * 100 + mi

		#Get list of alternative titles
		alt_names = mal_watchlist[max_prob_idx]['node']['alternative_titles']['synonyms']
		alt_names.append(mal_watchlist[max_prob_idx]['node']['title'])
		return day, time, alt_names

	else:
		print("\033[91m[-] Anime not found in watchlist! Ignoring.\033[0m")
		return None, None, None
Пример #13
0
def predicttypesusingcolumnnames():
    types = [row.split('.')[1] for row in data]
    list_to_match = [
        "Person name", "Last name", "First name", "Middle name", "Full name",
        "Business name", "Phone Number", "Address", "Street name", "City",
        "Neighborhood", "Latitude Longitude", "Zip", "Borough", "School name",
        "Vehicle Color", "Vehicle Car make", "City agency", "Areas of study",
        "Subjects", "School Levels", "College/University names", "Websites",
        "Building Classification", "Vehicle Type", "Type of location", "dba"
    ]

    fz = fuzzyset.FuzzySet()
    for l in list_to_match:
        fz.add(l.lower())
    count = 0
    for row in types:
        actualdatasetname = data[count]
        lp = fz.get(process(row))
        count = count + 1
        predictions.append(list(lp[0])[1].lower())
def match_name_list(results_dedupe, df):
    """ Match name list per issue to LOC name list"""
    name_list_all = []
    name_list_highmatch = []
    fz = fuzzyset.FuzzySet()
    terms = df['name'].tolist()
    #Create a list of terms we would like to match against in a fuzzy way
    for l in terms:
        fz.add(l)
    #Now see if our sample term fuzzy matches any of those specified terms
    for name in results_dedupe:
        sample_term = name
        #matches is a list of tuples (prob, name)
        matches = fz.get(sample_term)
        if matches:
            max_match = max(matches, key=lambda x: x[0])
        else:
            max_match = None
        if max_match:
            match_df = df[df['name'].str.match(max_match[1])]
            if len(match_df) >= 1:
                match_df = return_likely_year_match_df(match_df)
                #print(len(match_df))
                name_list_all.append([
                    name, match_df.iloc[0]['name heading'],
                    match_df.iloc[0]['URI'], max_match[0]
                ])
                #select = [each for each in matches if each[0]>0.8]
                #print(select)
                if max_match[0] > 0.85:
                    #if select :
                    name_list_highmatch.append([
                        name, match_df.iloc[0]['name heading'],
                        match_df.iloc[0]['URI'], max_match[0]
                    ])
            else:
                name_list_all.append([name, '', '', ''])
        else:
            name_list_all.append([name, '', '', ''])
    return name_list_all, name_list_highmatch
Пример #15
0
def setCorrectWatchlist(season):
    config = readConfig()
    watchlist = config['watchlist']

    #Add shows from current season to fuzzyset list
    fset = fuzzyset.FuzzySet()
    for show in season:
        fset.add(show)

    #Generating correct watchlist
    fset_watchlist = {}
    for show in watchlist.keys():
        if fset.get(show)[0][0] >= 0.48:
            fset_watchlist[fset.get(show)[0][1]] = watchlist[show]
        else:
            print(
                "\033[91m[-] {} does not seem to be airing this season! Ignoring..\033[0m"
                .format(show))

    config['watchlist'] = fset_watchlist
    with open('data/config.json', 'w') as f:
        json.dump(config, f, indent=4)
Пример #16
0
def fuzzmatch(source_file, target_file):

    source_key = [column for column in source_file][0]
    target_key = [column for column in target_file][0]
    source_new_df = pd.DataFrame(columns=[column for column in source_file])
    target_new_df = pd.DataFrame(columns=[column for column in target_file])
    # mapping={}
    fuzzy_threshold = 0.125

    for i, row_source in source_file.iterrows():
        fuzzymatch = fuzzyset.FuzzySet()
        fuzzymatch.add(row_source[source_key])
        for j, row_target in target_file.iterrows():
            fuzzyval = fuzzymatch.get(row_target[target_key])
            #print(row_source[col1]) row_target[col1]
            if fuzzyval is None:
                continue
            elif fuzzyval[0][0] > fuzzy_threshold:
                #                print(fuzzyval)
                #add these rows to new dataframes
                source_new_df = source_new_df.append(row_source)
                target_new_df = target_new_df.append(row_target)
        del fuzzymatch
    return source_new_df, target_new_df
Пример #17
0
    def get_term_vectors_for_articles_fuzzy(self,
                                            tokens,
                                            sim_threshold=0.8,
                                            gram_size=6,
                                            max_len_diff=5,
                                            use_levenshtein=True,
                                            composite_size=2):
        c = self.conn.cursor()
        if self.fuzzyset is None:
            fs_path = pl.Path(self.db_path + "_article_fs.pickle")
            if self.load_cached_fs and fs_path.exists():
                print("Loading Fuzzy Set from disk")
                with fs_path.open("rb") as fs_file:
                    f_s = pickle.load(fs_file)
                    self.fuzzyset = f_s
            else:
                print("Creating Fuzzy Set")
                all_articles_query = """SELECT articles.article
                                        FROM articles
                                        """
                c.execute(all_articles_query)
                f_s = fs.FuzzySet(gram_size_lower=gram_size,
                                  gram_size_upper=gram_size,
                                  use_levenshtein=False)
                i = 0
                for article, in c:
                    f_s.add(article)
                    i += 1
                    # if i % 10000 == 00:
                    #     print("Articles processed: {}".format(i))
                self.fuzzyset = f_s
                with fs_path.open("wb") as fs_file:
                    pickle.dump(f_s, fs_file)
                print("Finished creating Fuzzy Set")
        self.fuzzyset.use_levenshtein = use_levenshtein
        vectors = defaultdict(dict)
        token_article_mapping = {}
        for i in range(composite_size, 0, -1):
            i_length_tokens = [
                " ".join(tokens[i2:i2 + i])
                for i2 in range(0, (len(tokens) + 1 - i))
            ]
            matched_articles = []
            i_m = 1
            for i_t in i_length_tokens:
                i_m += 1
                # print("Processed Token: {}".format(i_m))
                match = self.fuzzyset.get(i_t)
                if match:
                    sim, word = match[0]
                    len_dif = abs(len(word) - len(i_t))
                    word_len = len(word) if len(word) <= 15 else 15
                    length_adjusted_threshold = sim_threshold + 0.15 * (
                        word_len / 15)
                    condition = sim >= length_adjusted_threshold and len_dif <= max_len_diff
                    if condition:
                        token_article_mapping[i_t] = word
                        matched_articles.append(word)

            for token_batch in batch(matched_articles, self.batchsize):
                param_placeholders = ", ".join(
                    ["?" for _ in range(len(token_batch))])
                statement = terms_for_articles_statement.format(
                    param_placeholders)
                c.execute(statement, token_batch)
                for term, article, tf_idf in c:
                    vectors[article][term] = tf_idf

        return vectors, token_article_mapping
Пример #18
0
    'q',
    'r',
    's',
    't',
    'u',
    'v',
    'w',
    'x',
    'y',
    'z',
    "'",
    " ",
    '_'
]

names_db = fuzzyset.FuzzySet()
data_db = {}


def read_charset():
    charset = {}
    inv_charset = {}
    for i, v in enumerate(ENGLISH_CHAR_MAP):
        charset[i] = v
        inv_charset[v] = i

    return charset, inv_charset


chrset_index = {}
Пример #19
0
	def find_fuzzy_matches(df, city, street, all_streets, ed_st_dict, source):

		#Fuzzy matching algorithm
		def fuzzy_match_function(street, ed, ed_st_dict, all_streets_fuzzyset, check_too_similar=False):

			nomatch = ['', '', False]
			ed = str(ed)

			#Return null if street is blank
			if street == '':
				return nomatch
			#Microdata ED may not be in Steve Morse, if so then add it to problem ED list and return null
			try:
				ed_streets = ed_st_dict[ed]
				ed_streets_fuzzyset = fuzzyset.FuzzySet(ed_streets)
			except:
			#	print("Problem ED:" + str(ed))
				return nomatch

			#Step 1: Find best match among streets associated with microdata ED
			try:
				best_match_ed = ed_streets_fuzzyset[street][0]
			except:
				return nomatch

			#Step 2: Find best match among all streets
			try:
				best_match_all = all_streets_fuzzyset[street][0]
			except:
				return nomatch    
			#Step 3: If both best matches are the same, return as best match

			if (best_match_ed[1] == best_match_all[1]) & (best_match_ed[0] >= 0.5):
				#Check how many other streets in ED differ by one character
				if check_too_similar:
					too_similar = sum([diff_by_one_char(st, best_match_ed[1]) for st in sm_ed_streets])
					if too_similar == 0:
						return [best_match_ed[1], best_match_ed[0], True]
					else:
						return nomatch
				else: 
					return [best_match_ed[1], best_match_ed[0], True]
			#Step 4: If both are not the same, return one with the higher score (to help manual cleaning)
			else:
				if best_match_all[0] < best_match_ed[0]:
					return [best_match_ed[1], best_match_ed[0], False]
				else:
					return [best_match_all[1], best_match_all[0], False]

		#Helper function (necessary since dictionary built only for cases without validated exact matches)
		def get_fuzzy_match(exact_match, fuzzy_match_dict, street, ed):
			#Only look at cases without validated exact match
			if not (exact_match):
				#Need to make sure "Unnamed" street doesn't get fuzzy matched
				if 'Unnamed' in street:
					return ['', '', False]
				#Get fuzzy match    
				else:
					return fuzzy_match_dict[street, ed]
			#Return null if exact validated match
			else:
				return ['', '', False]

		#Set var names
		fuzzy_match = 'fuzzy_match_'+source 
		fuzzy_bool = 'fuzzy_match_bool_'+source
		fuzzy_score = 'fuzzy_match_score_'+source

		#Create all street fuzzyset only once
		all_streets_fuzzyset = fuzzyset.FuzzySet(all_streets)

		#Create dictionary based on Street-ED pairs for faster lookup using helper function
		df_no_exact_match = df[~df['current_match_bool']]
		df_grouped = df_no_exact_match.groupby([street, 'ed'])
		fuzzy_match_dict = {}
		for st_ed, _ in df_grouped:
			fuzzy_match_dict[st_ed] = fuzzy_match_function(st_ed[0], st_ed[1], ed_st_dict, all_streets_fuzzyset)

		#Compute current number of residuals
		num_records = len(df)
		num_current_residual_cases = num_records - len(df[df['current_match_bool']])
		#Get fuzzy matches 
		df[fuzzy_match], df[fuzzy_score], df[fuzzy_bool] = zip(*df.apply(lambda x: get_fuzzy_match(x['current_match_bool'], fuzzy_match_dict, x[street], x['ed']), axis=1))
		#Update current match 
		df['current_match'], df['current_match_bool'] = zip(*df.apply(lambda x: update_current_match(x['current_match'], x['current_match_bool'], x[fuzzy_match], x[fuzzy_bool], x[street]),axis=1))

		#Generate dashboard information
		num_fuzzy_matches = np.sum(df[fuzzy_bool])
		prop_fuzzy_matches = float(num_fuzzy_matches)/num_records
		fuzzy_info = [num_fuzzy_matches]

		print("Fuzzy matches (using "+source+"): "+str(num_fuzzy_matches)+" of "+str(num_current_residual_cases)+" unmatched cases ("+str(round(100*float(num_fuzzy_matches)/float(num_current_residual_cases), 1))+"%)")

		return df, fuzzy_info
Пример #20
0
def process():
    names = fuzzyset.FuzzySet()
    names.add('stas khirman')
    names.add('khirman stas')
    names.add('stas')
    names.add('khirman')
    # drv = driver.load_driver('multimodel')
    serving = multimodel.MultiModelDriver(init_hook=face_badge.init_hook, process=face_badge.process_internal)
    kwargs = {'ml-serving-drivers': ['openvino', 'tensorflow', 'tensorflow']}
    serving.load_model(['./vidos/faces/face-detection.xml', './vidos/m1', './vidos/m2'], **kwargs)
    global to_process
    i_name = 1
    while runned:
        lock.acquire(blocking=True)
        frame = to_process
        if frame is None:
            lock.release()
            continue
        print('start frame')
        to_process = None
        results = serving.predict_hooks({
            'pixel_threshold': 0.5,
            'link_threshold': 0.5,
            'image': frame
        })
        frame = results['output']
        table = results['table_output']
        found_name = None
        candidates = []
        for e in table:
            text = e['name']
            if len(text) > 2:
                found = names.get(text)
                if (found is not None) and (len(found) > 0):
                    if found[0][0] > 0.7:
                        text = found[0][1]
                        if ' ' in text:
                            found_name = (found[0][0], text)
                            candidates = []
                            break
                        else:
                            candidates.append(text)
        if (found_name is None) and len(candidates) > 0:
            found_name = choose_one(names, candidates)
        if found_name is not None:
            add_overlays(frame, found_name[0], found_name[1])
            to_save = e['image'][:, :, ::-1]
            if output_dir != '':
                name = found_name[1].replace(" ", "_")
                to_dir = '{}/{}'.format(output_dir, name)
                if not os.path.exists(to_dir):
                    os.mkdir(to_dir)
                fname = '{}/auto_{}_{}.jpg'.format(to_dir, int(time.time()), i_name)
                logging.info('Save new picture: {}'.format(fname))
                cv2.imwrite(fname, to_save)
                global new_count
                new_count = 1
            global result
            result = frame
            i_name += 1
        global last_processed
        last_processed = frame
        lock.release()
        print('stop frame')
Пример #21
0
 def test_type_directInstantiation(self):
     fs = fuzzyset.FuzzySet() 
     self.assertTrue(isinstance(fs, fuzzyset.FuzzySet))
Пример #22
0
import fuzzyset
from fuzzyset import *

# a = fuzzyset.FuzzySet
f = open(
    "/Users/dongxinyuan/Desktop/Projektpraktikum Information Service Engineering/data/rawdata/split/test.txt",
    "r")
testlist = f.read().split("\n")
a = fuzzyset.FuzzySet(testlist)

for l in testlist:
    a.add(l)

# a.add("asd")

# a = f.read().split("\n")
print(a.get("history"))
Пример #23
0
    def compare_st_to_grid(info, year):
        stname, ed_list = info

        # convert ed_list to all strings, make lower to match letters in polygon files
        ed_list = [str(a).lower() for a in ed_list]
        # spatial join the grid with EDs the microdata street appears in
        if year == 1930:
            joined = gpd.sjoin(
                ed_poly_30.loc[ed_poly_30.ED_edit.isin(ed_list)], grid_geo)
        if year == 1940:
            joined = gpd.sjoin(ed_poly_40.loc[ed_poly_40.ED_num.isin(ed_list)],
                               grid_geo)
        # create new df out of unique street names in grid
        df = pd.DataFrame({
            'grid':
            joined.st30.dropna().unique().tolist() +
            joined.st40.dropna().unique().tolist()
        })
        df = df.loc[df.grid != '']
        # drop direction from all grid names
        df['nodir'] = df.grid.apply(drop_dir).apply(
            lambda x: x.decode('utf-8', 'ignore'))
        # separate type from all grid names
        df['notype'] = df.nodir.apply(drop_type).apply(
            lambda x: x.decode('utf-8', 'ignore'))
        #df = df.loc[df.nodir != df.notype]
        df['type'] = df.nodir.apply(st_type)
        df = df.loc[df.type.apply(lambda x: x is not None)]
        df['type'] = df['type'].apply(lambda x: x.group().replace(
            ' ', '')).apply(lambda x: x.decode('utf-8', 'ignore'))

        # create a list of no-direction street names from the entire grid
        all_streets = grid_geo.st30.dropna().unique().tolist(
        ) + grid_geo.st40.dropna().unique().tolist()
        all_streets = map(drop_dir, all_streets)
        all_streets = list(set(all_streets))

        # CHECK 1: if microdata street appears exactly IN ENTIRE GRID, drop from list to check later
        stname_nodir = drop_dir(stname).decode('utf-8', 'ignore')
        if stname_nodir in all_streets:
            return pd.DataFrame({
                'original': [stname],
                'exact': [1],
                'type_fix': [0],
                'fuzzy': [0],
                'check_grid': [0],
                'correct_st': ['']
            })

        # CHECK 2: if microdata name of street appears, and both types are one of St, Ave, Road, accept the grid version as correct
        stname_nodir_notype = drop_type(stname_nodir)
        try:
            stname_type = st_type(stname_nodir).group().replace(' ', '')
        # if code to get street type fails, it should be looked for in grid
        except:
            return pd.DataFrame({
                'original': [stname],
                'exact': [0],
                'type_fix': [0],
                'fuzzy': [0],
                'check_grid': [1],
                'correct_st': ['']
            })
        # check condition
        if stname_nodir_notype in df['notype'].tolist():
            check = df.loc[(df.notype == stname_nodir_notype) & (
                df.type.isin(['St', 'Ave', 'Road']))].reset_index()
            # check that there is only one option to choose from
            if len(check) == 1:
                return pd.DataFrame({
                    'original': [stname],
                    'exact': [0],
                    'type_fix': [1],
                    'fuzzy': [0],
                    'check_grid': [0],
                    'correct_st': [check.iloc[0]['nodir']]
                })
            else:
                return pd.DataFrame({
                    'original': [stname],
                    'exact': [0],
                    'type_fix': [0],
                    'fuzzy': [0],
                    'check_grid': [1],
                    'correct_st': ['']
                })

        # CHECK 3: if microdata street has very close name fuzzy match and exact type match, accept the grid version as correct
        notype_fs = fuzzyset.FuzzySet(df.notype.unique())
        match = notype_fs.get(stname_nodir_notype)
        # if match fails, case must be checked in grid
        if match is None:
            return pd.DataFrame({
                'original': [stname],
                'exact': [0],
                'type_fix': [0],
                'fuzzy': [0],
                'check_grid': [1],
                'correct_st': ['']
            })
        # first, see if best match is high enough score
        if match[0][0] > 0.8:
            # next, see if corresponding grid name has exact same type
            check = df.loc[df.notype == match[0][1]].reset_index()
            # confirm only one matching street
            if len(check) == 1:
                # confirm same type
                if stname_type == check.iloc[0]['type']:
                    return pd.DataFrame({
                        'original': [stname],
                        'exact': [0],
                        'type_fix': [0],
                        'fuzzy': [1],
                        'check_grid': [0],
                        'correct_st': [check.iloc[0]['nodir']]
                    })
                else:
                    return pd.DataFrame({
                        'original': [stname],
                        'exact': [0],
                        'type_fix': [0],
                        'fuzzy': [0],
                        'check_grid': [1],
                        'correct_st': ['']
                    })
            else:
                return pd.DataFrame({
                    'original': [stname],
                    'exact': [0],
                    'type_fix': [0],
                    'fuzzy': [0],
                    'check_grid': [1],
                    'correct_st': ['']
                })

        # if all checks fail, return a row marking that street to be checked, no correct street yet
        else:
            return pd.DataFrame({
                'original': [stname],
                'exact': [0],
                'type_fix': [0],
                'fuzzy': [0],
                'check_grid': [1],
                'correct_st': ['']
            })
Пример #24
0
import pandas as pd
import fuzzyset
import sys

import util.util as util
import util.constants as c

db = pd.read_pickle(f'resources/compare_{c.N_NGRAM}.db')
match_set = fuzzyset.FuzzySet(db.ngram)

def get_matches(seq):
    ngram = midi_to_ngram(seq)
    best_match = match_set.get(ngram)[0][1]
    matches = db[db.ngram==best_match]

    # Information comes from *next* ngram
    to_drop = []
    for i, r in matches.iterrows():
        if i+c.N_NGRAM >= len(db):
            to_drop.append(i)
        elif db.iloc[i].track != db.iloc[i+c.N_NGRAM].track:
            to_drop.append(i)
        elif db.iloc[i].track in c.reference_songs:
            to_drop.append(i)
    matches = matches.drop(to_drop)
    next_matches = db.iloc[matches.index+c.N_NGRAM]

    if matches.empty:
        # print('Error: End of song.')
        return None, None
Пример #25
0
def updateList(filename):
	toUpdate = input("[*] Update list on MAL? (y/n): ")
	if toUpdate != 'y':
		print("\n")
		return

	print("[*] Preparing to update list")
	# PARSER HERE - Get Anime name from filename
	# Using regex as temp. solution (works only for file names following HorribleSubs naming format)
	try:
		d = ap.Parse(filename)
		animename = d.getParsedValues()['anime']
		# print(d.finalList)
		# animename = re.split("\]|\)|\[|\(", filename)[2].split('-')[0].strip()
	except:
		print("\033[91m[-] Unsupported filename format/Not an anime file! Skipping.\033[0m")
		return
	# Get loginData from login json file
	with open("data/loginData.json", "r") as f:
		loginData = json.load(f)

	# if loginData is empty or it's been "expires_in" seconds (expiration of the access_token), do a fresh login
	if (not loginData) or ((time.time() - float(loginData['access_token'][1])) > float(loginData['expires_in'])):
		print("[*] Doing a fresh login")
		# Get login credentials from config
		info = credentialCheck()

		# Get loginInfo by logging in and add current timestamp to file
		loginInfo = mal.User.login(info[0], info[1])
		loginInfo['access_token'] = [loginInfo['access_token'], str(time.time())]

		with open("data/loginData.json", "w+") as f:
			json.dump(loginInfo, f, indent=4)
		AT = loginInfo['access_token'][0]

	# else, get the existing access_token from the json file
	else:
		credentialCheck()
		print("[*] Grabbing existing Access Token from file")
		AT = loginData['access_token'][0]

	# Get User's watchlist
	animeInfo = mal.User.getAnimeList(AT, "watching", ["alternative_titles", "num_episodes", "my_list_status"])
	aniList = []

	# Iterate through page-wise responses in animeInfo and form one single list with anime name and id
	for res in animeInfo:
		# print("Response: {}".format(res))
		t_aniList = []
		for i,item in enumerate(res['data']):
			t_aniList.append({'names': None, "id": "", "episodes": "", "status": ""})

			originalTitle = item['node']['title']
			engTitle = item['node']['alternative_titles']['en']
			japTitle = item['node']['alternative_titles']['ja']
			# Adding original, english, japanese and all other alternate_titles to names list
			t_aniList[i]['names'] = [originalTitle, engTitle, japTitle] + [name for name in item['node']['alternative_titles']['synonyms']]
			# Adding id of show
			t_aniList[i]['id'] = str(item['node']['id'])
			t_aniList[i]['episodes'] = str(item['node']['num_episodes'])
			t_aniList[i]['status'] = item['node']['my_list_status']['status']

		aniList += t_aniList

	# probDict = {}
	probValues = []
	fset = fuzzyset.FuzzySet()
	fset.add(animename)

	# print("{}: {}".format(aniList, len(aniList)))
	for show in aniList:
		probList = []
		for name in show['names']:
			# print("Name: {}".format(name))
			# Filter out blank entries from the list
			if name is not "":
				fuzzyInfo = fset.get(name)
				#Fuzzy returns None if 2 strings are completely different: filtering out those cases
				if fuzzyInfo is not None:
					# print("Fuzzy: {}".format(fuzzyInfo))
					probList.append(fuzzyInfo[0][0])

		# print("probList: {}".format(probList))
		# Add 0 probability if probList is empty (fuzzyInfo returned None, so show was never added to probList), hence definitely not this show
		if not probList:
			probList.append(0)
		# Add "show: max probability" key-value pair in dictionary
		# Key is first name from list of names of the show (first name is always 'title', the name MAL uses on the website by default)
		# probDict[show['names'][0]] = str(max(probList))
		probValues.append(max(probList))

	# print(probValues)
	if max(probValues) >= 0.5:
		toUpdate_idx = probValues.index(max(probValues))
		toUpdate_name = aniList[toUpdate_idx]['names'][0]
		toUpdate_ID = aniList[toUpdate_idx]['id']
		toUpdate_Eps = int(aniList[toUpdate_idx]['episodes'])
		toUpdate_status = aniList[toUpdate_idx]['status']
		# print("{} -> {}".format(toUpdate_name, toUpdate_ID))

		#Get previously watched episodes from list
		for res in animeInfo:
			for show in res['data']:
				if show['node']['title'] == toUpdate_name:
				 	oldVal = int(show['node']['my_list_status']['num_episodes_watched'])

		#Update list with previously watched episodes + 1
		mal.User.updateList(AT, toUpdate_ID, {"num_watched_episodes": oldVal + 1})
		print("\033[92m[+]\033[0m \033[93m{}\033[0m \033[92mwas updated!\033[0m \033[96m{} --> {}\033[0m".format(toUpdate_name, oldVal, oldVal + 1))
		if (oldVal + 1) == toUpdate_Eps:
			mal.User.updateList(AT, toUpdate_ID, {"status": "completed"})
			print("\033[92m[+] Anime Completed!\033[0m Status updated: \033[96m{} --> completed\033[0m".format(toUpdate_status))
			try:
				score = int(input("[*] Score? (1-10): "))
				if score >= 1 and score <= 10:
					mal.User.updateList(AT, toUpdate_ID, {"score": score})
					print("\033[92m[*] Score updated.\033[0m")
					print("\033[92m[*] Done.\033[0m")
				else:
					print("\033[91m[*] Skipped.\033[0m")
			except:
				print("\033[91m[-] Invalid input.\033[0m")

	else:
		print("\033[91m[-] This show does not seem to be in your watchlist! Skipping.\033[0m\n")
		return

	print("")
Пример #26
0
def find_fuzzy_matches(df, city, street, sm_all_streets, sm_ed_st_dict, check_too_similar=False):

	try:
		post = '_' + street.split('_')[2].split('HN')[0]
	except:
		post = ''

	num_records = len(df)

	cprint("Fuzzy matching algorithm for %s \n" % (street), attrs=['underline'], file=AnsiToWin32(sys.stdout))

	start = time.time()

	#
	# Find the best matching Steve Morse street name
	#

	#Create a set of all streets for fuzzy matching (create once, call on)
	sm_all_streets_fuzzyset = fuzzyset.FuzzySet(sm_all_streets)

	#Keep track of problem EDs
	problem_EDs = []

	#Function to check if street names differ by one character
	def diff_by_one_char(st1, st2):
		if len(st1) == len(st2):
			st1chars = list(st1)
			st2chars = list(st2)
			#Check how many characters differ, return True if only 1 character difference
			if sum([st1chars[i]!=st2chars[i] for i in range(len(st1chars))]) == 1:
				return True
			else:
				return False
		else:
			return False

	#Fuzzy matching algorithm
	def sm_fuzzy_match(street, ed):

		nomatch = ['', '', False]
	
		#Return null if street is blank
		if street == '':
			return nomatch
		#Microdata ED may not be in Steve Morse, if so then add it to problem ED list and return null
		try:
			sm_ed_streets = sm_ed_st_dict[ed]
			sm_ed_streets_fuzzyset = fuzzyset.FuzzySet(sm_ed_streets)
		except:
			problem_EDs.append(ed)
			return nomatch

		#Step 1: Find best match among streets associated with microdata ED
		try:
			best_match_ed = sm_ed_streets_fuzzyset[street][0]
		except:
			return nomatch
		#Step 2: Find best match among all streets
		try:
			best_match_all = sm_all_streets_fuzzyset[street][0]
		except:
			return nomatch    
		#Step 3: If both best matches are the same, return as best match
		if (best_match_ed[1] == best_match_all[1]) & (best_match_ed[0] >= 0.5):
			#Check how many other streets in ED differ by one character
			if check_too_similar:
				too_similar = sum([diff_by_one_char(st, best_match_ed[1]) for st in sm_ed_streets])
				if too_similar == 0:
					return [best_match_ed[1], best_match_ed[0], True]
				else:
					return nomatch
			else: 
				return [best_match_ed[1], best_match_ed[0], True]
		#Step 4: If both are not the same, return one with the higher score (to help manual cleaning)
		else:
			if best_match_all[0] < best_match_ed[0]:
				return [best_match_ed[1], best_match_ed[0], False]
			else:
				return [best_match_all[1], best_match_all[0], False]

	#Create dictionary based on Street-ED pairs for faster lookup using helper function
	df_no_exact_match = df[~(df['exact_match_bool'+post])]
	df_grouped = df_no_exact_match.groupby([street, 'ed'])
	sm_fuzzy_match_dict = {}
	for st_ed, _ in df_grouped:
		sm_fuzzy_match_dict[st_ed] = sm_fuzzy_match(st_ed[0], st_ed[1])

	#Helper function (necessary since dictionary built only for cases without validated exact matches)
	def get_fuzzy_match(exact_match, street, ed):
		#Only look at cases without validated exact match
		if not (exact_match):
			#Need to make sure "Unnamed" street doesn't get fuzzy matched
			if 'Unnamed' in street:
				return ['', '', False]
			#Get fuzzy match    
			else:
				return sm_fuzzy_match_dict[street, ed]
		#Return null if exact validated match
		else:
			return ['', '', False]

	#Get fuzzy matches 
	df['fuzzy_match_sm'+post], df['fuzzy_match_sm_score'+post], df['fuzzy_match_sm_bool'+post] = zip(*df.apply(lambda x: get_fuzzy_match(x['exact_match_bool'+post], x[street], x['ed']), axis=1))

	#Compute number of cases without exact match
	num_current_residual_cases = num_records - len(df[df['exact_match_bool'+post]])

	#Generate dashboard information
	num_fuzzy_matches = np.sum(df['fuzzy_match_sm_bool'+post])
	prop_sm_fuzzy_matches = float(num_fuzzy_matches)/num_records
	end = time.time()
	fuzzy_matching_time = round(float(end-start)/60, 1)
	fuzzy_info = [num_fuzzy_matches, fuzzy_matching_time, problem_EDs]

	cprint("Fuzzy matches (using microdata ED): "+str(num_fuzzy_matches)+" of "+str(num_current_residual_cases)+" unmatched cases ("+str(round(100*float(num_fuzzy_matches)/float(num_current_residual_cases), 1))+"%)\n", file=AnsiToWin32(sys.stdout))
	cprint("Fuzzy matching for %s took %s\n" % (city, fuzzy_matching_time), 'cyan', attrs=['dark'], file=AnsiToWin32(sys.stdout))

	return df, fuzzy_info
Пример #27
0
def find_fuzzy_matches(df, city, street, sm_all_streets, sm_ed_st_dict):

    num_records = df['st_edit'].notnull().sum()

    #
    # Identify exact matches to exclude from fuzzy match search
    #

    df['st_edit_exact_match'] = df[street].apply(lambda s: s in sm_all_streets)
    print("Exact matches: %s of %s" %
          (str(df['st_edit_exact_match'].sum()), str(num_records)))

    #
    # Find the best matching Steve Morse street name
    #

    #Create a set of all streets for fuzzy matching (create once, call on)
    sm_all_streets_fuzzyset = fuzzyset.FuzzySet(sm_all_streets)

    #Keep track of problem EDs
    problem_EDs = []

    #Fuzzy matching algorithm
    def sm_fuzzy_match(street, ed):

        if ed != str:
            ed = str(int(ed))

        #Return null if street is blank
        if street == '':
            return ['', '', False]

        #Microdata ED may not be in Steve Morse, if so then add it to problem ED list and return null
        try:
            sm_ed_streets = sm_ed_st_dict[ed]
            sm_ed_streets_fuzzyset = fuzzyset.FuzzySet(sm_ed_streets)
        except:
            problem_EDs.append(ed)
            return ['', '', False]

        #Step 1: Find best match among streets associated with microdata ED
        try:
            best_match_ed = sm_ed_streets_fuzzyset[street][0]
        except:
            return ['', '', False]

        #Step 2: Find best match among all streets
        try:
            best_match_all = sm_all_streets_fuzzyset[street][0]
        except:
            return ['', '', False]

        #Step 3: If both best matches are the same, return as best match
        if (best_match_ed[1] == best_match_all[1]) & (best_match_ed[0] >= 0.5):
            return [best_match_ed[1], best_match_ed[0], True]
        else:
            return ['', '', False]

    #Create dictionary based on Street-ED pairs for faster lookup using helper function
    df_st_edit = df[~df['st_edit_exact_match']]
    df_grouped = df_st_edit.groupby([street, 'ed'])
    sm_fuzzy_match_dict = {}
    for st_ed, _ in df_grouped:
        sm_fuzzy_match_dict[st_ed] = sm_fuzzy_match(st_ed[0], st_ed[1])

    #Helper function (necessary since dictionary built only for cases without validated exact matches)
    def get_fuzzy_match(exact_match, street, ed):
        #Only look at cases without validated exact match
        if not (exact_match):
            #Need to make sure "Unnamed" street doesn't get fuzzy matched
            if 'Unnamed' in street:
                return ['', '', False]
            #Get fuzzy match
            else:
                return sm_fuzzy_match_dict[street, ed]
        #Return null if exact validated match
        else:
            return ['', '', False]

    #Get fuzzy matches
    df['st_edit_fuzzy_match'], df['st_edit_fuzzy_match_score'], df[
        'st_edit_fuzzy_match_bool'] = zip(*df.apply(lambda x: get_fuzzy_match(
            x['st_edit_exact_match'], x[street], x['ed']),
                                                    axis=1))

    print("Fuzzy matches: %s of %s" %
          (str(df['st_edit_fuzzy_match_bool'].sum()), str(len(df))))
    print("Unmatched cases: %s of %s" % (str(
        len(df) - df['st_edit_fuzzy_match_bool'].sum() -
        df['st_edit_exact_match'].sum()), str(len(df))))

    df['st_edit_matched'] = df['st_edit']
    df.loc[~df['st_edit_exact_match'] & df['st_edit_fuzzy_match_bool'],
           'st_edit_matched'] = df['st_edit_fuzzy_match']

    return df
Пример #28
0
import fuzzyset

fz = fuzzyset.FuzzySet()
#Create a list of terms we would like to match against in a fuzzy way
for l in ["Diane Abbott", "Boris Johnson"]:
    fz.add(l)

#Now see if our sample term fuzzy matches any of those specified terms
sample_term='Boris Johnstone'
fz.get(sample_term)
#   , fz.get('Diana Abbot'), fz.get('Joanna Lumley')
Пример #29
0
from django.http import HttpResponse, JsonResponse
import os, mimetypes
import fuzzyset

THRESHOLD = 0.3
DIFFERENCE_THRESHOLD = 0

# Index the lines in all our subtitles once upon deploy
directory = os.path.dirname(os.path.realpath(__file__)) + "/../data"
index = 0
sets = {}
for filename in os.listdir(directory):
    if filename[0] == ".":
        continue
    subfile = open(directory + "/" + filename)
    fset = fuzzyset.FuzzySet()
    count = -1
    for line in subfile:
        count += 1
        line = line.strip()
        if count <= 1:
            continue
        elif line == "":
            count = -1
            continue
        fset.add(line)
        index += 1
    sets[filename] = fset


def subs(request):
Пример #30
0
def process():
    size = 1024
    charset, _ = read_charset()
    global chrset_index
    chrset_index = charset
    names = fuzzyset.FuzzySet()
    names.add('stas khirman')
    names.add('khirman stas')
    names.add('stas')
    names.add('khirman')
    drv1 = driver.load_driver('tensorflow')
    serving1 = drv1()
    serving1.load_model('./m1')
    drv2 = driver.load_driver('tensorflow')
    serving2 = drv2()
    serving2.load_model('./m2')
    global to_process
    i_name = 1
    while runned:
        lock.acquire(blocking=True)
        frame = to_process
        if frame is None:
            lock.release()
            continue
        print('start frame')
        to_process = None
        w = frame.shape[1]
        h = frame.shape[0]
        if w > h:
            if w > size:
                ratio = size / float(w)
                h = int(float(h) * ratio)
                w = size
            else:
                if h > size:
                    ratio = size / float(h)
                    w = int(float(w) * ratio)
                    h = size
        w = fix_length(w,32)
        h = fix_length(h,32)
        original = frame[:, :, ::-1].copy()
        image = cv2.resize(original, (w, h))
        image = image.astype(np.float32) / 255.0
        image = np.expand_dims(image, 0)
        outputs = serving1.predict({'image': image})
        cls = outputs['pixel_pos_scores'][0]
        links = outputs['link_pos_scores'][0]
        mask = decodeImageByJoin(cls, links, 0.5, 0.1)
        bboxes = maskToBoxes(mask, (original.shape[1], original.shape[0]))
        found_name = None
        candidates = []
        for i in range(len(bboxes)):
            box = np.int0(cv2.boxPoints(bboxes[i]))
            maxp = np.max(box, axis=0) + 2
            minp = np.min(box, axis=0) - 2

            y1 = max(0, minp[1])
            y2 = min(original.shape[0], maxp[1])
            x1 = max(0, minp[0])
            x2 = min(original.shape[1], maxp[0])
            text_img = original[y1:y2, x1:x2, :]
            if text_img.shape[0] < 4 or text_img.shape[1] < 4:
                continue
            #if bboxes[i][1][0]>bboxes[i][1][1]:
            #    angle = -1*bboxes[i][2]
            #else:
            #    angle = -1*(90+bboxes[i][2])
            #if angle!=0:
            #    text_img = rotate_bound(text_img,angle)
            text_img = norm_image_for_text_prediction(text_img, 32, 320)
            text_img = np.expand_dims(text_img, 0)
            text = serving2.predict({'images':text_img})
            text = text['output'][0]
            text = get_text(text)
            if len(text)>2:
                print('text: {}'.format(text))
                found = names.get(text)
                if (found is not None) and (len(found)>0):
                    print(found[0])
                    if found[0][0]>0.7:
                        text = found[0][1]
                        if ' ' in text:
                            found_name = (found[0][0],text)
                            candidates = []
                            break
                        else:
                            candidates.append(text)
            if (found_name is None) and len(candidates)>0:
                found_name = choose_one(names,candidates)
        for i in bboxes:
            box = cv2.boxPoints(i)
            box = np.int0(box)
            original = cv2.drawContours(original, [box], 0, (255, 0, 0), 2)
        frame = np.ascontiguousarray(original[:, :, ::-1],np.uint8)
        if found_name is not None:
            add_overlays(frame,found_name[0],found_name[1])
            cv2.imwrite('results/result_{}.jpg'.format(i_name),frame)
            global result
            result = frame
            i_name+=1
        global last_processed
        last_processed = frame
        lock.release()
        print('stop frame')