def fw_partial_token_set_ratio(question1, question2): fuzzy = [] for q1, q2 in zip(question1, question2): partial_ratio = fuzz.partial_token_set_ratio(str(q1), str(q2)) / 100 fuzzy.append([partial_ratio]) print("Created fuzz partial_token_set_ratio feature") return np.array(fuzzy)
def fit(self, X): d = defaultdict(list) for name in set(X): name = name.strip() name2 = re.sub(self.ex_lookup[self.stop_words],'',name).strip() if len(d)==0: d[' '.join(name2.split()[:2])].append(name) else: score = 0 group = 'string' for key in d.iterkeys(): test = fuzz.partial_token_set_ratio(name2,key) if test>score: score, group = test, key group = key if score>70: d[group].append(name) else: d[' '.join(name2.split()[:2])].append(name) self.MAP = {} for k,v in d.iteritems(): values = [] for name in v: values.append((X.count(name), name)) simp = sorted(values, reverse=True)[:1][0] for name in v: self.MAP[name] = simp[1] return self
def extractFeatures(data,filename,column,bagOfWords): print('Extract features for',filename) data[column].fillna('NA', inplace=True) for word in bagOfWords: print(word) ratio = [fuzz.partial_token_set_ratio( re.sub(r'[\s"\\]', ' ', cmt).strip().upper(), word ) for cmt in data[column]] data['is' + word] = np.array([r > 70 for r in ratio]) * 1 data.to_csv(filename, sep=',', encoding='utf-8')
def extractFeatures(data, name): print("Extract features for", name) data["SUMMARYOPS"].fillna("NA", inplace=True) for word in bagOfWords: print(word) ratio = [ fuzz.partial_token_set_ratio(re.sub(r'[\s"\\]', " ", cmt).strip().upper(), word) for cmt in data["SUMMARYOPS"] ] data["is" + word] = np.array([r > 70 for r in ratio]) * 1 data.to_csv(name, sep=",", encoding="utf-8")
def curate(self): punc = set(string.punctuation) story_count = 0 for source in self._sources: feed = feedparser.parse(source['url']) if 'entries' in feed: entries = feed['entries'] for entry in entries: story = {} story['_source_id'] = source['id'] story['_source_name'] = source['name'] story['_source_url'] = source['url'] story['_path'] = self.generate_path() if 'tags' in entry: story['_tags'] = ''.join([tag['term'] for tag in entry['tags']]) else: story['_tags'] = '' media_content_url = '' if 'media_content' in entry: for media_content in entry['media_content']: media_content_url = media_content['url'] break if media_content_url == '': if 'links' in entry: for link in entry['links']: if 'type' in link and 'href' in link and 'image' in link['type']: media_content_url = link['href'] break story['_media_content'] = media_content_url if 'published_parsed' in entry: story['_published_parsed'] = datetime.fromtimestamp(mktime(entry['published_parsed'])) else: story['_published_parsed'] = datetime.utcnow() if 'author' in entry: story['_author'] = entry['author'] else: story['_author'] = '' if 'summary' in entry: story['_summary'] = self.remove_markup(entry['summary']) story['_fuzzy_summary'] = ''.join(sorted( ['%s ' % w.upper() for w in ''.join([c for c in story['_summary'] if c not in punc]).split() if len(w) > 3 and w.isalpha()])) else: story['_summary'] = '' story['_fuzzy_summary'] = '' if 'id' in entry: story['_id'] = entry['id'] else: story['_id'] = entry['link'] story.update(entry) id = pdb.add_story(story) if id is not None: fuzzy_summaries = pdb.get_fuzzy_summaries(id) for summary in fuzzy_summaries: score = fuzz.partial_token_set_ratio(summary['fuzzy_summary'], story['_fuzzy_summary']) if score >= 50: topic_id = pdb.get_topic_story(summary['id']) if topic_id is not None: pdb.add_topic_story(topic_id, id, score) else: topic_id = pdb.add_topic(summary['s_title']) pdb.add_topic_story(topic_id, summary['id'], score) pdb.add_topic_story(topic_id, id, score) break story_count += 1 if not story_count % 50: print 'story_count: %d' % story_count print 'story_count: %d' % story_count
data = pd.read_csv('data/quora_duplicate_questions.tsv', sep='\t') data = data.drop(['id', 'qid1', 'qid2'], axis=1) data['len_q1'] = data.question1.apply(lambda x: len(str(x))) data['len_q2'] = data.question2.apply(lambda x: len(str(x))) data['diff_len'] = data.len_q1 - data.len_q2 data['len_char_q1'] = data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', ''))))) data['len_char_q2'] = data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', ''))))) data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split())) data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split())) data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1) data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True) data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1) norm_model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True) norm_model.init_sims(replace=True) data['norm_wmd'] = data.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1) question1_vectors = np.zeros((data.shape[0], 300)) error_count = 0
def fit(self, refdict=None, thresholds=[80,80], parentdict=None): """Construct fuzzy-key lookup dict from all records (FieldUniqueCounter dict) as {raw:common}""" if parentdict is not None: self.parentdict = parentdict else: pd_tmp = [(pair[1],pair[0],re.sub(self.ex_lookup[self.stop_words],'',pair[0])) for pair in refdict.iteritems()] pd_tmp = sorted(pd_tmp,reverse=True) # initial condenser, set primary keys for trip in pd_tmp: if trip[0]>100: self.parentdict[trip].append(trip) else: score,group = 0,(0,'a','a') for key in self.parentdict.iterkeys(): test = fuzz.partial_token_set_ratio(trip[2],key[2]) # matcher if test>score: score, group = test, key if score>=thresholds[0]: # score threshold self.parentdict[group].append(trip) else: self.parentdict[trip].append(trip) # score inter-key matches triplist = self.parentdict.keys() n = len(triplist) tripmatch = defaultdict(list) for i,trip in enumerate(triplist): score,match = 0,(0,0,0) for j in xrange(i+1,n): trip2 = triplist[j] test = fuzz.partial_token_set_ratio(trip[2],trip2[2]) # matcher if test>score: score,match = test,trip2 if match == (0,0,0): pass else: tripmatch[trip] += [score,match] # print 'tripmatch len = {}'.format(len(tripmatch)) # self.tripmatch = tripmatch # secondary condenser, aggregate primary key matches tickdict = defaultdict(list) ticker = 0 key,match = tripmatch.popitem() tickdict[ticker].append(key) while len(tripmatch)>0: # there is something problematic here i think.... if len(match) != 2: key,match = tripmatch.popitem() if match[0]>=thresholds[0]: tickdict[ticker].append(match[1]) newmatch = tripmatch[match[1]] del tripmatch[match[1]] else: ticker += 1 tickdict[ticker].append(match[1]) newmatch = tripmatch[match[1]] del tripmatch[match[1]] match = newmatch # print 'tickdict len = {}'.format(len(tickdict)) # self.tickdict = tickdict # set common key, gather names (as tuples) self.finaldict = defaultdict(list) for vlist in tickdict.itervalues(): common = max(vlist) for trip in vlist: self.finaldict[common] += [x for x in self.parentdict[trip]] # construct MAP for k,v in self.finaldict.iteritems(): for trip in v: self.MAP[trip[1]] = k[1] return self
lambda x: len(str(x).split())) # 问句2的单词个数 data['common_words'] = data.apply(lambda x: len( set(str(x['question1']).lower().split()).intersection( set(str(x['question2']).lower().split()))), axis=1) # 问句1与问句2的相同单词数(通过集合实现) # 调用fuzz抽取问句对特征 data['fuzz_qratio'] = data.apply( lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_WRatio'] = data.apply( lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_ratio'] = data.apply( lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_token_set_ratio'] = data.apply( lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_token_sort_ratio'] = data.apply( lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_token_set_ratio'] = data.apply( lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_token_sort_ratio'] = data.apply( lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) model = gensim.models.KeyedVectors.load_word2vec_format( 'data/GoogleNews-vectors-negative300.bin.gz', binary=True) data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)
# Search in CPE #if any(w in k.lower() for w in terms): # partial_results1_append((k, x, False)) # There is an acronym? if any(acronym.search(x) is not None for acronym in acronyms): partial_results1_append((k, x, True)) # Apply token_set_ratio partial_results2 = {} # k = CPE (str) # x = CPE description (str) # is_acronym = Bool for k, x, is_acronym in partial_results1: r = fuzz.partial_token_set_ratio(search_term, x, force_ascii=True) # Is false positive? if any(fil.search(x) is not None for fil in filters): continue # More weight if there is an acronym if is_acronym: r *= 1.25 # Fix Valuer r = r if r <= 100 else 100 partial_results2[k] = int(r) if results_number == 1 and r == 100: break
def fuzz_partial_token_set_ratio(sentences): sen = sentences.split("\001") return fuzz.partial_token_set_ratio(sen[0], sen[1])
df_data = pd.concat([ df_train[['question1', 'question2']], df_test[['question1', 'question2']] ], axis=0) df_feat['fuzz_qratio'] = df_data.apply( lambda row: fuzz.QRatio(str(row['question1']), str(row['question2'])), axis=1) df_feat['fuzz_WRatio'] = df_data.apply( lambda row: fuzz.WRatio(str(row['question1']), str(row['question2'])), axis=1) df_feat['fuzz_partial_ratio'] = df_data.apply(lambda row: fuzz.partial_ratio( str(row['question1']), str(row['question2'])), axis=1) df_feat['fuzz_partial_token_set_ratio'] = df_data.apply( lambda row: fuzz.partial_token_set_ratio(str(row['question1']), str(row['question2'])), axis=1) df_feat['fuzz_partial_token_sort_ratio'] = df_data.apply( lambda row: fuzz.partial_token_sort_ratio(str(row['question1']), str(row['question2'])), axis=1) df_feat['fuzz_token_set_ratio'] = df_data.apply( lambda row: fuzz.token_set_ratio(str(row['question1']), str(row['question2'])), axis=1) df_feat['fuzz_token_sort_ratio'] = df_data.apply( lambda row: fuzz.token_sort_ratio(str(row['question1']), str(row['question2'])), axis=1) df_feat[:len_train].to_csv('train_feature_fuzz.csv', index=False)
# install fuzzywuzzy # 2nd set of features from fuzzywuzzy import fuzz data['qratio'] = data.apply( lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) data['partial_ratio'] = data.apply( lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) data['ratio'] = data.apply( lambda x: fuzz.ratio(str(x['question1']), str(x['question2'])), axis=1) data['token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio( str(x['question1']), str(x['question2'])), axis=1) data['token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio( str(x['question1']), str(x['question2'])), axis=1) data.head(3) data.to_csv('C:\\Users\\Nitin PC\\Downloads\\preprocessed.tsv', sep='\t', encoding='utf-8') data = pd.read_csv('C:\\Users\\Nitin PC\\Downloads\\preprocessed.tsv', sep='\t') # more cleaning # stop words stop_words = stopwords.words('english')
diff_len = len_1 - len_2 len_char_q1 = len(''.join(set(str(question1).replace(' ', '')))) len_char_q2 = len(''.join(set(str(question2).replace(' ', '')))) len_word_q1 = len(str(question1).split()) len_word_q2 = len(str(question2).split()) common_words = len( set(str(question1).lower().split()).intersection( set(str(question2).lower().split()))) #fuzzy from fuzzywuzzy import fuzz fuzz_qratio = fuzz.QRatio(str(question1), str(question2)) fuzz_WRatio = fuzz.WRatio(str(question1), str(question2)) fuzz_partial_ratio = fuzz.partial_ratio(str(question1), str(question2)) fuzz_partial_token_set_ratio = fuzz.partial_token_set_ratio( str(question1), str(question2)) fuzz_partial_token_sort_ratio = fuzz.partial_token_sort_ratio( str(question1), str(question2)) fuzz_token_set_ratio = fuzz.token_set_ratio(str(question1), str(question2)) fuzz_token_sort_ratio = fuzz.token_sort_ratio(str(question1), str(question2)) #wmd import gensim model = gensim.models.KeyedVectors.load_word2vec_format( 'GoogleNews-vectors-negative300.bin.gz', binary=True) #sen2vec import scipy question1_vectors = scipy.sparse.lil_matrix((dataset.shape[0], 300)) question2_vectors = scipy.sparse.lil_matrix((dataset.shape[0], 300))
def parse_authors(self, register): """ Transforms the raw register author information from web of science in the CoLav standard. Parameters ---------- register : dict Register in web of science format Returns ------- authors : list Information of the authors in the CoLav standard format """ authors = [] if "PT" in register.keys(): #if register["PT"].rstrip()=="J": corresponding_last_name = "" orcid_list = [] researchid_list = [] if "RI" in register.keys(): if register["RI"] and register["RI"] == register["RI"]: ri = register["RI"] if ri[-1] == "\n": ri = ri[:-1] researchid_list = ri.rstrip().replace("; ", ";").split(";") if "OI" in register.keys(): if register["OI"] and register["OI"] == register["OI"]: oi = register["OI"] if oi[-1] == "\n": oi = oi[:-1] orcid_list = oi.rstrip().replace("; ", ";").split(";") if "AF" in register.keys(): author_list = register["AF"].rstrip().split("\n") if "RP" in register.keys(): if register["RP"]: corresponding_last_name = register["RP"].split(",")[0] for au in author_list: entry = {} entry["first_names"] = "" entry["national_id"] = "" entry["last_names"] = "" entry["initials"] = "" entry["full_name"] = "" entry["aliases"] = [] entry["affiliations"] = [] entry["keywords"] = [] entry["external_ids"] = [] entry["corresponding"] = False entry["corresponding_address"] = "" entry["corresponding_email"] = "" raw_name = au.split(", ") if len(raw_name) == 1: names = raw_name[0].capitalize() last_names = "" elif len(raw_name) > 2: names = " ".join(raw_name[:-1]).rstrip().capitalize() last_names = raw_name[-1].capitalize() else: names = raw_name[1].capitalize() last_names = raw_name[0].capitalize() entry["full_name"] = names + " " + last_names entry["first_names"] = names entry["last_names"] = last_names entry["initials"] = "".join( [i[0].upper() for i in names.split(" ")]) #Checking if there is an external id entry_ext = [] for res in researchid_list: if not res: continue try: name, rid = res.split("/")[-2:] except Exception as e: print( "Could not split name and id in researchid field on ", register["doi_idx"]) print(e) ratio = fuzz.partial_ratio(name, last_names + ", " + names) if ratio > 90: entry_ext.append({ "source": "researchid", "value": rid }) break elif ratio > 50: ratio = fuzz.token_set_ratio( name, last_names + ", " + names) if ratio > 90: entry_ext.append({ "source": "researchid", "value": rid }) break elif ratio > 50: ratio = fuzz.partial_token_set_ratio( name, last_names + ", " + names) if ratio > 95: entry_ext.append({ "source": "researchid", "value": rid }) break for res in orcid_list: if not res: continue try: name, oid = res.split("/")[-2:] except Exception as e: print( "Could not split name and id in orcid field on ", register["doi_idx"]) print(e) ratio = fuzz.partial_ratio(name, last_names + ", " + names) if ratio > 90: entry_ext.append({"source": "orcid", "value": oid}) break elif ratio > 50: ratio = fuzz.token_set_ratio( name, last_names + ", " + names) if ratio > 90: entry_ext.append({ "source": "orcid", "value": oid }) break elif ratio > 50: ratio = fuzz.partial_token_set_ratio( name, last_names + ", " + names) if ratio > 95: entry_ext.append({ "source": "orcid", "value": oid }) break entry["external_ids"] = entry_ext #Checking if is corresponding author if corresponding_last_name: if corresponding_last_name in last_names: entry["corresponding"] = True if "EM" in register.keys(): if register["EM"] and register[ "EM"] == register["EM"]: entry["corresponding_email"] = register[ "EM"].rstrip() authors.append(entry) if len(authors) == 1: authors[0]["corresponding"] = True return authors
def faxFromList(self, args, isPM): '''Look up the monster code in the list using fuzzy matching, then fax it. (Or, if in quiet mode, display its code) ''' splitArgs = args.split() if any(s for s in splitArgs if s.strip().lower() == "force"): return self.fax(splitArgs[0], splitArgs[0], "(forcing) ", isPM, force=True) # first, check for exact code/name/alias matches matches = [entry.code for entry in self._faxList.values() if entry.contains(args)] if len(matches) == 1: return self.fax(matches[0], self._faxList[matches[0]].name, "", isPM) # next, check for "close" matches simplify = (lambda x: x.replace("'", "") .replace("_", " ") .replace("-", " ").lower()) sArgs = simplify(args) scoreDiff = 15 scores = defaultdict(list) # make list of all possible names/codes/aliases allNames = [name for entry in self._faxList.values() for name in entry.nameList] for s in allNames: score1 = fuzz.partial_token_set_ratio(simplify(s), sArgs) scores[score1].append(s) allScores = scores.keys() maxScore = max(allScores) for score in allScores: if score < maxScore - scoreDiff: del scores[score] matches = [] for match in scores.values(): matches.extend(match) fuzzyMatchKeys = set(entry.code for entry in self._faxList.values() for match in matches if entry.contains(match)) # also check for args as a subset of string or code detokenize = lambda x: ''.join(re.split(r"'|_|-| ", x)).lower() dArgs = detokenize(args) matches = [name for name in allNames if dArgs in detokenize(name)] subsetMatchKeys = set(entry.code for entry in self._faxList.values() for match in matches if entry.contains(match)) ls = len(subsetMatchKeys) lf = len(fuzzyMatchKeys) matchKeys = subsetMatchKeys | fuzzyMatchKeys lm = len(matchKeys) if ls == 0 and lf == 1: m = matchKeys.pop() return self.fax(m, self._faxList[m].name, "(fuzzy match) ", isPM) elif lm == 1: m = matchKeys.pop() return self.fax(m, self._faxList[m].name, "(subset match) ", isPM) elif lm > 1 and lm < 6: possibleMatchStr = ", ".join( ("{} ({})".format(self._faxList[k].name,k)) for k in matchKeys) return "Did you mean one of: {}?".format(possibleMatchStr) elif lm > 1: return ("Matched {} monster names/codes; please be more specific." .format(ls + lf)) return ("No known monster with name/code matching '{0}'. " "Use '!fax {0} force' to force, or check the monster list " "at {1} .".format(args, self.fax_list_url))
def testPartialTokenSetRatio(self): self.assertEqual(fuzz.partial_token_set_ratio(self.s4, self.s7), 100)
def fuzz_partial_token_setratio(q1, q2): return fuzz.partial_token_set_ratio(q1, q2)