def tok_qgram(input_string, q): """ This function splits the input string into a list of q-grams. Note that, by default the input strings are padded and then tokenized. Args: input_string (string): Input string that should be tokenized. q (int): q-val that should be used to tokenize the input string. Returns: A list of tokens, if the input string is not NaN, else returns NaN. Examples: >>> import py_entitymatching as em >>> em.tok_qgram('database', q=2) ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$'] >>> em.tok_qgram('database', q=3) ['##d', '#da', 'dat', 'ata', 'tab', 'aba', 'bas', 'ase', 'se$', 'e$$'] >>> em.tok_qgram(None, q=2) nan """ if pd.isnull(input_string): return pd.np.NaN input_string = gh.convert_to_str_unicode(input_string) measure = sm.QgramTokenizer(qval=q) return measure.tokenize(input_string)
def filter_person(self, df, lim): A = df.ENHANCEDPERSONS.dropna().apply(lambda x: list( set([y.split(',')[0].lower() for y in x.split(';')]))) A = A.apply(pd.Series).stack() A.index = A.index.map(lambda i: "{}_{}".format(i[0], i[1])) A = A.reset_index() A.columns = ['id', 'name'] B = self.fetch_person() B.name = B.name.str.replace(r'_+', ' ').str.lower() qg3_tok = sm.QgramTokenizer(qval=3) C = ssj.jaccard_join(A, B, 'id', 'id', 'name', 'name', qg3_tok, lim, l_out_attrs=['name'], r_out_attrs=['name'], show_progress=False) return set(C.l_id.apply(lambda x: int(x.split("_")[0])))
def setUp(self): self.df = read_data(path_big_ten) self.trigramtok = sm.QgramTokenizer(qval=3) self.blocked_pairs = ssj.jaccard_join(self.df, self.df, 'id', 'id', 'name', 'name', self.trigramtok, 0.3) self.jaccsim = sm.Jaccard() self.sim_scores = get_sim_scores(self.df, self.blocked_pairs, self.trigramtok, self.jaccsim)
def tok_qgram(s): # check if the input is of type base string if pd.isnull(s): return s s = gh.convert_to_str_unicode(s) measure = sm.QgramTokenizer(qval=q) return measure.tokenize(s)
def tok_qgram(s): # check if the input is of type base string if pd.isnull(s): return s if not (isinstance(s, six.string_types) or isinstance(s, bytes)): s = str(s) else: if isinstance(s, bytes): s = s.decode('utf-8') measure = sm.QgramTokenizer(qval=q) return measure.tokenize(s)
def setUp(self): self.df = read_data(path_big_ten) self.trigramtok = sm.QgramTokenizer(qval=3) self.blocked_pairs = ssj.jaccard_join(self.df, self.df, 'id', 'id', 'name', 'name', self.trigramtok, 0.3) self.jaccsim = sm.Jaccard() self.sim_scores = get_sim_scores(self.df, self.blocked_pairs, self.trigramtok, self.jaccsim) self.sim_matrix = get_sim_matrix(self.df, self.sim_scores) self.aggcl = AgglomerativeClustering(n_clusters=5, affinity='precomputed', linkage='complete') self.labels = self.aggcl.fit_predict(self.sim_matrix)
def jac_q3_sim(str1, str2): try: # not needed as we already casted all to string and # lower cased and stripped all values before handing it over #str1 = str(str1).lower().strip() #str2 = str(str2).lower().strip() # assign a sim score of -1 when one of them is null if (str1 == 'nan' or str2 == 'nan' or str1 == '' or str2 == ''): return -1 else: q3_tok = sm.QgramTokenizer(qval=3, return_set=True) jac = sm.Jaccard() return jac.get_raw_score(q3_tok.tokenize(str1), q3_tok.tokenize(str2)) except: logger.warning('Issue with Jaccard_q3_Sim, hence -1 assigned') return -1
def get_oov_jaccard_sim(self, s1, s2): en_tokens_f = word_tokenize(s1.lower()) de_tokens_f = word_tokenize(s2.lower()) # Replacing the OOVs if their match has found en_tokens = [] for token in en_tokens_f: if token in self.en_oov: for el in self.en_oov[token]: en_tokens.append(el) else: en_tokens.append(token) de_tokens = [] for token in de_tokens_f: if token in self.de_oov: for el in self.de_oov[token]: de_tokens.append(el) else: de_tokens.append(token) new_en_tokens = [ token for token in en_tokens if token not in self.en_dictionary and token not in self.en_oov ] new_de_tokens = [ token for token in de_tokens if token not in self.de_dictionary and token not in self.de_oov ] new_en_str = " ".join(new_en_tokens) new_de_str = " ".join(new_de_tokens) if new_en_str == "" or new_de_str == "": return 0 ## Getting 3 - grams measure = sm.QgramTokenizer(qval=3) en_grams = measure.tokenize(new_en_str) de_grams = measure.tokenize(new_de_str) ## Getting Jaccard distance measure = sm.Jaccard() return measure.get_raw_score(en_grams, de_grams)
def main(): import pickle import py_stringmatching as sm from sklearn.feature_extraction.text import TfidfVectorizer INSAMPLE_ABS_OUTFILE = '../dataCached/insample_abstracts_outfile' OUTSAMPLE_ABS_OUTFILE = '../dataCached/outSample_abstracts_outfile' OUTSAMPLE_ABS_REDUCED_OUTFILE = '../dataCached/outSample_abstracts_reduced_outfile' a1 = pickle.load(open(INSAMPLE_ABS_OUTFILE,'rb')) a2 = pickle.load(open(OUTSAMPLE_ABS_OUTFILE,'rb')) a3 = pickle.load(open(OUTSAMPLE_ABS_REDUCED_OUTFILE,'rb')) csAbstract = CosSim('Cos Sim Abstract',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),False) csSentence = CosSim('Cos Sim Sentence',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),True) jacq3 = stringMatchExcerpts('Fuzzy Jaccard',sm.Jaccard(),sm.QgramTokenizer(qval=3)) components = [csAbstract,csSentence,jacq3] a1Features = [c.generateFeatures(a1) for c in components] print len(a1Features)
def get_similar_strings(table_path): """ Get list of strings to be normalized by value normalizer. The current algorithm is as follows: 1. Deduplicate the given list of strings. 2. Apply string similar join. 3. Retrive the top N similar strings as returned by string similar join. Arguments: table_path: The absolute path of list of strings. Returns: similar_strings: Similar strings as returned by the aforementioned algorithm. Note: This logic can be changed to improve the value normalizer part of the overall application. """ A = pd.read_csv(table_path) B = pd.read_csv(table_path) qg3_tok = sm.QgramTokenizer(qval=3) output_pairs = ssj.jaccard_join(A, B, 'id', 'id', 'foo', 'foo', qg3_tok, 0.6, l_out_attrs=['foo'], r_out_attrs=['foo']) considered_pairs = [] similar_strings = [] for index, row in output_pairs.iterrows(): if row['_sim_score'] > 0.6 and row['_sim_score'] < 1.0: if row['l_foo'] not in similar_strings: similar_strings.append(row['l_foo']) if row['r_foo'] not in similar_strings: similar_strings.append(row['r_foo']) if len(similar_strings) >= 21: break similar_strings.sort() return similar_strings
import numpy as np import os def ensure_dir(file_path): directory = os.path.dirname(file_path) if not os.path.exists(directory): os.makedirs(directory) INSAMPLE_FV_OUTFILE = 'dataCached/insampleFV_outfile' OUTSAMPLE_FV_OUTFILE = 'dataCached/outsampleFV_outfile' OUTSAMPLE_FV_REDUCED_OUTFILE = 'dataCached/outsampleFVreduced_outfile' csAbstract = FVC.CosSim('CSAbs',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),False) csSentence = FVC.CosSim('CSSent',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),True) jacq3 = FVC.stringMatchExcerpts('FuzzJacc',sm.Jaccard(),sm.QgramTokenizer(qval=3,return_set = True)) cosM = FVC.stringMatchExcerpts('CosMeasure',sm.Cosine(),sm.WhitespaceTokenizer(return_set = True)) cosMq3 = FVC.stringMatchExcerpts('FuzzCosMeasure',sm.Cosine(),sm.QgramTokenizer(return_set = True)) LVdist = FVC.stringMatchTitles('LVDist',sm.Levenshtein()) DEFAULTFV = [jacq3,cosM,cosMq3,LVdist] DEFAULTMODEL = LR() DEFAULTMODELNAME = 'LogisiticRegression' DEFAULTITERATIONS = 25 class join: def __init__(self,insampleData,outsampleData,dataFolder): self.insampleData = insampleData #pairs,labels,pairedAbstracts,pairedTitles self.outsampleData = outsampleData #pairs,labels,pairedAbstracts,pairedTitles self.dataFolder = dataFolder
import pandas as pd from .util import suffix import py_stringmatching as sm from remp import string_matching tokenizer = sm.QgramTokenizer(qval=2, return_set=True) jaccard = sm.Jaccard() def similarity_func_default(string1, string2): return jaccard.get_sim_score(tokenizer.tokenize(string1), tokenizer.tokenize(string2)) def construct_similarity_list(left_triples, right_triples, entity_candidates, aligned_attributes=None, similarity_func=None): if aligned_attributes is None: shared_attributes = set(left_triples['a'].unique()) shared_attributes &= set(right_triples['a'].unique()) shared_attributes = list(shared_attributes) aligned_attributes = pd.DataFrame({ 'a1': shared_attributes, 'a2': shared_attributes }) if 'attr_id' not in aligned_attributes: aligned_attributes['attr_id'] = aligned_attributes.index paired = pd.merge(entity_candidates, suffix(left_triples, '1')) paired = pd.merge(paired, aligned_attributes)
def __init__(self): self.dice = py_stringmatching.Dice() self.tokenizer = py_stringmatching.QgramTokenizer(qval=3)
from typing import Callable, List, Dict, Tuple, Sequence, NewType from dataclasses import dataclass import numpy as np import pandas as pd from collections import defaultdict from sklearn.metrics import precision_recall_fscore_support import py_stringsimjoin as ssj import py_stringmatching as sm WS = sm.WhitespaceTokenizer(return_set=True) TWO_GRAM = sm.QgramTokenizer(qval=2, return_set=True) def simjoin_top_k_pd(routine, params, k_max, thresh = None, suppress=True, early_break=True): knn_results = defaultdict(list) for k in range(1, k_max+1): ret_avg, ret_count, all_avg, all_count, MRR, retrieved = routine(*params, k=k, thresh=thresh) if not suppress: print(f"k: {k} \t ret avg: {ret_avg} \t ret_count: {ret_count} \t ret avg: {all_avg} \t ret_count: {all_count} \t MRR: {MRR} \t retrieved: {retrieved}") knn_results['k'].append(k) knn_results['ret_avg'].append(ret_avg) knn_results['ret_count'].append(ret_count) knn_results['all_avg'].append(all_avg) knn_results['all_count'].append(all_count) knn_results['MRR'].append(MRR) knn_results['retrieved'].append(retrieved) if early_break and ret_avg == 1.0 and all_avg == 1.0: break
def tok_qgram(input_string, q): if pd.isnull(input_string): return pd.np.NaN measure = sm.QgramTokenizer(qval=q) return measure.tokenize(input_string)
import py_stringmatching as sm import pandas as pd from sklearn.model_selection import train_test_split from sklearn import tree from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB from py_stringmatching import utils from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix #incomplete list of necessary imports #we will need to import the necessary scikit learn packages when we get there #Initialize the q-gram tokenizer qg3_tok_set = sm.QgramTokenizer(qval=3, return_set=True) #Initialize similarity score calculators jac = sm.Jaccard() oc = sm.OverlapCoefficient() #read in the CSV into a DataFrame gold_raw_data = pd.read_csv('gold.csv', low_memory=False) #Extract the gold labels from the DataFrame #This becomes an input into our Learning Algs gold_labels = gold_raw_data['Match?'] #This is our feature vector table #Another input into our Learning Algs. #We add feature vectors from with in the for #loop that iterates over the DataFrame
import py_stringmatching as sm alnum_tok = sm.AlphanumericTokenizer() qg3_tok = sm.QgramTokenizer(qval=3) jac = sm.Jaccard() lev = sm.Levenshtein() def calcola_similarita(string1, string2): a = jac.get_sim_score(alnum_tok.tokenize(string1), alnum_tok.tokenize(string2)) b = lev.get_sim_score(string1, string2) c = jac.get_sim_score(qg3_tok.tokenize(string1), qg3_tok.tokenize(string2)) return [{"alnum_jac": a}, {"alnum_lev": b}, {"qg3_jac": c}] def add_features(elem): line, count = elem title1 = line[4] director1 = line[3] date1 = line[5] title2 = line[7] director2 = line[6] date2 = line[8] return (line+(calcola_similarita(title1,title2),calcola_similarita(director1,director2),\ calcola_similarita(date1,date2)),count) def precision(true): tp = true.map(lambda row:("true",row.response))\
SOInsampleFile = 'stackoverflowdata/' + insample_data SOOutsampleFile = 'stackoverflowdata/' + outsample_data SOInsampleData = pickle.load(open(SOInsampleFile, 'rb')) SOOutsampleData = pickle.load(open(SOOutsampleFile, 'rb')) csAbstract = FVC.CosSim('CSAbs', TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True), False) csSentence = FVC.CosSim('CSSent', TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True), True) jac = FVC.stringMatchExcerpts('Jacc', sm.Jaccard(), sm.WhitespaceTokenizer(return_set=True)) jacq3 = FVC.stringMatchExcerpts('FuzzJacc', sm.Jaccard(), sm.QgramTokenizer(qval=3, return_set=True)) dice = FVC.stringMatchExcerpts('Dice', sm.Dice(), sm.WhitespaceTokenizer(return_set=True)) diceq3 = FVC.stringMatchExcerpts('Dice', sm.Dice(), sm.QgramTokenizer(qval=3, return_set=True)) cosM = FVC.stringMatchExcerpts('CosMeasure', sm.Cosine(), sm.WhitespaceTokenizer(return_set=True)) cosMq3 = FVC.stringMatchExcerpts('FuzzCosMeasure', sm.Cosine(), sm.QgramTokenizer(return_set=True)) LVdist = FVC.stringMatchTitles('LVDist', sm.Levenshtein()) sw = FVC.stringMatchTitles('SW', sm.SmithWaterman()) nw = FVC.stringMatchTitles('NW', sm.NeedlemanWunsch()) jw = FVC.stringMatchTitles('JW', sm.JaroWinkler()) def writeToCSV(fileName, header, tableList):
'cosine': sm.Cosine, 'dice': sm.Dice, 'generalized_jaccard': sm.GeneralizedJaccard, 'jaccard': sm.Jaccard, 'overlap_coefficient': sm.OverlapCoefficient, 'tversky_index': sm.TverskyIndex, # Corpus 'tfidf': sm.TfIdf, 'soft_tfidf': sm.SoftTfIdf, } tokenizer_lookup = { # Character gram tokenizers '1gram': sm.QgramTokenizer(qval=1), '1grams': sm.QgramTokenizer(qval=1), '2grams': sm.QgramTokenizer(qval=2), '3grams': sm.QgramTokenizer(qval=3), '4grams': sm.QgramTokenizer(qval=4), '5grams': sm.QgramTokenizer(qval=5), '6grams': sm.QgramTokenizer(qval=6), '7grams': sm.QgramTokenizer(qval=7), '8grams': sm.QgramTokenizer(qval=8), '9grams': sm.QgramTokenizer(qval=9), '1gram_set': sm.QgramTokenizer(qval=1, return_set=True), '1grams_set': sm.QgramTokenizer(qval=1, return_set=True), '2grams_set': sm.QgramTokenizer(qval=2, return_set=True), '3grams_set': sm.QgramTokenizer(qval=3, return_set=True), '4grams_set': sm.QgramTokenizer(qval=4, return_set=True), '5grams_set': sm.QgramTokenizer(qval=5, return_set=True),