def soft_tfidf(text, hypo): global tf #print(text) text = ' '.join(text) hypo = ' '.join(hypo) #print(text) text_resp = tf.transform([text]) hypo_resp = tf.transform([hypo]) valores_tf_idf_text = [] valores_tf_idf_hypo = [] features_names_text = [] features_names_hypo = [] feature_names = tf.get_feature_names() for col in text_resp.nonzero()[1]: #print feature_names[col], ' - ', text_resp[0, col] features_names_text.append(feature_names[col]) valores_tf_idf_text.append(text_resp[0, col]) #print("---------------") for col in hypo_resp.nonzero()[1]: #print feature_names[col], ' - ', hypo_resp[0, col] features_names_hypo.append(feature_names[col]) valores_tf_idf_hypo.append(hypo_resp[0, col]) valor = 0 for i, word_t in enumerate(features_names_text): for j, word_h in enumerate(features_names_hypo): if jaro.jaro_winkler_metric(word_t, word_h) > 0.9: valor += jaro.jaro_winkler_metric( (word_t), (word_h)) * valores_tf_idf_text[i] * valores_tf_idf_hypo[j] #print(valor) return valor
def jaro_winkler(__str1, __str2): def to_unicode(__str): if isinstance(__str, unicode): return __str return unicode(__str) return jaro.jaro_winkler_metric(to_unicode(__str1), to_unicode(__str2))
def create_similarity_matrix(sentences): ''' The purpose of this function will be to create an N x N similarity matrix. N represents the number of sentences and the similarity of a pair of sentences will be determined through the Jaro-Winkler Score. params: sentences (List -> String) : This is a list of strings you want to create the similarity matrix with. returns: This function will return a square numpy matrix ''' # identify sentence similarity matrix with Jaro Winkler score sentence_length = len(sentences) sim_mat = np.zeros((sentence_length, sentence_length)) for i in range(sentence_length): for j in range(sentence_length): if i != j: similarity = jaro.jaro_winkler_metric(sentences[i], sentences[j]) sim_mat[i][j] = similarity return sim_mat
def jwSim(v1, v2): try: return jaro.jaro_winkler_metric(v1.decode('utf-8').strip().lower(), v2.decode('utf-8').strip().lower()) except: # print v1 # print v2 return 0.
def jaro_w(word, topSimilarity): nw_candidates = {} for c in freqDict: similiarity = jaro.jaro_winkler_metric(word, c) if similiarity > 0.5: nw_candidates[c] = similiarity if len(nw_candidates) == 0: return [word] else: sorted_candidates_word_freq = sorted(nw_candidates.items(), key=lambda x: x[1], reverse=True) sorted_candidate_word = [] for i in range(len(sorted_candidates_word_freq)): sorted_candidate_word.append(sorted_candidates_word_freq[i][0]) words = [] if topSimilarity: return sorted_candidate_word[0:min(candidates_nb, len(sorted_candidate_word) - 1)] # Les 3 premiers mots else: sorted_candidate_word = sorted_candidate_word[ 0:min(candidates_nb, len(sorted_candidate_word) - 1)] for i in range(min(len(sorted_candidate_word), candidates_nb)): words.append(max(sorted_candidate_word, key=P)) sorted_candidate_word.remove(words[i]) return words
def match_by_title(query, result_titles, correct_titles): import jaro # print '\n%s' % query # print '\n'.join(result_titles), '\n' # print '\n'.join(sorted(correct_titles)), '\n' # Apply lower case in all entries first correct_titles = map(unicode.lower, correct_titles) matched_ids = [] for rtitle in result_titles: rtitle = rtitle.lower() best_jw = 0.0 best_id = 0 for i, ctitle in enumerate(correct_titles): jw = jaro.jaro_winkler_metric(unicode(rtitle), unicode(ctitle)) if (jw > best_jw): best_jw = jw best_id = i # Check if we have a match if (best_jw >= 0.9): matched_ids.append(str(best_id)) else: matched_ids.append('') # Create an incremental list of string ids for the correct # titles, since they are not in the database correct_ids = map(str, range(len(correct_titles))) return correct_ids, matched_ids
def findFamilyOrNot(surnames, row): result = [] a = combinations(range(len(surnames)), 2) for check in a: result.append( jr.jaro_winkler_metric(surnames[check[0]], surnames[check[1]])) if len(result) == 0: if int(row['age']) >= 18: return "single_adult" else: return "single_child" elif max(result) > .86: return "family" else: return "probable family or not"
def findParenting(row, uiText=True): if not row['surnames']: return surnames = row['surnames'] if len(surnames) < 2: return ages = list(map(int, row["roomies"].split(','))) ages_abs = [abs(int(x)) for x in ages] ages_under18 = [x for x in ages_abs if x < 18] age_surname_dict = dict(zip(ages_abs, surnames)) parent_type = set() if (row['age'] >= 18) and (len(ages_under18) > 0): for child_age in ages_under18: if (jr.jaro_winkler_metric(row['surname'], age_surname_dict[child_age])) > .80: parent_type.add(parentingType(child_age, uiText)) if len(parent_type): return parent_type.pop() if uiText else "/".join(parent_type)
def jaro_winkler(s1: str, s2: str): dis = jaro.jaro_winkler_metric(s1, s2) # print(f'Jaro-Winkler: {dis}') return dis
import jaro with open("password1_analysis.txt", "w") as writer: writer.write("actual, guess, jaro-winkler\n") total = 0.0 number = 0.0 with open("password1.txt", "r") as f: f.readline() for line in f.readlines(): parts = line.split() actual = parts[1] guess = parts[2] #edit_distance = editdistance.eval(actual, guess) jaro_winkler = jaro.jaro_winkler_metric(unicode(actual), unicode(guess)) total += jaro_winkler number += 1 writer.write(actual + ", " + guess + ", " + str(jaro_winkler) + "\n") writer.write("Average: " + str(total/number))
def recommendPaper(request): cats = [ 'astro-ph.GA', 'astro-ph.CO', 'astro-ph.EP', 'astro-ph.HE', 'astro-ph.IM', 'astro-ph.SR', 'cond-mat.dis-nn', 'cond-mat.mes-hall', 'cond-mat.other', 'cond-mat.quant-gas', 'cond-mat.soft', 'cond-mat.stat-mech', 'cond-mat.str-el', 'cond-mat.supr-con', 'gr-qc', 'hep-ex', 'hep-lat', 'hep-ph', 'hep-th', 'math.MP', 'nlin.AO', 'nlin.CG', 'nlin.CD', 'nlin.SI', 'nlin.PS', 'nucl-ex', 'nucl-th', 'physics.acc-ph', 'physics.app-ph', 'physics.atom-ph', 'physics.atm-clus', 'physics.bio-ph', 'physics.chem-ph', 'physics.class-ph', 'physics.comp-ph', 'physics.data-an', 'physics.flu-dyn', 'physics.gen-ph', 'physics.geo-ph', 'physics.hist-ph', 'physics.ins-det', 'physics.med-ph', 'physics.optics', 'physics.ed-ph', 'physics.soc-ph', 'physics.plasm-ph', 'physics.pop-ph', 'physics.space-ph', 'quant-ph', 'math.AG', 'math.AT', 'math.AP', 'math.CT', 'math.CA', 'math.CO', 'math.AC', 'math.CV', 'math.DG', 'math.DS', 'math.FA', 'math.GM', 'math.GN', 'math.GT', 'math.GR', 'math.HO', 'math.IT', 'math.KT', 'math.LO', 'math.MP', 'math.MG', 'math.NT', 'math.NA', 'math.OA', 'math.OC', 'math.PR', 'math.QA', 'math.RT', 'math.RA', 'math.SP', 'math.ST', 'math.SG', 'cs.AI', 'cs.CL', 'cs.CC', 'cs.CE', 'cs.CG', 'cs.GT', 'cs.CV', 'cs.CY', 'cs.CR', 'cs.DS', 'cs.DB', 'cs.DL', 'cs.DM', 'cs.DC', 'cs.ET', 'cs.FL', 'cs.GL', 'cs.GR', 'cs.AR', 'cs.HC', 'cs.IR', 'cs.IT', 'cs.LO', 'cs.LG', 'cs.MS', 'cs.MA', 'cs.MM', 'cs.NI', 'cs.NE', 'cs.NA', 'cs.OS', 'cs.OH', 'cs.PF', 'cs.PL', 'cs.RO', 'cs.SI', 'cs.SE', 'cs.SD', 'cs.SC', 'cs.SY', 'q-bio.BM', 'q-bio.CB', 'q-bio.GN', 'q-bio.MN', 'q-bio.NC', 'q-bio.OT', 'q-bio.PE', 'q-bio.QM', 'q-bio.SC', 'q-bio.TO', 'q-fin.CP', 'q-fin.EC', 'q-fin.GN', 'q-fin.MF', 'q-fin.PM', 'q-fin.PR', 'q-fin.RM', 'q-fin.ST', 'q-fin.TR', 'stat.AP', 'stat.CO', 'stat.ML', 'stat.ME', 'stat.OT', 'stat.TH', 'eess.AS', 'eess.IV', 'eess.SP', 'eess.SY', 'econ.EM', 'econ.GN', 'econ.TH', 'physics', 'astro-ph', 'cond-mat', 'nlin', 'physics', 'math', 'cs', 'CoRR', 'q-bio', 'q-fin', 'stat', 'eess', 'econ' ] user = request.GET.get('user') try: obj = models.UserModel.objects.get(userName=user) except: return HttpResponse("invalid username") sums = [] collectDict = ast.literal_eval(obj.collectDict) for (k,v) in collectDict.items(): sums.append(v['summary']) sums = sums[:20] fields = [] focusList = obj.focusList for f in focusList: if int(f)<154: fields.append(cats[int(f)-1]) random.shuffle(fields) fields = fields[:1] papers = [] for cat in fields: print(cat) url = "http://export.arxiv.org/api/query?search_query=cat:" + cat + "&sortBy=submittedDate&sortOrder=descending&max_results=10" res = requests.get(url) data = res.text soup = bs(data, 'lxml') entries = soup.find_all('entry') for entry in entries: newPaper = Paper() newPaper.id = entry.id.string newPaper.updatedTime = entry.updated.string newPaper.publishedTime = entry.published.string newPaper.title = entry.title.string newPaper.summary = entry.summary.string authors = entry.find_all('author') for author in authors: newPaper.author.append(author.contents[1].string) categories = entry.find_all('category') for category in categories: newPaper.category.append(category.get('term')) links = entry.find_all('link') for link in links: if link.get('title') == 'doi': newPaper.doiLink = link.get('href') elif link.get('title') == 'pdf': newPaper.pdfLink = link.get('href') else: newPaper.paperLink = link.get('href') papers.append(newPaper) for paper in papers: for summary in sums: paper.score += jaro.jaro_winkler_metric(paper.summary, summary) papers = sorted(papers, key=attrgetter('score')) if len(papers) > 10: papers = papers[:10] res = dict() res['resCode'] = 200 res['len'] = len(papers) res['papers'] = [] for paper in papers: p = dict() p['author'] = '' for author in paper.author: p['author'] += (author+'/') p['id'] = paper.id p['updatedTime'] = paper.publishedTime p['title'] = paper.title p['summary'] = paper.summary p['category'] = '' for category in paper.category: p['category'] += (category+'/') p['doiLink'] = paper.doiLink p['paperLink'] = paper.paperLink p['pdfLink'] = paper.pdfLink res['papers'].append(p) res = json.dumps(res) return HttpResponse(res)
def jwm(x, y): return jaro_winkler_metric(unicode(x.lower()), unicode(y.lower()))
def get(folder, host, user, password, database, incremental_ind): def id_generator(size=25, chars=string.ascii_lowercase + string.digits): return ''.join(random.choice(chars) for _ in range(size)) punctuation = "( + ) [ ? : ! . ; ] * # % ` ' / _ = -".split() punctuation.append('"') ###SETUP MAJOR VARS fdmain = folder+ "/location_disambiguation/" #need to figure out what this is #separate first(0) and incremental(1) disambiguations incremental = incremental_ind # Step 1 mydb = MySQLdb.connect(host, user, password, database) cursor = mydb.cursor() if incremental == 0: increm = '' else: increm = ' AND (location_id is NULL or location_id = "")' print "Step 1..." cursor.execute('select distinct country_transformed from rawlocation where country_transformed is not NULL and country_transformed != "" and country_transformed!="s" and country_transformed!="B." and country_transformed!="omitted" '+increm) countries = [item[0] for item in cursor.fetchall() if item[0] is not None] print countries os.makedirs(fdmain) os.makedirs(fdmain+'uspto_disamb/') os.makedirs(fdmain+'uspto_disamb_counts/') os.makedirs(fdmain+'uspto_disamb_v2/') os.makedirs(fdmain+'uspto_disamb_loc_latlong/') os.makedirs(fdmain+'uspto_disamb_only_loc/') for c in countries: print c datum = {} output = open(fdmain+'uspto_disamb/'+c+'.tsv','wb') output2 = open(fdmain+'uspto_disamb_counts/'+c+'.tsv','wb') outp = csv.writer(output,delimiter='\t') outp2 = csv.writer(output2,delimiter='\t') cursor.execute("select city,state,country_transformed,count(city) from rawlocation where country_transformed = '"+c+"'"+increm+" group by city,state order by count(city) desc") outp2.writerows(cursor.fetchall()) cursor.execute('select distinct state from rawlocation where country_transformed = "'+c+'"'+increm) states = [f[0] for f in cursor.fetchall()] for s in states: if str(s) == 'None' or str(s)=='NULL': cursor.execute('select id,city from rawlocation where country_transformed = "'+c+'" and (state is NULL or state="NULL")'+increm) s = '' else: s = re.sub('[\n\t\f\r]+','',s.strip()) cursor.execute('select id,city from rawlocation where country_transformed = "'+c+'" and state ="'+s+'"'+increm) locs = [list(f) for f in cursor.fetchall()] for l in locs: ll = [] for l1 in l: if l1: ll.append(re.sub('[\n\t\r\f]+','',l1.strip())) else: ll.append('') outp.writerow(ll+[s,c]) output.close() output2.close() print "Step 2..." fd = fdmain+'uspto_disamb_counts/' diri = os.listdir(fd) mastdata = {} mastdatum = {} for d in diri: #this is separate from the forloop below because otherwise places that are in the wrong file break it mastdata[d.replace('.tsv','')] = {} mastdatum[d.replace('.tsv','')] = {} for d in diri: input = open(fd+d,'rb') inp = csv.reader(input,delimiter='\t') try: head = inp.next() top = int(head[-1]) except: pass num = 1 for i in inp: num+=1 inp = csv.reader(file(fd+d),delimiter='\t') for e,i in enumerate(inp): if e<=int(num/3) and int(i[-1])>int(top/5): city = unidecode(i[0]) for p in punctuation: city = city.replace(p,'') city = re.sub('[0-9]+','',city) city = re.sub('^\s+','',city) city = re.sub('\s+$','',city) city = city.replace(' ','') state = i[1] state = re.sub('^\s+','',state) state = re.sub('\s+$','',state) country = i[2] key = id_generator(size=12) try: gg = mastdata[country][city.lower()+'_'+state.lower()] except: #print len(mastdata[country]) mastdata[country][city.lower()+'_'+state.lower()] = [key,i[0].strip(),i[1].strip(),i[2],int(i[3])] mastdatum[country][city.lower()] = [key,i[0],i[1].strip(),i[2].strip(),int(i[3])] input.close() print "Step 3..." # Step 3 fd = fdmain+'uspto_disamb/' diri = os.listdir(fd) for d in diri: output = open(fdmain+'uspto_disamb_v2/'+d,'wb') input = open(fd+d,'rb') outp = csv.writer(output,delimiter='\t') inp = csv.reader(input,delimiter='\t') data = mastdata[d.replace('.tsv','')] datum = mastdatum[d.replace(".tsv",'')] secdata = {} secdatum = {} for i in inp: city = unidecode(i[1]) state = i[2] country = i[3] for p in punctuation: city = city.replace(p,'') city = re.sub('[0-9]+','',city) city = re.sub('^\s+','',city) city = re.sub('\s+$','',city) origcity = city city = city.replace(' ','') try: gg = data[city.lower()+'_'+state.lower()] outp.writerow(i+gg) except: try: cit = city.lower().split(",")[0] gg = data[cit.lower()+'_'+state.lower()] outp.writerow(i+gg) except: try: cit = city.lower().split("/") for cc in cit: gg = data[cc.lower()+'_'+state.lower()] outp.writerow(i+gg) break except: try: cit = city.lower().split("-") for cc in cit: gg = data[cc.lower()+'_'+state.lower()] outp.writerow(i+gg) break except: try: cit = city.lower().split("&")[0] gg = data[cit.lower()+'_'+state.lower()] outp.writerow(i+gg) except: try: gg = datum[city.lower()] outp.writerow(i+gg) except: try: howdy = 0 for k,v in data.items(): dist = jaro.jaro_winkler_metric((city.lower()+'_'+state.lower()).decode('utf-8','ignore'),k.decode('utf-8','ignore')) edit = nltk.edit_distance(city.lower()+'_'+state.lower(),k) if (re.search(k.split("_")[0],city.lower()) and k.split("_")[0]!='') or dist >= 0.95 or (edit==2 and len(city.lower())>5): outp.writerow(i+v) howdy = 1 break gg = datum[city] except: if howdy == 0: cit = [cc for cc in origcity.lower().split(" ") if len(cc) > 4] howdy2 = 0 for cc in cit: try: gg = datum[cc] outp.writerow(i+gg) howdy2 = 1 break except: pass if howdy2 == 0: try: gg = secdata[city.lower()+'_'+state.lower()] outp.writerow(i+gg) except: try: cit = city.lower().split(",")[0] gg = secdata[cit.lower()+'_'+state.lower()] outp.writerow(i+gg) except: try: cit = city.lower().split("&")[0] gg = secdata[cit.lower()+'_'+state.lower()] outp.writerow(i+gg) except: try: gg = secdatum[city.lower()] outp.writerow(i+gg) except: try: howdy = 0 gg = datum[city] except: if howdy == 0: cit = [cc for cc in origcity.lower().split(" ") if len(cc) > 4] howdy2 = 0 for cc in cit: try: gg = secdatum[cc] outp.writerow(i+gg) howdy2 = 1 break except: pass if howdy2 == 0: key = id_generator(size=12) secdata[city.lower()+'_'+state.lower()] = [key,i[1],i[2],i[3]] secdatum[city.lower()] = [key,i[1],i[2],i[3]] outp.writerow(i+[key,i[1],i[2],i[3]]) input.close() output.close() print "Step 4..." #Step 4 fd = fdmain+'uspto_disamb_v2/' fd3 = fdmain+'uspto_disamb_only_loc/' diri = os.listdir(fd) for d in diri: input = open(fd+d,'rb') output = open(fd3+d,'wb') inp = csv.reader(input,delimiter='\t') outp2 = csv.writer(output,delimiter='\t') data = {} final = {} disamb = {} for i in inp: try: gg = data[' '.join(i[5:])] final[i[0]] = i[:4]+[gg]+i[5:] except: try: data[' '.join(i[5:])] = i[4] final[i[0]] = i disamb[i[4]] = i[4:] except: print d,i input.close() for k,v in disamb.items(): if len(v) == 5: v = v[:-1] outp2.writerow(v) output.close() #exit() print "Done Step 1 - 4"