def triangulate(self, tweet, loc): print('Triangulating: ' + tweet) cosine = Cosine(2) cos_tweet = cosine.get_profile(tweet) with open("clean/clean_rss.txt", "r") as clean_rss: for rss in clean_rss: rss = rss.split('\n')[0] cos_rss = cosine.get_profile(rss) cos_result = cosine.similarity_profiles(cos_tweet, cos_rss) if cos_result > 0.7: print('\t[PASS: '******'] ' + rss) return True else: print('\t[FAIL: ' + str(cos_result) + '] ' + rss) with open("clean/clean_retweet.txt", "r") as clean_rt: for rtweet in clean_rt: rt = rtweet.rsplit(' ', 1)[0] rt_loc = rtweet.split('\n')[0].rsplit(' ', 1)[1] cos_rt = cosine.get_profile(rt) if loc == rt_loc: cos_result = cosine.similarity_profiles(cos_tweet, cos_rt) if cos_result > 0.7: print('\t[PASS: '******'] ' + rt) return True else: print('\t[FAIL: ' + str(cos_result) + '] ' + rt) with open('clean/clean_tweet.txt', 'r') as clean_tweet: for ctweet in clean_tweet: ct = ctweet.rsplit(' ', 1)[0] ct_loc = ctweet.split('\n')[0].rsplit(' ', 1)[1] cos_ct = cosine.get_profile(ct) if loc == ct_loc: cos_result = cosine.similarity_profiles(cos_tweet, cos_ct) if cos_result > 0.7 and cos_result != 1.0: print('\t[PASS: '******'] ' + ct) return True else: print('\t[FAIL: ' + str(cos_result) + '] ' + ct) print('\tNo matching results found...') return False
def get_similarity_score(): cosine = Cosine(2) f, g, ctr = open('model_res').readlines(), open('real_res').readlines(), 0 for i in range(len(f)): f[i], g[i] = f[i].replace("\n", ""), g[i].replace("\n", "") ctr += cosine.similarity_profiles(cosine.get_profile(f[i]), cosine.get_profile(g[i])) return ctr / len(f)
def title_similarity(self, page): try: s1 = normalize(self.reference.title) s2 = normalize(page.title) n = 3 p1_trigrams = Counter(nltk.ngrams(s1, n)) p2_trigrams = Counter(nltk.ngrams(s2, n)) p1_grams = Counter(nltk.ngrams(s1, 1)) p2_grams = Counter(nltk.ngrams(s2, 1)) cosine = Cosine(1) similarity = cosine.similarity_profiles(p1_trigrams, p2_trigrams) similarity = cosine.similarity_profiles(p1_grams, p2_grams) similarity = similarity / 2 except ZeroDivisionError: similarity = 0 return similarity
def match(num): stop_plate = [0, "", "0"] suspected = '' from similarity.cosine import Cosine cosine = Cosine(2) s0 = str(num) f = open('lic.csv').readlines() for s1 in f: s1 = s1.replace("\n", "") if s1 in stop_plate: continue print(s1) p0 = cosine.get_profile(s0) p1 = cosine.get_profile(s1) if cosine.similarity_profiles(p0, p1) >= 0.65: suspected += s1 + " " return suspected
def footer(df, kgram=2, TOP_LINES=5): df['isFooter'] = False pgs = df['page'].unique() cosine = Cosine(kgram) for pg in pgs[1:]: prev_idx = df.index[df['page'] == (pg - 1)] pres_idx = df.index[df['page'] == pg] for ln in range(TOP_LINES): prev_ln = prev_idx[-1 * (ln + 1)] pres_ln = pres_idx[-1 * (ln + 1)] s0 = df.loc[prev_ln, 'text'] s1 = df.loc[pres_ln, 'text'] skip = 0 if s0.isdigit(): df.loc[prev_ln, 'isFooter'] = True skip = 1 if s1.isdigit(): df.loc[pres_ln, 'isFooter'] = True skip = 1 if (skip == 1) | (len(s0) < kgram) | (len(s1) < kgram): continue #print(s0,",", s1) p0 = cosine.get_profile(s0) p1 = cosine.get_profile(s1) sim = cosine.similarity_profiles(p0, p1) if (sim > 0.9): df.loc[prev_ln, 'isFooter'] = True df.loc[pres_ln, 'isFooter'] = True #print(pg,",", ln, ",", s0,",", s1,",", sim) return (df)
def met_cosine(s1, s2, n): cosine = Cosine(n) p1 = cosine.get_profile(s1) p2 = cosine.get_profile(s2) val = cosine.similarity_profiles(p1, p2) return val
for i in range(len(data)): temp = data[i][0].strip('"') profiles.append(cosine.get_profile(temp)) for i in range(len(profiles)): print(profiles[i]) profile_sim = [] for i in range(len(profiles)): sim_score = 0 for j in range(len(profiles)): temp = cosine.similarity_profiles(profiles[i], profiles[j]) sim_score = sim_score + temp profile_sim.append([i, sim_score]) print(i) with open('cosine_similarity.txt', 'w', encoding="ISO-8859-1") as outfile: json.dump(profile_sim, outfile) outfile.close()
print(input_list) print(counter) same = similar_text_score() txt_true = "The term international child abduction is generally synonymous with international parental kidnapping, child snatching, and child stealing.[1] However, the more precise legal usage of international child abduction originates in private international law and refers to the illegal removal of children from their home by an acquaintance or family member to a foreign country. In this context, 'illegal' is normally taken to mean 'in breach of custodial rights' and 'home' is defined as the child's habitual residence" txt_pred = "What is today called 'parental kidnapping,' 'international child abduction,', 'parental child abduction' and 'parental child trafficking' has existed as long as different legal jurisdictions and international borders have—though often under different names. None of these names achieved the modern day broad acceptance of terms like international child abduction. Lacking a common set of terminology or specifically designed laws to address the, at the time, poorly defined problem, researchers on the history of cross-border child abduction must search for terms like 'custodial interference,' 'contempt of child custody orders,' 'legal kidnapping' or, in cases where children were viewed more as property than as individual subjects of rights, name variations on theft, child-maintenance debt and smuggling, among others." y_pred2 = "New Delhi: At least 35 people were killed and over 200 injured on Sunday when over dozen coaches of two superfast Express trains got derailed in Uttar Pradesh and Assam, raising concerns once again about the patchy safety record of Indian Railways.Thirty-five have been confirmed dead and over 140 injured as the Howrah-Delhi-Kalka Express got derailed near Fatehpur Malwa in Uttar Pradesh. The incident took place around 12:30 pm. Approximately 1200 people were travelling on board Kalka Mail.The train was travelling from Howrah to New Delhi and was moving at the speed of 108 Km/Hr when the driver used emergency brakes to slow it down, which led to the derailment, sources claimed." print(lcs.distance(final_text_str, final_compare_text_str)) print(qgram.distance('hello', 'world')) print( cosine.similarity_profiles(cosine.get_profile(final_text_str), cosine.get_profile(final_compare_text_str))) print(jarowinkler.similarity(final_text_str, final_compare_text_str)) same.similarity_score(txt_true=final_text_str, txt_pred=final_compare_text_str) pred = prediction() pred.generate_a_dataset(qnt_of_rand_num=200, dataset_name='lawers200.csv') pred.train_model(csv_file='lawers200.csv', save_model_name='model200.pickle') pred.load_model_and_pred(model_name='model200.pickle', rating=0.95, wins=6, time_diff=196) import numpy as np import pandas as pd
def dashboard_carrer(): with open('.data/test.csv', 'rb') as f: result = chardet.detect(f.read()) p_test = pd.read_csv(".data/test.csv", encoding=result['encoding']) #Encoding p_test = p_test.replace('Enjoy', 5) p_test = p_test.replace('Slightly Enjoy', 4) p_test = p_test.replace('Neutral', 3) p_test = p_test.replace('Slightly Disagree', 2) p_test = p_test.replace('Strongly Disagree', 1) #Realistic Questions realistic = p_test[[ 'I like to work on cars', 'I like to build things', 'I like to take care of animals', 'I like putting things together or assembling things', 'I like to cook', 'I am a practical person', 'I like working outdoors' ]] #Investigative Questions investigative = p_test[[ 'I like to do puzzles', 'I like to do experiments', 'I enjoy science', 'I enjoy trying to figure out how things work', 'I like to analyze things (problems/situations)', 'I like working with numbers or charts', 'I am good at math' ]] #Artistic Questions artistic = p_test[[ 'I am good at working independently', 'I like to read about art and music', 'I enjoy creative writing', 'I am a creative person', 'I like to play instruments or sing', 'I like acting in plays', 'I like to draw' ]] #Social Questions social = p_test[[ 'I like to work in teams', 'I like to teach or train people', 'I like trying to help people solve their problems', 'I am interested in healing people', 'I enjoy learning about other cultures', 'I like to get into discussions about issues around me', 'I like helping people' ]] #Enterprising Questions enterprising = p_test[[ 'I am an ambitious person who set goals for myself', 'I like to try to influence or persuade people', 'I like selling things', 'I am quick to take on new responsibilities', 'I would like to start my own business', 'I like to give speeches', 'I like to lead' ]] #Conventional Questions conventional = p_test[[ 'I like to organize things', 'I wouldn’t mind working 8 hours per day in an office', 'I pay attention to details', 'I like to do filing or typing', 'I am good at keeping records of my work', 'I would like to work in an office' ]] #Summing Up realistic['R'] = realistic.sum(axis=1) investigative['I'] = investigative.sum(axis=1) artistic['A'] = artistic.sum(axis=1) social['S'] = social.sum(axis=1) enterprising['E'] = enterprising.sum(axis=1) conventional['C'] = conventional.sum(axis=1) code = realistic['R'] code = code.to_frame() code['I'] = investigative['I'] code['A'] = artistic['A'] code['S'] = social['S'] code['E'] = enterprising['E'] code['C'] = conventional['C'] n = 3 new_d = [ list(map(it(0), (row[1:].sort_values(ascending=False)[:n].iteritems()))) for _, row in code.iterrows() ] std = pd.DataFrame(new_d) std['code'] = std[0] + std[1] + std[2] #std has the test code std = std.drop([0, 1, 2], axis=1) #Read the course data course = pd.read_csv(".data/course.csv") df = pd.MultiIndex.from_product( [std["code"], course["course_code"], course["Course_short"]], names=["code", "course_code", "course"]).to_frame(index=False) df = df.dropna() #Cosine Similarity cosine = Cosine(2) df["p0"] = df["code"].apply(lambda s: cosine.get_profile(s)) df["p1"] = df["course_code"].apply(lambda s: cosine.get_profile(s)) df["cosine_sim"] = [ cosine.similarity_profiles(p0, p1) for p0, p1 in zip(df["p0"], df["p1"]) ] df.drop(["p0", "p1"], axis=1, inplace=True) #Sorting the Values top_n = df.sort_values(['cosine_sim'], ascending=False).groupby(df['code'].values).head(3) options = top_n["course"].to_numpy() # selecting rows based on condition rec = course.loc[course['Course_short'].isin(options)] recommendations = json.loads(rec.to_json(orient='records')) return render_template('./dashboard_carrer.html', title='Dashboard - Carrer', std=std, recommendations=recommendations)
def cosine(self, s0, s1): cosine = Cosine(15) p0 = cosine.get_profile(s0) p1 = cosine.get_profile(s1) #print('Cosine similarity \"%\" vs \"%\"'% (s0,s1)) return cosine.similarity_profiles(p0, p1)