def init_similarities(self): self.similarities = {} for section in self.conf.sections(): if section.startswith('similarity_'): sim_name = section[11:] self.similarities[sim_name] = similarity.get_similarity(self.conf, section) elif 'fallback_' in section: self.fallback_similarity = similarity.get_similarity(self.conf, section)
def main_summarizer(): counter = 0; db = MySQLdb.connect("localhost", 'root', '', "inswipes") cursor = db.cursor() sql1 = "SELECT * FROM `meta_content` WHERE post_id=(SELECT MAX(post_id) from `meta_content`)" cursor.execute(sql1) resultset = cursor.fetchall() row = resultset[0] article_number = row[0] text = row[1] article_link = row[2] article_category = row[3] article_title = row[4] article_status = 0 duplicate_id = "" duplicate_counter = 0 print text + '\n' sql2 = 'SELECT Main_Article,Post_Id FROM `post_management` WHERE Category_Name="%d"' % (article_category) cursor.execute(sql2) resultset2 = cursor.fetchall() print('Reading article ' + str(article_number) + '\n') keyphrases = extractKeyphrases(text) summary = extractSentences(text) #Checks whether there are articles of similar category present. If Yes , then checks whether it is a duplicate article or not. #If the article is duplicate then it intserts the similarity factor as well as id of the corresponding article in the table. if resultset2: print ("Same category articles are present\n") for row in resultset2: article_check = row[0] article_id=row[1] duplication_factor = get_similarity(text, article_check) print duplication_factor if duplication_factor > 0.3: print ("Article is Duplicate") temp_dup=duplicate_id duplicate_id = temp_dup + str(article_id) + ' -> ' + str(duplication_factor) + ', ' duplicate_counter+=1 else: print ("Article is not Duplicate") writeFiles(summary, keyphrases, article_number, article_title, article_category, text, article_link, article_status, duplicate_id, duplicate_counter) else: print ("no\n") writeFiles(summary, keyphrases, article_number, article_title, article_category, text, article_link, article_status, duplicate_id, duplicate_counter) db.close()
def _find_connections(id1, user1, users): connections = {} # {'aitor' : 10, 'aritz' : 30} for id2 in users: user2 = users[id2] if id2 != id1: # Already a connection if id2 not in user1['connections']: #print id1, 'with', id2 S = similarity.get_similarity(user1, user2) connections[id2] = S return connections
def get_distribution(self): n = self.N self.file_index, self.file_sim, self.file_coverage, self.file_sim_dict = similarity.get_similarity( n) print self.file_sim plt.figure(figsize=(30, 10)) ind = np.arange(len(self.file_sim)) #plt.bar(ind,self.file_coverage) plt.bar(self.file_sim, self.file_coverage) plt.xlabel('similarity') plt.ylabel('coverage')
def similarity_matcher(final_article,url): flag=0 if content.count()==0: print("hello") else: for i in content.find(): db_url=str(i['url']) article=str(i['article']) post_id=str(i["_id"]) if db_url != url: sim_value=get_similarity(final_article,article) print (sim_value) if sim_value>=.3: similar_to.append(post_id) flag=1 else: return 1
def get_main_image_from_urls(urls, title=''): try: valid_urls = [] if len(urls) == 0: return '' for u in urls: url = util.valid_url(u[0]) if url != '': valid_urls.append(url) if similarity.get_similarity(u[1], title) > ALT_SIMILARITY: return url break for url in valid_urls: try: if url != '': result = urlfetch.fetch(url) if result.status_code == 200: file = cStringIO.StringIO(result.content) im = Image.open(file) size = result.headers["Content-Length"] height, width = im.size if not size or size == '': size = 0 #print get_size_ratio(height, width) #print url if int(size) > IMAGE_SIZE and get_size_ratio(height, width) < IMAGE_SIZE_RATIO: return url break except Exception, ex: logging.error('get_main_image_from_urls: %s' % ex.message) continue return ''
def get_business_score(reviews, id): sentences = reviews_to_sentences(reviews) final_cat = {"food":0, "service":0,"ambiance":0, "money":0} cat_count = {"food":1, "service":1, "ambiance":1, "money":1} items_food = {} items_service = {} items_amb = {} items_money = {} for sent in sentences: sentr = senti.get_sentiment(sent) if sentr != 0.5: cat = sim.get_similarity(sent) final_cat[cat] = sentr + final_cat[cat] cat_count[cat] = 1 + cat_count[cat] if cat == 'food': items_food[sent] = sentr elif cat == 'service': items_service[sent] = sentr elif cat == 'ambiance': items_amb[sent] = sentr elif cat == 'money': items_money[sent] = sentr for key in final_cat.keys(): final_cat[key] = final_cat[key] / (1.0 * cat_count[key]) total_sen = 5 new_f = dict(sorted(items_food.iteritems(), key=operator.itemgetter(1), reverse=True)[:total_sen]) new_s = dict(sorted(items_service.iteritems(), key=operator.itemgetter(1), reverse=True)[:total_sen]) new_a = dict(sorted(items_amb.iteritems(), key=operator.itemgetter(1), reverse=True)[:total_sen]) new_m = dict(sorted(items_money.iteritems(), key=operator.itemgetter(1), reverse=True)[:total_sen]) f = " ".join(new_f.keys()) s = " ".join(new_s.keys()) a = " ".join(new_a.keys()) m = " ".join(new_m.keys()) write_image(f, s, a, m, id) return final_cat
print("Collecting text statistics...") ## Collect text stats from Readcalc https://pypi.python.org/pypi/ReadabilityCalculator readability_calc_type = cfg.get('scores', 'type') text_simplicity.get_readability_scores(data, readability_calc_type) print("Beginning to write data to postgres") connector.updated_input_dataframe_to_postgres(data) ## Quality check - writing to csv folder = cfg.get('checkpoint', 'dir') utils.create_date_folder(folder) checkpoint1_name = cfg.get('checkpoint', 'ch1') data.to_csv(checkpoint1_name, sep="\t") ## calculate similarity with checkpoint2_name = cfg.get('checkpoint', 'ch2') document_ids = data['id'].tolist() documents_list = data.processed_value.tolist() vector_type = cfg.get('vector', 'type') output = similarity.get_similarity(vector_type, documents_list, document_ids) #write output to csv file utils.output_to_csv(vector_type, output, document_ids, checkpoint2_name) ## writeback to postgres connector.csv_to_postgres(checkpoint2_name)