Exemplo n.º 1
0
 def init_similarities(self):
     self.similarities = {}
     for section in self.conf.sections():
         if section.startswith('similarity_'):
             sim_name = section[11:]
             self.similarities[sim_name] = similarity.get_similarity(self.conf, section)
         elif 'fallback_' in section:
             self.fallback_similarity = similarity.get_similarity(self.conf, section)
def main_summarizer():
    counter = 0;
    db = MySQLdb.connect("localhost", 'root', '', "inswipes")
    cursor = db.cursor()

    sql1 = "SELECT * FROM `meta_content` WHERE post_id=(SELECT MAX(post_id) from `meta_content`)"
    cursor.execute(sql1)

    resultset = cursor.fetchall()
    row = resultset[0]

    article_number = row[0]

    text = row[1]

    article_link = row[2]
    article_category = row[3]
    article_title = row[4]
    article_status = 0
    duplicate_id = ""
    duplicate_counter = 0
    print text + '\n'

    sql2 = 'SELECT Main_Article,Post_Id FROM `post_management` WHERE Category_Name="%d"' % (article_category)
    cursor.execute(sql2)
    resultset2 = cursor.fetchall()

    print('Reading article ' + str(article_number) + '\n')

    keyphrases = extractKeyphrases(text)

    summary = extractSentences(text)

    #Checks whether there are articles of similar category present. If Yes , then checks whether it is a duplicate article or not.
    #If the article is duplicate then it intserts the similarity factor as well as id of the corresponding article in the table.

    if resultset2:
        print ("Same category articles are present\n")
        for row in resultset2:
            article_check = row[0]
            article_id=row[1]
            duplication_factor = get_similarity(text, article_check)
            print duplication_factor
            if duplication_factor > 0.3:
                print ("Article is Duplicate")
                temp_dup=duplicate_id
                duplicate_id = temp_dup + str(article_id) + ' -> ' + str(duplication_factor) + ', '
                duplicate_counter+=1
            else:
                print ("Article is not Duplicate")

        writeFiles(summary, keyphrases, article_number, article_title, article_category, text, article_link,
                   article_status, duplicate_id, duplicate_counter)

    else:
        print ("no\n")
        writeFiles(summary, keyphrases, article_number, article_title, article_category, text, article_link,
                   article_status, duplicate_id, duplicate_counter)

    db.close()
Exemplo n.º 3
0
def _find_connections(id1, user1, users):
    connections = {} # {'aitor' : 10, 'aritz' : 30}
    for id2 in users:
        user2 = users[id2]
        if id2 != id1:
            # Already a connection
            if id2 not in user1['connections']:
                #print id1, 'with', id2
                S = similarity.get_similarity(user1, user2)
                connections[id2] = S
            
    return connections
Exemplo n.º 4
0
    def get_distribution(self):

        n = self.N
        self.file_index, self.file_sim, self.file_coverage, self.file_sim_dict = similarity.get_similarity(
            n)

        print self.file_sim
        plt.figure(figsize=(30, 10))
        ind = np.arange(len(self.file_sim))
        #plt.bar(ind,self.file_coverage)
        plt.bar(self.file_sim, self.file_coverage)
        plt.xlabel('similarity')
        plt.ylabel('coverage')
Exemplo n.º 5
0
def similarity_matcher(final_article,url):
    flag=0
    if content.count()==0:
        print("hello")
    else:
        for i in content.find():
            db_url=str(i['url'])
            article=str(i['article'])
            post_id=str(i["_id"])

            if db_url != url:
                sim_value=get_similarity(final_article,article)
                print (sim_value)
                if sim_value>=.3:
                    similar_to.append(post_id)
                    flag=1
            else:
                return 1
Exemplo n.º 6
0
def get_main_image_from_urls(urls, title=''):
    try:
        valid_urls = []
        if len(urls) == 0:
            return ''

        for u in urls:
            url = util.valid_url(u[0])

            if url != '':
                valid_urls.append(url)

                if similarity.get_similarity(u[1], title) > ALT_SIMILARITY:
                    return url
                    break

        for url in valid_urls:

            try:

                if url != '':
                    result = urlfetch.fetch(url)
                    if result.status_code == 200:
                        file = cStringIO.StringIO(result.content)
                        im = Image.open(file)
                        size = result.headers["Content-Length"]
                        height, width = im.size
                        if not size or size == '':
                            size = 0
                        #print get_size_ratio(height, width)
                        #print url
                        if int(size) > IMAGE_SIZE and get_size_ratio(height, width) < IMAGE_SIZE_RATIO:
                            return url
                            break

            except Exception, ex:
                logging.error('get_main_image_from_urls: %s' % ex.message)
                continue

        return ''
def get_business_score(reviews, id):
    sentences = reviews_to_sentences(reviews)
    final_cat = {"food":0, "service":0,"ambiance":0, "money":0}
    cat_count = {"food":1, "service":1, "ambiance":1, "money":1}
    items_food = {}
    items_service = {}
    items_amb = {}
    items_money = {}
    for sent in sentences:
        sentr = senti.get_sentiment(sent)
        if sentr != 0.5:
            cat = sim.get_similarity(sent)
            final_cat[cat] = sentr + final_cat[cat]
            cat_count[cat] = 1 + cat_count[cat]        
            if cat == 'food':
                items_food[sent] = sentr
            elif cat == 'service':
                items_service[sent] = sentr
            elif cat == 'ambiance':
                items_amb[sent] = sentr
            elif cat == 'money':
                items_money[sent] = sentr
            
    for key in final_cat.keys():
            final_cat[key] = final_cat[key] / (1.0 * cat_count[key])
    
    total_sen = 5
    new_f = dict(sorted(items_food.iteritems(), key=operator.itemgetter(1), reverse=True)[:total_sen])
    new_s = dict(sorted(items_service.iteritems(), key=operator.itemgetter(1), reverse=True)[:total_sen])
    new_a = dict(sorted(items_amb.iteritems(), key=operator.itemgetter(1), reverse=True)[:total_sen])
    new_m = dict(sorted(items_money.iteritems(), key=operator.itemgetter(1), reverse=True)[:total_sen])
    
    f = " ".join(new_f.keys())
    s = " ".join(new_s.keys())
    a = " ".join(new_a.keys())
    m = " ".join(new_m.keys())
    write_image(f, s, a, m, id)

    return final_cat
Exemplo n.º 8
0
print("Collecting text statistics...")

## Collect text stats from Readcalc https://pypi.python.org/pypi/ReadabilityCalculator
readability_calc_type = cfg.get('scores', 'type')
text_simplicity.get_readability_scores(data, readability_calc_type)

print("Beginning to write data to postgres")
connector.updated_input_dataframe_to_postgres(data)

## Quality check - writing to csv
folder = cfg.get('checkpoint', 'dir')
utils.create_date_folder(folder)
checkpoint1_name = cfg.get('checkpoint', 'ch1')
data.to_csv(checkpoint1_name, sep="\t")

## calculate similarity with
checkpoint2_name = cfg.get('checkpoint', 'ch2')

document_ids = data['id'].tolist()
documents_list = data.processed_value.tolist()

vector_type = cfg.get('vector', 'type')
output = similarity.get_similarity(vector_type, documents_list, document_ids)

#write output to csv file
utils.output_to_csv(vector_type, output, document_ids, checkpoint2_name)

## writeback to postgres
connector.csv_to_postgres(checkpoint2_name)