Пример #1
0
	def get_news(self, url, flag):
		#given a news site, tries to extract article links
		#definitely works for Hacker News
		
		#returns list of urls for articles as a list of strings (unicode)
		news = n.News(url)
		news.set_flag(flag)
		news.find_links()
		return news.links
Пример #2
0
def read_log(log_file):
    par_adv = []
    measured = False
    sys.stdout.write("Reading log")
    fo = open(log_file, "r")
    for line in fo:
        #       print line
        tim, linetype, linename, value, unit_id, treatment_id = interpret_log_line(
            line)
        if (linetype == 'meta'):
            if (linename == 'agents'):
                num_agents = int(value)
            elif (linename == 'treatnames'):
                treatnames = re.split("\@\|", value)
#               print "Treatments: ", treatnames
            elif (linename == 'block_id start'):
                sys.stdout.write(".")
                sys.stdout.flush()
                block_id = int(value)
                adv = []
                ints = []
                newsv = []
                for i in range(0, num_agents):
                    adv.append(adVector.AdVector())
                    ints.append(interest.Interests())
                    newsv.append(news.NewsVector())


#               print block_id
            elif (linename == 'assignment'):
                assignment = [int(x) for x in re.split("\@\|", value)]
            elif (linename == 'block_id end'):
                apply_labels_to_vecs(adv, ints, newsv, assignment, num_agents,
                                     len(treatnames))
                par_adv.append({
                    'advector': adv,
                    'newsvector': newsv,
                    'assignment': assignment,
                    'intvector': ints
                })
        elif (linetype == 'treatment'):
            pass
        elif (linetype == 'measurement'):
            if (linename == 'ad'):
                ind_ad = ad.Ad(value, treatment_id)
                adv[int(unit_id)].add(ind_ad)
            if (linename == 'interest'):
                ints[int(unit_id)].set_from_string(value)
            if (linename == 'news'):
                ind_news = news.News(value, treatment_id)
                newsv[int(unit_id)].add(ind_news)
        elif (linetype == 'error'):
            #           print "Error in block", block_id, ": ", line.strip()
            pass
    sys.stdout.write(".Reading complete\n")
    print "Treatments: ", treatnames
    return par_adv, treatnames
Пример #3
0
    def test_1_news_init(self):
        print("Test #1 for news.__init__(...) with mock News API replies:")
        news_out = []

        # go through each mock API answer and initialize a News object
        for a in self.mock_api_answer:
            news_out.append(news.News(a["description"], a["source"]))
            # run the check through the news_tester function defined below
            self.news_tester(news_out[-1], a)
        print("Done.")
Пример #4
0
def test():
    print(
        "test1:displays number of features generated from all the documents\n")
    f = open('feature_definition_file', 'r')
    count = 0
    k = f.readline()
    while k:
        count += 1
        k = f.readline()
    print("number of features generated in feature_defintion_file : " +
          str(count))
    f.close()
    print(
        "test2:verified that all the documents are read and parsed from the mininewsgroup directory\n"
    )
    f = open('training_data_file', 'r')
    count = 0
    k = f.readline()
    while k:
        count += 1
        k = f.readline()
    print("number of documents parsed from mininewsgroup : " + str(count))
    print("test3 : Given a filename and filepath parse the document\n")
    fil = open('class_definition_file', "r")
    classes = {}
    r = fil.readline()
    while r:
        p = str(r.strip()).split(" ")
        if p[0] in classes:
            classes[p[0]].append(p[1])
        else:
            classes[p[0]] = [p[1]]
        r = fil.readline()
    fil.close()
    directorypath = input(
        "Enter the filepath (eg:localpath/mini_newsgroups/alt.atheism/51121):\n"
    )
    ngobj = news.News(directorypath, classes)
    print("DOCID : " + ngobj.docID)
    print("Newsgroup : " + ngobj.newsgroup)
    print("Class : " + ngobj.class_label)
    print("Subject : " + ngobj.subject)
    print("Body : " + ngobj.body)
    print("test4\n")
    print(
        "Tokenizing the subject and body of the above given file,removing stop words and stemming: \n"
    )
    print(util.tokenize(ngobj.subject + " " + ngobj.body))
    print("test5 : printing inverted index of the given file\n")
    indexobjtest = InvertedIndex()
    indexobjtest.indexDoc(ngobj)
    for key in indexobjtest.items:
        print(key + " " + str(ngobj.docID) + " " +
              str(indexobjtest.items[key].posting[ngobj.docID].positions))
Пример #5
0
def againdeal(url_list, output, base_url): 
    #deal with data, use append add to store_class , findally return
    store_class = news.List_news()

    i = 1
    json_list = []

    topic_split = re.compile('<h1 class=\"headline\">.*</h1>')
    author_split = re.compile('<span class=\"provider org\">.*</span>')
    date_split = re.compile('<abbr title=.*</abbr>')
    text_split = re.compile('<p class=\"first\">.*</p>|<p>.*</p>')        #It is very difficult thought for a long, but can be found with union

    for url in url_list:

        nextweb = requests.get(base_url + str(url) + 'html')
        nextweb.encoding = 'utf-8'
        information = nextweb.text

        #Prevent coding problems
        try:
            #uer "str" ,  because list not use 
            topic = str(topic_split.findall(information)).replace('<h1 class=\"headline\">', '').replace('</h1>', '').replace('\\u3000', '', 20).replace('╱', '', 10).replace('[', '', 10).replace(']', '',10)
            author = str(author_split.findall(information)).replace('<span class=\"provider org\">', '').replace('</span>', '').replace('[', '', 10).replace(']', '',10)
            date = str(date_split.findall(information)).replace('>', '<', 10).split('<')[2]      #this is so trouble,  it is ["",  "<abbr title = ...",  "date",  "</abbr>",  ""],  so is data[2]
            text = str(text_split.findall(information)).replace('<p class=\"first\">', '').replace('</p>', '', 100).replace(' ', '', 100).replace('<p>', '', 100).replace('[', '', 10).replace(']', '',10).replace('\',\'', '', 10)

            #deal with date
            if '下' in date:
                date = date.replace('下午', '')
                try:
                    date = datetime.strptime(date, '%Y年%m月%d日 %H:%M') + timedelta(hours = 12)
                except:
                    date = datetime.strptime(date, '%Y年%m月%d日 %H:%M') + timedelta(days = 1, hours = -12)
            else:
                date = date.replace('上午', '')
                date = datetime.strptime(date, '%Y年%m月%d日 %H:%M') 


            store_class.append(news.News(topic, author, date, text))

            json_list.append(store_class.news[i - 1].toDict())

            print('第', i, '則新聞已擷取完')
            i += 1
        except:
            continue

    output.write(json.dumps(json_list,  ensure_ascii = False))
    print('讀取完畢!')

    return store_class
 def get_news_object(self, art, use_description = True):
     # check the news headline is long enough, otherwise
     # Watson NLU won't be able to analyze it
     if use_description:
         key = "description"
     else:
         key = "title"
         # also try to clean the title
         art[key] = self.clean_news_title(art[key])
     hl = art[key]
     if len(hl.split()) > 3 and len(hl) > 15:
         # it is long enough, use it
         return news.News(art[key], art["source"]["name"])
     else:
         # it is not long enough, pass on this one
         return None
Пример #7
0
def news_start_1000():
    my_news = news.News()
    repos = mysql.get_repo_without_news()
    if len(repos) == 0:
        print 'done'
        exit(0)
    for repo in repos:
        search_news, urls = my_news.get_news(repo[2])
        if search_news:
            mysql.insert_news(repo[0], search_news, urls)
            print '\033[1;31;40m'
            print repo, 'news ,done'
            print '\033[0m'
        else:
            mysql.insert_news(repo[0], [''], [''])
            print repo, 'failed'
Пример #8
0
def main():
    # Get API keys
    news_api = news.News()
    news_data = news_api.getNews(datetime.today())
    news_words = news_api.get_words()
    print(news_words)

    # Get spotify playlist
    playlist = spotify.addWords(news_words)

    # Load in UI
    root = Tk()
    root.geometry("800x800+800+800")

    # Congifure user interface
    app = UI()
    app.render_news(news_data)
    app.render_playlist(playlist)
    root.mainloop()
Пример #9
0
 def test_set_news(self):
     obj = news.News()
     example = {'id': 1,
                'title': 'record.title',
                'link': 'record.link',
                'date': 'date',
                'description': 'record.description'}
     obj.set_news(example)
     self.assertEqual(obj._news[1], example) # элемент бодавлен в словарь
     self.assertEqual(len(obj._news), 1) # в словаре только 1 элемент
     obj.set_news(example)
     self.assertEqual(len(obj._list_news), 1) # элемент повторно не добавлен
     example = {'id': 2,
                'title': 'record.title',
                'link': 'record.link',
                'date': 'date',
                'description': 'record.description'}
     obj.set_news(example)
     self.assertEqual(obj._news[2], example) # элемент бодавлен в словарь
     self.assertEqual(len(obj._news), 2) # в словаре 2 элемента
Пример #10
0
 def test_set_attribute(self):
     obj = news.News()
     example = {'id': 1,
                'title': 'record.title',
                'link': 'record.link',
                'date': 'date',
                'description': 'record.description'}
     obj.set_news(example)
     example = {'id': 2,
                'title': 'record.title',
                'link': 'record.link',
                'date': 'date',
                'description': 'record.description'}
     obj.set_news(example)
     obj.set_attribute(2, 'text', 'text news')
     example = {'id': 2,
                'title': 'record.title',
                'link': 'record.link',
                'date': 'date',
                'description': 'record.description',
                'text': 'text news'}
     self.assertEqual(obj._news[2], example)
 def __init__(self, master=None):
     self.padding_for_x = 30
     self.padding_for_y = 30
     self.news_handler = news.News()
     super().__init__(master)
     # lang
     self.languages = ["fi", "en"]
     self.selected = tk.StringVar()
     self.language_label = tk.Label(self)
     self.language_select = tt.Combobox(self,
                                        textvariable=self.selected,
                                        values=self.languages)
     self.build_language_field()
     # hint labels:
     self.remove_hint = tk.Label(self)
     self.add_new_hint = tk.Label(self)
     # lists all current news
     self.list_of_news = tk.Listbox(self)
     self.fill_list_view(self.news_handler.get_news())
     # delete from db button
     self.delete_from_db = tk.Button(self)
     # create new: headline field
     self.headline_label = tk.Label(self)
     self.headline = tk.Text(self)
     # create new: message field
     self.message_label = tk.Label(self)
     self.message = tk.Text(self)
     # create new: date
     self.date_label = tk.Label(self)
     self.news_date = tk.Entry(self)
     # create new: push to db
     self.add_news = tk.Button(self)
     self.master = master
     self.pack()
     self.create_hint_headers()
     self.create_list_view()
     self.create_delete_from_db_btn()
     self.create_new_news_components()
     self.winfo_toplevel().title("Cats opinion admin panel - Edit news")
Пример #12
0
def main():
    E = float(sys.argv[2])

    # glove_file = datapath('glove.twitter.27B/glove.twitter.27B.200d.txt')
    # tmp_file = get_tmpfile("tweets_word2vec.txt")
    # _ = glove2word2vec(glove_file, tmp_file)
    # model = KeyedVectors.load_word2vec_format(tmp_file)
    # model.save("tweets_word2vec.model")

    # print("model completed")

    model = KeyedVectors.load("glove.twitter.27B/tweets_word2vec.model")

    news_api = news.News()
    word_processor = processor.Processor()
    tweets_api = tweets.Tweets()
    articles = news_api.process_news(news_api.retrieve_everything())

    data = []
    for line in open(sys.argv[1]):
        data.append(json.loads(line))

    all_tweets = tweets_api.process_tweets(data)

    all_tokens = []
    copied_tweets = list(all_tweets)

    for tweet in copied_tweets:
        tokens = word_processor.tweet_tokenize(tweet[0])
        if tokens == []:
            all_tweets.remove(tweet)
            continue
        all_tokens.append(tokens)

    all_clusters = []
    cluster_id = 0
    for i in range(len(all_tweets)):
        tweet = all_tweets[i]
        token = all_tokens[i]
        # first cluster
        if all_clusters == []:
            new_cluster = cluster.Cluster(tweet[0], tweet[1], token,
                                          cluster_id)
            cluster_id += 1
            all_clusters.append(new_cluster)
            continue

        clustered = False

        for j in range(len(all_clusters)):
            single_cluster = all_clusters[j]
            vector = single_cluster.get_vector(False)
            # no common words between the tweet and the cluster, skip
            common_text_vector = intersection(vector["text"], token["text"])
            common_hashtag_vector = intersection(vector["hashtag"],
                                                 token["hashtag"])
            common_url_vector = intersection(vector["url"], token["url"])
            if common_text_vector == [] and \
                    common_hashtag_vector == [] and \
                    common_url_vector == []:
                continue

            # vector = single_cluster.get_vector(True)

            new_token = {}
            new_token["text"] = token["text"]
            new_token["hashtag"] = token["hashtag"]

            # TODO: we can check if a word is in the pre-trained model by doing the following
            # for word not in new_token["text"]:
            #     if word in model.wv.vocab: # if word in model.vocab:
            #         print(word)

            similarity = model.wv.n_similarity(new_token["text"],
                                               vector["text"])
            print(similarity)

            if similarity >= E:
                # max_cluster_similarity = similarity
                # max_cluster_index = j
                single_cluster.push(tweet[0], tweet[1], token)
                clustered = True
                break

        if not clustered:
            new_cluster = cluster.Cluster(tweet[0], tweet[1], token,
                                          cluster_id)
            cluster_id += 1
            all_clusters.append(new_cluster)

    print("Total number of clusters generated: %d" % (len(all_clusters)))
    cluster_sizes = [x.get_size() for x in all_clusters]
    print("The sizes of all clusters generated:")
    print(cluster_sizes)
    max_cluster_size = max(cluster_sizes)
    print("The max cluster size is: %d" % (max_cluster_size))

    F = float(sys.argv[3])
    related_news_clusters = []

    # for article in articles:
    for i in range(len(articles)):
        news_cluster_group = {}
        # max_similarity = 0
        # max_similarity_index = -1

        article = articles[i]
        text = article["title"] + article["description"]
        time = article["publish_time"]

        # for single_cluster in all_clusters:
        for j in range(len(all_clusters)):
            single_cluster = all_clusters[j]

            # Remove outlier clusters
            # if single_cluster.get_size() <= 10 or single_cluster.is_clustered:
            if single_cluster.get_size() <= 10:
                continue

            cluster_vector = single_cluster.get_vector(True)["text"]
            similarity = word_processor.docs_similarity(text, cluster_vector)
            similarity = word_processor.modified_similarity(
                similarity, time, single_cluster, True)

            # if similarity >= F and similarity > max_similarity:
            if similarity >= F:
                # max_similarity = similarity
                # max_similarity_index = j
                # The news is related to this cluster
                news_cluster_group["article"] = i
                news_cluster_group["cluster"] = single_cluster.get_id()
                related_news_clusters.append(news_cluster_group)
                # stop comparing with other clusters
                break

        # if max_similarity_index == -1:
        #     continue
        # news_cluster_group["article"] = i
        # news_cluster_group["cluster"] = max_similarity_index
        # related_news_clusters.append(news_cluster_group)
        # all_clusters[max_similarity_index].change_clustered()

    counter = {}
    for item in related_news_clusters:
        if item["cluster"] not in counter:
            counter[item["cluster"]] = 1
        else:
            counter[item["cluster"]] += 1

    # most_related_cluster = max(counter.items(), key=operator.itemgetter(1))[0]
    print("Number of pairs generated in total: %d" %
          (len(related_news_clusters)))
    print("All generated pairs:")
    print(related_news_clusters)
Пример #13
0
def main():
    if int(sys.argv[5]) == 1:
        enable_time_relevancy = True
    else:
        enable_time_relevancy = False

    if int(sys.argv[6]) == 1:
        enable_hashtag_similarity = True
    else:
        enable_hashtag_similarity = False

    E = float(sys.argv[2])
    word_processor = processor.Processor(enable_hashtag_similarity)
    tweets_api = tweets.Tweets(int(sys.argv[4]))

    all_tweets = tweets_api.process_tweets(sys.argv[1])
    all_tokens = []
    copied_tweets = list(all_tweets)

    for tweet in copied_tweets:
        tokens = word_processor.tweet_tokenize(tweet[0])
        if tokens["text"] == []:
            all_tweets.remove(tweet)
            continue 
        all_tokens.append(tokens)

    all_clusters = []
    # we are computing the similarity of one tweet with all clusters
    # exists, not the similarity with other tweets
    cluster_id = 0
    for i in range(len(all_tweets)):
        # first cluster
        if all_clusters == []:
            tweet = all_tweets[i]
            token = all_tokens[i]
            new_cluster = cluster.Cluster(tweet[0], tweet[1], token, cluster_id)
            cluster_id += 1
            all_clusters.append(new_cluster)
            continue
        
        clustered = False
        # max_cluster_similarity = 0
        # max_cluster_index = -1
        token = all_tokens[i]
        # print("Tweet after processed: %s" % (token["text"]))
        # for single_cluster in all_clusters:
        for j in range(len(all_clusters)):
            single_cluster = all_clusters[j]
            vector = single_cluster.get_vector(False)
            # no common words between the tweet and the cluster, skip
            if not intersection(vector["text"], token["text"]) and \
                not intersection(vector["hashtag"], token["hashtag"]):
                continue

            # start_pre_similarity = time.time()

            new_token = {}
            new_token["text"] = " ".join(token["text"])
            new_token["hashtag"] = token["hashtag"]
            # new_token["url"] = token["url"]
            # print("Pre similarity duration: %s" % (time.time() - start_pre_similarity))

            # if all_text_in_cluster(new_token["text"], vector["text"]):
            #     similarity = 1
            # else:
            #     vector = single_cluster.get_vector(True)
            #     similarity = word_processor.new_triple_similarity(new_token, vector)
            try:
                # print("Cluster: %s" % (vector["text"]))
                vector = single_cluster.get_vector(True)
                similarity = word_processor.new_triple_similarity(new_token, vector)
            except:
                continue
                # print(new_token)
                # print(vector)

            if enable_time_relevancy:
                similarity = word_processor.modified_similarity(
                    similarity, all_tweets[i][1], single_cluster)

            # print("Similarity: %f" % (similarity))
            if similarity >= E:
                tweet = all_tweets[i]
                single_cluster.push(tweet[0], tweet[1], token)
                clustered = True
                break

        # if max_cluster_index != -1:
        #     all_clusters[max_cluster_index].push(tweet[0], tweet[1], token)
        #     clustered = True

        if not clustered:
            tweet = all_tweets[i]
            token = all_tokens[i]
            new_cluster = cluster.Cluster(tweet[0], tweet[1], token, cluster_id)
            cluster_id += 1
            all_clusters.append(new_cluster)
        
        # print("-----------------------------------------------------------")

    print("Total number of clusters generated: %d" % (len(all_clusters)))
    cluster_sizes = [x.get_size() for x in all_clusters]
    print("The sizes of all clusters generated:")
    print(cluster_sizes)
    max_cluster_size = max(cluster_sizes)

    # for j in range(len(cluster_sizes)):
    #     if cluster_sizes[j] == max_cluster_size:
    #         break

    print("The max cluster size is: %d" % (max_cluster_size))
    # print("Number of tweets clustered using hashtag/url: %d" % (word_processor.hashtag_index))
    # print("Number of tweets clustered using text: %d" % (word_processor.text_index))

    # for item in all_clusters[j].get_all_tweets():
    #     print(item)

    # similarity = word_processor.docs_similarity(all_tweets[0][0], all_tweets[0][0])
    # the similarity we get is greater the better, closer to 1 means they are very
    # similar, otherwise very different

    # TODO: after finish clustering, we need to compute the similarity between
    # each cluster and each news we retrieved
    news_api = news.News()
    articles = news_api.process_news(news_api.retrieve_everything())
    F = float(sys.argv[3])
    related_news_clusters = []

    # for article in articles:
    for i in range(len(articles)):
        news_cluster_group = {}
        # max_similarity = 0
        # max_similarity_index = -1

        article = articles[i]
        text = article["title"] + article["description"]
        publish_time = article["publish_time"]

        # for single_cluster in all_clusters:
        for j in range(len(all_clusters)):
            single_cluster = all_clusters[j]

            # Remove outlier clusters
            if single_cluster.get_size() < 10:
                continue

            cluster_vector = single_cluster.get_vector(True)["text"]
            
            if not intersection(cluster_vector, text):
                continue

            similarity = word_processor.docs_similarity(text, cluster_vector)
            # similarity = word_processor.modified_similarity(similarity, publish_time, single_cluster, True)
            # if enable_time_relevancy:
            #     similarity = word_processor.modified_similarity(
            #         similarity, publish_time, single_cluster, True)

            # if similarity >= F and similarity > max_similarity:
            if similarity >= F:
                # max_similarity = similarity
                # max_similarity_index = j
                # The news is related to this cluster
                news_cluster_group["article"] = i
                news_cluster_group["cluster"] = single_cluster.get_id()
                news_cluster_group["similarity"] = similarity
                related_news_clusters.append(news_cluster_group)
                # stop comparing with other clusters
                break
    
    print("Number of pairs generated in total: %d" % (len(related_news_clusters)))
    print("All generated pairs:")
    print(related_news_clusters)

    for related_pair in related_news_clusters:
        print("News below")
        article_id = related_pair["article"]
        print(articles[article_id])
        cluster_id = related_pair["cluster"]
        print("Tweets below:")
        for k in range(len(all_clusters[cluster_id].get_all_tweets())):
            print("[%d]: %s: " % (k, all_clusters[cluster_id].get_all_tweets()[k]))
        print("----------------------------------------------------")
Пример #14
0
def main():
    E = float(sys.argv[2])
    news_api = news.News()
    word_processor = processor.Processor()
    tweets_api = tweets.Tweets(int(sys.argv[4]))

    articles = news_api.process_news(news_api.retrieve_everything())

    data = []
    for line in open(sys.argv[1]):
        data.append(json.loads(line))

    all_tweets = tweets_api.process_tweets(data)

    all_tokens = []
    copied_tweets = list(all_tweets)

    for tweet in copied_tweets:
        tokens = word_processor.tweet_tokenize(tweet[0])
        if tokens["text"] == []:
            all_tweets.remove(tweet)
            continue
        all_tokens.append(tokens)

    all_clusters = []
    # we are computing the similarity of one tweet with all clusters
    # exists, not the similarity with other tweets
    cluster_id = 0
    for i in range(len(all_tweets)):
        tweet = all_tweets[i]
        token = all_tokens[i]
        # first cluster
        if all_clusters == []:
            new_cluster = cluster.Cluster(tweet[0], tweet[1], token,
                                          cluster_id)
            cluster_id += 1
            all_clusters.append(new_cluster)
            continue

        clustered = False
        # max_cluster_similarity = 0
        # max_cluster_index = -1

        # for single_cluster in all_clusters:
        for j in range(len(all_clusters)):
            single_cluster = all_clusters[j]
            vector = single_cluster.get_vector(False)
            # no common words between the tweet and the cluster, skip
            common_text_vector = intersection(vector["text"], token["text"])
            common_hashtag_vector = intersection(vector["hashtag"],
                                                 token["hashtag"])
            common_url_vector = intersection(vector["url"], token["url"])
            if common_text_vector == [] and \
                common_hashtag_vector == [] and \
                common_url_vector == []:
                continue

            vector = single_cluster.get_vector(True)

            new_token = {}
            new_token["text"] = " ".join(token["text"])
            new_token["hashtag"] = token["hashtag"]
            new_token["url"] = token["url"]

            similarity = word_processor.new_triple_similarity(
                new_token, vector)
            # print("Tweet %d, Cluster %d" % (i, j))
            # print("Similarity before: %f" % (similarity))
            # # similarity = word_processor.docs_similarity(tweet[0], vector)
            # similarity = word_processor.modified_similarity(
            #         similarity, tweet[1], single_cluster)
            # print("Similarity after: %f" % (similarity))
            # if similarity >= E and similarity > max_cluster_similarity:
            if similarity >= E:
                # max_cluster_similarity = similarity
                # max_cluster_index = j
                single_cluster.push(tweet[0], tweet[1], token)
                clustered = True
                # TODO: we need to consider when one tweet is similar to multiple clusters,
                # which cluster should we push to
                break

        # if max_cluster_index != -1:
        #     all_clusters[max_cluster_index].push(tweet[0], tweet[1], token)
        #     clustered = True

        if not clustered:
            new_cluster = cluster.Cluster(tweet[0], tweet[1], token,
                                          cluster_id)
            cluster_id += 1
            all_clusters.append(new_cluster)

        # print(i)

    print("Total number of clusters generated: %d" % (len(all_clusters)))
    cluster_sizes = [x.get_size() for x in all_clusters]
    print("The sizes of all clusters generated:")
    print(cluster_sizes)
    max_cluster_size = max(cluster_sizes)

    # for j in range(len(cluster_sizes)):
    #     if cluster_sizes[j] == max_cluster_size:
    #         break

    print("The max cluster size is: %d" % (max_cluster_size))
    # print("Number of tweets clustered using hashtag/url: %d" % (word_processor.hashtag_index))
    # print("Number of tweets clustered using text: %d" % (word_processor.text_index))

    # for item in all_clusters[j].get_all_tweets():
    #     print(item)

    # similarity = word_processor.docs_similarity(all_tweets[0][0], all_tweets[0][0])
    # the similarity we get is greater the better, closer to 1 means they are very
    # similar, otherwise very different

    # TODO: after finish clustering, we need to compute the similarity between
    # each cluster and each news we retrieved
    F = float(sys.argv[3])
    related_news_clusters = []

    # for article in articles:
    for i in range(len(articles)):
        news_cluster_group = {}
        # max_similarity = 0
        # max_similarity_index = -1

        article = articles[i]
        text = article["title"] + article["description"]
        time = article["publish_time"]

        # for single_cluster in all_clusters:
        for j in range(len(all_clusters)):
            single_cluster = all_clusters[j]

            # Remove outlier clusters
            # if single_cluster.get_size() <= 10 or single_cluster.is_clustered:
            if single_cluster.get_size() <= 10:
                continue

            cluster_vector = single_cluster.get_vector(True)["text"]
            similarity = word_processor.docs_similarity(text, cluster_vector)
            similarity = word_processor.modified_similarity(
                similarity, time, single_cluster, True)

            # if similarity >= F and similarity > max_similarity:
            if similarity >= F:
                # max_similarity = similarity
                # max_similarity_index = j
                # The news is related to this cluster
                news_cluster_group["article"] = i
                news_cluster_group["cluster"] = single_cluster.get_id()
                related_news_clusters.append(news_cluster_group)
                # stop comparing with other clusters
                break

        # if max_similarity_index == -1:
        #     continue
        # news_cluster_group["article"] = i
        # news_cluster_group["cluster"] = max_similarity_index
        # related_news_clusters.append(news_cluster_group)
        # all_clusters[max_similarity_index].change_clustered()

    counter = {}
    for item in related_news_clusters:
        if item["cluster"] not in counter:
            counter[item["cluster"]] = 1
        else:
            counter[item["cluster"]] += 1

    # most_related_cluster = max(counter.items(), key=operator.itemgetter(1))[0]
    print("Number of pairs generated in total: %d" %
          (len(related_news_clusters)))
    print("All generated pairs:")
    print(related_news_clusters)
Пример #15
0
 def set_news(self):
     nss = news.News()
     list_news = nss.get_list_of_sm()
     for x in range(0, 8):
         self.main_widget.ids.newsgrid.add_widget(list_news[x])
Пример #16
0
import weather
import news
import quotes
import time
from datetime import date
from PIL.ImageTk import PhotoImage, Image



weather_obj = weather.Weather()
news_obj = news.News()
months = ["January", "February", "March", "April", "May", "June", "July",\
                "August", "September", "October", "November", "December"]
    
weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday",\
                "Saturday", "Sunday"]

large_icons = {"cloudy": Image.open("bigcloudy.png"), "clear-night": Image.open("bignight.png"), "foggy": Image.open('bigcloudy.png')\
               , "partly-cloudy-night": Image.open("bignightcloudy.png"), "partly-cloudy-day": Image.open("bigpartlycloudy.png"),\
               "rain": Image.open("bigrainy.png"), "clear-day": Image.open("bigsun.png"), "thunderstorm": Image.open("bigstorm.png")}

small_icons = {"cloudy": Image.open("cloudy.png"), "clear-night": Image.open("night.png"), "foggy": Image.open('cloudy.png')\
               , "partly-cloudy-night": Image.open("nightcloudy.png"), "partly-cloudy-day": Image.open("partlycloudy.png"),\
               "rain": Image.open("rain.png"), "clear-day": Image.open("sun.png"), "thunderstorm": Image.open("storm.png")}

year, month, day = str(date.today()).split('-')
week_index = date(int(year), int(month), int(day)).weekday()

def date_as_str():
    return f'{weekdays[week_index]},  {months[int(month)]} {int(day)}, {year}'
Пример #17
0
 def __init__(self):
     self.app = None
     self.news = news.News()
Пример #18
0
	def post(self):
		pieceOfNews = news.News(title = self.request.get('title'), text = self.request.get('text'))
		pieceOfNews.put()
		self.redirect('/add_news')
Пример #19
0
    def test_append(self):
        self.news.append(
            news.News('a', 'a', datetime(1000, 6, 18, 1, 25, 0), 'a'))

        self.assertEqual(len(self.news), 3)
                            # начиная с нуля.))

# Задается список для ссылок на страницы найденных новостей, ссылка для запроса GET и 
# наименование файла для сохранения найденных новостей, соответствующие условиям поиска, 
# и список словарей, полученный в ответ на запрос.
links = []
url, filename = news.url_filename_composer(tag, offset, start_date, end_date)
items = requests.get(url).json()['items']

# Цикл заполнения списка ссылок на страницы с новостями, удовлетворяющими уловиям поиска.
while(items != []):
    for i in range(len(items)):
        if items[i]['fronturl'][:6] != 'https:':
            links.append('https:' + items[i]['fronturl'])
        else:
            links.append(items[i]['fronturl'])
    offset += 10
    url = news.url_filename_composer(tag, offset, start_date, end_date)[0]
    items = requests.get(url).json()['items']

# Формирование списка словарей, соответствующих новостям, ссылки на страницы с которыми
# содержатся в списке links. Каждая ссылка используется для создания объекта класса News,
# который с помощью метода as_dict() преобразуется в словарь. 
# Преобразование списка словарей в объект pandas.DataFrame и сохранение его в 'csv' файл.
if links == []:
    print("Новостей не найдено")
else:
    news_list = [news.News(link).as_dict() for link in links]
    news_df = pd.DataFrame(news_list, columns = ['Date', 'Time', 'Header', 'Overview', 'Text'])
    news_df.to_csv(filename, index = False, encoding = 'utf-8', mode = 'w')
    print('Сбор и сохранение новостей завершены.')
Пример #21
0
def main():
    # Settings
    if int(sys.argv[5]) == 1:
        enable_time_relevancy = True
    else:
        enable_time_relevancy = False

    if int(sys.argv[6]) == 1:
        enable_hashtag_similarity = True
    else:
        enable_hashtag_similarity = False

    E = float(sys.argv[2])
    # model = Doc2Vec.load("./enwiki_dbow/doc2vec.bin")
    # print("Starts loading the model.")
    doc2vec_model = Doc2VecModel()
    model = doc2vec_model.get_model()
    # print("Model loaded.")
    word_processor = processor.Processor(enable_hashtag_similarity)
    tweets_api = tweets.Tweets(int(sys.argv[4]))

    # print("Starts loading the dataset")
    all_tweets = tweets_api.process_tweets(sys.argv[1])
    # print("Dataset loaded")

    all_tokens = []
    copied_tweets = list(all_tweets)

    for tweet in copied_tweets:
        tokens = word_processor.tweet_tokenize(tweet[0])

        if tokens == []:
            all_tweets.remove(tweet)
            continue
        all_tokens.append(tokens)

    # print("pre-processing completed")

    all_clusters = []
    cluster_id = 0
    for i in range(len(all_tweets)):
        # start_total = time.time()
        # first cluster
        if all_clusters == []:
            tweet = all_tweets[i]
            token = all_tokens[i]
            new_cluster = cluster.Cluster(
                tweet[0], tweet[1], token, cluster_id, True, model)
            cluster_id += 1
            all_clusters.append(new_cluster)
            continue

        clustered = False
        # print("Starts clustering %d" % (i))
        token = all_tokens[i]
        for j in range(len(all_clusters)):
            vector = all_clusters[j].get_vector(False)
            single_cluster = all_clusters[j]
            # no common words between the tweet and the cluster, skip
            if not intersection(vector["text"], token["text"]) and \
                not intersection(vector["hashtag"], token["hashtag"]):
                continue

            # vector = single_cluster.get_vector(True)
            new_token = {}
            new_token["text"] = token["text"]
            new_token["hashtag"] = token["hashtag"]

            # cluster_dbow_vector = model.infer_vector(vector["text"])
            # similarity = spatial.distance.cosine(tweet_dbow_vector, cluster_dbow_vector)
            # similarity = 1 - similarity
            # if all_text_in_cluster(new_token["text"], vector["text"]):
            #     similarity = 1
            # else:
            #     tweet_dbow_vector = model.infer_vector(new_token["text"])
            #     similarity = word_processor.doc2vec_double_similarity(new_token, vector, tweet_dbow_vector, all_clusters[j])
            tweet_dbow_vector = model.infer_vector(new_token["text"])
            similarity = word_processor.doc2vec_double_similarity(
                new_token, vector, tweet_dbow_vector, all_clusters[j])

            if enable_time_relevancy:
                similarity = word_processor.modified_similarity(
                    similarity, all_tweets[i][1], single_cluster)

            if similarity >= E:
                tweet = all_tweets[i]
                all_clusters[j].push(tweet[0], tweet[1], token)
                clustered = True
                break

        if not clustered:
            # start_new_cluster = time.time()
            tweet = all_tweets[i]
            token = all_tokens[i]
            new_cluster = cluster.Cluster(tweet[0], tweet[1], token, cluster_id, True, model)
            cluster_id += 1
            all_clusters.append(new_cluster)
            # print("New cluster duration: %s" %
            #       (time.time() - start_new_cluster))

        # print("Total time: %s" % (time.time() - start_total))
        # print("Clustering completed %d" % (i))

    print("Total number of clusters generated: %d" % (len(all_clusters)))
    cluster_sizes = [x.get_size() for x in all_clusters]
    print("The sizes of all clusters generated:")
    print(cluster_sizes)
    max_cluster_size = max(cluster_sizes)
    print("The max cluster size is: %d" % (max_cluster_size))

    news_api = news.News()
    articles = news_api.process_news(news_api.retrieve_everything())
    F = float(sys.argv[3])
    related_news_clusters = []

    # for article in articles:
    for i in range(len(articles)):
        news_cluster_group = {}
        # max_similarity = 0
        # max_similarity_index = -1

        article = articles[i]
        text = article["title"] + article["description"]
        publish_time = article["publish_time"]

        # for single_cluster in all_clusters:
        for j in range(len(all_clusters)):
            single_cluster = all_clusters[j]

            # Remove outlier clusters
            if single_cluster.get_size() < 10:
                continue
            # print("Article %d, Cluster %d." % (i, j))
            cluster_vector = single_cluster.get_vector(True)["text"]
            if not intersection(cluster_vector, text):
                continue
            similarity = word_processor.docs_similarity(text, cluster_vector)

            # if enable_time_relevancy:
            #     similarity = word_processor.modified_similarity(
            #         similarity, publish_time, single_cluster, True)

            if similarity >= F:
                news_cluster_group = {}
                # find all clusters related to the news
                # if i not in news_cluster_group.keys():
                #     news_cluster_group[i] = []
                # cluster_id = single_cluster.get_id()
                # news_cluster_group[i].append((cluster_id, similarity))
                news_cluster_group["article"] = i
                news_cluster_group["cluster"] = single_cluster.get_id()
                news_cluster_group["similarity"] = similarity
                related_news_clusters.append(news_cluster_group)
                # stop comparing with other clusters
                break

        # if news_cluster_group != {}:
        #     related_news_clusters.append(news_cluster_group)

    print("Number of pairs generated in total: %d" %
          (len(related_news_clusters)))
    print("All generated pairs:")
    print(related_news_clusters)

    # for related_pair in related_news_clusters:
    #     article_id = list(related_pair.keys())[0]
    #     print("News is below")
    #     print(articles[article_id])
    #     print("Tweets are below")
    #     cluster_list = list(related_pair.values())[0]
    #     for cluster_id, similarity in cluster_list:
    #         for k in range(len(all_clusters[cluster_id].get_all_tweets())):
    #             print("[%d]: %s: " %
    #                 (k, all_clusters[cluster_id].get_all_tweets()[k]))
    #         print("---------------------------------------------------")
    for related_pair in related_news_clusters:
        print("News below")
        article_id = related_pair["article"]
        print(articles[article_id])
        cluster_id = related_pair["cluster"]
        print("Tweets below:")
        for k in range(len(all_clusters[cluster_id].get_all_tweets())):
            print("[%d]: %s: " %
                  (k, all_clusters[cluster_id].get_all_tweets()[k]))
        print("----------------------------------------------------")
Пример #22
0
    def getUserName(self):
        getName = self.cam.readUserName()
        print(getName)
        if getName == "Unknown" or getName == "No User Detected":
            self.tk.after(5000, self.getUserName)
        else:
            jsonfile = getName
            with open('./json/' + getName + '.json') as json_file:
                data = json.load(json_file)
            for p in data[jsonfile]:
                clockCheckbox = (p['clockCheckbox'])
                clockFrame = (p['clockFrame'])
                clockSide = (p['clockSide'])
                weatherCheckbox = (p['weatherCheckbox'])
                weatherFrame = (p['weatherFrame'])
                weatherSide = (p['weatherSide'])
                newsCheckbox = (p['newsCheckbox'])
                newsFrame = (p['newsFrame'])
                newsSide = (p['newsSide'])
                newsCategory = (p['newsCategory'])
                stockCheckbox = (p['stockCheckbox'])
                stockFrame = (p['stockFrame'])
                stockSide = (p['stockSide'])
                stockList = (p['stockList'])
                quoteCheckbox = (p['quoteCheckbox'])
            self.splash.pack_forget()
            self.cam.pack_forget()
            self.instructions1.pack_forget()
            self.instructions2.pack_forget()
            #Quotes
            if quoteCheckbox == 'enable':
                self.quotes = quotes.Quotes(self.bottomFrame)
                self.quotes.pack(anchor=N, padx=100, pady=60)
            #clock
            if clockCheckbox == 'enable':
                self.clock = clock.Clock(getattr(self, clockFrame))
                if clockFrame == 'topFrame':
                    self.clock.pack(side=clockSide,
                                    anchor=N,
                                    padx=100,
                                    pady=60)
                else:
                    self.clock.pack(side=clockSide,
                                    anchor=S,
                                    padx=100,
                                    pady=60)
            #weather
            if weatherCheckbox == 'enable':
                self.weather = weather.Weather(getattr(self, weatherFrame),
                                               weatherSide)
                if weatherFrame == 'topFrame':
                    self.weather.pack(side=weatherSide,
                                      anchor=N,
                                      padx=100,
                                      pady=60)
                else:
                    self.weather.pack(side=weatherSide,
                                      anchor=S,
                                      padx=100,
                                      pady=60)
            #news
            if newsCheckbox == 'enable':
                self.news = news.News(getattr(self, newsFrame), newsSide,
                                      newsCategory)
                if newsFrame == 'topFrame':
                    self.news.pack(side=newsSide, anchor=N, padx=100, pady=60)
                else:
                    self.news.pack(side=newsSide, anchor=S, padx=100, pady=60)

            #stock
            if stockCheckbox == 'enable':
                self.stock = stock.Stock(getattr(self, stockFrame), stockList)
                if stockFrame == 'topFrame':
                    self.stock.pack(side=stockSide,
                                    anchor=N,
                                    padx=100,
                                    pady=60)
                else:
                    self.stock.pack(side=stockSide,
                                    anchor=S,
                                    padx=100,
                                    pady=60)
            self.checkStillViewing()
Пример #23
0
    #user_data will be list of format username, city, province, lat, long
    username = user_data[0]
    user_city = user_data[1]
    lat = user_data[3]
    lon = user_data[4]
    count = user_data[5]

    #display intro
    print(("Welcome back, {}").format(username))
    displaydateandtime.show_date()
    displaydateandtime.show_time()

    #fill in user object
    current_user = User(username, user_city, lat, lon, count)
    return current_user


if __name__ == "__main__":
    #check if data file exists
    exists = os.path.isfile('user_data_file.txt')
    if exists:
        current_user = ReturningUser()
    else:
        current_user = NewUser()

    print()
    #show weather
    ShowWeather.show_weather(current_user)
    news.News(current_user)

    print('Thank you for using Starlight - By Raza Abbas')
Пример #24
0
def get_news():
    return news.News()
Пример #25
0
def read_log(log_file):  # check
    treatnames = []
    fo = open(log_file, "r")
    line = fo.readline()
    chunks = re.split("\|\|", line)
    if (chunks[0] == 'g'):
        old = True
        gmarker = 'g'
        treatments = 2
        treatnames = ['0', '1']
        samples = len(chunks) - 1
    else:
        old = False
        gmarker = 'assign'
        treatments = int(chunks[2])
        samples = int(chunks[1])
        line = fo.readline()
        chunks = re.split("\|\|", line)
        for i in range(1, len(chunks)):
            treatnames.append(chunks[i].strip())
    fo.close()
    assert treatments == len(treatnames)
    for i in range(0, treatments):
        print "Treatment ", i, " = ", treatnames[i]
    adv = []
    ints = []
    newsv = []
    for i in range(0, samples):
        adv.append(adVector.AdVector())
        ints.append(interest.Interests())
        newsv.append(news.NewsVector())
    loadtimes = [timedelta(minutes=0)] * samples
    reloads = [0] * samples
    errors = [0] * samples
    xvfbfails = []
    breakout = False
    par_adv = []
    ass = []

    fo = open(log_file, "r")
    r = 0
    sys.stdout.write("Scanning ads")
    for line in fo:
        chunks = re.split("\|\|", line)
        chunks[len(chunks) - 1] = chunks[len(chunks) - 1].rstrip()
        if (chunks[0] == gmarker and r == 0):
            r += 1
            ass = chunks[2:]
            if (old):
                ass = chunks[1:]
            assert len(ass) == samples
            apply_labels_to_vecs(adv, ints, newsv, ass, samples, treatments)
#print ass
        elif (chunks[0] == gmarker and r > 0):
            r += 1
            par_adv.append({
                'adv': adv,
                'newsv': newsv,
                'ass': ass,
                'xf': xvfbfails,
                'interests': ints,
                'break': breakout,
                'loadtimes': loadtimes,
                'reloads': reloads,
                'errors': errors
            })
            sys.stdout.write(".")
            sys.stdout.flush()
            adv = []
            ints = []
            newsv = []
            for i in range(0, samples):
                adv.append(adVector.AdVector())
                ints.append(interest.Interests())
                newsv.append(news.NewsVector())
            loadtimes = [timedelta(minutes=0)] * samples
            reloads = [0] * samples
            errors = [0] * samples
            xvfbfails = []
            breakout = False
            ass = chunks[2:]
            if (old):
                ass = chunks[1:]
            assert len(ass) == samples
            apply_labels_to_vecs(adv, ints, newsv, ass, samples, treatments)
        elif (chunks[0] == 'Xvfbfailure'):
            xtreat, xid = chunks[1], chunks[2]
            xvfbfails.append(xtreat)
        elif (chunks[1] == 'breakingout'):
            breakout = True
        elif (chunks[1] == 'loadtime'):
            t = (datetime.strptime(chunks[2], "%H:%M:%S.%f"))
            delta = timedelta(hours=t.hour, minutes=t.minute, seconds=t.second)
            id = int(chunks[3])
            loadtimes[id] += delta
        elif (chunks[1] == 'reload'):
            id = int(chunks[2])
            reloads[id] += 1
        elif (chunks[1] == 'errorcollecting'):
            id = int(chunks[2])
            errors[id] += 1
        elif (chunks[1] == 'prepref'):
            id = int(chunks[4])
            ints[id].remove_interest()
        elif (chunks[1] == 'pref'):
            id = int(chunks[4])
            int_str = chunks[3]
            ints[id].set_from_string(int_str)
        elif (chunks[0] == 'news'):
            ind_news = news.News({
                'Time':
                datetime.strptime(chunks[3], "%Y-%m-%d %H:%M:%S.%f"),
                'Title':
                chunks[4],
                'Agency':
                chunks[5],
                'Ago':
                chunks[6],
                'Body':
                chunks[7].rstrip(),
                'Label':
                chunks[2]
            })
            newsv[int(chunks[1])].add(ind_news)
        elif (chunks[0] == 'ad'):
            ind_ad = ad.Ad({
                'Time':
                datetime.strptime(chunks[3], "%Y-%m-%d %H:%M:%S.%f"),
                'Title':
                chunks[4],
                'URL':
                chunks[5],
                'Body':
                chunks[6].rstrip(),
                'cat':
                "",
                'Label':
                chunks[2]
            })
            adv[int(chunks[1])].add(ind_ad)
        else:  # to analyze old log files
            try:
                ind_ad = ad.Ad({
                    'Time':
                    datetime.strptime(chunks[2], "%Y-%m-%d %H:%M:%S.%f"),
                    'Title':
                    chunks[3],
                    'URL':
                    chunks[4],
                    'Body':
                    chunks[5].rstrip(),
                    'cat':
                    "",
                    'label':
                    chunks[1]
                })
                # 	 			ind_ad = ad.Ad({'Time':datetime.strptime(chunks[1], "%Y-%m-%d %H:%M:%S.%f"), 'Title':chunks[2],
                # 	 					'URL': chunks[3], 'Body': chunks[4].rstrip(), 'cat': "", 'label':""})
                adv[int(chunks[0])].add(ind_ad)
            except:
                pass

    r += 1
    par_adv.append({
        'adv': adv,
        'newsv': newsv,
        'ass': ass,
        'xf': xvfbfails,
        'interests': ints,
        'break': breakout,
        'loadtimes': loadtimes,
        'reloads': reloads,
        'errors': errors
    })
    sys.stdout.write(".Scanning complete\n")
    sys.stdout.flush()
    return par_adv, treatnames
Пример #26
0
import utils
import news
import plot
import visualizer

news = news.News()
plot = plot.IndexPlot()
user = visualizer.GithubUser()


class MessageHandler():
    def __init__(self, machine):

        self.machine = machine


class HallHandler(MessageHandler):

    help_message = ("Welcome to the hall, "
                    "here are some command you can use:\n"
                    "\n"
                    "help\n"
                    "pop up the help message\n"
                    "\n"
                    "news\n"
                    "Go to bulletin board for breaking news\n"
                    "\n"
                    "vol\n"
                    "Plot the finance voladility")

    def handle(self, event, command):