Пример #1
0
def connect_to_er(api_key, max_retries=3):
    """Establish a connection to Event Registry."""
    if api_key is not None:
        er = ER.EventRegistry(apiKey=api_key,
                              repeatFailedRequestCount=max_retries)
    else:
        er = ER.EventRegistry(repeatFailedRequestCount=max_retries)

    return er
Пример #2
0
def DMOZ(results):
    final_Dmoz = {}
    t0 = time.time()
    for key, value in results.items():
        dmozResults = []
        for j in value:
            if type(j) == list:
                for predictions in j:
                    er = ER.EventRegistry(
                        apiKey='32db7607-6c90-40bd-b653-e167da1462c9')
                    analytics = ER.Analytics(er)
                    cat = analytics.categorize(predictions[1])
                    try:
                        for k, v in cat.items():
                            if k == 'categories':
                                if len(v) != 0 and len(v) != '':
                                    for y, value in v[0].items():
                                        if y == 'label':
                                            dmozResults.append(
                                                value.split('/')[2])
                    except:
                        pass
            else:
                er = ER.EventRegistry(
                    apiKey='32db7607-6c90-40bd-b653-e167da1462c9')
                analytics = ER.Analytics(er)
                cat = analytics.categorize(j[1])
                try:
                    for k, v in cat.items():
                        if k == 'categories':
                            if len(v) != 0 and len(v) != '':
                                for y, value in v[0].items():
                                    if y == 'label':
                                        dmozResults.append(value.split('/')[2])
                except:
                    pass

        if key in final_Dmoz:
            final_Dmoz[key].append(dmozResults)
        else:
            final_Dmoz[key] = [dmozResults]

    print("### Executed time:", round(time.time() - t0, 3), "s ###")
    return final_Dmoz
Пример #3
0
    def connect_to_service(self):
        """
        Establish connection with EventRegistry service
        """
        if self._access_token is not None:
            self._er = ER.EventRegistry(self._access_token)
        else:
            raise Exception("[ERROR] No access_token has been specified")

        return self._er
    def __init__(self, max_repeat_request=-1):
        """Initializes the event registry collector

        Args:
            max_repeat_request (int): The number of maximum
                requests that can be repeated if something
                goes wrong. If -1, repeat indefinately
                (Default: -1)

        """
        # initialize the event registry instance
        self._er = ER.EventRegistry(
            apiKey=API_KEY, repeatFailedRequestCount=max_repeat_request)
        self.MAX_EVENT_REQUESTS = 50
Пример #5
0
def Dmoz(pred):
    final_Dmoz ={}
    timestamps = []
    counter = 1
    t0 = time.time()
    try:
        for key, value in pred.items():
            start_time = time.time()
            dmozResults = []
            for j in value: 
                for k, v in j.items():
                    er = ER.EventRegistry(apiKey='32db7607-6c90-40bd-b653-e167da1462c9')
                    analytics = ER.Analytics(er)
                    cat = analytics.categorize(v[0])
                    try:
                        for k, v in cat.items():
                            if k == 'categories':
                                if len(v) != 0 and len(v) != '':
                                    for y, value in v[0].items():
                                        if y == 'label':
                                            dmozResults.append(value.split('/')[2])

                    except: 
                        pass
    except: 
        pass
        timestamps.append((counter, (time.time() - start_time)))
        counter +=1
        
        with open('/data/s1931628/latinumbigDatafile.csv', 'a') as file:
            csv_writer = csv.writer(file)
            csv_writer.writerow((key, dmozResults))

#         if key in final_Dmoz: 
#             final_Dmoz[key].append(dmozResults)
#         else:
#             final_Dmoz[key] = dmozResults
    with open('latinumtimeOneParseDmoz.csv', 'a') as file:
        csv_writer = csv.writer(file)
        csv_writer.writerow(timestamps)

    with open('latinumtimeOneChunkParseDmozOnly.csv', 'a') as file:
        csv_writer = csv.writer(file)
        csv_writer.writerow([(time.time() - t0)])
        
    print("### Executed time:", round(time.time() - t0, 3), "s ###")    
Пример #6
0
    def creating_dataframe(self, dictionary):
        final_words = []
        final_words1 = []

        l = []
        z = []
        docs = {}
        keys = dictionary.keys()
        for key in keys:
            kk = str(key)
            k = re.findall(r'\d{8}', kk)
            l.append(k)
        for i in l:
            for j in i:
                z.append(j)
        for key in z:
            # if key == '19234329':
            print(
                "###################### Generating topic labels for {} ############################"
                .format(key))
            df = pd.DataFrame(dictionary[key])
            df.columns = ['Text']
            df_ = df['Text'].apply(lambda x: ''.join(x))
            df_ = df_.str.lower()
            df_ = df_.apply(self.tokenize)
            df_ = df_.apply(self.replace)
            df_ = df_.apply(self.split)
            df_ = df_.apply(self.terms_only)
            df_ = df_.apply(lambda x: ' '.join(x))
            df_ = df_.apply(lambda x: re.sub(r' +', ' ', x))
            [final_words.append("".join(i).strip().split()) for i in df_]
            [final_words1.append(i) for i in final_words if len(i) >= 5]
            [
                self.userTweets.append(re.sub(r' +', " ", (' '.join(i))))
                for i in final_words1
            ]

            if key in docs:
                docs[key].append(self.userTweets)
            else:
                docs[key] = self.userTweets

            print(key, ":", self.userTweets)
            currentWordsByUser = []
            for i in range(len(self.userTweets)):
                tweetWords = self.userTweets[i].strip("'")
                tweetWords = tweetWords.strip('"')
                tweetWords = tweetWords.strip(",")

                currentWordsByUser.append(list(set(str(tweetWords).split())))

            uniqueWordsByUser = list(
                set(list(itertools.chain.from_iterable(currentWordsByUser))))
            print("uniqueWordsByUser:"******"len(uniqueWordsByUser):", len(uniqueWordsByUser))
            #append all unique words from each user to global word vector
            self.allWordsFromUsers.append(uniqueWordsByUser)

            ###

            mm = Models(50, 10, **docs)  #50,10
            terms_to_wiki = mm.calling_methods('LDA')
            ll = Labels(terms_to_wiki)
            wiki_titles = ll.get_titles_wiki()
            equal_length = ll.remove_all_null_dicts_returned_from_wiki(
                **wiki_titles)
            frq = ll.calculating_word_frequency(**equal_length)
            #print(equal_length)
            #print("------")
            #print(frq)

            results = ll.predicting_label(**frq)
            l = []
            for i in range(len(results)):
                er = ER.EventRegistry(
                    apiKey='32db7607-6c90-40bd-b653-e167da1462c9')
                analytics = ER.Analytics(er)
                cat = analytics.categorize(results[i][1])
                for k, v in cat.items():
                    if k == 'categories':
                        for y, value in v[0].items():
                            if y == 'label':
                                l.append(value.split('/')[2])

            self.userTopicLabels.append(l)

        print('########### FINAL FILE EXECUTED ##################')
        self.allWordsFromUsersJoined = list(
            itertools.chain.from_iterable(self.allWordsFromUsers))  #joined
        self.noneDuplicateWordsUsedFromAllUsers = list(
            set(self.allWordsFromUsersJoined))
        self.allUsersIndexing()
        self.savePreprocessedData()
import json
import eventregistry as ER

er = ER.EventRegistry('569a0bbd-eb92-4249-9434-c401f4d2c4cc')
analytics = ER.Analytics(er)

sum = []

for i in range(30):
    with open('JsonFile{}.json'.format(i), 'r') as fp:
        jsonObj = json.load(fp)
        print(jsonObj)
        newList = []
        for article in jsonObj['articles']:
            title = article['title']
            if (title.find('Google') != -1):
                newList.append(article)
        print(newList)
        newJsonString = json.dumps(newList)
        newJsonString2 = json.loads(newJsonString)
        sum1 = 0
        for article in newJsonString2:
            print(article['description'])
            text = article['description']
            if (text != None):
                sentiment = analytics.sentiment(text=text)
                sum1 = sum1 + sentiment['avgSent']
            else:
                sum1 = sum1 + 0

        sum.append(sum1)
Пример #8
0
    def creating_dataframe(self, dictionary):
        final_words = []
        final_words1 = []
        documents = []
        l = []
        z = []
        docs = {}
        keys = dictionary.keys()
        for key in keys:
            kk = str(key)
            k = re.findall(r'\d{8}', kk)
            l.append(k)
        for i in l:
            for j in i:
                z.append(j)
        for key in z:
            # if key == '19234329':
            print(
                "###################### Generating topic labels for {} ############################"
                .format(key))
            df = pd.DataFrame(dictionary[key])
            df.columns = ['Text']
            df_ = df['Text'].apply(lambda x: ''.join(x))
            df_ = df_.str.lower()
            df_ = df_.apply(self.tokenize)
            df_ = df_.apply(self.replace)
            df_ = df_.apply(self.split)
            df_ = df_.apply(self.terms_only)
            df_ = df_.apply(lambda x: ' '.join(x))
            df_ = df_.apply(lambda x: re.sub(r' +', ' ', x))
            [final_words.append("".join(i).strip().split()) for i in df_]
            [final_words1.append(i) for i in final_words if len(i) >= 5]
            [
                documents.append(re.sub(r' +', " ", (' '.join(i))))
                for i in final_words1
            ]

            if key in docs:
                docs[key].append(documents)
            else:
                docs[key] = documents

            mm = Models(50, 10, **docs)
            terms_to_wiki = mm.calling_methods('LDA')
            ll = Labels(terms_to_wiki)
            wiki_titles = ll.get_titles_wiki()
            equal_length = ll.remove_all_null_dicts_returned_from_wiki(
                **wiki_titles)
            frq = ll.calculating_word_frequency(**equal_length)
            results = ll.predicting_label(**frq)
            l = []
            for i in range(len(results)):
                er = ER.EventRegistry(
                    apiKey='32db7607-6c90-40bd-b653-e167da1462c9')
                analytics = ER.Analytics(er)
                cat = analytics.categorize(results[i][1])
                for k, v in cat.items():
                    if k == 'categories':
                        for y, value in v[0].items():
                            if y == 'label':
                                l.append(value.split('/')[2])

            print('\n')
            print(key, l)
        print('########### FINAL FILE EXECUTED ##################')
Пример #9
0
import eventregistry as evr
import asyncio
import json
from watson_developer_cloud import ToneAnalyzerV3
from flask import Flask, render_template, send_from_directory
from flask_sockets import Sockets
import threading
import time

tone_analyzer = ToneAnalyzerV3(
    username='******',
    password='******',
    version='2016-05-19')

er = evr.EventRegistry(apiKey = '3a705c62-c9ae-4c4f-9b94-a0963352b8b3')

def searchTopic(topic):
    async def search(websocket):
        evq = evr.QueryEventsIter(conceptUri=er.getConceptUri(topic))
        evq.addRequestedResult(evr.RequestEventsInfo(sortBy = 'date'))
        for event in evq.execQuery(er):
            evUri = event['uri']
            if event['location'] == None:
                continue
            location = event['location']['label']['eng'] + ',', event['location']['country']['label']['eng']
            articles = 0
            article_content = []
            arq = evr.QueryEventArticlesIter(evUri)
            avg_sentiments = {}
            for article in arq.execQuery(er):
                analysis = tone_analyzer.tone(article['body'], 'emotion')
Пример #10
0
import datetime

import eventregistry

from .ArticleProto_pb2 import ArticleDetail, ArticleList

er = eventregistry.EventRegistry(apiKey='9a66d7d3-b8e3-4fc0-ab52-ed70d71fb121')

source_uri_dict = {
    "National Geographic": "news.nationalgeographic.com",
    "Nature": "nature.com",
    "The Economist": "economist.com",
    "TIME": "time.com",
    "The New York Times": "nytimes.com",
    "Bloomberg Business": "bloomberg.com",
    "CNN": "edition.cnn.com",
    "Fox News": "foxnews.com",
    "Forbes": "forbes.com",
    "Washington Post": "washingtonpost.com",
    "The Guardian": "theguardian.com",
    "The Times": "thetimes.co.uk",
    "Mail Online": "dailymail.co.uk",
    "BBC": "bbc.com",
    "PEOPLE": "people.com",
}


def get_source_uri(source_title):
    if source_title in source_uri_dict:
        return source_uri_dict[source_title]
    else:
Пример #11
0
import eventregistry as ER
import datetime
import pandas as pd
import time
from eventregistry import *

er = ER.EventRegistry(apiKey="5ba73408-ea81-459b-abf4-6fedd8cb8ec6")  # dany
#er = ER.EventRegistry(apiKey = "5fed3642-762a-4abc-aabf-ac6213c1bcea")  #philipp
#er = ER.EventRegistry(apiKey = "7571801b-6710-4166-90cc-9c5352ddeedd")  #andi
#er = ER.EventRegistry(apiKey="1b673182-c9e4-4554-90cf-d082a0bd6b53") #  Hendrik?
analytics = ER.Analytics(er)

# DEFINE companies
companies = ['Samsung', 'BASF', 'Apple', 'Tesla', 'Airbus', 'Bayer', 'BMW', 'Telefonica', 'Google', 'Allianz', 'Total']

# DEFINE start and end date
startDate = datetime.date(2018, 7, 18)
endDate = datetime.date(2018, 7, 18)
# Get all Business Days in Period
time_frame = pd.bdate_range(startDate, endDate)


# Set maximum number of articles per day
number_of_articles = 50

# DEFINE df results columns
result = dict()

for company in companies:
    print("- Starting article processing for company :", company)
    # Dictionary
Пример #12
0
 def __init__(self, eventregistry_keys):
     self.KEY = eventregistry_keys["KEY"]
     self.api = evr.EventRegistry(apiKey=self.KEY)
Пример #13
0
# -*- coding: utf-8 -*-
from flask import Flask, request
app = Flask(__name__)
import eventregistry
er = eventregistry.EventRegistry(apiKey="23760d8a-beec-49ae-be16-250ff16e2e1f")
#from flask import Flask, request, jsonify
import json
from eventregistry import *


@app.route('/', methods=['GET'])
def search():

    if request.method == 'GET':
        key = request.args.get("keyword")
        lang0 = request.args.get("language")
        typ = request.args.get("type")

    if typ == "event":
        q = QueryEvents(lang=lang0,
                        keywords=key
                        #  sourceLocationUri  != None,
                        #dataType = ["news"])
                        )

        q.setRequestedResult(
            RequestEventsInfo(returnInfo=ReturnInfo(
                #   articleInfo = ArticleInfoFlags(location = True),
                locationInfo=LocationInfoFlags(geoLocation=True))))
        res = er.execQuery(q)
        #        datalist = [[]*3]*100
Пример #14
0
def authenticate():
    ''' authenticate event registry session '''
    with open(ER_KEY, 'r') as key:
        auth_key = key.read().splitlines()[0]
    return er.EventRegistry(auth_key)
def fetch_event_articles(api_key,
                         min_articles=500,
                         force=False,
                         save_on_api_fail=True,
                         csv_file=None):

    event_registry = er.EventRegistry(apiKey=api_key,
                                      repeatFailedRequestCount=2)

    # Single query to collect event ids
    all_events_gzip_file = op.join('csv',
                                   'events_min%d.csv' % min_articles) + '.gz'
    if not force and op.exists(all_events_gzip_file):
        df_events = pd.read_csv(all_events_gzip_file, compression='gzip')
    else:
        event_data = []
        qei = er.QueryEventsIter(lang='eng',
                                 minArticlesInEvent=min_articles,
                                 maxArticlesInEvent=min_articles * 10)
        for event in qei.execQuery(event_registry, maxItems=1001):
            event_data.append(event)
        df_events = pd.DataFrame(event_data)
        df_events.to_csv(all_events_gzip_file,
                         encoding='utf-8',
                         compression='gzip')
        del event_data

    # Uncache csv file.
    if not force and op.exists(csv_file):
        print("Loading articles from disk...")
        df_articles = pd.read_csv(csv_file)
    else:
        event_uris = df_events.uri.tolist()
        event_uris = [ev for ev in event_uris if ev[:3] == 'eng']
        print("Downloading articles for %d events..." % len(event_uris))

        # Loop to retrieve all articles for an event.
        return_info = er.ReturnInfo(articleInfo=er.ArticleInfoFlags(
            bodyLen=-1, concepts=True, categories=True, originalArticle=True))

        all_articles = []
        api_failed = False
        for uri in event_uris:
            print "current uri: ", uri
            current_event_data = []

            event_gzip_file = op.join('csv', 'event-%s.csv.gz' % uri)
            if not force and op.exists(event_gzip_file):
                tmp_df = pd.read_csv(event_gzip_file, compression='gzip')
            elif api_failed:
                print("\tSkipping; API failed.")
                try:
                    query_iter = er.QueryEventArticlesIter(uri)
                    for article in query_iter.execQuery(
                            event_registry, lang="eng",
                            returnInfo=return_info):
                        current_event_data.append(article)
                except TypeError:
                    # This is how API errors come through.
                    if save_on_api_fail:
                        print("\tWARNING: API failed. Skipping.")
                        api_failed = True  # end loop; we can't continue.
                        continue
                    else:
                        raise

                # Specify columns, so that we skip any empty events.
                tmp_df = pd.DataFrame(current_event_data,
                                      columns=[
                                          'body', 'categories', 'concepts',
                                          'date', 'dateTime', 'eventUri', 'id',
                                          'isDuplicate', 'lang',
                                          'originalArticle', 'sim', 'source',
                                          'time', 'title', 'uri', 'url'
                                      ])
                tmp_df.to_csv(event_gzip_file,
                              encoding='utf-8',
                              compression='gzip')

            if len(tmp_df) == 0:
                print("WARNING: event contains no articles.")
            # print "shape of df: {}".format(tmp_df.shape)
            # print "unique url: {}".format(len(set(tmp_df['url'])))
            all_articles.append(tmp_df)

        # Combine all news articles into a single dataframe.
        df_articles = pd.concat(all_articles)
        csv_file = csv_file or 'articles-min%d.csv' % min_articles
        df_articles.to_csv(csv_file, encoding='utf-8')

    return df_events, df_articles