def _getNextArticleBatch(self):
        """download next batch of events based on the event uris in the uri list"""
        eventUri = self.queryParams["eventUri"]
        # move to the next page to download
        self._articlePage += 1
        # if we have already obtained all pages, then exit
        if self._totalPages != None and self._articlePage > self._totalPages:
            return
        if self._er._verboseOutput:
            logger.debug("Downloading article page %d from event %s" %
                         (self._articlePage, eventUri))

        self.setRequestedResult(
            RequestEventArticles(page=self._articlePage,
                                 sortBy=self._articlesSortBy,
                                 sortByAsc=self._articlesSortByAsc,
                                 returnInfo=self._returnInfo,
                                 **self.queryParams))
        res = self._er.execQuery(self)
        if "error" in res:
            logger.error(res["error"])
        else:
            self._totalPages = res.get(eventUri, {}).get("articles",
                                                         {}).get("pages", 0)
        arts = res.get(eventUri, {}).get("articles", {}).get("results", [])
        self._articleList.extend(arts)
Exemplo n.º 2
0
 def _getNextEventBatch(self):
     """download next batch of events based on the event uris in the uri list"""
     self._eventPage += 1
     # if we have already obtained all pages, then exit
     if self._totalPages != None and self._eventPage > self._totalPages:
         return
     self.setRequestedResult(
         RequestEventsInfo(page=self._eventPage,
                           count=self._eventBatchSize,
                           sortBy=self._sortBy,
                           sortByAsc=self._sortByAsc,
                           returnInfo=self._returnInfo))
     # download articles and make sure that we set the same archive flag as it was returned when we were processing the uriList request
     if self._er._verboseOutput:
         logger.debug("Downloading event page %d..." % (self._eventPage))
     res = self._er.execQuery(self)
     if "error" in res:
         logger.error("Error while obtaining a list of events: " +
                      res["error"])
     else:
         self._totalPages = res.get("events", {}).get("pages", 0)
     results = res.get("events", {}).get("results", [])
     self._eventList.extend(results)
Exemplo n.º 3
0
 def _getNextArticleBatch(self):
     """download next batch of articles based on the article uris in the uri list"""
     # try to get more uris, if none
     self._articlePage += 1
     # if we have already obtained all pages, then exit
     if self._totalPages != None and self._articlePage > self._totalPages:
         return
     self.setRequestedResult(
         RequestArticlesInfo(page=self._articlePage,
                             sortBy=self._sortBy,
                             sortByAsc=self._sortByAsc,
                             returnInfo=self._returnInfo))
     if self._er._verboseOutput:
         logger.debug("Downloading article page %d..." %
                      (self._articlePage))
     res = self._er.execQuery(self)
     if "error" in res:
         logger.error("Error while obtaining a list of articles: " +
                      res["error"])
     else:
         self._totalPages = res.get("articles", {}).get("pages", 0)
     results = res.get("articles", {}).get("results", [])
     self._articleList.extend(results)
Exemplo n.º 4
0
    def __init__(self,
                 apiKey = None,
                 host = None,
                 hostAnalytics = None,
                 minDelayBetweenRequests = 0.5,
                 repeatFailedRequestCount = -1,
                 allowUseOfArchive = True,
                 verboseOutput = False,
                 settingsFName = None):
        """
        @param apiKey: API key that should be used to make the requests to the Event Registry. API key is assigned to each user account and can be obtained on
            this page: https://newsapi.ai/dashboard
        @param host: host to use to access the Event Registry backend. Use None to use the default host.
        @param hostAnalytics: the host address to use to perform the analytics api calls
        @param minDelayBetweenRequests: the minimum number of seconds between individual api calls
        @param repeatFailedRequestCount: if a request fails (for example, because ER is down), what is the max number of times the request
            should be repeated (-1 for indefinitely)
        @param allowUseOfArchive: default is True. Determines if the queries made should potentially be executed on the archive data.
            If False, all queries (regardless how the date conditions are set) will be executed on data from the last 31 days.
            Queries executed on the archive are more expensive so set it to False if you are just interested in recent data
        @param verboseOutput: if True, additional info about errors etc will be printed to console
        @param settingsFName: If provided it should be a full path to 'settings.json' file where apiKey an/or host can be loaded from.
            If None, we will look for the settings file in the eventregistry module folder
        """
        self._host = host
        self._hostAnalytics = hostAnalytics
        self._lastException = None
        self._logRequests = False
        self._minDelayBetweenRequests = minDelayBetweenRequests
        self._repeatFailedRequestCount = repeatFailedRequestCount
        self._allowUseOfArchive = allowUseOfArchive
        self._verboseOutput = verboseOutput
        self._lastQueryTime = time.time()
        self._headers = {}
        self._dailyAvailableRequests = -1
        self._remainingAvailableRequests = -1

        # lock for making sure we make one request at a time - requests module otherwise sometimes returns incomplete json objects
        self._lock = threading.Lock()
        self._reqSession = requests.Session()
        self._apiKey = apiKey
        self._extraParams = None

        # if there is a settings.json file in the directory then try using it to load the API key from it
        # and to read the host name from it (if custom host is not specified)
        currPath = os.path.split(os.path.realpath(__file__))[0]
        settFName = settingsFName or os.path.join(currPath, "settings.json")
        if apiKey:
            logger.debug("using user provided API key for making requests")

        if os.path.exists(settFName):
            settings = json.load(open(settFName))
            self._host = host or settings.get("host", "http://eventregistry.org")
            self._hostAnalytics = hostAnalytics or settings.get("hostAnalytics", "http://analytics.eventregistry.org")
            # if api key is set, then use it when making the requests
            if "apiKey" in settings and not apiKey:
                logger.debug("found apiKey in settings file which will be used for making requests")
                self._apiKey = settings["apiKey"]
        else:
            self._host = host or "http://eventregistry.org"
            self._hostAnalytics = hostAnalytics or "http://analytics.eventregistry.org"

        if self._apiKey == None:
            print("No API key was provided. You will be allowed to perform only a very limited number of requests per day.")
        self._requestLogFName = os.path.join(currPath, "requests_log.txt")

        logger.debug("Event Registry host: %s" % (self._host))
        logger.debug("Text analytics host: %s" % (self._hostAnalytics))

        # list of status codes - when we get them as a response from the call, we don't want to repeat the query as the response will likely always be the same
        self._stopStatusCodes = set([
            204,        # Information not available. Request succeeded, but the requested information is not available.
            400,        # Bad request. The request was unacceptable, most likely due to invalid or missing parameter.
            401,        # User's limit reached. The user reached the limit of the tokens in his account. The requests are rejected.
            403,        # Invalid account. The user's IP or account is disabled, potentially due to misuse.
        ])