def _getMostRecentPages(self, session): es_info = self.esInfo(session['domainId']) hits = [] if session['fromDate'] is None: hits = get_most_recent_documents(session['pagesCap'], es_info['mapping'], ["url", "x", "y", es_info['mapping']["tag"], es_info['mapping']["timestamp"], es_info['mapping']["text"]], session['filter'], es_info['activeCrawlerIndex'], es_info['docType'], self._es) else: if(session['filter'] is None): hits = range(es_info['mapping']["timestamp"], session['fromDate'], session['toDate'], ['url',"x", "y", es_info['mapping']['tag'], es_info['mapping']["timestamp"], es_info['mapping']["text"]], True, session['pagesCap'], es_info['activeCrawlerIndex'], es_info['docType'], self._es) else: s_fields = { es_info['mapping']["text"]: "(" + session['filter'].replace('"','\"') + ")", es_info['mapping']["timestamp"]: "[" + str(session['fromDate']) + " TO " + str(session['toDate']) + "]" } hits = multifield_query_search(s_fields, session['pagesCap'], ["url", "x", "y", es_info['mapping']["tag"], es_info['mapping']["timestamp"], es_info['mapping']["text"]], es_info['activeCrawlerIndex'], es_info['docType'], self._es) return hits
def getPagesSummarySeedCrawler(self, opt_ts1 = None, opt_ts2 = None, opt_applyFilter = False, session = None): es_info = self.esInfo(session['domainId']) # If ts1 not specified, sets it to -Infinity. if opt_ts1 is None: now = time.localtime(0) opt_ts1 = float(time.mktime(now)) * 1000 else: opt_ts1 = float(opt_ts1) # If ts2 not specified, sets it to now. if opt_ts2 is None: now = time.localtime() opt_ts2 = float(time.mktime(now)) * 1000 else: opt_ts2 = float(opt_ts2) if opt_applyFilter: # TODO(Yamuna): apply filter if it is None. Otherwise, match_all. results = get_most_recent_documents(session['pagesCap'], es_info['mapping'], ["url", es_info['mapping']["tag"]], session['filter'], es_info['activeCrawlerIndex'], es_info['docType'], \ self._es) else: results = \ range(es_info['mapping']["timestamp"], opt_ts1, opt_ts2, ['url',es_info['mapping']['tag']], True, session['pagesCap'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es) relevant = 0 irrelevant = 0 neutral = 0 # TODO(Yamuna): Double check the return values for crawler for res in results: try: tags = res[es_info['mapping']['tag']] if 'Relevant' in res[es_info['mapping']['tag']]: relevant = relevant + 1 elif 'Irrelevant' in res[es_info['mapping']['tag']]: irrelevant = irrelevant + 1 else: # Page has tags, but not Relevant or Irrelevant. neutral = neutral + 1 except KeyError: # Page does not have tags. neutral = neutral + 1 return { \ 'Relevant': relevant, 'Irrelevant': irrelevant, 'Neutral': neutral }
def getPagesSummarySeedCrawler(self, opt_ts1 = None, opt_ts2 = None, opt_applyFilter = False, session = None): es_info = self.esInfo(session['domainId']) # If ts1 not specified, sets it to -Infinity. if opt_ts1 is None: now = time.gmtime(0) opt_ts1 = float(calendar.timegm(now)) else: opt_ts1 = float(opt_ts1) # If ts2 not specified, sets it to now. if opt_ts2 is None: now = time.gmtime() opt_ts2 = float(calendar.timegm(now)) else: opt_ts2 = float(opt_ts2) if opt_applyFilter and session['filter'] != "": results = get_most_recent_documents(session['pagesCap'], es_info['mapping'], ["url", es_info['mapping']["tag"]], session['filter'], es_info['activeCrawlerIndex'], es_info['docType'], \ self._es) else: results = \ range(es_info['mapping']["timestamp"], opt_ts1, opt_ts2, ['url',es_info['mapping']['tag']], True, session['pagesCap'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es) relevant = 0 irrelevant = 0 neutral = 0 for res in results: try: tags = res[es_info['mapping']['tag']] if 'Irrelevant' in res[es_info['mapping']['tag']]: irrelevant = irrelevant + 1 else: # Page has tags Relevant or custom. if "" not in tags: relevant = relevant + 1 else: neutral = neutral + 1 except KeyError: # Page does not have tags. neutral = neutral + 1 return { \ 'Relevant': relevant, 'Irrelevant': irrelevant, 'Neutral': neutral }