def submit_query_terms(self, term_list, max_url_count=15, parallel_cb=None, cached=True): #Perform queries to Search Engine APIs #This function only operates when there is no information associated with the terms, #usually before running extract_terms() # #Args: # term_list: list of search terms that are submited by user #Returns: # urls: list of urls that are returned by Search Engine print '\n\nsubmit_query_terms\n\n' chdir(self.memex_home + '/seed_crawler/seeds_generator') query = ' '.join(term_list) with open('conf/queries.txt', 'w') as f: f.write(query) if not cached: comm = "java -cp target/seeds_generator-1.0-SNAPSHOT-jar-with-dependencies.jar BingSearch -t " + str( max_url_count) p = Popen(comm, shell=True, stdout=PIPE) output, errors = p.communicate() print output print errors call(["rm", "-rf", "html"]) call(["mkdir", "-p", "html"]) call(["rm", "-rf", "thumbnails"]) call(["mkdir", "-p", "thumbnails"]) #if sys.platform in ['darwin', 'linux2']: if sys.platform in ['darwin']: download("results.txt") else: download("results.txt", True, parallel_cb) if exists(self.memex_home + "/seed_crawler/ranking/exclude.txt"): call([ "rm", self.memex_home + "/seed_crawler/ranking/exclude.txt" ]) with open("results.txt", 'r') as f: urls = [ self.validate_url(line.strip()) for line in f.readlines() ] else: urls = search('text', term_list)[0:max_url_count] for url in urls: self.urls_set.add(url) self.tfidf = tfidf.tfidf(list(self.urls_set)) return urls #Results from Search Engine
def _getMoreLikeIrrelevantPages(self, session): es_info = self.esInfo(session['domainId']) neg_hits = search(es_info['mapping']['tag'], ['irrelevant'], session['pagesCap'], ['url', "x", "y", es_info['mapping']["tag"], es_info['mapping']["timestamp"], es_info['mapping']["text"]], es_info['activeCrawlerIndex'], 'page', self._es) hits = [] if len(neg_hits) > 0: neg_urls = [field['id'] for field in neg_hits] results = get_more_like_this(neg_urls, ['url', "x", "y", es_info['mapping']["tag"], es_info['mapping']["timestamp"], es_info['mapping']["text"]], session['pagesCap'], es_info['activeCrawlerIndex'], es_info['docType'], self._es) hits = neg_hits[0:self._pagesCapTerms] + results return hits
def submit_query_terms(self, term_list, max_url_count = 15, parallel_cb = None, cached=True): #Perform queries to Search Engine APIs #This function only operates when there is no information associated with the terms, #usually before running extract_terms() # #Args: # term_list: list of search terms that are submited by user #Returns: # urls: list of urls that are returned by Search Engine print '\n\nsubmit_query_terms\n\n' chdir(self.memex_home + '/seed_crawler/seeds_generator') query = ' '.join(term_list) with open('conf/queries.txt','w') as f: f.write(query) if not cached: comm = "java -cp target/seeds_generator-1.0-SNAPSHOT-jar-with-dependencies.jar BingSearch -t " + str(max_url_count) p=Popen(comm, shell=True, stdout=PIPE) output, errors = p.communicate() print output print errors call(["rm", "-rf", "html"]) call(["mkdir", "-p", "html"]) call(["rm", "-rf", "thumbnails"]) call(["mkdir", "-p", "thumbnails"]) #if sys.platform in ['darwin', 'linux2']: if sys.platform in ['darwin']: download("results.txt") else: download("results.txt", True, parallel_cb) if exists(self.memex_home + "/seed_crawler/ranking/exclude.txt"): call(["rm", self.memex_home + "/seed_crawler/ranking/exclude.txt"]) with open("results.txt",'r') as f: urls = [self.validate_url(line.strip()) for line in f.readlines()] else: urls = search('text', term_list)[0:max_url_count] for url in urls: self.urls_set.add(url) self.tfidf = tfidf.tfidf(list(self.urls_set)) return urls #Results from Search Engine
def _getMoreLikePages(self, session): es_info = self.esInfo(session['domainId']) hits=[] tags = session['selected_tags'].split(',') for tag in tags: tag_hits = search(es_info['mapping']['tag'], [tag], session['pagesCap'], ['url', "x", "y", es_info['mapping']["tag"], es_info['mapping']["timestamp"], es_info['mapping']["text"]], es_info['activeCrawlerIndex'], 'page', self._es) if len(tag_hits) > 0: tag_urls = [field['id'] for field in tag_hits] results = get_more_like_this(tag_urls, ['url', "x", "y", es_info['mapping']["tag"], es_info['mapping']["timestamp"], es_info['mapping']["text"]], session['pagesCap'], es_info['activeCrawlerIndex'], es_info['docType'], self._es) hits.extend(tag_hits[0:self._pagesCapTerms] + results) return hits
def _getRelevantPages(self, session): es_info = self.esInfo(session['domainId']) pos_hits = search(es_info['mapping']['tag'], ['relevant'], session['pagesCap'], ['url', "x", "y", es_info['mapping']["tag"], es_info['mapping']["timestamp"], es_info['mapping']["text"]], es_info['activeCrawlerIndex'], 'page', self._es) return pos_hits