示例#1
0
    def submit_query_terms(self,
                           term_list,
                           max_url_count=15,
                           parallel_cb=None,
                           cached=True):
        #Perform queries to Search Engine APIs
        #This function only operates when there is no information associated with the terms,
        #usually before running extract_terms()
        #
        #Args:
        #   term_list: list of search terms that are submited by user
        #Returns:
        #   urls: list of urls that are returned by Search Engine

        print '\n\nsubmit_query_terms\n\n'

        chdir(self.memex_home + '/seed_crawler/seeds_generator')

        query = ' '.join(term_list)
        with open('conf/queries.txt', 'w') as f:
            f.write(query)

        if not cached:
            comm = "java -cp target/seeds_generator-1.0-SNAPSHOT-jar-with-dependencies.jar BingSearch -t " + str(
                max_url_count)
            p = Popen(comm, shell=True, stdout=PIPE)
            output, errors = p.communicate()
            print output
            print errors

            call(["rm", "-rf", "html"])
            call(["mkdir", "-p", "html"])
            call(["rm", "-rf", "thumbnails"])
            call(["mkdir", "-p", "thumbnails"])

            #if sys.platform in ['darwin', 'linux2']:
            if sys.platform in ['darwin']:
                download("results.txt")
            else:
                download("results.txt", True, parallel_cb)

            if exists(self.memex_home + "/seed_crawler/ranking/exclude.txt"):
                call([
                    "rm", self.memex_home + "/seed_crawler/ranking/exclude.txt"
                ])

            with open("results.txt", 'r') as f:
                urls = [
                    self.validate_url(line.strip()) for line in f.readlines()
                ]
        else:
            urls = search('text', term_list)[0:max_url_count]

        for url in urls:
            self.urls_set.add(url)

        self.tfidf = tfidf.tfidf(list(self.urls_set))

        return urls  #Results from Search Engine
  def _getMoreLikeIrrelevantPages(self, session):
    es_info = self.esInfo(session['domainId'])
    neg_hits = search(es_info['mapping']['tag'], ['irrelevant'], session['pagesCap'], ['url', "x", "y", es_info['mapping']["tag"], es_info['mapping']["timestamp"], es_info['mapping']["text"]], es_info['activeCrawlerIndex'], 'page', self._es)

    hits = []
    if len(neg_hits) > 0:
      neg_urls = [field['id'] for field in neg_hits]

      results = get_more_like_this(neg_urls, ['url', "x", "y", es_info['mapping']["tag"], es_info['mapping']["timestamp"], es_info['mapping']["text"]], session['pagesCap'],  es_info['activeCrawlerIndex'], es_info['docType'],  self._es)
    
      hits = neg_hits[0:self._pagesCapTerms] + results

    return hits
示例#3
0
    def submit_query_terms(self, term_list, max_url_count = 15, parallel_cb = None, cached=True):
    #Perform queries to Search Engine APIs
    #This function only operates when there is no information associated with the terms,
    #usually before running extract_terms()
    #
    #Args:
    #   term_list: list of search terms that are submited by user
    #Returns:
    #   urls: list of urls that are returned by Search Engine

        print '\n\nsubmit_query_terms\n\n'

        chdir(self.memex_home + '/seed_crawler/seeds_generator')
        
        query = ' '.join(term_list)
        with open('conf/queries.txt','w') as f:
            f.write(query)
            
        if not cached:
            comm = "java -cp target/seeds_generator-1.0-SNAPSHOT-jar-with-dependencies.jar BingSearch -t " + str(max_url_count)
            p=Popen(comm, shell=True, stdout=PIPE)
            output, errors = p.communicate()
            print output
            print errors

        
            call(["rm", "-rf", "html"])
            call(["mkdir", "-p", "html"])
            call(["rm", "-rf", "thumbnails"])
            call(["mkdir", "-p", "thumbnails"])
        
            #if sys.platform in ['darwin', 'linux2']:
            if sys.platform in ['darwin']:
                download("results.txt")
            else:
                download("results.txt", True, parallel_cb)

            if exists(self.memex_home + "/seed_crawler/ranking/exclude.txt"):
                call(["rm", self.memex_home + "/seed_crawler/ranking/exclude.txt"])

            with open("results.txt",'r') as f:
                urls = [self.validate_url(line.strip()) for line in f.readlines()]
        else:
            urls = search('text', term_list)[0:max_url_count]

        for url in urls:
            self.urls_set.add(url)

        self.tfidf = tfidf.tfidf(list(self.urls_set))

        return urls #Results from Search Engine
  def _getMoreLikePages(self, session):
    es_info = self.esInfo(session['domainId'])
    
    hits=[]
    tags = session['selected_tags'].split(',')
    for tag in tags:
      tag_hits = search(es_info['mapping']['tag'], [tag], session['pagesCap'], ['url', "x", "y", es_info['mapping']["tag"], es_info['mapping']["timestamp"], es_info['mapping']["text"]], es_info['activeCrawlerIndex'], 'page', self._es)

      if len(tag_hits) > 0:
        tag_urls = [field['id'] for field in tag_hits]
      
        results = get_more_like_this(tag_urls, ['url', "x", "y", es_info['mapping']["tag"], es_info['mapping']["timestamp"], es_info['mapping']["text"]], session['pagesCap'],  es_info['activeCrawlerIndex'], es_info['docType'],  self._es)
      
        hits.extend(tag_hits[0:self._pagesCapTerms] + results)

    return hits
  def _getRelevantPages(self, session):
    es_info = self.esInfo(session['domainId'])

    pos_hits = search(es_info['mapping']['tag'], ['relevant'], session['pagesCap'], ['url', "x", "y", es_info['mapping']["tag"], es_info['mapping']["timestamp"], es_info['mapping']["text"]], es_info['activeCrawlerIndex'], 'page', self._es)

    return pos_hits