예제 #1
0
  def get_next_level_page(self, soup):
    page_links = []

    for link in soup.find_all('a'):
      if not link.get('href'):
        continue
      newlink = urljoin(self.page.url, link.get('href'))
      retlink = vc.various_check(newlink)
      if retlink:
        page_links.append(retlink)

    page_links = list(set(page_links))

    for link in page_links:
      if not self.check_duplication(link):
        page = Page(link, self.page.depth + 1, self.page.score, ref=self.page.url)
        self.queue.en_queue(page)
예제 #2
0
    def query(self):
        ''' send google search query and get the top 10 result as a list of URLs '''

        if self.fake:
            return ['http://engineering.nyu.edu', 'http://www.nyu.edu']

        my_referer = 'https://www.bing.com/'
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
            'referer': my_referer
        }
        ''' send query and return list of URLs '''
        query_string = self.make_query_string()
        params = {'q': query_string}

        # make google HTTP request
        bing_url = "http://www.bing.com/search?"
        try:
            response = requests.get(bing_url,
                                    params=params,
                                    headers=headers,
                                    timeout=0.5)

        except Exception as e:

            return

        # parse page contents
        data = response.text
        soup = BeautifulSoup(data, 'html.parser')

        ret = []

        for link in soup.find_all('h2'):
            for linka in link.find_all('a'):
                if linka.get('href'):
                    newlink = urljoin(response.url, linka.get('href'))
                    if newlink.startswith('http'):
                        retlink = vc.various_check(newlink)
                        if retlink:
                            ret.append(retlink)

        # return array of URls in an array
        return list(set(ret))[:10]
예제 #3
0
  def query(self):
    ''' send google search query and get the top 10 result as a list of URLs '''

    if self.fake:
      return ['http://engineering.nyu.edu', 'http://www.nyu.edu']

    my_referer='http://www.google.com'
    headers = {
      'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
      'referer': 'http://www.google.com'
    }

    ''' send query and return list of URLs '''
    query_string = self.make_query_string()
    params = {'q':query_string}

    # make google HTTP request
    google_url = "http://www.google.com/search?"
    try:
      response = requests.get(google_url, params=params, headers=headers, timeout=0.5)

    except Exception as e:
      self.error = 1
      return

    # parse page contents
    data = response.text
    soup = BeautifulSoup(data, 'html.parser')

    ret = []

    for link in soup.find_all('h3'):
      for linka in link.find_all('a'):
        if not linka.get('style'):
          if linka.get('href'):
            newlink = urljoin(response.url, linka.get('href'))
            retlink = vc.various_check(newlink)
            if retlink:
              ret.append(retlink)

    # return array of URls in an array
    return list(set(ret))[:10]
    def get_next_level_page(self, soup):
        page_links = []

        for link in soup.find_all('a'):
            if not link.get('href'):
                continue
            newlink = urljoin(self.page.url, link.get('href'))
            retlink = vc.various_check(newlink)
            if retlink:
                page_links.append(retlink)

        page_links = list(set(page_links))

        for link in page_links:
            if not self.check_duplication(link):
                page = Page(link,
                            self.page.depth + 1,
                            self.page.score,
                            ref=self.page.url)
                self.queue.en_queue(page)