예제 #1
0
    def fetch_pages_helper(self, urls, start, step, cache, results):
        """
        Helper function for parallel fetching 
        """
        max_size = 5000000
        pages = []

        for i in range(start, len(urls), step):
            url = urls[i]
            if (i + 1) % 500 == 0:
                print "Fetched ", i, " urls"
            page = Page(url)
            try:
                text = ''
                size = 0
                res = requests.get(url,
                                   headers=self.header,
                                   verify=False,
                                   timeout=5,
                                   stream=True)
                #t = time.time()
                for chunk in res.iter_content(10000):
                    #if (time.time() - t) > 5:
                    #    break
                    #    raise ValueError('timeout reached')
                    text += chunk
                    size += len(chunk)
                    if size > max_size:
                        print "Size exceeds ", size
                        raise ValueError('response too large')

                if res.status_code == 200:
                    #page = Page(url)
                    if len(text) < self.max_html_size:
                        page.add_html(text)
                else:
                    print "Failed to fetch ", url, res.status_code, start
            except:
                print "Failed to fetch ", url
                continue

            # Save to cache. Note that always save the fetched pages even if the requests were failed
            # since we want to avoid re-fetch these pages in the future
            if self.caching:
                cache.add(url, page.get_json_obj())
            else:
                page.get_json_obj()  # hack

            if page.body and (len(page.get_text('body')) > 100):
                #if not page.is_empty():
                pages.append(page)
        results.put(pages)
예제 #2
0
    def fetch(self, urls, out_file, extraction=True):
        """
        Parameters:
        -----------
        urls: list of url. Each url represents a website

        Returns:
        --------
        list<website>: list of fetched websites
        """
        if os.path.exists(out_file):
            fetched_urls = []
            with open(out_file) as lines:
                for line in lines:
                    try:
                        jsobj = json.loads(line)
                        fetched_urls.append(jsobj['url'])
                    except:
                        traceback.print_exc()
            urls = [url for url in urls if url not in fetched_urls]

        print "Number of urls to fetch: ", len(urls)
        out = open(out_file, 'a+')

        for i, url in enumerate(urls):
            if (i + 1) % 20 == 0:
                print "Fetched ", i, " urls"
            try:
                res = requests.get(url,
                                   headers=self.header,
                                   verify=False,
                                   timeout=10)
                if res.status_code == 200:
                    page = Page(url)
                    if len(res.text) < self.max_html_size:
                        page.add_html(res.text)
                        if extraction:
                            jspage = page.get_json_obj()
                        else:
                            jspage = {'url': url, 'html': res.text}
                        out.write(json.dumps(jspage) + '\n')
                else:
                    print res.status_code, url
            except:
                print "Failed to fetch ", url
                traceback.print_exc()

        out.close()