Пример #1
0
 def test_fetch_bad_status_code(self):
     '''
     Fetch a URL that returns a bad status code
     '''
     responses.add(responses.GET,
                   'http://example.com',
                   status=404,
                   content_type='text/html')
     with self.assertRaises(HTTPError):
         fetch('http://example.com')
Пример #2
0
 def save(self, *args, **kwargs):
     '''
     check to see if title or description are empty, and scrape them if necessary
     '''
     # TODO: make this async
     try:
         f = fetch(URL=self.url)
     except HTTPError:
         return super(Link, self).save(*args, **kwargs)
     if self.title == '':
         self.title = f.title
     if self.description == '':
         self.description = f.description
     return super(Link, self).save(*args, **kwargs)
Пример #3
0
    def test_fetch_without_data(self):
        '''
        Fetch with a domain that doesn't have a title or description
        '''
        responses.add(responses.GET,
                      'http://example.com',
                      body='',
                      status=200,
                      content_type='text/html')

        f = fetch('http://example.com')

        self.assertEqual('', f.title)
        self.assertEqual('', f.description)
Пример #4
0
    def test_fetch_recieves_data(self):
        '''
        Fetch with a domain that has both a title and a description
        '''
        body = '''
        <title>Example</title>
        <meta content="An example description" name="description">
        '''
        responses.add(responses.GET,
                      'http://example.com',
                      body=body,
                      status=200,
                      content_type='text/html')

        f = fetch('http://example.com')

        self.assertEqual('Example', f.title)
        self.assertEqual('An example description', f.description)
Пример #5
0
 def stew(self, _url):
     if _url != None:
         response = fetcher.fetch(self.addSchema(_url), self.proxy)
         self.htmltext = response.text
         self.content = response.content
         self.status = response.status_code
         self.code_reason = response.reason
         self.time_cost = response.elapsed
         self.url = response.url
         self.ok = response.ok
         self.encoding = response.encoding
         self.cookie = response.cookies
         self.header = response.headers
         self.is_redirect = response.is_redirect
         self.history = response.history
         self.soup = BeautifulSoup(self.content, 'html.parser')
         return self.cleanMe(self.soup)
     return None
Пример #6
0
 def __init__(self, _url=None, proxy=None):
     super(webparser, self).__init__()
     self.url = _url
     self.proxy = proxy
     if _url is None:
         return
     response = fetcher.fetch(self.addSchema(_url), self.proxy)
     self.htmltext = response.text
     self.content = response.content
     self.status = response.status_code
     self.code_reason = response.reason
     self.time_cost = response.elapsed
     self.url = response.url
     #print self.url
     self.ok = response.ok
     self.encoding = response.encoding
     self.cookie = response.cookies
     self.header = response.headers
     self.is_redirect = response.is_redirect
     self.history = response.history
Пример #7
0
 def test_fetcher(self, session):
     for i in self.input_data():
         url, content, status = i.split("\t")
         status = int(status)
         dm = DocumentMetadata(url)
         mr = MockResponse(content, status)
         session.return_value = mr
         new_dm = fetcher.fetch({}, dm, 1)
         self.assertEqual(dm.url, url)
         if status == 200:
             self.assertEqual(dm.status, 0)
             self.assertEqual(new_dm.status, 0)
             self.assertEqual(new_dm.response.content, content)
         elif status == 301:
             self.assertEqual(dm.status, fetcher.Status.SkipUrl)
         elif status == 0:
             self.assertEqual(dm.status, fetcher.Status.ConnectionError)
         elif status >= 500 and status < 510:
             self.assertEqual(dm.status, fetcher.Status.SkipUrl)
         else:
             self.assertEqual(dm.status, fetcher.Status.GenericError)
Пример #8
0
def main():
    query, db = get_arguments()

    if db == "psql":
        conn = psycopg2.connect(
            database=env("DATABASE_NAME"),
            user=env("DATABASE_USER"),
            host=env("DATABASE_URL"),
            password=env("DATABASE_PASSWORD"))
    elif db == "sqlite":
        sqlite_path = base_path + "/" + env("SQLITE_NAME")
        conn = sqlite3.connect(sqlite_path)
    else:
        return

    total = 0
    count_fetched = 0
    total_count_book_inserted = 0
    total_count_book_updated = 0
    total_count_category_inserted = 0

    while count_fetched <= total:
        data = fetcher.fetch(query, count_fetched, env('GOOGLE_BOOKS_API_KEY'))
        books = data.get('items', None)
        if books is None:
            break

        count_book_inserted, count_book_updated, count_category_inserted = save(db, conn, books)
        total_count_book_inserted += count_book_inserted
        total_count_book_updated += count_book_updated
        total_count_category_inserted += count_category_inserted
        total = data.get('totalItems', 0)
        count_fetched += len(books)
        print(str(count_fetched) + ' / ' + str(total))

    conn.close()
    print(str(total_count_book_inserted) + " books inserted!")
    print(str(total_count_book_updated) + " books updated!")
    print(str(total_count_category_inserted) + " categories inserted!")
Пример #9
0
    def start(self):
        """
        Start the crawling phase.

        The job continues until a sigterm is cought.
        """
        # starting urls end up in priority
        self.queue.init_priority_list(self.spider.start_urls)
        self.queue.add_bootstrap_urls(self.spider.urllist)

        while not GracefulKiller.kill_now:
            dmeta = self.queue.pop()
            dmeta.spider = self.spider.name
            if dmeta.url:
                # in case of changing spider filters it is better to recheck
                nurl, toremove = self.spider.check_and_normalize(dmeta.url)
                if toremove:
                    self.queue.remove_seen(dmeta.url)
                    # INFO: in case of normalization we want to fetch the url
                    #       but we want to discard other cases.
                    if nurl == dmeta.url:
                        continue
                dmeta.url = nurl
                dmeta.alternatives = [nurl]
                dmeta = fetcher.fetch(self.spider.headers, dmeta)

                if dmeta.status == fetcher.Status.ConnectionError:
                    # CHECK: check if this is still correct
                    self.queue.add_seen_and_reschedule(dmeta)
                elif dmeta.response:
                    r_url = self.spider.normalize_url(dmeta.response.url)
                    dmeta.alternatives.append(r_url)
                    document, dmeta = self.spider.parse(dmeta)
                    self.queue.add_normal_urls(dmeta)
                    # INFO: in case of status != 200 previous data will not be overwrited
                    self.documentStore.store(document)
                    self.queue.add_seen_and_reschedule(dmeta)
            self.sleep.wait(self.spider.delay)