예제 #1
0
def import_feed_downloads(db: DatabaseHandler, csv_file: str) -> None:
    log.info(f"Importing downloads from {csv_file}...")

    db.begin()

    with open(csv_file, mode='r', encoding='utf-8') as f:

        # Guess dialect
        sample = f.read(1024)
        sniffer = csv.Sniffer()
        dialect = sniffer.sniff(sample)
        f.seek(0)

        input_csv = csv.DictReader(f, dialect=dialect)

        n = 1
        for download in input_csv:
            log.info(f"Importing download {n}...")
            n += 1

            raw_download_content = download.get('_raw_download_content', None)
            if raw_download_content:
                del raw_download_content['_raw_download_content']

                # Cast some columns
                download['feeds_id'] = int(
                    download.get['feeds_id']
                ) if 'feeds_id' in download else None  # NULL
                download['stories_id'] = int(
                    download.get['stories_id']
                ) if 'stories_id' in download else None  # NULL
                download['parent'] = int(
                    download.get['parent']
                ) if 'parent' in download else None  # NULL
                download['priority'] = int(
                    download.get['priority']
                ) if 'priority' in download else 0  # NOT NULL
                download['sequence'] = int(
                    download.get['sequence']
                ) if 'sequence' in download else 0  # NOT NULL
                download['sequence'] = 't' if download.get('extracted',
                                                           False) else 'f'

                # Will be rewritten by handle_download()
                download['path'] = ''

                download = db.create(table='downloads', insert_hash=download)

                # Create mock response to import it
                response = FakeResponse(content=raw_download_content)
                handler = handler_for_download(db=db, download=download)
                handler.store_response(db=db,
                                       download=download,
                                       response=response)

    log.info("Committing...")
    db.commit()

    log.info(f"Done importing downloads from {csv_file}")
예제 #2
0
    def test_fetch_handle_download(self):
        credentials = self.univision_credentials()

        medium = self.db.create(table='media',
                                insert_hash={
                                    'name':
                                    f"Media for test feed {credentials.url}",
                                    'url': 'http://www.univision.com/',
                                })

        feed = self.db.create(table='feeds',
                              insert_hash={
                                  'name': 'feed',
                                  'type': 'univision',
                                  'url': credentials.url,
                                  'media_id': medium['media_id'],
                              })

        download = create_download_for_feed(db=self.db, feed=feed)

        handler = handler_for_download(db=self.db, download=download)
        assert isinstance(handler, DownloadFeedUnivisionHandler)

        # Recreate handler with mock configuration
        handler = DownloadFeedUnivisionHandler(
            crawler_config=self._mock_crawler_config())

        response = handler.fetch_download(db=self.db, download=download)
        assert response

        handler.store_response(db=self.db,
                               download=download,
                               response=response)

        download = self.db.find_by_id(table='downloads',
                                      object_id=download['downloads_id'])
        assert download
        assert download[
            'state'] == 'success', f"Download's state is not 'success': {download['state']}"
        assert not download[
            'error_message'], f"Download's error_message should be empty: {download['error_message']}"

        if self.expect_to_find_some_stories():
            story_downloads = self.db.query(
                """
                SELECT *
                FROM downloads
                WHERE feeds_id = %(feeds_id)s
                  AND type = 'content'
                  AND state = 'pending'
            """, {
                    'feeds_id': download['feeds_id'],
                }).hashes()
            assert story_downloads, 'One or more story downloads were derived from feed'
예제 #3
0
def test_invalid_feed_url():
    """Try fetching a funky URL."""
    db = connect_to_db()

    test_medium = create_test_medium(db=db, label='test')
    test_feed = create_test_feed(db=db, label='test', medium=test_medium)

    download = db.create(table='downloads', insert_hash={
        'url': 'file:///etc/passwd',
        'host': 'localhost',
        'type': 'feed',
        'state': 'pending',
        'priority': 0,
        'sequence': 1,
        'feeds_id': test_feed['feeds_id'],
    })

    handler = handler_for_download(db=db, download=download)

    with pytest.raises(McCrawlerFetcherSoftError, message="Invalid URL should be a soft exception"):
        handler.fetch_download(db=db, download=download)
예제 #4
0
    def _fetch_and_handle_response(
            self,
            path: str,
            downloads_id: Optional[int] = None) -> Dict[str, Any]:
        """Call the fetcher and handler on the given URL. Return the download passed to the fetcher and handler."""

        if downloads_id:
            download = self.db.find_by_id(table='downloads',
                                          object_id=downloads_id)
        else:
            download = self.db.create(
                table='downloads',
                insert_hash={
                    'url': f"http://localhost:{self.port}{path}",
                    'host': 'localhost',
                    'type': 'feed',
                    'state': 'pending',
                    'priority': 0,
                    'sequence': 1,
                    'feeds_id': self.feed['feeds_id'],
                })
            downloads_id = download['downloads_id']

        handler = handler_for_download(db=self.db, download=download)

        response = handler.fetch_download(db=self.db, download=download)
        assert response

        handler.store_response(db=self.db,
                               download=download,
                               response=response)

        download = self.db.find_by_id(table='downloads',
                                      object_id=downloads_id)

        return download