示例#1
0
    def test_dynamodb_storage(self):
        config = {
            "type": "DynamoDBStorage",
            "dynamodb_table_name": "test_table",
            "partition_key": "my_hash_key",
            "partition_key_format_string": "{category}-{url}"
        }
        item = {
            "category": "SomeCategory",
            "url": "http://google.com",
            "dict_test": {"corn": "husk"},
            "list_test": [1, 2, "corn"],
            "num": 4.02
        }

        manager = AWSManager()
        dynamostorage = DynamoDBStorage(manager, config)
        dynamo_item = dynamostorage.dynamo_item(Item(item_type="", payload=item))
        print(dynamo_item)
        self.assertEqual(dynamo_item['my_hash_key']['S'],
                         "%s-%s" % (item['category'], item['url']))
        for k in item:
            v = item[k]
            if isinstance(v, str):
                self.assertEqual(dynamo_item[k]['S'], v)
            elif isinstance(v, dict):
                d = json.loads(dynamo_item[k]['S'])
                for k in v:
                    self.assertEqual(d[k], v[k])
            elif isinstance(v, list):
                l = json.loads(dynamo_item[k]['S'])
                for i in range(len(v)):
                    self.assertEqual(l[i], v[i])
            else:
                self.assertEqual(dynamo_item[k]['N'], str(v))
示例#2
0
 def test_defaults(self):
     config = {
         'source_url': 'https://www.gutenberg.org/files/54386/54386-0.txt',
         's3_bucket_name': 'antennatest42',
         'destination_key': 'gutenberg.txt',
     }
     manager = AWSManager()
     source = StaticFileSource(manager, config)
     self.assertEqual(source._defaults['item_type'], source.item_type)
示例#3
0
    def test_local_file_source(self):
        config = {
            'source_url': 'https://www.gutenberg.org/files/54386/54386-0.txt',
            's3_bucket_name': 'antennatest42',
            'destination_key': 'gutenberg.txt',
        }
        manager = AWSManager()

        # Ensure object does not exist before we move forward
        client = manager.get_client('s3')
        client.delete_object(Bucket=config['s3_bucket_name'],
                             Key=config['destination_key'])

        source = StaticFileSource(manager, config)
        self.assertTrue(source.has_new_data())

        items = list(source.yield_items())
        self.assertEqual(1, len(items))
        self.assertFalse(source.has_new_data())
示例#4
0
 def test_newspaper_lib(self):
     #http://spectrum.ieee.org/blog/nanoclast
     config = {
         'url': 'http://futurism.com',
         'output_item_type': 'ScrapedArticle'
     }
     manager = AWSManager()
     source = NewspaperLibSource(manager, config)
     for item in source.yield_items():
         print(item.payload['url'])
示例#5
0
 def test_invalid_config(self):
     config = {}
     manager = AWSManager()
     try:
         source = StaticFileSource(manager, config)
         self.assertEqual(
             False,
             "Source should have thrown exception given empty config")
     except Exception as e:
         pass
示例#6
0
    def test_unique_dynamodb_filter(self):
        config = {
            "dynamodb_table_name": "test_table",
            "primary_key": "my_hash_key",
            "primary_key_format_string": "{category}-{url}"
        }
        item = {"category": "SomeCategory", "url": "http://google.com"}

        manager = AWSManager()
        ufilter = UniqueDynamoDBFilter(manager, config)
        formatted = ufilter.format_key(item)
        self.assertEqual(formatted, "%s-%s" % (item['category'], item['url']))
示例#7
0
    def test_rss_feed_source(self):
        config = {"rss_feed_url": "https://qz.com/feed/"}
        manager = AWSManager()
        source = RSSFeedSource(manager, config)
        self.assertTrue(source.has_new_data())

        items = list(source.yield_items())
        self.assertTrue(len(items) > 3)
        for item in items:
            self.assertTrue(len(item.payload['url']) > 10)
            self.assertTrue(len(item.payload['content']) > 10)
            self.assertTrue(len(item.payload['source_url']) > 4)
            self.assertTrue(len(item.payload['title']) > 10)
示例#8
0
 def test_dynamodb_storage(self):
     manager = AWSManager()
     dynamostorage = DynamoDBStorage(manager, self.config)
     dynamo_item = dynamostorage.dynamo_item(self.item)
     print(dynamo_item)
     self.assertEqual(dynamo_item['my_hash_key']['S'],
                      "%s-%s" % (item['category'], self.item['url']))
     for k in self.item:
         v = item[k]
         if isinstance(v, str):
             self.assertEqual(dynamo_item[k]['S'], v)
         else:
             self.assertEqual(dynamo_item[k]['N'], str(v))
示例#9
0
 def test_external_resources(self):
     manager = AWSManager()
     dynamostorage = DynamoDBStorage(manager, self.config)
     resources = dynamostorage.external_resources()
     self.assertEqual(len(resources), 1)