def test_incremental_crawl_failure(self, bucket_mock, conn_mock, crawl_mock): def failure_feed(url): if '/feed' in url: return {'notdata': [{'ooga': 'booga'}]} self.facebook_patch = patch( 'targetshare.integration.facebook.client.urllib2.urlopen', crawl_mock(1, 250, failure_feed) ) self.facebook_patch.start() the_past = epoch.from_date(timezone.now() - timedelta(days=365)) # Test runs in under a second typically, so we need to be slightly # behind present time, so that we can see fbm.incremental_epoch # get updated present = epoch.from_date(timezone.now() - timedelta(seconds=30)) fbm = models.FBSyncMap.items.create( fbid_primary=self.fbid, fbid_secondary=self.fbid, token=self.token.token, back_filled=False, back_fill_epoch=the_past, incremental_epoch=present, status=models.FBSyncMap.COMPLETE, bucket='test_bucket_0' ) existing_key = Mock() existing_key.get_contents_as_string.return_value = '{"updated": 1, "data": [{"test": "testing"}]}' bucket_mock.return_value = existing_key conn_mock.return_value = s3_feed.BucketManager() tasks.incremental_crawl(fbm.fbid_primary, fbm.fbid_secondary) new_fbm = models.FBSyncMap.items.get_item( fbid_primary=self.fbid, fbid_secondary=self.fbid) self.assertEqual(fbm.status, fbm.COMPLETE) self.assertEqual(int(new_fbm.incremental_epoch), present) self.assertFalse(existing_key.set_contents_from_string.called)
def test_incremental_crawl(self, bucket_mock, conn_mock): the_past = epoch.from_date(timezone.now() - timedelta(days=365)) # Test runs in under a second typically, so we need to be slightly # behind present time, so that we can see fbm.incremental_epoch # get updated present = epoch.from_date(timezone.now() - timedelta(seconds=30)) fbm = models.FBSyncMap.items.create( fbid_primary=self.fbid, fbid_secondary=self.fbid, token=self.token.token, back_filled=False, back_fill_epoch=the_past, incremental_epoch=present, status=models.FBSyncMap.COMPLETE, bucket='test_bucket_0' ) existing_key = Mock() existing_key.data = {"updated": 1, "data": [{"test": "testing"}]} bucket_mock.return_value = existing_key conn_mock.return_value = s3_feed.BucketManager() tasks.incremental_crawl(fbm.fbid_primary, fbm.fbid_secondary) new_fbm = models.FBSyncMap.items.get_item( fbid_primary=self.fbid, fbid_secondary=self.fbid) self.assertEqual(fbm.status, fbm.COMPLETE) self.assertGreater(int(new_fbm.incremental_epoch), present) self.assertTrue(existing_key.extend_s3_data.called) self.assertSequenceEqual( existing_key.extend_s3_data.call_args_list[0][0], (False,) )
def save_to_s3(self): """ Commits the current populated FeedKey to s3 """ self.data["updated"] = epoch.from_date(timezone.now()) with TemporaryFile() as tmp_file: json.dump(self.data, tmp_file) tmp_file.seek(0) self.set_contents_from_file(tmp_file)
def test_crawl_comments_and_likes(self, bucket_mock, conn_mock, fb_mock): the_past = epoch.from_date(timezone.now() - timedelta(days=365)) fbm = models.FBSyncMap.items.create( fbid_primary=self.fbid, fbid_secondary=self.fbid, token=self.token.token, back_filled=False, back_fill_epoch=the_past, incremental_epoch=epoch.from_date(timezone.now()), status=models.FBSyncMap.COMMENT_CRAWL, bucket='test_bucket_0' ) fb_mock.side_effect = [ {"data": [ { "id": "10151910724132946_11479371", "from": { "name": "Alex Tevlin", "id": "794333711" }, "message": "Should've stayed at Fulham to begin with!", "can_remove": False, "created_time": "2013-12-20T16:25:26+0000", "like_count": 0, "user_likes": False }, ]}, {"data": [ { "id": "100002382106641", "name": "Joseph Orozco" }, ]}, ] user_feed = json.loads( open(os.path.join(DATA_PATH, 'user_feed.json')).read() ) existing_key = Mock() existing_key.data = json.load( open(os.path.join(DATA_PATH, 'user_feed.json')) ) bucket_mock.return_value = existing_key conn_mock.return_value = s3_feed.BucketManager() self.assertEqual(len(user_feed['data'][0]['comments']['data']), 1) self.assertEqual(len(user_feed['data'][0]['likes']['data']), 3) tasks.crawl_comments_and_likes(fbm.fbid_primary, fbm.fbid_secondary) self.assertEqual(len(existing_key.data['data'][0]['comments']['data']), 2) self.assertEqual(len(existing_key.data['data'][0]['likes']['data']), 4) fbm = models.FBSyncMap.items.get_item(fbid_primary=self.fbid, fbid_secondary=self.fbid) self.assertEqual(fbm.status, fbm.COMPLETE)
def incremental_crawl(self, primary, secondary): sync_map = models.FBSyncMap.items.get_item( fbid_primary=primary, fbid_secondary=secondary) logger.info('Starting incremental crawl of %s', sync_map.s3_key_name) sync_map.save_status(models.FBSyncMap.INCREMENTAL) try: bucket = S3_CONN.get_or_create_bucket(sync_map.bucket) s3_key, created = bucket.get_or_create_key(sync_map.s3_key_name) s3_key.retrieve_fb_feed( sync_map.fbid_secondary, sync_map.token, sync_map.incremental_epoch, epoch.from_date(timezone.now()) ) except (facebook.client.OAuthException): rvn_logger.info('Failed incremental crawl due to expired token for %s', sync_map.s3_key_name) return except (ValueError, IOError): try: self.retry() except MaxRetriesExceededError: # We'll get `em next time, boss. rvn_logger.info('Failed incremental crawl of %s', sync_map.s3_key_name) else: try: s3_key.crawl_pagination() except (facebook.client.OAuthException): rvn_logger.info('Failed incremental crawl due to expired token for %s', sync_map.s3_key_name) return if 'data' in s3_key.data: # If we have data, let's save it. If not, let's kick this guy over # to crawl_comments_and_likes. We'll get that incremental data later try: s3_key.extend_s3_data(False) except HTTPException as exc: self.retry(exc=exc) sync_map.incremental_epoch = epoch.from_date(timezone.now()) sync_map.save() crawl_comments_and_likes.apply_async( args=[sync_map.fbid_primary, sync_map.fbid_secondary], countdown=DELAY_INCREMENT ) sync_map.save_status(models.FBSyncMap.COMPLETE) logger.info('Completed incremental crawl of %s', sync_map.s3_key_name)
def test_back_fill_crawl(self, bucket_mock, conn_mock, crawl_mock): the_past = epoch.from_date(timezone.now() - timedelta(days=365)) fbm = models.FBSyncMap.items.create( fbid_primary=self.fbid, fbid_secondary=self.fbid, token=self.token.token, back_filled=False, back_fill_epoch=the_past, incremental_epoch=epoch.from_date(timezone.now()), status=models.FBSyncMap.BACK_FILL, bucket='test_bucket_0' ) existing_key = Mock() existing_key.data = {"updated": 1, "data": [{"test": "testing"}]} bucket_mock.return_value = existing_key conn_mock.return_value = s3_feed.BucketManager() tasks.back_fill_crawl(fbm.fbid_primary, fbm.fbid_secondary) fbm = models.FBSyncMap.items.get_item( fbid_primary=self.fbid, fbid_secondary=self.fbid) self.assertEqual(fbm.status, fbm.COMMENT_CRAWL) assert fbm.back_fill_epoch assert fbm.back_filled assert fbm.incremental_epoch assert crawl_mock.apply_async.called self.assertTrue(existing_key.extend_s3_data.called)
def initial_crawl(self, primary, secondary): sync_map = models.FBSyncMap.items.get_item( fbid_primary=primary, fbid_secondary=secondary) logger.info('Starting initial crawl of %s', sync_map.s3_key_name) sync_map.save_status(models.FBSyncMap.INITIAL_CRAWL) past_epoch = epoch.from_date(timezone.now() - timedelta(days=365)) now_epoch = epoch.from_date(timezone.now()) try: bucket = S3_CONN.get_or_create_bucket(sync_map.bucket) s3_key, _ = bucket.get_or_create_key(sync_map.s3_key_name) s3_key.retrieve_fb_feed( sync_map.fbid_secondary, sync_map.token, past_epoch, now_epoch ) except (facebook.client.OAuthException): rvn_logger.info('Failed initial crawl due to expired token for %s', sync_map.s3_key_name) return except (ValueError, IOError, HTTPException): try: self.retry() except MaxRetriesExceededError: sync_map.save_status(models.FBSyncMap.WAITING) return s3_key.data['updated'] = now_epoch try: s3_key.save_to_s3() except HTTPException as exc: self.retry(exc=exc) sync_map.back_fill_epoch = past_epoch sync_map.incremental_epoch = now_epoch sync_map.save_status(models.FBSyncMap.PAGE_LIKES) retrieve_page_likes.apply_async( args=[sync_map.fbid_primary, sync_map.fbid_secondary], countdown=DELAY_INCREMENT ) logger.info('Completed initial crawl of %s', sync_map.s3_key_name)
def extend_s3_data(self, append=True): """ Extends the data we have in S3, typically in incremental or back_fill jobs. Append flag lets you dictate if the new data ends up in front or in back of the existing data """ with TemporaryFile() as s3_file, TemporaryFile() as json_file: self.get_contents_to_file(s3_file) s3_file.seek(0) full_data = json.load(s3_file) existing_data = full_data.setdefault("data", []) if append: existing_data.extend(self.data["data"]) self.data = full_data else: self.data["data"].extend(existing_data) self.data["updated"] = epoch.from_date(timezone.now()) json.dump(self.data, json_file) json_file.seek(0) self.set_contents_from_file(json_file)
from datetime import datetime, timedelta import mock from faraday.utils import epoch from targetshare import models from targetshare.tasks.integration import facebook from .. import EdgeFlipTestCase DEBUG_TOKEN_MOCK = json.dumps({ 'data': { 'is_valid': True, 'user_id': 100, 'expires_at': epoch.from_date(datetime(2013, 5, 15, 12, 1, 1)), } }) EXTEND_TOKEN_MOCK = urllib.urlencode([ ('access_token', 'tok1'), ('expires', str(60 * 60 * 24 * 60)), # 60 days in seconds ]) class TestStoreOpenAuthToken(EdgeFlipTestCase): fixtures = ('test_data',) frozen_time = '2013-01-01' requests_patch = mock.patch('requests.get', **{'return_value.content': 'access_token=TOKZ'})