def harvest(harvester_name, job_created, days_back=1): harvest_started = timestamp() harvester = registry[harvester_name] logger.info('Harvester "{}" has begun harvesting'.format(harvester_name)) result = harvester.harvest(days_back=days_back) # result is a list of all of the RawDocuments harvested return result, { 'harvestFinished': timestamp(), 'harvestTaskCreated': job_created, 'harvestStarted': harvest_started, }
def spawn_tasks(raw, timestamps, harvester_name): raw['timestamps'] = timestamps raw['timestamps']['normalizeTaskCreated'] = timestamp() chain = (normalize.si(raw, harvester_name) | process_normalized.s(raw)) chain.apply_async() process_raw.delay(raw)
def spawn_tasks(raw, timestamps, harvester_name): raw["timestamps"] = timestamps raw["timestamps"]["normalizeTaskCreated"] = timestamp() chain = normalize.si(raw, harvester_name) | process_normalized.s(raw) chain.apply_async() process_raw.delay(raw)
def harvest(harvester_name, job_created, start_date=None, end_date=None): harvest_started = timestamp() harvester = registry[harvester_name] start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() logger.info('Harvester "{}" has begun harvesting'.format(harvester_name)) result = harvester.harvest(start_date=start_date, end_date=end_date) # result is a list of all of the RawDocuments harvested return ( result, {"harvestFinished": timestamp(), "harvestTaskCreated": job_created, "harvestStarted": harvest_started}, )
def run_harvester(harvester_name, days_back=1): logger.info('Running harvester "{}"'.format(harvester_name)) normalization = begin_normalization.s(harvester_name) start_harvest = harvest.si(harvester_name, timestamp(), days_back=days_back) # Form and start a celery chain (start_harvest | normalization).apply_async()
def harvest(harvester_name, job_created, start_date=None, end_date=None): harvest_started = timestamp() harvester = registry[harvester_name] start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() logger.info('Harvester "{}" has begun harvesting'.format(harvester_name)) result = harvester.harvest(start_date=start_date, end_date=end_date) # result is a list of all of the RawDocuments harvested return result, { 'harvestFinished': timestamp(), 'harvestTaskCreated': job_created, 'harvestStarted': harvest_started, }
def run_harvester(harvester_name, start_date=None, end_date=None): logger.info('Running harvester "{}"'.format(harvester_name)) start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() normalization = begin_normalization.s(harvester_name) start_harvest = harvest.si(harvester_name, timestamp(), start_date=start_date, end_date=end_date) # Form and start a celery chain (start_harvest | normalization).apply_async()
def normalize(raw_doc, harvester_name): normalized_started = timestamp() harvester = registry[harvester_name] normalized = harvester.normalize(raw_doc) if not normalized: raise events.Skip('Did not normalize document with id {}'.format(raw_doc['docID'])) normalized['timestamps'] = util.stamp_from_raw(raw_doc, normalizeStarted=normalized_started) return normalized # returns a single normalized document
def test_timestamp(self): timestamp = util.timestamp() parsed = parse(timestamp) assert isinstance(parsed, datetime.datetime)