Exemplo n.º 1
0
def harvest(harvester_name, job_created, days_back=1):
    harvest_started = timestamp()
    harvester = registry[harvester_name]

    logger.info('Harvester "{}" has begun harvesting'.format(harvester_name))

    result = harvester.harvest(days_back=days_back)

    # result is a list of all of the RawDocuments harvested
    return result, {
        'harvestFinished': timestamp(),
        'harvestTaskCreated': job_created,
        'harvestStarted': harvest_started,
    }
Exemplo n.º 2
0
def spawn_tasks(raw, timestamps, harvester_name):
        raw['timestamps'] = timestamps
        raw['timestamps']['normalizeTaskCreated'] = timestamp()
        chain = (normalize.si(raw, harvester_name) | process_normalized.s(raw))

        chain.apply_async()
        process_raw.delay(raw)
Exemplo n.º 3
0
def spawn_tasks(raw, timestamps, harvester_name):
    raw['timestamps'] = timestamps
    raw['timestamps']['normalizeTaskCreated'] = timestamp()
    chain = (normalize.si(raw, harvester_name) | process_normalized.s(raw))

    chain.apply_async()
    process_raw.delay(raw)
Exemplo n.º 4
0
def spawn_tasks(raw, timestamps, harvester_name):
    raw["timestamps"] = timestamps
    raw["timestamps"]["normalizeTaskCreated"] = timestamp()
    chain = normalize.si(raw, harvester_name) | process_normalized.s(raw)

    chain.apply_async()
    process_raw.delay(raw)
Exemplo n.º 5
0
def harvest(harvester_name, job_created, start_date=None, end_date=None):
    harvest_started = timestamp()
    harvester = registry[harvester_name]

    start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
    end_date = end_date or date.today()

    logger.info('Harvester "{}" has begun harvesting'.format(harvester_name))

    result = harvester.harvest(start_date=start_date, end_date=end_date)

    # result is a list of all of the RawDocuments harvested
    return (
        result,
        {"harvestFinished": timestamp(), "harvestTaskCreated": job_created, "harvestStarted": harvest_started},
    )
Exemplo n.º 6
0
def run_harvester(harvester_name, days_back=1):
    logger.info('Running harvester "{}"'.format(harvester_name))

    normalization = begin_normalization.s(harvester_name)
    start_harvest = harvest.si(harvester_name, timestamp(), days_back=days_back)

    # Form and start a celery chain
    (start_harvest | normalization).apply_async()
Exemplo n.º 7
0
def harvest(harvester_name, job_created, start_date=None, end_date=None):
    harvest_started = timestamp()
    harvester = registry[harvester_name]

    start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
    end_date = end_date or date.today()

    logger.info('Harvester "{}" has begun harvesting'.format(harvester_name))

    result = harvester.harvest(start_date=start_date, end_date=end_date)

    # result is a list of all of the RawDocuments harvested
    return result, {
        'harvestFinished': timestamp(),
        'harvestTaskCreated': job_created,
        'harvestStarted': harvest_started,
    }
Exemplo n.º 8
0
def run_harvester(harvester_name, start_date=None, end_date=None):
    logger.info('Running harvester "{}"'.format(harvester_name))

    start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
    end_date = end_date or date.today()

    normalization = begin_normalization.s(harvester_name)
    start_harvest = harvest.si(harvester_name, timestamp(), start_date=start_date, end_date=end_date)

    # Form and start a celery chain
    (start_harvest | normalization).apply_async()
Exemplo n.º 9
0
def run_harvester(harvester_name, start_date=None, end_date=None):
    logger.info('Running harvester "{}"'.format(harvester_name))

    start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
    end_date = end_date or date.today()

    normalization = begin_normalization.s(harvester_name)
    start_harvest = harvest.si(harvester_name, timestamp(), start_date=start_date, end_date=end_date)

    # Form and start a celery chain
    (start_harvest | normalization).apply_async()
Exemplo n.º 10
0
def normalize(raw_doc, harvester_name):
    normalized_started = timestamp()
    harvester = registry[harvester_name]

    normalized = harvester.normalize(raw_doc)

    if not normalized:
        raise events.Skip('Did not normalize document with id {}'.format(raw_doc['docID']))

    normalized['timestamps'] = util.stamp_from_raw(raw_doc, normalizeStarted=normalized_started)

    return normalized  # returns a single normalized document
Exemplo n.º 11
0
def normalize(raw_doc, harvester_name):
    normalized_started = timestamp()
    harvester = registry[harvester_name]

    normalized = harvester.normalize(raw_doc)

    if not normalized:
        raise events.Skip('Did not normalize document with id {}'.format(raw_doc['docID']))

    normalized['timestamps'] = util.stamp_from_raw(raw_doc, normalizeStarted=normalized_started)

    return normalized  # returns a single normalized document
Exemplo n.º 12
0
    def test_timestamp(self):
        timestamp = util.timestamp()
        parsed = parse(timestamp)

        assert isinstance(parsed, datetime.datetime)