Пример #1
0
def _normalize(result, timestamp, registry, manifest):
    iso_timestamp = timestamp.isoformat()
    normalized = registry[manifest['directory']]['normalize'](result, timestamp)
    logger.info('Document {0} normalized successfully'.format(result.get("doc_id")))
    doc = process_docs.process(normalized, timestamp)
    if doc is not None:
        doc.attributes['source'] = manifest['name']
        doc.attributes['location'] = "archive/{source}/{doc_id}/{timestamp}/normalized.json"\
            .format(source=manifest['directory'], doc_id=doc.get('id').get('service_id'), timestamp=doc.get('timestamp')),
        doc.attributes['iso_timestamp'] = str(iso_timestamp)
        logger.info('Document {0} processed successfully'.format(result.get("doc_id")))
        search.update('scrapi', doc.attributes, manifest['directory'], result.get("doc_id"))
    return doc
Пример #2
0
def _normalize(result, timestamp, registry, manifest):
    iso_timestamp = timestamp.isoformat()
    normalized = registry[manifest['directory']]['normalize'](result,
                                                              timestamp)
    logger.info('Document {0} normalized successfully'.format(
        result.get("doc_id")))
    doc = process_docs.process(normalized, timestamp)
    if doc is not None:
        doc.attributes['source'] = manifest['name']
        doc.attributes['location'] = "archive/{source}/{doc_id}/{timestamp}/normalized.json"\
            .format(source=manifest['directory'], doc_id=doc.get('id').get('service_id'), timestamp=doc.get('timestamp')),
        doc.attributes['iso_timestamp'] = str(iso_timestamp)
        logger.info('Document {0} processed successfully'.format(
            result.get("doc_id")))
        search.update('scrapi', doc.attributes, manifest['directory'],
                      result.get("doc_id"))
    return doc
Пример #3
0
    def setUp(self):
        search.delete_all('test')
        source = "test"
        doc_id = 38
        doc = {
            'title': "TEST PROJECT",
            'contributors': ['Me, Myself', 'And I'],
            'properties': {
                'description': 'science stuff',
                'email': 'email stuff'
            },
            'meta': {},
            'id': doc_id,
            'source': source,
            'iso_timestamp': datetime.datetime.now().isoformat()
        }

        search.update(source, doc, 'article', doc_id)
Пример #4
0
    def setUp(self):
        search.delete_all('test')
        source = "test"
        doc_id = 38
        doc = {
            'title': "TEST PROJECT",
            'contributors': ['Me, Myself', 'And I'],
            'properties': {
                'description': 'science stuff',
                'email': 'email stuff'
            },
            'meta': {},
            'id': doc_id,
            'source': source,
            'iso_timestamp': datetime.datetime.now().isoformat()
        }

        search.update(source, doc, 'article', doc_id)
Пример #5
0
def migrate():
    try:
        search.delete_all('scrapi')
    except ElasticHttpNotFoundError:
        pass
    for dirname, dirnames, filenames in os.walk('archive/'):
        if os.path.isfile(dirname + '/normalized.json'):
            with open(dirname + '/normalized.json') as f:
                try:
                    doc = json.load(f)
                except ValueError as e:
                    logger.exception(e)
                    continue

                try:
                    search.update('scrapi', doc, dirname.split('/')[1], dirname.split('/')[2])
                except ElasticHttpError as e:
                    logger.exception(e)
                    continue