示例#1
0
def process_raw():
    if request.method == 'POST':
        docs = request.form['doc']
        doc_list_item = docs.split("ASDFJKL")  # TODO Fix this
        doc_ids = request.form['doc_id']
        doc_ids_item = doc_ids.split("ASDFJKL")
        for x in range(0, len(doc_list_item)):
            doc = doc_list_item[x]
            source = request.form.get('source')
            doc_id = doc_ids_item[x]
            filetype = request.form.get('filetype')

            timestamp = process_docs.process_raw(doc, source, doc_id, filetype)
            with open('worker_manager/recent_files.txt', 'a') as f:
                f.write(source + ", " + doc_id + ", " + str(timestamp) + "\n")
    else:
        docs = request.args['doc']
        doc_list_item = docs.split("ASDFJKL")
        doc_ids = request.args['doc_id']
        doc_ids_item = doc_ids.split("ASDFJKL")
        for x in range(0, len(doc_list_item)):
            doc = doc_list_item[x]
            source = request.args.get('source')
            doc_id = doc_ids_item[x]
            filetype = request.args.get('filetype')

            timestamp = process_docs.process_raw(doc, source, doc_id, filetype)
            with open('worker_manager/recent_files.txt', 'a') as f:
                f.write(source + ", " + doc_id + ", " + str(timestamp) + "\n")

    return Response()
示例#2
0
文件: main.py 项目: chrisseto/scrapi
def process_raw():
    if request.method == 'POST':
        docs = request.form['doc']
        doc_list_item = docs.split("ASDFJKL")  # TODO Fix this
        doc_ids = request.form['doc_id']
        doc_ids_item = doc_ids.split("ASDFJKL")
        for x in range(0, len(doc_list_item)):
            doc = doc_list_item[x]
            source = request.form.get('source')
            doc_id = doc_ids_item[x]
            filetype = request.form.get('filetype')

            timestamp = process_docs.process_raw(doc, source, doc_id, filetype)
            with open('worker_manager/recent_files.txt', 'a') as f:
                f.write(source + ", " + doc_id + ", " + str(timestamp) + "\n")
    else:
        docs = request.args['doc']
        doc_list_item = docs.split("ASDFJKL")
        doc_ids = request.args['doc_id']
        doc_ids_item = doc_ids.split("ASDFJKL")
        for x in range(0, len(doc_list_item)):
            doc = doc_list_item[x]
            source = request.args.get('source')
            doc_id = doc_ids_item[x]
            filetype = request.args.get('filetype')

            timestamp = process_docs.process_raw(doc, source, doc_id, filetype)
            with open('worker_manager/recent_files.txt', 'a') as f:
                f.write(source + ", " + doc_id + ", " + str(timestamp) + "\n")

    return Response()
示例#3
0
    def test_process_raw(self):
        raw_file = RawDocument({
            'doc': json.dumps({'Hello':  'world'}),
            'source': "TEST",
            'doc_id': 37,
            'filetype': "json"
        })

        assert process_docs.process_raw(raw_file, 'test-version')

        found = False
        for dirname, dirnames, filenames in os.walk('archive/TEST/{0}'.format(raw_file.get('doc_id'))):
            if os.path.isfile(dirname + '/raw.json'):
                found = True
        assert found
示例#4
0
    def test_process_legal(self):
        raw_doc = RawDocument({
            'doc': json.dumps({'Hello': 'world'}),
            'source': 'TEST',
            'doc_id': 37,
            'filetype': 'json'
        })
        ts = str(process_docs.process_raw(raw_doc, 'test-version'))
        timestamp = None
        for dirname, dirnames, filenames in os.walk('archive/TEST/{0}'.format(raw_doc.get('doc_id'))):
            if os.path.isfile(dirname + '/raw.json'):
                timestamp = dirname.split('/')[-1]
        assert timestamp == ts

        doc = NormalizedDocument({
            'title': "TEST PROJECT",
            'contributors': [
                {
                    'full_name': 'Me, Myself',
                    'email': '*****@*****.**'
                },
                {
                    'full_name': 'And I',
                    'email': '*****@*****.**'
                }
            ],
            'properties': {
            },
            'meta': {},
            'id': {
                'service_id': raw_doc.get('doc_id'),
                'doi': 'Not available',
                'url': 'fake.stuff.org/{}'.format(raw_doc.get('doc_id')),
            },
            'source': raw_doc.get('source'),
            'timestamp': str(timestamp),
            'tags': ['1', '2', '3'],
            'date_created': str(timestamp),
            'description': 'science stuff',
        })

        assert process_docs.process(doc, timestamp)

        found = False
        for dirname, dirnames, filenames in os.walk('archive/TEST/{0}'.format(raw_doc.get('doc_id'))):
            if os.path.isfile(dirname + '/normalized.json'):
                found = True
        assert found
示例#5
0
def run_consumer(manifest_file):
    """
        Run the consume and normalize functions of the module specified in the manifest

        Take a manifest file location, load the corresponding module from the
        consumers/ directory, call the consume and normalize functions for that module,
        and add the normalized documents to the elastic search index.
        Return the list of normalized documents
    """
    manifest = _load_config(manifest_file)
    logger.info('run_scraper executing for service {}'.format(manifest['directory']))
    logger.info('worker_manager.consumers.{0}'.format(manifest['directory']))

    results, registry, consumer_version = _consume(manifest['directory'])

    docs = []
    for result in results:
        timestamp = process_docs.process_raw(result, consumer_version)
        docs.append(_normalize(result, timestamp, registry, manifest))
    return docs
示例#6
0
def run_consumer(manifest_file):
    """
        Run the consume and normalize functions of the module specified in the manifest

        Take a manifest file location, load the corresponding module from the
        consumers/ directory, call the consume and normalize functions for that module,
        and add the normalized documents to the elastic search index.
        Return the list of normalized documents
    """
    manifest = _load_config(manifest_file)
    logger.info('run_scraper executing for service {}'.format(
        manifest['directory']))
    logger.info('worker_manager.consumers.{0}'.format(manifest['directory']))

    results, registry, consumer_version = _consume(manifest['directory'])

    docs = []
    for result in results:
        timestamp = process_docs.process_raw(result, consumer_version)
        docs.append(_normalize(result, timestamp, registry, manifest))
    return docs