Python process_raw示例，api.process_docs.process_raw Python示例

示例#1

0

显示文件

文件： main.py 项目： csheldonhess/scrapi

def process_raw():
    if request.method == 'POST':
        docs = request.form['doc']
        doc_list_item = docs.split("ASDFJKL")  # TODO Fix this
        doc_ids = request.form['doc_id']
        doc_ids_item = doc_ids.split("ASDFJKL")
        for x in range(0, len(doc_list_item)):
            doc = doc_list_item[x]
            source = request.form.get('source')
            doc_id = doc_ids_item[x]
            filetype = request.form.get('filetype')

            timestamp = process_docs.process_raw(doc, source, doc_id, filetype)
            with open('worker_manager/recent_files.txt', 'a') as f:
                f.write(source + ", " + doc_id + ", " + str(timestamp) + "\n")
    else:
        docs = request.args['doc']
        doc_list_item = docs.split("ASDFJKL")
        doc_ids = request.args['doc_id']
        doc_ids_item = doc_ids.split("ASDFJKL")
        for x in range(0, len(doc_list_item)):
            doc = doc_list_item[x]
            source = request.args.get('source')
            doc_id = doc_ids_item[x]
            filetype = request.args.get('filetype')

            timestamp = process_docs.process_raw(doc, source, doc_id, filetype)
            with open('worker_manager/recent_files.txt', 'a') as f:
                f.write(source + ", " + doc_id + ", " + str(timestamp) + "\n")

    return Response()

示例#2

0

显示文件

文件： main.py 项目： chrisseto/scrapi

def process_raw():
    if request.method == 'POST':
        docs = request.form['doc']
        doc_list_item = docs.split("ASDFJKL")  # TODO Fix this
        doc_ids = request.form['doc_id']
        doc_ids_item = doc_ids.split("ASDFJKL")
        for x in range(0, len(doc_list_item)):
            doc = doc_list_item[x]
            source = request.form.get('source')
            doc_id = doc_ids_item[x]
            filetype = request.form.get('filetype')

            timestamp = process_docs.process_raw(doc, source, doc_id, filetype)
            with open('worker_manager/recent_files.txt', 'a') as f:
                f.write(source + ", " + doc_id + ", " + str(timestamp) + "\n")
    else:
        docs = request.args['doc']
        doc_list_item = docs.split("ASDFJKL")
        doc_ids = request.args['doc_id']
        doc_ids_item = doc_ids.split("ASDFJKL")
        for x in range(0, len(doc_list_item)):
            doc = doc_list_item[x]
            source = request.args.get('source')
            doc_id = doc_ids_item[x]
            filetype = request.args.get('filetype')

            timestamp = process_docs.process_raw(doc, source, doc_id, filetype)
            with open('worker_manager/recent_files.txt', 'a') as f:
                f.write(source + ", " + doc_id + ", " + str(timestamp) + "\n")

    return Response()

示例#3

0

显示文件

    def test_process_raw(self):
        raw_file = RawDocument({
            'doc': json.dumps({'Hello':  'world'}),
            'source': "TEST",
            'doc_id': 37,
            'filetype': "json"
        })

        assert process_docs.process_raw(raw_file, 'test-version')

        found = False
        for dirname, dirnames, filenames in os.walk('archive/TEST/{0}'.format(raw_file.get('doc_id'))):
            if os.path.isfile(dirname + '/raw.json'):
                found = True
        assert found

示例#4

0

显示文件

    def test_process_legal(self):
        raw_doc = RawDocument({
            'doc': json.dumps({'Hello': 'world'}),
            'source': 'TEST',
            'doc_id': 37,
            'filetype': 'json'
        })
        ts = str(process_docs.process_raw(raw_doc, 'test-version'))
        timestamp = None
        for dirname, dirnames, filenames in os.walk('archive/TEST/{0}'.format(raw_doc.get('doc_id'))):
            if os.path.isfile(dirname + '/raw.json'):
                timestamp = dirname.split('/')[-1]
        assert timestamp == ts

        doc = NormalizedDocument({
            'title': "TEST PROJECT",
            'contributors': [
                {
                    'full_name': 'Me, Myself',
                    'email': '*****@*****.**'
                },
                {
                    'full_name': 'And I',
                    'email': '*****@*****.**'
                }
            ],
            'properties': {
            },
            'meta': {},
            'id': {
                'service_id': raw_doc.get('doc_id'),
                'doi': 'Not available',
                'url': 'fake.stuff.org/{}'.format(raw_doc.get('doc_id')),
            },
            'source': raw_doc.get('source'),
            'timestamp': str(timestamp),
            'tags': ['1', '2', '3'],
            'date_created': str(timestamp),
            'description': 'science stuff',
        })

        assert process_docs.process(doc, timestamp)

        found = False
        for dirname, dirnames, filenames in os.walk('archive/TEST/{0}'.format(raw_doc.get('doc_id'))):
            if os.path.isfile(dirname + '/normalized.json'):
                found = True
        assert found

示例#5

0

显示文件

文件： celerytasks.py 项目： chrisseto/scrapi

def run_consumer(manifest_file):
    """
        Run the consume and normalize functions of the module specified in the manifest

        Take a manifest file location, load the corresponding module from the
        consumers/ directory, call the consume and normalize functions for that module,
        and add the normalized documents to the elastic search index.
        Return the list of normalized documents
    """
    manifest = _load_config(manifest_file)
    logger.info('run_scraper executing for service {}'.format(manifest['directory']))
    logger.info('worker_manager.consumers.{0}'.format(manifest['directory']))

    results, registry, consumer_version = _consume(manifest['directory'])

    docs = []
    for result in results:
        timestamp = process_docs.process_raw(result, consumer_version)
        docs.append(_normalize(result, timestamp, registry, manifest))
    return docs

示例#6

0

显示文件

文件： celerytasks.py 项目： csheldonhess/scrapi

def run_consumer(manifest_file):
    """
        Run the consume and normalize functions of the module specified in the manifest

        Take a manifest file location, load the corresponding module from the
        consumers/ directory, call the consume and normalize functions for that module,
        and add the normalized documents to the elastic search index.
        Return the list of normalized documents
    """
    manifest = _load_config(manifest_file)
    logger.info('run_scraper executing for service {}'.format(
        manifest['directory']))
    logger.info('worker_manager.consumers.{0}'.format(manifest['directory']))

    results, registry, consumer_version = _consume(manifest['directory'])

    docs = []
    for result in results:
        timestamp = process_docs.process_raw(result, consumer_version)
        docs.append(_normalize(result, timestamp, registry, manifest))
    return docs