def process_raw(): if request.method == 'POST': docs = request.form['doc'] doc_list_item = docs.split("ASDFJKL") # TODO Fix this doc_ids = request.form['doc_id'] doc_ids_item = doc_ids.split("ASDFJKL") for x in range(0, len(doc_list_item)): doc = doc_list_item[x] source = request.form.get('source') doc_id = doc_ids_item[x] filetype = request.form.get('filetype') timestamp = process_docs.process_raw(doc, source, doc_id, filetype) with open('worker_manager/recent_files.txt', 'a') as f: f.write(source + ", " + doc_id + ", " + str(timestamp) + "\n") else: docs = request.args['doc'] doc_list_item = docs.split("ASDFJKL") doc_ids = request.args['doc_id'] doc_ids_item = doc_ids.split("ASDFJKL") for x in range(0, len(doc_list_item)): doc = doc_list_item[x] source = request.args.get('source') doc_id = doc_ids_item[x] filetype = request.args.get('filetype') timestamp = process_docs.process_raw(doc, source, doc_id, filetype) with open('worker_manager/recent_files.txt', 'a') as f: f.write(source + ", " + doc_id + ", " + str(timestamp) + "\n") return Response()
def test_process_raw(self): raw_file = RawDocument({ 'doc': json.dumps({'Hello': 'world'}), 'source': "TEST", 'doc_id': 37, 'filetype': "json" }) assert process_docs.process_raw(raw_file, 'test-version') found = False for dirname, dirnames, filenames in os.walk('archive/TEST/{0}'.format(raw_file.get('doc_id'))): if os.path.isfile(dirname + '/raw.json'): found = True assert found
def test_process_legal(self): raw_doc = RawDocument({ 'doc': json.dumps({'Hello': 'world'}), 'source': 'TEST', 'doc_id': 37, 'filetype': 'json' }) ts = str(process_docs.process_raw(raw_doc, 'test-version')) timestamp = None for dirname, dirnames, filenames in os.walk('archive/TEST/{0}'.format(raw_doc.get('doc_id'))): if os.path.isfile(dirname + '/raw.json'): timestamp = dirname.split('/')[-1] assert timestamp == ts doc = NormalizedDocument({ 'title': "TEST PROJECT", 'contributors': [ { 'full_name': 'Me, Myself', 'email': '*****@*****.**' }, { 'full_name': 'And I', 'email': '*****@*****.**' } ], 'properties': { }, 'meta': {}, 'id': { 'service_id': raw_doc.get('doc_id'), 'doi': 'Not available', 'url': 'fake.stuff.org/{}'.format(raw_doc.get('doc_id')), }, 'source': raw_doc.get('source'), 'timestamp': str(timestamp), 'tags': ['1', '2', '3'], 'date_created': str(timestamp), 'description': 'science stuff', }) assert process_docs.process(doc, timestamp) found = False for dirname, dirnames, filenames in os.walk('archive/TEST/{0}'.format(raw_doc.get('doc_id'))): if os.path.isfile(dirname + '/normalized.json'): found = True assert found
def run_consumer(manifest_file): """ Run the consume and normalize functions of the module specified in the manifest Take a manifest file location, load the corresponding module from the consumers/ directory, call the consume and normalize functions for that module, and add the normalized documents to the elastic search index. Return the list of normalized documents """ manifest = _load_config(manifest_file) logger.info('run_scraper executing for service {}'.format(manifest['directory'])) logger.info('worker_manager.consumers.{0}'.format(manifest['directory'])) results, registry, consumer_version = _consume(manifest['directory']) docs = [] for result in results: timestamp = process_docs.process_raw(result, consumer_version) docs.append(_normalize(result, timestamp, registry, manifest)) return docs
def run_consumer(manifest_file): """ Run the consume and normalize functions of the module specified in the manifest Take a manifest file location, load the corresponding module from the consumers/ directory, call the consume and normalize functions for that module, and add the normalized documents to the elastic search index. Return the list of normalized documents """ manifest = _load_config(manifest_file) logger.info('run_scraper executing for service {}'.format( manifest['directory'])) logger.info('worker_manager.consumers.{0}'.format(manifest['directory'])) results, registry, consumer_version = _consume(manifest['directory']) docs = [] for result in results: timestamp = process_docs.process_raw(result, consumer_version) docs.append(_normalize(result, timestamp, registry, manifest)) return docs