def add_dataset(db, uid, dztotalfilesize, dzfilename, dzfilepath, dzuuid): with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m: mimetype = m.id_filename(dzfilepath) xml_path = None if mimetype == "application/pdf": xml_path = dzfilepath[:-4] + ".xml" elif mimetype in ['text/plain', 'text/html']: mimetype = 'text/xml' # Create status = json.dumps({ 'annotate': None, 'ml': None, 'preview': None, 'download': None }) dataset = Datasets(uid=uid, name=dzfilename, size=dztotalfilesize, file_path=dzfilepath, upload_mimetype=mimetype, upload_uuid=dzuuid, xml_file_path=xml_path, status=status) print_log(app.name, 'Adding dataset: {}'.format(dataset)) db.session.add(dataset) db.session.commit() return dataset.id
def delete_ml(dsid): token = flask.request.headers.get('Authorization') uid = verify_user(token) dataset = Datasets.list_datasets(uid, dsid=dsid) local = flask.request.args.get('local', default=None, type=str) == 'True' if local: try: print_log(app.name, 'Deleting local ML files: {}'.format(dataset)) json_ml_in = '/var/www/elexifier-api/app/media/ML-IN-{}.json'.format( str(dsid)) json_ml_out = '/var/www/elexifier-api/app/media/ML-OUT-{}.json'.format( str(dsid)) if dataset.xml_lex != "": os.remove(dataset.xml_lex) if dataset.xml_ml_out != "": os.remove(dataset.xml_ml_out) os.remove(json_ml_in) os.remove(json_ml_out) except: pass Datasets.dataset_add_ml_paths(dsid) else: print_log(app.name, 'Deleting Lexonomy preview file: {}'.format(dataset)) if dataset.lexonomy_ml_delete is not None: requests.post(dataset.lexonomy_ml_delete, headers={ "Content-Type": 'application/json', "Authorization": app.config['LEXONOMY_AUTH_KEY'] }) Datasets.dataset_add_ml_lexonomy_access(db, dsid) return flask.make_response({'message': 'OK'}, 200)
def add_user(): # Check all required fields for field in ['email', 'password']: if field not in flask.request.json: raise InvalidUsage('Field {0} is mising.', status_code=422, enum='POST_ERROR') email = flask.request.json['email'] password = flask.request.json['password'] # Check if user already exists user = User.query.filter_by(email=email).first() if user is not None: db.session.close() raise InvalidUsage('User already exists', status_code=409, enum='USER_EXISTS') user = User(email, password) db.session.add(user) db.session.commit() response = { 'message': 'Registration was successful', 'username': '', 'email': user.email, 'auth_token': user.get_auth_token() } log.print_log(app.name, 'Registered new user {}'.format(user)) return flask.make_response(jsonify(response), 200)
def xf_update_transform(xfid): token = flask.request.headers.get('Authorization') id = verify_user(token) xfspec = flask.request.json.get('xfspec', None) saved = flask.request.json.get('saved', False) name = flask.request.json.get('name', None) print_log(app.name, 'Update transform {}'.format(xfid)) if xfspec is None: raise InvalidUsage("Invalid API call.", status_code=422, enum="POST_ERROR") rv = controllers.update_transform(xfid, xfspec, name, saved) return flask.make_response({'updated': rv}, 200)
def ml_run(dsid): """ Dataset should be annotated at Lexonomy so we can download it and start ML process. ML statuses: Starting_ML -> ML_Format -> ML_Annotated -> Lex_Format Error statuses: Lex2ML_Error, ML_Error, ML2Lex_Error """ token = flask.request.headers.get('Authorization') uid = verify_user(token) # get annotations first, so we get lex_xml path in db dataset = Datasets.list_datasets(uid, dsid=dsid) if dataset.status['annotate'] != 'Ready': raise InvalidUsage('File is not annotated at Lexonomy.', status_code=409, enum='STATUS_ERROR') get_lex_xml(uid, dsid) dataset = Datasets.list_datasets(uid, dsid=dsid) # deleting preview dataset.status['preview'] = None Datasets.dataset_add_ml_lexonomy_access(dsid) if dataset.lexonomy_ml_delete is not None: requests.post(dataset.lexonomy_ml_delete, headers={ "Content-Type": 'application/json', "Authorization": app.config['LEXONOMY_AUTH_KEY'] }) if dataset.status['ml'] in ['Starting_ML', 'ML_Format', 'ML_Annotated']: raise InvalidUsage('ML is already running.', status_code=409, enum='STATUS_ERROR') print_log(app.name, '{} Starting ML'.format(dataset)) dataset.status['ml'] = 'Starting_ML' Datasets.dataset_status(dsid, set=True, status=dataset.status) # Get files ready xml_raw = dataset.xml_file_path xml_ml_out = dataset.xml_lex[:-4] + '-ML_OUT.xml' Datasets.dataset_add_ml_paths(dsid, xml_lex=dataset.xml_lex, xml_ml_out=xml_ml_out) # Run ml task = run_pdf2lex_ml_scripts.apply_async( args=[uid, dsid, xml_raw, dataset.xml_lex, xml_ml_out], countdown=0) Datasets.dataset_ml_task_id(dsid, set=True, task_id=task.id) return flask.make_response( { 'message': 'ok', 'dsid': dsid, 'status': dataset.status['ml'] }, 200)
def prepare_dataset(uid, dsid, xfid, xpath, hw): dataset = Datasets.query.filter_by(uid=uid, id=dsid).first() print_log(app.name, 'Preparing dataset {}'.format(dataset)) mimetype, data = dataset.upload_mimetype, dataset.file_path for xml in xmls(mimetype, data): tree = lxml.etree.parse(xml) xpath = xpath.strip() namespaces = tree.getroot().nsmap namespace = '' namespace_prefix = False for prefix, ns in namespaces.items(): if prefix: namespace_prefix = True namespace = {prefix: ns} break else: namespace = ns if namespace_prefix: nodes = tree.xpath('//' + xpath, namespaces=namespace) else: nodes = tree.xpath('//' + xpath) for entry in nodes: headword = entry.findall('.//' + hw) if headword: text = headword[0].text else: text = '' entry_str = lxml.etree.tostring(entry, encoding='unicode', xml_declaration=False) entry_head = clean_tag(entry_str.split('\n', 1)[0])[:10] # Create dataset = Datasets_single_entry(dsid=dsid, xfid=xfid, entry_head=entry_head, entry_text=text, contents=entry_str) db.session.add(dataset) db.session.commit() return (True, 'Done')
def ds_download2(xfid, dsid): token = flask.request.headers.get('Authorization') uid = verify_user(token) status = controllers.transformer_download_status(xfid) get_status = flask.request.args.get('status', default='false', type=str) == 'true' if get_status: return flask.make_response({'status': status}, 200) elif status is None: print_log( app.name, 'Transformed dataset download started uid: {0:s}, xfid: {1:s} , dsid: {2:s}' .format(str(uid), str(xfid), str(dsid))) strip_ns = flask.request.args.get('strip_ns', default='false', type=str) == 'true' strip_header = flask.request.args.get('strip_header', default='false', type=str) == 'true' strip_DictScrap = flask.request.args.get('strip_DictScrap', default='false', type=str) == 'true' strip_DictScrap = strip_ns # TODO: remove this, when added to FE # Check if transformer exists try: transform = controllers.list_transforms(dsid, xfid=xfid) xf = transform.transform except: raise InvalidUsage('Transformer does not exist.', status_code=409) if xf is None: # Not sure why this is needed here? return flask.make_response( { 'spec': None, 'entity_xml': None, 'output': None }, 200) else: # start download task prepare_download.apply_async(args=[ uid, xfid, dsid, strip_ns, strip_header, strip_DictScrap ], countdown=0) status = 'Processing' controllers.transformer_download_status(xfid, set=True, download_status=status) elif status == "Processing": return flask.make_response({'message': 'File is still processing'}, 200) elif status == "Ready": print_log( app.name, 'Transformed dataset download finished uid: {0:s}, xfid: {1:s} , dsid: {2:s}' .format(str(uid), str(xfid), str(dsid))) # return file and delete afterwards dataset = Datasets.list_datasets(uid, dsid=dsid) file_name, file_type = dataset.name.split('.') target_file_name = file_name + '_' + str(xfid) + '_TEI.' + file_type target_path = os.path.join(app.config['APP_MEDIA'], target_file_name) @after_this_request def remove_file(response): response.headers['x-suggested-filename'] = out_name response.headers.add('Access-Control-Expose-Headers', '*') os.remove(target_path) return response controllers.transformer_download_status(xfid, set=True) # reset status transform_name = controllers.list_transforms(dsid, xfid=xfid).name out_name = dataset.name[:-4] + '-' + transform_name + '.xml' return flask.send_file(target_path, attachment_filename=out_name, as_attachment=True) return flask.make_response({'message': 'ok', 'status': status}, 200)
def run_pdf2lex_ml_scripts(uid, dsid, xml_raw, xml_lex, xml_out): # Create files temp_fname = xml_raw.split('.xml')[0] json_ml_in = temp_fname + '-ML-IN.json' json_ml_out = temp_fname + '-ML-OUT.json' open(json_ml_in, 'a').close() open(json_ml_out, 'a').close() open(xml_out, 'a').close() def clean_files(): os.remove(json_ml_in) os.remove(json_ml_out) status = Datasets.dataset_status(dsid) print_log('celery', 'Dictionary: {} @xml2json_ML'.format(dsid)) # step 1 try: xml2json(xml_raw, xml_lex, json_ml_in) status['ml'] = 'ML_Format' Datasets.dataset_status(dsid, set=True, status=status) except Exception as e: status['ml'] = 'Lex2ML_Error' Datasets.dataset_status(dsid, set=True, status=status) Datasets.dataset_ml_task_id(dsid, set=True, task_id="") print_log('celery', 'Dictionary: {} @xml2json_ML [ERROR]'.format(dsid)) clean_files() ErrorLog.add_error_log(db, dsid, tag='ml_error', message=traceback.format_exc()) return print_log('celery', 'Dictionary: {} @train_ML'.format(dsid)) # step 2 try: _, report = train_ML(json_ml_in, json_ml_out, '') ErrorLog.add_error_log(db, dsid, tag='ml_finished', message=report) status['ml'] = 'ML_Annotated' Datasets.dataset_status(dsid, set=True, status=status) except Exception as e: status['ml'] = 'ML_Error' Datasets.dataset_status(dsid, set=True, status=status) Datasets.dataset_ml_task_id(dsid, set=True, task_id="") print_log('celery', 'Dictionary: {} @train_ML [ERROR]'.format(dsid)) clean_files() ErrorLog.add_error_log(db, dsid, tag='ml_error', message=traceback.format_exc()) return print_log('celery', 'Dictionary: {} @json2xml_ML'.format(dsid)) # step 3 try: json2xml(json_ml_out, xml_raw, xml_out) status['ml'] = 'Lex_Format' Datasets.dataset_status(dsid, set=True, status=status) except Exception as e: status['ml'] = 'ML2Lex_Error' Datasets.dataset_status(dsid, set=True, status=status) Datasets.dataset_ml_task_id(dsid, set=True, task_id="") print_log('celery', 'Dictionary: {} @json2xml_ML [ERROR]'.format(dsid)) clean_files() ErrorLog.add_error_log(db, dsid, tag='ml_error', message=traceback.format_exc()) return Datasets.dataset_ml_task_id(dsid, set=True, task_id="") clean_files() return
def ds_send_to_lexonomy(dsid): token = flask.request.headers.get('Authorization') uid = verify_user(token) user = User.query.filter_by(id=uid).first() db.session.close() dataset = Datasets.list_datasets(uid, dsid=dsid) additional_pages = flask.request.args.get('add_pages', default='0', type=str).lower() == '1' if additional_pages: # get file from lexonomy and save it get_lex_xml(uid, dsid) # Reset dataset status and delete old files @Lexonomy dataset.status['ml'] = None dataset.status['preview'] = None if dataset.lexonomy_delete is not None: requests.post(dataset.lexonomy_delete, headers={ "Content-Type": 'application/json', "Authorization": app.config['LEXONOMY_AUTH_KEY'] }) if dataset.lexonomy_ml_delete is not None: requests.post(dataset.lexonomy_ml_delete, headers={ "Content-Type": 'application/json', "Authorization": app.config['LEXONOMY_AUTH_KEY'] }) request_data = { 'xml_file': '/api/lexonomy/{}/download/{}'.format(uid, dsid) + ('?add_pages=True' if additional_pages else ''), 'email': user.email, 'filename': dataset.name + ' - annotate', 'type': 'edit', 'url': app.config['URL'], 'ske_user': True if user.sketch_engine_uid is not None else False, 'return_to': "" # remove if no longer required } print_log(app.name, 'Starting asynchronous request to Lexonomy {}'.format(dataset)) make_lexonomy_request.apply_async(args=[dsid, request_data], countdown=0) # Update dataset status dataset.status['annotate'] = 'Starting' Datasets.dataset_status(dsid, set=True, status=dataset.status) return flask.make_response( { 'message': 'OK', 'dsid': dsid, 'status': dataset.status['annotate'], 'test_request': request_data }, 200)
def prepare_TEI_download(dsid, input_file, output_file, character_map): # Load json for transformation json_file = os.path.join(app.config['APP_DIR'], 'modules/pdf2lex_ml/lexonomy_to_tei.json') with open(json_file, 'r') as file: json_data = file.read() file.close() transformation_json = json.loads(json_data) # clean tokens lexonomy_xml = lxml.etree.parse(input_file) if character_map is None: character_map = dict() clean_tokens(lexonomy_xml.getroot(), character_map) orig_xml = lxml.etree.tostring(lexonomy_xml) parserLookup = lxml.etree.ElementDefaultClassLookup( element=transformator.TMyElement) myParser = lxml.etree.XMLParser() myParser.set_element_class_lookup(parserLookup) lexonomy_xml = lxml.etree.fromstring(orig_xml, parser=myParser) # init transformator mapping = transformator.TMapping(transformation_json) mapper = transformator.TMapper() # transform lexonomy format to tei format metadata = Datasets.dataset_metadata(dsid) out_TEI, out_aug = mapper.Transform( mapping, [], [lxml.etree.ElementTree(lexonomy_xml)], makeAugmentedInputTrees=True, stripForValidation=False, stripHeader=False, #stripDictScrap=True, # TODO: change when fixed stripDictScrap=False, headerTitle=False, headerPublisher=False, headerBibl=False, promoteNestedEntries=True, metadata=metadata) print_log('DEBUG', 'transformed') target_xml = '\n' + lxml.etree.tostring( out_TEI, pretty_print=True, encoding='unicode') print_log('DEBUG', 'in string') target_xml = target_xml.replace( '<entry xmlns:m="http://elex.is/wp1/teiLex0Mapper/meta" xmlns:a="http://elex.is/wp1/teiLex0Mapper/legacyAttributes" xmlns="http://www.tei-c.org/ns/1.0">', '<entry>') print_log('DEBUG', 'entry replaced') # writing transformed xml to file open(output_file, 'a').close() print_log('DEBUG', 'writing to file') with open(output_file, 'w') as out: out.write(target_xml) out.close() print_log('DEBUG', 'writing finished') status = Datasets.dataset_status(dsid) status['download'] = 'Ready' Datasets.dataset_status(dsid, set=True, status=status) return