Exemplo n.º 1
0
def add_dataset(db, uid, dztotalfilesize, dzfilename, dzfilepath, dzuuid):
    with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
        mimetype = m.id_filename(dzfilepath)

    xml_path = None
    if mimetype == "application/pdf":
        xml_path = dzfilepath[:-4] + ".xml"
    elif mimetype in ['text/plain', 'text/html']:
        mimetype = 'text/xml'

    # Create
    status = json.dumps({
        'annotate': None,
        'ml': None,
        'preview': None,
        'download': None
    })
    dataset = Datasets(uid=uid,
                       name=dzfilename,
                       size=dztotalfilesize,
                       file_path=dzfilepath,
                       upload_mimetype=mimetype,
                       upload_uuid=dzuuid,
                       xml_file_path=xml_path,
                       status=status)
    print_log(app.name, 'Adding dataset: {}'.format(dataset))
    db.session.add(dataset)
    db.session.commit()
    return dataset.id
Exemplo n.º 2
0
def delete_ml(dsid):
    token = flask.request.headers.get('Authorization')
    uid = verify_user(token)
    dataset = Datasets.list_datasets(uid, dsid=dsid)
    local = flask.request.args.get('local', default=None, type=str) == 'True'

    if local:
        try:
            print_log(app.name, 'Deleting local ML files: {}'.format(dataset))
            json_ml_in = '/var/www/elexifier-api/app/media/ML-IN-{}.json'.format(
                str(dsid))
            json_ml_out = '/var/www/elexifier-api/app/media/ML-OUT-{}.json'.format(
                str(dsid))
            if dataset.xml_lex != "":
                os.remove(dataset.xml_lex)
            if dataset.xml_ml_out != "":
                os.remove(dataset.xml_ml_out)
            os.remove(json_ml_in)
            os.remove(json_ml_out)
        except:
            pass
        Datasets.dataset_add_ml_paths(dsid)
    else:
        print_log(app.name,
                  'Deleting Lexonomy preview file: {}'.format(dataset))
        if dataset.lexonomy_ml_delete is not None:
            requests.post(dataset.lexonomy_ml_delete,
                          headers={
                              "Content-Type": 'application/json',
                              "Authorization": app.config['LEXONOMY_AUTH_KEY']
                          })
        Datasets.dataset_add_ml_lexonomy_access(db, dsid)

    return flask.make_response({'message': 'OK'}, 200)
Exemplo n.º 3
0
def add_user():
    # Check all required fields
    for field in ['email', 'password']:
        if field not in flask.request.json:
            raise InvalidUsage('Field {0} is mising.', status_code=422, enum='POST_ERROR')

    email = flask.request.json['email']
    password = flask.request.json['password']

    # Check if user already exists
    user = User.query.filter_by(email=email).first()
    if user is not None:
        db.session.close()
        raise InvalidUsage('User already exists', status_code=409, enum='USER_EXISTS')

    user = User(email, password)
    db.session.add(user)
    db.session.commit()

    response = {
        'message': 'Registration was successful',
        'username': '',
        'email': user.email,
        'auth_token': user.get_auth_token()
    }
    log.print_log(app.name, 'Registered new user {}'.format(user))
    return flask.make_response(jsonify(response), 200)
Exemplo n.º 4
0
def xf_update_transform(xfid):
    token = flask.request.headers.get('Authorization')
    id = verify_user(token)
    xfspec = flask.request.json.get('xfspec', None)
    saved = flask.request.json.get('saved', False)
    name = flask.request.json.get('name', None)
    print_log(app.name, 'Update transform {}'.format(xfid))
    if xfspec is None:
        raise InvalidUsage("Invalid API call.",
                           status_code=422,
                           enum="POST_ERROR")
    rv = controllers.update_transform(xfid, xfspec, name, saved)
    return flask.make_response({'updated': rv}, 200)
Exemplo n.º 5
0
def ml_run(dsid):
    """
    Dataset should be annotated at Lexonomy so we can download it and start ML process.
    ML statuses: Starting_ML -> ML_Format -> ML_Annotated -> Lex_Format
    Error statuses: Lex2ML_Error, ML_Error, ML2Lex_Error
    """
    token = flask.request.headers.get('Authorization')
    uid = verify_user(token)
    # get annotations first, so we get lex_xml path in db
    dataset = Datasets.list_datasets(uid, dsid=dsid)
    if dataset.status['annotate'] != 'Ready':
        raise InvalidUsage('File is not annotated at Lexonomy.',
                           status_code=409,
                           enum='STATUS_ERROR')
    get_lex_xml(uid, dsid)
    dataset = Datasets.list_datasets(uid, dsid=dsid)

    # deleting preview
    dataset.status['preview'] = None
    Datasets.dataset_add_ml_lexonomy_access(dsid)
    if dataset.lexonomy_ml_delete is not None:
        requests.post(dataset.lexonomy_ml_delete,
                      headers={
                          "Content-Type": 'application/json',
                          "Authorization": app.config['LEXONOMY_AUTH_KEY']
                      })

    if dataset.status['ml'] in ['Starting_ML', 'ML_Format', 'ML_Annotated']:
        raise InvalidUsage('ML is already running.',
                           status_code=409,
                           enum='STATUS_ERROR')
    print_log(app.name, '{} Starting ML'.format(dataset))
    dataset.status['ml'] = 'Starting_ML'
    Datasets.dataset_status(dsid, set=True, status=dataset.status)
    # Get files ready
    xml_raw = dataset.xml_file_path
    xml_ml_out = dataset.xml_lex[:-4] + '-ML_OUT.xml'
    Datasets.dataset_add_ml_paths(dsid,
                                  xml_lex=dataset.xml_lex,
                                  xml_ml_out=xml_ml_out)
    # Run ml
    task = run_pdf2lex_ml_scripts.apply_async(
        args=[uid, dsid, xml_raw, dataset.xml_lex, xml_ml_out], countdown=0)
    Datasets.dataset_ml_task_id(dsid, set=True, task_id=task.id)
    return flask.make_response(
        {
            'message': 'ok',
            'dsid': dsid,
            'status': dataset.status['ml']
        }, 200)
Exemplo n.º 6
0
def prepare_dataset(uid, dsid, xfid, xpath, hw):
    dataset = Datasets.query.filter_by(uid=uid, id=dsid).first()
    print_log(app.name, 'Preparing dataset {}'.format(dataset))
    mimetype, data = dataset.upload_mimetype, dataset.file_path

    for xml in xmls(mimetype, data):
        tree = lxml.etree.parse(xml)
        xpath = xpath.strip()

        namespaces = tree.getroot().nsmap
        namespace = ''
        namespace_prefix = False
        for prefix, ns in namespaces.items():
            if prefix:
                namespace_prefix = True
                namespace = {prefix: ns}
                break
            else:
                namespace = ns

        if namespace_prefix:
            nodes = tree.xpath('//' + xpath, namespaces=namespace)
        else:
            nodes = tree.xpath('//' + xpath)

        for entry in nodes:
            headword = entry.findall('.//' + hw)
            if headword:
                text = headword[0].text
            else:
                text = ''
            entry_str = lxml.etree.tostring(entry,
                                            encoding='unicode',
                                            xml_declaration=False)
            entry_head = clean_tag(entry_str.split('\n', 1)[0])[:10]

            # Create
            dataset = Datasets_single_entry(dsid=dsid,
                                            xfid=xfid,
                                            entry_head=entry_head,
                                            entry_text=text,
                                            contents=entry_str)
            db.session.add(dataset)
    db.session.commit()
    return (True, 'Done')
Exemplo n.º 7
0
def ds_download2(xfid, dsid):
    token = flask.request.headers.get('Authorization')
    uid = verify_user(token)
    status = controllers.transformer_download_status(xfid)

    get_status = flask.request.args.get('status', default='false',
                                        type=str) == 'true'

    if get_status:
        return flask.make_response({'status': status}, 200)

    elif status is None:
        print_log(
            app.name,
            'Transformed dataset download started uid: {0:s}, xfid: {1:s} , dsid: {2:s}'
            .format(str(uid), str(xfid), str(dsid)))
        strip_ns = flask.request.args.get('strip_ns',
                                          default='false',
                                          type=str) == 'true'
        strip_header = flask.request.args.get('strip_header',
                                              default='false',
                                              type=str) == 'true'
        strip_DictScrap = flask.request.args.get('strip_DictScrap',
                                                 default='false',
                                                 type=str) == 'true'
        strip_DictScrap = strip_ns  # TODO: remove this, when added to FE

        # Check if transformer exists
        try:
            transform = controllers.list_transforms(dsid, xfid=xfid)
            xf = transform.transform
        except:
            raise InvalidUsage('Transformer does not exist.', status_code=409)

        if xf is None:  # Not sure why this is needed here?
            return flask.make_response(
                {
                    'spec': None,
                    'entity_xml': None,
                    'output': None
                }, 200)
        else:
            # start download task
            prepare_download.apply_async(args=[
                uid, xfid, dsid, strip_ns, strip_header, strip_DictScrap
            ],
                                         countdown=0)
            status = 'Processing'
            controllers.transformer_download_status(xfid,
                                                    set=True,
                                                    download_status=status)

    elif status == "Processing":
        return flask.make_response({'message': 'File is still processing'},
                                   200)

    elif status == "Ready":
        print_log(
            app.name,
            'Transformed dataset download finished uid: {0:s}, xfid: {1:s} , dsid: {2:s}'
            .format(str(uid), str(xfid), str(dsid)))
        # return file and delete afterwards
        dataset = Datasets.list_datasets(uid, dsid=dsid)
        file_name, file_type = dataset.name.split('.')
        target_file_name = file_name + '_' + str(xfid) + '_TEI.' + file_type
        target_path = os.path.join(app.config['APP_MEDIA'], target_file_name)

        @after_this_request
        def remove_file(response):
            response.headers['x-suggested-filename'] = out_name
            response.headers.add('Access-Control-Expose-Headers', '*')
            os.remove(target_path)
            return response

        controllers.transformer_download_status(xfid, set=True)  # reset status
        transform_name = controllers.list_transforms(dsid, xfid=xfid).name
        out_name = dataset.name[:-4] + '-' + transform_name + '.xml'
        return flask.send_file(target_path,
                               attachment_filename=out_name,
                               as_attachment=True)

    return flask.make_response({'message': 'ok', 'status': status}, 200)
Exemplo n.º 8
0
def run_pdf2lex_ml_scripts(uid, dsid, xml_raw, xml_lex, xml_out):
    # Create files
    temp_fname = xml_raw.split('.xml')[0]
    json_ml_in = temp_fname + '-ML-IN.json'
    json_ml_out = temp_fname + '-ML-OUT.json'
    open(json_ml_in, 'a').close()
    open(json_ml_out, 'a').close()
    open(xml_out, 'a').close()

    def clean_files():
        os.remove(json_ml_in)
        os.remove(json_ml_out)

    status = Datasets.dataset_status(dsid)
    print_log('celery', 'Dictionary: {} @xml2json_ML'.format(dsid))  # step 1
    try:
        xml2json(xml_raw, xml_lex, json_ml_in)
        status['ml'] = 'ML_Format'
        Datasets.dataset_status(dsid, set=True, status=status)
    except Exception as e:
        status['ml'] = 'Lex2ML_Error'
        Datasets.dataset_status(dsid, set=True, status=status)
        Datasets.dataset_ml_task_id(dsid, set=True, task_id="")
        print_log('celery', 'Dictionary: {} @xml2json_ML [ERROR]'.format(dsid))
        clean_files()
        ErrorLog.add_error_log(db,
                               dsid,
                               tag='ml_error',
                               message=traceback.format_exc())
        return

    print_log('celery', 'Dictionary: {} @train_ML'.format(dsid))  # step 2
    try:
        _, report = train_ML(json_ml_in, json_ml_out, '')
        ErrorLog.add_error_log(db, dsid, tag='ml_finished', message=report)
        status['ml'] = 'ML_Annotated'
        Datasets.dataset_status(dsid, set=True, status=status)
    except Exception as e:
        status['ml'] = 'ML_Error'
        Datasets.dataset_status(dsid, set=True, status=status)
        Datasets.dataset_ml_task_id(dsid, set=True, task_id="")
        print_log('celery', 'Dictionary: {} @train_ML [ERROR]'.format(dsid))
        clean_files()
        ErrorLog.add_error_log(db,
                               dsid,
                               tag='ml_error',
                               message=traceback.format_exc())
        return

    print_log('celery', 'Dictionary: {} @json2xml_ML'.format(dsid))  # step 3
    try:
        json2xml(json_ml_out, xml_raw, xml_out)
        status['ml'] = 'Lex_Format'
        Datasets.dataset_status(dsid, set=True, status=status)
    except Exception as e:
        status['ml'] = 'ML2Lex_Error'
        Datasets.dataset_status(dsid, set=True, status=status)
        Datasets.dataset_ml_task_id(dsid, set=True, task_id="")
        print_log('celery', 'Dictionary: {} @json2xml_ML [ERROR]'.format(dsid))
        clean_files()
        ErrorLog.add_error_log(db,
                               dsid,
                               tag='ml_error',
                               message=traceback.format_exc())
        return

    Datasets.dataset_ml_task_id(dsid, set=True, task_id="")
    clean_files()
    return
Exemplo n.º 9
0
def ds_send_to_lexonomy(dsid):
    token = flask.request.headers.get('Authorization')
    uid = verify_user(token)
    user = User.query.filter_by(id=uid).first()
    db.session.close()
    dataset = Datasets.list_datasets(uid, dsid=dsid)
    additional_pages = flask.request.args.get('add_pages',
                                              default='0',
                                              type=str).lower() == '1'

    if additional_pages:
        # get file from lexonomy and save it
        get_lex_xml(uid, dsid)

    # Reset dataset status and delete old files @Lexonomy
    dataset.status['ml'] = None
    dataset.status['preview'] = None
    if dataset.lexonomy_delete is not None:
        requests.post(dataset.lexonomy_delete,
                      headers={
                          "Content-Type": 'application/json',
                          "Authorization": app.config['LEXONOMY_AUTH_KEY']
                      })
    if dataset.lexonomy_ml_delete is not None:
        requests.post(dataset.lexonomy_ml_delete,
                      headers={
                          "Content-Type": 'application/json',
                          "Authorization": app.config['LEXONOMY_AUTH_KEY']
                      })

    request_data = {
        'xml_file':
        '/api/lexonomy/{}/download/{}'.format(uid, dsid) +
        ('?add_pages=True' if additional_pages else ''),
        'email':
        user.email,
        'filename':
        dataset.name + ' - annotate',
        'type':
        'edit',
        'url':
        app.config['URL'],
        'ske_user':
        True if user.sketch_engine_uid is not None else False,
        'return_to':
        ""  # remove if no longer required
    }

    print_log(app.name,
              'Starting asynchronous request to Lexonomy {}'.format(dataset))
    make_lexonomy_request.apply_async(args=[dsid, request_data], countdown=0)

    # Update dataset status
    dataset.status['annotate'] = 'Starting'
    Datasets.dataset_status(dsid, set=True, status=dataset.status)

    return flask.make_response(
        {
            'message': 'OK',
            'dsid': dsid,
            'status': dataset.status['annotate'],
            'test_request': request_data
        }, 200)
Exemplo n.º 10
0
def prepare_TEI_download(dsid, input_file, output_file, character_map):
    # Load json for transformation
    json_file = os.path.join(app.config['APP_DIR'],
                             'modules/pdf2lex_ml/lexonomy_to_tei.json')
    with open(json_file, 'r') as file:
        json_data = file.read()
        file.close()

    transformation_json = json.loads(json_data)

    # clean tokens
    lexonomy_xml = lxml.etree.parse(input_file)
    if character_map is None:
        character_map = dict()
    clean_tokens(lexonomy_xml.getroot(), character_map)
    orig_xml = lxml.etree.tostring(lexonomy_xml)

    parserLookup = lxml.etree.ElementDefaultClassLookup(
        element=transformator.TMyElement)
    myParser = lxml.etree.XMLParser()
    myParser.set_element_class_lookup(parserLookup)
    lexonomy_xml = lxml.etree.fromstring(orig_xml, parser=myParser)

    # init transformator
    mapping = transformator.TMapping(transformation_json)
    mapper = transformator.TMapper()

    # transform lexonomy format to tei format
    metadata = Datasets.dataset_metadata(dsid)
    out_TEI, out_aug = mapper.Transform(
        mapping,
        [],
        [lxml.etree.ElementTree(lexonomy_xml)],
        makeAugmentedInputTrees=True,
        stripForValidation=False,
        stripHeader=False,
        #stripDictScrap=True, # TODO: change when fixed
        stripDictScrap=False,
        headerTitle=False,
        headerPublisher=False,
        headerBibl=False,
        promoteNestedEntries=True,
        metadata=metadata)
    print_log('DEBUG', 'transformed')
    target_xml = '\n' + lxml.etree.tostring(
        out_TEI, pretty_print=True, encoding='unicode')
    print_log('DEBUG', 'in string')
    target_xml = target_xml.replace(
        '<entry xmlns:m="http://elex.is/wp1/teiLex0Mapper/meta" xmlns:a="http://elex.is/wp1/teiLex0Mapper/legacyAttributes" xmlns="http://www.tei-c.org/ns/1.0">',
        '<entry>')
    print_log('DEBUG', 'entry replaced')

    # writing transformed xml to file
    open(output_file, 'a').close()
    print_log('DEBUG', 'writing to file')
    with open(output_file, 'w') as out:
        out.write(target_xml)
        out.close()
    print_log('DEBUG', 'writing finished')
    status = Datasets.dataset_status(dsid)
    status['download'] = 'Ready'
    Datasets.dataset_status(dsid, set=True, status=status)
    return