예제 #1
0
def elem_856(symbol):
    rc=ET.Element("record")
    for xfile in File.find_by_identifier(Identifier('symbol', symbol)):
        df = ET.SubElement(rc,"datafield")
        df.set("tag","856")
        df.set("ind1","4")
        df.set("ind2","0")

        sf_y=ET.SubElement(df,"subfield")
        sf_y.set("code",'y')
        sf_y.text=''.join(xfile.languages)

        sf_9=ET.SubElement(df,"subfield")
        sf_9.set("code",'9')
        sf_9.text=str(xfile.id)

        sf_s=ET.SubElement(df,"subfield")
        sf_s.set("code",'s')
        sf_s.text=str(xfile.size)

        sf_u=ET.SubElement(df,"subfield")
        sf_u.set("code",'u')
        sf_u.text=str(xfile.uri)    
    #return ET.tostring(rc)
    return rc
예제 #2
0
파일: app.py 프로젝트: grozdanic/ODSexport
def show_pdf(path):
    xfile = File.latest_by_identifier_language(Identifier('symbol', path),
                                               'EN')
    print(xfile.uri)
    return (render_template('test1.html',
                            symbol=path,
                            uri='http://' + xfile.uri))
예제 #3
0
def db():
    from dlx import DB
    from dlx.marc import Bib, Auth
    from dlx.file import S3, File, Identifier
    from tempfile import TemporaryFile

    DB.connect(
        'mongomock://localhost')  # ? does mock connection create a fresh db ?

    DB.bibs.drop()
    DB.auths.drop()
    DB.files.drop()
    DB.handle['dlx_dl_log'].drop()

    Auth().set('100', 'a', 'name_1').commit()
    Auth().set('100', 'a', 'name_2').commit()

    Bib().set('191', 'a', 'TEST/1').set('245', 'a',
                                        'title_1').set('700', 'a', 1).commit()
    Bib().set('245', 'a', 'title_2').set('700', 'a', 2).commit()

    S3.connect(access_key='key', access_key_id='key_id', bucket='mock_bucket')
    S3.client.create_bucket(Bucket=S3.bucket)

    handle = TemporaryFile()
    handle.write(b'some data')
    handle.seek(0)
    File.import_from_handle(handle,
                            filename='',
                            identifiers=[Identifier('symbol', 'TEST/1')],
                            languages=['EN'],
                            mimetype='text/plain',
                            source='test')

    return DB.client
예제 #4
0
    def upload(fh, data):
        symbols = [data['symbol1']]
        
        if data['symbol2'] and not data['symbol2'].isspace():
            symbols.append(data['symbol2'])
    
        if any([re.search(r'JOURNAL', x) for x in symbols]):
            return
        
        identifiers = [Identifier('symbol', x) for x in filter(None, symbols)]
        lang = {'A': 'AR', 'C': 'ZH', 'E': 'EN', 'F': 'FR', 'R': 'RU', 'S': 'ES', 'G': 'DE'}[data['languageId']]
        
        if args.language and lang != args.language.upper():
            return
        
        languages = [lang]
        overwrite = True if args.overwrite else False

        try:
            return File.import_from_handle(
                fh,
                filename=encode_fn(list(filter(None, symbols)), languages[0], 'pdf'),
                identifiers=identifiers,
                languages=languages,
                mimetype='application/pdf',
                source='gdoc-dlx-' + args.station,
                overwrite=overwrite
            )
        except FileExistsConflict as e:
            print(json.dumps({'warning': e.message, 'data': {'symbols': symbols, 'language': languages}}))
        except FileExists:
            print(json.dumps({'info': 'Already in the system', 'data': {'symbols': symbols, 'language': languages}}))
        except Exception as e:
            print(json.dumps({'error': '; '.join(re.split('[\r\n]', str(e))), 'data': {'symbols': symbols, 'languages': languages}}))
            raise e
예제 #5
0
def _fft_from_files(bib):
    symbols = bib.get_values('191', 'a') + bib.get_values('191', 'z')

    seen = []

    for symbol in set(symbols):
        if symbol == '' or symbol == ' ' or symbol == '***':  # note: clean these up in db
            continue

        for lang in ('AR', 'ZH', 'EN', 'FR', 'RU', 'ES', 'DE'):
            xfile = File.latest_by_identifier_language(
                Identifier('symbol', symbol), lang)

            if xfile and lang not in seen:
                field = Datafield(record_type='bib',
                                  tag='FFT',
                                  ind1=' ',
                                  ind2=' ')
                field.set('a', 'https://' + xfile.uri)
                field.set('d', ISO_STR[lang])
                field.set(
                    'n',
                    encode_fn(symbols if len(symbols) <= 3 else symbols[0:1],
                              lang, 'pdf'))
                bib.fields.append(field)

                seen.append(lang)

    return bib
예제 #6
0
def test_import_from_binary(db, s3):
    from io import BytesIO
    from dlx import Config, DB
    from dlx.file import File, Identifier, S3
    
    S3.client.create_bucket(Bucket=S3.bucket) # this should be only necessary for testing 
    control = 'eb733a00c0c9d336e65691a37ab54293'
    assert File.import_from_binary(b'test data', identifiers=[Identifier('isbn', '1')], filename='fn.ext', languages=['EN'], mimetype='application/dlx', source='test') == control
예제 #7
0
def test_import_from_url(db, s3):
    import requests
    from http.server import HTTPServer, BaseHTTPRequestHandler
    from io import BytesIO
    from dlx import Config, DB
    from dlx.file import File, Identifier, S3
    
    S3.client.create_bucket(Bucket=S3.bucket) # this should be only necessary for testing
    server = HTTPServer(('127.0.0.1', 9090), None)
    responses.add(responses.GET, 'http://127.0.0.1:9090', body=BytesIO(b'test data').read())
    control = 'eb733a00c0c9d336e65691a37ab54293'
    assert File.import_from_url(url='http://127.0.0.1:9090', identifiers=[Identifier('isbn', '3')], filename='test', languages=['EN'], mimetype='test', source=None) == control
예제 #8
0
def test_import_from_path(db, s3):
    from tempfile import NamedTemporaryFile
    from dlx import Config, DB
    from dlx.file import S3, File, Identifier
    
    S3.client.create_bucket(Bucket=S3.bucket) # this should be only necessary for testing
    fh = NamedTemporaryFile()
    fh.write(b'test data')
    fh.seek(0)
    path = fh.name
    control = 'eb733a00c0c9d336e65691a37ab54293'
    assert File.import_from_path(path, identifiers=[Identifier('isbn', '1')], filename='fn.ext', languages=['EN'], mimetype='application/dlx', source='test') == control
예제 #9
0
def test_import_from_handle(db, s3):    
    from tempfile import TemporaryFile
    from dlx import Config, DB
    from dlx.file import S3, File, Identifier, FileExists, FileExistsIdentifierConflict, FileExistsLanguageConflict
    
    S3.client.create_bucket(Bucket=S3.bucket) # this should be only necessary for testing
    handle = TemporaryFile()
    handle.write(b'some data')
    handle.seek(0)
    
    File.import_from_handle(handle, identifiers=[Identifier('isbn', '1')], filename='fn.ext', languages=['EN'], mimetype='application/dlx', source='test')
    
    results = list(DB.files.find({'identifiers': {'type': 'isbn', 'value': '1'}}))
    assert(len(results)) == 1
    assert results[0]['filename'] == 'fn.ext'
    assert results[0]['languages'] == ['EN']
    assert results[0]['mimetype'] == 'application/dlx'
    assert results[0]['source'] == 'test'
    assert results[0]['uri'] == '{}.s3.amazonaws.com/{}'.format(S3.bucket, results[0]['_id'])
    
    with TemporaryFile() as fh:
        S3.client.download_fileobj(S3.bucket, results[0]['_id'], fh)
        fh.seek(0)
        assert fh.read() == b'some data'
    
    with pytest.raises(FileExistsIdentifierConflict):
        handle = TemporaryFile()
        handle.write(b'some data')
        handle.seek(0)
        File.import_from_handle(handle, identifiers=[Identifier('isbn', '2')], filename='test', languages=['FR'], mimetype='test', source=None
    )
    
    with pytest.raises(FileExistsLanguageConflict):
        handle = TemporaryFile()
        handle.write(b'some data')
        handle.seek(0)
        File.import_from_handle(handle, identifiers=[Identifier('isbn', '1')], filename='test', languages=['FR'], mimetype='test', source=None)
예제 #10
0
            langs = row[3].split(',')
            for l in langs:
                lang.append(LANGS[l])
        else:
            try:
                lang = [LANGS[row[3]]]
            except KeyError:
                print(
                    f"LanguageError: Unable to determine language for {filename} and {symbol}. This file won't be imported."
                )
                break

        ext = filename.split('.')[-1]
        encoded_filename = encode_fn(symbol, lang, ext)
        identifiers = []
        identifiers.append(Identifier('symbol', symbol))
        #key = "{}/{}/PDF/{}".format(base_path, subfolder, filename)

        print(encoded_filename)
        save_file = "{}/{}".format(tmpdir, filename)

        if args.skipdb:
            key = f"{base_path}/{subfolder}/{filename}"

        else:
            table = dynamodb.Table(args.table)
            # Use the filename to query the DigitizationIndex
            response = table.query(
                IndexName=args.index,
                KeyConditionExpression=Key('filename').eq(filename))
            #print(response)
예제 #11
0
def run():
    args = get_args()

    DLX.connect(args.dlx_connect)
    S3.connect(bucket=args.s3_bucket)

    symbols = [args.symbol] if args.symbol else [
        re.split('\t', x)[0].strip() for x in open(args.list).readlines()
    ]
    langs = [args.language] if args.language else LANG.keys()

    for sym in symbols:
        bib = Bib.from_query(Query(
            Or(Condition('191', {'a': sym}), Condition('191', {'z': sym}))),
                             collation=Collation(locale='en', strength=2))

        if not bib and not args.skip_check:
            logging.warning(f'Bib for document {sym} not found. Skipping.')
            continue
        elif bib and not args.skip_check:
            # capture symbols from the bib record (exclude those beginning with brackets)
            ids = list(
                filter(
                    lambda x: x[0] != '[',
                    (bib.get_values('191', 'a') + bib.get_values('191', 'z'))))
        else:
            logging.warning(
                f'Bib for document {sym} not found with --skip_check enabled. Using {sym} as identifier'
            )
            ids = symbols

        for lang in langs:
            logging.info(f'Getting {sym} {lang} ...')

            try:
                fh = ODS.download(
                    sym if not args.ods_symbol else args.ods_symbol, lang)
            except FileNotFound:
                logging.warning(f'{sym} {lang} not found in ODS')
                continue
            except Exception as e:
                logging.warning(e)
                continue

            isolang = LANG[lang]

            try:
                result = File.import_from_handle(
                    fh,
                    filename=File.encode_fn(sym, isolang, 'pdf'),
                    identifiers=[Identifier('symbol', s) for s in ids],
                    languages=[isolang],
                    mimetype='application/pdf',
                    source='ods-importx',
                    overwrite=args.overwrite)
                logging.info(f'OK - {result.id}')
            except FileExistsLanguageConflict as e:
                logging.warning(f'{e.message} X {isolang}')
            except FileExistsIdentifierConflict as e:
                logging.warning(f'{e.message} X {ids}')
            except FileExists:
                logging.info('Already in the system')
            except:
                raise