Пример #1
0
def test_from_query(db):
    from dlx.marc import MarcSet, BibSet, AuthSet, QueryDocument, Condition
    
    bibset = BibSet.from_query({'_id': {'$in': [1, 2]}})
    assert isinstance(bibset, (MarcSet, BibSet))
    assert bibset.count == 2
    assert isinstance(bibset.records, map)
    bibset.cache()
    assert isinstance(bibset.records, list)
    
    bibset = BibSet.from_query({}, skip=0, limit=1)
    assert bibset.count == 1
    for bib in bibset:
        assert bib.id == 1
    assert len(list(bibset.records)) == 0
    assert bibset.count == 1
    
    conditions = [
        Condition(tag='150', subfields={'a': 'Header'}),
        Condition(tag='200', modifier='not_exists')
    ]
    authset = AuthSet.from_query(conditions)
    assert isinstance(authset, (MarcSet, AuthSet))
    assert authset.count == 1
    assert isinstance(authset.records, map)
    authset.cache()
    assert isinstance(authset.records, list)
    
    query = QueryDocument(
        Condition('245', modifier='exists')
        
    )    
    bibset = BibSet.from_query(query)
    assert isinstance(bibset, BibSet)
    assert bibset.count == 2
Пример #2
0
def test_xml_encoding():
    from dlx.marc import BibSet, Bib
    from xmldiff import main
    
    control = '<collection><record><datafield ind1=" " ind2=" " tag="245"><subfield code="a">Title with an é</subfield></datafield></record></collection>'
    bibset = BibSet([Bib().set('245', 'a', 'Title with an é')])
    assert main.diff_texts(bibset.to_xml(), control) == []
Пример #3
0
def show_xml856(path):
    query = QueryDocument(
        Condition(
            tag='191',
            #subfields={'a': re.compile('^'+path+'$')}
            subfields={'a': path}))
    #print(f" the imp query is  -- {query.to_json()}")
    ts2 = time.time()
    bibset = BibSet.from_query(query,
                               projection={
                                   '029': 1,
                                   '091': 1,
                                   '191': 1,
                                   '245': 1,
                                   '269': 1,
                                   '650': 1,
                                   '991': 1
                               })
    #add856 # this is where we isnert 856 tags for files info
    print(f"time for query is {time.time()-ts2}")
    ts3 = time.time()
    xml = add856(bibset)
    print(f"total time for adding 856 is {time.time()-ts3}")
    #xml=bibset.to_xml()
    #decoding to string and emoving double space from the xml; creates pbs with the job number on ODS export
    xml = xml.decode("utf-8").replace("  ", " ")
    return Response(xml, mimetype='text/xml')
Пример #4
0
def test_iterate(db):
    from dlx.marc import Bib, BibSet, Auth, AuthSet
    
    for bib in BibSet.from_query({}):
        assert isinstance(bib, Bib)
        
    for auth in AuthSet.from_query({}):
        assert isinstance(auth, Auth)
Пример #5
0
def test_to_str(db):
    from dlx.marc import BibSet
    
    control = '000\n   leader\n008\n   controlfield\n245\n   a: This\n   b: is the\n   c: title\n520\n   a: Description\n520\n   a: Another description\n   a: Repeated subfield\n650\n   a: Header\n710\n   a: Another header\n\n000\n   leader\n245\n   a: Another\n   b: is the\n   c: title\n650\n   a: Header\n'

    assert BibSet.from_query({}).to_str() == control
    
    
Пример #6
0
def test_from_excel():
    from dlx.marc import BibSet
    
    path = os.path.join(os.path.dirname(__file__), 'marc.xlsx')        
    bibset = BibSet.from_excel(path, date_format='%Y-%m-%d')
        
    for bib in bibset.records:
        assert bib.get_value('246','b')[:8] == 'subtitle'
        assert bib.get_values('269','c')[1] == 'repeated'
Пример #7
0
def test_from_table(db):
    from dlx.marc import BibSet
    from dlx.util import Table
    
    t = Table([
        ['246a',  '1.246$b',  '1.269c',    '2.269c'],
        ['title', 'subtitle', '1999-12-31','repeated'],
        ['title2','subtitle2','2000-01-01','repeated'],
    ])
    
    bibset = BibSet.from_table(t)
    for bib in bibset.records:
        assert bib.get_value('246','b')[:8] == 'subtitle'
        assert bib.get_values('269','c')[1] == 'repeated'
        
    with pytest.raises(Exception):
        bibset = BibSet.from_table(Table([['245a'], ['This']]), field_check='245a')
        
    with pytest.raises(Exception):
        bibset = BibSet.from_table(Table([['650a'], ['Should an int']]), auth_control=True)
        
    with pytest.raises(Exception):
        bibset = BibSet.from_table(Table([['650a'], ['Invalid']]), auth_control=False, auth_flag=True)
Пример #8
0
def test_init(bibs, auths):
    from dlx.marc import BibSet, Bib, AuthSet, Auth
    
    records = [Bib(x) for x in bibs]
    bibset = BibSet(records)
    assert isinstance(bibset, BibSet)
    assert len(bibset.records) == 2
    assert bibset.count == 2
    
    records = [Auth(x) for x in auths]
    authset = AuthSet(records)
    assert isinstance(authset, AuthSet)
    assert len(authset.records) == 2
    assert authset.count == 2
Пример #9
0
def xml(date):
    '''
outputs records in MARCXML format for the date which is provided as a dynamic route in YYYYMMDD or YYYY-MM-DD formats
/YYYYMMDD/xml?skip=n&limit=m
skip=n URL parameter is used to skip n records. Default is 0.
limit=m URL parameter is used to limit number of records returned. Default is 50.
if the date is in wrong format the function returns today's records
it uses DLX bibset.to_xml serialization function to output MARCXML
'''
    try:
        skp = int(request.args.get('skip'))
    except:
        skp = 0
    try:
        limt = int(request.args.get('limit'))
    except:
        limt = 50
    print(f"skip is {skp} and limit is {limt}")
    str_date = date.replace('-', '')
    print(f"the original str_date is {str_date}")
    if len(str_date) != 8:
        #date = datetime.datetime.now()
        str_date = str(date.year) + str(date.month) + str(date.day)
    print(f"the str_date is {str_date}")
    query = QueryDocument(
        Condition(tag='998', subfields={'z': re.compile('^' + str_date)}),
        Condition(tag='029', subfields={'a': 'JN'}))
    print(query.to_json())
    start_time = datetime.now()
    bibset = BibSet.from_query(query,
                               projection={
                                   '029': 1,
                                   '091': 1,
                                   '191': 1,
                                   '245': 1,
                                   '269': 1,
                                   '650': 1,
                                   '991': 1
                               },
                               skip=skp,
                               limit=limt)
    print(f"duration for 998z was {datetime.now()-start_time}")
    start_time_xml = datetime.now()
    xml = bibset.to_xml()

    #removing double space from the xml; creates pbs with the job number on ODS export
    xml = xml.replace("  ", " ")
    print(
        f"duration for xml serialization was {datetime.now()-start_time_xml}")
    return Response(xml, mimetype='text/xml')
Пример #10
0
def jsonf(date):
    '''
    outputs records in native central DB schema json format for the date which is provided as a dynamic route inputed in YYYYMMDD or YYYY-MM-DD
    e.g. /YYYY-MM-DD/json
    e.g. /YYYYMMDD/json?skip=n&limit=m
    skip=n URL parameter is used to skip n records. Default is 0.
    limit=m URL parameter is used to limit number of records returned. Default is 50.
    if the date is in wrong format the function returns today's records
    it uses DLX's bibset.to_json serialization function to output json
    '''
    try:
        skp = int(request.args.get('skip'))
    except:
        skp = 0
    try:
        limt = int(request.args.get('limit'))
    except:
        limt = 50
    print(f"skip is {skp} and limit is {limt}")
    str_date = date.replace('-', '')
    print(f"the original str_date is {str_date}")
    if len(str_date) != 8:
        date = datetime.datetime.now()
        str_date = str(date.year) + str(date.month) + str(date.day)
    print(f"the str_date is {str_date}")
    query = QueryDocument(
        Condition(tag='998', subfields={'z': re.compile('^' + str_date)}),
        Condition(tag='029', subfields={'a': 'JN'}))

    bibset = BibSet.from_query(query,
                               projection={
                                   '029': 1,
                                   '091': 1,
                                   '191': 1,
                                   '245': 1,
                                   '269': 1,
                                   '650': 1,
                                   '991': 1,
                                   '998': 1
                               },
                               skip=skp,
                               limit=limt)

    jsonl = []
    for bib in bibset.records:
        jsonl.append(bib.to_json())
    return jsonify(jsonl)
Пример #11
0
def symbols(date):
    '''
    outputs records in txt format for the date which is provided as a dynamic route in YYYYMMDD or YYYY-MM-DD formats
    e.g. /YYYYMMDD/symbols /YYYY-MM-DD/symbols?skip=n&limit=m
    skip=n URL parameter is used to skip n records. Default is 0.
    limit=m URL parameter is used to limit number of records returned. Default is 50.
    if the date is in wrong format the function returns today's records
    it uses DLX bibset.to_txt serialization function to output MARCXML
    '''
    try:
        skp = int(request.args.get('skip'))
    except:
        skp = 0
    try:
        limt = int(request.args.get('limit'))
    except:
        limt = 50
    print(f"skip is {skp} and limit is {limt}")
    str_date = date.replace('-', '')
    print(f"the original str_date is {str_date}")
    if len(str_date) != 8:
        date = datetime.datetime.now()
        str_date = str(date.year) + str(date.month) + str(date.day)
    print(f"the str_date is {str_date}")

    query = QueryDocument(
        Condition(tag='998', subfields={'z': re.compile('^' + str_date)}),
        Condition(tag='029', subfields={'a': 'JN'}))

    bibset = BibSet.from_query(query,
                               projection={
                                   '029': 1,
                                   '191': 1
                               },
                               skip=skp,
                               limit=limt)

    str_out = ''
    for bib in bibset.records:
        str_out += bib.to_str()
    return Response(str_out, mimetype='text/plain')
Пример #12
0
def show_xml(path):
    query = QueryDocument(
        Condition(
            tag='191',
            #subfields={'a': re.compile('^'+path+'$')}
            subfields={'a': path}))
    #print(f" the imp query is  -- {query.to_json()}")
    bibset = BibSet.from_query(query,
                               projection={
                                   '029': 1,
                                   '091': 1,
                                   '191': 1,
                                   '245': 1,
                                   '269': 1,
                                   '650': 1,
                                   '856': 1,
                                   '991': 1
                               })
    xml = bibset.to_xml()
    #removing double space from the xml; creates pbs with the job number on ODS export
    xml = xml.replace("  ", " ")
    return Response(xml, mimetype='text/xml')
Пример #13
0
def show_symbols(path):
    path = re.escape(path)
    data = ""
    return_data = ""
    query = QueryDocument(
        Condition(
            tag='191',
            subfields={'a': Regex('^' + path)},
        ), )
    print(f" the query is  -- {query.to_json()}")
    bibset = BibSet.from_query(query,
                               projection={'191': True},
                               skip=0,
                               limit=0)
    a_res_en = []
    for bib in bibset.records:
        bib_value = bib.get_value('191', 'a')
        a_res_en.append(bib.get_value('191', 'a'))
    return_data = sorted([quote(doc) for doc in a_res_en],
                         key=lambda x: int(''.join(c for c in x
                                                   if c.isdigit())))
    #return_data=a_res_en
    return (jsonify(return_data))
Пример #14
0
def show_txt(path):
    query = QueryDocument(
        Condition(
            tag='191',
            #subfields={'a': re.compile('^'+path+'$')}
            subfields={'a': path}))
    #print(f" the imp query is  -- {query.to_json()}")
    #export_fields={'089':1,'091':1,'191': 1,'239':1,'245':1,'249':1,'260':1,'269':1,'300':1,'500':1,'515':1,'520':1,'596':1,'598':1,'610':1,'611':1,'630:1,''650':1,'651':1,'710':1,'981':1,'989':1,'991':1,'992':1,'993':1,'996':1}
    bibset = BibSet.from_query(query)
    out_list = [('089', 'b'), ('091', 'a'), ('191', 'a'), ('191', 'b'),
                ('191', 'c'), ('191', '9'), ('239', 'a'), ('245', 'a'),
                ('245', 'b'), ('249', 'a'), ('245', 'a'), ('260', 'a'),
                ('260', 'b'), ('260', 'a'), ('260', 'c'), ('269', 'a'),
                ('300', 'a'), ('500', 'a'), ('515', 'a'), ('520', 'a'),
                ('596', 'a'), ('598', 'a'), ('610', 'a'), ('611', 'a'),
                ('630', 'a'), ('650', 'a'), ('651', 'a'), ('710', 'a'),
                ('981', 'a'), ('989', 'a'), ('989', 'b'), ('989', 'c'),
                ('991', 'a'), ('991', 'b'), ('991', 'c'), ('991', 'd'),
                ('992', 'a'), ('993', 'a'), ('996', 'a')]
    #print(f"duration for query was {datetime.now()-start_time_query}")
    jsonl = []

    for bib in bibset.records:
        out_dict = {}
        #start_time_bib=datetime.now()
        for entry in out_list:
            #start_time_field=datetime.now()
            out_dict[entry[0] + '__' + entry[1]] = bib.get_values(
                entry[0], entry[1])
            #print(f"for the field {entry[0]+'__'+entry[1]}")
            #print(f"duration for getting values was {datetime.now()-start_time_field}")
        jsonl.append(out_dict)
        print(f"for the bib {bib.get_values('191','a')}")
        #print(f"duration for getting bib values was {datetime.now()-start_time_bib}")
    #print(f"total duration was {datetime.now()-start_time_all}")
    return jsonify(jsonl)
 def fetch_bib_data(self,proj_dict):
     query = QueryDocument(
         Or(
             Condition(
             tag='191',
             subfields={'b': self.body+'/','c':self.session}
                 ),
             Condition(
             tag='791',
             subfields={'b': self.body+'/','c':self.session}
                 ),
             Condition(
             tag='930',
             subfields={'a': 'ITP'+self.body+self.session}
                 )
             )
         )
     #print(query.to_json())
     bibset=BibSet.from_query(query, projection=proj_dict, skip=0, limit=0)
     #l_temp=bibset.count
     #self.snapshot_len=l_temp 
     lbibs=list(bibset.records)
     print(f"bibset length is : {len(lbibs)}")
     return lbibs#, l_temp
Пример #16
0
def votes(topic):
    '''
    looks up UNBIS thesaurus labels and returns matching T codes ..
    skip=n URL parameter is used to skip n records. Default is 0.
    limit=m URL parameter is used to limit number of records returned. Default is 50.
    it uses DLX authset to output fields 035 and 150
    '''
    try:
        skp = int(request.args.get('skip'))
    except:
        skp = 0
    try:
        limt = int(request.args.get('limit'))
    except:
        limt = 50
    try:
        yr_from = request.args.get('year_from')
    except:
        yr_from = "1980"
    try:
        yr_to = request.args.get('year_to')
    except:
        yr_to = '2020'
    try:
        cntry = request.args.get('Country')
    except:
        cntry = 'CANADA'
    try:
        vt = request.args.get('Vote')
    except:
        vt = 'A'

    print(f"skip is {skp} and limit is {limt}")
    print(f"year_from is {yr_from} and year_to is {yr_to}")
    print(f"Country is {cntry}")
    print(f"Vote is {vt}")

    query = QueryDocument(
        Condition(tag='191', subfields={'d': re.compile(str(topic))}),
        Condition(tag='191', subfields={'a': re.compile('^A')}))
    print(query.to_json())
    dict_auth_ids = {}
    authset = AuthSet.from_query(query,
                                 projection={
                                     '001': 1,
                                     '191': 1
                                 },
                                 skip=skp,
                                 limit=limt)
    for auth in authset:
        dict_auth_ids[auth.get_value('191', 'a')] = auth.get_value('001')
    #unbis=authset.to_xml()
    #return Response(unbis, mimetype='text/xml')
    #return jsonify(dict_auth_ids)
    dict_bibs = {}
    str_bibs = ''
    votecountry = ''
    for key, value in dict_auth_ids.items():
        #sample_id=int(dict_auth_ids['A/74/251'])
        print(f"the id of {key} is {value}")
        query_bib = QueryDocument(
            Condition(tag='991', subfields={'d': int(value)}),
            Condition(tag='989',
                      subfields={'a': re.compile(str('Voting Data'))}))

        print(query_bib.to_json())
        bibset = BibSet.from_query(query_bib,
                                   projection={
                                       '001': 1,
                                       '791': 1,
                                       '967': 1
                                   },
                                   skip=skp,
                                   limit=limt)
        for bib in bibset:
            for field in bib.get_fields('967'):
                votecountry = field.get_value("d") + field.get_value("e")
                #print(f'Country+Vote: {votecountry}')
                if str(votecountry) == str(vt) + str(
                        cntry
                ):  # for the entries matching input query parameters using AND logic
                    dict_bibs[bib.get_value('791', 'a')] = bib.get_value('001')
                    str_bibs = str_bibs + ' OR 791:[' + bib.get_value(
                        '791', 'a') + ']'
    print(str_bibs)
    return jsonify(dict_bibs)
Пример #17
0
def jsons(date):
    '''
    outputs Security Council bib records in plain simple json format for the date which is provided as a dynamic route in YYYYMMDD or YYYY-MM-DD formats
    e.g. /YYYY-MM-DD/xml?skip=n&limit=m
    skip=n URL parameter is used to skip n records. Default is 0.
    limit=m URL parameter is used to limit number of records returned. Default is 50.
    if the date is in wrong format the function returns today's records
    it is used to publish S/ records for iSCAD+ in a plain json
    22 July added fields 049:a and 260:a
    '''
    try:
        skp = int(request.args.get('skip'))
    except:
        skp = 0
    try:
        limt = int(request.args.get('limit'))
    except:
        limt = 50
    print(f"skip is {skp} and limit is {limt}")
    #start_time_all=datetime.now()
    str_date = date.replace('-', '')
    print(f"the original str_date is {str_date}")
    if len(str_date) != 8:
        date = datetime.datetime.now()
        str_date = str(date.year) + str(date.month) + str(date.day)
    print(f"the str_date is {str_date}")
    #start_time_query=datetime.now()
    query = QueryDocument(
        Condition(tag='998', subfields={'z': re.compile('^' + str_date)}),
        Condition(tag='191', subfields={'b': re.compile('^S\/')}))
    export_fields = {
        '089': 1,
        '091': 1,
        '191': 1,
        '239': 1,
        '245': 1,
        '249': 1,
        '260': 1,
        '269': 1,
        '300': 1,
        '500': 1,
        '515': 1,
        '520': 1,
        '596': 1,
        '598': 1,
        '610': 1,
        '611': 1,
        '630:1,'
        '650': 1,
        '651': 1,
        '710': 1,
        '981': 1,
        '989': 1,
        '991': 1,
        '992': 1,
        '993': 1,
        '996': 1
    }
    bibset = BibSet.from_query(query,
                               projection=export_fields,
                               skip=skp,
                               limit=limt)
    out_list = [('089', 'b'), ('091', 'a'), ('191', 'a'), ('191', 'b'),
                ('191', 'c'), ('191', '9'), ('239', 'a'), ('245', 'a'),
                ('245', 'b'), ('249', 'a'), ('245', 'a'), ('260', 'a'),
                ('260', 'b'), ('260', 'a'), ('260', 'c'), ('269', 'a'),
                ('300', 'a'), ('500', 'a'), ('515', 'a'), ('520', 'a'),
                ('596', 'a'), ('598', 'a'), ('610', 'a'), ('611', 'a'),
                ('630', 'a'), ('650', 'a'), ('651', 'a'), ('710', 'a'),
                ('981', 'a'), ('989', 'a'), ('989', 'b'), ('989', 'c'),
                ('991', 'a'), ('991', 'b'), ('991', 'c'), ('991', 'd'),
                ('992', 'a'), ('993', 'a'), ('996', 'a')]
    #print(f"duration for query was {datetime.now()-start_time_query}")
    jsonl = []

    for bib in bibset.records:
        out_dict = {}
        #start_time_bib=datetime.now()
        for entry in out_list:
            #start_time_field=datetime.now()
            out_dict[entry[0] + '__' + entry[1]] = bib.get_values(
                entry[0], entry[1])
            #print(f"for the field {entry[0]+'__'+entry[1]}")
            #print(f"duration for getting values was {datetime.now()-start_time_field}")
        jsonl.append(out_dict)
        #print(f"for the bib {bib.get_values('191','a')}")
        #print(f"duration for getting bib values was {datetime.now()-start_time_bib}")
    #print(f"total duration was {datetime.now()-start_time_all}")
    return jsonify(jsonl)
Пример #18
0
def test_to_mrc(db):
    from dlx.marc import BibSet
    
    control = '00224r|||a2200097|||4500008001300000245002400013520001600037520004300053650001100096710001900107controlfield  aThisbis thectitle  aDescription  aAnother descriptionaRepeated subfield  aHeader  aAnother header00088r|||a2200049|||4500245002700000650001100027  aAnotherbis thectitle  aHeader'
    assert BibSet.from_query({}).to_mrc() == control
Пример #19
0
def test_to_mrk(db):
    from dlx.marc import BibSet
    
    control = '000  leader\n008  controlfield\n245  \\\\$aThis$bis the$ctitle\n520  \\\\$aDescription\n520  \\\\$aAnother description$aRepeated subfield\n650  \\\\$aHeader\n710  \\\\$aAnother header\n\n000  leader\n245  \\\\$aAnother$bis the$ctitle\n650  \\\\$aHeader\n'
    assert BibSet.from_query({}).to_mrk() == control
Пример #20
0
def test_to_xml(db):
    from dlx.marc import BibSet
    from xmldiff import main
    
    control = '<collection><record><controlfield tag="000">leader</controlfield><controlfield tag="008">controlfield</controlfield><datafield ind1=" " ind2=" " tag="245"><subfield code="a">This</subfield><subfield code="b">is the</subfield><subfield code="c">title</subfield></datafield><datafield ind1=" " ind2=" " tag="520"><subfield code="a">Description</subfield></datafield><datafield ind1=" " ind2=" " tag="520"><subfield code="a">Another description</subfield><subfield code="a">Repeated subfield</subfield></datafield><datafield ind1=" " ind2=" " tag="650"><subfield code="a">Header</subfield><subfield code="0">1</subfield></datafield><datafield ind1=" " ind2=" " tag="710"><subfield code="a">Another header</subfield><subfield code="0">2</subfield></datafield></record><record><controlfield tag="000">leader</controlfield><datafield ind1=" " ind2=" " tag="245"><subfield code="a">Another</subfield><subfield code="b">is the</subfield><subfield code="c">title</subfield></datafield><datafield ind1=" " ind2=" " tag="650"><subfield code="a">Header</subfield><subfield code="0">1</subfield></datafield></record></collection>'
    assert main.diff_texts(BibSet.from_query({}).to_xml(), control) == []
Пример #21
0
def xmlupdated(date):
    '''
outputs records in MARCXML format for the date which is provided as a dynamic route in YYYYMMDD or YYYY-MM-DD formats
/YYYYMMDD/xml?skip=n&limit=m
skip=n URL parameter is used to skip n records. Default is 0.
limit=m URL parameter is used to limit number of records returned. Default is 50.
if the date is in wrong format the function returns today's records
it uses DLX bibset.to_xml serialization function to output MARCXML
'''
    try:
        skp = int(request.args.get('skip'))
    except:
        skp = 0
    try:
        limt = int(request.args.get('limit'))
    except:
        limt = 50
    print(f"skip is {skp} and limit is {limt}")
    str_date = date.replace('-', '')
    print(f"the original str_date is {str_date}")
    if len(str_date) != 8:
        date = datetime.datetime.now()
        str_date = str(date.year) + str(date.month) + str(date.day)
        date_from = date
    else:
        date_year = str_date[0:4]
        date_month = str_date[4:6]
        date_day = str_date[6:8]
    date_from = datetime.fromisoformat(date_year + "-" + date_month + "-" +
                                       date_day)
    #date_to=date_from+timedelta(days = 2)
    print(f"date_from is {date_from}")
    #print(f"date_to is {date_to}")
    dict_query = {
        "$and": [{
            "updated": {
                "$gte": date_from,
                "$lt": date_from + timedelta(days=1)
            }
        }, {
            "029.subfields.value": "JN"
        }]
    }
    #dict_query= {"updated": {"$gte": date_from, "$lt": date_from+timedelta(days = 1)}}
    #print(query.to_json())
    #print(f"son query is {son_query}")
    print(f"dict query is {dict_query}")
    start_time = datetime.now()
    bibset = BibSet.from_query(dict_query,
                               projection={
                                   '029': 1,
                                   '091': 1,
                                   '191': 1,
                                   '245': 1,
                                   '269': 1,
                                   '650': 1,
                                   '991': 1
                               },
                               skip=skp,
                               limit=limt)
    xml = bibset.to_xml()
    #removing double space from the xml; creates pbs with the job number on ODS export
    xml = xml.replace("  ", " ")
    print(f"duration for updated was {datetime.now()-start_time}")
    return Response(xml, mimetype='text/xml')
Пример #22
0
def show_txt(path):
    '''displays the text of the document '''
    data = ""
    return_data = ""
    doc_list = []
    #path=quote(path)
    path = re.escape(path)
    '''
 i2 = urllib.parse.quote(i.encode("utf-8"))  #need to deal with special characters in each url
        uu2 = urllib.parse.urljoin(uu, i2)         #create url
    '''
    print(f" this is compiled path -- {'^' + str(path)+'$'}")
    doc_list = list(
        txts_coll.find({"doc_sym": {
            "$regex": "^" + str(path) + "$"
        }}))
    if len(doc_list) == 0 and path != 'favicon.ico':
        print(f"no exact DS {str(path)} - generating one")
        bib_value = ''
        #doc_list=list(txts_coll.find({"doc_sym":{"$regex":path}}))
        ''' extract text from DB'''
        #build list of tuples (striped_doc_sum, url to the pdf in s3)
        query = QueryDocument(
            Condition(tag='191', subfields={'a': Regex('^' + path + '$')}))
        #)
        print(f" the imp query is  -- {query.to_json()}")
        bibset = BibSet.from_query(query, skip=0, limit=3)
        a_res_en = []
        if bibset.count == 1:
            for bib in bibset.records:
                bib_value = bib.get_value('191', 'a')
                a_res_en.append(
                    (bib.get_value('191',
                                   'a'), 'http://' + ''.join(bib.files('EN'))))
                print(a_res_en)
                for url in a_res_en:
                    #txt_name = url.split('/')[-1]
                    #url is a tuple ; url[0] is a DS; url[1] is a s3 link to the pdf
                    txt_name = url[0]  # e.g. ARES721
                    #txt_name = txt_name.split('.')[0] +'.txt'
                    #txt_name = txt_name +'.txt'
                    #txt_loc='\\txts\\'+txt_name
                    if len(url[1]) > 10:
                        print(f" - - the {url[0]} is {url[1]} - -")
                        pdf = PDFExtract(url[1])
                        parsed = parser.from_buffer(
                            pdf.get_txt_from_url(url[1]))
                        print(f"0----PDFExtract----0")
                        txt = Txt(bib.get_value('191', 'a'))
                        print(txt.set_txt(parsed["content"]))
                        txt.title = bib.get_value('245', 'a')
                        #txt.title=bib.get_value('239','a')
                        ''' load text into txts'''
                        if txt.txt is not None:
                            query = {"doc_sym": txt.symbol}
                            txts_coll.replace_one(query,
                                                  txt.to_bson(),
                                                  upsert=True)

    doc_list = []
    doc_list = list(
        txts_coll.find({"doc_sym": {
            "$regex": "^" + str(path) + "$"
        }}))
    print(f" this is compiled path -- {'^' + str(path)+'$'}")
    if len(doc_list) == 1:
        print(f"-- it's a hit- 1")
        if doc_list[0]['doc_sym'][0] != 'S':
            return_data = doc_list[0]['raw_txt']
        else:
            #for SC docs - temporary measure
            doc_1 = doc_list[0].pop('_id')
            return_data = doc_list[0]
    elif len(doc_list) > 1:
        print(f"-- it's a hit- many")
        return_data = sorted([doc['doc_sym'] for doc in doc_list],
                             key=lambda x: int(''.join(c for c in x
                                                       if c.isdigit())))
        #return_data=sorted(["<a href="+doc['doc_sym']+">" for doc in doc_list])
        #return_data=sorted([url_for('/'+doc_list[0]['raw_txt']) for doc in doc_list])

    if return_data == "":
        return jsonify('text with document symbol:%s was not found' % path)
    #return(render_template('ds.html', data=return_data))
    #print(return_data)
    return jsonify(return_data)
Пример #23
0
from bson import Regex
from dlx import DB
from dlx.marc import BibSet, QueryDocument, Condition
from config import Config
DB.connect(Config.connect_string)

query = QueryDocument(Condition(tag='191', modifier='exists'),
                      Condition(tag='269', subfields={'a': Regex('^1975')}))

print(query.to_json())

bibset = BibSet.from_query(query, projection={'191': True}, skip=0, limit=0)
print('There are {} results'.format(bibset.count))

bibset.cache()

for bib in bibset.records:
    print('id: {}, symbol: {}'.format(bib.id, bib.get_value('191', 'a')))

print(bibset.to_xml())
Пример #24
0
def index():
    today = datetime.date.today()
    yesterday = today - datetime.timedelta(days=1)

    date = str(request.args.get('date', yesterday))
    date_obj = datetime.datetime.strptime(date, '%Y-%m-%d')
    duty_station = request.args.get('dutyStation', 'NY')

    g = Gdoc(username=secrets["username"], password=secrets["password"])

    g.set_param('dutyStation', duty_station)
    g.set_param('dateFrom', date)
    g.set_param('dateTo', date)
    g.set_param('includeFiles', 'false')

    next_date = date_obj.date() + datetime.timedelta(days=1)
    if next_date > today:
        next_date = None
    prev_date = date_obj.date() - datetime.timedelta(days=1)

    symbol_objects = {}
    for d in g.data:
        m = MetadataObject(d)
        f = FileObject(d)
        if m.symbol1 not in symbol_objects:
            m.files.append(f)
            symbol_objects[m.symbol1] = m
        else:
            symbol_objects[m.symbol1].files.append(f)

    for s in symbol_objects:
        symbol2 = symbol_objects[s].symbol2
        if len(symbol2) > 0:
            returned_files = DLXFile.find({
                'identifiers': [{
                    'type': 'symbol',
                    'value': s
                }, {
                    'type': 'symbol',
                    'value': symbol2
                }],
                'languages': ['EN']
            })
        else:
            returned_files = DLXFile.find({
                'identifiers': [{
                    'type': 'symbol',
                    'value': s
                }],
                'languages': ['EN']
            })
        for f in returned_files:
            symbol_objects[s].links.append(('PDF', f"https://{f.uri}"))

        query = Query.from_string(f'191__a:{s}')
        res = list(BibSet.from_query(query.compile()))
        try:
            my_s = res[0].get_value('191', 'a')
            symbol_objects[s].links.append(
                ('UNDL', f"{Config.dlx_endpoint}records/bibs/{res[0].id}"))
        except:
            pass

    return render_template('index.html',
                           duty_stations=Config.duty_stations,
                           data=symbol_objects,
                           date=date,
                           duty_station=duty_station,
                           next_date=next_date,
                           prev_date=prev_date)