def categorize(group, output, log): logging.getLogger().setLevel(loggingMap[log]) dtx = DigitoolXML(xml_dirname) c = Categorize(dtx, output) oai_ids = dtx.getList() categories[group](oai_ids, dtx, c) print(c)
def convert(dspace_admin_passwd, dspace_admin_username, test, run, skip): dt = Digitool(digitool_category) dt.download_list() if skip: dtx = DigitoolXML(xml_dirname, skip_missing=True) else: dtx = DigitoolXML(xml_dirname) c = MetadataConvertor() ds = Dspace(dspace_admin_username, dspace_admin_passwd) if test: problems = [] for record in dt.list[:10]: oai_id = dt.get_oai_id(record) checked, convertedMetadata, attachements = convertItem( oai_id, test, skip) if not checked: problems.append(oai_id) if run: ds.new_item(273, converted_metadata, [("lorem-ipsum.pdf", "application/pdf", "Dokument")]) if test: click.clear() print("problems", problems) ds.logout()
def convertItem(oai_id, test, skip): dt = Digitool(digitool_category) record = dt.get_item(oai_id) if skip: dtx = DigitoolXML(xml_dirname, skip_missing=True) else: dtx = DigitoolXML(xml_dirname) c = MetadataConvertor() originalMetadata = dt.get_metadata(record) if originalMetadata is None: if skip: return False else: raise Exception("No metadata in {}".format(oai_id)) if 'dc' in originalMetadata.keys(): #3112 convertedMetadataDC = c.convertDC(originalMetadata['dc'], oai_id) if 'record' in originalMetadata.keys(): #358, žádný průnik convertedMetadataRecord = c.convertRecord(originalMetadata['record'], oai_id) attachements = list(dtx.get_attachements(str(oai_id) + ".xml")) if test: click.clear() print("converting ", oai_id) print("originalMetadata:\n") for i in originalMetadata: print(i) print("convertedMetadata:\n") print("attachements:\n") print(attachements) checked = click.confirm("Is converting OK?", default=True) return (checked, convertedMetadataDC, attachements) else: return (False, convertedMetadataDC, attachements)
def test_convert(): oai_ids = Digitool(digitool_category).download_list() dtx = DigitoolXML(xml_dirname) categorize = Categorize(dtx) for oai_id in oai_ids: # checked, convertedMetadata, attachements = convertItem(oai_id, False) #print(oai_id) attachements = list(dtx.get_attachements(oai_id))
def categorize(group, skip): #TODO všechny dalši skupiny viz ostatni TODO if skip: dtx = DigitoolXML(xml_dirname, skip_missing=True) else: dtx = DigitoolXML(xml_dirname) c = Categorize(dtx) if group == 'oai': dt = Digitool(digitool_category) bugs.oai(dt, dtx, c, skip=skip) elif group == 'forgot': dt = Digitool(digitool_category) bugs.forgot_attachements(dt, dtx, c, xml_dirname + "/ls_streams.txt") c.print()
def descriptions(): dt = Digitool(digitool_category) dt.download_list() if skip: dtx = DigitoolXML(xml_dirname, skip_missing=True) else: dtx = DigitoolXML(xml_dirname) c = FilenameConvertor() problems = [] for record in dt.list: oai_id = dt.get_oai_id(record) attachements = list(dtx.get_attachements(oai_id + ".xml", full=True)) if skip: if len(attachements) == 0: continue else: if len(attachements) == 0: raise Exception("No attachement in {}.", format(oai_id)) descriptions = c.generate_description(attachements) if isinstance(descriptions, list): continue print(descriptions)
def convert(dspace_admin_passwd, dspace_admin_username, run, archive, catalogue, log): #TODO aleph, weird_attachmement by měli být nulové a ostatní by tak měli zustat logging.getLogger().setLevel(loggingMap[log]) if log == 'error': urllib3.disable_warnings() dtx = DigitoolXML(xml_dirname) oai_ids = dtx.getList() categorize = Categorize(dtx) if run: ds = Dspace(server, dspace_admin_username, dspace_admin_passwd, xml_dirname=xml_dirname) records = aleph.openAleph("dtl_2006.xml") if catalogue: f = open('output/' + server, 'w') count = 0 #for oai_id in oai_ids: facultysum = {} for oai_id in oai_ids: count += 1 digitoolMetadata = dtx.get_metadata(oai_id)['marc'] aleph_id = aleph.normalise(digitoolMetadata['001']) originalMetadata = records[aleph_id] metadataTopic = metadataConvertor.convertMarc(categorize, oai_id, originalMetadata) convertedMetadata, collection = metadataConvertor.createDC( server, categorize, oai_id, metadataTopic, originalMetadata) attachements = list(dtx.get_attachements(oai_id)) fc = filenameConvertor.FilenameConvertor(categorize) attachementsDescription = fc.generate_description(oai_id, attachements) if collection == None: raise Exception('Unknown faculty') #if collection == 248: # for row in convertedMetadata['metadata']: # if row['key'] == 'dc.title': # print(row['value']) # if row['key'] == 'dc.description.faculty': # print(row['value']) if False: for row in convertedMetadata['metadata']: print(row) print(attachementsDescription) if run: ds.new_item(collection, convertedMetadata, attachementsDescription) if archive: createArchive(oai_id, xml_dirname, convertedMetadata, attachementsDescription) if catalogue: f.write("{} {}\n".format(oai_id, collection)) #for row in convertedMetadata['metadata']: # if row['key'] == 'dc.description.faculty': # faculty = row['value'] # if faculty not in facultysum: # facultysum[faculty] = 1 # else: # facultysum[faculty] +=1 # if count % 1000 == 0: # time.sleep(1) if run: ds.logout() if catalogue: f.close() if facultysum: print(facultysum)
def test_noattachement(): dtx = DigitoolXML(xml_dirname) c = Categorize(dtx) oai_ids = Digitool(digitool_category).download_list() bugs.no_attachements(oai_ids, dtx, c) assert str(c) == '''
def test_forgot(): dtx = DigitoolXML(xml_dirname) c = Categorize(dtx) oai_ids = Digitool(digitool_category).download_list() bugs.forgot_attachements(oai_ids, dtx, c, xml_dirname + "/ls_streams.txt") assert str(c) == '''
def test_oai(): dtx = DigitoolXML(xml_dirname) c = Categorize(dtx) oai_ids = Digitool(digitool_category).download_list() bugs.oai(oai_ids, dtx, c) assert str(c) == '''