def static_file_merging(): """runs the record merger from a static XML in a file bypassing the extraction""" static_file = "misc/2011ApJ...741...91C.xml" #static_file = "misc/1999PASP..111..438F.xml" #static_file = "misc/1984A&A...130...97L.xml" logger.warn(static_file) return merge_records_xml(libxml2.parseDoc(open(static_file, "r").read()))
def merge_bibcodes(bibcodes, print_adsxml=False, print_marcxml=False, write_xml_to_disk=False): """ Returns a merged version of the record identified by bibcode. """ # Extract the record from ADS. records = ADSRecords('full', 'XML') for bibcode in bibcodes: records.addCompleteRecord(bibcode) ads_xml_obj = records.export() if print_adsxml: print ads_xml_obj.serialize('UTF-8') if write_xml_to_disk: with open('/tmp/adsxml.xml', 'w') as f: f.write(ads_xml_obj.serialize('UTF-8')) # Convert to MarcXML. stylesheet = libxslt.parseStylesheetDoc(libxml2.parseFile(XSLT)) xml_object = stylesheet.applyStylesheet(ads_xml_obj, None) if print_marcxml: print xml_object.serialize('UTF-8') if write_xml_to_disk: with open('/tmp/marcxml.xml', 'w') as f: f.write(xml_object.serialize('UTF-8')) merged_records, bibcodes_with_problems = merge_records_xml(xml_object) return merged_records
def test_01_merge_two_records_one_field(self): """ PRIORITY: 2 records, 1 field, 2 origins. """ marcxml = """<collections><collection> <record> <datafield tag="300" ind1=" " ind2=" "> <subfield code="a">10</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="980" ind1="" ind2=""> <subfield code="a">ASTRONOMY</subfield> <subfield code="7">ADS metadata</subfield> </datafield> </record> <record> <datafield tag="300" ind1=" " ind2=" "> <subfield code="a">15</subfield> <subfield code="7">NED</subfield> </datafield> </record> </collection></collections>""" expected = """<collections><collection><record> <datafield tag="300" ind1=" " ind2=" "> <subfield code="a">10</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="980" ind1="" ind2=""> <subfield code="a">ASTRONOMY</subfield> <subfield code="7">ADS metadata</subfield> </datafield> </record></collection></collections>""" merged_record = m.merge_records_xml(libxml2.parseDoc(marcxml))[0] self.assertEqual(merged_record, create_record_from_libxml_obj(libxml2.parseDoc(expected), logger)[0])
def test_04_merge_three_records_two_fields(self): """ 3 records, 6 fields, 6 origins. """ marcxml = """<collections><collection> <record> <datafield tag="300" ind1=" " ind2=" "> <subfield code="a">10</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="773" ind1=" " ind2=" "> <subfield code="a">Libération</subfield> <subfield code="7">STI</subfield> </datafield> <datafield tag="980" ind1="" ind2=""> <subfield code="a">ASTRONOMY</subfield> <subfield code="7">ADS metadata</subfield> </datafield> </record> <record> <datafield tag="773" ind1=" " ind2=" "> <subfield code="a">Le Monde</subfield> <subfield code="7">AAS</subfield> </datafield> <datafield tag="300" ind1=" " ind2=" "> <subfield code="a">15</subfield> <subfield code="7">NED</subfield> </datafield> </record> <record> <datafield tag="300" ind1=" " ind2=" "> <subfield code="a">5</subfield> <subfield code="7">ADS metadata</subfield> </datafield> <datafield tag="773" ind1=" " ind2=" "> <subfield code="a">L'Express</subfield> <subfield code="7">OCR</subfield> </datafield> </record> </collection></collections>""" expected = """<collections><collection><record> <datafield tag="300" ind1=" " ind2=" "> <subfield code="a">5</subfield> <subfield code="7">ADS metadata</subfield> </datafield> <datafield tag="773" ind1=" " ind2=" "> <subfield code="a">Le Monde</subfield> <subfield code="7">AAS</subfield> </datafield> <datafield tag="980" ind1="" ind2=""> <subfield code="a">ASTRONOMY</subfield> <subfield code="7">ADS metadata</subfield> </datafield> </record></collection></collections>""" merged_record = m.merge_records_xml(libxml2.parseDoc(marcxml))[0] self.assertEqual(merged_record, create_record_from_libxml_obj(libxml2.parseDoc(expected), logger)[0])
def test_02_merge_two_records_additional_subfield(self): """ AUTHORS: 2 records, 1 additional subfield. """ marcxml = """<collections><collection> <record> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Di Milia, Giovanni</subfield> <subfield code="b">Di Milia, G</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="980" ind1="" ind2=""> <subfield code="a">ASTRONOMY</subfield> <subfield code="7">ADS metadata</subfield> </datafield> </record> <record> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Di Milia, Giancarlo</subfield> <subfield code="b">Di Milia, G</subfield> <subfield code="u">Center for astrophysics</subfield> <subfield code="7">ARXIV</subfield> </datafield> </record> </collection></collections>""" expected = """<collections><collection><record> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Di Milia, Giovanni</subfield> <subfield code="b">Di Milia, G</subfield> <subfield code="u">Center for astrophysics</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="980" ind1="" ind2=""> <subfield code="a">ASTRONOMY</subfield> <subfield code="7">ADS metadata</subfield> </datafield> </record></collection></collections>""" #records = b.create_records(marcxml) expected_record = create_record_from_libxml_obj(libxml2.parseDoc(expected), logger)[0] merged_record = m.merge_records_xml(libxml2.parseDoc(marcxml))[0] self.assertTrue(b._compare_fields(merged_record[0]['100'][0], expected_record[0]['100'][0], strict=False))
def extractor_process(q_todo, q_done, q_probl, q_uplfile, lock_stdout, lock_createdfiles, q_life, extraction_directory, extraction_name): """Worker function for the extraction of bibcodes from ADS it has been defined outside any class because it's more simple to treat with multiprocessing """ logger.warning(multiprocessing.current_process().name + ' (worker) Process started') #I create a local logger fh = logging.FileHandler(os.path.join(pipeline_settings.BASE_OUTPUT_PATH, extraction_directory, pipeline_settings.BASE_LOGGING_PATH, multiprocessing.current_process().name+'_worker.log')) fmt = logging.Formatter(pipeline_settings.LOGGING_FORMAT) fh.setFormatter(fmt) local_logger = logging.getLogger(pipeline_settings.LOGGING_WORKER_NAME) local_logger.addHandler(fh) local_logger.setLevel(logger.level) local_logger.propagate = False #I print the same message for the local logger local_logger.warning(multiprocessing.current_process().name + ' Process started') #I remove the automatic join from the queue of the files to upload q_uplfile.cancel_join_thread() #I get the maximum number of groups I can process max_num_groups = settings.MAX_NUMBER_OF_GROUP_TO_PROCESS #variable used to know if I'm exiting because the queue is empty or because I reached the maximum number of groups to process queue_empty = False #while there is something to process or I reach the maximum number of groups I can process, I try to process for grpnum in range(max_num_groups): task_todo = q_todo.get() if task_todo[0] == 'STOP': queue_empty = True #I exit the loop break #I print when I'm starting the extraction local_logger.warning(multiprocessing.current_process().name + (' starting to process group %s' % task_todo[0])) ############ #then I process the bibcodes # I define a couple of lists where to store the bibcodes processed bibcodes_ok = [] bibcodes_probl = [] #I define a ADSEXPORT object recs = ADSRecords('full', 'XML') # I define a maximum amount of bibcodes I can skip per each cicle: the number of bibcodes per group / 10 (minimum 500) # if i skip more than this amount it means that there is something # wrong with the access to the data and it's better to stop everything max_number_of_bibs_to_skip = max(settings.NUMBER_OF_BIBCODES_PER_GROUP / 10, settings.MAX_SKIPPED_BIBCODES) for bibcode in task_todo[1]: try: recs.addCompleteRecord(bibcode) bibcodes_ok.append(bibcode) except Exception, error: local_logger.error(': problem retrieving the bibcode "%s" in group %s' % (bibcode, task_todo[0])) #I catch the exception type name exc_type, exc_obj, exc_tb = sys.exc_info() try: str_error_to_print = exc_type.__name__ + '\t' + str(error) except: try: str_error_to_print = u'%s\t%s' % (unicode(exc_type.__name__), unicode(error)) except: local_logger.error(' Cannot log error for bibcode %s ' % bibcode) str_error_to_print = '' bibcodes_probl.append((bibcode, str_error_to_print)) max_number_of_bibs_to_skip = max_number_of_bibs_to_skip - 1 #If i=I reach 0 It means that I skipped 1k bibcodes and probably there is a problem: so I simulate an exit for empty queue if max_number_of_bibs_to_skip == 0: break #I exit from both loops if max_number_of_bibs_to_skip == 0: local_logger.warning(' Detected possible error with ADS data access: skipped %s bibcodes in one group' % max(settings.NUMBER_OF_BIBCODES_PER_GROUP / 10, settings.MAX_SKIPPED_BIBCODES)) queue_empty = True break #I extract the object I created xmlobj = recs.export() del recs try: #I define a transformation object transf = xml_transformer.XmlTransformer(local_logger) #and I transform my object marcxml = transf.transform(xmlobj) except: err_msg = ' Impossible to transform the XML!' local_logger.critical(err_msg) raise GenericError(err_msg) if marcxml: #I merge the records merged_records, records_with_merging_probl = merger.merge_records_xml(marcxml) #If I had problems to merge some records I remove the bibcodes from the list "bibcodes_ok" and I add them to "bibcodes_probl" for elem in records_with_merging_probl: try: bibcodes_ok.remove(elem[0]) except ValueError: local_logger.warning(' Problems to remove bibcode "%s" in group "%s" from the list of bibcodes extracted after merging' % (elem[0], task_todo[0]) ) if elem[0] in bibcodes_probl: local_logger.error(': bibcode "%s" reached the merger but was in problematic bibcodes!' % elem[0]) bibcodes_probl = bibcodes_probl + records_with_merging_probl ######### #I write the object in a file ########## filepath = os.path.join(settings.BASE_OUTPUT_PATH, extraction_directory, pipeline_settings.BASE_BIBRECORD_FILES_DIR, pipeline_settings.BIBREC_FILE_BASE_NAME+'_'+extraction_name+'_'+task_todo[0]) output = open(filepath, 'wb') pickle.dump(merged_records, output) output.close() #then I write the filepath to a file for eventual future recovery lock_createdfiles.acquire() bibrec_file_obj = open(os.path.join(settings.BASE_OUTPUT_PATH, extraction_directory,settings.LIST_BIBREC_CREATED), 'a') bibrec_file_obj.write(filepath + '\n') bibrec_file_obj.close() lock_createdfiles.release() #finally I append the file to the queue local_logger.info('Insert in queue for upload the file "%s" of the group "%s" ' % (filepath, task_todo[0])) q_uplfile.put((task_todo[0],filepath)) #logger.info('record created, merged but not uploaded') #bibupload_merger(merged_records, local_logger, 'replace_or_insert') #otherwise I put all the bibcodes in the problematic else: bibcodes_probl = bibcodes_probl + [(bib, 'Bibcode extraction ok, but xml generation failed') for bib in bibcodes_ok] bibcodes_ok = [] #finally I pass to the done bibcodes to the proper file q_done.put([task_todo[0], bibcodes_ok]) #and the problematic bibcodes q_probl.put([task_todo[0], bibcodes_probl]) local_logger.warning(multiprocessing.current_process().name + (' finished to process group %s' % task_todo[0]))
def test_01_merge_two_records_one_field(self): """ AUTHORS: 2 records, priority. """ marcxml = """<collections><collection> <record> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Di Milia, Giovanni</subfield> <subfield code="b">Di Milia, G</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="700" ind1=" " ind2=" "> <subfield code="a">Luker, Jay</subfield> <subfield code="b">Luker, J</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="700" ind1=" " ind2=" "> <subfield code="a">Henneken, Edwin</subfield> <subfield code="b">Henneken, E</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="980" ind1="" ind2=""> <subfield code="a">ASTRONOMY</subfield> <subfield code="7">ADS metadata</subfield> </datafield> </record> <record> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Dimilia, Giovanni</subfield> <subfield code="b">Dimilia, G</subfield> <subfield code="7">ARXIV</subfield> </datafield> <datafield tag="700" ind1=" " ind2=" "> <subfield code="a">Luker, Jay</subfield> <subfield code="b">Luker, J</subfield> <subfield code="7">ARXIV</subfield> </datafield> <datafield tag="700" ind1=" " ind2=" "> <subfield code="a">Henneken, Edwin</subfield> <subfield code="b">Henneken, E</subfield> <subfield code="7">ARXIV</subfield> </datafield> </record> </collection></collections>""" expected = """<collections><collection><record> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Di Milia, Giovanni</subfield> <subfield code="b">Di Milia, G</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="700" ind1=" " ind2=" "> <subfield code="a">Luker, Jay</subfield> <subfield code="b">Luker, J</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="700" ind1=" " ind2=" "> <subfield code="a">Henneken, Edwin</subfield> <subfield code="b">Henneken, E</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="980" ind1="" ind2=""> <subfield code="a">ASTRONOMY</subfield> <subfield code="7">ADS metadata</subfield> </datafield> </record></collection></collections>""" merged_record = m.merge_records_xml(libxml2.parseDoc(marcxml))[0] self.assertEqual(merged_record, create_record_from_libxml_obj(libxml2.parseDoc(expected), logger)[0])