def setUp(self): ''' Setting up the regression test for rabbit.Notice that most rabbit test cases should reuse this setUp method. ''' self.verbose = 0 task_set_task_param('verbose', self.verbose) self.author_name = 'TestSurname, TestName' # The original name. # A two-letter change to the original name. Notice unicode characters self.slightly_modified_author_name = u"TéstSurname, TestName" # A rather large change of the original name. self.heavily_modified_name = 'TestSarname, TostName' # The same for. coauthors self.co_authors_names = [ 'Coauthor, SomeCoauthor', 'SomeCoauthor, DifferentCoauthor', 'Queen, Elizabeth', 'SomeBody, John' ] # This is a greek r! self.slightly_mod_co_authors_names = [ u'Coauthoρ, SomeCoauthoρ', u'SomeCoauthoρ, DifferentCoauthoρ', u'Queen, Elizabeth', u'SomeBody, John' ] self.heavily_mod_co_authors_names = [ 'Coeuthara, SomeCithor', 'SomiCythore, Difn', 'Quiin, d\'Elezebath', 'Samebedi, Johnathan' ] self.ext_id = 'FAKE_EXT_ID'
def setUp(self, recid=RECID, arxiv_id=ARXIV_ID): self.recid = recid self.arxiv_id = arxiv_id self.arxiv_version = 1 self.bibupload_xml = """<record> <controlfield tag="001">%s</controlfield> <datafield tag="037" ind1=" " ind2=" "> <subfield code="a">arXiv:%s</subfield> <subfield code="9">arXiv</subfield> <subfield code="c">hep-ph</subfield> </datafield> </record>""" % (recid, arxiv_id) bibtask.setup_loggers() bibtask.task_set_task_param('verbose', 0) recs = bibupload.xml_marc_to_records(self.bibupload_xml) status, dummy, err = bibupload.bibupload(recs[0], opt_mode='correct') assert status == 0, err.strip() assert len(get_fieldvalues(recid, '037__a')) == 1 def mocked_oai_harvest_get(prefix, baseurl, harvestpath, verb, identifier): temp_fd, temp_path = mkstemp() os.write(temp_fd, ARXIV_OAI_RESPONSE % self.arxiv_version) os.close(temp_fd) return [temp_path] self.oai_harvest_get = oai_harvest_daemon.oai_harvest_get oai_harvest_daemon.oai_harvest_get = mocked_oai_harvest_get def mocked_get_oai_src(params={}): return [{'baseurl': ''}] self.get_oai_src = oai_harvest_dblayer.get_oai_src oai_harvest_dblayer.get_oai_src = mocked_get_oai_src
def setUp(self): """Initialization""" self.cit = {74: set([92]), 77: set([85, 86]), 78: set([91, 79]), 79: set([91]), 81: set([89, 82, 83, 87]), 18: set([96]), 84: set([88, 91, 85]), 91: set([92]), 94: set([80]), 95: set([77, 86])} self.dict_of_ids = {96: 14, 18: 13, 74: 0, 77: 2, 78: 5, 79: 7, 80: 18, 81: 8, 82: 10, 83: 11, 84: 15, 85: 3, 86: 4, 87: 12, 88: 16, 89: 9, 91: 6, 92: 1, 94: 17, 95: 19} self.ref = list([0, 2, 1, 2, 2, 0, 3, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0]) self.dates = {0: 2001, 1: 2006, 2: 2002, 3: 2003, 4: 2003, 5: 2002, 6: 2007, 7: 2003, 8: 2002, 9: 2005, 10: 2002, 11: 2003, 12: 2003, 13: 1984, 14: 2000, 15: 2003, 16: 2003, 17: 1997, 18: 2002, 19: 1999} self.damping_factor = 0.85 self.conv_threshold = 0.0001 self.check_point = 1 task_set_task_param('verbose', 0)
def parse_option(key, value, opts, args): """ Elaborate task submission parameter. """ if args: # There should be no standalone arguments raise StandardError("Error: Unrecognised argument '%s'." % args[0]) if key in ('-i', '--id'): recids = task_get_task_param('recids') if not recids: recids = set() task_set_task_param('recids', recids) recids.update(split_cli_ids_arg(value)) elif key in ('-a', '--all'): task_set_task_param('all', True) return True
def task_submit_check_options(): """ Checks the tasks arguments for validity """ #----------------# # General Checks # #----------------# ## FFMPEG CONFIGURATION ## ## The status of ffmpeg should be checked before a task is submitted ## There is a minimum configuration that ffmpeg must be compiled with ## See bibencode_utils and bibencode_config config = check_ffmpeg_configuration() if config: ## Prints missing configuration string = '' for item in config: string += ('\t' + item + '\n') write_message( "FFmpeg options are missing. Please recompile and add:\n" + string) return False ## MODE ## ## Check if the mode is a valid if _topt('mode') is None: write_message('You have to specify a mode using \'-m MODE\'') return False if _topt('mode') not in CFG_BIBENCODE_VALID_MODES: write_message('%s is not a valid mode. Use one of %s' % (_topt('mode'), CFG_BIBENCODE_VALID_MODES)) return False ## INPUT ## ## Check if the input file is given and if it exists ## You should allways use an absolute path to the file if _topt('mode') in ('encode', 'extract', 'meta', 'batch'): if _topt('input') is None: write_message('You must specify an input file using \'-i FILE\'') return False else: if not os.path.exists(_topt('input')): print("The file %s does not exist" % _topt('input')) return False ## OUTPUT ## ## Check if the output file is given and if it exists ## You should allways use an absolute path to the file if _topt('mode') in ('encode', 'extract', 'meta'): if _topt('output') is None: write_message('No output file is given. Please specify with' ' \'-o NAME\'') return False #---------------# # Encoding Mode # #---------------# if _topt('mode') == 'encode': ## PROFILE ## Check for a valid profile if this is given if _topt('profile_name') is not None: if _topt('profile_name') not in get_encoding_profiles(): write_message( '%s not found in %s' % (_topt('profile_name'), CFG_BIBENCODE_PROFILES_ENCODING)) return False ## If the profile exists else: pass ## AUDIOCODEC ## ## Checks if the audiocodec is one of the predefined if _topt('acodec') is not None: if _topt('acodec') not in CFG_BIBENCODE_FFMPEG_VALID_ACODECS: write_message( '%s is not a valid audiocodec.\nAvailable codecs: %s' % (_topt('acodec'), CFG_BIBENCODE_FFMPEG_VALID_ACODECS)) return False ## VIDEOCODEC ## Checks if the videocodec is one of the predefined if _topt('vcodec') is not None: if _topt('vcodec') not in CFG_BIBENCODE_FFMPEG_VALID_VCODECS: write_message( '%s is not a valid videocodec.\nAvailable codecs: %s' % (_topt('vcodec'), CFG_BIBENCODE_FFMPEG_VALID_VCODECS)) return False ## SIZE ## ## Checks if the size is either WxH or an FFMPEG preset if _topt('size') is not None: if not CFG_BIBENCODE_FFMPEG_RE_VALID_SIZE.match(_topt('size')): if _topt('size') not in CFG_BIBENCODE_FFMPEG_VALID_SIZES: write_message( '%s is not a valid frame size.\nEither use the' ' \'WxH\' notation or one of these values:\n%s' % (_topt('size'), CFG_BIBENCODE_FFMPEG_VALID_SIZES)) return False ## Check if both a size and vertical or horizontal resolution if (_topt('width') or _topt('height')) and _topt('size'): write_message('Options \'width\' and \'height\' can not be ' 'combined with \'resolution\'') return False ## PASSES ## ## If a number of passes is given, it should be either 1 oder 2. ## You could do an infinite number of passes with ffmpeg, ## But it will almost never make a difference above 2 passes. ## So, we currently only support 2 passes. if _topt('passes') is not None: if _topt('passes') not in (1, 2): write_message('The number of passes must be either 1 or 2') return False else: task_set_option('passes', 1) ## BITRATE ## ## Check if the given bitrate is either 1000 sth. or 1000k sth. if _topt('abitrate') is not None: pass if _topt('vbitrate') is not None: pass #-----------------# # Extraction Mode # #-----------------# elif _topt('mode') == 'extract': ## PROFILE ## ## If a profile is given, check its validity if _topt('profile_name') is not None: if _topt('profile_name') not in get_extract_profiles(): write_message( '%s not found in %s' % (_topt('profile_name'), CFG_BIBENCODE_PROFILES_EXTRACT)) return False ## If the profile exists else: pass ## You cannot give both a number and specific positions ## !!! Think about allowing both -> First extract by number, ## !!! then additionally the specific positions if (((_topt('numberof') is not None) and (_topt('positions') is not None)) or ((_topt('numberof') is None) and (_topt('positions') is None))): write_message('Please specify either a number of frames to ' 'take or specific positions') return False ## SIZE ## ## Checks if the size is either WxH or an FFMPEG specific value if _topt('size') is not None: if not CFG_BIBENCODE_FFMPEG_RE_VALID_SIZE.match(_topt('size')): if _topt('size') not in CFG_BIBENCODE_FFMPEG_VALID_SIZES: write_message( '%s is not a valid frame size.\nEither use the' '\'WxH\' notation or one of these valus:\n%s' % (_topt('size'), CFG_BIBENCODE_FFMPEG_VALID_SIZES)) return False #---------------# # Metadata Mode # #---------------# elif _topt('mode') == 'meta': ## You have to give exactly one meta suboption if not _xor(_topt('meta_input'), _topt('meta_dump')): write_message("You can either dump or write metadata") return False ## METADATA INPUT ## if _topt('meta_input') is not None: ## Check if this is either a filename (that should exist) ## or if this a jsonic metadata notation if os.path.exists(_topt('meta_input')): pass else: try: metadict = json.loads(_topt('meta_input')) task_set_option('meta_input', metadict) except ValueError: write_message( 'The value %s of the \'--meta\' parameter is ' 'neither a valid filename nor a jsonic dict' % _topt('meta_input')) return False #------------# # Batch Mode # #------------# elif _topt('mode') == 'batch': if _topt('collection') and _topt('search'): write_message('You can either use \'search\' or \'collection\'') return False elif _topt('collection'): template = json_decode_file(_topt('input')) print('\n') print("#---------------------------------------------#") print("# YOU ARE ABOUT TO UPDATE A WHOLE COLLECTION #") print("#---------------------------------------------#") print('\n') print('The selected template file contains:') pprint(template) print('\n') elif _topt('search'): template = json_decode_file(_topt('input')) message = "# YOU ARE ABOUT TO UPDATE RECORDS MATCHING '%s' #" % _topt( 'search') print('\n') print("#" + "-" * (len(message) - 2) + "#") print(message) print("#" + "-" * (len(message) - 2) + "#") print('\n') print('The selected template file contains:') pprint(template) print('\n') #-------------# # Daemon Mode # #-------------# elif _topt('mode') == 'daemon': task_set_task_param('task_specific_name', 'daemon') ## You can either give none or both folders, but not only one if _xor(_topt('new_job_folder'), _topt('old_job_folder')): write_message('When specifying folders for the daemon mode, you ' 'have to specify both the folder for the new jobs ' 'and the old ones') return False ## If every check went fine return True
def task_submit_check_options(): """ Checks the tasks arguments for validity """ #----------------# # General Checks # #----------------# ## FFMPEG CONFIGURATION ## ## The status of ffmpeg should be checked before a task is submitted ## There is a minimum configuration that ffmpeg must be compiled with ## See bibencode_utils and bibencode_config config = check_ffmpeg_configuration() if config: ## Prints missing configuration string = '' for item in config: string += ('\t' + item + '\n') write_message( "FFmpeg options are missing. Please recompile and add:\n" + string ) return False ## MODE ## ## Check if the mode is a valid if _topt('mode') is None: write_message('You have to specify a mode using \'-m MODE\'') return False if _topt('mode') not in CFG_BIBENCODE_VALID_MODES: write_message('%s is not a valid mode. Use one of %s' % (_topt('mode'), CFG_BIBENCODE_VALID_MODES)) return False ## INPUT ## ## Check if the input file is given and if it exists ## You should allways use an absolute path to the file if _topt('mode') in ('encode', 'extract', 'meta', 'batch'): if _topt('input') is None: write_message('You must specify an input file using \'-i FILE\'') return False else: if not os.path.exists(_topt('input')): print("The file %s does not exist" % _topt('input')) return False ## OUTPUT ## ## Check if the output file is given and if it exists ## You should allways use an absolute path to the file if _topt('mode') in ('encode', 'extract', 'meta'): if _topt('output') is None: write_message('No output file is given. Please specify with' ' \'-o NAME\'' ) return False #---------------# # Encoding Mode # #---------------# if _topt('mode') == 'encode': ## PROFILE ## Check for a valid profile if this is given if _topt('profile_name') is not None: if _topt('profile_name') not in get_encoding_profiles(): write_message('%s not found in %s' % (_topt('profile_name'), CFG_BIBENCODE_PROFILES_ENCODING) ) return False ## If the profile exists else: pass ## AUDIOCODEC ## ## Checks if the audiocodec is one of the predefined if _topt('acodec') is not None: if _topt('acodec') not in CFG_BIBENCODE_FFMPEG_VALID_ACODECS: write_message( '%s is not a valid audiocodec.\nAvailable codecs: %s' % (_topt('acodec'), CFG_BIBENCODE_FFMPEG_VALID_ACODECS) ) return False ## VIDEOCODEC ## Checks if the videocodec is one of the predefined if _topt('vcodec') is not None: if _topt('vcodec') not in CFG_BIBENCODE_FFMPEG_VALID_VCODECS: write_message( '%s is not a valid videocodec.\nAvailable codecs: %s' % (_topt('vcodec'), CFG_BIBENCODE_FFMPEG_VALID_VCODECS) ) return False ## SIZE ## ## Checks if the size is either WxH or an FFMPEG preset if _topt('size') is not None: if not CFG_BIBENCODE_FFMPEG_RE_VALID_SIZE.match(_topt('size')): if _topt('size') not in CFG_BIBENCODE_FFMPEG_VALID_SIZES: write_message( '%s is not a valid frame size.\nEither use the' ' \'WxH\' notation or one of these values:\n%s' % (_topt('size'), CFG_BIBENCODE_FFMPEG_VALID_SIZES) ) return False ## Check if both a size and vertical or horizontal resolution if (_topt('width') or _topt('height')) and _topt('size'): write_message('Options \'width\' and \'height\' can not be ' 'combined with \'resolution\'') return False ## PASSES ## ## If a number of passes is given, it should be either 1 oder 2. ## You could do an infinite number of passes with ffmpeg, ## But it will almost never make a difference above 2 passes. ## So, we currently only support 2 passes. if _topt('passes') is not None: if _topt('passes') not in (1, 2): write_message('The number of passes must be either 1 or 2') return False else: task_set_option('passes', 1) ## BITRATE ## ## Check if the given bitrate is either 1000 sth. or 1000k sth. if _topt('abitrate') is not None: pass if _topt('vbitrate') is not None: pass #-----------------# # Extraction Mode # #-----------------# elif _topt('mode') == 'extract': ## PROFILE ## ## If a profile is given, check its validity if _topt('profile_name') is not None: if _topt('profile_name') not in get_extract_profiles(): write_message('%s not found in %s' % (_topt('profile_name'), CFG_BIBENCODE_PROFILES_EXTRACT) ) return False ## If the profile exists else: pass ## You cannot give both a number and specific positions ## !!! Think about allowing both -> First extract by number, ## !!! then additionally the specific positions if ( ((_topt('numberof') is not None) and (_topt('positions') is not None)) or ((_topt('numberof') is None) and (_topt('positions') is None)) ): write_message('Please specify either a number of frames to ' 'take or specific positions') return False ## SIZE ## ## Checks if the size is either WxH or an FFMPEG specific value if _topt('size') is not None: if not CFG_BIBENCODE_FFMPEG_RE_VALID_SIZE.match(_topt('size')): if _topt('size') not in CFG_BIBENCODE_FFMPEG_VALID_SIZES: write_message( '%s is not a valid frame size.\nEither use the' '\'WxH\' notation or one of these valus:\n%s' % (_topt('size'), CFG_BIBENCODE_FFMPEG_VALID_SIZES) ) return False #---------------# # Metadata Mode # #---------------# elif _topt('mode') == 'meta': ## You have to give exactly one meta suboption if not _xor(_topt('meta_input'), _topt('meta_dump')): write_message("You can either dump or write metadata") return False ## METADATA INPUT ## if _topt('meta_input') is not None: ## Check if this is either a filename (that should exist) ## or if this a jsonic metadata notation if os.path.exists(_topt('meta_input')): pass else: try: metadict = json.loads(_topt('meta_input')) task_set_option('meta_input', metadict) except ValueError: write_message('The value %s of the \'--meta\' parameter is ' 'neither a valid filename nor a jsonic dict' % _topt('meta_input')) return False #------------# # Batch Mode # #------------# elif _topt('mode') == 'batch': if _topt('collection') and _topt('search'): write_message('You can either use \'search\' or \'collection\'') return False elif _topt('collection'): template = json_decode_file(_topt('input')) print('\n') print("#---------------------------------------------#") print("# YOU ARE ABOUT TO UPDATE A WHOLE COLLECTION #") print("#---------------------------------------------#") print('\n') print('The selected template file contains:') pprint(template) print('\n') elif _topt('search'): template = json_decode_file(_topt('input')) message = "# YOU ARE ABOUT TO UPDATE RECORDS MATCHING '%s' #" % _topt('search') print('\n') print("#" + "-"*(len(message)-2) + "#") print(message) print("#" + "-"*(len(message)-2) + "#") print('\n') print('The selected template file contains:') pprint(template) print('\n') #-------------# # Daemon Mode # #-------------# elif _topt('mode') == 'daemon': task_set_task_param('task_specific_name', 'daemon') ## You can either give none or both folders, but not only one if _xor(_topt('new_job_folder'), _topt('old_job_folder')): write_message('When specifying folders for the daemon mode, you ' 'have to specify both the folder for the new jobs ' 'and the old ones') return False ## If every check went fine return True
def bst_apsharvest(dois="", recids="", query="", records="", new_mode="email", update_mode="email", from_date="", until_date=None, metadata="yes", fulltext="yes", hidden="yes", match="no", reportonly="no", threshold_date=None, devmode="no", input_file=""): """ Task to download APS metadata + fulltext given a list of arguments. Operates in two ways: 1. Harvesting of new/updated metadata+fulltext from APS via REST API This means that new records are being looked for at APS servers. Active when from_date and until_date is given, in addition when a DOI not already in the system is given. If the value "last" is given to from_date the harvester will harvest any new records since last run. If match is set to "yes" the records harvested will be matched against the database and split into "new" and "updated" records. 2. Attachment of fulltext only from APS for existing records When the records to be processed already exists in the system, the task only harvests the fulltext's themselves and attaches them to the records. Examples: Get full update for existing records via record identifier: >>> bst_apsharvest(recids="13,513,333") Get full update for existing records via a search query and unhide fulltext: >>> bst_apsharvest(query="find j prstab", hidden="no") Get metadata only update for an existing doi: >>> bst_apsharvest(dois="10.1103/PhysRevB.87.235401", fulltext="no") Get fulltext only update for a record and append to record: >>> bst_apsharvest(recids="11139", metadata="no", update_mode="append") Get new records from APS, send update to holding pen and email new records >>> bst_apsharvest(from_date="last", update_mode="o") Get records from APS updated between given dates, insert new and correct >>> bst_apsharvest(from_date="2013-06-03", until_date="2013-06-04", new_mode="insert", update_mode="correct") @param dois: comma-separated list of DOIs to download fulltext/metadata for. @type dois: string @param recids: comma-separated list of recids of record containing a DOI to download fulltext for. @type recids: string @param query: an Invenio search query of records to download fulltext for. @type query: string @param records: get any records modified, created or both since last time in the database to download fulltext for, can be either: "new" - fetches all new records added "modified" - fetches all modified records added "both" - both of the above @type records: string @param new_mode: which mode should the fulltext files be submitted in: "email" - does NOT run bibupload and sends an email instead. Default. "insert" - inserts the records into the database "append" - appends the fulltext to the existing attached files "correct" - corrects existing attached fulltext files, or adds new "replace" - replaces all attached files with new fulltext file The fulltext is appended by default to new records. @type mode: string @param update_mode: which mode should the fulltext files be submitted in: "email" - does NOT run bibupload and sends an email instead. Default. "insert" - inserts the records into the database "append" - appends the fulltext to the existing attached files "correct" - corrects existing attached fulltext files, or adds new "replace" - replaces all attached files with new fulltext file The fulltext is appended by default to new records. @type mode: string @param from_date: ISO date for when to harvest records from. Ex. 2013-01-01 If the value is "last" it means to get records since last harvest. @type from_date: string @param until_date: ISO date for when to harvest records until. Ex. 2013-01-01 @type until_date: string @param fulltext: should the record have fulltext attached? "yes" or "no" @type fulltext: string @param hidden: should the fulltext be hidden when attached? "yes" or "no" @type hidden: string @param match: should a simple match with the database be done? "yes" or "no" @type match: string @param reportonly: only report number of records to harvest, then exit? "yes" or "no" @type reportonly: string @param threshold_date: ISO date for when to harvest records since. Ex. 2013-01-01 @type threshold_date: string @param devmode: Activate devmode. Full verbosity and no uploads/mails. @type devmode: string @param input_file: harvests articles with given file containing one DOI per line. @type input_file: string """ task_update_progress("Parsing input parameters") # Validate modes for mode in [new_mode, update_mode]: if mode not in ("append", "a", "correct", "c", "o", "replace", "r", "insert", "i", "email"): raise Exception("Warning: given upload mode '%s' is not valid." % (mode,)) # We hide fulltext by default if hidden.lower() == "no": hidden = False else: hidden = True # We attach fulltext by default if fulltext.lower() == "no": fulltext = False else: fulltext = True # We attach meta-data by default if metadata.lower() == "no": metadata = False else: metadata = True # We do not match records by default if match.lower() == "yes": match = True else: match = False # We do not reportonly by default if devmode.lower() == "yes": devmode = True task_set_task_param('verbose', 9) else: devmode = False # We do not reportonly by default if reportonly.lower() == "yes": reportonly = True else: reportonly = False if input_file: if not os.path.exists(input_file): write_message("Input file {0} does not exist!".format(input_file), stream=sys.stderr) return False # Unify all parameters into a dict using locals parameters = locals() # 1: We analyze parameters and fetch all requested records from APS final_record_list, harvest_from_date, new_harvest_date = get_records_to_harvest(parameters) write_message("Found %d record(s) to download." % (len(final_record_list),)) if reportonly: write_message("'Report-only' mode. We exit now.") return if not final_record_list: # No records to harvest, quit. write_message("Nothing to harvest.") return # 2: Extract fulltext/metadata XML and upload bunches of # records as configured job = APSHarvestJob(CFG_APSHARVEST_DIR, date_started=new_harvest_date, date_harvested_from=harvest_from_date) count = process_records(job, parameters, final_record_list) if parameters.get("from_date") == "last": # Harvest of new records from APS successful # we update last harvested date store_last_updated(None, new_harvest_date, name="apsharvest_api_download") # We are done write_message("Harvested %d records. (%d failed)" % (count, len(job.records_failed)))
def bst_apsharvest(dois="", recids="", query="", records="", new_mode="email", update_mode="email", from_date="", until_date=None, metadata="yes", fulltext="yes", hidden="yes", match="no", reportonly="no", threshold_date=None, devmode="no"): """ Task to download APS metadata + fulltext given a list of arguments. Operates in two ways: 1. Harvesting of new/updated metadata+fulltext from APS via REST API This means that new records are being looked for at APS servers. Active when from_date and until_date is given, in addition when a DOI not already in the system is given. If the value "last" is given to from_date the harvester will harvest any new records since last run. If match is set to "yes" the records harvested will be matched against the database and split into "new" and "updated" records. 2. Attachment of fulltext only from APS for existing records When the records to be processed already exists in the system, the task only harvests the fulltext's themselves and attaches them to the records. Examples: Get full update for existing records via record identifier: >>> bst_apsharvest(recids="13,513,333") Get full update for existing records via a search query and unhide fulltext: >>> bst_apsharvest(query="find j prstab", hidden="no") Get metadata only update for an existing doi: >>> bst_apsharvest(dois="10.1103/PhysRevB.87.235401", fulltext="no") Get fulltext only update for a record and append to record: >>> bst_apsharvest(recids="11139", metadata="no", update_mode="append") Get new records from APS, send update to holding pen and email new records >>> bst_apsharvest(from_date="last", update_mode="o") Get records from APS updated between given dates, insert new and correct >>> bst_apsharvest(from_date="2013-06-03", until_date="2013-06-04", new_mode="insert", update_mode="correct") @param dois: comma-separated list of DOIs to download fulltext/metadata for. @type dois: string @param recids: comma-separated list of recids of record containing a DOI to download fulltext for. @type recids: string @param query: an Invenio search query of records to download fulltext for. @type query: string @param records: get any records modified, created or both since last time in the database to download fulltext for, can be either: "new" - fetches all new records added "modified" - fetches all modified records added "both" - both of the above @type records: string @param new_mode: which mode should the fulltext files be submitted in: "email" - does NOT run bibupload and sends an email instead. Default. "insert" - inserts the records into the database "append" - appends the fulltext to the existing attached files "correct" - corrects existing attached fulltext files, or adds new "replace" - replaces all attached files with new fulltext file The fulltext is appended by default to new records. @type mode: string @param update_mode: which mode should the fulltext files be submitted in: "email" - does NOT run bibupload and sends an email instead. Default. "insert" - inserts the records into the database "append" - appends the fulltext to the existing attached files "correct" - corrects existing attached fulltext files, or adds new "replace" - replaces all attached files with new fulltext file The fulltext is appended by default to new records. @type mode: string @param from_date: ISO date for when to harvest records from. Ex. 2013-01-01 If the value is "last" it means to get records since last harvest. @type from_date: string @param until_date: ISO date for when to harvest records until. Ex. 2013-01-01 @type until_date: string @param fulltext: should the record have fulltext attached? "yes" or "no" @type fulltext: string @param hidden: should the fulltext be hidden when attached? "yes" or "no" @type hidden: string @param match: should a simple match with the database be done? "yes" or "no" @type match: string @param reportonly: only report number of records to harvest, then exit? "yes" or "no" @type reportonly: string @param threshold_date: ISO date for when to harvest records since. Ex. 2013-01-01 @type threshold_date: string @param devmode: Activate devmode. Full verbosity and no uploads/mails. @type devmode: string """ # This is the list of APSRecord objects to be harvested. final_record_list = APSRecordList() task_update_progress("Parsing input parameters") # Validate modes for mode in [new_mode, update_mode]: if mode not in ("append", "a", "correct", "c", "o", "replace", "r", "insert", "i", "email"): raise Exception("Warning: given upload mode '%s' is not valid." % (mode, )) # We hide fulltext by default if hidden.lower() == "no": hidden = False else: hidden = True # We attach fulltext by default if fulltext.lower() == "no": fulltext = False else: fulltext = True # We attach meta-data by default if metadata.lower() == "no": metadata = False else: metadata = True # We do not match records by default if match.lower() == "yes": match = True else: match = False # We do not reportonly by default if devmode.lower() == "yes": devmode = True task_set_task_param('verbose', 9) else: devmode = False # We do not reportonly by default if reportonly.lower() == "yes": reportonly = True else: reportonly = False if threshold_date: # Input from user. Validate date try: harvest_from_date = validate_date(threshold_date) except ValueError, e: write_message("Error parsing from_date, use (YYYY-MM-DD): %s" % (str(e), ), stream=sys.stderr) return 1
def bst_apsharvest(dois="", recids="", query="", records="", new_mode="email", update_mode="email", from_date="", until_date=None, metadata="yes", fulltext="yes", hidden="yes", match="no", reportonly="no", threshold_date=None, devmode="no"): """ Task to download APS metadata + fulltext given a list of arguments. Operates in two ways: 1. Harvesting of new/updated metadata+fulltext from APS via REST API This means that new records are being looked for at APS servers. Active when from_date and until_date is given, in addition when a DOI not already in the system is given. If the value "last" is given to from_date the harvester will harvest any new records since last run. If match is set to "yes" the records harvested will be matched against the database and split into "new" and "updated" records. 2. Attachment of fulltext only from APS for existing records When the records to be processed already exists in the system, the task only harvests the fulltext's themselves and attaches them to the records. Examples: Get full update for existing records via record identifier: >>> bst_apsharvest(recids="13,513,333") Get full update for existing records via a search query and unhide fulltext: >>> bst_apsharvest(query="find j prstab", hidden="no") Get metadata only update for an existing doi: >>> bst_apsharvest(dois="10.1103/PhysRevB.87.235401", fulltext="no") Get fulltext only update for a record and append to record: >>> bst_apsharvest(recids="11139", metadata="no", update_mode="append") Get new records from APS, send update to holding pen and email new records >>> bst_apsharvest(from_date="last", update_mode="o") Get records from APS updated between given dates, insert new and correct >>> bst_apsharvest(from_date="2013-06-03", until_date="2013-06-04", new_mode="insert", update_mode="correct") @param dois: comma-separated list of DOIs to download fulltext/metadata for. @type dois: string @param recids: comma-separated list of recids of record containing a DOI to download fulltext for. @type recids: string @param query: an Invenio search query of records to download fulltext for. @type query: string @param records: get any records modified, created or both since last time in the database to download fulltext for, can be either: "new" - fetches all new records added "modified" - fetches all modified records added "both" - both of the above @type records: string @param new_mode: which mode should the fulltext files be submitted in: "email" - does NOT run bibupload and sends an email instead. Default. "insert" - inserts the records into the database "append" - appends the fulltext to the existing attached files "correct" - corrects existing attached fulltext files, or adds new "replace" - replaces all attached files with new fulltext file The fulltext is appended by default to new records. @type mode: string @param update_mode: which mode should the fulltext files be submitted in: "email" - does NOT run bibupload and sends an email instead. Default. "insert" - inserts the records into the database "append" - appends the fulltext to the existing attached files "correct" - corrects existing attached fulltext files, or adds new "replace" - replaces all attached files with new fulltext file The fulltext is appended by default to new records. @type mode: string @param from_date: ISO date for when to harvest records from. Ex. 2013-01-01 If the value is "last" it means to get records since last harvest. @type from_date: string @param until_date: ISO date for when to harvest records until. Ex. 2013-01-01 @type until_date: string @param fulltext: should the record have fulltext attached? "yes" or "no" @type fulltext: string @param hidden: should the fulltext be hidden when attached? "yes" or "no" @type hidden: string @param match: should a simple match with the database be done? "yes" or "no" @type match: string @param reportonly: only report number of records to harvest, then exit? "yes" or "no" @type reportonly: string @param threshold_date: ISO date for when to harvest records since. Ex. 2013-01-01 @type threshold_date: string @param devmode: Activate devmode. Full verbosity and no uploads/mails. @type devmode: string """ # This is the list of APSRecord objects to be harvested. final_record_list = APSRecordList() task_update_progress("Parsing input parameters") # Validate modes for mode in [new_mode, update_mode]: if mode not in ("append", "a", "correct", "c", "o", "replace", "r", "insert", "i", "email"): raise Exception("Warning: given upload mode '%s' is not valid." % (mode,)) # We hide fulltext by default if hidden.lower() == "no": hidden = False else: hidden = True # We attach fulltext by default if fulltext.lower() == "no": fulltext = False else: fulltext = True # We attach meta-data by default if metadata.lower() == "no": metadata = False else: metadata = True # We do not match records by default if match.lower() == "yes": match = True else: match = False # We do not reportonly by default if devmode.lower() == "yes": devmode = True task_set_task_param('verbose', 9) else: devmode = False # We do not reportonly by default if reportonly.lower() == "yes": reportonly = True else: reportonly = False if threshold_date: # Input from user. Validate date try: harvest_from_date = validate_date(threshold_date) except ValueError, e: write_message("Error parsing from_date, use (YYYY-MM-DD): %s" % (str(e),), stream=sys.stderr) return 1
def setUpClass(cls): if cls.run_exec: return cls.run_exec = True cls.verbose = 0 cls.logger = setup_loggers() cls.logger.info('Setting up regression tests...') task_set_task_param('verbose', cls.verbose) cls.authors = { 'author1': { 'name': 'authoraaaaa authoraaaab', 'inspireID': 'INSPIRE-FAKE_ID1' }, 'author2': { 'name': 'authorbbbba authorbbbbb', 'inspireID': 'INSPIRE-FAKE_ID2' }, 'author3': { 'name': 'authorcccca authorccccb', 'inspireID': 'INSPIRE-FAKE_ID3' }, 'author4': { 'name': 'authordddda authorddddb', 'inspireID': 'INSPIRE-FAKE_ID4' }, 'author5': { 'name': 'authoreeeea authoreeeeb', 'inspireID': 'INSPIRE-FAKE_ID5' }, 'author6': { 'name': 'authorffffa authorffffb', 'inspireID': 'INSPIRE-FAKE_ID6' }, 'author7': { 'name': 'authorgggga authorggggb', 'inspireID': 'INSPIRE-FAKE_ID7' }, 'author8': { 'name': 'authorhhhha authorhhhhb', 'inspireID': 'INSPIRE-FAKE_ID8' }, 'author9': { 'name': 'authoriiiia authoriiiib', 'inspireID': 'INSPIRE-FAKE_ID9' }, 'author10': { 'name': 'authorjjjja authorjjjjb', 'inspireID': 'INSPIRE-FAKE_ID10' }, 'author11': { 'name': 'authorkkkka authorkkkkb', 'inspireID': 'INSPIRE-FAKE_ID11' }, 'author12': { 'name': 'authorlllla authorllllb', 'inspireID': 'INSPIRE-FAKE_ID12' }, 'author13': { 'name': 'authormmmma authormmmmb', 'inspireID': 'INSPIRE-FAKE_ID13' }, 'author14': { 'name': 'authornnnna authornnnnb', 'inspireID': 'INSPIRE-FAKE_ID14' }, 'author15': { 'name': 'authorooooa authoroooob', 'inspireID': 'INSPIRE-FAKE_ID15' }, 'author16': { 'name': 'authorppppa authorppppb', 'inspireID': 'INSPIRE-FAKE_ID16' }, 'author17': { 'name': 'authorqqqqa authorqqqqb', 'inspireID': 'INSPIRE-FAKE_ID17' }, 'author18': { 'name': 'authorrrrra authorrrrrb', 'inspireID': 'INSPIRE-FAKE_ID18' }, 'author19': { 'name': 'authorssssa authorssssb', 'inspireID': 'INSPIRE-FAKE_ID19' } } cls.marc_xmls = dict() cls.bibrecs = dict() cls.pids = dict() cls.bibrefs = dict() def set_up_test_hoover_inertia(): cls.marc_xmls['paper1'] = get_new_marc_for_test( 'Test Paper', cls.authors['author1']['name'], limit_to_collections=True) cls.bibrecs['paper1'] = get_bibrec_for_record( cls.marc_xmls['paper1'], opt_mode='insert') cls.marc_xmls['paper1'] = add_001_field(cls.marc_xmls['paper1'], cls.bibrecs['paper1']) def set_up_test_hoover_duplication(): cls.marc_xmls['paper2'] = get_new_marc_for_test( 'Test Paper', cls.authors['author2']['name'], None, ((cls.authors['author2']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper2'] = get_bibrec_for_record( cls.marc_xmls['paper2'], opt_mode='insert') cls.marc_xmls['paper2'] = add_001_field(cls.marc_xmls['paper2'], cls.bibrecs['paper2']) def set_up_test_hoover_assign_one_inspire_id_from_an_unclaimed_paper(): cls.marc_xmls['paper3'] = get_new_marc_for_test( 'Test Paper', cls.authors['author3']['name'], None, ((cls.authors['author3']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper3'] = get_bibrec_for_record( cls.marc_xmls['paper3'], opt_mode='insert') cls.marc_xmls['paper3'] = add_001_field(cls.marc_xmls['paper3'], cls.bibrecs['paper3']) def set_up_test_hoover_assign_one_inspire_id_from_a_claimed_paper(): cls.marc_xmls['paper4'] = get_new_marc_for_test( 'Test Paper', cls.authors['author4']['name'], None, ((cls.authors['author4']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper4'] = get_bibrec_for_record( cls.marc_xmls['paper4'], opt_mode='insert') cls.marc_xmls['paper4'] = add_001_field(cls.marc_xmls['paper4'], cls.bibrecs['paper4']) def set_up_test_hoover_assign_one_inspire_id_from_unclaimed_papers_with_different_inspireID( ): cls.marc_xmls['paper5'] = get_new_marc_for_test( 'Test Paper', cls.authors['author5']['name'], None, ((cls.authors['author5']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper5'] = get_bibrec_for_record( cls.marc_xmls['paper5'], opt_mode='insert') cls.marc_xmls['paper5'] = add_001_field(cls.marc_xmls['paper5'], cls.bibrecs['paper5']) cls.marc_xmls['paper6'] = get_new_marc_for_test( 'Test Paper', cls.authors['author5']['name'], None, ((cls.authors['author6']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper6'] = get_bibrec_for_record( cls.marc_xmls['paper6'], opt_mode='insert') cls.marc_xmls['paper6'] = add_001_field(cls.marc_xmls['paper6'], cls.bibrecs['paper6']) def set_up_test_hoover_assign_one_inspire_id_from_a_claimed_paper_and_unclaimed_paper_with_different_inspireID( ): cls.marc_xmls['paper7'] = get_new_marc_for_test( 'Test Paper', cls.authors['author7']['name'], None, ((cls.authors['author7']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper7'] = get_bibrec_for_record( cls.marc_xmls['paper7'], opt_mode='insert') cls.marc_xmls['paper7'] = add_001_field(cls.marc_xmls['paper7'], cls.bibrecs['paper7']) cls.marc_xmls['paper8'] = get_new_marc_for_test( 'Test Paper', cls.authors['author7']['name'], None, ((cls.authors['author8']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper8'] = get_bibrec_for_record( cls.marc_xmls['paper8'], opt_mode='insert') cls.marc_xmls['paper8'] = add_001_field(cls.marc_xmls['paper8'], cls.bibrecs['paper8']) def set_up_test_hoover_assign_one_inspire_id_from_claimed_papers_with_different_inspireID( ): cls.marc_xmls['paper9'] = get_new_marc_for_test( 'Test Paper', cls.authors['author9']['name'], None, ((cls.authors['author2']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper9'] = get_bibrec_for_record( cls.marc_xmls['paper9'], opt_mode='insert') cls.marc_xmls['paper9'] = add_001_field(cls.marc_xmls['paper9'], cls.bibrecs['paper9']) cls.marc_xmls['paper10'] = get_new_marc_for_test( 'Test Paper', cls.authors['author9']['name'], None, ((cls.authors['author10']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper10'] = get_bibrec_for_record( cls.marc_xmls['paper10'], opt_mode='insert') cls.marc_xmls['paper10'] = add_001_field(cls.marc_xmls['paper10'], cls.bibrecs['paper10']) def set_up_test_hoover_vacuum_an_unclaimed_paper_with_an_inspire_id_from_a_claimed_paper( ): cls.marc_xmls['paper11'] = get_new_marc_for_test( 'Test Paper', cls.authors['author11']['name'], None, ((cls.authors['author11']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper11'] = get_bibrec_for_record( cls.marc_xmls['paper11'], opt_mode='insert') cls.marc_xmls['paper11'] = add_001_field(cls.marc_xmls['paper11'], cls.bibrecs['paper11']) cls.marc_xmls['paper12'] = get_new_marc_for_test( 'Test Paper', cls.authors['author12']['name'], None, ((cls.authors['author11']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper12'] = get_bibrec_for_record( cls.marc_xmls['paper12'], opt_mode='insert') cls.marc_xmls['paper12'] = add_001_field(cls.marc_xmls['paper12'], cls.bibrecs['paper12']) def set_up_test_hoover_vacuum_a_claimed_paper_with_an_inspire_id_from_a_claimed_paper( ): cls.marc_xmls['paper13'] = get_new_marc_for_test( 'Test Paper', cls.authors['author13']['name'], None, ((cls.authors['author13']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper13'] = get_bibrec_for_record( cls.marc_xmls['paper13'], opt_mode='insert') cls.marc_xmls['paper13'] = add_001_field(cls.marc_xmls['paper13'], cls.bibrecs['paper13']) cls.marc_xmls['paper14'] = get_new_marc_for_test( 'Test Paper', cls.authors['author14']['name'], None, ((cls.authors['author13']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper14'] = get_bibrec_for_record( cls.marc_xmls['paper14'], opt_mode='insert') cls.marc_xmls['paper14'] = add_001_field(cls.marc_xmls['paper14'], cls.bibrecs['paper14']) def set_up_test_hoover_assign_one_inspire_id_from_hepnames_record(): cls.marc_xmls['paper15'] = get_new_hepnames_marc_for_test( cls.authors['author15']['name'], ((cls.authors['author15']['inspireID'], 'i'), )) cls.bibrecs['paper15'] = get_bibrec_for_record( cls.marc_xmls['paper15'], opt_mode='insert') cls.marc_xmls['paper15'] = add_001_field(cls.marc_xmls['paper15'], cls.bibrecs['paper15']) def set_up_duplicated_unclaimed_signature(): cls.marc_xmls['paper16'] = get_new_marc_for_test( 'Test Paper', cls.authors['author16']['name'], (cls.authors['author17']['name'], ), ((cls.authors['author16']['inspireID'], 'i'), (cls.authors['author16']['inspireID'], 'i')), limit_to_collections=True) cls.bibrecs['paper16'] = get_bibrec_for_record( cls.marc_xmls['paper16'], opt_mode='insert') cls.marc_xmls['paper16'] = add_001_field(cls.marc_xmls['paper16'], cls.bibrecs['paper16']) def set_up_duplicated_claimed_signature(): cls.marc_xmls['paper18'] = get_new_marc_for_test( 'Test Paper', cls.authors['author18']['name'], (cls.authors['author19']['name'], ), ((cls.authors['author18']['inspireID'], 'i'), (cls.authors['author18']['inspireID'], 'i')), limit_to_collections=True) cls.bibrecs['paper18'] = get_bibrec_for_record( cls.marc_xmls['paper18'], opt_mode='insert') cls.marc_xmls['paper18'] = add_001_field(cls.marc_xmls['paper18'], cls.bibrecs['paper18']) set_up_test_hoover_inertia() set_up_test_hoover_duplication() set_up_test_hoover_assign_one_inspire_id_from_an_unclaimed_paper() set_up_test_hoover_assign_one_inspire_id_from_a_claimed_paper() set_up_test_hoover_assign_one_inspire_id_from_unclaimed_papers_with_different_inspireID( ) set_up_test_hoover_assign_one_inspire_id_from_a_claimed_paper_and_unclaimed_paper_with_different_inspireID( ) set_up_test_hoover_assign_one_inspire_id_from_claimed_papers_with_different_inspireID( ) set_up_test_hoover_vacuum_an_unclaimed_paper_with_an_inspire_id_from_a_claimed_paper( ) set_up_test_hoover_vacuum_a_claimed_paper_with_an_inspire_id_from_a_claimed_paper( ) set_up_test_hoover_assign_one_inspire_id_from_hepnames_record() set_up_duplicated_unclaimed_signature() set_up_duplicated_claimed_signature() cls.bibrecs_to_clean = [cls.bibrecs[key] for key in cls.bibrecs] rabbit(sorted([cls.bibrecs[key] for key in cls.bibrecs]), verbose=False) for key in cls.authors: try: temp = set() cls.bibrefs[key] = get_bibref_value_for_name( cls.authors[key]['name']) temp = run_sql( "select personid from aidPERSONIDPAPERS where bibref_value=%s and bibrec=%s and name=%s", (cls.bibrefs[key], cls.bibrecs[key.replace( 'author', 'paper')], cls.authors[key]['name'])) cls.pids[key] = temp[0][0] if temp else () except KeyError as e: print e claim_test_paper(cls.bibrecs['paper4']) claim_test_paper(cls.bibrecs['paper7']) claim_test_paper(cls.bibrecs['paper9']) claim_test_paper(cls.bibrecs['paper10']) claim_test_paper(cls.bibrecs['paper11']) claim_test_paper(cls.bibrecs['paper13']) claim_test_paper(cls.bibrecs['paper14']) claim_test_paper(cls.bibrecs['paper18']) tmp_claimed_exception = invenio.bibauthorid_hoover.DuplicateClaimedPaperException tmp_unclaimed_exception = invenio.bibauthorid_hoover.DuplicateUnclaimedPaperException class MockClaimedException( invenio.bibauthorid_hoover.DuplicateClaimedPaperException): def __init__(self, message, pid, signature, present_signatures): global dupl super(MockClaimedException, self).__init__(message, pid, signature, present_signatures) dupl += 1 class MockUnclaimedException( invenio.bibauthorid_hoover.DuplicateUnclaimedPaperException): def __init__(self, message, _pid, signature, present_signatures): global pid super(MockUnclaimedException, self).__init__(message, _pid, signature, present_signatures) pid = _pid invenio.bibauthorid_hoover.DuplicateClaimedPaperException = MockClaimedException invenio.bibauthorid_hoover.DuplicateUnclaimedPaperException = MockUnclaimedException hoover(list(set(cls.pids[key] for key in cls.pids if cls.pids[key]))) invenio.bibauthorid_hoover.DuplicateClaimedPaperException = tmp_claimed_exception invenio.bibauthorid_hoover.DuplicateUnclaimedPaperException = tmp_unclaimed_exception print "dupl", dupl