def get_defaults(tarball, sdir, refno_url): """ A function for parameter-checking. @param: tarball (string): the location of the tarball to be extracted @param: sdir (string): the location of the scratch directory for untarring, conversions, and the ultimate destination of the MARCXML @param: refno_url (string): server location on where to look for refno @return sdir, refno (string, string): the same arguments it was sent as is appropriate. """ if sdir == None: # Missing sdir: using default directory: CFG_TMPDIR sdir = CFG_TMPDIR else: sdir = os.path.split(tarball)[0] # make a subdir in the scratch directory for each tarball sdir = make_single_directory(sdir, \ os.path.split(tarball)[-1] + '_' + PLOTS_DIR) if refno_url != "": refno = get_reference_number(tarball, refno_url) if refno == None: refno = os.path.basename(tarball) write_message('Error: can\'t find record id for %s' % (refno,)) else: refno = os.path.basename(tarball) write_message("Skipping ref-no check") return sdir, refno
def get_defaults(tarball, sdir): ''' A function for parameter-checking. @param: tarball (string): the location of the tarball to be extracted @param: sdir (string): the location of the scratch directory for untarring, conversions, and the ultimate destination of the MARCXML @return sdir, refno (string, string): the same arguments it was sent as is appropriate. ''' if sdir == None: write_message('using default directory: ' + CFG_TMPDIR +\ ' for scratchwork') sdir = CFG_TMPDIR else: sdir = os.path.split(tarball)[0] # make a subdir in the scratch directory for each tarball sdir = make_single_directory(sdir, \ os.path.split(tarball)[-1] + '_' + PLOTS_DIR) arXiv_id = os.path.split(tarball)[-1] refno = get_reference_number(tarball) if refno == tarball: write_message('can\'t find record id for ' + arXiv_id) return sdir, refno
def plotextractor_harvest(identifier, active_file): """ Function that calls plotextractor library to download and extract tarball and fulltext pdf for each record. @param identifier: OAI identifier of the record to harvest @param active_file: path to the currently processed file @return: exitcode, errormessages and paths to generated MARCXML for plots and fulltext as a tuple (exitcode, err_msg, fulltext_xml, plotextracted_xml) """ err_msg = "" exitcode = 0 plotextracted_xml = None fulltext_xml = None active_dir, active_name = os.path.split(active_file) extract_path = make_single_directory(active_dir, active_name + \ "_plotextraction") tarball, pdf = harvest_single(identifier, extract_path) if tarball != None: plotextracted_xml_path = process_single(tarball, clean = True) if plotextracted_xml_path != None: plotsxml_fd = open(plotextracted_xml_path, 'r') plotextracted_xml = plotsxml_fd.read() plotsxml_fd.close() else: err_msg += "Error extracting plots from id: %s %s\n" % \ (identifier, tarball) exitcode = 1 else: err_msg += "Error harvesting plots from id: %s %s\n" % \ (identifier, extract_path) exitcode = 1 if pdf != None: fulltext_xml = '<datafield tag="FFT" ind1=" " ind2=" ">' + \ '<subfield code="a">' + pdf + '</subfield>' + \ '<subfield code="t"></subfield>' + \ '</datafield>' else: err_msg += "Error harvesting fulltext from id: %s %s\n" % \ (identifier, extract_path) exitcode = 1 return exitcode, err_msg, fulltext_xml, plotextracted_xml
def plotextractor_harvest(identifier, active_file): """ Function that calls plotextractor library to download and extract tarball and fulltext pdf for each record. @param identifier: OAI identifier of the record to harvest @param active_file: path to the currently processed file @return: exitcode, errormessages and paths to generated MARCXML for plots and fulltext as a tuple (exitcode, err_msg, fulltext_xml, plotextracted_xml) """ err_msg = "" exitcode = 0 plotextracted_xml = None fulltext_xml = None active_dir, active_name = os.path.split(active_file) extract_path = make_single_directory(active_dir, active_name + \ "_plotextraction") tarball, pdf = harvest_single(identifier, extract_path) if tarball != None: plotextracted_xml_path = process_single(tarball, clean=True) if plotextracted_xml_path != None: plotsxml_fd = open(plotextracted_xml_path, 'r') plotextracted_xml = plotsxml_fd.read() plotsxml_fd.close() else: err_msg += "Error extracting plots from id: %s %s\n" % \ (identifier, tarball) exitcode = 1 else: err_msg += "Error harvesting plots from id: %s %s\n" % \ (identifier, extract_path) exitcode = 1 if pdf != None: fulltext_xml = '<datafield tag="FFT" ind1=" " ind2=" ">' + \ '<subfield code="a">' + pdf + '</subfield>' + \ '<subfield code="t"></subfield>' + \ '</datafield>' else: err_msg += "Error harvesting fulltext from id: %s %s\n" % \ (identifier, extract_path) exitcode = 1 return exitcode, err_msg, fulltext_xml, plotextracted_xml