def format_element(bfo, separator='<br/>', width="800px", height="480px"): """ Display Flash (swf) panorama attached to this record. Consider files attached as .swf file with doctype 'panoaram'. @param separator: printed between each panorama @param width: width of each panorama @param height: height of each panorama """ out = "" panoramas = [] bibarchive = BibRecDocs(bfo.recID) # Prepare the Javascripts for bibdocfile in bibarchive.list_latest_files(doctype='panorama'): if bibdocfile.get_format() == '.swf': pano_index = len(panoramas) panoramas.append('embedpano({swf:"%(swf_file)s", target:"panoramabox%(pano_index)s", width:"%(width)s", height:"%(height)s"});' \ % {'swf_file': bibdocfile.get_url(), 'pano_index': pano_index, 'width': width, 'height': height}) if panoramas: out = separator.join(['<div id="panoramabox%i" style="margin:auto"></div>' %i for i in xrange(len(panoramas))]) out += '<script type="text/javascript" src="/js/swfkrpano.js"></script>' out += '<script type="text/javascript">' + \ ''.join(panoramas) + \ '</script>' return out
def get_media_from_recid(recid): ''' This method get the file in the given url @param(recid) : id of the file to get @return (file_type) : the mime type of the file found @return (data) : the file in a string variable ''' medias = [] bibarchiv = BibRecDocs(recid) bibdocs = bibarchiv.list_latest_files() for bibdocfile in bibdocs : bibfile = {'name': bibdocfile.get_full_name(), 'file' : '', 'type': 'application/%s' % \ bibdocfile.get_superformat().split(".")[-1], 'path': bibdocfile.get_full_path(), 'collection' : bibdocfile.get_type(), 'size': bibdocfile.get_size(), 'loaded' : False, 'selected' : ''} if bibfile['collection'] == "Main" : bibfile['selected'] = 'checked=yes' medias.append(bibfile) return medias
def get_media_from_recid(recid): ''' This method get the file in the given url @param recid: id of the file to get ''' medias = [] bibarchiv = BibRecDocs(recid) bibdocs = bibarchiv.list_latest_files() for bibdocfile in bibdocs: bibfile = {'name': bibdocfile.get_full_name(), 'file': '', 'type': 'application/%s' % \ bibdocfile.get_superformat().split(".")[-1], 'path': bibdocfile.get_full_path(), 'collection': bibdocfile.get_type(), 'size': bibdocfile.get_size(), 'loaded': False, 'selected': ''} if bibfile['collection'] == "Main": bibfile['selected'] = 'checked=yes' medias.append(bibfile) return medias
def record_has_arxiv_pdf(recid=None): if recid is None: return False brd = BibRecDocs(recid) for bdf in brd.list_latest_files(doctype="arXiv"): if bdf.format.lower() in ('.pdf', '.pdfa'): return True return False
def get_rawtext_from_record(record): bibrec = BibRecDocs(record.record_id) bibdoc = get_latest_pdf(bibrec.list_latest_files()) try: rawtext = bibdoc.bibdoc.get_text() except: return '' return rawtext
def download_one(recid, version): """Download given version of the PDF from arxiv""" write_message('fetching %s' % recid) for count, arxiv_id in enumerate(extract_arxiv_ids_from_recid(recid)): if count != 0: write_message("Warning: %s has multiple arxiv #" % recid) continue url_for_pdf = build_arxiv_url(arxiv_id, version) filename_arxiv_id = arxiv_id.replace('/', '_') temp_file = NamedTemporaryFile(prefix="arxiv-pdf-checker", dir=CFG_TMPSHAREDDIR, suffix="%s.pdf" % filename_arxiv_id) write_message('downloading pdf from %s' % url_for_pdf) path = download_external_url(url_for_pdf, temp_file.name, content_type='pdf') # Check if it is not an html not found page filesize = os.path.getsize(path) if filesize < 25000: f = open(path) try: for line in f: if 'PDF unavailable' in line: raise PdfNotAvailable() finally: f.close() docs = BibRecDocs(recid) bibdocfiles = docs.list_latest_files(doctype="arXiv") needs_update = False try: bibdocfile = bibdocfiles[0] except IndexError: bibdocfile = None needs_update = True else: existing_md5 = calculate_md5(bibdocfile.fullpath) new_md5 = calculate_md5(path.encode('utf-8')) if new_md5 != existing_md5: write_message('md5 differs updating') needs_update = True else: write_message('md5 matches existing pdf, skipping') if needs_update: if bibdocfiles: write_message('adding as new version') docs.add_new_version(path, docname=bibdocfile.name) else: write_message('adding as new file') docs.add_new_file(path, doctype="arXiv", docname="arXiv:%s" % filename_arxiv_id) else: raise FoundExistingPdf()
def check_records(records): for record in records: if is_springer(record): rec_doc = BibRecDocs(int(record.record_id)) rec_docs = rec_doc.list_latest_files() for doc in rec_docs: if doc.get_format() == '.xml': f = open(doc.get_full_path()) content = f.read() try: del record['100'] del record['700'] record.amended = True except: pass first_author = True try: if "-//NLM//DTD JATS" in content: jats = JATSParser() authors = jats.get_authors(parseString(content)) else: app = NLMParser() authors = app.get_authors(parseString(content)) except: record.warn('Problem with parsing XML.') continue for author in authors: if author.get('surname'): subfields = [ ('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', ''))) ] else: subfields = [('a', '%s' % (author.get('name', ''))) ] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record.add_field('100__', value='', subfields=subfields) first_author = False else: record.add_field('700__', value='', subfields=subfields)
def get_filetypes(recid): """ Returns filetypes extensions associated with given record. Takes as a parameter the recid of a record. @param url_field: recid of a record """ docs = BibRecDocs(recid) return [_get_filetype(d.format) for d in docs.list_latest_files()]
def get_rawtext_from_record_id(record_id): bibrec = BibRecDocs(record_id) bibdoc = get_latest_pdf(bibrec.list_latest_files()) try: rawtext = bibdoc.bibdoc.get_text() except: return '' return rawtext
def get_pdfa_record(self, path=None): from invenio.search_engine import perform_request_search xml_doc = self.get_article(path) rec = create_record() dummy, dummy, dummy, dummy, dummy, dummy, dummy,\ dummy, doi = self.get_publication_information(xml_doc) recid = perform_request_search(p='0247_a:"%s" AND NOT 980:"DELETED"' % (doi, )) if recid: record_add_field(rec, '001', controlfield_value=recid[0]) else: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) message = ('Adding PDF/A. No paper with this DOI: ' '%s. Trying to add it anyway.') % (doi, ) self.logger.error(message) try: if exists(join(path, 'main_a-2b.pdf')): record_add_field(rec, 'FFT', subfields=[('a', join(path, 'main_a-2b.pdf')), ('n', 'main'), ('f', '.pdf;pdfa')]) self.logger.debug('Adding PDF/A to record: %s' % (doi, )) elif exists(join(path, 'main.pdf')): record_add_field(rec, 'FFT', subfields=[('a', join(path, 'main.pdf'))]) message = 'No PDF/A in VTEX package for record: ' + doi self.logger.debug(message) else: message = "Record %s doesn't contain PDF file." % (doi, ) raise MissingFFTError(message) except MissingFFTError: message = "Elsevier paper: %s is missing PDF." % (doi, ) register_exception(alert_admin=True, prefix=message) self.logger.warning(message) ## copy other formats to bibupload file if recid: from invenio.bibdocfile import BibRecDocs record = BibRecDocs(recid[0]) for bibfile in record.list_latest_files(): if bibfile.get_format() != '.pdf;pdfa': record_add_field(rec, 'FFT', subfields=[('a', bibfile.get_full_path()), ('n', bibfile.get_name()), ('f', bibfile.get_format())]) return record_xml_output(rec)
def get_pdfa_record(self, path=None): from invenio.search_engine import perform_request_search xml_doc = self.get_article(path) rec = create_record() dummy, dummy, dummy, dummy, dummy, dummy, dummy,\ dummy, doi = self.get_publication_information(xml_doc) recid = perform_request_search(p='0247_a:"%s" AND NOT 980:"DELETED"' % (doi,)) if recid: record_add_field(rec, '001', controlfield_value=recid[0]) else: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) message = ('Adding PDF/A. No paper with this DOI: ' '%s. Trying to add it anyway.') % (doi,) self.logger.error(message) try: if exists(join(path, 'main_a-2b.pdf')): record_add_field( rec, 'FFT', subfields=[('a', join(path, 'main_a-2b.pdf')), ('n', 'main'), ('f', '.pdf;pdfa')]) self.logger.debug('Adding PDF/A to record: %s' % (doi,)) elif exists(join(path, 'main.pdf')): record_add_field( rec, 'FFT', subfields=[('a', join(path, 'main.pdf'))]) message = 'No PDF/A in VTEX package for record: ' + doi self.logger.debug(message) else: message = "Record %s doesn't contain PDF file." % (doi,) raise MissingFFTError(message) except MissingFFTError: message = "Elsevier paper: %s is missing PDF." % (doi,) register_exception(alert_admin=True, prefix=message) self.logger.warning(message) ## copy other formats to bibupload file if recid: from invenio.bibdocfile import BibRecDocs record = BibRecDocs(recid[0]) for bibfile in record.list_latest_files(): if bibfile.get_format() != '.pdf;pdfa': record_add_field(rec, 'FFT', subfields=[('a', bibfile.get_full_path()), ('n', bibfile.get_name()), ('f', bibfile.get_format())] ) return record_xml_output(rec)
def check_records(records): for record in records: if is_springer(record): rec_doc = BibRecDocs(int(record.record_id)) rec_docs = rec_doc.list_latest_files() for doc in rec_docs: if doc.get_format() == '.xml': f = open(doc.get_full_path()) content = f.read() try: del record['100'] del record['700'] record.amended = True except: pass first_author = True try: if "-//NLM//DTD JATS" in content: jats = JATSParser() authors = jats.get_authors(parseString(content)) else: app = NLMParser() authors = app.get_authors(parseString(content)) except: record.warn('Problem with parsing XML.') continue for author in authors: if author.get('surname'): subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))] else: subfields = [('a', '%s' % (author.get('name', '')))] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record.add_field('100__', value='', subfields=subfields) first_author = False else: record.add_field('700__', value='', subfields=subfields)
def get_filenames(recid): """ Returns names of the files associated with specific record and their derivatives. Takes as a parameter the recid of a record. Example: input: recID 999 (record with files ['thesis.ps.gz', 'random.pdf']) output: ['thesis.ps.gz', 'thesis.ps', 'thesis', 'random.pdf', 'random'] @param recid: recid of a record """ docs = BibRecDocs(recid) names = [_get_filenames(d.name + d.format) for d in docs.list_latest_files()] return reduce(lambda x,y: x+y, names)
def get_filenames(recid): """ Returns names of the files associated with specific record and their derivatives. Takes as a parameter the recid of a record. Example: input: recID 999 (record with files ['thesis.ps.gz', 'random.pdf']) output: ['thesis.ps.gz', 'thesis.ps', 'thesis', 'random.pdf', 'random'] @param recid: recid of a record """ docs = BibRecDocs(recid) names = [ _get_filenames(d.name + d.format) for d in docs.list_latest_files() ] return reduce(lambda x, y: x + y, names)
def has_or_had_format(recid, format): doc = BibRecDocs(recid) formats = [] ret = 0 for d in doc.list_latest_files(): formats.append(d.format) if format in formats: ret = 1 else: for d in doc.list_bibdocs(): for dd in d.docfiles: if format == dd.format: ret = 2 if ret == 0: return "<b>NO</b>" elif ret == 1: return "yes" elif ret == 2: return "<b>diff. v.</b>"
def get_files_from_bibdoc(recid): """ Retrieves using BibDoc all the files related with a given record @param recid @return List of dictionaries containing all the information stored inside BibDoc if the current record has files attached, the empty list otherwise """ if not recid or recid < 0: return [] from invenio.bibdocfile import BibRecDocs, InvenioBibDocFileError files = [] try: bibrecdocs = BibRecDocs(int(recid)) except InvenioBibDocFileError: return [] latest_files = bibrecdocs.list_latest_files() for afile in latest_files: file_dict = {} file_dict['comment'] = afile.get_comment() file_dict['description'] = afile.get_description() file_dict['eformat'] = afile.get_format() file_dict['full_name'] = afile.get_full_name() file_dict['full_path'] = afile.get_full_path() file_dict['magic'] = afile.get_magic() file_dict['name'] = afile.get_name() file_dict['path'] = afile.get_path() file_dict['size'] = afile.get_size() file_dict['status'] = afile.get_status() file_dict['subformat'] = afile.get_subformat() file_dict['superformat'] = afile.get_superformat() file_dict['type'] = afile.get_type() file_dict['url'] = afile.get_url() file_dict['version'] = afile.get_version() files.append(file_dict) return files
def _get_fulltext_args_from_recids(recids, task_info): """Get list of fulltext locations for input recids @param recids: (list) list of recids @return: (list) list of strings of the form 'recid:fulltext dir' """ fulltext_arguments = [] last_updated = None if task_info: last_updated = task_info['last_updated'] if recids: if last_updated: q_get_outdated = "SELECT id FROM bibrec WHERE id IN (%s) AND " \ "modification_date > '%s';" % \ (",".join(map(lambda r: str(r), recids)), last_updated) ## Get records for reference extraction changed_records = run_sql(q_get_outdated) else: ## Make list of lists of input recids changed_records = [[r] for r in recids] if changed_records: for record_row in changed_records: record = record_row[0] bibrecdoc = BibRecDocs(record) ## Get the latest 'document items' for this record bibdocfiles = bibrecdoc.list_latest_files() if bibdocfiles: doc_types = { 'pdf': [], 'pdfa': [], 'text': [], } bibdoc = bibrecdoc.list_bibdocs() ## Get the text file for this record if bibdoc and bibdoc[0].has_text(): doc_types['text'].append(bibdoc[0].get_text_path()) ## For each file, of a record for doc in bibdocfiles: pipe_gfile = \ os.popen("%s '%s'" \ % (CFG_PATH_GFILE, doc.get_full_path().replace("'", "\\'")), "r") res_gfile = pipe_gfile.readline() pipe_gfile.close() ## Look for : 1. Unstamped, original uploaded-by-user, pdf files ## 2. Stamped, processed, pdf files ## 3. Text files if (res_gfile.lower().find('pdfa') != -1): doc_types['pdfa'].append(doc.get_full_path()) elif (res_gfile.lower().find('pdf') != -1): doc_types['pdf'].append(doc.get_full_path()) ## Choose the type in this order of priority type_of_choice = doc_types['text'] or doc_types[ 'pdf'] or doc_types['pdfa'] if type_of_choice: fulltext_arguments.append( str(record).rstrip(".") + ':' + type_of_choice[0]) else: write_message("W: No pdf/text file for recid %s" % \ str(record), stream=sys.stdout, verbose=0) else: write_message("W: No files exist for recid %s" % \ str(record), stream=sys.stdout, verbose=0) elif task_info: ## In the event that no records have been modified since the ## last reference extraction write_message("No newly modified records for extraction-job '%s'." \ % task_info['name'], stream=sys.stdout, verbose=0) return fulltext_arguments
def bst_scoap3_importer(): task_sleep_now_if_required(can_stop_too=True) f = urllib.urlopen('http://repo.scoap3.org/ffts_for_inspire.py/csv') fd_update, name_update = mkstemp(suffix='.xml', prefix='bibupload_scoap3_', dir=CFG_TMPSHAREDDIR) out_update = fdopen(fd_update, 'w') fd_new, name_new = mkstemp(suffix='.xml', prefix='bibupload_scoap3_', dir=CFG_TMPSHAREDDIR) out_new = fdopen(fd_new, 'w') print >> out_update, "<collection>" print >> out_new, "<collection>" line_count_new = 0 # to avoid empty bibupload line_count_update = 0 # to avoid empty bibupload f.readline() ## Let's strip the header line for d in f: task_sleep_now_if_required(can_stop_too=True) recid, arxiv_id, cr_date, checksum, link, type, doi = [ x.strip() for x in d.split(',') ] write_message(d.strip()) if checksum == "None": write_message("... no PDF. Skipping") continue if arxiv_id == "None": inspire_record = perform_request_search(p="doi:%s" % (doi, ), cc="HEP") else: inspire_record = perform_request_search(p="037:%s or doi:%s" % (arxiv_id, doi), cc="HEP") if len(inspire_record) > 1: write_message( "ERROR: more than one INSPIRE record matched %s and %s for SCOAP3 record %s: %s" % (arxiv_id, doi, recid, list(inspire_record)), stream=sys.stderr) continue elif not inspire_record: write_message( "WARNING: no INSPIRE record matched %s or %s for SCOAP3 record %s" % (arxiv_id, doi, recid), stream=sys.stderr) continue action = None # do nothing rec = {} inspire_record = inspire_record[0] record = BibRecDocs(inspire_record) for doc in record.list_latest_files(): if doc.format in ('.pdf', '.pdf;pdfa'): if doc.bibdoc.doctype == 'SCOAP3': if doc.checksum == checksum: write_message( "... OK: file alredy attached to INSPIRE record %s (doc.checksum=%s, checksum=%s)" % (inspire_record, doc.checksum, checksum)) else: write_message( "... OK: new revision available for INSPIRE record %s (doc.checksum=%s, checksum=%s)" % (inspire_record, doc.checksum, checksum)) action = "UPDATE" break else: write_message("... OK: need to add new file to INSPIRE record %s" % inspire_record) action = "APPEND" if action: if type == '.pdf;pdfa': record_add_field(rec, 'FFT', subfields=[('a', link), ('n', 'scoap3-fulltext'), ('f', '.pdf;pdfa'), ('t', 'SCOAP3'), ('d', 'Article from SCOAP3')]) else: record_add_field(rec, 'FFT', subfields=[('a', link), ('n', 'scoap3-fulltext'), ('t', 'SCOAP3'), ('d', 'Article from SCOAP3')]) record_add_field(rec, '001', controlfield_value=str(inspire_record)) if action == "UPDATE": line_count_update += 1 print >> out_update, record_xml_output(rec) elif action == "APPEND": line_count_new += 1 print >> out_new, record_xml_output(rec) print >> out_update, "</collection>" print >> out_new, "</collection>" out_new.close() out_update.close() if line_count_new: id = task_low_level_submission("bibupload", "admin", "-N", "SCOAP3-import", "-a", name_new) write_message("Scheduled bibupload --append %s with ID #%s" % (name_new, id)) else: remove(name_new) if line_count_update: id = task_low_level_submission("bibupload", "admin", "-N", "SCOAP3-import", "-c", name_update) write_message("Scheduled bibupload --correct %s with ID #%s" % (name_new, id)) else: remove(name_update)
def format_element(bfo, oai=0): """Produce MARCXML with enhanced fields. Adds 100/700 $x with Record ID of linked HepName, 701/702 $y with True/False if the signature is claimed $z with Record ID of institution $w with BAI of linked Profile 371/110 $z with Record ID of institution 119/502 $z with Record ID of institution 999C5 $0 with on the fly discovered Record IDs (not for books) 773 $0 with Record ID of corresponding Book or Proceeding or Report $1 with Record ID of corresponding Journal $2 with Record ID of corresponding Conference 693/710 $0 with Record ID of corresponding experiment """ record = bfo.get_record() recid = bfo.recID # Let's filter hidden fields if acc_authorize_action(bfo.user_info, "runbibedit")[0]: # not authorized for tag in CFG_BIBFORMAT_HIDDEN_TAGS: if tag in record: del record[tag] else: # Let's add bibdoc info bibrecdocs = BibRecDocs(recid) for bibdocfile in bibrecdocs.list_latest_files(): fft = [ ("a", bibdocfile.fullpath), ("d", bibdocfile.description or ""), ("f", bibdocfile.format or ""), ("n", bibdocfile.name or ""), ("r", bibdocfile.status or ""), ("s", bibdocfile.cd.strftime("%Y-%m-%d %H:%M:%S")), ("t", bibdocfile.get_type()), ("v", str(bibdocfile.version)), ("z", bibdocfile.comment or ""), ] for flag in bibdocfile.flags: fft.append(("o", flag)) record_add_field(record, "FFT", subfields=fft) is_institution = "INSTITUTION" in [collection.upper() for collection in bfo.fields("980__a")] if "100" in record or "700" in record: signatures = dict( (name, (personid, flag)) for name, personid, flag in run_sql( "SELECT name, personid, flag FROM aidPERSONIDPAPERS WHERE bibrec=%s AND flag>-2", (recid,) ) ) # Let's add signatures for field in ( record_get_field_instances(record, "100") + record_get_field_instances(record, "700") + record_get_field_instances(record, "701") + record_get_field_instances(record, "702") ): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if "a" in subfield_dict: author_name = subfield_dict["a"] if "i" in subfield_dict: inspire_id = subfield_dict["i"] hepname_id = get_hepname_id_from_inspire_id(inspire_id) if hepname_id: subfields.append(("x", "%i" % hepname_id)) subfields.append(("y", "1")) else: personid, flag = signatures.get(author_name, (None, None)) bai = get_personid_canonical_id().get(personid) if bai: subfields.append(("w", bai)) hepname_id = get_hepname_id(personid) if hepname_id: subfields.append(("x", "%i" % hepname_id)) subfields.append(("y", "%i" % (flag == 2))) # And matched affiliations if "u" in subfield_dict: for code, value in subfields: if code == "u": ids = get_institution_ids(value) if len(ids) == 1: subfields.append(("z", "%i" % ids[0])) # Thesis institution for field in record_get_field_instances(record, "502"): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if "c" in subfield_dict: for code, value in subfields: if code == "c": ids = get_institution_ids(value) if len(ids) == 1: subfields.append(("z", "%i" % ids[0])) # Enhance affiliation in Experiments for field in record_get_field_instances(record, "119"): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if "u" in subfield_dict: for code, value in subfields: if code == "u": ids = get_institution_ids(value) if len(ids) == 1: subfields.append(("z", "%i" % ids[0])) # Enhance affiliation in HepNames and Jobs and Institutions for field in record_get_field_instances(record, "371"): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if "a" in subfield_dict: for code, value in subfields: if code == "a": ids = get_institution_ids(value) if len(ids) == 1: subfields.append(("z", "%i" % ids[0])) for field in record_get_field_instances(record, "110"): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if is_institution: # We try to resolve obsolete ICNs if "x" in subfield_dict: for code, value in subfields: if code == "x": ids = get_institution_ids(value) if len(ids) == 1: subfields.append(("z", "%i" % ids[0])) else: # In other collections institution is in a if "a" in subfield_dict: for code, value in subfields: if code == "a": ids = get_institution_ids(value) if len(ids) == 1: subfields.append(("z", "%i" % ids[0])) # Enhance citation for field in record_get_field_instances(record, "999", ind1="C", ind2="5"): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if "0" not in subfield_dict: matched_id = get_matched_id(subfields) if matched_id: subfields.append(("0", str(matched_id))) # Enhance CNUMs and Journals for field in record_get_field_instances(record, "773"): subfields = field_get_subfield_instances(field) for code, value in subfields: if code == "w": # Conference CNUMs recids = perform_request_search(p='111__g:"%s"' % value, cc="Conferences") if len(recids) == 1: subfields.append(("2", str(recids.pop()))) recids = perform_request_search(p='773__w:"%s" 980:PROCEEDINGS' % value) if recid in recids: # We remove this very record, since it can be a proceedings recids.remove(recid) if len(recids) == 1: subfields.append(("0", str(recids.pop()))) elif code == "p": # Journal title recids = perform_request_search(p='711__a:"%s"' % value, cc="Journals") if len(recids) == 1: subfields.append(("1", str(recids.pop()))) elif code == "z": # ISBN recids = find_isbn({"ISBN": value}) if len(recids) == 1: subfields.append(("0", str(recids.pop()))) elif code == "r": # Report recids = perform_request_search(p='reportnumber:"%s"' % value) if len(recids) == 1: subfields.append(("0", str(recids.pop()))) # Enhance Experiments for field in record_get_field_instances(record, "693"): subfields = field_get_subfield_instances(field) for code, value in subfields: if code == "e": recids = perform_request_search(p='119__a:"%s"' % value, cc="Experiments") if len(recids) == 1: subfields.append(("0", str(recids.pop()))) # Enhance Experiments for field in record_get_field_instances(record, "710"): subfields = field_get_subfield_instances(field) for code, value in subfields: if code == "g": recids = perform_request_search(p='119__a:"%s"' % value, cc="Experiments") if len(recids) == 1: subfields.append(("0", str(recids.pop()))) # Add Creation date: if "961" in record: del record["961"] creation_date, modification_date = run_sql( "SELECT creation_date, modification_date FROM bibrec WHERE id=%s", (recid,) )[0] record_add_field( record, "961", subfields=[("c", creation_date.strftime("%Y-%m-%d")), ("x", modification_date.strftime("%Y-%m-%d"))], ) formatted_record = record_xml_output(record) if oai: formatted_record = formatted_record.replace( "<record>", '<marc:record xmlns:marc="http://www.loc.gov/MARC21/slim" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd" type="Bibliographic">\n <marc:leader>00000coc 2200000uu 4500</marc:leader>', ) formatted_record = formatted_record.replace( '<record xmlns="http://www.loc.gov/MARC21/slim">', '<marc:record xmlns:marc="http://www.loc.gov/MARC21/slim" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd" type="Bibliographic">\n <marc:leader>00000coc 2200000uu 4500</marc:leader>', ) formatted_record = formatted_record.replace("</record", "</marc:record") formatted_record = formatted_record.replace("<controlfield", "<marc:controlfield") formatted_record = formatted_record.replace("</controlfield", "</marc:controlfield") formatted_record = formatted_record.replace("<datafield", "<marc:datafield") formatted_record = formatted_record.replace("</datafield", "</marc:datafield") formatted_record = formatted_record.replace("<subfield", "<marc:subfield") formatted_record = formatted_record.replace("</subfield", "</marc:subfield") return formatted_record
def tarballs_by_recids(recids, sdir, docname=None, doctype=None, docformat=None): """ Take a string representing one recid or several and get the associated tarballs for those ids. By default look for files with names matching the report number and with source field 'arXiv'. This can be changed with C{docname}, C{doctype}, C{docformat} @param: recids (string): the record id or ids @param: sdir (string): where the tarballs should live @param docname: select tarball for given recid(s) that match docname @param doctype: select tarball for given recid(s) that match doctype @param docformat: select tarball for given recid(s) that match docformat @return: tarballs ([string, string, ...]): locations of tarballs """ if not recids: return [] list_of_ids = [] if ',' in recids: recids = recids.split(',') for recid in recids: if '-' in recid: low, high = recid.split('-') recid = range(int(low), int(high)) list_of_ids.extend(recid) else: recid = int(recid) list_of_ids.append(recid) else: if '-' in recids: low, high = recids.split('-') list_of_ids = range(int(low), int(high)) else: list_of_ids = [int(recids)] arXiv_ids = [] local_files = [] for recid in list_of_ids: rec = get_record(recid) if not doctype and not docname and not docformat: for afieldinstance in record_get_field_instances(rec, tag='037'): if len(field_get_subfield_values(afieldinstance, '9')) > 0: if 'arXiv' == field_get_subfield_values( afieldinstance, '9')[0]: arXiv_id = field_get_subfield_values( afieldinstance, 'a')[0] arXiv_ids.append(arXiv_id) else: bibarchive = BibRecDocs(recid) all_files = bibarchive.list_latest_files() if doctype: all_files = [ docfile for docfile in all_files if docfile.get_type() == doctype ] if docname: all_files = [ docfile for docfile in all_files if docfile.get_name() == docname ] if docformat: all_files = [ docfile for docfile in all_files if docfile.get_format() == docformat ] local_files.extend([(docfile.get_path(), recid) for docfile in all_files]) if doctype or docname or docformat: return local_files return tarballs_by_arXiv_id(arXiv_ids, sdir)
def get_formats(recid): doc = BibRecDocs(recid) formats = [] for d in doc.list_latest_files(): formats.append(d.format) return formats
def bst_scoap3_importer(): f = urllib.urlopen('http://repo.scoap3.org/ffts_for_inspire.py/csv') fd1, name1 = mkstemp(suffix='.xml', prefix='bibupload_scoap3_', dir=CFG_TMPSHAREDDIR) out_update = fdopen(fd1, 'w') fd2, name2 = mkstemp(suffix='.xml', prefix='bibupload_scoap3_', dir=CFG_TMPSHAREDDIR) out_new = fdopen(fd2, 'w') print >> out_update, "<collection>" print >> out_new, "<collection>" line_count_new = 0 # to avoid empty bibupload line_count_update = 0 # to avoid empty bibupload for d in f: d = [x.strip() for x in d.split(',')] print d if d[0] not in ["recid", ''] and d[4] != "no pdf": inspire_record = perform_request_search(p="037:%s" % (d[1],), cc="HEP") try: if not len(inspire_record): raise IndexError elif len(inspire_record) > 1: raise IndexError else: action = 0 # do nothing rec = {} record = BibRecDocs(inspire_record[0]) for doc in record.list_latest_files(): if doc.format in ('.pdf', '.pdf;pdfa'): if doc.bibdoc.doctype is 'SCOAP3': if doc.checksum is d[3]: print "File alredy attached" else: action = 1 # update else: action = 2 # new if action: if d[5] == '.pdf;pdfa': record_add_field(rec, 'FFT', subfields=[('a', d[4]), ('n', 'scoap3-fulltext'), ('f', '.pdf;pdfa'), ('t', 'SCOAP3'), ('d', 'Article from SCOAP3')]) else: record_add_field(rec, 'FFT', subfields=[('a', d[4]), ('n', 'scoap3-fulltext'), ('t', 'SCOAP3'), ('d', 'Article from SCOAP3')]) record_add_field(rec, '001', controlfield_value=inspire_record[0]) if action == 1: line_count_update = line_count_update + 1 print >> out_update, record_xml_output(rec) elif action == 2: line_count_new = line_count_new + 1 print >> out_new, record_xml_output(rec) except IndexError: register_exception(alert_admin=True, prefix="ERROR - PDF import from SCOAP3. No record with: %s" % (d[1],)) continue except: register_exception(alert_admin=True, prefix="ERROR - PDF import from SCOAP3.") continue print >> out_update, "</collection>" print >> out_new, "</collection>" out_new.close() out_update.close() if line_count_new: task_low_level_submission("bibupload", "admin", "-N", "SCOAP3-import", "-a", name2) if line_count_update: task_low_level_submission("bibupload", "admin", "-N", "SCOAP3-import", "-c", name1)
def format_element(bfo, oai=0): """Produce MARCXML with enhanced fields. Adds 100/700 $x with Record ID of linked HepName, 701/702 $y with True/False if the signature is claimed $z with Record ID of institution $w with BAI of linked Profile 371/110 $z with Record ID of institution 119/502 $z with Record ID of institution 999C5 $0 with on the fly discovered Record IDs (not for books) 773 $0 with Record ID of corresponding Book or Proceeding or Report $1 with Record ID of corresponding Journal $2 with Record ID of corresponding Conference 693/710 $0 with Record ID of corresponding experiment """ can_see_hidden_stuff = not acc_authorize_action(bfo.user_info, 'runbibedit')[0] recid = bfo.recID if can_see_hidden_stuff and is_record_deleted(bfo): record = salvage_deleted_record_from_history(recid) else: record = bfo.get_record() # Let's filter hidden fields if can_see_hidden_stuff: # Let's add bibdoc info bibrecdocs = BibRecDocs(recid) for bibdocfile in bibrecdocs.list_latest_files(): fft = [ ('a', bibdocfile.fullpath), ('d', bibdocfile.description or ''), ('f', bibdocfile.format or ''), ('n', bibdocfile.name or ''), ('r', bibdocfile.status or ''), ('s', bibdocfile.cd.strftime('%Y-%m-%d %H:%M:%S')), ('t', bibdocfile.bibdoc.doctype), ('v', str(bibdocfile.version)), ('z', bibdocfile.comment or ''), ] for flag in bibdocfile.flags: fft.append(('o', flag)) record_add_field(record, 'FFT', subfields=fft) else: # not authorized for tag in CFG_BIBFORMAT_HIDDEN_TAGS: if tag in record: del record[tag] is_institution = 'INSTITUTION' in [ collection.upper() for collection in bfo.fields('980__a') ] signatures = {} if '100' in record or '700' in record: signatures = dict(( name, (personid, flag) ) for name, personid, flag in run_sql( "SELECT name, personid, flag FROM aidPERSONIDPAPERS WHERE bibrec=%s AND flag>-2", (recid, ))) # Let's add signatures for field in record_get_field_instances( record, '100') + record_get_field_instances( record, '700') + record_get_field_instances( record, '701') + record_get_field_instances(record, '702'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'a' in subfield_dict: author_name = subfield_dict['a'] personid, flag = signatures.get(author_name, (None, None)) bai = get_personid_canonical_id().get(personid) if bai: subfields.append(('w', bai)) hepname_id = get_hepname_id(personid) if hepname_id: subfields.append(('x', '%i' % hepname_id)) subfields.append(('y', '%i' % (flag == 2))) # And matched affiliations if 'u' in subfield_dict: for code, value in subfields: if code == 'u': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) # Thesis institution for field in record_get_field_instances(record, '502'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'c' in subfield_dict: for code, value in subfields: if code == 'c': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) # Related institution for field in record_get_field_instances(record, '510'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'a' in subfield_dict and not '0' in subfield_dict: ids = get_institution_ids(subfield_dict['a']) if len(ids) == 1: subfields.append(('0', '%i' % ids[0])) # Related journal for field in record_get_field_instances(record, '530'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'a' in subfield_dict and not '0' in subfield_dict: ids = get_institution_ids(subfield_dict['a']) if len(ids) == 1: subfields.append(('0', '%i' % ids[0])) # Enhance affiliation in Experiments for field in record_get_field_instances(record, '119'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'u' in subfield_dict: for code, value in subfields: if code == 'u': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) # Enhance affiliation in HepNames and Jobs and Institutions and # naked affiliations in HEP for field in record_get_field_instances( record, '371') + record_get_field_instances(record, '902'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'a' in subfield_dict: for code, value in subfields: if code == 'a': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) for field in record_get_field_instances(record, '110'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if is_institution: # We try to resolve obsolete ICNs if 'x' in subfield_dict: for code, value in subfields: if code == 'x': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) else: # In other collections institution is in a if 'a' in subfield_dict: for code, value in subfields: if code == 'a': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) # Enhance citation for field in record_get_field_instances(record, '999', ind1='C', ind2='5'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if '0' in subfield_dict: # Already available recid subfields.append(('z', '1')) else: matched_id = get_matched_id(subfields) if matched_id: subfields.append(('0', str(matched_id))) # Enhance related records for field in ( record_get_field_instances(record, '780', ind1='0', ind2='2') + record_get_field_instances(record, '785', ind1='0', ind2='2') + record_get_field_instances(record, '787', ind1='0', ind2='8')): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) subfield_citation = [] if subfield_dict.get('r'): # Reportnumber subfield_citation.append(('r', subfield_dict['r'])) if subfield_dict.get('z'): # ISBN subfield_citation.append(('i', subfield_dict['z'])) if 'w' not in subfield_dict and subfield_citation: matched_id = get_matched_id(subfield_citation) if matched_id: subfields.append(('w', str(matched_id))) # Enhance CNUMs and Journals for field in record_get_field_instances(record, '773'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) for code, value in subfields: if code == 'w': # Conference CNUMs recids = perform_request_search(p='111__g:"%s"' % value, cc='Conferences') if len(recids) == 1: subfields.append(('2', str(recids.pop()))) if '0' not in subfield_dict: recids = perform_request_search( p='773__w:"%s" 980:PROCEEDINGS' % value) if recid in recids: # We remove this very record, since it can be a proceedings recids.remove(recid) if len(recids) == 1: subfields.append(('0', str(recids.pop()))) elif code == 'p': # Journal title recids = perform_request_search(p='711__a:"%s"' % value, cc='Journals') if len(recids) == 1: subfields.append(('1', str(recids.pop()))) elif code == 'z' and '0' not in subfield_dict: # ISBN recids = find_isbn({'ISBN': value}) if len(recids) == 1: subfields.append(('0', str(recids.pop()))) elif code == 'r' and '0' not in subfield_dict: # Report recids = perform_request_search(p='reportnumber:"%s"' % value) if len(recids) == 1: subfields.append(('0', str(recids.pop()))) # Enhance Experiments for field in record_get_field_instances(record, '693'): subfields = field_get_subfield_instances(field) for code, value in subfields: if code == 'e': recids = perform_request_search(p='119__a:"%s"' % value, cc='Experiments') if len(recids) == 1: subfields.append(('0', str(recids.pop()))) elif code == 'a': recids = perform_request_search(p='119__b:"%s"' % value, cc='Experiments') if len(recids) == 1: subfields.append(('0', str(recids.pop()))) # Enhance Experiments for field in record_get_field_instances(record, '710'): subfields = field_get_subfield_instances(field) for code, value in subfields: if code == 'g': recids = perform_request_search(p='119__a:"%s"' % value, cc='Experiments') if len(recids) == 1: subfields.append(('0', str(recids.pop()))) # Add Creation date: if '961' in record: del record['961'] creation_date, modification_date = run_sql( "SELECT creation_date, modification_date FROM bibrec WHERE id=%s", (recid, ))[0] record_add_field(record, '961', subfields=[('x', creation_date.strftime('%Y-%m-%d')), ('c', modification_date.strftime('%Y-%m-%d'))]) formatted_record = record_xml_output(record) if oai: formatted_record = formatted_record.replace( "<record>", "<marc:record xmlns:marc=\"http://www.loc.gov/MARC21/slim\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd\" type=\"Bibliographic\">\n <marc:leader>00000coc 2200000uu 4500</marc:leader>" ) formatted_record = formatted_record.replace( "<record xmlns=\"http://www.loc.gov/MARC21/slim\">", "<marc:record xmlns:marc=\"http://www.loc.gov/MARC21/slim\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd\" type=\"Bibliographic\">\n <marc:leader>00000coc 2200000uu 4500</marc:leader>" ) formatted_record = formatted_record.replace("</record", "</marc:record") formatted_record = formatted_record.replace("<controlfield", "<marc:controlfield") formatted_record = formatted_record.replace("</controlfield", "</marc:controlfield") formatted_record = formatted_record.replace("<datafield", "<marc:datafield") formatted_record = formatted_record.replace("</datafield", "</marc:datafield") formatted_record = formatted_record.replace("<subfield", "<marc:subfield") formatted_record = formatted_record.replace("</subfield", "</marc:subfield") return formatted_record
def tarballs_by_recids(recids, sdir, docname=None, doctype=None, docformat=None): """ Take a string representing one recid or several and get the associated tarballs for those ids. By default look for files with names matching the report number and with source field 'arXiv'. This can be changed with C{docname}, C{doctype}, C{docformat} @param: recids (string): the record id or ids @param: sdir (string): where the tarballs should live @param docname: select tarball for given recid(s) that match docname @param doctype: select tarball for given recid(s) that match doctype @param docformat: select tarball for given recid(s) that match docformat @return: tarballs ([string, string, ...]): locations of tarballs """ if not recids: return [] list_of_ids = [] if ',' in recids: recids = recids.split(',') for recid in recids: if '-' in recid: low, high = recid.split('-') recid = range(int(low), int(high)) list_of_ids.extend(recid) else: recid = int(recid) list_of_ids.append(recid) else: if '-' in recids: low, high = recids.split('-') list_of_ids = range(int(low), int(high)) else: list_of_ids = [int(recids)] arXiv_ids = [] local_files = [] for recid in list_of_ids: rec = get_record(recid) if not doctype and not docname and not docformat: for afieldinstance in record_get_field_instances(rec, tag='037'): if len(field_get_subfield_values(afieldinstance, '9')) > 0: if 'arXiv' == field_get_subfield_values(afieldinstance, '9')[0]: arXiv_id = field_get_subfield_values(afieldinstance, 'a')[0] arXiv_ids.append(arXiv_id) else: bibarchive = BibRecDocs(recid) all_files = bibarchive.list_latest_files() if doctype: all_files = [docfile for docfile in all_files if docfile.get_type() == doctype] if docname: all_files = [docfile for docfile in all_files if docfile.get_name() == docname] if docformat: all_files = [docfile for docfile in all_files if docfile.get_format() == docformat] local_files.extend([(docfile.get_path(), recid) for docfile in all_files]) if doctype or docname or docformat: return local_files return tarballs_by_arXiv_id(arXiv_ids, sdir)
def bst_scoap3_importer(): """Import from SCOAP3.""" try: request = requests.get( 'http://repo.scoap3.org/ffts_for_inspire.py/csv') except (HTTPError, ConnectionError, Timeout): register_exception() return task_sleep_now_if_required(can_stop_too=True) fd_update, name_update = mkstemp(suffix='.xml', prefix='bibupload_scoap3_', dir=CFG_TMPSHAREDDIR) out_update = fdopen(fd_update, 'w') fd_new, name_new = mkstemp(suffix='.xml', prefix='bibupload_scoap3_', dir=CFG_TMPSHAREDDIR) out_new = fdopen(fd_new, 'w') print >> out_update, "<collection>" print >> out_new, "<collection>" line_count_new = 0 # to avoid empty bibupload line_count_update = 0 # to avoid empty bibupload # We strip the first line. for line in request.text.split("\n")[1:]: if not line.strip(): continue task_sleep_now_if_required(can_stop_too=True) recid, arxiv_id, cr_date, checksum, link, file_format, doi = [ x.strip() for x in line.split(',') ] write_message(line.strip()) if checksum == "None": write_message("... no PDF. Skipping") continue if arxiv_id == "None": inspire_record = perform_request_search(p="doi:%s" % (doi, ), cc="HEP") else: inspire_record = perform_request_search(p="037:%s or doi:%s" % (arxiv_id, doi), cc="HEP") if len(inspire_record) > 1: write_message( "ERROR: more than one INSPIRE record matched %s and %s for SCOAP3 record %s: %s" % (arxiv_id, doi, recid, list(inspire_record)), stream=sys.stderr) continue elif not inspire_record: write_message( "WARNING: no INSPIRE record matched %s or %s for SCOAP3 record %s" % (arxiv_id, doi, recid), stream=sys.stderr) continue action = None # do nothing rec = {} inspire_record = inspire_record[0] record = BibRecDocs(inspire_record) for doc in record.list_latest_files('SCOAP3'): if doc.format == file_format: if doc.checksum == checksum: write_message( "... OK: file alredy attached to INSPIRE record %s (doc.checksum=%s, checksum=%s)" % (inspire_record, doc.checksum, checksum)) else: write_message( "... OK: new revision available for INSPIRE record %s (doc.checksum=%s, checksum=%s)" % (inspire_record, doc.checksum, checksum)) action = "UPDATE" break else: write_message("... OK: need to add new file to INSPIRE record %s" % inspire_record) action = "APPEND" if action: if file_format == '.pdf;pdfa': record_add_field(rec, 'FFT', subfields=[('a', link), ('n', 'scoap3-fulltext'), ('f', '.pdf;pdfa'), ('t', 'SCOAP3'), ('d', 'Article from SCOAP3')]) else: record_add_field(rec, 'FFT', subfields=[('a', link), ('n', 'scoap3-fulltext'), ('t', 'SCOAP3'), ('d', 'Article from SCOAP3')]) record_add_field(rec, '001', controlfield_value=str(inspire_record)) if action == "UPDATE": line_count_update += 1 print >> out_update, record_xml_output(rec) elif action == "APPEND": line_count_new += 1 print >> out_new, record_xml_output(rec) print >> out_update, "</collection>" print >> out_new, "</collection>" out_new.close() out_update.close() if line_count_new: # We use correct here instead of append to deal with potential sync issues. # Basically BibUpload should handle "new" corrections as "append" if it is not there. id = task_low_level_submission("bibupload", "admin", "-N", "SCOAP3-import", "-c", name_new) write_message("Scheduled bibupload --correct %s with ID #%s" % (name_new, id)) else: remove(name_new) if line_count_update: id = task_low_level_submission("bibupload", "admin", "-N", "SCOAP3-import", "-c", name_update) write_message("Scheduled bibupload --correct %s with ID #%s" % (name_update, id)) else: remove(name_update)
def bst_scoap3_importer(): """Import from SCOAP3.""" try: request = requests.get('http://repo.scoap3.org/ffts_for_inspire.py/csv') except (HTTPError, ConnectionError, Timeout): register_exception() return task_sleep_now_if_required(can_stop_too=True) fd_update, name_update = mkstemp( suffix='.xml', prefix='bibupload_scoap3_', dir=CFG_TMPSHAREDDIR ) out_update = fdopen(fd_update, 'w') fd_new, name_new = mkstemp( suffix='.xml', prefix='bibupload_scoap3_', dir=CFG_TMPSHAREDDIR ) out_new = fdopen(fd_new, 'w') print >> out_update, "<collection>" print >> out_new, "<collection>" line_count_new = 0 # to avoid empty bibupload line_count_update = 0 # to avoid empty bibupload # We strip the first line. for line in request.text.split("\n")[1:]: if not line.strip(): continue task_sleep_now_if_required(can_stop_too=True) recid, arxiv_id, cr_date, checksum, link, type, doi = [x.strip() for x in line.split(',')] write_message(line.strip()) if checksum == "None": write_message("... no PDF. Skipping") continue if arxiv_id == "None": inspire_record = perform_request_search(p="doi:%s" % (doi, ), cc="HEP") else: inspire_record = perform_request_search(p="037:%s or doi:%s" % (arxiv_id, doi), cc="HEP") if len(inspire_record) > 1: write_message("ERROR: more than one INSPIRE record matched %s and %s for SCOAP3 record %s: %s" % (arxiv_id, doi, recid, list(inspire_record)), stream=sys.stderr) continue elif not inspire_record: write_message("WARNING: no INSPIRE record matched %s or %s for SCOAP3 record %s" % (arxiv_id, doi, recid), stream=sys.stderr) continue action = None # do nothing rec = {} inspire_record = inspire_record[0] record = BibRecDocs(inspire_record) for doc in record.list_latest_files(): if doc.format in ('.pdf', '.pdf;pdfa'): if doc.bibdoc.doctype == 'SCOAP3': if doc.checksum == checksum: write_message("... OK: file alredy attached to INSPIRE record %s (doc.checksum=%s, checksum=%s)" % (inspire_record, doc.checksum, checksum)) else: write_message("... OK: new revision available for INSPIRE record %s (doc.checksum=%s, checksum=%s)" % (inspire_record, doc.checksum, checksum)) action = "UPDATE" break else: write_message("... OK: need to add new file to INSPIRE record %s" % inspire_record) action = "APPEND" if action: if type == '.pdf;pdfa': record_add_field(rec, 'FFT', subfields=[('a', link), ('n', 'scoap3-fulltext'), ('f', '.pdf;pdfa'), ('t', 'SCOAP3'), ('d', 'Article from SCOAP3')]) else: record_add_field(rec, 'FFT', subfields=[('a', link), ('n', 'scoap3-fulltext'), ('t', 'SCOAP3'), ('d', 'Article from SCOAP3')]) record_add_field(rec, '001', controlfield_value=str(inspire_record)) if action == "UPDATE": line_count_update += 1 print >> out_update, record_xml_output(rec) elif action == "APPEND": line_count_new += 1 print >> out_new, record_xml_output(rec) print >> out_update, "</collection>" print >> out_new, "</collection>" out_new.close() out_update.close() if line_count_new: id = task_low_level_submission("bibupload", "admin", "-N", "SCOAP3-import", "-a", name_new) write_message("Scheduled bibupload --append %s with ID #%s" % (name_new, id)) else: remove(name_new) if line_count_update: id = task_low_level_submission("bibupload", "admin", "-N", "SCOAP3-import", "-c", name_update) write_message("Scheduled bibupload --correct %s with ID #%s" % (name_new, id)) else: remove(name_update)
def format_element(bfo, oai=0): """Produce MARCXML with enhanced fields. Adds 100/700 $x with Record ID of linked HepName, 701/702 $y with True/False if the signature is claimed $z with Record ID of institution $w with BAI of linked Profile 371/110 $z with Record ID of institution 119/502 $z with Record ID of institution 999C5 $0 with on the fly discovered Record IDs (not for books) 773 $0 with Record ID of corresponding Book or Proceeding or Report $1 with Record ID of corresponding Journal $2 with Record ID of corresponding Conference 693/710 $0 with Record ID of corresponding experiment """ can_see_hidden_stuff = not acc_authorize_action(bfo.user_info, 'runbibedit')[0] recid = bfo.recID if can_see_hidden_stuff and is_record_deleted(bfo): record = salvage_deleted_record_from_history(recid) else: record = bfo.get_record() # Let's filter hidden fields if can_see_hidden_stuff: # Let's add bibdoc info bibrecdocs = BibRecDocs(recid) for bibdocfile in bibrecdocs.list_latest_files(): fft = [ ('a', bibdocfile.fullpath), ('d', bibdocfile.description or ''), ('f', bibdocfile.format or ''), ('n', bibdocfile.name or ''), ('r', bibdocfile.status or ''), ('s', bibdocfile.cd.strftime('%Y-%m-%d %H:%M:%S')), ('t', bibdocfile.bibdoc.doctype), ('v', str(bibdocfile.version)), ('z', bibdocfile.comment or ''), ] for flag in bibdocfile.flags: fft.append(('o', flag)) record_add_field(record, 'FFT', subfields=fft) else: # not authorized for tag in CFG_BIBFORMAT_HIDDEN_TAGS: if tag in record: del record[tag] is_institution = 'INSTITUTION' in [collection.upper() for collection in bfo.fields('980__a')] signatures = {} if '100' in record or '700' in record: signatures = dict((name, (personid, flag)) for name, personid, flag in run_sql("SELECT name, personid, flag FROM aidPERSONIDPAPERS WHERE bibrec=%s AND flag>-2", (recid, ))) # Let's add signatures for field in record_get_field_instances(record, '100') + record_get_field_instances(record, '700') + record_get_field_instances(record, '701') + record_get_field_instances(record, '702'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'a' in subfield_dict: author_name = subfield_dict['a'] personid, flag = signatures.get(author_name, (None, None)) bai = get_personid_canonical_id().get(personid) if bai: subfields.append(('w', bai)) hepname_id = get_hepname_id(personid) if hepname_id: subfields.append(('x', '%i' % hepname_id)) subfields.append(('y', '%i' % (flag == 2))) # And matched affiliations if 'u' in subfield_dict: for code, value in subfields: if code == 'u': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) # Thesis institution for field in record_get_field_instances(record, '502'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'c' in subfield_dict: for code, value in subfields: if code == 'c': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) # Related institution for field in record_get_field_instances(record, '510'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'a' in subfield_dict and not '0'in subfield_dict: ids = get_institution_ids(subfield_dict['a']) if len(ids) == 1: subfields.append(('0', '%i' % ids[0])) # Related journal for field in record_get_field_instances(record, '530'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'a' in subfield_dict and not '0'in subfield_dict: ids = get_institution_ids(subfield_dict['a']) if len(ids) == 1: subfields.append(('0', '%i' % ids[0])) # Enhance affiliation in Experiments for field in record_get_field_instances(record, '119'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'u' in subfield_dict: for code, value in subfields: if code == 'u': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) # Enhance affiliation in HepNames and Jobs and Institutions and # naked affiliations in HEP for field in record_get_field_instances(record, '371') + record_get_field_instances(record, '902'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'a' in subfield_dict: for code, value in subfields: if code == 'a': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) for field in record_get_field_instances(record, '110'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if is_institution: # We try to resolve obsolete ICNs if 'x' in subfield_dict: for code, value in subfields: if code == 'x': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) else: # In other collections institution is in a if 'a' in subfield_dict: for code, value in subfields: if code == 'a': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) # Enhance citation for field in record_get_field_instances(record, '999', ind1='C', ind2='5'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if '0' in subfield_dict: # Already available recid subfields.append(('z', '1')) else: matched_id = get_matched_id(subfields) if matched_id: subfields.append(('0', str(matched_id))) # Enhance related records for field in (record_get_field_instances(record, '780', ind1='0', ind2='2') + record_get_field_instances(record, '785', ind1='0', ind2='2') + record_get_field_instances(record, '787', ind1='0', ind2='8')): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) subfield_citation = [] if subfield_dict.get('r'): # Reportnumber subfield_citation.append(('r', subfield_dict['r'])) if subfield_dict.get('z'): # ISBN subfield_citation.append(('i', subfield_dict['z'])) if 'w' not in subfield_dict and subfield_citation: matched_id = get_matched_id(subfield_citation) if matched_id: subfields.append(('w', str(matched_id))) # Enhance CNUMs and Journals for field in record_get_field_instances(record, '773'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) for code, value in subfields: if code == 'w': # Conference CNUMs recids = perform_request_search(p='111__g:"%s"' % value, cc='Conferences') if len(recids) == 1: subfields.append(('2', str(recids.pop()))) if '0' not in subfield_dict: recids = perform_request_search(p='773__w:"%s" 980:PROCEEDINGS' % value) if recid in recids: # We remove this very record, since it can be a proceedings recids.remove(recid) if len(recids) == 1: subfields.append(('0', str(recids.pop()))) elif code == 'p': # Journal title recids = perform_request_search(p='711__a:"%s"' % value, cc='Journals') if len(recids) == 1: subfields.append(('1', str(recids.pop()))) elif code == 'z' and '0' not in subfield_dict: # ISBN recids = find_isbn({'ISBN': value}) if len(recids) == 1: subfields.append(('0', str(recids.pop()))) elif code == 'r' and '0' not in subfield_dict: # Report recids = perform_request_search(p='reportnumber:"%s"' % value) if len(recids) == 1: subfields.append(('0', str(recids.pop()))) # Enhance Experiments for field in record_get_field_instances(record, '693'): subfields = field_get_subfield_instances(field) for code, value in subfields: if code == 'e': recids = perform_request_search(p='119__a:"%s"' % value, cc='Experiments') if len(recids) == 1: subfields.append(('0', str(recids.pop()))) elif code == 'a': recids = perform_request_search(p='119__b:"%s"' % value, cc='Experiments') if len(recids) == 1: subfields.append(('0', str(recids.pop()))) # Enhance Experiments for field in record_get_field_instances(record, '710'): subfields = field_get_subfield_instances(field) for code, value in subfields: if code == 'g': recids = perform_request_search(p='119__a:"%s"' % value, cc='Experiments') if len(recids) == 1: subfields.append(('0', str(recids.pop()))) # Add Creation date: if '961' in record: del record['961'] creation_date, modification_date = run_sql("SELECT creation_date, modification_date FROM bibrec WHERE id=%s", (recid,))[0] record_add_field(record, '961', subfields=[('x', creation_date.strftime('%Y-%m-%d')), ('c', modification_date.strftime('%Y-%m-%d'))]) formatted_record = record_xml_output(record) if oai: formatted_record = formatted_record.replace("<record>", "<marc:record xmlns:marc=\"http://www.loc.gov/MARC21/slim\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd\" type=\"Bibliographic\">\n <marc:leader>00000coc 2200000uu 4500</marc:leader>") formatted_record = formatted_record.replace("<record xmlns=\"http://www.loc.gov/MARC21/slim\">", "<marc:record xmlns:marc=\"http://www.loc.gov/MARC21/slim\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd\" type=\"Bibliographic\">\n <marc:leader>00000coc 2200000uu 4500</marc:leader>") formatted_record = formatted_record.replace("</record", "</marc:record") formatted_record = formatted_record.replace("<controlfield", "<marc:controlfield") formatted_record = formatted_record.replace("</controlfield", "</marc:controlfield") formatted_record = formatted_record.replace("<datafield", "<marc:datafield") formatted_record = formatted_record.replace("</datafield", "</marc:datafield") formatted_record = formatted_record.replace("<subfield", "<marc:subfield") formatted_record = formatted_record.replace("</subfield", "</marc:subfield") return formatted_record
def _get_fulltext_args_from_recids(recids, task_info): """Get list of fulltext locations for input recids @param recids: (list) list of recids @return: (list) list of strings of the form 'recid:fulltext dir' """ fulltext_arguments = [] last_updated = None if task_info: last_updated = task_info['last_updated'] if recids: if last_updated: q_get_outdated = "SELECT id FROM bibrec WHERE id IN (%s) AND " \ "modification_date > '%s';" % \ (",".join(map(lambda r: str(r), recids)), last_updated) ## Get records for reference extraction changed_records = run_sql(q_get_outdated) else: ## Make list of lists of input recids changed_records = [[r] for r in recids] if changed_records: for record_row in changed_records: record = record_row[0] bibrecdoc = BibRecDocs(record) ## Get the latest 'document items' for this record bibdocfiles = bibrecdoc.list_latest_files() if bibdocfiles: doc_types = {'pdf' : [], 'pdfa' : [], 'text' : [],} bibdoc = bibrecdoc.list_bibdocs() ## Get the text file for this record if bibdoc and bibdoc[0].has_text(): doc_types['text'].append(bibdoc[0].get_text_path()) ## For each file, of a record for doc in bibdocfiles: pipe_gfile = \ os.popen("%s '%s'" \ % (CFG_PATH_GFILE, doc.get_full_path().replace("'", "\\'")), "r") res_gfile = pipe_gfile.readline() pipe_gfile.close() ## Look for : 1. Unstamped, original uploaded-by-user, pdf files ## 2. Stamped, processed, pdf files ## 3. Text files if (res_gfile.lower().find('pdfa') != -1): doc_types['pdfa'].append(doc.get_full_path()) elif (res_gfile.lower().find('pdf') != -1): doc_types['pdf'].append(doc.get_full_path()) ## Choose the type in this order of priority type_of_choice = doc_types['text'] or doc_types['pdf'] or doc_types['pdfa'] if type_of_choice: fulltext_arguments.append(str(record).rstrip(".")+':'+type_of_choice[0]) else: write_message("W: No pdf/text file for recid %s" % \ str(record), stream=sys.stdout, verbose=0) else: write_message("W: No files exist for recid %s" % \ str(record), stream=sys.stdout, verbose=0) elif task_info: ## In the event that no records have been modified since the ## last reference extraction write_message("No newly modified records for extraction-job '%s'." \ % task_info['name'], stream=sys.stdout, verbose=0) return fulltext_arguments
def retrieve_random_sample(possible_ids, directory): """retrieves a sample document from a given set and return the id""" while len(possible_ids) > 0: recid = possible_ids.pop() brd = BibRecDocs(recid) pdf_bibdocfile = reduce(lambda x, y: y, filter(lambda bdf: bdf.format == ".pdf", brd.list_latest_files()), None) if pdf_bibdocfile: file_to_save = os.path.join(directory, "%i.pdf" % (recid,)) f = open(file_to_save, "w") f.write(pdf_bibdocfile.get_content()) f.close() return recid return None # there were no more samples