def national_authors_list(req, search_country): req.content_type = 'text/csv; charset=utf-8' req.headers_out['content-disposition'] = ('attachment; ' 'filename=national_authors_list.csv') ids = perform_request_search(p="country:'%s'" % (search_country,)) req.write("#;RECID;Title;Creation date;Publisher;Total # of authors;Authors name(given country only);Authors country;Authors affiliations\n") for number, recid in enumerate(ids): title = record_get_field_value(get_record(recid), '245', code="a") del_date = get_creation_date(recid) publisher = record_get_field_value(get_record(recid), '980', code="b") rec = get_record(recid) authors = [] author_count = 0 for f in ['100', '700']: if f in rec: for auth in rec[f]: author_count += 1 aff = '' name = '' country = '' hit = 0 for subfield, value in auth[0]: if subfield == 'a': name = value if subfield in ['v', 'u']: if aff: aff += ', ' + value else: aff = value if subfield == 'w': if country: country += ', ' + value else: country = value if search_country in value: hit = 1 if hit: authors.append({'name': name, 'affiliation': aff.replace('\n',''), 'country': country}) for i, author in enumerate(authors): if i == 0: req.write("%s;%s;%s;%s;%s;%s;%s;%s;%s\n" % (number+1, recid, title.replace('\n',''), del_date, publisher, author_count, author['name'], author['country'], author['affiliation'])) else: req.write(";;;;;;%s;%s;%s\n" % (author['name'], author['country'], author['affiliation']))
def index(req): req.content_type = 'text/csv; charset=utf-8' req.headers_out['content-disposition'] = ('attachment; filename=scoap3_records_info.csv') req.write("SCOAP3 record id; Journal; Creation date; Modification date; Title; Authors; Publication info\n") for key, value in JOURNALS.iteritems(): recids = perform_request_search(c=value) for recid in recids: rec = get_record(recid) title = rec['245'][0][0][0][1].strip() creation_date = get_creation_date(recid) modification_date = get_modification_date(recid) authors = rec['100'][0][0][0][1] if '700' in rec: for author in rec['700']: authors += ' / %s' % (author[0][0][1]) publication_info = '' if '733' in rec: publication_info += "%s %s (%s) %s" % (rec['733'][0][0][0][1], rec['733'][0][0][1][1], rec['733'][0][0][2][1], rec['733'][0][0][3][1]) if '024' in rec: publication_info += " %s" % (rec['024'][0][0][0][1],) if '037' in rec: publication_info += " %s" % (rec['037'][0][0][0][1],) req.writeline("%s; %s; %s; %s; %s; %s; %s\n" % (recid, value, creation_date, modification_date, title, authors, publication_info))
def get_recids_changes(last_recid, max_recs=10000): search_op = '>' if last_recid == -1: l = list(dbquery.run_sql("SELECT id FROM bibrec ORDER BY creation_date ASC LIMIT 1")) search_op = '>=' else: # let's make sure we have a valid recid (or get the close valid one) l = list(dbquery.run_sql("SELECT id FROM bibrec WHERE id >= %s LIMIT 1", (last_recid,))) if not len(l): return last_recid = l[0][0] # there is not api to get this (at least i haven't found it) mod_date = search_engine.get_modification_date(last_recid, fmt="%Y-%m-%d %H:%i:%S") if not mod_date: return modified_records = list(dbquery.run_sql("SELECT id,modification_date, creation_date FROM bibrec " "WHERE modification_date " + search_op + "%s LIMIT %s", (mod_date, max_recs ))) out = {'DELETED': [], 'CHANGED': [], 'ADDED': []} for recid, mod_date, create_date in modified_records: if mod_date == create_date: out['ADDED'].append(recid) else: rec = search_engine.get_record(recid) status = bibrecord.record_get_field_value(rec, tag='980', code='c') if status == 'DELETED': out['DELETED'].append(recid) else: out['CHANGED'].append(recid) return out
def create_xml(recid=None, osti_id=None, doi=None): osti_exists = False doi_exists = False osti_mismatch = False mismatches = [] osti_subfields = [('9', 'OSTI'), ('a', osti_id)] record = get_record(recid) record_link = '<a href="http://inspirehep.net/record/%s">%s</a>' % (str(recid),str(recid)) append_record = {} additions = False errors = None for item in BibFormatObject(recid).fields('035__'): if item.has_key('9') and item.has_key('a'): if item['9'] == 'OSTI' and item['a'] == osti_id: osti_exists = True elif item['9'] == 'OSTI' and item['a'] != osti_id: osti_mismatch = True mismatches.append(item['a']) for item in BibFormatObject(recid).fields('0247_'): if item.has_key('2') and item.has_key('a'): if item['2'] == 'DOI' and item['a'] == doi: doi_exists = True if osti_exists is False and osti_mismatch is True: print str(recid), "already has a different OSTI ID" errors = "doi %s in record %s should match OSTI ID %s, but the record already contains OSTI ID(s) %s<br />" % (doi, record_link, osti_id, ','.join(mismatches)) return errors if doi_exists is False and osti_exists is True: print str(recid), "contains an OSTI ID but no doi" no_doi = "%s contains OSTI ID %s but not doi %s<br />" % (record_link, osti_id, doi) return no_doi if osti_exists is False and osti_mismatch is False: record_add_field(append_record, '001', controlfield_value=str(recid)) record_add_field(append_record, '035', '', '', subfields=osti_subfields) print "%s: added 035__a:%s" % (str(recid), osti_id) return print_rec(append_record)
def create_xml(recid): correct_record = {} tag = '8564_' record = get_record(recid) flag = None record_add_field(record, '001', controlfield_value=str(recid)) field_instances = record_get_field_instances(record, tag[0:3], tag[3], tag[4]) correct_subfields = [] for field_instance in field_instances: correct_subfields = [] # print field_instance for c,v in field_instance[0]: # print c,v matchObj = re.search(r'inspirehep\.net/record/\d+/files/fermilab-thesis-.*?\.pdf', v, flags=re.IGNORECASE) if matchObj: print 'yes' flag = True correct_subfields.append(('y', 'Fulltext')) correct_subfields.append((c,v)) record_add_field(correct_record, tag[0:3], tag[3], tag[4], \ subfields=correct_subfields) if flag: return print_rec(correct_record) else: return None
def create_xml(recid): """ Searches for duplicate instances of 773 and keeps the good one. """ tag = '773__' tag_value = tag + 'p' journal = get_fieldvalues(recid, tag_value) if len(journal) == 2 and journal[0] == journal[1]: record = get_record(recid) correct_record = {} record_add_field(correct_record, '001', \ controlfield_value=str(recid)) field_instances = record_get_field_instances(record, \ tag[0:3], tag[3], tag[4]) correct_subfields = [] c_value = False for field_instance in field_instances: for code, value in field_instance[0]: if value == 'To appear in the proceedings of': pass elif (code, value) not in correct_subfields: if code == 'c': if c_value: if len(value) > len(c_value): c_value = value else: c_value = value else: correct_subfields.append((code, value)) if c_value: correct_subfields.append(('c', c_value)) record_add_field(correct_record, tag[0:3], tag[3], tag[4], \ subfields=correct_subfields) return print_rec(correct_record) return None
def create_xml(recid, tags): """Create xml file to replace to 100, 700 block.""" record = get_record(recid) correct_record = {} record_add_field(correct_record, '001', controlfield_value=str(recid)) flag = None for tag in tags: field_instances = record_get_field_instances(record, tag[0:3], \ tag[3], tag[4]) correct_subfields = [] for field_instance in field_instances: correct_subfields = [] for code, value in field_instance[0]: if code == 'v': try: if VERBOSE: print len(AFFILIATIONS_DONE) affiliation_key = re.sub(r'\W+', ' ', value).upper() if not affiliation_key in AFFILIATIONS_DONE: new_values = get_aff(value) AFFILIATIONS_DONE[affiliation_key] = new_values for new_value in AFFILIATIONS_DONE[affiliation_key]: correct_subfields.append(('u', \ new_value.lstrip(' '))) flag = True except TypeError: pass correct_subfields.append((code, value)) record_add_field(correct_record, tag[0:3], tag[3], tag[4], \ subfields=correct_subfields) if flag: return print_rec(correct_record)
def get_list(): papers = [] prev_version = perform_request_search() for recid in prev_version: rec = get_record(recid) doi = None arxiv_id = None try: if ('2', 'DOI') in rec['024'][0][0]: for t in rec['024'][0][0]: if 'a' in t: doi = t[1] if not doi: print "No DOI for record: %i" % (recid, ) else: print "No DOI for record: %i" % (recid, ) except: print "No DOI for record: %i" % (recid, ) checksum, url, url_type = get_pdf(recid) if '037' in rec.keys(): if ('9', 'arXiv') in rec.get('037')[0][0]: for t in rec.get('037')[0][0]: if 'a' in t: arxiv_id = t[1] papers.append((recid, arxiv_id, get_creation_date(recid), checksum, url, url_type, doi)) return papers
def replace_references(recid): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record """ # Parse references references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml.encode("utf-8")) # Record marc xml record = get_record(recid) if references[0]: fields_to_add = record_get_field_instances(references[0], tag="999", ind1="%", ind2="%") # Replace 999 fields record_delete_fields(record, "999") record_add_fields(record, "999", fields_to_add) # Update record references out_xml = record_xml_output(record) else: out_xml = None return out_xml
def _get_formated_record(record_id, output_format, update_commands, language, outputTags=""): """Returns a record in a given format @param record_id: the ID of record to format @param output_format: an output format code (or short identifier for the output format) @param update_commands: list of commands used to update record contents @param language: the language to use to format the record """ updated_record = _get_updated_record(record_id, update_commands) xml_record = bibrecord.record_xml_output(updated_record) old_record = search_engine.get_record(recid=record_id) if "hm" == output_format: result = "<pre>\n" if "All tags" not in outputTags or not outputTags: diff_result = _get_record_diff(record_id, old_record, updated_record) for line in diff_result.split('\n')[:-1]: for tag in outputTags: if tag in line.split()[1]: result += line.strip() + '\n' elif '<strong' in line: if tag in line.split()[3]: result += line.strip() + '\n' else: result += _get_record_diff(record_id, old_record, updated_record) result += "</pre>" return result result = bibformat.format_record(recID=None, of=output_format, xml_record=xml_record, ln=language) return result
def update_references(recid, overwrite=True): """Update references for a record First, we extract references from a record. Then, we are not updating the record directly but adding a bibupload task in -c mode which takes care of updating the record. Parameters: * recid: the id of the record """ if not overwrite: # Check for references in record record = get_record(recid) if record and record_has_field(record, "999"): raise RecordHasReferences("Record has references and overwrite " "mode is disabled: %s" % recid) if get_fieldvalues(recid, "999C59"): raise RecordHasReferences("Record has been curated: %s" % recid) # Parse references references_xml = extract_references_from_record_xml(recid) # Save new record to file (temp_fd, temp_path) = mkstemp(prefix=CFG_REFEXTRACT_FILENAME, dir=CFG_TMPSHAREDDIR) temp_file = os.fdopen(temp_fd, "w") temp_file.write(references_xml.encode("utf-8")) temp_file.close() # Update record task_low_level_submission("bibupload", "refextract", "-P", "5", "-c", temp_path)
def record_get_keywords(record, main_field=bconfig.CFG_MAIN_FIELD, others=bconfig.CFG_OTHER_FIELDS): """Returns a dictionary of keywordToken objects from the marc record. Weight is set to (0,0) if no weight can be found. This will load keywords from the field 653 and 695__a (which are the old 'DESY' keywords) @var record: int or marc record, if int - marc record is loaded from the database. If you pass record instance, keywords are extracted from it @return: tuple (found, keywords, marcxml) found - int indicating how many main_field keywords were found the other fields are not counted keywords - standard dictionary of keywordToken objects marcrec - marc record object loaded with data """ keywords = {} if isinstance(main_field, basestring): main_field = [main_field] if isinstance(others, basestring): others = [others] if isinstance(record, int): rec = get_record(record) else: rec = record found = 0 for m_field in main_field: tag, ind1, ind2 = bibclassify_engine._parse_marc_code(m_field) for field in rec.get(tag, []): keyword = "" weight = 0 type = "" for subfield in field[0]: if subfield[0] == "a": keyword = subfield[1] elif subfield[0] == "n": weight = int(subfield[1]) elif subfield[0] == "9": type = subfield[1] if keyword: found += 1 keywords[bor.KeywordToken(keyword, type=type)] = [[(0, 0) for x in range(weight)]] if others: for field_no in others: tag, ind1, ind2 = bibclassify_engine._parse_marc_code(field_no) type = "f%s" % field_no for field in rec.get(tag, []): keyword = "" for subfield in field[0]: if subfield[0] == "a": keyword = subfield[1] keywords[bor.KeywordToken(keyword, type=type)] = [[(0, 0)]] break return found, keywords, rec
def create_xml(recid, IDs, tags): """ Replaces specific inspire-ids in records with nothing """ if VERBOSE: print "Working on %s" % recid record = get_record(int(recid)) correct_record = {} record_add_field(correct_record, '001', controlfield_value=recid) for tag in tags: field_instances = record_get_field_instances(record, \ tag[0:3], tag[3], tag[4]) for field_instance in field_instances: correct_subfields = [] for code, value in field_instance[0]: if code == 'i': if value in IDs: if VERBOSE: print "Getting rid of %s from %s!" % (value, recid) pass else: correct_subfields.append((code, value)) else: correct_subfields.append((code, value)) record_add_field(correct_record, tag[0:3], tag[3], tag[4], \ subfields=correct_subfields) return print_rec(correct_record)
def late(req): req.content_type = "text/html" print >> req, pageheaderonly("Late journals", req=req) for journal in CFG_JOURNALS: print >> req, "<h2>%s</h2>" % escape(get_coll_i18nname(journal)) results = get_collection_reclist(journal) print >> req, "<table>" print >> req, "<tr><th>DOI</th><th>Title</th><th>DOI registration</th><th>Arrival in SCOAP3</th></tr>" for recid in results: creation_date = run_sql("SELECT creation_date FROM bibrec WHERE id=%s", (recid, ))[0][0] record = get_record(recid) doi = record_get_field_value(record, '024', '7', code='a') title = record_get_field_value(record, '245', code='a') doi_date = run_sql("SELECT creation_date FROM doi WHERE doi=%s", (doi, )) background = "#eee" if doi_date: doi_date = doi_date[0][0] if (creation_date - doi_date).days < 0: background = "#66FF00" elif (creation_date - doi_date).days < 1: background = "#FF6600" else: background = "#FF0000" else: doi_date = '' print >> req, '<tr style="background-color: %s;"><td><a href="http://dx.doi.org/%s" target="_blank">%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' % ( background, escape(doi, True), escape(doi), title, doi_date, creation_date) print >> req, "</table>"
def get_ids_from_recid(recid): record = get_record(recid) ## Retrieving DOI doi = "" dois = record_get_field_values(record, "024", "7", code="a") dois = [doi for doi in dois if doi.startswith("10.")] if len(dois) > 1: print >> sys.stderr, "WARNING: record %s have more than one DOI: %s" % (recid, dois) elif len(dois) == 1: doi = dois[0] ## Retrieving arXiv eprint eprint = "" eprints = record_get_field_values(record, "035", code="a") eprints = [ an_eprint[len("oai:arXiv.org:") :] for an_eprint in eprints if an_eprint.lower().startswith("oai:arxiv.org:") ] if len(eprints) > 1: print >> sys.stderr, "WARNING: record %s have more than one arXiv eprint: %s" % (recid, eprints) elif len(eprints) == 1: eprint = eprints[0] ## Retrieving Other service ID other_id = "" for field in record_get_field_instances(record, "035"): subfields = dict(field_get_subfield_instances(field)) if subfields.get("9", "").upper() == CFG_OTHER_SITE.upper() and subfields.get("a"): other_id = subfields["a"] reportnumbers = record_get_field_values(record, "037", code="a") return [str(recid), doi, eprint, other_id] + reportnumbers
def get_record_checks(req, recids): if recids == '': return '' recids = recids.split(',') return_val = [] for rid in recids: try: recid = int(rid) rec = get_record(recid) doi = get_doi(rec) record_compl = is_complete_record(recid) return_val.append("""<tr> <td><a href="%s">%i</a></td> <td>%s</td> <td>%s</td> <td><a href="http://dx.doi.org/%s">%s</a></td> <td>%s</td> <td>%s</td> <td>%s</td> <td>%s</td> <td>%s</td> <td>%s</td> <td>%s</td> <td>%s</td> <td>%s</td> </tr>""" % (join(CFG_SITE_URL, 'record', str(recid)), recid, get_creation_date(recid), get_modification_date(recid), doi, doi, has_or_had_format(recid, '.xml'), has_or_had_format(recid, '.pdf'), has_or_had_format(recid, '.pdf;pdfa'), check_complete_rec(record_compl), get_arxiv(rec), is_compliant(recid, "authors"), is_compliant(recid, "cc"), is_compliant(recid, "scoap3"), str([rec_key for rec_key, rec_val in record_compl.iteritems() if not rec_val]))) except: recid = rid return_val.append("""<tr><th colspan="13" align="left"> <h2>%s</h2></th></tr>""" % (recid,)) return_val.append("""<tr> <th>recid</th> <th>cr. date</th> <th>mod. date</th> <th>DOI</th> <th>XML</th> <th>PDF</th> <th>PDF/A</th> <th>Complete record?</th> <th>arXiv number</th> <th>Copyright: authors</th> <th>CC-BY</th> <th>Funded by SCOAP3</th> <th>notes</th> </tr>""") return ''.join(return_val)
def write_csv(req, dictionary, journal_list, f_date, t_date, created_or_modified_date): return_val = '' for key in journal_list: val = dictionary[key] papers = perform_request_search(p="date%s:%s->%s" % (created_or_modified_date, f_date, t_date), c=val) if papers == []: continue return_val += key + '\n' return_val += ';'.join(['recid', 'cr. date', 'mod. date', 'DOI', 'XML', 'PDF', 'PDF/A', 'Complete record?', 'arXiv number', 'Copyright: authors', 'CC-BY', 'Funded by SCOAP3', 'arXiv category', 'notes', 'First delivery', 'First AB delivery', 'Last modification', 'PDF/A upload', 'DOI registration', 'Delivery diff', 'PDF/A diff']) + '\n' for recid in papers: rec = get_record(recid) doi = get_doi(rec) first_del = None first_ab_del = None last_mod = None doi_reg = None pdfa_del = None first_del, first_ab_del, last_mod, doi_reg, pdfa_del = get_delivery_data(recid, doi) record_compl = is_complete_record(recid) return_val += ';'.join(str(item) for item in [str(recid), get_creation_date(recid), get_modification_date(recid), doi, has_or_had_format(recid, '.xml').lstrip('<b>').rstrip('</b>'), has_or_had_format(recid, '.pdf').lstrip('<b>').rstrip('</b>'), has_or_had_format(recid, '.pdf;pdfa').lstrip('<b>').rstrip('</b>'), str(check_complete_rec(record_compl)), get_arxiv(rec).lstrip('<b>').rstrip('</b>'), is_compliant(recid, 'authors').lstrip('<b>').rstrip('</b>'), is_compliant(recid, 'cc').lstrip('<b>').rstrip('</b>'), is_compliant(recid, 'scoap3').lstrip('<b>').rstrip('</b>'), is_compliant(recid, 'category').lstrip('<b>').rstrip('</b>'), str([rec_key for rec_key, rec_val in record_compl.iteritems() if not rec_val]), str(first_del), str(first_ab_del), str(last_mod), str(pdfa_del), str(doi_reg), check_24h_delivery(first_del, doi_reg), check_24h_delivery(pdfa_del, doi_reg) ]) return_val += '\n' return return_val
def create_xml(recid): record = get_record(recid) correct_record = {} record_add_field(correct_record, '001', controlfield_value=str(recid)) field_instances = record_get_field_instances(record, tag[0:3], tag[3], tag[4]) correct_subfields = [] for field_instance in field_instances: correct_subfields = [] for code, value in field_instance[0]: if volume_letter: if code == 'p': correct_subfields.append(('p', repl_journal)) elif code == 'v': volume = get_fieldvalues(recid, '773__v') for v in volume: if v[0].isalpha(): correct_subfields.append(('v', v)) else: new_volume = volume_letter + v correct_subfields.append(('v', new_volume)) else: correct_subfields.append((code, value)) else: if code == 'p': correct_subfields.append(('p', repl_journal)) else: correct_subfields.append((code, value)) record_add_field(correct_record, tag[0:3], tag[3], tag[4], subfields=correct_subfields) return print_rec(correct_record)
def test_create_record_and_signatures(self): """Test creating record and signatures.""" import querying from invenio.search_engine import get_record querying.consecutive_id = count() record_id = 123456 cds_record = get_record(record_id) signatures = querying.create_signatures(record_id, cds_record) record = querying.create_record(record_id, cds_record) record_expected = { 'publication_id': '123456', 'title': 'Target mass corrections in QCD', 'year': '1980', 'authors': ['Frazer, W R', 'Gunion, J F']} signatures_expected = [ {'publication_id': '123456', 'signature_id': '123456_Frazer, W R_0', 'author_affiliation': '', 'author_name': 'Frazer, W R'}, {'publication_id': '123456', 'signature_id': '123456_Gunion, J F_1', 'author_affiliation': '', 'author_name': 'Gunion, J F'}] self.assertEqual(signatures, signatures_expected) self.assertEqual(record, record_expected)
def record_get_keywords(recid, argd): """Returns a list of pairs [keyword, weight] contained in the record. Weight is set to 0 if no weight can be found.""" keywords = [] rec = get_record(recid) for field in rec.get('653', []): keyword = '' weight = 0 for subfield in field[0]: if subfield[0] == 'a': keyword = subfield[1] elif subfield[0] == 'n': weight = int(subfield[1]) if argd['sort'] == 'related': # Number of related documents minus 1 in order to not # consider the source document. weight = len(perform_request_search(p='"%s"' % keyword, f='keyword')) if weight: keywords.append([keyword, weight]) else: keywords.append([keyword, weight]) return keywords
def load_ticket_templates(recId): """ Loads all enabled ticket plugins and calls them. @return dictionary with the following structure: key: string: name of queue value: dict: a dictionary with 2 keys, the template subject and content of the queue @rtype dict """ ticket_templates = {} all_plugins, error_messages = load_ticket_plugins() if error_messages: # We got broken plugins. We alert only for now. print >>sys.stderr, "\n".join(error_messages) else: plugins = all_plugins.get_enabled_plugins() record = get_record(recId) for name, plugin in plugins.items(): if plugin: queue_data = plugin['get_template_data'](record) if queue_data: ticket_templates[queue_data[0]] = { 'subject' : queue_data[1], 'content' : queue_data[2] } else: raise BibEditPluginException("Plugin not valid in %s" % (name,)) return ticket_templates
def match_all_subfields_for_tag(recID, field_tag, subfields_required=[]): """ Tests whether the record with recID has at least one field with 'field_tag' where all of the required subfields in subfields_required match a subfield in the given field both in code and value @param recID: record ID @type recID: int @param field_tag: a 3 digit code for the field tag code @type field_tag: string @param subfields_required: a list of subfield code/value tuples @type subfields_required: list of tuples of strings. same format as in get_record(): e.g. [('w', 't'), ('4', 'XYZ123')] @return: boolean """ rec = get_record(recID) for field in rec[field_tag]: subfields_present = field[0] intersection = set(subfields_present) & set(subfields_required) if set(subfields_required) == intersection: return True return False
def create_xml(recid, tags, experiment): record = get_record(recid) correct_record = {} record_add_field(correct_record, '001', controlfield_value=str(recid)) flag = None for tag in tags: field_instances = record_get_field_instances(record, tag[0:3], tag[3], tag[4]) correct_subfields = [] for field_instance in field_instances: correct_subfields = [] for code, value in field_instance[0]: if code == 'a': search = 'find a ' + value + ' and exp ' + experiment new_value = convert_search_to_inspire_id(search) if new_value[0]: flag = True correct_subfields.append(('i', new_value[0])) if new_value[1]: flag = True orcid_value = 'ORCID:' + new_value[1] correct_subfields.append(('j', orcid_value)) correct_subfields.append((code, value)) record_add_field(correct_record, tag[0:3], tag[3], tag[4], subfields=correct_subfields) #return print_rec(correct_record) if flag: #print print_rec(correct_record) return print_rec(correct_record)
def perform_request_holdingpen(request_type, recId, changeId=None): """ A method performing the holdingPen ajax request. The following types of requests can be made: getHoldingPenUpdates - retrieving the holding pen updates pending for a given record """ response = {} if request_type == 'getHoldingPenUpdates': changeSet = get_related_hp_changesets(recId) changes = [] for change in changeSet: changes.append((str(change[0]), str(change[1]))) response["changes"] = changes elif request_type == 'getHoldingPenUpdateDetails': # returning the list of changes related to the holding pen update # the format based on what the record difference xtool returns assert(changeId != None) hpContent = get_hp_update_xml(changeId) holdingPenRecord = create_record(hpContent[0], "xm")[0] databaseRecord = get_record(hpContent[1]) response['record'] = holdingPenRecord response['changeset_number'] = changeId; elif request_type == 'deleteHoldingPenChangeset': assert(changeId != None) delete_hp_change(changeId); return response
def get_keywords_body(keywords, req, recid, argd): """Returns the body associated with the keywords.""" body = [] rec = get_record(recid) extend_argd(argd) if keywords: weights_available = 0 not in zip(*keywords)[1] else: req.write('There are no keywords associated with this document.<br>' \ '<form action="" method="get">' \ ' <input type="hidden" name="generate" value="yes">' \ ' <input type="submit" value="Generate keywords">' \ '</form>') return if argd['type'] == 'tagcloud' and not weights_available: # No weight is specified for at least one of the keywords. # Display the keywords as a list. argd['type'] = 'list' if argd['type'] == 'tagcloud': body.append('<div style="text-align: center; color: red; ' 'font-size: 80%; margin-top: 15px">Single keywords in grey, ' 'composite keywords in blue.</div>') if argd['type'] == 'list': # Display keywords as a list. body.append(_get_keywords_list(keywords, argd)) elif argd['type'] == 'tagcloud': if argd['sort'] == 'related' and not keywords: print 'No similar document was found.' # Separate single and composite keywords. single_keywords, composite_keywords = [], [] for keyword in keywords: if ': ' in keyword[0]: composite_keywords.append(keyword) else: single_keywords.append(keyword) # Display keywords as a tag cloud. single_levels = _get_font_levels(single_keywords) composite_levels = _get_font_levels(composite_keywords) body.append(_get_html_tag_cloud(single_levels + composite_levels, argd)) elif argd['type'] == 'xml': body.append('<pre><code>%s</code></pre>' % escape_html(record_xml_output(rec, ['653']))) else: body = 'Unknown type: ' + argd['type'] out = '' for element in body: out += '<br>' + element.encode('utf-8') req.write(out) return
def get_bibrecord(recid): """Return record in BibRecord wrapping.""" if record_exists(recid): record_revision_ids = get_record_revision_ids(recid) if record_revision_ids: return create_record(get_marcxml_of_revision_id(max(record_revision_ids)))[0] else: return get_record(recid)
def check_arxiv(recid): record = get_record(recid) for report_tag in record_get_field_instances(record, "037"): for category in field_get_subfield_values(report_tag, 'a'): if category.startswith('arXiv'): return True return False
def create_xml(recid, arxiv_ids): old_record = get_record(recid) attached_files = record_get_field_instances(old_record, tag='856', ind1='4') fields_to_add = [f for f in attached_files if check_arxiv_url(f, arxiv_ids)] record = {} record_add_field(record, '001', controlfield_value=str(recid)) record_add_fields(record, '856', fields_to_add) return print_rec(record)
def create_xml(recid, fname=None, oaff=None): affs = [a for a in oaff] record = get_record(recid) auth_location = record_get_field_instances(record, '100', '', '')[0][4] record_delete_field(record, '700', '', '') for x in affs: record_add_subfield_into(record, '100', 'u', x, field_position_global=auth_location) return print_rec(record)
def create_our_record(recid): old_record = get_record(recid) instances = record_get_field_instances(old_record, '980') new_instances = [l.field for l in set(OurInstance(i) for i in instances if field_get_subfield_instances(i) != [('a', 'unknown')])] record = {} record_add_field(record, '001', controlfield_value=str(recid)) record_add_fields(record, '980', new_instances) return print_rec(record)
def tarballs_by_recids(recids, sdir): """ Take a string representing one recid or several and get the associated tarballs for those ids. @param: recids (string): the record id or ids @param: sdir (string): where the tarballs should live @return: tarballs ([string, string, ...]): locations of tarballs """ list_of_ids = [] if ',' in recids: recids = recids.split(',') for recid in recids: if '-' in recid: low, high = recid.split('-') recid = range(int(low), int(high)) list_of_ids.extend(recid) else: recid = int(recid) list_of_ids.append(recid) else: if '-' in recids: low, high = recid.split('-') list_of_ids = range(int(low), int(high)) else: list_of_ids = int(recid) arXiv_ids = [] for recid in list_of_ids: rec = get_record(recid) for afieldinstance in record_get_field_instances(rec, tag='037'): if 'arXiv' == field_get_subfield_values(afieldinstance, '9')[0]: arXiv_id = field_get_subfield_values(afieldinstance, 'a')[0] arXiv_ids.append(arXiv_id) return tarballs_by_arXiv_id(arXiv_ids, sdir)
def usa_papers_csv(req): req.content_type = 'text/csv; charset=utf-8' req.headers_out['content-disposition'] = ('attachment; ' 'filename=usa_papers.csv') li = "%s; https://repo.scoap3.org/record/%s" ## print the list of linkt to the articles for university in CFG_SELECTED_AFF: print >> req, university search = create_search_from_affiliation(university) for collection in CFG_JOURNALS: res = perform_request_search(p='(%s)' % (search,), c=collection) if len(res): print >> req, collection for rec_id in res: rec = get_record(rec_id) line = li.format(str(rec['245'][0][0][0][1]), str(rec_id)) print >> req, line print >> req, "" print >> req, "" print >> req, ""
def tokenize(self, recID): phrases = [] try: rec = get_record(recID) for rule in self.rules: tag_to_index, necessary_tag, necessary_value = rule core_tag = tag_to_index[0:3] ind = tag_to_index[3:5] sub_tag = tag_to_index[5] fields = [dict(instance[0]) for instance in record_get_field_instances(rec, core_tag, ind[0], ind[1])] for field in fields: tag_condition = necessary_tag and field.has_key(necessary_tag) or necessary_tag == '' value_condition = necessary_value and field.get(necessary_tag, '') == necessary_value or \ necessary_value == '' if tag_condition and field.has_key(sub_tag) and value_condition: phrases.append(field[sub_tag]) return phrases except KeyError: return [] return phrases
def impact_articles(req, year): try: year = int(year) assert 2014 <= year except: raise SERVER_RETURN(HTTP_BAD_REQUEST) req.content_type = 'text/csv; charset=utf-8' req.headers_out['content-disposition'] = ('attachment; ' 'filename=impact_articles.csv') ids = perform_request_search(p="datecreated:{year}-01-01->{year}-12-31".format(year=year)) counter = 0 print >> req, "#;recid;journal;author;orcid;affiliation;countries" for i in ids: counter += 1 try: rec = get_record(i) except: print >> req, "{c},{recid},Can't load metadata".format(c=counter, recid=i) continue journal = record_get_field_value(rec, tag='773', code='p') for field in ['100', '700']: if field in rec: for author in rec[field]: name = "" orcid = "" aff = "" country = "" for key, val in author[0]: if key is 'a': name = unicode(val, 'UTF-8').replace('\n', ' ').strip() if key is 'j': orcid = unicode(val, 'UTF-8').replace('\n', ' ').strip() if key in ['v', 'u']: aff += unicode(val, 'UTF-8').replace('\n', ' ').strip() + " | " if key is 'w': country += unicode(val, 'UTF-8').replace('\n', ' ').strip() + ";" print >> req, "{c};{recid};{journal};{name};{orcid};{aff};{country}".format(c=counter, recid=i, journal=journal, name=name, orcid=orcid, aff=aff, country=country)
def generate_mediaexport_basket(basket_id): """ Exports the content of a basket. Takes each record from a basket and calls either generate_mediaexport_album or generate_mediaexport. :param str basket_id: The basket id. """ records = get_basket_content(basket_id, format='') recids = [record[0] for record in records] output = {} output['entries'] = [] for record_id in recids: # For each record_id return metadata record = get_record(record_id) if not record: # There is no record, for example when the record_id < 0 (external # resource). Skip it. continue report_number = record_get_field_value(record, *('037', ' ', ' ', 'a')) album_dict = generate_mediaexport_album(record_id, report_number, False) album_entries = album_dict.get('entries', None) if album_entries: output['entries'].append(album_entries) else: # If it's not an album, check if it's an image is_image = False collections = record_get_field_values(record, *('980', ' ', ' ', 'a')) collections.append(record_get_field_values(record, *('980', ' ', ' ', 'b'))) for collection in collections: if "PHOTO" in collection: is_image = True break tirage = report_number.rsplit("-", 1)[-1] media_dict = generate_mediaexport(record_id, is_image, report_number, tirage, False, False) if media_dict: output['entries'].append(media_dict) return json.dumps(output)
def marcxml_filter_out_tags(recid, fields): """ Returns the fields of record 'recid' that share the same tag and indicators as those specified in 'fields', but for which the subfield is different. This is nice to emulate a bibupload -c that corrects only specific subfields. Parameters: recid - *int* the id of the record to process fields - *list(str)* the list of fields that we want to filter out. Eg ['909COp', '909COo'] """ out = '' record = get_record(recid) # Delete subfields that we want to replace for field in fields: record_delete_subfield(record, tag=field[0:3], ind1=field[3:4], ind2=field[4:5], subfield_code=field[5:6]) # Select only datafields that share tag + indicators processed_tags_and_ind = [] for field in fields: if not field[0:5] in processed_tags_and_ind: # Ensure that we do not process twice the same datafields processed_tags_and_ind.append(field[0:5]) for datafield in record.get(field[0:3], []): if datafield[1] == field[3:4].replace('_', ' ') and \ datafield[2] == field[4:5].replace('_', ' ') and \ datafield[0]: out += field_xml_output(datafield, field[0:3]) + '\n' return out
def late(req): req.content_type = "text/html" print >> req, pageheaderonly("Late journals", req=req) th = ("<tr><th>DOI</th><th>Title</th><th>DOI registration</th>" "<th>Arrival in SCOAP3</th></tr>") tr = ("<tr style='background-color: {0};'><td>" "<a href='http://dx.doi.org/{1}' target='_blank'>{2}</td>" "<td>{3}</td><td>{4}</td><td>{5}</td></tr>") sql_bibrec = "SELECT creation_date FROM bibrec WHERE id=%s" sql_doi = "SELECT creation_date FROM doi WHERE doi=%s" for journal in CFG_JOURNALS: print >> req, "<h2>%s</h2>" % escape(get_coll_i18nname(journal)) results = get_collection_reclist(journal) print >> req, "<table>" print >> req, th for recid in results: creation_date = run_sql(sql_bibrec, (recid, ))[0][0] record = get_record(recid) doi = record_get_field_value(record, '024', '7', code='a') title = record_get_field_value(record, '245', code='a') doi_date = run_sql(sql_doi, (doi, )) background = "#eee" if doi_date: doi_date = doi_date[0][0] if (creation_date - doi_date).days < 0: background = "#66FF00" elif (creation_date - doi_date).days < 1: background = "#FF6600" else: background = "#FF0000" else: doi_date = '' print >> req, tr.format(background, escape(doi, True), escape(doi), title, doi_date, creation_date) print >> req, "</table>"
def index(req): req.content_type = 'text/csv; charset=utf-8' req.headers_out['content-disposition'] = ( 'attachment; filename=scoap3_records_info.csv') req.write( "SCOAP3 record id; Journal; Creation date; Modification date; Title; Authors; Publication info\n" ) for key, value in JOURNALS.iteritems(): recids = perform_request_search(c=value) for recid in recids: rec = get_record(recid) if '245' in rec: title = rec['245'][0][0][0][1].strip() else: title = "" creation_date = get_creation_date(recid) modification_date = get_modification_date(recid) if '100' in rec: authors = rec['100'][0][0][0][1] else: authors = "" if '700' in rec: for author in rec['700']: authors += ' / %s' % (author[0][0][1]) publication_info = '' if '733' in rec: publication_info += "%s %s (%s) %s" % ( rec['733'][0][0][0][1], rec['733'][0][0][1][1], rec['733'][0][0][2][1], rec['733'][0][0][3][1]) if '024' in rec: publication_info += " %s" % (rec['024'][0][0][0][1], ) if '037' in rec: publication_info += " %s" % (rec['037'][0][0][0][1], ) req.write("%s; %s; %s; %s; %s; %s; %s\n" % (recid, value, creation_date, modification_date, title, authors, publication_info))
def create_xml773(recid): record = get_record(recid) correct_record = {} record_add_field(correct_record, '001', controlfield_value=str(recid)) field_instances = record_get_field_instances(record, '773', '', '') correct_subfields = [] for field_instance in field_instances: correct_subfields = [] # print field_instance[0] for code, value in field_instance[0]: if code == 'p' and value == old_journal: correct_subfields.append(('p', repl_journal)) if VERBOSE: print "%s: Replacing 773__p %s with %s" % (recid, value, repl_journal) else: correct_subfields.append((code, value)) record_add_field(correct_record, '773', '', '', subfields=correct_subfields) return print_rec(correct_record)
def create_xml(recid, experiment): record = get_record(recid) correct_record = {} common_tags = {} experiment_tag = {} experiment_tag['693__'] = [('e', experiment)] tags = ['693__','710__'] #for tag in tags: # field_instances = record_get_field_instances(record, tag[0:3], tag[3], tag[4]) # for field_instance in field_instances: # correct_subfields = [] # for code, value in field_instance[0]: # correct_subfields.append((code, value)) # record_add_field(correct_record, tag[0:3], tag[3], tag[4], \ # subfields=correct_subfields) record_add_field(correct_record, '693', '_', '_', \ subfields=experiment_tag['693__']) record_add_field(correct_record, '001', controlfield_value=str(recid)) for key in common_tags: tag = key record_add_field(correct_record, tag[0:3], tag[3], tag[4], \ subfields=common_tags[key]) return print_rec(correct_record)
def create_xml(recid, old_aff=None, new_aff=None, skip_aff=None): record = get_record(recid) correct_record = {} tags = ('100__', '700__') record_add_field(correct_record, '001', controlfield_value=recid) for tag in tags: field_instances = record_get_field_instances(record, \ tag[0:3], tag[3], tag[4]) for field_instance in field_instances: correct_subfields = [] skip_aff_exists = False for aff in skip_aff: if any(val for code, val in field_instance[0] if aff in val): skip_aff_exists = True if VERBOSE: print "%s exists, deleting %s" % (aff, old_aff) if skip_aff_exists: for code, value in field_instance[0]: if code == 'u': if value != old_aff: correct_subfields.append((code, value)) else: correct_subfields.append((code, value)) else: for code, value in field_instance[0]: if code == 'u': if value == old_aff: correct_subfields.append((code, new_aff)) if VERBOSE: print "Changing %s to %s" % (old_aff, new_aff) else: correct_subfields.append((code, value)) else: correct_subfields.append((code, value)) record_add_field(correct_record, tag[0:3], tag[3], tag[4], \ subfields=correct_subfields) return print_rec(correct_record)
def get_ids_from_recid(recid): record = get_record(recid) ## Retrieving DOI doi = "" dois = record_get_field_values(record, '024', '7', code='a') dois = [doi for doi in dois if doi.startswith('10.')] if len(dois) > 1: print >> sys.stderr, "WARNING: record %s have more than one DOI: %s" % ( recid, dois) elif len(dois) == 1: doi = dois[0] ## Retrieving arXiv eprint eprint = "" eprints = record_get_field_values(record, '035', code='a') eprints = [ an_eprint[len('oai:arXiv.org:'):] for an_eprint in eprints if an_eprint.lower().startswith('oai:arxiv.org:') ] if len(eprints) > 1: print >> sys.stderr, "WARNING: record %s have more than one arXiv eprint: %s" % ( recid, eprints) elif len(eprints) == 1: eprint = eprints[0] ## Retrieving Other service ID other_id = '' for field in record_get_field_instances(record, '035'): subfields = dict(field_get_subfield_instances(field)) if subfields.get( '9', '').upper() == CFG_OTHER_SITE.upper() and subfields.get('a'): other_id = subfields['a'] reportnumbers = record_get_field_values(record, '037', code='a') return [str(recid), doi, eprint, other_id] + reportnumbers
def usa_papers(req): req.content_type = "text/html" print >> req, pageheaderonly("USA papers for selected affiliations", req=req) li = "<li><a href='https://repo.scoap3.org/record/{0}'>{1}</a></li>" ## print the list of linkt to the articles for university in CFG_SELECTED_AFF: print >> req, "<h2>%s</h2>" % (str(university),) search = create_search_from_affiliation(university) for collection in CFG_JOURNALS: res = perform_request_search(p='/%s/' % (search,), c=collection) if len(res): print >> req, "<h3>%s (%i)</h3>" % (str(collection), len(res)) print >> req, "<ul>" for rec_id in res: rec = get_record(rec_id) line = li.format(str(rec_id), str(rec['245'][0][0][0][1])) print >> req, line print >> req, "</ul>" req.write(pagefooteronly(req=req)) return ""
def lazy_parser(collection, left_tags, right_tags, volume_subfield): for recid in get_collection_reclist(collection): record = get_record(recid) for right_tag in right_tags: for right_value in record_get_field_values( record, right_tag[:3], right_tag[3], right_tag[4], right_tag[5]): if not right_value: continue # Empty metadata yield right_value, right_value for left_tag in left_tags: for left_field in record_get_field_instances( record, left_tag[:3], left_tag[3], left_tag[4]): left_subfields = dict( field_get_subfield_instances(left_field)) if left_tag[5] not in left_subfields: continue # Empty field if volume_subfield in left_subfields: yield left_subfields[left_tag[5]], '%s;%s' % ( right_value, left_subfields[volume_subfield]) else: yield left_subfields[left_tag[5]], right_value
def oairepositoryupdater_task(): """Main business logic code of oai_archive""" no_upload = task_get_option("no_upload") report = task_get_option("report") if report > 1: print_repository_status(verbose=report) return True initial_snapshot = {} for set_spec in all_set_specs(): initial_snapshot[set_spec] = get_set_definitions(set_spec) write_message("Initial set snapshot: %s" % pformat(initial_snapshot), verbose=2) task_update_progress("Fetching records to process") recids_with_oaiid = search_unit_in_bibxxx(p='*', f=CFG_OAI_ID_FIELD, type='e') write_message("%s recids have an OAI ID" % len(recids_with_oaiid), verbose=2) all_current_recids = search_unit_in_bibxxx(p='*', f=CFG_OAI_SET_FIELD, type='e') no_more_exported_recids = intbitset(all_current_recids) write_message("%s recids are currently exported" % (len(all_current_recids)), verbose=2) all_affected_recids = intbitset() all_should_recids = intbitset() recids_for_set = {} for set_spec in all_set_specs(): if not set_spec: set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC should_recids = get_recids_for_set_spec(set_spec) recids_for_set[set_spec] = should_recids no_more_exported_recids -= should_recids all_should_recids |= should_recids current_recids = search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_SET_FIELD, type='e') write_message( "%s recids should be in %s. Currently %s are in %s" % (len(should_recids), set_spec, len(current_recids), set_spec), verbose=2) to_add = should_recids - current_recids write_message("%s recids should be added to %s" % (len(to_add), set_spec), verbose=2) to_remove = current_recids - should_recids write_message("%s recids should be removed from %s" % (len(to_remove), set_spec), verbose=2) affected_recids = to_add | to_remove write_message("%s recids should be hence updated for %s" % (len(affected_recids), set_spec), verbose=2) all_affected_recids |= affected_recids missing_oaiid = all_should_recids - recids_with_oaiid write_message("%s recids are missing an oaiid" % len(missing_oaiid)) write_message("%s recids should no longer be exported" % len(no_more_exported_recids)) ## Let's add records with missing OAI ID all_affected_recids |= missing_oaiid | no_more_exported_recids write_message("%s recids should updated" % (len(all_affected_recids)), verbose=2) if not all_affected_recids: write_message("Nothing to do!") return True # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 # Iterate over the recids for i, recid in enumerate(all_affected_recids): task_sleep_now_if_required(can_stop_too=True) task_update_progress("Done %s out of %s records." % \ (i, len(all_affected_recids))) write_message("Elaborating recid %s" % recid, verbose=3) record = get_record(recid) if not record: write_message("Record %s seems empty. Let's skip it." % recid, verbose=3) continue new_record = {} # Check if an OAI identifier is already in the record or # not. assign_oai_id_entry = False oai_id_entry = record_get_field_value(record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], code=CFG_OAI_ID_FIELD[5]) if not oai_id_entry: assign_oai_id_entry = True oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid) write_message("Setting new oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) else: write_message("Already existing oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) # Get the sets to which this record already belongs according # to the metadata current_oai_sets = set( record_get_field_values(record, tag=CFG_OAI_SET_FIELD[:3], ind1=CFG_OAI_SET_FIELD[3], ind2=CFG_OAI_SET_FIELD[4], code=CFG_OAI_SET_FIELD[5])) write_message("Record %s currently belongs to these oai_sets: %s" % (recid, ", ".join(current_oai_sets)), verbose=3) current_previous_oai_sets = set( record_get_field_values(record, tag=CFG_OAI_PREVIOUS_SET_FIELD[:3], ind1=CFG_OAI_PREVIOUS_SET_FIELD[3], ind2=CFG_OAI_PREVIOUS_SET_FIELD[4], code=CFG_OAI_PREVIOUS_SET_FIELD[5])) write_message( "Record %s currently doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(current_previous_oai_sets)), verbose=3) # Get the sets that should be in this record according to # settings updated_oai_sets = set(_set for _set, _recids in recids_for_set.iteritems() if recid in _recids) write_message("Record %s now belongs to these oai_sets: %s" % (recid, ", ".join(updated_oai_sets)), verbose=3) updated_previous_oai_sets = set( _set for _set in (current_previous_oai_sets - updated_oai_sets) | (current_oai_sets - updated_oai_sets)) write_message( "Record %s now doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(updated_previous_oai_sets)), verbose=3) # Ok, we have the old sets and the new sets. If they are equal # and oai ID does not need to be added, then great, nothing to # change . Otherwise apply the new sets. if current_oai_sets == updated_oai_sets and not assign_oai_id_entry: write_message("Nothing has changed for record %s, let's move on!" % recid, verbose=3) continue # Jump to next recid write_message("Something has changed for record %s, let's update it!" % recid, verbose=3) subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)] for oai_set in updated_oai_sets: subfields.append((CFG_OAI_SET_FIELD[5], oai_set)) for oai_set in updated_previous_oai_sets: subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set)) record_add_field(new_record, tag="001", controlfield_value=str(recid)) record_add_field(new_record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], subfields=subfields) oai_out.write(record_xml_output(new_record)) tot += 1 if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE: oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if not no_upload: task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n') # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 task_sleep_now_if_required(can_stop_too=True) oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if not no_upload: task_sleep_now_if_required(can_stop_too=True) if tot > 0: task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n') else: os.remove(filename) return True
def institutions_list(req, country, year=None): from copy import deepcopy def find_nations(affiliation): NATIONS_DEFAULT_MAP['European Organization for Nuclear Research'] = 'CERN' NATIONS_DEFAULT_MAP['Centre Europeen de Recherches Nucleaires'] = 'CERN' NATIONS_DEFAULT_MAP['High Energy Accelerator Research Organization'] = 'KEK' NATIONS_DEFAULT_MAP['KEK'] = 'KEK' NATIONS_DEFAULT_MAP['FNAL'] = 'FNAL' NATIONS_DEFAULT_MAP['Fermilab'] = 'FNAL' NATIONS_DEFAULT_MAP['Fermi National'] = 'FNAL' NATIONS_DEFAULT_MAP['SLAC'] = 'SLAC' NATIONS_DEFAULT_MAP['DESY'] = 'DESY' NATIONS_DEFAULT_MAP['Deutsches Elektronen-Synchrotron'] = 'DESY' NATIONS_DEFAULT_MAP['JINR'] = 'JINR' NATIONS_DEFAULT_MAP['JOINT INSTITUTE FOR NUCLEAR RESEARCH'] = 'JINR' possible_affs = [] def _sublistExists(list1, list2): return ''.join(map(str, list2)) in ''.join(map(str, list1)) values = set([y.lower().strip() for y in re.findall(ur"[\w']+", affiliation.replace('.','').decode("UTF-8"), re.UNICODE)]) for key, val in NATIONS_DEFAULT_MAP.iteritems(): key = unicode(key) key_parts = set(key.lower().decode('utf-8').split()) if key_parts.issubset(values): possible_affs.append(val) values = values.difference(key_parts) if not possible_affs: possible_affs = ['HUMAN CHECK'] if 'CERN' in possible_affs and 'Switzerland' in possible_affs: # Don't use remove in case of multiple Switzerlands possible_affs = [x for x in possible_affs if x != 'Switzerland'] if 'KEK' in possible_affs and 'Japan' in possible_affs: possible_affs = [x for x in possible_affs if x != 'Japan'] if 'FNAL' in possible_affs and 'USA' in possible_affs: possible_affs = [x for x in possible_affs if x != 'USA'] if 'SLAC' in possible_affs and 'USA' in possible_affs: possible_affs = [x for x in possible_affs if x != 'USA'] if 'DESY' in possible_affs and 'Germany' in possible_affs: possible_affs = [x for x in possible_affs if x != 'Germany'] if 'JINR' in possible_affs and 'Russia' in possible_affs: possible_affs = [x for x in possible_affs if x != 'Russia'] return sorted(list(set(possible_affs)))[0] publisher_dict = {'New J. Phys.':0, 'Acta Physica Polonica B':0, 'Advances in High Energy Physics':0, 'Chinese Phys. C':0, 'EPJC':0, 'JCAP':0, 'JHEP':0, 'Nuclear Physics B':0, 'Physics letters B':0, 'PTEP':0} if(year): recids = perform_request_search(p='country:"%s" year:%s' % (country,year)) else: recids = perform_request_search(p='country:"%s"' % (country,)) req.content_type = 'text/csv; charset=utf-8' req.headers_out['content-disposition'] = ('attachment; ' 'filename=%s_institutions_list.csv' % (country,)) req.write("recid|authors #|title|country|New J. Phys.|Acta Physica Polonica B|Advances in High Energy Physics|Chinese Phys. C|EPJC|JCAP|JHEP|Nuclear Physics B|Physics letters B|PTEP\n") for recid in recids: rec = get_record(recid) global_affs = {} author_count = 0 if '100' in rec: author_count += len(rec['100']) if '700' in rec: author_count += len(rec['700']) journal = record_get_field_value(rec, '773', ind1="%", ind2="%", code='p') affs = [] affs.extend(record_get_field_values(rec, '100', ind1="%", ind2="%", code='v')) affs.extend(record_get_field_values(rec, '700', ind1="%", ind2="%", code='v')) for aff in affs: if aff not in global_affs: global_affs[aff] = deepcopy(publisher_dict) global_affs[aff][journal] += 1 for aff, j in global_affs.iteritems(): req.write("%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s\n" % (recid, author_count, aff.replace('\n', ' ').replace('\r', ''), find_nations(aff), j['New J. Phys.'],j['Acta Physica Polonica B'],j['Advances in High Energy Physics'],j['Chinese Phys. C'],j['EPJC'],j['JCAP'],j['JHEP'],j['Nuclear Physics B'],j['Physics letters B'],j['PTEP']))
def csu(req): req.content_type = 'text/csv; charset=utf-8' req.headers_out['content-disposition'] = ( 'attachment; filename=csu_records_info.csv') search_patterns = [ "California Polytechnic State University", "Carson", "Dominguez Hills", "Fresno", "California State University Fullerton", "California State University Long Beach", "California State University, Los Angeles", "Northridge", "California State University, Sacramento", "San Diego State University", "sfsu" ] def special_aff(author): affs = [] au = "" name = "" for i in author: if i[0] == 'v' and value in i[1]: affs.append(i[1]) if i[0] == 'a': name = i[1] if len(affs) > 0: au = name + '(' for aff in affs: au += aff + ', ' au += '), ' return au req.write( "SCOAP3 record id; Journal; Creation date; Modification date; Title; Authors; Publication info\n" ) for value in search_patterns: recids = perform_request_search(p="affiliation:'%s'" % (value, )) # req.write("%s; %s\n" % (value, len(recids) )) for recid in recids: rec = get_record(recid) if '245' in rec: title = rec['245'][0][0][0][1].strip() else: title = "" creation_date = get_creation_date(recid) modification_date = get_modification_date(recid) authors = "" if '100' in rec: authors += special_aff(rec['100'][0][0]) if '700' in rec: for author in rec['700']: authors += special_aff(author[0]) publication_info = '' if '773' in rec: for p in rec['773'][0][0]: if p[0] == 'p': publication_info = p[1] publication_info += " %s" % (rec['024'][0][0][0][1], ) if '037' in rec: publication_info += " %s" % (rec['037'][0][0][0][1], ) req.write("%s; %s; %s; %s; %s; %s; %s\n" % (recid, value, creation_date, modification_date, title, authors, publication_info))
def tarballs_by_recids(recids, sdir, docname=None, doctype=None, docformat=None): """ Take a string representing one recid or several and get the associated tarballs for those ids. By default look for files with names matching the report number and with source field 'arXiv'. This can be changed with C{docname}, C{doctype}, C{docformat} @param: recids (string): the record id or ids @param: sdir (string): where the tarballs should live @param docname: select tarball for given recid(s) that match docname @param doctype: select tarball for given recid(s) that match doctype @param docformat: select tarball for given recid(s) that match docformat @return: tarballs ([string, string, ...]): locations of tarballs """ if not recids: return [] list_of_ids = [] if ',' in recids: recids = recids.split(',') for recid in recids: if '-' in recid: low, high = recid.split('-') recid = range(int(low), int(high)) list_of_ids.extend(recid) else: recid = int(recid) list_of_ids.append(recid) else: if '-' in recids: low, high = recids.split('-') list_of_ids = range(int(low), int(high)) else: list_of_ids = [int(recids)] arXiv_ids = [] local_files = [] for recid in list_of_ids: rec = get_record(recid) if not doctype and not docname and not docformat: for afieldinstance in record_get_field_instances(rec, tag='037'): if len(field_get_subfield_values(afieldinstance, '9')) > 0: if 'arXiv' == field_get_subfield_values( afieldinstance, '9')[0]: arXiv_id = field_get_subfield_values( afieldinstance, 'a')[0] arXiv_ids.append(arXiv_id) else: bibarchive = BibRecDocs(recid) all_files = bibarchive.list_latest_files() if doctype: all_files = [ docfile for docfile in all_files if docfile.get_type() == doctype ] if docname: all_files = [ docfile for docfile in all_files if docfile.get_name() == docname ] if docformat: all_files = [ docfile for docfile in all_files if docfile.get_format() == docformat ] local_files.extend([(docfile.get_path(), recid) for docfile in all_files]) if doctype or docname or docformat: return local_files return tarballs_by_arXiv_id(arXiv_ids, sdir)
def get_field_values_on_condition(bibrecid, get_table="", get_tag="", condition_tag="", condition_value="", condition="==", source="MEM"): ''' Method to fetch data from a record in the database. It is possible to specify a condition in order to get only certain fields if condition holds. Examples: In [2]: bibauthorid_utils.get_field_values_on_condition (742535, [100, 700], 'u', 'a', 'Mathieu, Vincent') Out[2]: set(['UMH, Mons']) In [3]: bibauthorid_utils.get_field_values_on_condition (742535, [100, 700], 'u', 'a') Out[3]: set(['LPSC, Grenoble', 'UMH, Mons']) In [9]: bibauthorid_utils.get_field_values_on_condition (742535, [100,700], 'a', 'u', 'UMH, Mons') Out[9]: set(['Semay, Claude', 'Mathieu, Vincent']) In [4]: bibauthorid_utils.get_field_values_on_condition (742535, [100, 700], 'u') Out[4]: set(['LPSC, Grenoble', 'UMH, Mons']) In [5]: bibauthorid_utils.get_field_values_on_condition (742535, [100, 700]) Out[5]: {'100': [([('a', 'Mathieu, Vincent'), ('u', 'UMH, Mons'), ('i', '4286')], ' ', ' ', '', 3)], '700': [([('a', 'Semay, Claude'), ('u', 'UMH, Mons'), ('i', '4286')], ' ', ' ', '', 4), ([('a', 'Silvestre-Brac, Bernard'), ('u', 'LPSC, Grenoble'), ('i', '2240')], ' ', ' ', '', 5)]} In [6]: bibauthorid_utils.get_field_values_on_condition(1) Out[6]: {'001': [([], ' ', ' ', '1', 1)], '035': [([('a', 'Groom:0965xu'), ('9', 'SPIRESTeX')], ' ', ' ', '', 13)], '037': [([('a', 'CALT-68-62')], ' ', ' ', '', 3)], '100': [([('a', 'Groom, Donald E.'), ('u', 'Caltech'), ('i', '981')], ' ', ' ', '', 4)], '245': [([('a', 'A GENERAL RANGE ENERGY LIGHT OUTPUT PROGRAM FOR HEP')], ' ', ' ', '', 5)], '260': [([('c', '0965')], ' ', ' ', '', 7)], '269': [([('c', '0965-12-01')], ' ', ' ', '', 6)], '300': [([('a', '10')], ' ', ' ', '', 8)], '690': [([('a', 'Preprint')], 'C', ' ', '', 2)], '961': [([('x', '2007-03-02')], ' ', ' ', '', 10), ([('c', '2007-03-02')], ' ', ' ', '', 11)], '970': [([('9', 'DESY'), ('a', 'DESY-404799')], ' ', ' ', '', 9), ([('a', 'SPIRES-7090030')], ' ', ' ', '', 12)], '980': [([('a', 'Citeable')], ' ', ' ', '', 14), ([('a', 'CORE')], ' ', ' ', '', 15)]} @param bibrecid: The id of the record (bibrec) to get @type bibrecid: int @param get_table: List of one or more tables to look at @type get_table: list or string or int or long @param get_tag: The value of this tag shall be returned @type get_tag: string @param condition_tag: First part of the condition. Provide a tag to look up @type condition_tag: string @param condition_value: Second pard of the condition. Provide a value that has to be matched @type condition_value: string @param condition: Optional value to describe the condition. Defaults to "==" and may be any comparison @return: set of found values, empty set if no value found. @rtype: set or dictionary (if get_tag, condition_tag and condition_value are empty) ''' rec = None if source == "MEM": rec = dat.RELEVANT_RECORDS.get(bibrecid) elif source == "API": rec = get_record(bibrecid) if condition_value and isinstance(condition_value, str): condition_value = condition_value.decode('utf-8') returnset = set() if not rec: return set() if get_table: if not isinstance(get_table, list): if isinstance(get_table, str): get_table = [get_table] elif isinstance(get_table, int) or isinstance(get_table, long): get_table = [str(get_table)] else: sys.stderr.write( 'Error: Wrong table for table selection. ' + 'Allowed are list of strings, string or int/long values\n') for table in get_table: if str(table) in rec: if table in ["cites", "cited-by"]: return rec[str(table)] for recordentries in rec[str(table)]: is_condition = True is_skip_entry = False for field in recordentries[0]: if condition_tag and condition_value: if field[0] == condition_tag: condition_holds = False try: condition_holds = not eval( ("field[1].decode('utf-8') %s" + " condition_value") % (condition)) except (TypeError, NameError, IndexError): condition_holds = False if condition_holds: is_skip_entry = True is_condition = False break elif get_tag: if get_tag == field[0]: returnset.add(field[1].decode('utf-8')) else: retlist = {} for table in get_table: try: retlist[str(table)] = rec[str(table)] except KeyError: pass return retlist if is_condition and not is_skip_entry: for field in recordentries[0]: if field[0] == get_tag: returnset.add(field[1].decode('utf-8')) if len(returnset) == 0: returnset = set() return returnset else: return rec
def _get_orcid_dictionaries(papers, personid, old_external_ids, orcid): """Return list of dictionaries which can be used in ORCID library. Yields orcid list of ORCID_SINGLE_REQUEST_WORKS works of given person. @param papers: list of papers' records ids. @type papers: list (tuple(int,)) @param personid: personid of person who is requesting orcid dictionary of his works @type personid: int @param orcid: orcid of the author @type orcid: string """ orcid_list = [] for recid in papers: work_dict = { 'work_title': {} } recstruct = get_record(recid) url = CFG_SITE_URL + ('/record/%d' % recid) try: external_ids = _get_external_ids(recid, url, recstruct, old_external_ids, orcid) except OrcidRecordExisting: # We will not push this record, skip it. continue # There always will be some external identifiers. work_dict['work_external_identifiers'] = list(external_ids) work_dict['work_title']['title'] = \ encode_for_jinja_and_xml(record_get_field_value( recstruct, '245', '', '', 'a')) short_descriptions = \ record_get_field_values(recstruct, '520', '', '', 'a') if short_descriptions: work_dict['short_description'] = encode_for_jinja_and_xml( short_descriptions[0])[ :MAX_DESCRIPTION_LENGTH ].rsplit(' ', 1)[0] journal_title = record_get_field_value(recstruct, '773', '', '', 'p') if journal_title: work_dict['journal-title'] = encode_for_jinja_and_xml( journal_title) citation = _get_citation(recid) if citation: work_dict['work_citation'] = citation work_dict['work_type'] = _get_work_type(recstruct) publication_date = _get_publication_date(recstruct) if publication_date: work_dict['publication_date'] = publication_date work_dict['url'] = url work_contributors = _get_work_contributors(recid, personid) if len(work_contributors) > 0: work_dict['work_contributors'] = work_contributors work_source = record_get_field_value(recstruct, '359', '', '', '9') if work_source: work_dict['work_source']['work-source'] = \ encode_for_jinja_and_xml(work_source) language = record_get_field_value(recstruct, '041', '', '', 'a') if language: # If we understand the language we map it to ISO 639-2 language = LANGUAGE_MAP.get(language.lower().strip()) if language: work_dict['language_code'] = encode_for_jinja_and_xml(language) else: work_dict['language_code'] = 'en' work_dict['visibility'] = 'public' orcid_list.append(work_dict) bibtask.write_message("Pushing " + str(recid)) if len(orcid_list) == ORCID_SINGLE_REQUEST_WORKS: bibtask.write_message("I will push " + str(ORCID_SINGLE_REQUEST_WORKS) + " records to ORCID.") yield orcid_list orcid_list = [] if len(orcid_list) > 0: # empty message might be invalid bibtask.write_message("I will push last " + str(len(orcid_list)) + " records to ORCID.") yield orcid_list
def countries_by_publishers(req): req.content_type = "text/html" print >> req, pageheaderonly("Countries/publishers", req=req) ############ ## PART 1 ## # journals = [] # for pub in CFG_JOURNALS: # ids = perform_request_search(cc=pub) # journals.append((pub, ids)) # journals.append(("older_than_2014", perform_request_search(cc='older_than_2014'))) # countries = [] # for country in sorted(set(NATIONS_DEFAULT_MAP.itervalues())): # ids = perform_request_search(p="country:%s" % (country,)) + perform_request_search(cc='older_than_2014', p="country:%s" % (country,)) # countries.append((country, ids)) req.write("<h1>Number of articles per country per journal</h1>") req.write("<h2>Minimum one author from the country</h2>") req.flush() req.write("<table>\n") req.write("<tr><th rowspan=2>Country</th><th colspan=10>Journals</th><th>Other</th></tr>") req.write("""<tr> <td>Acta</td> <td>Advances in High Energy Physics</td> <td>Chinese Physics C</td> <td>European Physical Journal C</td> <td>Journal of Cosmology and Astroparticle Physics</td> <td>Journal of High Energy Physics</td> <td>New Journal of Physics</td> <td>Nuclear Physics B</td> <td>Physics Letters B</td> <td>Progress of Theoretical and Experimental Physics</td> <td>older_than_2014</td></tr>""") for country in sorted(set(NATIONS_DEFAULT_MAP.itervalues())): req.write("<tr><td>%s</td>" % (country,)) for pub in CFG_JOURNALS + ["older_than_2014"]: req.write("<td>%s</td>" % perform_request_search(p="country:%s" % (country,), cc=pub)) req.write("</tr>") req.write('</table>') ############ ## PART 2 ## # journals = [] hitcount = {} for pub in CFG_JOURNALS + ["older_than_2014"]: ids = perform_request_search(cc=pub) hitcount[pub] = {} for country in sorted(set(NATIONS_DEFAULT_MAP.itervalues())): hitcount[pub][country] = 0 for id in ids: record = get_record(id) countries = set(record_get_field_values(record, '700', '%', '%', 'w') + record_get_field_values(record, '100', '%', '%', 'w')) if len(countries) == 1: c = countries.pop() if c in set(NATIONS_DEFAULT_MAP.itervalues()): hitcount[pub][countries[0]] += 1 req.write("<h1>Number of articles per country per journal</h1>") req.write("<h2>All author from the country</h2>") req.flush() req.write("<table>\n") req.write("<tr><th rowspan=2>Country</th><th colspan=10>Journals</th><th>Other</th></tr>") req.write("""<tr> <td>Acta</td> <td>Advances in High Energy Physics</td> <td>Chinese Physics C</td> <td>European Physical Journal C</td> <td>Journal of Cosmology and Astroparticle Physics</td> <td>Journal of High Energy Physics</td> <td>New Journal of Physics</td> <td>Nuclear Physics B</td> <td>Physics Letters B</td> <td>Progress of Theoretical and Experimental Physics</td> <td>older_than_2014</td></tr>""") for country in sorted(set(NATIONS_DEFAULT_MAP.itervalues())): req.write("<tr><td>%s</td>" % (country,)) for pub in CFG_JOURNALS + ["older_than_2014"]: req.write("<td>%s</td>" % hitcount[pub][country]) req.write("</tr>") req.write('</table>') req.write(pagefooteronly(req=req)) return ""
def record_get_keywords(record, main_field=bconfig.CFG_MAIN_FIELD, others=bconfig.CFG_OTHER_FIELDS): """Returns a dictionary of keywordToken objects from the marc record. Weight is set to (0,0) if no weight can be found. This will load keywords from the field 653 and 695__a (which are the old 'DESY' keywords) @var record: int or marc record, if int - marc record is loaded from the database. If you pass record instance, keywords are extracted from it @return: tuple (found, keywords, marcxml) found - int indicating how many main_field keywords were found the other fields are not counted keywords - standard dictionary of keywordToken objects marcrec - marc record object loaded with data """ keywords = {} if isinstance(main_field, basestring): main_field = [main_field] if isinstance(others, basestring): others = [others] if isinstance(record, int): rec = get_record(record) else: rec = record found = 0 for m_field in main_field: tag, ind1, ind2 = bibclassify_engine._parse_marc_code(m_field) for field in rec.get(tag, []): keyword = '' weight = 0 type = '' for subfield in field[0]: if subfield[0] == 'a': keyword = subfield[1] elif subfield[0] == 'n': weight = int(subfield[1]) elif subfield[0] == '9': type = subfield[1] if keyword: found += 1 keywords[bor.KeywordToken(keyword, type=type)] = [[ (0, 0) for x in range(weight) ]] if others: for field_no in others: tag, ind1, ind2 = bibclassify_engine._parse_marc_code(field_no) type = 'f%s' % field_no for field in rec.get(tag, []): keyword = '' for subfield in field[0]: if subfield[0] == 'a': keyword = subfield[1] keywords[bor.KeywordToken(keyword, type=type)] = [[(0, 0)]] break return found, keywords, rec
def write_csv(req, dictionary, journal_list, f_date, t_date, created_or_modified_date): return_val = '' for key in journal_list: val = dictionary[key] papers = perform_request_search(p="date%s:%s->%s" % (created_or_modified_date, f_date, t_date), c=val) if papers == []: continue return_val += key return_val += ';'.join(['recid', 'cr. date', 'mod. date', 'DOI', 'XML', 'PDF', 'PDF/A', 'Complete record?', 'arXiv number', 'Copyright: authors', 'CC-BY', 'Funded by SCOAP3', 'notes', 'First delivery', 'First AB delivery', 'Last modification', 'PDF/A upload', 'DOI registration', 'Delivery diff', 'PDF/A diff']) + '\n' for recid in papers: rec = get_record(recid) doi = get_doi(rec) first_del = None first_ab_del = None last_mod = None doi_reg = None pdfa_del = None delivery_data = run_sql("SELECT doi.creation_date AS 'doi_reg', package.name AS 'pkg_name', package.delivery_date AS 'pkg_delivery' FROM doi_package LEFT JOIN doi ON doi_package.doi=doi.doi LEFT JOIN package ON package.id=doi_package.package_id WHERE doi_package.doi=%s ORDER BY package.delivery_date ASC", (doi,), with_dict=True) if delivery_data: first_del = delivery_data[0]['pkg_delivery'] first_ab_del = get_delivery_of_firts_ab_package(delivery_data) last_mod = delivery_data[-1]['pkg_delivery'] doi_reg = delivery_data[0]['doi_reg'] pdfa_del = get_delivery_of_firts_pdfa(delivery_data) record_compl = is_complete_record(recid) return_val += ';'.join(str(item) for item in [str(recid), get_creation_date(recid), get_modification_date(recid), doi, has_or_had_format(recid, '.xml').lstrip('<b>').rstrip('</b>'), has_or_had_format(recid, '.pdf').lstrip('<b>').rstrip('</b>'), has_or_had_format(recid, '.pdf;pdfa').lstrip('<b>').rstrip('</b>'), str(check_complete_rec(record_compl)), get_arxiv(rec).lstrip('<b>').rstrip('</b>'), is_compliant(recid, 'authors').lstrip('<b>').rstrip('</b>'), is_compliant(recid, 'cc').lstrip('<b>').rstrip('</b>'), is_compliant(recid, 'scoap3').lstrip('<b>').rstrip('</b>'), is_compliant(recid, 'category').lstrip('<b>').rstrip('</b>'), str([rec_key for rec_key, rec_val in record_compl.iteritems() if not rec_val]), str(first_del), str(first_ab_del), str(last_mod), str(pdfa_del), str(doi_reg), check_24h_delivery(first_ab_del, doi_reg), check_24h_delivery(pdfa_del, doi_reg) ]) return_val += '\n' return return_val
def get_record_checks(req, recids): if recids == '': return '' recids = recids.split(',') return_val = [] for rid in recids: try: recid = int(rid) rec = get_record(recid) doi = get_doi(rec) delivery_data = run_sql("SELECT doi.creation_date AS 'doi_reg', package.name AS 'pkg_name', package.delivery_date AS 'pkg_delivery' FROM doi_package LEFT JOIN doi ON doi_package.doi=doi.doi LEFT JOIN package ON package.id=doi_package.package_id WHERE doi_package.doi=%s ORDER BY package.delivery_date ASC", (doi,), with_dict=True) first_del = None first_ab_del = None last_mod = None doi_reg = None pdfa_del = None if delivery_data: first_del = delivery_data[0]['pkg_delivery'] first_ab_del = get_delivery_of_firts_ab_package(delivery_data) last_mod = delivery_data[-1]['pkg_delivery'] doi_reg = delivery_data[0]['doi_reg'] pdfa_del = get_delivery_of_firts_pdfa(delivery_data) record_compl = is_complete_record(recid) return_val.append("""<tr> <td><a href="%s">%i</a></td> <td>%s</td> <td>%s</td> <td><a href="http://dx.doi.org/%s">%s</a></td> <td>%s</td> <td>%s</td> <td>%s</td> <td>%s</td> <td>%s</td> <td>%s</td> <td>%s</td> <td>%s</td> <td>%s</td> <td>%s</td> <td>%s</td> <td>%s</td> <td>%s</td> <td>%s</td> <td>%s</td> <td %s>%s</td> <td %s>%s</td> </tr>""" % (join(CFG_SITE_URL, 'record', str(recid)), recid, get_creation_date(recid), get_modification_date(recid), doi, doi, has_or_had_format(recid, '.xml'), has_or_had_format(recid, '.pdf'), has_or_had_format(recid, '.pdf;pdfa'), check_complete_rec(record_compl), get_arxiv(rec), is_compliant(recid, "authors"), is_compliant(recid, "cc"), is_compliant(recid, "scoap3"), is_compliant(recid. "category"), str([rec_key for rec_key, rec_val in record_compl.iteritems() if not rec_val]), str(first_del), str(first_ab_del), str(last_mod), str(pdfa_del), str(doi_reg), format_24h_delivery(check_24h_delivery(first_del, doi_reg)), check_24h_delivery(first_del, doi_reg), format_24h_delivery(check_24h_delivery(pdfa_del, doi_reg)), check_24h_delivery(pdfa_del, doi_reg))) except Exception: register_exception() recid = rid return_val.append("""<tr><th colspan="13" align="left"> <h2>%s</h2></th></tr>""" % (recid,)) return_val.append("""<tr> <th>recid</th> <th>cr. date</th> <th>mod. date</th> <th>DOI</th> <th>XML</th> <th>PDF</th> <th>PDF/A</th> <th>Complete record?</th> <th>arXiv number</th> <th>Copyright: authors</th> <th>CC-BY</th> <th>Funded by SCOAP3</th> <th>Category</th> <th>notes</th> <th>First delivery</th> <th>First AB delivery</th> <th>Last modification</th> <th>PDF/A upload</th> <th>DOI registration</th> <th>Delivery diff</th> <th>PDF/A diff</th> </tr>""") return ''.join(return_val)
def format_element(bfo, reference_prefix, reference_suffix): """ Prints the references of this record @param reference_prefix a prefix displayed before each reference @param reference_suffix a suffix displayed after each reference """ references = bfo.fields("999C5", escape=1, repeatable_subfields_p=True) out = "" last_o = "" if not references: return out out += "<table>" for reference in references: ref_out = [] ref_out.append('<tr><td valign="top">') display_journal = '' display_report = '' clean_report = '' clean_journal = '' hits = [] if reference.has_key('o') and not reference['o'][0] == last_o: temp_ref = reference['o'][0].replace('.', '') if '[' in temp_ref and ']' in temp_ref: ref_out.append("<small>" + temp_ref + "</small> ") else: ref_out.append("<small>[" + temp_ref + "] </small> ") last_o = temp_ref ref_out.append("</td><td>") if reference_prefix: ref_out.append(reference_prefix) if reference.has_key('s'): display_journal = reference['s'][0] clean_journal = reference['s'][0] if reference.has_key('r'): if "[" in reference['r'][0] and "]" in reference['r'][0]: breaknum = reference['r'][0].find('[') newreference = reference['r'][0][:breaknum].strip() display_report = newreference clean_report = newreference else: display_report = reference['r'][0] clean_report = reference['r'][0] if clean_report: hits = search_unit(f='reportnumber', p=clean_report) if clean_journal and len(hits) != 1: hits = search_unit(f='journal', p=clean_journal) if reference.has_key('a') and len(hits) != 1: hits = search_unit(p=reference['a'][0]) if reference.has_key('0') and len(hits) != 1: # check if the record exists in the database try: recID = int(reference['0'][0]) if get_record(recID): # since we already have a recID, we can assign it directly # to the "hits" variable, so it will be handled in the last if statement hits = [recID] except ValueError: pass if len(hits) == 1: ref_out.append('<small>' + format_record(list(hits)[0], 'hs') + '</small>') else: if reference.has_key('h'): ref_out.append("<small> " + reference['h'][0] + ".</small>") if reference.has_key('t'): ref_out.append("<small> " + reference['t'][0] + "</small> -") if reference.has_key('y'): ref_out.append("<small> " + reference['y'][0] + ".</small>") if reference.has_key('p'): ref_out.append("<small> " + reference['p'][0] + ".</small>") if reference.has_key('m'): ref_out.append("<small> " + reference['m'][0].replace(']]', ']') + ".</small>") if reference.has_key('a'): ref_out.append("<small> <a href=\"http://dx.doi.org/" + \ reference['a'][0] + "\">" + reference['a'][0]+ "</a></small>") if reference.has_key('u'): ref_out.append("<small> <a href=" + reference['u'][0] + ">" + \ reference['u'][0]+ "</a></small>") if reference.has_key('i'): for r in reference['i']: ref_out.append( "<small> <a href=\"/search?ln=en&p=020__a%3A" + r + "\">" + r + "</a></small>") ref_out.append('<small>') if display_journal: ref_out.append(display_journal) if display_report: ref_out.append(' ' + display_report) ref_out.append("</small>") if reference_suffix: ref_out.append(reference_suffix) ref_out.append("</td></tr>") out += ' '.join(ref_out) return out + "</table>"
def main(input_file, dry_run, output_dir): # Ensure we have data to update first _print_out("--------------- Fetching current data ---------------") current_record_ids = perform_request_search(p=SEARCH_TERM) _print_out( str(len(current_record_ids)) + " records found matching search term \"" + SEARCH_TERM + "\"") _print_verbose("Record IDs found: " + str(current_record_ids)) current_records = {} # Struct {'recid': (record)} bad_record_ids = [] # We don't need the records for new PDG data, they are appended for recid in current_record_ids: record = get_record(recid) if '084' not in record: bad_record_ids.append(str(recid)) _print_out("WARNING: No 084 in fetched record %s" % (str(recid), )) else: current_records[recid] = record if len(bad_record_ids) > 0: _print_out("WARNING: Bad record IDs found! Printing to file") write_list_to_file(output_dir, "bad_record_ids", bad_record_ids) _print_out("--------------- Input Parsing ---------------") new_lines = get_lines_from_file(input_file) new_pdg_data = {} # Struct {'recid': [pdg_data]} lines_missing = [] lines_ambiguous = [] lines_invalid = [] _print_out("Finding records from input file") for i, line in enumerate(new_lines): status, r_id, data = parse_pdg_line(line) if status is ParseResult.Success: new_pdg_data[r_id] = data _print_verbose("line #" + str(i) + ": Success! Record ID " + str(r_id) + " found for line " + line) elif status is ParseResult.Invalid: lines_invalid.append(line) _print_verbose("line #" + str(i) + ": Invalid line: " + line) elif status is ParseResult.Missing: lines_missing.append(line) _print_verbose("line #" + str(i) + ": Missing line: " + line) elif status is ParseResult.Ambiguous: lines_ambiguous.append(line) _print_verbose("line #" + str(i) + ": Ambiguous line: " + line) _print_out("--------------- Matching records ---------------") _print_out("Records matched to PDG data (valid): " + str(len(new_pdg_data))) _print_out("Missing records (not found): " + str(len(lines_missing))) _print_out("Ambiguous (multiple results): " + str(len(lines_ambiguous))) _print_out("Invalid lines (Dodgy data): " + str(len(lines_invalid))) if len(lines_missing) is not 0: write_list_to_file(output_dir, "missing-records.txt", lines_missing) if len(lines_ambiguous) is not 0: write_list_to_file(output_dir, "ambiguous-records.txt", lines_ambiguous) if len(lines_invalid) is not 0: write_list_to_file(output_dir, "invalid-lines.txt", lines_invalid) # These lists contain record IDs of records to have PDG data either: # - add, the PDG data should be appended (record was added to PDG) # - compare, the PDG data should be compared for possible correction # - delete, the PDG data should be removed (record was removed from PDG) ids_add = list(set(new_pdg_data.keys()) - set(current_record_ids)) ids_compare = list(set(current_record_ids) & set(new_pdg_data.keys())) ids_delete = list(set(current_record_ids) - set(new_pdg_data.keys())) # At this point all rec IDs should be valid! _print_out("--------------- Update ---------------") appends, updates, deletions = None, None, None # Now, cycle through the lists if len(ids_add) > 0: appends = create_new_pdg_fields(ids_add, new_pdg_data) else: _print_out("No new fields to append.") if len(ids_compare) > 0: updates = check_existing_pdg_fields(ids_compare, new_pdg_data, current_records) else: _print_out("No duplicate records to compare.") if len(ids_delete) > 0: deletions = remove_pdg_fields(ids_delete, current_records) else: _print_out("No fields in records to be deleted.") _print_out("--------------- Writing Changes ---------------") if appends is not None: write_records_to_file(output_dir, "append.xml", appends, dry_run) else: _print_out("No records to append to.") if len(updates) > 0: write_records_to_file(output_dir, "correct.xml", updates, dry_run) else: _print_out("No records to correct.") if deletions is not None: write_records_to_file(output_dir, "delete.xml", deletions, dry_run) else: _print_out("No records to delete from.")
def task_run_core(name=NAME): """ Performs a search to find records without a texkey, generates a new one and uploads the changes in chunks """ recids = task_get_task_param('recids') if recids: start_date = None write_message("processing recids from commandline") else: start_date = datetime.now() recids = intbitset() recids |= intbitset( perform_request_search(p='-035:spirestex -035:inspiretex', cc='HEP')) if task_get_task_param('all'): write_message("processing all records without texkey") else: _, last_date = fetch_last_updated(name) recids = recids & fetch_records_modified_since(last_date) write_message("processing records modified since: %s" % last_date) write_message("Found %s records to assign texkeys" % len(recids)) processed_recids = [] xml_to_process = [] for count, recid in enumerate(recids): write_message("processing recid %s" % recid) # Check that the record does not have already a texkey has_texkey = False recstruct = get_record(recid) for instance in record_get_field_instances(recstruct, tag="035", ind1="", ind2=""): try: provenance = field_get_subfield_values(instance, "9")[0] except IndexError: provenance = "" try: value = field_get_subfield_values(instance, "a")[0] except IndexError: value = "" provenances = ["SPIRESTeX", "INSPIRETeX"] if provenance in provenances and value: has_texkey = True write_message("INFO: Record %s has already texkey %s" % (recid, value)) if not has_texkey: TexKeySeq = TexkeySeq() new_texkey = "" try: new_texkey = TexKeySeq.next_value(recid) except TexkeyNoAuthorError: write_message( "WARNING: Record %s has no first author or collaboration" % recid) continue except TexkeyNoYearError: write_message("WARNING: Record %s has no year" % recid) continue write_message("Created texkey %s for record %d" % (new_texkey, recid)) xml = create_xml(recid, new_texkey) processed_recids.append(recid) xml_to_process.append(xml) task_update_progress("Done %d out of %d." % (count, len(recids))) task_sleep_now_if_required() # sequence ID to be used in all subsequent tasks sequence_id = str(random.randrange(1, 4294967296)) if xml_to_process: process_chunk(xml_to_process, sequence_id) # Finally, index all the records processed # FIXME: Waiting for sequence id to be fixed # if processed_recids: # submit_bibindex_task(processed_recids, sequence_id) if start_date: store_last_updated(0, start_date, name) return True
if extract.get('year', False): subfields.append(('y', str(extract['year']))) if extract.get('page', False): subfields.append(('c', str(extract['page']))) new_field = create_field(subfields, global_position=field[4]) record_replace_field(record, '773', new_field, field[4]) break if not recid or recid == -1: # Record (probably) does not exist, flag for inserting into database # FIXME: Add some automatic deny/accept parameters, perhaps also bibmatch call insert_records.append(record) else: # Record exists, fetch existing record existing_record = get_record(recid) if existing_record is None: # Did not find existing record in database holdingpen_records.append(record) continue # We remove 500 field temporary/brief entry from revision if record already exists fields_500 = record_get_field_instances(record, '500', ind1="%", ind2="%") if fields_500 is not None: field_positions = [] for field in fields_500: subfields = field_get_subfield_instances(field) for subfield in subfields:
def _get_formated_record(record_id, output_format, update_commands, language, outputTags="", checked=True, displayed_records=None): """Returns a record in a given format @param record_id: the ID of record to format @param output_format: an output format code (or short identifier for the output format) @param update_commands: list of commands used to update record contents @param language: the language to use to format the record @param outputTags: the tags to be shown to the user @param checked: is the record checked by the user? @param displayed_records: records to be displayed on a given page @returns: record formated to be displayed or None """ if update_commands and checked: # Modify the bibrecord object with the appropriate actions updated_record = _get_updated_record(record_id, update_commands) textmarc_options = { "aleph-marc": 0, "correct-mode": 1, "append-mode": 0, "delete-mode": 0, "insert-mode": 0, "replace-mode": 0, "text-marc": 1 } if record_id not in displayed_records: return old_record = search_engine.get_record(recid=record_id) old_record_textmarc = xmlmarc2textmarc.create_marc_record( old_record, sysno="", options=textmarc_options) if "hm" == output_format: if update_commands and checked: updated_record_textmarc = xmlmarc2textmarc.create_marc_record( updated_record, sysno="", options=textmarc_options) result = _get_record_diff(old_record_textmarc, updated_record_textmarc, outputTags, record_id) else: filter_tags = "All tags" not in outputTags and outputTags result = ['<pre>'] for line in old_record_textmarc.splitlines(): if not filter_tags or line.split()[0].replace( '_', '') in outputTags: result.append("%09d " % record_id + line.strip()) result.append('</pre>') result = '\n'.join(result) else: if update_commands and checked: # No coloring of modifications in this case xml_record = bibrecord.record_xml_output(updated_record) else: xml_record = bibrecord.record_xml_output(old_record) result = bibformat.format_record(recID=None, of=output_format, xml_record=xml_record, ln=language) return result