def get_all_recids(): """Return all relevant record IDs.""" if CFG_INSPIRE_SITE: all_recids = get_collection_reclist(CFG_SITE_NAME) | get_collection_reclist("Conferences") elif CFG_CERN_SITE: all_recids = get_collection_reclist(CFG_SITE_NAME) | get_collection_reclist("CERN Articles & Preprints") | get_collection_reclist("CERN Series") | get_collection_reclist("CERN Departments") | get_collection_reclist("CERN Experiments") | get_collection_reclist("CERN R&D Projects") # We exclude all records that is not relevant for CERN/CDS # all_recids = all_recids & search_pattern(p='690c:CERN or 595:cds') # We exclude all records with an existing INSPIRE ID. # all_recids = all_recids - search_pattern(p='035:INSPIRE') else: all_recids = intbitset() return all_recids
def fetch_concerned_records(name): task_update_progress("Fetching record ids") dummy, last_date = fetch_last_updated(name) if task_get_option('new'): # Fetch all records inserted since last run sql = """SELECT `id_bibrec`, `cd` FROM `bibdocfsinfo` INNER JOIN `bibrec_bibdoc` ON `bibdocfsinfo`.`id_bibdoc` = `bibrec_bibdoc`.`id_bibdoc` WHERE `cd` > %s AND format IN ('.pdf', '.PDF', '.pdf;pdfa', '.PDF;pdfa') ORDER BY `cd`""" records = run_sql(sql, [last_date.isoformat()]) else: given_recids = task_get_option('recids') for collection in task_get_option('collections'): given_recids.add(get_collection_reclist(collection)) if given_recids: format_strings = ','.join(['%s'] * len(given_recids)) records = run_sql( """SELECT `id`, NULL FROM `bibrec` WHERE `id` IN (%s) ORDER BY `id`""" % format_strings, list(given_recids)) else: records = [] task_update_progress("Done fetching record ids") return records
def tokenize_for_phrases(self, recID): """Get the country names and country codes of the institutions affiliated with the authors of the publication """ # Get the name of the institution affiliated institution_names = [] for tag in self.institution_tags: institution_names += get_fieldvalues(recID, tag) # Get the hitset of all the institutes institution_collection_hitset = intbitset([]) for collection in CFG_WEBSEARCH_INSTITUTION_COLLECTIONS: institution_collection_hitset += get_collection_reclist(collection) # Search for the institution name and get a list of institution ids institution_ids = intbitset([]) for name in institution_names: result_hitset = search_pattern( p=name, f=self.institution_name_field ) institution_hitset = result_hitset & institution_collection_hitset institution_ids += list(institution_hitset) # Get the country tokens tokens = [] for instID in institution_ids: tokens += self._tokenize_from_country_name_tag(instID) tokens += self._tokenize_from_country_code_tag(instID) # Remove duplicates tokens = list(set(tokens)) return tokens
def create_collection_bibrec(table_name, coll_name, step_size=10000, max_size=-1): if table_name[0] != '_': raise Exception("By convention, temporary tables must begin with '_'. I don't want to give you tools to screw st important") create_stmt = dbquery.run_sql("SHOW CREATE TABLE bibrec")[0][1].replace('bibrec', dbquery.real_escape_string(table_name)) dbquery.run_sql("DROP TABLE IF EXISTS `%s`" % dbquery.real_escape_string(table_name)) dbquery.run_sql(create_stmt) # now retrieve the collection c = search_engine.get_collection_reclist(coll_name) # reverse sort it c = sorted(c, reverse=True) if len(c) < 0: sys.stderr.write("The collection %s is empty!\n" % coll_name) c = list(c) l = len(c) if max_size > 0: l = max_size i = 0 sys.stderr.write("Copying bibrec data, patience please...\n") while i < l: dbquery.run_sql("INSERT INTO `%s` SELECT * FROM `bibrec` WHERE bibrec.id IN (%s)" % (dbquery.real_escape_string(table_name), ','.join(map(str, c[i:i+step_size])))) i = i + len(c[i:i+step_size]) #sys.stderr.write("%s\n" % i) sys.stderr.write("Total number of records: %s Copied: %s\n" % (len(c), min(l, len(c))))
def late(req): req.content_type = "text/html" print >> req, pageheaderonly("Late journals", req=req) for journal in CFG_JOURNALS: print >> req, "<h2>%s</h2>" % escape(get_coll_i18nname(journal)) results = get_collection_reclist(journal) print >> req, "<table>" print >> req, "<tr><th>DOI</th><th>Title</th><th>DOI registration</th><th>Arrival in SCOAP3</th></tr>" for recid in results: creation_date = run_sql("SELECT creation_date FROM bibrec WHERE id=%s", (recid, ))[0][0] record = get_record(recid) doi = record_get_field_value(record, '024', '7', code='a') title = record_get_field_value(record, '245', code='a') doi_date = run_sql("SELECT creation_date FROM doi WHERE doi=%s", (doi, )) background = "#eee" if doi_date: doi_date = doi_date[0][0] if (creation_date - doi_date).days < 0: background = "#66FF00" elif (creation_date - doi_date).days < 1: background = "#FF6600" else: background = "#FF0000" else: doi_date = '' print >> req, '<tr style="background-color: %s;"><td><a href="http://dx.doi.org/%s" target="_blank">%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' % ( background, escape(doi, True), escape(doi), title, doi_date, creation_date) print >> req, "</table>"
def find_records(collection, subfields): """ Find records with VOLATILE content. @param collection: collection to be checked @type collection: string @param subfields: VOLATILE content in tagiic @type subfields: dict @return: dict {recid: array of tagiic} """ sf_keys = subfields.keys() sf_keys.sort() recs_collection = get_collection_reclist(collection) recs_to_change = {} for tagiic in sf_keys: for value in subfields[tagiic]: result = search_pattern(p=value, f=tagiic, m='e') & recs_collection if result: write_message('Update %i records with %s:"%s" -- %s' \ % (len(result), tagiic, value, list(result))) for recid in result: if recs_to_change.has_key(recid): recs_to_change[recid].append(tagiic) else: recs_to_change[recid] = [tagiic, ] return recs_to_change
def unlinked(req): """ Return an id-ordered list of citation log entries of at most 10000 rows. """ from invenio.dbquery import run_sql from invenio.search_engine import get_fieldvalues, get_collection_reclist useful_personids1 = intbitset(run_sql("SELECT distinct personid FROM aidPERSONIDDATA WHERE tag LIKE 'extid:%'")) useful_personids2 = intbitset(run_sql("SELECT distinct personid from aidPERSONIDPAPERS where flag=2")) linked_personids = intbitset(run_sql("SELECT personid FROM aidPERSONIDDATA WHERE tag='extid:INSPIREID'")) names = dict(run_sql("SELECT personid, data FROM aidPERSONIDDATA WHERE tag='canonical_name'")) matched_names = [name.lower().strip() for name in get_fieldvalues(get_collection_reclist('HepNames'), '035__a')] personid_to_match = (useful_personids1 | useful_personids2) - linked_personids body = ['<ol>'] for personid in personid_to_match: name = names.get(personid, str(personid)) if name.lower().strip() in matched_names: continue body.append('<li><a href="%(siteurl)s/author/profile/%(bai)s" target="_blank">%(bai)s</a></li>' % { 'siteurl': escape(CFG_SITE_SECURE_URL, True), 'bai': escape(name, True)}) body.append('</ol>') body = '\n'.join(body) return page(req=req, body=body, title="Unlinked useful BAIs")
def fetch_concerned_records(name): task_update_progress("Fetching record ids") last_recid, last_date = fetch_last_updated(name) if task_get_option('new'): # Fetch all records inserted since last run sql = "SELECT `id`, `creation_date` FROM `bibrec` " \ "WHERE `creation_date` >= %s " \ "AND `id` > %s " \ "ORDER BY `creation_date`" records = run_sql(sql, (last_date.isoformat(), last_recid)) elif task_get_option('modified'): # Fetch all records inserted since last run sql = "SELECT `id`, `modification_date` FROM `bibrec` " \ "WHERE `modification_date` >= %s " \ "AND `id` > %s " \ "ORDER BY `modification_date`" records = run_sql(sql, (last_date.isoformat(), last_recid)) else: given_recids = task_get_option('recids') for collection in task_get_option('collections'): given_recids.add(get_collection_reclist(collection)) if given_recids: format_strings = ','.join(['%s'] * len(given_recids)) records = run_sql("SELECT `id`, NULL FROM `bibrec` " \ "WHERE `id` IN (%s) ORDER BY `id`" % format_strings, list(given_recids)) else: records = [] task_update_progress("Done fetching record ids") return records
def eprints(): total = 0 fermilab = get_collection_reclist('Fermilab') print '{0:4s} {1:3s} {2:3s} {3:3s}'.format('Date', 'All', 'FNA', '%') date_range = ['1904', '1905', '1906'] #date_range = range(1, 20) for yymm in date_range: yymm = str(yymm) if len(yymm) == 1: yymm = '0' + yymm search_f = '037__a:fermilab* 037__c:physics.acc-ph 037__a:"arXiv:' + \ yymm + '*"' search = '037__c:physics.acc-ph 037__a:"arXiv:' + yymm + '*"' x = perform_request_search(p=search, cc='HEP') search = '037__c:acc-phys 037__a:"acc-phys/' + yymm + '*"' y = perform_request_search(p=search, cc='HEP') x_f = intbitset(x) & fermilab y_f = intbitset(y) & fermilab length = len(x) + len(y) length_f = len(x_f) + len(y_f) try: ratio = float(length_f) / float(length) * 100.0 except ZeroDivisionError: ratio = 0 print '{0:4s} {1:3d} {2:3d} {3:3f}'.format(yymm, length, length_f, ratio) total += length print "Total =", total
def create_collection_bibrec(table_name, coll_name, step_size=10000, maxsize=None): if table_name[0] != '_': raise Exception("By convention, temporary tables must begin with '_'. I don't want to give you tools to screw st important") create_stmt = dbquery.run_sql("SHOW CREATE TABLE bibrec")[0][1].replace('bibrec', dbquery.real_escape_string(table_name)) dbquery.run_sql("DROP TABLE IF EXISTS `%s`" % dbquery.real_escape_string(table_name)) dbquery.run_sql(create_stmt) print create_stmt #now retrieve the collection c = search_engine.get_collection_reclist(coll_name) if len(c) < 0: sys.stderr.write("The collection %s is empty!\n" % coll_name) else: print 'collection has x recs:', len(c) c = list(c) l = len(c) i = 0 sys.stderr.write("Copying bibrec data\n") while i < l: dbquery.run_sql("INSERT INTO `%s` SELECT * FROM `bibrec` WHERE bibrec.id IN (%s)" % (dbquery.real_escape_string(table_name), ','.join(map(str, c[i:i+step_size])))) i = i + step_size sys.stderr.write("%s\n" % i) if (maxsize and i > maxsize): break sys.stderr.write("Total number of records: %s\n" % l)
def tokenize_for_phrases(self, recID): """Get the country names and country codes of the institutions affiliated with the authors of the publication """ # Get the name of the institution affiliated institution_names = [] for tag in self.institution_tags: institution_names += get_fieldvalues(recID, tag) # Get the hitset of all the institutes institution_collection_hitset = intbitset([]) for collection in CFG_WEBSEARCH_INSTITUTION_COLLECTIONS: institution_collection_hitset += get_collection_reclist(collection) # Search for the institution name and get a list of institution ids institution_ids = intbitset([]) for name in institution_names: if name.strip(): result_hitset = search_pattern(p=name, f=self.institution_name_field) institution_hitset = result_hitset & institution_collection_hitset institution_ids += list(institution_hitset) # Get the country tokens tokens = [] for instID in institution_ids: tokens += self._tokenize_from_country_name_tag(instID) tokens += self._tokenize_from_country_code_tag(instID) # Remove duplicates tokens = list(set(tokens)) return tokens
def eprints(): total = 0 fermilab = get_collection_reclist('Fermilab') print '{0:4s} {1:3s} {2:3s} {3:3s}'.format('Date', 'All', 'FNA', '%') date_range = ['1901', '1902', '1903'] date_range = range(1, 20) for yymm in date_range: yymm = str(yymm) if len(yymm) == 1: yymm = '0' + yymm search_f = '037__a:fermilab* 037__c:physics.acc-ph 037__a:"arXiv:' + \ yymm + '*"' search = '037__c:physics.acc-ph 037__a:"arXiv:' + yymm + '*"' x = perform_request_search(p=search, cc='HEP') search = '037__c:acc-phys 037__a:"acc-phys/' + yymm + '*"' y = perform_request_search(p=search, cc='HEP') x_f = intbitset(x) & fermilab y_f = intbitset(y) & fermilab length = len(x) + len(y) length_f = len(x_f) + len(y_f) try: ratio = float(length_f)/float(length)*100.0 except ZeroDivisionError: ratio = 0 print '{0:4s} {1:3d} {2:3d} {3:3f}'.format(yymm, length, length_f, ratio) total += length print "Total =", total
def fetch_records_missing_arxiv_fulltext(): """ Returns all the record IDs for records which are supposed to have an arXiv fulltext but do not have it. """ return (search_pattern(p='035__9:"arXiv" - 980:DELETED') & get_collection_reclist('HEP')) \ - fetch_records_with_arxiv_fulltext()
def get_all_public_records(collections): """ Get all records which exist (i.e. not suppressed ones) and are in accessible collection. returns list of (recid, last_modification) tuples """ recids = intbitset() for collection in collections: recids += get_collection_reclist(collection) query = 'SELECT id, modification_date FROM bibrec' res = run_sql(query) return [(recid, lastmod) for (recid, lastmod) in res if recid in recids]
def create_update_jobs_by_collection(batch_template_file, collection, job_directory=CFG_BIBENCODE_DAEMON_DIR_NEWJOBS): """ Creates the job description files to update a whole collection @param batch_template_file: fullpath to the template for the update @type batch_tempalte_file: string @param collection: name of the collection that should be updated @type collection: string @param job_directory: fullpath to the directory storing the job files @type job_directory: string """ recids = get_collection_reclist(collection) return create_update_jobs_by_recids(recids, batch_template_file, job_directory)
def bst_dump_records(): try: os.makedirs(os.path.join(CFG_WEBDIR, 'dumps')) except OSError: pass html_index = open(os.path.join(CFG_WEBDIR, 'dumps', '.inspire-dump.html'), "w") print >> html_index, "<html><head><title>INSPIRE Dump</title></head><body><ul>" for collection in CFG_EXPORTED_COLLECTIONS: task_update_progress(collection) print >> html_index, """ <li><a href="%(prefix)s/dumps/%(collection)s-records.xml.gz">%(collection)s</a> (<a href="%(prefix)s/dumps/%(collection)s-records.xml.gz.md5">MD5</a>): %(date)s</li>""" % { 'prefix': CFG_SITE_URL, 'collection': collection, 'date': time.ctime() } write_message("Preparing %s-records.xml.gz" % collection) output_path = os.path.join(CFG_WEBDIR, 'dumps', '.%s-records.xml.gz' % collection) output = gzip.open(output_path, "w") print >> output, "<collection>" reclist = get_collection_reclist(collection) tot = len(reclist) time_estimator = get_time_estimator(tot) for i, recid in enumerate(reclist): with run_ro_on_slave_db(): print >> output, format_record(recid, 'xme', user_info={})[0] time_estimation = time_estimator()[1] if (i + 1) % 100 == 0: task_update_progress( "%s %s (%s%%) -> %s" % (collection, recid, (i + 1) * 100 / tot, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time_estimation)))) task_sleep_now_if_required() print >> output, "</collection>" output.close() write_message("Computing checksum") print >> open(output_path + '.md5', "w"), calculate_md5(output_path) os.rename( output_path, os.path.join(CFG_WEBDIR, 'dumps', '%s-records.xml.gz' % collection)) os.rename( output_path + '.md5', os.path.join(CFG_WEBDIR, 'dumps', '%s-records.xml.gz.md5' % collection)) write_message("DONE") print >> html_index, "</ul></body></html>" html_index.close() os.rename(os.path.join(CFG_WEBDIR, 'dumps', '.inspire-dump.html'), os.path.join(CFG_WEBDIR, 'dumps', 'inspire-dump.html'))
def get_compliance_values(): reclist = get_collection_reclist('SCOAP3 Repository') for recid in reclist: tmpdic = {} rec = get_record(recid) if '591' in rec: for i in range(3): str_val = rec['591'][i][0][0][1] key = str_val[:str_val.find(':')].lower() val = int(str_val[str_val.find(':')+1:]) tmpdic[key] = val compliance_check_values[recid] = tmpdic
def get_compliance_values(): reclist = get_collection_reclist('SCOAP3 Repository') for recid in reclist: tmpdic = {} rec = get_record(recid) if '591' in rec: for field in rec['591']: str_val = field[0][0][1] key = str_val[:str_val.find(':')].lower() val = int(str_val[str_val.find(':')+1:]) tmpdic[key] = val compliance_check_values[recid] = tmpdic
def unlinked(req, orcidonly=False): """ Return an id-ordered list of citation log entries of at most 10000 rows. """ from invenio.dbquery import run_sql from invenio.search_engine import get_fieldvalues, get_collection_reclist useful_personids1 = intbitset( run_sql( "SELECT distinct personid FROM aidPERSONIDDATA WHERE tag LIKE 'extid:%'" )) useful_personids2 = intbitset() if not orcidonly: useful_personids2 = intbitset( run_sql( "SELECT distinct personid from aidPERSONIDPAPERS where flag=2") ) linked_personids = intbitset( run_sql( "SELECT personid FROM aidPERSONIDDATA WHERE tag='extid:INSPIREID'") ) names = dict( run_sql( "SELECT personid, data FROM aidPERSONIDDATA WHERE tag='canonical_name'" )) matched_names = [ name.lower().strip() for name in get_fieldvalues( get_collection_reclist('HepNames'), '035__a') ] personid_to_match = (useful_personids1 | useful_personids2) - linked_personids body = ['<ol>'] for personid in personid_to_match: name = names.get(personid, str(personid)) if name.lower().strip() in matched_names: continue body.append( '<li><a href="%(siteurl)s/author/profile/%(bai)s" target="_blank">%(bai)s</a></li>' % { 'siteurl': escape(CFG_SITE_SECURE_URL, True), 'bai': escape(name, True) }) body.append('</ol>') body = '\n'.join(body) if orcidonly: title = "Unlinked BAIs with ORCID" else: title = "Unlinked useful BAIs" return page(req=req, body=body, title=title)
def get_all_recids(): """Return all relevant record IDs.""" if CFG_INSPIRE_SITE: all_recids = get_collection_reclist( CFG_SITE_NAME) | get_collection_reclist( "Conferences") | get_collection_reclist( "For CDS") | get_collection_reclist("CDS Hidden") elif CFG_CERN_SITE: all_recids = get_collection_reclist( CFG_SITE_NAME) | get_collection_reclist( "CERN Articles & Preprints") | get_collection_reclist( "CERN Series") | get_collection_reclist( "CERN Departments") | get_collection_reclist( "CERN Experiments") | get_collection_reclist( "CERN R&D Projects") # We exclude all records that is not relevant for CERN/CDS # all_recids = all_recids & search_pattern(p='690c:CERN or 595:cds') # We exclude all records with an existing INSPIRE ID. # all_recids = all_recids - search_pattern(p='035:INSPIRE') else: all_recids = intbitset() return all_recids
def get_all_public_records(collections): """ Get all records which exist (i.e. not suppressed ones) and are in accessible collection. returns list of (recid, last_modification) tuples """ all_restricted_recids = get_all_restricted_recids() recids = intbitset() minimum_timestamp = get_minimum_timestamp() for collection in collections: recids += get_collection_reclist(collection) recids = recids.difference(all_restricted_recids) query = 'SELECT id, modification_date FROM bibrec' res = run_sql(query) return [(recid, max(lastmod, minimum_timestamp)) for (recid, lastmod) in res if recid in recids]
def bst_dump_records(): try: os.makedirs(os.path.join(CFG_WEBDIR, "dumps")) except OSError: pass html_index = open(os.path.join(CFG_WEBDIR, "dumps", ".inspire-dump.html"), "w") print >> html_index, "<html><head><title>INSPIRE Dump</title></head><body><ul>" for collection in CFG_EXPORTED_COLLECTIONS: task_update_progress(collection) print >> html_index, """ <li><a href="%(prefix)s/dumps/%(collection)s-records.xml.gz">%(collection)s</a> (<a href="%(prefix)s/dumps/%(collection)s-records.xml.gz.md5">MD5</a>): %(date)s</li>""" % { "prefix": CFG_SITE_URL, "collection": collection, "date": time.ctime(), } write_message("Preparing %s-records.xml.gz" % collection) output_path = os.path.join(CFG_WEBDIR, "dumps", ".%s-records.xml.gz" % collection) output = gzip.open(output_path, "w") print >> output, "<collection>" reclist = get_collection_reclist(collection) tot = len(reclist) time_estimator = get_time_estimator(tot) for i, recid in enumerate(reclist): print >> output, format_record(recid, "xme", user_info={})[0] time_estimation = time_estimator()[1] if (i + 1) % 100 == 0: task_update_progress( "%s %s (%s%%) -> %s" % ( collection, recid, (i + 1) * 100 / tot, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time_estimation)), ) ) task_sleep_now_if_required() print >> output, "</collection>" output.close() write_message("Computing checksum") print >>open(output_path + ".md5", "w"), calculate_md5(output_path) os.rename(output_path, os.path.join(CFG_WEBDIR, "dumps", "%s-records.xml.gz" % collection)) os.rename(output_path + ".md5", os.path.join(CFG_WEBDIR, "dumps", "%s-records.xml.gz.md5" % collection)) write_message("DONE") print >> html_index, "</ul></body></html>" html_index.close() os.rename( os.path.join(CFG_WEBDIR, "dumps", ".inspire-dump.html"), os.path.join(CFG_WEBDIR, "dumps", "inspire-dump.html") )
def create_update_jobs_by_collection( batch_template_file, collection, job_directory=CFG_BIBENCODE_DAEMON_DIR_NEWJOBS): """ Creates the job description files to update a whole collection @param batch_template_file: fullpath to the template for the update @type batch_tempalte_file: string @param collection: name of the collection that should be updated @type collection: string @param job_directory: fullpath to the directory storing the job files @type job_directory: string """ recids = get_collection_reclist(collection) return create_update_jobs_by_recids(recids, batch_template_file, job_directory)
def get_all_public_records_modified_last_month(collections): """ Get all records which exist (i.e. not suppressed ones) and are in accessible collection. returns list of (recid, last_modification) tuples """ all_restricted_recids = get_all_restricted_recids() current_date = datetime.date.today() one_month_ago = current_date - datetime.timedelta(days = 31) recids = intbitset() for collection in collections: recids += get_collection_reclist(collection) recids = recids.difference(all_restricted_recids) query = 'SELECT id, modification_date FROM bibrec WHERE modification_date > %s' res = run_sql(query, (one_month_ago,)) return [(recid, lastmod) for (recid, lastmod) in res if recid in recids]
def _append_recid_collection_list(collection, current_recids): """Updated list of recids with new recids from collection @param collection: (string) collection name to use to obtain record ids @param current_recids: (list) list of current record ids which have already been obtained from previous collection or recid flags @return: (list) current record ids with newly appended recids from input collection """ records = get_collection_reclist(collection) for r in records: if r not in current_recids: current_recids.append(r) return current_recids
def get_all_public_records_modified_last_month(collections): """ Get all records which exist (i.e. not suppressed ones) and are in accessible collection. returns list of (recid, last_modification) tuples """ all_restricted_recids = get_all_restricted_recids() current_date = datetime.date.today() one_month_ago = current_date - datetime.timedelta(days=31) recids = intbitset() for collection in collections: recids += get_collection_reclist(collection) recids = recids.difference(all_restricted_recids) query = 'SELECT id, modification_date FROM bibrec WHERE modification_date > %s' res = run_sql(query, (one_month_ago, )) return [(recid, lastmod) for (recid, lastmod) in res if recid in recids]
def main(): for journal in CFG_JOURNALS: name = get_coll_i18nname(journal) reclist = get_collection_reclist(journal) print "<h2>%s</h2>" % escape(name) if not reclist: print "<p>None yet.</p>" continue print "<p><ul>" for recid in reclist: record = get_record(recid) title = remove_html_markup(record_get_field_value(record, '245', code='a'), remove_escaped_chars_p=False).strip() doi = record_get_field_value(record, '024', '7', code='a') print '<li><a href="http://dx.doi.org/%s" target="_blank">%s</a>: %s</li>' % (escape(doi, True), escape(doi), title) print "</ul></p>"
def find_book(citation_element): books_recids = get_collection_reclist('Books') search_string = citation_element['title'] recids = intbitset(get_recids_matching_query(search_string, 'title')) recids &= books_recids if len(recids) == 1: return recids if 'year' in citation_element: for recid in recids: year_tags = get_fieldvalues(recid, '269__c') for tag in year_tags: if tag == citation_element['year']: return [recid] return []
def get_all_recids(): if CFG_INSPIRE_SITE: all_recids = get_collection_reclist(CFG_SITE_NAME) elif CFG_CERN_SITE: all_recids = get_collection_reclist( CFG_SITE_NAME) | get_collection_reclist( "CERN Articles & Preprints") | get_collection_reclist( "CERN Series") | get_collection_reclist( "CERN Departments") | get_collection_reclist( "CERN Experiments") | get_collection_reclist( "CERN R&D Projects") else: all_recids = intbitset() return all_recids
def lazy_parser(collection, left_tags, right_tags): for recid in get_collection_reclist(collection): try: # Key tag # e.g. for journals database: 711__a left_values = get_tag_values(recid, left_tags) except IndexError: pass else: # Value tags # e.g. for journals database: 130__a, 730__a and 030__a right_values = get_tag_values(recid, right_tags) for left_value in set(left_values): for right_value in set(right_values): yield left_value, right_value
def lazy_parser(collection, left_tags, right_tags): for recid in get_collection_reclist(collection): try: # Key tag # e.g. for journals database: 130__a, 730__a and 030__a left_values = get_tag_values(recid, left_tags) except IndexError: pass else: # Value tags # e.g. for journals database: 711__a right_values = get_tag_values(recid, right_tags) for left_value in set(left_values): for right_value in set(right_values): yield left_value, right_value
def build_hepnames_knowledge(): recids = get_collection_reclist('HepNames') ret = {} for recid in recids: ids = {'recid': recid} record = get_record(recid) for field in record_get_field_instances(record, '035'): id_type = None id_value = None for code, value in field_get_subfield_instances(field): code = code.strip() value = value.strip() if code == '9': if id_type and id_type != value.upper(): write_message("ERROR: http://inspirehep.net/record/{recid} has invalid IDs".format(recid=recid), stream=sys.stderr) break id_type = value.upper() if code == 'a': if id_value and id_value != value: write_message("ERROR: http://inspirehep.net/record/{recid} has invalid IDs".format(recid=recid), stream=sys.stderr) break id_value = value if not id_type or not id_value: # Incomplete IDs continue else: if id_type == 'BAI': if not valid_bai(id_value): write_message("ERROR: http://inspirehep.net/record/{recid} has invalid BAI: {value}".format(recid=recid, value=id_value), stream=sys.stderr) continue elif id_type == 'INSPIRE': if not valid_inspire(id_value): write_message("ERROR: http://inspirehep.net/record/{recid} has invalid INSPIRE: {value}".format(recid=recid, value=id_value), stream=sys.stderr) continue elif id_type == 'ORCID': if not valid_orcid(id_value): write_message("ERROR: http://inspirehep.net/record/{recid} has invalid ORCID: {value}".format(recid=recid, value=id_value), stream=sys.stderr) continue elif id_type == 'KAKEN': if not valid_kaken(id_value): write_message("ERROR: http://inspirehep.net/record/{recid} has invalid KAKEN: {value}".format(recid=recid, value=id_value), stream=sys.stderr) continue ids[id_type] = id_value.upper() if id_type == 'BAI': ids['ORIGINAL_BAI'] = id_value ret[recid] = ids return ret.values()
def task_parse_options(key, value, opts, args): # pylint: disable-msg=W0613 """ Must be defined for bibtask to create a task """ if args: # There should be no standalone arguments for any bibcatalog job # This will catch args before the job is shipped to Bibsched raise StandardError("Error: Unrecognised argument '%s'." % args[0]) if key in ('-a', '--new'): task_set_option('new', True) elif key in ('-m', '--modified'): task_set_option('modified', True) elif key in ('-c', '--collections'): collections = task_get_option('collections') if not collections: collections = set() task_set_option('collections', collections) for v in value.split(","): collections.update(get_collection_reclist(v)) elif key in ('-i', '--recids'): recids = task_get_option('recids') if not recids: recids = set() task_set_option('recids', recids) recids.update(split_ids(value)) elif key in ('--tickets',): tickets = task_get_option('tickets') if not tickets: tickets = set() task_set_option('tickets', tickets) for item in value.split(','): tickets.add(item.strip()) elif key in ('--all-tickets',): task_set_option('all-tickets', True) elif key in ('-q', '--query'): query = task_get_option('query') if not query: query = set() task_set_option('query', query) query.add(value) elif key in ('-r', '--reportnumbers'): reportnumbers = task_get_option('reportnumbers') if not reportnumbers: reportnumbers = set() task_set_option('reportnumbers', reportnumbers) reportnumbers.add(value) return True
def get_all_recids(): if CFG_INSPIRE_SITE: all_recids = get_collection_reclist(CFG_SITE_NAME) elif CFG_CERN_SITE: all_recids = ( get_collection_reclist(CFG_SITE_NAME) | get_collection_reclist("CERN Articles & Preprints") | get_collection_reclist("CERN Series") | get_collection_reclist("CERN Departments") | get_collection_reclist("CERN Experiments") | get_collection_reclist("CERN R&D Projects") ) else: all_recids = intbitset() return all_recids
def task_parse_options(key, value, opts, args): # pylint: disable-msg=W0613 """ Must be defined for bibtask to create a task """ if args: # There should be no standalone arguments for any bibcatalog job # This will catch args before the job is shipped to Bibsched raise StandardError("Error: Unrecognised argument '%s'." % args[0]) if key in ('-a', '--new'): task_set_option('new', True) elif key in ('-m', '--modified'): task_set_option('modified', True) elif key in ('-c', '--collections'): collections = task_get_option('collections') if not collections: collections = set() task_set_option('collections', collections) for v in value.split(","): collections.update(get_collection_reclist(v)) elif key in ('-i', '--recids'): recids = task_get_option('recids') if not recids: recids = set() task_set_option('recids', recids) recids.update(split_ids(value)) elif key in ('--tickets', ): tickets = task_get_option('tickets') if not tickets: tickets = set() task_set_option('tickets', tickets) for item in value.split(','): tickets.add(item.strip()) elif key in ('--all-tickets', ): task_set_option('all-tickets', True) elif key in ('-q', '--query'): query = task_get_option('query') if not query: query = set() task_set_option('query', query) query.add(value) elif key in ('-r', '--reportnumbers'): reportnumbers = task_get_option('reportnumbers') if not reportnumbers: reportnumbers = set() task_set_option('reportnumbers', reportnumbers) reportnumbers.add(value) return True
def late(req): req.content_type = "text/html" print >> req, pageheaderonly("Late journals", req=req) th = ("<tr><th>DOI</th><th>Title</th><th>DOI registration</th>" "<th>Arrival in SCOAP3</th></tr>") tr = ("<tr style='background-color: {0};'><td>" "<a href='http://dx.doi.org/{1}' target='_blank'>{2}</td>" "<td>{3}</td><td>{4}</td><td>{5}</td></tr>") sql_bibrec = "SELECT creation_date FROM bibrec WHERE id=%s" sql_doi = "SELECT creation_date FROM doi WHERE doi=%s" for journal in CFG_JOURNALS: print >> req, "<h2>%s</h2>" % escape(get_coll_i18nname(journal)) results = get_collection_reclist(journal) print >> req, "<table>" print >> req, th for recid in results: creation_date = run_sql(sql_bibrec, (recid, ))[0][0] record = get_record(recid) doi = record_get_field_value(record, '024', '7', code='a') title = record_get_field_value(record, '245', code='a') doi_date = run_sql(sql_doi, (doi, )) background = "#eee" if doi_date: doi_date = doi_date[0][0] if (creation_date - doi_date).days < 0: background = "#66FF00" elif (creation_date - doi_date).days < 1: background = "#FF6600" else: background = "#FF0000" else: doi_date = '' print >> req, tr.format(background, escape(doi, True), escape(doi), title, doi_date, creation_date) print >> req, "</table>"
def parse_pdg_element(element, hep_collection=get_collection_reclist('HEP')): """Given an element from the PDG update file, this function will check the validity of the recid and return it with the parsed PDG data. Params: dict element - the element be parsed intbitset hep_collection - all recids in HEP, used for caching Return: ParseResult - Status code int recid - record ID list pdg_values - pdg_values """ recid = None pdg_values = None if set(element.keys()) != set(('inspireId', 'pdgIdList')): return ParseResult.Invalid, None, None recid = int(element['inspireId']) pdg_values = element['pdgIdList'] if recid not in hep_collection: return ParseResult.Missing, None, None return ParseResult.Success, recid, pdg_values
def bst_cnumcatchup(): modrecs = intbitset([ x[0] for x in run_sql('select id from bibrec where ' + 'modification_date >' + 'DATE_SUB(CURDATE(), INTERVAL 3 DAY)') ]) confupd = intbitset(get_collection_reclist('Conferences')) \ & modrecs procupd = intbitset(perform_request_search(p="980__a:Proceedings")) \ & modrecs cnums = [] for r in confupd: for c in get_fieldvalues(r, '111__g'): if len(c) > 3: cnums.append(c) for r in procupd: for c in get_fieldvalues(r, '773__w'): if len(c) > 3: cnums.append(c) recs = intbitset() for cn in cnums: recs += intbitset(perform_request_search(p="find cnum %s" % cn)) if recs: while len(recs) > 500: nextchunk = recs[:500] recs = recs[500:] task_low_level_submission('bibreformat', 'bibreformat:bstcnumcatchup', '-o', 'HB', '-i', ','.join([str(r) for r in nextchunk])) if recs: task_low_level_submission('bibreformat', 'bibreformat:bstcnumcatchup', '-o', 'HB', '-i', ','.join([str(r) for r in recs]))
def bst_cnumcatchup(): modrecs = intbitset([x[0] for x in run_sql('select id from bibrec where ' + 'modification_date >' + 'DATE_SUB(CURDATE(), INTERVAL 3 DAY)')]) confupd = intbitset(get_collection_reclist('Conferences')) \ & modrecs procupd = intbitset(perform_request_search(p="980__a:Proceedings")) \ & modrecs cnums = [] for r in confupd: for c in get_fieldvalues(r, '111__g'): if len(c) > 3: cnums.append(c) for r in procupd: for c in get_fieldvalues(r, '773__w'): if len(c) > 3: cnums.append(c) recs = intbitset() for cn in cnums: recs += intbitset(perform_request_search(p="find cnum %s" % cn)) if recs: while len(recs) > 500: nextchunk = recs[:500] recs = recs[500:] task_low_level_submission('bibreformat', 'bibreformat:bstcnumcatchup', '-o', 'HB', '-i', ','.join([str(r) for r in nextchunk])) if recs: task_low_level_submission('bibreformat', 'bibreformat:bstcnumcatchup', '-o', 'HB', '-i', ','.join([str(r) for r in recs]))
def bst_autocompletion_cache(collection_list=None): """ Bibtasklet responsible of the generation of the subjects and authors list for the autocompletion suggestions. @param collection_list: list of collection ids to cache. If None, all the collections will be calculated. """ task_update_progress("Started updating autocomplete cache") tag_dicc = {'en': '9051_a', 'fr': '9061_a', 'es': '9071_a'} if collection_list == None: res = run_sql("SELECT id FROM collection") collection_list = [i[0] for i in res] i = 0 task_update_progress("Done %s of %s" % (i, len(collection_list))) for collection in collection_list: i += 1 recids = list(get_collection_reclist(get_collection_name_by_id(collection))) authors = get_most_popular_field_values(recids, get_field_tags('exactauthor'))[0:200] authors = [a[0] for a in authors] subjects = {} for ln in ['en', 'fr', 'es']: subject_tag = tag_dicc[ln] subjects[ln] = [s[0] for s in get_most_popular_field_values(recids, subject_tag)] ins = AutocompletionCache(id_collection=collection, authors=authors, subjects=subjects) db.session.merge(ins) db.session.flush() task_update_progress("Done %s of %s" % (i, len(collection_list))) db.session.close_all() task_update_progress("Finished updating autocomplete cache")
def lazy_parser(collection, left_tags, right_tags, volume_subfield): for recid in get_collection_reclist(collection): record = get_record(recid) for right_tag in right_tags: for right_value in record_get_field_values( record, right_tag[:3], right_tag[3], right_tag[4], right_tag[5]): if not right_value: continue # Empty metadata yield right_value, right_value for left_tag in left_tags: for left_field in record_get_field_instances( record, left_tag[:3], left_tag[3], left_tag[4]): left_subfields = dict( field_get_subfield_instances(left_field)) if left_tag[5] not in left_subfields: continue # Empty field if volume_subfield in left_subfields: yield left_subfields[left_tag[5]], '%s;%s' % ( right_value, left_subfields[volume_subfield]) else: yield left_subfields[left_tag[5]], right_value
from invenio.search_engine import perform_request_search from invenio.search_engine import get_all_field_values from invenio.intbitset import intbitset #from invenio.bibauthorid_dbinterface \ # import _select_from_aidpersoniddata_where from invenio.dbquery import run_sql from hep_convert_email_to_id import find_inspire_id_from_record, \ bad_id_check, \ get_hepnames_anyid_from_recid LETTER = None RECIDS_HEPN = get_collection_reclist('HepNames') RECIDS_INST = get_collection_reclist('Institutions') RECIDS_EXPT = get_collection_reclist('Experiments') GOOD_IDENTIFIERS = set([ x.lower() for x in [ 'ARXIV', 'BAI', 'CERN', 'DESY', 'GoogleScholar', 'INSPIRE', 'JACOW', 'KAKEN', 'ORCID', 'ResearcherID', 'SCOPUS', 'SLAC', 'Wikipedia' ] ]) BAI_URL = 'https://inspirehep.net/author/manage_profile/' def bad_identifiers(): """Looks for bad 035__9 fields"""
""" Bibcheck plugin checking that reference info in 999C50 agrees with citation info in 999C5r and 999C5s """ import re from collections import defaultdict, namedtuple from invenio.intbitset import intbitset from invenio.search_engine import (get_collection_reclist, search_pattern, search_unit) Reftags = namedtuple('Reftags', 'pubnote repno DOI citedrecid curatorflag') FIELDS = Reftags('999C5s', '999C5r', '999C5a', '999C50', '999C59') HEPRECS = get_collection_reclist('HEP') CATEGORY = re.compile(ur'^(.*)\[[^\]]+\]') ARXIVPREFIX = re.compile(ur'^(arXiv:(\s+)?)\D', re.I) class Reference(object): """ container for various ref info """ def __init__(self): self._fields = defaultdict(list) @staticmethod def normalize_repno(repno): """ cast repno into standard form """
def _get_recids_foreach_ontology(recids=None, collections=None, taxonomy=None): """Returns an array containing hash objects containing the collection, its corresponding ontology and the records belonging to the given collection.""" rec_onts = [] # User specified record IDs. if recids: rec_onts.append({ 'ontology': taxonomy, 'collection': None, 'recIDs': recids, }) return rec_onts # User specified collections. if collections: for collection in collections: records = get_collection_reclist(collection) if records: rec_onts.append({ 'ontology': taxonomy, 'collection': collection, 'recIDs': records }) return rec_onts # Use rules found in collection_clsMETHOD. result = run_sql( "SELECT clsMETHOD.name, clsMETHOD.last_updated, " "collection.name FROM clsMETHOD JOIN collection_clsMETHOD ON " "clsMETHOD.id=id_clsMETHOD JOIN collection ON " "id_collection=collection.id") for ontology, date_last_run, collection in result: records = get_collection_reclist(collection) if records: if not date_last_run: write_message("INFO: Collection %s has not been previously " "analyzed." % collection, stream=sys.stderr, verbose=3) modified_records = intbitset(run_sql("SELECT id FROM bibrec")) elif task_get_option('force'): write_message("INFO: Analysis is forced for collection %s." % collection, stream=sys.stderr, verbose=3) modified_records = intbitset(run_sql("SELECT id FROM bibrec")) else: modified_records = intbitset( run_sql( "SELECT id FROM bibrec " "WHERE modification_date >= %s", (date_last_run, ))) records &= modified_records if records: rec_onts.append({ 'ontology': ontology, 'collection': collection, 'recIDs': records }) else: write_message( "WARNING: All records from collection '%s' have " "already been analyzed for keywords with ontology '%s' " "on %s." % (collection, ontology, date_last_run), stream=sys.stderr, verbose=2) else: write_message("ERROR: Collection '%s' doesn't contain any record. " "Cannot analyse keywords." % collection, stream=sys.stderr, verbose=0) return rec_onts
""" import re from sys import argv from invenio.search_engine import perform_request_search, get_record, \ search_unit, get_all_field_values from invenio.bibrecord import print_rec, record_get_field_instances, \ record_add_field from invenio.intbitset import intbitset from invenio.bibformat_engine import BibFormatObject from invenio.search_engine import get_collection_reclist from hep_convert_email_to_id_input import RECIDS, SEARCH, VERBOSE HN = get_collection_reclist('HepNames') EMAILS_HEPNAMES = get_all_field_values('371__m') + \ get_all_field_values('371__o') + \ get_all_field_values('595__m') + \ get_all_field_values('595__o') EMAILS_HEP = get_all_field_values('100__m') + get_all_field_values('700__m') COUNTER_MAX = 400 def generate_check_digit(base_digits): ''' Taken from https://github.com/tjwds/generate-orcid-checksum ''' total = 0 for digit in str(base_digits):
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ Bibcheck plugin checking that reference info in 999C50 agrees with citation info in 999C5r and 999C5s """ import re from collections import defaultdict, namedtuple from invenio.intbitset import intbitset from invenio.search_engine import (get_collection_reclist, search_pattern, search_unit) Reftags = namedtuple('Reftags', 'pubnote repno DOI citedrecid curatorflag') FIELDS = Reftags('999C5s', '999C5r', '999C5a', '999C50', '999C59') HEPRECS = get_collection_reclist('HEP') CATEGORY = re.compile(ur'^(.*)\[[^\]]+\]') ARXIVPREFIX = re.compile(ur'^(arXiv:(\s+)?)\D', re.I) class Reference(object): """ container for various ref info """ def __init__(self): self._fields = defaultdict(list) @staticmethod def normalize_repno(repno): """ cast repno into standard form """ # normalize "arXiv:1612.12345 [hep-th]"
import sys import re from invenio.intbitset import intbitset import urlparse import pytz import os from md5 import md5 ## Generate filtered apache logs from OpenAIRE with: ## $ cd /opt/invenio/var/log ## $ cat apache.log apache-ssl.log | grep "GET /record/" | grep 200 | gzip > ~/eu.log.gz ## Generate this locally to OpenAIRE with: from invenio.search_engine import get_collection_reclist, CFG_SITE_NAME eu_recids = get_collection_reclist(CFG_SITE_NAME) #eu_recids = intbitset(eu_recids = [9, 10, 19, 23, 24, 25, 26, 773, 774, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 799, 800, 802, 803, 804, 805, 806, 808, 809, 810, 811, 812, 813, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 828, 829, 830, 832, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 858, 861, 875, 877, 878, 879, 882, 884, 885, 887, 888, 889, 890, 891, 892, 893, 894, 899, 900, 901, 902, 903, 905, 906, 907, 912, 913, 914, 915, 923, 925, 928, 930, 931, 932, 934]) ## This are from CDS ## To obtain the recids from CDS fire up ipython ## from invenio.search_engine import search_pattern ## eu_recids = search_pattern(p='ec_fundedresources', f='0248_p') eu_recids = intbitset([1111467, 1119156, 1119304, 1119305, 1123073, 1131840, 1150815, 1152380, 1153910, 1154457, 1161069, 1165141, 1166365, 1171145, 1171956, 1172330, 1174720, 1174799, 1176934, 1177572, 1178778, 1178965, 1179056, 1179975, 1180629, 1180882, 1181684, 1183307, 1185309, 1186606, 1191601, 1192007, 1194234, 1194627, 1194889, 1194911, 1195998, 1198185, 1198199, 1198803, 1199128, 1201615, 1202603, 1204323, 1204596, 1205042, 1205627, 1206034, 1206388, 1207015, 1207269, 1207509, 1208557, 1209236, 1209302, 1209573, 1210107, 1210369, 1210586, 1210726, 1211321, 1211333, 1212045, 1212628, 1212647, 1212816, 1212901, 1213091, 1213474, 1213664, 1213885, 1213943, 1213965, 1214312, 1214514, 1214626, 1214945, 1215300, 1215671, 1215675, 1216010, 1216172, 1216173, 1216174, 1216175, 1216216, 1216493, 1216578, 1216643, 1217471, 1217703, 1217803, 1217852, 1220800, 1221038, 1221231, 1221713, 1221914, 1221916, 1221919, 1222486, 1222694, 1222700, 1222838, 1223191, 1223198, 1223541, 1223722, 1223846, 1224489, 1224652, 1224810, 1225128, 1225650, 1225729, 1225965, 1226309, 1226355, 1226551, 1226713, 1226918, 1227094, 1227133, 1227326, 1227792, 1228022, 1228452, 1228934, 1229318, 1229332, 1229356, 1229432, 1229434, 1229530, 1229531, 1229574, 1229575, 1229750, 1229994, 1230077, 1230105, 1230307, 1230376, 1230425, 1230503, 1230736, 1230755, 1230960, 1231305, 1231387, 1231747, 1231901, 1233463, 1233755, 1233863, 1233949, 1234547, 1234835, 1234922, 1234924, 1234925, 1234926, 1234929, 1235127, 1235128, 1235143, 1235144, 1235145, 1235172, 1235198, 1235259, 1235329, 1235339, 1235904, 1236534, 1236897, 1236925, 1236947, 1237190, 1237309, 1237584, 1237832, 1238424, 1238451, 1238573, 1238626, 1239851, 1240666, 1240667, 1240668, 1240816, 1240817, 1240818, 1241005, 1241307, 1241907, 1242058, 1242081, 1242085, 1242526, 1243614, 1243709, 1243710, 1244371, 1244638, 1244718, 1244731, 1246307, 1246557, 1246569, 1246962, 1247395, 1247837, 1248317, 1248563, 1248581, 1248817, 1249009, 1249090, 1249240, 1249428, 1249579, 1249582, 1249711, 1254335, 1254934, 1255033, 1255127, 1255623, 1255958, 1256429, 1256433, 1256515, 1257430, 1257907, 1258002, 1258154, 1259059, 1259461, 1259591, 1260389, 1260500, 1260579, 1260911, 1260933, 1260943, 1260944, 1260959, 1261330, 1262406, 1262655, 1262878, 1262879, 1262925, 1263511, 1263531, 1264059, 1264268, 1264540, 1264877, 1265038, 1265283, 1265490, 1265837, 1266225, 1266302, 1266406, 1266466, 1266467, 1266797, 1266811, 1267065, 1267078, 1267205, 1267609, 1268099, 1268268, 1268371, 1268394, 1268418, 1268609, 1268772, 1268841, 1269002, 1269265, 1269520, 1269604, 1269752, 1270074, 1270216, 1270325, 1270869, 1271225, 1271829, 1272125, 1272396, 1272477, 1272489, 1272590, 1272628, 1273034, 1273170, 1273211, 1273270, 1273946, 1274170, 1274383, 1274519, 1274659, 1275064, 1275577, 1275587, 1275594, 1275738, 1276020, 1276432, 1276808, 1276861, 1277106, 1277305, 1277454, 1277487, 1277731, 1277830, 1277882, 1277959, 1278031, 1278512, 1279407, 1280616, 1280784, 1280892, 1280951, 1281726, 1281730, 1282194, 1282250, 1282556, 1282605, 1283386, 1283555, 1284223, 1284800, 1285770, 1287375, 1288209, 1288422, 1289343, 1289612, 1289614, 1289851, 1290019, 1290126, 1291833, 1292549, 1292739, 1293002, 1293698, 1293904, 1294205, 1295863, 1296038, 1296499, 1297362, 1297895, 1297976, 1297977, 1298178, 1298497, 1298507, 1299652, 1300674, 1301014, 1301331, 1301701, 1302208, 1303738, 1303855, 1303952, 1304543, 1304871, 1304875, 1306249, 1307104, 1307421, 1307840, 1308076, 1310283, 1310886, 1313619, 1313622, 1313681, 1313970, 1314843, 1316237, 1316543, 1317585, 1317804, 1322393, 1323250, 1323908, 1324061, 1324645, 1325254, 1328761, 1328841, 1330864, 1331909, 1334625, 1335312, 1335824, 1336088, 1337830, 1340534, 1340535, 1341481, 1341768, 1342827, 1342828, 1343468, 1343469, 1343470, 1343471, 1343472, 1343880, 1344476, 1345361, 1347544, 1348674, 1349292, 1350832, 1351200, 1351430, 1351551, 1351789, 1352083, 1352136, 1352694, 1352711, 1352765, 1359203, 1385890, 1405045, 1405438, 1407211, 1423019, 1426293, 1426296, 1428133, 1428524, 1428908, 1428910, 1436135, 1436386, 1439010, 1443465, 1447061, 1448194, 1449781, 1449803, 1449805, 1449806, 1456830, 1456848, 1473443, 1476018, 1476020, 1476023, 1476025]) ## 128.141.95.175 - - [04/May/2012:15:47:35 +0200] "GET /record/878?ln=en HTTP/1.1" 200 5647 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.41 Safari/536.5" RE_PATH = re.compile(r"^/record/(?P<recid>\d+)(/(files/(?P<filename>.+\.\w+))?)?$") _CFG_SALT = None _CFG_SALT_FILE = os.path.join('.', 'salt.txt')
def recids_cache(collections, cache={}): if 'valid_recids' not in cache: cache['valid_recids'] = intbitset() for coll in collections.split(','): cache['valid_recids'] += get_collection_reclist(coll) return cache['valid_recids']
import sys import re from invenio.intbitset import intbitset import urlparse import pytz import os from md5 import md5 ## Generate filtered apache logs from OpenAIRE with: ## $ cd /opt/invenio/var/log ## $ cat apache.log apache-ssl.log | grep "GET /record/" | grep 200 | gzip > ~/eu.log.gz ## Generate this locally to OpenAIRE with: from invenio.search_engine import get_collection_reclist, CFG_SITE_NAME eu_recids = get_collection_reclist(CFG_SITE_NAME) #eu_recids = intbitset(eu_recids = [9, 10, 19, 23, 24, 25, 26, 773, 774, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 799, 800, 802, 803, 804, 805, 806, 808, 809, 810, 811, 812, 813, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 828, 829, 830, 832, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 858, 861, 875, 877, 878, 879, 882, 884, 885, 887, 888, 889, 890, 891, 892, 893, 894, 899, 900, 901, 902, 903, 905, 906, 907, 912, 913, 914, 915, 923, 925, 928, 930, 931, 932, 934]) ## This are from CDS ## To obtain the recids from CDS fire up ipython ## from invenio.search_engine import search_pattern ## eu_recids = search_pattern(p='ec_fundedresources', f='0248_p') eu_recids = intbitset([ 1111467, 1119156, 1119304, 1119305, 1123073, 1131840, 1150815, 1152380, 1153910, 1154457, 1161069, 1165141, 1166365, 1171145, 1171956, 1172330, 1174720, 1174799, 1176934, 1177572, 1178778, 1178965, 1179056, 1179975, 1180629, 1180882, 1181684, 1183307, 1185309, 1186606, 1191601, 1192007, 1194234, 1194627, 1194889, 1194911, 1195998, 1198185, 1198199, 1198803, 1199128, 1201615, 1202603, 1204323, 1204596, 1205042, 1205627, 1206034,
def _get_recids_foreach_ontology(recids=None, collections=None, taxonomy=None): """Returns an array containing hash objects containing the collection, its corresponding ontology and the records belonging to the given collection.""" rec_onts = [] # User specified record IDs. if recids: rec_onts.append({ 'ontology': taxonomy, 'collection': None, 'recIDs': recids, }) return rec_onts # User specified collections. if collections: for collection in collections: records = get_collection_reclist(collection) if records: rec_onts.append({ 'ontology': taxonomy, 'collection': collection, 'recIDs': records }) return rec_onts # Use rules found in collection_clsMETHOD. result = run_sql("SELECT clsMETHOD.name, clsMETHOD.last_updated, " "collection.name FROM clsMETHOD JOIN collection_clsMETHOD ON " "clsMETHOD.id=id_clsMETHOD JOIN collection ON " "id_collection=collection.id") for ontology, date_last_run, collection in result: records = get_collection_reclist(collection) if records: if not date_last_run: write_message("INFO: Collection %s has not been previously " "analyzed." % collection, stream=sys.stderr, verbose=3) modified_records = intbitset(run_sql("SELECT id FROM bibrec")) elif task_get_option('force'): write_message("INFO: Analysis is forced for collection %s." % collection, stream=sys.stderr, verbose=3) modified_records = intbitset(run_sql("SELECT id FROM bibrec")) else: modified_records = intbitset(run_sql("SELECT id FROM bibrec " "WHERE modification_date >= %s", (date_last_run, ))) records &= modified_records if records: rec_onts.append({ 'ontology': ontology, 'collection': collection, 'recIDs': records }) else: write_message("WARNING: All records from collection '%s' have " "already been analyzed for keywords with ontology '%s' " "on %s." % (collection, ontology, date_last_run), stream=sys.stderr, verbose=2) else: write_message("ERROR: Collection '%s' doesn't contain any record. " "Cannot analyse keywords." % collection, stream=sys.stderr, verbose=0) return rec_onts
def bst_create_icons(recid, icon_sizes, icon_format_mappings=None, collection=None, docnames=None, add_default_icon=0, inherit_moreinfo=0): """BibTasklet for generating missing icons. @param recid: the record on which the action is being performed @type recid: int @param icon_sizes: a comma-separated list of icon sizes, ex 180,640 @type icon_sizes: string @param collection: the collection name on which to run the task; if recid is defined, collection will be ignored @type collection: string @param icon_format_mappings: defines for each "master" format in which format the icons should be created. If the master format is not specified here, then its icons will be created in the same format, if possible (for eg. the icons of a TIFF file would be created as TIFF, while icons of a PDF or DOC file would be created in JPG) and unless a default mapping is not provided in C{CFG_ICON_CREATION_FORMAT_MAPPINGS}. Use syntax masterextension-targetextension1,targetextension2 (eg. "doc->png,jpg" or "png-jpg") Use '*' to target extensions not matched by other rules (if necessary set its value to empty '' in order to override/remove the default star rule set in C{CFG_ICON_CREATION_FORMAT_MAPPINGS}. @type icon_format_mappings: list @param docnames: the list of docnames for which we want to create an icon. If not provided, consider all docnames. Separate docnames using "/" @type docnames: list @param add_default_icon: if a default icon (i.e. without icon size suffix, matching CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT) should be added (1) or not (0) @type add_default_icon: int @param inherit_moreinfo: if the added icons should also have their description and comment set to the same value as the "main" bibdoc (1) or not (0) @type inherit_moreinfo: int """ if recid: recids = [int(recid)] elif collection: from invenio.search_engine import get_collection_reclist recids = get_collection_reclist(collection) else: write_message("Error: no recid found.", sys.stderr) return 1 try: add_default_icon = int(add_default_icon) and True or False except: add_default_icon = False try: inherit_moreinfo = int(inherit_moreinfo) and True or False except: inherit_moreinfo = False if icon_format_mappings is None: icon_format_mappings = [] if isinstance(icon_format_mappings, str): icon_format_mappings = [icon_format_mappings] try: icon_format_mappings = dict([map(lambda x: ',' in x and x.split(',') or x, mapping.split("-", 1)) \ for mapping in icon_format_mappings]) except Exception, e: write_message("Error: parameter 'icon_format_mappings' not well-formed:\n%s" % e, sys.stderr) return 0