def book_information_from_MARC(recid): """ Retrieve book's information from MARC @param recid: identify the record. Primary key of bibrec. @type recid: int @return tuple with title, year, author, isbn and editor. """ # FIXME do the same that book_title_from_MARC book_title = book_title_from_MARC(recid) book_year = ''.join(get_fieldvalues(recid, "260__c")) author_tags = ['100__a', '700__a', '721__a'] book_author = '' for tag in author_tags: l = get_fieldvalues(recid, tag) for c in l: book_author += c + '; ' book_author = book_author[:-2] l = get_fieldvalues(recid, "020__a") book_isbn = '' for isbn in l: book_isbn += isbn + ', ' book_isbn = book_isbn[:-2] book_editor = ', '.join(get_fieldvalues(recid, "260__a") + \ get_fieldvalues(recid, "260__b")) return (book_title, book_year, book_author, book_isbn, book_editor)
def check_fresh_record(user_info, recid): """ Check if the record is just submitted (has a record id) but not yet fully in the database. The check_user_can_view_record function is doing the same thing, but returns the same error code for both cases where the user doesn't have the right to view the record and the case when the record is not yet fully submitted. @param user_info: the user_info dictionary that describe the user. @type user_info: user_info dictionary @param recid: the record identifier. @type recid: positive integer @return: True if the record is fresh, False otherwise @rtype: bool """ if isinstance(recid, str): recid = int(recid) if get_fieldvalues(recid, '8560_f'): # The email field is set return False if get_fieldvalues(recid, '245__a'): # It has a title return False return True
def build_issns_from_local_site(): """ Retrieves the ISSNs from the local database. Store the "journal name -> issn" relation. Normalize journal names a little bit: - strip whithespace chars (left and right) - all lower case - remove "[Online]" suffix Print the result as Python dict structure. """ rec_id_list = perform_request_search(cc='Periodicals', of='id') built_issns = {} #built_issns = issns # Uncomment this to extend existing issns dict # (e.g. in case of manual addition) for rec_id in rec_id_list: journal_name_list = get_fieldvalues(rec_id, '210__%') issn_list = get_fieldvalues(rec_id, '022__a') if issn_list: issn = issn_list[0] # There should be only one ISSN for journal_name in journal_name_list: # Depending on how journal names are entered into the database, # you might want to do some processing before saving: journal_name = journal_name.lower().strip() if journal_name.endswith("[online]"): journal_name = journal_name[:-8].rstrip() built_issns[journal_name] = issn prtyp = pprint.PrettyPrinter(indent=4) prtyp.pprint(built_issns)
def get_video_thumbnail(recid): """ Returns the URL and ALT text for a video thumbnail of a given record """ comments = get_fieldvalues(recid, '8564_z') descriptions = get_fieldvalues(recid, '8564_y') urls = get_fieldvalues(recid, '8564_u') for pos, comment in enumerate(comments): if comment in ('SUGGESTIONTHUMB', 'BIGTHUMB', 'THUMB', 'SMALLTHUMB', 'POSTER'): return (urls[pos], descriptions[pos]) return ("", "")
def get_authors_from_record(recID, tags): """Get all authors for a record We need this function because there's 3 different types of authors and to fetch each one of them we need look through MARC tags """ authors_list = chain( get_fieldvalues(recID, tags['first_author']), get_fieldvalues(recID, tags['additional_author']), get_fieldvalues(recID, tags['alternative_author_name'])) authors = set(hash(author) for author in list(authors_list)[:21]) return authors
def get_journal_info(recid, tags): record_info = [] # TODO: handle recors with multiple journals tagsvalues = {} # we store the tags and their values here # like c->444 y->1999 p->"journal of foo", # v->20 tmp = get_fieldvalues(recid, tags['publication']['journal']) if tmp: tagsvalues["p"] = tmp[0] tmp = get_fieldvalues(recid, tags['publication']['volume']) if tmp: tagsvalues["v"] = tmp[0] tmp = get_fieldvalues(recid, tags['publication']['year']) if tmp: tagsvalues["y"] = tmp[0] tmp = get_fieldvalues(recid, tags['publication']['pages']) if tmp: # if the page numbers have "x-y" take just x pages = tmp[0] hpos = pages.find("-") if hpos > 0: pages = pages[:hpos] tagsvalues["c"] = pages # check if we have the required data ok = True for c in tags['publication_format']: if c in ('p', 'v', 'y', 'c'): if c not in tagsvalues: ok = False if ok: publ = format_journal(tags['publication_format'], tagsvalues) record_info += [publ] alt_volume = get_alt_volume(tagsvalues['v']) if alt_volume: tagsvalues2 = tagsvalues.copy() tagsvalues2['v'] = alt_volume publ = format_journal(tags['publication_format'], tagsvalues2) record_info += [publ] # Add codens for coden in get_kb_mappings('CODENS', value=tagsvalues['p']): tagsvalues2 = tagsvalues.copy() tagsvalues2['p'] = coden['key'] publ = format_journal(tags['publication_format'], tagsvalues2) record_info += [publ] return record_info
def proceedings_link(record): cnum = record['cnum'] out = '' if not cnum: return out search_result = Query("cnum:%s and 980__a:proceedings" % (cnum,)).\ search().recids if search_result: if len(search_result) > 1: from invenio.legacy.bibrecord import get_fieldvalues proceedings = [] for i, recid in enumerate(search_result): doi = get_fieldvalues(recid, '0247_a') if doi: proceedings.append('<a href="/record/%(ID)s">#%(number)s</a> (DOI: <a href="http://dx.doi.org/%(doi)s">%(doi)s</a>)' % {'ID': recid, 'number': i + 1, 'doi': doi[0]}) else: proceedings.append( '<a href="/record/%(ID)s">#%(number)s</a>' % {'ID': recid, 'number': i + 1}) out = 'Proceedings: ' out += ', '.join(proceedings) elif len(search_result) == 1: out += '<a href="/record/' + str(search_result[0]) + \ '">Proceedings</a>' return out
def update_references(recid, overwrite=True): """Update references for a record First, we extract references from a record. Then, we are not updating the record directly but adding a bibupload task in -c mode which takes care of updating the record. Parameters: * recid: the id of the record """ if not overwrite: # Check for references in record record = get_record(recid) if record and record_has_field(record, '999'): raise RecordHasReferences('Record has references and overwrite ' 'mode is disabled: %s' % recid) if get_fieldvalues(recid, '999C59'): raise RecordHasReferences('Record has been curated: %s' % recid) # Parse references references_xml = extract_references_from_record_xml(recid) # Save new record to file (temp_fd, temp_path) = mkstemp(prefix=CFG_REFEXTRACT_FILENAME, dir=CFG_TMPSHAREDDIR) temp_file = os.fdopen(temp_fd, 'w') temp_file.write(references_xml) temp_file.close() # Update record task_low_level_submission('bibupload', 'refextract', '-P', '4', '-c', temp_path)
def get_index_strings_by_control_no(control_no): """extracts the index-relevant strings from the authority record referenced by the 'control_no' parameter and returns it as a list of strings @param control_no: a (INVENIO) MARC internal control_no to an authority record @type control_no: string (e.g. 'author:(ABC)1234') @param expected_type: the type of authority record expected @type expected_type: string, e.g. 'author', 'journal' etc. @return: list of index-relevant strings from the referenced authority record """ from invenio.legacy.bibindex.engine import list_union #return value string_list = [] #1. get recID and authority type corresponding to control_no rec_IDs = get_low_level_recIDs_from_control_no(control_no) #2. concatenate and return all the info from the interesting fields for this record for rec_id in rec_IDs: # in case we get multiple authority records for tag in CFG_BIBAUTHORITY_AUTHORITY_SUBFIELDS_TO_INDEX.get(get_type_from_control_no(control_no)): new_strings = get_fieldvalues(rec_id, tag) string_list = list_union(new_strings, string_list) #return return string_list
def is_user_viewer_of_record(user_info, recid): """ Check if the user is allow to view the record based in the marc tags inside CFG_ACC_GRANT_VIEWER_RIGHTS_TO_EMAILS_IN_TAGS i.e. his email is inside the 506__m tag or he is inside an e-group listed in the 506__m tag :param user_info: the user_info dictionary that describe the user. :type user_info: user_info dictionary :param recid: the record identifier. :type recid: positive integer @return: True if the user is 'allow to view' the record; False otherwise @rtype: bool """ authorized_emails_or_group = [] for tag in cfg.get('CFG_ACC_GRANT_VIEWER_RIGHTS_TO_EMAILS_IN_TAGS', []): from invenio.legacy.bibrecord import get_fieldvalues authorized_emails_or_group.extend(get_fieldvalues(recid, tag)) for email_or_group in authorized_emails_or_group: if email_or_group in user_info['group']: return True email = email_or_group.strip().lower() if user_info['email'].strip().lower() == email: return True return False
def is_user_owner_of_record(user_info, recid): """Check if the user is owner of the record. I.e. he is the submitter and/or belongs to a owner-like group authorized to 'see' the record. :param user_info: the user_info dictionary that describe the user. :type user_info: user_info dictionary :param recid: the record identifier. :type recid: positive integer :return: True if the user is 'owner' of the record; False otherwise """ authorized_emails_or_group = [] for tag in cfg.get('CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS', []): from invenio.legacy.bibrecord import get_fieldvalues authorized_emails_or_group.extend(get_fieldvalues(recid, tag)) for email_or_group in authorized_emails_or_group: if email_or_group in user_info['group']: return True email = email_or_group.strip().lower() if user_info['email'].strip().lower() == email: return True if cfg['CFG_CERN_SITE']: # the egroup might be in the form [email protected] if email_or_group.replace('@cern.ch', ' [CERN]') in \ user_info['group']: return True return False
def is_periodical(recid): rec_type = get_fieldvalues(recid, "690C_a") if len(rec_type) > 0: for value in rec_type: if value == 'PERI': return True return False
def get_authors_from_record(recID, tags, use_bibauthorid=CFG_BIBRANK_SELFCITES_USE_BIBAUTHORID): """Get all authors for a record We need this function because there's 3 different types of authors and to fetch each one of them we need look through MARC tags """ if use_bibauthorid: authors = get_personids_from_record(recID) else: authors_list = chain( get_fieldvalues(recID, tags['first_author']), get_fieldvalues(recID, tags['additional_author']), get_fieldvalues(recID, tags['alternative_author_name'])) authors = set(hash(author) for author in list(authors_list)[:20]) return authors
def _record_in_files_p(recid, filenames): """Search XML files for given record.""" # Get id tags of record in question rec_oaiid = rec_sysno = -1 rec_oaiid_tag = get_fieldvalues(recid, OAIID_TAG) rec_sysno_tag = get_fieldvalues(recid, SYSNO_TAG) if rec_sysno_tag: rec_sysno = rec_sysno_tag[0] # For each record in each file, compare ids and abort if match is found for filename in filenames: try: if CFG_BIBEDIT_QUEUE_CHECK_METHOD == 'regexp': # check via regexp: this is fast, but may not be precise file_content = open(filename).read() re_match_001 = re.compile( '<controlfield tag="001">%s</controlfield>' % (recid)) if re_match_001.search(file_content): return True for rec_oaiid in rec_oaiid_tag: re_match_oaiid = re.compile( r'<datafield tag="%s" ind1=" " ind2=" ">(\s*<subfield code="a">\s*|\s*<subfield code="9">\s*.*\s*</subfield>\s*<subfield code="a">\s*)%s' % (OAIID_TAG[0:3], re.escape(rec_oaiid))) if re_match_oaiid.search(file_content): return True re_match_sysno = re.compile( r'<datafield tag="%s" ind1=" " ind2=" ">(\s*<subfield code="a">\s*|\s*<subfield code="9">\s*.*\s*</subfield>\s*<subfield code="a">\s*)%s' % (SYSNO_TAG[0:3], re.escape(str(rec_sysno)))) if rec_sysno_tag: if re_match_sysno.search(file_content): return True else: # by default, check via bibrecord: this is accurate, but may be slow file_ = open(filename) records = create_records(file_.read(), 0, 0) for i in range(0, len(records)): record, all_good = records[i][:2] if record and all_good: if _record_has_id_p(record, recid, rec_oaiid, rec_sysno): return True file_.close() except IOError: continue return False
def _record_in_files_p(recid, filenames): """Search XML files for given record.""" # Get id tags of record in question rec_oaiid = rec_sysno = -1 rec_oaiid_tag = get_fieldvalues(recid, OAIID_TAG) rec_sysno_tag = get_fieldvalues(recid, SYSNO_TAG) if rec_sysno_tag: rec_sysno = rec_sysno_tag[0] # For each record in each file, compare ids and abort if match is found for filename in filenames: try: if CFG_BIBEDIT_QUEUE_CHECK_METHOD == "regexp": # check via regexp: this is fast, but may not be precise file_content = open(filename).read() re_match_001 = re.compile('<controlfield tag="001">%s</controlfield>' % (recid)) if re_match_001.search(file_content): return True for rec_oaiid in rec_oaiid_tag: re_match_oaiid = re.compile( r'<datafield tag="%s" ind1=" " ind2=" ">(\s*<subfield code="a">\s*|\s*<subfield code="9">\s*.*\s*</subfield>\s*<subfield code="a">\s*)%s' % (OAIID_TAG[0:3], re.escape(rec_oaiid)) ) if re_match_oaiid.search(file_content): return True re_match_sysno = re.compile( r'<datafield tag="%s" ind1=" " ind2=" ">(\s*<subfield code="a">\s*|\s*<subfield code="9">\s*.*\s*</subfield>\s*<subfield code="a">\s*)%s' % (SYSNO_TAG[0:3], re.escape(str(rec_sysno))) ) if rec_sysno_tag: if re_match_sysno.search(file_content): return True else: # by default, check via bibrecord: this is accurate, but may be slow file_ = open(filename) records = create_records(file_.read(), 0, 0) for i in range(0, len(records)): record, all_good = records[i][:2] if record and all_good: if _record_has_id_p(record, recid, rec_oaiid, rec_sysno): return True file_.close() except IOError: continue return False
def deleted(self): """Return True if record is marked as deleted.""" from invenio.legacy.bibrecord import get_fieldvalues # record exists; now check whether it isn't marked as deleted: dbcollids = get_fieldvalues(self.id, "980__%") return ("DELETED" in dbcollids) or \ (current_app.config.get('CFG_CERN_SITE') and "DUMMY" in dbcollids)
def search_result_info(recid): """Return report number of a record or if it doen't exist return the recid itself. """ report_numbers = get_fieldvalues(recid, '037__a') if len(report_numbers) == 0: return "#"+str(recid) else: return report_numbers[0]
def search_result_info(recid): """Return report number of a record or if it doen't exist return the recid itself. """ report_numbers = get_fieldvalues(recid, '037__a') if len(report_numbers) == 0: return "#" + str(recid) else: return report_numbers[0]
def check_authorized_tags(recid, tags, test_func): """Check if tags in record matches a given test.""" authorized_values = [] for tag in tags: authorized_values.extend(get_fieldvalues(recid, tag)) for value in authorized_values: if test_func(value): return True return False
def get_video_duration(recid): """ Return the duration of a video """ duration = get_fieldvalues(recid, '950__d') if duration: duration = duration[0] duration = timecode_to_seconds(duration) return human_readable_time(duration) else: return ""
def get_control_nos_from_recID(recID): """ get a list of control numbers from the record ID @param recID: record ID @type recID: int @return: authority record control number """ return get_fieldvalues(recID, CFG_BIBAUTHORITY_RECORD_CONTROL_NUMBER_FIELD, repetitive_values=False)
def check_record_for_refextract(recid): if get_fieldvalues(recid, '999C6v'): # References extracted by refextract if get_fieldvalues(recid, '999C59'): # They have been curated # To put in the HP and create ticket in the future needs_submitting = False else: # They haven't been curated, we safely extract from the new pdf needs_submitting = True elif not get_fieldvalues(recid, '999C5_'): # No references in the record, we can safely extract # new references needs_submitting = True else: # Old record, with either no curated references or references # curated by SLAC. We cannot distinguish, so we do nothing needs_submitting = False return needs_submitting
def _next_merged_recid(recid): """ Returns the ID of record merged with record with ID = recid """ from invenio.legacy.bibrecord import get_fieldvalues merged_recid = None for val in get_fieldvalues(recid, "970__d"): try: merged_recid = int(val) break except ValueError: pass if not merged_recid: return None else: return merged_recid
def _next_merged_recid(recid): """Return the ID of record merged with record with ID = recid.""" from invenio.legacy.bibrecord import get_fieldvalues merged_recid = None for val in get_fieldvalues(recid, "970__d"): try: merged_recid = int(val) break except ValueError: pass if not merged_recid: return None else: return merged_recid
def get_merged_recid(recID): """ Return the record ID of the record with which the given record has been merged. @param recID: deleted record recID @type recID: int @return: merged record recID @rtype: int or None """ merged_recid = None for val in get_fieldvalues(recID, "970__d"): try: merged_recid = int(val) break except ValueError: pass return merged_recid
def record_exists(recID): """Return 1 if record RECID exists. Return 0 if it doesn't exist. Return -1 if it exists but is marked as deleted. Copy from search_engine""" out = 0 query = "SELECT id FROM bibrec WHERE id='%s'" % recID res = run_sql(query, None, 1) if res: # record exists; now check whether it isn't marked as deleted: dbcollids = get_fieldvalues(recID, "980__%") if ("DELETED" in dbcollids) or (CFG_CERN_SITE and "DUMMY" in dbcollids): out = -1 # exists, but marked as deleted else: out = 1 # exists fine return out
def guess_primary_collection_of_a_record(recID): """Return primary collection name a record recid belongs to, by testing 980 identifier. May lead to bad guesses when a collection is defined dynamically via dbquery. In that case, return 'CFG_SITE_NAME'.""" out = CFG_SITE_NAME dbcollids = get_fieldvalues(recID, "980__a") for dbcollid in dbcollids: variants = ("collection:" + dbcollid, 'collection:"' + dbcollid + '"', "980__a:" + dbcollid, '980__a:"' + dbcollid + '"', '980:' + dbcollid, '980:"' + dbcollid + '"') res = run_sql( "SELECT name FROM collection WHERE dbquery IN (%s,%s,%s,%s,%s,%s)", variants) if res: out = res[0][0] break if CFG_CERN_SITE: recID = int(recID) # dirty hack for ATLAS collections at CERN: if out in ('ATLAS Communications', 'ATLAS Internal Notes'): for alternative_collection in ( 'ATLAS Communications Physics', 'ATLAS Communications General', 'ATLAS Internal Notes Physics', 'ATLAS Internal Notes General', ): if recID in get_collection_reclist(alternative_collection): return alternative_collection # dirty hack for FP FP_collections = { 'DO': ['Current Price Enquiries', 'Archived Price Enquiries'], 'IT': [ 'Current Invitation for Tenders', 'Archived Invitation for Tenders' ], 'MS': ['Current Market Surveys', 'Archived Market Surveys'] } fp_coll_ids = [coll for coll in dbcollids if coll in FP_collections] for coll in fp_coll_ids: for coll_name in FP_collections[coll]: if recID in get_collection_reclist(coll_name): return coll_name return out
def guess_primary_collection_of_a_record(recID): """Return primary collection name a record recid belongs to, by testing 980 identifier. May lead to bad guesses when a collection is defined dynamically via dbquery. In that case, return 'CFG_SITE_NAME'.""" out = CFG_SITE_NAME dbcollids = get_fieldvalues(recID, "980__a") for dbcollid in dbcollids: variants = ( "collection:" + dbcollid, 'collection:"' + dbcollid + '"', "980__a:" + dbcollid, '980__a:"' + dbcollid + '"', "980:" + dbcollid, '980:"' + dbcollid + '"', ) res = run_sql("SELECT name FROM collection WHERE dbquery IN (%s,%s,%s,%s,%s,%s)", variants) if res: out = res[0][0] break if CFG_CERN_SITE: recID = int(recID) # dirty hack for ATLAS collections at CERN: if out in ("ATLAS Communications", "ATLAS Internal Notes"): for alternative_collection in ( "ATLAS Communications Physics", "ATLAS Communications General", "ATLAS Internal Notes Physics", "ATLAS Internal Notes General", ): if recID in get_collection_reclist(alternative_collection): return alternative_collection # dirty hack for FP FP_collections = { "DO": ["Current Price Enquiries", "Archived Price Enquiries"], "IT": ["Current Invitation for Tenders", "Archived Invitation for Tenders"], "MS": ["Current Market Surveys", "Archived Market Surveys"], } fp_coll_ids = [coll for coll in dbcollids if coll in FP_collections] for coll in fp_coll_ids: for coll_name in FP_collections[coll]: if recID in get_collection_reclist(coll_name): return coll_name return out
def get_field_count(recID, tags): """ Return number of field instances having TAGS in record RECID. @param recID: record ID @type recID: int @param tags: list of tags to count, e.g. ['100__a', '700__a'] @type tags: list @return: number of tags present in record @rtype: int @note: Works internally via getting field values, which may not be very efficient. Could use counts only, or else retrieve stored recstruct format of the record and walk through it. """ out = 0 for tag in tags: out += len(get_fieldvalues(recID, tag)) return out
def guess_authority_types(recID): """ guesses the type(s) (e.g. AUTHOR, INSTITUTE, etc.) of an authority record (should only have one value) @param recID: the record ID of the authority record @type recID: int @return: list of strings """ types = get_fieldvalues(recID, '980__a', repetitive_values=False) # remove possible duplicates ! #filter out unwanted information while CFG_BIBAUTHORITY_AUTHORITY_COLLECTION_IDENTIFIER in types: types.remove(CFG_BIBAUTHORITY_AUTHORITY_COLLECTION_IDENTIFIER) types = [_type for _type in types if _type.isalpha()] return types
def guess_main_name_from_authority_recID(recID): """ get the main name of the authority record @param recID: the record ID of authority record @type recID: int @return: the main name of this authority record (string) """ #tags where the main authority record name can be found main_name_tags = ['100__a', '110__a', '130__a', '150__a'] main_name = '' # look for first match only for tag in main_name_tags: fieldvalues = get_fieldvalues(recID, tag, repetitive_values=False) if len(fieldvalues): main_name = fieldvalues[0] break # return first match, if found return main_name
def guess_primary_collection_of_a_record(recID): """Return primary collection name a record recid belongs to, by testing 980 identifier. May lead to bad guesses when a collection is defined dynamically via dbquery. In that case, return 'CFG_SITE_NAME'.""" out = CFG_SITE_NAME dbcollids = get_fieldvalues(recID, "980__a") for dbcollid in dbcollids: variants = ("collection:" + dbcollid, 'collection:"' + dbcollid + '"', "980__a:" + dbcollid, '980__a:"' + dbcollid + '"', '980:' + dbcollid , '980:"' + dbcollid + '"') res = run_sql("SELECT name FROM collection WHERE dbquery IN (%s,%s,%s,%s,%s,%s)", variants) if res: out = res[0][0] break return out
def record_exists(recID): """Return 1 if record RECID exists. Return 0 if it doesn't exist. Return -1 if it exists but is marked as deleted. """ from invenio.config import CFG_CERN_SITE try: # if recid is '123foo', mysql will return id=123, and we don't want that recID = int(recID) except (ValueError, TypeError): return 0 out = 0 res = run_sql("SELECT id FROM bibrec WHERE id=%s", (recID,), 1) if res: # record exists; now check whether it isn't marked as deleted: dbcollids = get_fieldvalues(recID, "980__%") if ("DELETED" in dbcollids) or (CFG_CERN_SITE and "DUMMY" in dbcollids): out = -1 # exists, but marked as deleted else: out = 1 # exists fine return out
def record_exists(recID): """Return 1 if record RECID exists. Return 0 if it doesn't exist. Return -1 if it exists but is marked as deleted. """ from invenio.config import CFG_CERN_SITE try: # if recid is '123foo', mysql will return id=123, and we don't want that recID = int(recID) except (ValueError, TypeError): return 0 out = 0 res = run_sql("SELECT id FROM bibrec WHERE id=%s", (recID, ), 1) if res: # record exists; now check whether it isn't marked as deleted: dbcollids = get_fieldvalues(recID, "980__%") if ("DELETED" in dbcollids) or (CFG_CERN_SITE and "DUMMY" in dbcollids): out = -1 # exists, but marked as deleted else: out = 1 # exists fine return out
def book_title_from_MARC(recid): """ Retrieve book's title from MARC @param recid: identify the record. Primary key of bibrec. @type recid: int @return book's title """ title_tags = get_field_tags('title') book_title = '' i = 0 while book_title == '' and i < len(title_tags): l = get_fieldvalues(recid, title_tags[i]) for candidate in l: book_title = book_title + candidate + ': ' i += 1 book_title = book_title[:-2] return book_title
def get_item_info_for_search_result(recid): """ Get the item's info from MARC in order to create a search result with more details @param recid: identify the record. Primary key of bibrec. @type recid: int @return book's informations (author, editor and number of copies) """ book_author = ' '.join(get_fieldvalues(recid, "100__a") + \ get_fieldvalues(recid, "100__u")) book_editor = ' , '.join(get_fieldvalues(recid, "260__a") + \ get_fieldvalues(recid, "260__b") + \ get_fieldvalues(recid, "260__c")) book_copies = ' '.join(get_fieldvalues(recid, "964__a")) book_infos = (book_author, book_editor, book_copies) return book_infos
def check_arxiv(recid): """Returns True for arxiv papers""" for report_number in get_fieldvalues(recid, '037__9'): if report_number == 'arXiv': return True return False
def get_most_popular_field_values(recids, tags, exclude_values=None, count_repetitive_values=True, split_by=0): """Analyze RECIDS and look for TAGS and return most popular values. Optionally return the frequency with which they occur sorted according to descending frequency. If a value is found in EXCLUDE_VALUES, then do not count it. If COUNT_REPETITIVE_VALUES is True, then we count every occurrence of value in the tags. If False, then we count the value only once regardless of the number of times it may appear in a record. (But, if the same value occurs in another record, we count it, of course.) Example: .. code-block:: python >>> get_most_popular_field_values(range(11,20), '980__a') [('PREPRINT', 10), ('THESIS', 7), ...] >>> get_most_popular_field_values(range(11,20), ('100__a', '700__a')) [('Ellis, J', 10), ('Ellis, N', 7), ...] >>> get_most_popular_field_values(range(11,20), ('100__a', '700__a'), ... ('Ellis, J')) [('Ellis, N', 7), ...] :return: list of tuples containing tag and its frequency """ from invenio.legacy.bibrecord import get_fieldvalues valuefreqdict = {} # sanity check: if not exclude_values: exclude_values = [] if isinstance(tags, string_types): tags = (tags,) # find values to count: vals_to_count = [] displaytmp = {} if count_repetitive_values: # counting technique A: can look up many records at once: (very fast) for tag in tags: vals_to_count.extend(get_fieldvalues(recids, tag, sort=False, split_by=split_by)) else: # counting technique B: must count record-by-record: (slow) for recid in recids: vals_in_rec = [] for tag in tags: for val in get_fieldvalues(recid, tag, False): vals_in_rec.append(val) # do not count repetitive values within this record # (even across various tags, so need to unify again): dtmp = {} for val in vals_in_rec: dtmp[val.lower()] = 1 displaytmp[val.lower()] = val vals_in_rec = dtmp.keys() vals_to_count.extend(vals_in_rec) # are we to exclude some of found values? for val in vals_to_count: if val not in exclude_values: if val in valuefreqdict: valuefreqdict[val] += 1 else: valuefreqdict[val] = 1 # sort by descending frequency of values: f = [] # frequencies n = [] # original names ln = [] # lowercased names # build lists within one iteration for (val, freq) in iteritems(valuefreqdict): f.append(-1 * freq) if val in displaytmp: n.append(displaytmp[val]) else: n.append(val) ln.append(val.lower()) # sort by frequency (desc) and then by lowercased name. return [(n[i], -1 * f[i]) for i in numpy.lexsort([ln, f])]