def check_pubdate_without_month_selected(fields1, fields2, final_result, type_check, subfield_list, tag): """It checks if a pubdate without month is selected if other dates with month are present""" logger.info(" running check_pubdate_without_month_selected") # dates in format "YYYY-MM-DD" def has_valid_month(date_str): try: month = int(date_str[5:7]) return month != 0 except: return False field_with_month = 0 # I extract only the subfields from the final result list of fields final_result_fields = [field[0] for field in final_result] for field in [field[0] for field in fields1 + fields2]: # non selected field if field not in final_result_fields: for subfield in field: # if I find a date with a valid month I increment the variable if subfield[0] in subfield_list: if has_valid_month(subfield[1]): field_with_month += 1 # selected field else: for subfield in field: # if I find a date with a valid month then I return directly false if subfield[0] in subfield_list: if has_valid_month(subfield[1]): return False if field_with_month > 0: manage_check_error( 'Date without month selected while other one with month is present in field "%s"!' % tag, type_check, logger ) return None
def check_longer_string_not_selected(fields1, fields2, final_result, type_check, subfield_list, tag): """""" logger.info(" running check_longer_string_not_selected") cur_max_len = 0 max_len_field_not_sel = False # I extract only the subfields from the final result list of fields final_result_fields = [field[0] for field in final_result] for field in [field[0] for field in fields1 + fields2]: # non selected field if field not in final_result_fields: for subfield in field: # if I have found a field not selected I check if its length is greater than the one I have already checked if subfield[0] in subfield_list: if len(subfield[1]) > cur_max_len: # If so I update the variables cur_max_len = len(subfield[1]) max_len_field_not_sel = True # selected field else: for subfield in field: # if I found an unicode string in the selected field then I return directly false if subfield[0] in subfield_list: if len(subfield[1]) > cur_max_len: # If so I update the variables cur_max_len = len(subfield[1]) max_len_field_not_sel = False if max_len_field_not_sel: manage_check_error('Longer field "%s" not selected!' % tag, type_check, logger) return None
def check_uppercase_string_selected(fields1, fields2, final_result, type_check, subfield_list, tag): """""" logger.info(" running check_uppercase_string_selected") # I extract the fields selected and the ones not selected notsel_field_lower = 0 # I extract only the subfields from the final result list of fields final_result_fields = [field[0] for field in final_result] for field in [field[0] for field in fields1 + fields2]: # non selected field if field not in final_result_fields: for subfield in field: # if I found a lower case string I increase a counter if subfield[0] in subfield_list: if not subfield[1].isupper(): notsel_field_lower += 1 # selected field else: for subfield in field: # if I found an lower case string in the selected field then I return directly false if subfield[0] in subfield_list: if not subfield[1].isupper(): return False if notsel_field_lower > 0: manage_check_error( 'Upper case string selected instead of a lower case one in field "%s"!' % tag, type_check, logger ) return None
def check_author_from_shorter_list(fields1, fields2, final_result, type_check, subfield_list, tag): """Simply checks that the return list of authors is the longest possible. This check relies on the fact that we don't merge authors from different origins, but we simply add subfields for the ones we selected. """ logger.info(" running check_author_from_shorter_list") # I select the longest list longer_list = fields1 if len([field[0] for field in fields1]) >= len([field[0] for field in fields2]) else fields2 # I check if the one returned is shorter than the longest, I have a problem if len([field[0] for field in final_result]) < len([field[0] for field in longer_list]): manage_check_error('Longer list of authors not selected in field "%s"!' % tag, type_check, logger) return None
def check_collections_existence(merged_record, type_check): """Function that checks if there is at least one collection""" logger.info(' running check_collections_existence') try: collections_fields = merged_record[FIELD_TO_MARC['collection']] except KeyError: manage_check_error('No Collection field!', type_check, logger) return None if len(collections_fields) == 0: manage_check_error('No Collection field!', type_check, logger) return None
def check_one_date_per_type(fields1, fields2, final_result, type_check, subfield_list, tag): """Function to check if there are multiple dates of the same type""" logger.info(" running check_one_date_per_type") # I extract all the dates grouped by date type date_types = {} for field in final_result: date_types.setdefault(bibrecord.field_get_subfield_values(field, subfield_list[0][1])[0], []).append( bibrecord.field_get_subfield_values(field, subfield_list[0][0])[0] ) # then I check that these dates are unique per type for datet in date_types: if len(set(date_types[datet])) > 1: manage_check_error('Multiple dates for type "%s" in field "%s".' % (datet, tag), type_check, logger) return None
def check_duplicate_normalized_author_names(fields1, fields2, final_result, type_check, subfield_list, tag): """ Checks if there are authors with the same normalized name. This will prevent the correct matching of authors from one author list to the other. """ logger.info(" running check_duplicate_normalized_author_names") author_names = set() for field in final_result: author = bibrecord.field_get_subfield_values(field, AUTHOR_NORM_NAME_SUBFIELD)[0] if author in author_names: # I don't raise an error if I have duplicated normalized author names, # I simply return the trusted list manage_check_error( 'Duplicated normalized author name for "%s" in field "%s".' % (author, tag), type_check, logger ) else: author_names.add(author) return None
def check_pub_year_consistency(merged_record, type_check): """Function that checks if the publication year is consistent with the year at the beginning of the bibcode""" logger.info(' running check_pub_year_consistency') #definition of the list of dates I don't want to check with this function dates_to_skip_from_check = ['date-preprint'] try: system_number_fields = merged_record[FIELD_TO_MARC['system number']] except KeyError: manage_check_error('No System Number field!', type_check, logger) return None try: pub_dates_fields = merged_record[FIELD_TO_MARC['publication date']] except KeyError: manage_check_error('No Publication Date field!', type_check, logger) return None #the system number field should e unique, so if there are more than 1 fields, I have a problem (and I cannot proceed) if len(system_number_fields) > 1: manage_check_error('There are more than one System Numbers!', type_check, logger) return None system_number = bibrecord.field_get_subfield_values(system_number_fields[0], SYSTEM_NUMBER_SUBFIELD)[0] num_dates_checked = 0 for date_type_string in PUBL_DATE_TYPE_VAL_SUBFIELD: #I don't want to check the preprint date if date_type_string in dates_to_skip_from_check: continue #then I have to extract the right date (there can be different in the same field) pubdate = '' for field in pub_dates_fields: if bibrecord.field_get_subfield_values(field, PUBL_DATE_TYPE_SUBFIELD)[0] == date_type_string: pubdate = bibrecord.field_get_subfield_values(field, PUBL_DATE_SUBFIELD)[0] break if len(pubdate) != 0: num_dates_checked +=1 else: continue #final part of the check if pubdate[0:4] != system_number[0:4]: manage_check_error('Year of "%s" not consistent with the main bibcode "%s"!' % (date_type_string, system_number), type_check, logger) if num_dates_checked == 0: manage_check_error('No dates available for this record!', type_check, logger) return None
def check_different_keywords_for_same_type(fields1, fields2, final_result, type_check, subfield_list, tag): """""" logger.info(" running check_different_keywords_for_same_type") # I build a data structure for the keywords of the first set # where I group the keywords by institution kewords_per_institution = {} for field in [field[0] for field in fields1]: institution = " " keyword_string = "" for subfield in field: if subfield[0] == KEYWORD_ORIGIN_SUBFIELD: institution = subfield[1] if subfield[0] == KEYWORD_STRING_SUBFIELD: keyword_string = subfield[1] kewords_per_institution.setdefault(institution, set()).add(keyword_string) # for each keyword of the other set, I check if it already exists if I already have the same system # if I have the same system but not the keyword, then I have a problem for field in [field[0] for field in fields2]: institution = " " keyword_string = "" for subfield in field: if subfield[0] == KEYWORD_ORIGIN_SUBFIELD: institution = subfield[1] if subfield[0] == KEYWORD_STRING_SUBFIELD: keyword_string = subfield[1] if institution in kewords_per_institution: # if I have the same institution then I have to have the the keyword already if len(kewords_per_institution[institution].intersection(set([keyword_string]))) == 0: manage_check_error( 'Different groups with same keyword system don\'t have the same list of keywords (field "%s")!' % tag, type_check, logger, ) break else: pass return None
def check_string_with_unicode_not_selected(fields1, fields2, final_result, type_check, subfield_list, tag): """ Function that checks if a string without unicode has been selected instead of one containing unicode. If multiple strings have been selected, then only an unicode one is enough to return false. """ def is_unicode(s): try: s.decode() return False except UnicodeDecodeError: return True logger.info(" running check_string_with_unicode_not_selected") # I extract the fields selected and the ones not selected notsel_field_unicode = 0 # I extract only the subfields from the final result list of fields final_result_fields = [field[0] for field in final_result] for field in [field[0] for field in fields1 + fields2]: # non selected field if field not in final_result_fields: for subfield in field: # if I found an unicode string I increase a counter if subfield[0] in subfield_list: if is_unicode(subfield[1]): notsel_field_unicode += 1 # selected field else: for subfield in field: # if I found an unicode string in the selected field then I return directly false if subfield[0] in subfield_list: if is_unicode(subfield[1]): return False if notsel_field_unicode > 0: manage_check_error( 'Field "%s" with unicode string not selected (all the selected fields are not unicode)!' % tag, type_check, logger, ) return None
def first_author_bibcode_consistency(merged_record, type_check): """Function that checks if the last letter of the main bibcode is consistent with the first letter of the first author""" logger.info(' running first_author_bibcode_consistency') bibstems_to_skip_from_check = ['QB'] try: system_number_fields = merged_record[FIELD_TO_MARC['system number']] except KeyError: manage_check_error('No System Number field!', type_check, logger) return None try: first_author_fields = merged_record[FIELD_TO_MARC['first author']] except KeyError: manage_check_error('No First Author field!', type_check, logger) return None #the system number field should e unique, so if there are more than 1 fields, I have a problem (and I cannot proceed) if len(system_number_fields) > 1: manage_check_error('There are more than one System Numbers!', type_check, logger) return None #the first author field should e unique, so if there are more than 1 fields, I have a problem (and I cannot proceed) if len(first_author_fields) > 1: manage_check_error('There are more than one First Author!', type_check, logger) return None system_number = bibrecord.field_get_subfield_values(system_number_fields[0], SYSTEM_NUMBER_SUBFIELD)[0] first_author = bibrecord.field_get_subfield_values(first_author_fields[0], AUTHOR_NAME_SUBFIELD)[0] #If the bibcode has a bibstem to skip, I don't do anything for elem in bibstems_to_skip_from_check: if system_number[4:4+len(elem)] == elem: return None if first_author[0].lower() != system_number[-1].lower(): #if the last letter of the system number is a dot, then I want to give a different message if system_number[-1] == '.': manage_check_error('The main bibcode "%s" doesn\'t have an initial even if there is a First Author "%s"!' % (system_number, first_author), type_check, logger) else: manage_check_error('First Author "%s" not consistent with the main bibcode "%s"!' % (first_author, system_number), type_check, logger) return None