Exemplo n.º 1
0
def get_trusted_and_untrusted_fields(fields1, fields2, tag):
    """
    Selects the most trusted fields.
    """
    try:
        origin1 = get_origin(fields1)
        origin_val1 = get_origin_importance(tag, origin1)
    except OriginValueNotFound, error:
        logger.critical(error)
        raise
def merge_creation_modification_dates(merged_record):
    """Function that grabs all the origins in the merged record 
    and creates a merged version of the creation and modification date 
    based only on the found origins"""
    #I create a local copy to avoid problems
    record = deepcopy(merged_record)
    #I extract all the creation and modification dates
    try:
        creat_mod = record[FIELD_TO_MARC['creation and modification date']]
    except KeyError:
        logger.warning('      No Creation-Modification field available!')
        return record
    #then I extract all the origins from all the fields but the creation and modification date
    origins = []
    for field_code in record:
        if field_code != FIELD_TO_MARC['creation and modification date']:
            for field in record[field_code]:
                try:
                    origin = bibrecord.field_get_subfield_values(field, ORIGIN_SUBFIELD)[0]
                    if origin !='':
                        origins.append(origin)
                #if there is origin this is a problem, but I don't have to manage it here
                except IndexError:
                    pass
    #I unique the list
    origins = list(set(origins))        
    #then for each field in creation e modification date I check if it has an origin used in other fields
    #and if so I update creation and modification dates
    new_creation_modification_date = {}
    for field in creat_mod:
        try:
            origin = bibrecord.field_get_subfield_values(field, ORIGIN_SUBFIELD)[0]
        except IndexError:
            origin = ''
        
        if origin in origins:
            #I have to put or update the creation and modification date
            if len(new_creation_modification_date) == 0:
                #if there is no creation or modification date I simply insert the field
                new_creation_modification_date[CREATION_DATE_SUBFIELD] = bibrecord.field_get_subfield_values(field, CREATION_DATE_SUBFIELD)[0]
                new_creation_modification_date[MODIFICATION_DATE_SUBFIELD] = bibrecord.field_get_subfield_values(field, MODIFICATION_DATE_SUBFIELD)[0]
                new_creation_modification_date[ORIGIN_SUBFIELD] = origin
                new_creation_modification_date['origin_importance'] = get_origin_importance(FIELD_TO_MARC['creation and modification date'], origin)
            else:
                #otherwise I have to check which one is the oldest for creation and newest for modification
                old_creation = new_creation_modification_date[CREATION_DATE_SUBFIELD]
                old_modification = new_creation_modification_date[CREATION_DATE_SUBFIELD]
                new_creation = bibrecord.field_get_subfield_values(field, CREATION_DATE_SUBFIELD)[0]
                new_modification = bibrecord.field_get_subfield_values(field, MODIFICATION_DATE_SUBFIELD)[0]
                
                new_creation_modification_date[CREATION_DATE_SUBFIELD] = old_creation if old_creation <= new_creation else new_creation
                new_creation_modification_date[CREATION_DATE_SUBFIELD] = old_modification if old_modification >= new_modification else new_modification
                #then at the end I put as origin the most trusted origin
                old_origin = new_creation_modification_date[ORIGIN_SUBFIELD]
                old_origin_import = new_creation_modification_date['origin_importance']
                new_origin_import = get_origin_importance(FIELD_TO_MARC['creation and modification date'], origin)
                new_creation_modification_date[ORIGIN_SUBFIELD] = old_origin if old_origin_import >= new_origin_import else origin
                new_creation_modification_date['origin_importance'] = old_origin_import if old_origin_import >= new_origin_import else new_origin_import            
    #then I upgrade the field
    record[FIELD_TO_MARC['creation and modification date']] = [([(MODIFICATION_DATE_SUBFIELD, new_creation_modification_date[MODIFICATION_DATE_SUBFIELD]), 
                               (CREATION_DATE_SUBFIELD, new_creation_modification_date[CREATION_DATE_SUBFIELD]),
                               (ORIGIN_SUBFIELD, new_creation_modification_date[ORIGIN_SUBFIELD])], ) + creat_mod[0][1:]]
    return record
Exemplo n.º 3
0
def references_merger(fields1, fields2, tag):
    """Merging function for references"""
    #if one of the two lists is empty, I don't have to do anything
    if len(fields1) == 0 or len(fields2) == 0:
        logger.info('        Only one field for "%s".' % tag)
        return fields1+fields2
    #first I split the references in two groups: the ones that should be merged and the one that have to taken over the others
    ref_by_merging_type_fields1 = {'take_all':[], 'priority':[]}
    ref_by_merging_type_fields2 = {'take_all':[], 'priority':[]}
        
    #I split the fields1
    for field in fields1:
        if bibrecord.field_get_subfield_values(field, ORIGIN_SUBFIELD)[0] in REFERENCES_MERGING_TAKE_ALL_ORIGINS:
            ref_by_merging_type_fields1['take_all'].append(field)
        else:
            ref_by_merging_type_fields1['priority'].append(field)
    #and the fields2 (this in theory should be always of the same origin type)
    for field in fields2:
        if bibrecord.field_get_subfield_values(field, ORIGIN_SUBFIELD)[0] in REFERENCES_MERGING_TAKE_ALL_ORIGINS:
            ref_by_merging_type_fields2['take_all'].append(field)
        else:
            ref_by_merging_type_fields2['priority'].append(field)
    
    global_list = take_all(take_all(ref_by_merging_type_fields1['take_all'], ref_by_merging_type_fields2['take_all'], tag), 
                           priority_based_merger(ref_by_merging_type_fields1['priority'], ref_by_merging_type_fields2['priority'], tag),
                           tag)
    
    #finally I unique the resolved references
    #taking the reference string (and the related extension handler) from the most trusted origin or 
    #from the other if the most trusted origin has an empty reference string
    #or one with only the bibcode
    unique_references_dict = {}
    unresolved_references = []
    for field in global_list:
        fieldcp = deepcopy(field)
        try:
            bibcode_res = bibrecord.field_get_subfield_values(fieldcp, REFERENCE_RESOLVED_KEY)[0]
        except IndexError:
            bibcode_res = None
        if bibcode_res:
            #first record found
            if bibcode_res not in unique_references_dict:
                unique_references_dict[bibcode_res] = fieldcp
            #merging of subfields
            else:
                #I put in local variable the two list of subfields
                inlist = unique_references_dict[bibcode_res][0]
                outlist = fieldcp[0]
                #I create a new dictionary where to merge the results with the subfields of the first list
                new_subfields = {}
                for subfield in inlist:
                    new_subfields[subfield[0]] = subfield[1]
                origin_imp_inlist = get_origin_importance(tag, new_subfields[ORIGIN_SUBFIELD])
                #then I compare these entries with the values from the second list
                #first I retrieve the origin of the second list and its importance
                for subfield in outlist:
                    if subfield[0] == ORIGIN_SUBFIELD:
                        origin_outlist = subfield[1]
                        origin_imp_outlist = get_origin_importance(tag, subfield[1])
                        break
                #and I retrieve the reference extension if it exists
                extension_outlist = None
                for subfield in outlist:
                    if subfield[0] == REFERENCE_EXTENSION:
                        extension_outlist = subfield[1]
                        break
                #then I merge
                for subfield in outlist:
                    #if I don't have a subfield at all I insert it unless it is a Extension field
                    if subfield[0] not in new_subfields and subfield[0] != REFERENCE_EXTENSION:
                        logger.info('      Subfield "%s" added to reference "%s".' % (subfield[0], bibcode_res))
                        new_subfields[subfield[0]] = subfield[1]
                    #otherwise if it is a reference string
                    elif subfield[0] in new_subfields and subfield[0] == REFERENCE_STRING:
                        #I extract both reference strings
                        refstring_out = subfield[1]
                        refstring_in = new_subfields[REFERENCE_STRING]
                        #if the one already in the list is the bibcode and the other one not I take the other one and I set the origin to the most trusted one
                        if (refstring_in == bibcode_res or len(refstring_in) == 0) and len(refstring_out) != 0:
                            new_subfields[REFERENCE_STRING] = refstring_out
                            logger.info('      Reference string (bibcode only or empty) replaced by the one with origin "%s" for reference %s".' % (origin_outlist, bibcode_res))
                            #if there was an extension for this string I copy also that one
                            if extension_outlist != None:
                                new_subfields[REFERENCE_EXTENSION] = extension_outlist
                                logger.info('      Reference extension replaced by the one with value "%s" for reference %s".' % (extension_outlist, bibcode_res))
                            #I update the origin if the new one is better
                            if origin_imp_outlist > origin_imp_inlist:
                                #first I print the message because I need the old origin
                                logger.info('      Reference origin "%s" replaced by the more trusted "%s".' % (new_subfields[ORIGIN_SUBFIELD], origin_outlist))
                                #then I replace it
                                new_subfields[ORIGIN_SUBFIELD] = origin_outlist
                                
                        #otherwise if the string already in is not a bibcode or empty I have to check the importance
                        else:
                            if origin_imp_outlist > origin_imp_inlist:
                                new_subfields[REFERENCE_STRING] = refstring_out
                                logger.info('      Reference string replaced by the one with origin "%s" for reference %s".' % (origin_outlist, bibcode_res))
                                if extension_outlist != None:
                                    new_subfields[REFERENCE_EXTENSION] = extension_outlist
                                    logger.info('      Reference extension replaced by the one with value "%s" for reference %s".' % (extension_outlist, bibcode_res))
                                #first I print the message because I need the old origin
                                logger.info('      Reference origin "%s" replaced by the more trusted "%s".' % (new_subfields[ORIGIN_SUBFIELD], origin_outlist))
                                new_subfields[ORIGIN_SUBFIELD] = origin_outlist
                    
                #finally I replace the global field
                newrecord = (new_subfields.items(), ) + unique_references_dict[bibcode_res][1:]
                unique_references_dict[bibcode_res] = newrecord
        else:
            unresolved_references.append(fieldcp)
    #and I return the union of the two lists of resolved and unresolved references
    return unique_references_dict.values() + unresolved_references
Exemplo n.º 4
0
    return unique_references_dict.values() + unresolved_references
    

def get_trusted_and_untrusted_fields(fields1, fields2, tag):
    """
    Selects the most trusted fields.
    """
    try:
        origin1 = get_origin(fields1)
        origin_val1 = get_origin_importance(tag, origin1)
    except OriginValueNotFound, error:
        logger.critical(error)
        raise
    try:
        origin2 = get_origin(fields2)
        origin_val2 = get_origin_importance(tag, origin2)
    except OriginValueNotFound, error:
        logger.critical(error)
        raise

    if origin_val1 > origin_val2:
        logger.info('      Selected fields from record 1 (%s over %s).' % (origin1, origin2))
        return fields1, fields2
    elif origin_val1 < origin_val2:
        logger.info('      Selected fields from record 2 (%s over %s).' % (origin2, origin1))
        return fields2, fields1
    else:
        raise EqualOrigins(str(origin1) + ' - ' + str(origin2))
    

def _get_best_fields(fields1, fields2, tag):