예제 #1
0
class WoRMSCurator(object): 
    """
    Class for accessing the WoRMS taxonomic name database via the AphiaNameService. 
    """

    def __init__(self):
        """ Initialize a SOAP client using the WSDL for the WoRMS Aphia names service"""        
        self._worms = WoRMSService()

    def curate_taxon_name_and_author(self, input_record):
    
        # look up aphia record for input taxon name in WoRMS taxonomic database
        is_exact_match, aphia_record = (
            self._worms.aphia_record_by_taxon_name(input_record['TaxonName']))
        
        if aphia_record is not None:
    
            # save taxon name and author values from input record in new fields
            input_record['OriginalName'] = input_record['TaxonName']
            input_record['OriginalAuthor'] = input_record['Author']

            # replace taxon name and author fields in input record with values in aphia record
            input_record['TaxonName'] = aphia_record['scientificname']
            input_record['Author'] = aphia_record['authority']
      
            # add new fields
            input_record['WoRMsExactMatch'] = is_exact_match
            input_record['lsid'] = aphia_record['lsid']
        
        else:
      
            input_record['OriginalName'] = None
            input_record['OriginalAuthor'] = None
            input_record['WoRMsExactMatch'] = None
            input_record['lsid'] = None

        return input_record            
def clean_data_using_worms(input_data_file_name,
                           cleaned_data_file_name,
                           rejected_data_file_name,
                           input_field_delimiter=',',
                           output_field_delimiter=','):

    worms = WoRMSService()
    accepted_record_count = 0
    rejected_record_count = 0

    ##############################################################################################
    # @BEGIN read_input_data_records
    # @PARAM input_data_file_name
    # @PARAM input_field_delimiter
    # @IN input_data @FILE file:{input_data_file_name}
    # @OUT original_record

    # create CSV reader for input records
    timestamp("Reading input records from '{0}'.".format(input_data_file_name))
    input_data = csv.DictReader(open(input_data_file_name, 'r'),
                                delimiter=input_field_delimiter)

    # iterate over input data records
    record_num = 0
    for original_record in input_data:

        record_num += 1
        print
        timestamp('Reading input record {0:03d}.'.format(record_num))

        # @END read_input_data_records

        ##############################################################################################
        # @BEGIN extract_record_fields
        # @IN original_record
        # @OUT original_scientific_name
        # @OUT original_authorship

        # extract values of fields to be validated
        original_scientific_name = original_record['scientificName']
        original_authorship = original_record['scientificNameAuthorship']

        # @END extract_record_fields

        ##############################################################################################
        # @BEGIN find_matching_worms_record
        # @IN original_scientific_name
        # @OUT matching_worms_record
        # @OUT worms_lsid

        worms_match_result = None
        worms_lsid = None

        # first try exact match of the scientific name against WoRMs
        timestamp(
            "Trying WoRMs EXACT match for scientific name: '{0}'.".format(
                original_scientific_name))
        matching_worms_record = worms.aphia_record_by_exact_taxon_name(
            original_scientific_name)
        if matching_worms_record is not None:
            timestamp('WoRMs EXACT match was SUCCESSFUL.')
            worms_match_result = 'exact'

        # otherwise try a fuzzy match
        else:
            timestamp('EXACT match FAILED.')
            timestamp(
                "Trying WoRMs FUZZY match for scientific name: '{0}'.".format(
                    original_scientific_name))
            matching_worms_record = worms.aphia_record_by_fuzzy_taxon_name(
                original_scientific_name)
            if matching_worms_record is not None:
                timestamp('WoRMs FUZZY match was SUCCESSFUL.')
                worms_match_result = 'fuzzy'
            else:
                timestamp('WoRMs FUZZY match FAILED.')

        # if either match succeeds extract the LSID for the taxon
        if matching_worms_record is not None:
            worms_lsid = matching_worms_record['lsid']

    # @END find_matching_worms_record

    ##############################################################################################
    # @BEGIN reject_records_not_in_worms
    # @IN original_record
    # @IN matching_worms_record
    # @PARAM rejected_data_file_name
    # @PARAM output_field_delimiter
    # @OUT rejected_data @FILE file:{rejected_data_file_name}

    # reject the currect record if not matched successfully against WoRMs
        if worms_match_result is None:

            # open file for storing rejected data if not already open
            if 'rejected_data' not in locals():
                rejected_data = csv.DictWriter(
                    open(rejected_data_file_name, 'w'),
                    input_data.fieldnames,
                    delimiter=output_field_delimiter)
                rejected_data.writeheader()

            # write current record to rejects file
            timestamp('REJECTED record {0:03d}.'.format(record_num))
            rejected_data.writerow(original_record)
            rejected_record_count += 1

            # skip to processing of next record in input file
            continue

    # @END reject_records_not_in_worms

    ##############################################################################################
    # @BEGIN update_scientific_name
    # @IN original_scientific_name
    # @IN matching_worms_record
    # @OUT updated_scientific_name

        updated_scientific_name = None

        # get scientific name from WoRMs record if the taxon name match was fuzzy
        if worms_match_result == 'fuzzy':
            updated_scientific_name = matching_worms_record['scientificname']

    # @END update_scientific_name

    ##############################################################################################
    # @BEGIN update_authorship
    # @IN matching_worms_record
    # @IN original_authorship
    # @OUT updated_authorship

        updated_authorship = None

        # get the scientific name authorship from the WoRMS record if different from input record
        worms_name_authorship = matching_worms_record['authority']
        if worms_name_authorship != original_authorship:
            updated_authorship = worms_name_authorship

    # @END update_authorship

    ##############################################################################################
    # @BEGIN compose_cleaned_record
    # @IN original_record
    # @IN worms_lsid
    # @IN updated_scientific_name
    # @IN original_scientific_name
    # @IN updated_authorship
    # @IN original_authorship
    # @OUT cleaned_record

        cleaned_record = original_record
        cleaned_record['LSID'] = worms_lsid
        cleaned_record['WoRMsMatchResult'] = worms_match_result

        if updated_scientific_name is not None:
            timestamp("UPDATING scientific name from '{0}' to '{1}'.".format(
                original_scientific_name, updated_scientific_name))
            cleaned_record['scientificName'] = updated_scientific_name
            cleaned_record['originalScientificName'] = original_scientific_name

        if updated_authorship is not None:
            timestamp(
                "UPDATING scientific name authorship from '{0}' to '{1}'.".
                format(original_authorship, updated_authorship))
            cleaned_record['scientificNameAuthorship'] = updated_authorship
            cleaned_record['originalAuthor'] = original_authorship

    # @END compose_cleaned_record

    ##############################################################################################
    # @BEGIN write_clean_data_set
    # @PARAM cleaned_data_file_name
    # @PARAM output_field_delimiter
    # @IN cleaned_record
    # @OUT cleaned_data  @FILE file:{cleaned_data_file_name}

    # open file for storing cleaned data if not already open
        if 'cleaned_data' not in locals():
            cleaned_record_field_names = input_data.fieldnames + \
                ['LSID', 'WoRMsMatchResult', 'originalScientificName', 'originalAuthor']
            cleaned_data = csv.DictWriter(open(cleaned_data_file_name, 'w'),
                                          cleaned_record_field_names,
                                          delimiter=output_field_delimiter)
            cleaned_data.writeheader()

        timestamp('ACCEPTED record {0:03d}.'.format(record_num))
        cleaned_data.writerow(cleaned_record)
        accepted_record_count += 1

    # @END write_clean_data_set

    print
    timestamp("Wrote {0} accepted records to '{1}'.".format(
        accepted_record_count, cleaned_data_file_name))
    timestamp("Wrote {0} rejected records to '{1}'.".format(
        rejected_record_count, rejected_data_file_name))
def clean_data_using_worms(
    input_data_file_name, 
    cleaned_data_file_name, 
    rejected_data_file_name, 
    input_field_delimiter=',',
    output_field_delimiter=','
    ):  
    
    worms = WoRMSService()
    accepted_record_count = 0
    rejected_record_count = 0

    ##############################################################################################
    # @BEGIN read_input_data_records
    # @PARAM input_data_file_name
    # @PARAM input_field_delimiter
    # @IN input_data @FILE file:{input_data_file_name}
    # @OUT original_record
    
    # create CSV reader for input records
    timestamp("Reading input records from '{0}'.".format(input_data_file_name))
    input_data = csv.DictReader(open(input_data_file_name, 'r'),
                                delimiter=input_field_delimiter)

    # iterate over input data records
    record_num = 0
    for original_record in input_data:
        
        record_num += 1
        print
        timestamp('Reading input record {0:03d}.'.format(record_num))
    
    # @END read_input_data_records

    ##############################################################################################
    # @BEGIN extract_record_fields 
    # @IN original_record
    # @OUT original_scientific_name
    # @OUT original_authorship
    
        # extract values of fields to be validated
        original_scientific_name = original_record['scientificName']
        original_authorship = original_record['scientificNameAuthorship']
            
    # @END extract_record_fields 

        
    ##############################################################################################
    # @BEGIN find_matching_worms_record 
    # @IN original_scientific_name
    # @OUT matching_worms_record
    # @OUT worms_lsid
    
        worms_match_result = None
        worms_lsid = None
        
        # first try exact match of the scientific name against WoRMs
        timestamp("Trying WoRMs EXACT match for scientific name: '{0}'.".format(original_scientific_name))
        matching_worms_record = worms.aphia_record_by_exact_taxon_name(original_scientific_name)
        if matching_worms_record is not None:
            timestamp('WoRMs EXACT match was SUCCESSFUL.')
            worms_match_result = 'exact'

        # otherwise try a fuzzy match
        else:
            timestamp('EXACT match FAILED.')      
            timestamp("Trying WoRMs FUZZY match for scientific name: '{0}'.".format(original_scientific_name))
            matching_worms_record = worms.aphia_record_by_fuzzy_taxon_name(original_scientific_name)
            if matching_worms_record is not None:
                timestamp('WoRMs FUZZY match was SUCCESSFUL.')
                worms_match_result = 'fuzzy'
            else:
                timestamp('WoRMs FUZZY match FAILED.')
        
        # if either match succeeds extract the LSID for the taxon
        if matching_worms_record is not None:
            worms_lsid = matching_worms_record['lsid']
                    
    # @END find_matching_worms_record

    ##############################################################################################
    # @BEGIN reject_records_not_in_worms
    # @IN original_record
    # @IN matching_worms_record
    # @PARAM rejected_data_file_name
    # @PARAM output_field_delimiter
    # @OUT rejected_data @FILE file:{rejected_data_file_name}
    
        # reject the currect record if not matched successfully against WoRMs
        if worms_match_result is None:

            # open file for storing rejected data if not already open
            if 'rejected_data' not in locals():
                rejected_data = csv.DictWriter(open(rejected_data_file_name, 'w'), 
                                                  input_data.fieldnames, 
                                                  delimiter=output_field_delimiter)
                rejected_data.writeheader()
                
            # write current record to rejects file
            timestamp('REJECTED record {0:03d}.'.format(record_num))
            rejected_data.writerow(original_record)
            rejected_record_count += 1
            
            # skip to processing of next record in input file
            continue
                
    # @END reject_records_not_in_worms

    ##############################################################################################
    # @BEGIN update_scientific_name
    # @IN original_scientific_name
    # @IN matching_worms_record
    # @OUT updated_scientific_name
        
        updated_scientific_name = None
        
        # get scientific name from WoRMs record if the taxon name match was fuzzy
        if worms_match_result == 'fuzzy':
            updated_scientific_name = matching_worms_record['scientificname']

    # @END update_scientific_name

    ##############################################################################################
    # @BEGIN update_authorship
    # @IN matching_worms_record
    # @IN original_authorship
    # @OUT updated_authorship
    
        updated_authorship = None
        
        # get the scientific name authorship from the WoRMS record if different from input record
        worms_name_authorship = matching_worms_record['authority']
        if worms_name_authorship != original_authorship:
            updated_authorship = worms_name_authorship

    # @END update_authorship

    ##############################################################################################
    # @BEGIN compose_cleaned_record
    # @IN original_record
    # @IN worms_lsid
    # @IN updated_scientific_name
    # @IN original_scientific_name
    # @IN updated_authorship
    # @IN original_authorship
    # @OUT cleaned_record
       
        cleaned_record = original_record
        cleaned_record['LSID'] = worms_lsid
        cleaned_record['WoRMsMatchResult'] = worms_match_result
        
        if updated_scientific_name is not None:
            timestamp("UPDATING scientific name from '{0}' to '{1}'.".format(
                     original_scientific_name, updated_scientific_name))
            cleaned_record['scientificName'] = updated_scientific_name
            cleaned_record['originalScientificName'] = original_scientific_name
            
        if updated_authorship is not None:
            timestamp("UPDATING scientific name authorship from '{0}' to '{1}'.".format(
                original_authorship, updated_authorship))
            cleaned_record['scientificNameAuthorship'] = updated_authorship
            cleaned_record['originalAuthor'] = original_authorship
                
    # @END compose_cleaned_record

    ##############################################################################################
    # @BEGIN write_clean_data_set
    # @PARAM cleaned_data_file_name
    # @PARAM output_field_delimiter
    # @IN cleaned_record
    # @OUT cleaned_data  @FILE file:{cleaned_data_file_name}

        # open file for storing cleaned data if not already open
        if 'cleaned_data' not in locals():
            cleaned_record_field_names = input_data.fieldnames + \
                ['LSID', 'WoRMsMatchResult', 'originalScientificName', 'originalAuthor']
            cleaned_data = csv.DictWriter(open(cleaned_data_file_name, 'w'), 
                                          cleaned_record_field_names, 
                                          delimiter=output_field_delimiter)
            cleaned_data.writeheader()
    
        timestamp('ACCEPTED record {0:03d}.'.format(record_num))
        cleaned_data.writerow(cleaned_record)
        accepted_record_count += 1
        
    # @END write_clean_data_set

    print
    timestamp("Wrote {0} accepted records to '{1}'.".format(accepted_record_count, cleaned_data_file_name))
    timestamp("Wrote {0} rejected records to '{1}'.".format(rejected_record_count, rejected_data_file_name))
예제 #4
0
 def __init__(self):
     """ Initialize a SOAP client using the WSDL for the WoRMS Aphia names service"""        
     self._worms = WoRMSService()