def darwinize_list(termlist, dwccloudfile, namespace=None):
    ''' Translate the terms in a list to standard Darwin Core terms.
    parameters:
        termlist - list of values to translate (required)
        dwccloudfile - the vocabulary file for the Darwin Cloud (required)
        encoding - a string designating the input file encoding (optional; default None) 
            (e.g., 'utf-8', 'mac_roman', 'latin_1', 'cp1252')
    returns:
        a list with all translatable terms translated
    '''
    functionname = 'darwinize_list()'

    if termlist is None or len(termlist) == 0:
        s = 'No termlist given in %s.' % functionname
        logging.debug(s)
        return None

    dialect = csv_file_dialect(dwccloudfile)

    # No need to check if dwccloudfile is given and exists, vetted_vocab_dict_from_file()
    # does that.
    darwinclouddict = darwin_cloud_vocab_dict_from_file(dwccloudfile)

    if darwinclouddict is None:
        s = 'No Darwin Cloud terms in %s.' % functionname
        logging.debug(s)
        return None

    thelist = []
    for term in termlist:
        thelist.append(ustripstr(term))

    addnamespace = False
    if namespace is not None and 'y' in namespace:
        addnamespace = True

    darwinizedlist = []
    i = 0
    j = 1
    for term in thelist:
        if term in darwinclouddict:
            if darwinclouddict[term]['standard'] is not None and \
                len(darwinclouddict[term]['standard'].strip()) > 0:
                if addnamespace == True:
                    ns = darwinclouddict[term]['namespace']
                    newterm = ns + ':' + darwinclouddict[term]['standard']
                else:
                    newterm = darwinclouddict[term]['standard']
            else:
                newterm = termlist[i].strip()
        else:
            newterm = termlist[i].strip()
            if len(newterm) == 0:
                newterm = 'UNNAMED_COLUMN_%s' % j
                j += 1
        darwinizedlist.append(newterm)
        i += 1

    return darwinizedlist
def terms_not_in_darwin_cloud(checklist,
                              dwccloudfile,
                              encoding=None,
                              vetted=True,
                              casesensitive=False):
    ''' Get the list of distinct values in a checklist that are not in the Darwin Cloud
        vocabulary. Verbatim values in the Darwin Cloud vocabulary should be lower-case and
        stripped already, so that is what must be matched here. The Darwin Cloud vocabulary
        should have the case-sensitive standard value.
    parameters:
        checklist - list of values to check against the target list (required)
        dwccloudfile - the vocabulary file for the Darwin Cloud (required)
        vetted - set to False if unvetted values should also be returned (default True)
        encoding - a string designating the input file encoding (optional; default None) 
            (e.g., 'utf-8', 'mac_roman', 'latin_1', 'cp1252')
    returns:
        a sorted list of distinct new values not in the Darwin Cloud vocabulary
    '''
    functionname = 'terms_not_in_darwin_cloud()'

    if checklist is None or len(checklist) == 0:
        s = 'No checklist given in %s.' % functionname
        logging.debug(s)
        return None

    dialect = csv_file_dialect(dwccloudfile)

    # Try to determine the encoding of the inputfile.
    if encoding is None or len(encoding.strip()) == 0:
        encoding = csv_file_encoding(dwccloudfile)
        # csv_file_encoding() always returns an encoding if there is an input file.

    # No need to check if dwccloudfile is given and exists, vocab_dict_from_file() and
    # vetted_vocab_dict_from_file() do that.
    if vetted == True:
        darwinclouddict = vetted_vocab_dict_from_file(dwccloudfile,
                                                      'fieldname',
                                                      dialect=dialect,
                                                      encoding=encoding)
    else:
        darwinclouddict = vocab_dict_from_file(dwccloudfile,
                                               'fieldname',
                                               dialect=dialect,
                                               encoding=encoding)

    dwcloudlist = []
    for key, value in darwinclouddict.iteritems():
        dwcloudlist.append(key)

    if casesensitive == True:
        return not_in_list(dwcloudlist, checklist)

    lowerdwclist = []
    for term in dwcloudlist:
        lowerdwclist.append(ustripstr(term))

    notfound = not_in_list(lowerdwclist, checklist, function=ustripstr)

    return notfound
def terms_not_in_dwc(checklist, casesensitive=False):
    ''' From a list of terms, get those that are not Darwin Core terms.
    parameters:
        checklist - list of values to check against Darwin Core (required)
        casesensitive - True if the test for inclusion is case sensitive (default True)
    returns:
        a sorted list of non-Darwin Core terms from the checklist
    '''
    # No need to check if checklist is given, not_in_list() does that
    if casesensitive == True:
        return not_in_list(simpledwctermlist, checklist)

    lowerdwc = []
    for term in simpledwctermlist:
        lowerdwc.append(ustripstr(term))

    notfound = not_in_list(lowerdwc, checklist, function=ustripstr)
    return notfound
Exemplo n.º 4
0
def term_standardizer_report(inputfile,
                             reportfile,
                             vocabfile,
                             key,
                             separator=None,
                             encoding=None,
                             format=None):
    ''' Write a file with substitutions from a vocabfile for fields in a key and appended 
        terms showing the original values.
    parameters:
        inputfile - full path to the input file (required)
        reportfile - full path to the output file (required)
        vocabfile - path to the vocabulary file (required)
        key - field or separator-separated fields to set (required)
        separator - string to use as the key and value separator (optional; default '|')
        encoding - string signifying the encoding of the input file. If known, it speeds
            up processing a great deal. (optional; default None) (e.g., 'utf-8')
        format - string signifying the csv.dialect of the report file ('csv' or 'txt')
            (optional; default: txt)
    returns:
        success - True if the report was written, else False
    '''
    functionname = 'term_standardizer_report()'

    if reportfile is None or len(reportfile) == 0:
        s = 'No reportfile name given in %s.' % functionname
        logging.debug(s)
        return False

    if inputfile is None or len(inputfile) == 0:
        s = 'No inputfile file given in %s.' % functionname
        logging.debug(s)
        return False

    if os.path.isfile(inputfile) == False:
        s = 'Inputfile file %s not found in %s.' % (inputfile, functionname)
        logging.debug(s)
        return False

    # Determine the dialect of the input file
    inputdialect = csv_file_dialect(inputfile)

    # Determine the dialect of the input file
    if encoding is None or len(encoding.strip()) == 0:
        encoding = csv_file_encoding(inputfile)

    # Read the header from the input file
    inputheader = read_header(inputfile,
                              dialect=inputdialect,
                              encoding=encoding)

    if inputheader is None:
        s = 'Unable to read header from input file %s in %s.' % (inputfile,
                                                                 functionname)
        logging.debug(s)
        return False

    if key is None or len(key.strip()) == 0:
        s = 'No key given in %s.' % functionname
        logging.debug(s)
        return False

    # Make sure there is a separator for the next step
    if separator is None or len(separator) == 0:
        separator = '|'

    # Make a list of the fields in the key by splitting it on the separator
    fieldlist = key.split(separator)

    # Assume none of the fields is in the file
    headerhaskey = False

    # Search the cleaned up header for any field from the key
    cleanedinputheader = strip_list(inputheader)
    for field in fieldlist:
        if field in cleanedinputheader:
            headerhaskey = True
            break

    if headerhaskey == False:
        s = 'No field from %s found ' % fieldlist
        s += 'in input file %s in %s.' % (inputfile, functionname)
        logging.debug(s)
        return False

    if vocabfile is None or len(vocabfile) == 0:
        logging.debug('No vocabulary file given in %s.') % functionname
        return False

    if os.path.isfile(vocabfile) == False:
        s = 'Vocabulary file %s not found in %s.' % (vocabfile, functionname)
        logging.debug(s)
        return False

    # Get the vocabulary dictionary, but convert all entries using ustripstr. Assume
    # vocabulary file is encoded as utf-8.
    vocabdict = vocab_dict_from_file(vocabfile, key, encoding='utf-8', \
        separator=separator, function=ustripstr)
    if len(vocabdict) == 0:
        s = 'Vocabulary file %s ' % vocabfile
        s += 'had zero recommendations in %s.' % functionname
        logging.debug(s)
        return False

    if format is None or format.lower() == 'txt':
        dialect = tsv_dialect()
    else:
        dialect = csv_dialect()

    if format is None or format.lower() == 'txt':
        outputdialect = tsv_dialect()
    else:
        outputdialect = csv_dialect()

    # Create an output header that is the same as the input header with fields
    # appended to hold the original values of the key fields
    # Get the fields to add by splitting the key with the separator
    outputheader = cleanedinputheader
    for field in fieldlist:
        if field in outputheader:
            outputheader = outputheader + [field + '_orig']
        else:
            outputheader = outputheader + [field]

    # Create the outputfile and write the new header to it
    write_header(reportfile, outputheader, outputdialect)

    # Check to see if the outputfile was created
    if os.path.isfile(reportfile) == False:
        s = 'reportfile: %s not created in %s.' % (reportfile, functionname)
        logging.debug(s)
        return False

    # Open the outputfile to append rows having the added fields
    with open(reportfile, 'a') as outfile:
        writer = csv.DictWriter(outfile,
                                dialect=outputdialect,
                                encoding='utf-8',
                                fieldnames=outputheader)
        # Iterate through all rows in the input file
        for row in read_csv_row(inputfile,
                                dialect=inputdialect,
                                encoding=encoding,
                                header=True,
                                fieldnames=cleanedinputheader):
            # Set the _orig values for every field in the field list that exists in
            # the row
            for field in fieldlist:
                if field in row:
                    row[field + '_orig'] = row[field]

            # Construct a composite field value for the row to match a key in the
            # vocabulary file
            rowkey = extract_values_from_row(row, fieldlist, separator)

            # Get dictionary for recommended value for the ustripstr(rowkey)
            newvaluedict = recommended_value(vocabdict, ustripstr(rowkey))

            # Only make changes if there is a standardized value found
            if newvaluedict is not None:
                # ustripstr(rowkey) was found in the vocabulary
                # Get the standard value
                standard = newvaluedict['standard']

                # Treat standard value that is None or only whitespace as ''
                if standard is None or len(standard.strip()) == 0:
                    standard = ''

                # Make a list of values given in standard
                newvalues = standard.split(separator)

                # Only make changes if the number of recommendation fields is the
                # same as the number of fields in the key
                if len(newvalues) == len(fieldlist):
                    i = 0
                    # Update or add new value to field in the fieldlist
                    for field in fieldlist:
                        row[field] = newvalues[i]
                        i += 1

            writer.writerow(row)

    s = 'Report written to %s in %s.' % (reportfile, functionname)
    logging.debug(s)
    return True
def missing_vocab_list_from_file(checklist,
                                 vocabfile,
                                 key,
                                 separator=None,
                                 dialect=None,
                                 encoding=None):
    ''' Given a checklist of values, get values not found in the given vocabulary file. 
       Values can match exactly, or they can match after making them upper case and 
       stripping whitespace.
    parameters:
        checklist - list of values to get from the vocabfile (required)
        vocabfile - full path to the vocabulary lookup file (required)
        key - the field or separator-separated fieldnames that hold the distinct values 
              in the vocabulary file (required)
        separator - string to use as the value separator in the string 
            (optional; default None)
        dialect - csv.dialect object with the attributes of the vocabulary lookup file 
            (default None)
        encoding - a string designating the input file encoding (optional; default None) 
            (e.g., 'utf-8', 'mac_roman', 'latin_1', 'cp1252')
    returns:
        missingvocabdict - values in the checklist not found in the vocabulary file
    '''
    functionname = 'missing_vocab_list_from_file()'

    if checklist is None or len(checklist) == 0:
        s = 'No list of values given in %s.' % functionname
        logging.debug(s)
        return None

    vocabdict = vocab_dict_from_file(vocabfile, key, separator, dialect,
                                     encoding)
    if vocabdict is None or len(vocabdict) == 0:
        s = 'No vocabdict constructed in %s.' % functionname
        logging.debug(s)
        return None

    missingvocabset = set()

    # Look through every value in the checklist
    for value in checklist:
        if separator is None:
            terms = [value]
        else:
            try:
                terms = value.split(separator)
            except Exception, e:
                s = 'Exception splitting value: %s Exception: %s ' % (value, e)
                s += 'in %s' % functionname
                logging.debug(s)
                terms = [value]  # cop out
        newvalue = ''
        n = 0
        for term in terms:
            if n == 0:
                newvalue = ustripstr(term)
                n = 1
            else:
                newvalue = newvalue + separator + ustripstr(term)
        # If value or newvalue is in the vocabulary, nevermind
        if value in vocabdict or newvalue in vocabdict:
            pass
        # Otherwise, add the upper case, stripped value to the list
        else:
            missingvocabset.add(newvalue)
def matching_vocab_dict_from_file(checklist,
                                  vocabfile,
                                  key,
                                  separator=None,
                                  dialect=None,
                                  encoding=None):
    ''' Given a checklist of values, get matching values from a vocabulary file. Values
       can match exactly, or they can match after making them upper case and stripping 
       whitespace.
    parameters:
        checklist - list of values to get from the vocabfile (required)
        vocabfile - full path to the vocabulary lookup file (required)
        key - the field or separator-separated fieldnames that hold the distinct values 
            in the vocabulary file (required)
        separator - string to use as the value separator in the string 
            (optional; default None)
        dialect - csv.dialect object with the attributes of the vocabulary lookup file 
            (default None)
        encoding - a string designating the input file encoding (optional; default None) 
            (e.g., 'utf-8', 'mac_roman', 'latin_1', 'cp1252')
    returns:
        matchingvocabdict - dictionary of complete vocabulary records matching the values 
            in the checklist
    '''
    functionname = 'matching_vocab_dict_from_file()'

    if checklist is None or len(checklist) == 0:
        s = 'No list of values given in %s.' % functionname
        logging.debug(s)
        return None

    #print 'checklist: %s' % checklist

    vocabdict = vocab_dict_from_file(vocabfile, key, separator, dialect,
                                     encoding)
    if vocabdict is None or len(vocabdict) == 0:
        s = 'No vocabdict constructed in %s' % functionname
        logging.debug(s)
        return None

    #print 'vocabdict: %s' % vocabdict

    matchingvocabdict = {}

    # Look through every value in the checklist
    for value in checklist:
        if separator is None:
            terms = [value]
        else:
            try:
                terms = value.split(separator)
            except Exception, e:
                s = 'Exception splitting value: %s Exception: %s ' % (value, e)
                s += 'in %s' % functionname
                logging.debug(s)
                terms = [value]  # cop out
        newvalue = ''
        n = 0
        for term in terms:
            if n == 0:
                newvalue = ustripstr(term)
                n = 1
            else:
                newvalue = newvalue + separator + ustripstr(term)

        # If the simplified version of the value is in the dictionary, get the
        # vocabulary entry for it.
        if value in vocabdict or newvalue in vocabdict:
            matchingvocabdict[value] = vocabdict[newvalue]