示例#1
0
def term_list_report(reportfile, termlist, key, separator=None, format=None):
    ''' Write a report with a list of terms.
    parameters:
        reportfile - full path to the output report file (optional)
        termlist - list of terms to report (required)
        format - string signifying the csv.dialect of the report file ('csv' or 'txt')
        key - the field or separator-separated fieldnames that hold the distinct values 
            in the vocabulary file (required)
        separator - string to use as the value separator in the string 
            (optional; default None)
    returns:
        success - True if the report was written, else False
    '''
    functionname = 'term_list_report()'

    if termlist is None or len(termlist) == 0:
        s = 'No term list given in %s.' % functionname
        logging.debug(s)
        return False

    if reportfile is None or len(reportfile) == 0:
        s = 'No recommendation file name given in %s.' % functionname
        logging.debug(s)
        return False

    fieldnames = vocabheader(key, separator)

    if format is None or format.lower() == 'csv':
        dialect = csv_dialect()
    else:
        dialect = tsv_dialect()

    # Create the outputfile and write the new header to it
    write_header(reportfile, fieldnames, dialect)

    if os.path.isfile(reportfile) == False:
        s = 'reportfile: %s not created in %s.' % (reportfile, functionname)
        logging.debug(s)
        return False

    with open(reportfile, 'a') as csvfile:
        writer = csv.DictWriter(csvfile,
                                dialect=dialect,
                                encoding='utf-8',
                                fieldnames=fieldnames)
        for value in termlist:
            row = {key: value, 'standard': '', 'vetted': '0'}
            if separator is None:
                fields = [key]
            else:
                fields = key.split(separator)
            if len(fields) > 1:
                for field in fields:
                    row[field] = value
            writer.writerow(row)
    s = 'Report written to %s in %s.' % (reportfile, functionname)
    logging.debug(s)
    return True
示例#2
0
def term_completeness_report(reportfile, fieldcountdict, format=None):
    ''' Write a report with a list of fields and the number of times they are populated.
    parameters:
        reportfile - full path to the output report file (optional)
        fieldcountdict - dictionary of field names and the number of rows in which they 
            are populated in the inputfile
        format - string signifying the csv.dialect of the report file ('csv' or 'txt')
    returns:
        success - True if the report was written, else False
    '''
    functionname = 'term_completeness_report()'

    if fieldcountdict is None or len(fieldcountdict) == 0:
        s = 'No field count dictionary given in %s.' % functionname
        logging.debug(s)
        return False

    if reportfile is None or len(reportfile) == 0:
        s = 'No recommendation file name given in %s.' % functionname
        logging.debug(s)
        return False

    if format is None or format.lower() == 'csv':
        dialect = csv_dialect()
    else:
        dialect = tsv_dialect()

    fields = []
    # Make an alphabetically sorted list of field names
    for key, value in fieldcountdict.iteritems():
        fields.append(key)
    fieldlist = sorted(fields)

    outputheader = ['field', 'count']
    # Create the outputfile and write the new header to it
    write_header(reportfile, outputheader, dialect)

    if os.path.isfile(reportfile) == False:
        s = 'reportfile: %s not created in %s.' % (reportfile, functionname)
        logging.debug(s)
        return False

    with open(reportfile, 'a') as csvfile:
        writer = csv.DictWriter(csvfile,
                                dialect=dialect,
                                encoding='utf-8',
                                fieldnames=outputheader)
        for field in fieldlist:
            row = {'field': field, 'count': fieldcountdict[field]}
            writer.writerow(row)
    s = 'Report written to %s in %s.' % (reportfile, functionname)
    logging.debug(s)
    return True
def term_value_count_report(reportfile, termcountlist, termname='value', format=None):
    ''' Write a report of the counts of values for the term.
    parameters:
        reportfile - full path to the output report file
        termcountlist - list of terms with counts (required)
        termname - name of the term for which counts were made (optional; default 'value')
        format - string signifying the csv.dialect of the report file ('csv' or 'txt')
    returns:
        success - True if report was written or if there is nothing to write, else False
    '''
    functionname = 'term_value_count_report()'

    if reportfile is None or len(reportfile)==0:
        s = 'No report file given in %s.' % functionname
        logging.debug(s)
        return False

    if termcountlist is None or len(termcountlist)==0:
        s = 'No term count list given in %s.' % functionname
        logging.debug(s)
        return True

    if format=='csv' or format is None:
        dialect = csv_dialect()
    else:
        dialect = tsv_dialect()

    countreporttermlist = [termname, 'count']

    # Create the outputfile and write the new header to it
    write_header(reportfile, countreporttermlist, dialect)

    if os.path.isfile(reportfile) == False:
        s = 'reportfile: %s not created in %s' % (reportfile, functionname)
        logging.debug(s)
        return False

    with open(reportfile, 'a') as csvfile:
        writer = csv.DictWriter(csvfile, dialect=dialect, \
            fieldnames=countreporttermlist)
        for item in termcountlist:
            writer.writerow({termname:item[0], 'count':item[1] })
    return True
def text_file_field_stripper(options):
    ''' Filter a text file into a new file based on matching a list of fields to keep.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - the directory in which the output will be written (optional)
        inputfile - full path to the input file (required)
        outputfile - name of the output file, without path (required)
        separator - string that separates the values in termlist (e.g., '|') 
            (optional; default None)
        encoding - string signifying the encoding of the input file. If known, it speeds
            up processing a great deal. (optional; default None) (e.g., 'utf-8')
        format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt')
        termlist - list of fields to extract from the input file (required)
    returns a dictionary with information about the results
        workspace - actual path to the directory where the outputfile was written
        outputfile - actual full path to the output tsv file
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
        artifacts - a dictionary of persistent objects created
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug( 'Started %s' % __version__ )
    logging.debug( 'options: %s' % options )

    # Make a list for the response
    returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts']

    ### Standard outputs ###
    success = False
    message = None

    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Establish variables ###
    workspace = './'
    inputfile = None
    outputfile = None
    format = 'txt'
    termlist = None
    separator = None
    encoding = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputfile = options['inputfile']
    except:
        pass

    if inputfile is None or len(inputfile)==0:
        message = 'No input file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    if os.path.isfile(inputfile) == False:
        message = 'Input file %s not found. %s' % (inputfile, __version__)
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    try:
        termlist = options['termlist']
    except:
        pass

    if termlist is None or len(termlist)==0:
        message = 'No termlist given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message: %s' % message)
        return response(returnvars, returnvals)

    try:
        separator = options['separator']
    except:
        pass

    try:
        encoding = options['encoding']
    except:
        pass

    if separator is None or len(separator.strip())==0:
        theterms = [termlist]
    else:
        theterms = termlist.split(separator)

    # Determine the file dialect
    inputdialect = csv_file_dialect(inputfile)

    # Determine the file encoding
    if encoding is None or len(encoding.strip()) == 0:
        encoding = csv_file_encoding(inputfile)
        # csv_file_encoding() always returns an encoding if there is an input file.
        # No need to check.

    # If the termname is not in the header of the inputfile, nothing to do.
    header = read_header(inputfile, dialect=inputdialect, encoding=encoding)

    # Make a clean version of the input header
    cleaninputheader = clean_header(header)

    try:
        format = options['format']
    except:
        pass

    try:
        outputfile = options['outputfile']
    except:
        pass

    if outputfile is None or len(outputfile)==0:
        message = 'No output file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    # Prepare the outputfile
    if format is None or format.lower()=='txt':
        outputdialect = tsv_dialect()
    else:
        outputdialect = csv_dialect()

    if separator is None or len(separator.strip())==0:
        theterms = [termlist]
    else:
        theterms = termlist.split(separator)

    # Make a clean version of the output header
    cleanoutputheader = clean_header(theterms)

    # Create the outputfile and write the new header to it
    write_header(outputfile, cleanoutputheader, outputdialect)

    # Check to see that the file was created
    if os.path.isfile(outputfile) == False:
        message = 'Outputfile %s was not created. %s' % (outputfile, __version__)
        returnvals = [workspace, outputfile, success, message, artifacts]
        return response(returnvars, returnvals)

    # Open the outputfile to start writing matching rows
    with open(outputfile, 'a') as outfile:
        writer = csv.DictWriter(outfile, dialect=outputdialect, encoding='utf-8', 
            fieldnames=cleanoutputheader)

        # Iterate through all rows in the input file
        for row in read_csv_row(inputfile, dialect=inputdialect, encoding=encoding, 
            header=True, fieldnames=cleaninputheader):
            newrow = extract_fields_from_row(row, cleanoutputheader)
            writer.writerow(newrow)

    success = True
    s = 'stripped_file'
    artifacts[s] = outputfile
    
    # Prepare the response dictionary
    returnvals = [workspace, outputfile, success, message, artifacts]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
示例#5
0
def term_standardizer_report(inputfile,
                             reportfile,
                             vocabfile,
                             key,
                             separator=None,
                             encoding=None,
                             format=None):
    ''' Write a file with substitutions from a vocabfile for fields in a key and appended 
        terms showing the original values.
    parameters:
        inputfile - full path to the input file (required)
        reportfile - full path to the output file (required)
        vocabfile - path to the vocabulary file (required)
        key - field or separator-separated fields to set (required)
        separator - string to use as the key and value separator (optional; default '|')
        encoding - string signifying the encoding of the input file. If known, it speeds
            up processing a great deal. (optional; default None) (e.g., 'utf-8')
        format - string signifying the csv.dialect of the report file ('csv' or 'txt')
            (optional; default: txt)
    returns:
        success - True if the report was written, else False
    '''
    functionname = 'term_standardizer_report()'

    if reportfile is None or len(reportfile) == 0:
        s = 'No reportfile name given in %s.' % functionname
        logging.debug(s)
        return False

    if inputfile is None or len(inputfile) == 0:
        s = 'No inputfile file given in %s.' % functionname
        logging.debug(s)
        return False

    if os.path.isfile(inputfile) == False:
        s = 'Inputfile file %s not found in %s.' % (inputfile, functionname)
        logging.debug(s)
        return False

    # Determine the dialect of the input file
    inputdialect = csv_file_dialect(inputfile)

    # Determine the dialect of the input file
    if encoding is None or len(encoding.strip()) == 0:
        encoding = csv_file_encoding(inputfile)

    # Read the header from the input file
    inputheader = read_header(inputfile,
                              dialect=inputdialect,
                              encoding=encoding)

    if inputheader is None:
        s = 'Unable to read header from input file %s in %s.' % (inputfile,
                                                                 functionname)
        logging.debug(s)
        return False

    if key is None or len(key.strip()) == 0:
        s = 'No key given in %s.' % functionname
        logging.debug(s)
        return False

    # Make sure there is a separator for the next step
    if separator is None or len(separator) == 0:
        separator = '|'

    # Make a list of the fields in the key by splitting it on the separator
    fieldlist = key.split(separator)

    # Assume none of the fields is in the file
    headerhaskey = False

    # Search the cleaned up header for any field from the key
    cleanedinputheader = strip_list(inputheader)
    for field in fieldlist:
        if field in cleanedinputheader:
            headerhaskey = True
            break

    if headerhaskey == False:
        s = 'No field from %s found ' % fieldlist
        s += 'in input file %s in %s.' % (inputfile, functionname)
        logging.debug(s)
        return False

    if vocabfile is None or len(vocabfile) == 0:
        logging.debug('No vocabulary file given in %s.') % functionname
        return False

    if os.path.isfile(vocabfile) == False:
        s = 'Vocabulary file %s not found in %s.' % (vocabfile, functionname)
        logging.debug(s)
        return False

    # Get the vocabulary dictionary, but convert all entries using ustripstr. Assume
    # vocabulary file is encoded as utf-8.
    vocabdict = vocab_dict_from_file(vocabfile, key, encoding='utf-8', \
        separator=separator, function=ustripstr)
    if len(vocabdict) == 0:
        s = 'Vocabulary file %s ' % vocabfile
        s += 'had zero recommendations in %s.' % functionname
        logging.debug(s)
        return False

    if format is None or format.lower() == 'txt':
        dialect = tsv_dialect()
    else:
        dialect = csv_dialect()

    if format is None or format.lower() == 'txt':
        outputdialect = tsv_dialect()
    else:
        outputdialect = csv_dialect()

    # Create an output header that is the same as the input header with fields
    # appended to hold the original values of the key fields
    # Get the fields to add by splitting the key with the separator
    outputheader = cleanedinputheader
    for field in fieldlist:
        if field in outputheader:
            outputheader = outputheader + [field + '_orig']
        else:
            outputheader = outputheader + [field]

    # Create the outputfile and write the new header to it
    write_header(reportfile, outputheader, outputdialect)

    # Check to see if the outputfile was created
    if os.path.isfile(reportfile) == False:
        s = 'reportfile: %s not created in %s.' % (reportfile, functionname)
        logging.debug(s)
        return False

    # Open the outputfile to append rows having the added fields
    with open(reportfile, 'a') as outfile:
        writer = csv.DictWriter(outfile,
                                dialect=outputdialect,
                                encoding='utf-8',
                                fieldnames=outputheader)
        # Iterate through all rows in the input file
        for row in read_csv_row(inputfile,
                                dialect=inputdialect,
                                encoding=encoding,
                                header=True,
                                fieldnames=cleanedinputheader):
            # Set the _orig values for every field in the field list that exists in
            # the row
            for field in fieldlist:
                if field in row:
                    row[field + '_orig'] = row[field]

            # Construct a composite field value for the row to match a key in the
            # vocabulary file
            rowkey = extract_values_from_row(row, fieldlist, separator)

            # Get dictionary for recommended value for the ustripstr(rowkey)
            newvaluedict = recommended_value(vocabdict, ustripstr(rowkey))

            # Only make changes if there is a standardized value found
            if newvaluedict is not None:
                # ustripstr(rowkey) was found in the vocabulary
                # Get the standard value
                standard = newvaluedict['standard']

                # Treat standard value that is None or only whitespace as ''
                if standard is None or len(standard.strip()) == 0:
                    standard = ''

                # Make a list of values given in standard
                newvalues = standard.split(separator)

                # Only make changes if the number of recommendation fields is the
                # same as the number of fields in the key
                if len(newvalues) == len(fieldlist):
                    i = 0
                    # Update or add new value to field in the fieldlist
                    for field in fieldlist:
                        row[field] = newvalues[i]
                        i += 1

            writer.writerow(row)

    s = 'Report written to %s in %s.' % (reportfile, functionname)
    logging.debug(s)
    return True
示例#6
0
def uuid_term_appender(inputfile,
                       outputfile,
                       key,
                       guidtype=None,
                       encoding=None,
                       format=None):
    ''' Write a file adding a field populated by global unique identifiers (GUIDs) to the 
        fields in the input file.
    parameters:
        inputfile - full path to the input file (required)
        outputfile - full path to the output file (required)
        key - field or separator-separated fields to set (required)
        guidtype - type of GUID to use to populate the key (optional; default 'uuid')
        encoding - string signifying the encoding of the input file. If known, it speeds
            up processing a great deal. (optional; default None) (e.g., 'utf-8')
        format - string signifying the csv.dialect of the report file ('csv' or 'txt')
            (optional; default: txt)
    returns:
        success - True if the report was written, else False
    '''
    functionname = 'uuid_term_appender()'

    if outputfile is None or len(outputfile) == 0:
        s = 'No outputfile name given in %s.' % functionname
        logging.debug(s)
        return False

    if inputfile is None or len(inputfile) == 0:
        s = 'No inputfile file given in %s.' % functionname
        logging.debug(s)
        return False

    if os.path.isfile(inputfile) == False:
        s = 'Inputfile file %s not found in %s.' % (inputfile, functionname)
        logging.debug(s)
        return False

    # Determine the dialect of the input file
    inputdialect = csv_file_dialect(inputfile)

    # Determine the dialect of the input file
    if encoding is None or len(encoding.strip()) == 0:
        encoding = csv_file_encoding(inputfile)

    # Read the header from the input file
    inputheader = read_header(inputfile,
                              dialect=inputdialect,
                              encoding=encoding)

    if inputheader is None:
        s = 'Unable to read header from input file %s in %s.' % (inputfile,
                                                                 functionname)
        logging.debug(s)
        return False

    if key is None or len(key.strip()) == 0:
        s = 'No key given in %s.' % functionname
        logging.debug(s)
        return False

    # Abort if the key exists in the inputheader
    if key in inputheader:
        s = 'field %s ' % key
        s += 'already exists in file %s ' % inputfile
        s += 'in %s.' % functionname
        logging.debug(s)
        return False

    if format is None or format.lower() == 'txt':
        outputdialect = tsv_dialect()
    else:
        outputdialect = csv_dialect()

    # Make an outputheader that is a copy of the inputheader plus the new field to hold
    # GUID.
    outputheader = inputheader + [key]

    # Create the outputfile and write the new header to it
    write_header(outputfile, outputheader, outputdialect)

    # Check to see if the outputfile was created
    if os.path.isfile(outputfile) == False:
        s = 'outputfile: %s was not created in %s.' % (outputfile,
                                                       functionname)
        logging.debug(s)
        return False

    # Open the outputfile to append rows with appended GUID field
    with open(outputfile, 'a') as outfile:
        writer = csv.DictWriter(outfile,
                                dialect=outputdialect,
                                encoding='utf-8',
                                fieldnames=outputheader)

        # Iterate through all rows in the input file
        for row in read_csv_row(inputfile,
                                dialect=inputdialect,
                                encoding=encoding,
                                header=True,
                                fieldnames=inputheader):
            # Create a GUID based on the selected guidtype
            guid = get_guid(guidtype)

            # Set the value of the key field to a GUID
            row[key] = guid

            # Write the updated row to the outputfile
            writer.writerow(row)

    s = 'Output file written to %s in %s.' % (outputfile, functionname)
    logging.debug(s)
    return True
示例#7
0
def term_setter_report(inputfile,
                       reportfile,
                       key,
                       constantvalues=None,
                       separator=None,
                       encoding=None,
                       format=None):
    ''' Write a file substituting constants for fields that already exist in an input file 
        and with added fields with constants for fields that do not already exist in an 
       inputfile. Field name matching is exact.
    parameters:
        inputfile - full path to the input file (required)
        reportfile - full path to the output file (required)
        key - field or separator-separated fields to set (required)
        constantvalues - value or separator-separated values to set the field(s) to 
            (required)
        separator - string to use as the key and value separator (optional; default '|')
        encoding - string signifying the encoding of the input file. If known, it speeds
            up processing a great deal. (optional; default None) (e.g., 'utf-8')
        format - string signifying the csv.dialect of the report file ('csv' or 'txt')
            (optional; default: txt)
    returns:
        success - True if the report was written, else False
    '''
    functionname = 'term_setter_report()'

    if reportfile is None or len(reportfile) == 0:
        s = 'No reportfile name given in %s.' % functionname
        logging.debug(s)
        return False

    if inputfile is None or len(inputfile) == 0:
        s = 'No inputfile file given in %s.' % functionname
        logging.debug(s)
        return False

    if os.path.isfile(inputfile) == False:
        s = 'Inputfile file %s not found in %s.' % (inputfile, functionname)
        logging.debug(s)
        return False

    # Determine the dialect of the input file
    inputdialect = csv_file_dialect(inputfile)

    # Determine the dialect of the input file
    if encoding is None or len(encoding.strip()) == 0:
        encoding = csv_file_encoding(inputfile)

    # Read the header from the input file
    inputheader = read_header(inputfile,
                              dialect=inputdialect,
                              encoding=encoding)

    if inputheader is None:
        s = 'Unable to read header from input file %s in %s.' % (inputfile,
                                                                 functionname)
        logging.debug(s)
        return False

    if key is None or len(key.strip()) == 0:
        s = 'No key given in %s.' % functionname
        logging.debug(s)
        return False

    if constantvalues is None or len(constantvalues) == 0:
        s = 'No constantvalues given in %s.' % functionname
        logging.debug(s)
        return False

    # Make sure there is a separator for the next step
    if separator is None or len(separator) == 0:
        separator = '|'

    # Get the fields to set by splitting the key with the separator
    fields = key.split(separator)

    # Get the values to set by splitting the constantvalues with the separator
    addedvalues = constantvalues.split(separator)

    # Abort if there is a mismatch in the lengths of the field and constants lists
    if len(fields) != len(addedvalues):
        s = 'length of field list: %s ' % key
        s += 'does not match length of constants list: %s ' % constantvalues
        s += 'in %s.' % functionname
        logging.debug(s)
        return False

    if format is None or format.lower() == 'txt':
        outputdialect = tsv_dialect()
    else:
        outputdialect = csv_dialect()

    # Make an outputheader that is a copy of the inputheader
    outputheader = inputheader

    # Add to the output header fields that are not in the inputheader
    for field in fields:
        if field not in outputheader:
            outputheader = outputheader + [field]

    # Create the outputfile and write the new header to it
    write_header(reportfile, outputheader, outputdialect)

    # Check to see if the outputfile was created
    if os.path.isfile(reportfile) == False:
        s = 'reportfile: %s was not created in %s.' % (outputfile,
                                                       functionname)
        logging.debug(s)
        return False

    # Open the outputfile to append rows with fields set to constant values
    with open(reportfile, 'a') as outfile:
        writer = csv.DictWriter(outfile,
                                dialect=outputdialect,
                                encoding='utf-8',
                                fieldnames=outputheader)

        # Iterate through all rows in the input file
        for row in read_csv_row(inputfile,
                                dialect=inputdialect,
                                encoding=encoding,
                                header=True,
                                fieldnames=inputheader):
            # For every field in the key list
            for i in range(0, len(fields)):
                # Set the value of the ith field to the ith constant
                row[fields[i]] = addedvalues[i]
            # Write the updated row to the outputfile
            writer.writerow(row)

    s = 'Report written to %s in %s.' % (reportfile, functionname)
    logging.debug(s)
    return True
示例#8
0
def darwinize_header(options):
    ''' Translate field names from input file to Darwin Core field names in outputfile
        using a Darwin Cloud vocabulary lookup.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - path to a directory for the outputfile (optional)
        inputfile - full path to the input file (required)
        dwccloudfile - full path to the vocabulary file containing the Darwin Cloud 
           terms (required)
        outputfile - name of the output file, without path (required)
        encoding - string signifying the encoding of the input file. If known, it speeds
            up processing a great deal. (optional; default None) (e.g., 'utf-8')
        format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt')
        namespace - prepend namespace to fields that were darwinized 
        (optional; default 'no') (e.g., 'y', 'n')
    returns a dictionary with information about the results
        outputfile - actual full path to the output file
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug('Started %s' % __version__)
    logging.debug('options: %s' % options)

    # Make a list for the response
    returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts']

    ### Standard outputs ###
    success = False
    message = None

    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Establish variables ###
    workspace = './'
    inputfile = None
    dwccloudfile = None
    outputfile = None
    encoding = None
    namespace = 'n'
    format = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputfile = options['inputfile']
    except:
        pass

    if inputfile is None or len(inputfile) == 0:
        message = 'No input file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    if os.path.isfile(inputfile) == False:
        message = 'Input file %s not found. %s' % (inputfile, __version__)
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    try:
        dwccloudfile = options['dwccloudfile']
    except:
        pass

    if dwccloudfile is None or len(dwccloudfile) == 0:
        message = 'No Darwin Cloud vocabulary file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    if os.path.isfile(dwccloudfile) == False:
        message = 'Darwin Cloud vocabulary file not found. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    try:
        outputfile = options['outputfile']
    except:
        pass

    if outputfile is None or len(outputfile) == 0:
        message = 'No output file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    try:
        encoding = options['encoding']
    except:
        pass

    if encoding is None or len(encoding.strip()) == 0:
        encoding = csv_file_encoding(inputfile)
    try:
        namespace = options['namespace']
    except:
        pass

    inputdialect = csv_file_dialect(inputfile)

    try:
        format = options['format']
    except:
        pass

    if format is None or len(format) == 0:
        outputdialect = inputdialect
    elif format.lower() == 'csv':
        outputdialect = csv_dialect()
    else:
        outputdialect = tsv_dialect()

    header = read_header(inputfile, dialect=inputdialect, encoding=encoding)
    dwcheader = darwinize_list(header, dwccloudfile, namespace)

    if dwcheader is None:
        message = 'Unable to create darwinized header. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    # Write the new header to the outputfile
    if write_header(outputfile, dwcheader, dialect=outputdialect) == False:
        message = 'Unable to write header to output file. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    # Read the rows of the input file, append them to the output file after the
    # header with columns in the same order.
    with open(outputfile, 'a') as outfile:
        writer = csv.DictWriter(outfile,
                                dialect=outputdialect,
                                encoding='utf-8',
                                fieldnames=header)
        for row in read_csv_row(inputfile, inputdialect, encoding):
            writer.writerow(row)
            #print 'row: %s' % row

    success = True
    artifacts['darwinized_header_file'] = outputfile
    returnvals = [workspace, outputfile, success, message, artifacts]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
示例#9
0
def text_file_aggregator(options):
    ''' Join the contents of files in a given path. Headers and encodings are not assumed 
        to be the same. Write a file containing the joined files with one header line.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - path to a directory for the outputfile (optional)
        inputpath - full path to the input file set. The file extension of the outputfile
            will be the substring following the last '.' in the inputpath.
            Example: ./workspace/thefiles.txt will produce an output file ending in
            '.txt' (required) 
        outputfile - name of the output file, without path (optional)
        format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt')
    returns a dictionary with information about the results
        workspace - actual path to the directory where the outputfile was written
        outputfile - actual full path to the output file
        aggregaterowcount - the number of rows in the aggregated file, not counting header
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
        artifacts - a dictionary of persistent objects created
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug( 'Started %s' % __version__ )
    logging.debug( 'options: %s' % options )

    # Make a list for the response
    returnvars = ['workspace', 'outputfile', 'aggregaterowcount', 'success', 'message', 
        'artifacts']

    ### Standard outputs ###
    success = False
    message = None

    ### Custom outputs ###
    aggregaterowcount = None

    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Establish variables ###
    workspace = './'
    inputpath = None
    outputfile = None
    format = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputpath = options['inputpath']
    except:
        pass

    if inputpath is None or len(inputpath)==0:
        message = 'No input file given. %s' % __version__
        returnvals = [workspace, outputfile, aggregaterowcount, success, message,
            artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    try:
        format = options['format']
    except:
        pass

    if format is None or len(format)==0:
        format = 'txt'

    if format.lower() == 'txt':
        dialect = tsv_dialect()
    else:
        dialect = csv_dialect()

    try:
        outputfile = options['outputfile']
    except:
        pass

    if outputfile is None or len(outputfile)==0:
        outputfile='aggregate_'+str(uuid.uuid1())+format

    # Construct the output file path in the workspace
    outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    # Create the composite header. Let composite_header determine the dialects and 
    # encodings of the files to aggregate.
    aggregateheader = composite_header(inputpath)
    aggregaterowcount = 0

    # Open a file to write the aggregated results in chosen format and utf-8.
    with open(outputfile, 'w') as outfile:
        writer = csv.DictWriter(outfile, dialect=dialect, encoding='utf-8', 
            fieldnames=aggregateheader, extrasaction='ignore')
        writer.writeheader()
        files = glob.glob(inputpath)
        for file in files:
            dialect = csv_file_dialect(file)
            encoding = csv_file_encoding(file)
            with open(file, 'rU') as inputfile:
                reader = csv.DictReader(utf8_data_encoder(inputfile, encoding), 
                    dialect=dialect, encoding=encoding)
                for line in reader:
                    try:
                        writer.writerow(line)
                        aggregaterowcount += 1
                    except:
                        message = 'failed to write line:\n%s\n' % line
                        message += 'to file %s. %s' % (file, __version__)
                        returnvals = [workspace, outputfile, aggregaterowcount, success, 
                            message, artifacts]
                        logging.debug('message:\n%s' % message)
                        return response(returnvars, returnvals)

    success = True
    artifacts['aggregated_file'] = outputfile
    if aggregateheader is not None:
        aggregateheader = list(aggregateheader)
        returnvals = [workspace, outputfile, aggregaterowcount, success, message,
            artifacts]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
示例#10
0
def composite_header_constructor(options):
    ''' Create a file with a header that contains the distinct union of column names from 
        two input files.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - path to a directory for the output file (optional; default './')
        inputfile1 - full path to one of the input files (optional)
        inputfile2 - full path to the second input file (optional)
        outputfile - name of the output file, without path (required)
        format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt')
    returns a dictionary with information about the results
        compositeheader - header combining two inputs
        outputfile - actual full path to the output file
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
        artifacts - a dictionary of persistent objects created
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug( 'Started %s' % __version__ )
    logging.debug( 'options: %s' % options )

    # Make a list for the response
    returnvars = ['workspace', 'compositeheader', 'outputfile', 'success', 'message', \
        'artifacts']

    ### Standard outputs ###
    success = False
    message = None

    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Establish variables ###
    workspace = './'
    inputfile1 = None
    inputfile2 = None
    outputfile = None
    format = 'txt'
    compositeheader = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputfile1 = options['inputfile1']
    except:
        pass

    try:
        inputfile2 = options['inputfile2']
    except:
        pass

    try:
        outputfile = options['outputfile']
    except:
        pass

    if outputfile is None or len(outputfile)==0:
        message = 'No output file given. %s' % __version__
        returnvals = [workspace, compositeheader, outputfile, success, message, artifacts]
        return response(returnvars, returnvals)

    outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    try:
        format = options['format']
    except:
        pass

    # Read the headers of the two files and let read_header figure out the dialects and
    # encodings.
    header1 = read_header(inputfile1)
    header2 = read_header(inputfile2)

    compositeheader = merge_headers(header1, header2)

    if format is None or format.lower()=='txt':
        dialect = tsv_dialect()
    else:
        dialect = csv_dialect()

    # Write the resulting header into outputfile
    success = write_header(outputfile, compositeheader, dialect)
    if success == False:
        message = 'Header was not written. %s' % __version__
        returnvals = [workspace, compositeheader, outputfile, success, message, artifacts]
        return response(returnvars, returnvals)

    if compositeheader is not None:
        compositeheader = list(compositeheader)

    artifacts['composite_header_file'] = outputfile

    returnvals = [workspace, compositeheader, outputfile, success, message, artifacts]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)