def darwinize_header(options): ''' Translate field names from input file to Darwin Core field names in outputfile using a Darwin Cloud vocabulary lookup. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the outputfile (optional) inputfile - full path to the input file (required) dwccloudfile - full path to the vocabulary file containing the Darwin Cloud terms (required) outputfile - name of the output file, without path (required) encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt') namespace - prepend namespace to fields that were darwinized (optional; default 'no') (e.g., 'y', 'n') returns a dictionary with information about the results outputfile - actual full path to the output file success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug('Started %s' % __version__) logging.debug('options: %s' % options) # Make a list for the response returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts'] ### Standard outputs ### success = False message = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputfile = None dwccloudfile = None outputfile = None encoding = None namespace = 'n' format = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile = options['inputfile'] except: pass if inputfile is None or len(inputfile) == 0: message = 'No input file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if os.path.isfile(inputfile) == False: message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: dwccloudfile = options['dwccloudfile'] except: pass if dwccloudfile is None or len(dwccloudfile) == 0: message = 'No Darwin Cloud vocabulary file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if os.path.isfile(dwccloudfile) == False: message = 'Darwin Cloud vocabulary file not found. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile) == 0: message = 'No output file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) try: encoding = options['encoding'] except: pass if encoding is None or len(encoding.strip()) == 0: encoding = csv_file_encoding(inputfile) try: namespace = options['namespace'] except: pass inputdialect = csv_file_dialect(inputfile) try: format = options['format'] except: pass if format is None or len(format) == 0: outputdialect = inputdialect elif format.lower() == 'csv': outputdialect = csv_dialect() else: outputdialect = tsv_dialect() header = read_header(inputfile, dialect=inputdialect, encoding=encoding) dwcheader = darwinize_list(header, dwccloudfile, namespace) if dwcheader is None: message = 'Unable to create darwinized header. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) # Write the new header to the outputfile if write_header(outputfile, dwcheader, dialect=outputdialect) == False: message = 'Unable to write header to output file. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) # Read the rows of the input file, append them to the output file after the # header with columns in the same order. with open(outputfile, 'a') as outfile: writer = csv.DictWriter(outfile, dialect=outputdialect, encoding='utf-8', fieldnames=header) for row in read_csv_row(inputfile, inputdialect, encoding): writer.writerow(row) #print 'row: %s' % row success = True artifacts['darwinized_header_file'] = outputfile returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def term_value_count_reporter(options): ''' Extract a list of the distinct values of a given term in a text file along with the number of times each occurs. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the tsvfile (optional) inputfile - full path to the input file (required) outputfile - name of the output file, without path (optional) format - output file format (e.g., 'csv' or 'txt') (optional; default 'csv') termlist - list of fields in the field combination to count (required) separator - string that separates the values in in the output (e.g., '|') (optional; default '|') encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') returns a dictionary with information about the results workspace - actual path to the directory where the outputfile was written outputfile - actual full path to the output tsv file success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False artifacts - a dictionary of persistent objects created ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) #logging.debug( 'Started %s' % __version__ ) #logging.debug( 'options: %s' % options ) # Make a list for the response returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts'] ### Standard outputs ### success = False message = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputfile = None outputfile = None format = None termlist = None separator = '|' encoding = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile = options['inputfile'] except: pass if inputfile is None or len(inputfile)==0: message = 'No input file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) # Look to see if the input file is at the absolute path or in the workspace. if os.path.isfile(inputfile) == False: if os.path.isfile(workspace+'/'+inputfile) == True: inputfile = workspace+'/'+inputfile else: message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: termlist = options['termlist'] except: pass if termlist is None or len(termlist)==0: message = 'No field list given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message: %s' % message) return response(returnvars, returnvals) ### Optional inputs ### try: separator = options['separator'] except: pass try: format = options['format'] except: pass if format is None: format = 'csv' try: encoding = options['encoding'] except: pass try: outputfile = options['outputfile'] except: pass rootname = '' termname = '' n = 0 for f in termlist: if n == 0: rootname += f termname += f n = 1 else: rootname += '_'+f termname += separator+f if outputfile is None or len(outputfile)==0: outputfile = '%s_count_report_%s.%s' % (rootname, str(uuid.uuid1()), format) outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) # Get the list of values for the field given by termname along with their counts. counts = extract_value_counts_from_file(inputfile, termlist, separator=separator, encoding=encoding) #print 'counts: %s' % counts #Try to create the report for the term value counts. success = term_value_count_report(outputfile, counts, termname=termname, format=format) if success==False: message = 'No count report created for %s from %s. ' % (termname, outputfile) message += '%s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message: %s' % message) return response(returnvars, returnvals) s = '%s_count_report_file' % rootname artifacts[s] = outputfile returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def dataset_term_standardizer(options): ''' Create an output file replacing values from an input file for fields given in key with standard values and adding new fields to hold the original values. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the output file (optional; default './') inputfile - path to the input file. Either full path or path within the workspace (required) outputfile - name of the output file, without path (optional) vocabfile - path to the vocabulary file. Either full path or path within the workspace (required if constantvalues is None) encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt') key - field or separator-separated fields whose values are to be set to the constantvalues (required) separator - string to use as the key and value separator (optional; default '|') returns a dictionary with information about the results workspace - actual path to the directory where the outputfile was written outputfile - actual full path to the output report file success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False artifacts - a dictionary of persistent objects created ''' print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug('Started %s' % __version__) logging.debug('options: %s' % options) # Make a list for the response returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts'] ### Standard outputs ### success = False message = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputfile = None outputfile = None vocabfile = None format = 'txt' key = None separator = '|' encoding = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile = options['inputfile'] except: pass if inputfile is None or len(inputfile) == 0: message = 'No input file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) # Look to see if the input file is at the absolute path or in the workspace. if os.path.isfile(inputfile) == False: if os.path.isfile(workspace + '/' + inputfile) == True: inputfile = workspace + '/' + inputfile else: message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [workspace, outputfile, True, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: vocabfile = options['vocabfile'] except: pass if vocabfile is None or len(vocabfile) == 0: message = 'No vocab file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) # Look to see if vocab file is at the absolute path or in the workspace. vocabfileat = None if os.path.isfile(vocabfile) == True: vocabfileat = vocabfile else: vocabfileat = workspace + '/' + vocabfile vocabfile = vocabfileat try: key = options['key'] except: pass if key is None or len(key) == 0: message = 'No key given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) ### Optional inputs ### try: separator = options['separator'] except: pass try: format = options['format'] except: pass try: encoding = options['encoding'] except: pass try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile.strip()) == 0: outputfile = '%s/%s_changed_report_%s.%s' % \ (workspace.rstrip('/'), slugify(key), str(uuid.uuid1()), format) else: outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) # Get a list of distinct values of the term in the input file fields = key.split(separator) success = term_standardizer_report(inputfile, outputfile, vocabfile, key, \ separator=separator, encoding=encoding, format=format) if outputfile is not None and not os.path.isfile(outputfile): message = 'Failed to write results to output file %s. ' % outputfile message += '%s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) s = '%s_change_report_file' % slugify(key) artifacts[s] = outputfile returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def csv_field_selector(options): ''' Create a new file by selecting only fields in a termlist in the order given in that list. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory to work in (optional) inputfile - full path to the input file (required) outputfile - name of the output file, without path (required) termlist - list of fields to extract from the input file (required) encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt') returns a dictionary with information about the results outputfile - actual full path to the output file workspace - path to a directory for the output artifacts success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug( 'Started %s' % __version__ ) logging.debug( 'options: %s' % options ) # Make a list for the response returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts'] ### Standard outputs ### success = False message = None # Make a dictionary for artifacts left behind artifacts = {} ### Custom outputs ### ### Establish variables ### workspace = './' inputfile = None outputfile = None termlist = None encoding = None format = 'txt' ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile = options['inputfile'] except: pass try: outputfile = options['outputfile'] except: pass if inputfile is None or len(inputfile)==0: message = 'No input file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if os.path.isfile(inputfile) == False: message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: termlist = options['termlist'] except: pass if termlist is None or len(termlist)==0: message = 'No termlist given. %s' % __version__ returnvals = [workspace, extractedvalues, success, message] logging.debug('message: %s' % message) return response(returnvars, returnvals) try: format = options['format'] except: pass try: encoding = options['encoding'] except: pass if outputfile is None or len(outputfile)==0: message = 'No output file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) # Do the field selection. Let the selector figure out the input dialect. success = csv_select_fields(inputfile, outputfile, termlist, dialect=None, encoding=encoding, format=format) if success == False: message = 'Unable to select fields from %s. %s' % (inputfile, __version__) returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) artifacts['selected_field_file'] = outputfile returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def utf8_encoder(options): ''' Translate input file from its current encoding to utf8. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the outputfile (optional) inputfile - full path to the input file (required) outputfile - name of the output file, without path (required) encoding - the encoding of the input file (optional) returns a dictionary with information about the results outputfile - actual full path to the output file success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug('Started %s' % __version__) logging.debug('options: %s' % options) # Make a list for the response returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts'] ### Standard outputs ### success = False message = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### inputfile = None outputfile = None encoding = None ### Required inputs ### try: workspace = options['workspace'] except: workspace = './' try: inputfile = options['inputfile'] except: pass if inputfile is None or len(inputfile) == 0: message = 'No input file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if os.path.isfile(inputfile) == False: message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile) == 0: message = 'No output file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) try: encoding = options['encoding'] except: pass success = utf8_file_encoder(inputfile, outputfile, encoding) if success == False: message = 'Unable to translate %s to utf8 encoding. %s' % (inputfile, __version__) returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) artifacts['utf8_encoded_file'] = outputfile returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
workspace, outputfile, rowcount, success, message, artifacts ] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) else: try: with DwCAReader(inputfile) as dwcareader: rowcount = write_core_csv_file(dwcareader, outputfile) except Exception, e: message = 'Error %s reading archive %s. %s' % (e, inputfile, __version__) returnvals = [ workspace, outputfile, rowcount, success, message, artifacts ] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if rowcount == 0: message = 'Unable to create outputfile %s. %s' % (outputfile, __version__) returnvals = [ workspace, outputfile, rowcount, success, message, artifacts ] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) success = True if success == True: artifacts['dwca_core_to_tsv_outputfile'] = outputfile returnvals = [workspace, outputfile, rowcount, success, message, artifacts]
def text_file_splitter(options): ''' Split a text file into chunks with headers. Put the chunk files in the workspace. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - the directory in which the output will be written (optional) inputfile - full path to the input file (required) chunksize - the maximum number of records in an output file (optional) returns a dictionary with information about the results workspace - actual path to the directory where the outputfile was written filepattern - the pattern for the split file names chunks - the number of files created from the split rowcount - the number of rows in the file that was split, not counting header success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug('Started %s' % __version__) logging.debug('options: %s' % options) # Make a list for the response returnvars = [ 'workspace', 'filepattern', 'chunks', 'rowcount', 'success', 'message' ] ### Standard outputs ### success = False message = None ### Custom outputs ### filepattern = None chunks = None rowcount = None ### Establish variables ### workspace = './' inputfile = None termname = None chunksize = 10000 ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile = options['inputfile'] except: pass if inputfile is None or len(inputfile) == 0: message = 'No input file given. %s' % __version__ returnvals = [ workspace, filepattern, chunks, rowcount, success, message ] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if not os.path.isfile(inputfile): message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [ workspace, filepattern, chunks, rowcount, success, message ] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: chunksize = options['chunksize'] except: pass path = None fileext = None path, fileext, filepattern = split_path(inputfile) # Open the file in universal mode input = open(inputfile, 'rU') # Get the first line of the file as the header header = input.next() # dest will be used for the chunk files dest = None rowcount = 0 chunks = 0 # Iterate though the entire input file for line in input: # For the first line and every multiple of subsequent max_chunk_length lines if rowcount % chunksize == 0: # Close the old chunk file, if there is one if dest: dest.close() # Open a new chunk file to write the next lines into, with a header destfile = workspace + '/' + filepattern + '-' + str( chunks) + '.' + fileext dest = open(destfile, 'w') dest.write(header) chunks += 1 # Write a line to the current chunk and keep going dest.write(line) rowcount += 1 # Close the last chunk file if dest: dest.close() # Close the last input file if input: input.close() outputpattern = None if filepattern is not None and fileext is not None: outputpattern = workspace + '/' + filepattern + '-*.' + fileext success = True # Prepare the response dictionary returnvals = [workspace, filepattern, chunks, rowcount, success, message] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def darwin_cloud_collector(options): ''' Get field names from inputfile and put any that are not Simple Darwin Core into outputfile. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the outputfile (optional) inputfile - full path to the input file (required) outputfile - name of the output file, without path (required) encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') returns a dictionary with information about the results addedvalues - new values added to the output file outputfile - actual full path to the output file success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug('Started %s' % __version__) logging.debug('options: %s' % options) # Make a list for the response returnvars = ['workspace', 'addedvalues', 'outputfile', 'success', 'message', \ 'artifacts'] ### Standard outputs ### success = False message = None ### Custom outputs ### addedvalues = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputfile = None outputfile = None encoding = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile = options['inputfile'] except: pass try: outputfile = options['outputfile'] except: pass if inputfile is None or len(inputfile) == 0: message = 'No input file given. %s' % __version__ returnvals = [ workspace, addedvalues, outputfile, success, message, artifacts ] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if os.path.isfile(inputfile) == False: message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [ workspace, addedvalues, outputfile, success, message, artifacts ] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if outputfile is None or len(outputfile) == 0: message = 'No output file given. %s' % __version__ returnvals = [ workspace, addedvalues, outputfile, success, message, artifacts ] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) try: encoding = options['encoding'] except: pass # Read the header and let read_header figure out the dialect and encoding. header = read_header(inputfile, encoding=encoding) nondwc = terms_not_in_dwc(header, casesensitive=False) dialect = vocab_dialect() addedvalues = distinct_vocabs_to_file(outputfile, nondwc, 'fieldname', dialect=dialect) success = True if addedvalues is not None: artifacts['darwin_cloud_collector_file'] = outputfile returnvals = [ workspace, addedvalues, outputfile, success, message, artifacts ] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def term_unknown_reporter(options): ''' Report a list of values from a field in an input file that are not in a given vocabulary. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the tsvfile (optional) inputfile - path to the input file. Either full path or path within the workspace (required) vocabfile - path to the vocabulary file. Either full path or path within the workspace (required) outputfile - name of the output file, without path (optional) format - output file format (e.g., 'csv' or 'txt') (optional; default csv) key - the field or separator-separated fieldnames that hold the distinct values in the vocabulary file (required) separator - string to use as the value separator in the string (default '|') encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') returns a dictionary with information about the results workspace - actual path to the directory where the outputfile was written outputfile - actual full path to the output report file success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False artifacts - a dictionary of persistent objects created ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) #logging.debug( 'Started %s' % __version__ ) #logging.debug( 'options: %s' % options ) # Make a list for the response returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts'] ### Standard outputs ### success = False message = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputfile = None vocabfile = None outputfile = None format = 'txt' key = None separator = None encoding = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile = options['inputfile'] except: pass if inputfile is None or len(inputfile) == 0: message = 'No input file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) # Look to see if the input file is at the absolute path or in the workspace. if os.path.isfile(inputfile) == False: if os.path.isfile(workspace + '/' + inputfile) == True: inputfile = workspace + '/' + inputfile else: message = 'Input file %s not found. %s.' % (inputfile, __version__) returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: vocabfile = options['vocabfile'] except: pass if vocabfile is None or len(vocabfile) == 0: message = 'No vocab file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) # Look to see if vocab file is at the absolute path or in the workspace. vocabfileat = None if os.path.isfile(vocabfile) == True: vocabfileat = vocabfile else: vocabfileat = workspace + '/' + vocabfile vocabfile = vocabfileat try: key = options['key'] except: pass if key is None or len(key) == 0: message = 'No key in term_unknown_reporter. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: separator = options['separator'] except: pass try: format = options['format'] except: pass try: encoding = options['encoding'] except: pass try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile.strip()) == 0: outputfile = '%s/%s_standardization_report_%s.%s' % \ (workspace.rstrip('/'), slugify(key), str(uuid.uuid1()), format) else: outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) # Get a list of distinct values of the term in the input file if separator is None or len(separator) == 0: fields = [key] else: fields = key.split(separator) # Let extract_values_from_file figure out the dialect of inputfile. checklist = extract_values_from_file(inputfile, fields, separator, encoding=encoding, function=ustripstr) if checklist is None or len(checklist) == 0: message = 'No values of %s from %s. %s' % (key, inputfile, __version__) returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message: %s' % message) return response(returnvars, returnvals) # Get a dictionary of checklist values not found in the vocabfile, which is assumed # to be in utf-8 encoding. missingvocablist = missing_vocab_list_from_file(checklist, vocabfile, key, separator=separator, encoding='utf-8') if missingvocablist is None or len(missingvocablist) == 0: message = 'No missing values of %s from %s ' % (key, inputfile) message += 'found in %s. %s' % (vocabfile, __version__) success = True returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) # TODO: Use Allan's DQ report framework # Validation, Improvement, Measure # Create a series of term reports success = term_list_report(outputfile, missingvocablist, key, format=format) if outputfile is not None and not os.path.isfile(outputfile): message = 'Failed to write results to output file %s.' % outputfile message += '%s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) s = '%s_unknown_report_file' % key artifacts[s] = outputfile returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def composite_header_constructor(options): ''' Create a file with a header that contains the distinct union of column names from two input files. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the output file (optional; default './') inputfile1 - full path to one of the input files (optional) inputfile2 - full path to the second input file (optional) outputfile - name of the output file, without path (required) format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt') returns a dictionary with information about the results compositeheader - header combining two inputs outputfile - actual full path to the output file success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False artifacts - a dictionary of persistent objects created ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug( 'Started %s' % __version__ ) logging.debug( 'options: %s' % options ) # Make a list for the response returnvars = ['workspace', 'compositeheader', 'outputfile', 'success', 'message', \ 'artifacts'] ### Standard outputs ### success = False message = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputfile1 = None inputfile2 = None outputfile = None format = 'txt' compositeheader = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile1 = options['inputfile1'] except: pass try: inputfile2 = options['inputfile2'] except: pass try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile)==0: message = 'No output file given. %s' % __version__ returnvals = [workspace, compositeheader, outputfile, success, message, artifacts] return response(returnvars, returnvals) outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) try: format = options['format'] except: pass # Read the headers of the two files and let read_header figure out the dialects and # encodings. header1 = read_header(inputfile1) header2 = read_header(inputfile2) compositeheader = merge_headers(header1, header2) if format is None or format.lower()=='txt': dialect = tsv_dialect() else: dialect = csv_dialect() # Write the resulting header into outputfile success = write_header(outputfile, compositeheader, dialect) if success == False: message = 'Header was not written. %s' % __version__ returnvals = [workspace, compositeheader, outputfile, success, message, artifacts] return response(returnvars, returnvals) if compositeheader is not None: compositeheader = list(compositeheader) artifacts['composite_header_file'] = outputfile returnvals = [workspace, compositeheader, outputfile, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def term_token_reporter(options): ''' Get a dictionary of counts of tokens for a given term in an input file. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the outputfile (optional) inputfile - full path to the input file (required) outputfile - name of the output file, without path (optional) termname - the name of the term for which to count rows (required) encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') returns a dictionary with information about the results workspace - actual path to the directory where the outputfile was written outputfile - actual full path to the output report file tokens - a dictionary of tokens from the term in the inputfile success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False artifacts - a dictionary of persistent objects created ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug('Started %s' % __version__) logging.debug('options: %s' % options) # Make a list for the response returnvars = [ 'workspace', 'outputfile', 'tokens', 'success', 'message', 'artifacts' ] ### Standard outputs ### success = False message = None ### Custom outputs ### tokens = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputfile = None outputfile = None termname = None encoding = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile = options['inputfile'] except: pass if inputfile is None or len(inputfile) == 0: message = 'No input file given. %s' % __version__ returnvals = [ workspace, outputfile, tokens, success, message, artifacts ] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if os.path.isfile(inputfile) == False: message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [ workspace, outputfile, tokens, success, message, artifacts ] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: termname = options['termname'] except: pass if termname is None or len(termname) == 0: message = 'No term given. %s' % __version__ returnvals = [ workspace, outputfile, tokens, success, message, artifacts ] logging.debug('message: %s' % message) return response(returnvars, returnvals) try: encoding = options['encoding'] except: pass try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile) == 0: outputfile = '%s_token_report_%s.txt' % (termname, str(uuid.uuid1())) outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) tokens = term_token_count_from_file(inputfile, termname, encoding=encoding) success = token_report(outputfile, tokens) if success == True: s = '%s_token_report_file' % termname artifacts[s] = outputfile returnvals = [workspace, outputfile, tokens, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def vocab_counter(options): ''' Extract a dictionary of the distinct values of a given term in a text file along with the number of times each occurs. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the output artifacts (optional) inputfile - full path to the input file (required) termname - the name of the term for which to find distinct values (required) encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') returns a dictionary with information about the results workspace - path to a directory for the output artifacts extractedvalues - a list of distinct values of the term in the inputfile, with a count of the number of times it occurs success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug('Started %s' % __version__) logging.debug('options: %s' % options) # Make a list for the response returnvars = ['workspace', 'extractedvalues', 'success', 'message'] ### Standard outputs ### success = False message = None ### Custom outputs ### extractedvalues = None ### Establish variables ### workspace = './' inputfile = None termname = None encoding = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile = options['inputfile'] except: pass if inputfile is None or len(inputfile) == 0: message = 'No input file given. %s' % __version__ returnvals = [workspace, extractedvalues, success, message] logging.debug('message: %s' % message) return response(returnvars, returnvals) if not os.path.isfile(inputfile): message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [workspace, extractedvalues, success, message] logging.debug('message: %s' % message) return response(returnvars, returnvals) try: termname = options['termname'] except: pass if termname is None or len(termname) == 0: message = 'No term given. %s' % __version__ returnvals = [workspace, extractedvalues, success, message] logging.debug('message: %s' % message) return response(returnvars, returnvals) try: encoding = options['encoding'] except: pass extractedvalues = extract_value_counts_from_file(inputfile, [termname], encoding=encoding) success = True returnvals = [workspace, extractedvalues, success, message] options['vocab_counter_response'] = response(returnvars, returnvals) logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def vocab_extractor(options): ''' Extract a list of the distinct values of a set of terms in a text file. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory to work in (optional) inputfile - full path to the input file (required) termlist - list of fields to extract from the input file (required) separator - string that separates the values in termlist (e.g., '|') (optional; default None) encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') returns a dictionary with information about the results workspace - path to a directory worked in extractedvalues - a list of distinct values of the term in the inputfile success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug('Started %s' % __version__) logging.debug('options: %s' % options) # Make a list of keys in the response dictionary returnvars = ['workspace', 'extractedvalues', 'success', 'message'] ### Standard outputs ### success = False message = None ### Custom outputs ### extractedvalues = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputfile = None termlist = None separator = None encoding = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile = options['inputfile'] except: pass if inputfile is None or len(inputfile) == 0: message = 'No input file given. %s' % __version__ returnvals = [workspace, extractedvalues, success, message] logging.debug('message: %s' % message) return response(returnvars, returnvals) if not os.path.isfile(inputfile): message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [workspace, extractedvalues, success, message] logging.debug('message: %s' % message) return response(returnvars, returnvals) try: termlist = options['termlist'] except: pass if termlist is None or len(termlist) == 0: message = 'No termlist given. %s' % __version__ returnvals = [workspace, extractedvalues, success, message] logging.debug('message: %s' % message) return response(returnvars, returnvals) try: separator = options['separator'] except: pass try: encoding = options['encoding'] except: pass if separator is None or len(separator.strip()) == 0: theterms = [termlist] else: theterms = termlist.split(separator) # Extract the distinct values from the inputfile, applying the function to strip # white space and make lower case. # Let extract_values_from_file figure out the dialect and encoding of inputfile. extractedvalues = extract_values_from_file(inputfile, theterms, separator=separator, encoding=encoding, function=ustripstr) success = True returnvals = [workspace, extractedvalues, success, message] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def text_file_field_stripper(options): ''' Filter a text file into a new file based on matching a list of fields to keep. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - the directory in which the output will be written (optional) inputfile - full path to the input file (required) outputfile - name of the output file, without path (required) separator - string that separates the values in termlist (e.g., '|') (optional; default None) encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt') termlist - list of fields to extract from the input file (required) returns a dictionary with information about the results workspace - actual path to the directory where the outputfile was written outputfile - actual full path to the output tsv file success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False artifacts - a dictionary of persistent objects created ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug( 'Started %s' % __version__ ) logging.debug( 'options: %s' % options ) # Make a list for the response returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts'] ### Standard outputs ### success = False message = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputfile = None outputfile = None format = 'txt' termlist = None separator = None encoding = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile = options['inputfile'] except: pass if inputfile is None or len(inputfile)==0: message = 'No input file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if os.path.isfile(inputfile) == False: message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: termlist = options['termlist'] except: pass if termlist is None or len(termlist)==0: message = 'No termlist given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message: %s' % message) return response(returnvars, returnvals) try: separator = options['separator'] except: pass try: encoding = options['encoding'] except: pass if separator is None or len(separator.strip())==0: theterms = [termlist] else: theterms = termlist.split(separator) # Determine the file dialect inputdialect = csv_file_dialect(inputfile) # Determine the file encoding if encoding is None or len(encoding.strip()) == 0: encoding = csv_file_encoding(inputfile) # csv_file_encoding() always returns an encoding if there is an input file. # No need to check. # If the termname is not in the header of the inputfile, nothing to do. header = read_header(inputfile, dialect=inputdialect, encoding=encoding) # Make a clean version of the input header cleaninputheader = clean_header(header) try: format = options['format'] except: pass try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile)==0: message = 'No output file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) # Prepare the outputfile if format is None or format.lower()=='txt': outputdialect = tsv_dialect() else: outputdialect = csv_dialect() if separator is None or len(separator.strip())==0: theterms = [termlist] else: theterms = termlist.split(separator) # Make a clean version of the output header cleanoutputheader = clean_header(theterms) # Create the outputfile and write the new header to it write_header(outputfile, cleanoutputheader, outputdialect) # Check to see that the file was created if os.path.isfile(outputfile) == False: message = 'Outputfile %s was not created. %s' % (outputfile, __version__) returnvals = [workspace, outputfile, success, message, artifacts] return response(returnvars, returnvals) # Open the outputfile to start writing matching rows with open(outputfile, 'a') as outfile: writer = csv.DictWriter(outfile, dialect=outputdialect, encoding='utf-8', fieldnames=cleanoutputheader) # Iterate through all rows in the input file for row in read_csv_row(inputfile, dialect=inputdialect, encoding=encoding, header=True, fieldnames=cleaninputheader): newrow = extract_fields_from_row(row, cleanoutputheader) writer.writerow(newrow) success = True s = 'stripped_file' artifacts[s] = outputfile # Prepare the response dictionary returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def text_file_aggregator(options): ''' Join the contents of files in a given path. Headers and encodings are not assumed to be the same. Write a file containing the joined files with one header line. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the outputfile (optional) inputpath - full path to the input file set. The file extension of the outputfile will be the substring following the last '.' in the inputpath. Example: ./workspace/thefiles.txt will produce an output file ending in '.txt' (required) outputfile - name of the output file, without path (optional) format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt') returns a dictionary with information about the results workspace - actual path to the directory where the outputfile was written outputfile - actual full path to the output file aggregaterowcount - the number of rows in the aggregated file, not counting header success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False artifacts - a dictionary of persistent objects created ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug( 'Started %s' % __version__ ) logging.debug( 'options: %s' % options ) # Make a list for the response returnvars = ['workspace', 'outputfile', 'aggregaterowcount', 'success', 'message', 'artifacts'] ### Standard outputs ### success = False message = None ### Custom outputs ### aggregaterowcount = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputpath = None outputfile = None format = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputpath = options['inputpath'] except: pass if inputpath is None or len(inputpath)==0: message = 'No input file given. %s' % __version__ returnvals = [workspace, outputfile, aggregaterowcount, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: format = options['format'] except: pass if format is None or len(format)==0: format = 'txt' if format.lower() == 'txt': dialect = tsv_dialect() else: dialect = csv_dialect() try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile)==0: outputfile='aggregate_'+str(uuid.uuid1())+format # Construct the output file path in the workspace outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) # Create the composite header. Let composite_header determine the dialects and # encodings of the files to aggregate. aggregateheader = composite_header(inputpath) aggregaterowcount = 0 # Open a file to write the aggregated results in chosen format and utf-8. with open(outputfile, 'w') as outfile: writer = csv.DictWriter(outfile, dialect=dialect, encoding='utf-8', fieldnames=aggregateheader, extrasaction='ignore') writer.writeheader() files = glob.glob(inputpath) for file in files: dialect = csv_file_dialect(file) encoding = csv_file_encoding(file) with open(file, 'rU') as inputfile: reader = csv.DictReader(utf8_data_encoder(inputfile, encoding), dialect=dialect, encoding=encoding) for line in reader: try: writer.writerow(line) aggregaterowcount += 1 except: message = 'failed to write line:\n%s\n' % line message += 'to file %s. %s' % (file, __version__) returnvals = [workspace, outputfile, aggregaterowcount, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) success = True artifacts['aggregated_file'] = outputfile if aggregateheader is not None: aggregateheader = list(aggregateheader) returnvals = [workspace, outputfile, aggregaterowcount, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def parse_dynamic_properties(options): ''' Actor will parse the values of the dynamic properties column of the input and each property as separate column in a new csv file options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the outputfile (optional) inputfile - full path to the input file (required) outputfile - name of the output file, without path (optional) returns a dictionary with information about the results workspace - actual path to the directory where the outputfile was written outputfile - actual full path to the output file success - True if process completed successfully, otherwise False message - an explanation of the results artifacts - a dictionary of persistent objects created ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug('Started %s' % __version__) logging.debug('options: %s' % options) # Make a list of keys in the response dictionary returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts'] ### Standard outputs ### success = False message = None ### Custom outputs ### # Intialize any other output variables here so that the response calls no about them # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### inputfile = None outputfile = None ### Required inputs ### try: workspace = options['workspace'] except: workspace = './' try: inputfile = options['inputfile'] except: pass if inputfile is None or len(inputfile) == 0: message = 'No input file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if os.path.isfile(inputfile) == False: message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile) == 0: outputfile = 'parsed_props_' + str(uuid.uuid1()) + '.csv' # Construct the output file path in the workspace outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) ### Optional inputs ### # TODO: output format (csv or tsv) # Do the actual work now that the preparation is complete success = parse_props(inputfile, outputfile) # Add artifacts to the output dictionary if all went well if success == True: artifacts['parsed_props_output_file'] = outputfile # Prepare the response dictionary returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def dataset_guid_setter(options): ''' Create an output file adding a field populated by global unique identifiers. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the output file (optional; default './') inputfile - path to the input file. Either full path or path within the workspace (required) outputfile - name of the output file, without path (optional) format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt') key - field whose values are to be set to GUID values (required) encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') guidtype - type of GUID to use to populate the key (optional; default 'uuid') returns a dictionary with information about the results workspace - actual path to the directory where the outputfile was written outputfile - actual full path to the output report file success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False artifacts - a dictionary of persistent objects created ''' print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug('Started %s' % __version__) logging.debug('options: %s' % options) # Make a list of keys in the response dictionary returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts'] ### Standard outputs ### success = False message = None ### Custom outputs ### # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputfile = None outputfile = None format = 'txt' key = None guidtype = 'uuid' encoding = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile = options['inputfile'] except: pass if inputfile is None or len(inputfile) == 0: message = 'No input file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) # Look to see if the input file is at the absolute path or in the workspace. if os.path.isfile(inputfile) == False: if os.path.isfile(workspace + '/' + inputfile) == True: inputfile = workspace + '/' + inputfile else: message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [workspace, outputfile, True, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: key = options['key'] except: pass if key is None or len(key) == 0: message = 'No key given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) ### Optional inputs ### try: format = options['format'] except: pass try: separator = options['separator'] except: pass try: encoding = options['encoding'] except: pass try: guidtype = options['guidtype'] except: pass try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile.strip()) == 0: outputfile = '%s/%s_corrected_report_%s.%s' % \ (workspace.rstrip('/'), slugify(key), str(uuid.uuid1()), format) else: outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) # Run the core operation success = uuid_term_appender(inputfile, outputfile, key, guidtype=guidtype, encoding=encoding, format=format) # Check to see if the outputfile was created if outputfile is not None and not os.path.isfile(outputfile): message = 'Failed to write results to output file %s. ' % outputfile message += '%s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) # Add artifacts to the output dictionary if all went well s = '%s_setter_report_file' % slugify(key) artifacts[s] = outputfile # Prepare the response dictionary returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def outcomestats(options): """Generic actor showing patterns for logging, input dictionary, and output dictionary with artifacts. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the outputfile (optional) inputfile - full path to the input file (required) outputfile - name of the output file, without path (optional) returns a dictionary with information about the results workspace - actual path to the directory where the outputfile was written outputfile - actual full path to the output file success - True if process completed successfully, otherwise False message - an explanation of the results artifacts - a dictionary of persistent objects created """ setup_actor_logging(options) print options logging.debug( 'Started %s' % __version__ ) logging.debug( 'options: %s' % options ) # Make a list for the response returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts'] # Make a dictionary for artifacts left behind artifacts = {} # outputs workspace = None outputfile = None success = False message = None #abspath = os.path.abspath(__file__) #dname = os.path.dirname(abspath) #os.chdir(dname) #print dname #print options['configfile'] # inputs try: workspace = options['workspace'] except: workspace = None if workspace is None or len(workspace)==0: workspace = './' try: inputfile = options['inputfile'] except: inputfile = None if inputfile is None or len(inputfile)==0: message = 'No input file given' returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if os.path.isfile(inputfile) == False: message = 'Input file not found' returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: outputfile = options['outputfile'] except: outputfile = None if outputfile is None or len(outputfile)==0: outputfile='outcomeStats_'+str(uuid.uuid1())+'.xlsx' try: configfile = options['configfile'] except: configfile = None if configfile is None or len(configfile)==0: message = 'No config file given' returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) # Construct the output file path in the workspace outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) # Do the actual work now that the preparation is complete success = stats_to_xlsx(inputfile, outputfile, configfile) # Add artifacts to the output dictionary if all went well if success==True: artifacts['output_file'] = outputfile # Prepare the response dictionary returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def do_stuffer(options): print("path" + os.getcwd()) setup_actor_logging(options) #logging.debug('Started %s' % __version__) logging.debug('options: %s' % options) # Make a list of keys in the response dictionary returnvars = [ 'workspace', 'outputfile', 'success', 'message', 'artifacts' ] ### Standard outputs ### success = False message = None ### Custom outputs ### # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### inputfile = None outputfile = None ### Required inputs ### try: workspace = options['workspace'] except: workspace = './' try: inputfile = options['inputfile'] except: pass if inputfile is None or len(inputfile) == 0: #message = 'No input file given. %s' % __version__ message = 'No input file given.' returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if os.path.isfile(inputfile) == False: #message = 'Input file %s not found. %s' % (inputfile, __version__) message = 'Input file %s not found.' % inputfile returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile) == 0: outputfile = 'dwca_' + str(uuid.uuid1()) + '.zip' # Construct the output file path in the workspace outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) ### Optional inputs ### params = [] argspec = inspect.getargspec(do_stuff) for arg in argspec.args: if arg == 'inputfile': params.append(inputfile) elif arg == 'outputfile': params.append(outputfile) elif arg == 'workspace': params.append(workspace) else: if (arg in options): params.append(options[arg]) else: raise KeyError( '%s not supplied as a parameter of %s in yaml config' % (arg, do_stuff.func_name)) # Do the actual work now that the preparation is complete success = do_stuff(*params) # Add artifacts to the output dictionary if all went well if success == True: artifacts['template_output_file'] = outputfile # Prepare the response dictionary returnvals = [workspace, outputfile, success, message, artifacts] #logging.debug('Finishing %s' % __version__) logging.debug('Finishing') return response(returnvars, returnvals)
def csv_fieldcount_checker(options): ''' Get the first row in a csv file where the number of fields is less than the number of fields in the header. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the output artifacts (optional) inputfile - full path to the input file (required) returns a dictionary with information about the results workspace - path to a directory for the output artifacts firstbadrowindex - the line number of the first row in the inputfile where the field count does not match row - the content of the first line in the inputfile where the field count does not match. success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug( 'Started %s' % __version__ ) logging.debug( 'options: %s' % options ) # Make a list for the response returnvars = ['workspace', 'firstbadrowindex', 'row', 'success', 'message'] ### Standard outputs ### success = False message = None ### Custom outputs ### firstbadrowindex = 0 row = None ### Establish variables ### workspace = './' inputfile = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile = options['inputfile'] except: pass if inputfile is None or len(inputfile)==0: message = 'No input file given. %s' % __version__ returnvals = [workspace, firstbadrowindex, row, success, message] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if os.path.isfile(inputfile) == False: message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [workspace, firstbadrowindex, row, success, message] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) result = csv_field_checker(inputfile) if result is not None: firstbadrowindex = result[0] row = result[1] message = 'Row with incorrect number fields found. %s' % __version__ returnvals = [workspace, firstbadrowindex, row, success, message] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) success = True returnvals = [workspace, firstbadrowindex, row, success, message] logging.info('Finishing %s' % __version__) return response(returnvars, returnvals)
def term_counter(options): ''' Get a count of the rows that are populated for a given term. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the output artifacts (optional) inputfile - full path to the input file (required) termname - the name of the term for which to count rows (required) encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') returns a dictionary with information about the results workspace - path to a directory for the output artifacts rowcount - the number of rows in the inputfile that have a value for the term success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug('Started %s' % __version__) logging.debug('options: %s' % options) # Make a list for the response returnvars = ['workspace', 'rowcount', 'success', 'message'] ### Standard outputs ### success = False message = None ### Custom outputs ### rowcount = None ### Establish variables ### workspace = './' inputfile = None termname = None encoding = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile = options['inputfile'] except: pass if inputfile is None or len(inputfile) == 0: message = 'No input file given. %s' % __version__ returnvals = [workspace, rowcount, success, message] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if os.path.isfile(inputfile) == False: message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [workspace, rowcount, success, message] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: termname = options['termname'] except: pass if termname is None or len(termname) == 0: message = 'No term given. %s' % __version__ returnvals = [workspace, rowcount, success, message] logging.debug('message: %s' % message) return response(returnvars, returnvals) try: encoding = options['encoding'] except: pass rowcount = term_rowcount_from_file(inputfile, termname, encoding=encoding) success = True returnvals = [workspace, rowcount, success, message] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def dwca_core_to_tsv(options): ''' Save the core of the archive to a tsv file with DwC term names as headers. options - a dictionary of parameters loglevel - the level at which to log (e.g., DEBUG) workspace - path to a directory for the outputfile (optional) inputfile - full path to the input Darwin Core archive file (required) outputfile - file name of the tsv output file, no path (optional) archivetype - archive type ('standard' or 'gbif') (optional; default 'standard') returns a dictionary with information about the results workspace - actual path to the directory where the outputfile was written outputfile - actual full path to the output tsv file rowcount - the number of rows in the Darwin Core archive file success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False artifacts - a dictionary of persistent objects created ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug('Started %s' % __version__) logging.debug('options: %s' % options) # Make a list for the response returnvars = [ 'workspace', 'outputfile', 'rowcount', 'success', 'message', 'artifacts' ] ### Standard outputs ### success = False message = None ### Custom outputs ### rowcount = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputfile = None outputfile = None archivetype = 'standard' ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile = options['inputfile'] except: pass if inputfile is None or len(inputfile) == 0: message = 'No input file given. %s' % __version__ returnvals = [ workspace, outputfile, rowcount, success, message, artifacts ] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) # Look to see if the input file is at the absolute path or in the workspace. if os.path.isfile(inputfile) == False: if os.path.isfile(workspace + '/' + inputfile) == True: inputfile = workspace + '/' + inputfile else: message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [ workspace, outputfile, rowcount, success, message, artifacts ] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile) == 0: outputfile = 'dwca_%s.txt' % str(uuid.uuid1()) outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) try: archivetype = options['archivetype'] except: pass # Note: The DwCAReader creates a temporary directory of its own and cleans it up # Make a reader based on whether the archive is standard or a GBIF download. dwcareader = None if archivetype is not None and archivetype.lower() == 'gbif': try: with GBIFResultsReader(inputfile) as dwcareader: rowcount = write_core_csv_file(dwcareader, outputfile) except Exception, e: message = 'Error %s ' % e message += 'reading GBIF archive: %s. %s' % (inputfile, __version__) returnvals = [ workspace, outputfile, rowcount, success, message, artifacts ] logging.debug('message:\n%s' % message) return response(returnvars, returnvals)
def downloader(options): ''' Download a files from a list of URLs. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the outputfile (optional) url - URL to the file to download (required) outputfile - name of the output file, without path (optional) returns a dictionary with information about the results workspace - actual path to the directory where the outputfile was written outputfile - actual full path to the output file success - True if process completed successfully, otherwise False message - an explanation of the results artifacts - a dictionary of persistent objects created ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug('Started %s' % __version__) logging.debug('options: %s' % options) # Make a list for the response returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts'] ### Standard outputs ### success = False message = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' url = None outputfile = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: url = options['url'] except: pass try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile) == 0: outputfile = 'dwca_' + str(uuid.uuid1()) + '.zip' outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) success = download_file(url, outputfile) if success == True: artifacts['downloaded_file'] = outputfile returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)