def updateDbFile(db_file, field, values, updates, out_file=None, out_args=default_out_args): """ Updates field and value pairs to a database file Arguments: db_file : the database file name. field : the field to update. values : a list of values to specifying which rows to update. updates : a list of values to update each value with. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'update' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field log['VALUES'] = ','.join(values) log['UPDATES'] = ','.join(updates) printLog(log) # Open input db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) out_fields = db_iter.fields __, __, out_args['out_type'] = splitName(db_file) # Open output if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='parse-update', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = TSVWriter(pass_handle, out_fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count, pass_count = 0, 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Updated values if found for x, y in zip(values, updates): if rec[field] == x: rec[field] = y pass_count += 1 # Write records pass_writer.writeDict(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['UPDATED'] = pass_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def mergeDbFiles(db_files, drop=False, out_file=None, out_args=default_out_args): """ Updates field and value pairs to a database file Arguments: db_files : list of database file names. drop : if True drop columns not present in all files. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name. """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'merge' log['FILES'] = ','.join([os.path.basename(f) for f in db_files]) log['DROP'] = drop printLog(log) # Open input db_handles = [open(f, 'rt') for f in db_files] db_iters = [TSVReader(x) for x in db_handles] result_count = sum([countDbFile(f) for f in db_files]) # Define output fields field_list = [x.fields for x in db_iters] if drop: field_set = set.intersection(*map(set, field_list)) else: field_set = set.union(*map(set, field_list)) field_order = OrderedDict([(f, None) for f in chain(*field_list)]) out_fields = [f for f in field_order if f in field_set] # Open output file if out_file is not None: pass_handle = open(out_file, 'w') else: __, __, out_args['out_type'] = splitName(db_files[0]) pass_handle = getOutputHandle(db_files[0], out_label='parse-merge', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = TSVWriter(pass_handle, out_fields) # Iterate over records start_time = time() rec_count = 0 for db in db_iters: for rec in db: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Write records pass_writer.writeDict(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() for x in db_handles: x.close() return pass_handle.name
def selectDbFile(db_file, fields, values, logic='any', regex=False, out_file=None, out_args=default_out_args): """ Selects records from a database file Arguments: db_file : the database file name fields : a list of fields to check for selection criteria values : a list of values defining selection targets logic : one of 'any' or 'all' defining whether one or all fields must have a match. regex : if False do exact full string matches; if True allow partial regex matches. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs Returns: str : output file name. """ # Define string match function if regex: def _match_func(x, patterns): return any([re.search(p, x) for p in patterns]) else: def _match_func(x, patterns): return x in patterns # Define logic function if logic == 'any': _logic_func = any elif logic == 'all': _logic_func = all # Print console log log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'select' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) log['VALUES'] = ','.join(values) log['REGEX'] = regex printLog(log) # Open input db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) out_fields = db_iter.fields __, __, out_args['out_type'] = splitName(db_file) # Open output if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='parse-select', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = TSVWriter(pass_handle, out_fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count, pass_count, fail_count = 0, 0, 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Check for selection values in all fields select = _logic_func( [_match_func(rec.get(f, False), values) for f in fields]) # Write sequences if select: pass_count += 1 pass_writer.writeDict(rec) else: fail_count += 1 # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['SELECTED'] = pass_count log['DISCARDED'] = fail_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def sortDbFile(db_file, field, numeric=False, descend=False, out_file=None, out_args=default_out_args): """ Sorts records by values in an annotation field Arguments: db_file : the database filename field : the field name to sort by numeric : if True sort field numerically; if False sort field alphabetically descend : if True sort in descending order; if False sort in ascending order out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs Returns: str : output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'sort' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field log['NUMERIC'] = numeric printLog(log) # Open input db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) out_fields = db_iter.fields __, __, out_args['out_type'] = splitName(db_file) # Open output if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='parse-sort', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = TSVWriter(pass_handle, out_fields) # Store all records in a dictionary start_time = time() printMessage("Indexing: Running", start_time=start_time) db_dict = {i: r for i, r in enumerate(db_iter)} result_count = len(db_dict) # Sort db_dict by field values tag_dict = {k: v[field] for k, v in db_dict.items()} if numeric: tag_dict = {k: float(v or 0) for k, v in tag_dict.items()} sorted_keys = sorted(tag_dict, key=tag_dict.get, reverse=descend) printMessage("Indexing: Done", start_time=start_time, end=True) # Iterate over records start_time = time() rec_count = 0 for key in sorted_keys: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Write records pass_writer.writeDict(db_dict[key]) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def renameDbFile(db_file, fields, names, out_file=None, out_args=default_out_args): """ Renames fields in a database file Arguments: db_file : the database file name. fields : a list of fields to rename. values : a list of new names for fields. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name. """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'rename' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) log['NAMES'] = ','.join(names) printLog(log) # Open file handles db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) __, __, out_args['out_type'] = splitName(db_file) # Get header and rename fields out_fields = list(db_iter.fields) for f, n in zip(fields, names): i = out_fields.index(f) out_fields[i] = n # Open writer if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='parse-rename', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = TSVWriter(pass_handle, out_fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # TODO: repeating renaming is unnecessary. # Rename fields for f, n in zip(fields, names): rec[n] = rec.pop(f) # Write pass_writer.writeDict(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def splitDbFile(db_file, field, num_split=None, out_args=default_out_args): """ Divides a tab-delimited database file into segments by description tags Arguments: db_file : filename of the tab-delimited database file to split field : the field name by which to split db_file num_split : the numerical threshold by which to group sequences; if None treat field as textual out_args : common output argument dictionary from parseCommonArgs Returns: list : a list of output file names. """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'split' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field log['NUM_SPLIT'] = num_split printLog(log) # Open input db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) out_fields = db_iter.fields __, __, out_args['out_type'] = splitName(db_file) # Determine total numbers of records rec_count = countDbFile(db_file) start_time = time() count = 0 # Sort records into files based on textual field if num_split is None: # Create set of unique field tags with open(db_file, 'rt') as tmp_handle: tmp_iter = TSVReader(tmp_handle) tag_list = list(set([row[field] for row in tmp_iter])) # Forbidden characters in filename and replacements no_good = { '\/': 'f', '\\': 'b', '?': 'q', '\%': 'p', '*': 's', ':': 'c', '\|': 'pi', '\"': 'dq', '\'': 'sq', '<': 'gt', '>': 'lt', ' ': '_' } # Replace forbidden characters in tag_list tag_dict = {} for tag in tag_list: for c, r in no_good.items(): tag_dict[tag] = (tag_dict.get(tag, tag).replace(c,r) \ if c in tag else tag_dict.get(tag, tag)) # Create output handles handles_dict = { tag: getOutputHandle(db_file, out_label='%s-%s' % (field, label), out_name=out_args['out_name'], out_dir=out_args['out_dir'], out_type=out_args['out_type']) for tag, label in tag_dict.items() } # Create Db writer instances writers_dict = { tag: TSVWriter(handles_dict[tag], fields=out_fields) for tag in tag_dict } # Iterate over records for row in db_iter: printProgress(count, rec_count, 0.05, start_time=start_time) count += 1 # Write row to appropriate file tag = row[field] writers_dict[tag].writeDict(row) # Sort records into files based on numeric num_split else: num_split = float(num_split) # Create output handles handles_dict = { 'under': getOutputHandle(db_file, out_label='under-%.1f' % num_split, out_name=out_args['out_name'], out_dir=out_args['out_dir'], out_type=out_args['out_type']), 'atleast': getOutputHandle(db_file, out_label='atleast-%.1f' % num_split, out_name=out_args['out_name'], out_dir=out_args['out_dir'], out_type=out_args['out_type']) } # Create Db writer instances writers_dict = { 'under': TSVWriter(handles_dict['under'], fields=out_fields), 'atleast': TSVWriter(handles_dict['atleast'], fields=out_fields) } # Iterate over records for row in db_iter: printProgress(count, rec_count, 0.05, start_time=start_time) count += 1 tag = row[field] tag = 'under' if float(tag) < num_split else 'atleast' writers_dict[tag].writeDict(row) # Write log printProgress(count, rec_count, 0.05, start_time=start_time) log = OrderedDict() for i, k in enumerate(handles_dict): log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name) log['RECORDS'] = rec_count log['PARTS'] = len(handles_dict) log['END'] = 'ParseDb' printLog(log) # Close output file handles db_handle.close() for t in handles_dict: handles_dict[t].close() return [handles_dict[t].name for t in handles_dict]
def dropDbFile(db_file, fields, out_file=None, out_args=default_out_args): """ Deletes entire fields from a database file Arguments: db_file : the database file name. fields : a list of fields to drop. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs Returns: str : output file name. """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'add' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) printLog(log) # Open input db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) __, __, out_args['out_type'] = splitName(db_file) # Exclude dropped field from output out_fields = [f for f in db_iter.fields if f not in fields] # Open output if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='parse-drop', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = TSVWriter(pass_handle, out_fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Write row pass_writer.writeDict(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def indexDbFile(db_file, field=default_index_field, out_file=None, out_args=default_out_args): """ Adds an index column to a database file Arguments: db_file : the database file name. field : the name of the index field to add. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name. """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'index' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field printLog(log) # Open input db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) __, __, out_args['out_type'] = splitName(db_file) # Append index field out_fields = list(db_iter.fields) out_fields.append(field) # Open output if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='parse-index', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = TSVWriter(pass_handle, out_fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Add count and write updated row rec.update({field: rec_count}) pass_writer.writeDict(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def addDbFile(db_file, fields, values, out_file=None, out_args=default_out_args): """ Adds field and value pairs to a database file Arguments: db_file : the database file name. fields : a list of fields to add. values : a list of values to assign to all rows of each field. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name. """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'add' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) log['VALUES'] = ','.join(values) printLog(log) # Open inut db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) __, __, out_args['out_type'] = splitName(db_file) # Add fields out_fields = list(db_iter.fields) out_fields.extend(fields) # Open output if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='parse-add', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = TSVWriter(pass_handle, out_fields) # Count records result_count = countDbFile(db_file) # Define fields and values to append add_dict = { k: v for k, v in zip(fields, values) if k not in db_iter.fields } # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Write updated row rec.update(add_dict) pass_writer.writeDict(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name