def convertGUIDANCELog(arr, nams, trimfile, logfile, outfile): ''' Convert the GUIDANCE output to resemble the CIAlign "removed" file ''' trimfile_cols, trimfile_rows, out_trimmed = trimfile logfile_cols, logfile_rows = logfile t_arr_cols, t_nams_cols = utilityFunctions.FastaToArray(trimfile_cols) removed_cols = [ int(line.strip().split("\t")[0].split(" ")[-1]) for line in open(logfile_cols).readlines() ] removed_cols = np.array(removed_cols) - 1 all_ints = set(np.arange(0, np.shape(arr)[1])) keep = sorted(list(all_ints - set(removed_cols))) if os.path.exists("%s.With_Names" % trimfile_rows) and os.path.exists( "%s.With_Names" % logfile_rows): t_arr_rows, t_nams_rows = utilityFunctions.FastaToArray( "%s.With_Names" % trimfile_rows) t_arr_rows_rm, t_nams_rows_rm = utilityFunctions.FastaToArray( "%s.With_Names" % logfile_rows) elif os.path.exists("%s.With_Names" % trimfile_rows): t_arr_rows, t_nams_rows = utilityFunctions.FastaToArray( "%s.With_Names" % trimfile_rows) t_arr_rows_rm, t_nams_rows_rm = utilityFunctions.FastaToArray( logfile_rows) else: t_arr_rows, t_nams_rows = np.array([]), list() t_arr_rows_rm, t_nams_rows_rm = utilityFunctions.FastaToArray( "%s.With_Names" % logfile_rows) assert len(t_nams_rows) + len(t_nams_rows_rm) == len(nams) assert len(removed_cols) + np.shape(t_arr_cols)[1] == np.shape(arr)[1] assert (arr[:, keep] == t_arr_cols).all() allnams = sorted(t_nams_rows + t_nams_rows_rm) assert allnams == sorted(nams) out = open(outfile, "w") out.write("other\t%s\n" % (",".join([str(x) for x in sorted(removed_cols)]))) out.write("otherc\t%s\n" % (",".join([str(x) for x in sorted(t_nams_rows_rm)]))) out.close() which_nams = np.where(np.isin(nams, t_nams_rows))[0] new_arr = arr[which_nams, ] new_arr = new_arr[:, keep] utilityFunctions.writeOutfile(out_trimmed, new_arr, nams, t_nams_rows_rm)
def convertZorroLog(arr, nams, trimfile, logfile, outfile, thresh): ''' Convert the Zorro output to resemble the CIAlign "removed" file ''' scores = [float(x.strip()) for x in open(logfile).readlines()] scores = np.array(scores) which = np.where(scores < thresh)[0] keeps = np.where(scores >= thresh)[0] removed = set(which) new_arr = arr[:, keeps] utilityFunctions.writeOutfile(trimfile, new_arr, nams, removed) out = open(outfile, "w") out.write("other\t%s\n" % (",".join([str(x) for x in removed]))) out.close()
def runConsensus(args, log, orig_arr, orig_nams, arr, nams, removed_seqs): ''' Make consensus sequences Parameters ---------- args: configargparse.ArgumentParser ArgumentParser object containing the specified parameters log: logging.Logger Open log file orig_arr: np.array Array containing the original alignment orig_nams: List of sequence names in the original alignment arr: np.array Array containing the cleaned alignment nams: list List of sequence names in the cleaned alignment removed_seqs: set Set of sequence names which have been removed ''' if args.make_consensus or args.all_options or args.interpret: log.info("Building consensus sequence") if not args.silent: print("Building consensus sequence") cons, coverage = consensusSeq.findConsensus(arr, log, args.consensus_type) consarr = np.array(cons) # Combine the consensus with the alignment arr_plus_cons = np.row_stack((arr, consarr)) cons = "".join(cons) # Remove the gaps from the consensus if this option is specified if not args.consensus_keep_gaps: cons = cons.replace("-", "") # Output file of just the consensus sequence out = open("%s_consensus.fasta" % args.outfile_stem, "w") out.write(">%s\n%s\n" % (args.consensus_name, cons)) out.close() # Output file of the consensus and the alignment outf = "%s_with_consensus.fasta" % args.outfile_stem utilityFunctions.writeOutfile(outf, arr_plus_cons, nams + [args.consensus_name], removed_seqs)
def runTtoU(args, log, orig_arr, orig_nams, arr, nams, removed_seqs): ''' Make a copy of the alignment with T replaced by U Parameters ---------- args: configargparse.ArgumentParser ArgumentParser object containing the specified parameters log: logging.Logger Open log file orig_arr: np.array Array containing the original alignment orig_nams: List of sequence names in the original alignment arr: np.array Array containing the cleaned alignment nams: list List of sequence names in the cleaned alignment removed_seqs: set Set of sequence names which have been removed ''' # Replace T with U in the input if args.replace_input: log.info("Generating a T instead of U version of the input alignment") if not args.silent: print("Generating a T instead of U version of the input alignment") outf = "%s_T_input.fasta" % (args.outfile_stem) T_arr = utilityFunctions.replaceUbyT(orig_arr) # Write to file utilityFunctions.writeOutfile(outf, T_arr, orig_nams, removed_seqs) # Rpleace T with U in the output if args.replace_output: log.info("Generating a T instead of U version of\ the output alignment") if not args.silent: print("Generating a T instead of U version of\ the output alignment") outf = "%s_T_output.fasta" % (args.outfile_stem) T_arr = utilityFunctions.replaceUbyT(arr) # Write to file utilityFunctions.writeOutfile(outf, T_arr, orig_nams, removed_seqs)
def runUnalign(args, log, orig_arr, orig_nams, arr, nams, removed_seqs): ''' Make a copy of the alignment without gaps Parameters ---------- args: configargparse.ArgumentParser ArgumentParser object containing the specified parameters log: logging.Logger Open log file orig_arr: np.array Array containing the original alignment orig_nams: List of sequence names in the original alignment arr: np.array Array containing the cleaned alignment nams: list List of sequence names in the cleaned alignment removed_seqs: set Set of sequence names which have been removed ''' # Unalign input if args.unalign_input: log.info("Generating a gap free version of the input alignment") if not args.silent: print("Generating a gap free version of the input alignment") outf = "%s_unaligned_input.fasta" % (args.outfile_stem) unaligned_arr = utilityFunctions.unAlign(orig_arr) # Write to file utilityFunctions.writeOutfile(outf, unaligned_arr, orig_nams, removed_seqs) # Unalign output if args.unalign_output: log.info("Generating a gap free version of the output alignment") if not args.silent: print("Generating a gap free version of the output alignment") outf = "%s_unaligned_output.fasta" % (args.outfile_stem) unaligned_arr = utilityFunctions.unAlign(arr) # Write to file utilityFunctions.writeOutfile(outf, unaligned_arr, nams, removed_seqs)
def runCleaning(args, log, arr, nams): ''' Run the cleaning functions Parameters ---------- args: configargparse.ArgumentParser ArgumentParser object containing the specified parameters log: logging.Logger Open log file arr: np.array Array containing the original alignment nams: List of sequence names in the original alignment Returns ------- arr: np.array Array containing the cleaned alignment nams: list List of sequence names remaining in the cleaned alignment markupdict: dict Dictionary where the keys are function names and the values are lists of columns, rows or positions which have been removed removed_seqs: set set of the names of sequences which have been removed ''' # Set everything up orig_nams = copy.copy(nams) markupdict, relativePositions, R = setupTrackers(args, arr) outfile, rmfile = setupOutfiles(args) removed_seqs, removed_cols, removed_positions = R # Remove divergent sequences if args.remove_divergent or args.all_options or args.clean: log.info("Removing divergent sequences") if not args.silent: print("Removing divergent sequences") minperc = args.remove_divergent_minperc arr, r = parsingFunctions.removeDivergent(arr, nams, rmfile, log, minperc) # Track what has been removed markupdict['remove_divergent'] = r removed_seqs = removed_seqs | r nams = utilityFunctions.updateNams(nams, r) # Check there are some sequences left utilityFunctions.checkArrLength(arr, log) # Remove gaps created by remove divergent if (args.remove_divergent and args.remove_gaponly) or args.all_options or args.clean: log.info("Removing gap only columns") if not args.silent: print("Removing gap only columns") A = parsingFunctions.removeGapOnly(arr, relativePositions, rmfile, log) # Track what has been removed arr, r, relativePositions = A if 'remove_gaponly' in markupdict: markupdict['remove_gaponly'].update(r) else: markupdict['remove_gaponly'] = r # Check there are some columns left removed_cols = removed_cols | r utilityFunctions.checkArrLength(arr, log) # Remove insertions if args.remove_insertions or args.all_options or args.clean: log.info("Removing insertions") if not args.silent: print("Removing insertions") assert args.insertion_min_size < args.insertion_max_size, "\ insertion_min_size must be less than insertion_max_size" A = parsingFunctions.removeInsertions(arr, relativePositions, rmfile, log, args.insertion_min_size, args.insertion_max_size, args.insertion_min_flank) # Track what has been removed arr, r, relativePositions = A markupdict['remove_insertions'] = r removed_cols = removed_cols | r # Check there are some columns left utilityFunctions.checkArrLength(arr, log) # Remove gaps created by remove insertions if (args.remove_insertions and args.remove_gaponly) or args.all_options or args.clean: log.info("Removing gap only columns") if not args.silent: print("Removing gap only columns") A = parsingFunctions.removeGapOnly(arr, relativePositions, rmfile, log) # Track what has been removed arr, r, relativePositions = A if 'remove_gaponly' in markupdict: markupdict['remove_gaponly'].update(r) else: markupdict['remove_gaponly'] = r removed_cols = removed_cols | r # Check there are still some columns left utilityFunctions.checkArrLength(arr, log) # Crop Ends if args.crop_ends or args.all_options or args.clean: # doesn't remove any whole columns or rows log.info("Cropping ends") if not args.silent: print("Cropping ends") arr, r = parsingFunctions.cropEnds(arr, nams, relativePositions, rmfile, log, args.crop_ends_mingap_perc, args.crop_ends_redefine_perc) # Track what has been removed markupdict['crop_ends'] = r removed_positions.update(r) # Check there are still some positions left utilityFunctions.checkArrLength(arr, log) # Remove empty columns created by crop ends if (args.crop_ends and args.remove_gaponly) or args.all_options or args.clean: log.info("Removing gap only columns") if not args.silent: print("Removing gap only columns") A = parsingFunctions.removeGapOnly(arr, relativePositions, rmfile, log) # Track what has been removed arr, r, relativePositions = A if 'remove_gaponly' in markupdict: markupdict['remove_gaponly'].update(r) else: markupdict['remove_gaponly'] = r removed_cols = removed_cols | r # Check there are still some positions left utilityFunctions.checkArrLength(arr, log) # Remove short if args.remove_short or args.all_options or args.clean: log.info("Removing short sequences") if not args.silent: print("Removing short sequences") arr, r = parsingFunctions.removeTooShort(arr, nams, rmfile, log, args.remove_min_length) # Track what has been removed markupdict['remove_short'] = r removed_seqs = removed_seqs | r nams = utilityFunctions.updateNams(nams, r) # Check there are still some sequences left utilityFunctions.checkArrLength(arr, log) # Remove empty columns created by remove short if (args.remove_short and args.remove_gaponly) or args.all_options or args.clean: log.info("Removing gap only columns") if not args.silent: print("Removing gap only columns") A = parsingFunctions.removeGapOnly(arr, relativePositions, rmfile, log) arr, r, relativePositions = A if 'remove_gaponly' in markupdict: markupdict['remove_gaponly'].update(r) else: markupdict['remove_gaponly'] = r removed_cols = removed_cols | r utilityFunctions.checkArrLength(arr, log) if args.remove_gaponly and not (args.all_options or args.remove_divergent or args.remove_insertions or args.crop_ends or args.remove_short or args.clean): log.info("Removing gap only columns") if not args.silent: print("Removing gap only columns") A = parsingFunctions.removeGapOnly(arr, relativePositions, rmfile, log) arr, r, relativePositions = A # Track what has been removed if 'remove_gaponly' in markupdict: markupdict['remove_gaponly'].update(r) else: markupdict['remove_gaponly'] = r removed_cols = removed_cols | r # Check there are some columns left utilityFunctions.checkArrLength(arr, log) # Write the output file utilityFunctions.writeOutfile(outfile, arr, orig_nams, removed_seqs, rmfile) return (arr, nams, markupdict, removed_seqs)
def testWriteOutfile(self): utilityFunctions.writeOutfile(self.outfile, self.in_array, self.nams, self.removed) self.assertTrue(os.path.isfile(self.outfile))