def generate_backbone(self): _LOG.info("Reading input sequences: %s" %(self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) if (options().backbone_size is None): options().backbone_size = min(100,int(.20*sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" %(options().backbone_size)) backbone_sequences = sequences.get_hard_sub_alignment(random.sample(sequences.keys(), options().backbone_size)) [sequences.pop(i) for i in backbone_sequences.keys()] _LOG.info("Writing query and backbone set. ") query = get_temp_file("query", "backbone", ".fas") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(sequences, query) _write_fasta(backbone_sequences, backbone) _LOG.info("Generating sate backbone alignment and tree. ") satealignJob = SateAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' satealignJob.setup(backbone,options().backbone_size,self.options.outdir,moleculeType,options().cpu) satealignJob.run() satealignJob.read_results() options().placement_size = self.options.backbone_size options().alignment_file = open(self.options.outdir + "/sate.fasta") options().tree_file = open(self.options.outdir + "/sate.fasttree") _LOG.info("Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) options().fragment_file = query
def check_options(self): self.check_outputprefix() options().info_file = "A_dummy_value" # Check to see if tree/alignment/fragment file provided, if not, # generate it from sequence file if ((not options().tree_file is None) and (not options().alignment_file is None) and (not options().sequence_file is None)): options().fragment_file = options().sequence_file elif ((options().tree_file is None) and (options().alignment_file is None) and (not options().sequence_file is None)): self.generate_backbone() else: _LOG.error( ("Either specify the backbone alignment and tree and query " "sequences or only the query sequences. Any other " "combination is invalid")) exit(-1) sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) backbone_size = sequences.get_num_taxa() if options().backbone_size is None: options().backbone_size = backbone_size assert options().backbone_size == backbone_size, ( ("Backbone parameter needs to match actual size of backbone; " "backbone parameter:%s backbone_size:%s") % (options().backbone_size, backbone_size)) if options().placement_size is None: options().placement_size = options().backbone_size return ExhaustiveAlgorithm.check_options(self)
def check_options(self): options().info_file = "A_dummy_value" if options().tree_file is None or options().alignment_file is None: _LOG.error("Specify the backbone alignment and tree and query sequences") exit(-1) sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) return ExhaustiveAlgorithm.check_options(self)
def check_options(self): options().info_file = "A_dummy_value" if options().tree_file is None or options().alignment_file is None: _LOG.error("Specify the backbone alignment and tree and query sequences") exit(-1) sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) return ExhaustiveAlgorithm.check_options(self)
def generate_backbone(self): _LOG.info("Reading input sequences: %s" %(self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) fragments = MutableAlignment() if (options().median_full_length is not None): if (options().median_full_length == -1): seq_lengths = sorted([len(seq) for seq in sequences.values()]) lengths = len(seq_lengths) if lengths % 2: options().median_full_length = (seq_lengths[lengths / 2] + seq_lengths[lengths / 2 - 1]) / 2.0 else: options().median_full_length = seq_lengths[lengths / 2] (min_length,max_length) = (int(options().median_full_length*(1-options().backbone_threshold)),int(options().median_full_length*(1+options().backbone_threshold))) frag_names = [name for name in sequences if len(sequences[name]) > max_length or len(sequences[name]) < min_length] if (len(frag_names) > 0): fragments = sequences.get_hard_sub_alignment(frag_names) [sequences.pop(i) for i in fragments.keys()] if (options().backbone_size is None): options().backbone_size = min(1000,int(sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" %(options().backbone_size)) if (options().backbone_size > len(sequences.keys())): options().backbone_size = len(sequences.keys()) backbone_sequences = sequences.get_hard_sub_alignment(random.sample(sequences.keys(), options().backbone_size)) [sequences.pop(i) for i in backbone_sequences.keys()] _LOG.info("Writing backbone set. ") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(backbone_sequences, backbone) _LOG.info("Generating pasta backbone alignment and tree. ") pastaalignJob = PastaAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' pastaalignJob.setup(backbone,options().backbone_size,self.options.outdir,moleculeType,options().cpu) pastaalignJob.run() pastaalignJob.read_results() options().placement_size = self.options.backbone_size options().alignment_file = open(self.options.outdir + "/pasta.fasta") options().tree_file = open(self.options.outdir + "/pasta.fasttree") _LOG.info("Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) sequences.set_alignment(fragments) if (len(sequences) == 0): _LOG.info("No query sequences to align. Final alignment saved as %s" % self.get_output_filename("alignment.fasta")) shutil.copyfile(self.options.outdir + "/pasta.fasta", self.get_output_filename("alignment.fasta")) sys.exit(0) else: query = get_temp_file("query", "backbone", ".fas") options().fragment_file = query _write_fasta(sequences, query)
def read_alignment_and_tree(self): _LOG.info("Reading input alignment: %s" %(self.options.alignment_file)) alignment = MutableAlignment() alignment.read_file_object(self.options.alignment_file) #fragments = MutableAlignment() #fragments.read_file_object(self.options.fragment_file); _LOG.info("Reading input tree: %s" %(self.options.tree_file)) tree = PhylogeneticTree( dendropy.Tree(stream=self.options.tree_file, schema="newick", preserve_underscores=True)) return (alignment, tree)
def read_alignment_and_tree(self): _LOG.info("Reading input alignment: %s" % (self.options.alignment_file)) alignment = MutableAlignment() alignment.read_file_object(self.options.alignment_file) # fragments = MutableAlignment() # fragments.read_file_object(self.options.fragment_file); _LOG.info("Reading input tree: %s" % self.options.tree_file) tree = PhylogeneticTree( dendropy.Tree.get_from_stream(self.options.tree_file, schema="newick", preserve_underscores=True)) return (alignment, tree)
def main(): args = parse_args() sequences = MutableAlignment() assert os.path.isfile(args.input) and os.access(args.input, os.R_OK), "Input file %s does not exist\n" % args.input sequences.read_file_object(args.input) frag = MutableAlignment() full = MutableAlignment() for (key,seq) in sequences.items(): if (len(seq) <= args.threshold): frag[key]=seq else: full[key]=seq frag.write_to_path("%s.frag.fas" % args.output) full.write_to_path("%s.full.fas" % args.output)
def main(): args = parse_args() sequences = MutableAlignment() assert os.path.isfile(args.input) and os.access( args.input, os.R_OK), "Input file %s does not exist\n" % args.input sequences.read_file_object(args.input) frag = MutableAlignment() full = MutableAlignment() for (key, seq) in sequences.items(): if (len(seq) <= args.threshold): frag[key] = seq else: full[key] = seq frag.write_to_path("%s.frag.fas" % args.output) full.write_to_path("%s.full.fas" % args.output)
def check_options(self): options().info_file = "A_dummy_value" #Check to see if tree/alignment/fragment file provided, if not, generate it #from sequence file if not options().tree_file is None and not options( ).alignment_file is None and not options().sequence_file is None: options().fragment_file = options().sequence_file elif options().tree_file is None and options( ).alignment_file is None and not options().sequence_file is None: self.generate_backbone() else: _LOG.error( "Either specify the backbone alignment and tree and query sequences or only the query sequences. Any other combination is invalid" ) exit(-1) sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) backbone_size = sequences.get_num_taxa() if options().backbone_size is None: options().backbone_size = backbone_size assert options().backbone_size == backbone_size, ( "Backbone parameter needs to match actual size of backbone; backbone parameter:%s backbone_size:%s" % (options().backbone_size, backbone_size)) if options().placement_size is None: options().placement_size = options().backbone_size if options().alignment_size is None: _LOG.info( "Alignment subset size not given. Calculating subset size. ") alignment = MutableAlignment() alignment.read_file_object(open(self.options.alignment_file.name)) if (options().molecule == 'amino'): _LOG.warning( "Automated alignment subset selection not implemented for protein alignment. Setting to 10." ) options().alignment_size = 10 else: (averagep, maxp) = alignment.get_p_distance() align_size = 10 if (averagep > .60): while (align_size * 2 < alignment.get_num_taxa()): align_size = align_size * 2 _LOG.info( "Average p-distance of backbone is %f0.2. Alignment subset size set to %d. " % (averagep, align_size)) options().alignment_size = align_size return ExhaustiveAlgorithm.check_options(self)
def output_results(self): extended_alignment = self.results _LOG.info("Generating output. ") outfilename = self.get_output_filename("alignment.fasta") extended_alignment.write_to_path(outfilename) _LOG.info("Unmasked alignment written to %s" % outfilename) outfilename = self.get_output_filename("insertion_columns.txt") extended_alignment.write_insertion_column_indexes(outfilename) _LOG.info("The index of insertion columns written to %s" % outfilename) if self.options.backtranslation_sequence_file: outfilename = self.get_output_filename( "backtranslated_alignment.fasta") backtranslation_seqs = MutableAlignment() backtranslation_seqs.read_file_object( self.options.backtranslation_sequence_file) try: extended_backtranslated_alignment = backtranslate( self.results, backtranslation_seqs) except Exception as e: _LOG.warning("Backtranslation failed due " "to following error: " + str(e) + ".\n" "No translated DNA sequence will be " "written to a file.") pass else: extended_backtranslated_alignment.write_to_path(outfilename) _LOG.info("Backtranslated alignment written to %s" % outfilename) extended_backtranslated_alignment.remove_insertion_columns() outfilename = self.get_output_filename( "backtranslated_alignment_masked.fasta") extended_backtranslated_alignment.write_to_path(outfilename) _LOG.info("Backtranslated masked alignment written " "to %s" % outfilename) extended_alignment.remove_insertion_columns() outfilename = self.get_output_filename("alignment_masked.fasta") extended_alignment.write_to_path(outfilename) _LOG.info("Masked alignment written to %s" % outfilename)
def check_options(self): options().info_file = "A_dummy_value" #Check to see if tree/alignment/fragment file provided, if not, generate it #from sequence file if not options().tree_file is None and not options().alignment_file is None and not options().sequence_file is None: options().fragment_file = options().sequence_file elif options().tree_file is None and options().alignment_file is None and not options().sequence_file is None: self.generate_backbone() else: _LOG.error("Either specify the backbone alignment and tree and query sequences or only the query sequences. Any other combination is invalid") exit(-1) sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) backbone_size = sequences.get_num_taxa() if options().backbone_size is None: options().backbone_size = backbone_size assert options().backbone_size == backbone_size, ("Backbone parameter needs to match actual size of backbone; backbone parameter:%s backbone_size:%s" %(options().backbone_size, backbone_size)) if options().placement_size is None: options().placement_size = options().backbone_size return ExhaustiveAlgorithm.check_options(self)
def generate_backbone(self): _LOG.info("Reading input sequences: %s" % (self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) if (options().backbone_size is None): options().backbone_size = min(100, int(.20 * sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" % (options().backbone_size)) backbone_sequences = sequences.get_hard_sub_alignment( random.sample(sequences.keys(), options().backbone_size)) [sequences.pop(i) for i in backbone_sequences.keys()] _LOG.info("Writing query and backbone set. ") query = get_temp_file("query", "backbone", ".fas") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(sequences, query) _write_fasta(backbone_sequences, backbone) _LOG.info("Generating sate backbone alignment and tree. ") satealignJob = SateAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' satealignJob.setup(backbone, options().backbone_size, self.options.outdir, moleculeType, options().cpu) satealignJob.run() satealignJob.read_results() options().placement_size = self.options.backbone_size options().alignment_file = open(self.options.outdir + "/sate.fasta") options().tree_file = open(self.options.outdir + "/sate.fasttree") _LOG.info( "Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) options().fragment_file = query
def check_options(self): options().info_file = "A_dummy_value" #Check to see if tree/alignment/fragment file provided, if not, generate it #from sequence file if not options().tree_file is None and not options().alignment_file is None and not options().sequence_file is None: options().fragment_file = options().sequence_file elif options().tree_file is None and options().alignment_file is None and not options().sequence_file is None: self.generate_backbone() else: _LOG.error("Either specify the backbone alignment and tree and query sequences or only the query sequences. Any other combination is invalid") exit(-1) sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) backbone_size = sequences.get_num_taxa() if options().backbone_size is None: options().backbone_size = backbone_size assert options().backbone_size == backbone_size, ("Backbone parameter needs to match actual size of backbone; backbone parameter:%s backbone_size:%s" %(options().backbone_size, backbone_size)) if options().placement_size is None: options().placement_size = options().backbone_size if options().alignment_size is None: _LOG.info("Alignment subset size not given. Calculating subset size. ") alignment = MutableAlignment() alignment.read_file_object(open(self.options.alignment_file.name)) if (options().molecule == 'amino'): _LOG.warning("Automated alignment subset selection not implemented for protein alignment. Setting to 10.") options().alignment_size = 10 else: (averagep,maxp) = alignment.get_p_distance() align_size = 10 if (averagep > .60): while (align_size*2 < alignment.get_num_taxa()): align_size = align_size * 2 _LOG.info("Average p-distance of backbone is %f0.2. Alignment subset size set to %d. " % (averagep,align_size)) options().alignment_size = align_size return ExhaustiveAlgorithm.check_options(self)
def generate_backbone(self): _LOG.info("Reading input sequences: %s" % (self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) sequences.degap() fragments = MutableAlignment() if (options().median_full_length is not None or options().full_length_range is not None): if (options().median_full_length == -1): seq_lengths = sorted( [len(seq) for seq in list(sequences.values())]) lengths = len(seq_lengths) l2 = int(lengths / 2) if lengths % 2: options().median_full_length = (seq_lengths[l2] + seq_lengths[l2 + 1]) / 2.0 else: options().median_full_length = seq_lengths[l2] if options().full_length_range is not None: L = sorted(int(x) for x in options().full_length_range.split()) min_length = L[0] max_length = L[1] else: (min_length, max_length) = (int(options().median_full_length * (1 - options().backbone_threshold)), int(options().median_full_length * (1 + options().backbone_threshold))) _LOG.info( "Full length sequences are set to be from %d to %d character long" % (min_length, max_length)) frag_names = [ name for name in sequences if len(sequences[name]) > max_length or len(sequences[name]) < min_length ] if (len(frag_names) > 0): _LOG.info("Detected %d fragmentary sequences" % len(frag_names)) fragments = sequences.get_hard_sub_alignment(frag_names) [sequences.pop(i) for i in list(fragments.keys())] if (options().backbone_size is None): options().backbone_size = min(1000, int(sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" % (options().backbone_size)) if (options().backbone_size > len(list(sequences.keys()))): options().backbone_size = len(list(sequences.keys())) sample = sorted( random.sample(sorted(list(sequences.keys())), options().backbone_size)) backbone_sequences = sequences.get_hard_sub_alignment(sample) _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys())))) [sequences.pop(i) for i in list(backbone_sequences.keys())] _LOG.info("Writing backbone set. ") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(backbone_sequences, backbone) _LOG.info("Generating pasta backbone alignment and tree. ") pastaalignJob = PastaAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' pastaalignJob.setup(backbone, options().backbone_size, moleculeType, options().cpu, **vars(options().pasta)) pastaalignJob.run() (a_file, t_file) = pastaalignJob.read_results() shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree")) shutil.copyfile(a_file, self.get_output_filename("pasta.fasta")) options().placement_size = self.options.backbone_size options().alignment_file = open( self.get_output_filename("pasta.fasta")) options().tree_file = open(self.get_output_filename("pasta.fasttree")) _LOG.info( "Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) sequences.set_alignment(fragments) if (len(sequences) == 0): sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) self.results = ExtendedAlignment(fragment_names=[]) self.results.set_alignment(sequences) _LOG.info( "No query sequences to align. Final alignment saved as %s" % self.get_output_filename("alignment.fasta")) self.output_results() sys.exit(0) else: query = get_temp_file("query", "backbone", ".fas") options().fragment_file = query _write_fasta(sequences, query)