def main(argv): treefile = '' species_list = argv[2:] try: opts, args = getopt.getopt(argv[0:2],"ht:o:",["tree=","alignment=","species=", "consensus=","log="]) except getopt.GetoptError: print 'GetClades.py -t <treefile> -o <outfile> <species>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'GetClades.py -t <treefile> -o <outfile> <species>' sys.exit() elif opt in ("-t", "--tree"): treefile = arg #create outfilename from treefile outfilename = treefile.replace(".nwk", "_clades.txt") #skip tree files with a size of zero (usually caused by too few sequences to make a tree) if path.getsize(treefile) == 0: sys.exit() #read in tree, root by midpoint, and add info about target species t = Tree(treefile) if len(t) > 3: R = t.get_midpoint_outgroup() #midpoint root tree unless there are only three taxa, which was causing an error t.set_outgroup(R) t = add_species([t] + species_list) #extract cluster name from treefile (remove path and extension) cluster_name = path.split(treefile)[1].split(".")[0] + "_" #make dictionary with clade numbers as keys and arrays of species as values clade_dict = {} clade_num = 0 for node in t.get_monophyletic(values=["ingroup"], target_attr="species"): clade_num = clade_num +1 clade_dict[clade_num] = [""] * len(species_list) #create array of empty values with same length as species_list for leaf in node: for i in range(len(species_list)): if species_list[i] in leaf.name: clade_dict[clade_num][i] = cluster_name + str(clade_num) #open outfile and write results outfile_handle = open(outfilename, "w") for clade in clade_dict.values(): outfile_handle.write("\t".join(clade) + "\n") outfile_handle.close()
def main(argv): treefile = '' alnmentfile = '' consfilename = '' logfilename = '' species_name = '' try: opts, args = getopt.getopt(argv,"ht:a:c:l:s:",["tree=","alignment=","species=", "consensus=","log="]) except getopt.GetoptError: print 'MakeConsensus.py -t <treefile> -a <alignmentfile> -s <species> -c <consensusfile> -l <logfile>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'MakeConsensus.py -t <treefile> -a <alignmentfile> -s <species> -c <consensusfile> -l <logfile>' sys.exit() elif opt in ("-t", "--tree"): treefile = arg elif opt in ("-a", "--alignment"): alnfile = arg elif opt in ("-c", "--consensus"): consfilename = arg elif opt in ("-l", "--log"): logfilename = arg elif opt in ("-s", "--species"): species_name = arg #make a list of names to ensure that there are no duplicates. #if saved list exists, use it instead namefile = "names.p" if path.exists(namefile): name_list = pickle.load( open( namefile, "rb" ) ) else: name_list = {} name_num = 1 #read in tree and root by midpoint t = Tree(treefile) R = t.get_midpoint_outgroup() t.set_outgroup(R) #read in alignment and create index of sequence ids to access individual sequences by id aln = AlignIO.read(alnfile, "phylip-relaxed") aln_index = {} seq_num = 0 for seq in aln: aln_index[seq.id] = seq_num seq_num = seq_num +1 #Add "species" feature to leaves on tree for easy lookup t = add_species(t) #cycle through monophyletic groups and build consensus sequences for node in t.get_monophyletic(values=[species_name], target_attr="species"): species_seqs = [] for leaf in node: if 'kraussiana' not in leaf.name and 'willdenowii' not in leaf.name: #exclude reference sequence from consensus species_seqs.append(aln[aln_index[leaf.name]]) if len(species_seqs) > 1: #create consensus sequence from sequence list species_aln = MultipleSeqAlignment(species_seqs) consensus = smart_consensus(MultipleSeqAlignment(pad_ends(species_aln))) #use pad_ends to convert end gaps to ambiguous consensus = pad_ends([consensus], '-', 'N')[0] #convert back to gap characters (need to convert seq to a list) name = pick_name(species_seqs) + "_cons" while name in name_list: name = name.split("_")[0] + "_" + str(name_num) + "_cons" name_num = name_num + 1 name_list[name] = 1 name_num = 1 consensus = SeqRecord(Seq(str(consensus).replace('-','')), id=name, description=name) elif len(species_seqs) == 1: #for singletons, just write ungapped version of original consensus = species_seqs[0] consensus.seq = consensus.seq.ungap("-") name = consensus.id else: continue #write consensus sequence to consensus file consfile = open(consfilename, "a") consfile.write(consensus.format("fasta")) consfile.close() #write summary of seqs represented by cons to log file cluster_num = path.split(alnfile)[1].split(".")[0] #get cluster number logfile = open(logfilename, "a") for seq in species_seqs: logfile.write("%s\n" % ", ".join([seq.id, name, cluster_num])) logfile.close() #save list of names for future uses of script pickle.dump( name_list, open( namefile, "wb" ) )