Exemplo n.º 1
0
def main(argv):
   treefile = ''
   species_list = argv[2:]
   try:
      opts, args = getopt.getopt(argv[0:2],"ht:o:",["tree=","alignment=","species=", "consensus=","log="])
   except getopt.GetoptError:
      print 'GetClades.py -t <treefile> -o <outfile> <species>'
      sys.exit(2)
   for opt, arg in opts:
      if opt == '-h':
         print 'GetClades.py -t <treefile> -o <outfile> <species>'
         sys.exit()
      elif opt in ("-t", "--tree"):
         treefile = arg
   
   #create outfilename from treefile
   outfilename = treefile.replace(".nwk", "_clades.txt")
   

   #skip tree files with a size of zero (usually caused by too few sequences to make a tree)
   if path.getsize(treefile) == 0:
     sys.exit()      


     
   #read in tree, root by midpoint, and add info about target species
   t = Tree(treefile)
   if len(t) > 3:                 
     R = t.get_midpoint_outgroup()  #midpoint root tree unless there are only three taxa, which was causing an error
     t.set_outgroup(R)
   t = add_species([t] + species_list)
   
   #extract cluster name from treefile (remove path and extension)
   cluster_name = path.split(treefile)[1].split(".")[0] + "_"
   #make dictionary with clade numbers as keys and arrays of species as values
   clade_dict = {}
   clade_num = 0
   for node in t.get_monophyletic(values=["ingroup"], target_attr="species"):
     clade_num = clade_num +1
     clade_dict[clade_num] = [""] * len(species_list)  #create array of empty values with same length as species_list
     for leaf in node:
       for i in range(len(species_list)):
         if species_list[i] in leaf.name:
           clade_dict[clade_num][i] = cluster_name + str(clade_num)

   #open outfile and write results
   outfile_handle = open(outfilename, "w")

   for clade in clade_dict.values():
     outfile_handle.write("\t".join(clade) + "\n")
   outfile_handle.close()
Exemplo n.º 2
0
def main(argv):
   treefile = ''
   alnmentfile = ''
   consfilename = ''
   logfilename = ''
   species_name = ''
   try:
      opts, args = getopt.getopt(argv,"ht:a:c:l:s:",["tree=","alignment=","species=", "consensus=","log="])
   except getopt.GetoptError:
      print 'MakeConsensus.py -t <treefile> -a <alignmentfile> -s <species> -c <consensusfile> -l <logfile>'
      sys.exit(2)
   for opt, arg in opts:
      if opt == '-h':
         print 'MakeConsensus.py -t <treefile> -a <alignmentfile> -s <species> -c <consensusfile> -l <logfile>'
         sys.exit()
      elif opt in ("-t", "--tree"):
         treefile = arg
      elif opt in ("-a", "--alignment"):
         alnfile = arg
      elif opt in ("-c", "--consensus"):
         consfilename = arg
      elif opt in ("-l", "--log"):
         logfilename = arg
      elif opt in ("-s", "--species"):
         species_name = arg
   
   
   #make a list of names to ensure that there are no duplicates.
   #if saved list exists, use it instead
   namefile = "names.p"
   if path.exists(namefile):
     name_list = pickle.load( open( namefile, "rb" ) )
   else:
     name_list = {}
   name_num = 1
   
   #read in tree and root by midpoint
   t = Tree(treefile)
   R = t.get_midpoint_outgroup()
   t.set_outgroup(R)
   
   #read in alignment and create index of sequence ids to access individual sequences by id
   aln = AlignIO.read(alnfile, "phylip-relaxed")
   aln_index = {}
   seq_num = 0
   for seq in aln:
     aln_index[seq.id] = seq_num
     seq_num = seq_num +1
   
   #Add "species" feature to leaves on tree for easy lookup
   t = add_species(t)
   
   #cycle through monophyletic groups and build consensus sequences
   for node in t.get_monophyletic(values=[species_name], target_attr="species"):
     species_seqs = []
     for leaf in node:
       if 'kraussiana' not in leaf.name and 'willdenowii' not in leaf.name:  #exclude reference sequence from consensus
         species_seqs.append(aln[aln_index[leaf.name]])
     if len(species_seqs) > 1:                             #create consensus sequence from sequence list
       species_aln = MultipleSeqAlignment(species_seqs)
       consensus = smart_consensus(MultipleSeqAlignment(pad_ends(species_aln))) #use pad_ends to convert end gaps to ambiguous
       consensus = pad_ends([consensus], '-', 'N')[0]      #convert back to gap characters (need to convert seq to a list)
       name = pick_name(species_seqs) + "_cons"
       while name in name_list:
         name = name.split("_")[0] + "_" + str(name_num) + "_cons"
         name_num = name_num + 1
       name_list[name] = 1
       name_num = 1
       consensus =  SeqRecord(Seq(str(consensus).replace('-','')),
                              id=name, 
                              description=name)
     elif len(species_seqs) == 1:                                                #for singletons, just write ungapped version of original
       consensus = species_seqs[0]
       consensus.seq = consensus.seq.ungap("-")                    
       name = consensus.id
     
     else:
       continue
     #write consensus sequence to consensus file
     consfile = open(consfilename, "a")
     consfile.write(consensus.format("fasta"))
     consfile.close()
       
     #write summary of seqs represented by cons to log file
     cluster_num = path.split(alnfile)[1].split(".")[0] #get cluster number
     logfile = open(logfilename, "a")
     for seq in species_seqs:
       logfile.write("%s\n" % ", ".join([seq.id, name, cluster_num]))
     logfile.close()

   #save list of names for future uses of script
   pickle.dump( name_list, open( namefile, "wb" ) )