def main(fin, fout, fout_weights): """ @param fin: open file for reading interleaved phylip alignment @param fout: open file for writing interleaved phylip alignment @param fout_weights: open file for writing codon column multiplicities """ # init the list of unique columns unique_col_list = [] col_to_count = defaultdict(int) # read the taxon names and the columns taxon_names = None for col in phylip.read_interleaved_codon_alignment(fin): if taxon_names is None: taxon_names = col else: if col not in col_to_count: unique_col_list.append(col) col_to_count[col] += 1 # get some output formatting info ntaxa = len(taxon_names) name_lengths = [len(name) for name in taxon_names] ljust_spacing = max(name_lengths + [9]) # write the interleaved phylip header nunique_codon_cols = len(unique_col_list) print >> fout, ' %d %d' % (ntaxa, 3 * nunique_codon_cols) # write the output files ncols_per_paragraph = 15 offset = 0 while True: # transpose the column list back into a paragraph cols = unique_col_list[offset : offset+ncols_per_paragraph] if not cols: break paragraph = zip(*cols) # write the weights corresponding to these columns if fout_weights is not None: weights = [col_to_count[col] for col in cols] print >> fout_weights, '\n'.join(str(w) for w in weights) # write the paragraph for i in range(ntaxa): row = paragraph[i] if offset: print >> fout, ''.ljust(ljust_spacing), else: print >> fout, taxon_names[i].ljust(ljust_spacing), print >> fout, ' '.join(row) print >> fout # move the the next paragraph worth of columns offset += ncols_per_paragraph
def main(fin, fin_gcode, fin_taxa, fout): """ @param fin: interleaved phylip codon alignment file open for reading @param fin_gcode: open file for reading the genetic code @param fin_taxa: optional open file for defining taxon subset and order @param fout: open file for writing the integer ndarray as text """ # read the description of the genetic code arr = list(csv.reader(fin_gcode, delimiter='\t')) indices, aminos, codons = zip(*arr) if [int(x) for x in indices] != range(len(indices)): raise ValueError # read the interleaved phylip alignment taxon_names = None cols = [] for col in phylip.read_interleaved_codon_alignment(fin): if taxon_names is None: taxon_names = col else: cols.append(col) # define the ndarray of integers M_full = design.get_pattern_array(codons, cols) if fin_taxa is None: M = M_full else: # read the ordered taxon subset arr = list(csv.reader(fin_taxa, delimiter='\t')) indices, requested_taxa = zip(*arr) if [int(x) for x in indices] != range(len(indices)): raise ValueError # init the pattern ndarray with unknown codon states M = np.empty((len(cols), len(requested_taxa)), dtype=int) M.fill(-1) # construct the inverse map of the default taxon ordering name_to_phlip_index = dict((x, i) for i, x in enumerate(taxon_names)) # Redefine the columns according to the user ordering and subsetting. # In this code we are pretending to be a database software. for i, name in enumerate(requested_taxa): phylip_index = name_to_phlip_index.get(name, None) if phylip_index is not None: M[:, i] = M_full[:, phylip_index] # write the ndarray of integers np.savetxt(fout, M, fmt='%d', delimiter='\t')