def mergeRegionAssemblies(regionDirList, options): count = 0 c_count = 0 g_count = 0 fadic = {} orderlist = [] desdic= {} for tmpdir in regionDirList: clist, glist = [],[] tridir = '%s.trinity.Trinity.fasta' %tmpdir if not os.path.isfile(tridir): continue tmpfa = Fasta.Parse(tridir) for seqid in tmpfa.id: count += 1 id2 = seqid.split('|')[1] c, g, i = id2.split('_') if c not in clist: clist.append(c) c_count += 1 if g not in glist: glist.append(g) g_count += 1 newid = 'Transcript%d|c%d_g%d_%s' %(count,c_count,g_count,i) chrid = tmpdir.split('/')[-2] regid = tmpdir.split('/')[-1][6:] seqlen= len(tmpfa.seq[seqid]) fadic[newid] = tmpfa.seq[seqid] desdic[newid] = '%s:%s len=%d' %(chrid,regid,seqlen) orderlist.append(newid) Fasta.write(fadic, '%s/Transcript.fa' %options.outpath, orderlist=orderlist, description=desdic)
def get_response_content(fs): # get the tree tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() # get the mixture weights weights = [fs.weight_a, fs.weight_b, fs.weight_c] # get the matrices matrices = [fs.matrix_a, fs.matrix_b, fs.matrix_c] for R in matrices: if R.shape != (4, 4): msg = 'expected each nucleotide rate matrix to be 4x4' raise HandlingError(msg) # get the nucleotide alignment try: alignment = Fasta.Alignment(fs.alignment.splitlines()) alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError(e) # create the mixture proportions weight_sum = sum(weights) mixture_proportions = [weight / weight_sum for weight in weights] # create the rate matrix objects ordered_states = list('ACGT') rate_matrix_objects = [] for R in matrices: rate_matrix_object = RateMatrix.RateMatrix(R.tolist(), ordered_states) rate_matrix_objects.append(rate_matrix_object) # create the mixture model mixture_model = SubModel.MixtureModel(mixture_proportions, rate_matrix_objects) # normalize the mixture model mixture_model.normalize() # return the html string return do_analysis(mixture_model, alignment, tree) + '\n'
def get_response_content(fs): # get the tree tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() # get the alignment try: alignment = Fasta.Alignment(fs.fasta.splitlines()) alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError(e) # define the jukes cantor rate matrix dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major( dictionary_rate_matrix, ordered_states, ordered_states) rate_matrix_object = RateMatrix.RateMatrix(row_major_rate_matrix, ordered_states) # simulate the ancestral alignment try: alignment = PhyLikelihood.simulate_ancestral_alignment( tree, alignment, rate_matrix_object) except PhyLikelihood.SimulationError as e: raise HandlingError(e) # get the alignment string using an ordering defined by the tree arr = [] for node in tree.preorder(): arr.append(alignment.get_fasta_sequence(node.name)) # return the response return '\n'.join(arr) + '\n'
def get_response_content(fs): # read the alignment try: alignment = Fasta.Alignment(fs.fasta.splitlines()) except Fasta.AlignmentError as e: raise HandlingError('fasta alignment error: ' + str(e)) if alignment.get_sequence_count() != 2: raise HandlingError('expected a sequence pair') # read the rate matrix R = fs.matrix # read the ordered states ordered_states = Util.get_stripped_lines(fs.states.splitlines()) if len(ordered_states) != len(R): msg_a = 'the number of ordered states must be the same ' msg_b = 'as the number of rows in the rate matrix' raise HandlingError(msg_a + msg_b) if len(set(ordered_states)) != len(ordered_states): raise HandlingError('the ordered states must be unique') # create the rate matrix object using the ordered states rate_matrix_object = RateMatrix.RateMatrix(R.tolist(), ordered_states) # create the objective function objective = Objective(alignment.sequences, rate_matrix_object) # Use golden section search to find the mle distance. # The bracket is just a suggestion. bracket = (0.51, 2.01) mle_distance = optimize.golden(objective, brack=bracket) # write the response out = StringIO() print >> out, 'maximum likelihood distance:', mle_distance #distances = (mle_distance, 0.2, 2.0, 20.0) #for distance in distances: #print >> out, 'f(%s): %s' % (distance, objective(distance)) return out.getvalue()
def test_seq_len(self): seq_lens = [12, 14, 9, 11, 16] _file = os.path.join(os.getcwd(), "_test_fasta.fa.gz") ifile = gzip.open(_file, 'rt') _test_fasta = Fasta.Fasta(ifile) for _fas in _test_fasta: self.assertTrue(_fas.length in seq_lens)
def make_xml(start_pos, stop_pos, nsamples): """ @return: location of xml file, location of log file """ out = StringIO() print >> out, g_xml_pre_alignment print >> out, """ <!-- The sequence alignment (each sequence refers to a taxon above). --> <alignment id="alignment" dataType="nucleotide"> """ lines = g_fasta_string.splitlines() for header, seq in Fasta.gen_header_sequence_pairs(lines): print >> out, '<sequence>' print >> out, '<taxon idref="%s"/>' % header print >> out, seq print >> out, '</sequence>' print >> out, '</alignment>' print >> out, """ <patterns id="firsthalf.patterns" from="%d" to="%d"> <alignment idref="alignment"/> </patterns> """ % (start_pos, stop_pos) print >> out, get_xml_post_alignment(nsamples) log_loc = Util.get_tmp_filename(prefix='beast', suffix='.log') print >> out, get_log_xml(log_loc) xml_loc = Util.create_tmp_file( out.getvalue(), prefix='beast', suffix='.xml') return xml_loc, log_loc
def get_response_content(fs): # get the tree tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() # get the sequence order if it exists ordered_names = Util.get_stripped_lines(fs.order.splitlines()) if ordered_names: observed_name_set = set(ordered_names) expected_name_set = set(node.get_name() for node in tree.gen_tips()) extra_names = observed_name_set - expected_name_set missing_names = expected_name_set - observed_name_set if extra_names: msg_a = 'the list of ordered names includes these names ' msg_b = 'not found in the tree: %s' % str(tuple(extra_names)) raise HandlingError(msg_a + msg_b) if missing_names: msg_a = 'the tree includes these names not found in the list ' msg_b = 'of ordered names: %s' % str(tuple(missing_names)) raise HandlingError(msg_a + msg_b) else: ordered_names = list(tip.get_name() for name in tree.gen_tips()) # do the sampling sampled_sequences = JC69.sample_sequences(tree, ordered_names, fs.length) alignment = Fasta.create_alignment(ordered_names, sampled_sequences) # return the response return alignment.to_fasta_string() + '\n'
def main(): # create the alignment object print 'creating the alignment...' alignment_string = Fasta.brown_example_alignment.strip() alignment = Fasta.Alignment(StringIO(alignment_string)) # create a tree object print 'creating the tree...' tree_string = Newick.brown_example_tree tree = Newick.parse(tree_string, Newick.NewickTree) # create a rate matrix object print 'creating the rate matrix object...' distribution = {'A': .25, 'C': .25, 'G': .25, 'T': .25} kappa = 2.0 row_major_rate_matrix = RateMatrix.get_unscaled_hky85_rate_matrix( distribution, kappa).get_row_major_rate_matrix() rate_matrix = RateMatrix.FastRateMatrix(row_major_rate_matrix, list('ACGT')) rate_matrix.normalize() # get the mle_rates print 'getting the mle rates...' mle_rates = get_mle_rates(tree, alignment, rate_matrix) print 'mle rates:' print mle_rates print 'stockholm string:' print get_stockholm_string(tree, alignment, mle_rates)
def simulate_ancestral_alignment(tree, alignment, substitution_model): """ @param tree: a newick tree with branch lengths @param alignment: a Fasta Alignment object with headers that match the tree tip names @param substitution_model: a way to simulate ancestral states from a tree given its leaf states @return: a Fasta Alignment object of the simulated ancestral sequences """ for node in tree.gen_non_root_nodes(): if node.get_branch_length() is None or node.get_branch_length() <= 0: raise SimulationError('all branch lengths should be positive') for node in tree.gen_internal_nodes(): if not node.name: raise SimulationError('all internal nodes should be named') simulated_ancestors = dict((node.name, []) for node in tree.gen_internal_nodes()) for col in alignment.columns: name_to_letter = dict(zip(alignment.headers, col)) # Augment each tip with its corresponding letter. for tip in tree.gen_tips(): tip.state = name_to_letter[tip.name] # Do the simulation. substitution_model.simulate_ancestral_states(tree) name_state_pairs = [(node.name, node.state) for node in tree.gen_internal_nodes_preorder()] # Add this simulated column. for name, state in name_state_pairs: simulated_ancestors[name].append(state) # Create an alignment object from the simulated sequences. sio = StringIO() print >> sio, alignment.to_fasta_string() for header, sequence in simulated_ancestors.items(): print >> sio, '>' + header print >> sio, ''.join(sequence) fasta_string = sio.getvalue() return Fasta.Alignment(StringIO(fasta_string))
def make_xml(start_pos, stop_pos, nsamples): """ @return: location of xml file, location of log file """ out = StringIO() print >> out, g_xml_pre_alignment print >> out, """ <!-- The sequence alignment (each sequence refers to a taxon above). --> <alignment id="alignment" dataType="nucleotide"> """ lines = g_fasta_string.splitlines() for header, seq in Fasta.gen_header_sequence_pairs(lines): print >> out, '<sequence>' print >> out, '<taxon idref="%s"/>' % header print >> out, seq print >> out, '</sequence>' print >> out, '</alignment>' print >> out, """ <patterns id="firsthalf.patterns" from="%d" to="%d"> <alignment idref="alignment"/> </patterns> """ % (start_pos, stop_pos) print >> out, get_xml_post_alignment(nsamples) log_loc = Util.get_tmp_filename(prefix='beast', suffix='.log') print >> out, get_log_xml(log_loc) xml_loc = Util.create_tmp_file(out.getvalue(), prefix='beast', suffix='.xml') return xml_loc, log_loc
def get_response_content(fs): # read the nucleotide weights nt_weights = [fs.A, fs.C, fs.G, fs.T] # convert the nucleotide weights to probabilities nt_probs = [x / float(sum(nt_weights)) for x in nt_weights] # Assert that the kappa value and the nucleotide # probabilities are compatible. A, C, G, T = nt_probs R = float(A + G) Y = float(C + T) if R <= 0: raise HandlingError('the frequency of a purine must be positive') if Y <= 0: raise HandlingError('the frequency of a pyrimidine must be positive') if fs.kappa <= max(-Y, -R): msg_a = 'kappa must be greater than max(-R, -Y) ' msg_b = 'where R and Y are the purine and pyrimidine frequencies' raise HandlingError(msg_a + msg_b) # Create the rate matrix object # which is automatically scaled to a rate of 1.0. model = F84.create_rate_matrix(fs.kappa, nt_probs) # simulate a pair of sequences sequence_pair = PairLikelihood.simulate_sequence_pair( fs.distance, model, fs.length) # convert the pair of sequences to an alignment object aln = StringIO() print >> aln, '>first' print >> aln, ''.join(sequence_pair[0]) print >> aln, '>second' print >> aln, ''.join(sequence_pair[1]) return Fasta.Alignment(StringIO(aln.getvalue())).to_fasta_string() + '\n'
def parseGumby(gumbyFile, exonFile, baseSeq): # parses gumbyFile, removes things that overlap exons and gumbies that consist only of gaps on baseSeq # returns a list of gumbyBlocks infile = open(gumbyFile, "r") exons = [] if exonFile!=None: fh = open(exonFile, "r") for l in fh: fs = l.split() if fs[0].lower()!=baseSeq: continue exons.append([ int(fs[3]), int(fs[4]) ] ) # print exons re1 = compile("[a-z]+[ ]+[0-9]+[ ]+[0-9]+") seqs = {} pos = {} i = -1 resultLst = alignment.Alignment() for l in infile: l = l.strip() l = l.replace("*","-") l = l.replace("<", "-") l = l.replace(">", "-") if l.startswith("start"): if i!=-1: resultLst.extend(procData(baseSeq, exons, i, seqs, pos, pval, length, score)) f = l.split() pval = float(f[-1]) length = int(f[6].strip(",")) score = int(f[8].strip(",")) i+=1 seqs={} if re1.match(l): f = l.split() name = f[0] start = int(f[1])-1 end = int(f[2])-1 seq = f[3] if name not in seqs: faseq = Fasta.FastaSeq(name, seq) faseq.chrom = name faseq.start = start faseq.end = end seqs[name] = faseq else: faseq = seqs[f[0]] faseq.nucl += f[3] pos[name] = (name, start,end) resultLst.extend(procData(baseSeq, exons, i, seqs, pos, pval, length, score)) return resultLst
def test_string_filename(self): known_seq = [ 'GGGACAGGGGGC', 'GGGACTGGGGGGGC', 'ATGGCATAT', 'ATGGCATATCA', 'ATCGGAGGGATACGAG' ] ifile = os.path.join(os.getcwd(), "_test_fasta.fa") _test_fasta = Fasta.Fasta(ifile) for _fas in _test_fasta: self.assertTrue(_fas.sequence in known_seq)
def test_open_filename(self): known_seq = [ 'GGGACAGGGGGC', 'GGGACTGGGGGGGC', 'ATGGCATAT', 'ATGGCATATCA', 'ATCGGAGGGATACGAG' ] _file = os.path.join(os.getcwd(), "_test_fasta.fa.gz") ifile = gzip.open(_file, 'rt') _test_fasta = Fasta.Fasta(ifile) for _fas in _test_fasta: self.assertTrue(_fas.sequence in known_seq)
def get_amino_acid_alignment(table): """ @param table: a table of data in some random format sent by Ferran Casals @return: a Fasta amino acid alignment object """ if len(table) < 2: raise HandlingError('the data table should have at least two rows') first_row = table[0] if len(first_row) < 6: raise HandlingError( 'the first row of the table has %d columns ' 'but at least six were expected' % len(first_row)) if first_row[0].upper() != 'variant'.upper(): raise HandlingError('expected the first column to be the variant') if first_row[1].upper() != 'chr'.upper(): raise HandlingError('expected the second column to be the chromosome') if first_row[2].upper() != 'position'.upper(): raise HandlingError('expected the third column to be the position') if first_row[3].upper() != 'Amino Acid Change'.upper(): raise HandlingError( 'expected the fourth column to be the amino acid change') if first_row[4].upper() != 'alleles'.upper(): raise HandlingError( 'expected the fifth column to be the nucleotide change') remaining_rows = table[1:] for row in remaining_rows: if len(row) != len(first_row): raise HandlingError( 'each row should have the same number of columns') # get the ordered taxa taxa = first_row[5:] if len(set(taxa)) != len(taxa): raise HandlingError('the same taxon appears in more than one column') # get the sequence of codons for each taxon codon_sequences = zip(*remaining_rows)[5:] # convert codon sequences to amino acid sequences aa_sequences = [] for codon_sequence in codon_sequences: aa_list = [] for codon in codon_sequence: codon = codon.upper() if codon == 'ND': aa = '-' elif codon in Codon.g_non_stop_codons: aa = Codon.g_codon_to_aa_letter[codon] elif codon in Codon.g_stop_codons: raise HandlingError( 'one of the codons is a stop codon: %s' % codon) else: raise HandlingError( 'one of the codons is invalid: %s' % codon) aa_list.append(aa) aa_sequences.append(''.join(aa_list)) # return the alignment return Fasta.create_alignment(taxa, aa_sequences)
def calcGC(targetDic, faPath): tmpdic = {} fa = Fasta.Parse(faPath) for chrid in targetDic: print chrid tmpdic[chrid] = {} for item in targetDic[chrid]: seq = fa.seq[chrid][targetDic[chrid][item][0]:targetDic[chrid][item][1]].upper() gc = (seq.count('G') + seq.count('C') ) / len(seq) tmpdic[chrid][item] = gc return tmpdic
def test_likelihood_calculation(self): # get a tree tree = Newick.parse(sample_tree_string, Newick.NewickTree) # get a model input_xml_string = get_sample_xml_string() model = deserialize_mixture_model(input_xml_string) # get an alignment alignment = Fasta.CodonAlignment( StringIO(long_sample_codon_alignment_string)) # get the likelihood log_likelihood = PhyLikelihood.get_log_likelihood( tree, alignment, model)
def get_amino_acid_alignment(table): """ @param table: a table of data in some random format sent by Ferran Casals @return: a Fasta amino acid alignment object """ if len(table) < 2: raise HandlingError('the data table should have at least two rows') first_row = table[0] if len(first_row) < 6: raise HandlingError('the first row of the table has %d columns ' 'but at least six were expected' % len(first_row)) if first_row[0].upper() != 'variant'.upper(): raise HandlingError('expected the first column to be the variant') if first_row[1].upper() != 'chr'.upper(): raise HandlingError('expected the second column to be the chromosome') if first_row[2].upper() != 'position'.upper(): raise HandlingError('expected the third column to be the position') if first_row[3].upper() != 'Amino Acid Change'.upper(): raise HandlingError( 'expected the fourth column to be the amino acid change') if first_row[4].upper() != 'alleles'.upper(): raise HandlingError( 'expected the fifth column to be the nucleotide change') remaining_rows = table[1:] for row in remaining_rows: if len(row) != len(first_row): raise HandlingError( 'each row should have the same number of columns') # get the ordered taxa taxa = first_row[5:] if len(set(taxa)) != len(taxa): raise HandlingError('the same taxon appears in more than one column') # get the sequence of codons for each taxon codon_sequences = zip(*remaining_rows)[5:] # convert codon sequences to amino acid sequences aa_sequences = [] for codon_sequence in codon_sequences: aa_list = [] for codon in codon_sequence: codon = codon.upper() if codon == 'ND': aa = '-' elif codon in Codon.g_non_stop_codons: aa = Codon.g_codon_to_aa_letter[codon] elif codon in Codon.g_stop_codons: raise HandlingError('one of the codons is a stop codon: %s' % codon) else: raise HandlingError('one of the codons is invalid: %s' % codon) aa_list.append(aa) aa_sequences.append(''.join(aa_list)) # return the alignment return Fasta.create_alignment(taxa, aa_sequences)
def runPairWiseDiffs(self, fastaFileNames): print 'Calculating pairwise diffenrences...', # Read in fasta sequences into a dictionary: completeSets = {} for fastaFileName in fastaFileNames: baseName = os.path.splitext(os.path.basename(fastaFileName))[0] #baseName = os.path.basename(fastaFileName).split(".")[0] completeSets[baseName] = {} fastaFile = open(fastaFileName, 'r') fastaIterator = Fasta.Iterator(fastaFile, parser=Fasta.RecordParser()) for fastaRecord in fastaIterator: newName = safeName(copy.copy(fastaRecord.title)) #completeSets[baseName][fastaRecord.title.strip()] = fastaRecord.sequence completeSets[baseName][newName] = fastaRecord.sequence fastaFile.close() # Load existing alignment matrix alignmentMatrices = {} for fastaFileBaseName in completeSets.keys(): if not alignmentMatrices.has_key(fastaFileBaseName): alignmentMatrices[fastaFileBaseName] = {} alignmentMatrixFileName = os.path.join( self.options.statsdir, fastaFileBaseName + "_matrix.pickle") if os.path.exists(alignmentMatrixFileName) and os.path.getsize( alignmentMatrixFileName) > 0: alignmentMatrixFile = open(alignmentMatrixFileName, 'r') alignmentMatrices[fastaFileBaseName] = pickle.load( alignmentMatrixFile) alignmentMatrixFile.close() # Add any new alignments to alignment matrix (and save to them to file) self.updateAlignmentMatrix(alignmentMatrices, completeSets) print 'done'
def test_simulation(self): tree_string = '(((Human:0.1, Chimpanzee:0.2)to-chimp:0.8, Gorilla:0.3)to-gorilla:0.7, Orangutan:0.4, Gibbon:0.5)all;' # Parse the example tree. tree = Newick.parse(tree_string, Newick.NewickTree) tree.assert_valid() # Get header and sequence pairs. alignment = Fasta.Alignment(StringIO(Fasta.brown_example_alignment)) # Get the Jukes-Cantor rate matrix object. dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major(dictionary_rate_matrix, ordered_states, ordered_states) rate_matrix_object = RateMatrix.RateMatrix(row_major_rate_matrix, ordered_states) # Simulate ancestral states. simulated_alignment = simulate_ancestral_alignment(tree, alignment, rate_matrix_object)
def get_response_content(fs): out = StringIO() try: alignment = Fasta.Alignment(fs.fasta.splitlines()) print >> out, 'This is a valid alignment.' except Fasta.AlignmentError as e: alignment = None print >> out, 'This is not a valid alignment:', e if alignment: try: old_column_count = len(alignment.columns) alignment.force_nucleotide() removed_column_count = old_column_count - len(alignment.columns) if removed_column_count: print >> out, ('After removing %d' % removed_column_count), print >> out, 'columns this is a valid nucleotide alignment.' else: print >> out, 'This is a valid nucleotide alignment.' except Fasta.AlignmentError as e: print >> out, 'This is not a valid nucleotide alignment:', e for header, seq in Fasta.gen_header_sequence_pairs(StringIO(fs.fasta)): print >> out, '%s: %d' % (header, len(seq)) return out.getvalue()
def test_gc_content(self): known_seq = [ 'GGGACAGGGGGC', 'GGGACTGGGGGGGC', 'ATGGCATAT', 'ATGGCATATCA', 'ATCGGAGGGATACGAG' ] gc_ = [ 0.833333333333, 0.857142857143, 0.333333333333, 0.363636363636, 0.5625 ] gc_dict = dict(zip(known_seq, gc_)) ifile = os.path.join(os.getcwd(), "_test_fasta.fa.gz") _test_fasta = Fasta.Fasta(ifile) for _fas in _test_fasta: self.assertAlmostEqual(_fas.gc, gc_dict[_fas.sequence], places=4)
def get_response_content(fs): # get the tree tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() # get the nucleotide alignment try: alignment = Fasta.Alignment(fs.alignment.splitlines()) alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError(e) # get the normalized Direct RNA mixture model mixture_model = DirectRna.deserialize_mixture_model(fs.model) mixture_model.normalize() # return the html string return do_analysis(mixture_model, alignment, tree) + '\n'
def test_likelihood(self): # Parse the example tree. tree_string = Newick.brown_example_tree tree = Newick.parse(tree_string, Newick.NewickTree) tree.assert_valid() # Get header and sequence pairs. alignment = Fasta.Alignment(StringIO(Fasta.brown_example_alignment)) # Get the Jukes-Cantor rate matrix object. dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major(dictionary_rate_matrix, ordered_states, ordered_states) rate_matrix_object = RateMatrix.RateMatrix(row_major_rate_matrix, ordered_states) # Calculate the log likelihood. log_likelihood = get_log_likelihood(tree, alignment, rate_matrix_object) self.assertAlmostEqual(log_likelihood, -4146.26547208)
def get_response_content(fs): # read the alignment try: alignment = Fasta.Alignment(fs.fasta.splitlines()) except Fasta.AlignmentError as e: raise HandlingError('fasta alignment error: ' + str(e)) if alignment.get_sequence_count() < 2: raise HandlingError('expected at least two sequences') # Create the distance matrix, # replacing values of None with the representation for infinity. row_major_distance_matrix = [] for row in JC69.get_ML_distance_matrix(alignment.sequences): corrected_row = [fs.infinity if x == float('inf') else x for x in row] row_major_distance_matrix.append(corrected_row) # return the response return MatrixUtil.m_to_string(row_major_distance_matrix) + '\n'
def protsplit(protstr, prefix='', minlen=0, offset=0, dir='fwd'): prot = 'ACDEFGHIKLMNPQRSTVWY' re_prot = re.compile('[%s]+' % prot) out = [] plen = len(protstr) nlen = plen * 3 for match in re_prot.finditer(protstr): # print prefix,offset,match.start(),match.end() start = match.start() end = match.end() if end - start < minlen: continue nucleotide_start = start * 3 + offset if dir == 'rvs': nucleotide_start = nlen - start * 3 - offset out.append( Fasta(protstr[match.start():match.end()], '%s_%i' % (prefix, nucleotide_start))) return out
def parse(gff_file, base=None): ins_file=gff_file.replace(".gff","")+".ins" insertions=None if not os.path.exists(ins_file) else Fasta.parse(ins_file, todict=True) calls=[] for entry in open(gff_file, "r"): if entry.startswith("#"): continue try: call=Call(entry, base=base) if insertions is not None and call.id in insertions: call.inserted=str(insertions[call.id].seq) elif "Iseq" in call.attributes: call.inserted=call.attributes["Iseq"] del call.attributes["Iseq"] calls.append(call) except: print >> sys.stderr, "Unable to parse line: %s" % entry raise return calls
def get_response_content(fs): """ @param fs: a FieldStorage object containing the cgi arguments @return: a (response_headers, response_text) pair """ # read the alignment try: alignment = Fasta.Alignment(StringIO(fs.fasta)) except Fasta.AlignmentError as e: raise HandlingError('fasta alignment error: ' + str(e)) if alignment.get_sequence_count() < 2: raise HandlingError('expected at least two sequences') # read the rate matrix R = fs.matrix # read the ordered states ordered_states = Util.get_stripped_lines(StringIO(fs.states)) if len(ordered_states) != len(R): msg_a = 'the number of ordered states must be the same ' msg_b = 'as the number of rows in the rate matrix' raise HandlingError(msg_a + msg_b) if len(set(ordered_states)) != len(ordered_states): raise HandlingError('the ordered states must be unique') # create the rate matrix object using the ordered states rate_matrix_object = RateMatrix.RateMatrix(R.tolist(), ordered_states) # create the distance matrix n = alignment.get_sequence_count() row_major_distance_matrix = [[0] * n for i in range(n)] for i, sequence_a in enumerate(alignment.sequences): for j, sequence_b in enumerate(alignment.sequences): if i < j: # create the objective function using the sequence pair objective = Objective((sequence_a, sequence_b), rate_matrix_object) # Use golden section search to find the mle distance. # The bracket is just a suggestion. bracket = (0.51, 2.01) mle_distance = optimize.golden(objective, brack=bracket) # fill two elements of the matrix row_major_distance_matrix[i][j] = mle_distance row_major_distance_matrix[j][i] = mle_distance # write the response out = StringIO() print >> out, 'maximum likelihood distance matrix:' print >> out, MatrixUtil.m_to_string(row_major_distance_matrix) return out.getvalue()
def get_response_content(fs): # get the alignment object try: alignment = Fasta.Alignment(StringIO(fs.fasta)) except Fasta.AlignmentError as e: raise HandlingError('alignment error: ' + str(e)) # assert that the alignment is of exactly two sequences if len(alignment.sequences) != 2: raise HandlingError('expected a pair of sequences') # assert that the alignment is a gapless unambiguous nucleotide alignment old_column_count = alignment.get_column_count() try: alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError('nucleotide alignment error: ' + str(e)) new_column_count = alignment.get_column_count() if old_column_count != new_column_count: msg = 'expected a gapless unambiguous nucleotide alignment' raise HandlingError(msg) # get the maximum likelihood estimates according to a numeric optimizer. f = F84.Objective(alignment.sequences) values = list(f.get_initial_parameters()) result = scipy.optimize.fmin(f, values, ftol=1e-10, disp=0) distance, kappa, wC, wG, wT = result nt_distribution = F84.parameters_to_distribution((wC, wG, wT)) A, C, G, T = nt_distribution model = F84.create_rate_matrix(kappa, nt_distribution) log_likelihood = PairLikelihood.get_log_likelihood(distance, alignment.sequences, model) # begin the response out = StringIO() print >> out, 'ML distance:', distance print >> out, 'ML kappa:', kappa print >> out, 'ML A frequency:', A print >> out, 'ML C frequency:', C print >> out, 'ML G frequency:', G print >> out, 'ML T frequency:', T print >> out, 'log likelihood:', log_likelihood # write the response return out.getvalue()
def get_response_content(fs): # get the tree tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() # get the alignment try: alignment = Fasta.Alignment(fs.fasta.splitlines()) alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError(e) # get the log likelihood dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major( dictionary_rate_matrix, ordered_states, ordered_states) rate_matrix_object = RateMatrix.RateMatrix( row_major_rate_matrix, ordered_states) log_likelihood = PhyLikelihood.get_log_likelihood( tree, alignment, rate_matrix_object) # return the response return str(log_likelihood) + '\n'
def simulate_alignment(tree, substitution_model, ncolumns, seed=None): """ @param tree: a newick tree with branch lengths @param substitution_model: a way to simulate states on a tree @param ncolumns: the number of columns to simulate @param seed: a random number seed @return: a Fasta Alignment object of the simulated sequences """ # Check the input. for node in tree.gen_non_root_nodes(): if node.get_branch_length() is None or node.get_branch_length() <= 0: raise SimulationError('all branch lengths should be positive') tip_names = [node.name for node in tree.gen_tips()] for name in tip_names: if not name: raise SimulationError('each leaf should have a name') if len(tip_names) != len(set(tip_names)): raise SimulationError('each leaf should have a unique name') # Save the rng state if we are using a seed. if seed is not None: old_rng_state = random.getstate() # Seed the rng if we are using a seed. if seed is not None: random.seed(seed) # Simulate the states on the tree. simulated_sequences = dict((node.name, []) for node in tree.gen_tips()) for column_index in range(ncolumns): substitution_model.simulate_states(tree) for node in tree.gen_tips(): simulated_sequences[node.name].append(node.state) # Restore the rng state if we are using a seed if seed is not None: random.setstate(old_rng_state) # Create an alignment object from the simulated sequences. sio = StringIO() for header, sequence in simulated_sequences.items(): print >> sio, '>' + header print >> sio, ''.join(sequence) fasta_string = sio.getvalue() return Fasta.Alignment(StringIO(fasta_string))
def get_response_content(fs): # get the alignment object try: alignment = Fasta.Alignment(fs.fasta.splitlines()) except Fasta.AlignmentError as e: raise HandlingError('alignment error: ' + str(e)) # assert that the alignment is of exactly two sequences if len(alignment.sequences) != 2: raise HandlingError('expected a pair of sequences') # assert that the alignment is a gapless unambiguous nucleotide alignment old_column_count = alignment.get_column_count() try: alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError('nucleotide alignment error: ' + str(e)) new_column_count = alignment.get_column_count() if old_column_count != new_column_count: msg = 'expected a gapless unambiguous nucleotide alignment' raise HandlingError(msg) # get the maximum likelihood estimates sequence_pair = alignment.sequences distance, kappa, A, C, G, T = F84.get_closed_form_estimates(sequence_pair) # get the log likelihood nt_distribution = (A, C, G, T) rate_matrix_object = F84.create_rate_matrix(kappa, nt_distribution) log_likelihood = PairLikelihood.get_log_likelihood(distance, alignment.sequences, rate_matrix_object) # begin the response out = StringIO() print >> out, 'distance:', distance print >> out, 'kappa:', kappa print >> out, 'A frequency:', A print >> out, 'C frequency:', C print >> out, 'G frequency:', G print >> out, 'T frequency:', T print >> out, 'log likelihood:', log_likelihood # return the response return out.getvalue()
def parse(gff_file, base=None): logger = logging.getLogger(parse.__name__) ins_file=gff_file.replace(".gff","")+".ins" insertions=None if not os.path.exists(ins_file) else Fasta.parse(ins_file, todict=True) if insertions is None: logger.warn("Insertion sequence file %s missing" % ins_file) calls=[] for entry in open(gff_file, "r"): if entry.startswith("#"): continue try: call=Call(entry, base=base) if insertions is not None and call.id in insertions: call.inserted=str(insertions[call.id].seq) elif "Iseq" in call.attributes: call.inserted=call.attributes["Iseq"] del call.attributes["Iseq"] calls.append(call) except: logger.error("Unable to parse line: %s" % entry) raise return calls
def convert(inputVCFFile='', outputVariantFile='', parameters={}): fo = open(outputVariantFile, 'w') fa = Fasta.Fasta(fname=parameters['refFile']) vcffiles = inputVCFFile.split(',') for vcffile in vcffiles: vcf = VCFFile.VCFFile(fname=vcffile, mode='r') while True: dat = vcf.readline() if dat == {}: break pos = int(dat['POS']) chr = dat['CHROM'] ref = dat['REF'] rseq = ''.join(fa.get(chr, pos, len(ref))) if rseq != ref: sys.stderr.write("REFSEQ inconsistency\n") if float(dat['QUAL']) >= parameters['minQual']: altseq = dat['ALT'].split(',') for alt in altseq: if alt != "<DEL>" and len(alt) != len(ref): var = Variant.Variant4(ref=ref, alt=alt) if var.type == "ins" or var.type == "del": fo.write("%s %d %s\n" % (chr, pos + var.offset - 1, var.str)) fo.close() vcf.close()
import re if __name__ == '__main__': #print(sys.argv) inFile = sys.stdin else: workDir = '/home/ashis/work/github/courses/JHU_Computational_Genomics/HW3' os.chdir(workDir) sys.path.append(os.path.abspath(os.getcwd())) inputFilePath = 'data/4_subsequence_1.txt' inFile = open(inputFilePath, 'r') import Fasta # read inputs dnas = Fasta.parse_fasta(inFile) inFile.close() s = dnas[0] t = dnas[1] subseqIndexes = [] si = -1 for ti in range(len(t)): tchar = t[ti] while 1: si += 1 if s[si] == tchar: subseqIndexes.append(si) break
def get_response_content(fs): # init the response and get the user variables out = StringIO() nleaves = fs.nleaves nvertices = nleaves * 2 - 1 nbranches = nvertices - 1 nsites = fs.nsites # sample the coalescent tree with timelike branch lengths R, B = kingman.sample(fs.nleaves) r = Ftree.R_to_root(R) # get the leaf vertex names N = dict(zip(range(nleaves), string.uppercase[:nleaves])) N_leaves = dict(N) # get the internal vertex names v_to_leaves = R_to_v_to_leaves(R) for v, leaves in sorted(v_to_leaves.items()): if len(leaves) > 1: N[v] = ''.join(sorted(N[leaf] for leaf in leaves)) # get vertex ages v_to_age = kingman.RB_to_v_to_age(R, B) # sample the rates on the branches b_to_rate = sample_b_to_rate(R) xycorr = get_correlation(R, b_to_rate) # define B_subs in terms of substitutions instead of time B_subs = dict((p, t * b_to_rate[p]) for p, t in B.items()) # sample the alignment v_to_seq = sample_v_to_seq(R, B_subs, nsites) # get the log likelihood; this is kind of horrible pairs = [(N[v], ''.join(v_to_seq[v])) for v in range(nleaves)] headers, sequences = zip(*pairs) alignment = Fasta.create_alignment(headers, sequences) newick_string = FtreeIO.RBN_to_newick(R, B_subs, N_leaves) tree = Newick.parse(newick_string, Newick.NewickTree) dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major( dictionary_rate_matrix, ordered_states, ordered_states) rate_matrix_object = RateMatrix.RateMatrix( row_major_rate_matrix, ordered_states) ll = PhyLikelihood.get_log_likelihood( tree, alignment, rate_matrix_object) # get ll when rates are all 1.0 newick_string = FtreeIO.RBN_to_newick(R, B, N_leaves) tree = Newick.parse(newick_string, Newick.NewickTree) ll_unity = PhyLikelihood.get_log_likelihood( tree, alignment, rate_matrix_object) # get ll when rates are numerically optimized # TODO incorporate the result into the xml file # TODO speed up the likelihood evaluation (beagle? C module?) #f = Opt(R, B, N_leaves, alignment) #X_logs = [0.0] * nbranches #result = scipy.optimize.fmin(f, X_logs, full_output=True) #print result # print >> out, '<?xml version="1.0"?>' print >> out, '<beast>' print >> out print >> out, '<!-- actual rate autocorrelation', xycorr, '-->' print >> out, '<!-- actual root height', v_to_age[r], '-->' print >> out, '<!-- actual log likelihood', ll, '-->' print >> out, '<!-- ll if rates were unity', ll_unity, '-->' print >> out print >> out, '<!--' print >> out, 'predefine the taxa as in' print >> out, 'http://beast.bio.ed.ac.uk/Introduction_to_XML_format' print >> out, '-->' print >> out, get_leaf_taxon_defn(list(string.uppercase[:nleaves])) print >> out print >> out, '<!--' print >> out, 'define the alignment as in' print >> out, 'http://beast.bio.ed.ac.uk/Introduction_to_XML_format' print >> out, '-->' print >> out, get_alignment_defn(leaves, N, v_to_seq) print >> out print >> out, '<!--' print >> out, 'specify the starting tree as in' print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_4' print >> out, '-->' print >> out, get_starting_tree_defn(R, B, N_leaves) print >> out print >> out, '<!--' print >> out, 'connect the tree model as in' print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_4' print >> out, '-->' print >> out, g_tree_model_defn print >> out print >> out, g_uncorrelated_relaxed_clock_info print >> out """ print >> out, '<!--' print >> out, 'create a list of taxa for which to constrain the mrca as in' print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_3.1' print >> out, '-->' for v, leaves in sorted(v_to_leaves.items()): if len(leaves) > 1: print >> out, get_mrca_subset_defn(N, v, leaves) print >> out print >> out, '<!--' print >> out, 'create a tmrcaStatistic that will record the height as in' print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_3.1' print >> out, '-->' for v, leaves in sorted(v_to_leaves.items()): if len(leaves) > 1: print >> out, get_mrca_stat_defn(N[v]) """ print >> out print >> out, g_likelihood_info print >> out print >> out, '<!--' print >> out, 'run the mcmc' print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_3.1' print >> out, '-->' print >> out, get_mcmc_defn(v_to_leaves, v_to_age, N) print >> out print >> out, '</beast>' # return the response return out.getvalue()
#!/usr/bin/env python """ fastaSplit.py <fasta file> <output dir> """ import sys import Fasta if '-h' in sys.argv or '--help' in sys.argv or len(sys.argv)==1: sys.exit(__doc__) Fasta.split(sys.argv[1], oDir=sys.argv[2])
codonMapFile = sys.argv[1] else: workDir = "/home/ashis/work/github/courses/JHU_Computational_Genomics/HW3" os.chdir(workDir) sys.path.append(os.path.abspath(os.getcwd())) inputFilePath = "data/rosalind_splc.txt" inFile = open(inputFilePath, "r") codonMapFile = "inputs/rna-codon.txt" import Dna import Rna import Fasta # read inputs lines = Fasta.parse_fasta(inFile) inFile.close() dna = lines[0] introns = lines[1:] # splice dna re_pattern = "|".join(introns) splicedDna = re.sub(re_pattern, "", dna, 0) sdna = Dna.Dna(splicedDna) # transcribe and translate rna = Rna.Rna(sdna.transcribe(), codonMapFile=codonMapFile) protein = rna.translate() print(protein)
#!/usr/bin/env python """ reverse_comp.py <filename> Prints the reverse complement of a DNA string (in Fasta format). """ import sys import Fasta import Sequence if len(sys.argv) != 2 or "-h" in sys.argv or "--help" in sys.argv: sys.exit(__doc__) iFilename = sys.argv[1] header, seq = Fasta.load(iFilename) seq = Sequence.reverse_complement(seq.upper()) print ">%s" % header for i in xrange(0, len(seq), 80): print seq[i : i + 80]
#!/usr/bin/env python """ fastaLength.py <input filename> """ import sys import Fasta if len(sys.argv)==1 or '-h' in sys.argv or '--help' in sys.argv: sys.exit(__doc__) faFile = Fasta.load_mfa_iter(sys.argv[1]) for h,s in faFile: print h, len(s)
#!/usr/bin/env python """ translate.py <filename> Translates a DNA sequence to a protein sequence """ import sys import Fasta import Sequence if len(sys.argv)!=2 or '-h' in sys.argv or '--help' in sys.argv: sys.exit(__doc__) w = 60 iFilename = sys.argv[1] faFile = Fasta.load_mfa_iter(iFilename) for header,seq in faFile: protein = Sequence.translate(seq) print '>%s' % header for i in xrange(0, len(protein), w): print protein[i:i+w]
import re if __name__ == '__main__': #print(sys.argv) inFile = sys.stdin else: workDir = '/home/ashis/work/github/courses/JHU_Computational_Genomics/HW3' os.chdir(workDir) sys.path.append(os.path.abspath(os.getcwd())) inputFilePath = 'data/6_edit_alignment_1.txt' inFile = open(inputFilePath, 'r') import Fasta # read inputs proteins = Fasta.parse_fasta(inFile) inFile.close() protein1 = proteins[0] protein2 = proteins[1] sigma = 1 ## create scores and backtrack arrays len1 = len(protein1) len2 = len(protein2) scores = [[0]*(len2+1) for x in range(len1+1)] backtracks = [[-1]*(len2+1) for x in range(len1+1)] ## put first row and first column of scores and backtrack arrays for i in range(1,len1+1):
#!/usr/bin/env python import re import Fasta classes = { 'Extended B': (14850000, 15460000), 'Class I-II': (15460000, 17100000), 'Class III': (17100000, 17950000), 'Framework': (17950000, 19265000), 'Extended A': (19265000, 19400000) } header,seq = Fasta.load('scaffold_42.fa') for name,(start,end) in classes.items(): name = '_'.join(name.split()) new_header = '%s.%s-%s' % (header, start, end) Fasta.save('%s.fa' % name, new_header, seq[start-1:end])
def get_header_seq_pairs(): lines = g_fasta_string.splitlines() return list(Fasta.gen_header_sequence_pairs(lines))