Exemplo n.º 1
0
	def __init__(self,alnFile1,alnFile2):
		name1, aln1 = read_fasta(alnFile1)
		name2, aln2 = read_fasta(alnFile2)
		self.name1 = name1
		self.name2 = name2
		self.aln1 = aln1
		self.aln2 = aln2
		self.alnFile1 = alnFile1
		self.alnFile2 = alnFile2
Exemplo n.º 2
0
def write_alignment(sequence_file,fout):
        taxon_names, seq_aln = read_fasta(sequence_file)
        for (taxon,seq) in zip(taxon_names,seq_aln):
            if taxon != "out":
                fout.write("\t\t<sequence>\n")
                fout.write("\t\t\t<taxon idref=\"" + taxon + "\"/>\n")
                fout.write("\t\t\t"+seq + "\n")
                fout.write("\t\t</sequence>\n")
Exemplo n.º 3
0
	def sub_merge(self,n1,n2):
		# randomly sample sequences from aln1 and aln2
		subprocess.check_call(["python","utils/sampling.py",self.alnFile1,"temp1.fas",str(n1)])	
		subprocess.check_call(["python","utils/sampling.py",self.alnFile2,"temp2.fas",str(n2)])	
		# and call opal (or perhaps another merger) to merge them
		subprocess.check_call(["java","-Xmx256M","-jar","opal_2.1.3/Opal.jar","--in","temp1.fas","--in2","temp2.fas","--out","temp.fas"])
		subprocess.check_call(["utils/stdFAS.py","temp.fas","merged.fas"])
		
		name1,sub_aln1 = read_fasta("temp1.fas")
		name2,sub_aln2 = read_fasta("temp2.fas")
		name,sub_merged = read_fasta("merged.fas")

		subprocess.check_call(["rm","temp1.fas"])
		subprocess.check_call(["rm","temp2.fas"])
		subprocess.check_call(["rm","temp.fas"])
		subprocess.check_call(["rm","merged.fas"])
		
		return sub_aln1, sub_aln2, sub_merged
Exemplo n.º 4
0
def main():
    concated = {}
    L = 0

    for seqfile in argv[1:-1]:
        print(seqfile)
        newNames, newSeqs = read_fasta(seqfile)
        L = add_one_aln(concated, L, newNames, newSeqs)

    names, seqs = print_concatenated(concated, L)
    write_fasta(argv[-1], names, seqs)
Exemplo n.º 5
0
#! /usr/bin/env python

from sys import argv
from sequence_lib import read_fasta

input_file = argv[1]

names, sequences = read_fasta(input_file)

total = 0
freq = {}

for s in sequences:
    for c in s:
        if c != '-':
            total += 1
            if not c in freq:
                freq[c] = 1
            else:
                freq[c] = freq[c] + 1

for c in sorted(freq):
    print(c + " " + str(float(freq[c]) / total))
Exemplo n.º 6
0
#! /usr/bin/env python

from sequence_lib import read_fasta, write_fasta
from sys import argv

seqfile = argv[1]
reducedFile = argv[2] # output
identicalSeqsFile = argv[3] # output

names,sequences = read_fasta(seqfile)

sorted_seqs = sorted((s,i) for i,s in enumerate(sequences))

reduced_names = [names[sorted_seqs[0][1]]]
reduced_seqs = [sorted_seqs[0][0]]

prev_seq = sorted_seqs[0]
L = len(sorted_seqs)
i=1

found_identical = False
first_write = True
with open(identicalSeqsFile,"w") as f:
    while i<L:
        if sorted_seqs[i][0] == sorted_seqs[i-1][0]:
            if not found_identical:
                if not first_write:
                    f.write("\n")
                else:
                    first_write = False
                f.write(names[sorted_seqs[i-1][1]] + " ")
Exemplo n.º 7
0
#! /usr/bin/env python

from sequence_lib import read_fasta, p_distance
from sys import argv

seq_file = argv[1]

names, aln = read_fasta(seq_file)

d = 0
#count = 0
for i, a1 in enumerate(aln):
    for j, a2 in enumerate(aln[i + 1:]):
        d += p_distance(a1, a2)
        #count += 1
L = len(aln)
print(2 * d / (L * (L - 1)))
Exemplo n.º 8
0
#! /usr/bin/env python

from sequence_lib import read_fasta, write_fasta
from sys import argv

infile=argv[1]
outfile=argv[2]

taxa,seqs = read_fasta(infile)
new_seqs = []
for seq in seqs:
    new_seq = "".join([seq[i] for i in range(len(seq)) if i%3 != 2])
    new_seqs.append(new_seq)

write_fasta(outfile,taxa,new_seqs)
Exemplo n.º 9
0
#! /usr/bin/env python
# Usage: python mask_aln.py <file_in> <path_out> <mask_levels>

from sys import argv
import os.path
from sequence_lib import count_gaps, read_fasta, write_fasta

file_in = argv[1]
path_in, file_name = os.path.split(file_in)
path_out = path_in if (argv[2] == '-') else argv[2]
base_name, ext = os.path.splitext(file_name)

taxon_names, seq_aln = read_fasta(file_in)
gap_count = count_gaps(seq_aln)
N = len(gap_count)
taxon_count = len(taxon_names)

for msk_lev in argv[3:]:
    gap_limit = taxon_count * (1 - float(msk_lev))
    chosen_cols = [i for i in range(N) if gap_count[i] <= gap_limit]
    msk_aln = [""] * taxon_count
    output_file = path_out + "/" + base_name + "_msk" + str(msk_lev) + ext
    for j in chosen_cols:
        for i in range(taxon_count):
            msk_aln[i] = msk_aln[i] + seq_aln[i][j]
    write_fasta(output_file, taxon_names, msk_aln)
Exemplo n.º 10
0
#! /usr/bin/env python

from sys import argv
from sequence_lib import read_fasta, write_fasta
from random import sample

inputfile = argv[1]
outputfile = argv[2]
nsites = int(argv[3])

seq_names, seq_aln = read_fasta(inputfile)


sites = sorted(sample(range(len(seq_aln[0])),nsites))

new_aln = []

for a in seq_aln:
    b = ''
    for i in sites:
        b = b + a[i]
    new_aln.append(b)    

write_fasta(outputfile,seq_names,new_aln)
Exemplo n.º 11
0
                           mapping,
                           idx,
                           remove_gaps=True):
    for i, taxon in enumerate(taxon_names):
        seq_len = len([x for x in sequences[i]
                       if x != '-']) if remove_gaps else len(sequences[i])
        if taxon not in mapping:
            mapping[taxon] = [(idx, seq_len)]
        else:
            mapping[taxon].append((idx, seq_len))


mapping = {}

for idx, filename in enumerate(argv[1:]):
    taxon_names, sequences = read_fasta(filename)
    report_sequence_length(taxon_names, sequences, mapping, idx)

max_idx = len(argv) - 1

for taxon in mapping:
    string = taxon
    arr = mapping[taxon]
    i = 0
    for idx, seq_len in arr:
        while i < idx:
            string += " 0"
            i += 1
        string += (" " + str(seq_len))
        i += 1
    while i < max_idx: