Пример #1
0
def addIllumina(readsetfh, orgname, rundata, nembasedir, outlog):
	"""Add Illumina experiments to our readset."""
	index = 1
	for exp in rundata.expsIllumina:
		trinsetsingle = os.path.join(nembasedir, "illumina", "_trinityASM_%s" % exp, \
			"assemblydir", "single", "Trinity.fasta")
		trinsetpaired = os.path.join(nembasedir, "illumina", "_trinityASM_%s" % exp, \
			"assemblydir", "paired", "Trinity.fasta")
		oasset = os.path.join(nembasedir, "illumina", "_oasesASM_%s" % exp, \
			"assemblydir", "transcripts.fa")
		if not (os.path.exists(trinsetsingle) and os.path.exists(trinsetpaired) and \
			os.path.exists(oasset)):
			print "Warning: Experiment %s not found for %s" % (exp, orgname)
			outlog.write("Warning: Experiment %s not found for %s\n" % (exp, orgname))
			continue
		elif os.stat(trinsetsingle)[6] == 0 and os.stat(trinsetpaired)[6] == 0:
			print "Trinity assembly empty, using Oases"
			for rec in fasta_itr(oasset):
				rec.header = "Contig_Illumina_%s" % index
				writeSeq(rec, readsetfh)
				index += 1
		else:
			for rec in fasta_itr(trinsetsingle):
				rec.header = "Contig_Illumina_%s" % index
				writeSeq(rec, readsetfh)
				index += 1
			for rec in fasta_itr(trinsetpaired):
				rec.header = "Contig_Illumina_%s" % index
				writeSeq(rec, readsetfh)
				index += 1
Пример #2
0
def getFromFastaByName(cf):
	"""Print records where names match their header. If partial is true, check for 
		partial matches. names can also be a list of compiled regular expressions (or 
		objects that have a findall method)."""
	names, status = getNames(cf, cf.get_parameter('regexp', 'boolean'))
	if not status == constants.OK:
		return status
	fastafile = cf.get_input('fastafile')
	outputfile = open(cf.get_output('outputfile'), 'w')
	partial = cf.get_parameter('partial', 'boolean')
	negate = cf.get_parameter('negate', 'boolean')
	for rec in fasta_itr(fastafile):
		found = False
		for name in names:
			if hasattr(name, 'findall'):
				matches = name.findall(rec.header)
				if len(matches) > 0:
					found = True; break
			elif partial and rec.header.find(name) >= 0 or \
				rec.header == name:
				found = True; break
		if not negate and found or \
			negate and not found:
			outputfile.write(str(rec) + "\n")	
	outputfile.close()
	return constants.OK
Пример #3
0
def getOverRepClusters(cf):
	"""Identify over represented clusters in a fastqfile and write the cluster seed 
		to a file."""
	fastqfile = cf.get_input('fastqfile')
	resultsuc = cf.get_input('resultsuc')
	resultsfa = cf.get_input('resultsfa')
	percRep = cf.get_parameter('percRep', 'float')
	output = cf.get_output('resultsfa')
	totalSeqs = 0
	fqp = FastqParser()
	for rec in fqp.parse(open(fastqfile, 'rb')):
		totalSeqs += 1
	clusterCounts = {}
	reader = csv.reader(open(resultsuc, 'rb'), quoting=csv.QUOTE_NONE, delimiter='\t')
	for row in reader:
		if row[0] == 'H':
			if not clusterCounts.has_key(row[-1]):
				clusterCounts[row[-1]] = 0
			clusterCounts[row[-1]] += 1
	outfh = open(output, 'wb')
	for rec in fasta_itr(resultsfa):
		if not clusterCounts.has_key(rec.header):
			continue
		clusterRep = (float(clusterCounts[rec.header]) / float(totalSeqs)) * 100
		if clusterRep >= percRep:
			outfh.write(str(rec) + '\n')
	outfh.close()
	return constants.OK
Пример #4
0
def fasta_read(file_name):
    """read the sequence from a file in fasta format"""
    seq_dict = dict()
    for record in fasta.fasta_itr(file_name):
        header = record.header
        seq_dict[header] = record.sequence

    return seq_dict
Пример #5
0
def getAllData(dataFile): 
	allData = []
	for record in fasta.fasta_itr(dataFile):						#Create parallel arrays for negative sequence/headers
		sequence = re.sub('[()\'\']', '', record.sequence.strip());
		header = re.sub('[()\'\']', '', record.header.strip());
		allData.append([header, sequence]);

	return allData;
Пример #6
0
def printMIRAContigs(orgdir):
	"""Print the assembled mira contigs to an output file."""
	outputfile = os.path.join(orgdir, "contigs.fa")
	outfh = open(outputfile, 'w')
	miraasm = os.path.join(orgdir, "mira_assembly", "mira_d_results", \
		"mira_out.unpadded.fasta")
	for rec in fasta_itr(miraasm):
		outfh.write(str(rec) + "\n")
	outfh.close()
def fasta_read(file_name):
	"""read the sequence from a file in fasta format"""
	seq_dict = dict()
	for record in fasta.fasta_itr(file_name):
		header = record.header
		seq_dict[header] = record.sequence;

						
	return seq_dict;
Пример #8
0
def getAllData(dataFile):
    allData = []
    for record in fasta.fasta_itr(
            dataFile):  #Create parallel arrays for negative sequence/headers
        sequence = re.sub('[()\'\']', '', record.sequence.strip())
        header = re.sub('[()\'\']', '', record.header.strip())
        allData.append([header, sequence])

    return allData
Пример #9
0
def appendUnassembledReads(orgdir, asmreads):
	"""Append the unassembled reads to the final contig file."""
	outputfile = os.path.join(orgdir, "contigs.fa")
	outfh = open(outputfile, 'a')
	readfile = os.path.join(orgdir, "reads.fa")
	for rec in fasta_itr(readfile):
		if asmreads.has_key(rec.header):
			continue
		outfh.write(str(rec) + "\n")
	outfh.close()
Пример #10
0
def fasta_merge(cf):
    """Merge an array of fastafiles."""
    outfh = open(cf.get_output("output"), "w")
    fastafiles = get_array(cf, "fastafiles")
    cf.write_log(str(fastafiles))
    for key, fastafile in fastafiles:
        for rec in fasta_itr(fastafile):
            outfh.write(str(rec) + "\n")
    outfh.close()
    return constants.OK
Пример #11
0
def CreateNegDict(NegativeFileName):
	NegSequences = []
	NegHeaders = []
	NegativeFile = open(NegativeFileName, "r")
	for record in fasta.fasta_itr(NegativeFileName):						#Create parallel arrays for negative sequence/headers
		sequence = record.sequence
		header = record.header
		
		NegSequences.append(sequence)
		NegHeaders.append(header)
	return NegSequences, NegHeaders;
Пример #12
0
def addESTs(readsetfh, orgname, nembasedir, outlog):
	"""Add a set of ESTs to our readset."""
	estset = os.path.join(nembasedir, "est", "_outputCtgSet_%s" % orgname, "output")
	if not os.path.exists(estset):
		outlog.write("ESTs for %s not found \n" % orgname)
		return
	index = 1
	for rec in fasta_itr(estset):
		rec.header = "Contig_EST_%s" % index
		writeSeq(rec, readsetfh)
		index += 1
Пример #13
0
def add454(readsetfh, orgname, rundata, nembasedir, outlog):
	"""Add 454 experiments to our readset."""
	index = 1
	for exp in rundata.exps454:
		cap3contigs = os.path.join(nembasedir, "454", "_cap3Asm_%s" % exp, "assemblydir", \
			"output.cap.contigs")
		cap3singlets = os.path.join(nembasedir, "454", "_cap3Asm_%s" % exp, "assemblydir", \
			"output.cap.singlets")
		if not (os.path.exists(cap3contigs) and os.path.exists(cap3singlets)):
			print "Warning: Experiment %s not found for %s" % (exp, orgname)
			outlog.write("Warning: Experiment %s not found for %s\n" % (exp, orgname))
			continue
		for rec in fasta_itr(cap3contigs):
			rec.header = "Contig_454_%s" % index
			writeSeq(rec, readsetfh)
			index += 1
		for rec in fasta_itr(cap3singlets):
			rec.header = "Contig_454_%s" % index
			writeSeq(rec, readsetfh)
			index += 1
Пример #14
0
def get_seqs(f):
  seqs = []
  fg_gc_list = []
  fg_lengths = []
  stream = open(f)
  for record in fasta.fasta_itr(f):
    record.sequence= record.sequence.upper()
    seqs.append(record)
    fg_gc_list.append(GC(record.sequence))
    fg_lengths.append(len(record.sequence))
  stream.close()
  return seqs, fg_gc_list, fg_lengths
Пример #15
0
def renameContigs(indir, outdir):
	"""Rename the contigs in indir and write them to outdir."""
	for d in os.listdir(indir):
		code = getSpeciesCode(d)
		os.mkdir(os.path.join(outdir, d))
		infile = os.path.join(indir, d, "contigs.fa")
		outfile = os.path.join(outdir, d, "contigs.fa")
		outfh = open(outfile, 'w')
		index = 1
		for rec in fasta_itr(infile):
			rec.header = code + "_" + str(index)
			index += 1
			outfh.write(str(rec) + "\n")
		outfh.close()
#!/usr/bin/python
#
# This software is freely provided for any use.
# 
# erik garrison <*****@*****.**>

import fasta
from fasta import fasta_itr
import sys

if len(sys.argv) < 3:
    print "usage:", sys.argv[0], "<contig_file> <contig length cutoff>"
    exit()

contig_file = sys.argv[1]
cutoff = int(sys.argv[2])

for rec in fasta_itr(contig_file):
    if len(rec.sequence) > cutoff:
        print rec
Пример #17
0
import re
import os

#Prepare and open files
PositiveFileName = sys.argv[1]
NegativeFileName = sys.argv[2]
PosFileBaseName = os.path.basename(PositiveFileName)
print PosFileBaseName
OutputFileName = "NON" + PosFileBaseName
print OutputFileName
PositiveFile = open(PositiveFileName, "r")
NegativeFile = open(NegativeFileName, "r")
OutputFile = open(OutputFileName, "w")

PosLengths = []
for record in fasta.fasta_itr(
        PositiveFileName):  #Remember lengths of positive sequences
    seqLength = len(record.sequence)
    PosLengths.append(seqLength)

NegSequences = []
NegHeaders = []
for record in fasta.fasta_itr(
        NegativeFileName
):  #Create parallel arrays for negative sequence/headers
    sequence = record.sequence
    header = record.header

    NegSequences.append(sequence)
    NegHeaders.append(header)

indexArr = []
Пример #18
0
import fasta
import sys
import random

PositiveFileName = sys.argv[1]
NegativeFileName = sys.argv[2]
OutputFileName = sys.argv[3]
#OutputFileName = PositiveFileName.lstrip("TTP_PARCLIP_ConversionSpecificity")
#OutputFileName = OutputFileName.rstrip(".txt")
#OutputFileName = "Non_TTPPARCLIP" + OutputFileName + ".txt"
PositiveFile = open(PositiveFileName, "r")
NegativeFile = open(NegativeFileName, "r")
OutputFile = open(OutputFileName, "w")

PosLengths = []
for record in fasta.fasta_itr(
        PositiveFileName):  #Remember lengths of positive sequences
    seqLength = len(record.sequence)
    PosLengths.append(seqLength)

NegSequences = []
NegHeaders = []
for record in fasta.fasta_itr(NegativeFileName):
    sequence = record.sequence
    header = record.header

    NegSequences.append(sequence)
    NegHeaders.append(header)

for number in PosLengths:
    gotLength = False
    seqLength = 0
Пример #19
0
import re
import os

#Prepare and open files
PositiveFileName = sys.argv[1]
NegativeFileName = sys.argv[2]
PosFileBaseName = os.path.basename(PositiveFileName);
print PosFileBaseName
OutputFileName = "NON" + PosFileBaseName 
print OutputFileName;
PositiveFile = open(PositiveFileName, "r")
NegativeFile = open(NegativeFileName, "r")
OutputFile = open(OutputFileName , "w")

PosLengths = []
for record in fasta.fasta_itr(PositiveFileName):						#Remember lengths of positive sequences
	seqLength = len(record.sequence)
	PosLengths.append(seqLength)

NegSequences = []
NegHeaders = []
for record in fasta.fasta_itr(NegativeFileName):						#Create parallel arrays for negative sequence/headers
	sequence = record.sequence
	header = record.header
	
	NegSequences.append(sequence)
	NegHeaders.append(header)

indexArr = []
#Iterate through lengths in positive length array, picking a random negative sequence 
#and picking LENGTH nucleotides at random start point
Пример #20
0
import fasta
import sys
import random

PositiveFileName = sys.argv[1]
NegativeFileName = sys.argv[2]
OutputFileName = sys.argv[3];
#OutputFileName = PositiveFileName.lstrip("TTP_PARCLIP_ConversionSpecificity")
#OutputFileName = OutputFileName.rstrip(".txt")
#OutputFileName = "Non_TTPPARCLIP" + OutputFileName + ".txt"
PositiveFile = open(PositiveFileName, "r")
NegativeFile = open(NegativeFileName, "r")
OutputFile = open(OutputFileName , "w")

PosLengths = []
for record in fasta.fasta_itr(PositiveFileName):						#Remember lengths of positive sequences
	seqLength = len(record.sequence)
	PosLengths.append(seqLength)

NegSequences = []
NegHeaders = []
for record in fasta.fasta_itr(NegativeFileName):
	sequence = record.sequence
	header = record.header
	
	NegSequences.append(sequence)
	NegHeaders.append(header)


for number in PosLengths:
	gotLength = False
# 
# erik garrison <*****@*****.**>

import fasta
from fasta import fasta_itr
import sys

if len(sys.argv) < 2:
    print "usage:", sys.argv[0], "<fasta file>"
    exit()

fasta_file = sys.argv[1]

print '\t'.join(["header", "length", "a", "t", "g", "c", "at", "gc", "other"])

for rec in fasta_itr(fasta_file):
    l = len(rec.sequence)
    a,t,g,c,at,gc,other = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
    for bp in rec.sequence:
        if bp == 'A':
            a += 1
        elif bp == 'T':
            t += 1
        elif bp == 'G':
            g += 1
        elif bp == 'C':
            c += 1
        else:
            other += 1
    at = a + t
    gc = g + c
Пример #22
0
#os.environ["HMMERDB"] += ":"+os.path.abspath(options.hmm_path)
#print os.environ["HMMERDB"]
out_fname = os.path.abspath(options.out_fname)
out_dir = os.path.dirname(out_fname)
fname = os.path.abspath(options.input_fasta)

tr = string.maketrans("gatcryswkmbdhvnGATCRYSWKMBDHVN",
                      "ctagyrswmkvhdbnCTAGYRSWMKVHDBN")


def rev_record(record):
    return ">" + record.header + "|rev\n" + format(
        record.sequence[::-1].translate(tr))


records = [rec for rec in fasta.fasta_itr(fname)]
headers = [[rec.header, len(rec.sequence)] for rec in records]

ff = open(out_fname + '.fa', 'w')
for (i, rec) in enumerate(records):
    ff.write('>s' + str(i) + '\n' + format(rec.sequence) + '\n')
    ff.write('>s' + str(i) + '|rev\n' +
             format(rec.sequence[::-1].translate(tr)) + '\n')
ff.close()
#sys.exit(1)
# a temporary fasta file, use s(int) to easy the parsing


def parse_hmmsearch(kingdom, moltype, src):
    # function to parse hmmsearch output
    resu = []
Пример #23
0
    parser.print_help()
    sys.exit(1)

#print "%s"% os.path.abspath(options.hmm_path)
#os.environ["HMMERDB"] += ":"+os.path.abspath(options.hmm_path)
#print os.environ["HMMERDB"]
fname = os.path.abspath(options.input_fasta)

tr = string.maketrans("gatcryswkmbdhvnGATCRYSWKMBDHVN","ctagyrswmkvhdbnCTAGYRSWMKVHDBN")


def rev_record(record):
    return ">"+record.header+"|rev\n"+format(record.sequence[::-1].translate(tr))

    
records = [rec for rec in fasta.fasta_itr(fname)]
headers = [[rec.header,len(rec.sequence)] for rec in records]


temp_fasta = tempfile.NamedTemporaryFile(delete=False)
ff = open(temp_fasta.name,'w')
for (i, rec) in enumerate(records):
    ff.write('>s'+str(i)+'\n'+format(rec.sequence)+'\n')
    ff.write('>s'+str(i)+'|rev\n'+format(rec.sequence[::-1].translate(tr))+'\n')
ff.close()
#sys.exit(1)
# a temporary fasta file, use s(int) to easy the parsing

def parse_hmmsearch(kingdom, moltype, src):
# function to parse hmmsearch output
    resu = []
Пример #24
0
#os.environ["HMMERDB"] += ":"+os.path.abspath(options.hmm_path)
#print os.environ["HMMERDB"]
out_fname = os.path.abspath(options.out_fname)
out_dir = os.path.dirname(out_fname)
fname = os.path.abspath(options.input_fasta)

tr = string.maketrans("gatcryswkmbdhvnGATCRYSWKMBDHVN",
                      "ctagyrswmkvhdbnCTAGYRSWMKVHDBN")


def rev_record(record):
    return ">" + record.header + "|rev\n" + format(
        record.sequence[::-1].translate(tr))


records = [rec for rec in fasta.fasta_itr(fname)]
headers = [[rec.header, len(rec.sequence)] for rec in records]

ff = open(out_fname + '.fa', 'w')
for (i, rec) in enumerate(records):
    ff.write('>s' + str(i) + '\n' + format(rec.sequence) + '\n')
    ff.write('>s' + str(i) + '|rev\n' +
             format(rec.sequence[::-1].translate(tr)) + '\n')
ff.close()
#sys.exit(1)
# a temporary fasta file, use s(int) to easy the parsing


def parse_hmmsearch(kingdom, moltype, src):
    # function to parse hmmsearch output
    resu = []