def break_contigs(contigs_file, sam_file, output_file):
    contigs = list(SeqIO.parse(open(contigs_file, "rU"), "fasta"))
    # sam = sam_parser.SamChain([sam_parser.Samfile(sam_file) for sam_file in sam_files])
    sam = sam_parser.Samfile(sam_file)
    # last two arguments: K, min0 stretch length to break
    coverage_breaker = break_by_coverage.ContigBreaker(contigs, sam, 100, 50)
    coverage_breaker.OutputBroken(output_file)
def collect_contigs(dataset, output_dir, output_base, format):
    output = open(output_base + "." + format, "w")
    for barcode in dataset:
        file = os.path.join(output_dir, barcode.id, "truseq_long_reads." + format)
        if os.path.exists(file):
            contigs = SeqIO.parse(open(file), format)
            for contig in contigs:
                contig.id = barcode.id + "-" + contig.id
                SeqIO.write(contig, output, format)
    output.close()
示例#3
0
def collect_contigs(dataset, barcodes_dir, output_base, format):
    output = open(output_base + "." + format, "w")
    for barcode in dataset:
        file = os.path.join(barcodes_dir, barcode.id,
                            "truseq_long_reads." + format)
        if os.path.exists(file):
            contigs = SeqIO.parse(open(file), format)
            for contig in contigs:
                contig.id = barcode.id + "-" + contig.id
                SeqIO.write(contig, output, format)
    output.close()
def main():
    # Params
    parser = argparse.ArgumentParser()
    parser.add_argument("--refseq_in", required=True, help="RefSeq fasta file", type=str)
    parser.add_argument("--tss_in", required=True, help="RefSeq fasta file", type=str)
    parser.add_argument("--wigs_in", required=True,
                        help="Term-Seq coverage file(s) (.wig), Must contain forward and reverse files", type=str)
    parser.add_argument("--gff_out", required=True, help="GFF output file name for terminators", type=str)
    parser.add_argument("--distance", required=True, help="Distance to look for terminator after a TSS", type=int)
    args = parser.parse_args()

    # ---------------------------
    print("Loading sequence file...")
    fasta_parsed = SeqIO.parse(glob.glob(args.refseq_in)[0], "fasta")
    wig_files = glob.glob(args.wigs_in)
    f_wigs_parsed, r_wigs_parsed = WM(wig_files, fasta_parsed).build_matrix()
    accession = ""

    # The following line is repeated due to the previous iterator exhaustion
    fasta_parsed = SeqIO.parse(glob.glob(args.refseq_in)[0], "fasta")
    for seq_record in fasta_parsed:
        f_seq_str = str(seq_record.seq)
        accession = seq_record.id
        print(f_wigs_parsed[accession].to_string())
示例#5
0
def moleculo_postprocessing(contigs_file, output_file, sam_files, log):
    log.info("===== Starting postprocessing based on read alignment")
    log.info("Processing scaffolds from " + contigs_file)
    log.info("Using read alignments to break and filter scaffolds")
    contigs = list(SeqIO.parse(open(contigs_file, "rU"), "fasta"))
    sam = sam_parser.SamChain([sam_parser.Samfile(sam_file) for sam_file in sam_files])
    generate_quality.GenerateQuality(contigs, sam)
    pattern_filter = moleculo_filter_contigs.PatternContigFilter(contigs, sam, pattern, rc_pattern)
    length_filter = moleculo_filter_contigs.ContigLengthFilter(1500)
    coverage_breaker = break_by_coverage.ContigBreaker(contigs, sam, 100, 50)
    pattern_breaker = break_by_coverage.PatternBreaker(pattern, rc_pattern, 150)
    n_breaker = break_by_coverage.NBreaker(3)
    result = SplitAndFilter(contigs, coverage_breaker, length_filter, n_breaker, pattern_breaker, pattern_filter)
    OutputResults(output_file, "fasta", result)
    OutputResults(output_file, "fastq", result)
    log.info("===== Postprocessing finished. Results can be found in " + output_file + ".fastq")
示例#6
0
def moleculo_postprocessing(contigs_file, output_file, sam_files, log):
    log.info("===== Starting postprocessing based on read alignment")
    log.info("Processing scaffolds from " + contigs_file)
    log.info("Using read alignments to break and filter scaffolds")
    contigs = list(SeqIO.parse(open(contigs_file, "rU"), "fasta"))
    sam = sam_parser.SamChain(
        [sam_parser.Samfile(sam_file) for sam_file in sam_files])
    generate_quality.GenerateQuality(contigs, sam)
    pattern_filter = moleculo_filter_contigs.PatternContigFilter(
        contigs, sam, pattern, rc_pattern)
    length_filter = moleculo_filter_contigs.ContigLengthFilter(1500)
    coverage_breaker = break_by_coverage.ContigBreaker(contigs, sam, 100, 50)
    pattern_breaker = break_by_coverage.PatternBreaker(pattern, rc_pattern,
                                                       150)
    n_breaker = break_by_coverage.NBreaker(3)
    result = SplitAndFilter(contigs, coverage_breaker, length_filter,
                            n_breaker, pattern_breaker, pattern_filter)
    OutputResults(output_file, "fasta", result)
    OutputResults(output_file, "fastq", result)
    log.info("===== Postprocessing finished. Results can be found in " +
             output_file + ".fastq")
示例#7
0
import numpy
import SeqIO from Bio

clusters=[]
linkage_matrix = numpy.zeroes(num_records-1,4)

with open(filename, "r") as datafile:
    records = list(SeqIO.parse(datafile, "fasta"))
    num_records = len(records)

def init_clusters():
    clusters=[list(range(num_records)))]

def split_cluster():
    max_c = None
    max_sum = -1
    max_i=-1
    max_j=-1

    for c in clusters:
        sum=0
        max_di=-1
        max_dj=-1
        for i in range(len(c)):
            for j in range(i + 1, len(c)):
                d = get_distance(c[i],c[j])
                if(d>max_d):
                    max_di = c[i]
                    max_dj = c[j]
                sum= sum + d
        if sum > max_sum:
示例#8
0
                l += len(ins[i][1])
                last = ins[i][0]
            i += 1
        else:
            if last < d[j][0]:
                result.append(seq[last:d[j][0]])
                l += d[j][0] - last
                sys.stdout.write("Deletion: " + str(l) + " " + str(d[j][1]) +
                                 "\n")
                last = d[j][0] + d[j][1]
            j += 1
    result.append(seq[last:])
    return "".join(result)


def Generate(input, output, numins, numdel):
    reference = list(input)
    result = "".join([ch.seq for ch in reference])
    l = sum([len(ch) for ch in reference])
    ins = GroupByChrom(GenerateInsertions(numins, result), reference)
    d = GroupByChrom(GenerateDeletions(numdel, result), reference)
    for ch_ins, ch_d, chrom in itertools.izip(ins, d, reference):
        sys.stdout.write("Chromosome " + chrom.id + "\n")
        rec = SeqIO.SeqRecord(Apply(chrom.seq, ch_ins, ch_d), chrom.id)
        SeqIO.write(rec, output, "fasta")


if __name__ == '__main__':
    Generate(SeqIO.parse(open(sys.argv[1], "r"), "fasta"),
             open(sys.argv[2], "w"), int(sys.argv[3]), int(sys.argv[3]))
 	df['qstart'] = df['qstart'] - BUFFER	 #npwhere (column, change, column to apply), pybedtools? 
	if df['qstart'] <0:
		df['qstart'] = 0
	else: 
		pass 
df.apply(Xbuffer)
df.apply(Ybuffer)

#rerank blast file by e value

df.sort_values(by=['evalue', 'bitscore'], ascending=[True, False]) 

#create new file for each blasted TE 

file = open(TE'.fas', "w+")
for record in SeqIO.parse(INPUT, "fasta"):
	TEfile = open(TE'.fas', "a+")
	re.sub('__'(.*) '___', '#'\1'/', record.id)	
	record.id = 'CONSENSUS' + record.id
	TEfile.write(record.id + '\n', "w+")

#add top 40 blast hits to the new file  
	n=0
	while n <41:
		if seqIO.record == df['qseqid'] 
			TEfile.write(record.id, fasta, "a+") #write record,not record.id; 
			n += 1
		else: 
			pass 
		
#align with muscle 
3/168: import SeqIO from Bio
3/169: from Bio import SeqIO
3/170: SeqIO
 4/1: import sys
 4/2:
from Bio import SeqIO
from Bio.Alphabet import DNAAlphabet

sprot = SeqIO.parse("uniprot_sprot.fasta", "fasta", DNAAlphabet)
 4/3: len(sprot)
 4/4: sprot
 4/5:
from Bio import SeqIO
from Bio.Alphabet import ProteinAlphabet

sprot = SeqIO.parse("uniprot_sprot.fasta", "fasta", ProteinAlphabet())
 4/6: sprot
 4/7: len(sprot)
 4/8:
from Bio import SeqIO
from Bio.Alphabet import ProteinAlphabet

sprot = SeqIO.parse("uniprot_sprot.fasta", "fasta")
 4/9: sprot
4/10: len(sprot)
4/11: sprot
4/12:
from Bio import SeqIO
from Bio.Alphabet import ProteinAlphabet

sprot_raw = "uniprot_sprot.fasta"