def break_contigs(contigs_file, sam_file, output_file): contigs = list(SeqIO.parse(open(contigs_file, "rU"), "fasta")) # sam = sam_parser.SamChain([sam_parser.Samfile(sam_file) for sam_file in sam_files]) sam = sam_parser.Samfile(sam_file) # last two arguments: K, min0 stretch length to break coverage_breaker = break_by_coverage.ContigBreaker(contigs, sam, 100, 50) coverage_breaker.OutputBroken(output_file)
def collect_contigs(dataset, output_dir, output_base, format): output = open(output_base + "." + format, "w") for barcode in dataset: file = os.path.join(output_dir, barcode.id, "truseq_long_reads." + format) if os.path.exists(file): contigs = SeqIO.parse(open(file), format) for contig in contigs: contig.id = barcode.id + "-" + contig.id SeqIO.write(contig, output, format) output.close()
def collect_contigs(dataset, barcodes_dir, output_base, format): output = open(output_base + "." + format, "w") for barcode in dataset: file = os.path.join(barcodes_dir, barcode.id, "truseq_long_reads." + format) if os.path.exists(file): contigs = SeqIO.parse(open(file), format) for contig in contigs: contig.id = barcode.id + "-" + contig.id SeqIO.write(contig, output, format) output.close()
def main(): # Params parser = argparse.ArgumentParser() parser.add_argument("--refseq_in", required=True, help="RefSeq fasta file", type=str) parser.add_argument("--tss_in", required=True, help="RefSeq fasta file", type=str) parser.add_argument("--wigs_in", required=True, help="Term-Seq coverage file(s) (.wig), Must contain forward and reverse files", type=str) parser.add_argument("--gff_out", required=True, help="GFF output file name for terminators", type=str) parser.add_argument("--distance", required=True, help="Distance to look for terminator after a TSS", type=int) args = parser.parse_args() # --------------------------- print("Loading sequence file...") fasta_parsed = SeqIO.parse(glob.glob(args.refseq_in)[0], "fasta") wig_files = glob.glob(args.wigs_in) f_wigs_parsed, r_wigs_parsed = WM(wig_files, fasta_parsed).build_matrix() accession = "" # The following line is repeated due to the previous iterator exhaustion fasta_parsed = SeqIO.parse(glob.glob(args.refseq_in)[0], "fasta") for seq_record in fasta_parsed: f_seq_str = str(seq_record.seq) accession = seq_record.id print(f_wigs_parsed[accession].to_string())
def moleculo_postprocessing(contigs_file, output_file, sam_files, log): log.info("===== Starting postprocessing based on read alignment") log.info("Processing scaffolds from " + contigs_file) log.info("Using read alignments to break and filter scaffolds") contigs = list(SeqIO.parse(open(contigs_file, "rU"), "fasta")) sam = sam_parser.SamChain([sam_parser.Samfile(sam_file) for sam_file in sam_files]) generate_quality.GenerateQuality(contigs, sam) pattern_filter = moleculo_filter_contigs.PatternContigFilter(contigs, sam, pattern, rc_pattern) length_filter = moleculo_filter_contigs.ContigLengthFilter(1500) coverage_breaker = break_by_coverage.ContigBreaker(contigs, sam, 100, 50) pattern_breaker = break_by_coverage.PatternBreaker(pattern, rc_pattern, 150) n_breaker = break_by_coverage.NBreaker(3) result = SplitAndFilter(contigs, coverage_breaker, length_filter, n_breaker, pattern_breaker, pattern_filter) OutputResults(output_file, "fasta", result) OutputResults(output_file, "fastq", result) log.info("===== Postprocessing finished. Results can be found in " + output_file + ".fastq")
def moleculo_postprocessing(contigs_file, output_file, sam_files, log): log.info("===== Starting postprocessing based on read alignment") log.info("Processing scaffolds from " + contigs_file) log.info("Using read alignments to break and filter scaffolds") contigs = list(SeqIO.parse(open(contigs_file, "rU"), "fasta")) sam = sam_parser.SamChain( [sam_parser.Samfile(sam_file) for sam_file in sam_files]) generate_quality.GenerateQuality(contigs, sam) pattern_filter = moleculo_filter_contigs.PatternContigFilter( contigs, sam, pattern, rc_pattern) length_filter = moleculo_filter_contigs.ContigLengthFilter(1500) coverage_breaker = break_by_coverage.ContigBreaker(contigs, sam, 100, 50) pattern_breaker = break_by_coverage.PatternBreaker(pattern, rc_pattern, 150) n_breaker = break_by_coverage.NBreaker(3) result = SplitAndFilter(contigs, coverage_breaker, length_filter, n_breaker, pattern_breaker, pattern_filter) OutputResults(output_file, "fasta", result) OutputResults(output_file, "fastq", result) log.info("===== Postprocessing finished. Results can be found in " + output_file + ".fastq")
import numpy import SeqIO from Bio clusters=[] linkage_matrix = numpy.zeroes(num_records-1,4) with open(filename, "r") as datafile: records = list(SeqIO.parse(datafile, "fasta")) num_records = len(records) def init_clusters(): clusters=[list(range(num_records)))] def split_cluster(): max_c = None max_sum = -1 max_i=-1 max_j=-1 for c in clusters: sum=0 max_di=-1 max_dj=-1 for i in range(len(c)): for j in range(i + 1, len(c)): d = get_distance(c[i],c[j]) if(d>max_d): max_di = c[i] max_dj = c[j] sum= sum + d if sum > max_sum:
l += len(ins[i][1]) last = ins[i][0] i += 1 else: if last < d[j][0]: result.append(seq[last:d[j][0]]) l += d[j][0] - last sys.stdout.write("Deletion: " + str(l) + " " + str(d[j][1]) + "\n") last = d[j][0] + d[j][1] j += 1 result.append(seq[last:]) return "".join(result) def Generate(input, output, numins, numdel): reference = list(input) result = "".join([ch.seq for ch in reference]) l = sum([len(ch) for ch in reference]) ins = GroupByChrom(GenerateInsertions(numins, result), reference) d = GroupByChrom(GenerateDeletions(numdel, result), reference) for ch_ins, ch_d, chrom in itertools.izip(ins, d, reference): sys.stdout.write("Chromosome " + chrom.id + "\n") rec = SeqIO.SeqRecord(Apply(chrom.seq, ch_ins, ch_d), chrom.id) SeqIO.write(rec, output, "fasta") if __name__ == '__main__': Generate(SeqIO.parse(open(sys.argv[1], "r"), "fasta"), open(sys.argv[2], "w"), int(sys.argv[3]), int(sys.argv[3]))
df['qstart'] = df['qstart'] - BUFFER #npwhere (column, change, column to apply), pybedtools? if df['qstart'] <0: df['qstart'] = 0 else: pass df.apply(Xbuffer) df.apply(Ybuffer) #rerank blast file by e value df.sort_values(by=['evalue', 'bitscore'], ascending=[True, False]) #create new file for each blasted TE file = open(TE'.fas', "w+") for record in SeqIO.parse(INPUT, "fasta"): TEfile = open(TE'.fas', "a+") re.sub('__'(.*) '___', '#'\1'/', record.id) record.id = 'CONSENSUS' + record.id TEfile.write(record.id + '\n', "w+") #add top 40 blast hits to the new file n=0 while n <41: if seqIO.record == df['qseqid'] TEfile.write(record.id, fasta, "a+") #write record,not record.id; n += 1 else: pass #align with muscle
3/168: import SeqIO from Bio 3/169: from Bio import SeqIO 3/170: SeqIO 4/1: import sys 4/2: from Bio import SeqIO from Bio.Alphabet import DNAAlphabet sprot = SeqIO.parse("uniprot_sprot.fasta", "fasta", DNAAlphabet) 4/3: len(sprot) 4/4: sprot 4/5: from Bio import SeqIO from Bio.Alphabet import ProteinAlphabet sprot = SeqIO.parse("uniprot_sprot.fasta", "fasta", ProteinAlphabet()) 4/6: sprot 4/7: len(sprot) 4/8: from Bio import SeqIO from Bio.Alphabet import ProteinAlphabet sprot = SeqIO.parse("uniprot_sprot.fasta", "fasta") 4/9: sprot 4/10: len(sprot) 4/11: sprot 4/12: from Bio import SeqIO from Bio.Alphabet import ProteinAlphabet sprot_raw = "uniprot_sprot.fasta"