Python Ace.parse示例，Bio.Sequencing.Ace.parse Python示例

示例#1

0

显示文件

文件： ace2fasta.py 项目： Ecological-and-Evolutionary-Genomics/Scripts

def ace2fasta(in_file, out_file):
    ace_gen = Ace.parse(open(in_file, 'r'))
    with open(out_file, "w") as output_file:
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "All contigs treated"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            
            # Now we have started our alignment we can add sequences to it 
            # Add concensus sequence to alignment
            align.add_sequence(contig.name, contig.sequence)
            
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start
                clipe = contig.reads[readn].qa.qual_clipping_end
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            
            output_file.write(align.format("fasta"))

示例#2

0

显示文件

def ace2fasta(in_file, out_file):
    ace_gen = Ace.parse(open(in_file, 'r'))
    with open(out_file, "w") as output_file:
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "All contigs treated"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))

            # Now we have started our alignment we can add sequences to it
            # Add concensus sequence to alignment
            align.add_sequence(contig.name, contig.sequence)

            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start
                clipe = contig.reads[readn].qa.qual_clipping_end
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)

            output_file.write(align.format("fasta"))

示例#3

0

显示文件

文件： ace2gene_expression.py 项目： Ecological-and-Evolutionary-Genomics/Scripts

def gene_expression_2matrix(in_ace, out_file, tags, min_seq):
    """Count sequences with each tags in all contigs.
    
    """
    print
    print "USING MATRIX OUTPUT FORMAT"
    print
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        output_file.write("gene_name\tgene_length")
        for tag in tags:
            output_file.write("\t" + tag)
        output_file.write("\tXX_noTag")
        output_file.write("\n")
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "***All contigs treated***"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            align.add_sequence(contig.name, contig.sequence)
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start
                clipe = contig.reads[readn].qa.qual_clipping_end
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            sequences = read_fasta_2list(align.format("fasta"))
            if len(sequences) < min_seq:
                continue
            contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0]
            contig_seq = sequences[0][1].replace("*", "")
            contig_length = str(len(contig_seq))
            output_file.write(contig_name + "\t" + contig_length)
            print "Treating", contig_name
            d = defaultdict(int)
            for tag in tags:
                d[tag] = 0
            d["XX_noTag"] = 0
            fasta_counter = 0
            for fasta in sequences:
                fasta_counter += 1
                found_tag = 0
                for tag in tags:
                    if fasta[0].find(tag) > -1:
                        d[tag] += 1
                        found_tag = 1
                if found_tag == 0 and fasta[0].find("Consensus") < 0:
                    d["XX_noTag"] += 1
            for tag in sorted(d):
                output_file.write("\t" + str(d[tag]))
            output_file.write("\n")

示例#4

0

显示文件

文件： ace2gene_expression.py 项目： zhiping393/Scripts

def gene_expression_2matrix(in_ace, out_file, tags, min_seq):
    """Count sequences with each tags in all contigs.
    
    """
    print
    print "USING MATRIX OUTPUT FORMAT"
    print
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        output_file.write("gene_name\tgene_length")
        for tag in tags:
            output_file.write("\t" + tag)
        output_file.write("\tXX_noTag")
        output_file.write("\n")
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "***All contigs treated***"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            align.add_sequence(contig.name, contig.sequence)
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start
                clipe = contig.reads[readn].qa.qual_clipping_end
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            sequences = read_fasta_2list(align.format("fasta"))
            if len(sequences) < min_seq:
                continue
            contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0]
            contig_seq = sequences[0][1].replace("*", "")
            contig_length = str(len(contig_seq))
            output_file.write(contig_name + "\t" + contig_length)
            print "Treating", contig_name
            d = defaultdict(int)
            for tag in tags:
                d[tag] = 0
            d["XX_noTag"] = 0
            fasta_counter = 0
            for fasta in sequences:
                fasta_counter += 1
                found_tag = 0
                for tag in tags:
                    if fasta[0].find(tag) > -1:
                        d[tag] += 1
                        found_tag = 1
                if found_tag == 0 and fasta[0].find("Consensus") < 0:
                    d["XX_noTag"] += 1
            for tag in sorted(d):
                output_file.write("\t" + str(d[tag]))
            output_file.write("\n")

示例#5

0

显示文件

文件： ace_to_alignment.py 项目： nickloman/benchtop-sequencing-comparison

def parse_ace(ace_file):
	ace_gen = Ace.parse(open(ace_file, 'r'))
	contig = ace_gen.next()
	align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
	align.add_sequence(contig.name, contig.sequence)
 
	for readn in range(len(contig.reads)):
		clipst = contig.reads[readn].qa.qual_clipping_start
		clipe = contig.reads[readn].qa.qual_clipping_end
		start = contig.af[readn].padded_start
		seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)

		seq = pad_read(seq, start, len(contig.sequence))
		align.add_sequence(contig.reads[readn].rd.name + "_" + contig.af[readn].coru, seq)

	return contig, align

示例#6

0

显示文件

文件： AceIO.py 项目： manucorreia/biopython

def AceIterator(handle):
    """Returns SeqRecord objects from an ACE file.

    This uses the Bio.Sequencing.Ace module to do the hard work.  Note that
    by iterating over the file in a single pass, we are forced to ignore any
    WA, CT, RT or WR footer tags."""

    for ace_contig in Ace.parse(handle):
        #Convert the ACE contig record into a SeqRecord...
        consensus_seq_str = ace_contig.sequence
        #Assume its DNA unless there is a U in it,
        if "U" in consensus_seq_str:
            if "T" in consensus_seq_str:
                #Very odd! Error?
                alpha = generic_ncleotide
            else:
                alpha = generic_rna
        else:
            alpha = generic_dna

        if "*" in consensus_seq_str:
            #For consistency with most other file formats, map
            #any * gaps into 0 gaps.
            assert "-" not in consensus_seq_str
            consensus_seq = Seq(consensus_seq_str.replace("*", "-"),
                                Gapped(alpha, gap_char="-"))
        else:
            consensus_seq = Seq(consensus_seq_str, alpha)

        #TODO - Consensus base quality (BQ lines).  Note that any gaps
        #(* character) in the consensus does not get a quality entry.
        #This really needs Biopython support for per-letter-annotation.

        #TODO? - Base segments (BS lines) which indicates which read
        #phrap has chosen to be the consensus at a particular position.
        #Perhaps as SeqFeature objects?

        #TODO - Supporting reads (RD lines, plus perhaps QA and DS lines)
        #Perhaps as SeqFeature objects?

        seq_record = SeqRecord(consensus_seq,
                               id=ace_contig.name,
                               name=ace_contig.name)
        yield seq_record

示例#7

0

显示文件

文件： AceIO.py 项目： chapmanb/biosqlweb

def AceIterator(handle) :
    """Returns SeqRecord objects from an ACE file.

    This uses the Bio.Sequencing.Ace module to do the hard work.  Note that
    by iterating over the file in a single pass, we are forced to ignore any
    WA, CT, RT or WR footer tags."""

    for ace_contig in Ace.parse(handle) :
        #Convert the ACE contig record into a SeqRecord...
        consensus_seq_str = ace_contig.sequence
        #Assume its DNA unless there is a U in it,
        if "U" in consensus_seq_str :
            if "T" in consensus_seq_str :
                #Very odd! Error?
                alpha = generic_ncleotide
            else :
                alpha = generic_rna
        else :
            alpha = generic_dna
            
        if "*" in consensus_seq_str :
            #For consistency with most other file formats, map
            #any * gaps into 0 gaps.
            assert "-" not in consensus_seq_str
            consensus_seq = Seq(consensus_seq_str.replace("*","-"),
                                Gapped(alpha, gap_char="-"))
        else :
            consensus_seq = Seq(consensus_seq_str, alpha)

        #TODO - Consensus base quality (BQ lines).  Note that any gaps
        #(* character) in the consensus does not get a quality entry.
        #This really needs Biopython support for per-letter-annotation.

        #TODO? - Base segments (BS lines) which indicates which read
        #phrap has chosen to be the consensus at a particular position.
        #Perhaps as SeqFeature objects?

        #TODO - Supporting reads (RD lines, plus perhaps QA and DS lines)
        #Perhaps as SeqFeature objects?
            
        seq_record = SeqRecord(consensus_seq,
                               id = ace_contig.name,
                               name = ace_contig.name)
        yield seq_record

示例#8

0

显示文件

文件： haplotype_finder.py 项目： Ghoribi/Scripts

def get_haplotypes(in_ace, out_file, out_bamova, win_len, step,
                   coverage, stars, ngroups, nhaplo):
    """Get haplotypes from contigs in an ace file
    
    """
    marker_number = 0
    min_freq = 0.05
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        with open(out_bamova, "w") as bamova_file:
            output_file.write("Contig_nb\tWindow\tHaplotype\n")
            contig_counter = 0
            ntreated = 0
            for contig in ace_gen:
                pass_haplo = False
                contig_counter += 1
                align = Alignment(Gapped(IUPAC.ambiguous_dna, "X"))
                align.add_sequence(contig.name, contig.sequence)
                if len(contig.reads) -1 < coverage:
                    continue
                ntreated += 1
                for readn in xrange(len(contig.reads)):
                    clipst = contig.reads[readn].qa.qual_clipping_start
                    clipe = contig.reads[readn].qa.qual_clipping_end
                    clipst2 = contig.reads[readn].qa.align_clipping_start
                    clipe2 = contig.reads[readn].qa.align_clipping_end
                    if clipst2 > clipst:
                        clipst = clipst2
                    if clipe2 < clipe2:
                        clipe = clipe2
                    start = contig.af[readn].padded_start
                    seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                    seq = pad_read(seq, start, len(contig.sequence))
                    if "pseudo" not in contig.reads[readn].rd.name:
                        align.add_sequence(contig.reads[readn].rd.name, seq)
                sequences = read_fasta(align.format("fasta"))
                sequences = [[s[0].replace(">", ""), s[1]] for s in sequences]
                contig_name = sequences[0][0]
                concensus = sequences[0][1]
                error_positions = multi_find("*", concensus)[::-1]
                for p in error_positions:
                    sequences = [[s[0], s[1][0:p] + s[1][p+1:]] for s in sequences]
                concensus = sequences[0][1]
                sequences = [[s[0], correct_sequence(concensus, s[1])]
                             for s in sequences[1:]]
                sequences, snp_pos = snp_positions(sequences)
                haplotypes = best_snps(sequences, snp_pos, coverage)
                if haplotypes != "Empty":
                    bamova = []
                    variants = list(sorted(list(set([h[-1] for h in haplotypes[-1]]))))
                    groups = list(sorted(set([h[0][:3] for h in haplotypes[-1]])))
                    if len(groups) >= ngroups:
                        pass_haplo = True
                        for g in groups:
                            if len([h[0] for h in haplotypes[-1] if h[0].startswith(g)]) < nhaplo:
                                pass_haplo = False
                    if pass_haplo:
                        print contig.name
                        bamova_file.write("Marker" + str(marker_number) + "\n")
                        group_number = 0
                        for g in groups:
                            bamova_file.write("Population\t" + str(group_number))
                            group_number += 1
                            for v in variants:
                                bamova_file.write("\t" + str(len([h for h in haplotypes[-1]
                                                  if h[-1] == v and h[0].startswith(g)])))
                            bamova_file.write("\n")
                        with open ("fasta_output/" + contig.name + ".fasta", "w") as f:
                            output_file.write(contig.name + "\n")
                            for h in haplotypes[-1]:
                                f.write(">" + h[0] + str(marker_number) + "\n" + h[2] + "\n")
                                h[1] = [x - h[1][0] + 1 for x in h[1]]
                                output_file.write("Marker" + str(marker_number) + "\t" +
                                                  "\t".join([str(x) for x in h]) + "\t" +
                                                  ":".join(variants) + "\n")
                        marker_number += 1
                output_file.flush()
                bamova_file.flush()
                cutoff = 100000
                if contig_counter > cutoff:
                    break
        print "\n", str(ntreated), "contigs out of", str(contig_counter), "were treated"

示例#9

0

显示文件

文件： pairwise_diff.py 项目： Ghoribi/Scripts

def pairwise(in_ace, out_file):
    """Calculate pairwise differentiation indexes.
    
    """
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "***All contigs treated***"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            align.add_sequence(contig.name, contig.sequence)
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start
                clipe = contig.reads[readn].qa.qual_clipping_end
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            sequences = read_fasta(align.format("fasta"))
            contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0]
            print "Treating", contig_name
            window_len = 8 # PARAMETER
            max_diff = 3 # PARAMETER
            len_contig = len(sequences[0][1])
            number_indexes = 0
            total_indexes = 0
            for seq in sequences[1:]:
                try:
                    start = len(re.findall("^-+", seq[1])[0])
                except:
                    start = 0
                len_seq = 0
                min_len_seq = 100 # PARAMETER
                count = 0
                for window in range(start, len_contig, window_len):
                    nuc_contig = sequences[0][1][window:window + window_len]
                    nuc_seq = seq[1][window:window + window_len]
                    if "-" in nuc_seq:
                        len_seq += len(nuc_seq.replace("-", ""))
                    else:
                        diff = count_diff(nuc_contig, nuc_seq, max_diff)
                        if diff[1] == False:
                            count += diff[0]
                            len_seq += window_len
                len_seq -= seq.count("*")
                if len_seq >= min_len_seq:
                    index = float(count) / len_seq
                    if count > 0:
                        number_indexes +=1
                        total_indexes += index
                else:
                    index = "NA"
                #output_file.write(contig_name + "\t" + str(index) + "\n")
            try:
                mean_index = float(total_indexes) / number_indexes
            except:
                mean_index = "NA"
            output_file.write(contig_name + "\t" + str(mean_index) + "\n")

示例#10

0

显示文件

def pairwise(in_ace, out_file):
    """Calculate pairwise differentiation indexes.
    
    """
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "***All contigs treated***"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            align.add_sequence(contig.name, contig.sequence)
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start
                clipe = contig.reads[readn].qa.qual_clipping_end
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            sequences = read_fasta(align.format("fasta"))
            contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0]
            print "Treating", contig_name
            window_len = 8  # PARAMETER
            max_diff = 3  # PARAMETER
            len_contig = len(sequences[0][1])
            number_indexes = 0
            total_indexes = 0
            for seq in sequences[1:]:
                try:
                    start = len(re.findall("^-+", seq[1])[0])
                except:
                    start = 0
                len_seq = 0
                min_len_seq = 100  # PARAMETER
                count = 0
                for window in range(start, len_contig, window_len):
                    nuc_contig = sequences[0][1][window:window + window_len]
                    nuc_seq = seq[1][window:window + window_len]
                    if "-" in nuc_seq:
                        len_seq += len(nuc_seq.replace("-", ""))
                    else:
                        diff = count_diff(nuc_contig, nuc_seq, max_diff)
                        if diff[1] == False:
                            count += diff[0]
                            len_seq += window_len
                len_seq -= seq.count("*")
                if len_seq >= min_len_seq:
                    index = float(count) / len_seq
                    if count > 0:
                        number_indexes += 1
                        total_indexes += index
                else:
                    index = "NA"
                #output_file.write(contig_name + "\t" + str(index) + "\n")
            try:
                mean_index = float(total_indexes) / number_indexes
            except:
                mean_index = "NA"
            output_file.write(contig_name + "\t" + str(mean_index) + "\n")

示例#11

0

显示文件

文件： snpcount.py 项目： Ghoribi/Scripts

def snp_count(in_ace, out_file, snp_dict, tags, win_len, max_del, stars):
    """Genotype individuals at SNPs loci.
    
    """
    win_buffer = (win_len - 1) / 2
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        output_file.write("Contig_nb\tPos\ttag_name\tA\tC\tG\tT\tN\t*\t-\n")
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "***All contigs treated***"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            align.add_sequence(contig.name, contig.sequence)
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start # GOOD
                clipe = contig.reads[readn].qa.qual_clipping_end # GOOD
                clipst2 = contig.reads[readn].qa.align_clipping_start # Added
                clipe2 = contig.reads[readn].qa.align_clipping_end # Added
                if clipst2 > clipst: # Added
                    clipst = clipst2 # Added
                if clipe2 < clipe2: # Added
                    clipe = clipe2 # Added
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            sequences = read_fasta(align.format("fasta"))
            contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0]
            print "Treating", contig_name
            positions = []
            try:
                positions = snp_dict[contig_name]
            except:
                continue
            d = {}
            for pos in positions:
                if stars == True:
                    pos_ok = correct_position(pos, sequences[0][1])
                else:
                    pos_ok = pos
                left = pos_ok - 5
                if left < 0:
                    left = 0
                right = pos_ok + 1 + 5 # takes into account the middle nucleotide
                ref_window = sequences[0][1][left:right]
                d.setdefault(pos, {})
                d[pos].setdefault("XX_noTag", {})
                for nuc in list("ACGTN*-"):
                    d[pos]["XX_noTag"].setdefault(nuc, 0)
                for tag in tags:
                    d[pos].setdefault(tag, {})
                    for nuc in list("ACGTN*-"):
                        d[pos][tag].setdefault(nuc, 0)
                for fasta in sequences:
                    window = fasta[1][left:right]
                    del_count = 0
                    if window.count("-") > win_buffer - 3:
                        continue # Need at least 3 nucleotides on each side
                    for tag in tags:
                        if tag in fasta[0]:
                            t = tag
                            break
                        else:
                            t = "XX_noTag"
                    if len(ref_window) == len(window):
                        for i in xrange(len(window)):
                            if ref_window[i].isalpha() and window[i] == "*" or \
                               window[i].isalpha() and ref_window[i] == "*":
                                del_count += 1
                    if del_count > max_del:
                        continue
                    p = pos
                    s = fasta[1] # Sequence
                    n = s[pos_ok - 1].upper()
                    d[p][t][n] += 1
            for p in sorted(d):
                for t in sorted(d[p]):
                    output_file.write(contig_name + "\t" + str(p) + "\t" + 
                                      str(t))
                    for n in list("ACGTN*-"):
                        output_file.write("\t" + str(d[p][t][n]))
                    output_file.write("\n")

示例#12

0

显示文件

文件： haplotype_finder.py 项目： zhiping393/Scripts

def get_haplotypes(in_ace, out_file, out_bamova, win_len, step, coverage,
                   stars, ngroups, nhaplo):
    """Get haplotypes from contigs in an ace file
    
    """
    marker_number = 0
    min_freq = 0.05
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        with open(out_bamova, "w") as bamova_file:
            output_file.write("Contig_nb\tWindow\tHaplotype\n")
            contig_counter = 0
            ntreated = 0
            for contig in ace_gen:
                pass_haplo = False
                contig_counter += 1
                align = Alignment(Gapped(IUPAC.ambiguous_dna, "X"))
                align.add_sequence(contig.name, contig.sequence)
                if len(contig.reads) - 1 < coverage:
                    continue
                ntreated += 1
                for readn in xrange(len(contig.reads)):
                    clipst = contig.reads[readn].qa.qual_clipping_start
                    clipe = contig.reads[readn].qa.qual_clipping_end
                    clipst2 = contig.reads[readn].qa.align_clipping_start
                    clipe2 = contig.reads[readn].qa.align_clipping_end
                    if clipst2 > clipst:
                        clipst = clipst2
                    if clipe2 < clipe2:
                        clipe = clipe2
                    start = contig.af[readn].padded_start
                    seq = cut_ends(contig.reads[readn].rd.sequence, clipst,
                                   clipe)
                    seq = pad_read(seq, start, len(contig.sequence))
                    if "pseudo" not in contig.reads[readn].rd.name:
                        align.add_sequence(contig.reads[readn].rd.name, seq)
                sequences = read_fasta(align.format("fasta"))
                sequences = [[s[0].replace(">", ""), s[1]] for s in sequences]
                contig_name = sequences[0][0]
                concensus = sequences[0][1]
                error_positions = multi_find("*", concensus)[::-1]
                for p in error_positions:
                    sequences = [[s[0], s[1][0:p] + s[1][p + 1:]]
                                 for s in sequences]
                concensus = sequences[0][1]
                sequences = [[s[0], correct_sequence(concensus, s[1])]
                             for s in sequences[1:]]
                sequences, snp_pos = snp_positions(sequences)
                haplotypes = best_snps(sequences, snp_pos, coverage)
                if haplotypes != "Empty":
                    bamova = []
                    variants = list(
                        sorted(list(set([h[-1] for h in haplotypes[-1]]))))
                    groups = list(
                        sorted(set([h[0][:3] for h in haplotypes[-1]])))
                    if len(groups) >= ngroups:
                        pass_haplo = True
                        for g in groups:
                            if len([
                                    h[0] for h in haplotypes[-1]
                                    if h[0].startswith(g)
                            ]) < nhaplo:
                                pass_haplo = False
                    if pass_haplo:
                        print contig.name
                        bamova_file.write("Marker" + str(marker_number) + "\n")
                        group_number = 0
                        for g in groups:
                            bamova_file.write("Population\t" +
                                              str(group_number))
                            group_number += 1
                            for v in variants:
                                bamova_file.write("\t" + str(
                                    len([
                                        h for h in haplotypes[-1]
                                        if h[-1] == v and h[0].startswith(g)
                                    ])))
                            bamova_file.write("\n")
                        with open("fasta_output/" + contig.name + ".fasta",
                                  "w") as f:
                            output_file.write(contig.name + "\n")
                            for h in haplotypes[-1]:
                                f.write(">" + h[0] + str(marker_number) +
                                        "\n" + h[2] + "\n")
                                h[1] = [x - h[1][0] + 1 for x in h[1]]
                                output_file.write(
                                    "Marker" + str(marker_number) + "\t" +
                                    "\t".join([str(x) for x in h]) + "\t" +
                                    ":".join(variants) + "\n")
                        marker_number += 1
                output_file.flush()
                bamova_file.flush()
                cutoff = 100000
                if contig_counter > cutoff:
                    break
        print "\n", str(ntreated), "contigs out of", str(
            contig_counter), "were treated"

示例#13

0

显示文件

文件： BLAST2Network.py 项目： BB-24865/blast2network

            part_site_comp_fh.writerow(row)

    cutoff = []

    if (p.use_reads):

        print "Working on ace file {}".format(p.read_fn)

        contig_read_dict = {}
        contig_read_len_dict = {}

        from Bio.Sequencing import Ace

        with open(p.use_reads, 'rU') as ace_fh:

            for contig in Ace.parse(ace_fh):
                """rd (reads) - read with name, sequence, etc
				qa (read qual) - which parts used as consensus
				ds - file name of read's chromatogram file
				af - loc of read within contig
				bs (base segment) - which read chosen at consensus at each pos
				rt (transient read tags) - generated by crossmatch and phrap
				ct (consensus tag)
				wa (whole assembly tag) - hosts assembly program name, version, etc
				wr
				reads - info about read supporting ace contig
				contig - holds info about contig from ace record"""

                contig_name = "{}".format(contig.name)  # contig00001

                if not contig_name in contig_read_dict:

示例#14

0

显示文件

def AceIterator(handle):
    """Returns SeqRecord objects from an ACE file.

    This uses the Bio.Sequencing.Ace module to do the hard work.  Note that
    by iterating over the file in a single pass, we are forced to ignore any
    WA, CT, RT or WR footer tags.

    Ace files include the base quality for each position, which are taken
    to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file
    using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's
    letter_annotations dictionary under the "phred_quality" key.

    >>> from Bio import SeqIO
    >>> handle = open("Ace/consed_sample.ace", "rU")
    >>> for record in SeqIO.parse(handle, "ace"):
    ...     print record.id, record.seq[:10]+"...", len(record)
    ...     print max(record.letter_annotations["phred_quality"])
    Contig1 agccccgggc... 1475
    90

    However, ACE files do not include a base quality for any gaps in the
    consensus sequence, and these are represented in Biopython with a quality
    of zero. Using zero is perhaps misleading as there may be very strong
    evidence to support the gap in the consensus. Previous versions of
    Biopython therefore used None instead, but this complicated usage, and
    prevented output of the gapped sequence as FASTQ format.

    >>> from Bio import SeqIO
    >>> handle = open("Ace/contig1.ace", "rU")
    >>> for record in SeqIO.parse(handle, "ace"):
    ...     print record.id, "..." + record.seq[85:95]+"..."
    ...     print record.letter_annotations["phred_quality"][85:95]
    ...     print max(record.letter_annotations["phred_quality"])
    Contig1 ...AGAGG-ATGC...
    [57, 57, 54, 57, 57, 0, 57, 72, 72, 72]
    90
    Contig2 ...GAATTACTAT...
    [68, 68, 68, 68, 68, 68, 68, 68, 68, 68]
    90

    """
    for ace_contig in Ace.parse(handle):
        #Convert the ACE contig record into a SeqRecord...
        consensus_seq_str = ace_contig.sequence
        #Assume its DNA unless there is a U in it,
        if "U" in consensus_seq_str:
            if "T" in consensus_seq_str:
                #Very odd! Error?
                alpha = generic_nucleotide
            else:
                alpha = generic_rna
        else:
            alpha = generic_dna

        if "*" in consensus_seq_str:
            #For consistency with most other file formats, map
            #any * gaps into - gaps.
            assert "-" not in consensus_seq_str
            consensus_seq = Seq(consensus_seq_str.replace("*", "-"),
                                Gapped(alpha, gap_char="-"))
        else:
            consensus_seq = Seq(consensus_seq_str, alpha)

        #TODO? - Base segments (BS lines) which indicates which read
        #phrap has chosen to be the consensus at a particular position.
        #Perhaps as SeqFeature objects?

        #TODO - Supporting reads (RD lines, plus perhaps QA and DS lines)
        #Perhaps as SeqFeature objects?

        seq_record = SeqRecord(consensus_seq,
                               id=ace_contig.name,
                               name=ace_contig.name)

        #Consensus base quality (BQ lines).  Note that any gaps (originally
        #as * characters) in the consensus do not get a quality entry, so
        #we assign a quality of None (zero would be missleading as there may
        #be excelent support for having a gap here).
        quals = []
        i = 0
        for base in consensus_seq:
            if base == "-":
                quals.append(0)
            else:
                quals.append(ace_contig.quality[i])
                i += 1
        assert i == len(ace_contig.quality)
        seq_record.letter_annotations["phred_quality"] = quals

        yield seq_record

示例#15

0

显示文件

文件： aceread.py 项目： demis001/pyWrairLib

 def _get_gen(self):
     return ace.parse(open(self.ace_filename))

示例#16

0

显示文件

def AceIterator(source):
    """Return SeqRecord objects from an ACE file.

    This uses the Bio.Sequencing.Ace module to do the hard work.  Note that
    by iterating over the file in a single pass, we are forced to ignore any
    WA, CT, RT or WR footer tags.

    Ace files include the base quality for each position, which are taken
    to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file
    using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's
    letter_annotations dictionary under the "phred_quality" key.

    >>> from Bio import SeqIO
    >>> with open("Ace/consed_sample.ace") as handle:
    ...     for record in SeqIO.parse(handle, "ace"):
    ...         print("%s %s... %i" % (record.id, record.seq[:10], len(record)))
    ...         print(max(record.letter_annotations["phred_quality"]))
    Contig1 agccccgggc... 1475
    90

    However, ACE files do not include a base quality for any gaps in the
    consensus sequence, and these are represented in Biopython with a quality
    of zero. Using zero is perhaps misleading as there may be very strong
    evidence to support the gap in the consensus. Previous versions of
    Biopython therefore used None instead, but this complicated usage, and
    prevented output of the gapped sequence as FASTQ format.

    >>> from Bio import SeqIO
    >>> with open("Ace/contig1.ace") as handle:
    ...     for record in SeqIO.parse(handle, "ace"):
    ...         print("%s ...%s..." % (record.id, record.seq[85:95]))
    ...         print(record.letter_annotations["phred_quality"][85:95])
    ...         print(max(record.letter_annotations["phred_quality"]))
    Contig1 ...AGAGG-ATGC...
    [57, 57, 54, 57, 57, 0, 57, 72, 72, 72]
    90
    Contig2 ...GAATTACTAT...
    [68, 68, 68, 68, 68, 68, 68, 68, 68, 68]
    90

    """
    for ace_contig in Ace.parse(source):
        # Convert the ACE contig record into a SeqRecord...
        consensus_seq_str = ace_contig.sequence
        # Assume its DNA unless there is a U in it,
        if "U" in consensus_seq_str:
            if "T" in consensus_seq_str:
                # Very odd! Error?
                alpha = generic_nucleotide
            else:
                alpha = generic_rna
        else:
            alpha = generic_dna

        if "*" in consensus_seq_str:
            # For consistency with most other file formats, map
            # any * gaps into - gaps.
            assert "-" not in consensus_seq_str
            consensus_seq = Seq(consensus_seq_str.replace("*", "-"), alpha)
        else:
            consensus_seq = Seq(consensus_seq_str, alpha)

        # TODO? - Base segments (BS lines) which indicates which read
        # phrap has chosen to be the consensus at a particular position.
        # Perhaps as SeqFeature objects?

        # TODO - Supporting reads (RD lines, plus perhaps QA and DS lines)
        # Perhaps as SeqFeature objects?

        seq_record = SeqRecord(consensus_seq,
                               id=ace_contig.name,
                               name=ace_contig.name)

        # Consensus base quality (BQ lines).  Note that any gaps (originally
        # as * characters) in the consensus do not get a quality entry, so
        # we assign a quality of None (zero would be misleading as there may
        # be excellent support for having a gap here).
        quals = []
        i = 0
        for base in consensus_seq:
            if base == "-":
                quals.append(0)
            else:
                quals.append(ace_contig.quality[i])
                i += 1
        assert i == len(ace_contig.quality)
        seq_record.letter_annotations["phred_quality"] = quals

        yield seq_record

示例#17

0

显示文件

def snp_count(in_ace, out_file, snp_dict, tags, win_len, max_del, stars):
    """Genotype individuals at SNPs loci.
    
    """
    win_buffer = (win_len - 1) / 2
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        output_file.write("Contig_nb\tPos\ttag_name\tA\tC\tG\tT\tN\t*\t-\n")
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "***All contigs treated***"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            align.add_sequence(contig.name, contig.sequence)
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start  # GOOD
                clipe = contig.reads[readn].qa.qual_clipping_end  # GOOD
                clipst2 = contig.reads[readn].qa.align_clipping_start  # Added
                clipe2 = contig.reads[readn].qa.align_clipping_end  # Added
                if clipst2 > clipst:  # Added
                    clipst = clipst2  # Added
                if clipe2 < clipe2:  # Added
                    clipe = clipe2  # Added
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            sequences = read_fasta(align.format("fasta"))
            contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0]
            print "Treating", contig_name
            positions = []
            try:
                positions = snp_dict[contig_name]
            except:
                continue
            d = {}
            for pos in positions:
                if stars == True:
                    pos_ok = correct_position(pos, sequences[0][1])
                else:
                    pos_ok = pos
                left = pos_ok - 5
                if left < 0:
                    left = 0
                right = pos_ok + 1 + 5  # takes into account the middle nucleotide
                ref_window = sequences[0][1][left:right]
                d.setdefault(pos, {})
                d[pos].setdefault("XX_noTag", {})
                for nuc in list("ACGTN*-"):
                    d[pos]["XX_noTag"].setdefault(nuc, 0)
                for tag in tags:
                    d[pos].setdefault(tag, {})
                    for nuc in list("ACGTN*-"):
                        d[pos][tag].setdefault(nuc, 0)
                for fasta in sequences:
                    window = fasta[1][left:right]
                    del_count = 0
                    if window.count("-") > win_buffer - 3:
                        continue  # Need at least 3 nucleotides on each side
                    for tag in tags:
                        if tag in fasta[0]:
                            t = tag
                            break
                        else:
                            t = "XX_noTag"
                    if len(ref_window) == len(window):
                        for i in xrange(len(window)):
                            if ref_window[i].isalpha() and window[i] == "*" or \
                               window[i].isalpha() and ref_window[i] == "*":
                                del_count += 1
                    if del_count > max_del:
                        continue
                    p = pos
                    s = fasta[1]  # Sequence
                    n = s[pos_ok - 1].upper()
                    d[p][t][n] += 1
            for p in sorted(d):
                for t in sorted(d[p]):
                    output_file.write(contig_name + "\t" + str(p) + "\t" +
                                      str(t))
                    for n in list("ACGTN*-"):
                        output_file.write("\t" + str(d[p][t][n]))
                    output_file.write("\n")