Exemplo n.º 1
0
def compute_mes(interval, matrix5, matrix3, genome):
    genome = pysam.FastaFile(genome)
    # 5ss 3bases in exon and 6 bases in intron
    # 3ss 20 bases in intron and 3 bases in exon
    if interval['strand'] == '+':
        seq5 = genome.fetch(interval.chrom, interval.end - 3,
                            interval.end + 6).upper()
        seq3 = genome.fetch(interval.chrom, interval.start - 20,
                            interval.start + 3).upper()
    else:
        seq5 = reverse_complement(
            genome.fetch(interval.chrom, interval.start - 6,
                         interval.start + 3).upper())
        seq3 = reverse_complement(
            genome.fetch(interval.chrom, interval.end - 3,
                         interval.end + 20).upper())
    name_format_str = '{seq5}:{mes5}|{seq3}:{mes3}'
    if set(seq5).issubset('ACGT') and set(seq3).issubset('ACGT'):
        mes5 = maxent_fast.score5(seq5, matrix=matrix5)
        mes3 = maxent_fast.score3(seq3, matrix=matrix3)
        interval['seq5'] = seq5
        interval['mes5'] = mes5
        interval['seq3'] = seq3
        interval['mes3'] = mes3

    else:
        interval['seq5'] = seq5
        interval['mes5'] = 'NA'
        interval['seq3'] = seq3
        interval['mes3'] = 'NA'
    return interval
Exemplo n.º 2
0
def calculate_gc_percentage(interval, genome, extend=0):
    chrom = interval['chrom']
    start = int(interval['start'])
    end = int(interval['end'])
    strand = interval['strand']
    genome = pysam.FastaFile(genome)
    if strand == '+' or strand == '.':
        mid_region_seq = genome.fetch(chrom, start, end).upper()
        interval['GC_exon'] = seq_gc_content(mid_region_seq)
        if extend != 0:
            up_intron_seq = genome.fetch(chrom, start - extend, start).upper()
            dn_intron_seq = genome.fetch(chrom, end, end + extend).upper()
            interval['GC_up_intron'] = seq_gc_content(up_intron_seq)
            interval['GC_dn_intron'] = seq_gc_content(dn_intron_seq)
    else:
        mid_region_seq = reverse_complement(
            genome.fetch(chrom, start, end).upper())
        interval['GC_exon'] = seq_gc_content(mid_region_seq)
        if extend != 0:
            up_intron_seq = reverse_complement(
                genome.fetch(chrom, end, end + extend).upper())
            dn_intron_seq = reverse_complement(
                genome.fetch(chrom, start - extend, start).upper())
            interval['GC_up_intron'] = seq_gc_content(up_intron_seq)
            interval['GC_dn_intron'] = seq_gc_content(dn_intron_seq)
    return interval
Exemplo n.º 3
0
    def bind_chroseq(self, refdic, gap=0, intron=False):
        """
        the gap=0 and intron=False will output CDS seq
        gap>0 would not work with intron=True
        grp>0 and intron=False woll output exon seq seperated by "N"
        :param: refdic: the reference genome
        """
        # need self.exon, self.strand
        # need to note that the chr_select is 0 based
        # while the exon in bigg is 1 based
        if self.exon is None:
            self.get_exon()

        seq_l = []
        for n, exon_one in enumerate(self.exon):
            chro = self.chrom
            start, end = exon_one
            _, seq = chr_select(refdic, chro, start, end)
            seq_l.append(seq.upper())  # upper case for exon
            if intron is False and n < len(self.exon) - 1:
                seq_l.append(gap * "N")
            if intron and n < len(self.intron):
                intron_one = self.intron[n]
                start_i, end_i = intron_one
                _, seq_intron = chr_select(refdic, chro, start_i, end_i)
                seq_l.append(seq_intron.lower())  # lower case for intron
        seq_raw = "".join(seq_l)
        if self.strand == "+":
            seq_out = seq_raw
        else:
            seq_out = reverse_complement(seq_raw)

        self.seq_chro = "".join(seq_out)
Exemplo n.º 4
0
def find_reverse_palindromes(sequence):
    """Find every reverse palindromes of size >= 4, <= 12 and
    return their indices and associated length within a sequence.

    Args:
        sequence: string, dna sequence

    Returns:
        list of starting indices and length of each reverse palindrome
        found
    """
    output = []

    for i in range(len(sequence)):
        for j in range(4, 13):
            if i + j > len(sequence):
                continue

            subseq = sequence[i:i + j]
            rc = reverse_complement(subseq)

            if subseq == rc:
                output.append((i + 1, j))

    return output
Exemplo n.º 5
0
def compute_possible_dna_origins(DNA, final_peptides):
    """
    Given a string of DNA and a string of peptides, find the
    subsets of DNA withing the String that could have encoded the peptide

    (Encoding process: DNA -> RNA -> peptide)

    NOTE: for each String of DNA, we have to get the reverse complement that
    we know will be attached to it during transcription into RNA

    :param DNA: String - The strand of DNA
    :param peptides: String - The peptide produced after transcription and translation

    :return: origins: Array - A list of possible DNA origins for the peptides
    """
    encoders = recursive_find_rna_encoders(set(), final_peptides)
    freq_dict = FrequencyDict(DNA, len(final_peptides) * 3)

    res = []

    for codon in encoders:
        enc_dna = rna_to_dna(codon)
        enc_rev = reverse_complement(enc_dna, as_string=True)

        freq = freq_dict.get(enc_dna, 0)
        res.extend([enc_dna] * freq)

        freq = freq_dict.get(enc_rev, 0)
        res.extend([enc_rev] * freq)

    return res
Exemplo n.º 6
0
def most_frequent_kmers(DNA, k, mutation_thresh=0, reverse=False):
    """
    Returns a list of most frequent k-mers in DNA

    We'll use a Priorty Queue to track each pattern
    along with its frequency in the DNA.

    :param DNA: String - DNA
    :param k: Integer - Length of the K-mer
    :param mutation_thresh: Allows for a certain number of mismatches
    :return: Set - Set of most frequent K-mers
    """
    freq_dict = dictionaries.FrequencyDict(DNA, k, mutation_thresh)

    kmers_found = set()
    current_highest_freq = 0

    for kmer, frequency in freq_dict.items():

        rev = reverse_complement(kmer, as_string=True)
        if reverse and rev in freq_dict:
            frequency += freq_dict[rev]
        if frequency > current_highest_freq:
            current_highest_freq = frequency
            kmers_found = set([kmer])
        elif frequency == current_highest_freq:
            kmers_found.add(kmer)

    return kmers_found
Exemplo n.º 7
0
    def _test_profile_split(self, sequences, length):
        counts = utils.counts(sequences, length)
        profile = klib.Profile(utils.as_array(counts, length))
        left, right = profile.split()

        assert len(left) == len(right)
        assert sum(left) + sum(right) == sum(counts.values()) * 2

        indices_left = {}
        indices_right = {}
        indices_palindrome = {}

        for s, c in counts.items():
            r = utils.reverse_complement(s)
            if s < r:
                indices_left[utils.count_index(s)] = c * 2
            elif s > r:
                indices_right[utils.count_index(r)] = counts[s] * 2
            else:
                indices_palindrome[utils.count_index(s)] = c

        assert ([c for c in left if c > 0] == [
            c for i, c in sorted(
                list(indices_left.items()) + list(indices_palindrome.items()))
        ])
        assert ([c for c in right if c > 0] == [
            c for i, c in sorted(
                list(indices_right.items()) + list(indices_palindrome.items()))
        ])
Exemplo n.º 8
0
def main(input_str):

    """
    Main function takes string input and returns the best results depending
    on scoring. Single result include sh-miR sequence,
    score and link to 2D structure from mfold program
    """
    sequence = check_input(input_str)
    seq1, seq2, shift_left, shift_right = sequence
    if not seq2:
        seq2 = reverse_complement(seq1)
    all_frames = get_all()
    if 'error' in all_frames: #database error handler
        return all_frames

    frames = get_frames(seq1, seq2, shift_left, shift_right, all_frames)
    original_frames = [Backbone(**elem) for elem in all_frames]


    frames_with_score = []
    for frame_tuple, original in zip(frames, original_frames):
        score = 0
        frame, insert1, insert2 = frame_tuple
        mfold_data = mfold(frame.template(insert1, insert2))
        if 'error' in mfold_data:
            return mfold_data
        pdf, ss = mfold_data[0], mfold_data[1]
        score += score_frame(frame_tuple, ss, original)
        score += score_homogeneity(original)
        score += two_same_strands_score(seq1, original)
        frames_with_score.append((score, frame.template(insert1, insert2), frame.name, pdf))

    sorted_frames = [elem for elem in sorted(frames_with_score,\
                        key=lambda x: x[0], reverse=True) if elem[0] > 60]
    return {'result': sorted_frames[:3]}
Exemplo n.º 9
0
def main(fasta_block):
    fasta_dict = fasta_breakup(fasta_block)
    dna = fasta_dict.values()[0]

    restriction_sites = []
    for block_len in range(4,14,2):
        idx = 0
        while idx <= len(dna) - block_len:
            current = dna[0+idx: block_len+idx]
            # split current in half
            if current[:(block_len/2)] == "".join(reverse_complement(current[(block_len/2):])):
                print "palindrome, yo", current, current[:(block_len/2)], "".join(reverse_complement(current[(block_len/2):]))
                restriction_sites.append([idx+1, block_len])
            idx += 1
    print restriction_sites
    return restriction_sites
Exemplo n.º 10
0
def d_duval_(seq, alg, **kwargs):
    factors1 = [len(i) for i in alg(seq, **kwargs)]
    complement = reverse_complement(seq)
    factors2 = [len(i) for i in reversed(alg(complement, **kwargs))]

    rest = seq; result = []
    while factors1 and factors2:
        if factors1[0] < factors2[0]:
            n = factors1.pop(0)
            factors2[0] = factors2[0] - n
            if factors2[0] == 0: factors2.pop(0)
        else:
            n = factors2.pop(0)
            factors1[0] = factors1[0] - n
            if factors1[0] == 0: factors1.pop(0)
        f, rest = rest[:n], rest[n:]
        result.append(f)

    while factors1:
        n = factors1.pop(0)
        f, rest = rest[:n], rest[n:]
        result.append(f)
    while factors2:
        n = factors2.pop(0)
        f, rest = rest[:n], rest[n:]
        result.append(f)

    return result
Exemplo n.º 11
0
    def test_profile_reverse_complement_palindrome(self):
        counts = utils.counts(['ACCTAGGT'], 8)
        profile = klib.Profile(utils.as_array(counts, 8))

        for i in range(profile.length):
            assert (profile.binary_to_dna(profile.reverse_complement(i)) ==
                    utils.reverse_complement(profile.binary_to_dna(i)))
Exemplo n.º 12
0
def get_adenylation_domains(fasta, known=None, lagging_strand=False):
    adenylation_domains = []

    fasta_seqs = []
    for fs in SeqIO.parse(fasta, 'fasta'):
        revcom=False
        seq = str(fs.seq)
        pepseq, rf = get_pepseq(seq)
        if rf < 0 == lagging_strand:
            revcom=True
            seq = utils.reverse_complement(seq)
        fasta_seqs.append({'id': fs.id, 'seq': seq, 'pepseq': pepseq, 'rf': rf})
    for fs in fasta_seqs:
        utils.run_cmd([hmmsearch, '--domtblout', 'dump', os.path.abspath('lib/AMP-binding.hmm'), '-'],
                  '>header\n' + pepseq)
        with open('dump') as f:
            out = f.read()
        res_stream = StringIO(out)
        os.remove('dump')
        results = list(SearchIO.parse(res_stream, 'hmmsearch3-domtab'))

        for result in results:
            for i, hsp in enumerate(result.hsps, 1):
                s = hsp.hit_start
                e = hsp.hit_end

                adenylation_domains.append((AdenylationDomain(fs['seq'][s*3:e*3], known, '{}_{}'.format(fs['id'], i), revcom), s, e))

    return adenylation_domains
Exemplo n.º 13
0
    def test_profile_reverse_complement(self):
        counts = utils.counts(utils.SEQUENCES, 8)
        profile = klib.Profile(utils.as_array(counts, 8))

        for i in range(profile.length):
            assert (profile.binary_to_dna(profile.reverse_complement(i)) ==
                    utils.reverse_complement(profile.binary_to_dna(i)))
Exemplo n.º 14
0
    def _test_profile_split(self, sequences, length):
        counts = utils.counts(sequences, length)
        profile = klib.Profile(utils.as_array(counts, length))
        left, right = profile.split()

        assert len(left) == len(right)
        assert sum(left) + sum(right) == sum(counts.values()) * 2

        indices_left = {}
        indices_right = {}
        indices_palindrome = {}

        for s, c in counts.items():
            r = utils.reverse_complement(s)
            if s < r:
                indices_left[utils.count_index(s)] = c * 2
            elif s > r:
                indices_right[utils.count_index(r)] = counts[s] * 2
            else:
                indices_palindrome[utils.count_index(s)] = c

        assert ([c for c in left if c > 0] ==
                [c for i, c in sorted(list(indices_left.items()) +
                                      list(indices_palindrome.items()))])
        assert ([c for c in right if c > 0] ==
                [c for i, c in sorted(list(indices_right.items()) +
                                      list(indices_palindrome.items()))])
Exemplo n.º 15
0
def main(input_str):
    """
    Main function takes string input and returns the best results depending
    on scoring. Single result include sh-miR sequence,
    score and link to 2D structure from mfold program
    """
    sequence = check_input(input_str)
    seq1, seq2, shift_left, shift_right = sequence
    if not seq2:
        seq2 = reverse_complement(seq1)
    all_frames = get_all()
    if 'error' in all_frames:  #database error handler
        return all_frames

    frames = get_frames(seq1, seq2, shift_left, shift_right, all_frames)
    original_frames = [Backbone(**elem) for elem in all_frames]

    frames_with_score = []
    for frame_tuple, original in zip(frames, original_frames):
        score = 0
        frame, insert1, insert2 = frame_tuple
        mfold_data = mfold(frame.template(insert1, insert2))
        if 'error' in mfold_data:
            return mfold_data
        pdf, ss = mfold_data[0], mfold_data[1]
        score += score_frame(frame_tuple, ss, original)
        score += score_homogeneity(original)
        score += two_same_strands_score(seq1, original)
        frames_with_score.append(
            (score, frame.template(insert1, insert2), frame.name, pdf))

    sorted_frames = [elem for elem in sorted(frames_with_score,\
                        key=lambda x: x[0], reverse=True) if elem[0] > 60]
    return {'result': sorted_frames[:3]}
Exemplo n.º 16
0
    def test_profile_reverse_complement_palindrome(self):
        counts = utils.counts(['ACCTAGGT'], 8)
        profile = klib.Profile(utils.as_array(counts, 8))

        for i in range(profile.length):
            assert (profile.binary_to_dna(
                profile.reverse_complement(i)) == utils.reverse_complement(
                    profile.binary_to_dna(i)))
Exemplo n.º 17
0
    def test_profile_balance_palindrome(self):
        counts = utils.counts(['AATT'], 4)
        profile = klib.Profile(utils.as_array(counts, 4))
        profile.balance()

        counts.update(dict((utils.reverse_complement(s), c)
                           for s, c in counts.items()))
        utils.test_profile(profile, counts, 4)
Exemplo n.º 18
0
    def test_profile_balance(self):
        counts = utils.counts(utils.SEQUENCES, 8)
        profile = klib.Profile(utils.as_array(counts, 8))
        profile.balance()

        counts.update(dict((utils.reverse_complement(s), c)
                           for s, c in counts.items()))
        utils.test_profile(profile, counts, 8)
Exemplo n.º 19
0
    def test_profile_balance_palindrome(self):
        counts = utils.counts(['AATT'], 4)
        profile = klib.Profile(utils.as_array(counts, 4))
        profile.balance()

        counts.update(
            dict((utils.reverse_complement(s), c) for s, c in counts.items()))
        utils.test_profile(profile, counts, 4)
Exemplo n.º 20
0
    def test_profile_reverse_complement(self):
        counts = utils.counts(utils.SEQUENCES, 8)
        profile = klib.Profile(utils.as_array(counts, 8))

        for i in range(profile.length):
            assert (profile.binary_to_dna(
                profile.reverse_complement(i)) == utils.reverse_complement(
                    profile.binary_to_dna(i)))
Exemplo n.º 21
0
    def test_profile_balance(self):
        counts = utils.counts(utils.SEQUENCES, 8)
        profile = klib.Profile(utils.as_array(counts, 8))
        profile.balance()

        counts.update(
            dict((utils.reverse_complement(s), c) for s, c in counts.items()))
        utils.test_profile(profile, counts, 8)
def split_assembled_genome(gtf_path, fasta_path, od='.', L=98):
    trs = parse_gtf(gtf_path)
    print('GTF parsed')
    scaffolds = parse_fasta(fasta_path)
    print('Scaffolds parsed')

    wrong_scaffolds = 0
    for tr, data in trs.items():
        processed = []
        unprocessed = []
        if data['scaffold'] in scaffolds:
            sequence = scaffolds[data['scaffold']]
        else:
            print(f'{data["scaffold"]} not in FASTA file {fasta_path}')
            wrong_scaffolds += 1
            continue
        for exon in data['exons']:
            processed.append((int(exon['start']) - 1, int(exon['end']) - 1))
            unprocessed.append(
                (int(exon['start']) - L, int(exon['end']) + L - 2))

        processed = list(merge_intervals(processed))
        unprocessed = merge_intervals(unprocessed)

        processed = ''.join(map(lambda iv: sequence[iv[0]:iv[1]], processed))
        splice_junctions = []
        for iv in unprocessed:
            if iv[1] - iv[0] < 3 * L - 3:
                # If the length of the exon is < L-1
                splice_junctions.append(iv)
            else:
                splice_junctions.append((iv[0], iv[0] + 2 * L - 2))
                splice_junctions.append((iv[1] - (2 * L) + 2, iv[1]))

        splice_junctions = [
            collapse_N(sequence[iv[0]:iv[1]].upper())
            for iv in splice_junctions
        ]

        processed = collapse_N(processed).upper()
        if data['strand'] == '-':
            processed = reverse_complement(processed)
            splice_junctions = map(reverse_complement, splice_junctions)

        with open(f'{od}/processed_transcripts.fasta', 'a') as fh:
            fh.write(f'>{tr}\n')
            fh.write('\n'.join(
                [processed[i:i + 80] for i in range(0, len(processed), 80)]))
            fh.write('\n')

        with open(f'{od}/splice_junctions.fasta', 'a') as fh:
            for i, sj in enumerate(splice_junctions):
                fh.write(f'>{tr}:{i}\n')
                fh.write(f'{sj}\n')
    print('DONE!')
    print(
        f'{wrong_scaffolds} scaffolds were not found, and the corresponding annotations were ignored.'
    )
Exemplo n.º 23
0
    def get_sequences(self, locations, width=200):

        # need to ensure that most locations on the forward
        # and reverse strands are mappable
        seqs = [utils.makestr(self.genome[loc[0]][int(loc[1])-width/2:int(loc[1])+width/2]) if loc[3]=='+' \
            else utils.reverse_complement(utils.makestr(self.genome[loc[0]][int(loc[2])-width/2+1:int(loc[2])+width/2+1])) \
            for loc in locations]

        return seqs
Exemplo n.º 24
0
    def get_sequences(self, locations, width=200):

        # need to ensure that most locations on the forward
        # and reverse strands are mappable
        seqs = [utils.makestr(self.genome[loc[0]][int(loc[1])-width/2:int(loc[1])+width/2]) if loc[3]=='+' \
            else utils.reverse_complement(utils.makestr(self.genome[loc[0]][int(loc[2])-width/2+1:int(loc[2])+width/2+1])) \
            for loc in locations]

        return seqs
Exemplo n.º 25
0
    def test_balance(self):
        counts = utils.counts(utils.SEQUENCES, 8)
        filename = self.empty()

        with utils.open_profile(self.profile(counts, 8)) as input_handle:
            with utils.open_profile(filename, 'w') as output_handle:
                kmer.balance(input_handle, output_handle)
        counts.update(dict((utils.reverse_complement(s), c) for s, c in counts.items()))
        utils.test_profile_file(filename, counts, 8)
Exemplo n.º 26
0
    def test_balance(self):
        counts = utils.counts(utils.SEQUENCES, 8)
        filename = self.empty()

        with utils.open_profile(self.profile(counts, 8)) as input_handle:
            with utils.open_profile(filename, 'w') as output_handle:
                kmer.balance(input_handle, output_handle)
        counts.update(dict((utils.reverse_complement(s), c) for s, c in counts.items()))
        utils.test_profile_file(filename, counts, 8)
Exemplo n.º 27
0
 def getSeq(self,g):
     seq = ''
     for node in self.nodeIds:
         nodeSeq = g.nodes[abs(node)].nodeSeq
         if node < 0:
             nodeSeq = reverse_complement(nodeSeq)
         if seq != '':
             nodeSeq = nodeSeq[g.overlap:]    
         seq = seq + nodeSeq
     return(seq)
Exemplo n.º 28
0
def frequent_words_with_mismatches_with_revcomp(text: str, k: int,
                                                d: int) -> Set[str]:
    """
    >>> frequent_words_with_mismatches_with_revcomp("ACGTTGCATGTCGCATGATGCATGAGAGCT", 4, 1) == {'ATGT', 'ACAT'}
    True
    """
    freq_map = _frequent_words_helper(text, k, d)
    freq_map_rc = _frequent_words_helper(reverse_complement(text), k, d)
    freq_map += freq_map_rc
    _, maxval = freq_map.most_common(1)[0]
    res = {k for k, v in freq_map.items() if v == maxval}
    return res
Exemplo n.º 29
0
def find_wells():
    mm = mismatch_reporters()
    ls = load_seqs()
    successes = {}
    for k,v in mm.iteritems():
        if not 'fwd' in k: continue
        successes[k] = [  (k1,v1) 
                      
                          for k1,v1 in ls.iteritems() 
                          if v[6:-1] in zutils.reverse_complement(v1)]
    print 
    print
    print
    print '              '+'\n              '.join(sorted([str((k,len([elt[0] for elt in v]),[elt[0] for elt in v])) for k,v in successes.iteritems()]))
    return successes
Exemplo n.º 30
0
    def get_key_factor(factors, normalize=False):
        if len(factors) > 3:
            longest = max(factors[1:-1], key=lambda factor: len(factor))
        else:
            longest = max(factors, key=lambda factor: len(factor))

        if normalize:
            reverse = reverse_complement(longest)
            if reverse < longest:
                longest = reverse

        if 10 < len(longest) < 20:
            longest = longest[:10] + longest[-(len(longest) - 10):]
        elif len(longest) > 20:
            longest = longest[:10] + longest[-10:]

        return longest
Exemplo n.º 31
0
def main(fasta_block):
    dna_dict = fasta_breakup(fasta_block)
    dna_target = dna_dict.values()[0]

    rna_target = dna_to_rna(dna_target)
    rev_comp_dna = "".join(reverse_complement(dna_target))
    rev_comp_rna = dna_to_rna(rev_comp_dna)

    protein_strings = []
    for start in find_all(rna_target, 'AUG'):
        protein_strings.append(translate(start, rna_target))
    
    for start in find_all(rev_comp_rna, 'AUG'):
        protein_strings.append(translate(start, rev_comp_rna))
    
    #dedupe and remove Nones
    protein_strings = [prot for prot in protein_strings if prot != None]
    cleaned_proteins = list(set(protein_strings))
    return "\n".join(cleaned_proteins)
Exemplo n.º 32
0
    def make_oligos(self):
        if not self.library:
            print 'Must have mutation library before making oligos.'
            return

        for aa in self.library:
            seq = self.nt_seq

            for s, e, c in [(res[1], res[2], dgn) for res, dgn, tf in zip(self.code, self.library[aa]['dgn'], self.library[aa]['cdns']) if tf]:
                # TODO: Codon usage..
                c =  ''.join([next(iter(utils.dgn_to_nts[nt])) for nt in c])
                seq = seq[:s] + c + seq[e:]

            oligo_positions = []
            mut_positions = []
            self.oligos[aa] = []
            for s in self.library[aa]['oligo_set']:
                mut_start = s[0]*3
                mut_end = s[-1]*3+3
                mut = seq[mut_start:mut_end]
                pre_len = (oligo_length - len(mut)) / 2

                oligo_start = mut_start-pre_len
                oligo_end = oligo_start + oligo_length

                oligo_seq = self.nt_seq[oligo_start:mut_start] + mut.lower() + self.nt_seq[mut_end:oligo_end]
                if self.revcom:
                    oligo_seq = utils.reverse_complement(oligo_seq)
                self.oligos[aa].append(oligo_seq)

                oligo_positions.append([oligo_start, oligo_end])
                mut_positions.append([mut_start, mut_end])

            for i, pos in enumerate(oligo_positions[1:]):
                if mut_positions[i][1] > pos[0]:
                    print 'Oligo clash detected..'

        return
Exemplo n.º 33
0
def mismatch_reporters():
    enzymes = {'fwd': ['CTAGA', 'G'],
               'rev': ['GATCC', 'T']}

    primers, names = [], []
    for k, r in reporters.iteritems():
        for m_ct in 0, 2, 3:
            if m_ct > 0: nts = [n for n in nt_list if n!=ids[k]]
            else: nts = [ids[k]]
            for mm in nts:
                rep = re.sub('N', mm ,r, m_ct)
                rep = re.sub('N', ids[k], rep)
                
                fwd = list(enzymes['fwd'])
                fwd.insert(1,rep)
                rev = list(enzymes['rev'])
                rev.insert(1,zutils.reverse_complement(rep))

                names.append('{0}_{1}mm={2}_rev'.format(k, m_ct, mm))
                primers.append(''.join(rev))
                names.append('{0}_{1}mm={2}_fwd'.format(k, m_ct, mm))
                primers.append(''.join(fwd))
              
    return dict([(n,p) for n,p in zip(names, primers)])
Exemplo n.º 34
0
def locate_breakpoint(input_file, output_file, reference_file, margin=200):
    class Breakpoint_locator(object):
        def __init__(self):

            self.consensus = None
            self.seq_around_bp = None
            self.seq_start = None
            self.seq_end = None
            self.seq_dir = None
            self.bp_pos_consensus = None
            self.bp_pos_reference = None
            self.tmp_dir = tempfile.mkdtemp()

        def __del__(self):
            shutil.rmtree(self.tmp_dir)

        def initialize(self, cluster_id, consensus, seq_around_bp, seq_start,
                       seq_end, seq_dir):

            self.cluster_id = cluster_id
            self.consensus = consensus
            self.seq_around_bp = seq_around_bp
            self.seq_start = seq_start
            self.seq_end = seq_end
            self.seq_dir = seq_dir
            self.bp_pos_consensus = None
            self.bp_pos_reference = None
            if len(self.consensus) >= 1000:
                self.consensus = self.consensus[:1000]

        def locate_by_alignment(self):

            with open(self.tmp_dir + '/' + self.cluster_id + ".query.fa",
                      'w') as hout:
                print(">query_%s\n%s" % (self.cluster_id, self.seq_around_bp),
                      file=hout)

            with open(self.tmp_dir + '/' + self.cluster_id + ".target.fa",
                      'w') as hout:
                print(">target_%s\n%s" % (self.cluster_id, self.consensus),
                      file=hout)

            alignment_info = nanomonsv.long_read_validate.ssw_check(
                self.tmp_dir + '/' + self.cluster_id + ".target.fa",
                self.tmp_dir + '/' + self.cluster_id + ".query.fa")

            # print(self.seq_start, self.seq_end, self.seq_dir)
            # print(alignment_info["query_" + self.cluster_id])

            if "query_" + self.cluster_id not in alignment_info: return
            _, tstart_a, tend_a, qstart_a, qend_a, strand_a = alignment_info[
                "query_" + self.cluster_id]
            if strand_a != '+': return

            self.bp_pos_consensus = tend_a
            if self.seq_dir == '+':
                self.bp_pos_reference = self.seq_end - (
                    len(self.seq_around_bp) - qend_a)
            else:
                self.bp_pos_reference = self.seq_start + (
                    len(self.seq_around_bp) - qend_a)

            # print(self.bp_pos_reference, self.bp_pos_consensus)

    bp_loc = Breakpoint_locator()
    fasta_file = pysam.FastaFile(reference_file)

    with open(input_file, 'r') as hin, open(output_file, 'w') as hout:
        for row in csv.reader(hin, delimiter='\t'):
            if row[4] == '+':
                seq_around_bp = fasta_file.fetch(row[1],
                                                 int(row[2]) - margin,
                                                 int(row[3]))
                bp_loc.initialize(row[0], row[5], seq_around_bp,
                                  int(row[2]) - margin + 1, int(row[3]), '+')
            else:
                seq_around_bp = fasta_file.fetch(row[1], int(row[2]),
                                                 int(row[3]) + margin)
                seq_around_bp = reverse_complement(seq_around_bp)
                bp_loc.initialize(row[0], row[5], seq_around_bp,
                                  int(row[2]) + 1,
                                  int(row[3]) + margin, '-')

            bp_loc.locate_by_alignment()

            if bp_loc.bp_pos_reference is not None:
                print("%s\t%s\t%d\t%s\t%s" %
                      (row[0], row[1], bp_loc.bp_pos_reference, row[4],
                       row[5][bp_loc.bp_pos_consensus:]),
                      file=hout)

    del bp_loc
    fasta_file.close()
Exemplo n.º 35
0
 def get_node_seq(self, nodeId):
     if nodeId < 0:
         nodeSeq = reverse_complement(self.nodes[-nodeId].nodeSeq.strip())
     else:
         nodeSeq = self.nodes[nodeId].nodeSeq.strip()
     return (nodeSeq)
Exemplo n.º 36
0
def SNV_main(args, mut_df=None, frameshift_df=None):

    opts = vars(args)

    ############################
    # read in necessary files
    ############################

    # read in position to trinucleotide file
    logger.info('reading pos_to_nuc_dictionary...')
    # read in data frame
    pos_to_nuc_df = pd.read_table('db/merged_pos_to_context_class_final.txt',
                                  sep='\t',
                                  names=('pos', 'trinucleotide',
                                         'coefficient'))

    if len(pos_to_nuc_df[
            pos_to_nuc_df['trinucleotide'].astype(str).str.len() != 4]) != 0:
        logger.info('something is wrong with reading pos_to_nuc_dictionary...')
        sys.exit()

    # make dictionary
    pos_to_nuc = pos_to_nuc_df.set_index('pos')['trinucleotide'].to_dict()
    pos_to_nuc_keys = pos_to_nuc.keys()

    # read in trinucleotide to position file
    logger.info('reading nuc_to_pos_dictionary...')
    nuc_to_pos_dict = {}
    nuc_to_cumsum_dict = {}
    for k in strand_trinucs:
        df = pd.read_table('db/' + k + '_data.txt',
                           sep='\t',
                           names=('pos', 'trinucleotide', 'coefficient'))
        nuc_to_pos_dict[k] = df['pos'].values

        p = df['coefficient'].values
        cdf = np.cumsum(p)
        cdf /= cdf[-1]
        nuc_to_cumsum_dict[k] = cdf

    # read in cancer gene file
    logger.info('reading cancer_gene_dictionary...')
    pos_to_codon_dict = {}
    pos_to_gene_dict = {}
    tmp_Chr_pos = ''
    with open('db/cancergene_pos_list_final.txt', 'r') as hin:
        for line in hin:
            F = line.rstrip('\n').split(' ')
            if len(F) < 4:
                continue
            if opts['gene'] and F[0] != opts['gene']:
                continue
            if len(F[3]) != 3 | len(F[3]) != 11:
                logger.info("codon frame error...: {0}, {1}, {2}".format(
                    F[0], F[1], F[3]))
                sys.exit()

            pos_to_gene_dict.setdefault(F[1], []).append(F[0])
            pos_to_codon_dict.setdefault(F[1], []).append(';'.join(
                [F[2], F[3], F[4]]))

    hin.close()
    pos_to_gene_dict_keys = pos_to_gene_dict.keys()

    #############################
    # modify mutation dataframe
    #############################

    # get mutation df
    mut_df = pd.read_csv(opts['maf_file'], sep='\t')
    orig_num_mut = len(mut_df)

    # rename columns to fit my internal column names
    rename_dict = {
        'Hugo_Symbol': 'Gene',
        'Tumor_Sample_Barcode': 'Tumor_Sample',
        'Tumor_Seq_Allele2': 'Tumor_Allele',
        'Tumor_Seq_Allele': 'Tumor_Allele',
    }
    mut_df.rename(columns=rename_dict, inplace=True)

    # drop rows with missing info
    na_cols = [
        'Gene', 'Reference_Allele', 'Tumor_Allele', 'Start_Position',
        'Chromosome'
    ]
    mut_df = mut_df.dropna(subset=na_cols)
    logger.info('Kept {0} mutations after droping mutations with missing '
                'information (Droped: {1})'.format(len(mut_df),
                                                   orig_num_mut - len(mut_df)))

    if opts['gene']:
        mut_df = gene_analysis(mut_df, opts['gene'], pos_to_nuc_keys,
                               opts['maf_file'])

    #############################
    #. SNV dataframe
    #############################

    # select valid single nucleotide variants only and corrects for 1-based coordinates!! (important)
    snv_df = filtering.snv_mutation_df(mut_df, opts['unique'])
    # get chromosome-position
    snv_df['Chrom_Pos'] = snv_df['Chromosome'] + ':' + snv_df[
        'Start_Position'].astype(str)

    # remove SNVs of non-coding regions
    orig_len = len(snv_df)
    snv_df = snv_df[snv_df['Chrom_Pos'].isin(pos_to_nuc_keys)]
    after_len = len(snv_df)

    log_msg = ('Dropped {num_dropped} non-coding SNV mutations.'.format(
        num_dropped=orig_len - after_len))
    logger.info(log_msg)

    #############################
    #. SNV check
    ############################

    # get trincleotide context
    snv_df['trinucleotide'] = snv_df['Chrom_Pos'].apply(
        lambda x: pos_to_nuc[x])
    snv_df['trinucleotide'] = snv_df['trinucleotide'].astype('category')
    snv_df['Chrom_Pos'] = snv_df['Chrom_Pos'].astype('category')

    # check if the mutation is in the gene_list
    snv_df['gene'] = snv_df['Chrom_Pos'].map(pos_to_gene_dict)

    tmp_snv_df = snv_df.dropna(subset=['gene'])
    outcome = []
    chr_pos = tmp_snv_df.Chrom_Pos.values
    t_allele = tmp_snv_df.Tumor_Allele.values
    n_allele = tmp_snv_df.Reference_Allele.values

    # check if the mutation is synonymous / non-synonymous / splice site
    for idx in range(tmp_snv_df.shape[0]):
        tmp_outcome = []
        # there are genes with different reading frames
        for item in pos_to_codon_dict[chr_pos[idx]]:

            pos_in_codon = item.split(';')[0]
            codon_seq = item.split(';')[1]
            strand = item.split(';')[2]

            if pos_in_codon == 'splice_site':
                tmp_outcome.append('splice_site')
                continue

            # check if base change causes amino acid change
            codon_seq_list = list(codon_seq)

            if codon_seq_list[int(pos_in_codon)] == n_allele[idx]:
                codon_seq_list[int(pos_in_codon)] = t_allele[idx]
            elif codon_seq_list[int(pos_in_codon)] == utils.reverse_complement(
                    n_allele[idx]):
                codon_seq_list[int(pos_in_codon)] = utils.reverse_complement(
                    t_allele[idx])
            else:
                print "error: " + chr_pos[idx] + pos_to_codon_dict[
                    chr_pos[idx]]

            new_codon_seq = ''.join(codon_seq_list)

            if codon_table[codon_seq] == codon_table[new_codon_seq]:
                tmp_outcome.append('synonymous')
            else:
                tmp_outcome.append('non-synonymous')

        outcome.append(':'.join(tmp_outcome))

    tmp_snv_df['outcome'] = outcome
    tmp_snv_df['original'] = 'original'
    tmp_snv_df.to_csv(opts['output_prefix'] + '.final_snv_result.csv',
                      columns=[
                          'Tumor_Sample', 'original', 'Gene', 'Chromosome',
                          'Start_Position', 'End_Position', 'Reference_Allele',
                          'Tumor_Allele', 'Chrom:Pos', 'gene', 'outcome'
                      ],
                      index=False)

    #############################
    #. SNV simulation
    ############################

    max_num_sim = opts['simulation_number']  # number of simulations

    for num_sim in range(max_num_sim):

        log_msg = ('Performing simulation {num_simulation}...'.format(
            num_simulation=num_sim + 1))
        logger.info(log_msg)

        # randomization
        trinuc = snv_df['trinucleotide'].values
        new_pos_list = []
        for idx in range(snv_df.shape[0]):
            wr = weighted_choice(nuc_to_pos_dict[trinuc[idx]],
                                 nuc_to_cumsum_dict[trinuc[idx]])
            new_pos_list.append(wr)

        snv_df['New_chr_pos'] = new_pos_list

        # check if new chr_pos is in gene_list
        snv_df['New_gene'] = snv_df['New_chr_pos'].map(pos_to_gene_dict)

        tmp_snv_df = snv_df.dropna(subset=['New_gene'])
        #print tmp_snv_df

        outcome = []
        chr_pos = tmp_snv_df.New_chr_pos.values
        t_allele = tmp_snv_df.Tumor_Allele.values
        n_allele = tmp_snv_df.Reference_Allele.values

        for idx in range(tmp_snv_df.shape[0]):
            tmp_outcome = []
            for item in pos_to_codon_dict[chr_pos[idx]]:

                pos_in_codon = item.split(';')[0]
                codon_seq = item.split(';')[1]
                strand = item.split(';')[2]

                if pos_in_codon == 'splice_site':
                    tmp_outcome.append('splice_site')
                    continue

                codon_seq_list = list(codon_seq)

                if codon_seq_list[int(pos_in_codon)] == n_allele[idx]:
                    codon_seq_list[int(pos_in_codon)] = t_allele[idx]
                elif codon_seq_list[int(
                        pos_in_codon)] == utils.reverse_complement(
                            n_allele[idx]):
                    codon_seq_list[int(
                        pos_in_codon)] = utils.reverse_complement(
                            t_allele[idx])
                else:
                    print "error: " + chr_pos[idx] + pos_to_codon_dict[
                        chr_pos[idx]]

                new_codon_seq = ''.join(codon_seq_list)

                if codon_table[codon_seq] == codon_table[new_codon_seq]:
                    tmp_outcome.append('synonymous')
                else:
                    tmp_outcome.append('non-synonymous')

            outcome.append(':'.join(tmp_outcome))

        tmp_snv_df['New_outcome'] = outcome
        tmp_snv_df['simulation_num'] = 'simulation' + str(int(num_sim) + 1)
        tmp_snv_df.to_csv(opts['output_prefix'] + '.final_snv_result.csv',
                          columns=[
                              'Tumor_Sample', 'simulation_num', 'Gene',
                              'Chromosome', 'Start_Position', 'End_Position',
                              'Reference_Allele', 'Tumor_Allele',
                              'New_chr_pos', 'New_gene', 'New_outcome'
                          ],
                          mode='a',
                          header=False,
                          index=False)

    log_msg = ('Successfully finished. gene:{gene}, maf:{maf}'.format(
        gene=opts['gene'], maf=opts['maf_file']))
    logger.info(log_msg)
Exemplo n.º 37
0
def context_generator(fa_file, chroms, min_length=3, max_length=5, padding=1):
    """ Creates context and target k-mers using provided fasta and
    fasta index file. Using a 1 base sliding window approach with
    random k-mer sizes between min and max length. Both polarities
    are sampled randomly.

    E.g. min_length=3, max_length=5, padding=1

        rnd_kmer_sizes = [4, 3, 5]
        CATATCA -> ['CATA', 'ATA', 'TATCA']

        -> ('chr?', 'ATA', ['CATA', 'TATCA'])

        DNA sequences will be converted into ints for the final result

        -> ('chr?', 12, [140, 1140])

    Args:
          fa_file (str): Path to fasta file with with accompanying
                         Samtools index file (*.fai).
          chroms (list): Orded list of chromosome/parent ids which will
                         be included when iterating over the fasta file.
       min_length (int): Minimal allowed kmer size (nt).
       max_length (int): Maximum allowed kmer size (nt).
          padding (int): Number of kmers, on each side, added to the context.

    Yields:
        chromosom_id (str), target_seq (int), list(context_seqs (ints))
    """
    kmer_sizes = np.arange(min_length, max_length + 1)

    with pysam.FastaFile(fa_file) as ref:
        for chrom in chroms:
            chr_seq = ref.fetch(chrom)
            for subseq_pos in range(0, len(chr_seq)):

                # Create random kmer sizes.
                rnd_kmer_sizes = np.random.choice(kmer_sizes, padding * 2 + 1)

                # Extract sub-sequence from provided fasta file.
                subseq = chr_seq[subseq_pos:subseq_pos + rnd_kmer_sizes.size +
                                 rnd_kmer_sizes.max()]

                if len(subseq) < rnd_kmer_sizes.size + rnd_kmer_sizes.max():
                    continue

                # Randomly use both strand for learning (Data Augmentation).
                if np.random.randint(2):
                    subseq = reverse_complement(subseq)

                try:
                    num_kmers = []

                    for i, pos in enumerate(rnd_kmer_sizes):
                        kmer_seq = subseq[i:i + rnd_kmer_sizes[i]]
                        number_seq = multisize_patten2number(
                            kmer_seq, min_length, max_length)
                        num_kmers.append(number_seq)

                    context = np.array(num_kmers[:padding] +
                                       num_kmers[-padding:])
                    # np.random.shuffle(context)

                    target = num_kmers[padding]

                    yield chrom, target, context

                except (KeyError, IndexError, ValueError):  # as e:
                    pass  # Was not able to convert patten to number or
Exemplo n.º 38
0
def gencode_codon_list(target_gene, output):

    gencode_df = pd.read_table('db/gencode_coding.modified.bed', names=('chr', 'start', 'end', 'ID', 'type', 'strand', 'gene', 'order', 'sum'))
    gencode_df = gencode_df[gencode_df['gene'] == target_gene]
    gencode_df = gencode_df[gencode_df['chr'] != 'chrY']
    ID_list = gencode_df['ID'].unique()

    # select only unique ID
    ID_list = list(set(ID_list))

    # get sequence for each refID
    hin = open("db/gencode_coding.modified.bed", 'r')
    gene_seq_dict = {}
    for line in hin:
        F = line.rstrip('\n').split('\t')
        chrom = F[0].replace('chr','')
        if chrom not in chroms:
            continue
        if F[4] != "coding":
            continue
        coding_start = int(F[1])
        coding_end   = int(F[2])
        strand = F[5]
        ID = F[3]
        gene = F[6]
        if gene != target_gene:
            continue
        for item in ID_list:
            if item not in gene_seq_dict: gene_seq_dict[item] = ''
            if ID == item:
                exon_seq = fa.fetch(reference=chrom, start=coding_start, end =coding_end).upper()
                gene_seq_dict[item] = gene_seq_dict[item] + exon_seq
    hin.close()

    hout = open(output, 'a')
    # get codon information from refgene
    hin = open("db/gencode_coding.modified.bed", 'r')
    exon_length_dict = {}
    for line in hin:
        F = line.rstrip('\n').split('\t')
        chrom = F[0].replace('chr','')
        if chrom not in chroms:
            continue
        if F[4] == "coding":
            coding_start = int(F[1])
            coding_end   = int(F[2])
            strand = F[5]
            ID = F[3]
            gene = F[6]
            if gene != target_gene:
                continue
            for item in ID_list:
                if item != ID: continue
                if item not in exon_length_dict: exon_length_dict[item] = 0
                if strand == '+':
                    for pos in range(coding_end - coding_start):
                        # relative pos in the gene
                        pos2 = pos + exon_length_dict[item]
                        codon_pos = pos2 // 3
                        codon_start = codon_pos * 3
                        pos_in_codon = pos2 % 3
                        print >> hout, gene, item, ':'.join([chrom, str(coding_start + pos)]), pos_in_codon, gene_seq_dict[item][codon_start:(codon_start+3)], strand
                    exon_length_dict[item] = exon_length_dict[item] + (coding_end - coding_start)

                if strand == '-':
                    for pos in range(coding_end - coding_start):
                        # relative pos in the gene
                        pos2 = pos + exon_length_dict[item]
                        codon_pos = pos2 // 3
                        codon_start = codon_pos * 3
                        pos_in_codon = pos2 % 3
                        print >> hout, gene, item, ':'.join([chrom, str(coding_start + pos)]), 2 - (pos_in_codon), utils.reverse_complement(gene_seq_dict[item][codon_start:(codon_start+3)]), strand
                    exon_length_dict[item] = exon_length_dict[item] + (coding_end - coding_start)

        elif F[4] == "intron":
            start = int(F[1])
            end   = int(F[2])
            strand = F[5]
            ID = F[3]
            gene = F[6]
            if gene != target_gene:
                continue
            for pos in (start, start + 1, end - 2, end - 1):
                print >> hout, gene, ID, ':'.join([chrom, str(pos)]), 'splice_site', 'splice_site', strand
    hin.close()
    hout.close()
Exemplo n.º 39
0
def count_read_by_alignment(input_file,
                            bam_file,
                            reference,
                            output_file,
                            validate_sequence_length=200,
                            score_ratio_thres=1.4,
                            start_pos_thres=0.2,
                            end_pos_thres=0.8):
    class Alignment_counter(object):
        def __init__(self, score_ratio_thres, start_pos_thres, end_pos_thres):
            self.key = ''
            # self.hout = open(output_file, 'w')
            self.query_seq = None
            self.target_seq_list = []
            self.score_ratio_thres = score_ratio_thres
            self.start_pos_thres = start_pos_thres
            self.end_pos_thres = end_pos_thres
            self.tmp_dir = tempfile.mkdtemp()

        def __del__(self):
            # self.hout.close()
            shutil.rmtree(self.tmp_dir)

        def initialize(self, key, query_seq):
            self.key = key
            self.query_seq = query_seq
            self.target_seq_list = []

        def add_query_seq(self, read_name, read_seq):
            self.target_seq_list.append((read_name, read_seq))

        def count_alignment(self):

            if len(self.target_seq_list) == 0: return None
            cluster_id, _, _, _ = key.split('\t')
            """
            with open(self.tmp_dir + '/' + cluster_id + ".target.fa", 'w') as hout_ta:
                for read in self.target_seq_list:
                    print(">%s\n%s" % (read[0], read[1]), file = hout_ta)

            with open(self.tmp_dir + '/' + cluster_id + ".query.fa", 'w') as hout_qu:
                print(">query_%s\n%s" % (cluster_id, self.query_seq), file = hout_qu)

            alignment_info = nanomonsv.long_read_validate.ssw_check(
                self.tmp_dir + '/' + cluster_id + ".query.fa",
                self.tmp_dir + '/' + cluster_id + ".target.fa")

            all_rnames = list(set(alignment_info.keys()))
            supporting_reads = [rname for rname in all_rnames if \
                alignment_info[rname][0] > self.score_ratio_thres * len(self.query_seq) and \
                alignment_info[rname][1] < self.start_pos_thres * len(self.query_seq) and \
                alignment_info[rname][2] > self.end_pos_thres * len(self.query_seq)]

            if "15033196" in self.key:
                for a in alignment_info: print(a, alignment_info[a], a in supporting_reads)
                print(' ')
            """
            # print(cluster_id)

            all_rnames = []
            supporting_reads = []
            align_res = []
            for target_read in self.target_seq_list:
                all_rnames.append(target_read[0])
                tres = edlib.align(self.query_seq,
                                   target_read[1],
                                   mode="HW",
                                   task="locations")
                # print(target_read[0])
                # print(tres)
                if tres["editDistance"] < len(self.query_seq) * 0.25:  # and \
                    # tres["locations"][0][0] < 0.2 * len(self.query_seq) and \
                    # tres["locations"][0][1] > 0.8 * len(self.query_seq):
                    supporting_reads.append(target_read[0])

            return (len(all_rnames), len(supporting_reads))

    bam_hin = pysam.AlignmentFile(bam_file, 'rb')
    reference_fasta = pysam.FastaFile(reference)
    rname2key = {}
    key2contig = {}
    with open(input_file, 'r') as hin:
        for row in csv.reader(hin, delimiter='\t'):
            key = '\t'.join(row[:4])
            for read in bam_hin.fetch(row[1], max(int(row[2]) - 100, 0),
                                      int(row[2]) + 100):
                if read.is_secondary: continue
                if read.qname not in rname2key: rname2key[read.qname] = []
                rname2key[read.qname].append(key)

            if row[3] == '+':
                contig = reference_fasta.fetch(
                    row[1], max(int(row[2]) - validate_sequence_length - 1, 0),
                    int(row[2]))
            else:
                contig = reference_fasta.fetch(
                    row[1],
                    int(row[2]) - 1,
                    int(row[2]) + validate_sequence_length - 1)
                contig = reverse_complement(contig)
            contig = contig + row[4][:validate_sequence_length]
            key2contig[key] = contig

    for rname in rname2key:
        keys = list(set(rname2key[rname]))
        rname2key[rname] = keys

    with open(output_file + ".tmp.long_read_seq.unsorted", 'w') as hout:
        for read in bam_hin.fetch():

            if read.is_secondary or read.is_supplementary: continue
            if read.qname in rname2key:
                read_seq = read.query_sequence
                read_seq = reverse_complement(
                    read.query_sequence
                ) if read.is_reverse else read.query_sequence
                for key in rname2key[read.qname]:
                    print("%s\t%s\t%s" % (key, read.qname, read_seq),
                          file=hout)

    bam_hin.close()

    with open(output_file + ".tmp.long_read_seq.sorted", 'w') as hout:
        subprocess.check_call(
            ["sort", "-k1,1", output_file + ".tmp.long_read_seq.unsorted"],
            stdout=hout)
    os.remove(output_file + ".tmp.long_read_seq.unsorted")

    # key2count = {}
    alignment_counter = Alignment_counter(score_ratio_thres, start_pos_thres,
                                          end_pos_thres)
    with open(output_file + ".tmp.long_read_seq.sorted",
              'r') as hin, open(output_file, 'w') as hout:
        for row in csv.reader(hin, delimiter='\t'):
            key = '\t'.join(row[:4])
            if key != alignment_counter.key:
                acount = alignment_counter.count_alignment()
                if acount is not None:
                    print("%s\t%d\t%d" %
                          (alignment_counter.key, acount[0], acount[1]),
                          file=hout)
                    # key2count[alignment_counter.key] = acount

                alignment_counter.initialize(key, key2contig[key])
            alignment_counter.add_query_seq(row[4], row[5])

        acount = alignment_counter.count_alignment()
        if acount is not None:
            print("%s\t%d\t%d" % (alignment_counter.key, acount[0], acount[1]),
                  file=hout)
            # key2count[alignment_counter.key] = acount

    del alignment_counter
    os.remove(output_file + ".tmp.long_read_seq.sorted")
Exemplo n.º 40
0
import neural_network as nn
import utils as util
from random import shuffle
import numpy as np
import random

#read in sites
posfile = '/Users/student/Documents/Algorithms/Alg_final_project/data/rap1-lieb-positives.txt'
negfile = '/Users/student/Documents/Algorithms/Alg_final_project/data/yeast-upstream-1k-negative.fa'
testfile = '/Users/student/Documents/Algorithms/Alg_final_project/data/rap1-lieb-test.txt'
poslist = util.read_pos(posfile)
finaltestlist = util.read_pos(testfile)
posreversecomp = []
for i in poslist:
    posreversecomp.append(util.reverse_complement(i))

poslist = poslist + posreversecomp

neglist = util.read_fasta(negfile)
negreversecomp = []
for i in neglist:
    negreversecomp.append(util.reverse_complement(i))
neglist = neglist + negreversecomp

for i in neglist:
    if i in set(poslist):
        neglist.remove(i)

#print('negs',neglist[:10])
print('neg', len(neglist))