contexts = []
split_length = 200
for fast5 in glob.glob(dirpath + '*.fast5'):
    dataset = guppy_fast5_extraction(fast5)
    for data in dataset:
        read_id = data[0]
        if read_id not in rDNA_read_ids:
            continue
        split_fastq = data[1].split()
        read = split_fastq[6]
        quality = split_fastq[8]
        mod_base_table = data[2]
        cpg_scores = make_methylation_summary(read, mod_base_table)
        averaged_cpg = calculate_meth_stats(cpg_scores)
        utilities.split_mapping_and_sam_analysis(split_length, read_id, read, quality, '../clive/rDNA_index/humRibosomal.fa')
        lc = plot_read_structure('test', split_length, 0)
        fig = plt.figure()
        plt.subplots_adjust(left=0.2)
        ax = fig.add_subplot()
        x = []
        y = []
        for n, score in averaged_cpg.items():
            x.append(n * 200)
            y.append(score * 10000)
        xa = []
        ya = []
        for n, base in enumerate(mod_base_table):
            if base[1] != 0 and read[n] == 'A':
                contexts.append(read[n-1:n+3])
                xa.append(n)
                ya.append(base[1] * 40)
Пример #2
0
def find_boundaries_from_fastq(fastq, split_length):
    """Find boundary containing reads from a .fastq file.

    Args:
        fastq (str): filename
        split_length (split_length): split_length
    Returns:
        boundaries: list of (boundary coordinate, boundary rDNA coordinate,
                             direction, side of non rDNA, read, header)
    """
    with open(fastq) as f:
        boundaries = []
        for n, each_fastq in enumerate(itertools.zip_longest(*[iter(f)] * 4)):
            header = each_fastq[0].strip()
            read = each_fastq[1].strip()
            if len(read) < 40000:
                continue
            quality = each_fastq[3].strip()
            make_temp_fastq(split_length, header, read, quality)
            subprocess.run(
                'bwa mem -M -x ont2d -t 5 '
                '/home/yutaro/nanopore/clive/rDNA_index/'
                'humRibosomal.fa temp_files/temp_fastq.fastq > '
                'temp_files/temp_sam.sam',
                shell=True,
                stdout=FNULL,
                stderr=subprocess.STDOUT)
            # rDNA_coordinate=1 when using find_true_boundary2
            temp_boundary = find_end_reads('temp_files/temp_sam.sam',
                                           split_length,
                                           rDNA_coordinate=1)
            if temp_boundary:
                header = header.split()[0]
                boundary = int(temp_boundary[0])
                direction = temp_boundary[1]
                side = temp_boundary[2]
                if direction == '+':
                    if side == 'right':
                        bound_seq = read[boundary:boundary + 10000]
                        with open('boundary_seq1.fa', 'a') as fw:
                            fw.write('>' + header + '\n')
                            fw.write(bound_seq + '\n\n')
                    else:
                        with open('boundary_seq2.fa', 'a') as fw:
                            fw.write('>' + header + '\n')
                            fw.write(bound_seq + '\n\n')
                else:
                    bound_seq = read[boundary - 10000:boundary]
                    revcom = str(Seq(bound_seq).reverse_complement())
                    if side == 'left':
                        with open('boundary_seq1.fa', 'a') as fw:
                            fw.write('>' + header + '\n')
                            fw.write(revcom + '\n\n')
                    else:
                        with open('boundary_seq2.fa', 'a') as fw:
                            fw.write('>' + header + '\n')
                            fw.write(revcom + '\n\n')

                plot_read_structure(header,
                                    split_length,
                                    savename='end_reads/' + header + '.png',
                                    title=str(temp_boundary[0]))
                continue
                true_boundary = find_true_boundary2(header, read, quality,
                                                    temp_boundary)
                if true_boundary:
                    boundaries.append(
                        (true_boundary[0], true_boundary[1], temp_boundary[1],
                         temp_boundary[2], read, header.split()[0]))
        return boundaries
Пример #3
0
 group2 = pd.read_pickle('group2.pkl')
 num2id1 = {}
 with open('boundary_seq1.fa') as f:
     for n, line in enumerate(f):
         if n % 3 == 0:
             num2id1[n // 3] = line.strip()[1:]
 num2id2 = {}
 with open('boundary_seq2.fa') as f:
     for n, line in enumerate(f):
         if n % 3 == 0:
             num2id2[n // 3] = line.strip()[1:]
 for n, i in enumerate(group1):
     if len(i) > 2:
         for item in i:
             plot_read_structure(num2id1[item],
                                 200,
                                 savename='boundary1/' + str(n) + '/' +
                                 num2id1[item])
 for n, i in enumerate(group2):
     if len(i) > 2:
         try:
             os.mkdir('boundary2/' + str(n))
         finally:
             for item in i:
                 plot_read_structure(num2id2[item],
                                     200,
                                     savename='boundary2/' + str(n) + '/' +
                                     num2id2[item])
 quit()
 seq_list1 = []
 with open('boundary_seq1.fa') as f:
     for fa in itertools.zip_longest(*[iter(f)] * 3):