Exemplo n.º 1
0
def fix_fusion(ref_f, genome_fa, out_file, no_fix, secondary_flag=0,
               denovo_flag=0):
    """
    Realign fusion juncrions
    """
    print('Start to fix fusion junctions...')
    fa = check_fasta(genome_fa)
    ref = parse_ref(ref_f, 2)
    annotated_fusion_f = 'annotated_fusion.txt.tmp'
    fusions, fusion_names, fixed_flag = fix_bed(annotated_fusion_f, ref, fa,
                                                no_fix, denovo_flag)
    total = 0
    annotations = set()
    fixed_fusion_f = out_file
    if secondary_flag:
        secondary_f = open('low_conf_%s' % out_file, 'w')
    with open(fixed_fusion_f, 'w') as outf:
        for fus in fusion_names:
            reads = str(fusions[fus])
            fixed = fixed_flag[fus]
            if fixed > 0:
                total += 1
            fixed = str(fixed)
            name = 'circular_RNA/' + reads
            if fus.startswith('secondary'):
                _, loc, strand, left_info, right_info = fus.split('|')
                secondary_f.write('\t'.join([loc, name, fixed, strand,
                                             left_info, right_info]))
                secondary_f.write('\n')
                continue
            gene, iso, chrom, strand, index = fus.split()
            starts, ends = ref['\t'.join([gene, iso, chrom, strand])]
            exon_num = len(starts)
            intron_num = exon_num - 1
            if ',' in index:  # back spliced exons
                s, e = [int(x) for x in index.split(',')]
                if strand == '+':
                    index_info = ','.join(str(x + 1) for x in range(s, e + 1))
                else:
                    index_info = ','.join(str(exon_num - x)
                                          for x in range(s, e + 1))
                start = str(starts[s])
                end = str(ends[e])
                length = str(e - s + 1)
                sizes, offsets = generate_bed(int(start), starts[s:(e + 1)],
                                              ends[s:(e + 1)])
                annotation_info = '\t'.join([chrom, start, end, sizes,
                                             offsets])
                # remove circular RNA info duplications in denovo mode
                if denovo_flag and annotation_info in annotations:
                    continue
                if s == 0:
                    left_intron = 'None'
                else:
                    left_intron = '%s:%d-%d' % (chrom, ends[s - 1], starts[s])
                if e == len(ends) - 1:
                    right_intron = 'None'
                else:
                    right_intron = '%s:%d-%d' % (chrom, ends[e], starts[e + 1])
                intron = '|'.join([left_intron, right_intron])
                bed = '\t'.join([chrom, start, end, name, fixed, strand, start,
                                 start, '0,0,0', length, sizes, offsets,
                                 reads, 'circRNA', gene, iso, index_info,
                                 intron])
            else:  # ciRNAs
                index, start, end = index.split('|')
                size = str(int(end) - int(start))
                annotation_info = '\t'.join([chrom, start, end, size, '0'])
                # remove circular RNA info duplications in denovo mode
                if denovo_flag and annotation_info in annotations:
                    continue
                index = int(index)
                if strand == '+':
                    index_info = str(index + 1)
                else:
                    index_info = str(intron_num - index)
                intron = '%s:%d-%d' % (chrom, ends[index], starts[index + 1])
                bed = '\t'.join([chrom, start, end, name, fixed, strand, start,
                                 start, '0,0,0', '1', size, '0',
                                 reads, 'ciRNA', gene, iso, index_info,
                                 intron])
            if denovo_flag:  # in denovo mode
                annotations.add(annotation_info)
            outf.write(bed + '\n')
    if secondary_flag:
        secondary_f.close()

    os.remove('annotated_fusion.txt.tmp')

    print('Fixed %d fusion junctions!' % total)
Exemplo n.º 2
0
def fix_fusion(ref_f, genome_fa, out_dir, no_fix, denovo_flag=0):
    """
    Realign fusion juncrions
    """
    print('Start to fix fusion junctions...')
    fa = check_fasta(genome_fa)
    ref = parse_ref(ref_f, 2)
    annotated_fusion_f = '%s/annotated_fusion.txt' % out_dir
    fusions, fusion_names, fixed_flag = fix_bed(annotated_fusion_f, ref, fa,
                                                no_fix, denovo_flag)
    total = 0
    annotations = set()
    fixed_fusion_f = '%s/circ_fusion.txt' % out_dir
    with open(fixed_fusion_f, 'w') as outf:
        for fus in fusion_names:
            reads = str(fusions[fus])
            fixed = fixed_flag[fus]
            if fixed > 0:
                total += 1
            fixed = str(fixed)
            name = 'circular_RNA/' + reads
            gene, iso, chrom, strand, index = fus.split()
            starts, ends = ref['\t'.join([gene, iso, chrom, strand])]
            exon_num = len(starts)
            intron_num = exon_num - 1
            if ',' in index:  # back spliced exons
                s, e = [int(x) for x in index.split(',')]
                if strand == '+':
                    index_info = ','.join(str(x + 1) for x in xrange(s, e + 1))
                else:
                    index_info = ','.join(str(exon_num - x)
                                          for x in xrange(s, e + 1))
                start = str(starts[s])
                end = str(ends[e])
                length = str(e - s + 1)
                sizes, offsets = generate_bed(int(start), starts[s:(e + 1)],
                                              ends[s:(e + 1)])
                annotation_info = '\t'.join([chrom, start, end, sizes,
                                             offsets])
                # remove circular RNA info duplications in denovo mode
                if denovo_flag and annotation_info in annotations:
                    continue
                if s == 0:
                    left_intron = 'None'
                else:
                    left_intron = '%s:%d-%d' % (chrom, ends[s - 1], starts[s])
                if e == len(ends) - 1:
                    right_intron = 'None'
                else:
                    right_intron = '%s:%d-%d' % (chrom, ends[e], starts[e + 1])
                intron = '|'.join([left_intron, right_intron])
                bed = '\t'.join([chrom, start, end, name, fixed, strand, start,
                                 start, '0,0,0', length, sizes, offsets,
                                 reads, 'circRNA', gene, iso, index_info,
                                 intron])
            else:  # ciRNAs
                index, start, end = index.split('|')
                size = str(int(end) - int(start))
                annotation_info = '\t'.join([chrom, start, end, size, '0'])
                # remove circular RNA info duplications in denovo mode
                if denovo_flag and annotation_info in annotations:
                    continue
                index = int(index)
                if strand == '+':
                    index_info = str(index + 1)
                else:
                    index_info = str(intron_num - index)
                intron = '%s:%d-%d' % (chrom, ends[index], starts[index + 1])
                bed = '\t'.join([chrom, start, end, name, fixed, strand, start,
                                 start, '0,0,0', '1', size, '0',
                                 reads, 'ciRNA', gene, iso, index_info,
                                 intron])
            if denovo_flag:  # in denovo mode
                annotations.add(annotation_info)
            outf.write(bed + '\n')
    print('Fixed %d fusion junctions!' % total)