Пример #1
0
def read_coord(intervals, infos, read_name, cigar, leftpos, XI, XQ, chrom, flag):           
        xq = int(XQ)
        if flag & 16 == 0:   
            chain = '+'
        else:   
            chain = '-'
        infos[read_name][XI].append(chrom)
        infos[read_name][XI].append(chain)
        read_interval = one_interval(get_read_interval(cigar,leftpos, output = 'interval'))   # local alignment is always bound by M
        intervals[read_name][XI].append((xq,read_interval))
Пример #2
0
def return_mate_tuple(line_dict, second_mates, chrom, chain):
    """
    Takes single line from Chimeric.out.junction file
    :param line_dict: dictionary of elements in single line, output of star_line_dict
    :param second_mates: list of reads with the same name from SAM file
    :param chrom: chromosome
    :param chain: chain
    :return: None or tuple of 3 read intervals and type of mapping (mate_inside/mate_outside)
    """
    chim_part1 = get_read_interval(
        cigar=line_dict['cigar1'],
        leftpos=line_dict['coord1'],
    )
    chim_part2 = get_read_interval(
        cigar=line_dict['cigar2'],
        leftpos=line_dict['coord2'],
    )
    for mate in second_mates:
        if mate['cigar'] == line_dict['cigar1'] \
                or mate['cigar'] == line_dict['cigar2']:
            continue
        if mate['NH'] > 1:
            nh_chroms = 0  # check if mapping to this chromosome is unique
            for mapping in second_mates:
                if mapping['chrom'] == chrom:
                    nh_chroms += 1
            if nh_chroms > 1:
                continue
        if mate['chrom'] == chrom and mate['chain'] != chain:
            mate2_dict = get_read_interval(cigar=mate['cigar'],
                                           leftpos=mate['leftpos'])
            interval_intersection = mate_intersection(chim_part1=chim_part1,
                                                      chim_part2=chim_part2,
                                                      read_dict2=mate2_dict)
            return chim_part1, chim_part2, mate2_dict, interval_intersection
    return None
Пример #3
0
 def test_get_read_interval(self):
     cigars = [
         '20M30S',
         '30S20M',
         '30S20M30S',
         '20M2I20M',
         '20M2D20M',
         '20M32N20M',
         '20S20M32N20M',
         '20S20M32N20M1000p76M',
     ]
     leftpos = 100
     exp_read_dicts = [
         OrderedDict([
             ('M1', interval([100.0, 119.0])),
         ]),
         OrderedDict([
             ('M1', interval([100.0, 119.0])),
         ]),
         OrderedDict([
             ('M1', interval([100.0, 119.0])),
         ]),
         OrderedDict([('M1', interval([100.0, 119.0])),
                      ('M2', interval([120.0, 139.0]))]),
         OrderedDict([('M1', interval([100.0, 119.0])),
                      ('D1', interval([120.0, 121.0])),
                      ('M2', interval([122.0, 141.0]))]),
         OrderedDict([('M1', interval([100.0, 119.0])),
                      ('N1', interval([120.0, 151.0])),
                      ('M2', interval([152.0, 171.0]))]),
         OrderedDict([('M1', interval([100.0, 119.0])),
                      ('N1', interval([120.0, 151.0])),
                      ('M2', interval([152.0, 171.0]))]),
         OrderedDict([('M1', interval([100.0, 119.0])),
                      ('N1', interval([120.0, 151.0])),
                      ('M2', interval([152.0, 171.0])),
                      ('p1', interval([172.0, 1171.0])),
                      ('M3', interval([1172.0, 1247.0]))]),
     ]
     res_read_dicts = [
         ptes.get_read_interval(cigar=cigar, leftpos=leftpos, output='dict')
         for cigar in cigars
     ]
     for i in range(len(exp_read_dicts)):
         self.assertEqual(exp_read_dicts[i], res_read_dicts[i])
Пример #4
0
def main():
    ### Arguments

    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", type=str,
                        help="STAR Chimeric.out.junction output OR list")
    parser.add_argument("-o", "--output", type=str,
                        help="Path for subfolder with results")
    parser.add_argument("-l", "--list", type=str,
                        help="Enables list input mode. Options: input, tag - MUST be lists")
    parser.add_argument("-gz", "--gzip", type=str,
                        help="Option to create .json.gz")
    parser.add_argument("-gtf", "--gtf_annot", type=str,
                        default='/home/sunnymouse/Human_ref/hg19_exons.gtf',
                        help="Absolute path to annotation file")
    parser.add_argument("-t", "--tag", type=str,
                        default='ENCODE',
                        help="Tag name for grouping results (prefix), i.e. ENCODE id OR list")
    args = parser.parse_args()

    # Main
    make_dir(args.output)

    skipped = {'non-filtered': 0,    # different chromosomes and/or chains
               'chrM': 0,      # mapping to chrM
               'PE': 0,   # junction between the mates, -1 in STAR output
               'non-chim': 0}   # STAR counts very long (>1Mb) junctions as chimeric

    junc_dict = defaultdict(dict)

    # Exons GTF to junctions dict
    PTES_logger.info('Reading GTF...')
    gtf_exons_name = args.gtf_annot
    gtf_donors, gtf_acceptors = annot_junctions(gtf_exons_name=gtf_exons_name)
    PTES_logger.info('Reading GTF... done')

    if args.list:
        with open(args.input, 'r') as chim_names_file:
            chim_names_list = [x.strip('\n') for x in chim_names_file.readlines()]
        with open(args.tag, 'r') as tag_names_file:
            tag_names_list = [x.strip('\n') for x in tag_names_file.readlines()]
        pairs = zip(chim_names_list, tag_names_list)
        PTES_logger.info('Enabled list mode')
        chim_reads_df_list = []
    else:
        pairs = [(args.input, args.tag)]

    for chim_name, tag in pairs:
        annot_donors = 0
        annot_acceptors = 0
        read_names_list = []
        PTES_logger.info('Input file: %s ' % chim_name)

        PTES_logger.info('Reading STAR output...')
        with open(chim_name, 'r') as input_file:
            for i, line in enumerate(input_file):
                line_dict = star_line_dict(line=line)
                if not line_dict:
                    continue
                if line_dict['chrom1'] == line_dict['chrom2'] \
                        and line_dict['chain1'] == line_dict['chain2']:
                    chrom = line_dict['chrom1']
                    chain = line_dict['chain1']
                else:
                    skipped['non-filtered'] +=1
                    continue
                if chrom == 'chrM':
                    skipped['chrM'] += 1
                    continue
                if line_dict['junction_letters'] == '-':
                    PTES_logger.error('PE input, junction type -1 is present!')
                    PTES_logger.error('Current version works only with SE output')
                    skipped['PE'] += 1
                    continue
                if abs(line_dict['donor_ss'] - line_dict['acceptor_ss']) > 1000000 \
                        or chain == '+' and line_dict['donor_ss'] < line_dict['acceptor_ss'] \
                        or chain == '-' and line_dict['donor_ss'] > line_dict['acceptor_ss']:
                    skipped['non-chim'] += 1
                    continue
                read_name = line_dict['read_name']
                chim_part1 = get_read_interval(cigar=line_dict['cigar1'], leftpos=line_dict['coord1'])
                chim_part2 = get_read_interval(cigar=line_dict['cigar2'], leftpos=line_dict['coord2'])
                junc_dict[(chrom, chain, line_dict['donor_ss'], line_dict['acceptor_ss'])
                ].update({read_name: (chim_part1, chim_part2)})

                annot_donor = 0
                annot_acceptor = 0
                if line_dict['donor_ss'] in gtf_donors[chrom]:
                    annot_donor = 1
                    annot_donors += 1
                if line_dict['acceptor_ss'] in gtf_acceptors[chrom]:
                    annot_acceptor = 1
                    annot_acceptors += 1

                read_attrs = {
                    'read_name': read_name,
                    'chain': chain,  # chain of chimeric junction
                    'chrom': chrom,
                    'donor': line_dict['donor_ss'],
                    'acceptor': line_dict['acceptor_ss'],
                    'annot_donor': annot_donor,
                    'annot_acceptor': annot_acceptor,
                    'letters_ss': line_dict['junction_letters'],
                    'chim_dist': abs(line_dict['donor_ss'] - line_dict['acceptor_ss']),
                }
                read_names_list.append(read_attrs)

        PTES_logger.info('Reading STAR output... done')
        PTES_logger.info('Processed: %i rows' % i)
        for key in skipped:
            PTES_logger.info('Skipped %s: %i rows' % (key, skipped[key]))
        PTES_logger.info('Converted successfully: %i rows' % len(read_names_list))
        PTES_logger.info('Annot donors: %i' % annot_donors)
        PTES_logger.info('Annot acceptors: %i' % annot_acceptors)
        PTES_logger.info('Creating reads dataframe...')
        try:
            reads_df = pd.DataFrame(read_names_list)
            reads_df = reads_df[
                ['read_name', 'chrom', 'chain',
                 'donor', 'acceptor', 'annot_donor',
                 'annot_acceptor', 'letters_ss',
                 'chim_dist']
            ].sort_values(by=['chrom', 'chain', 'donor', 'acceptor']).reset_index(drop=True)  # reorder columns
            reads_df['id'] = tag
            if args.list:
                chim_reads_df_list.append(reads_df)
            else:
                all_reads_df = reads_df
        except KeyError:
            PTES_logger.warning('Creating reads dataframe... empty dataframe')

    if args.list:
        all_reads_df = pd.concat(chim_reads_df_list, sort=True).reset_index(drop=True)
    # Writing reads dataframe
    if all_reads_df is not None:
        all_reads_df.to_csv(os.path.join(args.output, 'chim_reads.csv'), sep='\t')
        PTES_logger.info('Creating reads dataframe... done')

        # Writing junc_dict
        PTES_logger.info('Writing intervals to json...')
        if args.gzip:
            PTES_logger.info('Output will be archived')
            with gzip.GzipFile(os.path.join(args.output, 'junc_dict.json.gz'), 'w') as junc_json:
                junc_json.write(json.dumps({str(k): v for k, v in junc_dict.items()}).encode('utf-8'))
        else:
            with open(os.path.join(args.output, 'junc_dict.json'), 'w') as junc_json:
                json.dump({str(k): v for k, v in junc_dict.items()}, junc_json, indent=2)
        PTES_logger.info('Writing intervals to json... done')

        # Writing junctions dataframe
        PTES_logger.info('Creating junctions dataframe...')
        junctions_df = reads_to_junctions(reads_df=all_reads_df)
        junctions_df.to_csv(os.path.join(args.output, 'chim_junctions.csv'), sep='\t')
        PTES_logger.info('Creating junctions dataframe... done')
    else:
        PTES_logger.warning('Empty dataframe')
Пример #5
0
 if junction_type == '1':
     junction_letters = 'GT/AG'
 elif junction_type == '2':
     junction_letters = 'CT/AC'            
 else:
     junction_letters = 'unknown'
 read_name = line_list[9]
 coord1 = int(line_list[10])
 cigar1 = line_list[11]
 coord2 = int(line_list[12])
 cigar2 = line_list[13]
 track_lists = []
 windows_min = []
 windows_max = []
 if 'p' in cigar1:
     mate1 = get_read_interval(cigar1.split('p')[0].rstrip('-0123456789'), coord1)
     mate2 = get_read_interval(*split_cigar(cigar1, coord1))   
     chim_part2 = get_read_interval(cigar2, coord2)            
     bed1 = get_track_list(chrom, chain, mate1, name='mate1', color='r')
     bed2 = get_track_list(chrom, chain, mate2, name='mate2', color='r')
     bed3 = get_track_list(chrom, chain, chim_part2, name='chim_mate2', color='r')
     track_lists = [bed1, bed2, bed3]
 elif 'p' in cigar2:
     mate1 = get_read_interval(cigar2.split('p')[0].rstrip('-0123456789'), coord2)
     mate2 = get_read_interval(*split_cigar(cigar2, coord2))   
     chim_part2 = get_read_interval(cigar1, coord1)
     bed1 = get_track_list(chrom, chain, mate1, name='mate1', color='r')
     bed2 = get_track_list(chrom, chain, mate2, name='mate2', color='r')
     bed3 = get_track_list(chrom, chain, chim_part2, name='chim_mate1', color='r')
     track_lists = [bed1, bed2, bed3]
 else:   #single-read mode
Пример #6
0
def main():
    # Arguments

    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--input",
                        type=str,
                        help="STAR output, Chimeric.out.junction.filtered \
                                OR list of such files")
    parser.add_argument("-s",
                        "--sam",
                        type=str,
                        help="Filtered STAR SAM output OR list")
    parser.add_argument("-o",
                        "--output",
                        type=str,
                        help="Output folder for results")
    parser.add_argument("-gz",
                        "--gzip",
                        type=str,
                        help="Option to create .json.gz")
    parser.add_argument(
        "-l",
        "--list",
        type=str,
        help="Enables list input mode. Options: sam, tag - MUST be lists")
    parser.add_argument("-gtf",
                        "--gtf_annot",
                        type=str,
                        default='/home/sunnymouse/Human_ref/hg19_exons.gtf',
                        help="Absolute path to annotation file")
    parser.add_argument(
        "-t",
        "--tag",
        type=str,
        default='ENCODE',
        help="Tag name for grouping results, i.e. ENCODE id OR list of tags")
    args = parser.parse_args()

    # Exons GTF to junctions dict
    PTES_logger.info('Reading GTF...')
    gtf_exons_name = args.gtf_annot
    gtf_donors, gtf_acceptors = annot_junctions(gtf_exons_name=gtf_exons_name)
    PTES_logger.info('Reading GTF... done')

    make_dir(args.output)
    norm_junc_dict = defaultdict(dict)
    norm_read_names_list = []

    if args.list:
        with open(args.sam, 'r') as sam_names_file:
            sam_names_list = [
                x.strip('\n') for x in sam_names_file.readlines()
            ]
        if args.input:
            with open(args.input, 'r') as chim_names_file:
                chim_names_list = [
                    x.strip('\n') for x in chim_names_file.readlines()
                ]
        else:
            chim_names_list = [None] * len(sam_names_list)
        with open(args.tag, 'r') as tag_names_file:
            tag_names_list = [
                x.strip('\n') for x in tag_names_file.readlines()
            ]
        triads = zip(chim_names_list, sam_names_list, tag_names_list)
        PTES_logger.info('Enabled list mode')
    else:
        triads = [(args.input, args.sam, args.tag)]

    for chim_name, sam_name, tag in triads:
        if chim_name:
            with open(chim_name, 'r') as chim_file:
                names_list = [
                    x.strip('\n').split('\t')[9]
                    for x in chim_file.readlines()
                ]
                names_set = set(names_list)  # only names with chimeric output

        with open(sam_name, 'r') as sam_file:
            PTES_logger.info('Input file %s' % sam_name)
            for line in sam_file:
                if line.startswith('@'):
                    continue
                row = line.strip().split('\t')
                sam_attrs = None
                if len(row) > 1:
                    read_name = row[0]
                    if chim_name:
                        if read_name in names_set:
                            sam_attrs = parse_sam_row(row)
                    else:
                        sam_attrs = parse_sam_row(row)
                if sam_attrs:
                    if 'N' in sam_attrs['cigar']:  # read mapped with intron
                        read_dict = get_read_interval(
                            cigar=sam_attrs['cigar'],
                            leftpos=sam_attrs['leftpos'],
                            output='dict')
                        if sam_attrs['chain'] == '+':
                            donor_ss = int(read_dict['N1'][0].inf -
                                           1)  # counts first N as intron
                            acceptor_ss = int(read_dict['N1'][0].sup + 1)
                        elif sam_attrs['chain'] == '-':
                            donor_ss = int(read_dict['N1'][0].sup + 1)
                            acceptor_ss = int(read_dict['N1'][0].inf - 1)
                        norm_junc_dict[(sam_attrs['chrom'], sam_attrs['chain'],
                                        donor_ss, acceptor_ss)].update(
                                            {read_name: tuple([read_dict])})
                        norm_read_names_list.append({
                            'read_name': read_name,
                            'chrom': sam_attrs['chrom'],
                            'chain': sam_attrs['chain'],
                            'donor': donor_ss,
                            'acceptor': acceptor_ss,
                            'id': tag
                        })
    try:
        norm_read_df = pd.DataFrame(norm_read_names_list)
        norm_read_df = norm_read_df[[
            'read_name',
            'chrom',
            'chain',
            'donor',
            'acceptor',
            'id',
        ]].sort_values(by=['chrom', 'chain', 'donor', 'acceptor']).reset_index(
            drop=True)
        PTES_logger.info('Writing reads dataframe...')
        norm_read_df.to_csv(os.path.join(args.output, 'norm_split_reads.csv'),
                            sep='\t')
        PTES_logger.info('Writing reads dataframe... done')
    except KeyError:
        PTES_logger.warning(
            'Creating norm split reads dataframe... empty dataframe')

    # Writing junc_dict
    PTES_logger.info('Writing intervals to json files...')
    if args.gzip:
        PTES_logger.info('Output will be archived')
        with gzip.GzipFile(os.path.join(args.output, 'norm_dict.json.gz'),
                           'w') as norm_json:
            norm_json.write(
                json.dumps({str(k1): v1
                            for k1, v1 in norm_junc_dict.items()
                            }).encode('utf-8'))
    else:
        with open(os.path.join(args.output, 'norm_dict.json'),
                  'w') as norm_json:
            json.dump({str(k1): v1
                       for k1, v1 in norm_junc_dict.items()},
                      norm_json,
                      indent=2)

    PTES_logger.info('Writing intervals to json files... done')

    # Writing junctions dataframe
    PTES_logger.info('Creating junctions dataframe...')
    junctions_df = reads_to_junctions(reads_df=norm_read_df,
                                      gtf_donors=gtf_donors,
                                      gtf_acceptors=gtf_acceptors)
    junctions_df.to_csv(os.path.join(args.output, 'norm_junctions.csv'),
                        sep='\t')
    PTES_logger.info('Creating junctions dataframe... done')