def read_coord(intervals, infos, read_name, cigar, leftpos, XI, XQ, chrom, flag): xq = int(XQ) if flag & 16 == 0: chain = '+' else: chain = '-' infos[read_name][XI].append(chrom) infos[read_name][XI].append(chain) read_interval = one_interval(get_read_interval(cigar,leftpos, output = 'interval')) # local alignment is always bound by M intervals[read_name][XI].append((xq,read_interval))
def return_mate_tuple(line_dict, second_mates, chrom, chain): """ Takes single line from Chimeric.out.junction file :param line_dict: dictionary of elements in single line, output of star_line_dict :param second_mates: list of reads with the same name from SAM file :param chrom: chromosome :param chain: chain :return: None or tuple of 3 read intervals and type of mapping (mate_inside/mate_outside) """ chim_part1 = get_read_interval( cigar=line_dict['cigar1'], leftpos=line_dict['coord1'], ) chim_part2 = get_read_interval( cigar=line_dict['cigar2'], leftpos=line_dict['coord2'], ) for mate in second_mates: if mate['cigar'] == line_dict['cigar1'] \ or mate['cigar'] == line_dict['cigar2']: continue if mate['NH'] > 1: nh_chroms = 0 # check if mapping to this chromosome is unique for mapping in second_mates: if mapping['chrom'] == chrom: nh_chroms += 1 if nh_chroms > 1: continue if mate['chrom'] == chrom and mate['chain'] != chain: mate2_dict = get_read_interval(cigar=mate['cigar'], leftpos=mate['leftpos']) interval_intersection = mate_intersection(chim_part1=chim_part1, chim_part2=chim_part2, read_dict2=mate2_dict) return chim_part1, chim_part2, mate2_dict, interval_intersection return None
def test_get_read_interval(self): cigars = [ '20M30S', '30S20M', '30S20M30S', '20M2I20M', '20M2D20M', '20M32N20M', '20S20M32N20M', '20S20M32N20M1000p76M', ] leftpos = 100 exp_read_dicts = [ OrderedDict([ ('M1', interval([100.0, 119.0])), ]), OrderedDict([ ('M1', interval([100.0, 119.0])), ]), OrderedDict([ ('M1', interval([100.0, 119.0])), ]), OrderedDict([('M1', interval([100.0, 119.0])), ('M2', interval([120.0, 139.0]))]), OrderedDict([('M1', interval([100.0, 119.0])), ('D1', interval([120.0, 121.0])), ('M2', interval([122.0, 141.0]))]), OrderedDict([('M1', interval([100.0, 119.0])), ('N1', interval([120.0, 151.0])), ('M2', interval([152.0, 171.0]))]), OrderedDict([('M1', interval([100.0, 119.0])), ('N1', interval([120.0, 151.0])), ('M2', interval([152.0, 171.0]))]), OrderedDict([('M1', interval([100.0, 119.0])), ('N1', interval([120.0, 151.0])), ('M2', interval([152.0, 171.0])), ('p1', interval([172.0, 1171.0])), ('M3', interval([1172.0, 1247.0]))]), ] res_read_dicts = [ ptes.get_read_interval(cigar=cigar, leftpos=leftpos, output='dict') for cigar in cigars ] for i in range(len(exp_read_dicts)): self.assertEqual(exp_read_dicts[i], res_read_dicts[i])
def main(): ### Arguments parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", type=str, help="STAR Chimeric.out.junction output OR list") parser.add_argument("-o", "--output", type=str, help="Path for subfolder with results") parser.add_argument("-l", "--list", type=str, help="Enables list input mode. Options: input, tag - MUST be lists") parser.add_argument("-gz", "--gzip", type=str, help="Option to create .json.gz") parser.add_argument("-gtf", "--gtf_annot", type=str, default='/home/sunnymouse/Human_ref/hg19_exons.gtf', help="Absolute path to annotation file") parser.add_argument("-t", "--tag", type=str, default='ENCODE', help="Tag name for grouping results (prefix), i.e. ENCODE id OR list") args = parser.parse_args() # Main make_dir(args.output) skipped = {'non-filtered': 0, # different chromosomes and/or chains 'chrM': 0, # mapping to chrM 'PE': 0, # junction between the mates, -1 in STAR output 'non-chim': 0} # STAR counts very long (>1Mb) junctions as chimeric junc_dict = defaultdict(dict) # Exons GTF to junctions dict PTES_logger.info('Reading GTF...') gtf_exons_name = args.gtf_annot gtf_donors, gtf_acceptors = annot_junctions(gtf_exons_name=gtf_exons_name) PTES_logger.info('Reading GTF... done') if args.list: with open(args.input, 'r') as chim_names_file: chim_names_list = [x.strip('\n') for x in chim_names_file.readlines()] with open(args.tag, 'r') as tag_names_file: tag_names_list = [x.strip('\n') for x in tag_names_file.readlines()] pairs = zip(chim_names_list, tag_names_list) PTES_logger.info('Enabled list mode') chim_reads_df_list = [] else: pairs = [(args.input, args.tag)] for chim_name, tag in pairs: annot_donors = 0 annot_acceptors = 0 read_names_list = [] PTES_logger.info('Input file: %s ' % chim_name) PTES_logger.info('Reading STAR output...') with open(chim_name, 'r') as input_file: for i, line in enumerate(input_file): line_dict = star_line_dict(line=line) if not line_dict: continue if line_dict['chrom1'] == line_dict['chrom2'] \ and line_dict['chain1'] == line_dict['chain2']: chrom = line_dict['chrom1'] chain = line_dict['chain1'] else: skipped['non-filtered'] +=1 continue if chrom == 'chrM': skipped['chrM'] += 1 continue if line_dict['junction_letters'] == '-': PTES_logger.error('PE input, junction type -1 is present!') PTES_logger.error('Current version works only with SE output') skipped['PE'] += 1 continue if abs(line_dict['donor_ss'] - line_dict['acceptor_ss']) > 1000000 \ or chain == '+' and line_dict['donor_ss'] < line_dict['acceptor_ss'] \ or chain == '-' and line_dict['donor_ss'] > line_dict['acceptor_ss']: skipped['non-chim'] += 1 continue read_name = line_dict['read_name'] chim_part1 = get_read_interval(cigar=line_dict['cigar1'], leftpos=line_dict['coord1']) chim_part2 = get_read_interval(cigar=line_dict['cigar2'], leftpos=line_dict['coord2']) junc_dict[(chrom, chain, line_dict['donor_ss'], line_dict['acceptor_ss']) ].update({read_name: (chim_part1, chim_part2)}) annot_donor = 0 annot_acceptor = 0 if line_dict['donor_ss'] in gtf_donors[chrom]: annot_donor = 1 annot_donors += 1 if line_dict['acceptor_ss'] in gtf_acceptors[chrom]: annot_acceptor = 1 annot_acceptors += 1 read_attrs = { 'read_name': read_name, 'chain': chain, # chain of chimeric junction 'chrom': chrom, 'donor': line_dict['donor_ss'], 'acceptor': line_dict['acceptor_ss'], 'annot_donor': annot_donor, 'annot_acceptor': annot_acceptor, 'letters_ss': line_dict['junction_letters'], 'chim_dist': abs(line_dict['donor_ss'] - line_dict['acceptor_ss']), } read_names_list.append(read_attrs) PTES_logger.info('Reading STAR output... done') PTES_logger.info('Processed: %i rows' % i) for key in skipped: PTES_logger.info('Skipped %s: %i rows' % (key, skipped[key])) PTES_logger.info('Converted successfully: %i rows' % len(read_names_list)) PTES_logger.info('Annot donors: %i' % annot_donors) PTES_logger.info('Annot acceptors: %i' % annot_acceptors) PTES_logger.info('Creating reads dataframe...') try: reads_df = pd.DataFrame(read_names_list) reads_df = reads_df[ ['read_name', 'chrom', 'chain', 'donor', 'acceptor', 'annot_donor', 'annot_acceptor', 'letters_ss', 'chim_dist'] ].sort_values(by=['chrom', 'chain', 'donor', 'acceptor']).reset_index(drop=True) # reorder columns reads_df['id'] = tag if args.list: chim_reads_df_list.append(reads_df) else: all_reads_df = reads_df except KeyError: PTES_logger.warning('Creating reads dataframe... empty dataframe') if args.list: all_reads_df = pd.concat(chim_reads_df_list, sort=True).reset_index(drop=True) # Writing reads dataframe if all_reads_df is not None: all_reads_df.to_csv(os.path.join(args.output, 'chim_reads.csv'), sep='\t') PTES_logger.info('Creating reads dataframe... done') # Writing junc_dict PTES_logger.info('Writing intervals to json...') if args.gzip: PTES_logger.info('Output will be archived') with gzip.GzipFile(os.path.join(args.output, 'junc_dict.json.gz'), 'w') as junc_json: junc_json.write(json.dumps({str(k): v for k, v in junc_dict.items()}).encode('utf-8')) else: with open(os.path.join(args.output, 'junc_dict.json'), 'w') as junc_json: json.dump({str(k): v for k, v in junc_dict.items()}, junc_json, indent=2) PTES_logger.info('Writing intervals to json... done') # Writing junctions dataframe PTES_logger.info('Creating junctions dataframe...') junctions_df = reads_to_junctions(reads_df=all_reads_df) junctions_df.to_csv(os.path.join(args.output, 'chim_junctions.csv'), sep='\t') PTES_logger.info('Creating junctions dataframe... done') else: PTES_logger.warning('Empty dataframe')
if junction_type == '1': junction_letters = 'GT/AG' elif junction_type == '2': junction_letters = 'CT/AC' else: junction_letters = 'unknown' read_name = line_list[9] coord1 = int(line_list[10]) cigar1 = line_list[11] coord2 = int(line_list[12]) cigar2 = line_list[13] track_lists = [] windows_min = [] windows_max = [] if 'p' in cigar1: mate1 = get_read_interval(cigar1.split('p')[0].rstrip('-0123456789'), coord1) mate2 = get_read_interval(*split_cigar(cigar1, coord1)) chim_part2 = get_read_interval(cigar2, coord2) bed1 = get_track_list(chrom, chain, mate1, name='mate1', color='r') bed2 = get_track_list(chrom, chain, mate2, name='mate2', color='r') bed3 = get_track_list(chrom, chain, chim_part2, name='chim_mate2', color='r') track_lists = [bed1, bed2, bed3] elif 'p' in cigar2: mate1 = get_read_interval(cigar2.split('p')[0].rstrip('-0123456789'), coord2) mate2 = get_read_interval(*split_cigar(cigar2, coord2)) chim_part2 = get_read_interval(cigar1, coord1) bed1 = get_track_list(chrom, chain, mate1, name='mate1', color='r') bed2 = get_track_list(chrom, chain, mate2, name='mate2', color='r') bed3 = get_track_list(chrom, chain, chim_part2, name='chim_mate1', color='r') track_lists = [bed1, bed2, bed3] else: #single-read mode
def main(): # Arguments parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", type=str, help="STAR output, Chimeric.out.junction.filtered \ OR list of such files") parser.add_argument("-s", "--sam", type=str, help="Filtered STAR SAM output OR list") parser.add_argument("-o", "--output", type=str, help="Output folder for results") parser.add_argument("-gz", "--gzip", type=str, help="Option to create .json.gz") parser.add_argument( "-l", "--list", type=str, help="Enables list input mode. Options: sam, tag - MUST be lists") parser.add_argument("-gtf", "--gtf_annot", type=str, default='/home/sunnymouse/Human_ref/hg19_exons.gtf', help="Absolute path to annotation file") parser.add_argument( "-t", "--tag", type=str, default='ENCODE', help="Tag name for grouping results, i.e. ENCODE id OR list of tags") args = parser.parse_args() # Exons GTF to junctions dict PTES_logger.info('Reading GTF...') gtf_exons_name = args.gtf_annot gtf_donors, gtf_acceptors = annot_junctions(gtf_exons_name=gtf_exons_name) PTES_logger.info('Reading GTF... done') make_dir(args.output) norm_junc_dict = defaultdict(dict) norm_read_names_list = [] if args.list: with open(args.sam, 'r') as sam_names_file: sam_names_list = [ x.strip('\n') for x in sam_names_file.readlines() ] if args.input: with open(args.input, 'r') as chim_names_file: chim_names_list = [ x.strip('\n') for x in chim_names_file.readlines() ] else: chim_names_list = [None] * len(sam_names_list) with open(args.tag, 'r') as tag_names_file: tag_names_list = [ x.strip('\n') for x in tag_names_file.readlines() ] triads = zip(chim_names_list, sam_names_list, tag_names_list) PTES_logger.info('Enabled list mode') else: triads = [(args.input, args.sam, args.tag)] for chim_name, sam_name, tag in triads: if chim_name: with open(chim_name, 'r') as chim_file: names_list = [ x.strip('\n').split('\t')[9] for x in chim_file.readlines() ] names_set = set(names_list) # only names with chimeric output with open(sam_name, 'r') as sam_file: PTES_logger.info('Input file %s' % sam_name) for line in sam_file: if line.startswith('@'): continue row = line.strip().split('\t') sam_attrs = None if len(row) > 1: read_name = row[0] if chim_name: if read_name in names_set: sam_attrs = parse_sam_row(row) else: sam_attrs = parse_sam_row(row) if sam_attrs: if 'N' in sam_attrs['cigar']: # read mapped with intron read_dict = get_read_interval( cigar=sam_attrs['cigar'], leftpos=sam_attrs['leftpos'], output='dict') if sam_attrs['chain'] == '+': donor_ss = int(read_dict['N1'][0].inf - 1) # counts first N as intron acceptor_ss = int(read_dict['N1'][0].sup + 1) elif sam_attrs['chain'] == '-': donor_ss = int(read_dict['N1'][0].sup + 1) acceptor_ss = int(read_dict['N1'][0].inf - 1) norm_junc_dict[(sam_attrs['chrom'], sam_attrs['chain'], donor_ss, acceptor_ss)].update( {read_name: tuple([read_dict])}) norm_read_names_list.append({ 'read_name': read_name, 'chrom': sam_attrs['chrom'], 'chain': sam_attrs['chain'], 'donor': donor_ss, 'acceptor': acceptor_ss, 'id': tag }) try: norm_read_df = pd.DataFrame(norm_read_names_list) norm_read_df = norm_read_df[[ 'read_name', 'chrom', 'chain', 'donor', 'acceptor', 'id', ]].sort_values(by=['chrom', 'chain', 'donor', 'acceptor']).reset_index( drop=True) PTES_logger.info('Writing reads dataframe...') norm_read_df.to_csv(os.path.join(args.output, 'norm_split_reads.csv'), sep='\t') PTES_logger.info('Writing reads dataframe... done') except KeyError: PTES_logger.warning( 'Creating norm split reads dataframe... empty dataframe') # Writing junc_dict PTES_logger.info('Writing intervals to json files...') if args.gzip: PTES_logger.info('Output will be archived') with gzip.GzipFile(os.path.join(args.output, 'norm_dict.json.gz'), 'w') as norm_json: norm_json.write( json.dumps({str(k1): v1 for k1, v1 in norm_junc_dict.items() }).encode('utf-8')) else: with open(os.path.join(args.output, 'norm_dict.json'), 'w') as norm_json: json.dump({str(k1): v1 for k1, v1 in norm_junc_dict.items()}, norm_json, indent=2) PTES_logger.info('Writing intervals to json files... done') # Writing junctions dataframe PTES_logger.info('Creating junctions dataframe...') junctions_df = reads_to_junctions(reads_df=norm_read_df, gtf_donors=gtf_donors, gtf_acceptors=gtf_acceptors) junctions_df.to_csv(os.path.join(args.output, 'norm_junctions.csv'), sep='\t') PTES_logger.info('Creating junctions dataframe... done')