def _parse_tx_infos(self, gtf_path): """Parse transcript infos from GTF file or load from cache In case of successful loading from GTF, result will be cached. """ if os.path.exists('_tx_cache.bin'): with open('_tx_cache.bin', 'rb') as f: return pickle.load(f) result = [] with gzip.open(gtf_path, 'rt') as f: for i, line in enumerate(f): if i % 1000 == 0: print('processed {}'.format(i), file=sys.stderr) if line.startswith('#'): continue if line.split('\t', 3)[2] != 'transcript': continue record = GTFFeature.parse(line) if record.feature != 'transcript': continue result.append( TranscriptInfo(record.attrs['gene_id'], record.attrs['transcript_id'], record.attrs['transcript_type'], record.seqname, record.start, record.end)) with open('_tx_cache.bin', 'wb') as g: pickle.dump(result, g) print(len(result), file=sys.stderr) return result
def handle_match(self, linc_tx, match, nm): """Handle one match of a lincRNA against the genome For each match, look at all overlapping exons and consider them as candidate lincRNA-to-coding gene interactions. """ # look for exons overlapping with the lincRNA match match_strand = ('-' if match.flag & 16 else '+') region = Region(self.sam_file.getrname(match.reference_id), match.pos, match.reference_end) #print('Querying for exons...', file=sys.stderr) try: first = True for arr in self.tabix.query(*region.to_tuple()): exon = GTFFeature.parse(arr=arr) if exon.feature != 'exon': continue # we look for overlapping transcripts if exon.attrs['transcript_type'] != 'protein_coding': continue # we are only interested in these if match_strand == exon.strand: continue # must be on different strands overlap_type = self.classify_overlap( [region.start, region.end], [exon.start, exon.end]) if first: # print('MATCH', match, file=sys.stderr) first = False # print('TARGET', exon, file=sys.stderr) window_5 = exon.get_5_prime_window(region) window_3 = exon.get_3_prime_window(region) classes = self.compute_classes(exon, window_5, window_3) def ali_length(cigar): OP = 'MIDNSHP=X' return sum(num for op, num in cigar if OP[op] in 'MI=X') # generate and write out OutputRecord out = OutputRecord( region, exon, linc_tx, len(match.query_sequence), ali_length(match.cigar), int(nm), window_5, window_3, classes) print(out.to_tsv(), file=self.args.output_tsv) except tabix.TabixError as e: pass # swallow, probably some unplaced region