예제 #1
0
파일: tabix_test.py 프로젝트: humburg/pysam
 def testDisjointIterators(self):
     # two iterators working on the same file
     tabix = pysam.TabixFile(self.filename)
     a = tabix.fetch(parser=pysam.asGTF(), multiple_iterators=True).next()
     b = tabix.fetch(parser=pysam.asGTF(), multiple_iterators=True).next()
     # both iterators are at top of file
     self.assertEqual(str(a), str(b))
예제 #2
0
파일: tabix_test.py 프로젝트: humburg/pysam
 def testDisjointIterators(self):
     # two iterators working on the same file
     tabix = pysam.TabixFile(self.filename)
     a = tabix.fetch(parser=pysam.asGTF(), multiple_iterators=True).next()
     b = tabix.fetch(parser=pysam.asGTF(), multiple_iterators=True).next()
     # both iterators are at top of file
     self.assertEqual(str(a), str(b))
예제 #3
0
def getGene(chr, start, end, strand):
    tmp = {'chr': chr, 'start': start, 'end': end, 'strand': strand}
    geneList1 = []
    geneList2 = []
    for gtf in tabixfile.fetch(tmp['chr'],
                               tmp['start'] - 1,
                               tmp['start'],
                               parser=pysam.asGTF()):
        if gtf.feature == 'gene':
            gn = gtf.gene_name
            if gtf.strand == tmp['strand'] or tmp['strand'] == 'U':
                geneList1.append(gn)
            else:
                if gn[-4:] == '-AS1':
                    geneList1.append(gn[0:(len(gn) - 4)])
                else:
                    geneList1.append(gn + '-AS1')
    for gtf in tabixfile.fetch(tmp['chr'],
                               tmp['end'] - 1,
                               tmp['end'],
                               parser=pysam.asGTF()):
        if gtf.feature == 'gene':
            gn = gtf.gene_name
            if gtf.strand == tmp['strand'] or tmp['strand'] == 'U':
                geneList2.append(gn)
            else:
                if gn[-4:] == '-AS1':
                    geneList2.append(gn[0:(len(gn) - 4)])
                else:
                    geneList2.append(gn + '-AS1')
    geneCom = list(set(geneList1) & set(geneList2))
    if len(geneCom) > 0:
        geneCom = geneCom
    else:
        geneCom = []
        for gtf in tabixfile.fetch(tmp['chr'],
                                   tmp['start'] - 1,
                                   tmp['end'] - 1,
                                   parser=pysam.asGTF()):
            if gtf.feature == 'gene':
                gn = gtf.gene_name
                if gtf.strand == tmp['strand'] or tmp['strand'] == 'U':
                    geneCom.append(gn)
                else:
                    if gn[-4:] == '-AS1':
                        geneCom.append(gn[0:(len(gn) - 4)])
                    else:
                        geneCom.append(gn + '-AS1')
    geneCom = list(set(geneCom))
    noAS = []
    if len(geneCom) > 1:
        for j in geneCom:
            if j[-4:] != '-AS1':
                noAS.append(j)
        if len(noAS) > 0:
            geneCom = noAS
    return (';'.join(geneCom))
예제 #4
0
    def testJoinedIterators(self):

        # two iterators working on the same file
        tabix = pysam.TabixFile(self.filename)
        a = tabix.fetch(parser=pysam.asGTF()).next()
        b = tabix.fetch(parser=pysam.asGTF()).next()
        # the first two lines differ only by the feature field
        self.assertEqual(a.feature, "UTR")
        self.assertEqual(b.feature, "exon")
        self.assertEqual(re.sub("UTR", "", str(a)), re.sub("exon", "", str(b)))
예제 #5
0
    def testJoinedIterators(self):

        # two iterators working on the same file
        tabix = pysam.TabixFile(self.filename)
        a = tabix.fetch(parser=pysam.asGTF()).next()
        b = tabix.fetch(parser=pysam.asGTF()).next()
        # the first two lines differ only by the feature field
        self.assertEqual(a.feature, "UTR")
        self.assertEqual(b.feature, "exon")
        self.assertEqual(re.sub("UTR", "", str(a)), re.sub("exon", "", str(b)))
예제 #6
0
파일: gtf.py 프로젝트: soh-i/Ivy
 def subset_of_feature_in_region(self, contig=None, start=None, end=None, types=None):
     '''
     Example:
      # return dict of rna and tss id
      for rna_tss in tabix.subset_of_feature_in_region(contig="chr2L", end=9839,
                                                 types=["transcript_id", 'tss_id']):
          print rna_tss
     '''
     
     for gtf in pysam.Tabixfile.fetch(self.tabixfile, contig, start, end,
                                      parser=pysam.asGTF()):
         if isinstance(types, str):
             try:
                 yield gtf.asDict()[types]
             except KeyError:
                 print 'key \'{0}\' is not found in {1}'.format(t, self.ingtf)
                 
         elif isinstance(types, list):
             tmp = dict()
             for t in types:
                 try:
                     tmp.update({t: gtf.asDict()[t]})
                 except KeyError:
                     print 'key \'{0}\' is not found in {1}'.format(t, self.ingtf)
             yield tmp
예제 #7
0
파일: gtf.py 프로젝트: soh-i/Ivy
 def gene_in_region(self, contig=None, start=None, end=None):
     for gtf in pysam.Tabixfile.fetch(self.tabixfile, contig, start, end,
                                      parser=pysam.asGTF()):
         try:
             yield gtf.asDict()['gene_name']
         except KeyError:
             print 'key \'{0}\' is not found in {1}'.format(t, self.ingtf)
예제 #8
0
def coverage(bam_paths,
             gtf_path,
             transcript_ids=None,
             verbose=False,
             agg_func=None):

    # Setup record iterator from gtf file.
    gtf_file = pysam.Tabixfile(gtf_path, parser=pysam.asGTF())
    gtf_records = (rec for rec in gtf_file.fetch() if rec.feature == 'exon')

    if transcript_ids is not None:
        transcript_ids = set(transcript_ids)
        gtf_records = (rec for rec in gtf_records
                       if rec['transcript_id'] in transcript_ids)

    if verbose:
        gtf_records = tqdm(gtf_records, leave=False)

    # Build frame.
    rows = _coverage_gen(bam_paths, gtf_records, agg_func=agg_func)
    index_names = ['transcript_id', 'chr', 'start', 'end', 'strand']

    result = pd.DataFrame.from_records(rows,
                                       columns=index_names + list(bam_paths))
    result = result.set_index(index_names)

    return result
예제 #9
0
def read_gtf(lines, scaffolds, contig_prefix):
    table = {}  # gene_id -> transcript_id -> exon_number -> feature -> [items]
    for gtf in text.parse_lines(lines, pysam.asGTF()):
        if not filter_gtf_record(gtf):
            update_gtf_table(table, gtf, scaffolds, contig_prefix)

    return table
예제 #10
0
def _TestMultipleIteratorsHelper(filename, multiple_iterators):
    """open file within scope, return iterator."""

    tabix = pysam.TabixFile(filename)
    iterator = tabix.fetch(parser=pysam.asGTF(), multiple_iterators=multiple_iterators)
    tabix.close()
    return iterator
예제 #11
0
def read_gtf(lines, scaffolds, contig_prefix):
    table = {}  # gene_id -> transcript_id -> exon_number -> feature -> [items]
    for gtf in text.parse_lines(lines, pysam.asGTF()):
        if not filter_gtf_record(gtf):
            update_gtf_table(table, gtf, scaffolds, contig_prefix)

    return table
예제 #12
0
    def testCopy(self):
        a = self.tabix.fetch(parser=pysam.asTuple()).next()
        b = copy.copy(a)
        self.assertEqual(a, b)

        a = self.tabix.fetch(parser=pysam.asGTF()).next()
        b = copy.copy(a)
        self.assertEqual(a, b)
예제 #13
0
def getgenenames(coords, gtf):
    eventstogenes = {}  # {event : gene}
    tabixfile = pysam.Tabixfile(gtf)
    for event in coords:
        isoform1genes = []
        isoform2genes = []
        isoform1chrm = coords[event][0][0]
        isoform1start = coords[event][0][1]
        isoform1end = coords[event][0][2]
        isoform1strand = coords[event][0][3]
        isoform2chrm = coords[event][1][0]
        isoform2start = coords[event][1][1]
        isoform2end = coords[event][1][2]
        isoform2strand = coords[event][1][3]
        for entry in tabixfile.fetch(isoform1chrm,
                                     isoform1start,
                                     isoform1end,
                                     isoform1strand,
                                     parser=pysam.asGTF()):
            isoform1genes.append(entry.gene_name)
        for entry in tabixfile.fetch(isoform2chrm,
                                     isoform2start,
                                     isoform2end,
                                     isoform2strand,
                                     parser=pysam.asGTF()):
            isoform2genes.append(entry.gene_name)

        #Collapse all duplicates
        isoform1genes = list(set(isoform1genes))
        isoform2genes = list(set(isoform2genes))
        #Get genes that overlap both isoforms
        isoformintersection = set(isoform1genes).intersection(
            set(isoform2genes))

        if len(isoformintersection) > 1:
            print 'WARNING: more than one gene found for event {0}.'.format(
                event)
            print event, list(isoformintersection)
        elif len(isoformintersection) == 1:
            eventstogenes[event] = list(isoformintersection)[0]
        elif len(isoformintersection) == 0:
            print 'No gene found for event {0}!!'.format(event)

    print 'Found genes for {0} of {1} events.'.format(len(eventstogenes),
                                                      len(coords))
    return eventstogenes
예제 #14
0
def _TestMultipleIteratorsHelper(filename, multiple_iterators):
    '''open file within scope, return iterator.'''

    tabix = pysam.TabixFile(filename)
    iterator = tabix.fetch(parser=pysam.asGTF(),
                           multiple_iterators=multiple_iterators)
    tabix.close()
    return iterator
예제 #15
0
    def testCopy(self):
        a = self.tabix.fetch(parser=pysam.asTuple()).next()
        b = copy.copy(a)
        self.assertEqual(a, b)

        a = self.tabix.fetch(parser=pysam.asGTF()).next()
        b = copy.copy(a)
        self.assertEqual(a, b)
예제 #16
0
파일: _tabix.py 프로젝트: zqfang/geneviz
    def __init__(self, filename, **kwargs):
        file_path = str(filename)
        if not file_path.endswith('.gz'):
            if os.path.exists(file_path + '.gz'):
                file_path += '.gz'
            else:
                file_path = self.compress(file_path, create_index=True)

        super().__init__(filename, parser=pysam.asGTF(), **kwargs)
def get_part_from_gtf(annotation, reference=None, feature="CDS"):
    """ Returns a part from GTF annotation 
    annotation: "0-References/genome.gtf.gz"
    reference:  None # or chr in GTF file   'I' or 'XII' or 'chr1'
    feature:   'CDS' # or 'ORF' ar any valid feature in GTF    
    is file name to compressed and indexed GTF. vt tabix """
    
    tabixfile = pysam.TabixFile(annotation, parser=pysam.asGTF())
    return [gtf for gtf in tabixfile.fetch(reference=reference) if (gtf.feature == feature)]
예제 #18
0
파일: tabix.py 프로젝트: jrderuiter/ngs-tk
    def __init__(self, file_path):
        file_path = str(file_path)
        if not file_path.endswith('.gz'):
            if os.path.exists(file_path + '.gz'):
                file_path += '.gz'
            else:
                file_path = self.compress(file_path)

        super().__init__(file_path, parser=pysam.asGTF())
예제 #19
0
def filterOut(chr, exfeat, pos):
    if len(exfeat) == 0: return 0
    if fo and not chr in contigo: return 0
    elif not fo: return 0
    res = [(kk.feature).lower() for kk in tabixo.fetch(
        reference=chr, start=pos, end=pos + 1, parser=pysam.asGTF())]
    for i in exfeat:
        if i in res: return 1
    return 0
예제 #20
0
    def _open_file(self):  # type: (...) -> pysam.TabixFile
        # Open gtf file.
        gtf_file = pysam.TabixFile(
            native_str(self._gtf_path), parser=pysam.asGTF())

        # Yield file object and ensure it is closed.
        try:
            yield gtf_file
        finally:
            gtf_file.close()
예제 #21
0
파일: tabix_test.py 프로젝트: humburg/pysam
    def testRead(self):

        for x, r in enumerate(self.tabix.fetch(parser=pysam.asGTF())):
            c = self.compare[x]
            self.assertEqual(len(c), len(r))
            self.assertEqual(list(c), list(r))
            self.assertEqual(c, str(r).split("\t"))
            self.assertTrue(r.gene_id.startswith("ENSG"))
            if r.feature != 'gene':
                self.assertTrue(r.transcript_id.startswith("ENST"))
            self.assertEqual(c[0], r.contig)
예제 #22
0
    def testRead(self):

        for x, r in enumerate(self.tabix.fetch(parser=pysam.asGTF())):
            c = self.compare[x]
            self.assertEqual(len(c), len(r))
            self.assertEqual(list(c), list(r))
            self.assertEqual(c, str(r).split("\t"))
            self.assertTrue(r.gene_id.startswith("ENSG"))
            if r.feature != 'gene':
                self.assertTrue(r.transcript_id.startswith("ENST"))
            self.assertEqual(c[0], r.contig)
예제 #23
0
파일: data.py 프로젝트: weallen/Pyseqwill
def load_kg_gtf(gtf_file_name):
    f = pysam.TabixFile(gtf_file_name)
    gtf = f.fetch(parser=pysam.asGTF())
    feats = []
    for row in gtf:
        attr = parse_gtf_attr(row.attributes)
        currfeat = GFFFeature(row.seqname, row.source, row.feature,\
                                int(row.start), int(row.end), score, \
                                strand, frame, attr)
        feats.append(currfeat)
    return feats
예제 #24
0
def read_transcripts(gtf_file, region, genome=None, retry=0):
    u"""
    Read transcripts from tabix indexed gtf files

    The original function check if the junctions corresponding to any exists exons, I disable this here

    :param gtf_file: path to bgzip gtf files (with tabix index), only ordered exons in this gtf file
    :param region: splice region
    :param retry: if the gtf chromosome and input chromosome does not match. eg: chr9:1-100:+ <-> 9:1-100:+
    :param genome: path to genome fasta file
    :return: SpliceRegion
    """
    if not os.path.exists(gtf_file):
        raise FileNotFoundError("%s not found" % gtf_file)

    try:
        logger.info("Reading from %s" % gtf_file)

        if genome:
            with pysam.FastaFile(genome) as fa:
                region.sequence = fa.fetch(region.chromosome, region.start - 1,
                                           region.end + 1)

        with pysam.Tabixfile(gtf_file, 'r') as gtf_tabix:
            relevant_exons_iterator = gtf_tabix.fetch(region.chromosome,
                                                      region.start - 1,
                                                      region.end + 1,
                                                      parser=pysam.asGTF())

            # min_exon_start, max_exon_end, exons_list = float("inf"), float("-inf"),  []
            for line in relevant_exons_iterator:
                try:
                    region.add_gtf(line)
                except IndexError as err:
                    logger.error(err)

    except ValueError as err:
        logger.warn(err)

        # handle the mismatch of chromosomes here
        if retry < 2:
            if not region.chromosome.startswith("chr"):
                logger.info("Guess need 'chr'")
                region.chromosome = "chr" + region.chromosome
            else:
                logger.info("Guess 'chr' is redundant")
                region.chromosome = region.chromosome.replace("chr", "")

            return read_transcripts(gtf_file=gtf_file,
                                    region=region,
                                    retry=retry + 1)

    return region
예제 #25
0
    def testSetting(self):

        for r in self.tabix.fetch(parser=pysam.asGTF()):
            r.contig = r.contig + "_test"          
            r.source = r.source + "_test"
            r.feature = r.feature + "_test"
            r.start += 10
            r.end += 10
            r.score = 20
            r.strand = "+"
            r.frame = 0
            r.attributes = 'gene_id "0001";'
예제 #26
0
    def testSetting(self):

        for r in self.tabix.fetch(parser=pysam.asGTF()):
            r.contig = r.contig + "_test"
            r.source = r.source + "_test"
            r.feature = r.feature + "_test"
            r.start += 10
            r.end += 10
            r.score = 20
            r.strand = "+"
            r.frame = 0
            r.attributes = 'gene_id "0001";'
예제 #27
0
파일: tabix.py 프로젝트: rajithbt/pyim
    def __init__(self, file_path):
        file_path = str(file_path)

        if not os.path.exists(file_path):
            raise IOError('File does not exist ({})'.format(file_path))

        if not file_path.endswith('.gz'):
            if os.path.exists(file_path + '.gz'):
                file_path += '.gz'
            else:
                file_path = self.compress(file_path)

        super().__init__(file_path, parser=pysam.asGTF())
예제 #28
0
파일: gtf.py 프로젝트: soh-i/Ivy
 def fetch_gtf(self, contig=None, start=None, end=None):
     '''
     Returns:
      pysam.tabix object
     
     Example:
      # return dict of each GTF line
      for _ in tabix.fetch_gtf(contig="chr2L", end=9839):
          print _.asDict()
     '''
     
     for gtf in pysam.Tabixfile.fetch(self.tabixfile, contig, start, end,
                                      parser=pysam.asGTF()):
         yield gtf
예제 #29
0
파일: util.py 프로젝트: sofiaff/imfusion
    def from_gtf(
        cls,
        gtf_path,  # type: pathlib.Path
        chromosomes=None,  # type: List[str]
        record_filter=None  # type: Callable[[Any], bool]
    ):  # type: (...) -> TranscriptReference
        """Builds an Reference instance from the given GTF file."""

        # Open gtf file.
        gtf = pysam.TabixFile(native_str(gtf_path), parser=pysam.asGTF())

        if chromosomes is None:
            chromosomes = gtf.contigs

        # Build the trees.
        transcript_trees = {}
        exon_trees = {}

        for chrom in chromosomes:
            # Collect exons and transcripts.
            transcripts = []
            exons = []

            records = gtf.fetch(reference=chrom)

            if record_filter is not None:
                records = (rec for rec in records if record_filter(rec))

            for record in records:
                if record.feature == 'transcript':
                    transcripts.append(cls._record_to_transcript(record))
                elif record.feature == 'exon':
                    exons.append(cls._record_to_exon(record))

            # Build transcript lookup tree.
            transcript_trees[chrom] = IntervalTree.from_tuples(
                (tr.start, tr.end, tr) for tr in transcripts)

            # Build exon lookup tree.
            keyfunc = lambda rec: rec.transcript_id

            exons = sorted(exons, key=keyfunc)
            grouped = itertools.groupby(exons, key=keyfunc)

            for tr_id, grp in grouped:
                exon_trees[tr_id] = IntervalTree.from_tuples(
                    (exon.start, exon.end, exon) for exon in grp)

        return cls(transcript_trees, exon_trees)
예제 #30
0
    def from_gtf(
            cls,
            gtf_path,  # type: pathlib.Path
            chromosomes=None,  # type: List[str]
            record_filter=None  # type: Callable[[Any], bool]
    ):  # type: (...) -> TranscriptReference
        """Builds an Reference instance from the given GTF file."""

        # Open gtf file.
        gtf = pysam.TabixFile(native_str(gtf_path), parser=pysam.asGTF())

        if chromosomes is None:
            chromosomes = gtf.contigs

        # Build the trees.
        transcript_trees = {}
        exon_trees = {}

        for chrom in chromosomes:
            # Collect exons and transcripts.
            transcripts = []
            exons = []

            records = gtf.fetch(reference=chrom)

            if record_filter is not None:
                records = (rec for rec in records if record_filter(rec))

            for record in records:
                if record.feature == 'transcript':
                    transcripts.append(cls._record_to_transcript(record))
                elif record.feature == 'exon':
                    exons.append(cls._record_to_exon(record))

            # Build transcript lookup tree.
            transcript_trees[chrom] = IntervalTree.from_tuples(
                (tr.start, tr.end, tr) for tr in transcripts)

            # Build exon lookup tree.
            keyfunc = lambda rec: rec.transcript_id

            exons = sorted(exons, key=keyfunc)
            grouped = itertools.groupby(exons, key=keyfunc)

            for tr_id, grp in grouped:
                exon_trees[tr_id] = IntervalTree.from_tuples(
                    (exon.start, exon.end, exon) for exon in grp)

        return cls(transcript_trees, exon_trees)
예제 #31
0
def overlap_annotation(junc, anno):
    ts_set = anno.fetch(junc.chrom, junc.start, junc.end, parser=pysam.asGTF())
    exon_set = filter(lambda x: x.feature=='exon', [i for i in ts_set])
    for idx,exon in enumerate(exon_set):
        if idx == len(exon_set) - 1:
            continue
        

    for(std::size_t i = 0; i < exons.size(); i++) {
        if(exons[i].start > junction.end) {
            //No need to look any further
            //the rest of the exons are outside the junction
            break;
        }
        //known junction
        if(exons[i].end == junction.start &&
                exons[i + 1].start == junction.end) {
            junction.known_acceptor = true;
            junction.known_donor = true;
            junction.known_junction = true;
            known_junction = true;
        }
        else {
            if(!junction_start) {
                if(exons[i].end >= junction.start) {
                    junction_start = true;
                }
            }
            if(junction_start) {
                if(exons[i].start > junction.start &&
                        exons[i].end < junction.end) {
                    junction.exons_skipped.insert(exons[i].name);
                }
                if(exons[i].start > junction.start) {
                    junction.donors_skipped.insert(exons[i].start);
                }
                if(exons[i].end < junction.end) {
                    junction.acceptors_skipped.insert(exons[i].end);
                }
                if(exons[i].end == junction.start) {
                    junction.known_donor = true;
                }
                //TODO - check for last exon
                if(exons[i].start == junction.end) {
                    junction.known_acceptor = true;
                }
            }
        }
    }
예제 #32
0
    def dfgene(self):
        if 'dfgene' not in self._data:
            xgene = {}
            with pysam.TabixFile(self.gtffile) as tbx:  # pylint: disable=maybe-no-member
                for gtf in tbx.fetch(parser=pysam.asGTF()):  # pylint: disable=maybe-no-member
                    if gtf.feature == 'gene':
                        xgene[gtf.gene_id] = (gtf.contig, gtf.start, gtf.end,
                                              gtf.strand, gtf.gene_id)
            self._data['dfgene'] = pd.DataFrame(
                np.array(list(xgene.values()),
                         dtype=[('Chrom', 'S10'), ('Start', 'i4'),
                                ('End', 'i4'), ('Strand', 'S1'),
                                ('GeneID', 'S30')]))

        return self._data['dfgene']
def annotate_inv(genes, chrom, start, end, strand):
    feats = set()
    parents = set()
    antisense_feats = set()
    antisense_parents = set()
    overlapping_feats = list(
        genes.fetch(chrom, start, end, parser=pysam.asGTF()))
    if not len(overlapping_feats):
        return None
    else:
        for record in overlapping_feats:
            if record.strand == strand:
                feats.add(record.feature)
                try:
                    parents.add(
                        re.search(r'Parent=(.*?)[;:\.]',
                                  record.attributes).group(1))
                except AttributeError:
                    parents.add(
                        re.search(r'ID=(.*?)[;:\\.]',
                                  record.attributes).group(1))
            else:
                antisense_feats.add(record.feature)
                try:
                    antisense_parents.add(
                        re.search(r'Parent=(.*?)[;:\.]',
                                  record.attributes).group(1))
                except AttributeError:
                    antisense_parents.add(
                        re.search(r'ID=(.*?)[;:\\.]',
                                  record.attributes).group(1))
    if not len(feats) and len(antisense_feats):
        feats = 'antisense'
        parents = antisense_parents
    else:
        feats = feats.difference({'gene', 'exon', 'protein', 'mRNA'})
        if not feats:
            feats = 'intron'
        else:
            for ftype in PRIORITY:
                if ftype in feats:
                    feats = ftype
                    break
            else:
                feats = '|'.join(feats)
    return [feats, '|'.join(parents)]
예제 #34
0
파일: gtf.py 프로젝트: soh-i/Ivy
 def strand_info(self, contig=None, start=None, end=None):
     """
     Args:
      contig(str)='', start(int)='', end=''
     
     Returns:
      strand information [+-], or [.] is 404
     """
     
     found = "."
     for gtf in pysam.Tabixfile.fetch(self.tabixfile, contig, start, end,
                                      parser=pysam.asGTF()):
         if gtf.strand:
             found = gtf.strand
             break
         else:
             continue
     return found
예제 #35
0
    def testSetting(self):

        for r in self.tabix.fetch(parser=pysam.asGTF()):
            r.contig = r.contig + "_test_contig"
            r.source = r.source + "_test_source"
            r.feature = r.feature + "_test_feature"
            r.start += 10
            r.end += 10
            r.score = 20
            r.strand = "+"
            r.frame = 0
            r.attributes = 'gene_id "0001";'
            r.transcript_id = "0002"
            sr = str(r)
            self.assertTrue("_test_contig" in sr)
            self.assertTrue("_test_source" in sr)
            self.assertTrue("_test_feature" in sr)
            self.assertTrue("gene_id \"0001\"" in sr)
            self.assertTrue("transcript_id \"0002\"" in sr)
예제 #36
0
def main():
    logging.basicConfig(level=logging.DEBUG)
    parser = argparse.ArgumentParser()
    parser.add_argument('--frac', type=float, default=0.0)
    parser.add_argument('gtf_file')
    args = parser.parse_args()

    all_t_ids = set()
    t_ids = set()
    for f in pysam.tabix_iterator(open(args.gtf_file), pysam.asGTF()):
        if f.feature == 'transcript':
            t_id = f.transcript_id
            frac = float(f.frac)
            keep = (frac >= args.frac)
            all_t_ids.add(t_id)
            if keep:
                t_ids.add(t_id)
                print str(f)
        elif f.feature == 'exon':
            t_id = f.transcript_id
            assert t_id in all_t_ids
            if t_id in t_ids:
                print str(f)
예제 #37
0
def main():
    logging.basicConfig(level=logging.DEBUG)
    parser = argparse.ArgumentParser()
    parser.add_argument('--frac', type=float, default=0.0)
    parser.add_argument('gtf_file')
    args = parser.parse_args()

    all_t_ids = set()
    t_ids = set()
    for f in pysam.tabix_iterator(open(args.gtf_file), pysam.asGTF()):
        if f.feature == 'transcript':
            t_id = f.transcript_id
            frac = float(f.frac)
            keep = (frac >= args.frac)
            all_t_ids.add(t_id)
            if keep:
                t_ids.add(t_id)
                print str(f)
        elif f.feature == 'exon':
            t_id = f.transcript_id
            assert t_id in all_t_ids
            if t_id in t_ids:
                print str(f)
예제 #38
0
파일: annoFL.py 프로젝트: yangence/circfull
def getCircType(tmp):
    exon_start=[int(i) for i in tmp['exon_start'].split(',')]
    exon_end=[int(i) for i in tmp['exon_end'].split(',')]

    canGene_full=[] # all in the gene with same sense
    canGene_part=[] # only a part in the gene with same sense
    canGene_anti=[] # antisense
    for gtf in tabixfile.fetch(tmp['chr'], tmp['start']-1, tmp['end']-1,parser=pysam.asGTF()):
        if gtf.feature=='gene':
            if gtf.strand==tmp['strand']:
                if (gtf.start<=tmp['start']) and (gtf.end>=tmp['end']):
                    canGene_full.append(gtf.gene_id)
                else:
                    canGene_part.append(gtf.gene_id)
            else:
                canGene_anti.append(gtf.gene_id)                
    start_type=''
    end_type=''
    circ_type=''
    geneName=''
    if len(canGene_full)>0:
        circ_type='full'
        exon_score=-1
        inexon_score=-1
        geneName=''
        for i in canGene_full:
            geneExon=gene2exon_dict[i]
            start_type_tmp,end_type_tmp,exon_score_tmp,inexon_score_tmp=compareExon2gene(geneExon,exon_start,exon_end)
            if exon_score_tmp>exon_score:
                start_type=start_type_tmp
                end_type=end_type_tmp
                exon_score=exon_score_tmp
                inexon_score=inexon_score_tmp
                geneName=gene2class_dict[i]
            elif exon_score_tmp==exon_score:
                if inexon_score_tmp>inexon_score:
                    start_type=start_type_tmp
                    end_type=end_type_tmp
                    inexon_score=inexon_score_tmp
                    geneName=gene2class_dict[i]
            else:
                continue
    elif len(canGene_part)>0:
        if len(canGene_part)>1:
            min_start,max_end,geneName,geneID_minmax=getMinMax(canGene_part)
            if (min_start<tmp['start']) and (max_end>=tmp['end']):
                circ_type='read through'
                geneExon_1=gene2exon_dict[geneID_minmax[0]]
                start_type_tmp_1,end_type_tmp_1,exon_score_tmp_1,inexon_score_tmp1=comparePartExon2gene(geneExon_1,exon_start,exon_end)   
                geneExon_2=gene2exon_dict[geneID_minmax[1]]
                start_type_tmp_2,end_type_tmp_2,exon_score_tmp_2,inexon_score_tmp2=comparePartExon2gene(geneExon_2,exon_start,exon_end)   
                return(combin2type(start_type_tmp_1,start_type_tmp_2),combin2type(end_type_tmp_1,end_type_tmp_2),circ_type,geneName)
        circ_type='part'
        exon_score=-1
        inexon_score=-1
        geneName=''
        for i in canGene_part:
            geneExon=gene2exon_dict[i]
            start_type_tmp,end_type_tmp,exon_score_tmp,inexon_score_tmp=comparePartExon2gene(geneExon,exon_start,exon_end)
            if exon_score_tmp>exon_score:
                start_type=start_type_tmp
                end_type=end_type_tmp
                exon_score=exon_score_tmp
                inexon_score=inexon_score_tmp
                geneName=gene2class_dict[i]
            elif exon_score_tmp==exon_score:
                if inexon_score_tmp>inexon_score:
                    start_type=start_type_tmp
                    end_type=end_type_tmp
                    inexon_score=inexon_score_tmp
                    geneName=gene2class_dict[i]
            else:
                continue  
    elif len(canGene_anti)>0:
        circ_type='antisense'
        geneName=';'.join([gene2class_dict[i]+'-AS1' for i in canGene_anti])
        start_type=','.join(['a']* len(exon_start))
        end_type=','.join(['a']* len(exon_end))
    else:
        circ_type='intergenic'
        start_type=','.join(['intergenic']* len(exon_start))
        end_type=','.join(['intergenic']* len(exon_end))
    return(start_type,end_type,circ_type,geneName)
예제 #39
0
def create_pysam_tabix(path):
    if path is not None and os.path.exists(path):
        return pysam.Tabixfile(path, parser=pysam.asGTF())
    return None
예제 #40
0
def get_part_from_gtf(annotation, reference=None, feature="CDS"):
    tabixfile = pysam.TabixFile(annotation, parser=pysam.asGTF())
    return [
        gtf for gtf in tabixfile.fetch(reference=reference)
        if (gtf.feature == feature)
    ]
예제 #41
0
파일: GTF.py 프로젝트: prasoonnema/cgat
def readFromFile(infile):
    """read records from file and return as list."""
    result = []
    for gff in pysam.tabix_iterator(infile, pysam.asGTF()):
        result.append(gff)
    return result
    def testGTF( self ):

        for x, r in enumerate(self.tabix.fetch( parser = pysam.asGTF() )):
            self.assertEqual( "\t".join( self.compare[x]), str(r) )
예제 #43
0
                               args.max_dist_ann,
                               args.min_len_tail_contig,
                               args.min_num_tail_reads,
                               args.min_num_bridge_reads,
                               args.min_bridge_read_tail_len,
                               has_pas = args.has_polyadenylation_signal)
print 'Loading KLEAT {}/{} ...DONE\r'.format(N,N)

# Group KLEAT data
sprint('Grouping kleat data ...')
kleats = Kleat.groupKleat(kleats)
print 'DONE'

# Parse ensembl
sprint('Loading ensembl annotation ...')
ensembl = pysam.TabixFile(args.ensembl, parser=pysam.asGTF())
print 'DONE'

# Parse aceview
sprint('Loading aceview annotation ...')
aceview = pysam.TabixFile(args.aceview, parser=pysam.asGTF())
print 'DONE'

# Parse refseq
sprint('Loading refseq annotation ...')
refseq = pysam.TabixFile(args.refseq, parser=pysam.asGTF())
print 'DONE'

# Parse ucsc
sprint('Loading ucsc annotation ...')
ucsc = pysam.TabixFile(args.ucsc, parser=pysam.asGTF())
예제 #44
0
def getStrand(x):
    exon_leftSeq_first = x['exon_leftSeq_first'].split(',')[1:]
    exon_rightSeq_first = x['exon_rightSeq_first'].split(',')[:-1]
    exon_leftSeq_second = x['exon_leftSeq_second'].split(',')[1:]
    exon_rightSeq_second = x['exon_rightSeq_second'].split(',')[:-1]
    exon_motif_left = [
        x['exon_leftSeq_first'].split(',')[0] +
        x['exon_rightSeq_first'].split(',')[-1]
    ]
    exon_motif_right = [
        x['exon_leftSeq_second'].split(',')[0] +
        x['exon_rightSeq_second'].split(',')[-1]
    ]
    strand_first = 'U'
    strand_second = 'U'
    if len(exon_leftSeq_first) > 0:
        for i in range(len(exon_leftSeq_first)):
            exon_motif_left.append(exon_leftSeq_first[i] +
                                   exon_rightSeq_first[i])
    if len(exon_leftSeq_second) > 0:
        for i in range(len(exon_leftSeq_second)):
            exon_motif_right.append(exon_leftSeq_second[i] +
                                    exon_rightSeq_second[i])
    for i in exon_motif_left:
        if i in ['AGGT', 'AGGC']:
            strand_first = '+'
            break
        elif i in ['ACCT', 'GCCT']:
            strand_first = '-'
            break
    for i in exon_motif_right:
        if i in ['AGGT', 'AGGC']:
            strand_second = '+'
            break
        elif i in ['ACCT', 'GCCT']:
            strand_second = '-'
            break

    if strand_first == 'U':
        if x['chr_first'] in tabixfile.contigs:
            for gtf in tabixfile.fetch(x['chr_first'],
                                       x['start_first'] - 1,
                                       x['start_first'],
                                       parser=pysam.asGTF()):
                if gtf.start == x['start_first'] - 1:
                    strand_first = gtf.strand
                    break
    if strand_first == 'U':
        if x['chr_first'] in tabixfile.contigs:
            for gtf in tabixfile.fetch(x['chr_first'],
                                       x['end_first'] - 1,
                                       x['end_first'],
                                       parser=pysam.asGTF()):
                if gtf.end == x['end_first'] - 1:
                    strand_first = gtf.strand
                    break
    if strand_second == 'U':
        if x['chr_second'] in tabixfile.contigs:
            for gtf in tabixfile.fetch(x['chr_second'],
                                       x['start_second'] - 1,
                                       x['start_second'],
                                       parser=pysam.asGTF()):
                if gtf.start == x['start_second'] - 1:
                    strand_second = gtf.strand
                    break
    if strand_second == 'U':
        if x['chr_second'] in tabixfile.contigs:
            for gtf in tabixfile.fetch(x['chr_second'],
                                       x['end_second'] - 1,
                                       x['end_second'],
                                       parser=pysam.asGTF()):
                if gtf.end == x['end_second'] - 1:
                    strand_second = gtf.strand
                    break
    return ([strand_first, strand_second])
예제 #45
0
def _aggregate_gtf(gtf_file, sample_id, gtf_expr_attr, output_fh, stats_fh,
                   is_ref=False):
    def _init_t_dict():
        return {'_id': None, 'num_exons': 0, 'length': 0}

    t_dict = collections.defaultdict(_init_t_dict)
    cur_t_id = 1
    exprs = []
    for f in pysam.tabix_iterator(open(gtf_file), pysam.asGTF()):
        if f.feature == 'transcript':
            t_id = f.transcript_id
            if t_id in t_dict:
                m = 'GTF "%s" transcript_id "%s" not unique' % (gtf_file, t_id)
                raise GTFError(m)
            t_item = t_dict[t_id]
            # rename transcript id
            new_t_id = "%s.T%d" % (sample_id, cur_t_id)
            cur_t_id += 1
            t_item['_id'] = new_t_id
            if is_ref:
                expr = 0.0
            else:
                expr = float(f[gtf_expr_attr])
            exprs.append(expr)
            # prepare attributes
            attrs = {GTF.Attr.TRANSCRIPT_ID: new_t_id,
                     GTF.Attr.SAMPLE_ID: sample_id,
                     GTF.Attr.REF: str(int(is_ref)),
                     GTF.Attr.EXPR: str(expr)}
            # save attributes
            f.fromDict(attrs)
            print >>output_fh, str(f)
        elif f.feature == 'exon':
            t_id = f.transcript_id
            t_item = t_dict[t_id]
            # update statistics
            t_item['num_exons'] += 1
            t_item['length'] += (f.end - f.start)
            # replace transcript id
            f.fromDict({GTF.Attr.TRANSCRIPT_ID: t_item['_id']})
            print >>output_fh, str(f)

    # process statistics
    num_exons = []
    lengths = []
    for t_item in t_dict.itervalues():
        lengths.append(t_item['length'])
        num_exons.append(t_item['num_exons'])

    # compute and write stats
    quantiles = range(0, 101)
    expr_qs = (scoreatpercentile(exprs, q) for q in quantiles)
    expr_qs = ','.join(map(str, expr_qs))
    length_qs = (int(round(scoreatpercentile(lengths, q)))
                 for q in quantiles)
    length_qs = ','.join(map(str, length_qs))
    num_exon_qs = (int(round(scoreatpercentile(num_exons, q)))
                   for q in quantiles)
    num_exon_qs = ','.join(map(str, num_exon_qs))
    fields = [sample_id, len(t_dict), expr_qs, length_qs, num_exon_qs]
    print >>stats_fh, '\t'.join(map(str, fields))
 def helper(self, thread=0):
     for contig in self.tabix_reader.contigs[thread::self.num_threads]:
         load_genes_helper(self.tabix_reader.fetch(contig, start=0, parser=pysam.asGTF(), multiple_iterators=True),
                           features=self.features)
예제 #47
0
def gtf_iterator(gtf_path):
    return tabix.TabixIterator(gtf_path, parser=pysam.asGTF())
예제 #48
0
파일: annoFL.py 프로젝트: yangence/circfull
def bed2df(bedFile,gtfFile):
    global tabixfile,gene2strand_dict,gene2trans_dict,gene2exon_dict,trans2exon_dict,trans2gene_dict,gene2status_dict,gene2class_dict,trans2class_dict
    FL_bed=pd.read_csv(bedFile,sep='\t',header=None)
    FL_bed.iloc[:,10]=FL_bed.iloc[:,10].map(str)
    FL_bed.iloc[:,11]=FL_bed.iloc[:,11].map(str)
    FL=pd.DataFrame(FL_bed.apply(lambda x: bed2FL(x),axis=1).tolist(),columns=['chr','start','end','isoID','strand','exon_start','exon_end','len'])
    tabixfile=pysam.TabixFile(gtfFile)
    gene2strand_dict={}
    gene2trans_dict={}
    gene2exon_dict={}
    trans2exon_dict={}
    trans2gene_dict={}
    gene2status_dict={}
    gene2class_dict={}
    trans2class_dict={}
    
    for gtf in tabixfile.fetch(parser=pysam.asGTF()):
        if gtf.feature=='exon':
            current_geneID=gtf.gene_id
            current_transID=gtf.transcript_id
            if gene2exon_dict.__contains__(current_geneID):
                gene2exon_dict[current_geneID].append([gtf.start,gtf.end])
            else:
                gene2exon_dict[current_geneID]=[[gtf.start,gtf.end]]
            if trans2exon_dict.__contains__(current_transID):
                trans2exon_dict[current_transID].append([gtf.start,gtf.end])
            else:
                trans2exon_dict[current_transID]=[[gtf.start,gtf.end]]
        if gtf.feature=='transcript':
            current_geneID=gtf.gene_id
            current_transID=gtf.transcript_id
            if 'transcript_status' in gtf.keys():
                trans2class_dict[current_transID]=gtf.transcript_status
            else:
                trans2class_dict[current_transID]='NOVEL'
            if gene2trans_dict.__contains__(current_geneID):
                gene2trans_dict[current_geneID].append(current_transID)
            else:
                gene2trans_dict[current_geneID]=[current_transID]
            trans2gene_dict[current_transID]=current_geneID
        if gtf.feature=='gene':
            current_geneID=gtf.gene_id
            gene2strand_dict[current_geneID]=gtf.strand
            if 'gene_status' in gtf.keys():
                gene2status_dict[current_geneID]=gtf.gene_status
            else:
                gene2status_dict[current_geneID]='NOVEL'
            gene2class_dict[current_geneID]=gtf.gene_name
    for i in gene2exon_dict.keys():
        gene2exon_dict[i]=noDup_list(gene2exon_dict[i])
        
    start_type_list=[]
    end_type_list=[]
    circ_type_list=[]
    geneName_list=[]
    for i in range(FL.shape[0]):
        tmp=FL.iloc[i,:]
        start_type,end_type,circ_type,geneName=getCircType(tmp)
        start_type_list.append(start_type)
        end_type_list.append(end_type)
        circ_type_list.append(circ_type)
        geneName_list.append(geneName)
        
    FL['start_type']=start_type_list
    FL['end_type']=end_type_list
    FL['geneName']=geneName_list
    
    BSJ_type_list=[]
    for i in range(FL.shape[0]):
        circ_type=circ_type_list[i]
        start_type=start_type_list[i].split(',')
        end_type=end_type_list[i].split(',')
        if circ_type in ['intergenic','antisense','read through']:
            BSJ_type=circ_type
        elif circ_type=='part':
            if start_type[0] =='intergenic' and end_type[-1]=='intergenic':
                BSJ_type='novel UTR5;3'
            elif start_type[0] =='intergenic':
                if FL.iloc[i,:]['strand']=='+':
                    BSJ_type='novel UTR5'
                else:
                    BSJ_type='novel UTR3'
            else:
                if FL.iloc[i,:]['strand']=='+':
                    BSJ_type='novel UTR3'
                else:
                    BSJ_type='novel UTR5'
        else:
            num=len(start_type)
            # judge BSJ type
            if start_type[0]=='m':
                if end_type[-1]=='m':
                    BSJ_type='m'
                elif end_type[-1] in ['inE','outE']:
                    if FL.iloc[i,:]['strand']=='+':
                        BSJ_type='AS5'
                    else:
                        BSJ_type='AS3'
                elif end_type[-1]=='intron':
                    BSJ_type='Intron retention'
                else:
                    print('1')
            elif start_type[0] in ['inE','outE']:
                if end_type[-1]=='m':
                    if FL.iloc[i,:]['strand']=='+':
                        BSJ_type='AS5'
                    else:
                        BSJ_type='AS3'
                elif end_type[-1] in ['inE','outE']:
                    BSJ_type='AS5,AS3'
                elif end_type[-1]=='intron':
                    BSJ_type='Intron retention'
                else:
                     print('1')
            elif start_type[0] =='intron':
                BSJ_type='Intron retention'
            else:
                 print('1')
                            
        BSJ_type_list.append(BSJ_type)
        
    FSJ_type_list=[]
    for i in range(FL.shape[0]):
        FSJ_type=[]
        circ_type=circ_type_list[i]
        start_type=start_type_list[i].split(',')
        end_type=end_type_list[i].split(',')
        if circ_type in ['intergenic','antisense','read through']:       
            FSJ_type=circ_type
        else:
            num=len(start_type)
        #judge FSJ type
            if num==1:
                FSJ_type='1'
            else:
                for j in range(num-1):
                    donar=end_type[j]
                    acceptor=start_type[j+1]
                    if donar=='m':
                        if acceptor=='m':
                            FSJ_type.append('m')
                        elif acceptor in ['inE','outE']:
                            if FL.iloc[i,:]['strand']=='+':
                                FSJ_type.append('AS3')
                            else:
                                FSJ_type.append('AS5')
                        elif acceptor=='intron':
                            FSJ_type.append('Intron retention')
                        else:
                            if FL.iloc[i,:]['strand']=='+':
                                FSJ_type.append('novel UTR3')
                            else:
                                FSJ_type.append('novel UTR5')
                    elif donar  in ['inE','outE']:
                        if acceptor=='m':
                            if FL.iloc[i,:]['strand']=='+':
                                FSJ_type.append('AS5')
                            else:
                                FSJ_type.append('AS3')
                        elif acceptor in ['inE','outE']:
                            FSJ_type.append('AS5,AS3')
                        elif acceptor=='intron':
                            FSJ_type.append('Intron retention')
                        else:
                            if FL.iloc[i,:]['strand']=='+':
                                FSJ_type.append('novel UTR3')
                            else:
                                FSJ_type.append('novel UTR5')
                    elif donar=='intron':
                        if acceptor=='intergenic':
                            if FL.iloc[i,:]['strand']=='+':
                                FSJ_type.append('novel UTR3')
                            else:
                                FSJ_type.append('novel UTR5')
                        else:
                            FSJ_type.append('Intron retention')
                    else:
                        if acceptor=='intergenic':
                            FSJ_type.append('novel UTR5;3')
                        else:
                            if FL.iloc[i,:]['strand']=='+':
                                FSJ_type.append('novel UTR5')
                            else:
                                FSJ_type.append('novel UTR3')
                FSJ_type=','.join(FSJ_type)          
        FSJ_type_list.append(FSJ_type)
        
    detail_BSJ_type=[]
    com_BSJ_type=[]
    for i in BSJ_type_list:
        if i=='AS3':
            n='N3SS'
            m='NSS'
        elif i=='AS5':
            n='N5SS'
            m='NSS'
        elif i=='AS5,AS3':
            n='N5SS,N3SS'
            m='NSS'
        elif i=='Intron retention':
            n='intronic'
            m='intronic'
        elif i=='antisense':
            n='antisense'
            m='antisense'
        elif i=='intergenic':
            n='intergenic'
            m='intergenic'
        elif i=='m':
            n='exonic'
            m='exonic'
        elif i=='novel UTR3':
            n='novel UTR3'
            m='novel UTR'
        elif i=='novel UTR5':
            n='novel UTR5'
            m='novel UTR'
        elif i=='novel UTR5;3':
            n='novel UTR5,UTR3'
            m='novel UTR'
        elif i=='read through':
            n='read through'
            m='read through'
        else:
            n='unknown'
            m='unknown'
        detail_BSJ_type.append(n)
        com_BSJ_type.append(m)
    FL['detail_type']=detail_BSJ_type
    FL['type']=com_BSJ_type
    return(FL)
	def load_data(self, filepath):
		"""Loads GFF data"""
		tabix = pysam.TabixFile(filepath)

		for row in tabix.fetch(self.interval.chrom, self.interval.start, self.interval.end, parser = pysam.asGTF()):

			feature_interval = genomic_interval(row.contig, row.start, row.end, strand = row.strand)
			a = self.parse_gff_attributes(row.attributes)

			if row.feature == "gene":
				self.genes[a["gene_id"]] = a["gene_name"]
			elif row.feature == "transcript":
				self.transcripts[a["gene_id"]] = self.transcripts.get(a["gene_id"], []) + [(a["transcript_id"], feature_interval)]
			elif row.feature == "exon":
				self.exons[a["transcript_id"]] = self.exons.get(a["transcript_id"], []) + [(a["exon_id"], feature_interval)]
			elif row.feature == "CDS":
				self.cds[a["exon_id"]] = self.cds.get(a["exon_id"], []) + [(a["ID"], feature_interval)]
			elif row.feature == "UTR":
				self.utrs[a["exon_id"]] = self.utrs.get(a["exon_id"], []) + [(a["ID"], feature_interval)]

		tabix.close()
예제 #50
0
	def load_data(self, filepath):
		"""Loads GFF data"""
		tabix = pysam.TabixFile(filepath)

		for row in tabix.fetch(self.interval.chrom, self.interval.start, self.interval.end, parser = pysam.asGTF()):

			feature_interval = genomic_interval(row.contig, row.start, row.end, strand = row.strand)
			a = self.parse_gff_attributes(row.attributes)

			if row.feature == "gene":
				self.genes[a["gene_id"]] = a["gene_name"]
			elif row.feature == "transcript":
				self.transcripts[a["gene_id"]] = self.transcripts.get(a["gene_id"], []) + [(a["transcript_id"], feature_interval)]
			elif row.feature == "exon":
				self.exons[a["transcript_id"]] = self.exons.get(a["transcript_id"], []) + [(a["exon_id"], feature_interval)]
			elif row.feature == "CDS":
				self.cds[a["exon_id"]] = self.cds.get(a["exon_id"], []) + [(a["ID"], feature_interval)]
			elif row.feature == "UTR":
				self.utrs[a["exon_id"]] = self.utrs.get(a["exon_id"], []) + [(a["ID"], feature_interval)]

		tabix.close()
예제 #51
0
def create_pysam_tabix(path):
    if path is not None and os.path.exists(path):
        return pysam.Tabixfile(path, parser=pysam.asGTF())
    return None
예제 #52
0
def iterator(infile):
    """return a simple iterator over all entries in a file."""
    return pysam.tabix_iterator(infile, pysam.asGTF())
예제 #53
0
def exploreBAM(myinput):
    inputs = myinput.split('$')
    chr, bamfile = inputs[0], inputs[1]
    outfile = os.path.join(outfolder, 'table_%s_%s' % (chr, pid))
    #outfile2=os.path.join(outfolder,'subs_%s_%s'%(chr,pid))
    d, di = {}, {}
    bam = pysam.Samfile(bamfile, "rb")
    fasta = pysam.Fastafile(fastafile)
    ktabix = pysam.Tabixfile(kfile)
    lenregion = dicregions[chr]
    if uann: tabix = pysam.Tabixfile(annfile)
    if expos: extabix = pysam.Tabixfile(exfile)
    out = open(outfile, 'w')
    #if not custsub:
    #	dsubs=dict([(x+y, 0) for x in 'ACGT' for y in 'ACGT'])
    #	out2=open(outfile2,'w')
    #header='Region\tPosition\tReference\tCoverage\tMeanQuality\tBaseCount\tSubs\tFrequency\n'
    #out.write(header)
    sys.stderr.write('Started analysis on region: %s\n' % (chr))
    if blatr:
        badblat = os.path.join(blatfolder, 'blatseqs_%s.bad' % (chr))
        if os.path.exists(badblat):
            sys.stderr.write('Using Blat mapping for region %s\n' % (chr))
            f = open(badblat)
            for i in f:
                l = (i.strip()).split()
                d[l[0] + '_' + l[1]] = int(l[1])
            f.close()
            sys.stderr.write('Found %i reads for region %s\n' % (len(d), chr))
    if exss:
        if os.path.exists(splicefile):
            sys.stderr.write('Loading known splice sites for region %s\n' %
                             (chr))
            f = open(splicefile)
            for i in f:
                l = (i.strip()).split()
                if l[0] != chr: continue
                st, tp, cc = l[4], l[3], int(l[1])
                if st == '+' and tp == 'D':
                    for j in range(nss):
                        di[cc + (j + 1)] = 0
                if st == '+' and tp == 'A':
                    for j in range(nss):
                        di[cc - (j + 1)] = 0
                if st == '-' and tp == 'D':
                    for j in range(nss):
                        di[cc - (j + 1)] = 0
                if st == '-' and tp == 'A':
                    for j in range(nss):
                        di[cc + (j + 1)] = 0
            f.close()
            sys.stderr.write('Loaded %i positions for %s\n' % (len(di), chr))
    if chr in ktabix.contigs:
        for kpos in range(0, lenregion, chunckval):
            startk, endk = kpos, (kpos + chunckval) - 1
            kres = [
                kk
                for kk in ktabix.fetch(reference=chr, start=startk, end=endk)
            ]
            if len(kres) == 0: continue
            kdic = getd(kres)
            #print kdic
            # else explore bam to find exact positions
            for pileupcolumn in bam.pileup(chr, startk, endk):
                if not startk <= pileupcolumn.pos <= endk: continue
                if not kdic.has_key(pileupcolumn.pos + 1): continue
                ref = fasta.fetch(chr, pileupcolumn.pos,
                                  pileupcolumn.pos + 1).upper()
                seq, qual, strand, squal, blatc = '', 0, '', '', ''
                if rmsh:
                    if ((pileupcolumn.pos + 1) - h**o) - 1 < 0: sequp = ''
                    else:
                        sequp = (fasta.fetch(
                            chr, ((pileupcolumn.pos + 1) - h**o) - 1,
                            (pileupcolumn.pos + 1) - 1)).upper()
                    seqdw = (fasta.fetch(chr, pileupcolumn.pos + 1,
                                         (pileupcolumn.pos + 1) +
                                         h**o)).upper()
                for pileupread in pileupcolumn.pileups:  # per ogni base dell'allineamento multiplo
                    if not isinstance(pileupread.query_position, (int, long)):
                        continue
                    s, q, t, qq = pileupread.alignment.seq[
                        pileupread.query_position].upper(), ord(
                            pileupread.alignment.qual[
                                pileupread.query_position]
                        ) - QVAL, '*', pileupread.alignment.qual[
                            pileupread.query_position]
                    # escludi posizioni introniche nei pressi di splice sites
                    if exss and di.has_key(pileupcolumn.pos + 1): continue
                    # multiple hit
                    if exh and pileupread.alignment.is_secondary: continue
                    # duplicates
                    if exd and pileupread.alignment.is_duplicate: continue
                    # se paired end
                    if conc and pileupread.alignment.is_paired:
                        # se non concordanti
                        if not pileupread.alignment.is_proper_pair: continue
                        # se concordanti ma nello stesso orientamento
                        flag = pileupread.alignment.flag
                        if pileupread.alignment.is_duplicate:
                            flag = flag - 1024
                        if pileupread.alignment.is_secondary: flag = flag - 256
                        if flag in [67, 131, 115, 179]: continue
                    # mapping quality
                    if mq and pileupread.alignment.mapq < MAPQ: continue
                    #se la qualita' >= alla qualita' minima
                    if q >= MQUAL and pileupcolumn.pos in pileupread.alignment.positions:
                        #tags=dict(pileupread.alignment.tags)
                        #deduci la strand per ogni posizione
                        if getstrand:
                            #usa le info del mapping se strand oriented
                            if pileupread.alignment.is_read1:
                                if unchange1:
                                    if pileupread.alignment.is_reverse: t = '-'
                                    else: t = '+'
                                else:
                                    if pileupread.alignment.is_reverse: t = '+'
                                    else: t = '-'
                            elif pileupread.alignment.is_read2:
                                if unchange2:
                                    if pileupread.alignment.is_reverse: t = '-'
                                    else: t = '+'
                                else:
                                    if pileupread.alignment.is_reverse: t = '+'
                                    else: t = '-'
                            else:  # for single ends
                                if unchange1:
                                    if pileupread.alignment.is_reverse: t = '-'
                                    else: t = '+'
                                else:
                                    if pileupread.alignment.is_reverse: t = '+'
                                    else: t = '-'
                        if rmnuc:
                            #rlen=pileupread.alignment.rlen #pileupread.alignment.qlen #lunghezza della specifica read
                            #print rlen,pileupread.query_position,pileupread.alignment.qstart,pileupread.alignment.qend
                            # verifica se il nuc deve essere rimosso alle estremita' nel range x-y
                            # testare il forward
                            #qp=pileupread.query_position #pileupread.query_position-pileupread.alignment.qstart
                            #print pileupread.query_position,pileupread.alignment.rlen,len(pileupread.alignment.seq)
                            #if pileupread.alignment.is_reverse:
                            #	if (rlen-qp)-1 < rmp[0]:continue
                            #	if (rlen-qp)-1 > ((rlen)-rmp[1])-1: continue
                            #else:
                            #	if qp<rmp[0]:continue
                            #	if qp>(rlen-rmp[1])-1: continue
                            rlen = pileupread.alignment.rlen  #pileupread.alignment.qlen #lunghezza della specifica read
                            qp = pileupread.query_position  #pileupread.query_position-pileupread.alignment.qstart
                            if pileupread.alignment.is_reverse:
                                if qp > (rlen - rmp[0]) - 1: continue
                                if qp < rmp[1]: continue
                            else:
                                if qp < rmp[0]: continue
                                if qp > (rlen - rmp[1]) - 1: continue
                        # se la read di appartenenza non mappa in modo univoco con Blat
                        if blatr:
                            rt = 0
                            if pileupread.alignment.is_read1: rt = 1
                            elif pileupread.alignment.is_read2: rt = 2
                            rname = pileupread.alignment.qname + '_%i' % (rt)
                            if d.has_key(rname): blatc += '0'  #continue
                            else: blatc += '1'
                        # se la base e' diversa dal reference
                        # se in regione omopolimerica scarta
                        if rmsh and rmHomo(sequp, seqdw, h**o, ref): continue
                        seq += s
                        qual += q
                        strand += t
                        squal += qq
                if seq.strip() != '':
                    if blatr:
                        if testBlat(blatc):
                            seq, qual, squal, strand = normByBlat(
                                seq, strand, squal, blatc)
                        else:
                            continue
                    #print pileupcolumn.pos+1,seq,squal
                    #mystrand=kdic[pileupcolumn.pos+1]
                    #print mystrand
                    try:
                        mystrand = kdic[pileupcolumn.pos + 1]
                    except:
                        mystrand = '2'
                    #print chr,pileupcolumn.pos+1,seq,strand, mystrand
                    if uann and not getstrand:
                        if chr in tabix.contigs:
                            sres = [
                                kk.strand
                                for kk in tabix.fetch(reference=chr,
                                                      start=(pileupcolumn.pos),
                                                      end=(pileupcolumn.pos +
                                                           1),
                                                      parser=pysam.asGTF())
                            ]
                            mystrand = vstrand(sres)
                    if getstrand and not uann:
                        mystr = vstand(strand)
                        if mystr == '-': mystrand = '0'
                        elif mystr == '+': mystrand = '1'
                        else: mystrand = '2'
                    if mystrand == '0':
                        seq = comp(seq)
                        ref = comp(ref)
                    #if getstrand and mystrand in ['1','0'] and not useconf: seq,qual,squal=normByStrand(seq,strand,squal,mystrand)
                    if getstrand and mystrand in ['1', '0'] and corrstr:
                        seq, qual, squal = normByStrand(
                            seq, strand, squal, mystrand)
                    if uann and mystrand in ['1', '0'] and corrstr:
                        seq, qual, squal = normByStrand(
                            seq, strand, squal, mystrand)
                    #if not getstrand and not uann and mystrand in ['1','0']: seq,qual,squal=normByStrand(seq,strand,squal,mystrand)
                    #print chr,pileupcolumn.pos+1,seq,strand,mystrand
                    cov, bcomp, subs, freq = BaseCount(seq, ref)
                    if cov < MINCOV: continue
                    if exms and subs.count(' ') > 0: continue
                    mqua = meanq(qual, len(seq))
                    if expos:
                        if chr in extabix.contigs:
                            exres = [
                                kk
                                for kk in extabix.fetch(reference=chr,
                                                        start=(
                                                            pileupcolumn.pos),
                                                        end=(pileupcolumn.pos +
                                                             1))
                            ]
                            if len(exres) > 0: continue
                    line = '\t'.join([
                        chr,
                        str(pileupcolumn.pos + 1), ref, mystrand,
                        str(cov), (mqua),
                        str(bcomp), subs, freq
                    ]) + '\n'
                    out.write(line)
    bam.close()
    fasta.close()
    ktabix.close()
    out.close()
    if uann: tabix.close()
    if expos: extabix.close()
    sys.stderr.write('Job completed for region: %s\n' % (chr))
예제 #54
0
def readFromFile(infile):
    """read records from file and return as list."""
    result = []
    for gff in pysam.tabix_iterator(infile, pysam.asGTF()):
        result.append(gff)
    return result
예제 #55
0
def readFromFile( infile ):
    """read gtf from file."""
    result = []
    for gff in pysam.tabix_iterator( infile, pysam.asGTF() ):
        result.append( gff )
    return result
예제 #56
0
파일: GTF.py 프로젝트: prasoonnema/cgat
def iterator(infile):
    """return a simple iterator over all entries in a file."""
    return pysam.tabix_iterator(infile, pysam.asGTF())