Exemplo n.º 1
0
    def parseLine(self,line):
        line = line.rstrip('\n')
        ff = line.split('\t')

        #psLayout version 3

        #match	mis- 	rep. 	N's	Q gap	Q gap	T gap	T gap	strand	Q        	Q   	Q    	Q  	T        	T   	T    	T  	block	blockSizes 	qStart	 tStarts
     	#match	match	   	count	bases	count	bases	      	name     	size	start	end	name     	size	start	end	count
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------
        #236	0	0	0	0	0	0	0	+	TRINITY_DN4669_c0_g1_i1	237	0	236	Gm16144_ENSMUST00000131093	1843	1272	1508	1	236,	0,	1272,
        #179	0	0	0	0	0	0	0	-	TRINITY_DN4615_c0_g1_i1	317	0	179	Hdhd3_ENSMUST00000037820	2977	0	179	1	179,	138,	0,
        #183	0	0	0	0	0	0	0	+	TRINITY_DN4601_c0_g1_i1	219	36	219	Atp6v1a_ENSMUST00000130036	40052	2211	2394	1

            
        f = Feature()

        match  = int(ff[0])
        mismatch = int(ff[1])

        strand = ff[8]
        qid    = ff[9]
        qlen   = int(ff[10])
        qstart = int(ff[11])
        qend   = int(ff[12])
        hid    = ff[13]
        hlen   = int(ff[14])
        hstart = int(ff[15])
        hend   = int(ff[16])

        f.qid    = qid
        f.type1  = 'blat'
        f.type2  = 'blat'
        f.qstart = qstart
        f.qend   = qend

        f.hid    = hid
        f.hstart = hstart
        f.hend   = hend

        f.score = int(100*match/qlen)

        f.qlen  = qlen
        f.hlen  = hlen

        f.hitattr['match'] = match
        f.hitattr['mismatch'] = mismatch

        if strand != ".":
            if strand == "+":
                f.strand = 1
            elif strand == 1:
                f.strand = 1
            elif strand == "-":
                f.strand = -1
            elif strand  == -1:
                f.strand = -1

        return f
Exemplo n.º 2
0
    def parseLine(self, line):

        line = line.rstrip('\n')
        ff = line.split('\t')

        ##score  name1   strand1 size1   zstart1 end1    name2   strand2 size2   zstart2 end2    identity        idPct   coverage        covPct
        #12413   98004798        +       1579    278     1520    F27C8.1 -       1482    200     1455    709/1185        59.8%   1255/1482       84.7%
        #15213   98029119        +       1752    526     1572    F27C8.1 -       1482    365     1415    615/1014        60.7%   1050/1482       70.9%

        f = Feature()

        qstrand = ff[2]
        hstrand = ff[7]

        qid = ff[1]
        qlen = int(ff[3])
        qstart = int(ff[4])
        qend = int(ff[5])
        hid = ff[6]
        hlen = int(ff[8])
        hstart = int(ff[9])
        hend = int(ff[10])

        f.qid = qid
        f.type1 = 'lastz'
        f.type2 = 'lastz'
        f.qstart = qstart
        f.qend = qend

        f.hid = hid
        f.hstart = hstart
        f.hend = hend

        f.score = int(ff[0])

        f.qlen = qlen
        f.hlen = hlen

        pid = ff[12].replace('%', '')
        cov = ff[14].replace('%', '')

        f.hitattr['pid'] = float(pid)
        f.hitattr['cov'] = float(cov)

        if qstrand == "+" and hstrand == "+":
            strand = 1
        elif qstrand == "+" and hstrand == "-":
            strand = -1
        elif qstrand == "-" and hstrand == "+":
            strand = -1
        elif qstrand == "-" and hstrand == "-":
            strand = 1

        return f
Exemplo n.º 3
0
    def parseLine(self,line):
        line = line.rstrip('\n')
        ff = line.split('\t')

        #chr1	unknown	CDS	3054734	3054733	.	+	-1	gene_id "ENSMUSG00000090025"; gene_name "ENSMUSG00000090025"; transcript_id "ENSMUST00000160944";

        f = Feature()

        f.qid   = ff[0]
        f.type1 = ff[1]
        f.type2 = ff[2]

        f.qstart = int(ff[3])
        f.qend   = int(ff[4])

        if ff[5] != ".":
            f.score = double(ff[5])

        if ff[6] != ".":
            if ff[6] == "+":
                f.strand = 1
            elif ff[6] == 1:
                f.strand = 1
            elif ff[6] == "-":
                f.strand = -1
            elif ff[6] == -1:
                f.strand = -1

        if ff[7] != ".":
            f.phase = int(ff[7])


        featf = ff[8].split(';')
            
        for feat in featf:
            feat = feat.strip()
            tmp  = feat.split(' ')

            if len(tmp) == 2:

                key = tmp[0].strip()
                val = tmp[1].strip()
                val = val.strip('"')

                f.hitattr[key] = val

                if key == "transcript_id":
                    f.hid = val
        return f
Exemplo n.º 4
0
    def nextGFF(self):

        for line in self.fh:

            if line is None:
                return

            if re.search('^##FASTA', line):
                return None

            if re.search('^#', line):
                continue

            line = line.rstrip('\n')
            ff = line.split('\t')

            ##gff-version 3
            #!gff-spec-version 1.20
            #!processor NCBI annotwriter
            #!genome-build ASM72083v1
            #!genome-build-accession NCBI_Assembly:GCF_000720835.1
            ##sequence-region NZ_JODT01000001.1 1 388890
            ##species http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=67256
            #NZ_JODT01000001.1       RefSeq  region  1       388890  .       +       .       ID=id0;Dbxref=taxon:67256;collection-date=2010;country=Japan: Suginami%2C Tokyo;culture-collection=NRRL:B-2120;gbkey=Src;isolation-source=garden soil;mol_type=genomic DNA;strain=NRRL B-2120;sub-species=achromogenes
            #NZ_JODT01000001.1       RefSeq  gene    283     1188    .       -       .       ID=gene0;Name=IH25_RS0100010;gbkey=Gene;locus_tag=IH25_RS0100010
            #NZ_JODT01000001.1       Protein Homology        CDS     283     1188    .       -       0       ID=cds0;Parent=gene0;Dbxref=Genbank:WP_030600633.1;Name=WP_030600633.1;gbkey=CDS;product=DeoR faimly transcriptional regulator;protein_id=WP_030600633.1;transl_table=11
            #NZ_JODT01000001.1       RefSeq  gene    1391    2839    .       -       .       ID=gene1;Name=IH25_RS0100015;gbkey=Gene;locus_tag=IH25_RS0100015
            #NZ_JODT01000001.1       Protein Homology        CDS     1391    2839    .       -       0       ID=cds1;Parent=gene1;Dbxref=Genbank:WP_030600636.1;Name=WP_030600636.1;Note=catalyzes the reduction of nonspecific electron acceptors such as 2%2C6-dimethyl-1%2C4-benzoquinone and 5-hydroxy-1%2C4-naphthaquinone%3B does not have lipoamide dehydrogenase activity;gbkey=CDS;product=flavoprotein disulfide reductase;protein_id=WP_030600636.1;transl_table=11
            #NZ_JODT01000001.1       RefSeq  gene    2936    3373    .       +       .       ID=gene2;Name=IH25_RS0100020;gbkey=Gene;locus_tag=IH25_RS0100020
            #NZ_JODT01000001.1       Protein Homology        CDS     2936    3373    .       +       0       ID=cds2;Parent=gene2;Dbxref=Genbank:WP_030600640.1;Name=WP_030600640.1;gbkey=CDS;product=gamma-glutamyl cyclotransferase;protein_id=WP_030600640.1;transl_table=11
            #NZ_JODT01000001.1       RefSeq  gene    3499    4323    .       +       .       ID=gene3;Name=IH25_RS0100025;gbkey=Gene;locus_tag=IH25_RS0100025
            #NZ_JODT01000001.1       Protein Homology        CDS     3499    4323    .       +       0       ID=cds3;Parent=gene3;Dbxref=Genbank:WP_03060

            if len(ff) < 8:
                raise Exception(
                    "GFF line needs 8 or more fields to parse [%s]" % line)

            f = Feature()

            f.qid = ff[0]
            f.type1 = ff[1]
            f.type2 = ff[2]
            f.qstart = int(ff[3])
            f.qend = int(ff[4])
            f.score = ff[5]
            f.strand = ff[6]
            f.phase = ff[7]

            if f.score == ".":
                f.score = 0
            else:
                f.score = int(f.score)

            if f.strand == "+":
                f.strand = 1

            if f.strand == "-":
                f.strand = -1

            if f.strand == ".":
                f.strand = 0

            if len(ff) > 8:

                hidstr = ff[8]
                hitattr = {}

                hffarr = hidstr.split(';')

                for hff in hffarr:

                    tmparr = hff.split('=')

                    hitattr[tmparr[0]] = tmparr[1]

                f.hitattr = hitattr

            return f