Exemplo n.º 1
0
    def setUp(self):

        self.feat = Feature()

        self.feat.qid = "chr1"
        self.feat.qstart = 10200
        self.feat.qend = 10340
        self.feat.score = 100
        self.feat.pid = 90.0
        self.feat.qlen = 100000000
        self.feat.qseq = "NNN"
        self.feat.phase = "."
        self.feat.strand = 1
        self.feat.hid = "D89D80"
        self.feat.hstart = 1
        self.feat.hend = 141
        self.feat.hlen = 150
        self.feat.hseq = "NNN"
Exemplo n.º 2
0
    def createAlignmentGFF(self, id1, id2, rstart, rend, qstart, qend, rstrand,
                           qstrand, tmprseq, tmpqseq, insertpos):
        seq1 = {}
        seq2 = {}

        seq1['id'] = id1
        seq2['id'] = id2

        seq1['seq'] = tmprseq
        seq2['seq'] = tmpqseq

        tmpgff = Feature()

        tmpgff.qid = id1
        tmpgff.qstart = rstart
        tmpgff.qend = rend

        tmpgff.hitattr['qseq'] = seq1
        tmpgff.hitattr['hseq'] = seq2
        tmpgff.hitattr['hid'] = id1

        tmpgff.hitattr['insertpos'] = insertpos

        return tmpgff
Exemplo n.º 3
0
    def postProcessOutput(self):

        super(BlastOutput6ParserAnalysis,self).postProcessOutput()

        data   = {}

        file   = self.input_files[0]

        with open(file) as fp:

            for line in fp:

                line = line.rstrip('\n')
                ff   = line.split('\t')

                qid = ff[0]
                hid = ff[1]
                pid = float(ff[2])
                alnlen = ff[3]
                mm     = int(ff[4])
                gaps   = int(ff[5])
                qstart = int(ff[6])
                qend   = int(ff[7])
                hstart = int(ff[8])
                hend   = int(ff[9])
                exval   = float(ff[10])
                score  = float(ff[11])

                feat = Feature()

                feat.qid = qid
                feat.qstart = qstart
                feat.qend   = qend
                feat.hid    = hid
                feat.hstart = hstart
                feat.hend   = hend

                feat.pid = pid
                feat.score = score
                
                feat.mm = mm
                feat.gaps = gaps
                feat.exval = exval

                if len(ff) > 12:
                    feat.qlen = int(ff[12])
                    feat.hlen = int(ff[13])
                    feat.qseq = ff[14]
                    feat.hseq = ff[15]

                if not qid in data:
                    data[qid] = []

                tmp = data[qid]

                tmp.append(feat)

            self.data = data
Exemplo n.º 4
0
    def parseBlastOutput6(file):
 
      hits = {}

      prev = None

      with open(file) as fp:

         for line in fp:
             
             line = line.rstrip('\n')

             ff   = line.split('\t')

             qid = ff[0]
	     hid = ff[1]
             pid = float(ff[2])
             alnlen = ff[3]
             mm     = int(ff[4])
             gaps   = int(ff[5])
             qstart = int(ff[6])
             qend   = int(ff[7])
             hstart = int(ff[8])
             hend   = int(ff[9])
             exval   = float(ff[10])
             score  = float(ff[11])

             feat = Feature()

             feat.qid = qid
             feat.qstart = qstart
             feat.qend   = qend
             feat.hid    = hid
             feat.hstart = hstart
             feat.hend   = hend

             feat.pid = pid
             feat.score = score
                
             feat.mm = mm
             feat.gaps = gaps
             feat.exval = exval

             if len(ff) > 15:
                 feat.qlen = int(ff[12])
                 feat.hlen = int(ff[13])
                 feat.qseq = ff[14]
                 feat.hseq = ff[15]

             if not qid in hits:
                 hits[qid] = []

             tmp = hits[qid]
             tmp.append(feat)

      return hits 
Exemplo n.º 5
0
def main(args):

    logging.info(" ========> Converting mummer delta format for %s %s %s" %
                 (args.deltafile, args.reffile, args.queryfile))

    logging.info("ARGS %s" % args)

    ref = FastaFile(args.reffile)
    qry = FastaFile(args.queryfile)
    gff = GFFFactory(args.gfffile)

    g = gff.nextGFF()

    gffs = {}

    while g is not None:

        if g.type2 == "CDS":
            #print "QID %s %s"%(g.qid,g.type2)
            if g.qid not in gffs:
                gffs[g.qid] = []

            gffs[g.qid].append(g)

        g = gff.nextGFF()

    refseqs = {}
    qryseqs = {}

    seq = ref.nextSeq()

    while seq is not None:
        refseqs[seq['id']] = seq
        seq = ref.nextSeq()

    seq = qry.nextSeq()

    while seq is not None:
        qryseqs[seq['id']] = seq
        seq = qry.nextSeq()

    fh = open(args.deltafile)

    alns = {}
    lnum = 0

    line = fh.readline()

    id1 = None
    id2 = None
    len1 = None
    len2 = None

    while line != "":  # Can't use for line in fh: because we read the alignment in chunks

        lnum = lnum + 1

        line = line.rstrip('\n')
        ff = line.split(' ')

        if lnum == 1:
            """  The first line lists the two original input files separated by a space."""

            if1 = ff[0]
            if2 = ff[1]

            print "Input files [%s][%s]\n" % (if1, if2)

        elif lnum == 2:
            """ The second line specifies the alignment data type, either NUCMER or "PROMER"""

            alntype = ff[0]

            if alntype != "NUCMER":
                raise Exception(
                    "Only NUCMER alignments are currently parsed - we have [%s]"
                    % alntyp)

        else:
            """ Every grouping of alignment regions have a header, just like the cluster's header in the .cluster file. 
            This is a FASTA style header and lists the two sequences that produced the following alignments after a '>' and separated by a space.
            After the two sequences are the lengths of those sequences in the same order. 
            
            An example header might look like: >tagA1 tagB1 500 2000000   """

            if ff[0].startswith(">"):
                id1 = ff[0].replace(">", '')
                id2 = ff[1]

                len1 = int(ff[2])
                len2 = int(ff[3])

                #print "IDs %s %s %d %d"%(id1,id2,len1,len2)

            else:
                #print "Parsing %s"%line
                """ The four digits are the start and end in the reference sequence respectively and the start and end in the query sequence respectively. 

                These coordinates are always measured in DNA bases regardless of the alignment data type. 
                
                The three digits after the starts and stops are: 
                the number of errors (non-identities), 
                similarity errors (non- positive match scores) 
                non-alpha characters in the sequence (used to count stop-codons i promer data). 
                
                An example header might look like: 5198 22885 5389 23089 20 20 0  """

                rstart = int(ff[0])
                rend = int(ff[1])
                qstart = int(ff[2])
                qend = int(ff[3])

                qstrand = 1
                hstrand = 1

                if rend < rstart:
                    qstrand = -1
                else:
                    qstrand = 1

                if qend < qstart:
                    hstrand = -1
                else:
                    hstrand = 1

                #print "Strands %d %d"%(qstrand,hstrand)

                errors = int(ff[4])
                simerrs = int(ff[5])
                nonalpha = int(ff[6])

                if id1 not in refseqs:
                    raise Exception(
                        "Can't find reference sequence [%s] in ref file [%s]" %
                        (id1, args.reffile))

                if id2 not in qryseqs:
                    raise Exception(
                        "Can't find query sequence [%s] in query file [%s]" %
                        (id2, args.queryfile))
                rseq = refseqs[id1]
                qseq = qryseqs[id2]

                #print "Found alignment header %s %d %d ::  %s %d %d"%(id1,rstart,rend,id2,qstart,qend)
                """ Each of these headers is followed by a string of signed digits, one per line, with the final line before the next header equaling 0 (zero). 

                Each digit represents the distance to the next insertion in the reference (positive int) or deletion in the reference (negative int), 
                as measured in DNA bases or amino acids depending on the alignment data type.
                
                For example, with 'nucmer' the delta sequence (1, -3, 4, 0) would represent 
                - an insertion at positions 1 and 7 in the reference sequence and 
                - an insertion at position 3 in the query sequence. 
                
                Or with letters: A = acgtagctgag$ B = cggtagtgag$ Delta = (1, -3, 4, 0) A = acg.tagctgag$ B = .cggtag.tgag$    """

                count = fh.readline()
                count = count.rstrip('\n')
                count = int(count)

                tmprseq = rseq['seq']
                tmpqseq = qseq['seq']

                if rend > rstart:
                    tmprseq = tmprseq[rstart - 1:rend - 1]
                else:
                    tmprseq = tmprseq[rend:rstart]
                    tmprseq = reverse_complement(tmprseq)

                if qend > qstart:
                    tmpqseq = tmpqseq[qstart - 1:qend - 1]
                else:
                    tmpqseq = tmpqseq[qend:qstart]
                    tmpqseq = reverse_complement(tmpqseq)

                insertpos = 0

                while count != 0:
                    if count < 0:
                        """ This is an insertion in the query sequence so we put a - in the ref"""

                        insertpos = insertpos + abs(count)
                        tmprseq = tmprseq[:insertpos -
                                          1] + "-" + tmprseq[insertpos - 1:]

                    elif count > 0:
                        """ This is an insertion in the reference sequence """
                        insertpos = insertpos + abs(count)
                        tmpqseq = tmpqseq[:insertpos -
                                          1] + "-" + tmpqseq[insertpos - 1:]

                    count = fh.readline()
                    count = count.rstrip('\n')
                    count = int(count)

                seq1 = {}
                seq2 = {}
                seq1['id'] = id1
                seq2['id'] = id2
                seq1['seq'] = tmprseq
                seq2['seq'] = tmpqseq

                if (seq1 != seq2 and id1 == "GG739696.1"):
                    print prettyPrint([seq1, seq2])

                if id1 not in alns:
                    alns[id1] = []

                tmpgff = Feature()

                tmpgff.qid = id1
                tmpgff.qstart = rstart
                tmpgff.qend = rend

                #print "Strand %d %d"%(qstrand,hstrand)

                tmpgff.hitattr['qseq'] = seq1
                tmpgff.hitattr['hseq'] = seq2
                tmpgff.hitattr['hid'] = id1

                tmpgff.hitattr['insertpos'] = insertpos

                #alns[id1].append([seq1,seq2])
                alns[id1].append(tmpgff)

        line = fh.readline()

    #for id in alns:
    #print id
    #for gff in alns[id]:
    #print "%s - %s"%( tmpgff.qid,tmpgff.hitattr['hid'])

    gnum = 1

    for id in gffs:
        for g in gffs[id]:

            outstr = []

            name = g.hitattr['Name']
            prod = g.hitattr['product']

            #for h in g.hitattr:
            #    print "%s %s"%(h,g.hitattr[h])

            found = False
            foundgff = None
            status = "NEW"

            if id in alns:
                for tmpgff in alns[id]:

                    if g.overlaps(tmpgff):

                        if tmpgff.contains(g):
                            #print "Contained Seq qstart/end %d %d"%(tmpgff.qstart,tmpgff.qend)
                            found = True
                            foundgff = tmpgff
                        else:

                            ostart = g.qstart
                            oend = g.qend

                            if tmpgff.qstart > g.qstart:
                                ostart = tmpgff.qstart

                            if tmpgff.qend < g.qend:
                                oend = tmpgff.qend

                            frac = int(100 * (oend - ostart + 1) /
                                       (g.qend - g.qstart + 1))

                            status = "PARTALIGN"
                            outstr.append(
                                "============1 Processing gene %d %s %s" %
                                (gnum, name, prod))
                            outstr.append(
                                "Contig coords from gff file %s %d-%d" %
                                (g.qid, g.qstart, g.qend))
                            outstr.append(
                                "Partial overlap of %d percent overlap coords are %d %d"
                                % (frac, ostart, oend))

            if not found:
                if status == "NEW":
                    status = "NOALIGN"
                    outstr.append("============2 Processing gene %d %s %s" %
                                  (gnum, name, prod))
                    outstr.append(
                        "Contig coords from gff file %s %d-%d %s %s" %
                        (g.qid, g.qstart, g.qend, name, prod))
                    outstr.append(
                        "ERROR: No align for %s %s qstart/end %d %d %s" %
                        (name, tmpgff.qid, tmpgff.qstart, tmpgff.qend, prod))
            else:
                if qstrand == -1:
                    status = "REVSTRAND"
                    outstr.append("===========3 Processing gene %d %s %s" %
                                  (gnum, name, prod))
                    outstr.append(
                        "Contig coords from gff file %s %d-%d %s %s" %
                        (g.qid, g.qstart, g.qend, name, prod))
                    outstr.append(
                        "ERROR: can't deal with reverse strand reference alignments"
                    )
                else:
                    gstrand = g.strand
                    gstart = g.qstart
                    gend = g.qend

                    astrand = foundgff.strand
                    astart = foundgff.qstart
                    aend = foundgff.qend

                    apos1 = findAlnPos(foundgff, gstart)
                    apos2 = findAlnPos(foundgff, gend)

                    if gstrand == 1:
                        qseq = foundgff.hitattr['qseq']['seq'][apos1:apos2]
                        hseq = foundgff.hitattr['hseq']['seq'][apos1:apos2]
                    else:
                        qseq = foundgff.hitattr['qseq']['seq'][apos1 +
                                                               1:apos2 + 1]
                        hseq = foundgff.hitattr['hseq']['seq'][apos1 +
                                                               1:apos2 + 1]

                        qseq = reverse_complement(qseq)
                        hseq = reverse_complement(hseq)

                    if qseq != hseq:
                        status = "MUTATION"
                        outstr.append("===========4 Processing gene %d %s %s" %
                                      (gnum, name, prod))

                        #print "GFF %s %s %d %d %s %s"%(g.qid,g.hid,g.qstart,g.qend,name,prod)

                        outstr.append("DNA alignment\n")
                        tmpstr = prettyPrint([{
                            'id': id1,
                            'seq': qseq
                        }, {
                            'id': id2,
                            'seq': hseq
                        }])
                        tmpff = tmpstr.split('\n')
                        for f in tmpff:
                            outstr.append(f)

                        qpep = translate(qseq)
                        hpep = translate(hseq)

                        tmpstr = prettyPrint([{
                            'id': id1,
                            'seq': qpep
                        }, {
                            'id': id2,
                            'seq': hpep
                        }])
                        outstr.append("PEP alignment\n")
                        tmpff = tmpstr.split('\n')
                        for f in tmpff:
                            outstr.append(f)

                        #print "GFF start-end strand %d-%d %d %s %s"%(gstart,gend,gstrand,name,prod)
                        #print "ALN start-end strand %d-%d %d %s %s"%(astart,aend,astrand,name,prod)

                        #print "POS %d %d",(apos1,apos2)

                        #print "QSEQ %s"%qseq
                        #print "HSEQ %s"%hseq

                        #print "QPEP %s"%qpep
                        #print "HPEP %s"%hpep
                    else:
                        status = "IDENTICAL"
                        outstr.append(
                            "============5 Processing gene %d %s %s" %
                            (gnum, name, prod))
                        outstr.append("NO CHANGE for this alignment %s %s %s" %
                                      (tmpgff.qid, name, prod))

            for i in outstr:
                print "%-15s %s" % (status, i)
            print "\n"
            gnum = gnum + 1
Exemplo n.º 6
0
    def parseLine(self,line):
        line = line.rstrip('\n')
        ff = line.split('\t')

        #psLayout version 3

        #match	mis- 	rep. 	N's	Q gap	Q gap	T gap	T gap	strand	Q        	Q   	Q    	Q  	T        	T   	T    	T  	block	blockSizes 	qStart	 tStarts
     	#match	match	   	count	bases	count	bases	      	name     	size	start	end	name     	size	start	end	count
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------
        #236	0	0	0	0	0	0	0	+	TRINITY_DN4669_c0_g1_i1	237	0	236	Gm16144_ENSMUST00000131093	1843	1272	1508	1	236,	0,	1272,
        #179	0	0	0	0	0	0	0	-	TRINITY_DN4615_c0_g1_i1	317	0	179	Hdhd3_ENSMUST00000037820	2977	0	179	1	179,	138,	0,
        #183	0	0	0	0	0	0	0	+	TRINITY_DN4601_c0_g1_i1	219	36	219	Atp6v1a_ENSMUST00000130036	40052	2211	2394	1

            
        f = Feature()

        match  = int(ff[0])
        mismatch = int(ff[1])

        strand = ff[8]
        qid    = ff[9]
        qlen   = int(ff[10])
        qstart = int(ff[11])
        qend   = int(ff[12])
        hid    = ff[13]
        hlen   = int(ff[14])
        hstart = int(ff[15])
        hend   = int(ff[16])

        f.qid    = qid
        f.type1  = 'blat'
        f.type2  = 'blat'
        f.qstart = qstart
        f.qend   = qend

        f.hid    = hid
        f.hstart = hstart
        f.hend   = hend

        f.score = int(100*match/qlen)

        f.qlen  = qlen
        f.hlen  = hlen

        f.hitattr['match'] = match
        f.hitattr['mismatch'] = mismatch

        if strand != ".":
            if strand == "+":
                f.strand = 1
            elif strand == 1:
                f.strand = 1
            elif strand == "-":
                f.strand = -1
            elif strand  == -1:
                f.strand = -1

        return f
Exemplo n.º 7
0
    def parseLine(self,line):
        line = line.rstrip('\n')
        ff = line.split('\t')

        #chr1	unknown	CDS	3054734	3054733	.	+	-1	gene_id "ENSMUSG00000090025"; gene_name "ENSMUSG00000090025"; transcript_id "ENSMUST00000160944";

        f = Feature()

        f.qid   = ff[0]
        f.type1 = ff[1]
        f.type2 = ff[2]

        f.qstart = int(ff[3])
        f.qend   = int(ff[4])

        if ff[5] != ".":
            f.score = double(ff[5])

        if ff[6] != ".":
            if ff[6] == "+":
                f.strand = 1
            elif ff[6] == 1:
                f.strand = 1
            elif ff[6] == "-":
                f.strand = -1
            elif ff[6] == -1:
                f.strand = -1

        if ff[7] != ".":
            f.phase = int(ff[7])


        featf = ff[8].split(';')
            
        for feat in featf:
            feat = feat.strip()
            tmp  = feat.split(' ')

            if len(tmp) == 2:

                key = tmp[0].strip()
                val = tmp[1].strip()
                val = val.strip('"')

                f.hitattr[key] = val

                if key == "transcript_id":
                    f.hid = val
        return f
Exemplo n.º 8
0
    def parseLine(self, line):

        line = line.rstrip('\n')
        ff = line.split('\t')

        ##score  name1   strand1 size1   zstart1 end1    name2   strand2 size2   zstart2 end2    identity        idPct   coverage        covPct
        #12413   98004798        +       1579    278     1520    F27C8.1 -       1482    200     1455    709/1185        59.8%   1255/1482       84.7%
        #15213   98029119        +       1752    526     1572    F27C8.1 -       1482    365     1415    615/1014        60.7%   1050/1482       70.9%

        f = Feature()

        qstrand = ff[2]
        hstrand = ff[7]

        qid = ff[1]
        qlen = int(ff[3])
        qstart = int(ff[4])
        qend = int(ff[5])
        hid = ff[6]
        hlen = int(ff[8])
        hstart = int(ff[9])
        hend = int(ff[10])

        f.qid = qid
        f.type1 = 'lastz'
        f.type2 = 'lastz'
        f.qstart = qstart
        f.qend = qend

        f.hid = hid
        f.hstart = hstart
        f.hend = hend

        f.score = int(ff[0])

        f.qlen = qlen
        f.hlen = hlen

        pid = ff[12].replace('%', '')
        cov = ff[14].replace('%', '')

        f.hitattr['pid'] = float(pid)
        f.hitattr['cov'] = float(cov)

        if qstrand == "+" and hstrand == "+":
            strand = 1
        elif qstrand == "+" and hstrand == "-":
            strand = -1
        elif qstrand == "-" and hstrand == "+":
            strand = -1
        elif qstrand == "-" and hstrand == "-":
            strand = 1

        return f
Exemplo n.º 9
0
    def nextGFF(self):

        for line in self.fh:

            if line is None:
                return

            if re.search('^##FASTA', line):
                return None

            if re.search('^#', line):
                continue

            line = line.rstrip('\n')
            ff = line.split('\t')

            ##gff-version 3
            #!gff-spec-version 1.20
            #!processor NCBI annotwriter
            #!genome-build ASM72083v1
            #!genome-build-accession NCBI_Assembly:GCF_000720835.1
            ##sequence-region NZ_JODT01000001.1 1 388890
            ##species http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=67256
            #NZ_JODT01000001.1       RefSeq  region  1       388890  .       +       .       ID=id0;Dbxref=taxon:67256;collection-date=2010;country=Japan: Suginami%2C Tokyo;culture-collection=NRRL:B-2120;gbkey=Src;isolation-source=garden soil;mol_type=genomic DNA;strain=NRRL B-2120;sub-species=achromogenes
            #NZ_JODT01000001.1       RefSeq  gene    283     1188    .       -       .       ID=gene0;Name=IH25_RS0100010;gbkey=Gene;locus_tag=IH25_RS0100010
            #NZ_JODT01000001.1       Protein Homology        CDS     283     1188    .       -       0       ID=cds0;Parent=gene0;Dbxref=Genbank:WP_030600633.1;Name=WP_030600633.1;gbkey=CDS;product=DeoR faimly transcriptional regulator;protein_id=WP_030600633.1;transl_table=11
            #NZ_JODT01000001.1       RefSeq  gene    1391    2839    .       -       .       ID=gene1;Name=IH25_RS0100015;gbkey=Gene;locus_tag=IH25_RS0100015
            #NZ_JODT01000001.1       Protein Homology        CDS     1391    2839    .       -       0       ID=cds1;Parent=gene1;Dbxref=Genbank:WP_030600636.1;Name=WP_030600636.1;Note=catalyzes the reduction of nonspecific electron acceptors such as 2%2C6-dimethyl-1%2C4-benzoquinone and 5-hydroxy-1%2C4-naphthaquinone%3B does not have lipoamide dehydrogenase activity;gbkey=CDS;product=flavoprotein disulfide reductase;protein_id=WP_030600636.1;transl_table=11
            #NZ_JODT01000001.1       RefSeq  gene    2936    3373    .       +       .       ID=gene2;Name=IH25_RS0100020;gbkey=Gene;locus_tag=IH25_RS0100020
            #NZ_JODT01000001.1       Protein Homology        CDS     2936    3373    .       +       0       ID=cds2;Parent=gene2;Dbxref=Genbank:WP_030600640.1;Name=WP_030600640.1;gbkey=CDS;product=gamma-glutamyl cyclotransferase;protein_id=WP_030600640.1;transl_table=11
            #NZ_JODT01000001.1       RefSeq  gene    3499    4323    .       +       .       ID=gene3;Name=IH25_RS0100025;gbkey=Gene;locus_tag=IH25_RS0100025
            #NZ_JODT01000001.1       Protein Homology        CDS     3499    4323    .       +       0       ID=cds3;Parent=gene3;Dbxref=Genbank:WP_03060

            if len(ff) < 8:
                raise Exception(
                    "GFF line needs 8 or more fields to parse [%s]" % line)

            f = Feature()

            f.qid = ff[0]
            f.type1 = ff[1]
            f.type2 = ff[2]
            f.qstart = int(ff[3])
            f.qend = int(ff[4])
            f.score = ff[5]
            f.strand = ff[6]
            f.phase = ff[7]

            if f.score == ".":
                f.score = 0
            else:
                f.score = int(f.score)

            if f.strand == "+":
                f.strand = 1

            if f.strand == "-":
                f.strand = -1

            if f.strand == ".":
                f.strand = 0

            if len(ff) > 8:

                hidstr = ff[8]
                hitattr = {}

                hffarr = hidstr.split(';')

                for hff in hffarr:

                    tmparr = hff.split('=')

                    hitattr[tmparr[0]] = tmparr[1]

                f.hitattr = hitattr

            return f
Exemplo n.º 10
0
        qcov = float(fields[14])
        hcov = float(fields[15])

        qid = fields[17]
        hid = fields[18]

        strand = 1

        if hend < hstart:
            strand = -1
            tmp = hend
            hend = hstart
            hstart = tmp

        tmpgff = Feature()

        tmpgff.qid = qid
        tmpgff.qstart = qstart
        tmpgff.qend = qend
        tmpgff.qlen = qlen
        tmpgff.qcov = qcov

        tmpgff.hitattr['hid'] = hid
        tmpgff.hitattr['hstart'] = hstart
        tmpgff.hitattr['hend'] = hend
        tmpgff.hitattr['hlen'] = hlen
        tmpgff.hitattr['hcov'] = hcov

        tmpgff.pid = pid