def check_F2(refseq, spec, VERBOSE=0):
    '''Check fragment F2: gag, pol'''
    check = check_length_fragment(refseq,
                                  'F2' + spec,
                                  VERBOSE=VERBOSE,
                                  tolerance=80)
    if not check:
        return False

    # Check gag (there should be end)
    genename = 'gag'
    (start, end, start_found, end_found) = locate_gene(refseq,
                                                       genename,
                                                       VERBOSE=VERBOSE)
    if (not end_found):
        print 'ERROR: end of ' + genename + ' not found in F2!'
        return False
    elif VERBOSE >= 3:
        print 'OK: end of ' + genename + ' found'

    geneseq = refseq[:end]
    geneseq = geneseq[len(geneseq) % 3:]
    gene = geneseq.seq
    prot = gene.translate()
    check = check_has_end(prot, 'gag', VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_premature_stops(prot, 'gag', VERBOSE=VERBOSE)
    if not check:
        return False

    # Check pol (there should be the start)
    genename = 'pol'
    (start, end, start_found, end_found) = locate_gene(refseq,
                                                       genename,
                                                       VERBOSE=VERBOSE)
    if (not start_found):
        print 'ERROR: start of ' + genename + ' not found in F2!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start of ' + genename + ' found'

    geneseq = refseq[start:]
    geneseq = geneseq[:len(geneseq) - len(geneseq) % 3]
    gene = geneseq.seq
    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    return True
def check_F2(refseq, spec, VERBOSE=0):
    '''Check fragment F2: gag, pol'''
    check = check_length_fragment(refseq, 'F2'+spec, VERBOSE=VERBOSE, tolerance=80)
    if not check:
        return False

    # Check gag (there should be end)
    genename = 'gag'
    (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
    if (not end_found):
        print 'ERROR: end of '+genename+' not found in F2!'
        return False
    elif VERBOSE >= 3:
        print 'OK: end of '+genename+' found'

    geneseq = refseq[:end]
    geneseq = geneseq[len(geneseq) % 3:]
    gene = geneseq.seq
    prot = gene.translate()
    check = check_has_end(prot, 'gag', VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_premature_stops(prot, 'gag', VERBOSE=VERBOSE)
    if not check:
        return False

    # Check pol (there should be the start)
    genename = 'pol'
    (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
    if (not start_found):
        print 'ERROR: start of '+genename+' not found in F2!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start of '+genename+' found'

    geneseq = refseq[start:]
    geneseq = geneseq[:len(geneseq) - len(geneseq) % 3]
    gene = geneseq.seq
    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    return True
def check_F3(refseq, spec, VERBOSE=0):
    '''Check fragment F3: end of pol'''
    check = check_length_fragment(refseq,
                                  'F3' + spec,
                                  VERBOSE=VERBOSE,
                                  tolerance=50)
    if not check:
        return False

    # Check pol: this depends on the spec: for F3bo there should be the end,
    # anything else has only the middle (it's all pol!)
    genename = 'pol'
    if spec == 'bo':
        (start, end, start_found, end_found) = locate_gene(refseq,
                                                           genename,
                                                           VERBOSE=VERBOSE)
        if (not end_found):
            print 'ERROR: end of ' + genename + ' not found in F3!'
            return False
        elif VERBOSE >= 3:
            print 'OK: end of ' + genename + ' found'

        geneseq = refseq[:end]
        geneseq = geneseq[len(geneseq) % 3:]
        gene = geneseq.seq
        prot = gene.translate()
        check = check_has_end(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

    else:
        # Try all 3 reading frames
        for offset in xrange(3):
            geneseq = refseq[offset:]
            geneseq = geneseq[:len(geneseq) - (len(geneseq) % 3)]
            gene = geneseq.seq
            prot = gene.translate()

            check = check_has_premature_stops_noend(prot, genename, VERBOSE=0)
            if check:
                if VERBOSE >= 3:
                    print 'OK: ' + genename + ' has no premature stop codons'
                break
        else:
            if VERBOSE >= 1:
                print 'ERROR: ' + genename + ' has premature stop codons in all reading frames!'
            return False

    return True
def check_F3(refseq, spec, VERBOSE=0):
    '''Check fragment F3: end of pol'''
    check = check_length_fragment(refseq, 'F3'+spec, VERBOSE=VERBOSE, tolerance=50)
    if not check:
        return False

    # Check pol: this depends on the spec: for F3bo there should be the end,
    # anything else has only the middle (it's all pol!)
    genename = 'pol'
    if spec == 'bo':
        (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
        if (not end_found):
            print 'ERROR: end of '+genename+' not found in F3!'
            return False
        elif VERBOSE >= 3:
            print 'OK: end of '+genename+' found'

        geneseq = refseq[:end]
        geneseq = geneseq[len(geneseq) % 3:]
        gene = geneseq.seq
        prot = gene.translate()
        check = check_has_end(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

    else:
        # Try all 3 reading frames
        for offset in xrange(3):
            geneseq = refseq[offset:]
            geneseq = geneseq[: len(geneseq) - (len(geneseq) % 3)]
            gene = geneseq.seq
            prot = gene.translate()

            check = check_has_premature_stops_noend(prot, genename, VERBOSE=0)
            if check:
                if VERBOSE >= 3:
                    print 'OK: '+genename+' has no premature stop codons'
                break
        else:
            if VERBOSE >= 1:
                print 'ERROR: '+genename+' has premature stop codons in all reading frames!'
            return False

    return True
def check_genomewide(refseq, VERBOSE=0):
    '''Check the integrity of all genes in the genomewide consensus'''
    # Check single-exon genes
    length_tolerance = {'gag': 30, 'pol': 30, 'env': 70, 'vpr': 15, 'vpu': 15}
    for genename, tol in length_tolerance.iteritems():
        (start, end,
         start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
        if (not start_found) or (not end_found):
            print 'ERROR: '+genename+' not found in genomewide!'
            return False
        elif VERBOSE >= 3:
            print 'OK: start and end of '+genename+' found'
        
        gene_HXB2 = get_gene_HXB2(genename)
        check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=tol)
        if not check:
            return False

        geneseq = refseq[start: end]
        gene = geneseq.seq
        check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
        if not check:
            # sometimes the gene ends a few nucleotides upstream, and there is a
            # frameshift mutation that screws up
            gene_new = refseq.seq[start:]
            gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)]
            prot_new = gene_new.translate()
            end_new = prot_new.find('*')
            end_diff = start + (3 * end_new + 3) - end
            if -90 < end_diff < 0:
                print genename.upper()+' ENDS '+str((end - start) // 3 - end_new - 1)+' AMINO ACIDS UPSTREAM!'
                gene = gene_new[:3 * (end_new + 1)]
            else:
                return False

        prot = gene.translate()
        check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
        if (not check):
            if genename != 'vpu':
                return False
            else:
                print 'ERROR IN VPU STARTING CODON, CONTINUING!'

        check = check_has_end(prot, genename, VERBOSE=VERBOSE)
        if not check:
            # sometimes a gene is a bit longer
            gene_new = refseq.seq[start:]
            gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)]
            prot_new = gene_new.translate()
            end_new = prot_new.find('*')
            end_diff = start + (3 * end_new + 3) - end
            if -90 < end_diff < 0:
                print genename.upper()+' ENDS '+str((end - start) // 3 - end_new - 1)+' AMINO ACIDS UPSTREAM!'
                gene = gene_new[:3 * (end_new + 1)]
                prot = gene.translate()
            elif 0 < end_diff < 90:
                print genename.upper()+' ENDS '+str(end_new + 1 - (end - start) // 3)+' AMINO ACIDS DOWNSTREAM!'
                gene = gene_new[:3 * (end_new + 1)]
                prot = gene.translate()
            else:
                return False

        check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

    # Vif is special because it can be longer than in HXB2
    genename = 'vif'
    (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
    if (not start_found) or (not end_found):
        print 'ERROR: '+genename+' not found in genomewide!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start and end of '+genename+' found'
    
    gene_HXB2 = get_gene_HXB2(genename)
    check = check_has_similar_length(end - start, len(gene_HXB2), genename,
                                     VERBOSE=VERBOSE, maxdiff=15)
    if not check:
        return False

    geneseq = refseq[start: end]
    gene = geneseq.seq
    check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_end(prot, genename, VERBOSE=0)
    if not check:
        # Vif tends to be a bit longer than in HXB2
        for nc in xrange(1, 4):
            gene_ext = refseq[start: end + 3 * nc].seq
            prot_ext = gene_ext.translate()
            check = check_has_end(prot_ext, genename, VERBOSE=0)
            if check:
                gene = gene_ext
                prot = prot_ext
                if VERBOSE:
                    print 'WARNING: '+genename+' actually ends '+str(nc)+' codons downstream'
                break
        else:
            print 'ERROR: '+genename+' does not end, not even slightly downstream'
            return False

    check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    # Check 2-exon genes
    for genename_whole in ('tat', 'rev'):
        genename = genename_whole+'1'
        (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
        if (not start_found) or (not end_found):
            print 'ERROR: '+genename+' not found in genomewide!'
            return False
        elif VERBOSE >= 3:
            print 'OK: start and end of '+genename+' found'
        
        gene_HXB2 = get_gene_HXB2(genename)
        check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15)
        if not check:
            return False

        geneseq = refseq[start: end]
        geneseq = geneseq[:len(geneseq) - len(geneseq) % 3]
        gene = geneseq.seq
        prot = gene.translate()
        check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        start_exon1 = start
        end_exon1 = end

        genename = genename_whole+'2'
        (start, end, start_found, end_found) = locate_gene(refseq[end_exon1 + 2000:], genename, VERBOSE=VERBOSE)
        if (not start_found) or (not end_found):
            print 'ERROR: '+genename+' not found in genomewide!'
            return False
        elif VERBOSE >= 3:
            print 'OK: start and end of '+genename+' found'

        start += end_exon1 + 2000
        end += end_exon1 + 2000

        # NOTE: rev2 overlaps with env gp41 and can have insertions or deletions
        if genename == 'rev2':
            tol = 45
        else:
            tol = 15
        gene_HXB2 = get_gene_HXB2(genename)
        check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=tol)
        if not check:
            return False

        geneseq = refseq[start: end]
        frame = get_frame(geneseq, gene_HXB2, genename, VERBOSE=VERBOSE)
        geneseq = geneseq[frame:]
        gene = geneseq.seq
        prot = gene.translate()
        check = check_has_end(prot, genename, VERBOSE=VERBOSE)
        if not check:
            if genename != 'rev2':
                return False

            else:
                # rev2 can end a bit early
                end_new = prot.rfind('*')
                if end_new != -1:
                    if len(prot) - 1 - end_new < 20:
                        print 'REV2 ENDS '+str(len(prot) - end_new - 1)+' AMINO ACIDS UPSTREAM!'
                        prot = prot[:end_new + 1]
                        end = start + frame + 3 * (end_new + 1)
                    else:
                        return False
                else:
                    # rev2 can also end quite a bit late
                    gene_new = refseq.seq[start:]
                    gene_new = gene_new[(end - start) % 3:]
                    gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)]
                    prot_new = gene_new.translate()
                    end_new = prot_new.find('*')

                    if (start + 3 * end_new) - end < 200:
                        print 'REV2 ENDS '+str(end_new - len(prot) + 1)+' AMINO ACIDS DOWNSTREAM!'
                        prot = prot_new[:end_new + 1]
                        end = start + ((end - start) % 3) + 3 * (end_new + 1)
                    else:
                        return False

        check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        start_exon2 = start
        end_exon2 = end

        genename = genename_whole
        gene_HXB2 = get_gene_HXB2(genename)

        from Bio.SeqFeature import FeatureLocation
        gene_loc = FeatureLocation(start_exon1, end_exon1, strand=+1) + \
                   FeatureLocation(start_exon2, end_exon2, strand=+1)
        geneseq = gene_loc.extract(refseq)
        gene = geneseq.seq

        check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        prot = gene.translate()
        check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        check = check_has_end(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

    return True
def check_F6(refseq, spec, VERBOSE=0):
    '''Check fragment F6: end of env, tat2, rev2'''
    check = check_length_fragment(refseq, 'F6'+spec, VERBOSE=VERBOSE, tolerance=50)
    if not check:
        return False

    # Check env (there should be end)
    genename = 'env'
    (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
    if (not end_found):
        print 'ERROR: end of '+genename+' not found in F6!'
        return False
    elif VERBOSE >= 3:
        print 'OK: end of '+genename+' found'

    geneseq = refseq[:end]
    gene_HXB2 = get_gene_HXB2(genename)
    frame = get_frame(geneseq, gene_HXB2, genename)
    geneseq = geneseq[frame:]
    geneseq = geneseq[:len(geneseq) - (len(geneseq) % 3)]
    gene = geneseq.seq
    prot = gene.translate()
    check = check_has_end(prot, genename, VERBOSE=VERBOSE)
    # env can end a bit early or late
    if not check:
        gene_new = refseq.seq[frame:]
        gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)]
        prot_new = gene_new.translate()
        end_new = prot_new.find('*')

        end_diff = (frame + 3 * end_new) - end
        if 0 < end_diff < 200:
            print 'ENV ENDS '+str(end_new - len(prot) + 1)+' AMINO ACIDS DOWNSTREAM!'
            prot = prot_new[:end_new + 1]
        elif -200 < end_diff < 0:
            print 'ENV ENDS '+str(len(prot) - 1 - end_new)+' AMINO ACIDS UPSTREAM!'
            prot = prot_new[:end_new + 1]
        else:
            return False

    check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
    if not check:
        print prot
        return False

    # Check tat2 (second exon of tat, should be complete)
    genename = 'tat2'
    (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
    if (not start_found) or (not end_found):
        print 'ERROR: '+genename+' not found in F6!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start and end of '+genename+' found'
    
    gene_HXB2 = get_gene_HXB2(genename)
    check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15)
    if not check:
        return False

    geneseq = refseq[start: end]
    geneseq = geneseq[len(geneseq) % 3:]
    gene = geneseq.seq
    prot = gene.translate()
    check = check_has_end(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
    if not check:
        print 'ERROR IN TAT2 PREMATURE STOPS, CONTINUING!'

    # Check rev2 (second exon of rev, should be complete)
    genename = 'rev2'
    (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
    if (not start_found) or (not end_found):
        print 'ERROR: '+genename+' not found in F6!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start and end of '+genename+' found'

    # NOTE: rev2 overlaps with env gp41 and can have insertions or deletions
    gene_HXB2 = get_gene_HXB2(genename)
    check = check_has_similar_length(end - start, len(gene_HXB2), genename,
                                     VERBOSE=VERBOSE, maxdiff=45)
    if not check:
        return False

    geneseq = refseq[start: end]
    geneseq = geneseq[len(geneseq) % 3:]
    gene = geneseq.seq
    prot = gene.translate()
    check = check_has_end(prot, genename, VERBOSE=VERBOSE)
    if not check:
        # rev2 can end a bit early
        end_new = prot.rfind('*')
        if end_new != -1:
            if len(prot) - 1 - end_new < 20:
                print 'REV2 ENDS '+str(len(prot) - end_new - 1)+' AMINO ACIDS UPSTREAM!'
                prot = prot[:end_new + 1]
            else:
                return False
        else:
            # rev2 can also end quite a bit late
            gene_new = refseq.seq[start:]
            gene_new = gene_new[(end - start) % 3:]
            gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)]
            prot_new = gene_new.translate()
            end_new = prot_new.find('*')

            if (start + 3 * end_new) - end < 200:
                print 'REV2 ENDS '+str(end_new - len(prot) + 1)+' AMINO ACIDS DOWNSTREAM!'
                prot = prot_new[:end_new + 1]
            else:
                return False

    check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    return True
def check_F5(refseq, spec, VERBOSE=0):
    '''Check fragment F5: env'''
    if spec == 'a+bo':
        spec_inner = 'bo'
    else:
        spec_inner = spec

    check = check_length_fragment(refseq, 'F5'+spec_inner, VERBOSE=VERBOSE, tolerance=70)
    if not check:
        return False

    # Check env (there should be the start)
    genename = 'env'
    (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
    if (not start_found):
        print 'ERROR: start of '+genename+' not found in F5!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start of '+genename+' found'

    geneseq = refseq[start:]
    geneseq = geneseq[:len(geneseq) - len(geneseq) % 3]
    gene = geneseq.seq
    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    # Check vpu (should be complete in F5ao)
    if spec_inner == 'ao':
        genename = 'vpu'
        (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
        if (not start_found) or (not end_found):
            print 'ERROR: '+genename+' not found in F4!'
            return False
        elif VERBOSE >= 3:
            print 'OK: start and end of '+genename+' found'
        
        gene_HXB2 = get_gene_HXB2(genename)
        check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15)
        if not check:
            return False

        geneseq = refseq[start: end]
        gene = geneseq.seq
        check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        prot = gene.translate()
        check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
        if not check:
            print 'ERROR IN VPU STARTING CODON, CONTINUING!'
            #return False

        check = check_has_end(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

    return True
def check_F4(refseq, spec, VERBOSE=0):
    '''Check fragment F4: pol, vif, vpr, vpu, tat1, rev1, env'''
    check = check_length_fragment(refseq, 'F4'+spec, VERBOSE=VERBOSE, tolerance=50)
    if not check:
        return False

    # Check pol (there should be end)
    genename = 'pol'
    (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
    if (not end_found):
        print 'ERROR: end of '+genename+' not found in F4!'
        return False
    elif VERBOSE >= 3:
        print 'OK: end of '+genename+' found'

    geneseq = refseq[:end]
    gene_HXB2 = get_gene_HXB2(genename)
    frame = get_frame(geneseq, gene_HXB2, genename)
    geneseq = geneseq[frame:]
    geneseq = geneseq[:len(geneseq) - (len(geneseq) % 3)]
    gene = geneseq.seq
    prot = gene.translate()
    check = check_has_end(prot, genename, VERBOSE=VERBOSE)
    # it can end a bit early or late
    if not check:
        gene_new = refseq.seq[frame:]
        gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)]
        prot_new = gene_new.translate()
        end_new = prot_new.find('*')

        end_diff = (frame + 3 * end_new) - end
        if 0 < end_diff < 200:
            print genename.upper()+' ENDS '+str(end_new - len(prot) + 1)+' AMINO ACIDS DOWNSTREAM!'
            prot = prot_new[:end_new + 1]
        elif -200 < end_diff < 0:
            print genename.upper()+' ENDS '+str(len(prot) - 1 - end_new)+' AMINO ACIDS UPSTREAM!'
            prot = prot_new[:end_new + 1]
        else:
            return False

    check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
    if not check:
        print prot
        return False


    # Check env (there should be the start)
    genename = 'env'
    (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
    if (not start_found):
        print 'ERROR: start of '+genename+' not found in F4!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start of '+genename+' found'

    geneseq = refseq[start:]
    geneseq = geneseq[:len(geneseq) - len(geneseq) % 3]
    gene = geneseq.seq
    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    # Check vif (should be complete)
    genename = 'vif'
    (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
    if (not start_found) or (not end_found):
        print 'ERROR: '+genename+' not found in F4!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start and end of '+genename+' found'
    
    gene_HXB2 = get_gene_HXB2(genename)
    check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15)
    if not check:
        return False

    geneseq = refseq[start: end]
    gene = geneseq.seq
    check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_end(prot, genename, VERBOSE=0)
    if check:
        if VERBOSE >= 3:
            print 'OK: '+genename+' ends with a *'
    else:
        # Vif tends to be a bit longer than in HXB2
        for nc in xrange(1, 4):
            gene_ext = refseq[start: end + 3 * nc].seq
            prot_ext = gene_ext.translate()
            check = check_has_end(prot_ext, genename, VERBOSE=0)
            if check:
                gene = gene_ext
                prot = prot_ext
                if VERBOSE:
                    print 'WARNING: '+genename+' actually ends '+str(nc)+' codons downstream'
                break
        else:
            print 'ERROR: '+genename+' does not end, not even slightly downstream'
            return False

    check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    # Check vpu (should be complete)
    genename = 'vpu'
    (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
    if (not start_found) or (not end_found):
        print 'ERROR: '+genename+' not found in F4!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start and end of '+genename+' found'
    
    gene_HXB2 = get_gene_HXB2(genename)
    check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15)
    if not check:
        return False

    geneseq = refseq[start: end]
    gene = geneseq.seq
    check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        print 'ERROR IN VPU STARTING CODON, CONTINUING!'
        #return False

    check = check_has_end(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    # Check vpr (should be complete)
    genename = 'vpr'
    (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
    if (not start_found) or (not end_found):
        print 'ERROR: '+genename+' not found in F4!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start and end of '+genename+' found'
    
    gene_HXB2 = get_gene_HXB2(genename)
    check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15)
    if not check:
        return False

    geneseq = refseq[start: end]
    gene = geneseq.seq
    check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_end(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    # Check tat1 (first exon of tat, should be complete)
    genename = 'tat1'
    (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
    if (not start_found) or (not end_found):
        print 'ERROR: '+genename+' not found in F4!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start and end of '+genename+' found'
    
    gene_HXB2 = get_gene_HXB2(genename)
    check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=35)
    if not check:
        return False

    geneseq = refseq[start: end]
    geneseq = geneseq[:len(geneseq) - len(geneseq) % 3]
    gene = geneseq.seq
    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    # Check rev1 (first exon of rev, should be complete)
    genename = 'rev1'
    (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
    if (not start_found) or (not end_found):
        print 'ERROR: '+genename+' not found in F4!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start and end of '+genename+' found'
    
    gene_HXB2 = get_gene_HXB2(genename)
    check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15)
    if not check:
        return False

    geneseq = refseq[start: end]
    geneseq = geneseq[:len(geneseq) - len(geneseq) % 3]
    gene = geneseq.seq
    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    return True
def check_F1(refseq, spec, VERBOSE=0):
    '''Check fragment F1: gag, pol'''
    check = check_length_fragment(refseq, 'F1'+spec, VERBOSE=VERBOSE, tolerance=50)
    if not check:
        return False

    # Check gag (should be complete)
    genename = 'gag'
    (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
    if (not start_found) or (not end_found):
        print 'ERROR: '+genename+' not found in F1!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start and end of '+genename+' found'
    
    gene_HXB2 = get_gene_HXB2(genename)
    check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=30)
    if not check:
        return False

    geneseq = refseq[start: end]
    gene = geneseq.seq
    check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_end(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    # Check pol (there should be the start)
    genename = 'pol'
    (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
    if (not start_found):
        print 'ERROR: start of '+genename+' not found in F1!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start of '+genename+' found'

    geneseq = refseq[start:]
    geneseq = geneseq[:len(geneseq) - len(geneseq) % 3]
    gene = geneseq.seq
    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    return True
def annotate_sequence(seqrecord, additional_edges={}, additional_features=['chunk'], VERBOSE=0):
    '''Annotate a consensus with the genes and stuff (in place)'''
    # TODO: what do we do with genes that do not start/end where they are
    # supposed to? Do we follow biology and track their new locations?
    from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation
    from hivwholeseq.utils.genome_info import gene_edges, RNA_structure_edges, \
            other_edges, find_region_edges, find_region_edges_multiple, \
            locate_gene
    edge_dict = {'gene': gene_edges,
                 'RNA structure': RNA_structure_edges,
                 'other': other_edges}
    edge_dict.update(additional_edges)
    additional_features = ['protein'] + additional_features
    features = edge_dict.keys() + additional_features

    if VERBOSE:
        print 'Features:', ', '.join(features)

    smat = np.array(seqrecord)

    for feature_type in edge_dict:
        edges_all = edge_dict[feature_type]
        print feature_type, edge_dict[feature_type].keys()
        for name, edges in edges_all.iteritems():
            if VERBOSE >= 2:
                print name,

            # Skip a feature if it's present already
            if name in map(lambda x: x.id, seqrecord.features):
                if VERBOSE >= 2:
                    print 'already present.'
                continue

            # Behave differently for unsplit regions and split ones
            if len(edges) == 2:
                # LTR problems with F6
                if 'F6' in name:
                    pos_edge = find_region_edges(smat[6000::], [edges[0], None])
                    pos_edge[0] += 6000
                elif feature_type == 'genes':
                    pos_edge = locate_gene(smat, name, output_compact=True)
                else:
                    pos_edge = find_region_edges(smat, edges)

                # Cut the primers for some features
                if (None not in pos_edge) and name in ['V1', 'V3', 'V4', 'V5']:
                    pos_edge[0] += len(edges[0])
                    pos_edge[1] -= len(edges[1])

                # Cut only the right primer for V2
                if (None not in pos_edge) and name in ['V2']:
                    pos_edge[1] -= len(edges[1])

                if pos_edge[0] is None:
                    if name not in ['F1', "LTR5'"]:
                        print 'WARNING: start not found'
                    pos_edge[0] = 0

                if pos_edge[1] is None:
                    if name not in ['F6', "LTR3'"]:
                        print 'WARNING: end not found'
                    pos_edge[1] = len(smat)

                location = FeatureLocation(*pos_edge)
            else:
                if feature_type == 'genes':
                    pos_edges = [locate_gene(smat, name+suff, output_compact=True)
                                 for suff in ('1', '2')]
                else:
                    pos_edges = find_region_edges_multiple(smat, edges, min_distance=1)
                locations = [FeatureLocation(*pos_edge) for pos_edge in pos_edges]
                location = CompoundLocation(locations)

            if VERBOSE >= 2:
                print 'found:', location

            feature = SeqFeature(location, type=feature_type, id=name, strand=1)
            seqrecord.features.append(feature)

    # Add proteins and other features from HXB2
    from operator import attrgetter
    from seqanpy import align_overlap
    from hivwholeseq.utils.genome_info import proteins, chunks
    from hivwholeseq.reference import load_custom_reference
    additional_features_dict = {}
    if 'protein' in additional_features:
        additional_features_dict['protein'] = proteins
    if 'chunk' in additional_features:
        additional_features_dict['chunk'] = chunks

    ref_ann = load_custom_reference('HXB2', 'gb')
    for feagroup, additional_features_grp in additional_features_dict.iteritems():
        for feaname in additional_features_grp:
            if VERBOSE >= 2:
                print feaname,

            fea = ref_ann.features[map(attrgetter('id'), ref_ann.features).index(feaname)]
            seq = fea.extract(ref_ann)
            (score, ali1, ali2) = align_overlap(seqrecord, seq, score_gapopen=-20)
            start = len(ali2) - len(ali2.lstrip('-'))
            end = len(ali2.rstrip('-'))
            end -= ali1[start: end].count('-')

            location = FeatureLocation(start, end)
            if VERBOSE >= 2:
                print 'found:', location

            feature = SeqFeature(location, type=feagroup, id=feaname, strand=1)
            seqrecord.features.append(feature)
def check_genomewide(refseq, VERBOSE=0):
    '''Check the integrity of all genes in the genomewide consensus'''
    # Check single-exon genes
    length_tolerance = {'gag': 30, 'pol': 30, 'env': 70, 'vpr': 15, 'vpu': 15}
    for genename, tol in length_tolerance.iteritems():
        (start, end, start_found, end_found) = locate_gene(refseq,
                                                           genename,
                                                           VERBOSE=VERBOSE)
        if (not start_found) or (not end_found):
            print 'ERROR: ' + genename + ' not found in genomewide!'
            return False
        elif VERBOSE >= 3:
            print 'OK: start and end of ' + genename + ' found'

        gene_HXB2 = get_gene_HXB2(genename)
        check = check_has_similar_length(end - start,
                                         len(gene_HXB2),
                                         genename,
                                         VERBOSE=VERBOSE,
                                         maxdiff=tol)
        if not check:
            return False

        geneseq = refseq[start:end]
        gene = geneseq.seq
        check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
        if not check:
            # sometimes the gene ends a few nucleotides upstream, and there is a
            # frameshift mutation that screws up
            gene_new = refseq.seq[start:]
            gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)]
            prot_new = gene_new.translate()
            end_new = prot_new.find('*')
            end_diff = start + (3 * end_new + 3) - end
            if -90 < end_diff < 0:
                print genename.upper() + ' ENDS ' + str(
                    (end - start) // 3 - end_new -
                    1) + ' AMINO ACIDS UPSTREAM!'
                gene = gene_new[:3 * (end_new + 1)]
            else:
                return False

        prot = gene.translate()
        check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
        if (not check):
            if genename != 'vpu':
                return False
            else:
                print 'ERROR IN VPU STARTING CODON, CONTINUING!'

        check = check_has_end(prot, genename, VERBOSE=VERBOSE)
        if not check:
            # sometimes a gene is a bit longer
            gene_new = refseq.seq[start:]
            gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)]
            prot_new = gene_new.translate()
            end_new = prot_new.find('*')
            end_diff = start + (3 * end_new + 3) - end
            if -90 < end_diff < 0:
                print genename.upper() + ' ENDS ' + str(
                    (end - start) // 3 - end_new -
                    1) + ' AMINO ACIDS UPSTREAM!'
                gene = gene_new[:3 * (end_new + 1)]
                prot = gene.translate()
            elif 0 < end_diff < 90:
                print genename.upper() + ' ENDS ' + str(
                    end_new + 1 -
                    (end - start) // 3) + ' AMINO ACIDS DOWNSTREAM!'
                gene = gene_new[:3 * (end_new + 1)]
                prot = gene.translate()
            else:
                return False

        check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

    # Vif is special because it can be longer than in HXB2
    genename = 'vif'
    (start, end, start_found, end_found) = locate_gene(refseq,
                                                       genename,
                                                       VERBOSE=VERBOSE)
    if (not start_found) or (not end_found):
        print 'ERROR: ' + genename + ' not found in genomewide!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start and end of ' + genename + ' found'

    gene_HXB2 = get_gene_HXB2(genename)
    check = check_has_similar_length(end - start,
                                     len(gene_HXB2),
                                     genename,
                                     VERBOSE=VERBOSE,
                                     maxdiff=15)
    if not check:
        return False

    geneseq = refseq[start:end]
    gene = geneseq.seq
    check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_end(prot, genename, VERBOSE=0)
    if not check:
        # Vif tends to be a bit longer than in HXB2
        for nc in xrange(1, 4):
            gene_ext = refseq[start:end + 3 * nc].seq
            prot_ext = gene_ext.translate()
            check = check_has_end(prot_ext, genename, VERBOSE=0)
            if check:
                gene = gene_ext
                prot = prot_ext
                if VERBOSE:
                    print 'WARNING: ' + genename + ' actually ends ' + str(
                        nc) + ' codons downstream'
                break
        else:
            print 'ERROR: ' + genename + ' does not end, not even slightly downstream'
            return False

    check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    # Check 2-exon genes
    for genename_whole in ('tat', 'rev'):
        genename = genename_whole + '1'
        (start, end, start_found, end_found) = locate_gene(refseq,
                                                           genename,
                                                           VERBOSE=VERBOSE)
        if (not start_found) or (not end_found):
            print 'ERROR: ' + genename + ' not found in genomewide!'
            return False
        elif VERBOSE >= 3:
            print 'OK: start and end of ' + genename + ' found'

        gene_HXB2 = get_gene_HXB2(genename)
        check = check_has_similar_length(end - start,
                                         len(gene_HXB2),
                                         genename,
                                         VERBOSE=VERBOSE,
                                         maxdiff=15)
        if not check:
            return False

        geneseq = refseq[start:end]
        geneseq = geneseq[:len(geneseq) - len(geneseq) % 3]
        gene = geneseq.seq
        prot = gene.translate()
        check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        start_exon1 = start
        end_exon1 = end

        genename = genename_whole + '2'
        (start, end, start_found,
         end_found) = locate_gene(refseq[end_exon1 + 2000:],
                                  genename,
                                  VERBOSE=VERBOSE)
        if (not start_found) or (not end_found):
            print 'ERROR: ' + genename + ' not found in genomewide!'
            return False
        elif VERBOSE >= 3:
            print 'OK: start and end of ' + genename + ' found'

        start += end_exon1 + 2000
        end += end_exon1 + 2000

        # NOTE: rev2 overlaps with env gp41 and can have insertions or deletions
        if genename == 'rev2':
            tol = 45
        else:
            tol = 15
        gene_HXB2 = get_gene_HXB2(genename)
        check = check_has_similar_length(end - start,
                                         len(gene_HXB2),
                                         genename,
                                         VERBOSE=VERBOSE,
                                         maxdiff=tol)
        if not check:
            return False

        geneseq = refseq[start:end]
        frame = get_frame(geneseq, gene_HXB2, genename, VERBOSE=VERBOSE)
        geneseq = geneseq[frame:]
        gene = geneseq.seq
        prot = gene.translate()
        check = check_has_end(prot, genename, VERBOSE=VERBOSE)
        if not check:
            if genename != 'rev2':
                return False

            else:
                # rev2 can end a bit early
                end_new = prot.rfind('*')
                if end_new != -1:
                    if len(prot) - 1 - end_new < 20:
                        print 'REV2 ENDS ' + str(len(prot) - end_new -
                                                 1) + ' AMINO ACIDS UPSTREAM!'
                        prot = prot[:end_new + 1]
                        end = start + frame + 3 * (end_new + 1)
                    else:
                        return False
                else:
                    # rev2 can also end quite a bit late
                    gene_new = refseq.seq[start:]
                    gene_new = gene_new[(end - start) % 3:]
                    gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)]
                    prot_new = gene_new.translate()
                    end_new = prot_new.find('*')

                    if (start + 3 * end_new) - end < 200:
                        print 'REV2 ENDS ' + str(end_new - len(prot) + 1
                                                 ) + ' AMINO ACIDS DOWNSTREAM!'
                        prot = prot_new[:end_new + 1]
                        end = start + ((end - start) % 3) + 3 * (end_new + 1)
                    else:
                        return False

        check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        start_exon2 = start
        end_exon2 = end

        genename = genename_whole
        gene_HXB2 = get_gene_HXB2(genename)

        from Bio.SeqFeature import FeatureLocation
        gene_loc = FeatureLocation(start_exon1, end_exon1, strand=+1) + \
                   FeatureLocation(start_exon2, end_exon2, strand=+1)
        geneseq = gene_loc.extract(refseq)
        gene = geneseq.seq

        check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        prot = gene.translate()
        check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        check = check_has_end(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

    return True
def check_F6(refseq, spec, VERBOSE=0):
    '''Check fragment F6: end of env, tat2, rev2'''
    check = check_length_fragment(refseq,
                                  'F6' + spec,
                                  VERBOSE=VERBOSE,
                                  tolerance=50)
    if not check:
        return False

    # Check env (there should be end)
    genename = 'env'
    (start, end, start_found, end_found) = locate_gene(refseq,
                                                       genename,
                                                       VERBOSE=VERBOSE)
    if (not end_found):
        print 'ERROR: end of ' + genename + ' not found in F6!'
        return False
    elif VERBOSE >= 3:
        print 'OK: end of ' + genename + ' found'

    geneseq = refseq[:end]
    gene_HXB2 = get_gene_HXB2(genename)
    frame = get_frame(geneseq, gene_HXB2, genename)
    geneseq = geneseq[frame:]
    geneseq = geneseq[:len(geneseq) - (len(geneseq) % 3)]
    gene = geneseq.seq
    prot = gene.translate()
    check = check_has_end(prot, genename, VERBOSE=VERBOSE)
    # env can end a bit early or late
    if not check:
        gene_new = refseq.seq[frame:]
        gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)]
        prot_new = gene_new.translate()
        end_new = prot_new.find('*')

        end_diff = (frame + 3 * end_new) - end
        if 0 < end_diff < 200:
            print 'ENV ENDS ' + str(end_new - len(prot) +
                                    1) + ' AMINO ACIDS DOWNSTREAM!'
            prot = prot_new[:end_new + 1]
        elif -200 < end_diff < 0:
            print 'ENV ENDS ' + str(len(prot) - 1 -
                                    end_new) + ' AMINO ACIDS UPSTREAM!'
            prot = prot_new[:end_new + 1]
        else:
            return False

    check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
    if not check:
        print prot
        return False

    # Check tat2 (second exon of tat, should be complete)
    genename = 'tat2'
    (start, end, start_found, end_found) = locate_gene(refseq,
                                                       genename,
                                                       VERBOSE=VERBOSE)
    if (not start_found) or (not end_found):
        print 'ERROR: ' + genename + ' not found in F6!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start and end of ' + genename + ' found'

    gene_HXB2 = get_gene_HXB2(genename)
    check = check_has_similar_length(end - start,
                                     len(gene_HXB2),
                                     genename,
                                     VERBOSE=VERBOSE,
                                     maxdiff=15)
    if not check:
        return False

    geneseq = refseq[start:end]
    geneseq = geneseq[len(geneseq) % 3:]
    gene = geneseq.seq
    prot = gene.translate()
    check = check_has_end(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
    if not check:
        print 'ERROR IN TAT2 PREMATURE STOPS, CONTINUING!'

    # Check rev2 (second exon of rev, should be complete)
    genename = 'rev2'
    (start, end, start_found, end_found) = locate_gene(refseq,
                                                       genename,
                                                       VERBOSE=VERBOSE)
    if (not start_found) or (not end_found):
        print 'ERROR: ' + genename + ' not found in F6!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start and end of ' + genename + ' found'

    # NOTE: rev2 overlaps with env gp41 and can have insertions or deletions
    gene_HXB2 = get_gene_HXB2(genename)
    check = check_has_similar_length(end - start,
                                     len(gene_HXB2),
                                     genename,
                                     VERBOSE=VERBOSE,
                                     maxdiff=45)
    if not check:
        return False

    geneseq = refseq[start:end]
    geneseq = geneseq[len(geneseq) % 3:]
    gene = geneseq.seq
    prot = gene.translate()
    check = check_has_end(prot, genename, VERBOSE=VERBOSE)
    if not check:
        # rev2 can end a bit early
        end_new = prot.rfind('*')
        if end_new != -1:
            if len(prot) - 1 - end_new < 20:
                print 'REV2 ENDS ' + str(len(prot) - end_new -
                                         1) + ' AMINO ACIDS UPSTREAM!'
                prot = prot[:end_new + 1]
            else:
                return False
        else:
            # rev2 can also end quite a bit late
            gene_new = refseq.seq[start:]
            gene_new = gene_new[(end - start) % 3:]
            gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)]
            prot_new = gene_new.translate()
            end_new = prot_new.find('*')

            if (start + 3 * end_new) - end < 200:
                print 'REV2 ENDS ' + str(end_new - len(prot) +
                                         1) + ' AMINO ACIDS DOWNSTREAM!'
                prot = prot_new[:end_new + 1]
            else:
                return False

    check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    return True
def check_F5(refseq, spec, VERBOSE=0):
    '''Check fragment F5: env'''
    if spec == 'a+bo':
        spec_inner = 'bo'
    else:
        spec_inner = spec

    check = check_length_fragment(refseq,
                                  'F5' + spec_inner,
                                  VERBOSE=VERBOSE,
                                  tolerance=70)
    if not check:
        return False

    # Check env (there should be the start)
    genename = 'env'
    (start, end, start_found, end_found) = locate_gene(refseq,
                                                       genename,
                                                       VERBOSE=VERBOSE)
    if (not start_found):
        print 'ERROR: start of ' + genename + ' not found in F5!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start of ' + genename + ' found'

    geneseq = refseq[start:]
    geneseq = geneseq[:len(geneseq) - len(geneseq) % 3]
    gene = geneseq.seq
    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    # Check vpu (should be complete in F5ao)
    if spec_inner == 'ao':
        genename = 'vpu'
        (start, end, start_found, end_found) = locate_gene(refseq,
                                                           genename,
                                                           VERBOSE=VERBOSE)
        if (not start_found) or (not end_found):
            print 'ERROR: ' + genename + ' not found in F4!'
            return False
        elif VERBOSE >= 3:
            print 'OK: start and end of ' + genename + ' found'

        gene_HXB2 = get_gene_HXB2(genename)
        check = check_has_similar_length(end - start,
                                         len(gene_HXB2),
                                         genename,
                                         VERBOSE=VERBOSE,
                                         maxdiff=15)
        if not check:
            return False

        geneseq = refseq[start:end]
        gene = geneseq.seq
        check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        prot = gene.translate()
        check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
        if not check:
            print 'ERROR IN VPU STARTING CODON, CONTINUING!'
            #return False

        check = check_has_end(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

    return True
def check_F4(refseq, spec, VERBOSE=0):
    '''Check fragment F4: pol, vif, vpr, vpu, tat1, rev1, env'''
    check = check_length_fragment(refseq,
                                  'F4' + spec,
                                  VERBOSE=VERBOSE,
                                  tolerance=50)
    if not check:
        return False

    # Check pol (there should be end)
    genename = 'pol'
    (start, end, start_found, end_found) = locate_gene(refseq,
                                                       genename,
                                                       VERBOSE=VERBOSE)
    if (not end_found):
        print 'ERROR: end of ' + genename + ' not found in F4!'
        return False
    elif VERBOSE >= 3:
        print 'OK: end of ' + genename + ' found'

    geneseq = refseq[:end]
    gene_HXB2 = get_gene_HXB2(genename)
    frame = get_frame(geneseq, gene_HXB2, genename)
    geneseq = geneseq[frame:]
    geneseq = geneseq[:len(geneseq) - (len(geneseq) % 3)]
    gene = geneseq.seq
    prot = gene.translate()
    check = check_has_end(prot, genename, VERBOSE=VERBOSE)
    # it can end a bit early or late
    if not check:
        gene_new = refseq.seq[frame:]
        gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)]
        prot_new = gene_new.translate()
        end_new = prot_new.find('*')

        end_diff = (frame + 3 * end_new) - end
        if 0 < end_diff < 200:
            print genename.upper() + ' ENDS ' + str(
                end_new - len(prot) + 1) + ' AMINO ACIDS DOWNSTREAM!'
            prot = prot_new[:end_new + 1]
        elif -200 < end_diff < 0:
            print genename.upper() + ' ENDS ' + str(
                len(prot) - 1 - end_new) + ' AMINO ACIDS UPSTREAM!'
            prot = prot_new[:end_new + 1]
        else:
            return False

    check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
    if not check:
        print prot
        return False

    # Check env (there should be the start)
    genename = 'env'
    (start, end, start_found, end_found) = locate_gene(refseq,
                                                       genename,
                                                       VERBOSE=VERBOSE)
    if (not start_found):
        print 'ERROR: start of ' + genename + ' not found in F4!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start of ' + genename + ' found'

    geneseq = refseq[start:]
    geneseq = geneseq[:len(geneseq) - len(geneseq) % 3]
    gene = geneseq.seq
    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    # Check vif (should be complete)
    genename = 'vif'
    (start, end, start_found, end_found) = locate_gene(refseq,
                                                       genename,
                                                       VERBOSE=VERBOSE)
    if (not start_found) or (not end_found):
        print 'ERROR: ' + genename + ' not found in F4!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start and end of ' + genename + ' found'

    gene_HXB2 = get_gene_HXB2(genename)
    check = check_has_similar_length(end - start,
                                     len(gene_HXB2),
                                     genename,
                                     VERBOSE=VERBOSE,
                                     maxdiff=15)
    if not check:
        return False

    geneseq = refseq[start:end]
    gene = geneseq.seq
    check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_end(prot, genename, VERBOSE=0)
    if check:
        if VERBOSE >= 3:
            print 'OK: ' + genename + ' ends with a *'
    else:
        # Vif tends to be a bit longer than in HXB2
        for nc in xrange(1, 4):
            gene_ext = refseq[start:end + 3 * nc].seq
            prot_ext = gene_ext.translate()
            check = check_has_end(prot_ext, genename, VERBOSE=0)
            if check:
                gene = gene_ext
                prot = prot_ext
                if VERBOSE:
                    print 'WARNING: ' + genename + ' actually ends ' + str(
                        nc) + ' codons downstream'
                break
        else:
            print 'ERROR: ' + genename + ' does not end, not even slightly downstream'
            return False

    check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    # Check vpu (should be complete)
    genename = 'vpu'
    (start, end, start_found, end_found) = locate_gene(refseq,
                                                       genename,
                                                       VERBOSE=VERBOSE)
    if (not start_found) or (not end_found):
        print 'ERROR: ' + genename + ' not found in F4!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start and end of ' + genename + ' found'

    gene_HXB2 = get_gene_HXB2(genename)
    check = check_has_similar_length(end - start,
                                     len(gene_HXB2),
                                     genename,
                                     VERBOSE=VERBOSE,
                                     maxdiff=15)
    if not check:
        return False

    geneseq = refseq[start:end]
    gene = geneseq.seq
    check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        print 'ERROR IN VPU STARTING CODON, CONTINUING!'
        #return False

    check = check_has_end(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    # Check vpr (should be complete)
    genename = 'vpr'
    (start, end, start_found, end_found) = locate_gene(refseq,
                                                       genename,
                                                       VERBOSE=VERBOSE)
    if (not start_found) or (not end_found):
        print 'ERROR: ' + genename + ' not found in F4!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start and end of ' + genename + ' found'

    gene_HXB2 = get_gene_HXB2(genename)
    check = check_has_similar_length(end - start,
                                     len(gene_HXB2),
                                     genename,
                                     VERBOSE=VERBOSE,
                                     maxdiff=15)
    if not check:
        return False

    geneseq = refseq[start:end]
    gene = geneseq.seq
    check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_end(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    # Check tat1 (first exon of tat, should be complete)
    genename = 'tat1'
    (start, end, start_found, end_found) = locate_gene(refseq,
                                                       genename,
                                                       VERBOSE=VERBOSE)
    if (not start_found) or (not end_found):
        print 'ERROR: ' + genename + ' not found in F4!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start and end of ' + genename + ' found'

    gene_HXB2 = get_gene_HXB2(genename)
    check = check_has_similar_length(end - start,
                                     len(gene_HXB2),
                                     genename,
                                     VERBOSE=VERBOSE,
                                     maxdiff=35)
    if not check:
        return False

    geneseq = refseq[start:end]
    geneseq = geneseq[:len(geneseq) - len(geneseq) % 3]
    gene = geneseq.seq
    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    # Check rev1 (first exon of rev, should be complete)
    genename = 'rev1'
    (start, end, start_found, end_found) = locate_gene(refseq,
                                                       genename,
                                                       VERBOSE=VERBOSE)
    if (not start_found) or (not end_found):
        print 'ERROR: ' + genename + ' not found in F4!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start and end of ' + genename + ' found'

    gene_HXB2 = get_gene_HXB2(genename)
    check = check_has_similar_length(end - start,
                                     len(gene_HXB2),
                                     genename,
                                     VERBOSE=VERBOSE,
                                     maxdiff=15)
    if not check:
        return False

    geneseq = refseq[start:end]
    geneseq = geneseq[:len(geneseq) - len(geneseq) % 3]
    gene = geneseq.seq
    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    return True
def check_F1(refseq, spec, VERBOSE=0):
    '''Check fragment F1: gag, pol'''
    check = check_length_fragment(refseq,
                                  'F1' + spec,
                                  VERBOSE=VERBOSE,
                                  tolerance=50)
    if not check:
        return False

    # Check gag (should be complete)
    genename = 'gag'
    (start, end, start_found, end_found) = locate_gene(refseq,
                                                       genename,
                                                       VERBOSE=VERBOSE)
    if (not start_found) or (not end_found):
        print 'ERROR: ' + genename + ' not found in F1!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start and end of ' + genename + ' found'

    gene_HXB2 = get_gene_HXB2(genename)
    check = check_has_similar_length(end - start,
                                     len(gene_HXB2),
                                     genename,
                                     VERBOSE=VERBOSE,
                                     maxdiff=30)
    if not check:
        return False

    geneseq = refseq[start:end]
    gene = geneseq.seq
    check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_end(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    # Check pol (there should be the start)
    genename = 'pol'
    (start, end, start_found, end_found) = locate_gene(refseq,
                                                       genename,
                                                       VERBOSE=VERBOSE)
    if (not start_found):
        print 'ERROR: start of ' + genename + ' not found in F1!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start of ' + genename + ' found'

    geneseq = refseq[start:]
    geneseq = geneseq[:len(geneseq) - len(geneseq) % 3]
    gene = geneseq.seq
    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    return True
示例#16
0
def annotate_sequence(seqrecord,
                      additional_edges={},
                      additional_features=['chunk'],
                      VERBOSE=0):
    '''Annotate a consensus with the genes and stuff (in place)'''
    # TODO: what do we do with genes that do not start/end where they are
    # supposed to? Do we follow biology and track their new locations?
    from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation
    from hivwholeseq.utils.genome_info import gene_edges, RNA_structure_edges, \
            other_edges, find_region_edges, find_region_edges_multiple, \
            locate_gene
    edge_dict = {
        'gene': gene_edges,
        'RNA structure': RNA_structure_edges,
        'other': other_edges
    }
    edge_dict.update(additional_edges)
    additional_features = ['protein'] + additional_features
    features = edge_dict.keys() + additional_features

    if VERBOSE:
        print 'Features:', ', '.join(features)

    smat = np.array(seqrecord)

    for feature_type in edge_dict:
        edges_all = edge_dict[feature_type]
        print feature_type, edge_dict[feature_type].keys()
        for name, edges in edges_all.iteritems():
            if VERBOSE >= 2:
                print name,

            # Skip a feature if it's present already
            if name in map(lambda x: x.id, seqrecord.features):
                if VERBOSE >= 2:
                    print 'already present.'
                continue

            # Behave differently for unsplit regions and split ones
            if len(edges) == 2:
                # LTR problems with F6
                if 'F6' in name:
                    pos_edge = find_region_edges(smat[6000::],
                                                 [edges[0], None])
                    pos_edge[0] += 6000
                elif feature_type == 'genes':
                    pos_edge = locate_gene(smat, name, output_compact=True)
                else:
                    pos_edge = find_region_edges(smat, edges)

                # Cut the primers for some features
                if (None not in pos_edge) and name in ['V1', 'V3', 'V4', 'V5']:
                    pos_edge[0] += len(edges[0])
                    pos_edge[1] -= len(edges[1])

                # Cut only the right primer for V2
                if (None not in pos_edge) and name in ['V2']:
                    pos_edge[1] -= len(edges[1])

                if pos_edge[0] is None:
                    if name not in ['F1', "LTR5'"]:
                        print 'WARNING: start not found'
                    pos_edge[0] = 0

                if pos_edge[1] is None:
                    if name not in ['F6', "LTR3'"]:
                        print 'WARNING: end not found'
                    pos_edge[1] = len(smat)

                location = FeatureLocation(*pos_edge)
            else:
                if feature_type == 'genes':
                    pos_edges = [
                        locate_gene(smat, name + suff, output_compact=True)
                        for suff in ('1', '2')
                    ]
                else:
                    pos_edges = find_region_edges_multiple(smat,
                                                           edges,
                                                           min_distance=1)
                locations = [
                    FeatureLocation(*pos_edge) for pos_edge in pos_edges
                ]
                location = CompoundLocation(locations)

            if VERBOSE >= 2:
                print 'found:', location

            feature = SeqFeature(location,
                                 type=feature_type,
                                 id=name,
                                 strand=1)
            seqrecord.features.append(feature)

    # Add proteins and other features from HXB2
    from operator import attrgetter
    from seqanpy import align_overlap
    from hivwholeseq.utils.genome_info import proteins, chunks
    from hivwholeseq.reference import load_custom_reference
    additional_features_dict = {}
    if 'protein' in additional_features:
        additional_features_dict['protein'] = proteins
    if 'chunk' in additional_features:
        additional_features_dict['chunk'] = chunks

    ref_ann = load_custom_reference('HXB2', 'gb')
    for feagroup, additional_features_grp in additional_features_dict.iteritems(
    ):
        for feaname in additional_features_grp:
            if VERBOSE >= 2:
                print feaname,

            fea = ref_ann.features[map(attrgetter('id'),
                                       ref_ann.features).index(feaname)]
            seq = fea.extract(ref_ann)
            (score, ali1, ali2) = align_overlap(seqrecord,
                                                seq,
                                                score_gapopen=-20)
            start = len(ali2) - len(ali2.lstrip('-'))
            end = len(ali2.rstrip('-'))
            end -= ali1[start:end].count('-')

            location = FeatureLocation(start, end)
            if VERBOSE >= 2:
                print 'found:', location

            feature = SeqFeature(location, type=feagroup, id=feaname, strand=1)
            seqrecord.features.append(feature)