예제 #1
0
def main_gradient_2_sans_proj(u_p, v_p, Y, u_, v_, N, M, lambda_, beta_u,
                              beta_v, lambda_1, lambda_2, dt):

    for i in range(iteration):

        # Computation
        sqrt_dt = torch.sqrt(dt)

        u_1 = (1 / lambda_1) * f.gradient_u_2(N, M, u_p, v_p, Y, lambda_) * dt
        u_2 = torch.sqrt(2 / (lambda_1 * beta_u)) * torch.empty(N).normal_(
            mean=0, std=sqrt_dt)
        u_3 = ((N - 1) / (N * lambda_1 * beta_u)) * u_p * dt
        u_n = u_p - u_1 + u_2 - u_3

        v_1 = 1 / lambda_2 * f.gradient_v_2(N, M, u_p, v_p, Y, lambda_) * dt
        v_2 = torch.sqrt(2 / (lambda_2 * beta_v)) * torch.empty(M).normal_(
            mean=0, std=sqrt_dt)
        v_3 = ((M - 1) / (M * lambda_2 * beta_v)) * v_p * dt
        v_n = v_p - v_1 + v_2 - v_3

        # Re-asign for the loop
        u_p = u_n
        v_p = v_n

    res_u = f.overlap(u_, u_n, N)
    res_v = f.overlap(v_, v_n, M)
    print("g2_u: ", res_u)
    print("g2_v: ", res_v)

    return (res_u, res_v)
예제 #2
0
def main_gradient_1_avec_proj(u_p, v_p, Y, u_, v_, N, M, lambda_, beta_u,
                              beta_v, lambda_1, lambda_2, dt):

    for i in range(iteration):

        # Computation
        u_1 = (1 / lambda_1) * torch.tensordot(
            f.proj(u_p, N), f.gradient_u_1(N, M, u_p, v_p, Y, lambda_), 1) * dt
        u_2 = torch.sqrt(2 / (lambda_1 * beta_u)) * torch.tensordot(
            f.proj(u_p, N),
            torch.empty(N).normal_(mean=0, std=torch.sqrt(dt)), 1)
        u_3 = ((N - 1) / (N * lambda_1 * beta_u)) * u_p * dt
        u_n = u_p - u_1 + u_2 - u_3

        v_1 = (1 / lambda_2) * torch.tensordot(
            f.proj(v_p, M), f.gradient_v_1(N, M, u_p, v_p, Y, lambda_), 1) * dt
        v_2 = torch.sqrt(2 / (lambda_2 * beta_v)) * torch.tensordot(
            f.proj(v_p, M),
            torch.empty(M).normal_(mean=0, std=torch.sqrt(dt)), 1)
        v_3 = ((M - 1) / (M * lambda_2 * beta_v)) * v_p * dt
        v_n = v_p - v_1 + v_2 - v_3

        # Re-asign for the loop
        u_p = u_n
        v_p = v_n

    res_u = f.overlap(u_, u_n, N)
    res_v = f.overlap(v_, v_n, M)
    print("g1_u_proj: ", res_u)
    print("g1_v_proj: ", res_v)

    return (res_u, res_v)
예제 #3
0
def cmp_run_overlaps(PRa, ka, PRb, kb):

    clusts_a = PRa.hit_clusters[(ka['id_a'], ka['id_b'], ka['linkage_type'],
                                 ka['alpha'], ka['cut'], ka['nd'])]
    clusts_b = PRb.hit_clusters[(kb['id_a'], kb['id_b'], kb['linkage_type'],
                                 kb['alpha'], kb['cut'], kb['nd'])]

    chr_groups_a = {}

    total_coverage_a = 0
    total_coverage_b = 0
    overlap_a = 0
    overlap_b = 0

    for c in clusts_a:
        k = (str(c[0]), str(c[3]))
        if k not in chr_groups_a:
            chr_groups_a[k] = []
        #fi
        chr_groups_a[k].append(c)
        total_coverage_a = total_coverage_a + (c[2] - c[1]) + (c[5] - c[4])
    #efor

    for c in clusts_b:
        k = (str(c[0]), str(c[3]))
        total_coverage_b = total_coverage_b + (c[2] - c[1]) + (c[5] - c[4])

        if k not in chr_groups_a:
            continue
        #fi

        rel_clusts = chr_groups_a[k]
        for rc in rel_clusts:
            ov_a = util.overlap((c[1], c[2]), (rc[1], rc[2]))
            ov_b = util.overlap((c[4], c[5]), (rc[4], rc[5]))
            if ov_a > 0 and ov_b > 0:
                print k
                print(c[1], c[2]), (rc[1], rc[2]), "->", ov_a
                print(c[4], c[5]), (rc[4], rc[5]), "->", ov_b
                overlap_a = overlap_a + ov_a
                overlap_b = overlap_b + ov_b
            #fi
        #efor

    print overlap_a, overlap_b, total_coverage_a, total_coverage_b

    return ((float(overlap_a + overlap_b) /
             float(total_coverage_a + total_coverage_b)),
            float(overlap_a) / float(total_coverage_a),
            float(overlap_b) / float(total_coverage_b))
예제 #4
0
def newAnnotation(request):
    document_id=int(request.POST['document_id'])
    text=request.POST['newNec']
    annotation_id=int(request.POST['newNecCategoryId'])

    if annotation_id == 'Delete':
        return documentByAnnotator(request, document_id, annotator_id=request.user.id, error=None)
      
    if not text:
        return documentByAnnotator(request, document_id, annotator_id=request.user.id, error="Please select text to add new anntations")
    annotator=Annotator.objects.get(id=request.user.id)

    annotation_type=AnnotationType.objects.get(id=int(annotation_id))

    document=Document.objects.get(id=document_id)
    annotations = Annotation.objects.filter(document=document, annotator=annotator)

    indices=util.findIndices(document.text, text)


    for ind in indices:
        if Annotation.objects.filter(document=document, begin_index=ind[0], end_index=ind[1], annotator=annotator):
            continue
        begin=ind[0]
        end=ind[1]
        allNamedEntities=Annotation.objects.filter(document=document, annotator=annotator)
        for absNE in allNamedEntities:
            if util.overlap((absNE.begin_index, absNE.end_index), (ind[0],ind[1])):
                absNE.delete()
        #add new one
        annotation  = Annotation(document=document, annotation=text, begin_index=begin, end_index=end, annotation_type=annotation_type, annotator=annotator)
        annotation.save()

    return documentByAnnotator(request, document_id, annotator_id=request.user.id, error=None)
예제 #5
0
def run_cba(Xtr,
            Ytr,
            Xt,
            Yt,
            lb,
            support=0.20,
            confidence=0.5,
            k=None,
            log=None):
    txns_train = TransactionDB.from_DataFrame(pd.concat([Xtr, Ytr], axis=1))
    txns_test = TransactionDB.from_DataFrame(pd.concat([Xt, Yt], axis=1))
    cba = CBA(support=support, confidence=confidence, algorithm="m1")
    cba.fit(txns_train)

    if k is not None:
        cba.clf.rules = cba.clf.rules[:k]

    Y_pred = [int(i) for i in cba.predict(txns_test)]

    for r in cba.clf.rules:
        r.covered = set(
            [i for i, rd in enumerate(txns_train) if r.antecedent <= rd])

    if log is None:
        from logger import log
    log('cba-k', len(cba.clf.rules))
    log('cba-rules', str(cba.clf.rules))
    [log('cba-nconds', len(r), i) for i, r in enumerate(cba.clf.rules)]
    log('cba-auc', roc_auc_score(lb.transform(Yt.values),
                                 lb.transform(Y_pred)))
    log('cba-bacc', balanced_accuracy_score(Yt, Y_pred))
    log('cba-disp', dispersion_(cba.clf.rules, average=True))
    log('cba-overlap', overlap(cba.clf.rules))
    print(confusion_matrix(Yt, Y_pred))
예제 #6
0
def run_ours(Xtr,
             Ytr,
             Xt,
             Yt,
             lb,
             nsample,
             lambda_mode,
             q,
             sample_mode,
             k=None,
             rerun=True,
             eps=0.01,
             min_recall_per_class=0.8,
             log=None):
    #name = 'ours' if k is None else 'oursk'
    name = 'ours{}'.format(int(rerun))
    k = k if k is not None else 100

    dec = DecisionSet(eps)
    dec.train(Xtr,
              Ytr,
              max_k=k,
              nsamp=nsample,
              lamb=lambda_mode,
              q=q,
              mode=sample_mode,
              rerun=rerun,
              min_recall_per_class=min_recall_per_class)
    print('default:', dec.default)

    Xt_ = [Transaction(feat2item(t)) for t in Xt.values]
    Y_pred = dec.predict_all(Xt_)

    if log is None:
        from logger import log
    log('{}-default'.format(name), dec.default)
    log('{}-k'.format(name), len(dec.rules))
    log('{}-maxk'.format(name), k)
    [log('{}-nconds'.format(name), len(r), i) for i, r in enumerate(dec.rules)]
    log('{}-q'.format(name), q)
    log('{}-nsample'.format(name), nsample)
    log('{}-lamb'.format(name), lambda_mode)
    log('{}-seq'.format(name), dec.seq)
    log('{}-auc'.format(name),
        roc_auc_score(lb.transform(Yt.values), lb.transform(Y_pred)))
    log('{}-bacc'.format(name), balanced_accuracy_score(Yt, Y_pred))
    log('{}-disp'.format(name), dispersion(dec.rules, average=True))
    log('{}-overlap'.format(name), overlap(dec.rules))
    log('{}-mode'.format(name), sample_mode)
    [
        log('{}-precisions-tr'.format(name), v, l)
        for l, v in precision(dec).items()
    ]
    [
        log('{}-recall-tr'.format(name), v, l)
        for l, v in recall(dec.rules).items()
    ]
    print(confusion_matrix(Yt, Y_pred))

    return Y_pred
예제 #7
0
def run_ids(Xtr, Ytr, Xt, Yt, lb, min_freq, lambs, log=None):
    ids, nfreq, default = IDS(Xtr, Ytr.values, lambs, freq=min_freq)
    for r in ids:
        print('class: ',
              r.class_label,
              ', cover: {}/{}'.format(len(r.get_correct_cover(Xtr, Ytr)),
                                      len(r.get_cover(Xtr))),
              end='; ')
        r.print_rule()

    for r in ids:
        r.covered = set(r.get_cover(Xtr))

    Y_pred = IDS_predict(ids, Xt, default=default)

    if log is None:
        from logger import log
    [log('ids-lambda', lamb, i) for i, lamb in enumerate(lambs)]
    log('ids-k', len(ids))
    [log('ids-nconds', r.get_length(), i) for i, r in enumerate(ids)]
    log('ids-nfreq', nfreq)
    log('ids-freq', min_freq)
    log('ids-default', default)
    log('ids-auc', roc_auc_score(lb.transform(Yt.values),
                                 lb.transform(Y_pred)))
    log('ids-bacc', balanced_accuracy_score(Yt, Y_pred))
    log('ids-disp', dispersion_(ids, average=True))
    log('ids-overlap', overlap(ids))
    print(confusion_matrix(Yt, Y_pred))

    return Y_pred
def findOverlapIntervals(name1, name2, cutoffRatio):
    nodeNamePairs = []

    interval1Start = int(name1[1])
    interval1End = int(name1[2])
    interval2Start = int(name2[1])
    interval2End = int(name2[2])

    overlap = util.overlap(interval1Start, interval1End, interval2Start,
                           interval2End)
    intervalLen1 = interval1End - interval1Start
    intervalLen2 = interval2End - interval2Start

    overlapRatio1 = float(overlap) / float(intervalLen1)
    overlapRatio2 = float(overlap) / float(intervalLen2)

    maxOverlapRatio = max(overlapRatio1, overlapRatio2)

    if (name1[0] != name2[0]):
        print "Error!!! ", name1, name2

    #add the nodeNamePair to nodeNamePairs
    if maxOverlapRatio > cutoffRatio:
        #print name1,name2
        nodeNamePairs.append((name1, name2))

    return nodeNamePairs
def containsOverlapBorders(borders):
	for i in range(len(borders)):
		for j in range(i+1,len(borders)):
			if (borders[i][0]==borders[j][0]):
				overlapped=util.overlap(borders[i][1],borders[i][2],borders[j][1],borders[j][2])>0
				if overlapped:
					return (i,j)
	return False
예제 #10
0
파일: VOCpr.py 프로젝트: ChrisYang/CRFdet
def viewSortDet(gtImages,detlist,numim=numpy.inf,opt="all",usetr=True,usedf=False,ovr=0.5):
    dimg={}
    tot=0
    for idx in range(min(gtImages.getTotal(),numim)):
        rect=gtImages.getBBox(idx)
        if rect!=[]:
            #print gtImages.getImageName(idx).split("/")[-1].split(".")[0]
            dimg[gtImages.getImageName(idx).split("/")[-1].split(".")[0]]=rect
        tot=tot+len(rect)
    imname=[]
    cnt=0
    tp=numpy.zeros(len(detlist))
    fp=numpy.zeros(len(detlist))
    thr=numpy.zeros(len(detlist))
    detlist.sort(cmpscore)
    for idx,detbb in enumerate(detlist):
        #print detbb[1]
        found=False
        if dimg.has_key(detbb[0]):
            rect=dimg[detbb[0]]
            found=False
            for r in rect:
                rb=(float(detbb[3]),float(detbb[2]),float(detbb[5]),float(detbb[4]))
                #print "GT:",r
                #print "DET:",rb
                if overlap(rb,r)>=ovr:
                    dimg[detbb[0]].remove(r)
                    found=True
                    break
        if found:  
            tp[idx]=1
        else:
            fp[idx]=1
        thr[idx]=detbb[1]
        if show:
            pylab.ioff()
            prec=numpy.sum(tp)/float(numpy.sum(tp)+numpy.sum(fp))
            rec=numpy.sum(tp)/tot
            print "Scr:",detbb[1],"Prec:",prec,"Rec:",rec
            img=gtImages.getImageByName2(detbb[0])
            pylab.figure(1)
            pylab.clf()
            pylab.imshow(img)
            rb=(float(detbb[3]),float(detbb[2]),float(detbb[5]),float(detbb[4]))
            for r in rect:
                pylab.figure(1)
                pylab.ioff()
                box(r[0],r[1],r[2],r[3],'b',lw=1.5)
            if found:
                box(rb[0],rb[1],rb[2],rb[3],'g',lw=1.5)
            else:
                box(rb[0],rb[1],rb[2],rb[3],'r',lw=1.5)
            pylab.draw()
            pylab.show()
            rect=[]
            raw_input()

    return tp,fp,thr,tot
예제 #11
0
    def autoScafMidSeam(self, strands):
        """docstring for autoScafMidSeam"""
        part = self.part()
        strandType = StrandType.Scaffold
        idx = part.activeBaseIndex()
        for i in range(1, len(strands)):
            row1, col1, sSidx1 = strands[i-1]  # previous strand
            row2, col2, sSidx2 = strands[i]  # current strand
            vh1 = part.virtualHelixAtCoord((row1, col1))
            vh2 = part.virtualHelixAtCoord((row2, col2))
            strand1 = vh1.scaffoldStrandSet()._strandList[sSidx1]
            strand2 = vh2.scaffoldStrandSet()._strandList[sSidx2]
            # determine if the pair of strands are neighbors
            neighbors = part.getVirtualHelixNeighbors(vh1)
            if vh2 in neighbors:
                p2 = neighbors.index(vh2)
                if vh2.number() % 2 == 1:
                    # resize and install external xovers
                    try:
                        # resize to the nearest prexover on either side of idx
                        newLo = util.nearest(idx, part.getPreXoversHigh(strandType, p2, maxIdx=idx-10))
                        newHi = util.nearest(idx, part.getPreXoversLow(strandType, p2, minIdx=idx+10))
                        if strand1.canResizeTo(newLo, newHi) and \
                           strand2.canResizeTo(newLo, newHi):
                            # do the resize
                            strand1.resize((newLo, newHi))
                            strand2.resize((newLo, newHi))
                            # install xovers
                            part.createXover(strand1, newHi, strand2, newHi)
                            part.createXover(strand2, newLo, strand1, newLo)
                    except ValueError:
                        pass  # nearest not found in the expanded list

                    # go back an install the internal xovers
                    if i > 2:
                        row0, col0, sSidx0 = strands[i-2]  # two strands back
                        vh0 = part.virtualHelixAtCoord((row0, col0))
                        strand0 = vh0.scaffoldStrandSet()._strandList[sSidx0]
                        if vh0 in neighbors:
                            p0 = neighbors.index(vh0)
                            l0, h0 = strand0.idxs()
                            l1, h1 = strand1.idxs()
                            oLow, oHigh = util.overlap(l0, h0, l1, h1)
                            try:
                                lList = filter(lambda x:x>oLow and x<oHigh, part.getPreXoversLow(strandType, p0))
                                lX = lList[len(lList)/2]
                                hList = filter(lambda x:x>oLow and x<oHigh, part.getPreXoversHigh(strandType, p0))
                                hX = hList[len(hList)/2]
                                # install high xover first
                                part.createXover(strand0, hX, strand1, hX)
                                # install low xover after getting new strands
                                # following the breaks caused by the high xover
                                strand3 = vh0.scaffoldStrandSet()._strandList[sSidx0]
                                strand4 = vh1.scaffoldStrandSet()._strandList[sSidx1]
                                part.createXover(strand4, lX, strand3, lX)
                            except IndexError:
                                pass  # filter was unhappy
예제 #12
0
def distance(hits, i, j):
  """d = distance(hits, i, j):
     hits: The output list of hit_index
     i: The ID of hit i
     j: The ID of hit j

                |-a-|
     ---=========---============---
       \\\\\\\\\\   ||||||||||||
     ----=========--============---
                 |-b|

      d = a + b

     Outputs:
       d: The distance between hit i and hit j
  """
  h1 = hits[i];
  h2 = hits[j];

    # Different chromosomes
  if (h1[1] != h2[1]) or (h1[4] != h2[4]):
    return float("inf");
  #fi

    # Same exons
  if (h1[2] == h2[2]) and (h1[5] == h2[5]) and (h1[3] == h2[3]) and (h1[6] == h2[6]):
    return 0;
  #fi

    # Regions of hit
  h1_a = (h1[7], h1[8]);
  h1_b = (h1[9], h1[10]);
  h2_a = (h2[7], h2[8]);
  h2_b = (h2[9], h2[10]);
  ov1 = util.overlap(h1_a, h2_a);
  ov2 = util.overlap(h1_b, h2_b);

    # If they all overlap, distance is 0.
  if ov1 > 0 and ov2 > 0:
    return 0;
  #fi

  return -(min(0, ov1) + min(0, ov2));
예제 #13
0
파일: VOCpr.py 프로젝트: ChrisYang/CRFdet
def VOCprlistfast(gtImages,detlist,show=False,usetr=True,usedf=False,ovr=0.5):
    """
        calculate the precision recall curve
    """
    dimg={}
    tot=0
    for idx in range(gtImages.getTotal()):
        rect=gtImages.getBBox(idx)
        if rect!=[]:
            dimg[gtImages.getImageName(idx).split("/")[-1].split(".")[0]]=rect
        tot=tot+len(rect)
        #print tot
    imname=[]
    cnt=0
    tp=numpy.zeros(len(detlist))
    fp=numpy.zeros(len(detlist))
    detlist.sort(cmpscore)
    for idx,detbb in enumerate(detlist):#detlist[sortlist]):#gtImages.getTotal()):
        found=False
        if dimg.has_key(detbb[0]):
            rect=dimg[detbb[0]]#gtImages.getBBox(idx,usetr=usetr,usedf=usedf)
            #print rect
            found=False
            for r in rect:
                rb=(float(detbb[3]),float(detbb[2]),float(detbb[5]),float(detbb[4]))
                if overlap(rb,r)>=ovr:
                    dimg[detbb[0]].remove(r)
                    found=True
                    break
        if found:  
            tp[idx]=1#.append(float(detbb[1]))
        else:
            fp[idx]=1#.append(float(detbb[1]))
        if show:
            pylab.ioff()
            img=gtImages.getImageByName2(detbb[0])
            pylab.figure(1)
            pylab.clf()
            pylab.imshow(img)
            rb=(float(detbb[3]),float(detbb[2]),float(detbb[5]),float(detbb[4]))
            for r in rect:
                pylab.figure(1)
                pylab.ioff()
                box(r[0],r[1],r[2],r[3],'b',lw=1.5)
            if found:
                box(rb[0],rb[1],rb[2],rb[3],'g',lw=1.5)
            else:
                box(rb[0],rb[1],rb[2],rb[3],'r',lw=1.5)
            pylab.draw()
            pylab.show()
            rect=[]
            raw_input()

    return tp,fp,tot
예제 #14
0
def run_cn2(Xtr, Ytr, Xt, Yt, lb, k=None, log=None):
    domainx = Domain.from_numpy(Xtr.values)
    domainy = Domain.from_numpy(Ytr.values.reshape((-1, 1)))
    datax = Orange.data.Table.from_numpy(domainx, Xtr.values)
    datay = Orange.data.Table.from_numpy(domainy, Ytr.values.reshape((-1, 1)))
    discretizer = Orange.preprocess.DomainDiscretizer()
    domainx = discretizer(datax)
    domainy = discretizer(datay)
    domain = Domain(domainx.attributes, domainy.attributes[0])
    data = Orange.data.Table.from_numpy(domain, Xtr.values, Y=Ytr.values)

    learner = Orange.classification.CN2UnorderedLearner()
    #learner = Orange.classification.rules.CN2Learner()
    learner.rule_finder.search_algorithm.beam_width = 10
    learner.rule_finder.search_strategy.constrain_continuous = True
    learner.rule_finder.general_validator.min_covered_examples = 15
    cn2 = learner(data)

    if k is not None:
        r_def = cn2.rule_list[-1]
        cn2.rule_list = cn2.rule_list[:k]
        cn2.rule_list.append(r_def)

    Y_pred = np.argmax(cn2.predict(Xt.values), axis=1)

    ids = np.arange(Xtr.shape[0])
    print('default:', cn2.rule_list[-1].prediction)
    # Skip the last default rule
    for i, r in enumerate(cn2.rule_list[:-1]):
        cov = np.array([r.evaluate_instance(x) for x in data])
        pred = np.array([r.prediction] * sum(cov))
        acc = pred == Ytr.values[cov]
        r.covered = set(ids[cov])
        print(
            'CN2', '#{}, label:{}, len:{}, cov:{}, acc:{}'.format(
                i, r.prediction, r.length,
                sum(cov) / len(ids),
                sum(acc) / sum(cov)))

    if log is None:
        from logger import log
    log('cn2-k', len(cn2.rule_list[:-1]))
    [log('cn2-nconds', r.length, i) for i, r in enumerate(cn2.rule_list[:-1])]
    log('cn2-auc', roc_auc_score(lb.transform(Yt.values),
                                 lb.transform(Y_pred)))
    log('cn2-bacc', balanced_accuracy_score(Yt, Y_pred))
    log('cn2-disp', dispersion_(cn2.rule_list[:-1], average=True))
    log('cn2-overlap', overlap(cn2.rule_list[:-1]))
    print(confusion_matrix(Yt, Y_pred))
예제 #15
0
def newAnnotation(request):
    #import pdb
    #pdb.set_trace()
    document_id=int(request.POST['document_id'])
    text=request.POST['newNec']
    annotation_id=int(request.POST['newNecCategoryId'])

    if annotation_id == 'Delete':
        return documentByAnnotator(request, document_id, annotator_id=request.user.id, error=None)
      
    if not text:
        return documentByAnnotator(request, document_id, annotator_id=request.user.id, error="Please select text to add new anntations")
    annotator=Annotator.objects.get(id=request.user.id)

    annotation_type=AnnotationType.objects.get(id=int(annotation_id))

    document=Document.objects.get(id=document_id)
    annotations = Annotation.objects.filter(document=document, annotator=annotator)

    indices=util.findIndices(document.text, text)

    for ind in indices:
        if Annotation.objects.filter(document=document, begin_index=ind[0], end_index=ind[1], annotator=annotator):
            continue
        begin=ind[0]
        end=ind[1]
        allNamedEntities=Annotation.objects.filter(document=document, annotator=annotator)
        foundOverlap = False
        for absNE in allNamedEntities:
            if util.overlap((absNE.begin_index, absNE.end_index), (ind[0],ind[1])):
                foundOverlap = True
                #absNE.delete() # blah!
        # don't erase any existing entities
        if foundOverlap:
            continue
        #add new one
        annotation  = Annotation(document=document, annotation=text, begin_index=begin, end_index=end, annotation_type=annotation_type, annotator=annotator)
        annotation.save()

    annotations = Annotation.objects.filter(document=document, annotator=annotator)
    text=util.htmlFormat(document.text, annotations)
    #return documentByAnnotator(request, document_id, annotator_id=request.user.id, error=None)
    return HttpResponse(text)
예제 #16
0
def overlap_clusters(C):

    overlaps = []

    for i in xrange(len(C) - 1):
        ci = C[i]
        cio = []
        for j in xrange(i + 1, len(C)):
            cj = C[j]
            rlen = float(ci[2] - ci[1] + 1) / float(cj[2] - cj[1] + 1)
            if ((ci[0] == cj[0]) and util.overlap((ci[1], ci[2]),
                                                  (cj[1], cj[2])) > 0
                    and (rlen > 0.8 and rlen < 1.25)):
                cio.append(j)
            #fi
        #efor
        if len(cio) > 0:
            overlaps.append(cio + [i])
        #fi
    #efor

    return overlaps
예제 #17
0
def VOCanalysis(gtImages,
                detlist,
                show=False,
                usetr=True,
                usedf=False,
                ovr=0.5):
    """
        calculate the precision recall curve
    """
    dimg = {}
    tot = 0
    for idx in range(len(gtImages)):
        rect = gtImages[idx]["bbox"][:]
        #if idx>288:
        #    print idx,rect
        if rect != []:
            #print gtImages.getImageName(idx).split("/")[-1].split(".")[0]
            dimg[gtImages[idx]["name"].split("/")[-1].split(".")[0]] = {
                "bbox": rect,
                "det": [False] * len(rect)
            }
        tot = tot + len(rect)
    imname = []
    cnt = 0
    tp = numpy.zeros(len(detlist))
    fp = numpy.zeros(len(detlist))
    thr = numpy.zeros(len(detlist))

    tplist = []
    fplist = []
    fp2list = []
    fnlist = []

    detlist.sort(cmpscore)
    for idx, detbb in enumerate(detlist):
        #print detbb[1]
        found = False
        maxovr = 0
        #gtdet=[False]
        gt = 0
        if dimg.has_key(detbb[0]):
            rect = dimg[detbb[0]]["bbox"]
            found = False
            for ir, r in enumerate(rect):
                #gtdet.append(False)
                rb = (float(detbb[3]), float(detbb[2]), float(detbb[5]),
                      float(detbb[4]))
                #print "GT:",r
                #print "DET:",rb
                covr = overlap(rb, r)
                if covr >= maxovr:
                    maxovr = covr
                    gt = ir
                    #dimg[detbb[0]].remove(r)
                    #found=True
                    #break
        if maxovr > ovr:
            if not (dimg[detbb[0]]["det"][gt]):
                tp[idx] = 1
                dimg[detbb[0]]["det"][gt] = True
                tplist.append(detbb)
            else:
                fp[idx] = 1
                fplist.append(detbb)
        else:
            fp[idx] = 1
            fp2list.append(detbb)

    totalDetected = 0
    totalnoDetected = 0

    for idx in range(len(gtImages)):
        rect = gtImages[idx]["bbox"][:]
        if rect != []:
            name = gtImages[idx]["name"].split("/")[-1].split(".")[0]
            bboxgt = dimg[name]
            for i in range(len(bboxgt["det"])):
                if bboxgt["det"][i]:
                    #bbox FOUND, it's ok
                    totalDetected += 1
                else:
                    #bbox not FOUND, add to FN
                    gtbb = [name, 0, bboxgt["bbox"][i][0:4]]
                    fnlist.append(gtbb)
                    totalnoDetected += 1

    print "total Detected %d, total no Detected %d" % (totalDetected,
                                                       totalnoDetected)

    #tplist.sort(key=lambda det: -det[1])
    #fplist.sort(key=lambda det: -det[1])
    #fnlist.sort(key=lambda det: -det[1])

    return tplist, fplist, fp2list, fnlist
예제 #18
0
def VOCprRecordOptim(gtImages, detlist, show=False, ovr=0.5, pixels=None):
    """
        calculate the precision recall curve
    """
    tx = []
    ty = []
    sx = []
    sy = []
    dimg = {}
    tot = 0
    for idx in range(len(gtImages)):
        rect = gtImages[idx]["bbox"][:]
        if rect != []:
            dimg[gtImages[idx]["name"].split(
                "/")[-1].split(".")[0]] = {"bbox": rect, "det": [False] * len(rect)}
            for i, recti in enumerate(rect):
                if recti[5] == 0:
                    tot = tot + 1

    imname = []
    cnt = 0
    tp = numpy.zeros(len(detlist))
    fp = numpy.zeros(len(detlist))
    thr = numpy.zeros(len(detlist))
    detlist.sort(cmpscore)
    for idx, detbb in enumerate(detlist):
        found = False
        maxovr = 0
        gt = 0
        if dimg.has_key(detbb[0]):
            rect = dimg[detbb[0]]["bbox"]
            found = False
            for ir, r in enumerate(rect):
                rb = (float(detbb[3]), float(detbb[2]),
                      float(detbb[5]), float(detbb[4]))
                if pixels == None:
                    covr = overlap(rb, r)
                else:
                    covr = overlapx(rb, r, pixels)
                if covr >= maxovr:
                    maxovr = covr
                    gt = ir

        if maxovr > ovr:
            if dimg[detbb[0]]["bbox"][gt][5] == 0:
                if not(dimg[detbb[0]]["det"][gt]):
                    tp[idx] = 1
                    dimg[detbb[0]]["det"][gt] = True
                    gtx = dimg[detbb[0]]["bbox"][gt][
                        3] - dimg[detbb[0]]["bbox"][gt][1]
                    dtx = detbb[4] - detbb[2]
                    gty = dimg[detbb[0]]["bbox"][gt][
                        2] - dimg[detbb[0]]["bbox"][gt][0]
                    dty = detbb[5] - detbb[3]
                    gtcx = (
                        dimg[detbb[0]]["bbox"][gt][3] + dimg[detbb[0]]["bbox"][gt][1]) / 2.
                    dtcx = (detbb[4] + detbb[2]) / 2.
                    gtcy = (
                        dimg[detbb[0]]["bbox"][gt][2] + dimg[detbb[0]]["bbox"][gt][0]) / 2.
                    dtcy = (detbb[5] + detbb[3]) / 2.
                    tx.append((gtcx - dtcx) / float(dtx))
                    ty.append((gtcy - dtcy) / float(dty))
                    sx.append(gtx / float(dtx))
                    sy.append(gty / float(dty))
                else:
                    fp[idx] = 1
        else:
            fp[idx] = 1

        thr[idx] = detbb[1]
        if show:
            prec = numpy.sum(tp) / float(numpy.sum(tp) + numpy.sum(fp))
            rec = numpy.sum(tp) / tot
            print("Scr:", detbb[1], "Prec:%.3f" % prec, "Rec:%.3f" % rec)
            ss = raw_input()
            if ss == "s" or not(found):
                pylab.ioff()
                img = gtImages.getImageByName2(detbb[0])
                pylab.figure(1)
                pylab.clf()
                pylab.imshow(img)
                rb = (float(detbb[3]), float(detbb[2]),
                      float(detbb[5]), float(detbb[4]))
                for r in rect:
                    pylab.figure(1)
                    pylab.ioff()
                    box(r[0], r[1], r[2], r[3], 'b', lw=1.5)
                if found:
                    box(rb[0], rb[1], rb[2], rb[3], 'g', lw=1.5)
                else:
                    box(rb[0], rb[1], rb[2], rb[3], 'r', lw=1.5)
                pylab.draw()
                pylab.show()
                rect = []

    return tp, fp, thr, tot, tx, ty, sx, sy
def newAnnotation(request):
    #import pdb
    #pdb.set_trace()
    document_id = int(request.POST['document_id'])
    text = request.POST['newNec']
    annotation_id = int(request.POST['newNecCategoryId'])

    if annotation_id == 'Delete':
        return documentByAnnotator(request,
                                   document_id,
                                   annotator_id=request.user.id,
                                   error=None)

    if not text:
        return documentByAnnotator(
            request,
            document_id,
            annotator_id=request.user.id,
            error="Please select text to add new anntations")
    annotator = Annotator.objects.get(id=request.user.id)

    annotation_type = AnnotationType.objects.get(id=int(annotation_id))

    document = Document.objects.get(id=document_id)
    annotations = Annotation.objects.filter(document=document,
                                            annotator=annotator)

    indices = util.findIndices(document.text, text)

    for ind in indices:
        if Annotation.objects.filter(document=document,
                                     begin_index=ind[0],
                                     end_index=ind[1],
                                     annotator=annotator):
            continue
        begin = ind[0]
        end = ind[1]
        allNamedEntities = Annotation.objects.filter(document=document,
                                                     annotator=annotator)
        foundOverlap = False
        for absNE in allNamedEntities:
            if util.overlap((absNE.begin_index, absNE.end_index),
                            (ind[0], ind[1])):
                foundOverlap = True
                #absNE.delete() # blah!
        # don't erase any existing entities
        if foundOverlap:
            continue
        #add new one
        annotation = Annotation(document=document,
                                annotation=text,
                                begin_index=begin,
                                end_index=end,
                                annotation_type=annotation_type,
                                annotator=annotator)
        annotation.save()

    annotations = Annotation.objects.filter(document=document,
                                            annotator=annotator)
    text = util.htmlFormat(document.text, annotations)
    #return documentByAnnotator(request, document_id, annotator_id=request.user.id, error=None)
    return HttpResponse(text)
예제 #20
0
    def run(self):
        if self.options.debubble:
            self.loadBubbleCircles()

        #read1_file is required
        read1_file = fastq.Reader(self.options.read1_file)

        #no front trim if sequence is barcoded
        if self.options.barcode:
            self.options.trim_front = 0

        reporter = QCReporter()

        self.r1qc_prefilter = QualityControl(self.options.qc_sample,
                                             self.options.qc_kmer)
        self.r2qc_prefilter = QualityControl(self.options.qc_sample,
                                             self.options.qc_kmer)
        self.r1qc_prefilter.statFile(self.options.read1_file)
        if self.options.read2_file != None:
            self.r2qc_prefilter.statFile(self.options.read2_file)

        self.r1qc_postfilter = QualityControl(self.options.qc_sample,
                                              self.options.qc_kmer)
        self.r2qc_postfilter = QualityControl(self.options.qc_sample,
                                              self.options.qc_kmer)

        readLen = self.r1qc_prefilter.readLen
        overlap_histgram = [0 for x in xrange(readLen + 1)]
        distance_histgram = [0 for x in xrange(readLen + 1)]

        #auto detect trim front and trim tail
        if self.options.trim_front == -1 or self.options.trim_tail == -1:
            #auto trim for read1
            trimFront, trimTail = self.r1qc_prefilter.autoTrim()
            if self.options.trim_front == -1:
                self.options.trim_front = trimFront
            if self.options.trim_tail == -1:
                self.options.trim_tail = trimTail
            #auto trim for read2
            if self.options.read2_file != None:
                # check if we should keep same trimming for read1/read2 to keep their length identical
                # this option is on by default because lots of dedup algorithms require this feature
                if self.options.trim_pair_same:
                    self.options.trim_front2 = self.options.trim_front
                    self.options.trim_tail2 = self.options.trim_tail
                else:
                    trimFront2, trimTail2 = self.r2qc_prefilter.autoTrim()
                    if self.options.trim_front2 == -1:
                        self.options.trim_front2 = trimFront2
                    if self.options.trim_tail2 == -1:
                        self.options.trim_tail2 = trimTail2

        print(self.options.read1_file + " options:")
        print(self.options)

        #if good output folder not specified, set it as the same folder of read1 file
        good_dir = self.options.good_output_folder
        if good_dir == None:
            good_dir = os.path.dirname(self.options.read1_file)

        #if bad output folder not specified, set it as the same folder of read1 file
        bad_dir = self.options.bad_output_folder
        if bad_dir == None:
            bad_dir = os.path.join(
                os.path.dirname(os.path.dirname(good_dir + "/")), "bad")

        #if overlap output folder not specified, set it as the same folder of read1 file
        overlap_dir = self.options.overlap_output_folder
        if overlap_dir == None:
            #            overlap_dir = os.path.dirname(self.options.read1_file)
            overlap_dir = os.path.join(
                os.path.dirname(os.path.dirname(good_dir + "/")), "overlap")

        #save QC results at the same folder of good
        qc_base_folder = self.options.report_output_folder
        if qc_base_folder == None:
            qc_base_folder = os.path.join(
                os.path.dirname(os.path.dirname(good_dir + "/")), "QC")
        if not os.path.exists(qc_base_folder):
            os.makedirs(qc_base_folder)
        qc_dir = qc_base_folder

        if not os.path.exists(good_dir):
            os.makedirs(good_dir)

        if not os.path.exists(bad_dir):
            os.makedirs(bad_dir)

        if self.options.store_overlap and self.options.read2_file != None and (
                not os.path.exists(overlap_dir)):
            os.makedirs(overlap_dir)

        gzip_out = self.options.gzip
        gzip_comp = self.options.compression
        if not gzip_out and self.options.read1_file.endswith(".gz"):
            gzip_out = True

        good_read1_file = None
        bad_read1_file = None
        overlap_read1_file = None
        if not self.options.qc_only:
            good_read1_file = fastq.Writer(
                os.path.join(good_dir,
                             getMainName(self.options.read1_file) +
                             ".good.fq"), gzip_out, gzip_comp)
            bad_read1_file = fastq.Writer(
                os.path.join(bad_dir,
                             getMainName(self.options.read1_file) + ".bad.fq"),
                gzip_out, gzip_comp)

            overlap_read1_file = None
            if self.options.store_overlap:
                overlap_read1_file = fastq.Writer(
                    os.path.join(
                        overlap_dir,
                        getMainName(self.options.read1_file) + ".overlap.fq"),
                    gzip_out, gzip_comp)

        #other files are optional
        read2_file = None
        good_read2_file = None
        bad_read2_file = None
        overlap_read2_file = None

        index1_file = None
        good_index1_file = None
        bad_index1_file = None
        overlap_index1_file = None

        index2_file = None
        good_index2_file = None
        bad_index2_file = None
        overlap_index2_file = None

        #if other files are specified, then read them
        if self.options.read2_file != None:
            read2_file = fastq.Reader(self.options.read2_file)
            if not self.options.qc_only:
                good_read2_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.read2_file) + ".good.fq"),
                    gzip_out, gzip_comp)
                bad_read2_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.read2_file) + ".bad.fq"),
                    gzip_out, gzip_comp)
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_read2_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.read2_file) +
                            ".overlap.fq"), gzip_out, gzip_comp)
        if self.options.index1_file != None:
            index1_file = fastq.Reader(self.options.index1_file)
            if not self.options.qc_only:
                good_index1_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.index1_file) + ".good.fq"),
                    gzip_out, gzip_comp)
                bad_index1_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.index1_file) + ".bad.fq"),
                    gzip_out, gzip_comp)
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index1_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.index1_file) +
                            ".overlap.fq"), gzip_out, gzip_comp)
        if self.options.index2_file != None:
            index2_file = fastq.Reader(self.options.index2_file)
            if not self.options.qc_only:
                good_index2_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.index2_file) + ".good.fq"),
                    gzip_out, gzip_comp)
                bad_index2_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.index2_file) + ".bad.fq"),
                    gzip_out, gzip_comp)
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index2_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.index2_file) +
                            ".overlap.fq"), gzip_out, gzip_comp)

        r1 = None
        r2 = None
        i1 = None
        i2 = None

        # stat numbers
        TOTAL_BASES = 0
        GOOD_BASES = 0
        TOTAL_READS = 0
        GOOD_READS = 0
        BAD_READS = 0
        BADBCD1 = 0
        BADBCD2 = 0
        BADTRIM1 = 0
        BADTRIM2 = 0
        BADBBL = 0
        BADLEN = 0
        BADPOL = 0
        BADLQC = 0
        BADNCT = 0
        BADINDEL = 0
        BADMISMATCH = 0
        BADDIFF = 0
        READ_CORRECTED = 0
        BASE_CORRECTED = 0
        BASE_SKIPPED_CORRECTION = 0
        BASE_ZERO_QUAL_MASKED = 0
        OVERLAPPED = 0
        OVERLAP_LEN_SUM = 0
        OVERLAP_BASE_SUM = 0
        # error profiling by overlap analysis
        OVERLAP_BASE_ERR = 0
        OVERLAP_ERR_MATRIX = init_error_matrix()

        #adapter trimming by overlap analysis
        TRIMMED_ADAPTER_BASE = 0
        TRIMMED_ADAPTER_READ = 0

        while True:
            r1 = read1_file.nextRead()
            if r1 == None:
                break
            else:
                TOTAL_BASES += len(r1[1])

            if read2_file != None:
                r2 = read2_file.nextRead()
                if r2 == None:
                    break
            if index1_file != None:
                i1 = index1_file.nextRead()
                if i1 == None:
                    break
            if index2_file != None:
                i2 = index2_file.nextRead()
                if i2 == None:
                    break
                else:
                    TOTAL_BASES += len(r2[1])

            TOTAL_READS += 1

            #barcode processing
            if self.options.barcode:
                barcodeLen1 = barcodeprocesser.detectBarcode(
                    r1[1], self.options.barcode_length,
                    self.options.barcode_verify)
                if barcodeLen1 == 0:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADBCD1")
                    BADBCD1 += 1
                    continue
                else:
                    if r2 == None:
                        barcodeprocesser.moveBarcodeToName(
                            r1, self.options.barcode_length,
                            self.options.barcode_verify)
                    else:
                        barcodeLen2 = barcodeprocesser.detectBarcode(
                            r2[1], self.options.barcode_length,
                            self.options.barcode_verify)
                        if barcodeLen2 == 0:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADBCD2")
                            BADBCD2 += 1
                            continue
                        else:
                            barcodeprocesser.moveAndTrimPair(
                                r1, r2, barcodeLen1, barcodeLen2,
                                self.options.barcode_verify)

            #trim
            if self.options.trim_front > 0 or self.options.trim_tail > 0:
                r1 = trim(r1, self.options.trim_front, self.options.trim_tail)
                if len(r1[1]) < 5:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADTRIM1")
                    BADTRIM1 += 1
                    continue
                if r2 != None:
                    r2 = trim(r2, self.options.trim_front2,
                              self.options.trim_tail2)
                    if len(r2[1]) < 5:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                        bad_read2_file, bad_index1_file,
                                        bad_index2_file, "BADTRIM2")
                        BADTRIM2 += 1
                        continue

            #filter debubble
            if self.options.debubble:
                if self.isInBubble(r1[0]):
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADBBL")
                    BADBBL += 1
                    continue

            #filter sequence length
            if len(r1[1]) < self.options.seq_len_req:
                self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file,
                                bad_index1_file, bad_index2_file, "BADLEN")
                BADLEN += 1
                continue

            #check polyX
            if self.options.poly_size_limit > 0:
                poly1 = hasPolyX(r1[1], self.options.poly_size_limit,
                                 self.options.allow_mismatch_in_poly)
                poly2 = None
                if r2 != None:
                    poly2 = hasPolyX(r2[1], self.options.poly_size_limit,
                                     self.options.allow_mismatch_in_poly)
                if poly1 != None or poly2 != None:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADPOL")
                    BADPOL += 1
                    continue

            #check low quality count
            if self.options.unqualified_base_limit > 0:
                lowQual1 = lowQualityNum(r1,
                                         self.options.qualified_quality_phred)
                lowQual2 = 0
                if r2 != None:
                    lowQual2 = lowQualityNum(
                        r2, self.options.qualified_quality_phred)
                if lowQual1 > self.options.unqualified_base_limit or lowQual1 > self.options.unqualified_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADLQC")
                    BADLQC += 1
                    continue

            #check N number
            if self.options.n_base_limit > 0:
                nNum1 = nNumber(r1)
                nNum2 = 0
                if r2 != None:
                    nNum2 = nNumber(r2)
                if nNum1 > self.options.n_base_limit or nNum2 > self.options.n_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADNCT")
                    BADNCT += 1
                    continue

            #check overlap and do error correction
            if r2 != None and (not self.options.no_overlap):
                (offset, overlap_len, distance) = util.overlap(r1[1], r2[1])
                overlap_histgram[overlap_len] += 1
                # deal with the case insert DNA is shorter than read length and cause offset is negative
                # in this case the adapter is sequenced and should be trimmed
                if offset < 0 and overlap_len > 30:
                    # shift the junk bases
                    r1[1] = r1[1][0:overlap_len]
                    r1[3] = r1[3][0:overlap_len]
                    r2[1] = r2[1][0:overlap_len]
                    r2[3] = r2[3][0:overlap_len]
                    TRIMMED_ADAPTER_BASE += abs(offset) * 2
                    TRIMMED_ADAPTER_READ += 1
                    # check the sequence length again after adapter trimmed
                    if len(r1[1]) < self.options.seq_len_req:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                        bad_read2_file, bad_index1_file,
                                        bad_index2_file, "BADLEN")
                        BADLEN += 1
                        continue
                    # then calc overlap again
                    (offset, overlap_len,
                     distance) = util.overlap(r1[1], r2[1])

                distance_histgram[distance] += 1
                # if distance is too high, then set it as bad mismatch
                if distance > 3:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADDIFF")
                    BADDIFF += 1
                    continue
                if overlap_len > 30:
                    OVERLAPPED += 1
                    OVERLAP_LEN_SUM += overlap_len
                    # we consider the distance is caused by sequencing error
                    OVERLAP_BASE_SUM += overlap_len * 2
                    OVERLAP_BASE_ERR += distance
                    corrected = 0
                    zero_qual_masked = 0
                    skipped_mismatch = 0
                    if distance > 0:
                        #try to fix low quality base
                        #hamming = util.hammingDistance(r1[1][len(r1[1]) - overlap_len:], util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #if hamming != distance:
                        #    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADINDEL")
                        #    BADINDEL += 1
                        #    continue
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        err_mtx = init_error_matrix()
                        for o in xrange(overlap_len):
                            b1 = r1[1][len(r1[1]) - overlap_len + o]
                            b2 = util.complement(r2[1][-o - 1])
                            q1 = r1[3][len(r1[3]) - overlap_len + o]
                            q2 = r2[3][-o - 1]
                            if b1 != b2:
                                # print(TOTAL_READS, o, b1, b2, q1, q2)
                                this_is_corrected = False
                                if util.qualNum(q1) >= 30 and util.qualNum(
                                        q2) <= 14:
                                    if b1 != 'N' and b2 != 'N':
                                        err_mtx[util.complement(b1)][
                                            util.complement(b2)] += 1
                                    if not self.options.no_correction:
                                        r2[1] = util.changeString(
                                            r2[1], -o - 1, util.complement(b1))
                                        r2[3] = util.changeString(
                                            r2[3], -o - 1, q1)
                                        corrected += 1
                                        this_is_corrected = True
                                elif util.qualNum(q2) >= 30 and util.qualNum(
                                        q1) <= 14:
                                    if b1 != 'N' and b2 != 'N':
                                        err_mtx[b2][b1] += 1
                                    if not self.options.no_correction:
                                        r1[1] = util.changeString(
                                            r1[1],
                                            len(r1[1]) - overlap_len + o, b2)
                                        r1[3] = util.changeString(
                                            r1[3],
                                            len(r1[3]) - overlap_len + o, q2)
                                        corrected += 1
                                        this_is_corrected = True
                                if not this_is_corrected:
                                    if self.options.mask_mismatch:
                                        # mask them as zero qual if it is not corrected
                                        zero_qual = '!'
                                        r2[3] = util.changeString(
                                            r2[3], -o - 1, zero_qual)
                                        r1[3] = util.changeString(
                                            r1[3],
                                            len(r1[3]) - overlap_len + o,
                                            zero_qual)
                                        zero_qual_masked += 1
                                    else:
                                        skipped_mismatch += 1

                                if corrected + zero_qual_masked + skipped_mismatch >= distance:
                                    break
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        if corrected + zero_qual_masked + skipped_mismatch == distance:
                            merge_error_matrix(OVERLAP_ERR_MATRIX, err_mtx)
                            if corrected > 0:
                                READ_CORRECTED += 1
                            BASE_CORRECTED += corrected
                            # multiply by 2 since we mask bases by pair
                            BASE_ZERO_QUAL_MASKED += zero_qual_masked * 2
                            BASE_SKIPPED_CORRECTION += skipped_mismatch * 2
                        else:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADMISMATCH")
                            BADMISMATCH += 1
                            continue
                    if distance == 0 or distance == corrected:
                        if self.options.store_overlap:
                            self.writeReads(getOverlap(r1, overlap_len),
                                            getOverlap(r2, overlap_len), i1,
                                            i2, overlap_read1_file,
                                            overlap_read2_file,
                                            overlap_index1_file,
                                            overlap_index2_file, None)

            #write to good
            self.writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file,
                            good_index1_file, good_index2_file, None)
            GOOD_BASES += len(r1[1])
            if i2 != None:
                GOOD_BASES += len(r2[1])
            if self.options.qc_sample <= 0 or TOTAL_READS < self.options.qc_sample:
                self.r1qc_postfilter.statRead(r1)
                if r2 != None:
                    self.r2qc_postfilter.statRead(r2)

            GOOD_READS += 1
            if self.options.qc_only and TOTAL_READS >= self.options.qc_sample:
                break

        self.r1qc_postfilter.qc()
        #self.r1qc_postfilter.plot(qc_dir, "R1-postfilter")
        if self.options.read2_file != None:
            self.r2qc_postfilter.qc()
            #self.r2qc_postfilter.plot(qc_dir, "R2-postfilter")

        #close all files
        if not self.options.qc_only:
            good_read1_file.close()
            bad_read1_file.close()
            if self.options.read2_file != None:
                good_read2_file.close()
                bad_read2_file.close()
            if self.options.index1_file != None:
                good_index1_file.close()
                bad_index1_file.close()
            if self.options.index2_file != None:
                good_index2_file.close()
                bad_index2_file.close()

        # print stat numbers
        BAD_READS = TOTAL_READS - GOOD_READS
        result = {}
        result['total_bases'] = TOTAL_BASES
        result['good_bases'] = GOOD_BASES
        result['total_reads'] = TOTAL_READS
        result['good_reads'] = GOOD_READS
        result['bad_reads'] = BAD_READS
        result['bad_reads_with_bad_barcode'] = BADBCD1 + BADBCD2
        result['bad_reads_with_reads_in_bubble'] = BADBBL
        result['bad_reads_with_bad_read_length'] = BADLEN + BADTRIM1 + BADTRIM2
        result['bad_reads_with_polyX'] = BADPOL
        result['bad_reads_with_low_quality'] = BADLQC
        result['bad_reads_with_too_many_N'] = BADNCT
        result['bad_reads_with_bad_overlap'] = BADMISMATCH + BADINDEL + BADDIFF
        result['readlen'] = readLen

        # plot result bar figure
        labels = [
            'good reads', 'has_polyX', 'low_quality', 'too_short', 'too_many_N'
        ]
        counts = [
            GOOD_READS, BADPOL, BADLQC, BADLEN + BADTRIM1 + BADTRIM2, BADNCT
        ]
        colors = ['#66BB11', '#FF33AF', '#FFD3F2', '#FFA322', '#FF8899']
        if self.options.read2_file != None:
            labels.append('bad_overlap')
            counts.append(BADMISMATCH + BADINDEL + BADDIFF)
            colors.append('#FF6600')
        if self.options.debubble:
            labels.append('in_bubble')
            counts.append(BADBBL)
            colors.append('#EEBB00')
        if self.options.barcode:
            labels.append('bad_barcode')
            counts.append(BADBCD1 + BADBCD2)
            colors.append('#CCDD22')

        for i in xrange(len(counts)):
            type_percent = 0.0
            if TOTAL_READS > 0:
                type_percent = 100.0 * float(counts[i]) / TOTAL_READS
            labels[i] = labels[i] + ": " + str(
                counts[i]) + "(" + str(type_percent) + "%)"

        reporter.addFigure(
            'Good reads and bad reads after filtering',
            self.r1qc_prefilter.statPlotly(labels, counts, TOTAL_READS,
                                           'filter_stat'), 'filter_stat', "")
        #self.r1qc_prefilter.plotFilterStats(labels, counts, colors, TOTAL_READS, os.path.join(qc_dir, "filter-stat.png"))

        #squeeze qc data for JSON output
        self.r1qc_prefilter.squeeze()
        self.r1qc_postfilter.squeeze()
        if self.options.read2_file != None:
            self.r2qc_prefilter.squeeze()
            self.r2qc_postfilter.squeeze()

        stat = {}
        # stat["options"]=self.options
        stat["afterqc_main_summary"] = result
        stat["command"] = makeDict(self.options)
        stat["kmer_content"] = {}
        stat["kmer_content"][
            "read1_prefilter"] = self.r1qc_prefilter.topKmerCount[0:10]
        stat["kmer_content"][
            "read1_postfilter"] = self.r1qc_postfilter.topKmerCount[0:10]

        # output more data in JSON file for offline plotting directly from JSON
        stat["base_quality"] = {}
        stat["base_quality"][
            "read1_prefilter"] = self.r1qc_prefilter.baseMeanQual
        stat["base_quality"][
            "read1_postfilter"] = self.r1qc_postfilter.baseMeanQual
        stat["mean_quality"] = {}
        stat["mean_quality"]["read1_prefilter"] = self.r1qc_prefilter.meanQual
        stat["mean_quality"][
            "read1_postfilter"] = self.r1qc_postfilter.meanQual
        stat["base_content"] = {}
        stat["base_content"]["read1_prefilter"] = self.r1qc_prefilter.percents
        stat["base_content"][
            "read1_postfilter"] = self.r1qc_postfilter.percents
        stat["gc_content"] = {}
        stat["gc_content"]["read1_prefilter"] = self.r1qc_prefilter.gcPercents
        stat["gc_content"][
            "read1_postfilter"] = self.r1qc_postfilter.gcPercents

        if self.options.read2_file != None:
            stat["kmer_content"][
                "read2_prefilter"] = self.r2qc_prefilter.topKmerCount[0:10]
            stat["kmer_content"][
                "read2_postfilter"] = self.r2qc_postfilter.topKmerCount[0:10]

            stat["base_quality"][
                "read2_prefilter"] = self.r2qc_prefilter.baseMeanQual
            stat["base_quality"][
                "read2_postfilter"] = self.r2qc_postfilter.baseMeanQual

            stat["mean_quality"][
                "read2_prefilter"] = self.r2qc_prefilter.meanQual
            stat["mean_quality"][
                "read2_postfilter"] = self.r2qc_postfilter.meanQual

            stat["base_content"][
                "read2_prefilter"] = self.r2qc_prefilter.percents
            stat["base_content"][
                "read2_postfilter"] = self.r2qc_postfilter.percents

            stat["gc_content"][
                "read2_prefilter"] = self.r2qc_prefilter.gcPercents
            stat["gc_content"][
                "read2_postfilter"] = self.r2qc_postfilter.gcPercents

            stat["afterqc_overlap"] = {}
            stat["afterqc_overlap"]['overlapped_pairs'] = OVERLAPPED
            if OVERLAPPED > 0:
                stat["afterqc_overlap"]['average_overlap_length'] = float(
                    OVERLAP_LEN_SUM / OVERLAPPED)
            else:
                stat["afterqc_overlap"]['average_overlap_length'] = 0.0
            stat["afterqc_overlap"]['bad_mismatch_reads'] = BADMISMATCH
            stat["afterqc_overlap"]['bad_diff'] = BADDIFF
            stat["afterqc_overlap"]['bad_indel_reads'] = BADINDEL
            stat["afterqc_overlap"]['corrected_reads'] = READ_CORRECTED
            stat["afterqc_overlap"]['corrected_bases'] = BASE_CORRECTED
            stat["afterqc_overlap"][
                'skipped_correction_bases'] = BASE_SKIPPED_CORRECTION
            stat["afterqc_overlap"]['zero_qual_masked'] = BASE_ZERO_QUAL_MASKED
            stat["afterqc_overlap"][
                'zero_qual_skipped'] = BASE_ZERO_QUAL_MASKED
            stat["afterqc_overlap"][
                'trimmed_adapter_bases'] = TRIMMED_ADAPTER_BASE
            stat["afterqc_overlap"][
                'trimmed_adapter_reads'] = TRIMMED_ADAPTER_READ
            if OVERLAP_BASE_SUM > 0:
                stat["afterqc_overlap"]['error_rate'] = float(
                    OVERLAP_BASE_ERR) / float(OVERLAP_BASE_SUM)
            else:
                stat["afterqc_overlap"]['error_rate'] = 0.0
            stat["afterqc_overlap"]['error_matrix'] = OVERLAP_ERR_MATRIX
            stat["afterqc_overlap"][
                'edit_distance_histogram'] = distance_histgram[0:10]
            reporter.addFigure(
                'Sequence error distribution',
                self.r1qc_prefilter.errorPlotly(OVERLAP_ERR_MATRIX,
                                                'error_matrix'),
                'error_matrix', "")
            reporter.addFigure(
                'Overlap length distribution',
                self.r1qc_prefilter.overlapPlotly(overlap_histgram, readLen,
                                                  TOTAL_READS, 'overlap_stat'),
                'overlap_stat', "")
            #self.r1qc_prefilter.plotOverlapHistgram(overlap_histgram, readLen, TOTAL_READS, os.path.join(qc_dir, "overlap.png"))

        stat_file = open(
            os.path.join(qc_dir,
                         os.path.basename(self.options.read1_file) + ".json"),
            "w")
        stat_json = json.dumps(stat,
                               sort_keys=True,
                               indent=4,
                               separators=(',', ': '))
        stat_file.write(stat_json)
        stat_file.close()

        self.addFiguresToReport(reporter)
        reporter.setStat(stat)
        reporter.setVersion(self.options.version)
        reporter.output(
            os.path.join(qc_dir,
                         os.path.basename(self.options.read1_file) + ".html"))
예제 #21
0
def VOCprlist(gtImages, detlist, show=False, usetr=True, usedf=False, ovr=0.5):
    """
        calculate the precision recall curve
    """
    #detf=open(detfile,"r")
    #detect=detf.readlines()
    imname = []
    cnt = 0
    #ovr=0.49
    #print trPosImages.getTotal()
    tp = []
    fp = []
    tot = 0
    for idx in range(gtImages.getTotal()):
        print gtImages.getImageName(idx)
        if show:
            img = gtImages.getImage(idx)
            pylab.figure(1)
            pylab.clf()
            pylab.imshow(img)
        #pyr=HOGcompute.HOGcrop(img,interv=interv)
        #pyr.pad()
        #pyr.pad()
        #pyr.contrast()
        rect = gtImages.getBBox(idx, usetr=usetr, usedf=usedf)
        print rect
        if show:
            for r in rect:
                pylab.figure(1)
                pylab.ioff()
                box(r[0], r[1], r[2], r[3], 'b', lw=1.5)
                #raw_input()
        tot = tot + len(rect)
        #print len(rect),rect
        #print rect
        for l in detlist:
            data = l  #.split(" ")
            if data[0] == gtImages.getImageName(idx).split("/")[-1].split(
                    ".")[0]:
                notfound = True
                rb = [
                    float(data[3]),
                    float(data[2]),
                    float(data[5]),
                    float(data[4])
                ]
                if show:
                    pylab.ioff()
                    pylab.text(rb[1], rb[0], data[1])
                for id, r in enumerate(rect):
                    #pylab.figure(1)
                    #box(r[0],r[1],r[2],r[3],'b',lw=1.5)
                    #print "entered",data
                    #rb=[float(data[3]),float(data[2]),float(data[5]),float(data[4])]
                    #print rb,r,overlap(rb,r)
                    #pylab.text(rb[1],rb[0],data[1])
                    if overlap(rb, r) >= ovr:
                        if show:
                            pylab.ioff()
                            box(rb[0], rb[1], rb[2], rb[3], 'g', lw=1.5)
                        del rect[id]
                        tp.append(float(data[1]))
                        notfound = False
                        break
                if notfound == True:
                    if show:
                        pylab.ioff()
                        box(rb[0], rb[1], rb[2], rb[3], 'r', lw=1)
                    fp.append(float(data[1]))
                #print len(tp),len(fp),tot
            #break
        if show:
            pylab.figure(1)
            pylab.show()
            pylab.draw()
        #raw_input()
    return tp, fp, tot
예제 #22
0
파일: bestdet.py 프로젝트: ChrisYang/CRFdet
#            except:
#                try:
#                    img=util.myimread(imgpath+"buffy_s5e4/"+l["idim"])
#                except:
#            try:
#                img=util.myimread(imgpath+"buffy_s5e5/"+l["idim"])
#            except:
#                try:
#                    img=util.myimread(imgpath+"buffy_s5e6/"+l["idim"])
#                except:
#                    pass

        #gooddet=-1
        ovr=[]
        for idb,b in enumerate(gt[l["idim"]]):#for each bb gt
            ovr.append(util.overlap(b,l["bbox"]))
        if len(ovr)>0:
            #print "Best ovr",max(ovr)
            if max(ovr)>=0.5:
                detectCRF.visualize2([l],cfg.N,img,text="rank:%d ovr:%.3f scl:%d"%(idl,max(ovr),l["hog"]),bb=gt[l["idim"]][numpy.array(ovr).argmax()],color="w",line=line)
            else:
                detectCRF.visualize2([l],cfg.N,img,text="rank:%d ovr:%.3f scl:%d"%(idl,max(ovr),l["hog"]),bb=gt[l["idim"]][numpy.array(ovr).argmax()],color="r",line=line)
        else:
            detectCRF.visualize2([l],cfg.N,img,text="rank:%d"%(idl),color="r",line=line)
        #pl.figure(100)        
        #pl.clf()
        #pl.imshow(img)
        raw_input()


예제 #23
0
    def run(self):
        if self.options.debubble:
            self.loadBubbleCircles()

        #read1_file is required
        read1_file = fastq.Reader(self.options.read1_file)
        #create a QC folder to contains QC results
        qc_base_folder = os.path.join(os.path.dirname(self.options.read1_file), "QC")
        if not os.path.exists(qc_base_folder):
            os.makedirs(qc_base_folder)
        #QC result of this file/pair
        qc_dir =  os.path.join(qc_base_folder, os.path.basename(self.options.read1_file))
        if not os.path.exists(qc_dir):
            os.makedirs(qc_dir)

        #no front trim if sequence is barcoded
        if self.options.barcode:
            self.options.trim_front = 0

        reporter = QCReporter()

        r1qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer)
        r2qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer)
        r1qc_prefilter.statFile(self.options.read1_file)
        r1qc_prefilter.plot(qc_dir, "R1-prefilter")
        if self.options.read2_file != None:
            r2qc_prefilter.statFile(self.options.read2_file)
            r2qc_prefilter.plot(qc_dir, "R2-prefilter")

        r1qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer)
        r2qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer)

        readLen = r1qc_prefilter.readLen
        overlap_histgram = [0 for x in xrange(readLen+1)]
        distance_histgram = [0 for x in xrange(readLen+1)]

        #auto detect trim front and trim tail
        if self.options.trim_front == -1 or self.options.trim_tail == -1:
            #auto trim for read1
            trimFront, trimTail = r1qc_prefilter.autoTrim()
            if self.options.trim_front == -1:
                self.options.trim_front = trimFront
            if self.options.trim_tail == -1:
                self.options.trim_tail = trimTail
            #auto trim for read2
            if self.options.read2_file != None:
                # check if we should keep same trimming for read1/read2 to keep their length identical
                # this option is on by default because lots of dedup algorithms require this feature
                if self.options.trim_pair_same:
                    self.options.trim_front2 = self.options.trim_front
                    self.options.trim_tail2 = self.options.trim_tail
                else:
                    trimFront2, trimTail2 = r2qc_prefilter.autoTrim()
                    if self.options.trim_front2 == -1:
                        self.options.trim_front2 = trimFront2
                    if self.options.trim_tail2 == -1:
                        self.options.trim_tail2 = trimTail2
                
        print(self.options.read1_file + " options:")
        print(self.options)
        
        #if good output folder not specified, set it as the same folder of read1 file
        good_dir = self.options.good_output_folder
        if good_dir == None:
            good_dir = os.path.dirname(self.options.read1_file)

        #if bad output folder not specified, set it as the same folder of read1 file            
        bad_dir = self.options.bad_output_folder
        if bad_dir == None:
            bad_dir = os.path.dirname(self.options.read1_file)

        #if overlap output folder not specified, set it as the same folder of read1 file
        overlap_dir = self.options.overlap_output_folder
        if overlap_dir == None:
            overlap_dir = os.path.dirname(self.options.read1_file)
            
        if not os.path.exists(good_dir):
            os.makedirs(good_dir)
            
        if not os.path.exists(bad_dir):
            os.makedirs(bad_dir)

        if self.options.store_overlap and self.options.read2_file != None and (not os.path.exists(overlap_dir)):
            os.makedirs(overlap_dir)
        
        good_read1_file = None
        bad_read1_file = None
        overlap_read1_file = None
        if not self.options.qc_only:
            good_read1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read1_file)+".good.fq"))
            bad_read1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read1_file)+".bad.fq"))

            overlap_read1_file = None
            if self.options.store_overlap:
                overlap_read1_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.read1_file)+".overlap.fq"))
        
        #other files are optional
        read2_file = None
        good_read2_file = None
        bad_read2_file = None
        overlap_read2_file = None

        index1_file = None
        good_index1_file = None
        bad_index1_file = None
        overlap_index1_file = None

        index2_file = None
        good_index2_file = None
        bad_index2_file = None
        overlap_index2_file = None
        
        #if other files are specified, then read them
        if self.options.read2_file != None:
            read2_file = fastq.Reader(self.options.read2_file)
            if not self.options.qc_only:
                good_read2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read2_file)+".good.fq"))
                bad_read2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read2_file)+".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_read2_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.read2_file)+".overlap.fq"))
        if self.options.index1_file != None:
            index1_file = fastq.Reader(self.options.index1_file)
            if not self.options.qc_only:
                good_index1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index1_file)+".good.fq"))
                bad_index1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index1_file)+".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index1_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.index1_file)+".overlap.fq"))
        if self.options.index2_file != None:
            index2_file = fastq.Reader(self.options.index2_file)
            if not self.options.qc_only:
                good_index2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index2_file)+".good.fq"))
                bad_index2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index2_file)+".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index2_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.index2_file)+".overlap.fq"))
            
        r1 = None
        r2 = None
        i1 = None
        i2 = None

        # stat numbers
        TOTAL = 0
        GOOD = 0
        BAD = 0
        BADBCD1 = 0
        BADBCD2 = 0
        BADTRIM1 = 0
        BADTRIM2 = 0
        BADBBL = 0
        BADLEN = 0
        BADPOL = 0
        BADLQC = 0
        BADNCT = 0
        BADOL = 0
        BADINDEL = 0
        BADMISMATCH = 0
        BASE_CORRECTED = 0
        OVERLAPPED = 0
        OVERLAP_LEN_SUM = 0

        while True:
            r1 = read1_file.nextRead()
            if r1==None:
                break
                
            if read2_file != None:
                r2 = read2_file.nextRead()
                if r2==None:
                    break
            if index1_file != None:
                i1 = index1_file.nextRead()
                if i1==None:
                    break
            if index2_file != None:
                i2 = index2_file.nextRead()
                if i2==None:
                    break

            TOTAL += 1
                    
            #barcode processing
            if self.options.barcode:
                barcodeLen1 = barcodeprocesser.detectBarcode(r1[1], self.options.barcode_length, self.options.barcode_verify)
                if barcodeLen1 == 0:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD1")
                    BADBCD1 += 1
                    continue
                else:
                    if r2 == None:
                        barcodeprocesser.moveBarcodeToName(r1, self.options.barcode_length, self.options.barcode_verify)
                    else:
                        barcodeLen2 = barcodeprocesser.detectBarcode(r2[1], self.options.barcode_length, self.options.barcode_verify)
                        if barcodeLen2 == 0:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD2")
                            BADBCD2 += 1
                            continue
                        else:
                            barcodeprocesser.moveAndTrimPair(r1, r2, barcodeLen1, barcodeLen2, self.options.barcode_verify)
            
            #trim
            if self.options.trim_front > 0 or self.options.trim_tail > 0:
                r1 = trim(r1, self.options.trim_front, self.options.trim_tail)
                if len(r1[1]) < 5:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM1")
                    BADTRIM1 += 1
                    continue
                if r2 != None:
                    r2 = trim(r2, self.options.trim_front2, self.options.trim_tail2)
                    if len(r2[1]) < 5:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM2")
                        BADTRIM2 += 1
                        continue

            #filter debubble
            if self.options.debubble:
                if self.isInBubble(r1[0]):
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBBL")
                    BADBBL += 1
                    continue
            
            #filter sequence length
            if len(r1[1])<self.options.seq_len_req:
                self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLEN")
                BADLEN += 1
                continue
                    
            #check polyX
            if self.options.poly_size_limit > 0:
                poly1 = hasPolyX(r1[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly)
                poly2 = None
                if r2!=None:
                    poly2 = hasPolyX(r2[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly)
                if poly1!=None or poly2!=None:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADPOL")
                    BADPOL += 1
                    continue
            
            #check low quality count
            if self.options.unqualified_base_limit > 0:
                lowQual1 = lowQualityNum(r1, self.options.qualified_quality_phred)
                lowQual2 = 0
                if r2!=None:
                    lowQual2 = lowQualityNum(r2, self.options.qualified_quality_phred)
                if lowQual1 > self.options.unqualified_base_limit or lowQual1 > self.options.unqualified_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLQC")
                    BADLQC += 1
                    continue
            
            #check N number
            if self.options.n_base_limit > 0:
                nNum1 = nNumber(r1)
                nNum2 = 0
                if r2!=None:
                    nNum2 = nNumber(r2)
                if nNum1 > self.options.n_base_limit or nNum2 > self.options.n_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADNCT")
                    BADNCT += 1
                    continue

            #check overlap and do error correction
            if r2!=None:
                (offset, overlap_len, distance) = util.overlap(r1[1], r2[1])
                overlap_histgram[overlap_len] += 1
                # deal with the case insert DNA is shorter than read length and cause offset is negative
                if offset <0 and overlap_len > 30:
                    # shift the junk bases
                    r1[1] = r1[1][0:overlap_len]
                    r1[3] = r1[3][0:overlap_len]
                    r2[1] = r2[1][-offset:-offset+overlap_len]
                    r2[3] = r2[3][-offset:-offset+overlap_len]
                    # then calc overlap again
                    (offset, overlap_len, distance) = util.overlap(r1[1], r2[1])
                if overlap_len>30:
                    OVERLAPPED += 1
                    distance_histgram[distance] += 1
                    OVERLAP_LEN_SUM += overlap_len
                    corrected = 0
                    if distance > 2:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADOL")
                        BADOL += 1
                        continue
                    elif distance>0:
                        #try to fix low quality base
                        hamming = util.hammingDistance(r1[1][len(r1[1]) - overlap_len:], util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        if hamming != distance:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADINDEL")
                            BADINDEL += 1
                            continue
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        for o in xrange(overlap_len):
                            b1 = r1[1][len(r1[1]) - overlap_len + o]
                            b2 = util.complement(r2[1][-o-1])
                            q1 = r1[3][len(r1[3]) - overlap_len + o]
                            q2 = r2[3][-o-1]
                            if b1 != b2:
                                # print(TOTAL, o, b1, b2, q1, q2)
                                if util.qualNum(q1) >= 27 and util.qualNum(q2) <= 16:
                                    r2[1] = util.changeString(r2[1], -o-1, util.complement(b1))
                                    r2[3] = util.changeString(r2[3], -o-1, q1)
                                    corrected += 1
                                elif util.qualNum(q2) >= 27 and util.qualNum(q1) <= 16:
                                    r1[1]= util.changeString(r1[1], len(r1[1]) - overlap_len + o, b2)
                                    r1[3] = util.changeString(r1[3], len(r1[3]) - overlap_len + o, q2)
                                    corrected += 1
                                if corrected >= distance:
                                    break
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        if corrected == distance:
                            BASE_CORRECTED += 1
                        else:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADMISMATCH")
                            BADMISMATCH += 1
                            continue
                    if distance == 0 or distance == corrected:
                        if self.options.store_overlap:
                            self.writeReads(getOverlap(r1, overlap_len), getOverlap(r2, overlap_len), i1, i2, overlap_read1_file, overlap_read2_file, overlap_index1_file, overlap_index2_file, None)

            #write to good       
            self.writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file, good_index1_file, good_index2_file, None)
            r1qc_postfilter.statRead(r1)
            if r2 != None:
                r2qc_postfilter.statRead(r2)

            GOOD += 1
            if self.options.qc_only and TOTAL >= self.options.qc_sample:
                break

        r1qc_postfilter.qc()
        r1qc_postfilter.plot(qc_dir, "R1-postfilter")
        if self.options.read2_file != None:
            r2qc_postfilter.qc()
            r2qc_postfilter.plot(qc_dir, "R2-postfilter")
        
        #close all files
        if not self.options.qc_only:
            good_read1_file.flush()
            bad_read1_file.flush()
            if self.options.read2_file != None:
                good_read2_file.flush()
                bad_read2_file.flush()
            if self.options.index1_file != None:
                good_index1_file.flush()
                bad_index1_file.flush()
            if self.options.index2_file != None:
                good_index2_file.flush()
                bad_index2_file.flush()

        # print stat numbers
        BAD = TOTAL - GOOD
        result = {}
        result['total_reads']=TOTAL
        result['good_reads']=GOOD
        result['bad_reads']=BAD
        result['bad_reads_with_bad_barcode']= BADBCD1 + BADBCD2
        result['bad_reads_with_reads_in_bubble']= BADBBL
        result['bad_reads_with_bad_read_length']= BADLEN + BADTRIM1 + BADTRIM2
        result['bad_reads_with_polyX']= BADPOL
        result['bad_reads_with_low_quality']=BADLQC
        result['bad_reads_with_too_many_N']= BADNCT
        result['bad_reads_with_bad_overlap']= BADOL + BADMISMATCH + BADINDEL

        # plot result bar figure
        labels = ['good reads', 'has_polyX', 'low_quality', 'too_short', 'too_many_N']
        counts = [GOOD, BADPOL, BADLQC, BADLEN + BADTRIM1 + BADTRIM2, BADNCT]
        colors = ['green', '#FF1111', '#FF3333', '#FF5555', '#FF7777']
        if self.options.read2_file != None:
            labels.append('bad_overlap')
            counts.append(BADOL + BADMISMATCH + BADINDEL)
            colors.append('#FF9999')
        if self.options.debubble:
            labels.append('in_bubble')
            counts.append(BADBBL)
            colors.append('#FFBBBB')
        if self.options.barcode:
            labels.append('bad_barcode')
            counts.append(BADBCD1 + BADBCD2)
            colors.append('#FFDDDD')

        fig = plt.figure(1)
        plt.title("Good reads (green) and bad reads (red) of total " + str(TOTAL))
        fig.subplots_adjust(left = 0.14)
        lefts = xrange(len(counts))
        plt.yticks(lefts, labels)
        plt.ylim(-0.5, len(counts)-0.5)
        plt.barh(lefts, counts, align='center', height=0.5, alpha=0.8, color=colors)
        plt.savefig(os.path.join(qc_dir, "filter-stat.png"))
        plt.close(1)

        stat={}
        # stat["options"]=self.options
        stat["summary"]=result
        stat["command"]=makeDict(self.options)
        stat["kmer_content"] = {}
        stat["kmer_content"]["read1_prefilter"] = r1qc_prefilter.topKmerCount[0:10]
        stat["kmer_content"]["read1_postfilter"] = r1qc_postfilter.topKmerCount[0:10]
        if self.options.read2_file != None:
            stat["kmer_content"]["read2_prefilter"] = r2qc_prefilter.topKmerCount[0:10]
            stat["kmer_content"]["read2_postfilter"] = r2qc_postfilter.topKmerCount[0:10]
            stat["overlap"]={}
            stat["overlap"]['overlapped_pairs']=OVERLAPPED
            if OVERLAPPED > 0:
                stat["overlap"]['average_overlap_length']=float(OVERLAP_LEN_SUM/OVERLAPPED)
            else:
                stat["overlap"]['average_overlap_length']=0.0
            stat["overlap"]['bad_edit_distance']=BADOL
            stat["overlap"]['bad_mismatch_bases']=BADMISMATCH
            stat["overlap"]['bad_indel']=BADINDEL
            stat["overlap"]['reads_with_corrected_mismatch_bases']=BASE_CORRECTED
            stat["overlap"]['overlapped_area_edit_distance_histogram']=distance_histgram[0:10]
            plotOverlapHistgram(overlap_histgram, readLen, TOTAL, os.path.join(qc_dir, "overlap.png"))

        stat_file = open(os.path.join(qc_dir, "after.json"), "w")
        stat_json = json.dumps(stat, sort_keys=True,indent=4, separators=(',', ': '))
        stat_file.write(stat_json)
        stat_file.close()

        self.addFiguresToReport(reporter)
        reporter.output(os.path.join(qc_dir, "report.html"))
예제 #24
0
파일: VOCpr.py 프로젝트: ChrisYang/CRFdet
def viewDet(gtImages,detfile,opt="all",usetr=True,usedf=False,stop=True,t=0.5):
    detf=open(detfile,"r")
    detect=detf.readlines()
    detlst=numpy.zeros((len(detect),5))
    namelst=[]
    pylab.ioff()
    for id,el in enumerate(detect):
        aux=el.split()
        namelst.append(aux[0])
        detlst[id,:]=aux[1:]
    srt=numpy.argsort(-detlst[:,0])
    imname=[]
    cnt=0
    ovr=0.49
    #print trPosImages.getTotal()
    tp=[]
    fp=[]
    tot=0
    pylab.figure()
    bb=numpy.zeros((4))
    for id in range(detlst.shape[0]):
        pylab.ioff()
        abb=detlst[srt[id]]
        conf=abb[0]
        bb[0]=abb[2];bb[1]=abb[1];bb[2]=abb[4];bb[3]=abb[3]
        pylab.clf()
        img=gtImages.getImageByName2(namelst[srt[id]])
        gtbb=gtImages.getBBoxByName(namelst[srt[id]],usetr=usetr,usedf=usedf)
        found=False
        for l in range(len(gtbb)):
            pylab.imshow(img)
            pylab.title("%s Confidence: %f"%(namelst[srt[id]],float(conf)))
            #box(gtbb[l][0],gtbb[l][1],gtbb[l][2],gtbb[l][3],col='b',lw="2")
            print overlap(bb[:],gtbb[l][:4])
            if overlap(bb[:],gtbb[l][:4])>0:
                if overlap(bb[:],gtbb[l][:4])>ovr:
                    box(gtbb[l][0],gtbb[l][1],gtbb[l][2],gtbb[l][3],col='y',lw="2")
                    box(bb[0],bb[1],bb[2],bb[3],col='g',lw="2")
                    pylab.show()
                    pylab.draw()
                    if stop:
                        raw_input()
                    else:
                        time.sleep(t)
                    found=True
                else:
                    box(gtbb[l][0],gtbb[l][1],gtbb[l][2],gtbb[l][3],col='y',lw="1")
                    #box(bb[0],bb[1],bb[2],bb[3],col='g',lw="2")
                    #raw_input()
            else:
                pass
                #pylab.imshow(img)
                #box(bb[0],bb[1],bb[2],bb[3],col='r',lw="2")
        if not(found):
            pylab.imshow(img)
            box(bb[0],bb[1],bb[2],bb[3],col='r',lw="2")
            pylab.show()
            pylab.draw()
            if stop:
                raw_input()
            else:
                time.sleep(t)
예제 #25
0
def readMotifMatching(combinationList,
                      coordDict,
                      pwmFileNameList,
                      color="black",
                      pwmReferenceList=None):
    """Reads motif predicted binding sites files and creates necessary structures for the statistical test.

    Keyword arguments:
    combinationList -- List of the number of cobinding combinations.
    coordDict -- Dictionary of coordinates where the motif matching was applied.
    pwmFileNameList -- List of PWMs files where each entry's name will represent the name of the motif.
                       Alternatively, it can be a single file containing all the MPBSs and their name on the NAME field.
    color -- Color of the bed entries. Can be 'green', 'red' or 'black'. (default 'black')
    pwmReferenceList -- Optional argument. In case pwmFileNameList is a single file (final motif matching file), this
                        parameter can be set to be a pwmList that will preserve the order of the pwmList. This is useful
                        in the case you want the same combinations of cobinding factors be created. (default None)

    Returns:
    mpbsDict -- Dictionary (for each PWM) of dictionaries (for each chromosome) of motif predicted binding sites.
    statDict -- Dictionary of statistics for Fisher test concerning the number of motifs inside enriched regions.
    geneDict -- Dictionary of genes (position NAME in bed file) that contains each motif.
    """

    # Reading all MPBSs
    pwmList = []
    allMpbsDict = dict()
    if (isinstance(pwmFileNameList, list)):
        for pwmFileName in pwmFileNameList:
            pwmList.append(".".join(
                pwmFileName.split("/")[-1].split(".")[:-1]))
            allMpbsDict[
                pwmList[-1]] = bedFunctions.createBedDictFromSingleFile(
                    pwmFileName, separator="\t")
    else:
        if (pwmReferenceList): pwmList = pwmReferenceList
        pwmFile = open(pwmFileNameList, "r")
        for line in pwmFile:
            ll = line.strip().split("\t")
            if (ll[3] in allMpbsDict.keys()):
                if (ll[0] in allMpbsDict[ll[3]].keys()):
                    allMpbsDict[ll[3]][ll[0]].append(
                        [int(ll[1]),
                         int(ll[2]), ll[3],
                         int(ll[4]), ll[5]])
                else:
                    allMpbsDict[ll[3]][ll[0]] = [[
                        int(ll[1]),
                        int(ll[2]), ll[3],
                        int(ll[4]), ll[5]
                    ]]
            else:
                if (not pwmReferenceList): pwmList.append(ll[3])
                allMpbsDict[ll[3]] = dict()
                allMpbsDict[ll[3]][ll[0]] = [[
                    int(ll[1]),
                    int(ll[2]), ll[3],
                    int(ll[4]), ll[5]
                ]]
        pwmFile.close()

    # Creating chromosome list
    chrList = constants.getChromList(reference=[coordDict])
    # Removing chrX, chrY and chrM
    chrListT = []
    for e in chrList:
        if (e not in ["chrX", "chrY", "chrM"]): chrListT.append(e)
    chrList = chrListT

    # Evaluating bed additionals
    if (color == "green"): color = "0,130,0"
    elif (color == "red"): color = "130,0,0"
    elif (color == "black"): color = "0,0,0"

    # Create combinations dictionary keys
    combKeys = []
    for c in combinationList:
        for b in [",".join(e) for e in itertools.combinations(pwmList, c)]:
            combKeys.append(b)

    # Counting statistics
    mpbsDict = dict([(e, dict()) for e in pwmList])
    statDict = dict([(e, [0, 0]) for e in combKeys
                     ])  # Left is evidence / Right is not evidence
    geneDict = dict([(e, []) for e in combKeys])
    for chrName in coordDict.keys():

        for e in mpbsDict.keys():
            mpbsDict[e][chrName] = []  # Creating chrName keys
        counter = dict([(e, 0) for e in pwmList
                        ])  # Counters to iterate over all mpbs dict

        # Iterating on coordinates
        for coord in coordDict[chrName]:

            flagMotifs = dict([(e, False) for e in pwmList
                               ])  # Motifs found on this coordinate

            # Searching for MPBSs that overlapped this coordinate
            for factorName in pwmList:
                while (counter[factorName] < len(
                        allMpbsDict[factorName][chrName])):
                    currMpbs = allMpbsDict[factorName][chrName][
                        counter[factorName]]
                    check = util.overlap(coord, currMpbs)
                    if (check == 0):  # Contain overlap
                        flagMotifs[factorName] = True
                        mpbsDict[factorName][chrName].append(
                            currMpbs + [currMpbs[0], currMpbs[1], color])
                    elif (check == -1):
                        break  # Motif is after coord
                    counter[factorName] += 1

            # Updating statistic counts and genes
            motifsFoundList = [k for k in pwmList if flagMotifs[k]]
            motifsFoundKeys = []
            motifsNotFoundKeys = [e for e in combKeys]
            for c in combinationList:
                for b in [
                        ",".join(e)
                        for e in itertools.combinations(motifsFoundList, c)
                ]:
                    motifsFoundKeys.append(b)
                    motifsNotFoundKeys.remove(b)
            for k in motifsFoundKeys:
                statDict[k][0] += 1
                for e in coord[2].split(":"):
                    geneDict[k].append(e)
            for k in motifsNotFoundKeys:
                statDict[k][1] += 1

    # Remove repetitive genes from geneList
    for k in geneDict.keys():
        geneDict[k] = list(set(geneDict[k]))

    return mpbsDict, statDict, geneDict
예제 #26
0
def VOCprRecord_wrong(gtImages,
                      detlist,
                      show=False,
                      usetr=True,
                      usedf=False,
                      ovr=0.5):
    """
        calculate the precision recall curve
    """
    dimg = {}
    tot = 0
    for idx in range(len(gtImages)):
        rect = gtImages[idx]["bbox"][:]
        #if idx>288:
        #    print idx,rect
        if rect != []:
            #print gtImages.getImageName(idx).split("/")[-1].split(".")[0]
            dimg[gtImages[idx]["name"].split("/")[-1].split(".")[0]] = {
                "bbox": rect,
                "det": [False] * len(rect)
            }
        tot = tot + len(rect)
    imname = []
    cnt = 0
    tp = numpy.zeros(len(detlist))
    fp = numpy.zeros(len(detlist))
    thr = numpy.zeros(len(detlist))
    detlist.sort(cmpscore)
    for idx, detbb in enumerate(detlist):
        #print detbb[1]
        found = False
        maxovr = 0
        #gtdet=[False]
        gt = 0
        if dimg.has_key(detbb[0]):
            rect = dimg[detbb[0]]["bbox"]
            found = False
            for ir, r in enumerate(rect):
                #gtdet.append(False)
                rb = (float(detbb[3]), float(detbb[2]), float(detbb[5]),
                      float(detbb[4]))
                #print "GT:",r
                #print "DET:",rb
                covr = overlap(rb, r)
                if covr >= maxovr:
                    maxovr = covr
                    gt = ir
                    #dimg[detbb[0]].remove(r)
                    #found=True
                    #break
        if maxovr > ovr:
            #if not(dimg[detbb[0]]["det"][gt]):
            tp[idx] = 1
            #dimg[detbb[0]]["det"][gt]=True
            #else:
            #    fp[idx]=1
        else:
            fp[idx] = 1
        thr[idx] = detbb[1]
        if show:
            prec = numpy.sum(tp) / float(numpy.sum(tp) + numpy.sum(fp))
            rec = numpy.sum(tp) / tot
            print "Scr:", detbb[1], "Prec:%.3f" % prec, "Rec:%.3f" % rec
            ss = raw_input()
            if ss == "s" or not (found):
                pylab.ioff()
                img = gtImages.getImageByName2(detbb[0])
                pylab.figure(1)
                pylab.clf()
                pylab.imshow(img)
                rb = (float(detbb[3]), float(detbb[2]), float(detbb[5]),
                      float(detbb[4]))
                for r in rect:
                    pylab.figure(1)
                    pylab.ioff()
                    box(r[0], r[1], r[2], r[3], 'b', lw=1.5)
                if found:
                    box(rb[0], rb[1], rb[2], rb[3], 'g', lw=1.5)
                else:
                    box(rb[0], rb[1], rb[2], rb[3], 'r', lw=1.5)
                pylab.draw()
                pylab.show()
                rect = []

    return tp, fp, thr, tot
예제 #27
0
def viewDet(gtImages,
            detfile,
            opt="all",
            usetr=True,
            usedf=False,
            stop=True,
            t=0.5):
    detf = open(detfile, "r")
    detect = detf.readlines()
    detlst = numpy.zeros((len(detect), 5))
    namelst = []
    pylab.ioff()
    for id, el in enumerate(detect):
        aux = el.split()
        namelst.append(aux[0])
        detlst[id, :] = aux[1:]
    srt = numpy.argsort(-detlst[:, 0])
    imname = []
    cnt = 0
    ovr = 0.49
    #print trPosImages.getTotal()
    tp = []
    fp = []
    tot = 0
    pylab.figure()
    bb = numpy.zeros((4))
    for id in range(detlst.shape[0]):
        pylab.ioff()
        abb = detlst[srt[id]]
        conf = abb[0]
        bb[0] = abb[2]
        bb[1] = abb[1]
        bb[2] = abb[4]
        bb[3] = abb[3]
        pylab.clf()
        img = gtImages.getImageByName2(namelst[srt[id]])
        gtbb = gtImages.getBBoxByName(namelst[srt[id]],
                                      usetr=usetr,
                                      usedf=usedf)
        found = False
        for l in range(len(gtbb)):
            pylab.imshow(img)
            pylab.title("%s Confidence: %f" % (namelst[srt[id]], float(conf)))
            #box(gtbb[l][0],gtbb[l][1],gtbb[l][2],gtbb[l][3],col='b',lw="2")
            print overlap(bb[:], gtbb[l][:4])
            if overlap(bb[:], gtbb[l][:4]) > 0:
                if overlap(bb[:], gtbb[l][:4]) > ovr:
                    box(gtbb[l][0],
                        gtbb[l][1],
                        gtbb[l][2],
                        gtbb[l][3],
                        col='y',
                        lw="2")
                    box(bb[0], bb[1], bb[2], bb[3], col='g', lw="2")
                    pylab.show()
                    pylab.draw()
                    if stop:
                        raw_input()
                    else:
                        time.sleep(t)
                    found = True
                else:
                    box(gtbb[l][0],
                        gtbb[l][1],
                        gtbb[l][2],
                        gtbb[l][3],
                        col='y',
                        lw="1")
                    #box(bb[0],bb[1],bb[2],bb[3],col='g',lw="2")
                    #raw_input()
            else:
                pass
                #pylab.imshow(img)
                #box(bb[0],bb[1],bb[2],bb[3],col='r',lw="2")
        if not (found):
            pylab.imshow(img)
            box(bb[0], bb[1], bb[2], bb[3], col='r', lw="2")
            pylab.show()
            pylab.draw()
            if stop:
                raw_input()
            else:
                time.sleep(t)
예제 #28
0
def VOCprlistfast(gtImages,
                  detlist,
                  show=False,
                  usetr=True,
                  usedf=False,
                  ovr=0.5):
    """
        calculate the precision recall curve
    """
    dimg = {}
    tot = 0
    for idx in range(gtImages.getTotal()):
        rect = gtImages.getBBox(idx)
        if rect != []:
            dimg[gtImages.getImageName(idx).split("/")[-1].split(".")
                 [0]] = rect
        tot = tot + len(rect)
        #print tot
    imname = []
    cnt = 0
    tp = numpy.zeros(len(detlist))
    fp = numpy.zeros(len(detlist))
    detlist.sort(cmpscore)
    for idx, detbb in enumerate(
            detlist):  #detlist[sortlist]):#gtImages.getTotal()):
        found = False
        if dimg.has_key(detbb[0]):
            rect = dimg[
                detbb[0]]  #gtImages.getBBox(idx,usetr=usetr,usedf=usedf)
            #print rect
            found = False
            for r in rect:
                rb = (float(detbb[3]), float(detbb[2]), float(detbb[5]),
                      float(detbb[4]))
                if overlap(rb, r) >= ovr:
                    dimg[detbb[0]].remove(r)
                    found = True
                    break
        if found:
            tp[idx] = 1  #.append(float(detbb[1]))
        else:
            fp[idx] = 1  #.append(float(detbb[1]))
        if show:
            pylab.ioff()
            img = gtImages.getImageByName2(detbb[0])
            pylab.figure(1)
            pylab.clf()
            pylab.imshow(img)
            rb = (float(detbb[3]), float(detbb[2]), float(detbb[5]),
                  float(detbb[4]))
            for r in rect:
                pylab.figure(1)
                pylab.ioff()
                box(r[0], r[1], r[2], r[3], 'b', lw=1.5)
            if found:
                box(rb[0], rb[1], rb[2], rb[3], 'g', lw=1.5)
            else:
                box(rb[0], rb[1], rb[2], rb[3], 'r', lw=1.5)
            pylab.draw()
            pylab.show()
            rect = []
            raw_input()

    return tp, fp, tot
예제 #29
0
def overlap_clusters2(C):
    overlaps = []

    for i in xrange(len(C)):
        print "\r%d / %d" % (i, len(C)),
        sys.stdout.flush()
        K = [C[i]]
        for j in xrange(i + 1, len(C)):
            #fi
            cj = C[j]
            for k in xrange(len(K)):
                ck = K[k]
                len1 = float(cj[2] - cj[1] + 1) / float(ck[2] - ck[1] + 1)
                len2 = float(cj[5] - cj[4] + 1) / float(ck[5] - ck[4] + 1)

                # If one of the two regions
                # * overlap, and
                # * have a reasonable similar size,
                # add it to the overlap region
                if (((cj[0] == ck[0]) and util.overlap(
                    (cj[1], cj[2]),
                    (ck[1], ck[2])) > 0 and (len1 > 0.3 and len1 < 3))
                        or ((cj[3] == ck[3]) and util.overlap(
                            (cj[4], cj[5]), (ck[4], ck[5])) > 0 and
                            (len2 > 0.3 and len2 < 3))):
                    K.append(cj)
                    break
                #fi
            #efor
        #efor
        if len(K) > 1:
            overlaps.append(K)
        #fi
    #efor

    # Overlapping regions
    OR = []

    # Within an overlap, find unique regions; remove duplicate regions
    for k in overlaps:
        R1 = [(c[0], c[1], c[2], 'id_a') for c in k]
        R2 = [(c[3], c[4], c[5], 'id_b') for c in k]

        # List of unique regions
        UR = []

        for R in [R1, R2]:
            for i in xrange(len(R) - 1):
                mr = R[i]
                # maximal region
                if mr == None:
                    continue
                for j in xrange(i + 1, len(R)):
                    rj = R[j]
                    if rj == None:
                        continue
                        # If the two regions overlap, expand the maximal region
                    if (mr[0] == rj[0] and util.overlap((mr[1], mr[2]),
                                                        (rj[1], rj[2])) > 0):
                        mr = (mr[0], min(mr[1], rj[1]), max(mr[2],
                                                            rj[2]), mr[3])
                        R[j] = None
                    #fi
                #efor
                UR.append(mr)
            #efor
        #efor
        OR.append(UR)
    #efor

    # Remove duplicates among overlaps (check for subsets)
    for i in xrange(len(OR) - 1):
        for j in xrange(i + 1, len(OR)):
            if sum([1 for k in OR[j] if (k in OR[i])]) >= 0.8 * len(OR):
                if len(OR[i]) > len(OR[j]):
                    OR[j] = []
                else:
                    OR[i] = []
            #fi
        #efor
    #efor

    return [x for x in OR if len(x) > 0]
예제 #30
0
파일: VOCpr.py 프로젝트: ChrisYang/CRFdet
def VOCprRecordthr(gtImages,detlist,show=False,ovr=0.5,pixels=None):
    """
        calculate the precision recall curve
    """
    dimg={}
    tot=0
    posd=[]
    for idx in range(len(gtImages)):
        rect=gtImages[idx]["bbox"][:]
        #if idx>288:
        #    print idx,rect
        if rect!=[]:
            #print gtImages.getImageName(idx).split("/")[-1].split(".")[0]
            dimg[gtImages[idx]["name"].split("/")[-1].split(".")[0]]={"bbox":rect,"det":[False]*len(rect)}
            for i, recti in enumerate(rect):
                if recti[5] == 0:
                    tot=tot+1

    imname=[]
    cnt=0
    tp=numpy.zeros(len(detlist))
    fp=numpy.zeros(len(detlist))
    thr=numpy.zeros(len(detlist))
    detlist.sort(cmpscore)
    for idx,detbb in enumerate(detlist):
        #print detbb[1]
        found=False
        maxovr=0
        #gtdet=[False]
        gt=0
        if dimg.has_key(detbb[0]):
            rect=dimg[detbb[0]]["bbox"]
            found=False
            for ir,r in enumerate(rect):
                #gtdet.append(False)
                rb=(float(detbb[3]),float(detbb[2]),float(detbb[5]),float(detbb[4]))
                #print "GT:",r
                #print "DET:",rb
                if pixels==None:
                    covr=overlap(rb,r)
                else:
                    covr=overlapx(rb,r,pixels)
                if covr>=maxovr:
                    maxovr=covr
                    gt=ir
                    #dimg[detbb[0]].remove(r)
                    #found=True
                    #break

        if maxovr>ovr:
            if dimg[detbb[0]]["bbox"][gt][5] == 0:
                if not(dimg[detbb[0]]["det"][gt]):
                    tp[idx]=1
                    dimg[detbb[0]]["det"][gt]=True
                    posd.append(detbb[1])
                else:
                    fp[idx]=1
        else:
            fp[idx]=1

########### PASCAL 2010
#    if ovmax>=VOCopts.minoverlap
#        if ~gt(i).diff(jmax)
#            if ~gt(i).det(jmax)
#                tp(d)=1;            % true positive
#		        gt(i).det(jmax)=true;
#            else
#                fp(d)=1;            % false positive (multiple detection)
#            end
#        end
#    else
#        fp(d)=1;                    % false positive
#    end
########################



        thr[idx]=detbb[1]
        if show:
            prec=numpy.sum(tp)/float(numpy.sum(tp)+numpy.sum(fp))
            rec=numpy.sum(tp)/tot
            print "Scr:",detbb[1],"Prec:%.3f"%prec,"Rec:%.3f"%rec
            ss=raw_input()
            if ss=="s" or not(found):
                pylab.ioff()
                img=gtImages.getImageByName2(detbb[0])
                pylab.figure(1)
                pylab.clf()
                pylab.imshow(img)
                rb=(float(detbb[3]),float(detbb[2]),float(detbb[5]),float(detbb[4]))
                for r in rect:
                    pylab.figure(1)
                    pylab.ioff()
                    box(r[0],r[1],r[2],r[3],'b',lw=1.5)
                if found:
                    box(rb[0],rb[1],rb[2],rb[3],'g',lw=1.5)
                else:
                    box(rb[0],rb[1],rb[2],rb[3],'r',lw=1.5)
                pylab.draw()
                pylab.show()
                rect=[]

    return tp,fp,thr,tot,posd
예제 #31
0
파일: VOCpr.py 프로젝트: ChrisYang/CRFdet
def VOCprRecord_wrong(gtImages,detlist,show=False,usetr=True,usedf=False,ovr=0.5):
    """
        calculate the precision recall curve
    """
    dimg={}
    tot=0
    for idx in range(len(gtImages)):
        rect=gtImages[idx]["bbox"][:]
        #if idx>288:
        #    print idx,rect
        if rect!=[]:
            #print gtImages.getImageName(idx).split("/")[-1].split(".")[0]
            dimg[gtImages[idx]["name"].split("/")[-1].split(".")[0]]={"bbox":rect,"det":[False]*len(rect)}
        tot=tot+len(rect)
    imname=[]
    cnt=0
    tp=numpy.zeros(len(detlist))
    fp=numpy.zeros(len(detlist))
    thr=numpy.zeros(len(detlist))
    detlist.sort(cmpscore)
    for idx,detbb in enumerate(detlist):
        #print detbb[1]
        found=False
        maxovr=0
        #gtdet=[False]
        gt=0
        if dimg.has_key(detbb[0]):
            rect=dimg[detbb[0]]["bbox"]
            found=False
            for ir,r in enumerate(rect):
                #gtdet.append(False)
                rb=(float(detbb[3]),float(detbb[2]),float(detbb[5]),float(detbb[4]))
                #print "GT:",r
                #print "DET:",rb
                covr=overlap(rb,r)
                if covr>=maxovr:
                    maxovr=covr
                    gt=ir
                    #dimg[detbb[0]].remove(r)
                    #found=True
                    #break
        if maxovr>ovr:
            #if not(dimg[detbb[0]]["det"][gt]):
            tp[idx]=1
            #dimg[detbb[0]]["det"][gt]=True
            #else:
            #    fp[idx]=1
        else:
            fp[idx]=1
        thr[idx]=detbb[1]
        if show:
            prec=numpy.sum(tp)/float(numpy.sum(tp)+numpy.sum(fp))
            rec=numpy.sum(tp)/tot
            print "Scr:",detbb[1],"Prec:%.3f"%prec,"Rec:%.3f"%rec
            ss=raw_input()
            if ss=="s" or not(found):
                pylab.ioff()
                img=gtImages.getImageByName2(detbb[0])
                pylab.figure(1)
                pylab.clf()
                pylab.imshow(img)
                rb=(float(detbb[3]),float(detbb[2]),float(detbb[5]),float(detbb[4]))
                for r in rect:
                    pylab.figure(1)
                    pylab.ioff()
                    box(r[0],r[1],r[2],r[3],'b',lw=1.5)
                if found:
                    box(rb[0],rb[1],rb[2],rb[3],'g',lw=1.5)
                else:
                    box(rb[0],rb[1],rb[2],rb[3],'r',lw=1.5)
                pylab.draw()
                pylab.show()
                rect=[]

    return tp,fp,thr,tot
예제 #32
0
def viewSortDet(gtImages,
                detlist,
                numim=numpy.inf,
                opt="all",
                usetr=True,
                usedf=False,
                ovr=0.5):
    dimg = {}
    tot = 0
    for idx in range(min(gtImages.getTotal(), numim)):
        rect = gtImages.getBBox(idx)
        if rect != []:
            #print gtImages.getImageName(idx).split("/")[-1].split(".")[0]
            dimg[gtImages.getImageName(idx).split("/")[-1].split(".")
                 [0]] = rect
        tot = tot + len(rect)
    imname = []
    cnt = 0
    tp = numpy.zeros(len(detlist))
    fp = numpy.zeros(len(detlist))
    thr = numpy.zeros(len(detlist))
    detlist.sort(cmpscore)
    for idx, detbb in enumerate(detlist):
        #print detbb[1]
        found = False
        if dimg.has_key(detbb[0]):
            rect = dimg[detbb[0]]
            found = False
            for r in rect:
                rb = (float(detbb[3]), float(detbb[2]), float(detbb[5]),
                      float(detbb[4]))
                #print "GT:",r
                #print "DET:",rb
                if overlap(rb, r) >= ovr:
                    dimg[detbb[0]].remove(r)
                    found = True
                    break
        if found:
            tp[idx] = 1
        else:
            fp[idx] = 1
        thr[idx] = detbb[1]
        if show:
            pylab.ioff()
            prec = numpy.sum(tp) / float(numpy.sum(tp) + numpy.sum(fp))
            rec = numpy.sum(tp) / tot
            print "Scr:", detbb[1], "Prec:", prec, "Rec:", rec
            img = gtImages.getImageByName2(detbb[0])
            pylab.figure(1)
            pylab.clf()
            pylab.imshow(img)
            rb = (float(detbb[3]), float(detbb[2]), float(detbb[5]),
                  float(detbb[4]))
            for r in rect:
                pylab.figure(1)
                pylab.ioff()
                box(r[0], r[1], r[2], r[3], 'b', lw=1.5)
            if found:
                box(rb[0], rb[1], rb[2], rb[3], 'g', lw=1.5)
            else:
                box(rb[0], rb[1], rb[2], rb[3], 'r', lw=1.5)
            pylab.draw()
            pylab.show()
            rect = []
            raw_input()

    return tp, fp, thr, tot
예제 #33
0
파일: VOCpr.py 프로젝트: ChrisYang/CRFdet
def VOCanalysis(gtImages,detlist,show=False,usetr=True,usedf=False,ovr=0.5):
    """
        calculate the precision recall curve
    """
    dimg={}
    tot=0
    for idx in range(len(gtImages)):
        rect=gtImages[idx]["bbox"][:]
        #if idx>288:
        #    print idx,rect
        if rect!=[]:
            #print gtImages.getImageName(idx).split("/")[-1].split(".")[0]
            dimg[gtImages[idx]["name"].split("/")[-1].split(".")[0]]={"bbox":rect,"det":[False]*len(rect)}
        tot=tot+len(rect)
    imname=[]
    cnt=0
    tp=numpy.zeros(len(detlist))
    fp=numpy.zeros(len(detlist))
    thr=numpy.zeros(len(detlist))

    tplist=[]
    fplist=[]
    fp2list=[]
    fnlist=[]

    detlist.sort(cmpscore)
    for idx,detbb in enumerate(detlist):
        #print detbb[1]
        found=False
        maxovr=0
        #gtdet=[False]
        gt=0
        if dimg.has_key(detbb[0]):
            rect=dimg[detbb[0]]["bbox"]
            found=False
            for ir,r in enumerate(rect):
                #gtdet.append(False)
                rb=(float(detbb[3]),float(detbb[2]),float(detbb[5]),float(detbb[4]))
                #print "GT:",r
                #print "DET:",rb
                covr=overlap(rb,r)
                if covr>=maxovr:
                    maxovr=covr
                    gt=ir
                    #dimg[detbb[0]].remove(r)
                    #found=True
                    #break
        if maxovr>ovr:
            if not(dimg[detbb[0]]["det"][gt]):
                tp[idx]=1
                dimg[detbb[0]]["det"][gt]=True
                tplist.append(detbb)
            else:
                fp[idx]=1
                fplist.append(detbb)
        else:
            fp[idx]=1
            fp2list.append(detbb)

    totalDetected  =0
    totalnoDetected=0

    for idx in range(len(gtImages)):
        rect=gtImages[idx]["bbox"][:]
        if rect!=[]:
            name = gtImages[idx]["name"].split("/")[-1].split(".")[0]
            bboxgt = dimg[name]
            for i in range(len(bboxgt["det"])):
                if bboxgt["det"][i]:
                    #bbox FOUND, it's ok
                    totalDetected += 1
                else:
                    #bbox not FOUND, add to FN
                    gtbb = [name,0,bboxgt["bbox"][i][0:4]]
                    fnlist.append(gtbb)
                    totalnoDetected += 1


    print "total Detected %d, total no Detected %d"%(totalDetected,totalnoDetected)

    #tplist.sort(key=lambda det: -det[1])
    #fplist.sort(key=lambda det: -det[1])
    #fnlist.sort(key=lambda det: -det[1])

    return tplist,fplist,fp2list,fnlist
예제 #34
0
    def run(self):
        if self.options.debubble:
            self.loadBubbleCircles()

        #read1_file is required
        read1_file = fastq.Reader(self.options.read1_file)
        #create a QC folder to contains QC results
        qc_base_folder = os.path.join(os.path.dirname(self.options.read1_file),
                                      "QC")
        if not os.path.exists(qc_base_folder):
            os.makedirs(qc_base_folder)
        #QC result of this file/pair
        qc_dir = os.path.join(qc_base_folder,
                              os.path.basename(self.options.read1_file))
        if not os.path.exists(qc_dir):
            os.makedirs(qc_dir)

        #no front trim if sequence is barcoded
        if self.options.barcode:
            self.options.trim_front = 0

        reporter = QCReporter()

        r1qc_prefilter = QualityControl(self.options.qc_sample,
                                        self.options.qc_kmer)
        r2qc_prefilter = QualityControl(self.options.qc_sample,
                                        self.options.qc_kmer)
        r1qc_prefilter.statFile(self.options.read1_file)
        r1qc_prefilter.plot(qc_dir, "R1-prefilter")
        if self.options.read2_file != None:
            r2qc_prefilter.statFile(self.options.read2_file)
            r2qc_prefilter.plot(qc_dir, "R2-prefilter")

        r1qc_postfilter = QualityControl(self.options.qc_sample,
                                         self.options.qc_kmer)
        r2qc_postfilter = QualityControl(self.options.qc_sample,
                                         self.options.qc_kmer)

        readLen = r1qc_prefilter.readLen
        overlap_histgram = [0 for x in xrange(readLen + 1)]
        distance_histgram = [0 for x in xrange(readLen + 1)]

        #auto detect trim front and trim tail
        if self.options.trim_front == -1 or self.options.trim_tail == -1:
            #auto trim for read1
            trimFront, trimTail = r1qc_prefilter.autoTrim()
            if self.options.trim_front == -1:
                self.options.trim_front = trimFront
            if self.options.trim_tail == -1:
                self.options.trim_tail = trimTail
            #auto trim for read2
            if self.options.read2_file != None:
                # check if we should keep same trimming for read1/read2 to keep their length identical
                # this option is on by default because lots of dedup algorithms require this feature
                if self.options.trim_pair_same:
                    self.options.trim_front2 = self.options.trim_front
                    self.options.trim_tail2 = self.options.trim_tail
                else:
                    trimFront2, trimTail2 = r2qc_prefilter.autoTrim()
                    if self.options.trim_front2 == -1:
                        self.options.trim_front2 = trimFront2
                    if self.options.trim_tail2 == -1:
                        self.options.trim_tail2 = trimTail2

        print(self.options.read1_file + " options:")
        print(self.options)

        #if good output folder not specified, set it as the same folder of read1 file
        good_dir = self.options.good_output_folder
        if good_dir == None:
            good_dir = os.path.dirname(self.options.read1_file)

        #if bad output folder not specified, set it as the same folder of read1 file
        bad_dir = self.options.bad_output_folder
        if bad_dir == None:
            bad_dir = os.path.dirname(self.options.read1_file)

        #if overlap output folder not specified, set it as the same folder of read1 file
        overlap_dir = self.options.overlap_output_folder
        if overlap_dir == None:
            overlap_dir = os.path.dirname(self.options.read1_file)

        if not os.path.exists(good_dir):
            os.makedirs(good_dir)

        if not os.path.exists(bad_dir):
            os.makedirs(bad_dir)

        if self.options.store_overlap and self.options.read2_file != None and (
                not os.path.exists(overlap_dir)):
            os.makedirs(overlap_dir)

        good_read1_file = None
        bad_read1_file = None
        overlap_read1_file = None
        if not self.options.qc_only:
            good_read1_file = fastq.Writer(
                os.path.join(good_dir,
                             getMainName(self.options.read1_file) +
                             ".good.fq"))
            bad_read1_file = fastq.Writer(
                os.path.join(bad_dir,
                             getMainName(self.options.read1_file) + ".bad.fq"))

            overlap_read1_file = None
            if self.options.store_overlap:
                overlap_read1_file = fastq.Writer(
                    os.path.join(
                        overlap_dir,
                        getMainName(self.options.read1_file) + ".overlap.fq"))

        #other files are optional
        read2_file = None
        good_read2_file = None
        bad_read2_file = None
        overlap_read2_file = None

        index1_file = None
        good_index1_file = None
        bad_index1_file = None
        overlap_index1_file = None

        index2_file = None
        good_index2_file = None
        bad_index2_file = None
        overlap_index2_file = None

        #if other files are specified, then read them
        if self.options.read2_file != None:
            read2_file = fastq.Reader(self.options.read2_file)
            if not self.options.qc_only:
                good_read2_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.read2_file) + ".good.fq"))
                bad_read2_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.read2_file) + ".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_read2_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.read2_file) +
                            ".overlap.fq"))
        if self.options.index1_file != None:
            index1_file = fastq.Reader(self.options.index1_file)
            if not self.options.qc_only:
                good_index1_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.index1_file) + ".good.fq"))
                bad_index1_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.index1_file) + ".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index1_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.index1_file) +
                            ".overlap.fq"))
        if self.options.index2_file != None:
            index2_file = fastq.Reader(self.options.index2_file)
            if not self.options.qc_only:
                good_index2_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.index2_file) + ".good.fq"))
                bad_index2_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.index2_file) + ".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index2_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.index2_file) +
                            ".overlap.fq"))

        r1 = None
        r2 = None
        i1 = None
        i2 = None

        # stat numbers
        TOTAL = 0
        GOOD = 0
        BAD = 0
        BADBCD1 = 0
        BADBCD2 = 0
        BADTRIM1 = 0
        BADTRIM2 = 0
        BADBBL = 0
        BADLEN = 0
        BADPOL = 0
        BADLQC = 0
        BADNCT = 0
        BADOL = 0
        BADINDEL = 0
        BADMISMATCH = 0
        BASE_CORRECTED = 0
        OVERLAPPED = 0
        OVERLAP_LEN_SUM = 0

        while True:
            r1 = read1_file.nextRead()
            if r1 == None:
                break

            if read2_file != None:
                r2 = read2_file.nextRead()
                if r2 == None:
                    break
            if index1_file != None:
                i1 = index1_file.nextRead()
                if i1 == None:
                    break
            if index2_file != None:
                i2 = index2_file.nextRead()
                if i2 == None:
                    break

            TOTAL += 1

            #barcode processing
            if self.options.barcode:
                barcodeLen1 = barcodeprocesser.detectBarcode(
                    r1[1], self.options.barcode_length,
                    self.options.barcode_verify)
                if barcodeLen1 == 0:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADBCD1")
                    BADBCD1 += 1
                    continue
                else:
                    if r2 == None:
                        barcodeprocesser.moveBarcodeToName(
                            r1, self.options.barcode_length,
                            self.options.barcode_verify)
                    else:
                        barcodeLen2 = barcodeprocesser.detectBarcode(
                            r2[1], self.options.barcode_length,
                            self.options.barcode_verify)
                        if barcodeLen2 == 0:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADBCD2")
                            BADBCD2 += 1
                            continue
                        else:
                            barcodeprocesser.moveAndTrimPair(
                                r1, r2, barcodeLen1, barcodeLen2,
                                self.options.barcode_verify)

            #trim
            if self.options.trim_front > 0 or self.options.trim_tail > 0:
                r1 = trim(r1, self.options.trim_front, self.options.trim_tail)
                if len(r1[1]) < 5:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADTRIM1")
                    BADTRIM1 += 1
                    continue
                if r2 != None:
                    r2 = trim(r2, self.options.trim_front2,
                              self.options.trim_tail2)
                    if len(r2[1]) < 5:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                        bad_read2_file, bad_index1_file,
                                        bad_index2_file, "BADTRIM2")
                        BADTRIM2 += 1
                        continue

            #filter debubble
            if self.options.debubble:
                if self.isInBubble(r1[0]):
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADBBL")
                    BADBBL += 1
                    continue

            #filter sequence length
            if len(r1[1]) < self.options.seq_len_req:
                self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file,
                                bad_index1_file, bad_index2_file, "BADLEN")
                BADLEN += 1
                continue

            #check polyX
            if self.options.poly_size_limit > 0:
                poly1 = hasPolyX(r1[1], self.options.poly_size_limit,
                                 self.options.allow_mismatch_in_poly)
                poly2 = None
                if r2 != None:
                    poly2 = hasPolyX(r2[1], self.options.poly_size_limit,
                                     self.options.allow_mismatch_in_poly)
                if poly1 != None or poly2 != None:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADPOL")
                    BADPOL += 1
                    continue

            #check low quality count
            if self.options.unqualified_base_limit > 0:
                lowQual1 = lowQualityNum(r1,
                                         self.options.qualified_quality_phred)
                lowQual2 = 0
                if r2 != None:
                    lowQual2 = lowQualityNum(
                        r2, self.options.qualified_quality_phred)
                if lowQual1 > self.options.unqualified_base_limit or lowQual1 > self.options.unqualified_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADLQC")
                    BADLQC += 1
                    continue

            #check N number
            if self.options.n_base_limit > 0:
                nNum1 = nNumber(r1)
                nNum2 = 0
                if r2 != None:
                    nNum2 = nNumber(r2)
                if nNum1 > self.options.n_base_limit or nNum2 > self.options.n_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADNCT")
                    BADNCT += 1
                    continue

            #check overlap and do error correction
            if r2 != None:
                (offset, overlap_len, distance) = util.overlap(r1[1], r2[1])
                overlap_histgram[overlap_len] += 1
                # deal with the case insert DNA is shorter than read length and cause offset is negative
                if offset < 0 and overlap_len > 30:
                    # shift the junk bases
                    r1[1] = r1[1][0:overlap_len]
                    r1[3] = r1[3][0:overlap_len]
                    r2[1] = r2[1][-offset:-offset + overlap_len]
                    r2[3] = r2[3][-offset:-offset + overlap_len]
                    # then calc overlap again
                    (offset, overlap_len,
                     distance) = util.overlap(r1[1], r2[1])
                if overlap_len > 30:
                    OVERLAPPED += 1
                    distance_histgram[distance] += 1
                    OVERLAP_LEN_SUM += overlap_len
                    corrected = 0
                    if distance > 2:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                        bad_read2_file, bad_index1_file,
                                        bad_index2_file, "BADOL")
                        BADOL += 1
                        continue
                    elif distance > 0:
                        #try to fix low quality base
                        hamming = util.hammingDistance(
                            r1[1][len(r1[1]) - overlap_len:],
                            util.reverseComplement(r2[1][len(r2[1]) -
                                                         overlap_len:]))
                        if hamming != distance:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADINDEL")
                            BADINDEL += 1
                            continue
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        for o in xrange(overlap_len):
                            b1 = r1[1][len(r1[1]) - overlap_len + o]
                            b2 = util.complement(r2[1][-o - 1])
                            q1 = r1[3][len(r1[3]) - overlap_len + o]
                            q2 = r2[3][-o - 1]
                            if b1 != b2:
                                # print(TOTAL, o, b1, b2, q1, q2)
                                if util.qualNum(q1) >= 27 and util.qualNum(
                                        q2) <= 16:
                                    r2[1] = util.changeString(
                                        r2[1], -o - 1, util.complement(b1))
                                    r2[3] = util.changeString(
                                        r2[3], -o - 1, q1)
                                    corrected += 1
                                elif util.qualNum(q2) >= 27 and util.qualNum(
                                        q1) <= 16:
                                    r1[1] = util.changeString(
                                        r1[1],
                                        len(r1[1]) - overlap_len + o, b2)
                                    r1[3] = util.changeString(
                                        r1[3],
                                        len(r1[3]) - overlap_len + o, q2)
                                    corrected += 1
                                if corrected >= distance:
                                    break
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        if corrected == distance:
                            BASE_CORRECTED += 1
                        else:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADMISMATCH")
                            BADMISMATCH += 1
                            continue
                    if distance == 0 or distance == corrected:
                        if self.options.store_overlap:
                            self.writeReads(getOverlap(r1, overlap_len),
                                            getOverlap(r2, overlap_len), i1,
                                            i2, overlap_read1_file,
                                            overlap_read2_file,
                                            overlap_index1_file,
                                            overlap_index2_file, None)

            #write to good
            self.writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file,
                            good_index1_file, good_index2_file, None)
            if self.options.qc_sample <= 0 or TOTAL < self.options.qc_sample:
                r1qc_postfilter.statRead(r1)
                if r2 != None:
                    r2qc_postfilter.statRead(r2)

            GOOD += 1
            if self.options.qc_only and TOTAL >= self.options.qc_sample:
                break

        r1qc_postfilter.qc()
        r1qc_postfilter.plot(qc_dir, "R1-postfilter")
        if self.options.read2_file != None:
            r2qc_postfilter.qc()
            r2qc_postfilter.plot(qc_dir, "R2-postfilter")

        #close all files
        if not self.options.qc_only:
            good_read1_file.flush()
            bad_read1_file.flush()
            if self.options.read2_file != None:
                good_read2_file.flush()
                bad_read2_file.flush()
            if self.options.index1_file != None:
                good_index1_file.flush()
                bad_index1_file.flush()
            if self.options.index2_file != None:
                good_index2_file.flush()
                bad_index2_file.flush()

        # print stat numbers
        BAD = TOTAL - GOOD
        result = {}
        result['total_reads'] = TOTAL
        result['good_reads'] = GOOD
        result['bad_reads'] = BAD
        result['bad_reads_with_bad_barcode'] = BADBCD1 + BADBCD2
        result['bad_reads_with_reads_in_bubble'] = BADBBL
        result['bad_reads_with_bad_read_length'] = BADLEN + BADTRIM1 + BADTRIM2
        result['bad_reads_with_polyX'] = BADPOL
        result['bad_reads_with_low_quality'] = BADLQC
        result['bad_reads_with_too_many_N'] = BADNCT
        result['bad_reads_with_bad_overlap'] = BADOL + BADMISMATCH + BADINDEL

        # plot result bar figure
        labels = [
            'good reads', 'has_polyX', 'low_quality', 'too_short', 'too_many_N'
        ]
        counts = [GOOD, BADPOL, BADLQC, BADLEN + BADTRIM1 + BADTRIM2, BADNCT]
        colors = ['#66BB11', '#FF33AF', '#FFD3F2', '#FFA322', '#FF8899']
        if self.options.read2_file != None:
            labels.append('bad_overlap')
            counts.append(BADOL + BADMISMATCH + BADINDEL)
            colors.append('#FF6600')
        if self.options.debubble:
            labels.append('in_bubble')
            counts.append(BADBBL)
            colors.append('#EEBB00')
        if self.options.barcode:
            labels.append('bad_barcode')
            counts.append(BADBCD1 + BADBCD2)
            colors.append('#CCDD22')

        for i in xrange(len(counts)):
            labels[i] = labels[i] + ": " + str(counts[i]) + "(" + str(
                100.0 * float(counts[i]) / TOTAL) + "%)"

        fig = plt.figure(1)
        plt.title("Filtering statistics of sampled " + str(TOTAL) + " reads",
                  fontsize=12,
                  color='#666666')
        plt.axis('equal')
        patches, texts = plt.pie(counts, colors=colors, radius=0.7)
        patches, labels, dummy = zip(*sorted(
            zip(patches, labels, counts), key=lambda x: x[2], reverse=True))
        plt.legend(patches, labels, loc='upper left', fontsize=9)
        plt.savefig(os.path.join(qc_dir, "filter-stat.png"),
                    bbox_inches='tight')
        plt.close(1)

        stat = {}
        # stat["options"]=self.options
        stat["summary"] = result
        stat["command"] = makeDict(self.options)
        stat["kmer_content"] = {}
        stat["kmer_content"]["read1_prefilter"] = r1qc_prefilter.topKmerCount[
            0:10]
        stat["kmer_content"][
            "read1_postfilter"] = r1qc_postfilter.topKmerCount[0:10]
        if self.options.read2_file != None:
            stat["kmer_content"][
                "read2_prefilter"] = r2qc_prefilter.topKmerCount[0:10]
            stat["kmer_content"][
                "read2_postfilter"] = r2qc_postfilter.topKmerCount[0:10]
            stat["overlap"] = {}
            stat["overlap"]['overlapped_pairs'] = OVERLAPPED
            if OVERLAPPED > 0:
                stat["overlap"]['average_overlap_length'] = float(
                    OVERLAP_LEN_SUM / OVERLAPPED)
            else:
                stat["overlap"]['average_overlap_length'] = 0.0
            stat["overlap"]['bad_edit_distance'] = BADOL
            stat["overlap"]['bad_mismatch_bases'] = BADMISMATCH
            stat["overlap"]['bad_indel'] = BADINDEL
            stat["overlap"][
                'reads_with_corrected_mismatch_bases'] = BASE_CORRECTED
            stat["overlap"][
                'overlapped_area_edit_distance_histogram'] = distance_histgram[
                    0:10]
            plotOverlapHistgram(overlap_histgram, readLen, TOTAL,
                                os.path.join(qc_dir, "overlap.png"))

        stat_file = open(os.path.join(qc_dir, "after.json"), "w")
        stat_json = json.dumps(stat,
                               sort_keys=True,
                               indent=4,
                               separators=(',', ': '))
        stat_file.write(stat_json)
        stat_file.close()

        self.addFiguresToReport(reporter)
        reporter.output(os.path.join(qc_dir, "report.html"))
예제 #35
0
파일: VOCpr.py 프로젝트: ballasn/facedet
def VOCprRecordOptim(gtImages, detlist, show=False, ovr=0.5, pixels=None):
    """
        calculate the precision recall curve
    """
    tx = []
    ty = []
    sx = []
    sy = []
    dimg = {}
    tot = 0
    for idx in range(len(gtImages)):
        rect = gtImages[idx]["bbox"][:]
        if rect != []:
            dimg[gtImages[idx]["name"].split(
                "/")[-1].split(".")[0]] = {"bbox": rect, "det": [False] * len(rect)}
            for i, recti in enumerate(rect):
                if recti[5] == 0:
                    tot = tot + 1

    imname = []
    cnt = 0
    tp = numpy.zeros(len(detlist))
    fp = numpy.zeros(len(detlist))
    thr = numpy.zeros(len(detlist))
    detlist.sort(cmpscore)
    for idx, detbb in enumerate(detlist):
        found = False
        maxovr = 0
        gt = 0
        if dimg.has_key(detbb[0]):
            rect = dimg[detbb[0]]["bbox"]
            found = False
            for ir, r in enumerate(rect):
                rb = (float(detbb[3]), float(detbb[2]),
                      float(detbb[5]), float(detbb[4]))
                if pixels == None:
                    covr = overlap(rb, r)
                else:
                    covr = overlapx(rb, r, pixels)
                if covr >= maxovr:
                    maxovr = covr
                    gt = ir

        if maxovr > ovr:
            if dimg[detbb[0]]["bbox"][gt][5] == 0:
                if not(dimg[detbb[0]]["det"][gt]):
                    tp[idx] = 1
                    dimg[detbb[0]]["det"][gt] = True
                    gtx = dimg[detbb[0]]["bbox"][gt][
                        3] - dimg[detbb[0]]["bbox"][gt][1]
                    dtx = detbb[4] - detbb[2]
                    gty = dimg[detbb[0]]["bbox"][gt][
                        2] - dimg[detbb[0]]["bbox"][gt][0]
                    dty = detbb[5] - detbb[3]
                    gtcx = (
                        dimg[detbb[0]]["bbox"][gt][3] + dimg[detbb[0]]["bbox"][gt][1]) / 2.
                    dtcx = (detbb[4] + detbb[2]) / 2.
                    gtcy = (
                        dimg[detbb[0]]["bbox"][gt][2] + dimg[detbb[0]]["bbox"][gt][0]) / 2.
                    dtcy = (detbb[5] + detbb[3]) / 2.
                    tx.append((gtcx - dtcx) / float(dtx))
                    ty.append((gtcy - dtcy) / float(dty))
                    sx.append(gtx / float(dtx))
                    sy.append(gty / float(dty))
                else:
                    fp[idx] = 1
        else:
            fp[idx] = 1

        thr[idx] = detbb[1]
        if show:
            prec = numpy.sum(tp) / float(numpy.sum(tp) + numpy.sum(fp))
            rec = numpy.sum(tp) / tot
            print("Scr:", detbb[1], "Prec:%.3f" % prec, "Rec:%.3f" % rec)
            ss = raw_input()
            if ss == "s" or not(found):
                pylab.ioff()
                img = gtImages.getImageByName2(detbb[0])
                pylab.figure(1)
                pylab.clf()
                pylab.imshow(img)
                rb = (float(detbb[3]), float(detbb[2]),
                      float(detbb[5]), float(detbb[4]))
                for r in rect:
                    pylab.figure(1)
                    pylab.ioff()
                    box(r[0], r[1], r[2], r[3], 'b', lw=1.5)
                if found:
                    box(rb[0], rb[1], rb[2], rb[3], 'g', lw=1.5)
                else:
                    box(rb[0], rb[1], rb[2], rb[3], 'r', lw=1.5)
                pylab.draw()
                pylab.show()
                rect = []

    return tp, fp, thr, tot, tx, ty, sx, sy
예제 #36
0
    def setComplementSequence(self, sequenceString, strand):
        """
        This version takes anothers strand and only sets the indices that
        align with the given complimentary strand

        return the used portion of the sequenceString

        As it depends which direction this is going, and strings are stored in
        memory left to right, we need to test for isDrawn5to3 to map the
        reverse compliment appropriately, as we traverse overlapping strands.

        We reverse the sequence ahead of time if we are applying it 5' to 3',
        otherwise we reverse the sequence post parsing if it's 3' to 5'

        Again, sequences are stored as strings in memory 5' to 3' so we need
        to jump through these hoops to iterate 5' to 3' through them correctly

        Perhaps it's wiser to merely store them left to right and reverse them
        at draw time, or export time
        """
        sLowIdx, sHighIdx = self._baseIdxLow, self._baseIdxHigh
        cLowIdx, cHighIdx = strand.idxs()

        # get the ovelap
        lowIdx, highIdx = util.overlap(sLowIdx, sHighIdx, cLowIdx, cHighIdx)

        # only get the characters we're using, while we're at it, make it the
        # reverse compliment

        totalLength = self.totalLength()

        # see if we are applying
        if sequenceString is None:
            # clear out string for in case of not total overlap
            useSeq = ''.join([' ' for x in range(totalLength)])
        else:  # use the string as is
            useSeq = sequenceString[::-1] if self._isDrawn5to3 \
                                            else sequenceString

        temp = array(ARRAY_TYPE, sixb(useSeq))
        if self._sequence is None:
            tempSelf = array(ARRAY_TYPE, sixb(''.join([' ' for x in range(totalLength)])))
        else:
            tempSelf = array(ARRAY_TYPE, sixb(self._sequence) if self._isDrawn5to3 \
                                                    else sixb(self._sequence[::-1]))

        # generate the index into the compliment string
        a = self.insertionLengthBetweenIdxs(sLowIdx, lowIdx - 1)
        b = self.insertionLengthBetweenIdxs(lowIdx, highIdx)
        c = strand.insertionLengthBetweenIdxs(cLowIdx, lowIdx - 1)
        start = lowIdx - cLowIdx + c
        end = start + b + highIdx - lowIdx + 1
        tempSelf[lowIdx - sLowIdx + a:highIdx - sLowIdx + 1 + a + b] = temp[start:end]
        # print "old sequence", self._sequence
        self._sequence = tostring(tempSelf)

        # if we need to reverse it do it now
        if not self._isDrawn5to3:
            self._sequence = self._sequence[::-1]

        # test to see if the string is empty(), annoyingly expensive
        if len(self._sequence.strip()) == 0:
            self._sequence = None

        # print "new sequence", self._sequence
        return self._sequence
예제 #37
0
파일: VOCpr.py 프로젝트: ChrisYang/CRFdet
def VOCprlist(gtImages,detlist,show=False,usetr=True,usedf=False,ovr=0.5):
    """
        calculate the precision recall curve
    """
    #detf=open(detfile,"r")
    #detect=detf.readlines()
    imname=[]
    cnt=0
    #ovr=0.49
    #print trPosImages.getTotal()
    tp=[]
    fp=[]
    tot=0
    for idx in range(gtImages.getTotal()):
        print gtImages.getImageName(idx)
        if show:
            img=gtImages.getImage(idx)
            pylab.figure(1)
            pylab.clf()
            pylab.imshow(img)
        #pyr=HOGcompute.HOGcrop(img,interv=interv)
        #pyr.pad()
        #pyr.pad()
        #pyr.contrast()
        rect=gtImages.getBBox(idx,usetr=usetr,usedf=usedf)
        print rect
        if show:
            for r in rect:
                pylab.figure(1)
                pylab.ioff()
                box(r[0],r[1],r[2],r[3],'b',lw=1.5)
                #raw_input()
        tot=tot+len(rect)
        #print len(rect),rect
        #print rect
        for l in detlist:
            data=l#.split(" ")
            if data[0]==gtImages.getImageName(idx).split("/")[-1].split(".")[0]:
                notfound=True
                rb=[float(data[3]),float(data[2]),float(data[5]),float(data[4])]
                if show:
                    pylab.ioff()
                    pylab.text(rb[1],rb[0],data[1])
                for id,r in enumerate(rect):
                    #pylab.figure(1)
                    #box(r[0],r[1],r[2],r[3],'b',lw=1.5)
                    #print "entered",data
                    #rb=[float(data[3]),float(data[2]),float(data[5]),float(data[4])]
                    #print rb,r,overlap(rb,r)
                    #pylab.text(rb[1],rb[0],data[1])
                    if overlap(rb,r)>=ovr:
                        if show:
                            pylab.ioff()
                            box(rb[0],rb[1],rb[2],rb[3],'g',lw=1.5)
                        del rect[id]
                        tp.append(float(data[1]))
                        notfound=False
                        break
                if notfound==True:
                    if show:
                        pylab.ioff()
                        box(rb[0],rb[1],rb[2],rb[3],'r',lw=1)                        
                    fp.append(float(data[1]))
                #print len(tp),len(fp),tot            
            #break
        if show:
            pylab.figure(1)
            pylab.show()
            pylab.draw()
        #raw_input()
    return tp,fp,tot
예제 #38
0
    def setComplementSequence(self, sequenceString, strand):
        """
        This version takes anothers strand and only sets the indices that
        align with the given complimentary strand

        return the used portion of the sequenceString

        As it depends which direction this is going, and strings are stored in
        memory left to right, we need to test for isDrawn5to3 to map the
        reverse compliment appropriately, as we traverse overlapping strands.

        We reverse the sequence ahead of time if we are applying it 5' to 3',
        otherwise we reverse the sequence post parsing if it's 3' to 5'

        Again, sequences are stored as strings in memory 5' to 3' so we need
        to jump through these hoops to iterate 5' to 3' through them correctly

        Perhaps it's wiser to merely store them left to right and reverse them
        at draw time, or export time
        """
        sLowIdx, sHighIdx = self._baseIdxLow, self._baseIdxHigh
        cLowIdx, cHighIdx = strand.idxs()

        # get the ovelap
        lowIdx, highIdx = util.overlap(sLowIdx, sHighIdx, cLowIdx, cHighIdx)

        # only get the characters we're using, while we're at it, make it the
        # reverse compliment

        totalLength = self.totalLength()

        # see if we are applying
        if sequenceString == None:
            # clear out string for in case of not total overlap
            useSeq = ''.join([' ' for x in range(totalLength)])
        else:  # use the string as is
            useSeq = sequenceString[::-1] if self._isDrawn5to3 \
                                            else sequenceString

        temp = array('c', useSeq)
        if self._sequence == None:
            tempSelf = array('c', ''.join([' ' for x in range(totalLength)]))
        else:
            tempSelf = array('c', self._sequence if self._isDrawn5to3 \
                                                    else self._sequence[::-1])

        # generate the index into the compliment string
        a = self.insertionLengthBetweenIdxs(sLowIdx, lowIdx - 1)
        b = self.insertionLengthBetweenIdxs(lowIdx, highIdx)
        c = strand.insertionLengthBetweenIdxs(cLowIdx, lowIdx - 1)
        start = lowIdx - cLowIdx + c
        end = start + b + highIdx - lowIdx + 1
        tempSelf[lowIdx - sLowIdx + a:highIdx - sLowIdx + 1 + a + b] = \
                                                                temp[start:end]
        # print "old sequence", self._sequence
        self._sequence = tempSelf.tostring()
        
        # if we need to reverse it do it now
        if not self._isDrawn5to3:
            self._sequence = self._sequence[::-1]

        # test to see if the string is empty(), annoyingly expensive
        if len(self._sequence.strip()) == 0:
            self._sequence = None
            
        # print "new sequence", self._sequence
        return self._sequence