예제 #1
0
def get_data(column, np_values, alpha):

    mvs = bayes_mvs(np_values, alpha)

    #report these metrics
    output = [
        present("Column", column),
        present("Length", len(np_values)),
        present("Unique", len(np.unique(np_values))),
        present("Min", np_values.min()),
        present("Max", np_values.max()),
        present("Mid-Range", (np_values.max() - np_values.min())/2),
        present("Range", np_values.max() - np_values.min()),
        present("Mean", np_values.mean()),
        present("Mean-%s-CI" % alpha, tupleToString(mvs[0][1])),
        present("Variance", mvs[1][0]),
        present("Var-%s-CI" % alpha, tupleToString(mvs[1][1])),
        present("StdDev", mvs[2][0]),
        present("Std-%s-CI" % alpha, tupleToString(mvs[2][1])),
        present("Mode", stats.mode(np_values)[0][0]),
        present("Q1", stats.scoreatpercentile(np_values, 25)),
        present("Q2", stats.scoreatpercentile(np_values, 50)),
        present("Q3", stats.scoreatpercentile(np_values, 75)),
        present("Trimean", trimean(np_values)),
        present("Minhinge", midhinge(np_values)),
        present("Skewness", stats.skew(np_values)),
        present("Kurtosis", stats.kurtosis(np_values)),
        present("StdErr", sem(np_values)),
        present("Normal-P-value", normaltest(np_values)[1])
        ]
    return output
예제 #2
0
def get_data(column, np_values, alpha):

    mvs = bayes_mvs(np_values, alpha)

    #report these metrics
    output = [
        present("Column", column),
        present("Length", len(np_values)),
        present("Unique", len(np.unique(np_values))),
        present("Min", np_values.min()),
        present("Max", np_values.max()),
        present("Mid-Range", (np_values.max() - np_values.min()) / 2),
        present("Range",
                np_values.max() - np_values.min()),
        present("Mean", np_values.mean()),
        present("Mean-%s-CI" % alpha, tupleToString(mvs[0][1])),
        present("Variance", mvs[1][0]),
        present("Var-%s-CI" % alpha, tupleToString(mvs[1][1])),
        present("StdDev", mvs[2][0]),
        present("Std-%s-CI" % alpha, tupleToString(mvs[2][1])),
        present("Mode",
                stats.mode(np_values)[0][0]),
        present("Q1", stats.scoreatpercentile(np_values, 25)),
        present("Q2", stats.scoreatpercentile(np_values, 50)),
        present("Q3", stats.scoreatpercentile(np_values, 75)),
        present("Trimean", trimean(np_values)),
        present("Minhinge", midhinge(np_values)),
        present("Skewness", stats.skew(np_values)),
        present("Kurtosis", stats.kurtosis(np_values)),
        present("StdErr", sem(np_values)),
        present("Normal-P-value",
                normaltest(np_values)[1])
    ]
    return output
예제 #3
0
 def _compute_params(self, features):
     m = features.shape[1]
     percentiles = np.ones((2, m)) * np.nan
     for j in xrange(m):
         percentiles[0, j] = scoreatpercentile(features[:, j], self.lower_q)
         percentiles[1, j] = scoreatpercentile(features[:, j], self.upper_q)
     return percentiles
 def _compute_params(self, features):
     m = features.shape[1]
     percentiles = np.ones((2, m)) * np.nan
     for j in xrange(m):
         percentiles[0, j] = scoreatpercentile(features[:, j], self.lower_q)
         percentiles[1, j] = scoreatpercentile(features[:, j], self.upper_q)
     return percentiles
예제 #5
0
 def _compute_percentiles(features):
     m = features.shape[1]
     percentiles = np.ones((2, m)) * np.nan
     for j in xrange(m):
         percentiles[0, j] = scoreatpercentile(features[:, j], 1)
         percentiles[1, j] = scoreatpercentile(features[:, j], 99)
     return percentiles
예제 #6
0
def showStatistics(key=None, domain=True, dist=False, cumdist=False, clip=None, vmin=None, vmax=None, percentile=False):
    """Show the values corresponding with key in the specified mode.

    key is one of the keys of SelectableStatsValues
    mode is one of ['On Domain','Histogram','Cumulative Histogram']
    """
    S = selection.check(single=True)
    if S:
        func, onEdges = SelectableStatsValues[key]
        kargs = {}
        if key == "Curvature":
            kargs["neighbours"] = _stat_dia.results["neighbours"]
        val = func(S, **kargs)
        if key == "Curvature":
            ind = CurvatureValues.index(_stat_dia.results["curval"])
            val = val[ind]
            val = val[S.elems]

        # !! THIS SHOULD BE IMPLEMENTED AS A GENERAL VALUE CLIPPER
        # !! e.g popping up when clicking the legend
        # !! and the values should be changeable

        if clip:
            clip = clip.lower()
            if percentile:
                try:
                    from scipy.stats.stats import scoreatpercentile
                except:
                    warning(
                        """..
                
**The **percentile** clipping option is not available.
Most likely because 'python-scipy' is not installed on your system."""
                    )
                    return

                Q1 = scoreatpercentile(val, vmin)
                Q3 = scoreatpercentile(val, vmax)
                factor = 3
                if vmin:
                    vmin = Q1 - factor * (Q3 - Q1)
                if vmax:
                    vmax = Q3 + factor * (Q3 - Q1)

            if clip == "top":
                val = val.clip(max=vmax)
            elif clip == "bottom":
                val = val.clip(min=vmin)
            else:
                val = val.clip(vmin, vmax)

        if domain:
            clear()
            lights(False)
            showSurfaceValue(S, key, val, onEdges)
        if dist:
            showHistogram(key, val, cumulative=False)
        if cumdist:
            showHistogram(key, val, cumulative=True)
예제 #7
0
 def hit_detection_proliferation(res, who, who_mitotic_hits, ctrl_points, whis=1.5):
     q3=scoreatpercentile(ctrl_points, per=75)
     q1=scoreatpercentile(ctrl_points, per=25)
     
     val=q1-whis*(q3-q1)
     
     hits=np.where(res<=val)[0]
     
     return filter(lambda x: x not in who_mitotic_hits, np.array(who)[hits])
예제 #8
0
def normalize(data):
    wtdata = array(data)
    wtdata[wtdata < 0] = 0
    q1 = stats.scoreatpercentile(wtdata, 25)
    q3 = stats.scoreatpercentile(wtdata, 75)
    interquart = q3 - q1
    tenperc = stats.scoreatpercentile(wtdata[wtdata <= q1 + 1.5 * interquart], 90)
    maxav = wtdata[wtdata >= tenperc].mean()
    wtdata = wtdata / maxav
    return wtdata
예제 #9
0
    def hit_detection_pheno_score(res, who, ctrl_points, whis=1.5):
        '''
       Permissive hit detection taking experiments where Interphase phenotypic score is under the bottom whisker,
       which is 1.5*IQR
'''
        q3=scoreatpercentile(ctrl_points[:,CLASSES.index('Interphase')], per=75)
        q1=scoreatpercentile(ctrl_points[:,CLASSES.index('Interphase')], per=25)
        
        val=q1-whis*(q3-q1)
        
        hits=np.where(res[:,CLASSES.index('Interphase')]<=val)[0]
        
        return res[hits], np.array(who)[hits]
예제 #10
0
파일: mapping.py 프로젝트: mmagnus/hitrace
def normalize(bonuses):
    l = len(bonuses)
    wtdata = array(bonuses)
    if wtdata.min() < 0:
	wtdata -= wtdata.min()
    interquart = stats.scoreatpercentile(wtdata, 75) - stats.scoreatpercentile(wtdata, 25)
    tenperc = stats.scoreatpercentile(wtdata, 90)
    maxcount = 0
    maxav = 0.
    for i in range(l):
	if wtdata[i] >= tenperc:
	    maxav += wtdata[i]
	    maxcount += 1
    maxav /= maxcount
    wtdata = wtdata/maxav
    return wtdata
예제 #11
0
def graphWorking(trajdist, mds_transformed, genes=None, percentile=10, only_high_degree=True, clustering_labels=None):
    couleurs=['green', 'red', 'blue', 'orange', 'yellow', "white", 'purple', 'pink', 'grey']
    newtrajdist=np.array(trajdist)
    newtrajdist[np.where(newtrajdist>scoreatpercentile(newtrajdist.flatten(), percentile))]=0
    
    G=nx.from_numpy_matrix(newtrajdist); labels={}
    #maintenant les edges ont bien les poids que l'on voudrait mais on ne peut pas faire de plots pcq il faut donner les positions des noeuds. Utiliser les resultats MDS pr ca
    for el in G.nodes():
        G.node[el]['pos']=mds_transformed[el][:2]
        if genes is not None:
            labels[el]=genes[el]
    
    pos=nx.get_node_attributes(G,'pos')

    p.figure(figsize=(8,8))
    
    if not only_high_degree:
        nx.draw_networkx_edges(G,pos,alpha=0.4)
        node_color=[float(G.degree(v)) for v in G] if clustering_labels is None else [couleurs[k] for k in clustering_labels]
        nx.draw_networkx_nodes(G,pos,
                           node_size=60,
                           node_color=node_color,
                           cmap=p.cm.Reds_r)
    else:
        degrees=np.array([float(G.degree(v)) for v in G])
        wh_=np.where(degrees>=scoreatpercentile(degrees, 90))[0]
        
        pos={x:pos[x] for x in pos if x in wh_}
        labels={x:labels[x] for x in pos}
        
        for k in range(len(genes)):
            if k not in pos:
                G.remove_node(k)
        node_color=[float(G.degree(v)) for v in G]
        nx.draw_networkx_edges(G,pos,alpha=0.4)
        nx.draw_networkx_nodes(G,pos,
                           node_size=60,
                           node_color=node_color,
                           cmap=p.cm.Reds_r)
        
    
    nx.draw_networkx_labels(G, pos, labels, font_size=10)
    
    p.axis('off')
    p.show()
        
    return G
예제 #12
0
    def compute_fft_stats(self):
        if config.args.conflicts == config.ADD_FILES and \
                (os.path.exists(self.fivepseq_out.get_file_path(FivePSeqOut.FFT_SIGNALS_TERM)) &
                os.path.exists(self.fivepseq_out.get_file_path(FivePSeqOut.FFT_SIGNALS_START)) &
                os.path.exists(self.fivepseq_out.get_file_path(FivePSeqOut.FFT_STATS_DF_FILE))):
            self.logger.info(
                "Skipping FFT statistics calculation: files already exist")
        else:
            self.logger.info("Computing FFT statistics")
            count_vector_list = self.fivepseq_counts.get_count_vector_list(
                FivePSeqCounts.FULL_LENGTH)

            span_size = self.fivepseq_counts.annotation.span_size

            lengths = [0] * len(count_vector_list)
            for i in range(len(count_vector_list)):
                count_vector = count_vector_list[i][
                    span_size:len(count_vector_list[i]) - span_size]
                lengths[i] = len(count_vector)

            # align start
            size = int(stats.scoreatpercentile(lengths, per=25))
            num = 3 * len(count_vector_list) // 4
            start_array = np.zeros((num, size))
            term_array = np.zeros((num, size))

            ind = 0
            for i in range(len(count_vector_list)):
                count_vector = count_vector_list[i][
                    span_size:len(count_vector_list[i]) - span_size]
                if (len(count_vector)) > size:
                    start_array[ind, :] = count_vector[0:size]
                    term_array[ind, :] = count_vector[len(count_vector) -
                                                      size:len(count_vector)]
                    ind += 1
            start_mean = start_array.mean(axis=0)
            term_mean = term_array.mean(axis=0)

            self.fft_stats_start = self.fft_stats_on_vector(start_mean, 5)
            self.fft_stats_term = self.fft_stats_on_vector(term_mean, 5)
            self.fft_stats_df = pd.DataFrame(data=[
                self.fft_stats_start[1], self.fft_stats_start[2],
                self.fft_stats_start[3], self.fft_stats_term[1],
                self.fft_stats_term[2], self.fft_stats_term[3]
            ],
                                             index=[
                                                 "START_periods",
                                                 "START_signals",
                                                 "START_scales",
                                                 "TERM_periods",
                                                 "TERM_signals", "TERM_scales"
                                             ])
            self.logger.info("Writing FFT stats")
            self.fivepseq_out.write_df_to_file(self.fft_stats_df,
                                               FivePSeqOut.FFT_STATS_DF_FILE)
            self.fivepseq_out.write_series_to_file(
                self.fft_stats_start[0], FivePSeqOut.FFT_SIGNALS_START)
            self.fivepseq_out.write_series_to_file(
                self.fft_stats_term[0], FivePSeqOut.FFT_SIGNALS_TERM)
예제 #13
0
def createFeldmanRanking(protectedCandidates, nonProtectedCandidates, k):
    """
    creates a ranking that promotes the protected candidates by adjusting the distribution of the
    qualifications of the protected and non-protected group

    IMPORTANT: THIS METHOD MODIFIES THE ORIGINAL LIST OF PROTECTED CANDIDATES!
    I.e. it modifies the qualification of the
    protected candidates. If the original list has to be preserved, it has to be deep-copied into a
    new data structure, before handed over into this method.

    steps:
        1. take a protected candidate x
        2. determine the percentile of that candidate within their group percentile(x)
        3. find a non-protected candidate y that has the same percentile(y) == percentile(x)
        4. assign the score of y to x
        5. goto 1

    Parameters:
    ----------
    :param protectedCandidates: array of protected candidates
    :param nonProtectedCandidates: array of non-protected candidates
    :param k: length of the ranking to return

    Return:
    ------
    a ranking of protected and non-protected candidates, which tries to have a better share of
    protected and non-protected candidates
    """

    # ensure candidates are sorted by descending qualificiations
    protectedCandidates.sort(key=lambda candidate: candidate.qualification,
                             reverse=True)
    nonProtectedCandidates.sort(key=lambda candidate: candidate.qualification,
                                reverse=True)

    protectedQualifications = [
        protectedCandidates[i].qualification
        for i in range(len(protectedCandidates))
    ]
    nonProtectedQualifications = [
        nonProtectedCandidates[i].qualification
        for i in range(len(nonProtectedCandidates))
    ]

    # create same distribution for protected and non-protected candidates
    for i, candidate in enumerate(protectedCandidates):
        if i >= k:
            # only need to adapt the scores for protected candidates up to required length
            # the rest will not be considered anyway
            break
        # find percentile of protected candidate
        p = percentileofscore(protectedQualifications, candidate.qualification)
        # find score of a non-protected in the same percentile
        score = scoreatpercentile(nonProtectedQualifications, p)
        candidate.qualification = score

    # create a colorblind ranking
    return createFairRanking(k, protectedCandidates, nonProtectedCandidates,
                             ESSENTIALLY_ZERO, 0.1)
    def get_goat_indicies(self):
        """Return the indicies of goat user.
        Speration is done at 30th (or 70th) percentile.

        Source: Exploiting the "Doddington Zoo" Effect in Biometric Fusion
        """

        sorted_genuine = self._genuine_scores.copy()
        sorted_genuine.sort()

        if self._type == 'distance':
            score = scoreatpercentile(sorted_genuine, 70)
            indicies = np.where(self._genuine_scores>score)
        elif self._type == 'score':
            score = scoreatpercentile(sorted_genuine, 30)
            indicies = np.where(self._genuine_scores<score)

        return indicies[0]
예제 #15
0
def normalize(bonuses):
    l = len(bonuses)
    wtdata = array(bonuses)
    if wtdata.min() < 0:
        wtdata -= wtdata.min()
    interquart = stats.scoreatpercentile(wtdata, 75) - stats.scoreatpercentile(
        wtdata, 25)
    tenperc = stats.scoreatpercentile(wtdata, 90)
    maxcount = 0
    maxav = 0.

    for i in xrange(l):
        if wtdata[i] >= tenperc:
            maxav += wtdata[i]
            maxcount += 1
    maxav /= maxcount
    wtdata = wtdata / maxav
    return wtdata
예제 #16
0
 def _create_cache_percentiles(self, predicate, resume=False):
     controls = self._get_controls(predicate)
     for i, (plate, imKeys) in enumerate(make_progress_bar('Percentiles')(controls.items())):
         filename = self._percentiles_filename(plate)
         if i == 0:
             _check_directory(os.path.dirname(filename), resume)
         if resume and os.path.exists(filename):
             continue
         features = self.cache.load(imKeys)[0]
         if len(features) == 0:
             logger.warning('No DMSO features for plate %s' % str(plate))
             percentiles = np.zeros((0, len(self.cache.colnames)))
         else:
             m = features.shape[1]
             percentiles = np.ones((2, m)) * np.nan
             for j in xrange(m):
                 percentiles[0, j] = scoreatpercentile(features[:, j], 1)
                 percentiles[1, j] = scoreatpercentile(features[:, j], 99)
         np.save(filename, percentiles)
    def get_lamb_indicies(self):
        """Return the indicies of lambs users.

        BUG: seems to not work
        """

        lambs_scores = []
        for userid in self._users_id:
            presentations = self.get_impostor_presentations_of_user(userid)
            best_matchings = []

            # loop other the other user to get the best matching
            gallery_ids = presentations.get_raw_gallery_userid(unique=True)
            for gallery_id in gallery_ids:
                assert gallery_id != userid

                # Select only one gallery user
                indicies = presentations._data[:,self.GALLERY] ==  gallery_id
                impscores = presentations.get_raw_scores()[indicies]

                if self._type == 'score':
                    selected_score = np.min(impscores)
                elif self._type == 'distance' :
                    selected_score = np.max(impscores)
                else:
                    raise "Impossible"


                best_matchings.append(selected_score)
            lambs_scores.append(np.mean(best_matchings))

        lamb_score_sorted = np.sort(lambs_scores)
        if self._type == 'distance':
            score = scoreatpercentile(lamb_score_sorted, 10)
            indicies = np.where(self._genuine_scores<score)
        elif self._type == 'score':
            score = scoreatpercentile(lamb_score_sorted, 90)
            indicies = np.where(self._genuine_scores>score)
        else:
            raise "Impossible"

        return indicies[0]
예제 #18
0
 def _create_cache_percentiles(self, predicate, resume=False):
     controls = self._get_controls(predicate)
     for i, (plate, imKeys) in enumerate(
             make_progress_bar('Percentiles')(controls.items())):
         filename = self._percentiles_filename(plate)
         if i == 0:
             _check_directory(os.path.dirname(filename), resume)
         if resume and os.path.exists(filename):
             continue
         features = self.cache.load(imKeys)[0]
         if len(features) == 0:
             logger.warning('No DMSO features for plate %s' % str(plate))
             percentiles = np.zeros((0, len(self.cache.colnames)))
         else:
             m = features.shape[1]
             percentiles = np.ones((2, m)) * np.nan
             for j in xrange(m):
                 percentiles[0, j] = scoreatpercentile(features[:, j], 1)
                 percentiles[1, j] = scoreatpercentile(features[:, j], 99)
         np.save(filename, percentiles)
예제 #19
0
 def extract(self, point_cloud, neighborhood, target_point_cloud, target_index, volume_description):
     """
     Extract the feature value(s) of the point cloud at location of the target.
     :param point_cloud: environment (search space) point cloud
     :param neighborhood: array of indices of points within the point_cloud argument
     :param target_point_cloud: point cloud that contains target point
     :param target_index: index of the target point within the target point cloud
     :param volume_description: cell, sphere, cylinder or voxel size description
     :target_index: index of the target point in the target point cloud
     :return: feature value
     """
     source_data = point_cloud[point][self.data_key]['data'][neighborhood]
     return stats.scoreatpercentile(source_data, self.percentile)
예제 #20
0
def graphVisualClustering(genesTarget, mds_dist, genes, G):
    result=[[] for k in range(len(genesTarget))]
    arr=np.array([gene in genesTarget for gene in genes])
    degrees=np.array([float(G.degree(v)) for v in G])
    high_degrees=np.where(degrees>=scoreatpercentile(degrees, 90))[0]
    
    for el in high_degrees:
        gene=genes[el]
        distances=cdist(mds_dist[np.newaxis, el], mds_dist[np.where(arr)])
        print el, gene, np.argmax(distances)
        result[np.argmax(distances)].append(gene)
        
    return result
예제 #21
0
def greyscale(infile, outfile):
    f = lasfile.File(infile,None,'rb')
    h = f.header

    # Projections for converting between the two
    #utmproj = Proj(init="epsg:26910")
    utmproj = Proj(r'+proj=utm +zone=10 +datum=NAD83 +units=us-ft +no_defs')
    latproj = Proj(proj='latlong',datum='WGS84')
    # Conversion from feet to meters
    conv = 1.0/0.3048

    tile_width = int(h.max[0] - h.min[0]) + 1
    tile_height = int(h.max[1] - h.min[1]) + 1
    height_offset = h.min[2]

    img = Image.new("RGB", (tile_width, tile_height))

    print "Image size: %d x %d" % (tile_width, tile_height)


    zvals = []

    for point in f:
        zvals.append(point.z)
    
    # Get a reasonable max height
    max_height = stats.scoreatpercentile(zvals, 99.5)
    depth_scale = 255/(max_height - h.min[2])
    print "Computed max height (99.5 percentile): %.2f" % (max_height)
    i = 0
    for point in f:
        if point.z > max_height:
            z = max_height
        else:
            z = point.z
        
        x = int(point.x - h.min[0])
        y = int(point.y - h.min[1])
    
        intensity = 255 - int(((z - height_offset) * depth_scale))
        img.putpixel((x,y), (intensity,intensity,intensity))
    
        i += 1


    img.save(outfile)
예제 #22
0
def plotInternalConsistency(M, tick_labels, cmap=mpl.cm.bwr, second_labels=None):
    
    norm = mpl.colors.Normalize(0,scoreatpercentile(M.flatten(), 95))
    f=p.figure()
    ax=f.add_subplot(111)
    ax.matshow(M, cmap=cmap, norm=norm)
    p.yticks(range(0,M.shape[0],15),tick_labels[::15])
    ax.tick_params(labelsize=6)
#     if second_labels is not None:
#         ax_=ax.twinx()
#         ax_.set_yticks(range(M.shape[0]))
#         ax_.set_yticklabels(second_labels)
#         ax_.tick_params(labelsize=4)
#         ax_.set_yscale(ax.get_yscale())
    p.show()
    
    return
예제 #23
0
def feldmanRanking(protectedCandidates, nonProtectedCandidates, k,
                   dataSetName):

    # ensure candidates are sorted by descending qualificiations
    nonProtectedCandidates.sort(key=lambda candidate: candidate.learnedScores,
                                reverse=True)
    nonProtectedQualifications = [
        nonProtectedCandidates[i].learnedScores
        for i in range(len(nonProtectedCandidates))
    ]

    protectedCandidates.sort(key=lambda candidate: candidate.learnedScores,
                             reverse=True)
    protectedQualifications = [
        protectedCandidates[i].learnedScores
        for i in range(len(protectedCandidates))
    ]

    ranking = []

    # create same distribution for protected and non-protected candidates
    for i, candidate in enumerate(protectedCandidates):
        if i >= k:
            # only need to adapt the scores for protected candidates up to required length
            # the rest will not be considered anyway
            break
        # find percentile of protected candidate
        p = percentileofscore(protectedQualifications, candidate.learnedScores)
        # find score of a non-protected in the same percentile
        score = scoreatpercentile(nonProtectedQualifications, p)
        candidate.qualification = score
        ranking.append(candidate)

    ranking += nonProtectedCandidates

    # create a colorblind ranking
    ranking.sort(key=lambda candidate: candidate.qualification, reverse=True)

    rankingResultsPath = "FeldmanEtAl/" + dataSetName + "ranking.csv"

    return ranking, rankingResultsPath
예제 #24
0
    gene = affy2entrez[p].replace('"','')
    if gene in genecols.keys():
        genecols[gene].append(i)
    else:
        genecols[gene] = [i]
o.write(l[0]+','+l[1]+',')
for i, g in enumerate(genecols):
    o.write(g)
    if i < len(genecols)-1:
        o.write(',')
    else:
        o.write('\n')
print 'Number of probes'
print len(probes)
i = 0
while True:
    i += 1
    l = f.readline()
    if not l:
        break
    fields = l.strip(' \n').split()
    a = array([float(x) for x in fields[2:]])
    o.write(fields[0]+','+fields[1]+',')
    for j, g in enumerate(genecols):
        o.write(str(stats.scoreatpercentile(a[genecols[g]], 50)))
        if j < len(genecols)-1:
            o.write(',')
        else:
            o.write('\n')
    
예제 #25
0
def trimean(values):
    return (stats.scoreatpercentile(values, 25) + 2.0*stats.scoreatpercentile(values, 50) + stats.scoreatpercentile(values, 75))/4.0
예제 #26
0
def trimean(values):
    return (stats.scoreatpercentile(values, 25) +
            2.0 * stats.scoreatpercentile(values, 50) +
            stats.scoreatpercentile(values, 75)) / 4.0
예제 #27
0
 rdat = RDATFile()
 rdat.load(open(args.rdatdir+'/'+filename))
 for cname in rdat.constructs:
     construct = rdat.constructs[cname]
     struct = SecondaryStructure(construct.structure)
     frags = struct.explode()
     for data in construct.data:
         if (('mutation' not in data.annotations) or \
                 ('mutation' in data.annotations and \
                 'WT' in data.annotations['mutation'])):
             if 'modifier' in data.annotations:
                 if args.normalize:
                     normvals = normalize(data.values)
                 else:
                     normvals = data.values
                     iqr = scoreatpercentile(normvals, 75) - scoreatpercentile(normvals, 25)
         for fragtype in frags:
             db['all'].extend(normvals)
             if data.errors:
                 db['all'].extend(data.errors)
             dbidx['all'] = dict([((construct.name, construct.seqpos[i]), v) for i, v in enumerate(normvals)])
             fraglist = frags[fragtype]
             for frag in fraglist:
                 vals = []
                 valerrors = []
                 pos = []
                 for idx in frag:
                     try:
                         iddx = construct.seqpos.index(idx + construct.offset + 1)
                         if ('DMS' in data.annotations['modifier'] and construct.sequence[idx].upper() not in ['A', 'C']) or\
                            ('CMCT' in data.annotations['modifier'] and construct.sequence[idx].upper() not in ['G', 'U']) or\
def plot_outlier_detection(df_floats: pd.DataFrame,
                           y_labels: np.ndarray,
                           clf,
                           clf_name: str = None,
                           scaler=None):
    """ Return contourf plot of the anomaly detection model.

        Plots contourf plot of decision scores marking area where observations would be considered inliers.

        Parameters
            ----------
            df_floats: pd.DataFrame with elements as floats.

            y_labels: numpy array of the same length as df_floats that assigns 0/1 (inlier/outlier) to each observation
                    according to fitted model.

            clf: fitted model.

            clf_name: name of fitted model.

            scaler: estimator that was used to change range of df_floats DataFrame

        Returns
            -------
            Contourf plot.

        """
    if df_floats.shape[1] > 2:
        print('Plotting first two variables...')
    elif df_floats.shape[1] < 2:
        print('Sorry can not plot less than two variables.')
        return

    # predict raw anomaly score
    y_scores = clf.decision_function(df_floats.iloc[:, [0, 1]]) * -1
    # threshold value to consider a datapoint inlier or outlier
    threshold = stats.scoreatpercentile(
        y_scores, 100 * len(y_labels[y_labels == 1]) / len(y_labels))

    # Specifies interval over which the np.linspace will be created
    x_lim_min, x_lim_max = df_floats.iloc[:, 0].min(), df_floats.iloc[:,
                                                                      0].max()
    x_delta = 0.05 * (x_lim_max - x_lim_min)
    y_lim_min, y_lim_max = df_floats.iloc[:, 1].min(), df_floats.iloc[:,
                                                                      1].max()
    y_delta = 0.05 * (y_lim_max - y_lim_min)
    # coordinate array for vectorized evaluation of raw anomaly scores
    # TODO: Coarser grid sometimes returns error when plotting (threshold out of [zz.min();zz.max()]).
    xx, yy = np.meshgrid(
        np.linspace(x_lim_min - x_delta, x_lim_max + x_delta, 100),
        np.linspace(y_lim_min - y_delta, y_lim_max + y_delta, 100))
    # decision function calculates the raw anomaly score for every point
    zz = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
    zz = zz.reshape(xx.shape)

    # undo the scaling so the plot is in the same scale as input data
    if scaler:
        df_floats.iloc[:, [0, 1]] = scaler.inverse_transform(
            df_floats.iloc[:, [0, 1]])
        x_lim_min, x_lim_max = df_floats.iloc[:, 0].min(
        ), df_floats.iloc[:, 0].max()
        x_delta = 0.05 * (x_lim_max - x_lim_min)
        y_lim_min, y_lim_max = df_floats.iloc[:, 1].min(
        ), df_floats.iloc[:, 1].max()
        y_delta = 0.05 * (y_lim_max - y_lim_min)
        xx, yy = np.meshgrid(
            np.linspace(x_lim_min - x_delta, x_lim_max + x_delta, 100),
            np.linspace(y_lim_min - y_delta, y_lim_max + y_delta, 100))

    # inliers_1 - inlier feature 1,  inliers_2 - inlier feature 2
    inliers_1 = (df_floats.iloc[:, 0][y_labels == 0]).values.reshape(-1, 1)
    inliers_2 = (df_floats.iloc[:, 1][y_labels == 0]).values.reshape(-1, 1)
    # outliers_1 - outlier feature 1, outliers_2 - outlier feature 2
    outliers_1 = df_floats.iloc[:, 0][y_labels == 1].values.reshape(-1, 1)
    outliers_2 = df_floats.iloc[:, 1][y_labels == 1].values.reshape(-1, 1)

    plt.figure(figsize=(10, 10))
    # fill blue map colormap from minimum anomaly score to threshold value
    plt.contourf(xx,
                 yy,
                 zz,
                 levels=np.linspace(zz.min(), threshold, 50),
                 cmap=plt.cm.Blues_r)
    plt.colorbar(
        plt.contourf(xx,
                     yy,
                     zz,
                     levels=np.linspace(zz.min(), threshold, 50),
                     cmap=plt.cm.Blues_r))
    # fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score
    plt.contourf(xx, yy, zz, levels=[threshold, zz.max()], colors='orange')
    # draw red contour line where anomaly score is equal to threshold
    a = plt.contour(xx, yy, zz, levels=[threshold], linewidths=2, colors='red')
    # draw inliers as white dots
    b = plt.scatter(inliers_1, inliers_2, c='white', s=20, edgecolor='k')
    # draw outliers as black dots
    c = plt.scatter(outliers_1, outliers_2, c='black', s=20, edgecolor='k')

    plt.axis('tight')
    # loc=2 is used for the top left corner
    plt.legend([a.collections[0], b, c],
               ['learned decision function', 'inliers', 'outliers'],
               prop=matplotlib.font_manager.FontProperties(size=20),
               loc=2)

    plt.xlim((x_lim_min - x_delta, x_lim_max + x_delta))
    plt.ylim((y_lim_min - y_delta, y_lim_max + y_delta))
    plt.xlabel(df_floats.columns[0])
    plt.ylabel(df_floats.columns[1])
    if clf_name:
        plt.title(clf_name)
    plt.show()

    return
예제 #29
0
def variableSelector(predictorTrain, targetTrain, method, verbose=False):

    varSelected = []

    #SelectKBest
    if (method == 'kb'):

        selector = SelectKBest(mutual_info_classif,
                               k='all').fit(predictorTrain, targetTrain)
        #selector = SelectKBest(mutual_info_classif).fit(predictNormWithPCA,target)
        #selector = SelectPercentile(mutual_info_classif, percentile=100).fit(predictorTrain,targetTrain)

        scores = selector.scores_

        if (verbose):
            print "Scores de selection"
            print scores

        threshold = stats.scoreatpercentile(scores, 33)

        for i in range(len(scores)):
            if (scores[i] > threshold):
                varSelected.append(i)

        if (verbose):
            print scores[0]

        if (verbose):
            plt.hist(scores, bins=30)  #, bins = range(1,len(scores)))
            plt.title("Score de la selection de variables avec SelectKBest")
            plt.ylabel('Nombre de variables')
            plt.xlabel('Score')
            plt.show()

    #Random Forest
    elif (method == 'rf'):
        clf = RandomForestClassifier(n_estimators=20)  # Random Forest
        clf.fit(predictorTrain, targetTrain)

        indexes = range(len(predictorTrain[0]))
        if (verbose):
            print "Features sorted by their score:"
            print sorted(zip(
                map(lambda x: round(x, 4), clf.feature_importances_), indexes),
                         reverse=True)
            plt.hist(clf.feature_importances_,
                     bins=30)  # , bins = range(1,len(scores)))
            plt.title("Score de la selection de variables avec Random Forest")
            plt.ylabel('Nombre de variables')
            plt.xlabel('Score')
            plt.show()

        scores = defaultdict(list)
        scoresTab = []

        for train_idx, test_idx in ShuffleSplit(
                n_splits=10, random_state=0,
                test_size=.3).split(predictorTrain):

            X_train, X_test = predictorTrain[train_idx], predictorTrain[
                test_idx]
            Y_train, Y_test = targetTrain[train_idx], targetTrain[test_idx]

            clf.fit(X_train, Y_train)
            acc = r2_score(Y_test, clf.predict(X_test))

            for i in range(predictorTrain.shape[1]):
                X_t = X_test.copy()
                np.random.shuffle(X_t[:, i])
                shuff_acc = r2_score(Y_test, clf.predict(X_t))

                scores[indexes[i]].append((acc - shuff_acc) / acc)
                scoresTab.append((acc - shuff_acc) / acc)

        threshold = stats.scoreatpercentile(scoresTab, 25)

        for feat, score in scores.items():
            scorebis = np.mean(score)
            if scorebis > threshold:
                varSelected.append(feat)

        varSelected = np.unique(varSelected)

    return varSelected
예제 #30
0
def summary_string(data):
    """ Returns the string representation of the data in `vector` """
    summary = (np.nanmin(data), stats.scoreatpercentile(data,25), np.median(data), stats.scoreatpercentile(data,75), np.nanmax(data))
    return "[%s, %s, %s, %s, %s]" % summary
예제 #31
0
def midhinge(values):
    return (stats.scoreatpercentile(values, 25) + stats.scoreatpercentile(values, 75))/2.0
 def _extract_one(self, point_cloud, neighborhood):
     source_data = point_cloud[point][self.data_key]['data'][neighborhood]
     return stats.scoreatpercentile(source_data, self.percentile)
예제 #33
0
def midhinge(values):
    return (stats.scoreatpercentile(values, 25) +
            stats.scoreatpercentile(values, 75)) / 2.0
예제 #34
0
 rdat = RDATFile()
 rdat.load(open(args.rdatdir + '/' + filename))
 for cname in rdat.constructs:
     construct = rdat.constructs[cname]
     struct = SecondaryStructure(construct.structure)
     frags = struct.explode()
     for data in construct.data:
         if (('mutation' not in data.annotations) or \
                 ('mutation' in data.annotations and \
                 'WT' in data.annotations['mutation'])):
             if 'modifier' in data.annotations:
                 if args.normalize:
                     normvals = normalize(data.values)
                 else:
                     normvals = data.values
                     iqr = scoreatpercentile(
                         normvals, 75) - scoreatpercentile(normvals, 25)
         for fragtype in frags:
             db['all'].extend(normvals)
             if data.errors:
                 db['all'].extend(data.errors)
             dbidx['all'] = dict([((construct.name, construct.seqpos[i]), v)
                                  for i, v in enumerate(normvals)])
             fraglist = frags[fragtype]
             for frag in fraglist:
                 vals = []
                 valerrors = []
                 pos = []
                 for idx in frag:
                     try:
                         iddx = construct.seqpos.index(idx +
                                                       construct.offset + 1)