Python describe示例，scipy.stats.describe Python示例

示例#1

0

显示文件

文件： chuang.py 项目： Al3n70rn/EgoNet

	def summary(self):
		"""summary basic statistic for identified subnetwork"""
		print str(len(self.netdict))+' subnetwork generated:'
		n, (smin, smax), sm, sv, ss, sk = describe([self.netdict[key][0] for key in self.netdict])
		print 'Subnet MI socre range ['+str(smin)+', '+str(smax)+'] of mean '+str(sm)+' and var '+str(sv)
		n, (smin, smax), sm, sv, ss, sk = describe([len(self.netdict[key][1]) for key in self.netdict])
		print 'Subnet nodes size ['+str(smin)+', '+str(smax)+'] of mean '+str(int(sm))+' and var '+str(int(sv))

示例#2

0

显示文件

文件： tokenizers.py 项目： ijmarshall/srtools

def calc_agreements(nr_of_abstracts=150):
    # Loop over the abstracts and calculate the kappa and alpha per abstract
    aggregate = []
    for i in range(0, nr_of_abstracts):
        # try:
            annotators = round_robin(i)
            annotations_A = flatten(get_annotations(i, annotators[0]))
            annotations_B = flatten(get_annotations(i, annotators[1]))
            annotations = __str_combine_annotations(annotations_A, annotations_B)
            a = AnnotationTask(annotations, agreement_fn)
            aggregate.append({
                "kappa" : a.kappa(),
                "alpha" : a.alpha(),
                "annotator_A" : annotators[0],
                "annotator_B" : annotators[1] })
        # except:
        #     print("Could not calculate kappa for abstract %i" % (i + 1))
        #     pass

    # Summary statistics
    kappa = describe([a['kappa'] for a in aggregate])
    print("number of abstracts %i" % kappa[0])
    print("[kappa] mean: " + str(kappa[2]))
    print("[kappa] variance: " + str(kappa[3]))
    alpha = describe([a['alpha'] for a in aggregate])
    print("[alpha] mean: " + str(alpha[2]))
    print("[alpha] variance: " + str(alpha[3]))

示例#3

0

显示文件

文件： summary_test_slalib.py 项目： Sherlockhlt/pytpm

def hipgaleq():
    """Print summary of GAL-EQ comparison with SLALIB galeq (HIP)."""
    hip_tab = get_hipdata()
    sla_tab = get_sla("slalib_hip_galeq.txt")

    dummy = np.zeros((len(hip_tab['px']),))
    v6l = convert.cat2v6(hip_tab['glon'], hip_tab['glat'], dummy, dummy,
                         dummy, dummy, tpm.CJ)

    # The actual epoch of galactic data is J2000. But in SLALIB
    # the input is taken to be B1950.0. So use tpm.B1950 as epoch
    # in the conversion.
    v6o = convert.convertv6(v6l, s1=4, s2=6, epoch=tpm.B1950)
    cat = convert.v62cat(v6o, tpm.CJ)

    cat = cat2array(cat)

    ra_diff = np.degrees(cat['alpha']) - sla_tab[:, 0]
    ra_diff = np.abs(ra_diff * 3600.0)
    dec_diff = np.degrees(cat['delta']) - sla_tab[:, 1]
    dec_diff = np.abs(dec_diff * 3600.0)

    print("Comparison with SLALIB galeq using HIPPARCOS data.")
    fs = "{0} {1}\n" + \
        "Min:  {2:.4f} Max: {3:.4f} \nMean: {4:.4f} Std: {5:.4f}\n"
    x = stats.describe(ra_diff)
    print(fs.format("ra_diff", "arcsec", x[1][0], x[1][1], x[2],
                    x[3] ** 0.5))
    x = stats.describe(dec_diff)
    print(fs.format("dec_diff", "arcsec", x[1][0], x[1][1], x[2],
                    x[3] ** 0.5))

示例#4

0

显示文件

文件： dictionaries.py 项目： epavlick/turker-demographics

def meta_data():
	totin = 0
	totout = 0
	inqual = 0
	outqual = 0
	indist = list()
	outdist = list()
	for count, qual in zip(open('%s/clpair.numturkers'%DICT_DIR).readlines(), open('%s/clpair.turkerqual'%DICT_DIR).readlines()):
		lang, num = count.strip().split('\t')
		lang, score = qual.strip().split('\t')
		num = int(float(num.strip()))
		score = float(score.strip())
		indist += [score] * num
		totin += num
		inqual += (num*score)		
	for count, qual in zip(open('%s/nonclpair.numturkers'%DICT_DIR).readlines(), open('%s/nonclpair.turkerqual'%DICT_DIR).readlines()):
		lang, num = count.strip().split('\t')
		lang, score = qual.strip().split('\t')
		num = int(float(num.strip()))
		score = float(score.strip())
		outdist += [score] * num
		totout += num
		outqual += (num*score)		
	i_n, (i_min, i_max), i_m, i_v, i_s, i_k = stats.describe(indist)
        i_moe = math.sqrt(i_v)/math.sqrt(i_n) * 2.576
	o_n, (o_mon, o_max), o_m, o_v, o_s, o_k = stats.describe(outdist)
        o_moe = math.sqrt(o_v)/math.sqrt(o_n) * 2.576
        print 'In region: %d Turkers, Avg. score %0.3f (%0.3f, %.03f)'%(i_n, i_m, i_m - i_moe, i_m + i_moe)
        print 'Out of region: %d Turkers, Avg. score %0.3f (%0.3f, %.03f)'%(o_n, o_m, o_m - o_moe, o_m + o_moe)

示例#5

0

显示文件

文件： summary_test_slalib.py 项目： Sherlockhlt/pytpm

def hipecleq():
    """Print summary of ECL-EQ comparison with SLALIB ecleq (HIP)."""
    hip_tab = get_hipdata()
    sla_tab = get_sla("slalib_hip_ecleq.txt")

    dummy = np.zeros((len(hip_tab['px']),))
    v6l = convert.cat2v6(hip_tab['elon2'], hip_tab['elat2'], dummy, dummy,
                         dummy, dummy, tpm.CJ)

    v6o = convert.convertv6(v6l, s1=3, s2=6)
    cat = convert.v62cat(v6o, tpm.CJ)

    cat = cat2array(cat)

    ra_diff = np.degrees(cat['alpha']) - sla_tab[:, 0]
    ra_diff = np.abs(ra_diff * 3600.0)
    dec_diff = np.degrees(cat['delta']) - sla_tab[:, 1]
    dec_diff = np.abs(dec_diff * 3600.0)

    print("Comparison with SLALIB ecleq using HIPPARCOS data.")
    fs = "{0} {1}\n" + \
        "Min:  {2:.4f} Max: {3:.4f} \nMean: {4:.4f} Std: {5:.4f}\n"
    x = stats.describe(ra_diff)
    print(fs.format("ra_diff", "arcsec", x[1][0], x[1][1], x[2],
                    x[3] ** 0.5))
    x = stats.describe(dec_diff)
    print(fs.format("dec_diff", "arcsec", x[1][0], x[1][1], x[2],
                    x[3] ** 0.5))

示例#6

0

显示文件

文件： summary_test_slalib.py 项目： Sherlockhlt/pytpm

def hipeqgal():
    """Print summary of EQ-GAL comparison with SLALIB eqgal (HIP)."""
    hip_tab = get_hipdata()
    sla_tab = get_sla("slalib_hip_eqgal.txt")

    dummy = np.zeros((len(hip_tab['px']),))
    v6l = convert.cat2v6(hip_tab['raj2'], hip_tab['decj2'], dummy, dummy,
                         dummy, dummy, tpm.CJ)

    v6o = convert.convertv6(v6l, s1=6, s2=4)
    # The galactic coordinates are at epoch J2000. But SLALIB
    # results are for B1950. So apply proper motion here.
    v6o = convert.proper_motion(v6o, tpm.B1950, tpm.J2000)
    cat = convert.v62cat(v6o, tpm.CJ)

    cat = cat2array(cat)

    ra_diff = np.degrees(cat['alpha']) - sla_tab[:, 0]
    ra_diff = np.abs(ra_diff * 3600.0)
    dec_diff = np.degrees(cat['delta']) - sla_tab[:, 1]
    dec_diff = np.abs(dec_diff * 3600.0)

    print("Comparison with SLALIB eqgal using HIPPARCOS data.")
    fs = "{0} {1}\n" + \
        "Min:  {2:.4f} Max: {3:.4f} \nMean: {4:.4f} Std: {5:.4f}\n"
    x = stats.describe(ra_diff)
    print(fs.format("ra_diff", "arcsec", x[1][0], x[1][1], x[2],
                    x[3] ** 0.5))
    x = stats.describe(dec_diff)
    print(fs.format("dec_diff", "arcsec", x[1][0], x[1][1], x[2],
                    x[3] ** 0.5))

示例#7

0

显示文件

文件： geneexp.py 项目： ajwije/DiffPath

    def statsAnalysis( self ):
        ge_arr = numpy.array( self.geneexpdict.values() )
        descstats = stats.describe( ge_arr ) # descriptive statistics for the log fold change values: size of array, (min,max), mean, var, skewness, kurtosis
        print descstats
        raw_avg_logfc = numpy.mean( ge_arr );
        raw_stdev_logfc = numpy.std( ge_arr );
        print "raw mean and sd: ", raw_avg_logfc, raw_stdev_logfc;
        stats.probplot( ge_arr, plot=matplotlib.pyplot )
        matplotlib.pyplot.savefig('qqplot_raw.png')
        matplotlib.pyplot.close();

        # if the distribution is not central, the n and nn labels could be assigned to genes with > 0 log(fc). To avoid this, convert gene exp values to z-scores and recalculate mean and sd
        for k in self.geneexpdict.keys():
            v = self.geneexpdict[k];
            zscore = (v - raw_avg_logfc)/float(raw_stdev_logfc);
            self.geneexpdict[k] = zscore;

        # recompute distribution parameters
        ge_arr = numpy.array( self.geneexpdict.values() )
        descstats = stats.describe( ge_arr ) # descriptive statistics for the log fold change values: size of array, (min,max), mean, var, skewness, kurtosis
        print descstats
        self.avg_logfoldchange = numpy.mean( ge_arr );
        self.stdev_logfoldchange = numpy.std( ge_arr );
        print "centralized mean and sd: ", self.avg_logfoldchange, self.stdev_logfoldchange;
        stats.probplot( ge_arr, plot=matplotlib.pyplot )
        matplotlib.pyplot.savefig('qqplot_centralized.png')
        matplotlib.pyplot.close();

示例#8

0

显示文件

文件： anova_for_matlab.py 项目： epavlick/turker-demographics

def anova(lists):
	base = lists['all']
	print 'all', stats.describe(base)
	for l in lists:
		if not l == 'all':
			print l, stats.describe(lists[l])
			print stats.f_oneway(base, lists[l])

示例#9

0

显示文件

文件： tuning_stats.py 项目： jorgsk/Utail

def main():
    here = os.path.dirname(os.path.realpath(__file__))
    filenames = ['cumul_HeLa-S3_Cytoplasm.stat', 'cumul_HeLa-S3_Nucleus.stat',
                 'cumul_HeLa-S3_Whole_Cell.stat']
    #filenames = ['cumul_K562_Nucleus.stat']
    for filename in filenames:
        tuning_file = os.path.join(os.path.split(here)[0], 'output', filename)
        #tuning_file = os.path.join(os.path.split(here)[0], 'output', 'test.stat')
        contents = {0:'cumul', 1:'pA_to_cumul_dist', 2:'pA_cumul', 3: 'd_stream_covr',
                    4: 'u_stream_covr', 5: 'rpkm', 6:'utr_length', 7: 'strand'}

        tuning_handle = open(tuning_file, 'rb')
        header = tuning_handle.next().split()

        # Get the stats dictionary
        data = {}

        for line in tuning_handle:
            (utr_id, default_cumul, pA_to_cumul_dist, pA_cumul, d_stream_covr,
             u_stream_covr, rpkm, utr_length, strand) = line.split()

            if float(rpkm) < 2:
                continue

            data[utr_id] = (int(default_cumul), int(pA_to_cumul_dist),
                            float(pA_cumul), float(d_stream_covr),
                            float(u_stream_covr), float(rpkm), int(utr_length),
                            strand)

        # Print the distribution of pA_cumul
        # Print the mean and the standard deviation as well
        # TODO wait for the calculation to finish. Then look at the mean and std and
        # maybe plot as well. For now, I move on!
        # AS WELL! Get a measure on how good your changes are: get the mean of the
        # distances from the cut-off to the actual polyas. This distance should
        # decrease with each iteration.

        # The relative cumulative length of the pA clusters
        pA_cumuls = [vals[2] for vals in data.itervalues()]
        (n_cumul, min_max_cumul, mean_cumul, var_cumul) = stats.describe(pA_cumuls)[:4]

        print filename, mean_cumul, var_cumul

        # The before/after coverage ratio of the pA clusters
        beg_aft = [math.log(vals[3]/vals[4], 2) for vals in data.itervalues() if vals[4]!=0]
        (n_ratio, min_max_ratio, mean_ratio, var_ratio) = stats.describe(beg_aft)[:4]

        print filename, mean_ratio, var_ratio

        box_plot(beg_aft, filename)
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.hist(beg_aft, bins=30)
        ax.set_title(filename)
        plt.show()

示例#10

0

显示文件

文件： egonet.py 项目： Al3n70rn/EgoNet

	def summary(self):
		"""summary basic statistic for identified subnetwork"""
		print str(len(self.netdict))+' subnetwork generated:'
		n, (smin, smax), sm, sv, ss, sk = describe([self.netdict[key][0] for key in self.netdict])
		print 'Subnet socre ['+str(smin)+', '+str(smax)+'] of mean '+str(sm)+' and var '+str(sv)
		n, (smin, smax), sm, sv, ss, sk = describe([len(self.netdict[key][1].nodes()) for key in self.netdict])
		print 'Subnet nodes size ['+str(smin)+', '+str(smax)+'] of mean '+str(int(sm))+' and var '+str(int(sv))
		counter = Counter(self.depth)
		print 'Subnet depth summary:'
		for each in sorted(counter.keys()):
			print 'depth '+str(each)+': '+str(counter[each])

示例#11

0

显示文件

文件： analyzer.py 项目： thrawn117/mastertestbed

 def print_statistics(self):
     from scipy import stats
     sz, (mn, mx), avg, var,skew, kurt = stats.describe(self.score1)
     s1_stats =  (sz, mn, mx, avg, var,skew, kurt)[:5]
     sz, (mn, mx), avg, var,skew, kurt = stats.describe(self.score2)
     s2_stats =  (sz, mn, mx, avg, var,skew, kurt)[:5]
     stat_string = ("\n\tTrace 1 \t Trace2\n" + '-'*40 + '\n' +
                    "Length \t %d \t\t %d \n" % (s1_stats[0], s2_stats[0]) +
                    "Min \t %f \t %f \n" % (s1_stats[1], s2_stats[1]) +
                    "Max \t %f \t %f \n" % (s1_stats[2], s2_stats[2]) +
                    "Average  %f \t %f \n" % (s1_stats[3], s2_stats[3]) +
                    "Variance %f \t %f \n" % (s1_stats[4], s2_stats[4]) )
     print stat_string

示例#12

0

显示文件

文件： dummyStochastics.py 项目： LiutaurasP/Python-workground

def print_statistics(a1, a2):
    sta1 = scs.describe(a1)
    sta2 = scs.describe(a2)
    print "%14s %14s %14s" % ("statistic", "data set 1", "data set 2")
    print 45 * "-"
    print "%14s %14.3f %14.3f" % ("size", sta1[0], sta2[0])
    print "%14s %14.3f %14.3f" % ("min", sta1[1][0], sta2[1][0])
    print "%14s %14.3f %14.3f" % ("max", sta1[1][1], sta2[1][1])
    print "%14s %14.3f %14.3f" % ("mean", sta1[2], sta2[2])
    print "%14s %14.3f %14.3f" % ("std", np.sqrt(sta1[3]),
    np.sqrt(sta2[3]))
    print "%14s %14.3f %14.3f" % ("skew", sta1[4], sta2[4])
    print "%14s %14.3f %14.3f" % ("kurtosis", sta1[5], sta2[5])

示例#13

0

显示文件

文件： week4_visualizations.py 项目： wer61537/Wesleyan-DataMangementVisualization

def eda_var(df, var, units):
    #statistical summary
    print("Statistical Summary of " + var + ".")
    print(df[var].describe())
    print("Distribution Analysis " + var + ".")
    st.describe(df[var])
    #requency dsitributions    
    print("Frequency distribution of " + var +".")
    print( df[var].value_counts().sort_values())
    print("Normalized frequency distribution of " + var + ".")                              
    print(df[var].value_counts( normalize=True))
    #eda_plot(df,var, units)    
    dist_plot(usage,var)

示例#14

0

显示文件

文件： datatools.py 项目： mrcbarbier/blast

def make_density(data,**kwargs):
    mat=array(data)
    if mat.shape[1]==2 :
        #data is couples (x,y)
        mat=mat.T
    normed=kwargs.get('normed',False)
    nbin=kwargs.get('bins',100)
    logy=kwargs.get('logy',False)
    remove0=1-kwargs.get('include_zeroes',1)
    bintype='linear'
    xmin,xmax=np.min(mat[0]),np.max(mat[0])
    if kwargs.get('logx',False):
        numat=mat[0][mat[0]>10**(-100)]
        xmin,xmax=min(nozero(nonan(numat))),max(nonan(numat))
        bintype='log'
        bins=np.logspace(log10(xmin),log10(xmax),nbin,False)
        binw=[bins[i+1]-bins[i] for i in xrange(nbin-1)]
        binw.append(xmax-bins[-1])
    xspan=xmax-xmin
    try:
        bins
    except: #linear spacing case
        binw=xspan/nbin
        bins=np.linspace(xmin,xmax,nbin,False)
    binnage=[[] for i in xrange(nbin)]
    for x,y in mat.T :
        if x<xmin or x>xmax:
            continue
        if bintype=='linear':
            xbin=int(floor(float(x-xmin)/binw))
        else :
            xbin=bisct.bisect_left(bins,x)
        if xbin ==nbin: #maxvalue
            xbin=nbin-1
        if remove0 and abs(y)>10**(-40):
            binnage[xbin].append(y)
    res=array([stats.describe(i)[2:]+(min(i),max(i)) if i else stats.describe([0])[2:]+(0,0) for i in binnage])
    sspercen=scipy.stats.scoreatpercentile
    if kwargs.get('relative',1):
        quantile=array([array([sspercen(i,50),sspercen(i,50)-sspercen(i,5),sspercen(i,95)-sspercen(i,50)]) if i else array([0,0,0]) for i in binnage])
        res2=array([-res[:,-2]+res[:,0],res[:,-1]-res[:,0]])
    else:
        quantile=array([array([sspercen(i,50),sspercen(i,5),sspercen(i,95)]) if i else array([0,0,0]) for i in binnage])
        res2=array([res[:,-2],res[:,-1]])
    quantile=quantile.T
    if normed :
        if bintype=='linear':
            res[:,0]/=sum(res[:,0])*binw
        else :
            res[:,0]/=sum(np.dot(res[:,0],binw))
    return bins,res[:,0],res[:,1],res2,quantile[0],quantile[1:],array([len(i) for i in binnage])

示例#15

0

显示文件

文件： analysis.py 项目： bikash/kaggleCompetition1

    def describe_out(self):
        stat_train = pd.DataFrame(columns=['Min', 'Max', 'Mean', 'Median', 'SD', 'Skew', 'Kurt'])
        for i in range(self.features_index[0], self.returns_next_days_index[1]+1):
            data = self.train_data.iloc[:,i]
            n, min_max, mean, var, skew, kurt = stats.describe(data)
            stat_train.loc[self.train_data.columns[i]] = [min_max[0], min_max[1], mean, data.median(), scipy.sqrt(var), skew, kurt]
        stat_train.to_csv('../data/stat_train.csv')

        stat_test = pd.DataFrame(columns=['Min', 'Max', 'Mean', 'Median', 'SD', 'Skew', 'Kurt'])
        for i in range(self.features_index[0], self.returns_predict_index[0]):
            data = self.test_data.iloc[:,i]
            n, min_max, mean, var, skew, kurt = stats.describe(data)
            stat_test.loc[self.test_data.columns[i]] = [min_max[0], min_max[1], mean, data.median(), scipy.sqrt(var), skew, kurt]
        stat_test.to_csv('../data/stat_test.csv')

示例#16

0

显示文件

文件： custom_methods.py 项目： mmbaye/columbia_e6891

def gen_feature_mfcc(x, debug=False):
	d = len(x[0])
	if debug: print (" - - entering gen_feature_mfcc")
	v = x[:,0:d-1] # the last column is 0, ignore it
	if np.isnan(v).any() or np.isinf(v).any():
		raise Exception('MFCC contains Nan of Inf')
	xn, (xmin, xmax), xmean, xvar, xskew, xkurt =  stats.describe(v)
	if debug: print stats.describe(v)
	d = np.diff(v, 1, 0)
	dmean = np.mean(d, 0)
	x = np.concatenate((xmean,xmin,xmax,xvar,dmean))
	if np.isnan(x).any() or np.isinf(x).any():
		raise Exception('Feature vector contains Nan of Inf')
	return x

示例#17

0

显示文件

文件： monte_carlo.py 项目： jietang/nflsurvivor

def simulate_seasons(N, start_week, players):
    winner_equity = defaultdict(float)
    last_weeks = []
    for i in range(N):
        if i % 1000 == 0:
            print i
        last_week, winners = simulate_season(start_week, players)
        for winner in winners:
            winner_equity[winner] += 1. / len(winners)
        last_weeks.append(last_week)

    pp({p: "%.2f" % (100 * e / N) for p, e in winner_equity.iteritems()})
    print stats.describe(last_weeks)
    print stats.histogram(last_weeks)

示例#18

0

显示文件

 def _init_fld2val(self, name, vals):
     """Describe summary statistics for a list of numbers."""
     #pylint: disable=no-member
     vals_stats = stats.describe(vals)
     stddev = math.sqrt(vals_stats[3]) # stats variance
     p25 = np.percentile(vals, 25)
     p50 = np.percentile(vals, 50) # median
     p75 = np.percentile(vals, 75)
     fld2val = {
         'name':name,
         'qty'.format(ITEMS=self.desc):vals_stats[0], # stats nobs
         'range':self._get_str_range(vals_stats),
         '25th percentile':p25,
         'median':p50,
         '75th percentile':p75,
         'mean':vals_stats[2], # stats mean
         'stddev':stddev}
     fmtflds = set(['25th percentile', 'median', '75th percentile', 'mean', 'stddev'])
     mkint = "," in self.fmtstr
     for key, val in fld2val.items():
         if key in fmtflds:
             if mkint:
                 val = int(round(val))
             val = self.fmtstr.format(val)
             fld2val[key] = val
     return fld2val

示例#19

0

显示文件

文件： StatisticsByCategories.py 项目： badcock4412/Quantum-GIS

    def processAlgorithm(self, progress):
        layer = QGisLayers.getObjectFromUri(self.getParameterValue(self.INPUT_LAYER))
        valuesFieldName = self.getParameterValue(self.VALUES_FIELD_NAME)
        categoriesFieldName = self.getParameterValue(self.CATEGORIES_FIELD_NAME)

        output = self.getOutputFromName(self.OUTPUT)
        valuesField = layer.fieldNameIndex(valuesFieldName)
        categoriesField = layer.fieldNameIndex(categoriesFieldName)
        
        features = QGisLayers.features(layer)
        nFeat = len(features)
        values = {}
        for feat in features:
            attrs = feat.attributes()
            value = float(attrs[valuesField].toDouble()[0])
            cat = unicode(attrs[categoriesField].toString())
            if cat not in values:
                values[cat] = []
            values[cat].append(value)
        
        fields = [QgsField("category", QVariant.String), QgsField("mean", QVariant.Double), QgsField("variance", QVariant.Double)]
        writer = output.getTableWriter(fields)
        for cat, value in values.items():                       
            n, min_max, mean, var, skew, kurt = stats.describe(value)
            record = [cat, mean, math.sqrt(var)]            
            writer.addRecord(record)

示例#20

0

显示文件

文件： statistics.py 项目： josephnoir/cppa-cl-benchmarks

def main():
    parser = OptionParser(usage="usage: %prog [options] file", version="%prog 0.1")
    # parser.add_option("-t", "--template",
    #                   action="store", type="string", dest="template",
    #                   help="declare output format")
    parser.add_option(
        "-s",
        "--separator",
        action="store",
        type="string",
        dest="separator",
        help="seperator in the output file [defualt = ' ']",
        default=" ",
    )

    (options, args) = parser.parse_args()
    if len(args) != 1:
        parser.error("incorrect number of arguments")

    data = np.loadtxt(args[0])
    # with open(args[0]) as fd:
    #    data2 = np.fromfile(fd, sep='\n', dtype=float)
    # print(data2)

    # df = pd.DataFrame(data)

    # print(data)
    # print(df)

    # numpy
    mean = data.mean()
    med = np.median(data)
    var = data.var()
    sdev = data.std()
    delta = confidence_interval(data)

    # pandas
    """
    mean = df.mean()
    med  = df.median()
    var  = df.var()
    sdev = df.std()
    """

    # scipy stats
    sem = stats.sem(data)  # stand error of mean

    #'''
    print("mean              : {}".format(mean))
    print("median            : {}".format(med))
    print("variance          : {}".format(var))
    print("standard deviation: {}".format(sdev))
    print("stats.sem         : {}".format(sem))
    print("conf. interval    : {}".format(delta))
    #'''

    # print('{0}{1}{2}'.format(mean[0], options.separator, sdev[0]))

    dn, dmin_max, dmean, dvar, dskew, dkurt = stats.describe(data)
    dstd = math.sqrt(dvar)

示例#21

0

显示文件

文件： distributions_extras.py 项目： matthew-brett/draft-statsmodels

def examples_normexpand():
    skewnorm = SkewNorm_gen()
    rvs = skewnorm.rvs(5, size=100)
    normexpan = NormExpan_gen(rvs, mode="sample")

    smvsk = stats.describe(rvs)[2:]
    print "sample: mu,sig,sk,kur"
    print smvsk

    dmvsk = normexpan.stats(moments="mvsk")
    print "normexpan: mu,sig,sk,kur"
    print dmvsk
    print "mvsk diff distribution - sample"
    print np.array(dmvsk) - np.array(smvsk)
    print "normexpan attributes mvsk"
    print mc2mvsk(normexpan.cnt)
    print normexpan.mvsk

    mc, mnc = mvsk2m(dmvsk)
    print "central moments"
    print mc
    print "non-central moments"
    print mnc

    pdffn = pdf_moments(mc)
    print "\npdf approximation from moments"
    print "pdf at", mc[0] - 1, mc[0] + 1
    print pdffn([mc[0] - 1, mc[0] + 1])
    print normexpan.pdf([mc[0] - 1, mc[0] + 1])

示例#22

0

显示文件

文件： docstats.py 项目： battelle-dsi-capstone-2015/battelle-product

    def topic_entropy(self):

        # Get stats for the whole corpus (sample)
        cstats = {}
        sql1 = "SELECT avg(topic_entropy), max(topic_entropy), min(topic_entropy) FROM doctopic"
        sql2 = "INSERT INTO corpusstats (mean_topic_entropy,max_topic_entropy,min_topic_entropy) VALUES (?,?,?)"
        for r in self.curin.execute(sql1):
            cstats['avg_h'], cstats['max_h'], cstats['min_h'] = r
            self.curout.execute(sql2,r)
        self.connout.commit()
        
        # Get stats for each doc (trial, observation)
        dstats = {}
        all_h = []
        sql3 = "SELECT doc_id, doc_label, topic_entropy FROM doctopic"
        for r in self.curin.execute(sql3):
            dstats[r[0]] = {}
            dstats[r[0]]['label'] = r[1]
            dstats[r[0]]['entropy'] = r[2]
            all_h.append(r[2])
            
        all_h = np.array(all_h)
        dr = sps.describe(all_h)
        print(dr)
        for k in dr:
            print(k)

示例#23

0

显示文件

文件： c2_copaConfCopa_analise.py 项目： leohmoreira/c2models

def compute_statistics(serie):
    """
        Computa as estatísticas de SERIE utilizando stats.describe
    """
    sizeData, (minimum,maximum),arithmeticMean,variance,skeness,kurtosis = stats.describe(serie)

    print "Size Data  = ",sizeData , "Minimo,Maximo = ",(minimum,maximum), "Média = ", arithmeticMean , "Variância = ", variance

示例#24

0

显示文件

文件： stats.py 项目： ldamewood/renormalization

 def _testCombine():
     A = random(10000)
     B = 10 * random(1000)
     C = hstack([A,B])
     run3 = RunningStatistics(A)
     run3.push(B)
     _compareDescriptions(run3.describe, describe(C))

示例#25

0

显示文件

文件： extras.py 项目： alfonsodiecko/PYTHON_DIST

def examples_normexpand():
    skewnorm = SkewNorm_gen()
    rvs = skewnorm.rvs(5,size=100)
    normexpan = NormExpan_gen(rvs, mode='sample')

    smvsk = stats.describe(rvs)[2:]
    print('sample: mu,sig,sk,kur')
    print(smvsk)

    dmvsk = normexpan.stats(moments='mvsk')
    print('normexpan: mu,sig,sk,kur')
    print(dmvsk)
    print('mvsk diff distribution - sample')
    print(np.array(dmvsk) - np.array(smvsk))
    print('normexpan attributes mvsk')
    print(mc2mvsk(normexpan.cnt))
    print(normexpan.mvsk)

    from statsmodels.stats.momenthelpers import mvsk2mnc, mnc2mc
    mnc = mvsk2mnc(dmvsk)
    mc = mnc2mc(mnc)
    print('central moments')
    print(mc)
    print('non-central moments')
    print(mnc)


    pdffn = pdf_moments(mc)
    print('\npdf approximation from moments')
    print('pdf at', mc[0]-1,mc[0]+1)
    print(pdffn([mc[0]-1,mc[0]+1]))
    print(normexpan.pdf([mc[0]-1,mc[0]+1]))

示例#26

0

显示文件

文件： dce_4_nodes_linear.py 项目： phiros/nepi

def avg_interest_rtt(ec, run):
    logs_dir = ec.run_dir
    
    # Parse downloaded CCND logs
    (graph,
      content_names,
      interest_expiry_count,
      interest_dupnonce_count,
      interest_count,
      content_count) = ccn_parser.process_content_history_logs(
        logs_dir, ec.netgraph.topology)

    # statistics on RTT
    rtts = [content_names[content_name]["rtt"] \
            for content_name in content_names.keys()]

    # sample mean and standard deviation
    sample = numpy.array(rtts)
    n, min_max, mean, var, skew, kurt = stats.describe(sample)
    std = math.sqrt(var)
    ci = stats.t.interval(0.95, n-1, loc = mean, 
            scale = std/math.sqrt(n))

    global metrics
    metrics.append((mean, ci[0], ci[1]))
    
    return mean

示例#27

0

显示文件

文件： test_wavelet_cpp.py 项目： ctw/ptsa_new

def fixme():
    num_freqs = 8
    min_frequency = 3.0
    max_frequency = 60.0

    morlet_transform = morlet.MorletWaveletTransform(5, np.logspace(np.log10(min_frequency), np.log10(max_frequency), num_freqs),  1000, 4096)
    # morlet_transform = morlet.MorletWaveletTransform(5, min_frequency, max_frequency, num_freqs, 1000, 4096)

    # morlet_transform = morlet.MorletWaveletTransform()
    # morlet_transform.init_flex(5, np.logspace(np.log10(min_frequency), np.log10(max_frequency), num_freqs),  1000, 4096)
    # morlet_transform.init(5, min_frequency, max_frequency, num_freqs, 1000, 4096)

    samplerate = 1000.
    frequency = 60.0
    modulation_frequency = 80.0

    duration = 4.096

    n_points = int(np.round(duration*samplerate))
    x = np.arange(n_points, dtype=np.float)
    signal = np.sin(x*(2*np.pi*frequency/samplerate))-np.cos(x*(2*np.pi*frequency/samplerate))

    powers=np.empty(shape=(signal.shape[0]*num_freqs,), dtype=np.float)
    num_of_iterations = 100
    # for i in range(num_of_iterations):
    #     morlet_transform.multiphasevec(signal,powers)
    morlet_transform.multiphasevec(signal,powers)

    powers = powers.reshape(8,powers.shape[0]/8)

    print(describe(powers))

示例#28

0

显示文件

文件： stockindexrepository.py 项目： krystian-warzocha/ProgramowanieWJezykuPythonZaliczenie

 def calculate_statistics(self):
     """Przelicza statystyki zwiazane ze zbiorem kursow indeksu"""
     values = []
     for quote in self.quotes:
         values.append(quote.value)
     n, (smin, smax), sm, sv, ss, sk = stats.describe(values)
     return (sm, sv, ss, sk)

示例#29

0

显示文件

文件： boost_predict.py 项目： bikash/kaggleCompetition1

 def batch_filter(self, X, Y):
     n, min_max, mean, var, skew, kurt = stats.describe(Y)
     sd = math.sqrt(var)
     y_index = Y[(Y > mean - self.range * sd).values & (Y < mean + self.range * sd).values].index.tolist()
     X = X.iloc[y_index, :]
     Y = Y[y_index]
     return X, Y

示例#30

0

显示文件

文件： testingClumpingUtility.py 项目： DrDark/Blackjack

def testMtimesNHands(d, p1, p2, deck = deckShuffle(6), M = 100, N = 10000):
    '''
    Simulate a larger number of games and record the count averages against the house.
    '''
    list1=[]
    for j in range(M):
      count = 0
      for i in range(N):
          P = basicStrategy(d, p1, p2, deck)
          D = dealerPlay(d, deck) 
          count += playerWin(D[0], P[0])[0] 
          if len(deck)<10: deck = deckShuffle()  
      list1.append(count)
    l = 0
    for i in list1:
      l = l + i/(M*1.0)
      return l/(N*1.0), list1

    
    plt.hist(list1, color='k', alpha = .15)
    plt.show()
    
    print '-------'
    print '''Summary stats for Strategy 1'''
    print '-------'
    Size,  Range, Mean, variance, skewness, kurtosis = describe(list1)
    print 'Number of observations: ', Size
    print 'Mean: ',  Mean
    print 'Min to Max: ', Range
    print 'Variance: ', variance
    print 'Standard Dev.: ', sqrt(variance)
    print 'Skewness: ', skewness
    print 'Kurtosis: ', kurtosis
    print '-------'
    print '-------'

示例#31

0

显示文件

文件： Medidas_de_centralidade.py 项目： eduardosisti/Amostragem

import numpy as np
from scipy import stats
import math as mt

dados = [40000, 18000, 12000, 250000, 30000, 140000, 300000, 40000, 800000]
media = np.mean(dados)
mediana = np.median(dados)
quartis = np.quantile(dados, [0, 0.25, 0.75, 1])
desvio_padrao = np.std(
    dados, ddof=1
)  #ddof = 1 é usado pois estamos usando a população e não uma amostra
describe = stats.describe(dados)

print(describe)
print(mt.sqrt(describe[3]))
print(desvio_padrao)

示例#32

0

显示文件

文件： fraud_blend.py 项目： Tyushang/fraud

    s += w
    rank[mx] = 1
m_gmean2 = np.exp(m_gmean2 / s)

# %% [code]
top_mean = 0
s = 0
for n in [0, 1, 3, 7, 26]:
    top_mean += concat_sub.iloc[:, n] * scores[top[n]]
    s += scores[top[n]]
top_mean /= s

# %% [code]
m_gmean = np.exp(0.3 * np.log(m_gmean1) + 0.2 * np.log(m_gmean2) +
                 0.5 * np.log(top_mean))
describe(m_gmean)

# %% [code]
concat_sub['isFraud'] = m_gmean
concat_sub[['isFraud']].to_csv('stack_gmean.csv')

all_files2 = glob.glob("/tmp/lgmodels/*.csv")
all_files2.sort(key=lambda s: s.split('.')[1], reverse=True)

aa_outs = [
    pd.read_csv(all_files2[f], index_col=0) for f in range(len(all_files2))
]
aa_concat_sub = pd.concat(aa_outs, axis=1)
# aa_concat_sub.columns = all_files

aa_corr = aa_concat_sub.corr()

示例#33

0

显示文件

文件： spcconvert.py 项目： ktl014/spcvisualizer

def run(data_path, cfg):

    print "Running SPC image conversion..."

    # get the base name of the directory
    base_dir_name = os.path.basename(os.path.abspath(data_path))

    # list the directory for tif images
    print "Listing directory " + base_dir_name + "..."

    image_list = []
    if cfg.get('MergeSubDirs', "false").lower() == "true":
        sub_directory_list = sorted(
            glob.glob(os.path.join(data_path, "[0-9]" * 10)))
        for sub_directory in sub_directory_list:
            print "Listing sub directory " + sub_directory + "..."
            image_list += glob.glob(os.path.join(sub_directory, "*.tif"))
    else:
        image_list += glob.glob(os.path.join(data_path, "*.tif"))

    image_list = sorted(image_list)

    # skip if no images were found
    if len(image_list) == 0:
        print "No images were found. skipping this directory."
        return

    # Get the total number of images in the directory
    total_images = len(image_list)

    # Create the output directories for the images and web app files
    subdir = os.path.join(data_path, '..', base_dir_name + '_static_html')
    if not os.path.exists(subdir):
        os.makedirs(subdir)
    image_dir = os.path.join(subdir, 'images')
    if not os.path.exists(image_dir):
        os.makedirs(image_dir)

    print "Starting image conversion and page generation..."

    # loop over the images and do the processing
    images_per_dir = cfg.get('ImagesPerDir', 2000)

    if cfg.get("BayerPattern").lower() == "rg":
        bayer_conv = cv2.COLOR_BAYER_RG2RGB
    if cfg.get("BayerPattern").lower() == "bg":
        bayer_conv = cv2.COLOR_BAYER_BG2RGB

    print "Loading images...\r",
    bundle_queue = Queue()
    for index, image in enumerate(image_list):

        reldir = 'images/' + str(
            images_per_dir * int(index / images_per_dir)).zfill(5)
        absdir = os.path.join(
            image_dir,
            str(images_per_dir * int(index / images_per_dir)).zfill(5))

        filename = os.path.basename(image)

        if not os.path.exists(absdir):
            os.makedirs(absdir)

        bundle = {}
        bundle['image_path'] = image
        bundle['image'] = cvtools.import_image(os.path.dirname(image),
                                               filename,
                                               bayer_pattern=bayer_conv)
        bundle['data_path'] = data_path
        bundle['image_dir'] = absdir
        bundle['reldir'] = reldir
        bundle['cfg'] = cfg
        bundle['total_images'] = total_images

        bundle_queue.put(bundle)
        print "Loading images... (" + str(index) + " of " + str(
            total_images) + ")\r",

        #if index > 2000:
        #    total_images = index
        #    break

    # Get the number o proceess to use based on CPUs
    n_threads = multiprocessing.cpu_count() - 1
    if n_threads < 1:
        n_threads = 1

    # Create the set of processes and start them
    start_time = time.time()
    output_queue = Queue()
    processes = []
    for i in range(0, n_threads):
        p = Process(target=process_bundle_list,
                    args=(bundle_queue, output_queue))
        p.start()
        processes.append(p)

    # Monitor processing of the images and save processed images to disk as they become available
    print "\nProcessing Images...\r",
    counter = 0
    entry_list = []
    use_jpeg = use_jpeg = cfg.get("UseJpeg").lower() == 'true'
    raw_color = cfg.get("SaveRawColor").lower() == 'true'
    while True:
        print "Processing and saving images... (" + str(counter).zfill(
            5) + " of " + str(total_images).zfill(5) + ")\r",

        if counter >= total_images:
            break

        #if output_queue.qsize() == 0:

        try:
            output = output_queue.get()
            if output:
                entry_list.append(output['entry'])
                output_path = os.path.join(output['image_path'],
                                           output['prefix'])
                if use_jpeg:
                    if raw_color:
                        cv2.imwrite(
                            os.path.join(output_path + "_rawcolor.jpeg"),
                            output['features']['rawcolor'])
                    cv2.imwrite(os.path.join(output_path + ".jpeg"),
                                output['features']['image'])
                else:
                    if raw_color:
                        cv2.imwrite(
                            os.path.join(output_path + "_rawcolor.png"),
                            output['features']['rawcolor'])
                    cv2.imwrite(os.path.join(output_path + ".png"),
                                output['features']['image'])

                cv2.imwrite(os.path.join(output_path + "_binary.png"),
                            output['features']['binary'])

            counter = counter + 1
        except:
            time.sleep(0.05)

    # Record the total time for processing
    proc_time = int(math.floor(time.time() - start_time))

    # Terminate the processes in case they are stuck
    for p in processes:
        p.terminate()

    print "\nPostprocessing..."

    # sort the entries by height and build the output
    entry_list.sort(key=itemgetter('maj_axis_len'), reverse=True)

    # Create histograms of several key features

    # image resolution in mm/pixel
    image_res = cfg.get('PixelSize', 22.1) / 1000

    #print "Image resolution is set to: " + str(image_res) + " mm/pixel."

    # Get arrays from the dict of features
    total_images = len(entry_list)
    nbins = int(np.ceil(np.sqrt(total_images)))
    maj_len = np.array(map(itemgetter('maj_axis_len'), entry_list)) * image_res
    min_len = np.array(map(itemgetter('min_axis_len'), entry_list)) * image_res
    aspect_ratio = np.array(map(itemgetter('aspect_ratio'), entry_list))
    orientation = np.array(map(itemgetter('orientation'), entry_list))
    area = np.array(map(itemgetter('area'),
                        entry_list)) * image_res * image_res
    unixtime = np.array(map(itemgetter('timestamp'), entry_list))
    elapsed_seconds = unixtime - np.min(unixtime)
    file_size = np.array(map(itemgetter('file_size'), entry_list)) / 1000.0

    #print unixtime

    total_seconds = max(elapsed_seconds)
    print "Total seconds recorded: " + str(total_seconds)
    if total_seconds < 1:
        total_seconds = 1

    print "\nComputing histograms..."

    # Compute histograms
    all_hists = {}
    hist = np.histogram(area, nbins)
    all_hists['area'] = json.dumps(zip(hist[1].tolist(), hist[0].tolist()))
    hist = np.histogram(maj_len, nbins)
    all_hists['major_axis_length'] = json.dumps(
        zip(hist[1].tolist(), hist[0].tolist()))
    hist = np.histogram(min_len, nbins)
    all_hists['minor_axis_length'] = json.dumps(
        zip(hist[1].tolist(), hist[0].tolist()))
    hist = np.histogram(aspect_ratio, nbins)
    all_hists['aspect_ratio'] = json.dumps(
        zip(hist[1].tolist(), hist[0].tolist()))
    hist = np.histogram(elapsed_seconds, np.uint32(total_seconds))
    all_hists['elapsed_seconds'] = json.dumps(
        zip(hist[1].tolist(), hist[0].tolist()))
    hist = np.histogram(orientation, nbins)
    all_hists['orientation'] = json.dumps(
        zip(hist[1].tolist(), hist[0].tolist()))
    hist = np.histogram(file_size, nbins)

    print "\nComputing stats..."

    all_hists['file_size'] = json.dumps(zip(hist[1].tolist(),
                                            hist[0].tolist()))
    # Compute general stats from features
    all_stats = {}
    all_stats['area'] = stats.describe(area)
    all_stats['major_axis_length'] = stats.describe(maj_len)
    all_stats['minor_axis_length'] = stats.describe(min_len)
    all_stats['aspect_ratio'] = stats.describe(aspect_ratio)
    all_stats['elapsed_seconds'] = stats.describe(elapsed_seconds)
    all_stats['orientation'] = stats.describe(orientation)
    all_stats['file_size'] = stats.describe(file_size)

    print "Building web app..."

    # Load html template for rendering
    template = ""
    with open(os.path.join('app', 'index.html'), "r") as fconv:
        template = fconv.read()

    # Define the render context from the processed histograms, images, and stats
    context = {}
    context['version'] = '1.0.1.05'
    context['total_images'] = total_images
    context['proc_time'] = proc_time
    context['duration'] = total_seconds
    context['compression_ratio'] = int(
        (1000.0 * 24 * total_images) / np.sum(file_size))
    context['rois_per_second'] = total_images / context['duration']
    context['kb_per_second'] = int(np.sum(file_size) / context['duration'])
    context['recording_started'] = datetime.datetime.fromtimestamp(
        np.min(unixtime)).strftime('%Y-%m-%d %H:%M:%S')
    context['app_title'] = "SPC Convert: " + base_dir_name
    context['dir_name'] = base_dir_name
    context['raw_color'] = raw_color
    context['image_res'] = image_res
    if use_jpeg:
        context['image_ext'] = '.jpeg'
    else:
        context['image_ext'] = '.png'
    context['stats_names'] = [{
        "name": "Min"
    }, {
        "name": "Max"
    }, {
        "name": "Mean"
    }, {
        "name": "Standard Deviation"
    }, {
        "name": "Skewness"
    }, {
        "name": "Kurtosis"
    }]

    # definie the charts to display from the histogram data
    charts = []
    for chart_name, data_values in all_hists.iteritems():
        chart = {}
        chart['source'] = 'js/' + chart_name + '.js'
        chart['name'] = chart_name
        units = ""
        if chart_name == 'area':
            units = " (mm*mm)"
        if chart_name == 'major_axis_length' or chart_name == 'minor_axis_length':
            units = " (mm)"
        if chart_name == 'file_size':
            units = " (kB)"
        if chart_name == 'elapsed_seconds':
            units = " (s)"
        if chart_name == 'orientation':
            units = " (deg)"
        chart['title'] = 'Histogram of ' + chart_name + units
        chart['x_title'] = chart_name + units
        chart['y_title'] = 'counts'
        chart['stats_title'] = chart_name
        chart['data'] = data_values
        chart['stats'] = []
        chart['stats'].append({
            "name":
            "Min",
            "value":
            "{:10.3f}".format(all_stats[chart_name][1][0])
        })
        chart['stats'].append({
            "name":
            "Max",
            "value":
            "{:10.3f}".format(all_stats[chart_name][1][1])
        })
        chart['stats'].append({
            "name":
            "Mean",
            "value":
            "{:10.3f}".format(all_stats[chart_name][2])
        })
        chart['stats'].append({
            "name":
            "Standard Deviation",
            "value":
            "{:10.3f}".format(math.sqrt(all_stats[chart_name][3]))
        })
        chart['stats'].append({
            "name":
            "Skewness",
            "value":
            "{:10.3f}".format(all_stats[chart_name][4])
        })
        chart['stats'].append({
            "name":
            "Kurtosis",
            "value":
            "{:10.3f}".format(all_stats[chart_name][5])
        })
        charts.append(chart)

    context['charts'] = charts

    # render the html page and save to disk
    page = pystache.render(template, context)

    with open(os.path.join(subdir, 'spcdata.html'), "w") as fconv:
        fconv.write(page)

    # remove any old app files and try to copy over new ones
    try:
        shutil.rmtree(os.path.join(subdir, "css"), ignore_errors=True)
        shutil.copytree("app/css", os.path.join(subdir, "css"))
        shutil.rmtree(os.path.join(subdir, "js"), ignore_errors=True)
        shutil.copytree("app/js", os.path.join(subdir, "js"))
    except:
        print "Error copying supporting files for html."

    # Load roistore.js database for rendering
    template = ""
    with open(os.path.join('app', 'js', 'database-template.js'), "r") as fconv:
        template = fconv.read()

    context = {}
    context['image_items'] = entry_list
    context['table'] = base_dir_name

    # render the javascript page and save to disk
    page = pystache.render(template, context)

    with open(os.path.join(subdir, 'js', 'database.js'), "w") as fconv:
        fconv.write(page)

    print "Done."

示例#34

0

显示文件

def calculate_stats(generator, args, anchor_params):
    """ Calculates stats for anchor coverage over given dataset.
        Output stats include:
        - Average number of positive & negative anchors per image
        - Max/min number of positive anchors in dataset
        - Proportion of positive to negative anchors across dataset
    """

    annotations_count = []
    missed_annotations_count = []
    positive_anchors_count = []
    negative_anchors_count = []

    num_images = generator.size()
    image_scale = None
    image_shape = None

    print("\n")
    for i in range(num_images):
        print("Processing {}/{} ".format(i, num_images), end="\r")

        annotations = generator.load_annotations(i)

        # Skip if there is no annotation label
        if len(annotations['labels']) == 0:
            continue

        # Resize the image and annotations
        # Save the relevant image properties (scale and shape) once and reuse - as we know that all images will have same properties
        # Saving these properties significantly speeds up the process
        if args.resize:
            if (image_scale is None):
                image = generator.load_image(i)
                image, image_scale = generator.resize_image(image)
                image_shape = image.shape

            annotations['bboxes'] *= image_scale
        else:
            if (image_shape is None):
                image = generator.load_image(i)
                image_shape = image.shape

        anchors = anchors_for_shape(image_shape, anchor_params=anchor_params)
        positive_indices, _, _, max_indices = compute_gt_annotations_for_visualisation(
            anchors,
            annotations['bboxes'],
            negative_overlap=args.negative_overlap_iou,
            positive_overlap=args.positive_overlap_iou)

        num_annotations = annotations['bboxes'].shape[0]
        missed_annotations = num_annotations - len(
            set(max_indices[positive_indices]))
        num_positive_anchors = annotations['bboxes'][
            max_indices[positive_indices], :].shape[0]

        annotations_count.append(num_annotations)
        missed_annotations_count.append(missed_annotations)
        positive_anchors_count.append(num_positive_anchors)
        negative_anchors_count.append(anchors.shape[0] - num_positive_anchors)

    prop = sum(positive_anchors_count) / sum(negative_anchors_count)
    missed_annotations_stats = stats.describe(missed_annotations_count)
    positive_anchors_stats = stats.describe(positive_anchors_count)
    negative_anchors_stats = stats.describe(negative_anchors_count)

    print("##############################")
    print(
        f"\nResults for parameters:\nPositive IoU: {args.positive_overlap_iou}\nNegative IoU: {args.negative_overlap_iou}"
    )
    print(
        f"\nAnchor parameters: \nsizes: {anchor_params.sizes}\nstrides: {anchor_params.strides}\nratios: {anchor_params.ratios}\nscales: {anchor_params.scales}"
    )
    print("\n-------")
    print(f"\nTotal annotations: {sum(annotations_count)}")
    print(
        f"\nMissed annotations: \nMin, Max: {missed_annotations_stats.minmax} \nMean: {missed_annotations_stats.mean:.3f}"
    )
    print(f"\nProportion of pos/neg anchors: {prop:.5f}")
    print(
        f"\nPositive anchors: \nMin, Max: {positive_anchors_stats.minmax}\nMean: {positive_anchors_stats.mean:.3f}"
    )
    print(
        f"\nNegative anchors: \nMin, Max: {negative_anchors_stats.minmax}\nMean: {negative_anchors_stats.mean:.3f}"
    )

    print("\n")

示例#35

0

显示文件

spixels = []
for i in range(0, 13):  # for every pixel:
    for j in range(0, 13):
        spixels.append(pixels[i + 200, j + 200])

print('first sample:', fpixels)
print('second sample:', spixels)

averagef = grades_average(fpixels)
averages = grades_average(spixels)

skewf = skew(fpixels)
skews = skew(spixels)

nf, min_maxf, meanf, varf, skewf, kurtf = stats.describe(fpixels)
ns, min_maxs, means, vars, skews, kurts = stats.describe(spixels)
print('meanf:', meanf)
print('means:', means)
print('varf:', varf)
print('vars:', vars)
print('skewf:', skewf)
print('skews:', skews)
print('kurtf:', kurtf)
print('kurts:', kurts)
#variancef = grades_variance(fpixels, averagef)
#variances = grades_variance(spixels, averages)

ax = pl.subplot(111)
#ax.bar(2, meanf, width=1)
#ax.bar(4, means, width=1)

示例#36

0

显示文件

文件： Code_DataAnalysis.py 项目： lrpopeyou/Financial_Models

nvar.myhist(risk_factors["CP1"].diff().dropna())
nvar.myhist(risk_factors["CP2"].diff().dropna())
nvar.myhist(risk_factors["CP3"].diff().dropna())

# the correlation matrix of daily change in CP1\CP2\CP3
risk_factors.diff().dropna().corr()
# Covariance matrix of daily change in CP1,CP2,CP3
risk_factors.diff().dropna().cov()
# Cholesky decompostion
C = linalg.cholesky(risk_factors.diff().dropna().cov())

import scipy.stats as stats

print(
    "means for daily changes in 3 componets:\t",
    stats.describe(risk_factors.diff().dropna()).mean,
)
print(
    "variance for daily changes 3 componets:\t",
    stats.describe(risk_factors.diff().dropna()).variance,
)


# Use montecarlo simulation to analyze the daily Var
# Mainly use the first 3 risk-factors
def myvar(C, dstd, components, num_of_sim=10000):
    # COV=C*C'
    # dstd is the standard deviation of the real ex quots
    # num_of_sim is the number of simulation
    # componetns is the first 3 principle components of normalized ex quots
    result = pd.DataFrame(columns=quots_dropna.columns)

示例#37

0

显示文件

文件： elmo_experiment.py 项目： elliotschu/clinical-concept-linking

def main():
    loc_mention_embeddings = "/Users/elliotschumacher/Dropbox/git/synonym_detection/resources/bilm/out_max/mention_embeddings"
    loc_concept_embeddings = "/Users/elliotschumacher/Dropbox/git/synonym_detection/resources/bilm/out_max/embedding_output"

    dev_file = "/Users/elliotschumacher/Dropbox/concept/share_clef/SPLIT_2017-12-08-13-38-01/train/dev_fix_concrete.tar"

    test_dict = {}
    for (comm, filename) in file_io.CommunicationReader(dev_file):
        for menset in comm.entityMentionSetList[0].mentionList:
            test_dict[menset.uuid.uuidString] = menset


    with open(os.path.join(loc_mention_embeddings, 'mention_representations.npy'),
              'rb') as mention_representations_npy, \
            open(os.path.join(loc_mention_embeddings, 'mention_to_info.pkl'), 'rb') as mention_to_info_pkl, \
            open(os.path.join(loc_mention_embeddings, 'id_to_mention_info.pkl'), 'rb') as id_to_mention_info_pkl:
        mention_representations = np.load(mention_representations_npy)
        id_to_mention_info = pickle.load(id_to_mention_info_pkl)
        mention_to_info = pickle.load(mention_to_info_pkl)

    with open(os.path.join(loc_concept_embeddings, 'concept_representations.npy'),
              'rb') as concept_representations_npy, \
            open(os.path.join(loc_concept_embeddings, 'id_to_concept_name_alt.pkl'),
                 'rb') as id_to_concept_name_alt_pkl, \
            open(os.path.join(loc_concept_embeddings, 'concept_to_id_name_alt.pkl'),
                 'rb') as concept_to_id_name_alt_pkl:
        concept_representations = np.load(concept_representations_npy)
        id_to_concept_info = pickle.load(id_to_concept_name_alt_pkl)
        cui_to_concept_info = pickle.load(concept_to_id_name_alt_pkl)

    output_file = "elmo_exp.csv"
    result_list = []
    input_csv = "/Users/elliotschumacher/Dropbox/git/concept-linker/results/run_2019_03_06_11_01_30_b13/eval_759.csv"
    eval_csv = pd.DataFrame.from_csv(input_csv)

    cos_sims = {}
    shuffled_keys = list(mention_to_info.keys())
    for mention_uuid1 in list(mention_to_info.keys()):
        if mention_uuid1 in test_dict:
            menset = test_dict[mention_uuid1]
            if menset.entityType in cui_to_concept_info:
                random.shuffle(shuffled_keys)
                for i in range(10):
                    mention_uuid2 = shuffled_keys[i]
                    if mention_uuid1 != mention_uuid2:
                        m_indx1 = mention_to_info[mention_uuid1]["index"]
                        m_indx2 = mention_to_info[mention_uuid2]["index"]
                        cos_sim = cosine_similarity(
                            [mention_representations[m_indx1, :]],
                            [mention_representations[m_indx2, :]])[0][0]
                        min_uuid = min(mention_uuid1, mention_uuid2)
                        max_uuid = max(mention_uuid1, mention_uuid2)
                        cos_sims[min_uuid, max_uuid] = cos_sim

    print("Stats for mention cos similarity")
    print(describe(list(cos_sims.values())))

    outer_concept_list = list(cui_to_concept_info)
    inner_concept_list = list(cui_to_concept_info)
    random.shuffle(outer_concept_list)
    cos_sims_cui = {}

    for cui1 in outer_concept_list[:1000]:
        c_indx1 = cui_to_concept_info[cui1][0]["index"]
        c_indexes = random.sample(range(0, len(inner_concept_list)), 10)
        for cui2_indx in c_indexes:
            cui2 = inner_concept_list[cui2_indx]
            c_indx2 = cui_to_concept_info[cui2][0]["index"]
            if c_indx1 != c_indx2:
                cos_sim = \
                cosine_similarity([concept_representations[c_indx1, :]], [concept_representations[c_indx2, :]])[0][0]
                min_uuid = min(cui1, cui2)
                max_uuid = max(cui1, cui2)
                cos_sims_cui[min_uuid, max_uuid] = cos_sim

    print("Stats for concept cos similarity")
    print(describe(list(cos_sims_cui.values())))
    #df = pd.DataFrame([list(cos_sims.keys()), list(cos_sims_cui.keys())], columns=['Mention', 'Concept'])
    plt.hist([list(cos_sims.values()),
              list(cos_sims_cui.values())],
             color=['r', 'b'],
             alpha=0.5)
    plt.gca().legend(('Mentions', 'Concepts'))

    plt.show()

    for _, row in eval_csv.iterrows():
        menset = test_dict[row["~~mention_uuid"]]
        if menset.entityType in cui_to_concept_info:
            mention_info = mention_to_info[menset.uuid.uuidString]
            concept_info = cui_to_concept_info[menset.entityType][0]

            m_indx = mention_info["index"]
            c_indx = mention_info["index"]
            sentence = " ".join([
                w.text.strip()
                for w in menset.tokens.tokenization.tokenList.tokenList
            ])

            m_rep = mention_representations[m_indx, :]
            c_rep = concept_representations[c_indx, :]
            cos_dist = cdist(concept_representations,
                             m_rep.reshape(1, -1),
                             metric='cosine')
            ranking = st.rankdata(cos_dist)

            gold_rank = ranking[c_indx]

            cos_sim = cosine_similarity([m_rep], [c_rep])[0][0]

            print("Cosine dist:{0}, sim:{1}".format(cos_dist[c_indx], cos_sim))
            row["cos_dist"] = cos_sim
            row["sentence"] = sentence
            row["cos_rank"] = gold_rank
            result_list.append(row)

    dataframe = pd.DataFrame.from_records(result_list)
    dataframe.to_csv(output_file, index=False)

示例#38

0

显示文件

文件： Project_StatAnalysis_Salikova_Zaikina.py 项目： Ramsey21/Python-Stat

"""

# %%
# построение гистограмм с выводом описательной характеристики
quantitative_variables = [
    'budget', 'revenue', 'runtime', 'vote_average', 'release_year'
]
for variable in quantitative_variables:
    plt.hist(data[variable], 12, density=1, facecolor='c')
    plt.grid(True)
    plt.xlabel("Значения")
    plt.ylabel("Относительная частота")
    plt.title(f'Распределение по {variable}')
    plt.savefig(f'{variable}.png', bbox_inches='tight')
    plt.show()
    print(sp.describe(data[variable], ddof=1, bias=False))
    print(data[variable].describe())

# %%
# избавляемся от "|" в столбцах director, cast, genres
data2 = data
data2['director'] = data.director.apply(lambda x: str(x).split('|'))
data2['cast'] = data.cast.apply(lambda x: str(x).split('|'))
data2['genres'] = data.genres.apply(lambda x: str(x).split('|'))
data3 = data2.explode('director')
data4 = data3.explode('cast')
data5 = data4.explode('genres')
data5

# %%
# построение гистограммы распредления жанров

示例#39

0

显示文件

for line in codecs.open("u.data","r",encoding="latin-1"):
    user,movie,rating,date=line.strip().split("\t")
    user_index=int(user)-1
    movie_index=int(movie)-1
    
    R[user_index,movie_index]=float(rating)

print(R[0,10])
#%% 12-3

from scipy import stats

user_mean_li=[]
for i in range(0,R.shape[0]):
    user_rating=[x for x in R[i] if x>0.0]
    user_mean_li.append(stats.describe(user_rating).mean)

stats.describe(user_mean_li)
#%%
import matplotlib.pyplot as plt
#plt.plot(user_info_li)
#plt.plot(movie_info_li)
#plt.plot(user_mean_li)
#plt.plot(R)
#행렬시각화...?

print(R.shape) #(943, 1682)
print(R.shape[0]) #943
print(R.shape[1]) #1682
print(R[0,2])
print(R[0])

示例#40

0

显示文件

def test_stellar_structure_equations(
    file_name="../Example Stars/low_mass_star.txt",
    config=StellarConfiguration()):
    data = np.loadtxt(file_name).T * example_star_units[:, None]

    diff = config.T_prime_radiative(data[ex_r_index, :], data[ex_rho_index, :], data[ex_T_index, :],
                                    data[ex_L_index, :]) - \
        config.T_prime_convective(data[ex_r_index, :], data[ex_rho_index, :], data[ex_T_index, :],
                                  data[ex_M_index, :])

    rho_prime_actual = config.rho_prime(data[ex_r_index, :],
                                        data[ex_rho_index, :],
                                        data[ex_T_index, :],
                                        data[ex_M_index, :],
                                        data[ex_L_index, :])
    rho_prime_expected = data[ex_rho_prime_index, :]
    print(
        "Rho Prime Percentage Error:",
        stats.describe(
            (rho_prime_actual - rho_prime_expected) / rho_prime_expected))

    T_prime_actual = config.T_prime(data[ex_r_index, :], data[ex_rho_index, :],
                                    data[ex_T_index, :], data[ex_M_index, :],
                                    data[ex_L_index, :])
    T_prime_expected = data[ex_T_prime_index, :]
    print(
        "T Prime Percentage Error:",
        stats.describe((T_prime_actual - T_prime_expected) / T_prime_expected))

    M_prime_actual = config.M_prime(data[ex_r_index, :], data[ex_rho_index, :])
    M_prime_expected = data[ex_M_prime_index, :]
    print(
        "M PrimePercentage Error:",
        stats.describe((M_prime_actual - M_prime_expected) / M_prime_expected))

    L_prime_actual = config.L_prime(data[ex_r_index, :], data[ex_rho_index, :],
                                    data[ex_T_index, :])
    L_prime_expected = data[ex_L_prime_index, :]
    print(
        "L prime Percentage Error:",
        stats.describe((L_prime_actual - L_prime_expected) / L_prime_expected))

    P_actual = config.P(data[ex_rho_index, :], data[ex_T_index, :])
    P_expected = data[ex_P_index, :]
    print("P Percentage Error:",
          stats.describe((P_actual - P_expected) / P_expected))

    P_degeneracy_actual = config.P_degeneracy(data[ex_rho_index, :])
    P_degeneracy_expected = data[ex_P_degeneracy_index, :]
    print(
        "P_degeneracy Percentage Error:",
        stats.describe((P_degeneracy_actual - P_degeneracy_expected) /
                       P_degeneracy_expected))

    P_gas_actual = config.P_gas(data[ex_rho_index, :], data[ex_T_index, :])
    P_gas_expected = data[ex_P_gas_index, :]
    print("P_gas Percentage Error:",
          stats.describe((P_gas_actual - P_gas_expected) / P_gas_expected))

    kappa_actual = config.kappa(data[ex_rho_index, :], data[ex_T_index, :])
    kappa_expected = data[ex_kappa_index, :]
    print("Kappa Percentage Error:",
          stats.describe((kappa_actual - kappa_expected) / kappa_expected))

示例#41

0

显示文件

    model = Word2Vec(sentences=corpus,
                     size=size,
                     window=window,
                     min_count=1,
                     workers=multiprocessing.cpu_count(),
                     sg=0)
    return model


#%% Hyperparameter optimization
total_word_num(data)
num_word_by_document = []
for i in range(len(data)):
    num_word_by_document.append(len(data[i]))
from scipy import stats
stats = stats.describe(num_word_by_document)
stats.mean
np.sqrt(stats.variance)
size = [100, 200, 300, 400, 500]
window = [5, 6, 7, 8, 9, 10]

# Getting the closest word to the keyword
print("Word2Vec model training")
words_by_window = []
for w in window:
    words_by_size = []
    for s in size:
        model = model_w2v(data, s, w)
        words_by_size.append(model.wv.most_similar(keyword, topn=10))
        print(
            "Central words to the keyword (hidden layer: {}, window: {}):\n{}\n"

示例#42

0

显示文件

def topological_features(floormap, prepare_for_doom=False):
    """
    Create the level graph from the floormap and compute some topological features on the graph.
    :param floormap:
    :param prepare_for_doom: (Default:False) If true each node will also contain vertices and walls information for converting the level to a WAD file.
    :return: (room map, room_graph, dict of metrics)
    """
    roommap, room_graph, dist = create_graph(floormap,
                                             return_dist=True,
                                             room_coordinates=prepare_for_doom)
    room_props = regionprops(roommap)
    for r in range(1, roommap.max() + 1):
        # Room Size
        room_graph.node[r]["area"] = room_props[r - 1]["area"]
        room_graph.node[r]["perimeter"] = room_props[r - 1]["perimeter"]
        mask = (roommap == r)
        max_dist = np.max(mask * dist)
        room_graph.node[r]["max_dist"] = max_dist
        room_graph.node[r]["centroid"] = room_props[r - 1]["centroid"]

        # TODO: Add information about other maps, such as enemies, etc.

    centroid_distance = dict()
    for i, j in room_graph.edges():
        # Decorate the edges with the distance
        if i == 0 or j == 0:
            continue
        centroid_distance[(i, j)] = np.linalg.norm(
            np.asarray(room_graph.node[i]["centroid"]) -
            np.asarray(room_graph.node[j]["centroid"])).item()
    nx.set_edge_attributes(room_graph,
                           name='centroid_distance',
                           values=centroid_distance)

    # To compute correct metrics we need to remove node 0, which is the background
    graph_no_background = room_graph.copy()
    graph_no_background.remove_node(0)
    metrics = dict()
    # Computing metrics from "Predicting the Global Structure of Indoor Environments: A costructive Machine Learning Approach", (Luperto, Amigoni, 2018)
    #####
    metrics["nodes"] = len(nx.nodes(graph_no_background))
    pl_list = list()
    diam_list = list()
    assort_list = list()
    for cc in nx.connected_component_subgraphs(graph_no_background):
        if len(cc.edges()) > 0:
            pl_list += [nx.average_shortest_path_length(cc)]
            diam_list += [nx.diameter(cc)]
            assort_list += [
                nx.degree_assortativity_coefficient(graph_no_background)
            ]

    metrics["avg-path-length"] = np.mean(pl_list) if len(pl_list) > 0 else 0
    metrics["diameter-mean"] = np.mean(diam_list) if len(diam_list) > 0 else 0
    metrics["art-points"] = len(
        list(nx.articulation_points(graph_no_background)))
    metrics["assortativity-mean"] = nx.degree_assortativity_coefficient(
        graph_no_background) if len(cc.edges()) > 0 else 0
    try:
        # Centrality measures
        metrics["betw-cen"] = nx.betweenness_centrality(graph_no_background)
        metrics["closn-cen"] = nx.closeness_centrality(graph_no_background)
        # These metrics may throw exceptions
        # metrics["eig-cen"] = nx.eigenvector_centrality_numpy(graph_no_background)
        # metrics["katz-cen"] = nx.katz_centrality_numpy(graph_no_background)

        # Describing node stat distributions and removing them from the dict
        for met in ['betw-cen', 'closn-cen']:
            values = list(metrics['{}'.format(met)].values())
            st = describe(values)

            metrics["{}-min".format(met)] = st.minmax[0]
            metrics["{}-max".format(met)] = st.minmax[1]
            metrics["{}-mean".format(met)] = st.mean
            metrics["{}-var".format(met)] = st.variance
            metrics["{}-skew".format(met)] = st.skewness
            metrics["{}-kurt".format(met)] = st.kurtosis
            # Quartiles
            metrics["{}-Q1".format(met)] = np.percentile(values, 25)
            metrics["{}-Q2".format(met)] = np.percentile(values, 50)
            metrics["{}-Q3".format(met)] = np.percentile(values, 75)
            del metrics[met]
    except:
        warnings.warn("Unable to compute centrality for this level")
        metrics["betw-cen"] = np.nan
        metrics["closn-cen"] = np.nan
    #####

    # Metrics on distance map. Ignoring black space surrounding the level
    cleandist = np.where(dist == 0, np.nan, dist)
    dstat = describe(cleandist, axis=None, nan_policy='omit')
    metrics["distmap-max".format(met)] = dstat.minmax[1]
    metrics["distmap-mean".format(met)] = dstat.mean
    metrics["distmap-var".format(met)] = dstat.variance
    metrics["distmap-skew".format(met)] = dstat.skewness
    metrics["distmap-kurt".format(met)] = dstat.kurtosis
    # Quartiles
    metrics["distmap-Q1".format(met)] = np.percentile(values, 25)
    metrics["distmap-Q2".format(met)] = np.percentile(values, 50)
    metrics["distmap-Q3".format(met)] = np.percentile(values, 75)

    return roommap, room_graph, metrics

示例#43

0

显示文件

文件： text_utils.py 项目： yueguo-50/ClinicalNotesICU

def merge_text_events_with_timeseries(problem_type, data, text_reader, w2i_lookup, conf_max_len,
                                      dump_information=False, fname=None):
    text_not_found = 0
    sucessful = 0

    text_event_lens = []
    data_with_text = []

    if dump_information:
        text_count_by_hour = {}
        patient_count_by_hour = {}
        text_len_by_hour = {}

    maximum_index_output = -1
    for batch in data:
        ip, op, _ = batch['data']
        X = ip[0]
        mask = ip[2]
        if problem_type == 'decom':
            ts = batch['decomp_ts']
            output = op[1]
        elif problem_type == 'los':
            ts = batch['los_ts']
            output = op[2]
            maximum_index_output = max(maximum_index_output, output.max())
        assert_shapes(X, mask, output)
        text_event_dictionary = text_reader.read_all_text_events_json(
            batch['names'])

        max_len = -1
        for i, name in enumerate(batch['names']):
            if name not in text_event_dictionary:
                continue
            text_events = text_event_dictionary[name]
            hours = map(lambda x: x[0], text_events)
            hours = list(filter(lambda h: h <= X.shape[1], hours))
            max_len = max(max_len, len(hours))

        final_items = []
        for i, name in enumerate(batch['names']):
            # timerow represents 1 patient.
            # first timestep is 5.
            if name not in text_event_dictionary:
                text_not_found += 1
                continue
            else:
                sucessful += 1
            # if sucessful % 5000 == 0:
            #    print("Scccessful:", sucessful)
            mask_i = mask[i]
            X_i = X[i]
            output_i = output[i]
            ts_i = ts[i]
            if len(ts_i) == 0:
                continue
            text_events = text_event_dictionary[name]
            assert len(text_events[0]) == 2
            hours = list(map(lambda x: x[0], text_events))[:max_len]
            texts = list(map(lambda x: x[1], text_events))[:max_len]
            if dump_information:
                assert fname is not None
                count = 0
                length = 0
                for t in ts_i:
                    if t in patient_count_by_hour:
                        patient_count_by_hour[t] += 1
                    else:
                        patient_count_by_hour[t] = 1
                    if t in hours:
                        count += 1
                        length += len(texts[hours.index(t)])
                    if t not in text_count_by_hour:
                        text_count_by_hour[t] = 0
                        text_len_by_hour[t] = 0
                    text_count_by_hour[t] += count
                    text_len_by_hour[t] += length

            assert len(hours) == len(texts)
            text_event_lens.append(len(texts))

            # generate 2D TimeMask for 1DConvolution.
            time_mask = np.zeros((mask_i.shape[0], max_len))

            if max(ts_i) >= mask_i.shape[0]:
                ts_i = [ti for ti in ts_i if ti < mask_i.shape[0]]

            for t in ts_i:
                for ind, h in enumerate(hours):
                    if h > t:
                        break
                    time_mask[t][ind] = t-h+1
                    assert time_mask[t][ind] >= 0

            final_items.append(
                {'X': X_i, 'Out': output_i, 'Mask': mask_i, 'Text': texts, 'TimeMask': time_mask})

        if len(final_items) >= 1:
            # Now post process.
            X = np.stack(list(map(lambda x: x['X'], final_items)))
            Output = np.stack(list(map(lambda x: x['Out'], final_items)))
            Mask = np.stack(list(map(lambda x: x['Mask'], final_items)))
            TimeMask = np.stack(
                list(map(lambda x: x['TimeMask'], final_items)))
            Texts, _ = generate_tensor_text(
                list(map(lambda x: x['Text'], final_items)), w2i_lookup, conf_max_len)
            try:
                assert_shapes(X, Mask, Output, TimeMask, Texts)
                data_with_text.append(
                    {'X': X, 'Output': Output, 'Mask': Mask, 'TimeMask': TimeMask, 'Texts': Texts})
            except:
                print("Merge failed due to shape issue")

    print("Text Not found for patients: ", text_not_found)
    print("Sucessful for patients: ", sucessful)
    print("Maximum value in Output: ", maximum_index_output)

    text_event_lens = np.array(text_event_lens)
    from scipy import stats
    print(stats.describe(text_event_lens))

    if dump_information:
        with open(fname, 'wb') as f:
            pickle.dump({'text_count_by_hour': text_count_by_hour,
                         'patient_count_by_hour': patient_count_by_hour,
                         'text_lens_by_hour': text_len_by_hour},
                        f, pickle.HIGHEST_PROTOCOL)

    return data_with_text, text_event_lens

示例#44

0

显示文件

def runAnalysis(trainFilename,
                testFilename,
                labelFilename,
                labelCol,
                labelName,
                trainYr=1999,
                testYr=2009,
                grams=(2, 5),
                addWrdCnt=False,
                addCntry=False):

    # Incorporate gram specific path
    if grams[1] == grams[0]:
        gramDir = 'grams' + str(grams[1])
    if grams[1] != grams[0]:
        gramDir = 'grams' + str(grams[0]) + '_' + str(grams[1])
    ###

    # Load data
    trainData = buildData(textFile=trainFilename,
                          sYr=trainYr,
                          labelFile=labelFilename)

    testData = buildData(textFile=testFilename,
                         sYr=testYr,
                         labelFile=labelFilename)
    ####

    # Divide into train and test and convert
    # to appropriate format
    vectorizer = TfidfVectorizer(ngram_range=grams)

    xTrain = vectorizer.fit_transform(trainData[:, 1])
    yTrain = np.array([int(x) for x in list(trainData[:, labelCol])])

    print('Saving tfidf')
    vec_file = '{}_tfidf.pkl'.format(labelName)
    joblib.dump(vectorizer, vec_file)

    xTest = vectorizer.transform(testData[:, 1])
    yTest = np.array([int(x) for x in list(testData[:, labelCol])])

    # Add other features
    if (addWrdCnt):
        wTrain = csr_matrix(np.array(list(trainData[:, 2]))).transpose()
        wTest = csr_matrix(np.array(list(testData[:, 2]))).transpose()

        xTrain = hstack((xTrain, wTrain))
        xTest = hstack((xTest, wTest))

    if (addCntry):
        cntryYr = [x.split('_')[0] for x in trainData[:, 0]]
        from pandas import factorize
        cntryYr = factorize(cntryYr)[0]
        cTrain = csr_matrix(np.array(list(cntryYr))).transpose()

        cntryYr = [x.split('_')[0] for x in testData[:, 0]]
        cntryYr = factorize(cntryYr)[0]
        cTest = csr_matrix(np.array(list(cntryYr))).transpose()

        xTrain = hstack((xTrain, cTrain))
        xTest = hstack((xTest, cTest))
    #####

    # Run SVM with linear kernel
    print('Fitting SVM')
    svmClass = LinearSVC().fit(xTrain, yTrain)
    yConfSVM = list(svmClass.decision_function(xTest))
    yPredSVM = svmClass.predict(xTest)

    svm_class_file = '{}_svm_class.pkl'.format(labelName)
    joblib.dump(svmClass, svm_class_file)

    print('SVM2')
    svmClass_2 = SVC(kernel='linear', probability=True).fit(xTrain, yTrain)
    yProbSVM = svmClass_2.predict_proba(xTest)

    svm_class2_file = '{}_svm_class2.pkl'.format(labelName)
    joblib.dump(svmClass_2, svm_class2_file)
    #####

    # Performance stats
    outpath = _get_data('../../results', gramDir)
    if addWrdCnt:
        outName = (labelName + '_train' + trainFilename.split('_')[1] +
                   '_test' + testFilename.split('_')[1] + '_xtraFt' + '.txt')
        outName = os.path.join(outpath, outName)
    else:
        outName = (labelName + '_train' + trainFilename.split('_')[1] +
                   '_test' + testFilename.split('_')[1] + '.txt')
        outName = os.path.join(outpath, outName)
    orig_stdout = sys.stdout
    out = open(outName, 'w')
    sys.stdout = out
    print '\nTrain Data from: ' + trainFilename
    print '\t\tTrain Data Cases: ' + str(xTrain.shape[0])
    print '\t\tMean of y in train: ' + str(round(describe(yTrain)[2],
                                                 3)) + '\n'
    print 'Test Data from: ' + testFilename
    print '\t\tTest Data Cases: ' + str(xTest.shape[0])
    print '\t\tMean of y in test: ' + str(round(describe(yTest)[2], 3)) + '\n'
    prStats('SVM', grams, yTest, yPredSVM)
    out.close()
    sys.stdout = orig_stdout
    #####

    # Print data with prediction
    trainCntry = np.array([[x.split('_')[0].replace(',', '')]
                           for x in list(trainData[:, 0])])
    trainYr = np.array([[x.split('_')[1]] for x in list(trainData[:, 0])])
    testCntry = np.array([[x.split('_')[0].replace(',', '')]
                          for x in list(testData[:, 0])])
    testYr = np.array([[x.split('_')[1]] for x in list(testData[:, 0])])

    vDat = np.array(
        [[x] for x in flatten([['train'] * trainData.shape[0], ['test'] *
                               testData.shape[0]])])

    trainLab = np.array([[x] for x in list(trainData[:, labelCol])])
    testLab = np.array([[x] for x in list(testData[:, labelCol])])

    if labelName[0:6] == 'polCat':
        probSVM = [';'.join(['%s' % x for x in row]) for row in yProbSVM]
        confSVM = [
            ';'.join(['%s' % x for x in sublist]) for sublist in yConfSVM
        ]
    if labelName[0:6] != 'polCat':
        probSVM = [x[1] for x in yProbSVM]
        confSVM = yConfSVM

    filler = [-9999] * trainData.shape[0]
    predSVM = np.array([[x] for x in flatten([filler, list(yPredSVM)])])
    probSVM = np.array([[x] for x in flatten([filler, probSVM])])
    confSVM = np.array([[x] for x in flatten([filler, confSVM])])

    output = np.hstack((np.vstack(
        (trainCntry, testCntry)), np.vstack(
            (trainYr, testYr)), vDat, np.vstack(
                (trainLab, testLab)), np.hstack((confSVM, probSVM, predSVM))))

    outCSV = outName.replace('.txt', '.csv')
    outCSV = os.path.join(outpath, outCSV)
    with open(outCSV, 'wb') as f:
        f.write(b'country,year,data,' + labelName +
                ',confSVM,probSVM,predSVM\n')
        np.savetxt(f, output, delimiter=',', fmt="%s")

    # Print top features for classes from SVM
    infFeatures(outpath, outName.replace('.txt', '._wrdFtr.csv'), vectorizer,
                svmClass, 500)

示例#45

0

显示文件

def doStats(warmupdata,
            Data,
            doGraphs=False,
            doWriteStdout=False,
            graphFilenameStub=''):
    # Mean, min, max, variance
    (nsamples, (min, max), mean, unbiasedvar, skew, kurtosis) = stat.describe(
        Data
    )  # Unbiasedvar is actually the reduced-bias estimator of population variance (~1/(N-1)...)
    # Standard error of mean:
    sem = stat.sem(Data)  # Not yet correlation corrected

    # Compute the autocorrelation function:
    Data_dcshift = Data - mean
    #DataNorm=np.sum(np.square(Data_dcshift))
    cor = np.correlate(Data_dcshift, Data_dcshift, mode='same') / unbiasedvar
    autocor = cor[int(cor.size / 2):]
    autocor = autocor / np.arange(nsamples - 1, nsamples - 1 - autocor.size,
                                  -1)  # Note -1 for 0-based indexing

    # Choose where to cutoff the autocorrelation time sum
    cutoff = autocor.size
    j = 0
    while j < cutoff:
        if autocor[j] < np.sqrt(2. / (nsamples - j)):
            cutoff = np.minimum(cutoff, 5 * j)
        j = j + 1
    # Compute correlation time
    kappa = 1. + 2. * np.sum(autocor[1:int(2. * cutoff / 5.)])
    # We can also make an array of all possible cutoffs
    if doGraphs:
        kappa_cutoffdep = np.ones(autocor.size)
        for jc in range(1, autocor.size):
            kappa_cutoffdep[jc] = 1 + 2 * np.sum(autocor[1:jc])
    # Update the standard error of the mean for a correlation correction
    semcc = sem * np.sqrt(kappa)

    # Manual (non-Numpy) autocorrelation function for transparency - verified equal
    #j=0
    #cutoff=nsamples
    #autocor_m=np.zeros(cutoff)
    #while j < cutoff:
    #    autocor_m[j]=0.
    #    for i in range(0,nsamples-j):
    #        autocor_m[j] = autocor_m[j] + (Data[warmup+i]-mean)*(Data[warmup+i+j]-mean)
    #    autocor_m[j]=autocor_m[j]/(unbiasedvar*(nsamples-j))
    #    if autocor_m[j] < np.sqrt(2./(nsamples-j)):
    #        cutoff = np.minimum(cutoff,5*j)
    #    j=j+1

    if doWriteStdout == True:
        print("  - Mean                    = ", mean, " +/- ", semcc)
        print("  - Equilibrated samples    = ", nsamples)
        print("  - Correlation time        = ", kappa)
        print("  - Effective # samples     = ", nsamples / kappa)
        print("  - Reduced-bias variance   = ", unbiasedvar)
        # note that there is no unbiased estimator for the population standard deviation. We can use sqrt(var) as a indicative estimator.
        print(
            "  - S.D. (unbiased, biased) = ", np.sqrt(unbiasedvar),
            np.std(Data, ddof=0)
        )  # ddof is correction to 1/N...using ddof=1 returns regular reduced-bias estimator
        print("  - Skewness                = ", skew)
        print("  - Kurtosis                = ", kurtosis)
        print("  - Min, Max                = ", min, max)
        #print  ( Reduced bias estimator - test vs. above from sqrt(var))

    if doGraphs:
        import matplotlib.pyplot as pl  # If we import pylab instead, we get matplotlib.pyplot and numpy both under the global namespace for MATLAB-like syntax and reduced typing (useful for interactive use)

        # Plot some things
        pl.figure(num=1, figsize=(15, 10))
        #
        pl.subplot(221)  # Select first panel in 2x2 grid...
        pl.title("Trace of Data")
        pl.plot(np.concatenate([warmupdata, Data]))
        pl.ylim([0.98 * min, 1.02 * max])
        pl.axhline(mean, color='red')
        pl.axvspan(0, len(warmupdata), color='green', alpha=0.5)
        pl.axvline(len(warmupdata), color='green')
        pl.xlabel("Sample index")
        #
        # Generate a histogram of the data
        # Not needed now - just do a histogram plot directly
        #hist=stat.histogram(Data)
        #print hist
        #hist=np.histogram(Data)
        #print hist[0]
        #print hist[1]
        #print type(hist[0])
        pl.subplot(222)
        pl.title("Histogram of Data")
        n, bins, patches = pl.hist(Data,
                                   25,
                                   normed=1,
                                   facecolor="green",
                                   alpha=0.5)
        #ygauss=stat.norm(bins,mean,np.sqrt(unbiasedvar))
        pl.plot(bins, stat.norm.pdf(bins, mean, np.sqrt(unbiasedvar)), 'r--')
        #
        pl.subplot(223)
        pl.plot(autocor[:cutoff])
        x = np.arange(0, cutoff)
        pl.plot(x, np.exp(-x / kappa))
        pl.title("Autocorrelation function")
        pl.xlabel('$\\tau$')
        pl.ylabel('$C\\left(\\tau\\right)$')
        pl.axhline(0, color='black')
        #pl.xlim(0,plotxmax)
        #
        pl.subplot(224)
        pl.plot(kappa_cutoffdep)
        pl.title("Correlation time estimator vs. cutoff")
        #pl.xlabel('$\\tau_{cut}$')
        #pl.ylabel('$\\Kappa$')
        #pl.axhline(0,color='black')
        #
        pl.savefig("stats_{0}.png".format(graphFilenameStub))

    return (nsamples, (min, max), mean, semcc, kappa, unbiasedvar, autocor)

示例#46

0

显示文件

文件： particlerelax.py 项目： MattMavis/ITERCAD

    start_t = time()

    for theta in np.linspace(0, tau, RESOLUTION)[1053:]:  #
        new_circle = rotate_list_of_points(circle, theta)
        sextant = ary(circle_to_sextant(new_circle))
        '''
		#plot the initial state
		plt.scatter(sextant[:,0], sextant[:,1])
		plt.title("Intital distribution")
		plt.show()
		'''
        all_points = [Point(p) for p in sextant]

        step = 0
        force_mag = [quadrature(p.get_force(all_points)) for p in all_points]
        des = describe(force_mag)
        while des.minmax[
                1] > F_LIMIT and des.mean > F_LIMIT / 2.5:  #For F_LIMIT = .5% of radius, this should take a bit more than 200 iterations.
            forces = ary([p.get_force(all_points) for p in all_points])
            force_mag = [quadrature(f) for f in forces]
            '''
			plt.hist(force_mag, bins = 200)
			sns.kdeplot(force_mag)
			plt.title("forces magnitudes at step "+str(step))
			plt.show()
			'''
            #get the statistical information about the forces' magnitudes.
            des = describe(force_mag)

            #calculate step size to take
            if des.minmax[1] < F_LIMIT * 15 and des.mean < F_LIMIT * 15 / 2.5:

示例#47

0

显示文件

# %%
j1.mode()
# %% [markdown]
# ## combinaciones, permutaciones y exponenciales
# ## scipy.special

# %%
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt

# %%

# %%
x = [1, 2, 3, 4, 5]
stats.describe(x)

# %%
normal = stats.norm()
# %%
x = np.arange(-3, 3.1, 0.1)
# %%
x
# %%
plt.plot(x, normal.pdf(x))
plt.show()
# %%
normal.expect()
# %%
normal.interval(0.95)
# %%

示例#48

0

显示文件

    def extractfeatures(self, DICOMImages, image_pos_pat, image_ori_pat,
                        series_path, phases_series, VOI_mesh):
        """ Start pixVals for collection pixel values at VOI """
        pixVals_margin = []
        pixVals = []
        Fmargin = {}
        voxel_frameS = {}

        # necessary to read point coords
        VOIPnt = [0, 0, 0]
        ijk = [0, 0, 0]
        pco = [0, 0, 0]

        for i in range(len(DICOMImages)):
            # find mapping to Dicom space
            [transformed_image, transform_cube
             ] = Display().dicomTransform(DICOMImages[i], image_pos_pat,
                                          image_ori_pat)
            if (i == 0):
                # create mask from segmenation
                np_VOI_mask = self.createMaskfromMesh(VOI_mesh,
                                                      transformed_image)

            for j in range(VOI_mesh.GetNumberOfPoints()):
                VOI_mesh.GetPoint(j, VOIPnt)

                # extract pixID at location VOIPnt
                pixId = transformed_image.FindPoint(VOIPnt[0], VOIPnt[1],
                                                    VOIPnt[2])
                im_pt = [0, 0, 0]

                transformed_image.GetPoint(pixId, im_pt)
                inorout = transformed_image.ComputeStructuredCoordinates(
                    im_pt, ijk, pco)
                if (inorout == 0):
                    pass
                else:
                    pixValx = transformed_image.GetScalarComponentAsFloat(
                        ijk[0], ijk[1], ijk[2], 0)
                    pixVals_margin.append(pixValx)

            # Now collect pixVals
            print "\n Saving %s" % 'Fmargin' + str(i)
            Fmargin['Fmargin' + str(i)] = pixVals_margin
            pixVals_margin = []

            # extract pixID at inside VOIPnt
            VOI_scalars = transformed_image.GetPointData().GetScalars()
            np_VOI_imagedata = vtk_to_numpy(VOI_scalars)

            dims = transformed_image.GetDimensions()
            spacing = transformed_image.GetSpacing()
            np_VOI_imagedata = np_VOI_imagedata.reshape(
                dims[2], dims[1], dims[0])
            np_VOI_imagedata = np_VOI_imagedata.transpose(2, 1, 0)

            #################### HERE GET INTERNAL PIXELS IT AND MASK IT OUT
            VOI_imagedata = np_VOI_imagedata[nonzero(np_VOI_mask)]

            for j in range(len(VOI_imagedata)):
                pixValx = VOI_imagedata[j]
                pixVals.append(pixValx)

            # Now collect pixVals
            print "\n Saving %s" % 'F' + str(i)
            voxel_frameS['F' + str(i)] = pixVals
            pixVals = []

        ##############################################################
        # Initialize features
        self.i_var = []
        self.alln_F_r_i = []
        self.allmin_F_r_i = []
        self.allmax_F_r_i = []
        self.allmean_F_r_i = []
        self.allvar_F_r_i = []
        self.allskew_F_r_i = []
        self.allkurt_F_r_i = []
        F_r_0 = array(voxel_frameS['F' + str(0)]).astype(float)
        n, min_max, meanFr, var_F_r_0, skew, kurt = stats.describe(F_r_0)
        self.i_var_max = 0

        # Collect to Compute inhomogeneity variance of uptake and other variables
        for k in range(1, len(DICOMImages)):
            F_r_i = array(voxel_frameS['F' + str(k)]).astype(float)
            print "\nF_r_i parameters %s" % str(k)
            n_F_r_i, min_max_F_r_i, mean_F_r_i, var_F_r_i, skew_F_r_i, kurt_F_r_i = stats.describe(
                F_r_i)

            print("Number of internal voxels: {0:d}".format(n_F_r_i))
            self.alln_F_r_i.append(n_F_r_i)
            print("Minimum: {0:8.6f} Maximum: {1:8.6f}".format(
                min_max_F_r_i[0], min_max_F_r_i[1]))
            self.allmin_F_r_i.append(min_max_F_r_i[0])
            self.allmax_F_r_i.append(min_max_F_r_i[1])
            print("Mean: {0:8.6f}".format(mean_F_r_i))
            self.allmean_F_r_i.append(mean_F_r_i)
            print("Variance F_r_i: {0:8.6f}".format(var_F_r_i))
            self.allvar_F_r_i.append(var_F_r_i)
            print("Skew : {0:8.6f}".format(skew_F_r_i))
            self.allskew_F_r_i.append(skew_F_r_i)
            print("Kurtosis: {0:8.6f}".format(kurt_F_r_i))
            self.allkurt_F_r_i.append(kurt_F_r_i)

            print("Variance of uptake: {0:8.6f}".format(var_F_r_i / var_F_r_0))
            self.i_var.append(var_F_r_i / var_F_r_0)

            # Find max of change in variance of uptake
            if (self.i_var[k - 1] > self.i_var_max):
                self.i_var_max = self.i_var[k - 1]

        print("\nMax Variance of uptake: {0:8.6f}\n".format(self.i_var_max))

        # Collect to Compute change in variance of uptake
        self.ii_var = []
        self.ii_var_min = 1000
        for k in range(len(DICOMImages) - 1):
            F_r_i = array(voxel_frameS['F' + str(k)]).astype(float)
            F_r_iplus = array(voxel_frameS['F' + str(k + 1)]).astype(float)
            n, min_max, meanFr, var_F_r_ith, skew, kurt = stats.describe(F_r_i)
            n, min_max, meanFr, var_F_r_iplus, skew, kurt = stats.describe(
                F_r_iplus)
            """change Variance of uptake:"""
            self.ii_var.append(var_F_r_ith / var_F_r_iplus)

            # Find max of change in variance of uptake
            if (var_F_r_ith / var_F_r_iplus < self.ii_var_min):
                self.ii_var_min = var_F_r_ith / var_F_r_iplus

        print("Min change Variance of uptake: {0:8.6f}\n".format(
            self.ii_var_min))

        # Extract features for sharpness of lesion margin, compute Margin gradient iii_var
        # The gradient is computed using convolution with a 3D sobel filter using scipy.ndimage.filters.sobel
        # The function generic_gradient_magnitude calculates a gradient magnitude using the function passed through derivative to calculate first derivatives.
        F_rmargin_0 = array(Fmargin['Fmargin' + str(0)]).astype(float)
        self.iii_var_max = -1000
        iii_Sobelvar = []

        # Collect to Compute variance of uptake and other variables
        for k in range(1, len(DICOMImages)):
            F_rmargin_i = array(Fmargin['Fmargin' + str(k)]).astype(float)

            margin_delta = F_rmargin_i - F_rmargin_0
            # using first sobel and then prewitt
            sobel_grad_margin_delta = generic_gradient_magnitude(
                margin_delta, sobel)

            # compute feature Margin Gradient
            n, min_max, mean_sobel_grad_margin, var, skew, kurt = stats.describe(
                sobel_grad_margin_delta)
            n, min_max, mean_F_rmargin_i, var_F_r_ith, skew, kurt = stats.describe(
                F_rmargin_i)
            """Margin Gradient"""
            iii_Sobelvar.append(mean_sobel_grad_margin / mean_F_rmargin_i)

            # Find max of Margin Gradient
            if (iii_Sobelvar[k - 1] > self.iii_var_max):
                self.iii_var_max = iii_Sobelvar[k - 1]
                self.iii_var_max_k = k

        print("Max Margin Gradient: {0:8.6f}".format(self.iii_var_max))
        print("k for Max Margin Gradient: {0:8.6f}".format(self.iii_var_max_k))

        # compute iv feature Variance of Margin Gradient
        # note: only computed from the subtraction frames of i and 0 where the margin gradient iii_var is maximum.
        self.ivVariance = []
        F_rmargin_iv = array(Fmargin['Fmargin' +
                                     str(self.iii_var_max_k)]).astype(float)
        n, min_max, mean_F_rmargin_iv, var_F_r_ith, skew, kurt = stats.describe(
            F_rmargin_iv)

        margin_delta_iv = F_rmargin_iv - F_rmargin_0

        # using first sobel and then prewitt
        sobel_grad_margin_delta_iv = generic_gradient_magnitude(
            margin_delta_iv, sobel)
        n, min_max, mean_sobel, var_sobel_grad_margin_delta_iv, skew, kurt = stats.describe(
            sobel_grad_margin_delta_iv)

        self.ivVariance = var_sobel_grad_margin_delta_iv / mean_F_rmargin_iv**2

        print("Variance of spatial Margin Gradient: {0:8.6f}".format(
            self.ivVariance))

        # Extract Shape features: pre-requisite is the Volume and the diameter of the lesion
        ####################################
        # Measure VOI
        ###################################
        VOI_massProperty = vtk.vtkMassProperties()
        VOI_massProperty.SetInputData(VOI_mesh)
        VOI_massProperty.Update()

        # get VOI volume
        # VTK is unitless. The units you get out are the units you put in.
        # If your input polydata has points defined in terms of millimetres, then
        # the volume will be in cubic millimetres.
        VOI_vol = VOI_massProperty.GetVolume()  # mm3
        VOI_surface = VOI_massProperty.GetSurfaceArea()  # mm2

        # just print the results
        print "\nVolume lesion = ", VOI_vol
        print "Surface lesion  = ", VOI_surface

        # Calculate the effective diameter of the surface D=2(sqrt3(3V/(4pi)))
        diam_root = (3 * VOI_vol) / (4 * pi)
        self.VOI_efect_diameter = 2 * pow(diam_root, 1.0 / 3)
        print "VOI_efect_diameter = ", self.VOI_efect_diameter

        centerOfMassFilter = vtk.vtkCenterOfMass()
        centerOfMassFilter.SetInputData(VOI_mesh)
        centerOfMassFilter.SetUseScalarsAsWeights(False)
        centerOfMassFilter.Update()

        # centroid of lesion
        self.lesion_centroid = [0, 0, 0]
        self.lesion_centroid = centerOfMassFilter.GetCenter()
        print "lesion_centroid = ", self.lesion_centroid

        # create a sphere to compute the volume of lesion within a sphere of effective diameter
        sphere_effectD = vtk.vtkSphereSource()
        sphere_effectD.SetRadius(self.VOI_efect_diameter / 2)  #VOI_diameter/2
        sphere_effectD.SetCenter(self.lesion_centroid)
        sphere_effectD.Update()

        # compute volume of lesion within a sphere of effective diameter
        sphereVOI_massProperty = vtk.vtkMassProperties()
        sphereVOI_massProperty.SetInputData(sphere_effectD.GetOutput())
        sphereVOI_massProperty.Update()
        sphereVOI_vol = sphereVOI_massProperty.GetVolume()  # mm3

        # just print the results
        print "Volume sphereVOI = ", sphereVOI_vol

        # Compute Shape of lesion in 3D
        # Circularity
        epsilon = 0.001
        self.circularity = sphereVOI_vol / (VOI_vol + epsilon)
        print("\nCircularity: {0:8.6f}".format(self.circularity))

        self.irregularity = 1 - pi * (self.VOI_efect_diameter / VOI_surface)
        print("Irregularity: {0:8.6f}".format(self.irregularity))

        ####################################
        # Radial gradient analysis ref[9] white paper
        ###################################
        # Radial gradient analysis is based on examination of the angles between voxel-value gradients
        # and lines intersecting a single point near the center of the suspect lesion, lines in radial directions.
        # Radial gradient values are given by the dot product of the gradient direction and the radial direction.
        RGH_mean = []
        self.max_RGH_mean = 0
        self.max_RGH_mean_k = 0
        RGH_var = []
        self.max_RGH_var = 0
        self.max_RGH_var_k = 0
        H_norm_p = []

        # do subtraction of timepost-pre
        ####################
        for i in range(1, len(DICOMImages)):
            subtractedImage = Display().subImage(DICOMImages, i)
            [transformed_image, transform_cube
             ] = Display().dicomTransform(subtractedImage, image_pos_pat,
                                          image_ori_pat)

            for j in range(VOI_mesh.GetNumberOfPoints()):
                VOI_mesh.GetPoint(j, VOIPnt)

                r = array(VOIPnt)
                rc = array(self.lesion_centroid)
                norm_rdir = (r - rc) / linalg.norm(r - rc)

                # Find point for gradient vectors at the margin point
                pixId = transformed_image.FindPoint(VOIPnt[0], VOIPnt[1],
                                                    VOIPnt[2])
                sub_pt = [0, 0, 0]
                transformed_image.GetPoint(pixId, sub_pt)

                ijk = [0, 0, 0]
                pco = [0, 0, 0]

                grad_pt = [0, 0, 0]

                inorout = transformed_image.ComputeStructuredCoordinates(
                    sub_pt, ijk, pco)
                if (inorout == 0):
                    print "point outside data"
                else:
                    transformed_image.GetPointGradient(
                        ijk[0], ijk[1], ijk[2],
                        transformed_image.GetPointData().GetScalars(), grad_pt)

                #############
                # Compute vector in the direction gradient at margin point
                grad_marginpt = array([grad_pt])
                norm_grad_marginpt = grad_marginpt / linalg.norm(grad_marginpt)

                # Compute dot product (unit vector for dot product)
                p_dot = dot(norm_grad_marginpt, norm_rdir)
                norm_p_dot = np.abs(p_dot)[0]  #linalg.norm(p_dot)

                H_norm_p.append(norm_p_dot)

            # The histogram of radial gradient values quantifying the frequency of occurrence of the dot products in a given region of interest
            # radial gradient histogram. The hist() function now has a lot more options
            # first create a single histogram

            # the histogram of the data with histtype='step'
#            plt.figure()
#            nsamples, bins, patches = plt.hist(array(H_norm_p), 50, normed=1, histtype='bar',facecolor='blue', alpha=0.75)
#           n, min_max, mean_bins, var_bins, skew, kurt = stats.describe(nsamples)

            mean_bins = np.mean(H_norm_p)
            var_bins = np.var(H_norm_p)

            print("\n mean RGB: {0:8.6f}".format(mean_bins))
            print("variance RGB: {0:8.6f}".format(var_bins))

            # Append data
            RGH_mean.append(mean_bins)
            RGH_var.append(var_bins)

            # Find max of RGH Gradient
            if (RGH_mean[i - 1] > self.max_RGH_mean):
                self.max_RGH_mean = RGH_mean[i - 1]
                self.max_RGH_mean_k = i

            if (RGH_var[i - 1] > self.max_RGH_var):
                self.max_RGH_var = RGH_var[i - 1]
                self.max_RGH_var_k = i

            # add a line showing the expected distribution
            # create a histogram by providing the bin edges (unequally spaced)
            plt.xlabel('normalized dot product |R.G|')
            plt.ylabel('Probability')
            plt.title('radial gradient histogram')
            plt.grid(True)

        ################# Jacob's lesion margin sharpness
        #initializations
        VOI_outlinept_normal = [0, 0, 0]
        VOI_outlinept = [0, 0, 0]
        inpt = [0, 0, 0]
        outpt = [0, 0, 0]
        im_pts = [0, 0, 0]
        ijk_in = [0, 0, 0]
        ijk_out = [0, 0, 0]
        pco = [0, 0, 0]
        SIout_pixVal = []
        lastSIout_pixVal = []

        # get model_point_normals
        VOI_point_normals = vtk.vtkPolyDataNormals()
        VOI_point_normals.SetInputData(VOI_mesh)
        VOI_point_normals.SetComputePointNormals(1)
        VOI_point_normals.SetComputeCellNormals(0)
        VOI_point_normals.SplittingOff()
        VOI_point_normals.FlipNormalsOff()
        VOI_point_normals.ConsistencyOn()
        VOI_point_normals.Update()

        # Retrieve model normals
        VOI_normalsRetrieved = VOI_point_normals.GetOutput().GetPointData(
        ).GetNormals()
        VOI_n = VOI_normalsRetrieved.GetNumberOfTuples()

        # obtain vols of interest
        [transf_pre_dicomReader,
         transform_cube] = Display().dicomTransform(DICOMImages[0],
                                                    image_pos_pat,
                                                    image_ori_pat)
        [transf_last_dicomReader, transform_cube
         ] = Display().dicomTransform(DICOMImages[len(DICOMImages) - 1],
                                      image_pos_pat, image_ori_pat)

        num_margin = []
        den_margin = []

        for i in range(1, len(DICOMImages)):
            #initializations
            SIout_pixVal = []
            lastSIout_pixVal = []

            subtractedImage = Display().subImage(DICOMImages, i)
            [transf_sub_pre_dicomReader, transform_cube
             ] = Display().dicomTransform(subtractedImage, image_pos_pat,
                                          image_ori_pat)

            for k in range(VOI_n):
                VOI_outlinept_normal = VOI_normalsRetrieved.GetTuple3(k)
                VOI_mesh.GetPoint(k, VOI_outlinept)

                #   "d for radial lenght: %f" % d
                d = sqrt(spacing[0]**2 + spacing[1]**2 + spacing[2]**2)

                inpt[0] = VOI_outlinept[0] - VOI_outlinept_normal[0] * d
                inpt[1] = VOI_outlinept[1] - VOI_outlinept_normal[1] * d
                inpt[2] = VOI_outlinept[2] - VOI_outlinept_normal[2] * d

                outpt[0] = VOI_outlinept[0] + VOI_outlinept_normal[0] * d
                outpt[1] = VOI_outlinept[1] + VOI_outlinept_normal[1] * d
                outpt[2] = VOI_outlinept[2] + VOI_outlinept_normal[2] * d

                # get pre-contrast SIin to normalized RSIgroup [See equation 1] from paper
                prepixin = transf_pre_dicomReader.FindPoint(
                    inpt[0], inpt[1], inpt[2])
                transf_pre_dicomReader.GetPoint(prepixin, im_pts)
                transf_pre_dicomReader.ComputeStructuredCoordinates(
                    im_pts, ijk_in, pco)
                #print ijk_in

                # get pre-contrast SIout in 6-c-neighbordhood to normalized RSIgroup [See equation 1] from paper
                prepixout = transf_pre_dicomReader.FindPoint(
                    outpt[0], outpt[1], outpt[2])
                transf_pre_dicomReader.GetPoint(prepixout, im_pts)
                transf_pre_dicomReader.ComputeStructuredCoordinates(
                    im_pts, ijk_out, pco)
                #print ijk_out

                # get t-post SIin
                SIin_pixVal = transf_sub_pre_dicomReader.GetScalarComponentAsFloat(
                    ijk_in[0], ijk_in[1], ijk_in[2], 0)
                preSIin_pixVal = transf_pre_dicomReader.GetScalarComponentAsFloat(
                    ijk_in[0], ijk_in[1], ijk_in[2], 0) + epsilon

                RSIin = SIin_pixVal / preSIin_pixVal
                ####

                # get t-post SIout  6-c-neighbordhood
                #cn1
                SIout = transf_sub_pre_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0] + 1, ijk_out[1], ijk_out[2], 0)
                preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0] + 1, ijk_out[1], ijk_out[2], 0) + epsilon
                SIout_pixVal.append(float(SIout / preSIout))
                #cn2
                SIout = transf_sub_pre_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0] - 1, ijk_out[1], ijk_out[2], 0)
                preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0] - 1, ijk_out[1], ijk_out[2], 0) + epsilon
                SIout_pixVal.append(float(SIout / preSIout))
                #cn3
                SIout = transf_sub_pre_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0], ijk_out[1] + 1, ijk_out[2], 0)
                preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0], ijk_out[1] + 1, ijk_out[2], 0) + epsilon
                SIout_pixVal.append(float(SIout / preSIout))
                #cn4
                SIout = transf_sub_pre_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0], ijk_out[1] - 1, ijk_out[2], 0)
                preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0], ijk_out[1] - 1, ijk_out[2], 0) + epsilon
                SIout_pixVal.append(float(SIout / preSIout))
                #cn5
                SIout = transf_sub_pre_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0], ijk_out[1], ijk_out[2] + 1, 0)
                preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0], ijk_out[1], ijk_out[2] + 1, 0) + epsilon
                SIout_pixVal.append(float(SIout / preSIout))
                #cn6
                SIout = transf_sub_pre_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0], ijk_out[1] - 1, ijk_out[2] - 1, 0)
                preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0], ijk_out[1] - 1, ijk_out[2] - 1, 0) + epsilon
                SIout_pixVal.append(float(SIout / preSIout))

                RSIout = mean(SIout_pixVal)
                ###

                # get last-post SIout 6-c-neighbordhood
                #cn1
                SIout = transf_last_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0] + 1, ijk_out[1], ijk_out[2], 0)
                preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0] + 1, ijk_out[1], ijk_out[2], 0) + epsilon
                lastSIout_pixVal.append(float(SIout / preSIout))
                #cn2
                SIout = transf_last_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0] - 1, ijk_out[1], ijk_out[2], 0)
                preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0] - 1, ijk_out[1], ijk_out[2], 0) + epsilon
                lastSIout_pixVal.append(float(SIout / preSIout))
                #cn3
                SIout = transf_last_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0], ijk_out[1] + 1, ijk_out[2], 0)
                preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0], ijk_out[1] + 1, ijk_out[2], 0) + epsilon
                lastSIout_pixVal.append(float(SIout / preSIout))
                #cn4
                SIout = transf_last_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0], ijk_out[1] - 1, ijk_out[2], 0)
                preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0], ijk_out[1] - 1, ijk_out[2], 0) + epsilon
                lastSIout_pixVal.append(float(SIout / preSIout))
                #cn5
                SIout = transf_last_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0], ijk_out[1], ijk_out[2] + 1, 0)
                preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0], ijk_out[1], ijk_out[2] + 1, 0) + epsilon
                lastSIout_pixVal.append(float(SIout / preSIout))
                #cn6
                SIout = transf_last_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0], ijk_out[1] - 1, ijk_out[2] - 1, 0)
                preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat(
                    ijk_out[0], ijk_out[1] - 1, ijk_out[2] - 1, 0) + epsilon
                lastSIout_pixVal.append(float(SIout / preSIout))

                # calculate
                RSIoutf = mean(lastSIout_pixVal)

                ### compute feature
                num_margin.append(RSIin - RSIout)
                den_margin.append(RSIin - RSIoutf)
                #print num_margin
                #print den_margin

                SIout_pixVal = []
                lastSIout_pixVal = []

        self.edge_sharp_mean = mean(array(num_margin).astype(float)) / mean(
            array(den_margin).astype(float))
        self.edge_sharp_std = std(array(num_margin).astype(float)) / std(
            array(den_margin).astype(float))
        print "\n edge_sharp_mean: "
        print self.edge_sharp_mean

        print "\n edge_sharp_std: "
        print self.edge_sharp_std

        ##################################################
        # orgamize into dataframe
        self.morphologyFeatures = DataFrame(
            data=array([[
                mean(self.allmin_F_r_i),
                mean(self.allmax_F_r_i),
                mean(self.allmean_F_r_i),
                mean(self.allvar_F_r_i),
                mean(self.allskew_F_r_i),
                mean(self.allkurt_F_r_i), self.i_var_max, self.ii_var_min,
                self.iii_var_max, self.iii_var_max_k, self.ivVariance,
                self.circularity, self.irregularity, self.edge_sharp_mean,
                self.edge_sharp_std, self.max_RGH_mean, self.max_RGH_mean_k,
                self.max_RGH_var, self.max_RGH_var_k
            ]]),
            columns=[
                'min_F_r_i', 'max_F_r_i', 'mean_F_r_i', 'var_F_r_i',
                'skew_F_r_i', 'kurt_F_r_i', 'iMax_Variance_uptake',
                'iiMin_change_Variance_uptake', 'iiiMax_Margin_Gradient',
                'k_Max_Margin_Grad', 'ivVariance', 'circularity',
                'irregularity', 'edge_sharp_mean', 'edge_sharp_std',
                'max_RGH_mean', 'max_RGH_mean_k', 'max_RGH_var',
                'max_RGH_var_k'
            ])

        return self.morphologyFeatures

示例#49

0

显示文件

from scipy import stats

# Generating a normal distribution sample
# with 100 elements
sample = np.random.randn(100)

# The harmonic mean: Sample values have to
# be greater than 0.
out = stats.hmean(sample[sample > 0])
print('Harmonic mean = ' + str(out))

# The mean, where values below -1 and above 1 are
# removed for the mean calculation
out = stats.tmean(sample, limits=(-1, 1))
print('\nTrimmed mean = ' + str(out))

# Calculating the skewness of the sample
out = stats.skew(sample)
print('\nSkewness = ' + str(out))

# Additionally, there is a handy summary function called
# describe, which gives a quick look at the data.
out = stats.describe(sample)
print('\nSize = ' + str(out[0]))
print('Min = ' + str(out[1][0]))
print('Max = ' + str(out[1][1]))
print('Mean = ' + str(out[2]))
print('Variance = ' + str(out[3]))
print('Skewness = ' + str(out[4]))
print('Kurtosis = ' + str(out[5]))

示例#50

0

显示文件

文件： OOP.py 项目： aminyakubu/data_sci_python

 def five_figure_summary(self,col_position):
     statistics = stats.describe(self.array[1:,col_position].astype(np.float))
     return f"Five-figure stats of column {col_position}: {statistics}"

示例#51

0

显示文件

        "./Outputs/avg_fare_info/1/model for fleet size 1500 surge 2fdemand 0.0perc_k 0pro_s 0 repl0.p",
        "rb",
    )
)

report = m.get_service_rate_per_zone()
report
report.LOS.describe()
print("total_demand = {}".format(report.total.sum()))

total_demand = 20000
system_LOS = report.served.sum() / total_demand
system_LOS
np.sum(m.operator.revenues)
drivers_fares = [np.sum(v.collected_fares) for v in m.vehilcs]
stats.describe(drivers_fares)

np.median(drivers_fares)

# print("vehicle utilization = {}".format(report.idle.sum()/(report.idle.sum() + report.incoming.sum())))


z = m.zones[10]


l = [z.id for z in m.zones]
l.index(236)

directory = "./Outputs/zone_demand_viz/"
if not os.path.exists(directory):
    os.makedirs(directory)

示例#52

0

显示文件

文件： lappdProtocol.py 项目： kcroker/lappd_digest

    def __init__(self, samples):

        # Now compute the pedestals
        from scipy.stats import describe
        from numpy import floor

        # Did we receive any samples?
        if not len(samples):
            raise Exception("Did not receive any samples!")

        # Some board information
        self.board_id = samples[0].board_id

        # Use the first event to determine the list of channels
        self.chan_list = samples[0].channels.keys()

        # Sanity checks on the pedestal samples
        for sample in samples:

            if not isinstance(sample, event):
                raise Exception(
                    "Encountered non-event in list of samples.  Nonsense.")

            if not self.chan_list == sample.channels.keys():
                raise Exception(
                    "Provided sample events for pedestaling have inhomogeneous channel content.  This .... is ... U N A C C E P T A B L E E E E ---- U N A C C E P T A B L E E E E E E E"
                )
            if not sample.keep_offset:
                raise Exception(
                    "Refusing to build pedestals with torn data (ROI mode)")

        # Set up for pedestals
        self.mean = {}
        self.variance = {}

        # Gotta do it manually
        for chan_id in self.chan_list:
            caps = [[] for x in range(0, len(samples[0].channels[chan_id]))]
            self.mean[chan_id] = []
            self.variance[chan_id] = []

            for evt in samples:
                for capacitor, ampl in enumerate(evt.channels[chan_id]):
                    if not ampl is None:
                        caps[capacitor].append(ampl)

            # Now we have it in filtered capacitor form
            for cap in caps:
                # Cap is a list
                N = len(cap)
                if N == 0:
                    self.mean[chan_id].append(None)
                    self.variance[chan_id].append(None)
                elif N == 1:
                    self.mean[chan_id].append(cap)
                    self.variance[chan_id].append(None)
                else:
                    bs, bs, mean, variance, *bs = describe(cap)

                    #
                    # Because the numpy method returns some ass-retarded type
                    #
                    # Note that we save the mean as an integer.
                    # This lets us do integer subtraction without conversion when removing pedestals
                    # directly from the data as it comes in.
                    #
                    # Since noise is a least 30 ADC counts, this changes nothing.
                    #
                    self.mean[chan_id].append(round(float(mean)))
                    self.variance[chan_id].append(float(variance))

        # Remove the samples because python can't pickle it
        del (self.chan_list)

示例#53

0

显示文件

文件： Salary Data.py 项目： btg1998/Information-Systems

with open('SalaryData.csv', 'r') as csvfile:
    read = csv.reader(csvfile, delimiter=',', quotechar='|')
    f = 0
    for row in read:
        if f != 0:
            Data.append(row)
        f = f + 1
Exp = []
TExp = []
Sal = []
for i in Data:
    Exp.append(int(i[2]))
    TExp.append(int(i[4]))
    Sal.append(int(i[5]))
print("Stats of Experience: ")
print(stats.describe(Exp))
print("Stats of Total Experience: ")
print(stats.describe(TExp))
print("Stats of Salary: ")
print(stats.describe(Sal))
fig = plt.figure()
fig.suptitle('Experience V/S Salary', fontsize=14, fontweight='bold')
plt.scatter(Exp, Sal)
Sal = np.array(Sal)
Exp = np.array(Exp)
popt, pcov = curve_fit(func, Exp, Sal)
SSR = (sum((func(Exp, *popt) - Sal)**2) / Exp.size)
print(SSR)
plt.plot(np.unique(Exp), np.poly1d(np.polyfit(Exp, Sal, 3))(np.unique(Exp)))
#plt.plot(Exp, func(Exp, *popt))
plt.xlabel("Experience")

示例#54

0

显示文件

print(zarr)

#정수 생성
cnt = 0
for i in np.arange(3):
    for j in np.arange(5):
        cnt += 1
        zarr[i, j] = cnt

print(zarr)

#외부 csv파일 읽어 배열 생성
phone = np.genfromtxt('c:/Java/phone-01.csv',delimiter=',')  #텍스트파일을 배열로 생성
print(phone)
print(np.mean(phone[:,2]))  #화면크기 항목에 대한 평균 출력
print(np.median(phone[:,2]))  #중앙값

print('총 개수 : ', len(phone))

p_col3 = phone[:,2]
print(np.percentile(p_col3, 0))   #사분위값 : 최소값
print(np.percentile(p_col3, 25))  #1사분위값
print(np.percentile(p_col3, 50))  #2사분위값
print(np.percentile(p_col3, 75))  #3사분위값
print(np.percentile(p_col3, 100)) #사분위값 : 최대값

#scipy에는 여러가지 기술통계를 한번에 계산해주는
#describe 함수가 있음
from scipy.stats import describe
print(describe(phone))

示例#55

0

显示文件

from scipy import stats
import numpy as np

s = np.genfromtxt('Brrrr.log', dtype='float')
#print(s)
s2 = np.array(s)

x2, x1, b = s2[:, 0], s2[:, 1], s2[:, 2]
'''
print(x2)
print()
print(b)
'''
print(stats.describe(x2))
print()
print(stats.describe(x1))
print()
print(stats.describe(b))

print()
from math import sqrt
a = -.00256515 + sqrt(.00069877876)
b = 3.805212 + sqrt(1338.377)
c = -242.17783 + sqrt(160203168.52084109)
print("a:", a)
print("b:", b)
print("c:", c)

示例#56

0

显示文件

文件： main.py 项目： amrutn/Simulating-Liquid-Liquid-Phase-Separation-of-Proteins

def main():

    #Path to config file
    _, config_path = deepcopy(sys.argv)
    with open(config_path, "r") as f:
        config = json.load(f)
        data_dir = config["data_dir"]
        if not os.path.isdir(data_dir):
            os.mkdir(data_dir)
        specs = config["specifications"]

    with open(os.path.join(data_dir, "config_copy.json"), 'w',
              newline='') as f:
        f.write(json.dumps(config))
    vol_dat = []
    run_num = 0
    for run_vals in specs:
        concentrations = run_vals["concentrations"]
        sample_volume = run_vals["sample_volume"]
        sample_diffusive_const = run_vals["sample_diffusive_const"]
        num_timesteps = run_vals["Number of timesteps"]
        molecular_radius = run_vals["Molecular Radius"]
        min_droplet_vol = run_vals["Min Droplet Volume"]
        num_droplets = []

        means = []
        for c in concentrations:
            print("Started run for concentration " + str(c) + "uM", flush=True)
            sample = Sample(sample_volume, sample_diffusive_const, c,
                            molecular_radius, min_droplet_vol)
            aggs = sample.simulate(num_timesteps)
            volumes = []
            for agg in aggs:
                if agg.is_droplet():
                    volumes.append(agg.volume())
            vol_dat += [volumes]
            if len(volumes) == 0:
                num_droplets.append(0)
                means.append(0)
            else:
                nobs, minmax, mean, variance, skewness, kurtosis = sp.describe(
                    volumes)
                num_droplets.append(nobs)
                means.append(mean)
            print("Finished run for concentration " + str(c) + "uM",
                  flush=True)
        run_num += 1

        run_dir = os.path.join(data_dir, "run" + str(run_num))
        os.mkdir(run_dir)
        with open(os.path.join(run_dir, "volumes_dat.csv"), 'w',
                  newline='') as csvfile:
            writer = csv.writer(csvfile, delimiter=',')
            writer.writerows(vol_dat)

        plt.rc('font', family='serif')
        fig = plt.figure(figsize=(10, 10))
        ax = fig.add_subplot(1, 1, 1)
        for item in (ax.get_xticklabels() + ax.get_yticklabels()):
            item.set_fontsize(20)
        for item in ([ax.title, ax.xaxis.label, ax.yaxis.label]):
            item.set_fontsize(30)
        ax.plot(concentrations, num_droplets, "sb:")
        ax.set_title("Number of Droplets")
        ax.set_xlabel("Concentration (uM)")
        ax.set_ylabel("Number of Droplets")
        plt.savefig(os.path.join(run_dir, 'num_droplets.png'),
                    bbox_inches='tight')

        fig = plt.figure(figsize=(10, 10))
        ax = fig.add_subplot(1, 1, 1)
        for item in (ax.get_xticklabels() + ax.get_yticklabels()):
            item.set_fontsize(20)
        for item in ([ax.title, ax.xaxis.label, ax.yaxis.label]):
            item.set_fontsize(30)
        ax.plot(concentrations, means, "sb:")
        ax.set_title("Mean Droplet Volume")
        ax.set_xlabel("Concentration (uM)")
        ax.set_ylabel("Mean Droplet Volume (um^3)")
        plt.savefig(os.path.join(run_dir, 'mean_droplet_volume.png'),
                    bbox_inches='tight')

示例#57

0

显示文件

    # 연도 읽을 때 에러 처리. 파일 헤더를 무시합니다.
    try:
        invoice_year = time.strptime(line_items[4], '%m/%d/%y %H:%M').tm_year

    except ValueError:
        continue

    # 2011년에 일어난 구매가 아닌 것은 무시합니다.
    if invoice_year != 2011:
        continue

    # 읽은 정보로 데이터 구조를 채웁니다.
    # 상품 가짓수를 고려하므로 상품 코드를 셋으로 가지도록 하겠습니다.
    user_product_dic.setdefault(user_code, set())
    user_product_dic[user_code].add(product_id)

    product_user_dic.setdefault(product_id, set())
    product_user_dic[product_id].add(user_code)

    product_id_name_dic[product_id] = product_name

# 데이터구조를 다 채웠으므로 각 사용자들이 구매한 상품 가짓수로 리스트를 만들어봅시다.
product_per_user_li = [len(x) for x in user_product_dic.values()]

# 이 장에서 사용할 최종 사용자 수와 상품 가짓수를 출력해봅니다.
print('# of users:', len(user_product_dic))
print('# of products:', len(product_user_dic))

# 각 사용자들이 구매한 상품 가짓수로 기초 통계량을 출력합니다.
print(stats.describe(product_per_user_li))

示例#58

0

显示文件

def fxHighStressTest(port0, baudRate, port1 = "", commandFreq = 1000,
		positionAmplitude = 10000, currentAmplitude = 2500,
		positionFreq = 1, currentFreq = 5, currentAsymmetricG = 1.25,
		numberOfLoops = 720):
	global times		# Elapsed time since strart of run
	global currentRequests
	global positionRequests
	global readDeviceTimes	# Timing data for fxReadDevice()
	global sendMotorTimes	# Timing data for fxSendMotorCommand
	global setGainsTimes	# Timing data for fxSetGains()
	global cycleStopTimes
	global data0		# Contains state of ActPack0 
	global data1 		# Contains state of ActPack1

	########### One vs two devices ############
	secondDevice = False
	if (port1 != ""):
		secondDevice = True

	if (secondDevice):
		print("Running high stress test with two devices")
	else:
		print("Running high stress test with one device")

	########### Debug & Data Logging ############
	debugLoggingLevel = 6 # 6 is least verbose, 0 is most verbose
	dataLog = False # Data log logs device data

	delay_time = float(1/(float(commandFreq)))
	print('Delay time: ', delay_time)

	########### Open the device(s) and start streaming ############
	devId0 = fxOpen(port0, baudRate, debugLoggingLevel) 
	fxStartStreaming(devId0, commandFreq, dataLog)
	print('Connected to device with Id:', devId0)

	devId1 = -1
	if (secondDevice):
		print('Port: ', port1)
		print('BaudRate: ', baudRate)
		print('debugLoggingLevel: ', debugLoggingLevel)
		devId1 = fxOpen(port1, baudRate, debugLoggingLevel)
		fxStartStreaming(devId1, commandFreq, dataLog)
		print('Connected to device with Id:', devId1)

	############# Main Code ############
	######## Make your changes here #########

	# Get initial position:
	print('Reading initial position...')

	# Give the device time to consume the startStreaming command and start streaming
	sleep(0.1)

	data0 = fxReadDevice(devId0)
	initialPos0 = data0.encoderAngle	# May be used to offset subsequent readings
	print("Initial position 0:", initialPos0)

	initialPos1 = 0
	if (secondDevice):
		data1 = fxReadDevice(devId1)
		initialPos1 = data1.encoderAngle
		print("Initial position 1:", initialPos1)

	# Generate control profiles
	print('Command table #1 - Position Sine:')
	positionSamples = sinGenerator(positionAmplitude, positionFreq, commandFreq)
	print(np.int64(positionSamples))
	print('Command table #2 - Current Sine:')
	currentSamples = sinGenerator(currentAmplitude, currentFreq, commandFreq)
	print("number of samples is: ", len(currentSamples))
	print(np.int64(currentSamples))
	print('Command table #3 - Current Sine:')
	currentSamplesLine = lineGenerator(0, 0.15, commandFreq)
	#print(np.int64(currentSamplesLine))

	# Initialize lists
	# cycleStopTimes = []

	try:
		t0 = time()	# Record start time of experiment
		i = 0
		for reps in range(0, numberOfLoops):

			print("")
			print("Rep #", reps+1,"out of",numberOfLoops)
			print("-------------------")

			# Step 0: set position controller
			# -------------------------------
			print("Step 0: set position controller")

			sleep(delay_time)	# Important in loop 2+
			if (i):	# Second or later iterations in loop
				# setPositionCtrl(  devId0, devId1, secondDevice, data0.encoderAngle, initialPos1)
				sendAndTimeCmds(t0, devId0, devId1, secondDevice, initialPos0, initialPos1,
					current0=0, current1=0, motorCmd=FxPosition,
					position0=data0.encoderAngle, position1=initialPos1,
					posReq=0, setGains=True)
				# ToDo: data1.encoderAngle
			else:	# First loop iteration
				# setPositionCtrl(  devId0, devId1, secondDevice, initialPos0, initialPos1)
				sendAndTimeCmds(t0, devId0, devId1, secondDevice, initialPos0=0, initialPos1=0,
					current0=0, current1=0, motorCmd=FxPosition,
					position0=initialPos0, position1=initialPos1, posReq=0, setGains=True)

			# Step 1: go to initial position
			# -------------------------------
			if (i):	# Second or later iterations in loop
				print("Step 1: go to initial position")
				linSamples = linearInterp(data0.encoderAngle-initialPos0, 0, 100)
				#print(np.int64(linSamples))

				for sample in linSamples:

					sleep(delay_time)
					sendAndTimeCmds(t0, devId0, devId1, secondDevice, initialPos0, initialPos1,
						current0=0, current1=0, motorCmd=FxPosition,
						position0=sample+initialPos0, position1=sample+initialPos1,
						posReq=sample, setGains=False)
					"""
					# set controller to the next sample
					# read ActPack data
					tstart = time()
					data0  = fxReadDevice(devId0)
					tstop  = time()
					readDeviceTimes.append(tstop - tstart)
					if (secondDevice):
						data1 = fxReadDevice(devId1)

					# Position setpoint:
					tstart = time()
					fxSendMotorCommand(devId0, FxPosition, sample + initialPos0)
					tstop = time()
					sendMotorTimes.append(tstop - tstart)

					currentMeasurements0.append(data0.motorCurrent)
					positionMeasurements0.append(data0.encoderAngle - initialPos0)
					if (secondDevice):
						fxSendMotorCommand(devId1, FxPosition, sample + initialPos1)
						currentMeasurements1.append(data1.motorCurrent)
						positionMeasurements1.append(data1.encoderAngle-initialPos1)

					times.append(time() - t0)
					currentRequests.append(0)
					positionRequests.append(sample)	# BAB: sample+initialPos0 ???
					"""
					i = i + 1
			else:	# First time in loop
				print("Step 1: skipped, first round")

			# Step 2: position sine wave
			# --------------------------
			print("Step 2: track position sine wave")

			for sample in positionSamples:

				sleep(delay_time)
				sendAndTimeCmds(t0, devId0, devId1, secondDevice,initialPos0, initialPos1,
					current0=0, current1=0, motorCmd=FxPosition,
					position0=sample+initialPos0, position1=sample+initialPos1,
					posReq=0, setGains=False)

				"""
				# set controller to the next sample
				# read ActPack data
				tstart = time()
				data0 = fxReadDevice(devId0)
				tstop = time()
				readDeviceTimes.append(tstop - tstart)
				if (secondDevice):
					data1 = fxReadDevice(devId1)

				# Position setpoint:
				tstart = time()
				fxSendMotorCommand(devId0, FxPosition, sample + initialPos0)
				tstop = time()
				sendMotorTimes.append(tstop - tstart)

				currentMeasurements0.append(data0.motorCurrent)
				positionMeasurements0.append(data0.encoderAngle - initialPos0)
				if (secondDevice):
					fxSendMotorCommand(devId1, FxPosition, sample + initialPos1)
					currentMeasurements1.append(data1.motorCurrent)
					positionMeasurements1.append(data1.encoderAngle - initialPos1)

				times.append(time() - t0)
				currentRequests.append(0)
				positionRequests.append(sample)	# BAB: sample+initialPos0 ???
				"""
				i = i + 1

			# Step 3: set current controller
			# -------------------------------
			print("Step 3: set current controller")
			# setCurrentCtrl(   devId0, devId1, secondDevice, 0, 0)
			sendAndTimeCmds(t0, devId0, devId1, secondDevice, initialPos0, initialPos1,
				current0=0, current1=0, motorCmd=FxCurrent,
				position0=0, position1=0, posReq=0, setGains=True)


			# Step 4: current setpoint
			# --------------------------
			print("Step 4: track current sine wave")
			for sample in currentSamples:

				sleep(delay_time)
				# We use more current on the "way back" to come back closer to
				# the staring point
				if(sample <= 0):	#No change
					compensatedSample = sample
				else:			#Apply gain
					compensatedSample = np.int64(currentAsymmetricG * sample)

				sendAndTimeCmds(t0, devId0, devId1, secondDevice,initialPos0, initialPos1,
					current0=compensatedSample, current1=compensatedSample,
					motorCmd=FxCurrent, position0=0, position1=0, posReq=0, setGains=False)

				# set controller to the next sample
				# read ActPack data
				"""
				tstart = time()
				data0 = fxReadDevice(devId0)
				tstop = time()
				readDeviceTimes.append(tstop - tstart)
				if (secondDevice):
					data1 = fxReadDevice(devId1)

				# Position setpoint:
				tstart = time()
				fxSendMotorCommand(devId0, FxCurrent, compensatedSample)
				tstop = time()
				sendMotorTimes.append(tstop - tstart)

				currentMeasurements0.append(data0.motorCurrent)
				positionMeasurements0.append(data0.encoderAngle - initialPos0)
				if (secondDevice):
					fxSendMotorCommand(devId1, FxCurrent, compensatedSample)
					currentMeasurements1.append(data1.motorCurrent)
					positionMeasurements1.append(data1.encoderAngle - initialPos1)

				times.append(time() - t0)
				currentRequests.append(compensatedSample)
				positionRequests.append(0)
				"""
				i = i + 1
				
			# Step 5: short pause at 0 current to allow a slow-down
			# -----------------------------------------------------
			print("Step 5: motor slow-down, zero current")

			for sample in currentSamplesLine:

				sleep(delay_time)
				sendAndTimeCmds(t0, devId0, devId1, secondDevice,initialPos0, initialPos1,
				 	current0=sample, current1=sample, motorCmd=FxCurrent,
				 	position0=0, position1=0, posReq=0, setGains=False)

				"""
				# set controller to the next sample
				# read ActPack data
				tstart = time()
				data0 = fxReadDevice(devId0)
				tstop = time()
				readDeviceTimes.append(tstop - tstart)
				if (secondDevice):
					data1 = fxReadDevice(devId1)

				# Position setpoint:
				tstart = time()
				fxSendMotorCommand(devId0, FxCurrent, sample)
				tstop = time()
				sendMotorTimes.append(tstop - tstart)

				currentMeasurements0.append(data0.motorCurrent)
				positionMeasurements0.append(data0.encoderAngle - initialPos0)
				if (secondDevice):
					fxSendMotorCommand(devId1, FxCurrent, sample)
					currentMeasurements1.append(data1.motorCurrent)
					positionMeasurements1.append(data1.encoderAngle - initialPos1)

				times.append(time() - t0)
				currentRequests.append(sample)
				positionRequests.append(0)
				"""
				i = i + 1

			# We'll draw a line at the end of every period
			cycleStopTimes.append(time() - t0)
			elapsed_time = time() - t0
	except KeyboardInterrupt:
		print ('Keypress detected.  Exiting gracefully ...')

	fxClose(devId0)
	fxClose(devId1)

	######## Stats: #########
	print("")
	print("Final Stats:")
	print("------------")
	actual_period = cycleStopTimes[0]
	command_frequency = i / elapsed_time
	print("Number of commands sent: " + str(i))
	print("Total time (s): " + str(elapsed_time))
	print("Requested command frequency: "+"{:.2f}".format(commandFreq))
	print("Actual command frequency (Hz): "+"{:.2f}".format(command_frequency))
	print("")
	print('currentSamplesLine: ',		len(currentSamplesLine))
	print('size(times)',			len(times))
	print('size(currentRequests): ',	len(currentRequests))
	print('size(currentMeasurements0): ',	len(currentMeasurements0))
	print('size(setGainsTimes): ',		len(setGainsTimes))
	print('')

        ######## Summary stats about intividual arrays: #########
	print('\n\ntimes: ',			stats.describe(times))
	print('\n\ncurrentRequests: ',		stats.describe(currentRequests))
	print('\n\ncurrentMeasurements0: ',	stats.describe(currentMeasurements0))
	# print('\n\ncurrentMeasurements1: ',	stats.describe(currentMeasurements1))
	print('\n\npositionRequests: ',		stats.describe(positionRequests))
	print('\n\npositionMeasurements0: ',	stats.describe(positionMeasurements0))
	# print('\n\npositionMeasurements1: ',	stats.describe(positionMeasurements1))
	print('\n\nreadDeviceTimes: ',		stats.describe(readDeviceTimes))
	print('\n\nsendMotorTimes: ',		stats.describe(sendMotorTimes))
	print('\n\nseetGainsTimes: ',		stats.describe(setGainsTimes))


	######## End of Main Code #########

	######## Plotting Code, you can edit this ##################

	###### Begin Create unique data filename and save desired and measured values
	now = datetime.now().strftime("%Y-%m-%d_%H-%M")
	data_fn = 'log/' + now + '_Current.csv'
	print('Do create Current  data file ['+ data_fn + ']')
	# NON-PYTHONIC, but efficient write to file:
	# with open(data_fn, 'w') as df:
	# 	for i in range(len(currentRequests)):
	# 		df.write(str(times[i]) + ',' + str(currentRequests[i]) + ','
	# 			+ str(currentMeasurements0[i]) + '\n')

	data_fn = 'log/' + now + '_Position.csv'
	print('Do create Position data file ['+ data_fn + ']')
	# with open(data_fn, 'w') as df:
	# 	for i in range(len(positionRequests)):
	# 		df.write(str(times[i]) + ',' + str(positionRequests[i]) + ','
	# 			+ str(positionMeasurements0[i]) + '\n')
	###### End Create unique data filename and save desired and measured values

	# Current Plot:
	plt.figure(1)
	title = "Motor Current"
	plt.plot(times, currentRequests, color = 'b', label = 'desired current')
	plt.plot(times, currentMeasurements0, color = 'r', label = 'measured current')
	plt.xlabel("Time (s)")
	plt.ylabel("Motor current (mA)")
	plt.title(title)
	plt.legend(loc='upper right')

	# Draw a vertical line at the end of each cycle
	for endpoints in cycleStopTimes:
		plt.axvline(x=endpoints)

	# Position Plot:
	plt.figure(2)
	title = "Motor Position"
	plt.plot(times, positionRequests, color = 'b', label = 'desired position')
	plt.plot(times, positionMeasurements0, color = 'r', label = 'measured position')
	plt.xlabel("Time (s)")
	plt.ylabel("Encoder position")
	plt.title(title)
	plt.legend(loc='upper right')

	# Draw a vertical line at the end of each cycle
	for endpoints in cycleStopTimes:
		plt.axvline(x=endpoints)

	plt.figure(3)
	# Convert command times into millisec
	sendMotorTimes = [i * 1000 for i in sendMotorTimes]
	plt.plot(times, sendMotorTimes, color='b', label='Send Motor Times')
	plt.xlabel("Time (ms)")
	plt.ylabel("Command Time (ms)")
	plt.title("Send Motor Times")
	plt.legend(loc='upper right')

	plt.figure(4)
	plt.yscale('log')
	plt.hist(sendMotorTimes, bins=100, label = 'Send Motor Times')
	plt.yscale('log')
	plt.xlabel("Time (ms)")
	plt.ylabel("Occurrences")
	plt.title("Send Motor Commands")
	plt.legend(loc='upper right')

	plt.figure(5)
	# Convert command times into millisec
	readDeviceTimes = [i * 1000 for i in readDeviceTimes]
	plt.plot(times, readDeviceTimes, color='b', label='Read Device Times')
	plt.xlabel("Time (ms)")
	plt.ylabel("Command Time (ms)")
	plt.title("Read Device Commands")
	plt.legend(loc='upper right')

	plt.figure(6)
	plt.yscale('log')
	plt.hist(readDeviceTimes, bins=100, label = 'Read Device Times')
	plt.yscale('log')
	plt.xlabel("Time (ms)")
	plt.ylabel("Occurrences")
	plt.title("Read Device Commands")
	plt.legend(loc='upper right')

	plt.figure(7)
	# Convert command times into millisec
	setGainsTimes = [i * 1000 for i in setGainsTimes]
	plt.plot(times, setGainsTimes, color='b', label='Set Gains Times')
	plt.xlabel("Time (ms)")
	plt.ylabel("Command Time (ms)")
	plt.title("Set Gains Commands")
	plt.legend(loc='upper right')

	plt.figure(8)
	plt.yscale('log')
	# Remove 0 values in histogram
	setGainsTimes = [i for i in setGainsTimes if i > 0]
	plt.hist(setGainsTimes, bins=100, label = 'Set Gains Times')
	plt.yscale('log')
	plt.xlabel("Time (ms)")
	plt.ylabel("Occurrences")
	plt.title("Set Gains Commands")
	plt.legend(loc='upper right')


	# #######
	# *** ToDo: add plotting for 2nd device here ***
	# #######
	
	plt.show()

示例#59

0

显示文件

dset_sbert_data = dset_sbert.remove_columns(['dataset', 'identifier', 'length', 'text'])
dset_sbert_tfidf = concatenate_datasets([dsets_tokenized, dset_sbert_data], axis=1)


import torch
def mean_sbert(x):
    t = torch.tensor(x["sbert_top_128"])
    t16 = t[:, :16].mean(1)
    t128 = t.mean(1)
    return dict(sbert_top_16_avg=t16.tolist(), sbert_top_128_avg=t128.tolist())


dset_sbert_tfidf = dset_sbert_tfidf.map(mean_sbert,  batched=True, batch_size=4096)


dset_sbert_tfidf.save_to_disk("/home/ahemf/processed/dsets_448_sbert_tfidf")
sbert_tfidf = dset_sbert_tfidf.remove_columns(['sbert', 'sbert_top_128'])
sbert_tfidf.save_to_disk("/home/ahemf/processed/sbert_tfidf")

sum(sbert_tfidf["length"]) / 1_000_000_000 == 9.504
sum(sbert_tfidf["length"]) == 9504256152

from scipy.stats import describe
describe(sbert_tfidf["perplexity"])  # DescribeResult(nobs=8252138, minmax=(1.0, 2666.609619140625), mean=50.22365915303506, variance=378.6735535134215, skewness=5.66997263818954, kurtosis=204.6866296365421)
describe(np.log1p(sbert_tfidf["perplexity"]))  # DescribeResult(nobs=8252138, minmax=(0.6931471805599453, 7.888938076667314), mean=3.8780924745662606, variance=0.11594320687589393, skewness=-0.2525622031627243, kurtosis=3.5463577790908074)
describe(sbert_tfidf["sbert_top_128_avg"])  # DescribeResult(nobs=8252138, minmax=(0.23806016147136688, 0.9999999403953552), mean=0.48508123392669744, variance=0.00398569604138435, skewness=1.0084712141737993, kurtosis=3.5581830353889874)
import pandas as pd
pd.Series(sbert_tfidf["sbert_top_128_avg"]).describe()

示例#60

0

显示文件

文件： coding numpy scipy pandas.py 项目： deinega/cheat-sheet

from scipi import optimize
optimize.minimize(f, x0=0)

from scipi import integrate
res, err = integrate.quad(f, 0, np.inf)

from scipi import interpolate
interpolate.interp1d(x, y, kind='quadratic', fill_value='extrapolate')

from scipy import stats

stats.norm.rvs(size=10) # normally distributed sample
stats.t.rvs(10, size=100)
stats.norm.pdf(x), cdf(x) # probability distribution, cumulative functions

stats.describe(x) # calculate statistics for sample

np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1, loc=squared_errors.mean(), scale=stats.sem(squared_errors))) # calculate confidence interval for the test RMSE
# this is equivalent to:
tscore = stats.t.ppf((1 + confidence) / 2, df=m - 1)
tmargin = tscore * squared_errors.std(ddof=1) / np.sqrt(m)
np.sqrt(mean - tmargin), np.sqrt(mean + tmargin)

fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt) # helps to check if distribution is normal
plt.show()

### STATSMODEL

import statsmodels.api as sm