Пример #1
0
 def __init__(self, trainframe, classifier):
     '''abstract class for prediction.
     Parameters: 
         `trainframe`: pandas.DataFrame
             Labeled data. Note that to conserve space this frame will be altered
             IN PLACE and should not be reused!
         `classifier`: scikit-learn classifier
             must support predict_proba 
             '''
     print 'Data example: '
     print trainframe[0:10]
     print 'Outcomes overall:'
     print pd.value_counts(trainframe['OutcomeType'].values, sort=False)
     
     self.trainframe = trainframe
     self.classifier = classifier
     
     self.y = trainframe['OutcomeType'].copy()
     self.length = trainframe.shape[0]
     for label in 'AnimalID', 'Name', 'DateTime', 'OutcomeType', 'OutcomeSubtype':
         try: 
             trainframe.drop(label, 1, inplace=True)
         except KeyError:
             pass
     self.clean_train_data()
Пример #2
0
def draw(domain_length):
    """
    绘制柱装图
    :param domain_length:
    """
    x_label = []
    x = pd.value_counts(pd.Series(domain_length)).index[:25]
    y = pd.value_counts(pd.Series(domain_length)).values[:25]/1000.0
    for label in x:
        x_label.append(str(label))
    x = np.arange(len(y))
    fig = plt.figure()
    fig.add_subplot(111)
    plt.bar(x,y,align='center')
    x_min,x_max = x.min(), x.max()
    y_min,y_max = y.min(), y.max()
    plt.xlabel(u'顶级域名')
    plt.ylabel(u'域名个数(K)')
    plt.xlim(x_min-1, x_max+1)
    plt.ylim(y_min, y_max+10)
    plt.xticks(x,x_label,rotation=50)
    # plt.grid(axis='y')
    plt.subplots_adjust(top=0.95,bottom=0.15,left=0.08,right=0.97)
    plt.savefig(u"各个顶级域名含有的域名数量",dpi=140)
    plt.show()
Пример #3
0
def slide_14():
    ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
    bins = [18, 25, 35, 60, 100]

    cats = pd.cut(ages, bins)
    print cats

    # labels じゃなくて codes を使え
    # print cats.labels
    print cats.codes
    # print cats.levels
    # levels じゃなくて categories を使え
    print cats.categories
    print pd.value_counts(cats)

    print pd.cut(ages, [18, 26, 36, 61, 100], right=False)

    group_names = ['Youth', 'YoungAdultl', 'MiddleAged', 'Senior']
    print pd.cut(ages, bins, labels=group_names)

    data = np.random.rand(20)
    print data
    print pd.cut(data, 3, precision=2)

    data = np.random.randn(1000)
    cats = pd.qcut(data, 3)
    print cats
    print pd.value_counts(cats)
    print pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
Пример #4
0
def statsFromRun(feat,DL,RW):
    left=pd.Series()
    light=pd.Series()
    next_waypoint=pd.Series()
    oncoming=pd.Series()
    right=pd.Series()
    for f in feat:
        left= left.add(pd.value_counts(f.left.ravel()), fill_value=0)
        light= light.add(pd.value_counts(f.light.ravel()), fill_value=0)
        next_waypoint= next_waypoint.add(pd.value_counts(f.next_waypoint.ravel()), fill_value=0)
        oncoming= oncoming.add(pd.value_counts(f.oncoming.ravel()), fill_value=0)
        right= right.add(pd.value_counts(f.right.ravel()), fill_value=0)

    fig, axes = plt.subplots(nrows=2, ncols=3,figsize=(14,6))
    fig.suptitle( "Runs:{}".format(len(feat)))

    left.plot(kind='bar', title="Left",ax=axes[0,0])
    light.plot(kind='bar', title="light",ax=axes[0,1])
    next_waypoint.plot(kind='bar', title="next_waypoint",ax=axes[0,2])
    oncoming.plot(kind='bar', title="oncoming",ax=axes[1,0])
    right.plot(kind='bar', title="right",ax=axes[1,2])
    axes[1,1].plot(DL,label="Deadlines")
    axes[1,1].plot(RW,label="Rewards")
    avgDist=3
    axes[1,1].plot(     #add a line to the graph representing the avg of all point within avgDist of the current run. 
        [(np.mean(DL[i-avgDist:i+avgDist])+np.mean(RW[i-avgDist:i+avgDist]))/2 for i in range(len(DL))],
        label="Avg {:2.2f}".format( # use the last half avg in the label
            (np.mean(DL[len(DL)/2:len(DL)])+np.mean(RW[len(DL)/2:len(DL)]))/2)) 
    #axes[1,1].xlabel('Run')
    axes[1,1].legend(loc=2)
    #axes[1,1].title("Deadline and Rewards per Run")
    
    plt.show()
    plt.close()
Пример #5
0
def read_file(filename):
    for df in pd.read_csv(filename, index_col='id',keep_default_na=False, na_values=[""], chunksize=50000):
        for ds in df:
            factor = pd.cut(df[ds], 10)
            print pd.value_counts(factor)
            break
        break
Пример #6
0
def feature_userBased(filename):

    df = pd.read_csv(filename, sep=',', header=0)
    csv_file = open(filename[0:-4]+'_user.csv', 'w')
    a = csv.writer(csv_file, delimiter=',')
    for user_id, group in df.groupby('user_id'):
        dict =  pd.value_counts(group.behavior_type, sort=False)
        num_skim = 0
        num_collect = 0
        num_cart = 0
        num_buy = 0
        if (1 in dict):
            num_skim = int(dict[1])               # 浏览数
        if (2 in dict):
            num_collect = int(dict[2])            # 收藏数
        if 3 in dict:
            num_cart = int(dict[3])               # 加入购物车数
        if 4 in dict:
            num_buy = int(dict[4])                # 购买数

        conversion_rate = 0
        conversion_rate = (num_buy*1.0)/dict.sum()
        conversion_rate = float('%.4f'% conversion_rate)


        buy_group = group[group['behavior_type']==4]
        num_item_buy = (pd.value_counts(buy_group.item_id)).shape[0]   # 该客户购买的商品种数
        item = [user_id, num_skim, num_collect, num_cart, num_buy, conversion_rate, num_item_buy]
        a.writerow(item)
Пример #7
0
Файл: main.py Проект: zjgtan/zh
def kftest(df, column, label, tag):
    df_tmp = df.loc[:, [column, label]]
    df_tmp = df_tmp.dropna()
    
    col = dict(pd.value_counts(df_tmp[column]))
    lab = dict(pd.value_counts(df_tmp[label]))
    f_obs = []
    f_exp = []

    obs_d = {}
    for i in col:
        for j in lab:
            obs = sum([1 \
                    if df_tmp.iloc[k][column] == i and df_tmp.iloc[k][label] == j \
                    else 0 for k in range(len(df_tmp))])

            obs_d.setdefault(j, {})
            obs_d[j][i] = obs

            f_obs.append(obs)

            f_exp.append(1. * lab[j] / (sum(lab.values())) * col[i])

    statics, p_value = chisquare(f_obs, f_exp, ddof=len(f_obs) - 2)

    str1 = "%d(%f),%d(%f),%d(%f),%f,%f" % (col[tag], col[tag] * 1. / sum(col.values()), 
            obs_d[0][tag],
            1. * obs_d[0][tag] / sum(obs_d[0].values()),
            obs_d[1][tag],
            1. * obs_d[1][tag] / sum(obs_d[1].values()),
            statics,
            p_value)
    return str1
Пример #8
0
def dist_by_group(grp, predictor):
    df = pd.concat([grp, predictor], axis=1) 
    colnames = df.columns.values
    grouped = df.groupby(colnames[0])
    agg_df = grouped.apply(lambda x: pd.value_counts(x.iloc[:,1])/sum(pd.value_counts(x.iloc[:,1])))
    agg_df = agg_df.unstack()
    return agg_df
def discretize_bins_quantiles(df,col_name,number_of_bins, verbose = False):
    new_col = 'bins_' + str(col_name)
    df[new_col] = pd.qcut(df[col_name],number_of_bins, labels = False)
    
    if verbose:
        print pd.value_counts(data[new_col])
        
    return new_col
def discretize_bins_values(df,col_name, bins, verbose = False):
    new_col = 'bins_' + str(col_name)
    df[new_col] = pd.cut(df[col_name], bins = bins, include_lowest = True, labels = False)
    
    if verbose:
        print pd.value_counts(data[new_col])
        
    return new_col
Пример #11
0
def run_test2():

    orig_animals = ['cat', 'dog', 'mouse']
    animals = orig_animals * 3

    raw_data = { 'animal' : animals,
                'score' : get_rand_num_array(len(animals))
    }

    # make DataFrame
    #
    df = pd.DataFrame(raw_data, columns = ['animal', 'score'])

    print '-' * 10
    print df
    print '-' * 10
    #return

    # Create array for bins
    #
    bins = get_bin_list(step=20, low_num=0, high_num=100)

    # For each score assign it to a bin
    #
    labels = pd.cut(df['score'], bins)

    # Same as above but adding the bin value as a column to the DataFrame
    #
    df['bin_label'] = pd.cut(df['score'], bins)
    print type(df)
    print df.describe
    print '-' * 10

    from collections import Counter
    c = Counter(df['bin_label'])
    print '-' * 10
    print c

    vcounts = pd.value_counts(df['bin_label'])
    print vcounts
    #print 'by_bin', by_bin
    print '-' * 10
    vcounts = df['bin_label'].value_counts()
    d = vcounts.to_dict()
    keys = d.keys()
    keys.sort()
    for k in keys:
        print k, d[k], type(k)

    return
    # Show the count in each bin
    #
    vc_series = pd.value_counts(df['bin_label'])
    print '\n', 'vc_series', vc_series
    print '-' * 10

    print vc_series.axes
    import ipdb; ipdb.set_trace()
Пример #12
0
def cont_var_to_disc(df, column_name, max_value, number_of_bins):
	'''function that can discretize a continuous variable'''
	df[column_name] = df[column_name].apply(lambda x: cap_values(x, max_value))
	variable_name = column_name + "_bins"
	df[variable_name] = pd.cut(df[column_name], bins=number_of_bins, labels=False)
	print pd.value_counts(df[variable_name])
	#This is useful if you take all of the features to do the model but not if specify features
	#df.drop(column_name, axis=1, inplace=True)
	return df
Пример #13
0
def pd_01():
    obj=Series(['c','a','d','a','a','b','c'])
    uniques=obj.unique()
    print uniques
    print uniques.sort()
    print pd.value_counts(obj,sort=False)
    mask=obj.isin(['b','c'])
    print mask
    print obj[mask]
Пример #14
0
def queryVolatile(sym,startdate,dbconn):
    
    df=stockeod.getAllDataFrame(sym,startdate,dbconn)
    #df['chg']=pd.Series(np.random.randn(sLength), index=df.index)
    df['chg']=1
    df['lschg']=1
    p1=0.0
    p0=0.0
    for index, row in df.iterrows():
        if index==0:
            df.ix[0,['chg']] = 0
        else:            
            p1 =  df.ix[index,'sadjclose']
            p0 =  df.ix[index-1,'sadjclose']
            pclose = df.ix[index,'sclose']
            plow = df.ix[index,'slow']
            chg = 100*(p1 / p0 - 1)
            #prev_close = pclose / (chg/100+1)
            lschg = abs((plow/pclose-1)*100)
            #print index,plow,prev_close,chg,lschg
            #lschg = lschg - chg
            
                
            #if chg>=0:
            #    chg+=0.4
            #else:
            #    chg-=0.4
                    
            df.ix[index,'chg'] = chg #int(round(chg))
            df.ix[index,'lschg'] = lschg
            
    #print df[['symbol','sdate','sopen','sadjclose','chg','lschg']]
    print df
    #bins = [-1000,-5,-3,-1,1,3,5,1000]
    #cats = pd.cut(df['chg'],bins)
    #cats.plot(kind='kde')
    fig = plt.figure()
    ax1 = fig.add_subplot(2,1,1)
    ax2 = fig.add_subplot(2,1,2)
    ax1.set_xlim([-20,20])
    mybins=[-15,-5,-3,-1,1,3,5,15]
    
    cats = pd.cut(df['chg'],mybins)
    print "change percent\n", pd.value_counts(cats)
    
    shadowbins=[0,1,3,5,15]
    shadowcats = pd.cut(df['lschg'],shadowbins)
    print "shadow line change percent\n", pd.value_counts(shadowcats)
    df['chg'].hist(ax=ax1,bins=mybins)
    df['chg'].plot(ax=ax1,kind='kde')
    df['lschg'].hist(ax=ax2,bins=shadowbins)
    df['lschg'].plot(ax=ax2,kind='kde')
    plt.show()
Пример #15
0
def slide_21():
    import json
    db = json.load(open(FOODJSONPATH))
    print len(db)

    print db[0].keys()
    print db[0]['nutrients'][0]

    nutrients = DataFrame(db[0]['nutrients'])
    print nutrients[:7]

    info_keys = ['description', 'group', 'id', 'manufacturer']
    info = DataFrame(db, columns=info_keys)
    print info[:5]

    print pd.value_counts(info.group)[:10]

    print "今から全部のnutrientsを扱うよ"
    nutrients = []

    for rec in db:
        fnuts = DataFrame(rec['nutrients'])
        fnuts['id'] = rec['id']
        nutrients.append(fnuts)

    nutrients = pd.concat(nutrients, ignore_index=True)
    print "なんか重複多い"
    print nutrients.duplicated().sum()
    nutrients = nutrients.drop_duplicates()

    print "infoとnutrients両方にdescriptionとgroupがあるから変えよう"
    col_mapping = {'description': 'food', 'group': 'fgroup'}
    info = info.rename(columns=col_mapping, copy=False)

    col_mapping = {'description': 'nutrient', 'group': 'nutgroup'}
    nutrients = nutrients.rename(columns=col_mapping, copy=False)

    ndata = pd.merge(nutrients, info, on='id', how='outer')
    print ndata.ix[30000]

    result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)
    result['Zinc, Zn'].order().plot(kind='barh')
    plt.show()

    by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])
    get_maximum = lambda x: x.xs(x.value.idxmax())
    get_minimum = lambda x: x.xs(x.value.idxmin())

    max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]

    max_foods.food = max_foods.food.str[:50]

    print max_foods.ix['Amino Acids']['food']
Пример #16
0
def check_frequency(ax, true_data, sampling_data):
    true_seq = list(chain.from_iterable(true_data))
    sampling_seq = list(chain.from_iterable(sampling_data))

    f1 = pd.value_counts(true_seq) / len(true_seq)
    f2 = pd.value_counts(sampling_seq) / len(sampling_seq)
    freq = merge_and_sort(f1, f2)

    jsd = JSD(freq.true_data, freq.sampling_data)
    freq.index = [str(x)[:20] for x in freq.index]
    freq.plot(ax=ax, kind='bar')
    ax.set_title("Frequency(JSD: %.5f)" % jsd)
Пример #17
0
def cut_data():
    ages=[20,22,25,27,21,23,37,31,61,45,41,32]
    bins=[18,25,35,60,100]
    cats=pd.cut(ages,bins)
    print cats
    print cats.levels
    print cats.labels
    print pd.value_counts(cats)
    print pd.cut(ages,[18,25,35,60,100],right=False)
    group_names=['Youth','YoungAdult','MiddleAged','Senior']
    print pd.cut(ages,bins,labels=group_names)
    data=np.random.randn(20)
    print pd.cut(data,4,precision=2)
Пример #18
0
def draw(entire_tlds,sub_tlds,first_tlds):
    """
    绘制柱装图
    :param entire_tlds:
    """
    fig =plt.figure()
    x_label = []
    x = pd.value_counts(pd.Series(entire_tlds)).index[:25]
    y = pd.value_counts(pd.Series(entire_tlds)).values[:25]/1000.0
    for label in x:
        x_label.append(str(label))
    x = np.arange(len(y))

    ax = fig.add_subplot(121)
    ax.bar(x,y)
    ax.set_xticks(x)
    ax.set_xticklabels(x_label,rotation=50)
    plt.grid(axis='y')
    plt.ylabel(u'域名数量(K)')
    plt.xlabel(u'二级顶级域名')

    x_label = []
    x = pd.value_counts(pd.Series(sub_tlds)).index[:20]  # 去重
    y = pd.value_counts(pd.Series(sub_tlds)).values[:20]/1000.0  # 去重
    for label in x:
        x_label.append(str(label))
    x = np.arange(len(y))
    ax2 = fig.add_subplot(122)
    ax2.bar(x,y)
    ax2.set_xticks(x)
    ax2.set_xticklabels(x_label,rotation=50)
    plt.grid(axis='y')
    plt.xlabel(u'第二级顶级域名')

    plt.subplots_adjust(top=0.96,bottom=0.15,left=0.06,right=0.98,wspace=0.10)
    plt.savefig(u"二级顶级域名",dpi=140)


    # x_label = []
    # x = pd.value_counts(pd.Series(first_tlds)).index[:20]
    # y = pd.value_counts(pd.Series(first_tlds)).values[:20]
    # for label in x:
    #     x_label.append(str(label))
    # x = np.arange(len(y))
    # ax3 = fig.add_subplot(223)
    # ax3.bar(x,y)
    # ax3.set_xticks(x)
    # ax3.set_xticklabels(x_label,rotation=50)
    # plt.grid()

    plt.show()
Пример #19
0
def get_binned_data( df, bin_count=10 ):
    v_max, v_min = df.max(), df.min()
    bins = [(v_max-v_min)/(bin_count+1)*i+v_min for i in range(bin_count+1)]
    labels = ["{0} {1:.1f}".format(i, (v_max-v_min)/(bin_count+1)*(i+0.5)+v_min) for i in range(bin_count)]

    categories = pd.cut(df, bins, labels=labels)
    #print( categories)
    print( df)
    print(pd.value_counts( categories ))

    ret_df = pd.dataFrame()
    ret_df.index = labels
    ret_df['count'] = pd.value_counts(categories)

    return ret_df
def density_(df, n=100):
    x = pd.cut(df.t1, n)
    y = pd.cut(df.t2, n)
    x_counts = pd.value_counts(x)
    y_counts = pd.value_counts(y)
    x_mid = map(get_mid, x_counts.index)
    y_mid = map(get_mid, y_counts.index)
    lower = min(min(x_mid), min(y_mid))
    upper = max(max(x_mid), max(y_mid))
    arr = np.linspace(lower, upper, 100)
    grid = np.meshgrid(arr, arr)
    x_counts.index = x_mid
    y_counts.index = y_mid
    x_counts = x_counts.sort_index()
    y_counts = y_counts.sort_index()
Пример #21
0
	def barz(self):
		"--barplot"
		"frame"
		bz = pd.DataFrame({'he': self.he, 'inc': self.inc, 'alrmV':self.alrmV } )
		print '**alrmV', self.alrmV[:10], bz.alrmV.value_counts()
	
		"parse data"
		#confs/dmhi-current/reports/.txt	
		"group/count unique/ sort -> value_counts"
		#self.df['Label'].idx(1).count()
		
		
		"counts"
		at = pd.value_counts(bz.alrmV); #print "**alert-types-10\n", at.shape, at[:10]
		inc = bz.inc.value_counts(); #print "incorrect\n", inc #1=incorrect
		he = bz.he.value_counts(); #print "hardeasy\n", he
		#inc.plot(kind='bar')

		"group by"  #gender, geography, timeofday
		grouped = bz.groupby(['he','inc'])#.sum().plot(kind='bar', stacked=True)		
		#key = [k for (k,v) in grouped.groups]
		#print 'key', key
		#print grouped.size()
		#print 'PPP', grouped.value_counts()
		pew=grouped['alrmV'].value_counts().unstack().fillna(0.)
		print 'heic vals(\n' 
		pprint.pprint(pew)

		pew.plot(kind='bar',stacked=True)
def player_performance_plots(database,table,player_name):
    conn = MySQLdb.connect(user="******",passwd="xxxx",db=database,
                           cursorclass=MySQLdb.cursors.DictCursor)
    cmd_target = 'SELECT * FROM '+ table + ' WHERE player IN (\''+ player_name +'\');'
    player_frame = pd.read_sql(cmd_target, con=conn)
    conn.close()
    player_values = player_frame['pos_streak_list'].values
    streaks = [ast.literal_eval(x) for x in player_values]
    streak_data = np.concatenate(streaks)
    x=range(len(streak_data))
    y=streak_data
    df_streaks = pd.DataFrame(dict(streaks=x, streak_length=y))
    streak_counts = pd.value_counts(df_streaks.values.ravel())

    xData = streak_counts.index[:15]
    xData_1 = [x-1 for x in xData]
    yData = streak_counts.values[:15]
    # yData_1 = yData*(1000)/yData[0]

    popt, pcov = optimize.curve_fit(exp_func, xData, yData)

    yEXP = exp_func(xData, *popt)

    plt.figure()
    sns.factorplot("streak_length", data=df_streaks,kind="bar",palette="Blues",size=6,aspect=2,legend_out=False);
    plt.plot(xData_1, yData, label='Data', marker='o')
    plt.plot(xData_1, yEXP, 'r-',ls='--', label="Exp Fit")
    plt.legend()
    plt.show()
    a,b,c = popt
    return streak_counts
Пример #23
0
 def test_series_groupby(self):
     """Test boxplot groupby using a series of data labels."""
     vals = dist._box_reshape(self.df.y, self.df.g, None, None)[0]
     nt.assert_equal(len(vals), 3)
     want_lengths = pd.value_counts(self.df.g)[["a", "b", "c"]]
     got_lengths = [len(a) for a in vals]
     npt.assert_array_equal(want_lengths, got_lengths)
Пример #24
0
 def test_series_groupby_order(self):
     """Test a series-based groupby with a forced ordering."""
     order = ["c", "a", "b"]
     vals = dist._box_reshape(self.df.y, self.df.g, None, order)[0]
     want_lengths = pd.value_counts(self.df.g)[order]
     got_lengths = [len(a) for a in vals]
     npt.assert_array_equal(want_lengths, got_lengths)
Пример #25
0
def fix_barcode_columns(df, patients=None, tissue_code='All', get_batch=False):
    """
    Takes TCGA barcode and reformats it into a MultiIndex if all tissue_codes 
    are desired, or just pulls the correct tissue codes and filteres the 
    DataFrame.

    df: pandas DataFrame
    patients: patient list to filter on
    tissue_code: ['01','11','All']  #if all returns MultiIndex

    """
    if get_batch is False:
        df.columns = pd.MultiIndex.from_tuples([(i[:12], i[13:15]) for i 
                                                in df.columns])
    else:
        df.columns = pd.MultiIndex.from_tuples([(i[:12], i[13:15], i[21:24]) for i 
                                                in df.columns])
    if patients is not None:
        df = df.ix[:, patients]
    if tissue_code != 'All':
        try:
            df = df.T.xs(tissue_code, level=1).T  # pandas bug
            df = df.groupby(axis=1, level=0).first()
        except KeyError:  # try different cross-seciton
            new_code = pd.value_counts(df.columns.get_level_values(1)).idxmax()
            df = df.T.xs(new_code, level=1).T  # pandas bug
            df = df.groupby(axis=1, level=0).first()
            
    else:
        df = df.groupby(axis=1, level=[0, 1]).first()
    return df
	def is_noninformative_feature(self, feature_name):
		value_counts = pd.value_counts(self.data[feature_name], dropna = False)
		if len(value_counts) == 1:
			return True 
		elif value_counts.max()*1./self.data.shape[0] >= self.params["FRAC_OF_FEAT_TO_BE_NONINFORMATIVE"]:
			return True 
		return False 
Пример #27
0
    def plot_layer(self, layer):
        layer = {k: v for k, v in layer.iteritems() if k in self.VALID_AES}
        layer.update(self.manual_aes)
        x = layer.pop('x')
        if 'weight' not in layer:
            counts = pd.value_counts(x)
            labels = counts.index.tolist()
            weights = counts.tolist()
        else:
            weights = layer.pop('weight')
            if not isinstance(x[0], Timestamp):
                labels = x
            else:
                df = pd.DataFrame({'weights':weights, 'timepoint': pd.to_datetime(x)})
                df = df.set_index('timepoint')
                ts = pd.TimeSeries(df.weights, index=df.index)
                ts = ts.resample('W', how='sum')
                ts = ts.fillna(0)
                weights = ts.values.tolist()
                labels = ts.index.to_pydatetime().tolist()
        indentation = np.arange(len(labels)) + 0.2
        width = 0.35
        idx = np.argsort(labels)
        labels, weights = np.array(labels)[idx], np.array(weights)[idx]
        labels = sorted(labels)

        plt.bar(indentation, weights, width, **layer)
        plt.autoscale()
        return [
                {"function": "set_xticks", "args": [indentation+width/2]},
                {"function": "set_xticklabels", "args": [labels]}
            ]
Пример #28
0
def featureExtraction(filename):
    """
    提取特征
    :param filename:
    :return:
    """
    df = pd.read_csv(filename, sep=',', header=0)
    df = df.drop_duplicates()
    csv_file = open(filename[0:-4]+'_num.csv', 'w')
    a = csv.writer(csv_file, delimiter=',')
    for (u, i), group in df.groupby(['user_id', 'item_id']):
        num_skim = 0
        num_collect = 0
        num_cart = 0
        num_buy = 0
        dict =  pd.value_counts(group.behavior_type, sort=False)
        if (1 in dict):
            num_skim = int(dict[1])
        if (2 in dict):
            num_collect = int(dict[2])
        if 3 in dict:
            num_cart = int(dict[3])
        if 4 in dict:
            num_buy = int(dict[4])
        item = [u, i, num_skim, num_collect, num_cart, num_buy]
        a.writerow(item)
def several_tools_per_phase(supertupel):
    """What can we gather from people who use a tool more than once?"""

    title = "Mehrfach genutzte Tools pro Forschungszyklus"

    filename = "19_tools_in_mehreren_phasen." + EXTENSION

    tisp_user_how_many = []
    einer = []
    alle = []

    for st in supertupel:
        # User benutzt %d Tools in mehreren Phasen
        tisp_user_how_many.append(len(st))
        for tool in st:
            alle.append(category_lookup(tool[0]))
                
    tisp_series2 = pd.Series(alle)
    pvc2 = pd.value_counts(tisp_series2.values)
    print("Welche Kategorien werden mehrfach genutzt?")
    # print(pvc2)

    fig, axes = plt.subplots(nrows=1, ncols=1)#, figsize=(20,10))

    axes.set_xlabel("BenutzerInnen", alpha=ALPHA_VALUE, ha='left')
    axes.set_ylabel("Kategorien", alpha=ALPHA_VALUE, ha='left')

    # Here we go!
    pvc2.plot(kind="barh", ax=axes, color=COLOURS[0], width=WIDTH)
    fig.savefig(filename, bbox_inches='tight')
    plt.close(fig)
    print("Antwort in %s" % filename)
    titles.write("%s: Abb. 4.:%s\n" % (filename,title))
Пример #30
0
def process_dataset(ds):
    # Deal with missing data: (1) kick (2) filled with median
    ds["Age"] = ds["Age"].fillna(ds["Age"].median())
    ds["Fare"] = ds["Fare"].fillna(ds["Fare"].median())
    ds["Embarked"] = ds["Embarked"].fillna('S')
    # Categorized
    ds.loc[ds["Sex"] == "male", "Sex"] = 0
    ds.loc[ds["Sex"] == "female", "Sex"] = 1
    ds.loc[ds["Embarked"] == 'S', "Embarked"] = 0
    ds.loc[ds["Embarked"] == 'C', "Embarked"] = 1
    ds.loc[ds["Embarked"] == 'Q', "Embarked"] = 2
    # Binning
    binning(ds, "Fare")
    binning(ds, "Age")
    # Create new feature
    ds["FamilySize"] = ds["SibSp"] + ds["Parch"]
    ds["NameLength"] = ds["Name"].apply(lambda x: len(x))
    titles = ds["Name"].apply(get_title)
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5,
                     "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8,
                     "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10,
                     "Sir": 9, "Capt": 7, "Ms": 2, "Dona": 10}
    for k, v in title_mapping.items():
        titles[titles == k] = v
    ds["Title"] = titles
    family_ids = ds.apply(get_family_id, axis=1)
    family_ids[ds["FamilySize"] < 3] = -1
    print(pd.value_counts(family_ids))
    ds["FamilyId"] = family_ids
    return ds
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

prime_nos = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47]

number_bins = [0, 10, 20, 30, 40, 50]

category = pd.cut(prime_nos, number_bins)
print category

print category.categories

#value_counts() to get count prime number witthin a range
print pd.value_counts(category)

#limits number of bins
print pd.cut(prime_nos, 3, precision=1)
Пример #32
0
    net = Graph(DATADIR + DATASET + '\\links.txt',
                typ='dir',
                order=ORDER,
                withdiag=WITHDIAG)
    print('READ TIME: %.2f' % (time.time() - pt))

    f.write('%d %d %d\n' % (net.nVertices, net.nEdges, DIMENSION))

    pt = time.time()
    grouping_model = Louvain(net, rand=RANDOM_GROUPING)
    groups = grouping_model.execute(merge=MERGE)
    print('GROUP TIME: %.2f' % (time.time() - pt))

    group_sizes = [len(t) for t in groups]
    print('Grouping Results:')
    print(pd.value_counts(group_sizes))
    inv_index_original = groups2inv_index(groups, net.nVertices)
    # sizes_index = [group_sizes[t - 1] for t in inv_index_original]

    pt = time.time()
    # k_set = sample(net, k=K_SIZE, method='deg_deter')
    k_set = sample(net, k=K_SIZE,
                   method=SAMPLE_METHOD)  #, vertex_group_sizes=sizes_index)
    print('SAMPLE TIME: %.2f' % (time.time() - pt))

    inv_index = groups2inv_index(groups, net.nVertices, k_set)
    pure_override_nodes(groups, inv_index)
    groups = [k_set] + groups

    pt = time.time()
    model = Optimizer(net,
Пример #33
0
from pyecharts import Bar
import pandas as pd
import re
d = pd.read_csv('BGM_week_v1.csv', nrows=250)
star = d["week"]
result = star.values.tolist()
result1 = pd.value_counts(result)
sum_ = 0
week1 = result1["星期一"]
week2 = result1["星期二"] + result1["火曜日"]
week3 = result1["星期三"] + result1["水曜日"]
week4 = result1["星期四"] + result1["周四"] + result1["木曜日"]
week5 = result1["星期五"]
week6 = result1["星期六"]
week7 = result1["星期天"] + result1["周日"]

value = [
    week1, week2, week3, week4, week5, week6, week7
]

for i in value:
    sum_ += i
other = 250-sum_
value.append(other)
attr = ["星期一", "星期二", "星期三", "星期四", "星期五", "星期六", "星期日", "剧场版"]
bar = Bar('Top250动画 TV动画放送日期', "count")
bar.add("count", attr, value)
bar.render('Top250动画 TV动画放送日期.html')
import numpy as np
import pandas as pd
#import tensorflow as tf
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]

# Also, we can pass a unique name to each label.
bin_names = ['Youth', 'YoungAdult', 'MiddleAge', 'Senior']
new_cats = pd.cut(ages, bins, labels=bin_names)
print(pd.value_counts(new_cats))

#we can also calculate their cumulative sum
# pd.value_counts(new_cats).cumsum()
print(pd.value_counts(new_cats).cumsum())
Пример #35
0
    def test_index(self):

        # test that the various tests get properly aggregated, no duplicate indices
        self.assertEqual(max(pd.value_counts(self.summary.index)), 1)
Пример #36
0
def handle_b(b, n):
    global df7
    global d_test
    #对文件下每个CSV操作
    for each_src in data_dirs:
        df5 = DataFrame(
            pd.read_csv(
                each_src,
                names=['code', 'from', 'to', 'date', 'time', 'ci', 'lac']))
        #去除一些错误条目
        df7 = df5[df5['date'] > 20000000]
        #每天
        if modes == 'd':
            #列出唯一的日期
            d_test = sorted(df7['date'].unique())
            for i in d_test:
                #从信息源筛选出每一天的信息
                temp_df = df7[df7['date'] == int(i)]
                #按拨出次数排名
                temp_count0 = pd.value_counts(temp_df['from'])
                #选出前n名
                temp_count = DataFrame(temp_count0[0:int(n)],
                                       columns=['degree'])
                #增加第三列(日期),导出csv
                temp_count['when'] = int(i)
                temp_dir = 'temp_count/d/' + str(i) + '_topn.csv'
                temp_count.to_csv(temp_dir, index=True, header=False)
            print 'done'
        #每周
        if modes == 'w':
            d_test2 = sorted(df7['date'].unique())
            date_to_week = {}
            for i in d_test2:
                #根据日期获取该日期在一年的第几周
                temp_date = datetime.datetime.strptime(str(i), "%Y%m%d")
                n = temp_date.strftime('%W')
                #添加映射
                date_to_week[i] = n
            #从日期映射到第几周
            df7['week'] = df7['date'].map(date_to_week)
            #列出唯一的周数
            d_test = sorted(df7['week'].unique())
            for i in d_test:
                temp_df = df7[df7['week'] == i]
                temp_count0 = pd.value_counts(temp_df['from'])
                temp_count = DataFrame(temp_count0[0:int(n)],
                                       columns=['degree'])
                temp_count['when'] = '第' + str(i) + '周'
                temp_dir = 'temp_count/w/' + str(i) + '_topn.csv'
                temp_count.to_csv(temp_dir, index=True, header=False)
                print 'Done'
        #每月
        if modes == 'm':
            d_test2 = sorted(df7['date'].unique())
            date_to_month = {}
            for i in d_test2:
                temp_date = datetime.datetime.strptime(str(i), "%Y%m%d")
                n = temp_date.strftime('%m')
                date_to_month[i] = n
            df7['month'] = df7['date'].map(date_to_month)
            d_test = sorted(df7['month'].unique())
            for i in d_test:
                temp_df = df7[df7['month'] == i]
                temp_count0 = pd.value_counts(temp_df['from'])
                temp_count = DataFrame(temp_count0[0:int(n)],
                                       columns=['degree'])
                temp_count['when'] = '第' + str(i) + '月'
                temp_dir = 'temp_count/m/' + str(i) + '_topn.csv'
                temp_count.to_csv(temp_dir, index=True, header=False)
                print 'donE'
Пример #37
0
import numpy as np
data.replace('n/a', np.nan, inplace=True)
data.emp_length.fillna(value=0, inplace=True)
data['emp_length'].replace(to_replace='[^0-9]+',
                           value='',
                           inplace=True,
                           regex=True)

data['emp_length'] = data['emp_length'].astype(int)
data['term'] = data['term'].apply(lambda x: x.lstrip())

import seaborn as sns
import matplotlib

s = pd.value_counts(data['emp_length']).to_frame().reset_index()
s.columns = ['type', 'count']


def emp_dur_graph(graph_title):
    sns.set_style('whitegrid')
    ax = sns.barplot(x='type', y='count', data=s)
    ax.set(xlabel='', ylabel='', title=graph_title)
    matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ','))
    _ = ax.set_xticklabels(ax.get_xticklabels(), rotation=0)


emp_dur_graph('Distribution of employment length for issued loans')

from matplotlib import pyplot as plt
print(plt.style.available)
Пример #38
0
#!/usr/bin/env python
# coding: utf-8

# In[1]:

import pandas as pd
Location = r"C:\Users\rfsas\Documents\MBA\Spring 2020 Class docs\ISM 6419 - Data Visualization\week 7\datasets\gradedata.csv"
df = pd.read_csv(Location)
df.head()

# In[2]:

bins = [0, 60, 70, 80, 90, 100]
group_names = ['F', 'D', 'C', 'B', 'A']
df['lettergrade'] = pd.cut(df['grade'], bins, labels=group_names)
df

# In[3]:

pd.value_counts(df['lettergrade'])

# In[4]:

bins = [0, 80, 100]
group_names = ['fail', 'pass']
df['Pass/Fail'] = pd.cut(df['grade'], bins, labels=group_names)
df

# In[ ]:
x = len(pac_internado)/all_data
y = len(pac_não_internado)/all_data

print('Pacientes Internados :', round(x*100,2),'%')
print('Pacientes Não Internados :', round(y*100,2),'%')

#Pacientes Internados : 11.72 %
#Pacientes Não Internados : 88.28 %

#Verificação do % de Dados da Variável Target - Gráfico - Op 1

import matplotlib.pyplot as plt

labels = ['Pacientes Não Internados','Pacientes Internados']
classes = pd.value_counts(df['CONT_STATUS_INT'], sort = True)
plt.figure(figsize = (14, 7))
classes.plot(kind = 'bar', rot = 0)
plt.title("Target Class Distribution")
plt.xticks(range(2), labels)
plt.xlabel("Class")
plt.ylabel("Frequency")

#Verificação do % de Dados da Variável Target - Gráfico - Op 2

import seaborn as sns

import matplotlib.pyplot as plt

targets = df['DESC_STATUS_INT'].values
sns.set(style = "darkgrid")
#Delete the duplicate way-2 (only 1 column)
import pandas as pd
data = pd.read_csv('c:/Users/DCUK/.PyCharmCE2018.1/PycharmProjects/gender.csv')
id = data["id"]
val = data["Claim Value"]
non_du_id = []
for x in sorted(set(id)):
    non_du_id.append(x)

#Delete the duplicate way-3 (whole dataframe)
import pandas as pd
data = pd.read_csv('c:/Users/DCUK/.PyCharmCE2018.1/PycharmProjects/gender.csv')
df = pd.DataFrame(data)
mask = df.duplicated(keep=False)
print(pd.value_counts(mask))
# False    239712
# True         79
# dtype: int64
mask1 = df.drop_duplicates(keep=False)
print(len(mask1))
# directly return   239712

non_du_id = df[~mask]
non_du_id.to_csv('non_du_id-2.csv')

import pandas as pd
data = pd.read_csv(
    'c:/Users/DCUK/.PyCharmCE2018.1/PycharmProjects/Aspnet_user/final_column_3.csv'
)
df = data[data["Claim Type"].str.match("diabetes_type")]
Пример #41
0
Let's divide these into bins of 18 to 25, 26 to 35, 36 to 60, and finally 61 and
older. To do so, you have to use cut, a function a pandas:'''
bins = [18, 25, 35, 60, 100]

cats = pd.cut(ages, bins)
cats   # categorical object

'''
The object pandas returns is a special Categorical object. You can treate it like
an array of strings indicating the bin name;
initially it contains a levels array indicating the distinct category names along 
with a labeling for the ages data in the labels attribute:'''
cats.labels

cats.value_counts()
pd.value_counts(cats)

'''
Which side is closed or open can be changed by passing right=False:'''
pd.cut(ages, bins, right=False)

'''
You can also pass your own bin names by passing a list or array to the labels
option:'''
group_names = ['youth','youngadult','middleage','senior']

cats2 = pd.cut(ages, bins, labels=group_names)
pd.value_counts(cats2)

'''
If you pass cut a integer number of bins instead of explicit bin ages, it will
Пример #42
0
descrip_speed=accidentes['Speed_limit'].describe()

quick_report1=accidentes.describe().transpose()

#%%

quick_report2=accidentes.describe(include=['object']).transpose()
## incluye variables tipo objeto

#%%
quick_report3=accidentes.mode().transpose()

## para la moda 

#%%
accidentes_por_dia=pd.value_counts(accidentes['Date'])

#%%
##estadisticas de numero de vehiculos

print "mean value:{}".format(accidentes['Number_of_Vehicles'].mean())
print "min value:{}".format(accidentes['Number_of_Vehicles'].min())
print "max value:{}".format(accidentes['Number_of_Vehicles'].max())
print "mode value:{}".format(accidentes['Number_of_Vehicles'].mode())
print "std value:{}".format(accidentes['Number_of_Vehicles'].std())

#%%

vehicle_counts=accidentes.groupby('Date').agg({'Number_of_Vehicles':np.sum})
casualty_counts=accidentes.groupby('Date').agg({'Number_of_Casualties':np.sum})
#%%
Пример #43
0
		timetoswtich.extend(swarmans[9])
		switch_facs.extend(swarmans[10])
		interpolation_list.extend(swarmans[2])
		
		#lists by groups
		usernames_list.extend(swarmans[3])
		initialpull_list.extend(swarmans[4])
		interpolation_final.extend(swarmans[5])

iterate_sheets(10) 
	
swarm_repeatability=[]
swarm_instance_repeatability=[]
crowd_instance_repeatability=[]
for q in range(len(All_swarmanswers)):
	swarmcounts=pd.value_counts(All_swarmanswers[q])
	dictionary=dict(swarmcounts)
	swarm_repeatability.append(list(swarmcounts)[0]/sum(swarmcounts))
	for s in range(len(All_swarmanswers[q])):
		ans=All_swarmanswers[q][s]
		count= dictionary[ans]
		swarm_instance_repeatability.append((count-1)/(sum(swarmcounts)-1))

### repeatability by question graph ###
x_labels=np.arange(1,26,1)
fig, ax = plt.subplots()
rects = ax.bar(np.arange(1,26,1), swarm_repeatability,alpha=0.4)
ax.set_title('Repeatability by Question',size=16)
ax.set_xlabel('Question',size=14)
ax.set_ylabel('Repeatability',size=14)
plt.xticks(x_labels,x_labels)
Пример #44
0
    'Quantidade por sexo': quantidade_sexo,
    'Percentual': percentual_sexo
})

tabela.rename(index={0: 'Masculino', 1: 'Feminino'}, inplace=True)

tabela.rename_axis('Sexo', axis='columns', inplace=True)

tabela
"""Distribuição de renda"""

labels = ['E', 'D', 'C', 'B', 'A']
classes = [0, 1576, 3152, 7880, 15760, 200000]

#Contagem da frequência por classe
frequencia_renda = pd.value_counts(
    pd.cut(dados.Renda, bins=classes, labels=labels, include_lowest=True))
#Calculo percentual de cada classe
percentual_renda = pd.value_counts(pd.cut(
    dados.Renda, bins=classes, labels=labels, include_lowest=True),
                                   normalize=True) * 100

#Criando o dataframe para servir de tabela para a análise
tabela_renda = pd.DataFrame({
    'Frequência por classe': frequencia_renda,
    'Percentual por classe': percentual_renda
})
tabela_renda.sort_index(ascending=False, inplace=True)
tabela_renda.rename_axis('Classe', axis='columns')

tabela_renda['Frequência por classe'].plot.bar(width=1,
                                               color='red',
Пример #45
0
#特色菜
import jieba
delicious = []
for i in range(750):
    try:
        recommend = jieba.lcut(data['recommend'][i])
        while ',' in recommend:
            recommend.remove(',')
        while '(' in recommend:
            recommend.remove('(')
        while ')' in recommend:
            recommend.remove(')')
        delicious.extend(recommend)
    except:
        continue
delicious = pd.value_counts(delicious)
from pyecharts import WordCloud
wordcloud = WordCloud(width=1000, height=600)
wordcloud.add("",
              delicious.index,
              delicious.values,
              word_size_range=[12, 150],
              is_more_utils=True)
wordcloud.render("delicious.html")

#
from sklearn.cluster import KMeans
#为更好聚类,我们将星级转为数字
for i in range(750):
    try:
        if data.loc[i, 'star'] == '五星商户':
# # EDA and Data Cleaning

# The variables are broken into 4 categories: Client Data, Last Contact Info, Other, and Social and Economic Variables.
# I have performed EDA on each category seperately to get a better picture
# #### EDA-Part 1

# In[116]:

bank_client = data.iloc[:, 0:7]
bank_client.head()

# In[117]:

#Checking for unique job titles and their counts in data
bank_client['job'].value_counts()
pd.value_counts(bank_client['job']).plot.bar()

# In[118]:

#Checking for counts of different marital status in data
bank_client['marital'].value_counts()
pd.value_counts(bank_client['marital']).plot.bar()

# In[119]:

#Checking for Educationa;l background unique counts
bank_client['education'].value_counts()
pd.value_counts(bank_client['education']).plot.bar()

# In[120]:
prime_nos = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47]
number_bins = [0, 10, 20, 30, 40, 50]

# In[127]:

category = pd.cut(prime_nos, number_bins)
category

# In[128]:

category.categories

# In[130]:

pd.value_counts(category)

# In[131]:

# Limits
pd.cut(prime_nos, 3, precision=1)

# ### Observation

# In[132]:

df = DataFrame(np.random.randn(1000, 5))
#basic observation
df.head()

# In[133]:
Пример #48
0
data = scale(df_select)

# Define number of clusters
noOfClusters = 4

# Train a model
model = KMeans(init='k-means++', n_clusters=noOfClusters, n_init=20).fit(data)

# In[33]:

print(90 * '_')
print("\nCount of players in each cluster")
print(90 * '_')

pd.value_counts(model.labels_, sort=False)

# In[34]:

# Create a composite dataframe for plotting
# ... Use custom function declared in customplot.py (which we imported at the beginning of this notebook)

P = pd_centers(featuresUsed=select5features, centers=model.cluster_centers_)
P

# <h1 style="font-size:2em;color:#2467C0">Visualization of Clusters</h1>
# We now have 4 clusters based on the features we selected, we can treat them as profiles for similar groups of players. We can visualize these profiles by plotting the centers for each cluster, i.e., the average values for each featuere within the cluster. We will use matplotlib for this visualization. We will learn more about matplotlib in Week 5.

# In[35]:

# For plotting the graph inside the notebook itself, we use the following command
Пример #49
0
                        clf.fit(x_train, y_train)  # 传入训练数据, 进行参数训练
                        predictions.append(bool(clf.predict(
                            [x_predict])))  # 传入测试数据进行预测, 得到预测的结果

                        # 3- 进行交易
                        if predictions[-1] == True:  # 如果预测结果为涨: 买入
                            print(quote.datetime, "预测下一交易日为 涨")
                            target_pos.set_target_volume(10)
                        else:  # 如果预测结果为跌: 卖出
                            print(quote.datetime, "预测下一交易日为 跌")
                            target_pos.set_target_volume(-10)
                        break

    except BacktestFinished:  # 回测结束, 获取预测结果,统计正确率
        klines["pre_close"] = klines["close"].shift(
            1)  # 增加 pre_close(上一交易日的收盘价) 字段
        klines = klines[-len(predictions) + 1:]  # 取出在回测日期内的K线数据
        klines[
            "prediction"] = predictions[:
                                        -1]  # 增加预测的本交易日涨跌情况字段(向后移一个数据目的: 将 本交易日对应下一交易日的涨跌 调整为 本交易日对应本交易日的涨跌)
        results = (klines["close"] - klines["pre_close"] >=
                   0) == klines["prediction"]

        print(klines)
        print("----回测结束----")
        print("预测结果正误:\n", results)
        print("预测结果数目统计: 总计", len(results), "个预测结果")
        print(pd.value_counts(results))
        print("预测的准确率:")
        print((pd.value_counts(results)[True]) / len(results))
Пример #50
0
df = pd.read_csv('csv/raw_26_April_Sensors.csv')

## Analyze and Visualize Dataset

# General Information
df.info()
print('The dataset contains ' + str(df.shape[0]) + ' data samples and ' +
      str(df.shape[1]) + ' data columns')

# Identifying NaN Values
print(df.isnull().sum())

# Overview of numerical data
print(df.describe())

print('Dataset contains ' + str(pd.value_counts(df['RecordID'].values)[0]) +
      ' "safe" data samples as well as ' +
      str(pd.value_counts(df['RecordID'].values)[1]) +
      ' "relatevely safe" data samples and ' +
      str(pd.value_counts(df['RecordID']).values[2]) +
      ' "unsafe" data samples')

# Overview of dataset rows
print(df.head(20))

# Numerical Data Distribution
SENSOR_DATA_COLUMNS = [
    'GyroX1', 'GyroY1', 'GyroZ1', 'AccX1', 'AccY1', 'AccZ1', 'MagX1', 'MagY1',
    'MagZ1', 'GyroX2', 'GyroY2', 'GyroZ2', 'AccX2', 'AccY2', 'AccZ2', 'MagX2',
    'MagY2', 'MagZ2', 'GyroX3', 'GyroY3', 'GyroZ3', 'AccX3', 'AccY3', 'AccZ3',
    'MagX3', 'MagY3', 'MagZ3'
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
        tweets = pd.DataFrame()
        tweets['text'] = map(lambda tweet: tweet['text'], tweets_data)
        tweets['language'] = map(lambda tweet: tweet['lang'], tweets_data)
        tweets['country'] = map(
            lambda tweet: tweet['place']['country']
            if tweet['place'] != None else None, tweets_data)
    except:
        continue

#convert the map object to list for plotting
tweets_list_by_lang = list(tweets['language'][0])
tweets_by_lang_count = pd.value_counts(tweets_list_by_lang)[:5]

#plot the top 5 languages on the received tweets with filtered tags
fig, ax = plt.subplots()
ax.tick_params(axis='x', labelsize=7)
ax.tick_params(axis='y', labelsize=7)
ax.set_xlabel('Languages', fontsize=7)
ax.set_ylabel('Number of tweets', fontsize=7)
ax.set_title('Top 5 languages', fontsize=7, fontweight='bold')
tweets_by_lang_count.plot(ax=ax, kind='bar', color='green')
plt.show()

#convert the map object to list for plotting
tweets_list_by_Country = list(tweets['country'][0])
tweets_by_Country_count = pd.value_counts(tweets_list_by_Country)[:5]
from gensim.models import doc2vec
from collections import namedtuple

np.random.seed(0)

if __name__ == "__main__":
    SPLIT_SIZE = 0.3
    VECTOR_SIZE = 100

    # load data
    train_df = pd.read_csv('./kaggledata/records.tsv', sep='\t', header=0)

    raw_docs_train = train_df['Review'].values
    sentiment_train = train_df['Score'].values
    num_labels = len(np.unique(sentiment_train))
    print pd.value_counts(sentiment_train)
    print sentiment_train
    print "Label's categories amount: " + str(num_labels)

    # text pre-processing
    stop_words = set(stopwords.words('english'))
    stop_words.update(
        ['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
    stemmer = SnowballStemmer("english")

    print "pre-processing train docs..."
    processed_docs_train = []
    for doc in raw_docs_train:
        tokens = word_tokenize(doc.lower())
        filtered = [word for word in tokens if word not in stop_words]
        stemmed = [stemmer.stem(word) for word in filtered]
Пример #53
0
    def value_counts(self, dropna: bool = True):
        from pandas import value_counts

        return value_counts(self._ndarray, dropna=dropna).astype("Int64")
Пример #54
0
 def select_unimportant(self, delt):
     delete = pd.value_counts(delt)[pd.value_counts(delt) > 1].index
     return delete
Пример #55
0
print scores.mean()
##############提特征######################
titanic['Familysize'] = titanic['SibSp'] + titanic['Parch']  #家庭总共多少人
titanic['NameLength'] = titanic['Name'].apply(lambda x: len(x))  #名字的长度
import re


def get_title(name):
    title_reserch = re.search('([A-Za-z]+)\.', name)
    if title_reserch:
        return title_reserch.group(1)
    return ""


titles = titanic['Name'].apply(get_title)
print pandas.value_counts(titles)
#将称号转换成数值表示
title_mapping = {
    "Mr": 1,
    "Miss": 2,
    "Mrs": 3,
    "Master": 4,
    "Dr": 5,
    "Rev": 6,
    "Col": 7,
    "Major": 8,
    "Mlle": 9,
    "Countess": 10,
    "Ms": 11,
    "Lady": 12,
    "Jonkheer": 13,
Пример #56
0
    def plot_company_ranking(self):

        # df sel contains the data of the subset of arjen
        df_sel = self.read_input_file()

        table_name = 'company'
        data_df = read_sql_table(table_name,
                                 connection=self.connection,
                                 reset=self.reset)
        # df[df["datetime"].isnull]
        data_df.dropna(axis=0, subset=["datetime"], inplace=True)

        data_df.set_index(KVK_KEY, inplace=True, drop=True)

        if self.dump_to_file:
            data_df.to_csv(table_name + ".csv")

        df_sel = pd.concat([data_df, df_sel], axis=1, join="inner")

        count_sel = pd.value_counts(df_sel["ranking"]).sort_index()
        count_sel.index = count_sel.index.astype(int)
        tot_sel = count_sel.sum()
        count_sel = 100 * (count_sel / tot_sel)
        print("counted sel {}".format(tot_sel))

        count_all = pd.value_counts(data_df["ranking"]).sort_index()
        count_all.index = count_all.index.astype(int)
        tot_all = count_all.sum()
        count_all = 100 * (count_all / tot_all)
        print("counted all {}".format(tot_all))

        count_all = pd.concat([count_all, count_sel], axis=1)

        count_all.columns = [f"All (N={tot_all})", f"Sel (N={tot_sel}"]

        fig, axis = plt.subplots(figsize=(6.5, 5))
        plt.subplots_adjust(left=0.1, right=0.9, top=0.85)
        axis.set_xlabel("Ranking [-]")
        axis.set_ylim([0, 40])
        axis.set_ylabel("% kvks")

        count_all.plot(kind="bar", ax=axis, label="# kvks", rot=0)
        axis.set_xlim([-1, 10])

        ax2 = axis.twinx()
        ax2.set_ylabel("cumulative %")

        cum_sum_all = count_all.cumsum()
        cum_sum_sel = pd.DataFrame(index=count_sel.index,
                                   data=count_sel.cumsum().values,
                                   columns=[count_all.columns[1]])
        #
        cum_sum_all.plot(y=[cum_sum_all.columns[0]],
                         ax=ax2,
                         style="--o",
                         color="tab:red",
                         legend=False)
        cum_sum_sel.plot(y=[cum_sum_sel.columns[0]],
                         ax=ax2,
                         style="--x",
                         color="tab:green",
                         legend=False)

        ax2.set_ylim([0, 110])
        ax2.set_xlim([-1, 10])
        #
        ax2.tick_params(axis="y", labelcolor="black")
        axis.legend(bbox_to_anchor=(0.65, 1.22), title="% KVK")
        ax2.legend(bbox_to_anchor=(1.05, 1.22), title="Cumulative %")

        logger.info("plot fig")
        plt.savefig("url_score_NL.jpg")
        logger.info("save to csv sel")
        cum_sum_sel.to_csv("url_score_DH.csv")
        logger.info("save to csv all")
        cum_sum_all.to_csv("url_score_NL.csv")
    'column9', 'column14'
]
disease_data = disease_data[
    features]  # Putting side by side features with the same input data type

# Analyzing Output Distribution
data_size = disease_data.shape[0]
sick = disease_data[disease_data['column14'] == 1]
not_sick = disease_data[disease_data['column14'] == 0]
x = len(sick) / data_size
y = len(not_sick) / data_size
print('Sick :', x * 100, '%')
print('Not sick :', y * 100, '%')
plt.figure(14)  # Plotting output feature for distribution analysis
labels = ['Sick', 'Not Sick']
graph = pd.value_counts(disease_data['column14'], sort=True)
graph.plot(kind='bar', rot=0)
plt.title("Transaction class distribution")
plt.xticks(range(2), labels)
plt.xlabel("Class")
plt.ylabel("Frequency")

plt.figure(15)
sns.heatmap(disease_data.corr(), annot=True)  # Correlation Matrix of the Data

# Checking and Removing Outliers using Z-score function
z = np.abs(stats.zscore(disease_data))
threshold = 3
disease_data = disease_data[(z < 3).all(axis=1)]

# Scatter Plot - Uncomment only if needed. High computational time required.
Пример #58
0
meanval = dataemp['overall-ratings'].mean()

# # Classifying labels  - 1 - Satisfied happy employee 0- Employee is not satisfied with job. Greater than mean of overall-rating is considered as satisifed and less than mean of overall-rating is considered as unsatisfied for classification

# In[20]:

dataemp['label'] = dataemp['overall-ratings'].apply(lambda x: 1
                                                    if x > meanval else 0)

# In[21]:

dataemp.head()

# In[22]:

pd.value_counts(dataemp['label']).plot.bar()
plt.show()

# In[23]:


def datatext_preprocess(total_text):
    removepunc = [
        char for char in total_text if char not in string.punctuation
    ]
    removepunc = ''.join(removepunc)
    re.sub('[^A-Za-z]+', '', removepunc)
    return ' '.join([
        word for word in removepunc.split()
        if word.lower() not in stopwords.words('english')
    ])
Пример #59
0
from datetime import datetime
ffp = data['FFP_DATE'].apply(lambda x:datetime.strptime(x,'%Y/%m/%d'))
ffp_year = ffp.map(lambda x : x.year)
# 绘制各年份会员入会人数直方图
fig = plt.figure(figsize = (8 ,5))  # 设置画布大小
plt.rcParams['font.sans-serif'] = 'SimHei'  # 设置中文显示
plt.rcParams['axes.unicode_minus'] = False
plt.hist(ffp_year, bins='auto', color='#0504aa')
plt.xlabel('年份')
plt.ylabel('入会人数')
plt.title('各年份会员入会人数')
plt.show()
plt.close

# 提取会员不同性别人数
male = pd.value_counts(data['GENDER'])['男']
female = pd.value_counts(data['GENDER'])['女']
# 绘制会员性别比例饼图
fig = plt.figure(figsize = (7 ,4))  # 设置画布大小
plt.pie([ male, female], labels=['男','女'], colors=['lightskyblue', 'lightcoral'],
       autopct='%1.1f%%')
plt.title('会员性别比例')
plt.show()
plt.close

# # 提取不同级别会员的人数
# lv_four = pd.value_counts(data['FFP_TIER'])[4]
# lv_five = pd.value_counts(data['FFP_TIER'])[5]
# lv_six = pd.value_counts(data['FFP_TIER'])[6]
# # 绘制会员各级别人数条形图
# fig = plt.figure(figsize = (8 ,5))  # 设置画布大小
Пример #60
0
def case_study_example_question(case_question):
	'''
	param case_question: question index to use as example
	generates graphs of group interpolation over time
	prints p-values of individuals and groups
	'''
	########## Case Study  on GROUPS; Interpolation through time graph #############
	print('Case Question: Initial vs Survey by Swarms pvalue = %.6f' %(stats.ttest_rel(swarm_initial_mean[case_question],crowd_avg_byquestion[case_question])[1]))
	print('Case Question: Initial vs Final by Swarms pvalue = %.6f' %(stats.ttest_rel(swarm_initial_mean[case_question],All_interpolations[case_question])[1]))
	print('Case Question: Survey vs Final by Swarms pvalue = %.6f' %(stats.ttest_rel(All_interpolations[case_question],crowd_avg_byquestion[case_question])[1]))

	### Graph of Interpolation over time for each of 10 groups on this question ###
	initial_interpoltion=[]
	for i in range(10):
		group=i
		impulse_array=np.array((imp_throughtime[case_question][group]))
		interpolation_through_time=[]
		time=np.arange(4,len(impulse_array)+4,4) ## starting from 1 second; 4 timesteps = 1 second
		time_list=time/4 #timesteps >> seconds
		percenttime_bins=np.arange(.1,1.1,.1)

		initial_interpoltion.append(impulse_linear_interpolation(sum(impulse_array[4:12]))) ##initial interpolation from 1-3 seconds
		for t in range(len(percenttime_bins)-1):
			timestep1=int(percenttime_bins[t]*(len(impulse_array)) -1)	#lower bin
			timestep2=int(percenttime_bins[t+1]*(len(impulse_array)) -1)	#upper bin
			time_impulse=sum(impulse_array[timestep1:timestep2]) #start at one second #change lower index to 4 and upper index timestep2  if wanting cumulative interp over time
			time_interpolation=impulse_linear_interpolation(time_impulse)
			interpolation_through_time.append(time_interpolation)
		
		plt.plot(percenttime_bins[1:],interpolation_through_time,marker='o',label='Group %s'%(group+1),color='C%s'%i)
		plt.xlabel('Percent Time',size=14)
		plt.ylabel('Interpolation',size=14)

	plt.title('Case Study: Interpolation Through Time',size=16)
	plt.show()


	######## CASE STUDY on INDIVIDUALS: #######
	surveyavg=np.mean(crowd_avg_byquestion[case_question])		# survey average on this question (10 groups)
	swarmavg=np.mean(All_interpolations[case_question])			#swarm interpolation on this question (10 groups)
	swarm_init=np.array(swarm_initial_question[case_question]) 	#initial individuals' answers on this question
	survey_init=np.array(survey_initial_question[case_question])	#survey individuals' answers on this question
	final_interp=np.array(swarm_final_interp_question[case_question])	#final individuals' interpolations on this question

	print('Case Question: Initial vs Survey by Individuals pvalue = %.6f' %(stats.ttest_rel(swarm_init,survey_init)[1]))
	print('Case Question: Initial vs Final by Individuals pvalue = %.6f' %(stats.ttest_rel(swarm_init,final_interp)[1]))
	print('Case Question: Survey vs Final by Individuals pvalue = %.6f' %(stats.ttest_rel(survey_init,final_interp)[1]))

	### making dictionary of faction support frequency ###
	countsvals=(dict(pd.value_counts(survey_init)))
	countsvals2=dict(pd.value_counts(swarm_init))
	values1=[]
	for i in list(countsvals.keys()):
		values1.append(i+.15) #offsetting bins to graph side by side
	values2=[]
	for i in list(countsvals2.keys()):
		values2.append(i-.15) #offsetting bins to graph side by side
	frac_vals=[]
	for i in range(len(countsvals.values())):
		frac_vals.append( list(countsvals.values())[i]/(sum(countsvals.values())))
	frac_vals_2=[]
	for i in range(len(countsvals2.values())):
		frac_vals_2.append(list(countsvals2.values())[i]/(sum(countsvals2.values())))

	#### Graph of Individuals' Survey, Swarm Initial, and Swarm Final Interpolation ### 
	plt.title('Case Question: Individuals Answers')
	plt.bar(values1,frac_vals,width=.3,color='C0',alpha=.4,label='Survey: Mean = %.1f, std = %.2f'%(np.mean(survey_init),np.std(survey_init)))
	plt.bar(values2,frac_vals_2,width=.3,color='C1',alpha=.4,label='Swarm Initial: Mean = %.1f, std = %.2f'%(np.mean(swarm_init),np.std(swarm_init)))
	weights = np.ones_like(final_interp)/float(len(final_interp))
	plt.hist(final_interp,label='Swarm Final: Mean = %.1f, std = %.2f'%(np.mean(final_interp),np.std(final_interp)),color='C2',bins=np.arange(1,5,.3),alpha=.4,weights=weights)
	plt.xlabel('Answer')
	plt.ylabel('Frequency')
	plt.legend()
	plt.show()

	### individual mean (mean diff, CI) of swarm initial vs swarm final ###
	diff=np.array(final_interp) - np.array(swarm_init)	
	print('Case Question: Mean Final - Initial Mean Difference = %.3f , p = %.3f'%(np.mean(diff), stats.ttest_rel(final_interp,swarm_init)[1]))

	#### bootstrapped individual standard deviation (mean diff, CI) of swarm initial vs swarm final ###
	bootstrapped=bootstrap(range(len(swarm_init)),1000,len(swarm_init)) ## bootstrapping individuals
	init_std=[]
	final_std=[]
	for i in range(len(bootstrapped)):
		init_std .append(np.std(list(swarm_init[bootstrapped[i]])) ) #std of individuals initially
		final_std.append(np.std(list(final_interp[bootstrapped[i]])) ) #std of individuals final
	p_val=stats.ttest_rel(init_std,final_std)[1]
	print('Case Question: Mean Final - Initial Standard Deviation Difference = %.3f, p= %.3f'%(np.mean(np.array(final_std)-np.array(init_std)),p_val))