示例#1
0
    def decision_tree(cls,df,length,neighbour_details):

        df=pd.read_csv('tile_features.csv')
        length=len(df)
        #print(df.head())
        #df.loc[len(df)] = neighbour_details
        df2 = pd.dataFrame(neighbour_details,columns=['A','B','C','D','E'])
        #df = df.append(df2, ignore_index=True)
        
        df = df.append(df2, ignore_index=True)
        df.to_csv('tile_features.csv', mode='a', header=False)
示例#2
0
def get_binned_data( df, bin_count=10 ):
    v_max, v_min = df.max(), df.min()
    bins = [(v_max-v_min)/(bin_count+1)*i+v_min for i in range(bin_count+1)]
    labels = ["{0} {1:.1f}".format(i, (v_max-v_min)/(bin_count+1)*(i+0.5)+v_min) for i in range(bin_count)]

    categories = pd.cut(df, bins, labels=labels)
    #print( categories)
    print( df)
    print(pd.value_counts( categories ))

    ret_df = pd.dataFrame()
    ret_df.index = labels
    ret_df['count'] = pd.value_counts(categories)

    return ret_df
def fitDataToPandas(input):
	output = []
	target = []
	for element in input:
		current=[]
		
		current = [element.math,element.lit,
		                                 element.read,element.high,element.music
		                                 ,element.polits]
		target.append(element.lecpref)
		output.append(current)
    

	return (pd.DataFrame(output),np.array(target))

	return (pd.dataFrame(target),np.array,(output))
示例#4
0
 <linkrel="fluid-icon" href="https://github.com/fluidicon.png" title="GitHub">
 <metaproperty="fb:app_id" content="1401488693436528">
 <linkrel="assets" href="https://assets-cdn.github.com/">
 ...
 """

soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.title.string)

# 31. Given DataFrame, apply label encoder
d = ['A', 'B', 'C', 'D', 'E', 'AA', 'AB']
le = LabelEncoder()
print(le.fit_transform(d))

# 32. Output?
df = pd.dataFrame({'Id': [1, 2, 3, 4], 'val': [2, 5, np.nan, 6]})
print(
    df.val == np.nan
)  # np.nan is an object, not a value (therefore, this will always be false)

# 33. Stored in HDFS format, how to find structure of data?
hf.keys()

# 34.
reviews = [
    'movie is unwatchable no matter how decent the first half is  . ',
    'somewhat funny and well  paced action thriller that has jamie foxx as a hapless  fast  talking hoodlum who is chosen by an overly demanding',
    'morse is okay as the agent who comes up with the ingenious plan to get whoever did it at all cost .'
]

counts = Counter()
# 列の内容の置き換え
import pandas as pd
import numpy as np

df = pd.dataFrame(np.random.randn(4,3),index=range(1,5),\
columns=list('ABC'))

df_t['A'] = ['dog','cat', 'bard', 'man']

# 複数の文字列を置換する==copyして別のDataFrameを作ること!==
df_copy = df.replace({'dog': 'DOGMAN','cat':'DOGMAN'})

示例#6
0
# consider case where y = n * x ; so relation of x and y is scalable.

n = 1  # amount of correlation
x = np.random.uniform(
    1, 2, 1000)  # taking generic 1000 samples from uniform random variable

y = x.copy() * n  # here I making y = n * x

# centring of data for PCA to make it work better.

x = x - np.mean(x)  # centre the x and remove its mean
y = y - np.mean(y)  # centre y and remove its mean.

# create dataframe with x and y

data = pd.dataFrame({'x': x, 'y': y})

# plot the original un corelated data

plt.scatter(data.x, data.y)

# initiate the PCA and choose 2 output variables

pca = PCA(n_components=2)

# create transformation model for this data to get it to rotate(use of rotation matrices)

pcaTr = pca.fit(data)
rotatedData = pcaTr.transform(
    data)  # transofrm data base on rotation of matrix of pcaTr.
示例#7
0
marathon_2017['25K'] = marathon_2017['25K'].astype('m8[s]').astype(np.int64)
marathon_2017['30K'] = marathon_2017['30K'].astype('m8[s]').astype(np.int64)
marathon_2017['35K'] = marathon_2017['35K'].astype('m8[s]').astype(np.int64)
marathon_2017['40K'] = marathon_2017['40K'].astype('m8[s]').astype(np.int64)

# Define function name to_seconds
check_time = 7200
Lat = 0
Long = 0
Location = ''
points = [[42.247835, -71.474357], [42.274032, -71.423979],
          [42.282364, -71.364801], [42.297870, -71.284260],
          [42.324830, -71.259660], [42.345680, -71.215169],
          [42.352089, -71.124947], [42.351510, -71.086980]]

marathon_location = pd.dataFrame(columns=['Lat', 'Long'])
for index, record in marathon_2017.iterrows():
    if (record['40K'] < check_time):
        Lat = points[7][0]
        Long = points[7][1]
    elif (record['35K'] < check_time):
        Lat = points[6][0]
        Long = points[6][1]
    elif (record['30K'] < check_time):
        Lat = points[5][0]
        Long = points[5][1]
    elif (record['25K'] < check_time):
        Lat = points[4][0]
        Long = points[4][1]
    elif (record['20K'] < check_time):
        Lat = points[3][0]
示例#8
0
studentgender = pandas.Series(gender)
print(student)

chocolate = pandas.Series(chocolate)
print(chocolate)

#dataFrame
chocolatedata = (chocolates)
chocolatesdf(chocolates)
   
students ={"steve": 32, "Lia":28, "vin":45, "katie":38}
studentsdata = [students]
print(studentsdata)

studentsdf = pandas.dataFrame(studentdata, index = ["age"])


studentlist =[[ "steve", 32,"male"], ["lia", 28,"female"], ["vin",45, "male"], ["katie", 38, "female"]]
studentlistdf = pandas.DataFrame(studentlist,columns=["name","age", "gender"],index= ["1","2","3","4"])
print(studentlistdf)

print(studentinfo)
print(studentgender)

studentdf1 = [studentinfo,studentgender]
print(studentdf1)

studentdf2 = pandas.DataFrame(studentdf1,index=["age","gender"])
print(studentdf2)
## taking sample from training and test train
training = training.sample(2000, random_state=19)
testing = test.sample(2000, random_state=17)

## combining random samples
combi = training.append(testing)
y = combi['origin']
combi.drop('origin',axis=1,inplace=True)


## modelling
model = RandomForestClassifier(n_estimators = 50, max_depth = 5,min_samples_leaf = 5)

drop_list = []
for i in combi.columns:
    score = cross_val_score(model,pd.dataFrame(combi[i]),y,cv=10,scoring='roc_auc')
    if (np.mean(score) > 0.8):
        drop_list.append(i)
		
print(i,np.mean(score))
# -- No drifitng features
# All features are important and train and test can be analyzed together as they come from the same distribution



##############################################################################################################

## Appending the original train and test datasets and replacing NA's and  treating outliers using aforementioned logic

train['source']= 'train'
test['source'] = 'test'
示例#10
0
def munge_data(messy_table):
    print('grouping trips...')
    trip_i1 = 0
    trips = []
    cur_announced_and_arrived = pd.DataFrame(columns=[
        'datetime', 'station_id', 'projection', 'trip_id', 'train_id',
        'line_id'
    ])
    #go through every station
    for st in const.stations:
        trip_id = 1
        #go through all rows
        for i in range(messy_table.shape[0]):
            #first train
            if i == 0:
                train = messy_table.ix[i][st + '_Time1_Train']
                trip_i1 = i
            # new train
            if train != messy_table.ix[i][st + '_Time1_Train']:
                announced_report = ()
                arrived_report = ()
                excep = False
                #go through all rows in current trip
                for j in [trip_i1, i - 1]:
                    try:
                        est_secs = int(messy_table.ix[j][st + '_Time1_Est'])
                        m, s = divmod(est_secs, 60)
                        h, m = divmod(m, 60)
                        eta = str(h) + ":" + str(m) + ":" + str(s)

                        report = pd.dataFrame(
                            {
                                'datetime':
                                datetime.strptime(
                                    messy_table.ix[j]['dateadded'],
                                    '%Y-%m-%d %H:%M:%S'),
                                'station_id':
                                st,
                                'projection':
                                eta,
                                'trip_id':
                                trip_id,
                                'train_id':
                                str(messy_table.ix[j][st + '_Time1_Train']),
                                'line_id':
                                messy_table.ix[j][st + '_Time1_LineID']
                            },
                            index=[0])
                        if j == trip_i1:
                            announced_report = report
                        elif j == i - 1:
                            arrived_report = report
                    except:
                        excep = True
                        print("bad time " +
                              str(messy_table.ix[j][st + '_Time1_Est']))
                #insert only good trips
                if not excep and len(announced_report) == 6 and len(
                        arrived_report) == 6:
                    x = announced_report.ix[0]['projection'].split(":")
                    an_proj = int(x[0]) + 60 * int(x[1]) + int(x[2])
                    x = arrived_report.ix[0]['projection'].split(":")
                    arr = int(x[0]) + 60 * int(x[1]) + int(x[2])
                    if an_proj > 0 and arr == 0 and an_proj > arr:
                        cur_announced_and_arrived = cur_announced_and_arrived.append(
                            announced_report)
                        cur_announced_and_arrived = cur_announced_and_arrived.append(
                            arrived_report)
                        trips.append(announced_report)
                        trips.append(arrived_report)
                        trip_id += 1
                    excep = False

                #go to next trip
                train = messy_table.ix[i][st + '_Time1_Train']
                trip_i1 = i

    print("done")
    return cur_announced_and_arrived
os.chdir('/home/justyna/Pulpit/uczeniem/umz-template/zajecia1/zadanie3/train')

r = pd.read_csv('train.tsv', sep = '\t',names = ['price', 'isNew','rooms', 'floor', 'location', 'sqrMetres'])

reg = linear_model.LinearRegression()

r.head()
r.corr()
sns.set(style='whitegrid', context='notebook')

c=['price', 'isNew', 'rooms', 'floor', 'sqrMetres']
sns.pairplot(r[c], size=3)
plt.show()

reg.fit(pd.dataFrame(r,columns=['sqrMetres', 'floor', 'rooms', 'isNew'], r['price'])

os.chdir('/home/justyna/Pulpit/uczeniem/umz-template/zajecia1/zadanie3/dev-0')
r2 = pd.read_csv('in.tsv', sep = '\t',names = ['isNew','rooms', 'floor', 'location', 'sqrMetres'])
x_d=pd.DataFrame(r2,columns=['sqrMetres', 'floor', 'rooms', 'isNew'])
y_d=reg.predict(x_d)
y_d=pd.Series(y_d)
y_d.to_csv('out.tsv', sep='\t', header=False, index=False)

os.chdir('/home/justyna/Pulpit/uczeniem/umz-template/zajecia1/zadanie3/test-A')

r3 = pd.read_csv('in.tsv', sep = '\t',names = ['isNew','rooms', 'floor', 'location', 'sqrMetres'])
x_d2=pd.DataFrame(r3,columns=['sqrMetres', 'floor', 'rooms', 'isNew'])
y_d2=reg.predict(x_d2)
y_d2=pd.Series(y_d2)
y_d2.to_csv('out.tsv', sep='\t', header=False, index=False)
#        ps1[idx] = np.expm1(model_lassoC1.predict(it))
#    else:
#        ps1[idx] = np.expm1(model_lasso.predict(it))
#    idx = idx + 1

#X_test["s1Y"] = ps1

p = ps1 + clf.predict(X_test)  # + clf1.predict(X_test)

idx = 0
for index, it in X_test.iterrows():
    if it['OverallQual'] == 1.25:
        p[idx] = ps1[idx]
    idx = idx + 1

#solution = pd.DataFrame({"id":test.Id, "SalePrice":p}, columns=['id', 'SalePrice'])
solution = pd.DataFrame({"id": p, "SalePrice": test.Id})
solution.to_csv("lasso_sol22_Median.csv", index=False)

#p_pred = np.expm1(model_lasso.predict(xtrain))
p_pred = (
    np.expm1(model_lasso.predict(xtrain))
    #+ np.expm1(model_lasso_1.predict(xtrain)) + np.expm1(model_lasso_2.predict(xtrain))
    #+ np.expm1(model_lasso_3.predict(xtrain)) + np.expm1(model_lasso_4.predict(xtrain))
    # + np.expm1(model_lasso_5.predict(xtrain)) + np.expm1(model_lasso_6.predict(xtrain))
) / 1

lasso_pred = p_pred

lasso = pd.dataFrame(data=lasso_pred)
print(lasso_pred)
def main():
    filepath = "Data/Master_30_4.csv"
    # there are 835 assessors, each evaluated approximately 66 recordings
    
    #preparing the data
    DF_x,DF_y = splitXY(filepath)
    DF_x = DF_x.fillna(-1)    
    DF_y.isnull().sum() #only 5 missing values
    DF_y = TransformTarget(DF_y)  
    DF_train_x,DF_test_x,DF_train_y,DF_test_y = SplitTrainTest(DF_x,DF_y,train_percent = 0.75)
    
    #setting the hyperparameters to test
    nEstimators = [10,100,200,500,1000]  
    C = [10**x for x in range(-5,5)]
    
    #cross validation for different classifiers
    GBC_func = []
    for i in nEstimators:
        GBC_func.append(wrapper(GradientBoostingClassifier,n_estimators=i,learning_rate = 0.1))
    GBC_Score = CrossVal(DF_train_x,DF_train_y,GBC_func,k=3)
    
    RF_func = []    
    for i in nEstimators:
        RF_func.append(wrapper(RandomForestClassifier,n_estimators=i))
    RF_Score = CrossVal(DF_train_x,DF_train_y,RF_func,k=3)
    
    Logit_func = []
    for i in C:
        Logit_func.append(wrapper(LogisticRegression,C=i))
    Logit_Score = CrossVal(DF_train_x,DF_train_y,Logit_func,k=3)
    
    SVM_func = []
    for i in C:
        SVM_func.append(wrapper(SVC, C=i, kernel='poly', degree = 2))
    SVM_Score = CrossVal(DF_train_x,DF_train_y,SVM_func,k=3) 
    
    target_cols = list(DF_test_y.columns)
    
    #plots for the cross validation results
    Logit_score_avg = np.mean(Logit_Score, axis=1)
    position = 231    
    fig = plt.figure()
    for i in range(len(target_cols)):
        ax = fig.add_subplot(position)
        ax.plot(np.log(C),Logit_score_avg[i])
        ax.set_title(target_cols[i])
        ax.set_autoscaley_on(True)
        position += 1
    plt.tight_layout()
    fig.savefig("figures/logit_plots_5.5.15.png")
    
    RF_score_avg = np.mean(RF_Score, axis=1)
    position = 231    
    fig = plt.figure()
    for i in range(len(target_cols)):
        ax = fig.add_subplot(position)
        ax.plot(np.log(nEstimators),RF_score_avg[i])
        ax.set_title(target_cols[i])
        ax.set_autoscaley_on(True)
        position += 1
    plt.tight_layout()
    fig.savefig("figures/rf_plots_5.5.15.png")
    
    GBC_score_avg = np.mean(GBC_Score, axis=1)
    position = 231    
    fig = plt.figure()
    for i in range(len(target_cols)):
        ax = fig.add_subplot(position)
        ax.plot(np.log(nEstimators),GBC_score_avg[i])
        ax.set_title(target_cols[i])
        ax.set_autoscaley_on(True)
        position += 1
    plt.tight_layout()
    fig.savefig("figures/GBC_plots_5.5.15.png")
    
    SVM_score_avg = np.mean(SVM_Score, axis=1)
    position = 231    
    fig = plt.figure()
    for i in range(len(target_cols)):
        ax = fig.add_subplot(position)
        ax.plot(np.log(nEstimators),SVM_score_avg[i])
        ax.set_title(target_cols[i])
        ax.set_autoscaley_on(True)
        position += 1
    plt.tight_layout()
    fig.savefig("figures/SVM_plots_5.5.15.png")
    
    
    #getting feature importance
    train_x, train_y,test_x, test_y = TrainTestClean(DF_train_x,DF_train_y,DF_test_x,DF_test_y)
    columns = train_x.columns.values.tolist()
    
    GBC_clf = GradientBoostingClassifier(n_estimators=500,learning_rate = 0.1)  
    GBC_clf.fit(train_x,train_y[[0]].squeeze())
    GBC_feature_importance= pd.DataFrame()
    GBC_feature_importance['features']=list(train_x.columns)
    GBC_feature_importance['importance'] = GBC_clf.feature_importances_
    GBC_feature_importance['importance'] = GBC_feature_importance['importance'] / max(GBC_feature_importance['importance'])
    GBC_feature_importance = GBC_feature_importance.sort('importance',ascending=False)
    
    RF_clf = RandomForestClassifier(n_estimators=500)
    RF_clf.fit(train_x,train_y[[0]].squeeze())
    RF_feature_importance = pd.DataFrame()
    RF_feature_importance['features'] = list(train_x.columns)
    RF_feature_importance['importance'] = RF_clf.feature_importances_
    RF_feature_importance['importance'] = RF_feature_importance['importance'] / max(RF_feature_importance['importance'])
    RF_feature_importance = RF_feature_importance.sort('importance',ascending=False)    
    
    feature_importance = pd.dataFrame()
    feature_importance['features'] = list(train_x.columns)[2:156]
    target_labels = train_y.columns.values.tolist()  
    
    #RF_models = fitModels(RF_clf,train_x[columns[2:156]],train_y)
    #for i in range(len(RF_models)):
     #   feature_importance[target_labels[i]] = RF_models[i].feature_importances_
      #  feature_importance[target_labels[i]] = feature_importance[target_labels[i]]/max(feature_importance[target_labels[i]])
    
    #RF_testScores = testScore(RF_models,test_x[columns[2:156]],test_y)
    
    #feature importance for logistic regression
    logit_ind_scores = np.zeros((len(columns),len(target_labels)))
    Logit_clf = LogisticRegression(C = 0.01)
    for i in range(len(target_labels)):  
        for j in range(len(columns)):
            #cols = [col for col in train_x.columns if col not in [columns[j]]]
            Logit_clf.fit(train_x[columns[j:j+1]],train_y[target_labels[i]].squeeze())
            temp_score = Logit_clf.score(test_x[columns[j:j+1]],test_y[target_labels[i]].squeeze())
            print "target variable " + target_labels[i] + " without " + columns[j] + ": " + str(temp_score)
            logit_ind_scores[j][i] = temp_score
    
    
    np.savetxt("figures/logit_lasso_xval_score_5.12.15.csv",Logit_lasso_score_avg)
    np.savetxt("figures/logit_scores_features_importance_5.12.15.csv",logit_scores_coef)
    np.savetxt("figures/logit_scores_importance_5.7.15.csv",scores)
    
    #scores for logit with all features
    actual_scores = np.zeros(len(target_labels))
    for i in range(len(target_labels)):
        Logit_clf.fit(train_x,train_y[target_labels[i]].squeeze())
        actual_scores[i]=Logit_clf.score(test_x,test_y[target_labels[i]].squeeze())
    
    #testing without the time features
    unwanted_features = ['AssignmentDurationInSeconds', 'WorkTimeInSeconds']
    DF_train_x_2 = DF_train_x.drop(unwanted_features,axis=1)
    
    RF_func = []    
    for i in nEstimators:
        RF_func.append(wrapper(RandomForestClassifier,n_estimators=i))
    RF_Score_2 = CrossVal(DF_train_x_2,DF_train_y,RF_func,k=2)
    
    Logit_func = []
    for i in C:
        Logit_func.append(wrapper(LogisticRegression,C=i))
    Logit_Score_2 = CrossVal(DF_train_x_2,DF_train_y,Logit_func,k=2)

    #running the algorithms with only audio data, without audio data, with everything
    Audio_Features = columns[65:]
    GBC_audio_scores_avg = np.zeros((3,len(target_labels)))
    GBC = []
    GBC.append(GradientBoostingClassifier(n_estimators=500,learning_rate = 0.1))
    GBC_Scores_all = CrossVal(DF_train_x,DF_train_y,GBC,k=3)
    DF_train_x_audio = pd.concat([DF_train_x.ix[:,['WorkerId']],DF_train_x[Audio_Features]],axis=1,join_axes=[DF_train_x.index])
    GBC_Scores_audio = CrossVal(DF_train_x_audio,DF_train_y,GBC,k=3)
    GBC_Scores_noaudio = CrossVal(DF_train_x.drop(Audio_Features,axis=1),DF_train_y,GBC,k=3)
    GBC_audio_scores_avg[0] = np.mean(GBC_Scores_all, axis=1).ravel()
    GBC_audio_scores_avg[1] = np.mean(GBC_Scores_audio, axis=1).ravel()
    GBC_audio_scores_avg[2] = np.mean(GBC_Scores_noaudio, axis=1).ravel()
    GBC_labels = ["all","only_audio","no_audio"]
      
    fig = plt.figure()
    for i in range(GBC_audio_scores_avg.shape[0]):
        ax = fig.add_subplot(111)
        ax.plot(range(len(target_labels)),GBC_audio_scores_avg[i],label = GBC_labels[i])
        plt.xticks(range(len(target_labels)), target_labels, size="small")
        ax.set_autoscaley_on(True)
    plt.tight_layout()
    plt.legend(loc=4)
    fig.savefig("figures/GBC_audio_scores_5.5.15.png")
    
    GBC_fits = []
    GBC_feature_importance= pd.DataFrame()
    GBC_feature_importance['features']=train_x.columns.values.tolist()
    for i in range(len(target_labels)):    
        GBC_clf.fit(train_x,train_y[[i]].squeeze())
        GBC_fits.append(GBC_clf)
        col_name = target_labels[i] + " importance"
        GBC_feature_importance[col_name] = GBC_clf.feature_importances_
        GBC_feature_importance[col_name] = GBC_feature_importance[col_name] / max(GBC_feature_importance[col_name])
        print col_name + " completed"
        
    for i in range(len(target_labels)):
        col_name = target_labels[i] + " importance"
        GBC_feature_importance = GBC_feature_importance.sort(col_name,ascending=False)
        print "top features for " + col_name
        print GBC_feature_importance[[0,i+1]].head(20)
        print "bottom features for " + col_name
        print GBC_feature_importance[[0,i+1]].tail(20)
        
    GBC_feature_importance.to_csv("figures/GBC_feature_importance.csv")
    
    
    lasso_opt_c = np.array([0.01,0.1,0.1,1,0.01,1])
    
    lasso_opt_model = []
    lasso_feature_coef = pd.DataFrame()
    lasso_feature_coef['features'] = columns
    for i in lasso_opt_c:
        lasso_opt_model.append(wrapper(LogisticRegression,C=i,penalty='l1'))
    for i in range(len(lasso_opt_model)):
        lasso_opt_model[i].fit(train_x,train_y[[i]].squeeze())
    
    for model in lasso_opt_model:
        lasso_feature_coef[target_labels[i]] = model.coef_.ravel()
    
    lasso_feature_coef.to_csv("figures/lasso_feature_coef.5.13.15.csv")
    
    #running on test
    logit_opt_model = []
    train_x,train_y,test_x,test_y = TrainTestClean(DF_train_x,DF_train_y,DF_test_x,DF_test_y)
    logit_opt_c = np.array([0.01,0.1,0.1,1,0.01,1])  #change the parameters based on xval results
    for i in logit_opt_c:
        logit_opt_model.append(wrapper(LogisticRegression,C=i))
    test_logit_scores = testScore(logit_opt_model,test_x,test_y)
    
    #running GBC with optimized features
    GBC_opt_model = []    
    GBC_opt_est = np.array([500,200,500,500,100,500])
    GBC_opt_score = np.zeros(6)
    for i in GBC_opt_est:
        GBC_opt_model.append(wrapper(GradientBoostingClassifier,n_estimators=i,learning_rate = 0.1))
    for i in range(len(GBC_opt_model)):
        drop_cols = lasso_feature_coef['features'].where(lasso_feature_coef[target_labels[i]] == 0)
        opt_cols = [col for col in columns if col not in drop_cols]
        nonzero_train_x = train_x[opt_cols]
        nonzero_test_x = test_x[opt_cols]
        
        GBC_opt_model[i].fit(nonzero_train_x,train_y[[i]].squeeze())
        GBC_opt_score[i] = GBC_opt_model[i].score(nonzero_test_x,test_y[[i]].squeeze())
    
    SVM_func=[]
    SVM_score = np.zeros(6)
    for i in logit_opt_c:
         
        SVM_func.append(wrapper(SVC, C=i, kernel='poly', degree = 2))
    for i in range(len(SVM_func)):
        drop_cols = lasso_feature_coef['features'].where(lasso_feature_coef[target_labels[i]] == 0)
        opt_cols = [col for col in columns if col not in drop_cols]
        nonzero_train_x = train_x[opt_cols]
        nonzero_test_x = test_x[opt_cols]
        
        SVM_func[i].fit(nonzero_train_x,train_y[[i]].squeeze())
        SVM_score[i] = SVM_func[i].score(nonzero_test_x,test_y[[i]].squeeze())
        print SVM_score[i]
示例#14
0
df = pd.DataFrame(data)

#Partiendo de series de pandas
listado_lenguajes = ['Python','C#','Java']
d = {'Lenguaje': pd.series(['Sin compilar','Compilado','Compilado'],index = listado_lenguajes),
    'Dificultad': pd.series(['Media','Alta','Muy Alta'],index = listado_lenguajes)} 
#Si los indices no coincidieran lo que ocurre es que hay un merge y aparecen todos en el indice general del dataFrame, si resulta que una seria,
#no incluye ese indice aparecera como un NaN el valor asociado a ese indice en esa columna.
df = pd.DataFrame(d)

#Hasta aquí hemos creado de diferentes maneras DataFrames, recordar que los valores importados de un csv con read_csv o de un excel mediante read_excel
#asi como los valores importados con SQL pyodbc son mostrados o interpretados por python como un DataFrame, nosotros podemos crear un dataFrame mediante
#un csv unicamente seleccinando los datos que queremos mediante la creacion de un reader y seleccionando las filas.

#Creacion de un dataFrame con columnas seleccionadas -- en este caso solo lenguaje y dificultad
df = pd.dataFrame(data,columnas={'Lenguaje','Dificultad'})

#Podemos crear columnas nuevas de la siguiente manera.
df['Experiencia'] = 'variable asignar' #se crea todo la columna con la misma variable. Podemos hacerla condicional en funcion de otra del Data frame
df['Rentabilidad'] = df['Salario'] > 35000 #Todas las que cumplan con ese criterio en la columna Salario las indexará True en la nueva columna Rentabilidad.


#Borrar datos de DataFrame.
df.pop('Nombre Columna Borrar') #--> podemos alamacenarlo en una variable columna_borrada = df.pop('Nombre Columna Borrar')
del df['Nombre columna borrar'] # Hace lo mismo, borrar una columna.

#Insertar una columna.
salarios = pd.series([600000,350000,400000]) #--> se introduciria esta columna en nuestro dataframe.
df.insert(2,'Bien Pagado', salarios)

#Ordenar una DataFrame por indicce o por valor de columna.
示例#15
0
   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.
   - many more! (see http://archive.ics.uci.edu/ml/datasets/Housing)


bos = pd.dataframe(boston.data)
Traceback (most recent call last):

  File "<ipython-input-25-e72d214f7ed6>", line 1, in <module>
    bos = pd.dataframe(boston.data)

AttributeError: module 'pandas' has no attribute 'dataframe'


bos = pd.dataFrame(boston.data)
Traceback (most recent call last):

  File "<ipython-input-26-c36d6e58d4a3>", line 1, in <module>
    bos = pd.dataFrame(boston.data)

AttributeError: module 'pandas' has no attribute 'dataFrame'


bos = pd.DataFrame(boston.data)

bos.head()
Out[28]: 
        0     1     2    3      4      5     6       7    8      9     10  \
0  0.00632  18.0  2.31  0.0  0.538  6.575  65.2  4.0900  1.0  296.0  15.3   
1  0.02731   0.0  7.07  0.0  0.469  6.421  78.9  4.9671  2.0  242.0  17.8   
示例#16
0
import numpy as np
import pandas as pd

gis = open("teste.txt", 'rb')
matriz = []

noData = -9999

i = 0
for lin in gis:
    if (i > 5):
        matriz.append([int(float(x)) for x in lin.split()])
    else:
        i += 1

matriz = np.array(matriz)
m = pd.dataFrame(matriz)

for r in range(matriz.shape[0]):
    for c in range(matriz.shape[1]):
        if (m[r, c]) == noData:
            pass
        else:
            pass
示例#17
0
"""
Example dataframe insertion into the MongoDB database
"""

from DB_Interface import DB_Interface
import pandas as pd

password = "******"
dbname = "chatbot_training_db"
ex_dict = {'a': 1, 'b': 2, 'c': 3}
example_df = pd.dataFrame(ex_dict)

interface = DB_Interface("mongodb+srv://user-main:" + password +
                         "@cluster-idchannel.k6c0f.mongodb.net/" + dbname +
                         "?retryWrites=true&w=majority")
interface.add_to_db(dbname, example_df)
示例#18
0
def features_for_new(X, train, rank = 6):
    X_f = pd.dataFrame()
    X_f[feat_c] = train.columns[-rank:]
    pass
示例#19
0
# Brics countries 
# Brazil, Russi, India, China, South Africa

import pandas as pd
dict = {
	"country":["Brazil", "Russia", "India", "China", "South Africa"],
	"capital":["Brasilia", "Moscow", "New Dehli", "Beijing", "Pretoria"],
	"area":[8.516, 17.10, 3.286. 9.597, 1.221],
	"population:[200.4, 143.5, 1252, 1357, 52.98]	
	}

brics = pd.dataFrame(dict)
brics.index = ["BR", "RU", "IN", "CH", "SA"]

#or 
brics = pd.read_csv("path/to/brics.csv", index_col = 0)

type(brics["country"]) # series
type(brick[["country"]]) dataframe

brick[["country", "capital"]] # column access

brics[1:4] # row access only through splicing, not ""

# loc - label-based
# iloc - integer position-based

brics.loc["RU"] # returns as a column
brics.loc[["RU"]] # returns as a row

brics.loc[["RU", "IN", "CH"], ["country", "capital"]]