def decision_tree(cls,df,length,neighbour_details): df=pd.read_csv('tile_features.csv') length=len(df) #print(df.head()) #df.loc[len(df)] = neighbour_details df2 = pd.dataFrame(neighbour_details,columns=['A','B','C','D','E']) #df = df.append(df2, ignore_index=True) df = df.append(df2, ignore_index=True) df.to_csv('tile_features.csv', mode='a', header=False)
def get_binned_data( df, bin_count=10 ): v_max, v_min = df.max(), df.min() bins = [(v_max-v_min)/(bin_count+1)*i+v_min for i in range(bin_count+1)] labels = ["{0} {1:.1f}".format(i, (v_max-v_min)/(bin_count+1)*(i+0.5)+v_min) for i in range(bin_count)] categories = pd.cut(df, bins, labels=labels) #print( categories) print( df) print(pd.value_counts( categories )) ret_df = pd.dataFrame() ret_df.index = labels ret_df['count'] = pd.value_counts(categories) return ret_df
def fitDataToPandas(input): output = [] target = [] for element in input: current=[] current = [element.math,element.lit, element.read,element.high,element.music ,element.polits] target.append(element.lecpref) output.append(current) return (pd.DataFrame(output),np.array(target)) return (pd.dataFrame(target),np.array,(output))
<linkrel="fluid-icon" href="https://github.com/fluidicon.png" title="GitHub"> <metaproperty="fb:app_id" content="1401488693436528"> <linkrel="assets" href="https://assets-cdn.github.com/"> ... """ soup = BeautifulSoup(html_doc, 'html.parser') print(soup.title.string) # 31. Given DataFrame, apply label encoder d = ['A', 'B', 'C', 'D', 'E', 'AA', 'AB'] le = LabelEncoder() print(le.fit_transform(d)) # 32. Output? df = pd.dataFrame({'Id': [1, 2, 3, 4], 'val': [2, 5, np.nan, 6]}) print( df.val == np.nan ) # np.nan is an object, not a value (therefore, this will always be false) # 33. Stored in HDFS format, how to find structure of data? hf.keys() # 34. reviews = [ 'movie is unwatchable no matter how decent the first half is . ', 'somewhat funny and well paced action thriller that has jamie foxx as a hapless fast talking hoodlum who is chosen by an overly demanding', 'morse is okay as the agent who comes up with the ingenious plan to get whoever did it at all cost .' ] counts = Counter()
# 列の内容の置き換え import pandas as pd import numpy as np df = pd.dataFrame(np.random.randn(4,3),index=range(1,5),\ columns=list('ABC')) df_t['A'] = ['dog','cat', 'bard', 'man'] # 複数の文字列を置換する==copyして別のDataFrameを作ること!== df_copy = df.replace({'dog': 'DOGMAN','cat':'DOGMAN'})
# consider case where y = n * x ; so relation of x and y is scalable. n = 1 # amount of correlation x = np.random.uniform( 1, 2, 1000) # taking generic 1000 samples from uniform random variable y = x.copy() * n # here I making y = n * x # centring of data for PCA to make it work better. x = x - np.mean(x) # centre the x and remove its mean y = y - np.mean(y) # centre y and remove its mean. # create dataframe with x and y data = pd.dataFrame({'x': x, 'y': y}) # plot the original un corelated data plt.scatter(data.x, data.y) # initiate the PCA and choose 2 output variables pca = PCA(n_components=2) # create transformation model for this data to get it to rotate(use of rotation matrices) pcaTr = pca.fit(data) rotatedData = pcaTr.transform( data) # transofrm data base on rotation of matrix of pcaTr.
marathon_2017['25K'] = marathon_2017['25K'].astype('m8[s]').astype(np.int64) marathon_2017['30K'] = marathon_2017['30K'].astype('m8[s]').astype(np.int64) marathon_2017['35K'] = marathon_2017['35K'].astype('m8[s]').astype(np.int64) marathon_2017['40K'] = marathon_2017['40K'].astype('m8[s]').astype(np.int64) # Define function name to_seconds check_time = 7200 Lat = 0 Long = 0 Location = '' points = [[42.247835, -71.474357], [42.274032, -71.423979], [42.282364, -71.364801], [42.297870, -71.284260], [42.324830, -71.259660], [42.345680, -71.215169], [42.352089, -71.124947], [42.351510, -71.086980]] marathon_location = pd.dataFrame(columns=['Lat', 'Long']) for index, record in marathon_2017.iterrows(): if (record['40K'] < check_time): Lat = points[7][0] Long = points[7][1] elif (record['35K'] < check_time): Lat = points[6][0] Long = points[6][1] elif (record['30K'] < check_time): Lat = points[5][0] Long = points[5][1] elif (record['25K'] < check_time): Lat = points[4][0] Long = points[4][1] elif (record['20K'] < check_time): Lat = points[3][0]
studentgender = pandas.Series(gender) print(student) chocolate = pandas.Series(chocolate) print(chocolate) #dataFrame chocolatedata = (chocolates) chocolatesdf(chocolates) students ={"steve": 32, "Lia":28, "vin":45, "katie":38} studentsdata = [students] print(studentsdata) studentsdf = pandas.dataFrame(studentdata, index = ["age"]) studentlist =[[ "steve", 32,"male"], ["lia", 28,"female"], ["vin",45, "male"], ["katie", 38, "female"]] studentlistdf = pandas.DataFrame(studentlist,columns=["name","age", "gender"],index= ["1","2","3","4"]) print(studentlistdf) print(studentinfo) print(studentgender) studentdf1 = [studentinfo,studentgender] print(studentdf1) studentdf2 = pandas.DataFrame(studentdf1,index=["age","gender"]) print(studentdf2)
## taking sample from training and test train training = training.sample(2000, random_state=19) testing = test.sample(2000, random_state=17) ## combining random samples combi = training.append(testing) y = combi['origin'] combi.drop('origin',axis=1,inplace=True) ## modelling model = RandomForestClassifier(n_estimators = 50, max_depth = 5,min_samples_leaf = 5) drop_list = [] for i in combi.columns: score = cross_val_score(model,pd.dataFrame(combi[i]),y,cv=10,scoring='roc_auc') if (np.mean(score) > 0.8): drop_list.append(i) print(i,np.mean(score)) # -- No drifitng features # All features are important and train and test can be analyzed together as they come from the same distribution ############################################################################################################## ## Appending the original train and test datasets and replacing NA's and treating outliers using aforementioned logic train['source']= 'train' test['source'] = 'test'
def munge_data(messy_table): print('grouping trips...') trip_i1 = 0 trips = [] cur_announced_and_arrived = pd.DataFrame(columns=[ 'datetime', 'station_id', 'projection', 'trip_id', 'train_id', 'line_id' ]) #go through every station for st in const.stations: trip_id = 1 #go through all rows for i in range(messy_table.shape[0]): #first train if i == 0: train = messy_table.ix[i][st + '_Time1_Train'] trip_i1 = i # new train if train != messy_table.ix[i][st + '_Time1_Train']: announced_report = () arrived_report = () excep = False #go through all rows in current trip for j in [trip_i1, i - 1]: try: est_secs = int(messy_table.ix[j][st + '_Time1_Est']) m, s = divmod(est_secs, 60) h, m = divmod(m, 60) eta = str(h) + ":" + str(m) + ":" + str(s) report = pd.dataFrame( { 'datetime': datetime.strptime( messy_table.ix[j]['dateadded'], '%Y-%m-%d %H:%M:%S'), 'station_id': st, 'projection': eta, 'trip_id': trip_id, 'train_id': str(messy_table.ix[j][st + '_Time1_Train']), 'line_id': messy_table.ix[j][st + '_Time1_LineID'] }, index=[0]) if j == trip_i1: announced_report = report elif j == i - 1: arrived_report = report except: excep = True print("bad time " + str(messy_table.ix[j][st + '_Time1_Est'])) #insert only good trips if not excep and len(announced_report) == 6 and len( arrived_report) == 6: x = announced_report.ix[0]['projection'].split(":") an_proj = int(x[0]) + 60 * int(x[1]) + int(x[2]) x = arrived_report.ix[0]['projection'].split(":") arr = int(x[0]) + 60 * int(x[1]) + int(x[2]) if an_proj > 0 and arr == 0 and an_proj > arr: cur_announced_and_arrived = cur_announced_and_arrived.append( announced_report) cur_announced_and_arrived = cur_announced_and_arrived.append( arrived_report) trips.append(announced_report) trips.append(arrived_report) trip_id += 1 excep = False #go to next trip train = messy_table.ix[i][st + '_Time1_Train'] trip_i1 = i print("done") return cur_announced_and_arrived
os.chdir('/home/justyna/Pulpit/uczeniem/umz-template/zajecia1/zadanie3/train') r = pd.read_csv('train.tsv', sep = '\t',names = ['price', 'isNew','rooms', 'floor', 'location', 'sqrMetres']) reg = linear_model.LinearRegression() r.head() r.corr() sns.set(style='whitegrid', context='notebook') c=['price', 'isNew', 'rooms', 'floor', 'sqrMetres'] sns.pairplot(r[c], size=3) plt.show() reg.fit(pd.dataFrame(r,columns=['sqrMetres', 'floor', 'rooms', 'isNew'], r['price']) os.chdir('/home/justyna/Pulpit/uczeniem/umz-template/zajecia1/zadanie3/dev-0') r2 = pd.read_csv('in.tsv', sep = '\t',names = ['isNew','rooms', 'floor', 'location', 'sqrMetres']) x_d=pd.DataFrame(r2,columns=['sqrMetres', 'floor', 'rooms', 'isNew']) y_d=reg.predict(x_d) y_d=pd.Series(y_d) y_d.to_csv('out.tsv', sep='\t', header=False, index=False) os.chdir('/home/justyna/Pulpit/uczeniem/umz-template/zajecia1/zadanie3/test-A') r3 = pd.read_csv('in.tsv', sep = '\t',names = ['isNew','rooms', 'floor', 'location', 'sqrMetres']) x_d2=pd.DataFrame(r3,columns=['sqrMetres', 'floor', 'rooms', 'isNew']) y_d2=reg.predict(x_d2) y_d2=pd.Series(y_d2) y_d2.to_csv('out.tsv', sep='\t', header=False, index=False)
# ps1[idx] = np.expm1(model_lassoC1.predict(it)) # else: # ps1[idx] = np.expm1(model_lasso.predict(it)) # idx = idx + 1 #X_test["s1Y"] = ps1 p = ps1 + clf.predict(X_test) # + clf1.predict(X_test) idx = 0 for index, it in X_test.iterrows(): if it['OverallQual'] == 1.25: p[idx] = ps1[idx] idx = idx + 1 #solution = pd.DataFrame({"id":test.Id, "SalePrice":p}, columns=['id', 'SalePrice']) solution = pd.DataFrame({"id": p, "SalePrice": test.Id}) solution.to_csv("lasso_sol22_Median.csv", index=False) #p_pred = np.expm1(model_lasso.predict(xtrain)) p_pred = ( np.expm1(model_lasso.predict(xtrain)) #+ np.expm1(model_lasso_1.predict(xtrain)) + np.expm1(model_lasso_2.predict(xtrain)) #+ np.expm1(model_lasso_3.predict(xtrain)) + np.expm1(model_lasso_4.predict(xtrain)) # + np.expm1(model_lasso_5.predict(xtrain)) + np.expm1(model_lasso_6.predict(xtrain)) ) / 1 lasso_pred = p_pred lasso = pd.dataFrame(data=lasso_pred) print(lasso_pred)
def main(): filepath = "Data/Master_30_4.csv" # there are 835 assessors, each evaluated approximately 66 recordings #preparing the data DF_x,DF_y = splitXY(filepath) DF_x = DF_x.fillna(-1) DF_y.isnull().sum() #only 5 missing values DF_y = TransformTarget(DF_y) DF_train_x,DF_test_x,DF_train_y,DF_test_y = SplitTrainTest(DF_x,DF_y,train_percent = 0.75) #setting the hyperparameters to test nEstimators = [10,100,200,500,1000] C = [10**x for x in range(-5,5)] #cross validation for different classifiers GBC_func = [] for i in nEstimators: GBC_func.append(wrapper(GradientBoostingClassifier,n_estimators=i,learning_rate = 0.1)) GBC_Score = CrossVal(DF_train_x,DF_train_y,GBC_func,k=3) RF_func = [] for i in nEstimators: RF_func.append(wrapper(RandomForestClassifier,n_estimators=i)) RF_Score = CrossVal(DF_train_x,DF_train_y,RF_func,k=3) Logit_func = [] for i in C: Logit_func.append(wrapper(LogisticRegression,C=i)) Logit_Score = CrossVal(DF_train_x,DF_train_y,Logit_func,k=3) SVM_func = [] for i in C: SVM_func.append(wrapper(SVC, C=i, kernel='poly', degree = 2)) SVM_Score = CrossVal(DF_train_x,DF_train_y,SVM_func,k=3) target_cols = list(DF_test_y.columns) #plots for the cross validation results Logit_score_avg = np.mean(Logit_Score, axis=1) position = 231 fig = plt.figure() for i in range(len(target_cols)): ax = fig.add_subplot(position) ax.plot(np.log(C),Logit_score_avg[i]) ax.set_title(target_cols[i]) ax.set_autoscaley_on(True) position += 1 plt.tight_layout() fig.savefig("figures/logit_plots_5.5.15.png") RF_score_avg = np.mean(RF_Score, axis=1) position = 231 fig = plt.figure() for i in range(len(target_cols)): ax = fig.add_subplot(position) ax.plot(np.log(nEstimators),RF_score_avg[i]) ax.set_title(target_cols[i]) ax.set_autoscaley_on(True) position += 1 plt.tight_layout() fig.savefig("figures/rf_plots_5.5.15.png") GBC_score_avg = np.mean(GBC_Score, axis=1) position = 231 fig = plt.figure() for i in range(len(target_cols)): ax = fig.add_subplot(position) ax.plot(np.log(nEstimators),GBC_score_avg[i]) ax.set_title(target_cols[i]) ax.set_autoscaley_on(True) position += 1 plt.tight_layout() fig.savefig("figures/GBC_plots_5.5.15.png") SVM_score_avg = np.mean(SVM_Score, axis=1) position = 231 fig = plt.figure() for i in range(len(target_cols)): ax = fig.add_subplot(position) ax.plot(np.log(nEstimators),SVM_score_avg[i]) ax.set_title(target_cols[i]) ax.set_autoscaley_on(True) position += 1 plt.tight_layout() fig.savefig("figures/SVM_plots_5.5.15.png") #getting feature importance train_x, train_y,test_x, test_y = TrainTestClean(DF_train_x,DF_train_y,DF_test_x,DF_test_y) columns = train_x.columns.values.tolist() GBC_clf = GradientBoostingClassifier(n_estimators=500,learning_rate = 0.1) GBC_clf.fit(train_x,train_y[[0]].squeeze()) GBC_feature_importance= pd.DataFrame() GBC_feature_importance['features']=list(train_x.columns) GBC_feature_importance['importance'] = GBC_clf.feature_importances_ GBC_feature_importance['importance'] = GBC_feature_importance['importance'] / max(GBC_feature_importance['importance']) GBC_feature_importance = GBC_feature_importance.sort('importance',ascending=False) RF_clf = RandomForestClassifier(n_estimators=500) RF_clf.fit(train_x,train_y[[0]].squeeze()) RF_feature_importance = pd.DataFrame() RF_feature_importance['features'] = list(train_x.columns) RF_feature_importance['importance'] = RF_clf.feature_importances_ RF_feature_importance['importance'] = RF_feature_importance['importance'] / max(RF_feature_importance['importance']) RF_feature_importance = RF_feature_importance.sort('importance',ascending=False) feature_importance = pd.dataFrame() feature_importance['features'] = list(train_x.columns)[2:156] target_labels = train_y.columns.values.tolist() #RF_models = fitModels(RF_clf,train_x[columns[2:156]],train_y) #for i in range(len(RF_models)): # feature_importance[target_labels[i]] = RF_models[i].feature_importances_ # feature_importance[target_labels[i]] = feature_importance[target_labels[i]]/max(feature_importance[target_labels[i]]) #RF_testScores = testScore(RF_models,test_x[columns[2:156]],test_y) #feature importance for logistic regression logit_ind_scores = np.zeros((len(columns),len(target_labels))) Logit_clf = LogisticRegression(C = 0.01) for i in range(len(target_labels)): for j in range(len(columns)): #cols = [col for col in train_x.columns if col not in [columns[j]]] Logit_clf.fit(train_x[columns[j:j+1]],train_y[target_labels[i]].squeeze()) temp_score = Logit_clf.score(test_x[columns[j:j+1]],test_y[target_labels[i]].squeeze()) print "target variable " + target_labels[i] + " without " + columns[j] + ": " + str(temp_score) logit_ind_scores[j][i] = temp_score np.savetxt("figures/logit_lasso_xval_score_5.12.15.csv",Logit_lasso_score_avg) np.savetxt("figures/logit_scores_features_importance_5.12.15.csv",logit_scores_coef) np.savetxt("figures/logit_scores_importance_5.7.15.csv",scores) #scores for logit with all features actual_scores = np.zeros(len(target_labels)) for i in range(len(target_labels)): Logit_clf.fit(train_x,train_y[target_labels[i]].squeeze()) actual_scores[i]=Logit_clf.score(test_x,test_y[target_labels[i]].squeeze()) #testing without the time features unwanted_features = ['AssignmentDurationInSeconds', 'WorkTimeInSeconds'] DF_train_x_2 = DF_train_x.drop(unwanted_features,axis=1) RF_func = [] for i in nEstimators: RF_func.append(wrapper(RandomForestClassifier,n_estimators=i)) RF_Score_2 = CrossVal(DF_train_x_2,DF_train_y,RF_func,k=2) Logit_func = [] for i in C: Logit_func.append(wrapper(LogisticRegression,C=i)) Logit_Score_2 = CrossVal(DF_train_x_2,DF_train_y,Logit_func,k=2) #running the algorithms with only audio data, without audio data, with everything Audio_Features = columns[65:] GBC_audio_scores_avg = np.zeros((3,len(target_labels))) GBC = [] GBC.append(GradientBoostingClassifier(n_estimators=500,learning_rate = 0.1)) GBC_Scores_all = CrossVal(DF_train_x,DF_train_y,GBC,k=3) DF_train_x_audio = pd.concat([DF_train_x.ix[:,['WorkerId']],DF_train_x[Audio_Features]],axis=1,join_axes=[DF_train_x.index]) GBC_Scores_audio = CrossVal(DF_train_x_audio,DF_train_y,GBC,k=3) GBC_Scores_noaudio = CrossVal(DF_train_x.drop(Audio_Features,axis=1),DF_train_y,GBC,k=3) GBC_audio_scores_avg[0] = np.mean(GBC_Scores_all, axis=1).ravel() GBC_audio_scores_avg[1] = np.mean(GBC_Scores_audio, axis=1).ravel() GBC_audio_scores_avg[2] = np.mean(GBC_Scores_noaudio, axis=1).ravel() GBC_labels = ["all","only_audio","no_audio"] fig = plt.figure() for i in range(GBC_audio_scores_avg.shape[0]): ax = fig.add_subplot(111) ax.plot(range(len(target_labels)),GBC_audio_scores_avg[i],label = GBC_labels[i]) plt.xticks(range(len(target_labels)), target_labels, size="small") ax.set_autoscaley_on(True) plt.tight_layout() plt.legend(loc=4) fig.savefig("figures/GBC_audio_scores_5.5.15.png") GBC_fits = [] GBC_feature_importance= pd.DataFrame() GBC_feature_importance['features']=train_x.columns.values.tolist() for i in range(len(target_labels)): GBC_clf.fit(train_x,train_y[[i]].squeeze()) GBC_fits.append(GBC_clf) col_name = target_labels[i] + " importance" GBC_feature_importance[col_name] = GBC_clf.feature_importances_ GBC_feature_importance[col_name] = GBC_feature_importance[col_name] / max(GBC_feature_importance[col_name]) print col_name + " completed" for i in range(len(target_labels)): col_name = target_labels[i] + " importance" GBC_feature_importance = GBC_feature_importance.sort(col_name,ascending=False) print "top features for " + col_name print GBC_feature_importance[[0,i+1]].head(20) print "bottom features for " + col_name print GBC_feature_importance[[0,i+1]].tail(20) GBC_feature_importance.to_csv("figures/GBC_feature_importance.csv") lasso_opt_c = np.array([0.01,0.1,0.1,1,0.01,1]) lasso_opt_model = [] lasso_feature_coef = pd.DataFrame() lasso_feature_coef['features'] = columns for i in lasso_opt_c: lasso_opt_model.append(wrapper(LogisticRegression,C=i,penalty='l1')) for i in range(len(lasso_opt_model)): lasso_opt_model[i].fit(train_x,train_y[[i]].squeeze()) for model in lasso_opt_model: lasso_feature_coef[target_labels[i]] = model.coef_.ravel() lasso_feature_coef.to_csv("figures/lasso_feature_coef.5.13.15.csv") #running on test logit_opt_model = [] train_x,train_y,test_x,test_y = TrainTestClean(DF_train_x,DF_train_y,DF_test_x,DF_test_y) logit_opt_c = np.array([0.01,0.1,0.1,1,0.01,1]) #change the parameters based on xval results for i in logit_opt_c: logit_opt_model.append(wrapper(LogisticRegression,C=i)) test_logit_scores = testScore(logit_opt_model,test_x,test_y) #running GBC with optimized features GBC_opt_model = [] GBC_opt_est = np.array([500,200,500,500,100,500]) GBC_opt_score = np.zeros(6) for i in GBC_opt_est: GBC_opt_model.append(wrapper(GradientBoostingClassifier,n_estimators=i,learning_rate = 0.1)) for i in range(len(GBC_opt_model)): drop_cols = lasso_feature_coef['features'].where(lasso_feature_coef[target_labels[i]] == 0) opt_cols = [col for col in columns if col not in drop_cols] nonzero_train_x = train_x[opt_cols] nonzero_test_x = test_x[opt_cols] GBC_opt_model[i].fit(nonzero_train_x,train_y[[i]].squeeze()) GBC_opt_score[i] = GBC_opt_model[i].score(nonzero_test_x,test_y[[i]].squeeze()) SVM_func=[] SVM_score = np.zeros(6) for i in logit_opt_c: SVM_func.append(wrapper(SVC, C=i, kernel='poly', degree = 2)) for i in range(len(SVM_func)): drop_cols = lasso_feature_coef['features'].where(lasso_feature_coef[target_labels[i]] == 0) opt_cols = [col for col in columns if col not in drop_cols] nonzero_train_x = train_x[opt_cols] nonzero_test_x = test_x[opt_cols] SVM_func[i].fit(nonzero_train_x,train_y[[i]].squeeze()) SVM_score[i] = SVM_func[i].score(nonzero_test_x,test_y[[i]].squeeze()) print SVM_score[i]
df = pd.DataFrame(data) #Partiendo de series de pandas listado_lenguajes = ['Python','C#','Java'] d = {'Lenguaje': pd.series(['Sin compilar','Compilado','Compilado'],index = listado_lenguajes), 'Dificultad': pd.series(['Media','Alta','Muy Alta'],index = listado_lenguajes)} #Si los indices no coincidieran lo que ocurre es que hay un merge y aparecen todos en el indice general del dataFrame, si resulta que una seria, #no incluye ese indice aparecera como un NaN el valor asociado a ese indice en esa columna. df = pd.DataFrame(d) #Hasta aquí hemos creado de diferentes maneras DataFrames, recordar que los valores importados de un csv con read_csv o de un excel mediante read_excel #asi como los valores importados con SQL pyodbc son mostrados o interpretados por python como un DataFrame, nosotros podemos crear un dataFrame mediante #un csv unicamente seleccinando los datos que queremos mediante la creacion de un reader y seleccionando las filas. #Creacion de un dataFrame con columnas seleccionadas -- en este caso solo lenguaje y dificultad df = pd.dataFrame(data,columnas={'Lenguaje','Dificultad'}) #Podemos crear columnas nuevas de la siguiente manera. df['Experiencia'] = 'variable asignar' #se crea todo la columna con la misma variable. Podemos hacerla condicional en funcion de otra del Data frame df['Rentabilidad'] = df['Salario'] > 35000 #Todas las que cumplan con ese criterio en la columna Salario las indexará True en la nueva columna Rentabilidad. #Borrar datos de DataFrame. df.pop('Nombre Columna Borrar') #--> podemos alamacenarlo en una variable columna_borrada = df.pop('Nombre Columna Borrar') del df['Nombre columna borrar'] # Hace lo mismo, borrar una columna. #Insertar una columna. salarios = pd.series([600000,350000,400000]) #--> se introduciria esta columna en nuestro dataframe. df.insert(2,'Bien Pagado', salarios) #Ordenar una DataFrame por indicce o por valor de columna.
- Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261. - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann. - many more! (see http://archive.ics.uci.edu/ml/datasets/Housing) bos = pd.dataframe(boston.data) Traceback (most recent call last): File "<ipython-input-25-e72d214f7ed6>", line 1, in <module> bos = pd.dataframe(boston.data) AttributeError: module 'pandas' has no attribute 'dataframe' bos = pd.dataFrame(boston.data) Traceback (most recent call last): File "<ipython-input-26-c36d6e58d4a3>", line 1, in <module> bos = pd.dataFrame(boston.data) AttributeError: module 'pandas' has no attribute 'dataFrame' bos = pd.DataFrame(boston.data) bos.head() Out[28]: 0 1 2 3 4 5 6 7 8 9 10 \ 0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8
import numpy as np import pandas as pd gis = open("teste.txt", 'rb') matriz = [] noData = -9999 i = 0 for lin in gis: if (i > 5): matriz.append([int(float(x)) for x in lin.split()]) else: i += 1 matriz = np.array(matriz) m = pd.dataFrame(matriz) for r in range(matriz.shape[0]): for c in range(matriz.shape[1]): if (m[r, c]) == noData: pass else: pass
""" Example dataframe insertion into the MongoDB database """ from DB_Interface import DB_Interface import pandas as pd password = "******" dbname = "chatbot_training_db" ex_dict = {'a': 1, 'b': 2, 'c': 3} example_df = pd.dataFrame(ex_dict) interface = DB_Interface("mongodb+srv://user-main:" + password + "@cluster-idchannel.k6c0f.mongodb.net/" + dbname + "?retryWrites=true&w=majority") interface.add_to_db(dbname, example_df)
def features_for_new(X, train, rank = 6): X_f = pd.dataFrame() X_f[feat_c] = train.columns[-rank:] pass
# Brics countries # Brazil, Russi, India, China, South Africa import pandas as pd dict = { "country":["Brazil", "Russia", "India", "China", "South Africa"], "capital":["Brasilia", "Moscow", "New Dehli", "Beijing", "Pretoria"], "area":[8.516, 17.10, 3.286. 9.597, 1.221], "population:[200.4, 143.5, 1252, 1357, 52.98] } brics = pd.dataFrame(dict) brics.index = ["BR", "RU", "IN", "CH", "SA"] #or brics = pd.read_csv("path/to/brics.csv", index_col = 0) type(brics["country"]) # series type(brick[["country"]]) dataframe brick[["country", "capital"]] # column access brics[1:4] # row access only through splicing, not "" # loc - label-based # iloc - integer position-based brics.loc["RU"] # returns as a column brics.loc[["RU"]] # returns as a row brics.loc[["RU", "IN", "CH"], ["country", "capital"]]