#importing import numpy as np #import panda as pd #from Ipython.display import display #dataset data = np.read_csv("data/winequality-red.csv") #display display(data.head(n=5))
import torch.nn as nn import torch .nn.parallel import torch.optim as optim import torch.utils.data from torch.autogrid import Variable #importing the dataset movies=pd.read_csv('ml-lm/movies.dat',sep='::',header=None,engine='python',encoding='latin-1') users=pd.read_csv('ml-lm/users.dat',sep='::',header='None',engine='python',encoding='latin-1') rating=pd.read_csv('ml-lm/ratings.dat',sep='::',header=None,engine='python',encoding='latin-1') #Preparing the training set and the test set training_set=pd.read_csv('ml-100k/ul.base',delimiter='\t') training_set=np.array(training_set,dtype='int') test_set=np.read_csv('ml-100k/ul.test',delimiter='\t') test_set=np.array(test_set,dtype='int') #Getting the number of user and movies nb_users=int(max(max(training_set[:,0]),max(test_set[:,0]))) nb_movies=int(max(max(training_set[:,0]),max(test_set[:,1]))) #Converting the data into and array in lines and movies in columns def convert(data): new_data=[] for id_users in range(1,nb_users+1) id_movies = data[:,1][data[:,0]==id_users] id_rating= data[:,2][data[:,0]==id_users] ratings=np.zeros(nb_movies)
rdr = csv.reader(f) for line in rdf: print(line) f.close() # CSV파일 쓰기 import csv f = open('output.csv', 'w', encoding='utf-8') wr = csv.writer(f) wr.writerow([1, "Alice", True]) wr.writerow([2, "Bob", False]) f.close() # Intermediate 있어보이게 표현 ''' with open('./train.csv') as csvfile: rdr = csv.DictReader(csvfile)) for i in rdr: print(i) ''' # Advanced pandas로 읽기 import numpy as pd train = pd.read_csv("./train.csv") test = pd.read_csv("./test.csv") # train 데이터 살펴보기 train.describe(include="all")
import numpy as pd import pandas as pd titanic_df = pd.read_csv('titanictrain.csv') titanic_df['Survived'] = titanic_df['Survived'].map({0: 'Died', 1: 'Survived'})
pc.set_edgecolor('black') plt.tight_layout() for pc in violin_plot2['bodies']: pc.set_facecolor('teal') pc.set_edgecolor('black') plt.tight_layout() for pc in violin_plot3['bodies']: pc.set_facecolor('teal') pc.set_edgecolor('black') plt.tight_layout() plt.show() if __name__ == '__main__': obesity = pd.read_csv('/Users/Kelly/Desktop/Fit-or-Faux/Datasets/Obesity_in_Adults_-_CDPHE_Community_Level_Estimates_(Census_Tracts) .csv') overweight = pd.read_csv('/Users/Kelly/Desktop/Fit-or-Faux/Datasets/Overweight_and_Obese_Adults_-_CDPHE_Community_Level_Estimates_(Census_Tracts).csv') diabetes = pd.read_csv('/Users/Kelly/Desktop/Fit-or-Faux/Datasets/Diabetes_in_Adults_-_CDPHE_Community_Level_Estimates__Census_Tracts_.csv') fn.sort_census_estimate_highest(obesity) plot_sort_census_estimate_lowest(diabetes, 'Diabetes') plot_sort_census_estimate_lowest(overweight, 'Overweight') print(plot_sort_census_estimate_lowest(obesity, 'Obesity') plot_sort_census_estimate_highest(obesity, 'Obesity') plot_sort_census_estimate_highest(overweight, 'Overweight') plot_sort_census_estimate_highest(diabetes, 'Diabetes') plot_census_estimate(obesity, 'Obesity') plot_census_estimate(overweight, 'Overweight') plot_census_estimate(obesity, 'Diabetes')
def read_matrixfile(name): m = np.read_csv(name) return m
return s else: msg('%s is not a subset of "rgbk", please provide a string permutation of these four characters') raise argparse.ArgumentTypeError(msg) parser.add_argument( '-c', '--colors', dest='color' type=color_subset, default="rgbk", help='Specify the colors to process as a string of characters' ) parser.add_argument( '-b, --batch', dest='batch', type=string, default=None, help='Pass option to read an external csv which will batch process the entries' ) sys.argv = ["sketchy.py "Code/sketchseries/Validation/test1.jpg"] args = parser.parse_args() if args.batch not None: # loop through configurations stored in a csv file # For guidance on the format of the csv, see # csv example batches = pd.read_csv(args.batch, sep=',') for batch in batches: sketchy(params..) else: sketchy(args)
#Polynomial Regression import numpy as np import matplotlib.pyplot as pyplot import pandas as pd dataset = np.read_csv('') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 3].values #Fitting Linear Regression from sklearn.linear_model import LinearRegression lin_reg = LinearRegression() lin_reg.fit(X, y) #Fitting Polynomial Regression from sklear.preprocessing import PolynomialFeatures poly_reg = PolynomialFeatures() X_poly = poly_reg.fit.transform(X) lin_reg2 = LinearRegression() lin_reg2.fit(X_poly, y) #Visualising the Linear Regression Model plt.scatter(X, y, color='red') plt.plot(X, lin_reg.predict(X), color='blue') plt.title('Thruth or Bluff') plt.xlabel('Position') plt.ylabel('Salary') plt.show() #Visualising the Polynomial Regression Model
which has already been explicitly provided for us so we don't need to do any extraction. """ import numpy as pd import pandas as pd import tensorflow as tf from keras import Sequential from keras.layers import Input, Dense from keras.models import Model from google.colab import drive drive.mount('/content/drive') filename = "drive/Team Drives/Deep Learning Project/events.csv" df = pd.read_csv(filename) columns = list(df.columns.values) df.head() attempted_shot = [] corner_kick = [] isFoul = [] isYellowCard1 = [] isYellowCard2 = [] straight_red_card = [] substitution = [] free_kick_awarded = [] off_sides = [] is_hand_ball = [] penalty_awarded = []
import cv2 import numpy as np obj_list = np.read_csv('code/csv_labels.csv') print(obj_list[:3])
import numpy as pd import pandas as pd from sklearn.ensemble import RandomForestRegressor # Read the training data train = pd.read_csv('train.csv') #pull data into target (y) and predictors (X) train_y = train.SalePrice # Features we are using to predict the Sale Price of a house predictors_cols = ['LotArea', 'OverallQual', 'YearBuilt', 'TotRmsAbvGrd'] # Create training predictors data train_X = train[predictors_cols] my_model = RandomForestRegressor() # Fit the model: Capture patterns from provided data. This is the heart of modeling. my_model.fit(train_X, train_y) # Read test data test = pd.read_csv('test.csv') # Pull the same features/columns as training data from the test data test_X = test[predictors_cols] # Use the model to make predictions predicted_prices = my_model.predict(test_X) print(predicted_prices)
# Note you need to have these packages in your computer. To do this easily just download anaconda. import pandas as pd import numpy as np from scipy stats import matplotlib.pyplot as plt # Table is generated using numpy but is essentially the same as the Table package that was designed. Note that the file was converted from xls to csv using an online converter. If you want you can write a python converter but this will be slow. Note that the path will be different for windows. Also the path depends on where your csv file is in. I chose to put it in the downloads section. Also Note that the filename has changed to 07423-0001-Data1.csv due to converting. Table = np.read_csv('~/Downloads/07423-0001-Data1.csv') # This line gets the Table of slave numbers and their prices. prices = Table['V14'] # These two lines gets the Table of slave numbers and their prices, only including those that are we deem to be relevant (i.e. where the column V40 is all 1 gives us the slaves that are guaranteed and 2 gives us those that are guaranteed.) guarantee = Table['V14'][Table.V40 == 1] notG = Table['V14'][Table.V40 == 2] # Assuming that the student has correctly impleneted the KS distance function in lab 8, which we simulate using the python package scipy. Let us set our significance level for the hypothesis testing to be 0.05. Meaning we reject the null hypothesis (that there is no difference between the prices) stats.ks_2samp(guarantee, notG) # From running the line above, we get a p-value of about 1.55*10^-17, which leads us to the conclusion to reject the null hypothesis. # The other variables can be taken into account by creating more queries to the table, or more complex functions that iterate over loops like a for loop having a boolean statement for the different statistics (i.e. the color of the person's skin). # The results for males and females price difference is significant. From hypothesis test using KS distance, we get a p-value of 2.9943622527702614e-19. male = Table['V14'][Table.V15 == 1] female = Table['V14'][Table.V15 == 2] stats.ks_2samp(male, female) # Further analysis can be done by looking at the histograms of the two data, using the matplotlib.pyplot plt.figure()
import numpy as np import matplotlib.pyplot as plt plt = np.read_csv("Data/py.csv") lotArea = plt[['lotArea']] Print(plt['lotArea'].describe())
# coding: utf-8 # In[90]: import numpy as pd import matplotlib.pyplot as plt import pandas as pd # In[91]: #Reading data dataset = pd.read_csv('titanic_data.csv') dataset.head(5) # In[92]: #Removing unnecessary columns dataset.drop(['PassengerId','Name','Ticket','fair','Embarked','Cabin'],axis = 'columns',inplace = True) dataset.head(5) # In[93]: #Fill na values with mean
# Generate trades for analysis number_trades = 1000 # number of days mean_profit = 0.001 # 0.1% per trade standard_dev = 0.003 # 0.3% per trade #trades = make_trade_list(1000) #print (trades) ########## ########## Read a text file containing the list of trades # # Read trades for analysis filename = 'trades.csv' trades = np.read_csv(filename) #print (trades) ########## # Set the parameters describing the personal risk tolerance # of the trader. drawdown_tolerance = 0.10 desired_accuracy = 0.003 initial_capital = 100000.0 for rep in range(5): # Fraction is initially set to use all available funds
Original file is located at https://colab.research.google.com/drive/1kH3mMxTT6-t5GYHiiYPLbgKsGScc-_Ts """ !pip install tqdm import sys, os import datetime import numpy as pd import pandas as pd from tqdm import tqdm from google.colab import drive drive.mount('/content/drive') # Load text from first half of each game df = pd.read_csv('drive/Team Drives/Deep Learning Project/ken_cnn/text_dataset_60min.csv', sep='\t') df.head(3) df.shape # Combine text comments into one long text for each game texts = {} duplicate_ids = [] for idx, row in tqdm(df.iterrows(), total=df.shape[0]): game_id = row['id_odsp'] text = str(row['text']) if not game_id in texts: texts[game_id] = [] else: duplicate_ids.append(idx)
Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1PkR06z-NeA0mRWvv0aiqRiOIOhuLs6G7 """ import numpy as pd import pandas as pd from google.colab import drive drive.mount('/content/drive') # Read dataset events_fp = "drive/Team Drives/Deep Learning Project/events.csv" events_df = pd.read_csv(events_fp, index_col="id_odsp") events_df = events_df.iloc[:, 1:] # Select only text from the first half of games events_df = events_df[events_df['time'] <= 60] events_df = events_df[['text']] events_df.head(5) # Read dataset ginf_fp = "drive/Team Drives/Deep Learning Project/ginf.csv" ginf_df = pd.read_csv(ginf_fp, index_col="id_odsp") ginf_df = ginf_df.iloc[:, 1:] # Select only games that have detail ginf_df = ginf_df[ginf_df['adv_stats'] == True] ginf_df = ginf_df[['ht', 'at', 'fthg',
pd.set_option('display.max_columns', 30) pd.set_option('display.max_rows', 999) pd.set_option('display.float_format', lambda x: '%.5f' % x) pd.set_option('display.expand_frame_repr', False) # titanic, hitters,diabetes verisetleri üzerinde feature engineering çalışmalarını yürütünüz # Feature Engineering - Data Pre-Processing # kem küm # ------------------------------------------------Titanic-----------------------------------------------------------# df_titanic_ = pd.read_csv(r"\titanic.csv") df_titanic = df_titanic_.copy() # Değişken Mühendisliği # Kabini NA olanlar için CABIN BOOL df_titanic["NEW_CABIN_BOOL"] = df_titanic["Cabin"].isnull().astype("int") # Name Letter Count df_titanic["NEW_NAME_COUNT"] = df_titanic["Name"].str.len() # Name word Count df_titanic["NEW_NAME_WORD_COUNT"] = df_titanic["Name"].apply(lambda x: len(str(x).split(" "))) # isDoctor ? df_titanic["NEW_NAME_DR"] = df_titanic["Name"].apply(lambda x: len([x for x in x.split() if x.startswith("Dr")])) # Name Titles df_titanic["NEW_TITLE"] = df_titanic.Name.str.extract(' ([A-Za-z]+)\.', expand=False) # Familiy Size
y_current = (teta1*x_train)+teta0 cost = sum(np.array([data**2 for data in (y_train-y_current)])*np.array(weights))/N teta1_gradient = -(2.0/N)*sum(x_train*(y_train - y_current)*weights) teta0_gradient = -(2.0/N)*sum((y_train - y_current)*weights) teta1 = teta1 -(learningRate *teta1_gradient) teta0 = teta0 -(learningRate *teta0_gradient) predict_y = [] for entry in x_test: predict = teta1*entry+teta0 predict_y.extend(predict) return np.array(predict_y)[:,np.newaxis] micMatrix = np.matrix(pd.read_csv("/Users/Tina/Desktop/Thesis Code/shuffledMicData1_22.csv")) RNAMatrix = np.matrix(pd.read_csv("/Users/Tina/Desktop/Thesis Code/shuffledRNAData1_22.csv")) geneNumber= micMatrix.shape[1] trainMic = micMatrix[0:95,] trainRNA = RNAMatrix[0:95,] testMic = micMatrix[95:104,] testRNA = RNAMatrix[95:104,] Nx = testRNA.shape[0] Ny = testRNA.shape[1] mylist= np.zeros((Nx,Ny)).tolist() predictedMatrix = np.matrix(mylist) predictedMatrixrid = np.matrix(mylist)
from pandas import DataFrame import torch.nn as nn from torch.utils.data import TensorDataset from torch.utils.data import DataLoader teams = [ "Virtus Bologna", "Banco di Sardegna Sassari", "Gemani Basket Brescia", "EA7 Emporio Armani Milano", "Enel Brindisi", "Vanoli Cremona", "Umana Venezia", "Dolomiti Energia Trento", "Fortituto Kontatto Bologna", "OpenJobMetis Varese", "Red October Cantu", "Grissin Bon Reggio Emilia", "Universo De'Longhi Treviso", "Virtus Roma", "Flexx Pistoia", "Pallacanestro Trieste", "Consultinvest VL Pesaro" ] #Show Game Results frame = pd.read_csv("serieAUpdated.csv") columns = [ 'Team', 'PFH', 'PAH', 'PFA', 'PAA', 'eFGFH', 'eFGAH', 'eFGFA', 'eFGAA', 'TOFH', 'TOAH', 'TOFA', 'TOAA', 'ORFH', 'ORAH', 'ORFA', 'ORAA', 'FTRFH', 'FTRAH', 'FTRFA', 'FTRAA' ] data = [] print(frame) #Loop Through Game Results to Create Team Averages for i in teams: row = [] homeFrame = frame[frame.homeTeam.str.contains(i)] awayFrame = frame[frame.awayTeam.str.contains(i)] PFH = homeFrame["homeScore"].mean() PAH = homeFrame["awayScore"].mean()
my_date.day first_two = [datetime(2016, 1, 1), datetime(2016, 1, 2)] # Datetime Index dt_ind = pd.DatetimeIndex(first_two) data = np.random.randn(2, 2) cols = ['a', 'b'] df = pd.DataFrame(data, dt_ind, cols) df.index.argmax() # latest index (use min for first) df.index.max() # latest date # Time Resampling df = pd.read_csv( 'data/walmart_stock.csv') # parse_dates=True, index_col='Date' df.info() df['Date'] = pd.to_datetime(df['Date'], format='') df['Date'] = df['Date'].apply(pd.to_datetime) df.set_index('Date', inplace=True) df.head() df.resample(rule='A').mean() # year end frequency # Q - quaterly # BQ - business quaterly def first_day(entry): return entry[0]
cols = [] for term in [ s for s in data.columns.values if ("理賠金額" in s) and not ('年理賠金額' in s) ]: cols.append((term, term.replace('理賠金額', '單位理賠金額'))) #'理賠金額轉換為單位理賠金額' for col in cols: data[col[0]] = data[col[0]] / data['B5總保額'] #進model ###################################################################################### import numpy as pd import pandas as pd data2016 = pd.read_csv('data2016(notfill).csv') data2017 = pd.read_csv('data2017(notfill).csv') data2018 = pd.read_csv('data2018(notfill).csv') data2018 = data2018.drop([ 'Unnamed: 0', 'RANK', 'MIN_of_RANK', 'MIN_of_MIN_of_RANK', '婚姻居住狀況', '婚姻狀況_5', '役別', '被保人聾啞', '被保人肢體殘缺畸形_部位及程度', '被保人肥胖瘦弱' ], axis=1, errors='ignore') #data2017.to_csv('data2017(notfill).csv',index=False) ID_2018 = data2018['被保人ID'] data2016 = data2016.drop([ '理賠金額_住院2016', '理賠金額_住院2017', '理賠金額_住院2018', '理賠金額_手術2017', '理賠金額_手術2018', '被保人ID', '近三年理賠金額_住院_13_15', '近五年理賠金額_住院_11_15', '近14年理賠金額_住院_02_15', '近三年理賠金額_手術_13_15', '近五年理賠金額_手術_11_15', '近14年理賠金額_手術_02_15' ],
""" Created on Sun Dec 13 15:26:12 2020 @author: deger """ ## Kütüphaneler import numpy as pd import pandas as pd import matplotlib.pyplot as plt ## Verilerin import edilmesi #csv dosylarını okumak için pandas'ın read_csv metodunu çağırıyoruz. #read_csv() metodunun ilk parametresi file_path olduğu için verileri çekeceğimiz dosyanın yolunu pandas'a gösteriyoruz. veriler = pd.read_csv('./veriler.csv') #------------------Sütun isimleri------------------------------------------------- """ verileri bu şekilde okuttuğumuzda eğer başka bir header değeri atamadıysak, pandas her zaman en üstteki satırı sütun isimleri olarak görür """ #-------------------pandas.read_csv() metodunun parametreleri:------------------------------------- """ pandas.read_csv(filepath_or_buffer, sep=',', delimiter=None, header='infer', names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, skipfooter=0, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, cache_dates=True, iterator=False, chunksize=None, compression='infer', thousands=None, decimal='.', lineterminator=None, quotechar='"', quoting=0, doublequote=True, escapechar=None, comment=None, encoding=None,
} predict_y = [] for entry in test_x: bandwidth = adaptiveBandwidth(train_x, entry, 10) nks = [np.sum((j - entry)**2) / bandwidth for j in train_x] ks = [kernels[kernelType](i) for i in nks] dividend = sum([ks[i] * train_y[i] for i in range(len(ks))]) devisor = sum(ks) predict = dividend / divisor predict_y.extend(predict) return np.array(predict_y)[:, np.newaxis] #inputs are from files shuffledMicData and shuffledRNAData which are the outputs of ExtractingTrainTestCaseAndControlSamples.R micMatrix = np.matrix( pd.read_csv("/Users/Tina/Desktop/Thesis Code/shuffledMicData1_22.csv")) RNAMatrix = np.matrix( pd.read_csv("/Users/Tina/Desktop/Thesis Code/shuffledRNAData1_22.csv")) geneNumber = micMatrix.shape[1] trainMic = micMatrix[0:95, ] trainRNA = RNAMatrix[0:95, ] testMic = micMatrix[95:104, ] testRNA = RNAMatrix[95:104, ] Nx = testRNA.shape[0] Ny = testRNA.shape[1] mylist = np.zeros((Nx, Ny)).tolist() predictedMatrix = np.matrix(mylist) predictedMatrixrid = np.matrix(mylist)
""" Created on Thu May 11 10:35:05 2017 @author: wanjun 数据清洗 """ #%% import numpy as pd import pandas as pd from sklearn import preprocessing import talib from matplotlib import pyplot as plt #%% file = '/Users/wanjun/Desktop/LSTM模型/data/data_train_latest.csv' file_1 = '/Users/wanjun/Desktop/LSTM模型/data/MINUTE_zhuli_IF_20170509.csv' data = pd.read_csv(file) data_1 = pd.read_csv(file_1, index_col=1, parse_dates=True) data.index = data_1.index data['datetime'] = data_1.index #%% data = data.sort_index() data = data['2016-12-19':] index = data.drop_duplicates('datetime').resample('D').mean().dropna().index data_clean = pd.DataFrame(columns=['open', 'high', 'low', 'volume', 'close']) lst_len = [] lst = [] for i in index: i = str(i)[:10] temp = data[i] start = i + ' 09:30:00' end = i + ' 15:01:00'