def main(): """Main function""" print "----------------------- DECISION TREE CLASSIFIER -----------------------" start_time = time() data = {} # Reading data from input files readData(data) listRatings = Lists # List of attributes that will be considered to build the decision tree attrs = ["genders", "ages", "occupations", "genres"] # Creating the root node initial_node = NodeTree(None, "root", None) # Calling the recursive function that builds the decision tree relativeEntropy(data, data["ratings"], attrs, listRatings, initial_node) # Decision tree leaves debug # printLeaves(initial_node) # Call the test case testCase(data, initial_node) print "\nExecution time:", time() - start_time, "seg"
def main(): """Main function""" print "----------------------- A PRIORI CLASSIFIER -----------------------" start_time = time() data = {} readData(data) aPrioriClassifier(data) testCase(data) print print "Execution time:", time() - start_time
def run(self): df = helper.readData(self.tweet_file) df = helper.cleanTweetData(df) #Check if path exists before attempting to write the data. # Create the parent directory if it does not exist. if not os.path.exists('./output/{}' \ .format(task_run_time)): os.makedirs('./output/{}'.format(task_run_time)) df[self.required_cols].to_csv(self.output().path, index=False)
def run(self): if not os.path.exists('./output/{}' \ .format(task_run_time)): os.makedirs('./output/{}'.format(task_run_time)) print(self.input()) features_df = helper.readData(self.input()['features'].path) model = helper.trainModel(features_df) joblib.dump(model, self.output().path)
def run(self): if not os.path.exists('./output/{}' \ .format(task_run_time)): os.makedirs('./output/{}'.format(task_run_time)) tweet_df = helper.readData(self.input().path) cities_df = helper.readData(self.cities_file) # Drop any null values in the city names so that we wont run into # errors in the later stages cities_df.dropna(subset=['name'], inplace=True) # In order to find the closest city, we can build the KDTree # by utilizing the scipy implementation. tree = helper.buildTree(cities_df) # Update a new column in the Tweet_file df, since it holds the output # label tweet_df['name'] = tweet_df.apply(lambda x: \ helper.closestCity(x['tweet_lat'], \ x['tweet_long'], \ tree, cities_df), \ axis = 1) tweet_df = tweet_df[['airline_sentiment', 'name']] tweet_df.rename(columns={'airline_sentiment': 'label'}, inplace=True) feature_df, encoder = helper.createTrainData(tweet_df, cities_df) feature_df.to_csv(self.output()['features'].path, index=False) joblib.dump(encoder, self.output()['encoder'].path)
def run(self): if not os.path.exists('./output/{}' \ .format(task_run_time)): os.makedirs('./output/{}'.format(task_run_time)) encoder = joblib.load(self.input()['inp2']['encoder'].path) model = joblib.load(self.input()['inp1'].path) cities_df = helper.readData(self.cities_file) score_df = helper.scoreData(cities_df, model, encoder) score_df.to_csv(self.output().path, index=False)
def plotLineGraph(x_axis, y_axis, i, header): fig = plt.figure(i) fig.suptitle(header) plt.plot(x_axis, y_axis, color='green') data_csv_file = "TrimedData.csv" y_label = "Praise" x_label = "Comments" # lambda_vals = [4, 4.5, 5, 5.5, 6, 6.5] #[2, 2.25, 2.5, 2.75, 3, 3.5, 4] # lambda_vals = [5, 10, 13, 15, 20, 30] lambda_vals = [0, 0.001, 0.01, 0.1, 0.5, 1, 2, 3, 5, 7.5, 10, 12.5, 15] # lambda_vals = [10, 11, 12, 13, 14, 15] lambda_vals = [2] df = hlp.readData(data_csv_file, x_label) df = hlp.getNotNull(df, y_label) print(df.shape) x = df.loc[:, [y_label]] x.to_csv("check.csv") # feature_model = gensim.models.Doc2Vec.load("comments2vec.d2v") # features = hlp.DFToFeatureX(feature_model, df, x_label) feature_model = gensim.models.Word2Vec.load("cmtWord2vec.d2v") features = hlp.DFToFeatureX_W(feature_model, df, x_label) # idf_model = idf.Idf.load("idf_model.json") # features = hlp.DFToFeatureX_W_Tdf(feature_model, idf_model, df, x_label) # exit() # features = hlp.PCA_reduce_feature(features, 50) y_output = df.loc[:, [y_label]] y_output[y_output != 0] = 1
import numpy as np uai_file = sys.argv[1] task_id = int(sys.argv[2]) train_file = sys.argv[3] test_file = sys.argv[4] dataset = 'dataset' + uai_file.split('.')[0] modelPath = 'hw5-data/' + dataset + '/' + uai_file trainPath = 'hw5-data/' + dataset + '/' + train_file testPath = 'hw5-data/' + dataset + '/' + test_file bn = BN() bn.readModel(modelPath) if task_id in [1, 3]: train_df = readData(trainPath, is_fully_observed=True) else: train_df = readData(trainPath, is_fully_observed=False) test_df = readData(testPath, is_fully_observed=True) print('-' * 50) if task_id == 1: model = FOD_learner() model.estimate(train_df, bn) print('log likelihood difference: ', log_pointwise_difference(model, bn, test_df)) elif task_id == 2: lst = [] for i in range(5):
#!/usr/bin/python3 import ipdb import helper import time import math def is_square(n): r=math.sqrt(n) if (int(r)**2==n): return int(r) else: return -1 st=time.clock() n,testdata=helper.readData("input") #n=int(input()) #testdata=input() for i in range(n): T=int(testdata[i]) #N=int(input()) side={} a=1 for a in range(1,int(T/2)): for b in range(1,T-a): if a**2+(b**2) >=T**2: break c=a**2+(b**2) c=is_square(c) if c>0: if a+b+c<=T:
import pandas as pd import numpy as np import helper as hlp import matplotlib.pyplot as plt x_label = "Comments" file = "Data/unique.csv" df = hlp.readData(file, x_label) y_labels = df.columns y_labels = np.delete(y_labels, np.argwhere(y_labels == x_label)) for col in y_labels: train, test = hlp.stratified_sampling(df[[x_label, col]], df[col], 80) train.to_csv("Data/" + col + "_train.csv") test.to_csv("Data/" + col + "_test.csv")
sum1=np.dot(np.transpose(diff),covinv) prob[1]=prob[1]+(1/const)*np.exp((-0.5)*(np.sum(np.dot(sum1,diff),0))) prob[1]=prob[1]/train1.shape[0] for i in range(train2.shape[0]): diff=test-train2[i] sum1=np.dot(np.transpose(diff),covinv) prob[2]=prob[2]+(1/const)*np.exp((-0.5)*(np.sum(np.dot(sum1,diff),0))) prob[2]=prob[2]/train2.shape[0] return prob if name=='__main__': train0,test0,train1,test1,train2,test2,a,b,c=helper.readData() train=np.vstack((train0,np.vstack((train1,train2)))) test=np.vstack((test0,np.vstack((test1,test2)))) t=int(math.ceil(a/36))+int(math.ceil(b/36))+int(math.ceil(c/36)) label=np.zeros(t) label[:int(math.ceil(a/36))]=0 label[int(math.ceil(a/36)):int(math.ceil(a/36))+int(math.ceil(b/36))]=1 label[int(math.ceil(a/36))+int(math.ceil(b/36)):]=2 print "l,",label.shape point=np.zeros(131) pro=np.zeros((131,3)) t0=train0.shape[0] t1=train0.shape[0]+train1.shape[0]