示例#1
0
def main():
    """Main function"""

    print "----------------------- DECISION TREE CLASSIFIER -----------------------"
    start_time = time()
    data = {}

    # Reading data from input files
    readData(data)

    listRatings = Lists

    # List of attributes that will be considered to build the decision tree
    attrs = ["genders", "ages", "occupations", "genres"]

    # Creating the root node
    initial_node = NodeTree(None, "root", None)

    # Calling the recursive function that builds the decision tree
    relativeEntropy(data, data["ratings"], attrs, listRatings, initial_node)

    # Decision tree leaves debug
    # printLeaves(initial_node)

    # Call the test case
    testCase(data, initial_node)

    print "\nExecution time:", time() - start_time, "seg"
示例#2
0
def main():
    """Main function"""

    print "----------------------- A PRIORI CLASSIFIER -----------------------"
    start_time = time()
    data = {}

    readData(data)

    aPrioriClassifier(data)

    testCase(data)

    print
    print "Execution time:", time() - start_time
示例#3
0
    def run(self):

        df = helper.readData(self.tweet_file)

        df = helper.cleanTweetData(df)

        #Check if path exists before attempting to write the data.
        # Create the parent directory if it does not exist.
        if not os.path.exists('./output/{}' \
                                .format(task_run_time)):
            os.makedirs('./output/{}'.format(task_run_time))

        df[self.required_cols].to_csv(self.output().path, index=False)
示例#4
0
    def run(self):

        if not os.path.exists('./output/{}' \
                                .format(task_run_time)):
            os.makedirs('./output/{}'.format(task_run_time))

        print(self.input())

        features_df = helper.readData(self.input()['features'].path)

        model = helper.trainModel(features_df)

        joblib.dump(model, self.output().path)
示例#5
0
    def run(self):

        if not os.path.exists('./output/{}' \
                                .format(task_run_time)):
            os.makedirs('./output/{}'.format(task_run_time))

        tweet_df = helper.readData(self.input().path)

        cities_df = helper.readData(self.cities_file)

        # Drop any null values in the city names so that we wont run into
        # errors in the later stages

        cities_df.dropna(subset=['name'], inplace=True)

        # In order to find the closest city, we can build the KDTree
        # by utilizing the scipy implementation.

        tree = helper.buildTree(cities_df)

        # Update a new column in the Tweet_file df, since it holds the output
        # label

        tweet_df['name'] = tweet_df.apply(lambda x: \
                                    helper.closestCity(x['tweet_lat'], \
                                                        x['tweet_long'], \
                                                        tree, cities_df), \
                                                            axis = 1)

        tweet_df = tweet_df[['airline_sentiment', 'name']]

        tweet_df.rename(columns={'airline_sentiment': 'label'}, inplace=True)

        feature_df, encoder = helper.createTrainData(tweet_df, cities_df)

        feature_df.to_csv(self.output()['features'].path, index=False)

        joblib.dump(encoder, self.output()['encoder'].path)
示例#6
0
    def run(self):

        if not os.path.exists('./output/{}' \
                                .format(task_run_time)):
            os.makedirs('./output/{}'.format(task_run_time))

        encoder = joblib.load(self.input()['inp2']['encoder'].path)

        model = joblib.load(self.input()['inp1'].path)

        cities_df = helper.readData(self.cities_file)

        score_df = helper.scoreData(cities_df, model, encoder)

        score_df.to_csv(self.output().path, index=False)
示例#7
0
def plotLineGraph(x_axis, y_axis, i, header):
    fig = plt.figure(i)
    fig.suptitle(header)
    plt.plot(x_axis, y_axis, color='green')


data_csv_file = "TrimedData.csv"
y_label = "Praise"
x_label = "Comments"
# lambda_vals = [4, 4.5, 5, 5.5, 6, 6.5] #[2, 2.25, 2.5, 2.75, 3, 3.5, 4]
# lambda_vals = [5, 10, 13, 15, 20, 30]
lambda_vals = [0, 0.001, 0.01, 0.1, 0.5, 1, 2, 3, 5, 7.5, 10, 12.5, 15]
# lambda_vals = [10, 11, 12, 13, 14, 15]
lambda_vals = [2]
df = hlp.readData(data_csv_file, x_label)
df = hlp.getNotNull(df, y_label)
print(df.shape)
x = df.loc[:, [y_label]]
x.to_csv("check.csv")
# feature_model = gensim.models.Doc2Vec.load("comments2vec.d2v")
# features = hlp.DFToFeatureX(feature_model, df, x_label)

feature_model = gensim.models.Word2Vec.load("cmtWord2vec.d2v")
features = hlp.DFToFeatureX_W(feature_model, df, x_label)
# idf_model = idf.Idf.load("idf_model.json")
# features = hlp.DFToFeatureX_W_Tdf(feature_model, idf_model, df, x_label)
# exit()
# features = hlp.PCA_reduce_feature(features, 50)
y_output = df.loc[:, [y_label]]
y_output[y_output != 0] = 1
示例#8
0
import numpy as np

uai_file = sys.argv[1]
task_id = int(sys.argv[2])
train_file = sys.argv[3]
test_file = sys.argv[4]

dataset = 'dataset' + uai_file.split('.')[0]
modelPath = 'hw5-data/' + dataset + '/' + uai_file
trainPath = 'hw5-data/' + dataset + '/' + train_file
testPath = 'hw5-data/' + dataset + '/' + test_file

bn = BN()
bn.readModel(modelPath)
if task_id in [1, 3]:
    train_df = readData(trainPath, is_fully_observed=True)
else:
    train_df = readData(trainPath, is_fully_observed=False)

test_df = readData(testPath, is_fully_observed=True)

print('-' * 50)

if task_id == 1:
    model = FOD_learner()
    model.estimate(train_df, bn)
    print('log likelihood difference: ',
          log_pointwise_difference(model, bn, test_df))
elif task_id == 2:
    lst = []
    for i in range(5):
示例#9
0
#!/usr/bin/python3
import ipdb
import helper
import time
import math


def is_square(n):
    r=math.sqrt(n)
    if (int(r)**2==n):
        return int(r)
    else:
        return -1
st=time.clock()
n,testdata=helper.readData("input")
#n=int(input())
#testdata=input()
for i in range(n):
    T=int(testdata[i])
    #N=int(input())
    side={}
    a=1

    for a in range(1,int(T/2)):
        for b in range(1,T-a):
            if a**2+(b**2) >=T**2:
                break
            c=a**2+(b**2)
            c=is_square(c)
            if  c>0:
                if a+b+c<=T:
示例#10
0
import pandas as pd
import numpy as np
import helper as hlp
import matplotlib.pyplot as plt

x_label = "Comments"
file = "Data/unique.csv"

df = hlp.readData(file, x_label)
y_labels = df.columns
y_labels = np.delete(y_labels, np.argwhere(y_labels == x_label))
for col in y_labels:
    train, test = hlp.stratified_sampling(df[[x_label, col]], df[col], 80)
    train.to_csv("Data/" + col + "_train.csv")
    test.to_csv("Data/" + col + "_test.csv")
示例#11
0
		sum1=np.dot(np.transpose(diff),covinv)
		prob[1]=prob[1]+(1/const)*np.exp((-0.5)*(np.sum(np.dot(sum1,diff),0)))
	prob[1]=prob[1]/train1.shape[0]

	for i in range(train2.shape[0]):
		diff=test-train2[i]
		sum1=np.dot(np.transpose(diff),covinv)
		prob[2]=prob[2]+(1/const)*np.exp((-0.5)*(np.sum(np.dot(sum1,diff),0)))
	prob[2]=prob[2]/train2.shape[0]
	
	return prob

if name=='__main__':
	

	train0,test0,train1,test1,train2,test2,a,b,c=helper.readData()
	train=np.vstack((train0,np.vstack((train1,train2))))
	test=np.vstack((test0,np.vstack((test1,test2))))
	
	t=int(math.ceil(a/36))+int(math.ceil(b/36))+int(math.ceil(c/36))
	label=np.zeros(t)
	label[:int(math.ceil(a/36))]=0
	label[int(math.ceil(a/36)):int(math.ceil(a/36))+int(math.ceil(b/36))]=1
	label[int(math.ceil(a/36))+int(math.ceil(b/36)):]=2

	print "l,",label.shape
	point=np.zeros(131)
	pro=np.zeros((131,3))
	
	t0=train0.shape[0]
	t1=train0.shape[0]+train1.shape[0]