def init_command_line_config(self): print "using command line argparser" from argparser import argdict conf = argdict self.pageLimit = conf["page_limit"] self.linkLimit = conf["link_limit"] self.relevantThreshold = conf["relevancy_threshold"] classifierString = conf["classifier"] self.classifier = None if "NB" in classifierString.upper(): self.classifier = NaiveBayesClassifier() elif "SVM" in classifierString.upper(): self.classifier = SVMClassifier() seeds = conf["seeds"] self.seedUrls = [] urlsPerSeed = 10 for keyword in seeds: if keyword is not None and keyword is not "": if "http" in keyword: self.seedUrls.append(keyword) else: seedUrlGenerator = google.search(keyword) searchResultUrls = list(itertools.islice(seedUrlGenerator, 0, urlsPerSeed)) self.seedUrls = list(set(self.seedUrls) | set(searchResultUrls)) else: raise Exception("Seed is not valid: (" + str(keyword) + ") -- it must be a keyword or URL") print "seed urls: " print self.seedUrls self.blacklistDomains = conf["blacklist_domains"] self.labeled = {} self.labeled["relevantUrls"] = conf["relevant_urls"] for url in self.labeled["relevantUrls"]: if url is None or url is "": raise Exception("Relevant URL is not valid: (" + str(url) + ")") self.labeled["irrelevantUrls"] = conf["relevant_urls"] for url in self.labeled["irrelevantUrls"]: if url is None or url is "": raise Exception("Irrelevant URL is not valid: (" + str(url) + ")") self.vsm = { "on": conf["vsm"], "filterModel": conf["vsm_filter"], "minRepositoryDocNum": conf["min_repo_doc_num"], "filterIrrelevantThreshold": conf["irrelevancy_threshold"], "filterRelevantThreshold": conf["relevancy_threshold"] }
def init_config_file_config(self): print "using config file" conf = Config("config.ini") self.pageLimit = conf["pageLimit"] self.linkLimit = conf["linkLimit"] self.relevantThreshold = conf["relevantThreshold"] classifierString = conf["classifier"] self.classifier = None if "NB" in classifierString.upper(): self.classifier = NaiveBayesClassifier() elif "SVM" in classifierString.upper(): self.classifier = SVMClassifier() seedKeywords = linesFromFile(conf["seedFile"]) self.seedUrls = [] urlsPerSeed = 10 for keyword in seedKeywords: if "http" in keyword: self.seedUrls.append(keyword) else: seedUrlGenerator = google.search(keyword) searchResultUrls = list(itertools.islice(seedUrlGenerator, 0, urlsPerSeed)) self.seedUrls = list(set(self.seedUrls) | set(searchResultUrls)) print "seed urls: " print self.seedUrls self.blacklistDomains = linesFromFile(conf["blacklistFile"]) self.trainingDocsPath = conf["trainingDocs"] self.trainingDocsPath = os.path.abspath(self.trainingDocsPath) self.labeled = {} self.labeled["relevantPath"] = os.path.join(self.trainingDocsPath, "relevant.txt"); self.labeled["irrelevantPath"] = os.path.join(self.trainingDocsPath, "irrelevant.txt"); self.labeled["relevantUrls"] = linesFromFile(self.labeled["relevantPath"]); self.labeled["irrelevantUrls"] = linesFromFile(self.labeled["irrelevantPath"]); self.vsm = { "on": conf["useVSM"], "filterModel": conf["VSMFilterModel"], "minRepositoryDocNum": conf["minRepositoryDocNum"], "filterIrrelevantThreshold": conf["filterIrrelevantThreshold"], "filterRelevantThreshold": conf["filterRelevantThreshold"] } print "done"
def scoreClassifier(postType, featureDimension): #MLP acc_list = [] for k in range(2, 11): l = MLPClassifier('CSV Files_' + postType +'/data_std_hr24.csv', 'CSV Files_' + postType +'/label_hr24.csv', k , featureDimension) acc_list.append(l.kfold_validator()) t = np.arange(2, 11, 1) plt.plot(t, acc_list, 'ro') plt.ylabel('accuracy') plt.xlabel('K-fold') plt.title('MLP Classifier') plt.savefig('Images_' + postType + "/"+"MLP_Classifier.png") plt.close() #plt.show() #LR acc_list = [] avgCoeff = np.zeros(shape=(1,featureDimension)) for k in range(2,11): l = LRClassifier('CSV Files_' + postType +'/data_std_hr24.csv', 'CSV Files_' + postType +'/label_hr24.csv',k, featureDimension) accuracy, lrCoeff = l.kfold_validator() acc_list.append(accuracy) avgCoeff = avgCoeff + lrCoeff avgCoeff /= 9 print(avgCoeff) t = np.arange(2, 11, 1) plt.plot(t, acc_list, 'ro') plt.ylabel('accuracy') plt.xlabel('K-fold') plt.title('Logistic Regression') plt.savefig('Images_' + postType + "/" + "Logistic_Regression.png") plt.close() #plt.show() #SVM acc_list =[] for k in range(2,11): l = SVMClassifier('CSV Files_' + postType +'/data_std_hr24.csv', 'CSV Files_' + postType +'/label_hr24.csv',k, featureDimension) acc_list.append(l.kfold_validator()) t = np.arange(2, 11, 1) plt.plot(t, acc_list, 'ro') plt.ylabel('accuracy') plt.xlabel('K-fold') plt.title('Support Vector Machine') plt.savefig('Images_' + postType + "/" + "Support_Vector_Machine.png") plt.close() #plt.show() #NB acc_list =[] for k in range(2,11): l = NBClassifier('CSV Files_' + postType +'/data_std_hr24.csv', 'CSV Files_' + postType +'/label_hr24.csv',k, featureDimension) acc_list.append(l.kfold_validator()) t = np.arange(2, 11, 1) plt.plot(t, acc_list, 'ro') plt.ylabel('accuracy') plt.xlabel('K-fold') plt.title('Gaussian Naive Bayes') plt.savefig('Images_' + postType + "/" + "Gaussian_Naive_Bayes.png") plt.close()
def vcClassifier(postType, featureDimension): gp = GraphPlotting.GraphPlot() hours = [1, 6, 12, 24] graph_list = [] #LR for hour in hours: acc_list = [] print("For Hour: " + str(hour)) for k in range(2, 11): l = LRClassifier('CSV Files/data_std_hr' + str(hour) + '.csv', 'CSV Files/label_std_hr' + str(hour) + '.csv', k, featureDimension) acc_list.append(l.kfold_validator()[0]) graph_list.append((hour, acc_list)) gp.PlotGraph(graph_list, "Hours", "Accuracy", "Logistic Regression", postType) #SVM grap_list = [] for hour in hours: acc_list = [] print("For Hour: " + str(hour)) for k in range(2, 11): l = SVMClassifier('CSV Files/data_std_hr' + str(hour) + '.csv', 'CSV Files/label_std_hr' + str(hour) + '.csv', k, featureDimension) acc_list.append(l.kfold_validator()) grap_list.append((hour, acc_list)) gp.PlotGraph(grap_list, "Hours", "Accuracy", "Support vector machines", postType) #NB graph_list = [] for hour in hours: acc_list = [] print("For Hour: " + str(hour)) for k in range(2, 11): l = NBClassifier('CSV Files/data_std_hr' + str(hour) + '.csv', 'CSV Files/label_std_hr' + str(hour) + '.csv', k, featureDimension) acc_list.append(l.kfold_validator()) graph_list.append((hour, acc_list)) gp.PlotGraph(graph_list, "Hours", "Accuracy", "Navie Bayes", postType)
def svm_classification(k, lenData, pctTest, params, C=1, gamma=1, kernel="rbf"): clear_csv() samples = [] print(params) if (params[0] == "PAIS"): samples = generar_muestra_pais(lenData) else: samples = generar_muestra_provincia(lenData, params[1]) quantity_for_testing = int(lenData * pctTest) normalizer = Normalizer() data = normalizer.prepare_data(samples, quantity_for_testing) svmClassifier = SVMClassifier(kernel, C, gamma) firstRound = cross_validation(k, svmClassifier, data, lenData, "trainingFeatures", "testingFeatures", "First") secondRound = cross_validation(k, svmClassifier, data, lenData, "trainingFeatures", "testingFeatures", "Second") secondWithFirst = cross_validation(k, svmClassifier, data, lenData, "trainingFeaturesFirstInclude", "testingFeaturesFirstInclude", "Second") normalData = normalizer.get_normal_data() predictions = [firstRound, secondRound, secondWithFirst] show_accuracy("SVM", predictions) make_csv(k, normalData, lenData, pctTest, predictions)
def initialize_data_vars(self, data_dict): ## Seen Unseen data I/O self.seen_data_input = data_dict['seen_data_input'] self.seen_data_output = data_dict['seen_data_output'] self.unseen_data_input = data_dict['unseen_data_input'] self.unseen_data_output = data_dict['unseen_data_output'] self.seen_attr_mat = data_dict['seen_attr_mat'] self.unseen_attr_mat = data_dict['unseen_attr_mat'] self.seen_class_ids = data_dict['seen_class_ids'] self.unseen_class_ids = data_dict['unseen_class_ids'] self.num_attr = self.seen_attr_mat.shape[1] self.num_seen_classes = self.seen_attr_mat.shape[0] self.num_unseen_classes = self.unseen_attr_mat.shape[0] ## Create training Dataset print('Creating training dataset...') self.X_train = self.seen_data_input self.a_train = self.seen_attr_mat[self.seen_data_output, :] ## Create testing Dataset print('Creating test dataset...') self.X_test = self.unseen_data_input self.a_test = self.unseen_attr_mat[self.unseen_data_output, :] if (normalize): ## Updates self.seen_mean and self.seen_std self.X_train, self.X_test = self.preprocess(self.X_train, self.X_test, clamp_thresh=3.0) self.clfs = [] if (self.binary): for _ in range(self.num_attr): self.clfs.append(SVMClassifier()) else: for _ in range(self.num_attr): self.clfs.append(SVMRegressor()) self.pprint()
def classfiySVM(): svmInst = SVMClassifier(finalList, classList, testFinalList, testClassList) svmInst.classify()
def main(): conf = FCConfig("config.ini") seedUrls = linesFromFile(conf["seedFile"]) repositoryDocNames = linesFromFile(conf["docsFile"]) if conf["labelFile"]: print "Using labels" labels = intLinesFromFile(conf["labelFile"]) relevantDocs = [ doc for doc, lab in zip(repositoryDocNames, labels) if lab == 1 ] irrelevantDocs = [ doc for doc, lab in zip(repositoryDocNames, labels) if lab == 0 ] else: # use VSM model to label training docs vsmModel = None if conf["VSMFilterModel"].lower() == "tf-idf": vsmModel = TfidfScorer(getUrlTexts(seedUrls)) elif conf["VSMFilterModel"].lower() == "lsi": vsmModel = LSIScorer(getUrlTexts(seedUrls)) print "constructed vsm model" relevantDocs, irrelevantDocs = vsmModel.labelDocs( repositoryDocNames, conf["minRepositoryDocNum"], conf["filterIrrelevantThreshold"], conf["filterRelevantThreshold"]) print len(relevantDocs), len(irrelevantDocs) # Train classifier classifier = None testSize = min(len(relevantDocs), len(irrelevantDocs)) trainSize = conf["trainDocNum"] if (trainSize > testSize): raise Exception("Training size is larger than test size") trainDocs = relevantDocs[:trainSize] + irrelevantDocs[:trainSize] trainLabels = [1] * trainSize + [0] * trainSize if conf["classifier"].upper() == "NB": classifier = NaiveBayesClassifier() elif conf["classifier"].upper() == "SVM": classifier = SVMClassifier() classifier.trainClassifierFromNames(trainDocs, trainLabels) print "Training complete" # Test classifier testSize = min(len(relevantDocs), len(irrelevantDocs)) testDocs = relevantDocs[:testSize] + irrelevantDocs[:testSize] testLabels = [1] * testSize + [0] * testSize predictedLabels = list(classifier.predictFromNames(testDocs)) # Statistical analysis (recall and precision) allRelevant = testSize allIrrelevant = testSize predictedRelevant = predictedLabels.count(1) predictedIrrelevant = predictedLabels.count(0) correctlyRelevant = 0 for i in range(0, testSize): if predictedLabels[i] == 1: correctlyRelevant += 1 correctlyIrrelevant = 0 for i in range(testSize, 2 * testSize): if predictedLabels[i] == 0: correctlyIrrelevant += 1 relevantRecall = float(correctlyRelevant) / allRelevant relevantPrecision = float(correctlyRelevant) / (predictedRelevant) irrelevantRecall = float(correctlyIrrelevant) / allIrrelevant irrelevantPrecision = float(correctlyIrrelevant) / (predictedIrrelevant) print relevantRecall, relevantPrecision [(-1, p) for p in seedUrls] priorityQueue = PriorityQueue(t) crawler = Crawler(priorityQueue, classifier, 10) crawler.crawl() print crawler.relevantPagesCount print crawler.pagesCount
class FocusedCrawler: def init_command_line_config(self): print "using command line argparser" from argparser import argdict conf = argdict self.pageLimit = conf["page_limit"] self.linkLimit = conf["link_limit"] self.relevantThreshold = conf["relevancy_threshold"] classifierString = conf["classifier"] self.classifier = None if "NB" in classifierString.upper(): self.classifier = NaiveBayesClassifier() elif "SVM" in classifierString.upper(): self.classifier = SVMClassifier() seeds = conf["seeds"] self.seedUrls = [] urlsPerSeed = 10 for keyword in seeds: if keyword is not None and keyword is not "": if "http" in keyword: self.seedUrls.append(keyword) else: seedUrlGenerator = google.search(keyword) searchResultUrls = list(itertools.islice(seedUrlGenerator, 0, urlsPerSeed)) self.seedUrls = list(set(self.seedUrls) | set(searchResultUrls)) else: raise Exception("Seed is not valid: (" + str(keyword) + ") -- it must be a keyword or URL") print "seed urls: " print self.seedUrls self.blacklistDomains = conf["blacklist_domains"] self.labeled = {} self.labeled["relevantUrls"] = conf["relevant_urls"] for url in self.labeled["relevantUrls"]: if url is None or url is "": raise Exception("Relevant URL is not valid: (" + str(url) + ")") self.labeled["irrelevantUrls"] = conf["relevant_urls"] for url in self.labeled["irrelevantUrls"]: if url is None or url is "": raise Exception("Irrelevant URL is not valid: (" + str(url) + ")") self.vsm = { "on": conf["vsm"], "filterModel": conf["vsm_filter"], "minRepositoryDocNum": conf["min_repo_doc_num"], "filterIrrelevantThreshold": conf["irrelevancy_threshold"], "filterRelevantThreshold": conf["relevancy_threshold"] } def init_config(self): if len(sys.argv) > 1: self.init_command_line_config() else: self.init_config_file_config() def init_config_file_config(self): print "using config file" conf = Config("config.ini") self.pageLimit = conf["pageLimit"] self.linkLimit = conf["linkLimit"] self.relevantThreshold = conf["relevantThreshold"] classifierString = conf["classifier"] self.classifier = None if "NB" in classifierString.upper(): self.classifier = NaiveBayesClassifier() elif "SVM" in classifierString.upper(): self.classifier = SVMClassifier() seedKeywords = linesFromFile(conf["seedFile"]) self.seedUrls = [] urlsPerSeed = 10 for keyword in seedKeywords: if "http" in keyword: self.seedUrls.append(keyword) else: seedUrlGenerator = google.search(keyword) searchResultUrls = list(itertools.islice(seedUrlGenerator, 0, urlsPerSeed)) self.seedUrls = list(set(self.seedUrls) | set(searchResultUrls)) print "seed urls: " print self.seedUrls self.blacklistDomains = linesFromFile(conf["blacklistFile"]) self.trainingDocsPath = conf["trainingDocs"] self.trainingDocsPath = os.path.abspath(self.trainingDocsPath) self.labeled = {} self.labeled["relevantPath"] = os.path.join(self.trainingDocsPath, "relevant.txt"); self.labeled["irrelevantPath"] = os.path.join(self.trainingDocsPath, "irrelevant.txt"); self.labeled["relevantUrls"] = linesFromFile(self.labeled["relevantPath"]); self.labeled["irrelevantUrls"] = linesFromFile(self.labeled["irrelevantPath"]); self.vsm = { "on": conf["useVSM"], "filterModel": conf["VSMFilterModel"], "minRepositoryDocNum": conf["minRepositoryDocNum"], "filterIrrelevantThreshold": conf["filterIrrelevantThreshold"], "filterRelevantThreshold": conf["filterRelevantThreshold"] } print "done" def setup_model(self): if self.vsm["on"]: self.setup_vsm_model() else: self.setup_labeled_model() self.testSize = min(len(self.relevantDocs), len(self.irrelevantDocs)) def setup_labeled_model(self): print "Using labels provided by relevant.txt & irrelevant.txt" if self.labeled["irrelevantUrls"] is not None and len(self.labeled["irrelevantUrls"]) > 0: self.irrelevantDocs = [Webpage(url).save_tmp() for url in self.labeled["irrelevantUrls"] ] else: raise Exception("Irrelevant URLs must be provided for classification") if self.labeled["relevantUrls"] is not None and len(self.labeled["relevantUrls"]) > 0: self.relevantDocs = [Webpage(url).save_tmp() for url in self.labeled["relevantUrls"] ] else: raise Exception("Relevant URLs must be provided for classification") print "Found {} relevantDocs & {} irrelevantDocs".format(len(self.relevantDocs), len(self.irrelevantDocs)) def setup_vsm_model(self): # use VSM model to label training docs self.vsm["model"] = None if self.vsmFilterModel.lower() == "tf-idf": self.vsm["model"] = TfidfScorer(getUrlTexts(seedUrls)) elif self.vsmFilterModel.lower() == "lsi": self.vsm["model"] = LSIScorer(getUrlTexts(seedUrls)) if self.vsm["model"] is None: print "No filter model specified. Cannot construct vsm model" sys.exit() else: print "constructed vsm model" self.relevantDocs , self.irrelevantDocs = self.vsm["model"].labelDocs( self.repositoryDocNames, self.vsm["minRepositoryDocNum"], self.vsm["filterIrrelevantThreshold"], self.vsm["filterRelevantThreshold"]) def train_classifier(self): print "Training classifier" trainDocs = self.relevantDocs + self.irrelevantDocs trainLabels = [1]*len(self.relevantDocs) + [0]*len(self.irrelevantDocs) self.classifier.trainClassifierFromNames(trainDocs, trainLabels) print "Training complete" # Statistical analysis (recall and precision) def stat_analysis(self): testDocs = self.relevantDocs[:self.testSize] + self.irrelevantDocs[:self.testSize] testLabels = [1]*self.testSize + [0]*self.testSize self.predictedLabels = list(self.classifier.predictFromNames(testDocs)) allRelevant = self.testSize allIrrelevant = self.testSize self.predictedRelevant = self.predictedLabels.count(1) self.predictedIrrelevant = self.predictedLabels.count(0) correctlyRelevant = 0 for i in range(0, self.testSize): if self.predictedLabels[i] == 1: correctlyRelevant += 1 correctlyIrrelevant = 0 for i in range(self.testSize, 2*self.testSize): if self.predictedLabels[i] == 0: correctlyIrrelevant += 1 self.relevantRecall = float(correctlyRelevant) / allRelevant self.relevantPrecision = float(correctlyRelevant) / (self.predictedRelevant) self.irrelevantRecall = float(correctlyIrrelevant) / allIrrelevant self.irrelevantPrecision = float(correctlyIrrelevant) / (self.predictedIrrelevant) print self.relevantRecall, self.relevantPrecision def crawl(self): t = [(-1,p) for p in self.seedUrls] priorityQueue = PriorityQueue(t) crawler = Crawler( priorityQueue, self.classifier, self.pageLimit, self.linkLimit, self.relevantThreshold, self.blacklistDomains) crawler.crawl() print crawler.relevantPages print len(crawler.relevantPages) / len(crawler.visited) def cleanup_tmp_html_files(self): for filepath in self.relevantDocs + self.irrelevantDocs: os.remove(filepath)
a = self.evaluatorInstance.getAccuracy(Y_val_true, Y_val_pred) p, r, f = self.evaluatorInstance.getPRF(Y_val_true, Y_val_pred) print("Accuracy: " + str(a)) print("Precision: " + str(p)) print("Recall: " + str(r)) print("F-measure: " + str(f)) if __name__ == '__main__': trainFilePath = sys.argv[ 1] #please give the path to your reformatted quasar-s json train file valFilePath = sys.argv[2] # provide the path to val file retrievalInstance = Retrieval() featurizerInstance = [ TfIdfFeaturizer(), CountFeaturizer(), HashVectorizer() ] classifierInstance = [ NNClassifier(), SVMClassifier(), MultinomialNaiveBayes() ] for feature in featurizerInstance: for classifier in classifierInstance: trainInstance = Pipeline(trainFilePath, valFilePath, retrievalInstance, feature, classifier) print "-------------------------------------------------"
def SVMDriver(self): svm = SVMClassifier(self.train_x, self.train_y, self.test_x, self.test_y) # -----= SVM svm.train_1() svm_labels = svm.predic() svm_acc = svm.getAccuracy() svm.printResult() self.acc['svm-NuSVC'] = { 'accuracy': svm_acc, 'train-time': svm.trainTime(), 'test-time': svm.testTime(), } svm.train_2() svm_labels_2 = svm.predic() svm_acc = svm.getAccuracy() svm.printResult() self.acc['svm-LinearSVC'] = { 'accuracy': svm_acc, 'train-time': svm.trainTime(), 'test-time': svm.testTime(), } svm.train_3() svm_labels_3 = svm.predic() svm_acc = svm.getAccuracy() svm.printResult() self.acc['svm-SVC'] = { 'accuracy': svm_acc, 'train-time': svm.trainTime(), 'test-time': svm.testTime(), }
def DirectAttributePrediction(classifier='SVM', predicate_type='binary', C=10.0): # Get features index to recover samples train_index = bzUnpickle('./CreatedData/train_features_index.txt') test_index = bzUnpickle('./CreatedData/test_features_index.txt') # Get classes-attributes relationship train_attributes = get_class_attributes('./', name='train', predicate_type=predicate_type) test_attributes = get_class_attributes('./', name='test', predicate_type=predicate_type) N_ATTRIBUTES = train_attributes.shape[1] # Create training Dataset print('Creating training dataset...') X_train, y_train = create_data('./CreatedData/train_featuresVGG19.pic.bz2', train_index, train_attributes) print('X_train to dense...') X_train = X_train.toarray() Xplat_train, Xplat_val, yplat_train, yplat_val = train_test_split( X_train, y_train, test_size=0.10, random_state=42) print('Creating test dataset...') X_test, y_test = create_data('./CreatedData/test_featuresVGG19.pic.bz2', test_index, test_attributes) y_pred = np.zeros(y_test.shape) y_proba = np.copy(y_pred) print('X_test to dense...') X_test = X_test.toarray() # CHOOSING SVM if classifier == 'SVM': platt_params = [] for i in range(N_ATTRIBUTES): print('--------- Attribute %d/%d ---------' % (i + 1, N_ATTRIBUTES)) t0 = time() # SVM classifier if predicate_type == 'binary': clf = SVMClassifier() else: clf = SVMRegressor() # Training clf.fit(X_train, y_train[:, i]) print('Fitted classifier in: %fs' % (time() - t0)) if predicate_type == 'binary': clf.set_platt_params(Xplat_val, yplat_val[:, i]) # Predicting print('Predicting for attribute %d...' % (i + 1)) y_pred[:, i] = clf.predict(X_test) if predicate_type == 'binary': y_proba[:, i] = clf.predict_proba(X_test) print('Saving files...') np.savetxt('./DAP_' + predicate_type + '/prediction_SVM', y_pred) if predicate_type == 'binary': np.savetxt('./DAP_' + predicate_type + '/platt_params_SVM', platt_params) np.savetxt('./DAP_' + predicate_type + '/probabilities_SVM', y_proba) # CHOOSING NEURAL NETWORK if classifier == 'NN': if predicate_type != 'binary': clf = NeuralNetworkRegressor(dim_features=X_train.shape[1], nb_attributes=N_ATTRIBUTES) else: clf = NeuralNetworkClassifier(dim_features=X_train.shape[1], nb_attributes=N_ATTRIBUTES) print('Fitting Neural Network...') clf.fit(X_train, y_train) print('Predicting attributes...') y_pred = np.array(clf.predict(X_test)) y_pred = y_pred.reshape((y_pred.shape[0], y_pred.shape[1])).T y_proba = y_pred print('Saving files...') np.savetxt('./DAP_' + predicate_type + '/prediction_NN', y_pred) if predicate_type == 'binary': np.savetxt('./DAP_' + predicate_type + '/probabilities_NN', y_proba)
from VectorizedDataset import VectorizedDataset from SpectrumKernel import SpectrumKernel from SVMClassifier import SVMClassifier import kernels import utils path = 'data/' Kernel = SpectrumKernel n_grams_list = [4, 3, 2] VD = VectorizedDataset(Kernel, path, n_grams_list) SVMC = {} Cs = [0.9, 1.4, 1.2] for i in range(3): SVMC[i] = SVMClassifier(kernel=kernels.rbf, C=Cs[i]) SVMC[i].fit(VD.X[i]['train'], VD.Y[i]['train']) print("fit done for training {}".format(i)) print("Training accuracy for classifier {} : ".format(i) + str( utils.compute_val_accuracy(SVMC[i], VD.X[i]['train'], VD.Y[i] ['train']))) print("Validation accuracy for classifier {} : ".format(i) + str( utils.compute_val_accuracy(SVMC[i], VD.X[i]['val'], VD.Y[i]['val']))) utils.generate_submission_file(SVMC, path, n_grams_list, submission_filename='Yte.csv')