def question_j(): logging.info("<Question J> Multiclass Classification") category = [ 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale', 'soc.religion.christian' ] train, test = utils.fetch_data(category) train_idf = utils.model_data(train) test_idf = utils.model_data(test) logging.info("Creating TFxIDF Vector Representations") logging.info("Performing LSI on TFxIDF Matrices") # apply LSI to TDxIDF matrices svd = TruncatedSVD(n_components=50) train_lsi = svd.fit_transform(train_idf) test_lsi = svd.fit_transform(test_idf) logging.info("TFxIDF Matrices Transformed") logging.info("Size of Transformed Training Dataset: {0}".format( train_lsi.shape)) logging.info("Size of Transformed Testing Dataset: {0}".format( test_lsi.shape)) clf_list = [ OneVsOneClassifier(GaussianNB()), OneVsOneClassifier(svm.SVC(kernel='linear')), OneVsRestClassifier(GaussianNB()), OneVsRestClassifier(svm.SVC(kernel='linear')) ] clf_name = [ 'OneVsOneClassifier Naive Bayes', 'OneVsOneClassifier SVM', 'OneVsRestClassifier Naive Bayes', 'OneVsRestClassifier SVM' ] # perform classification for clf, clf_n in zip(clf_list, clf_name): logging.info("Training {0} Classifier ".format(clf_n)) clf.fit(train_lsi, train.target) logging.info("Testing {0} Classifier".format(clf_n)) test_predicted = clf.predict(test_lsi) utils.calculate_stats(test.target, test_predicted)
def main(): # Relabelling and stuff classes = [ computer_technologies, recreational_activity, science, miscellaneus, politics, religion ] all_categories = [] i = 0 rmap = {} for cnum, c in enumerate(classes): for category in c: all_categories.append(category) rmap[i] = cnum i += 1 data = fetch_20newsgroups(subset='all', shuffle=True, random_state=42) data.target = list(map(lambda x: rmap[x], data.target)) data_idf = utils.model_data(data, 'part6') # Find Effective dimensions to retrieve data k = 6 ds = range(2, 75, 1) svd_metrics = [] print("Varying Dimensions") for d in ds: print("Set d = ", d) svd = TruncatedSVD(n_components=d) poly = FunctionTransformer(np.log1p) normalizer = Normalizer(copy=False) svd_pipeline = make_pipeline(svd, poly, normalizer) X_SVD = svd_pipeline.fit_transform(data_idf) kmeans = KMeans(n_clusters=k).fit(X_SVD) svd_metrics.append(utils.calculate_stats(data.target, kmeans.labels_)) metric_names = [ 'homogeneity_score', 'completeness_score', 'adjusted_rand_score', 'adjusted_mutual_info_score' ] for i, metric_name in enumerate(metric_names): plt.plot(ds, list(map(lambda x: x[i], svd_metrics)), label=metric_name) plt.xlabel('Dimensions') plt.ylabel('Metric Value') plt.legend(loc='best') plt.savefig('plots/part6.png', format='png') plt.clf()
type=int, default=4096, help='Neurons in the Hidden Layer') parser.add_argument('--epochs', type=int, default=5, help='Epochs') parser.add_argument('--gpu', type=str, default='cuda', help='GPU or CPU') parser.add_argument('--save_dir', type=str, default='checkpoint.pth', help='Path to checkpoint') arg, unknown = parser.parse_known_args() train_transforms, valid_transforms, test_transforms = data_transforms() train_data, valid_data, test_data = data_loader(train_transforms, valid_transforms, test_transforms) trainloader, validloader, testloader = model_data(train_data, valid_data, test_data) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if arg.arch == 'vgg': input_size = 25088 model = models.vgg16(pretrained=True) elif arg.aech == 'densenet': input_size = 25088 model = models.densenet121(pretrained=True) for param in model.parameters(): param.requires_grad = False model.classifier = nn.Sequential(nn.Linear(input_size, arg.hidden_units), nn.ReLU(), nn.Dropout(0.5),
def question_d(): logging.info("<Question D>Reducing data to 50 dimensional vector") train_idf = utils.model_data(proc_train_set) test_idf = utils.model_data(proc_test_set) _, _ = utils.apply_lsi(train_idf, test_idf)
def question_b(): logging.info("<Question B> Getting the TFxIDF representation") utils.model_data(proc_train_set)
def question_1(): logging.info("<Question 1> Getting the TFxIDF representation") utils.model_data(data_set, "train_idf")