示例#1
0
def question_j():
    logging.info("<Question J> Multiclass Classification")
    category = [
        'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale',
        'soc.religion.christian'
    ]
    train, test = utils.fetch_data(category)

    train_idf = utils.model_data(train)
    test_idf = utils.model_data(test)
    logging.info("Creating TFxIDF Vector Representations")

    logging.info("Performing LSI on TFxIDF Matrices")
    # apply LSI to TDxIDF matrices
    svd = TruncatedSVD(n_components=50)
    train_lsi = svd.fit_transform(train_idf)
    test_lsi = svd.fit_transform(test_idf)

    logging.info("TFxIDF Matrices Transformed")

    logging.info("Size of Transformed Training Dataset: {0}".format(
        train_lsi.shape))
    logging.info("Size of Transformed Testing Dataset: {0}".format(
        test_lsi.shape))

    clf_list = [
        OneVsOneClassifier(GaussianNB()),
        OneVsOneClassifier(svm.SVC(kernel='linear')),
        OneVsRestClassifier(GaussianNB()),
        OneVsRestClassifier(svm.SVC(kernel='linear'))
    ]
    clf_name = [
        'OneVsOneClassifier Naive Bayes', 'OneVsOneClassifier SVM',
        'OneVsRestClassifier Naive Bayes', 'OneVsRestClassifier SVM'
    ]

    # perform classification
    for clf, clf_n in zip(clf_list, clf_name):
        logging.info("Training {0} Classifier ".format(clf_n))
        clf.fit(train_lsi, train.target)
        logging.info("Testing {0} Classifier".format(clf_n))
        test_predicted = clf.predict(test_lsi)
        utils.calculate_stats(test.target, test_predicted)
示例#2
0
def main():
    # Relabelling and stuff
    classes = [
        computer_technologies, recreational_activity, science, miscellaneus,
        politics, religion
    ]
    all_categories = []
    i = 0
    rmap = {}
    for cnum, c in enumerate(classes):
        for category in c:
            all_categories.append(category)
            rmap[i] = cnum
            i += 1

    data = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)
    data.target = list(map(lambda x: rmap[x], data.target))
    data_idf = utils.model_data(data, 'part6')

    # Find Effective dimensions to retrieve data
    k = 6
    ds = range(2, 75, 1)
    svd_metrics = []
    print("Varying Dimensions")
    for d in ds:
        print("Set d = ", d)
        svd = TruncatedSVD(n_components=d)
        poly = FunctionTransformer(np.log1p)
        normalizer = Normalizer(copy=False)
        svd_pipeline = make_pipeline(svd, poly, normalizer)
        X_SVD = svd_pipeline.fit_transform(data_idf)
        kmeans = KMeans(n_clusters=k).fit(X_SVD)
        svd_metrics.append(utils.calculate_stats(data.target, kmeans.labels_))

    metric_names = [
        'homogeneity_score', 'completeness_score', 'adjusted_rand_score',
        'adjusted_mutual_info_score'
    ]

    for i, metric_name in enumerate(metric_names):
        plt.plot(ds, list(map(lambda x: x[i], svd_metrics)), label=metric_name)
    plt.xlabel('Dimensions')
    plt.ylabel('Metric Value')
    plt.legend(loc='best')
    plt.savefig('plots/part6.png', format='png')
    plt.clf()
示例#3
0
                    type=int,
                    default=4096,
                    help='Neurons in the Hidden Layer')
parser.add_argument('--epochs', type=int, default=5, help='Epochs')
parser.add_argument('--gpu', type=str, default='cuda', help='GPU or CPU')
parser.add_argument('--save_dir',
                    type=str,
                    default='checkpoint.pth',
                    help='Path to checkpoint')
arg, unknown = parser.parse_known_args()

train_transforms, valid_transforms, test_transforms = data_transforms()
train_data, valid_data, test_data = data_loader(train_transforms,
                                                valid_transforms,
                                                test_transforms)
trainloader, validloader, testloader = model_data(train_data, valid_data,
                                                  test_data)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if arg.arch == 'vgg':
    input_size = 25088
    model = models.vgg16(pretrained=True)
elif arg.aech == 'densenet':
    input_size = 25088
    model = models.densenet121(pretrained=True)

for param in model.parameters():
    param.requires_grad = False

model.classifier = nn.Sequential(nn.Linear(input_size, arg.hidden_units),
                                 nn.ReLU(), nn.Dropout(0.5),
示例#4
0
def question_d():
    logging.info("<Question D>Reducing data to 50 dimensional vector")
    train_idf = utils.model_data(proc_train_set)
    test_idf = utils.model_data(proc_test_set)
    _, _ = utils.apply_lsi(train_idf, test_idf)
示例#5
0
def question_b():
    logging.info("<Question B> Getting the TFxIDF representation")
    utils.model_data(proc_train_set)
示例#6
0
def question_1():
    logging.info("<Question 1> Getting the TFxIDF representation")
    utils.model_data(data_set, "train_idf")