예제 #1
0
def main(arguments):
    # load the features of the dataset
    features = datasets.load_breast_cancer().data

    # standardize the features
    features = StandardScaler().fit_transform(features)

    # get the number of features
    num_features = features.shape[1]

    # load the corresponding labels for the features
    labels = datasets.load_breast_cancer().target

    # transform the labels to {-1, +1}
    labels[labels == 0] = -1

    # split the dataset to 70/30 partition: 70% train, 30% test
    train_features, test_features, train_labels, test_labels = train_test_split(
        features, labels, test_size=0.3, stratify=labels)

    train_size = train_features.shape[0]
    test_size = test_features.shape[0]

    # slice the dataset as per the batch size
    train_features = train_features[:train_size - (train_size % BATCH_SIZE)]
    train_labels = train_labels[:train_size - (train_size % BATCH_SIZE)]
    test_features = test_features[:test_size - (test_size % BATCH_SIZE)]
    test_labels = test_labels[:test_size - (test_size % BATCH_SIZE)]

    # instantiate the SVM class
    model = SVM(
        alpha=LEARNING_RATE,
        batch_size=BATCH_SIZE,
        svm_c=arguments.svm_c,
        num_classes=NUM_CLASSES,
        num_features=num_features,
    )

    # train the instantiated model
    model.train(
        epochs=arguments.num_epochs,
        log_path=arguments.log_path,
        train_data=[train_features, train_labels],
        train_size=train_features.shape[0],
        validation_data=[test_features, test_labels],
        validation_size=test_features.shape[0],
        result_path=arguments.result_path,
    )

    test_conf, test_accuracy = utils.plot_confusion_matrix(
        phase="testing",
        path=arguments.result_path,
        class_names=["benign", "malignant"])

    print("True negatives : {}".format(test_conf[0][0]))
    print("False negatives : {}".format(test_conf[1][0]))
    print("True positives : {}".format(test_conf[1][1]))
    print("False positives : {}".format(test_conf[0][1]))
    print("Testing accuracy : {}".format(test_accuracy))
예제 #2
0
파일: main.py 프로젝트: EthanWelsh/Yulp
def main():

    reviews = retrieve_reviews(5000)

    # Split reviews into a training and testing portion
    train_reviews = reviews[:4500]
    test_reviews = reviews[4500 + 1:]

    # Separate text and label to use during the training process
    text, labels = zip(*train_reviews)

    vector = FeatureVector()

    # Add features into feature vector
    vector.append(sentiment.SentimentAnalysis())
    vector.append(tfidf.TfIdf())
    vector.append(readability.Readability())
    vector.append(food_sophistication.FoodSophistication())
    vector.append(average_word_length.AverageWordLength())
    vector.append(rarity.Rarity())
    vector.append(spelling.Spelling())
    vector.append(sentence_topic.SentenceTopic())

    # Train all of the features individually
    vector.train(text, labels)

    model = SVM(vector)
    model.train(text, labels)

    # Separate text and label to use during the testing process
    text, labels = zip(*test_reviews)

    matches = 0
    distance = {}

    for i in range(len(labels)):
        predicted_score = model.predict(text[i])
        actual_score = labels[i]

        # count how many predicted scores match with the actual ones
        if predicted_score == actual_score:
            matches += 1

        # get a histogram of how far predicted scores differ from the actual
        dist = abs(predicted_score - actual_score)
        distance[dist] = distance.get(dist, 0) + 1

    print('Matches = {:.2%}'.format(matches / len(labels)))

    for distance, count in distance.items():
        print("{} : {}".format(distance, count))
예제 #3
0
def main():

    reviews = retrieve_reviews(5000)

    # Split reviews into a training and testing portion
    train_reviews = reviews[:4000]
    test_reviews = reviews[4001 + 1:]

    # Separate text and label to use during the training process
    text, labels = zip(*train_reviews)

    vector = FeatureVector()

    # Add features into feature vector
    vector.append(average_word_length.AverageWordLength())
    vector.append(sentiment_analysis.SentimentAnalysis())
    vector.append(rarity_analysis.Rarity())
    vector.append(tfidf.TfIdf())
    vector.append(readability.Readability())
    vector.append(spelling.Spelling())

    # Train all of the features individually
    vector.train(text, labels)

    model = SVM(vector)
    model.train(text, labels)

    # Separate text and label to use during the testing process
    text, labels = zip(*test_reviews)

    matches = 0
    distance = {}

    for i in range(len(labels)):
        predicted_score = model.predict(text[i])
        actual_score = labels[i]

        # count how many predicted scores match with the actual ones
        if predicted_score == actual_score:
            matches += 1

        # get a histogram of how far predicted scores differ from the actual
        dist = abs(predicted_score - actual_score)
        distance[dist] = distance.get(dist, 0) + 1

    print('Matches = {0:.2f}%'.format((matches / len(labels)) * 100))

    for distance, count in distance.items():
        print("{} : {}".format(distance, count))
예제 #4
0
def svm_experiment(train_data, validation_data, test_data):
    # merge train and validation
    log.write("Preparing data training")
    # build feature matrix
    train_data = flatten([train_data, validation_data])
    train_feature_matrix = []
    train_label_vector = []
    for doc in train_data:
        for idx, sentences in enumerate(flatten(doc["paragraphs"])):
            sentence_feature = []
            for attr in feature_attr_name:
                sentence_feature.append(doc[attr][idx])
            train_feature_matrix.append(sentence_feature)
            train_label_vector.append(flatten(doc["gold_labels"])[idx])

    n_data = len(train_feature_matrix)
    split_length = int(n_data / 20)
    rand_split = np.random.randint(20)
    offset = rand_split * split_length
    length = offset + split_length

    train_feature_matrix = np.array(train_feature_matrix[offset:length])
    train_label_vector = np.array(train_label_vector[offset:length])
    train_feature_matrix, train_label_vector = negative_sampling(
        train_feature_matrix, train_label_vector)

    log.write("Preparing data testing")
    test_feature_matrix = []
    for doc in test_data:
        for idx, sentences in enumerate(flatten(doc["paragraphs"])):
            sentence_feature = []
            for attr in feature_attr_name:
                sentence_feature.append(doc[attr][idx])
            test_feature_matrix.append(sentence_feature)
    test_feature_matrix = np.array(test_feature_matrix)

    log.write("Training SVM")
    conf = {"kernel": "rbf_kernel", "degree": 2, "sigma": 1, "C": 100}
    log.write(conf)
    svm_clf = SVM(kernel=conf["kernel"], C=conf["C"], sigma=conf["sigma"])
    svm_clf.fit(train_feature_matrix, train_label_vector)
    t1 = time.time()
    log.write("Testing SVM")
    predicted_labels, val = svm_clf.predict(test_feature_matrix)
    t2 = time.time()
    print('Elapsed time: {}'.format(timedelta(seconds=t2 - t1)))
    predicted_labels = [1 if i > -1 else 0 for i in predicted_labels]
    return predicted_labels
예제 #5
0
    def test_train_improper_arguments(self):
        fv = FeatureVector()
        fv.append(TestSVM.BogusFeature())
        fv.append(TestSVM.BogusFeature())
        fv.append(TestSVM.BogusFeature())

        with pytest.raises(TypeError):
            SVM(feature_vector=fv).train(reviews=TestSVM.sample_reviews, labels=TestSVM.sample_labels[:-1, :])
예제 #6
0
    def test_svm_predict(self):

        fv = FeatureVector()
        fv.append(TestSVM.BogusFeature())
        fv.append(TestSVM.BogusFeature())
        fv.append(TestSVM.BogusFeature())

        svm = SVM(feature_vector=fv)
        svm.train(reviews=TestSVM.sample_reviews, labels=TestSVM.sample_labels)

        assert svm.predict(['HI']) == 0 or svm.predict(['HI']) == 1
        assert svm.predict(['earth']) == 0 or svm.predict(['earth']) == 1
예제 #7
0
    def test_train(self):

        fv = FeatureVector()
        fv.append(TestSVM.BogusFeature())
        fv.append(TestSVM.BogusFeature())
        fv.append(TestSVM.BogusFeature())

        try:
            SVM(feature_vector=fv).train(reviews=TestSVM.sample_reviews, labels=TestSVM.sample_labels)
        except:
            pytest.fail('SVM training failed')
    def __init__(self,
                 classifier_type,
                 classifier_params,
                 kernel_name,
                 kernel_params,
                 id_model=0):

        # define kernel
        self.kernel = Kernel(kernel_name, kernel_params)
        self.tag_kernel = str(
            id_model
        ) + "_kernel_" + kernel_name  # kernel tag used to kernel mat saves

        # define classifier
        if classifier_type == "svm":
            self.classifier = SVM(classifier_params)
        elif classifier_type == "l_regression":
            self.classifier = LogisticRegression(lambda_regularisation=0.01)

        # kernel mat
        self.kernel_mat_train = None
        self.kernel_mat_val = None
        self.kernel_mat_test = None

        # load save kernel matrix
        self.do_save_kernel = kernel_params["save_kernel"]
        self.save_name = kernel_params["save_name"]
        if kernel_params["load_kernel"]:
            self.kernel_mat_train = load_object("train_" + self.tag_kernel +
                                                "_" +
                                                kernel_params["load_name"])
            self.kernel_mat_val = load_object("val_" + self.tag_kernel + "_" +
                                              kernel_params["load_name"])
            self.kernel_mat_test = load_object("test_" + self.tag_kernel +
                                               "_" +
                                               kernel_params["load_name"])
            if self.kernel_mat_train is None:
                print("## kernel load failed: kernel not found")
            else:
                print("## kernel matrix loaded")