def test_labels(): # get the texts from the training data examples = classify.read_smsspam("smsspam/SMSSpamCollection.train") labels = [label for label, _ in examples] # create the label encoder from the training texts to_labels = classify.TextToLabels(labels) # make sure that some sample labels are encoded as expected ham_index = to_labels.index("ham") spam_index = to_labels.index("spam") assert ham_index != spam_index assert np.all(to_labels(["ham", "spam", "spam"]) == [ham_index, spam_index, spam_index])
def test_read_smsspam(): # keep a counter here (instead of enumerate) in case the iterator is empty count = 0 for example in classify.read_smsspam("AGBIGnp.out"): # make sure the right shape is returned assert len(example) == 2 label, text = example # make sure the label is one of the expected two assert label in {"Positive", "Negative"} count += 1 assert count == 1553
def test_read_smsspam(): # keep a counter here (instead of enumerate) in case the iterator is empty count = 0 for example in classify.read_smsspam("smsspam/SMSSpamCollection.train"): # make sure the right shape is returned assert len(example) == 2 label, text = example # make sure the label is one of the expected two assert label in {"ham", "spam"} count += 1 assert count == 3345
def test_labels(): # get the texts from the training data examples = classify.read_smsspam("AGBIG_annotation.outt") labels = [label for label, _ in examples] # create the label encoder from the training texts to_labels = classify.TextToLabels(labels) # make sure that some sample labels are encoded as expected nc_index = to_labels.index("no") c_index = to_labels.index("yes") assert nc_index != c_index assert np.all( to_labels(["no", "yes", "yes"]) == [nc_index, c_index, c_index])
def test_prediction(capsys, min_f1=0.89, min_accuracy=0.97): # get texts and labels from the training data train_examples = classify.read_smsspam("smsspam/SMSSpamCollection.train") train_labels, train_texts = zip(*train_examples) # get texts and labels from the development data devel_examples = classify.read_smsspam("smsspam/SMSSpamCollection.devel") devel_labels, devel_texts = zip(*devel_examples) # create the feature extractor and label encoder to_features = classify.TextToFeatures(train_texts) to_labels = classify.TextToLabels(train_labels) # train the classifier on the training data classifier = classify.Classifier() classifier.train(to_features(train_texts), to_labels(train_labels)) # make predictions on the development data predicted_indices = classifier.predict(to_features(devel_texts)) assert np.array_equal(predicted_indices, predicted_indices.astype(bool)) # measure performance of predictions devel_indices = to_labels(devel_labels) spam_label = to_labels.index("spam") f1 = f1_score(devel_indices, predicted_indices, pos_label=spam_label) accuracy = accuracy_score(devel_indices, predicted_indices) # print out performance if capsys is not None: with capsys.disabled(): msg = "\n{:.1%} F1 and {:.1%} accuracy on SMSSpam development data" print(msg.format(f1, accuracy)) # make sure that performance is adequate assert f1 > min_f1 assert accuracy > min_accuracy
def test_prediction(capsys, min_f1=0.89, min_accuracy=0.97): #K FOLD TEST full_examples = classify.read_smsspam("a_lil_morenp.out") full_labels, full_texts = zip(*full_examples) clf = MLPClassifier(max_iter=1000) pipeline = Pipeline([('vectorizer', CountVectorizer(binary=False, ngram_range=(1, 1), max_df=1)), ('classifier', AdaBoostClassifier())]) #print(np.asarray(train_texts[1:5])) k_fold = KFold(n_splits=2) #for LinearRegression #full_labels = [0 if i == "no" else i for i in full_labels] #full_labels = [1 if i == "yes" else i for i in full_labels] scores = [] for train_indices, test_indices in k_fold.split(np.array(full_texts)): train_text = np.array(full_texts)[train_indices] train_y = np.array(full_labels)[train_indices] test_text = np.array(full_texts)[test_indices] test_y = np.array(full_labels)[test_indices] pipeline.fit(train_text, train_y) score = pipeline.score(test_text, test_y) p = pipeline.predict(test_text) p2 = pipeline.predict_proba(test_text) scores.append(score) p_o = [j for j in p if j == "Positive"] p2_o = [p2[j] for j in range(0, len(p)) if p[j] == "Positive"] # print("Positive " , len(p_o) , " total " , len(p) , " proba " , p2_o[0]) print("Positive ", len(p_o), " total ", len(p)) score = sum(scores) / len(scores) #KFOLD performance if capsys is not None: with capsys.disabled(): msg = "\n{:.1%} score on MTURK development data" + p_o print(msg.format(score))
def test_labels(): # get the texts from the training data examples = classify.read_smsspam("AGBIGnp.out") labels = [label for label, _ in examples] # create the label encoder from the training texts to_labels = classify.TextToLabels(labels) # make sure that some sample labels are encoded as expected #fl_index = to_labels.index("Facts/Logic") pt_index = to_labels.index("Positive") nt_index = to_labels.index("Negative") #a_index = to_labels.index("Affiliation") #h_index = to_labels.index("Humor") #w_index = to_labels.index("Warning") assert nt_index != pt_index #assert np.all(to_labels(["Facts/Logic", "Positive", "Negative", "Affiliation", "Humor", "Warning"]) == assert np.all(to_labels(["Positive", "Negative"]) == [pt_index, nt_index])
def test_features(): # get the texts from the training data examples = classify.read_smsspam("AGBIGnp.out") texts = [text for _, text in examples] # create the feature extractor from the training texts to_features = classify.TextToFeatures(texts) # extract features for some made-up sentences features = to_features(["illegals should leave", "Build the wall"]) # make sure there is one row of features for each sentence assert len(features.shape) == 2 n_rows, n_cols = features.shape assert n_rows == 2 # make sure there are nonzero values for some selected unigram # features in the first sentence indices = [to_features.index(f) for f in ["illegals", "wall"]] assert len(set(indices)) > 1 row_indices, col_indices = features[:, indices].nonzero() assert np.all(row_indices == 0) assert len(col_indices) == 2
def test_features(): # get the texts from the training data examples = classify.read_smsspam("AGBIG_annotation.outt") texts = [text for _, text in examples] # create the feature extractor from the training texts to_features = classify.TextToFeatures(texts) # extract features for some made-up sentences features = to_features( ["There are some things that I need to send to you.", "Hello!"]) # make sure there is one row of features for each sentence assert len(features.shape) == 2 n_rows, n_cols = features.shape assert n_rows == 2 # make sure there are nonzero values for some selected unigram and bigram # features in the first sentence indices = [to_features.index(f) for f in ["need", "to you"]] assert len(set(indices)) > 1 row_indices, col_indices = features[:, indices].nonzero() assert np.all(row_indices == 0) assert len(col_indices) == 2
def test_prediction(capsys, min_f1=0.89, min_accuracy=0.97): #K FOLD TEST full_examples = classify.read_smsspam("a_lil_more.out") full_labels, full_texts = zip(*full_examples) clf = MLPClassifier(max_iter=1000) pipeline = Pipeline([('vectorizer', CountVectorizer(binary=False, ngram_range=(1, 1), max_df=1)), ('classifier', AdaBoostClassifier())]) #print(np.asarray(train_texts[1:5])) k_fold = KFold(n_splits=2) #for LinearRegression #full_labels = [0 if i == "no" else i for i in full_labels] #full_labels = [1 if i == "yes" else i for i in full_labels] scores = [] for train_indices, test_indices in k_fold.split(np.array(full_texts)): train_text = np.array(full_texts)[train_indices] train_y = np.array(full_labels)[train_indices] test_text = np.array(full_texts)[test_indices] test_y = np.array(full_labels)[test_indices] pipeline.fit(train_text, train_y) score = pipeline.score(test_text, test_y) p = pipeline.predict(test_text) p2 = pipeline.predict_proba(test_text) scores.append(score) p_o = [j for j in p if j == "yes"] p2_o = [p2[j] for j in range(0, len(p)) if p[j] == "yes"] print("yes ", len(p_o), " total ", len(p), " proba ", p2_o[0]) score = sum(scores) / len(scores) #KFOLD performance if capsys is not None: with capsys.disabled(): msg = "\n{:.1%} score on MTURK development data" + p print(msg.format(score)) '''f = open("classify.js", "w") porter = Porter(clf, language='js') output = porter.export(embed_data=True) f.write(output) f.close()''' #NORMAL VALIDATION # get texts and labels from the training data train_examples = classify.read_smsspam("AGBIG_annotation.outt") train_labels, train_texts = zip(*train_examples) # get texts and labels from the development data devel_examples = classify.read_smsspam("AGBIG_annotation.outd") devel_labels, devel_texts = zip(*devel_examples) # create the feature extractor and label encoder to_features = classify.TextToFeatures(train_texts) to_labels = classify.TextToLabels(train_labels) # train the classifier on the training data aka fit classifier = classify.Classifier() classifier.train(to_features(train_texts), to_labels(train_labels)) # make predictions on the development data predicted_indices = classifier.predict(to_features(devel_texts)) assert np.array_equal(predicted_indices, predicted_indices.astype(bool)) # measure performance of predictions devel_indices = to_labels(devel_labels) spam_label = to_labels.index("yes") f1 = f1_score(devel_indices, predicted_indices, pos_label=spam_label) accuracy = accuracy_score(devel_indices, predicted_indices) # print out performance if capsys is not None: with capsys.disabled(): msg = "\n{:.1%} F1 and {:.1%} accuracy on MTURK development data" print(msg.format(f1, accuracy))