示例#1
0
def gen_predictions(learner_dict,unlabeled_datasets):
	'''
	Returns predictions for unlabeled data
	'''
	labeled_datasets = pd.DataFrame(columns=['origText','classLabel'])
	for name,classifier in learner_dict.items():
		df = pd.read_csv('%s_learner.csv' % classifier.className).set_index('Unnamed: 0')
		classifier.labeled_datasets = machine_learning.ActiveLearningDataset(df,classLabel="classLabel",origText="origText")
		unlabeled_datasets['classLabel'] = unlabeled_datasets.classLabel.replace({np.nan:0})
		classifier.unlabeled_datasets = machine_learning.ActiveLearningDataset(unlabeled_datasets,classLabel="classLabel",origText="origText")
		new_data = classifier.labeled_datasets.data[[classifier.labeled_datasets.origText,classifier.labeled_datasets.classLabel]]
		new_data[new_data.columns[1]].replace({1:name},inplace=True)
		labeled_datasets = pd.concat([labeled_datasets,new_data])

	final_preds = pd.DataFrame(columns = learner_dict.keys())
	for name,learner in learner_dict.items():
		origData = learner.unlabeled_datasets.data[[learner.labeled_datasets.origText]]
		point_sets = [learner.unlabeled_datasets.get_samples().values]
		if learner.nbc:
			ml_class = machine_learning.NaiveBayes(learner.labeled_datasets.data,1,learner.labeled_datasets.classLabel)
			ml_class.testing = learner.test_datasets.data.drop(learner.test_datasets.origText,1)
			ml_class.predictProbabilities('Gaussian')
			ml_class.getPredictions()
			scores = ml_class.testingProbs
		elif learner.models[0].probability:
			scores = []
			for example_index in range(len(point_sets[0])):
				prediction = learner.models[0].predict_probability(point_sets[0][example_index])
				scores.append(prediction)
		else:
			scores = []
			for example_index in range(len(point_sets[0])):
				score = learner.models[0].predict_values(point_sets[0][example_index])
				scores.append(score)
			scores = pd.DataFrame(index=learner.unlabeled_datasets.data.index,data=scores)
		scores.columns = [0,1]
		if len(learner_dict.keys())==1:
			final_preds = scores
		else:
			final_preds[name] = scores[1]
		final_preds = final_preds.drop([each for each in labeled_datasets.index if each in final_preds.index])
		origData = origData.drop([each for each in labeled_datasets.index if each in origData.index])
	predictions = pd.DataFrame(final_preds.idxmax(1))
	predictions['origText'] = origData[origData.columns[0]]
	labeled_datasets.to_csv('all_labeled_data.csv')
	predictions.to_csv('all_unlabeled_data_predictions.csv')
示例#2
0
 def __init__(self, unlabeled_datasets = pd.DataFrame(), test_datasets = pd.DataFrame(),models=[],probability = 0,NBC=False,className='Class'):
     # just using default parameter for now
     self.params = svm_parameter(weight=[1, 1000],probability=probability)
     self.unlabeled_datasets = unlabeled_datasets
     self.test_datasets = test_datasets
     # initialize empty labeled datasets (i.e., all data is unlabeled to begin with)
     self.labeled_datasets = machine_learning.ActiveLearningDataset(pd.DataFrame(columns=unlabeled_datasets.data.columns))
     self.models = models
     self.test_results = []
     self.nbc = NBC
     self.className = className
示例#3
0
 def undersample_labeled_datasets(self, k=None):
     '''
     Undersamples the current labeled datasets
     '''
     if self.labeled_datasets.data.shape[0]>0:
         if not k:
             #print "undersampling majority class to equal that of the minority examples"
             k = self.labeled_datasets.number_of_majority_examples() - self.labeled_datasets.number_of_minority_examples()
         # we copy the datasets rather than mutate the class members.
         copied_dataset = machine_learning.ActiveLearningDataset(self.labeled_datasets.copy())
         print "removing %s majority instances" % k
         removed_instances = copied_dataset.undersample(k)
     else:
         raise Exception, "No labeled data has been provided!"   
     return copied_dataset
示例#4
0
#Scale the features so each vector is of unit modulus
textData.bagofwords = textData.tfidf_df.apply(lambda x: x/np.linalg.norm(x),1)
#Include dummy variables for each class label in the dataframe
#####1 - Positive#####
#####0 - Negative#####
textData.bagofwords["classLabel"] = pd.get_dummies(all_data['sentiment'])['pos']
#Include the original text in the tfidf-dataframe
textData.bagofwords["origText"] = all_data.text

#Choose 1 from each class- positive & negative
labeled_data = textData.bagofwords.loc[[0,500]]

#Shuffle the remaining dataset
shuffle = textData.bagofwords.loc[np.random.permutation(textData.bagofwords[~textData.bagofwords.index.isin([0,500])].index)]

#Use 150 for the pool of unlabeled, and 50 for the test data
unlabeled_data = shuffle[0:150]
test_data = shuffle[150::]

data1 = machine_learning.ActiveLearningDataset(labeled_data,classLabel="classLabel",origText="origText")
data2 = machine_learning.ActiveLearningDataset(unlabeled_data,classLabel="classLabel",origText="origText")
data3 = machine_learning.ActiveLearningDataset(test_data,classLabel="classLabel",origText="origText")

active_learner = learner.learner(data1,test_datasets=data3,probability=0,NBC=True)
length = len(data1.data)
active_learner.pick_initial_training_set(length)
active_learner.rebuild_models(undersample_first=True)

active_learner.unlabeled_datasets.add_data(data2.data)

active_learner.active_learn(10, num_to_label_at_each_iteration=2)
示例#5
0
        #Make copies of the datasets
        curr_labeledData = labeledData.copy()
        curr_unlabeledData = unlabeledData.copy()
        curr_testData = testData.copy()

        #Overwrite the old classLabel with binary class labels
        curr_labeledData['classLabel'] = classDummies[col].loc[
            curr_labeledData.index]
        curr_unlabeledData['classLabel'] = classDummies[col].loc[
            curr_unlabeledData.index]
        curr_testData['classLabel'] = classDummies[col].loc[
            curr_testData.index]

        data1 = machine_learning.ActiveLearningDataset(curr_labeledData,
                                                       classLabel="classLabel",
                                                       origText="origText")
        data2 = machine_learning.ActiveLearningDataset(curr_unlabeledData,
                                                       classLabel="classLabel",
                                                       origText="origText")
        data3 = machine_learning.ActiveLearningDataset(curr_testData,
                                                       classLabel="classLabel",
                                                       origText="origText")

        #Create learner, with labeled dataset as initial training
        active_learner = learner.learner(data1,
                                         test_datasets=data3,
                                         NBC=False,
                                         className=classDefinitions[col])
        active_learner.load()
        classifiers[col] = active_learner