def gen_predictions(learner_dict,unlabeled_datasets): ''' Returns predictions for unlabeled data ''' labeled_datasets = pd.DataFrame(columns=['origText','classLabel']) for name,classifier in learner_dict.items(): df = pd.read_csv('%s_learner.csv' % classifier.className).set_index('Unnamed: 0') classifier.labeled_datasets = machine_learning.ActiveLearningDataset(df,classLabel="classLabel",origText="origText") unlabeled_datasets['classLabel'] = unlabeled_datasets.classLabel.replace({np.nan:0}) classifier.unlabeled_datasets = machine_learning.ActiveLearningDataset(unlabeled_datasets,classLabel="classLabel",origText="origText") new_data = classifier.labeled_datasets.data[[classifier.labeled_datasets.origText,classifier.labeled_datasets.classLabel]] new_data[new_data.columns[1]].replace({1:name},inplace=True) labeled_datasets = pd.concat([labeled_datasets,new_data]) final_preds = pd.DataFrame(columns = learner_dict.keys()) for name,learner in learner_dict.items(): origData = learner.unlabeled_datasets.data[[learner.labeled_datasets.origText]] point_sets = [learner.unlabeled_datasets.get_samples().values] if learner.nbc: ml_class = machine_learning.NaiveBayes(learner.labeled_datasets.data,1,learner.labeled_datasets.classLabel) ml_class.testing = learner.test_datasets.data.drop(learner.test_datasets.origText,1) ml_class.predictProbabilities('Gaussian') ml_class.getPredictions() scores = ml_class.testingProbs elif learner.models[0].probability: scores = [] for example_index in range(len(point_sets[0])): prediction = learner.models[0].predict_probability(point_sets[0][example_index]) scores.append(prediction) else: scores = [] for example_index in range(len(point_sets[0])): score = learner.models[0].predict_values(point_sets[0][example_index]) scores.append(score) scores = pd.DataFrame(index=learner.unlabeled_datasets.data.index,data=scores) scores.columns = [0,1] if len(learner_dict.keys())==1: final_preds = scores else: final_preds[name] = scores[1] final_preds = final_preds.drop([each for each in labeled_datasets.index if each in final_preds.index]) origData = origData.drop([each for each in labeled_datasets.index if each in origData.index]) predictions = pd.DataFrame(final_preds.idxmax(1)) predictions['origText'] = origData[origData.columns[0]] labeled_datasets.to_csv('all_labeled_data.csv') predictions.to_csv('all_unlabeled_data_predictions.csv')
def __init__(self, unlabeled_datasets = pd.DataFrame(), test_datasets = pd.DataFrame(),models=[],probability = 0,NBC=False,className='Class'): # just using default parameter for now self.params = svm_parameter(weight=[1, 1000],probability=probability) self.unlabeled_datasets = unlabeled_datasets self.test_datasets = test_datasets # initialize empty labeled datasets (i.e., all data is unlabeled to begin with) self.labeled_datasets = machine_learning.ActiveLearningDataset(pd.DataFrame(columns=unlabeled_datasets.data.columns)) self.models = models self.test_results = [] self.nbc = NBC self.className = className
def undersample_labeled_datasets(self, k=None): ''' Undersamples the current labeled datasets ''' if self.labeled_datasets.data.shape[0]>0: if not k: #print "undersampling majority class to equal that of the minority examples" k = self.labeled_datasets.number_of_majority_examples() - self.labeled_datasets.number_of_minority_examples() # we copy the datasets rather than mutate the class members. copied_dataset = machine_learning.ActiveLearningDataset(self.labeled_datasets.copy()) print "removing %s majority instances" % k removed_instances = copied_dataset.undersample(k) else: raise Exception, "No labeled data has been provided!" return copied_dataset
#Scale the features so each vector is of unit modulus textData.bagofwords = textData.tfidf_df.apply(lambda x: x/np.linalg.norm(x),1) #Include dummy variables for each class label in the dataframe #####1 - Positive##### #####0 - Negative##### textData.bagofwords["classLabel"] = pd.get_dummies(all_data['sentiment'])['pos'] #Include the original text in the tfidf-dataframe textData.bagofwords["origText"] = all_data.text #Choose 1 from each class- positive & negative labeled_data = textData.bagofwords.loc[[0,500]] #Shuffle the remaining dataset shuffle = textData.bagofwords.loc[np.random.permutation(textData.bagofwords[~textData.bagofwords.index.isin([0,500])].index)] #Use 150 for the pool of unlabeled, and 50 for the test data unlabeled_data = shuffle[0:150] test_data = shuffle[150::] data1 = machine_learning.ActiveLearningDataset(labeled_data,classLabel="classLabel",origText="origText") data2 = machine_learning.ActiveLearningDataset(unlabeled_data,classLabel="classLabel",origText="origText") data3 = machine_learning.ActiveLearningDataset(test_data,classLabel="classLabel",origText="origText") active_learner = learner.learner(data1,test_datasets=data3,probability=0,NBC=True) length = len(data1.data) active_learner.pick_initial_training_set(length) active_learner.rebuild_models(undersample_first=True) active_learner.unlabeled_datasets.add_data(data2.data) active_learner.active_learn(10, num_to_label_at_each_iteration=2)
#Make copies of the datasets curr_labeledData = labeledData.copy() curr_unlabeledData = unlabeledData.copy() curr_testData = testData.copy() #Overwrite the old classLabel with binary class labels curr_labeledData['classLabel'] = classDummies[col].loc[ curr_labeledData.index] curr_unlabeledData['classLabel'] = classDummies[col].loc[ curr_unlabeledData.index] curr_testData['classLabel'] = classDummies[col].loc[ curr_testData.index] data1 = machine_learning.ActiveLearningDataset(curr_labeledData, classLabel="classLabel", origText="origText") data2 = machine_learning.ActiveLearningDataset(curr_unlabeledData, classLabel="classLabel", origText="origText") data3 = machine_learning.ActiveLearningDataset(curr_testData, classLabel="classLabel", origText="origText") #Create learner, with labeled dataset as initial training active_learner = learner.learner(data1, test_datasets=data3, NBC=False, className=classDefinitions[col]) active_learner.load() classifiers[col] = active_learner