示例#1
0
    def start(self):
        print self.get_name()
        trial = []
        self._setup_options(self.config)
        t0 = time()
        self.data = datautil.load_dataset(self.dataname, self.data_path, categories=self.data_cat, rnd=self.seed,
                                          shuffle=True, percent=self.split, keep_subject=True)
        self.print_lap("Loaded", t0)
        # self.data = self.vectorize(self.data)
        cv = self.cross_validation_data(self.data, folds=self.folds, trials=self.trials, split=self.split)
        t = 0
        for train_index, test_index in cv:
            # get the data of this cv iteration
            # train, test = exputil.sample_data(self.data, train_index, test_index)
            train, test = self._sample_data(self.data, train_index, test_index)
            self.print_lap("\nSampled", t0)
            # get the expert and student
            learner = exputil.get_learner(cfgutil.get_section_options(self.config, 'learner'),
                                          vct=self.vct, sent_tk=self.sent_tokenizer, seed=(t * 10 + 10),  cost_model=self.cost_model)

            expert = exputil.get_expert(cfgutil.get_section_options(self.config, 'expert'), size=len(train.data))

            expert.fit(train.data, y=train.target, vct=self.vct)

            # do active learning
            results = self.main_loop(learner, expert, self.budget, self.bootstrap_size, train, test)
            self.print_lap("\nTrial %s" % t, t0)

            # save the results
            trial.append(results)
            t += 1
        self.report_results(trial)
    def start(self, n_jobs=1, pre_dispatch='2*n_jobs'):
        trial = []
        self._setup_options(self.config)
        print self.get_name()
        t0 = time()
        self.data = datautil.load_dataset(self.dataname, self.data_path, categories=self.data_cat, rnd=self.seed,
                                          shuffle=True, percent=self.split, keep_subject=True)
        self.print_lap("Loaded", t0)

        self.data = self.vectorize(self.data)

        cv = self.cross_validation_data(self.data, folds=self.folds, trials=self.trials, split=self.split)

        seeds = np.arange(len(cv)) * 10 + 10

        expert = exputil.get_expert(cfgutil.get_section_options(self.config, 'expert'), size=(len(self.data.train.target),self.data.train.sizes.max()))

        expert.fit(self.data.train.bow, y=self.data.train.target, vct=self.vct)

        lrnr_setup= {'vct':self.vct, "sent_tk":self.sent_tokenizer,  "cost_model":self.cost_model,
                     'validation_set':self.validation_set}

        lrnr_type = cfgutil.get_section_option(self.config, 'learner', 'type')

        neu_threshold = cfgutil.get_section_option(self.config, 'expert', 'threshold')

        if lrnr_type in ['utility-cheat','const-cheat','const-cheat-noisy']:
            lrnr_setup.update({'snip_model':expert.oracle, 'threshold':neu_threshold})

        learners = [exputil.get_learner(cfgutil.get_section_options(self.config, 'learner'),
                                        seed=s, **lrnr_setup) for s in seeds]
        self.print_lap("\nPreprocessed", t0)
        # ===================================
        parallel = Parallel(n_jobs=n_jobs, verbose=True,
                            pre_dispatch=pre_dispatch)
        scores = parallel(delayed(self.main_loop_jobs,check_pickle=False)(learners[t], expert, self.budget, self.bootstrap_size,
                                                  self.data, tr[0],tr[1], t)
                         for t, tr in enumerate(cv))
        # ===================================

        self.print_lap("\nDone trials", t0)

        # save the results

        self.report_results(scores)
示例#3
0
    def start(self):
        import copy
        from collections import deque
        from time import time

        self.set_options(self.config)
        self.data = datautil.load_dataset(self.dataname, self.data_path, categories=self.data_cat, rnd=self.seed,
                                          shuffle=True, percent=self.split, keep_subject=True)

        sequence = self.get_sequence(len(self.data.train.target), self.budget+self.bootstrap_size)

        pool, test = self._sample_data(self.data, sequence, [])
        # pool2, _ = self._sample_data(self.data, sequence, [])
        # pool2 = copy.deepcopy(pool)
        pool2 = self.copy_pool(pool)
        # pool2.remaining = []

        student1, student2 = self.get_student(self.config, [pool, pool2], sequence)

        expert = self.get_expert(self.config, self.data.train.target_names)

        combined_budget = 0
        coin = np.random.RandomState(9187465)

        i = 0
        # expert_labels = self.start_record()
        student = {'learner1':student1, 'learner2':student2}
        expert_times = {'learner1':[], 'learner2':[]}
        expert_labels = {'learner1': self.start_record(), 'learner2': self.start_record()}
        original_sequence = []

        raw_input("\n*** Press <return> to start ***")

        t0 = time()
        while combined_budget < (2 * self.budget):
            if i == 0:
                ## Bootstrap
                # bootstrap
                train = self.bootstrap(student['learner1'].pool, self.bootstrap_size, bunch.Bunch(index=[], target=[]))

                student['learner1'].train = train
                student['learner2'].train = bunch.Bunch(index=copy.copy(train.index), target=copy.copy(train.target))

                student['learner1'].student = self.retrain(student['learner1'].student, student['learner1'].pool,
                                                           student['learner1'].train)

                student['learner2'].student = self.retrain(student['learner2'].student, student['learner2'].pool,
                                                           student['learner2'].train)

                for t in train.index:
                    student['learner1'].pool.remaining.remove(t)
                    student['learner2'].pool.remaining.remove(t)

                tmp_list = list(student['learner1'].pool.remaining)
                pool_sample = self.rnd_state.choice(tmp_list, self.budget, False)
                student['learner1'].pool.remaining = deque(pool_sample)
                original_sequence = copy.copy(train.index) + list(pool_sample)

                self.rnd_state.shuffle(pool_sample)
                student['learner2'].pool.remaining = deque(pool_sample)
            else:
                # select student
                next_turn = coin.random_sample()

                if next_turn < .5:
                    curr_student = 'learner1'
                else:  # first1 student
                    curr_student = 'learner2'

                query, labels = self.al_cycle(student[curr_student], expert)

                # print len(student['learner1'].pool.remaining), len(student['learner2'].pool.remaining)

                if query is not None and labels is not None:

                    # progress
                    print "\n%.1f %% completed" % (100. * combined_budget / (2 * self.budget))
                    # re-train the learner
                    student[curr_student].student = self.retrain(student[curr_student].student,
                                                                 student[curr_student].pool, student[curr_student].train)

                    #We can evaluate later
                    step_oracle = self.evaluate_oracle(query, labels, labels=np.unique(student[curr_student].pool.target))

                    # record labels
                    expert_labels[curr_student] = self.record_labels(expert_labels[curr_student], query, labels,
                                                                     time=expert.get_annotation_time(),
                                                                     pause=expert.get_pause())

                    if self.debug:
                        self._debug(student[curr_student], expert, query, step_oracle)

                    combined_budget = student['learner1'].budget + student['learner2'].budget

            i += 1

        t1 = time()
        print "\nTotal annotation time: %.3f secs (%.3f mins)" % ((t1-t0), (t1-t0)/60)

        self.save_results(student, expert_times, expert_labels)
        ##TODO evaluate the students after getting labels
        # self.evaluate_student(student['learner1'], student['learner1'].train.index, pool, test, order=False)
        # self.evaluate_student(student['learner2'], student['learner1'].train.index, pool, test, order=True)

        t = bunch.Bunch(index=expert_labels['learner1']['index'], target=expert_labels['learner1']['labels'])
        self.evaluate_student(student['learner1'].student.model, t, original_sequence, pool, test,
                              name=student['learner1'].name, order=False)

        t = bunch.Bunch(index=expert_labels['learner2']['index'], target=expert_labels['learner2']['labels'])
        self.evaluate_student(student['learner2'].student.model, t, original_sequence, pool, test,
                              name=student['learner2'].name, order=True)
示例#4
0
    def start(self):
        import copy
        from collections import deque

        self.set_options(self.config)
        self.data = datautil.load_dataset(self.dataname, self.data_path, categories=self.data_cat, rnd=self.seed,
                                          shuffle=True, percent=self.split, keep_subject=True)

        sequence = self.get_sequence(len(self.data.train.target), self.budget+self.bootstrap_size)

        pool, test = self._sample_data(self.data, sequence, [])
        # pool2, _ = self._sample_data(self.data, sequence, [])
        # pool2 = copy.deepcopy(pool)
        pool2 = self.copy_pool(pool)
        # pool2.remaining = []

        student1, student2 = self.get_student(self.config, [pool, pool2], sequence)

        expert = self.get_expert(self.config, self.data.train.target_names)

        combined_budget = 0
        coin = np.random.RandomState(9187465)

        i = 0
        # expert_labels = self.start_record()
        student = {'learner1':student1, 'learner2':student2}
        expert_times = {'learner1':[], 'learner2':[]}
        expert_labels = {'learner1': self.start_record(), 'learner2': self.start_record()}
        original_sequence = []
        while combined_budget < (2 * self.budget):
            if i == 0:
                ## Bootstrap
                # bootstrap
                train = self.bootstrap(student['learner1'].pool, self.bootstrap_size, bunch.Bunch(index=[], target=[]))

                student['learner1'].train = train
                student['learner2'].train = bunch.Bunch(index=copy.copy(train.index), target=copy.copy(train.target))

                student['learner1'].student = self.retrain(student['learner1'].student, student['learner1'].pool,
                                                           student['learner1'].train)

                student['learner2'].student = self.retrain(student['learner2'].student, student['learner2'].pool,
                                                           student['learner2'].train)

                for t in train.index:
                    student['learner1'].pool.remaining.remove(t)
                    student['learner2'].pool.remaining.remove(t)

                tmp_list = list(student['learner1'].pool.remaining)
                pool_sample = self.rnd_state.choice(tmp_list, self.budget, False)
                student['learner1'].pool.remaining = deque(pool_sample)

                self.rnd_state.shuffle(pool_sample)
                student['learner2'].pool.remaining = deque(pool_sample)
            else:
                # select student
                next_turn = coin.random_sample()

                if next_turn < .5:
                    curr_student = 'learner1'
                else:  # first1 student
                    curr_student = 'learner2'

                query, labels = self.al_cycle(student[curr_student], expert)

                # print len(student['learner1'].pool.remaining), len(student['learner2'].pool.remaining)

                if query is not None and labels is not None:
                    # re-train the learner
                    student[curr_student].student = self.retrain(student[curr_student].student,
                                                                 student[curr_student].pool, student[curr_student].train)

                    #We can evaluate later
                    step_oracle = self.evaluate_oracle(query, labels, labels=np.unique(student[curr_student].pool.target))

                    # record labels
                    expert_labels[curr_student] = self.record_labels(expert_labels[curr_student], query, labels,
                                                                     time=expert.get_annotation_time())

                    if self.debug:
                        self._debug(student[curr_student], expert, query, step_oracle, names=self.data.train.target_names)

                    combined_budget = student['learner1'].budget + student['learner2'].budget

            i += 1
示例#5
0
with open("imdb-vocab-annotated.txt") as f:
    lines = f.readlines()
    vocab = [l.strip() for l in lines]

print "Dictionary size: %d" %len(vocab)
#print vocab

# Loading Data
print "Loading the data"
vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=False, vocabulary=vocab)

if os.path.isfile("imdb-data.pkl"):
    data = pickle.load(open("imdb-data.pkl", 'rb'))
else:
    data = datautil.load_dataset("imdb", "C:\\Users\\mbilgic\\Desktop\\aclIMDB", categories=None, rnd=5463, shuffle=True)
    data.train.data = np.array(data.train.data, dtype=object)
    data.test.data = np.array(data.test.data, dtype=object)
    pickle.dump(data, open("imdb-data.pkl", 'wb'))

print "Fitting the vectorizer"
data.test.bow = vct.fit_transform(data.test.data)

# Fit the expert
print "Training the expert"
expert = LogisticRegression('l2', C=1)
expert.fit(data.test.bow, data.test.target)

terms = np.array(vct.get_feature_names())

coefs = expert.coef_[0]

# In[2]:

## Get the data ready
import re 
vct = CountVectorizer(min_df=2, token_pattern=re.compile(r'(?u)\b\w+\b'))

# vct_doc = CountVectorizer(encoding='ISO-8859-1', min_df=2, max_df=1.0, binary=True, token_pattern='\\b\\w+\\b')
vct_doc = exputil.get_vectorizer({'vectorizer':'bow', 'limit':None, 'min_size':2})



sent_tk = nltk.data.load('tokenizers/punkt/english.pickle')

imdb =  load_dataset("imdb",IMDB_DATA, keep_subject=True)

imdb.train.bow = vct_doc.fit_transform(imdb.train.data)
imdb.test.bow = vct_doc.transform(imdb.test.data)


# In[3]:

class Document(object):
    def __init__(self, raw_text, lbl, sent_tk, vct_gral, sent_lbl=None):
        self.sentences = sent_tk.tokenize_sents([raw_text])[0]
        self.doc_label = lbl
        self.sent_bow = vct_gral.transform(self.sentences) # counts per sentence
        if sent_lbl is not None:
            self.sent_labels = [lbl] * len(self.sentences)
        else:
示例#7
0
import utilities.experimentutils as exputil
import learner
import utilities.datautils as datautil
import numpy as np
import experiment.base as exp
import nltk

if __name__ == '__main__':
        
    ## Get the data ready
    imdb_path = 'C:/Users/mbilgic/Desktop/aclImdb/'
    rnd = np.random.RandomState(2345)
    clf = exputil.get_classifier('lrl2',parameter=1)
    expert = exputil.get_classifier('lrl2',parameter=1)
    vct = exputil.get_vectorizer({'vectorizer':"tfidf", 'limit':None, 'min_size':None})
    data = datautil.load_dataset('imdb', imdb_path, categories=None, rnd=5463, shuffle=True)
    data.train.bow = vct.fit_transform(data.train.data)
    expert = exputil.get_classifier('lrl2',parameter=1)
    
    ## Set the learner options and expert
    sent_tk = nltk.data.load('tokenizers/punkt/english.pickle')
    student = learner.strategy.StructuredLearner(clf)
    student.set_sent_tokenizer(sent_tk)
    student.set_vct(vct)
    student.set_snippet_utility('sr')
    student.set_calibration(True)
    expert.fit(data.train.bow, data.train.target)
    
    ## Get the boostrap and train
    data.train.remaining = rnd.permutation(len(data.train.target))
    ## balanced bootstrap