class TopicEmbeddingModel():
    '''
    Wrapper class for different topic models
    
    '''
    def __init__(self,folder='model',modeltype='kpca',topics=10):
        # the classifier, which also contains the trained BoW transformer
        self.bow = Vectorizer(folder=folder,steps=['hashing','tfidf'])
        self.folder = folder
        self.modeltype = modeltype
        self.topics = topics

        if self.modeltype is 'kpca':
            from sklearn.decomposition import KernelPCA
            self.model = KernelPCA(kernel='rbf',gamma=1.,n_components=topics)
        if self.modeltype is 'nmf':
            from sklearn.decomposition import NMF
            self.model = NMF(n_components=topics)

    def fit(self,X):
        '''
        fits a topic model

        INPUT
        X   list of strings
        '''

        # transform list of strings into sparse BoW matrix
        X = self.bow.transform(X)
        #X = self.bow['tfidf_transformer'].fit_transform(\
        #    self.bow['count_vectorizer'].fit_transform(X))

        # depending on the model, train
        if self.modeltype is 'kpca':
            Xc = self.model.fit_transform(X)
        if self.modeltype is 'nmf':
            Xc = self.model.fit_transform(X)


    def predict(self,X):
        '''
        predicts cluster assignment from list of strings
        
        INPUT
        X   list of strings
        '''
        if X is not list: X = [X]
        X = self.bow.transform(X)
        #X = self.bow['tfidf_transformer'].transform(\
        #    self.bow['count_vectorizer'].transform(X))
        
        if self.modeltype is 'kpca':
            return self.model.transform(X)
        if self.modeltype is 'nmf':
            return self.model.transform(X)
예제 #2
0
def main():
    with timer("model loading"):
        # モデルとパイプラインの読込
        model = ModelMLP()
        model.load_model()
        vectorizer = Vectorizer()
        vectorizer.load_vectorizer()

    with timer("data loading"):
        # 予測対象のデータをロード
        df = load_data_from_gcs()

    with timer("preprocess"):
        df = preprocess(df)

    with timer("predict"):
        X = df.drop(columns="price")
        X = vectorizer.transform(X)
        pred = model.predict(X)

        print(pred[:10])
예제 #3
0
class Trainer(object):
    """Trains the classifier with training data and does the cross validation.
    """

    def __init__(self):
        """Initializes the datastructures required.
        """
        # The actual text extraction object (does text to vector mapping).
        self.vectorizer = Vectorizer()

        # A list of already hand classified tweets to train our classifier.
        self.data = None

        # A list containing the classification to each individual tweet
        # in the tweets list.
        self.classification = None

        self.classifier = None
        self.scores = None

    def initialize_training_data(self):
        """Initializes all types of training data we have.
        """
        corpus_file = open(os.path.join(datasettings.DATA_DIRECTORY,
                                        'full-corpus.csv'))

        classification, tweets = parse_training_corpus(corpus_file)

        reviews_positive = parse_imdb_corpus(
            os.path.join(datasettings.DATA_DIRECTORY, 'positive'))

        num_postive_reviews = len(reviews_positive)
        class_positive = ['positive'] * num_postive_reviews

        reviews_negative = parse_imdb_corpus(
            os.path.join(datasettings.DATA_DIRECTORY, 'negative'))
        num_negative_reviews = len(reviews_negative)
        class_negative = ['negative'] * num_negative_reviews

        self.data = tweets
        self.classification = classification

        #self.date_time = date_time
        #self.retweet = retweets
        #self.favorited = favorited

    def initial_fit(self):
        """Initializes the vectorizer by doing a fit and then a transform.
        """
        # We map the sentiments to the values specified in the SENTIMENT_MAP.
        # For any sentiment that is not part of the map we give a value 0.
        classification_vector = numpy.array(map(
            lambda s: SENTIMENT_MAP.get(s.lower(), 0),
                                        self.classification))

        feature_vector = self.vectorizer.fit_transform(self.data)

        return (classification_vector, feature_vector)

    def build_word_dict(self):
        """ Build sentiment dictionary and build vector of 
            weights for tweets.
        """
        fileIn = open(os.path.join(datasettings.DATA_DIRECTORY,
                                   'AFINN-96.txt'))
        wordDict = {}
        line = fileIn.readline()
        while line != '':
            temp = string.split(line, '\t')
            wordDict[temp[0]] = int(temp[1])
            line = fileIn.readline()
        fileIn.close()

        fileIn = open(os.path.join(datasettings.DATA_DIRECTORY,
                                   'AFINN-111.txt'))
        line = fileIn.readline()
        while line != '':
            temp = string.split(line, '\t')
            wordDict[temp[0]] = int(temp[1])
            line = fileIn.readline()
        fileIn.close()

        word_dict_vector = []
        for tweet in self.data:
            word_list = tweet.split()
            sum = 0
            for word in word_list:
                if word in wordDict.keys():
                    sum += wordDict[word]    
            word_dict_vector.append(sum)

        return word_dict_vector

    def transform(self, test_data):
        """Performs the transform using the already initialized vectorizer.
        """
        feature_vector = self.vectorizer.transform(test_data)

    def score_func(self, true, predicted):
        """Score function for the validation.
        """
        return metrics.precision_recall_fscore_support(
            true, predicted,
            pos_label=[
                SENTIMENT_MAP['positive'],
                SENTIMENT_MAP['negative'],
                SENTIMENT_MAP['neutral'],
                ],
            average='macro')

    def cross_validate(self, k=10):
        """Performs a k-fold cross validation of our training data.

        Args:
            k: The number of folds for cross validation.
        """
        self.scores = []

        X, y = check_arrays(self.feature_vector,
                            self.classification_vector,
                            sparse_format='csr')
        cv = cross_validation.check_cv(
            k, self.feature_vector, self.classification_vector,
            classifier=True)

        for train, test in cv:
            self.classifier1.fit(self.feature_vector[train],
                          self.classification_vector[train])
            self.classifier2.fit(self.feature_vector[train],
                          self.classification_vector[train])
            self.classifier3.fit(self.feature_vector[train],
                          self.classification_vector[train])
            classification1 = self.classifier1.predict(
                self.feature_vector[test])
            classification2 = self.classifier2.predict(
                self.feature_vector[test])
            classification3 = self.classifier3.predict(
                self.feature_vector[test])

            classification = []
            for predictions in zip(classification1, classification2,
                                   classification3):
                neutral_count = predictions.count(0)
                positive_count = predictions.count(1)
                negative_count = predictions.count(-1)
                if (neutral_count == negative_count and
                    negative_count == positive_count):
                    classification.append(predictions[0])
                elif (neutral_count > positive_count and
                    neutral_count > negative_count):
                    classification.append(0)
                elif (positive_count > neutral_count and
                    positive_count > negative_count):
                    classification.append(1)
                elif (negative_count > neutral_count and
                    negative_count > positive_count):
                    classification.append(-1)
            classification = numpy.array(classification)

            self.scores.append(self.score_func(y[test], classification))

    def train_and_validate(self, cross_validate=False, mean=False,
                           serialize=False):
        """Trains the SVC with the training data and validates with the test data.

        We do a K-Fold cross validation with K = 10.
        """
        self.classification_vector, self.feature_vector = self.initial_fit()

        self.classifier1 = naive_bayes.MultinomialNB()
        self.classifier2 = naive_bayes.BernoulliNB()
        self.classifier3 = svm.LinearSVC(loss='l2', penalty='l1',
                                         C=1000,dual=False, tol=1e-3)

        if cross_validate:
            self.cross_validate(k=cross_validate)
        else:
            self.classifier1.fit(self.feature_vector,
                                 self.classification_vector)
            self.classifier2.fit(self.feature_vector,
                                 self.classification_vector)
            self.classifier3.fit(self.feature_vector,
                                 self.classification_vector)

        if serialize:
            classifiers_file = open(os.path.join(
                datasettings.DATA_DIRECTORY, 'classifiers.pickle'), 'wb')
            cPickle.dump([self.classifier1,
                          self.classifier2,
                          self.classifier3], classifiers_file)
            vectorizer_file = open(os.path.join(
                datasettings.DATA_DIRECTORY, 'vectorizer.pickle'), 'wb')
            cPickle.dump(self.vectorizer, vectorizer_file)

        return self.scores

    def build_ui(self, mean=False):
        """Prints out all the scores calculated.
        """
        for i, score in enumerate(self.scores):
            print "Cross Validation: %d" % (i + 1)
            print "*" * 40
            if mean:
                print "Mean Accuracy: %f" % (score)
            else:
                print "Precision\tRecall\t\tF-Score"
                print "~~~~~~~~~\t~~~~~~\t\t~~~~~~~"
                precision = score[0]
                recall = score[1]
                f_score = score[2]
                print "%f\t%f\t%f" % (precision, recall, f_score)


            print
예제 #4
0
class CRF:
    """Class for training and predicting CRF based named entity recognition.
        main API functions are:
        CRF.fit(): training using avg. perceptron
        CRF.inference: prediction using viterbi algorithm
        save, load weights and vector transformers    
     """
    def __init__(self):

        # vectorizer class
        # based on composition instead of inheritence principles
        self.vectorizer = Vectorizer()

        # weights learned and used by model
        self.weights = np.array([])
        self.tag_enums = []

        self.tag_dict = {}

    def fit(self, file_name, iterations=5):
        """ Wrapper function for initializing and training CRF model
            params: 
                file_name: training data file in GermEval format
                iterations: number of iterations. default to 5"""

        tags, tokens = self.read_file(file_name)
        # fix this shit later
        # build essential indices maps for the vectorizer
        self.vectorizer.build_tag_map(tags)
        self.vectorizer.build_word_map(tokens)
        self.vectorizer.build_name_list("data/first_names.txt")

        ### EDIT HERE TO ADD FUNCTIONS ###
        # add feature functions here #
        # descriptions in vectorizer file #
        self.vectorizer.add_feature("word",
                                    self.vectorizer.sparse_feat_word_in,
                                    len(self.vectorizer.word_map))
        self.vectorizer.add_feature("prev word",
                                    self.vectorizer.sparse_feat_prev_word,
                                    len(self.vectorizer.word_map))

        self.vectorizer.add_feature(
            "word tag", self.vectorizer.sparse_feat_word_and_tag,
            len(self.vectorizer.word_map) * len(self.vectorizer.tag_map), True)
        self.vectorizer.add_feature("DE name gazetter",
                                    self.vectorizer.sparse_feat_in_names, 2)
        self.vectorizer.add_feature("Caps",
                                    self.vectorizer.sparse_feat_is_all_cap, 2)
        self.vectorizer.add_feature("hyphenated",
                                    self.vectorizer.sparse_feat_hyphenated, 2)

        # tag transitions must be added last so that the Viterbi can know where to look
        self.vectorizer.add_feature(
            "prev tag", self.vectorizer.sparse_feat_prev_tag,
            len(self.vectorizer.tag_map) * len(self.vectorizer.tag_map), True)

        # fit vectorizer
        self.vectorizer.fit(tokens, tags)
        #initialize weight
        self.tag_enums = list(enumerate(self.vectorizer.tag_list))
        self.tag_dict = {word: idx for idx, word in self.tag_enums}
        self.initialize_weights(self.vectorizer.vector_size + 1)
        # perceptron train
        self.train(tokens, tags, iterations)
        print("Classifier Fitted")

    def predict(self, file_name):
        """ Wrapper for viterbi inference. Takes filename in GermEval format
            returns predicted tag sequence and actual tag seq in that order.
            returned in a flattened way"""
        tags, tokens = self.read_file(file_name)
        predicted_list = []

        actual_list = []
        for token_seq, tag_seq in tqdm(list(zip(tokens, tags))):
            predicted_tags = self.inference(token_seq)
            predicted_list.extend(predicted_tags)
            actual_list.extend(tag_seq)

        return predicted_list, actual_list

    def read_file(self, fname):
        """ GermEval file parser
            returns list of sequences of both tags and tokens
             """
        tag_seq = []
        tok_seq = []
        curr_tok = []
        curr_tag = []

        with open(fname, 'r') as df:
            for line in df:

                line = line.strip().split("\t")
                if len(line) < 2:
                    tag_seq.append(curr_tag)
                    tok_seq.append(curr_tok)
                    curr_tok = []
                    curr_tag = []
                else:
                    if line[0] == '#':
                        #print("annot")
                        pass
                    else:
                        curr_tok.append(line[1])
                        curr_tag.append(line[2])
        return tag_seq, tok_seq

    def inference(self, token_seq, feats_list=False, int_tags=False):
        """ Viterbi Algorithm for decoding. Takes list of tokens and
            returns either list of predicted tags or additionally list of feature vectors
              """
        # check input for empty sequence
        if len(token_seq) < 1:
            #print("invalid input encountered: empty tokens")
            return [], []

        tag_len = len(self.vectorizer.tag_list)
        seq_len = len(token_seq)

        # initialize viterbi/ backpointer charts

        #### change this shit
        viterbi_chart = np.zeros((seq_len, tag_len))
        bp_chart = np.full((seq_len, tag_len), -1)
        feature_chart = [[{} for j in range(tag_len)] for i in range(seq_len)]
        # initialize first trellis
        for i, tag in self.tag_enums:
            viterbi_chart[0][i] = self.vectorizer.feature_dot(
                token_seq, tag, 0, self.weights)
            feature_chart[0][i] = self.vectorizer.join_features(
                token_seq, [tag], 0, 0)

        # for each word
        for i in range(1, seq_len):
            #for each state
            for j, tag_1 in self.tag_enums:
                best_val = -1000000000000000000
                idx = -1

                # argmax
                # go through states with known transition
                tag_curr_id = self.tag_dict[
                    tag_1] * self.vectorizer.tag_sentinel + self.vectorizer.partitions[
                        -1]

                for tag_2 in self.vectorizer.tag2tag[tag_1]:
                    ind_tag_2 = self.tag_dict[tag_2]
                    ind = tag_curr_id + ind_tag_2

                    vs = viterbi_chart[i - 1][ind_tag_2] + self.weights[ind]

                    if vs > best_val:
                        best_val = vs
                        idx = ind_tag_2

                # update charts`
                bp_chart[i][j] = idx
                tag_2 = self.tag_enums[idx][1]

                # feature

                feature = self.vectorizer.feature(token_seq, tag_1, tag_2, i)

                #scal_prod = self.vectorizer.sparse_dot(self.weights, feature)

                viterbi_chart[i][j] = viterbi_chart[i - 1][idx] + sum(
                    [val * self.weights[k] for k, val in feature.items()])
                feature_chart[i][j] = feature

        # find max and initialize backtrace

        best = np.argmax(viterbi_chart[seq_len - 1])
        # deque to append first
        res = deque()
        feat_vs = deque()
        res.append(best)
        feat_vs.append(feature_chart[len(token_seq) - 1][best])
        # extract path
        for i in range(seq_len - 1, 0, -1):
            # res.appendleft( self.vectorizer.tag_list[ int(bp_chart[i][int(best)]) ])

            res.appendleft(bp_chart[i][int(best)])
            feat_vs.appendleft(feature_chart[i][int(best)])
            best = bp_chart[i][int(best)]

        feat_vs.appendleft(feature_chart[0][int(best)])

        if int_tags == False:
            res = [self.vectorizer.tag_list[int(w)] for w in res]

        if feats_list:
            return list(res), feat_vs
        else:
            return res

    def get_wrong_tags(self, y, tags):
        res = []
        #idx = 0
        for i in range(len(tags)):
            if y[i] != tags[i]:
                res.append(i)
        return res

    def train(self, tok_seq, tag_seq, iters=5, learning_rate=1):
        """ Train CRF model using avg. Perceptron algorithm. 
                takes list of token/tag sequences and attempts to learn
                something useful. learning rate can be set, but probably
                not that useful. iters set the number of iteration default to 5 """

        avg_weights = self.weights

        # zip training data to allow it to shuffle
        # convert to list because shuffling won't work otherwise
        new_tags = [[self.vectorizer.tag_map[tg] for tg in tg_list]
                    for tg_list in tag_seq]
        train_data = list(enumerate(zip(tok_seq, new_tags)))

        num_words = sum([len(seq) for seq in tok_seq])
        num_samples = len(tok_seq)

        # pre calculate gold vectors
        gold_data = self.vectorizer.transform(tok_seq, tag_seq)
        # print(gold_data)
        # iterative loop
        for i in range(iters):
            # epoch timer
            #start = time()

            shuffle(train_data)
            print("starting epoch:", i + 1)
            wrong = 0
            for idx, (tokens, tags) in tqdm(train_data):
                y, y_feats = self.inference(tokens,
                                            feats_list=True,
                                            int_tags=True)
                #wrong_tags = [ind for ind,pair in enumerate(zip(y,tags)) if pair[0] != pair[1]]
                wrong_tags = self.get_wrong_tags(y, tags)

                # if predicted wrong

                if len(wrong_tags) > 0:

                    # collect wrong for accuracy displayed after epoch
                    wrong += len(wrong_tags)

                    predicted = fn.reduce(self.vectorizer.sum_features,
                                          [y_feats[i] for i in wrong_tags])

                    gold_wrong = [gold_data[idx][i] for i in wrong_tags]
                    gold = fn.reduce(self.vectorizer.sum_features, gold_wrong)

                    diff = self.vectorizer.subtract_features(gold, predicted)

                    self.vectorizer.add_weights(avg_weights,
                                                diff,
                                                lr=learning_rate)

            #end = time()
            #print("epoch time", end - start)
            print("accuracy:", (num_words - wrong) / num_words)

        # average
        self.weights = avg_weights / (num_samples * iters)

    def initialize_weights(self, size, fill=0):
        # dtype to double just in case
        # but normal float or half could also work
        self.weights = np.full((size, ), fill, dtype=np.float64)

    def save_weights(self, weights_fname, vectorizer_fname):
        """ save to binary. 
            arg1: name of weight filename
            arg2: name of vectorizer filename"""

        with open(weights_fname, "wb") as w_file:
            np.save(w_file, self.weights)
        with open(vectorizer_fname, "wb") as vec_file:
            pickle.dump(self.vectorizer, vec_file)

    def load_weights(self, weights_fname, vectorizer_fname):
        """ load from binary. 
            arg1: name of weight filename
            arg2: name of vectorizer filename"""

        with open(weights_fname, "rb") as w_file:
            self.weights = np.load(w_file)

        with open(vectorizer_fname, "rb") as vec_file:
            self.vectorizer = pickle.load(vec_file)
def test_with_nested_CV(folder='model',folds=5, plot=True, steps=['hashing','tfidf']):
    '''
    
    Evaluates the classifer by doing nested CV 
    i.e. keeping 1/folds of the data out of the training and doing training 
    (including model selection for regularizer) on the training set and testing
    on the held-out data
    
    Also prints some stats and figures
    
    INPUT
    folder  folder with model files
    folds   number of folds

    '''
    # start timer
    import time
    t0 = time.time()
    # create bag of words representations
    vv = Vectorizer(steps=steps)

    # load data
    vec = Vectorizer(folder=folder)
    data = get_speech_text(folder=folder)
    for key in data.keys():
        data[key] = vec.transform(data[key])
    # create numerical labels
    Y = hstack(map((lambda x: ones(data[data.keys()[x]].shape[0])*x),range(len(data))))
    # create data matrix
    X = vstack(data.values())
    # permute data 
    fsize = len(Y)/folds
    randidx = permutation(len(Y))
    Y = Y[randidx]
    X = X[randidx,:]
    idx = reshape(arange(fsize*folds),(folds,fsize))
    Y = Y[:fsize*folds]
    # allocate matrices for predictions
    predicted = zeros(fsize*folds)
    predicted_prob = zeros((fsize*folds,len(data)))
        
    # the regularization parameters to choose from 
    parameters = {'C': (10.**arange(-4,4,1.)).tolist()}
    
    # do nested CV
    for ifold in range(folds):
        testidx = idx[ifold,:]
        trainidx = idx[setdiff1d(arange(folds),ifold),:].flatten()
        text_clf = LogisticRegression(class_weight='auto',dual=True)
        # for nested CV, do folds-1 CV for parameter optimization
        # within inner CV loop and use the outer testfold as held-out data
        # for model validation
        gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, cv=(folds-1))
        gs_clf.fit(X[trainidx,:],Y[trainidx])
        predicted[testidx] = gs_clf.predict(X[testidx,:])
        predicted_prob[testidx,:] = gs_clf.predict_proba(X[testidx,:])
        print '************ Fold %d *************'%(ifold+1)
        print metrics.classification_report(Y[testidx], predicted[testidx],target_names=data.keys()) 
    
    t1 = time.time()
    total_time = t1 - t0
    timestr = 'Wallclock time: %f sec\n'%total_time
    dimstr = 'Vocabulary size: %d\n'%X.shape[-1]
    report = timestr + dimstr
    # extract some metrics
    print '********************************'
    print '************ Total *************'
    print '********************************'
    report += metrics.classification_report(Y, predicted,target_names=data.keys())
    # dump metrics to file
    open(folder+'/report_%s.txt'%'_'.join(sorted(steps)),'wb').write(report)
    print(report)
    conf_mat = metrics.confusion_matrix(Y,predicted)
    open(folder+'/conf_mat_%s.txt'%'_'.join(sorted(steps)),'wb').write(json.dumps(conf_mat.tolist()))
    print(conf_mat)
    
    if plot:
        # print confusion matrix
        import pylab
        pylab.figure(figsize=(16,16))
        pylab.imshow(metrics.confusion_matrix(Y,predicted),interpolation='nearest')
        pylab.colorbar()
        pylab.xticks(arange(4),[x.decode('utf-8') for x in data.keys()])
        pylab.yticks(arange(4),[x.decode('utf-8') for x in data.keys()])
        pylab.xlabel('Predicted')
        pylab.ylabel('True')
        font = {'family' : 'normal', 'size'   : 30}
        pylab.rc('font', **font)
        pylab.savefig(folder+'/conf_mat.pdf',bbox_inches='tight')
class Classifier:

    def __init__(self,folder='model',train=False):
        '''
        Creates a classifier object
        if no model is found, or train is set True, a new classifier is learned

        INPUT
        folder  the root folder with the Bag-of-Word data, where the model is stored
        train   set True if you want to train 

        '''
        self.folder = folder
        # load Bag-of-Word extractor
        self.bow_vectorizer = Vectorizer(self.folder)
        # if there is no classifier file or training is invoked
        if (not os.path.isfile(self.folder+'/classifier.pickle')) or train:
            print 'Training classifier'
            self.train()
        print 'Loading classifier'
        clfdict = cPickle.load(open(self.folder+'/classifier.pickle'))
        self.clf = clfdict['classifier']
        self.parties = clfdict['labels']

    def predict(self,text):
        '''
        Loads scikit-learn Bag-of-Word extractor and classifier and
        applies it to some text. 

        INPUT
        text    a string to assign to a party
        folder  the folder containing the classifier and bag-of-words transformer pickles
        
        '''

        # transform string into sparse matrix
        x = self.bow(text)
        # predict probabilities of each party
        probabilities = self.clf.predict_proba(x)
        # transform the predictions into json output
        result = {'text':text,'prediction':[]}
        # the classifier returns parties in alphabetical order, so we reorder
        for pidx in range(len(self.parties)): 
            result['prediction'].append(
                {   'party':self.parties[pidx],
                    'probability':probabilities.flatten()[pidx]
                })
        return result

    def bow(self,text):
        if type(text) is not list:
            text = [text]
        return self.bow_vectorizer.transform(text)
   
    def train(self,folds = 2):
        '''
        trains a classifier on the bag of word vectors extracted with extract_bundestag speeches.py

        INPUT
        folder  the folder to store the model file and load the bag-of-words-vectorizer file
        folds   number of cross-validation folds for optimizing the regularizer of the classifier

        '''
        try:
            # load the data
            data = get_speech_text(folder=self.folder)
            for key in data:
                data[key] = self.bow(data[key])
        except:
            print('Could not load text data file in\n' + \
                  'Try executing [python downloader.py --download --parse]')
            raise
        # create numerical labels for each party
        Y = hstack(map((lambda x: ones(data[data.keys()[x]].shape[0])*x),range(len(data))))
        # create the data matrix
        X = vstack(data.values())
        # estimate fold size (if not a divisor of total samples)
        fsize = len(Y)/folds
        # permute data indices for training
        randidx = permutation(len(Y))
        Y = Y[randidx]
        X = X[randidx,:]
        # the classifier, accounting for unbalanced classes
        text_clf = LogisticRegression(class_weight='auto',dual=True)
        # the regularizer
        parameters = {'C': (10.**arange(-5,5,1.)).tolist()}
        # perform gridsearch to get the best regularizer
        gs_clf = GridSearchCV(text_clf, parameters, cv=folds, n_jobs=-1,verbose=2)
        gs_clf.fit(X,Y)
        print "Classifier reached mean %0.2f accuracy with regularizer: %f"%(gs_clf.best_score_, gs_clf.best_params_['C'])
        # dump classifier to pickle
        cPickle.dump({'classifier':gs_clf,'labels':data.keys()},open(self.folder+'/classifier.pickle','wb'),-1)
예제 #7
0
class Trainer(object):
    """Trains the classifier with training data and does the cross validation.
    """
    def __init__(self):
        """Initializes the datastructures required.
        """
        # The actual text extraction object (does text to vector mapping).
        self.vectorizer = Vectorizer()

        # A list of already hand classified tweets to train our classifier.
        self.data = None

        # A list containing the classification to each individual tweet
        # in the tweets list.
        self.classification = None

        self.classifier = None
        self.scores = None

    def initialize_training_data(self):
        """Initializes all types of training data we have.
        """
        corpus_file = open(
            os.path.join(datasettings.DATA_DIRECTORY, 'full-corpus.csv'))

        classification, tweets = parse_training_corpus(corpus_file)

        reviews_positive = parse_imdb_corpus(
            os.path.join(datasettings.DATA_DIRECTORY, 'positive'))

        num_postive_reviews = len(reviews_positive)
        class_positive = ['positive'] * num_postive_reviews

        reviews_negative = parse_imdb_corpus(
            os.path.join(datasettings.DATA_DIRECTORY, 'negative'))
        num_negative_reviews = len(reviews_negative)
        class_negative = ['negative'] * num_negative_reviews

        self.data = tweets
        self.classification = classification

        #self.date_time = date_time
        #self.retweet = retweets
        #self.favorited = favorited

    def initial_fit(self):
        """Initializes the vectorizer by doing a fit and then a transform.
        """
        # We map the sentiments to the values specified in the SENTIMENT_MAP.
        # For any sentiment that is not part of the map we give a value 0.
        classification_vector = numpy.array(
            map(lambda s: SENTIMENT_MAP.get(s.lower(), 0),
                self.classification))

        feature_vector = self.vectorizer.fit_transform(self.data)

        return (classification_vector, feature_vector)

    def build_word_dict(self):
        """ Build sentiment dictionary and build vector of 
            weights for tweets.
        """
        fileIn = open(os.path.join(datasettings.DATA_DIRECTORY,
                                   'AFINN-96.txt'))
        wordDict = {}
        line = fileIn.readline()
        while line != '':
            temp = string.split(line, '\t')
            wordDict[temp[0]] = int(temp[1])
            line = fileIn.readline()
        fileIn.close()

        fileIn = open(
            os.path.join(datasettings.DATA_DIRECTORY, 'AFINN-111.txt'))
        line = fileIn.readline()
        while line != '':
            temp = string.split(line, '\t')
            wordDict[temp[0]] = int(temp[1])
            line = fileIn.readline()
        fileIn.close()

        word_dict_vector = []
        for tweet in self.data:
            word_list = tweet.split()
            sum = 0
            for word in word_list:
                if word in wordDict.keys():
                    sum += wordDict[word]
            word_dict_vector.append(sum)

        return word_dict_vector

    def transform(self, test_data):
        """Performs the transform using the already initialized vectorizer.
        """
        feature_vector = self.vectorizer.transform(test_data)

    def score_func(self, true, predicted):
        """Score function for the validation.
        """
        return metrics.precision_recall_fscore_support(
            true,
            predicted,
            pos_label=[
                SENTIMENT_MAP['positive'],
                SENTIMENT_MAP['negative'],
                SENTIMENT_MAP['neutral'],
            ],
            average='macro')

    def cross_validate(self, k=10):
        """Performs a k-fold cross validation of our training data.

        Args:
            k: The number of folds for cross validation.
        """
        self.scores = []

        X, y = check_arrays(self.feature_vector,
                            self.classification_vector,
                            sparse_format='csr')
        cv = cross_validation.check_cv(k,
                                       self.feature_vector,
                                       self.classification_vector,
                                       classifier=True)

        for train, test in cv:
            self.classifier1.fit(self.feature_vector[train],
                                 self.classification_vector[train])
            self.classifier2.fit(self.feature_vector[train],
                                 self.classification_vector[train])
            self.classifier3.fit(self.feature_vector[train],
                                 self.classification_vector[train])
            classification1 = self.classifier1.predict(
                self.feature_vector[test])
            classification2 = self.classifier2.predict(
                self.feature_vector[test])
            classification3 = self.classifier3.predict(
                self.feature_vector[test])

            classification = []
            for predictions in zip(classification1, classification2,
                                   classification3):
                neutral_count = predictions.count(0)
                positive_count = predictions.count(1)
                negative_count = predictions.count(-1)
                if (neutral_count == negative_count
                        and negative_count == positive_count):
                    classification.append(predictions[0])
                elif (neutral_count > positive_count
                      and neutral_count > negative_count):
                    classification.append(0)
                elif (positive_count > neutral_count
                      and positive_count > negative_count):
                    classification.append(1)
                elif (negative_count > neutral_count
                      and negative_count > positive_count):
                    classification.append(-1)
            classification = numpy.array(classification)

            self.scores.append(self.score_func(y[test], classification))

    def train_and_validate(self,
                           cross_validate=False,
                           mean=False,
                           serialize=False):
        """Trains the SVC with the training data and validates with the test data.

        We do a K-Fold cross validation with K = 10.
        """
        self.classification_vector, self.feature_vector = self.initial_fit()

        self.classifier1 = naive_bayes.MultinomialNB()
        self.classifier2 = naive_bayes.BernoulliNB()
        self.classifier3 = svm.LinearSVC(loss='l2',
                                         penalty='l1',
                                         C=1000,
                                         dual=False,
                                         tol=1e-3)

        if cross_validate:
            self.cross_validate(k=cross_validate)
        else:
            self.classifier1.fit(self.feature_vector,
                                 self.classification_vector)
            self.classifier2.fit(self.feature_vector,
                                 self.classification_vector)
            self.classifier3.fit(self.feature_vector,
                                 self.classification_vector)

        if serialize:
            classifiers_file = open(
                os.path.join(datasettings.DATA_DIRECTORY,
                             'classifiers.pickle'), 'wb')
            cPickle.dump(
                [self.classifier1, self.classifier2, self.classifier3],
                classifiers_file)
            vectorizer_file = open(
                os.path.join(datasettings.DATA_DIRECTORY, 'vectorizer.pickle'),
                'wb')
            cPickle.dump(self.vectorizer, vectorizer_file)

        return self.scores

    def build_ui(self, mean=False):
        """Prints out all the scores calculated.
        """
        for i, score in enumerate(self.scores):
            print "Cross Validation: %d" % (i + 1)
            print "*" * 40
            if mean:
                print "Mean Accuracy: %f" % (score)
            else:
                print "Precision\tRecall\t\tF-Score"
                print "~~~~~~~~~\t~~~~~~\t\t~~~~~~~"
                precision = score[0]
                recall = score[1]
                f_score = score[2]
                print "%f\t%f\t%f" % (precision, recall, f_score)

            print
예제 #8
0
class Classifier:
    def __init__(self, folder='model', train=False):
        '''
        Creates a classifier object
        if no model is found, or train is set True, a new classifier is learned

        INPUT
        folder  the root folder with the Bag-of-Word data, where the model is stored
        train   set True if you want to train 

        '''
        self.folder = folder
        # load Bag-of-Word extractor
        self.bow_vectorizer = Vectorizer(self.folder)
        # if there is no classifier file or training is invoked
        if (not os.path.isfile(self.folder + '/classifier.pickle')) or train:
            print 'Training classifier'
            self.train()
        print 'Loading classifier'
        clfdict = cPickle.load(open(self.folder + '/classifier.pickle'))
        self.clf = clfdict['classifier']
        self.parties = clfdict['labels']

    def predict(self, text):
        '''
        Loads scikit-learn Bag-of-Word extractor and classifier and
        applies it to some text. 

        INPUT
        text    a string to assign to a party
        folder  the folder containing the classifier and bag-of-words transformer pickles
        
        '''

        # transform string into sparse matrix
        x = self.bow(text)
        # predict probabilities of each party
        probabilities = self.clf.predict_proba(x)
        # transform the predictions into json output
        result = {'text': text, 'prediction': []}
        # the classifier returns parties in alphabetical order, so we reorder
        for pidx in range(len(self.parties)):
            result['prediction'].append({
                'party':
                self.parties[pidx],
                'probability':
                probabilities.flatten()[pidx]
            })
        return result

    def bow(self, text):
        if type(text) is not list:
            text = [text]
        return self.bow_vectorizer.transform(text)

    def train(self, folds=2):
        '''
        trains a classifier on the bag of word vectors extracted with extract_bundestag speeches.py

        INPUT
        folder  the folder to store the model file and load the bag-of-words-vectorizer file
        folds   number of cross-validation folds for optimizing the regularizer of the classifier

        '''
        try:
            # load the data
            data = get_speech_text(folder=self.folder)
            for key in data:
                data[key] = self.bow(data[key])
        except:
            print('Could not load text data file in\n' + \
                  'Try executing [python downloader.py --download --parse]')
            raise
        # create numerical labels for each party
        Y = hstack(
            map((lambda x: ones(data[data.keys()[x]].shape[0]) * x),
                range(len(data))))
        # create the data matrix
        X = vstack(data.values())
        # estimate fold size (if not a divisor of total samples)
        fsize = len(Y) / folds
        # permute data indices for training
        randidx = permutation(len(Y))
        Y = Y[randidx]
        X = X[randidx, :]
        # the classifier, accounting for unbalanced classes
        text_clf = LogisticRegression(class_weight='auto', dual=True)
        # the regularizer
        parameters = {'C': (10.**arange(-5, 5, 1.)).tolist()}
        # perform gridsearch to get the best regularizer
        gs_clf = GridSearchCV(text_clf,
                              parameters,
                              cv=folds,
                              n_jobs=-1,
                              verbose=2)
        gs_clf.fit(X, Y)
        print "Classifier reached mean %0.2f accuracy with regularizer: %f" % (
            gs_clf.best_score_, gs_clf.best_params_['C'])
        # dump classifier to pickle
        cPickle.dump({
            'classifier': gs_clf,
            'labels': data.keys()
        }, open(self.folder + '/classifier.pickle', 'wb'), -1)
def test_with_nested_CV(folder='model',
                        folds=5,
                        plot=True,
                        steps=['hashing', 'tfidf']):
    '''
    
    Evaluates the classifer by doing nested CV 
    i.e. keeping 1/folds of the data out of the training and doing training 
    (including model selection for regularizer) on the training set and testing
    on the held-out data
    
    Also prints some stats and figures
    
    INPUT
    folder  folder with model files
    folds   number of folds

    '''
    # start timer
    import time
    t0 = time.time()
    # create bag of words representations
    vv = Vectorizer(steps=steps)

    # load data
    vec = Vectorizer(folder=folder)
    data = get_speech_text(folder=folder)
    for key in data.keys():
        data[key] = vec.transform(data[key])
    # create numerical labels
    Y = hstack(
        map((lambda x: ones(data[data.keys()[x]].shape[0]) * x),
            range(len(data))))
    # create data matrix
    X = vstack(data.values())
    # permute data
    fsize = len(Y) / folds
    randidx = permutation(len(Y))
    Y = Y[randidx]
    X = X[randidx, :]
    idx = reshape(arange(fsize * folds), (folds, fsize))
    Y = Y[:fsize * folds]
    # allocate matrices for predictions
    predicted = zeros(fsize * folds)
    predicted_prob = zeros((fsize * folds, len(data)))

    # the regularization parameters to choose from
    parameters = {'C': (10.**arange(-4, 4, 1.)).tolist()}

    # do nested CV
    for ifold in range(folds):
        testidx = idx[ifold, :]
        trainidx = idx[setdiff1d(arange(folds), ifold), :].flatten()
        text_clf = LogisticRegression(class_weight='auto', dual=True)
        # for nested CV, do folds-1 CV for parameter optimization
        # within inner CV loop and use the outer testfold as held-out data
        # for model validation
        gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, cv=(folds - 1))
        gs_clf.fit(X[trainidx, :], Y[trainidx])
        predicted[testidx] = gs_clf.predict(X[testidx, :])
        predicted_prob[testidx, :] = gs_clf.predict_proba(X[testidx, :])
        print '************ Fold %d *************' % (ifold + 1)
        print metrics.classification_report(Y[testidx],
                                            predicted[testidx],
                                            target_names=data.keys())

    t1 = time.time()
    total_time = t1 - t0
    timestr = 'Wallclock time: %f sec\n' % total_time
    dimstr = 'Vocabulary size: %d\n' % X.shape[-1]
    report = timestr + dimstr
    # extract some metrics
    print '********************************'
    print '************ Total *************'
    print '********************************'
    report += metrics.classification_report(Y,
                                            predicted,
                                            target_names=data.keys())
    # dump metrics to file
    open(folder + '/report_%s.txt' % '_'.join(sorted(steps)),
         'wb').write(report)
    print(report)
    conf_mat = metrics.confusion_matrix(Y, predicted)
    open(folder + '/conf_mat_%s.txt' % '_'.join(sorted(steps)),
         'wb').write(json.dumps(conf_mat.tolist()))
    print(conf_mat)

    if plot:
        # print confusion matrix
        import pylab
        pylab.figure(figsize=(16, 16))
        pylab.imshow(metrics.confusion_matrix(Y, predicted),
                     interpolation='nearest')
        pylab.colorbar()
        pylab.xticks(arange(4), [x.decode('utf-8') for x in data.keys()])
        pylab.yticks(arange(4), [x.decode('utf-8') for x in data.keys()])
        pylab.xlabel('Predicted')
        pylab.ylabel('True')
        font = {'family': 'normal', 'size': 30}
        pylab.rc('font', **font)
        pylab.savefig(folder + '/conf_mat.pdf', bbox_inches='tight')