class CRFTrainer(object):
    def __init__(self, c_value, classifier_name='ChainCRF'):
        self.c_value = c_value
        self.classifier_name = classifier_name

        if self.classifier_name == 'ChainCRF':
            model = ChainCRF()
            self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=50) 
        else:
            raise TypeError('Invalid classifier type')

    def load_data(self):
        letters = load_letters()
        X, y, folds = letters['data'], letters['labels'], letters['folds']
        X, y = np.array(X), np.array(y)
        return X, y, folds

    # X is a numpy array of samples where each sample
    # has the shape (n_letters, n_features) 
    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        return self.clf.score(X_test, y_test)

    # Run the classifier on input data
    def classify(self, input_data):
        return self.clf.predict(input_data)[0]
Exemplo n.º 2
0
def fit_predict(train_docs,
                test_docs,
                dataset,
                C,
                class_weight,
                constraints,
                compat_features,
                second_order,
                coparents,
                grandparents,
                siblings,
                exact_test=False):
    stats = stats_train(train_docs)
    prop_vect, _ = prop_vectorizer(train_docs,
                                   which=dataset,
                                   stats=stats,
                                   n_most_common_tok=None,
                                   n_most_common_dep=2000,
                                   return_transf=True)
    link_vect = link_vectorizer(train_docs, stats, n_most_common=500)

    sec_ord_vect = (second_order_vectorizer(train_docs)
                    if second_order else None)

    _, _, _, pmi_in, pmi_out = stats

    def _transform_x_y(docs):
        X = [
            _vectorize(doc, pmi_in, pmi_out, prop_vect, link_vect,
                       sec_ord_vect) for doc in docs
        ]
        Y = [doc.label for doc in docs]
        return X, Y

    X_tr, Y_tr = _transform_x_y(train_docs)
    X_te, Y_te = _transform_x_y(test_docs)

    model = ArgumentGraphCRF(class_weight=class_weight,
                             constraints=constraints,
                             compat_features=compat_features,
                             coparents=coparents,
                             grandparents=grandparents,
                             siblings=siblings)

    clf = FrankWolfeSSVM(model,
                         C=C,
                         random_state=0,
                         verbose=1,
                         check_dual_every=25,
                         show_loss_every=25,
                         max_iter=100,
                         tol=0)

    clf.fit(X_tr, Y_tr)

    if exact_test:
        clf.model.exact = True
    Y_pred = clf.predict(X_te)

    return clf, Y_te, Y_pred
Exemplo n.º 3
0
class CRFTrainer(object):

    def __init__(self, c_value, classifier_name='ChainCRF'):
        self.c_value = c_value
        self.classifier_name = classifier_name

        if self.classifier_name == 'ChainCRF':
            model = ChainCRF()
            self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=50)
        else:
            raise TypeError('Invalid classifier type')

    def load_data(self):
        letters = load_letters()

        X, y, folds = letters['data'], letters['labels'], letters['folds']
        X, y = np.array(X), np.array(y)
        return X, y, folds

    # X是一个由样本组成的numpy数组,每个样本为(字母,数值)
    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        return self.clf.score(X_test, y_test)

    # 对输入数据运行分类器
    def classify(self, input_data):
        return self.clf.predict(input_data)[0]
Exemplo n.º 4
0
    def structraining(self, bags, mentions, retweets, labels):
        total_datas = []
        total_labels = []
        print('num_user', len(bags.keys()))
        for user_id, bag in bags.items():
            if not user_id in labels:
                continue
            features = np.empty((0, self.top_seq))
            edge_nodes = np.empty((0, 2))
            edge_features = np.empty((0, 1))
            clique_labels = np.array([labels[user_id]])
            features = np.vstack([features, bag])
            mentioned_ids = mentions[user_id]
            cnt = 0
            for mentioned_id in enumerate(mentioned_ids):
                if not mentioned_id in labels:
                    continue
                clique_labels = np.append(clique_labels,
                                          np.array([labels[mentioned_id]]))
                if mentioned_id in bags:
                    features = np.vstack([features, bags[mentioned_id]])
                else:
                    features = np.vstack([features, np.zeros(self.top_seq)])
                edge_nodes = np.vstack([edge_nodes, np.array([0, cnt + 1])])
                edge_features = np.vstack([edge_features, np.array([[0]])])
                cnt += 1

            num_mentioned = edge_nodes.shape[0]
            retweet_ids = retweets[user_id]
            cnt = 0
            for retweet_id in retweet_ids:
                if not retweet_id in labels:
                    continue
                clique_labels = np.append(clique_labels,
                                          np.array([labels[retweet_id]]))
                if retweet_id in bags:
                    features = np.vstack([features, bags[retweet_id]])
                else:
                    features = np.vstack([features, np.zeros(self.top_seq)])
                edge_nodes = np.vstack(
                    [edge_nodes,
                     np.array([0, cnt + 1 + num_mentioned])])
                edge_features = np.vstack([edge_features, np.array([[1]])])
                cnt += 1

            total_datas.append(
                (features, edge_nodes.astype(int), edge_features))
            total_labels.append(clique_labels)

        ratio = len(total_datas) * 0.7
        ratio = int(ratio)
        print(ratio)
        X_train, y_train = total_datas[:ratio], total_labels[:ratio]
        X_test, y_test = total_datas[ratio:], total_labels[ratio:]

        model = EdgeFeatureGraphCRF(inference_method="max-product")
        ssvm = FrankWolfeSSVM(model=model, C=0.1, max_iter=10)
        ssvm.fit(X_train, y_train)
        result = ssvm.score(X_test, y_test)
        print(result)
Exemplo n.º 5
0
def n_cross_valid_crf(X, Y, K, command):
    # cross validation for crf

    if command == 'write_results':
        list_write = list()

    cv = KFold(len(X), K, shuffle=True, random_state=0)
    for traincv, testcv in cv:
        x_train, x_test = X[traincv], X[testcv]
        y_train, y_test = Y[traincv], Y[testcv]

        crf = ChainCRF(inference_method='max-product', directed=False, class_weight=None)
        ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=100)
        ssvm.fit(x_train, y_train)
        y_pred = ssvm.predict(x_test)

        print 'Accuracy of linear-crf %f:' % ssvm.score(x_test, y_test)
        if command == 'metrics_F1':
            metrics_crf(y_test, y_pred)
        elif command == 'confusion_matrix':
            confusion_matrix_CRF(y_test, y_pred)
        elif command == 'write_results':
            list_write += write_results_CRF(testcv, y_test, y_pred)

        print '------------------------------------------------------'
        print '------------------------------------------------------'

    if command == 'write_results':
        list_write = sorted(list_write, key=itemgetter(0))  # sorted list based on index
        for value in list_write:
            pred_list = value[1]
            test_list = value[2]

            for i in range(0, len(pred_list)):
                print str(pred_list[i]) + '\t' + str(test_list[i])
class CRFTrainer(object):
    def __init__(self, c_value, classifier_name='ChainCRF'):
        self.c_value = c_value
        self.classifier_name = classifier_name

        if self.classifier_name == 'ChainCRF':
            model = ChainCRF()
            self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=50) 
        else:
            raise TypeError('Invalid classifier type')

    def load_data(self):
        letters = load_letters()
        X, y, folds = letters['data'], letters['labels'], letters['folds']
        X, y = np.array(X), np.array(y)
        return X, y, folds

    # X is a numpy array of samples where each sample
    # has the shape (n_letters, n_features) 
    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        return self.clf.score(X_test, y_test)

    # Run the classifier on input data
    def classify(self, input_data):
        return self.clf.predict(input_data)[0]
Exemplo n.º 7
0
def pick_best_C_value(train_sentences, sentence_labels, test_SF,
                      test_sentences, test_sentence_labels):

    i = 0.10
    best_C = i
    f_old = 0
    for z in range(1, 20):
        print "----------------- Training on C-value %f" % i
        modelCRF = ChainCRF()
        ssvm = FrankWolfeSSVM(model=modelCRF, C=i, max_iter=20, random_state=5)
        ssvm.fit(train_sentences, sentence_labels)
        print "\n"
        print "-------- Training complete --------"

        predictions = ssvm.predict(test_sentences)
        test_SF['predicted_labels'] = predictions

        #Saving model
        print "Saving model...."
        pickle.dump(ssvm, open('models/ote/otemodel.sav', 'wb'))

        #Evaluating Trained CRF model

        p, r, f1, common, retrieved, relevant = evaluating_ote(test_SF)
        if (f1 >= f_old):
            #save value of 'C'
            f_old = f1
            best_C = i

        i = i + 0.05
    return best_C
def train(trainSetX, trainSetY, testSetX, testSetY):
    modelLogger = SaveLogger('imagesegmentation-horse-hog_96_lbp_test.model',
                             save_every=1)

    # Load trained CRF model
    print 'Loading trained model for  CRF'
    #clf = modelLogger.load()

    # Uncomment if we want to train from scratch first layer CRF
    print 'Training CRF...'
    start_time = time.time()
    crf = EdgeFeatureGraphCRF()  #antisymmetric_edge_features=[1,2]
    clf = FrankWolfeSSVM(model=crf,
                         C=10.,
                         tol=.1,
                         verbose=3,
                         show_loss_every=1,
                         logger=modelLogger)  # #max_iter=50
    ##clf = OneSlackSSVM(model=crf, verbose=1, show_loss_every=1, logger=modelLogger)
    clf.fit(numpy.array(trainSetX), numpy.array(trainSetY))
    print 'Training CRF took ' + str(time.time() - start_time) + ' seconds'

    #print("Overall super pixelwise accuracy (training set): %f" % clf.score(numpy.array(trainSetX), numpy.array(trainSetY) ))
    #print("Overall super pixelwise accuracy (test set): %f" % clf.score(numpy.array(testSetX), numpy.array(testSetY) ))

    print 'SUPERPIXELWISE ACCURACY'
    print '-----------------------------------------------------------------------'
    print ''
    print 'TRAINING SET RESULTS'
    train_ypred = evaluatePerformance(clf, numpy.array(trainSetX),
                                      numpy.array(trainSetY))
    print ''
    print 'TEST SET RESULTS'
    evaluatePerformance(clf, numpy.array(testSetX), numpy.array(testSetY))
    print '-----------------------------------------------------------------------'
Exemplo n.º 9
0
class CRFModel(object):
    def __init__(self, c_val=1.0):
        self.clf = FrankWolfeSSVM(model=ChainCRF(), C=c_val, max_iter=50)

    def load_data(self):
        alphabets = load_letters()
        X = np.array(alphabets['data'])
        y = np.array(alphabets['labels'])
        folds = alphabets['folds']
        return X, y, folds

    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        return self.clf.score(X_test, y_test)

    def classify(self, input_data):
        return self.clf.predict(input_data)[0]

    def convert_to_letters(indices):
        alphabets = np.array(list(string.ascii_lowercase))
        output = np.take(alphabets, indices)
        output = ''.join(output)
        return output
Exemplo n.º 10
0
def train_SSVM(X_train, y_train):

    #print X_train.shape, X_train[0].shape

    # splitting the 8 sub-arrays into further:
    #X_train = np.concatenate([np.array_split(x, 100) for x in X_train])
    #y_train = np.concatenate([np.array_split(y, 100) for y in y_train])

    #X_test = np.concatenate([np.array_split(x, 30) for x in X_test])
    #y_test = np.concatenate([np.array_split(y, 30) for y in y_test])

    #print X_train.shape
    #print X_train[0].shape
    #print y_train[0].shape
    #exit()
    #Train using linear chain CRF
    #https://groups.google.com/forum/#!topic/pystruct/KIkF7fzCyDI

    model = ChainCRF()
    #ssvm = NSlackSSVM(model=model, C=.1, max_iter=11) # almost similar to FrankWolfeSSVM
    ssvm = FrankWolfeSSVM(model=model, C=0.001, max_iter=11)
    # c=0.2 -> 62.86 % accuracy <==> c=0.1

    #ssvm = OneSlackSSVM(model=model) #doesn't work as well
    ssvm.fit(X_train, y_train)
    print "Learning complete..."

    return ssvm
Exemplo n.º 11
0
class CRFTrainer(object):
    #define an init function to initialize the values.
    def __init__(self, c_value, classifier_name='ChainCRF'):
        self.c_value = c_value
        self.classifier_name = classifier_name
        #using chain crf to analyze the data, so add an error check for this:
        if self.classifier_name == 'ChainCRF':
            model = ChainCRF()
            #define the classifier to use with CRF model.
            self.clf = FrankWolfeSSVM(model=model,
                                      C=self.c_value,
                                      max_iter=100)
        else:
            raise TypeError('Invalid classifier type')

    def load_clean_data(self):
        '''
        load the data into X and y, where X is a numpy array of samples where each sample has the shape (n_letters, n_features)
        '''
        df = featurize.get_data()
        featurize.split_words(df)
        featurize.first_letter_uppercase(df)
        featurize.has_number(df)
        featurize.has_slash(df)
        featurize.spacy_pos_tagger(df)
        featurize.pos_ngrams(df)
        featurize.encoding_labels(df)
        X, y = featurize.get_X_and_y(df)
        return df, X, y

    def cross_val(self, X_train, y_train):
        '''
        method to conduct 5-fold cross validation
        '''
        kf = KFold(len(X_train), n_folds=5, random_state=None, shuffle=False)
        for train_idx, test_idx in kf:
            xtrain, xval = X_train[train_idx], X_train[test_idx]
            ytrain, yval = y_train[train_idx], y_train[test_idx]
            model = ChainCRF()
            ssvm = FrankWolfeSSVM(model=model, C=0.5, max_iter=15)
            ssvm.fit(xtrain, ytrain)
            print ssvm.score(xval, yval)

    def train(self, X_train, y_train):
        '''
        training method
        '''
        self.clf.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        '''
        method to evaluate the performance of the model
        '''
        return self.clf.score(X_test, y_test)

    def classify(self, input_data):
        '''
        method to run the classifier on input data
        '''
        return self.clf.predict(input_data)[0]
Exemplo n.º 12
0
def train_SSVM(X_train, y_train):

    #print X_train.shape, X_train[0].shape

    # splitting the 8 sub-arrays into further:
    #X_train = np.concatenate([np.array_split(x, 100) for x in X_train])
    #y_train = np.concatenate([np.array_split(y, 100) for y in y_train])

    #X_test = np.concatenate([np.array_split(x, 30) for x in X_test])
    #y_test = np.concatenate([np.array_split(y, 30) for y in y_test])

    #print X_train.shape
    #print X_train[0].shape
    #print y_train[0].shape
    #exit()
    #Train using linear chain CRF
    #https://groups.google.com/forum/#!topic/pystruct/KIkF7fzCyDI

    model = ChainCRF()
    #ssvm = NSlackSSVM(model=model, C=.1, max_iter=11) # almost similar to FrankWolfeSSVM
    ssvm = FrankWolfeSSVM(model=model, C=0.001, max_iter=11)
    # c=0.2 -> 62.86 % accuracy <==> c=0.1

    #ssvm = OneSlackSSVM(model=model) #doesn't work as well
    ssvm.fit(X_train, y_train)
    print "Learning complete..."

    return ssvm
Exemplo n.º 13
0
    def create_crf(self):
        """

        :return:
        """
        # to load nltk tagger, a time consuming, one time needed operation
        self.nltk_tagger = nltk.tag._get_tagger()
        self.crf = FrankWolfeSSVM(model=ChainCRF(), C=1.0, max_iter=50)
        self.X, self.y, self.label_code, self.folds, generate_fold = self.load_training_data(
        )

        score = 0
        # only need to iterate through if fold was generated
        num_tries = 10 if generate_fold else 1
        while (score <= 0.90) and (num_tries > 0):
            try:
                X_train, y_train = self.get_train_data()
                self.train(X_train, y_train)

                X_test, y_test = self.get_test_data()
                score = self.evaluate(X_test, y_test)
            except Exception as e:
                current_app.logger.error('Exception: %s' % (str(e)))
                current_app.logger.error(traceback.format_exc())
                pass
            num_tries -= 1
        return (score > 0)
Exemplo n.º 14
0
def test_multinomial_blocks_frankwolfe_batch():
    X, Y = generate_blocks_multinomial(n_samples=10, noise=0.3, seed=0)
    crf = GridCRF(inference_method='qpbo')
    clf = FrankWolfeSSVM(model=crf, C=1, max_iter=500, batch_mode=True)
    clf.fit(X, Y)
    Y_pred = clf.predict(X)
    assert_array_equal(Y, Y_pred)
Exemplo n.º 15
0
class CRFTrainer(object):
    def __init__(self, c_value, classifier_name='ChainCRF'):
        self.c_value = c_value
        self.classifier_name = classifier_name

        if self.classifier_name == 'ChainCRF':
            model = ChainCRF()
            self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=50)
        else:
            raise TypeError('Invalid classifier type')

    def load_data(self):
        letters = load_letters()

        X, y, folds = letters['data'], letters['labels'], letters['folds']
        X, y = np.array(X), np.array(y)
        return X, y, folds

    # X是一个由样本组成的numpy数组,每个样本为(字母,数值)
    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        return self.clf.score(X_test, y_test)

    # 对输入数据运行分类器
    def classify(self, input_data):
        return self.clf.predict(input_data)[0]
Exemplo n.º 16
0
def test_multinomial_blocks_frankwolfe():
    X, Y = generate_blocks_multinomial(n_samples=10, noise=0.5, seed=0)
    crf = GridCRF(inference_method='qpbo')
    clf = FrankWolfeSSVM(model=crf, C=1, max_iter=50, verbose=3)
    clf.fit(X, Y)
    Y_pred = clf.predict(X)
    assert_array_equal(Y, Y_pred)
def scope_trainer(sentence_dicts):
    scope_instances, scope_labels, sentence_splits = extract_features_scope(sentence_dicts, 'training')
    scope_vec = DictVectorizer()
    fvs = scope_vec.fit_transform(scope_instances).toarray()
    X_train, y_train = split_data(fvs, scope_labels, sentence_splits)
    scope_ssvm = FrankWolfeSSVM(model=ChainCRF(), C=0.20, max_iter=10)
    scope_ssvm.fit(X_train, y_train)
    return scope_ssvm, scope_vec
Exemplo n.º 18
0
 def __init__(self, c_value, classifier_name='ChainCRF'):
     self.c_value = c_value
     self.classifier_name = classifier_name
     if self.classifier_name == 'ChainCRF':
         model = ChainCRF()
         self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=50)
     else:
         raise TypeError('Invalid classifier type')
Exemplo n.º 19
0
def test_multinomial_blocks_frankwolfe():
    X, Y = generate_blocks_multinomial(n_samples=50, noise=0.5,
                                       seed=0)
    crf = GridCRF(inference_method='qpbo')
    clf = FrankWolfeSSVM(model=crf, C=1, line_search=True,
                         batch_mode=False, check_dual_every=500)
    clf.fit(X, Y)
    Y_pred = clf.predict(X)
    assert_array_equal(Y, Y_pred)
Exemplo n.º 20
0
def main():
    parser = argparse.ArgumentParser(
        description="learn to segment and tokenize (really, any labeling)",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--untokfile",
                        "-u",
                        nargs='?',
                        type=argparse.FileType('r'),
                        default=sys.stdin,
                        help="untok file")
    parser.add_argument(
        "--biofile",
        "-b",
        nargs='?',
        type=argparse.FileType('r'),
        default=sys.stdin,
        help="bio file. must match untok file and be space separated")
    parser.add_argument("--outfile",
                        "-o",
                        nargs='?',
                        type=argparse.FileType('wb'),
                        default=None,
                        help="output file")
    parser.add_argument("--debug",
                        "-d",
                        action='store_true',
                        default=False,
                        help="debug mode")

    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))

    untokfile = prepfile(args.untokfile, 'r')
    biofile = prepfile(args.biofile, 'r')

    data, labels, datamap, labelmap = prepdata(untokfile, biofile, args.debug)

    #  print(data)
    #  print(labels)
    model = ChainCRF()
    #ssvm = SubgradientSSVM(model=model, C=.1)#, show_loss_every=5)
    ssvm = FrankWolfeSSVM(model=model, max_iter=100,
                          C=.1)  #, show_loss_every=5)
    ssvm.fit(data, labels)
    #  curve = ssvm.loss_curve_
    # TONT
    # print("TONT score with chain CRF: %f" % ssvm.score(data, labels))

    ret = {}
    ret['model'] = ssvm
    ret['feats'] = datamap
    ret['labels'] = labelmap
    if args.outfile is not None:
        pickle.dump(ret, args.outfile)
Exemplo n.º 21
0
def train_scope_learner(sentence_dicts, C_value):
    scope_sentence_dicts, scope_instances, scope_labels, sentence_splits = extract_features_scope(
        sentence_dicts, 'training')
    vectorizer = DictVectorizer()
    fvs = vectorizer.fit_transform(scope_instances).toarray()
    X_train, y_train = make_splits(fvs, scope_labels, sentence_splits)
    model = ChainCRF()
    scope_ssvm = FrankWolfeSSVM(model=model, C=C_value, max_iter=10)
    scope_ssvm.fit(X_train, y_train)
    return scope_ssvm, vectorizer
Exemplo n.º 22
0
def test_multinomial_blocks_frankwolfe():
    X, Y = generate_blocks_multinomial(n_samples=50, noise=0.5, seed=0)
    crf = GridCRF(inference_method='qpbo')
    clf = FrankWolfeSSVM(model=crf,
                         C=1,
                         line_search=True,
                         batch_mode=False,
                         check_dual_every=500)
    clf.fit(X, Y)
    Y_pred = clf.predict(X)
    assert_array_equal(Y, Y_pred)
Exemplo n.º 23
0
 def __init__(self, c_value, classifier_name='ChainCRF'):
     self.c_value = c_value
     self.classifier_name = classifier_name
     #using chain crf to analyze the data, so add an error check for this:
     if self.classifier_name == 'ChainCRF':
         model = ChainCRF()
         #define the classifier to use with CRF model.
         self.clf = FrankWolfeSSVM(model=model,
                                   C=self.c_value,
                                   max_iter=100)
     else:
         raise TypeError('Invalid classifier type')
Exemplo n.º 24
0
    def __init__(self):
        self.classifierMNB = Pipeline([  #Multinomial Naive Bayes
            ('extract', ExtractFeatures()),
            #('encoding', MultiColumnLabelEncoder()),
            ('clf', MultinomialNB(alpha=0.5))
        ])
        # self.classifierMaxEnt = Pipeline([
        #         ('extract', ExtractFeatures()),
        #         #('encoding', MultiColumnLabelEncoder()),
        #         ('clf', nltk.maxent.MaxentClassifier.train(x, algorithm = 'gis', trace = 0, max_iter = 10))
        #         ])
        self.classifierMaxEnt_LogReg = Pipeline([  #Maximum Entropy
            ('extract', ExtractFeatures()),
            ('clf', linear_model.LogisticRegression())
        ])
        self.classifierCRF = Pipeline([  #CRF
            ('extract', ExtractFeaturesToArray()),
            ('clf', FrankWolfeSSVM(model=ChainCRF(),
                                   C=2,
                                   max_iter=10,
                                   tol=0.01))
        ])
        self.classifierSVM = Pipeline([  #Support Vector Machine
            ('extract', ExtractFeatures()), ('clf', svm.LinearSVC())
        ])

        pass
Exemplo n.º 25
0
def graph_crf():

    crf = GraphCRF()
    # X_train

    # creating features
    # maximum number of attributes = 2
    # variables have only one attribute (assigned value), so other second attribute is set to zero
    feature_1 = [1, 0]  # var_1
    feature_2 = [2, 0]  # var_2
    # function has two attributes, so an indicator variable is used to show those two
    feature_3 = [1, 1]  # function
    # if has only one condition, which checks for value 1
    feature_4 = [1, 0]  # if
    features = np.array([feature_1, feature_2, feature_3, feature_4])

    # creating edges
    # there are four edges: (v1, v2), (v1, func), (v2, func), (v1, if)
    edge_1 = [0, 1]  # (v1,v2)
    edge_2 = [0, 2]  # (v1, func)
    edge_3 = [1, 2]  # (v2, func)
    edge_4 = [0, 3]  # (v1, if)
    edges = np.array([edge_1, edge_2, edge_3, edge_4])

    X_train_sample = (features, edges)

    # y_train
    # These are enumerated values for actions
    # We assume there should be an action for each node(variable, function, if, etc.)
    y_train_sample = np.array([0, 0, 1, 2])

    # creat some full training set by re-sampling above thing
    n_samples = 100
    X_train = []
    y_train = []
    for i in range(n_samples):
        X_train.append(X_train_sample)
        y_train.append(y_train_sample)

    model = GraphCRF(directed=True, inference_method="max-product")
    ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10)
    ssvm.fit(X_train, y_train)

    # predict something
    output = ssvm.predict(X_train[0:3])
    print output
Exemplo n.º 26
0
def main():
    parser = argparse.ArgumentParser(
        description="learn to segment and tokenize (really, any labeling)",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--untokfile", "-u", nargs="?", type=argparse.FileType("r"), default=sys.stdin, help="untok file"
    )
    parser.add_argument(
        "--biofile",
        "-b",
        nargs="?",
        type=argparse.FileType("r"),
        default=sys.stdin,
        help="bio file. must match untok file and be space separated",
    )
    parser.add_argument("--outfile", "-o", nargs="?", type=argparse.FileType("wb"), default=None, help="output file")
    parser.add_argument("--debug", "-d", action="store_true", default=False, help="debug mode")

    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))

    untokfile = prepfile(args.untokfile, "r")
    biofile = prepfile(args.biofile, "r")

    data, labels, datamap, labelmap = prepdata(untokfile, biofile, args.debug)

    #  print(data)
    #  print(labels)
    model = ChainCRF()
    # ssvm = SubgradientSSVM(model=model, C=.1)#, show_loss_every=5)
    ssvm = FrankWolfeSSVM(model=model, max_iter=100, C=0.1)  # , show_loss_every=5)
    ssvm.fit(data, labels)
    #  curve = ssvm.loss_curve_
    # TONT
    # print("TONT score with chain CRF: %f" % ssvm.score(data, labels))

    ret = {}
    ret["model"] = ssvm
    ret["feats"] = datamap
    ret["labels"] = labelmap
    if args.outfile is not None:
        pickle.dump(ret, args.outfile)
Exemplo n.º 27
0
    def __init__(self, c_value, classifier_name='ChainCRF'):
        self.c_value = c_value
        self.classifier_name = classifier_name

        if self.classifier_name == 'ChainCRF':
            model = ChainCRF()
            self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=50)
        else:
            raise TypeError('Invalid classifier type')
Exemplo n.º 28
0
def test_svm_as_crf_pickling_bcfw():

    iris = load_iris()
    X, y = iris.data, iris.target

    X_ = [(np.atleast_2d(x), np.empty((0, 2), dtype=np.int)) for x in X]
    Y = y.reshape(-1, 1)

    X_train, X_test, y_train, y_test = train_test_split(X_, Y, random_state=1)
    _, file_name = mkstemp()

    pbl = GraphCRF(n_features=4, n_states=3, inference_method='unary')
    logger = SaveLogger(file_name)
    svm = FrankWolfeSSVM(pbl, C=10, logger=logger, max_iter=50)
    svm.fit(X_train, y_train)

    assert_less(.97, svm.score(X_test, y_test))
    assert_less(.97, logger.load().score(X_test, y_test))
Exemplo n.º 29
0
def test_svm_as_crf_pickling_batch():

    iris = load_iris()
    X, y = iris.data, iris.target

    X_ = [(np.atleast_2d(x), np.empty((0, 2), dtype=np.int)) for x in X]
    Y = y.reshape(-1, 1)

    X_train, X_test, y_train, y_test = train_test_split(X_, Y, random_state=1)
    _, file_name = mkstemp()

    pbl = GraphCRF(n_features=4, n_states=3, inference_method='unary')
    logger = SaveLogger(file_name)
    svm = FrankWolfeSSVM(pbl, C=10, logger=logger, max_iter=50, batch_mode=False)
    svm.fit(X_train, y_train)

    assert_less(.97, svm.score(X_test, y_test))
    assert_less(.97, logger.load().score(X_test, y_test))
Exemplo n.º 30
0
def model_test(k, head, tail):
    """
    CRF训练和预测
    """
    each_fold_time = time.time()  #开始计时

    #divide train set and test set
    train_id = dataId[head:tail]
    test_id = dataId[:head] + dataId[tail:]

    X_train = X_arr[train_id, :]
    Y_train = Y_arr[train_id]
    X_test = X_arr[test_id, :]
    Y_test = Y_arr[test_id]
    campTest = Camp_arr[test_id]
    #ends divide train set and test set
    if len(X_train) > 0:
        #实例化CRF
        EFGCRF = EdgeFeatureGraphCRF(inference_method='qpbo',
                                     class_weight=CLASS_WEIGHT)
        if LEARNER == "OneSlackSSVM":
            #利用OneSlackSSVM训练模型参数
            ssvm = OneSlackSSVM(EFGCRF,
                                C=.1,
                                tol=.1,
                                max_iter=100,
                                switch_to='ad3')
        elif LEARNER == "FrankWolfeSSVM":
            #利用FrankWolfeSSVM训练模型参数
            ssvm = FrankWolfeSSVM(EFGCRF, C=.1, tol=.1, max_iter=100)
        else:
            #没有选择分类器退出
            pass

        ssvm.fit(X_train, Y_train)
        Y_pred = ssvm.predict(X_test)

        df_result = statistic_result(Y_pred, Y_test, campTest)
        V_precision = precision_score(df_result["label"], df_result["pred"])
        V_recall = recall_score(df_result["label"], df_result["pred"])
        V_f1 = f1_score(df_result["label"], df_result["pred"])

        camps_pred, camps_lbl = statistic_campaign_result(Y_pred, Y_test)
        C_precision = precision_score(camps_lbl, camps_pred)
        C_recall = recall_score(camps_lbl, camps_pred)
        C_f1 = f1_score(camps_lbl, camps_pred)

        result_Queue.put(
            [V_precision, V_recall, V_f1, C_precision, C_recall, C_f1])

    else:
        print("TRAIN SET is NULL")

    print("the {}th fold using time: {:.4f} min".format(
        k + 1, (time.time() - each_fold_time) / 60))
    del X_train, Y_train, X_test, Y_test, Y_pred, campTest
Exemplo n.º 31
0
def n_cross_valid_crf_candidate(list_line, X, Y, K):
    list_text = []
    for i in range(0, len(list_line), 3):
        split_first = 0
        split_second = 0

        if i % 3 == 0:
            split_first = list_line[i].strip().split('\t')
        list_text.append(split_first)

    list_text = np.array(list_text)

    cv = KFold(len(X), K, shuffle=True, random_state=0)
    list_write = []
    for traincv, testcv in cv:
        x_train, x_test = X[traincv], X[testcv]
        y_train, y_test = Y[traincv], Y[testcv]
        list_text_train, list_text_test = list_text[traincv], list_text[testcv]

        crf = ChainCRF(inference_method='max-product', directed=False, class_weight=None)
        ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=10)
        ssvm.fit(x_train, y_train)
        y_pred = ssvm.predict(x_test)
        list_wrong = metrics_crf_candidate(list_text_test, y_test, y_pred)
        if len(list_write) == 0:
            list_write = list_wrong
        else:
            for i in range(0, len(list_wrong)):
                svc = list_wrong[0]
                road = list_wrong[1]
                busstop = list_wrong[2]

                list_write[0] = list_write[0] + svc
                list_write[1] = list_write[1] + road
                list_write[2] = list_write[2] + busstop

    # write_file('d:/', 'wrong_svc', list_write[0])
    # write_file('d:/', 'wrong_road', list_write[1])
    # write_file('d:/', 'wrong_busstop', list_write[2])

    write_file('d:/', 'good_svc', list_write[0])
    write_file('d:/', 'good_road', list_write[1])
    write_file('d:/', 'good_busstop', list_write[2])
Exemplo n.º 32
0
def MLfitCRF(data_train, data_test, records, folds):
    fvector = np.array([data_train[0]])
    labels = np.array([data_train[1]])

    #create CRF model
    CRFmodel = ChainCRF()
    #create ML classifier
    ssvm = FrankWolfeSSVM(model = CRFmodel, C = 0.1)
    #training
    ssvm.fit(fvector, labels)

    #model testing
    fvector_test = np.array(data_test[0])
    labels_test = np.array(data_test[1])
    score = ssvm.score(fvector_train, labels_test)

    print score

    return
Exemplo n.º 33
0
def results_CRFs(X_training, Y_training, X_testing, Y_testing, command):
    crf = ChainCRF(inference_method='max-product', directed=False, class_weight=None)
    ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=100)
    ssvm.fit(X_training, Y_training)
    y_pred = ssvm.predict(X_testing)

    list_write = list()
    print 'Accuracy of linear-crf %f:' % ssvm.score(X_testing, Y_testing)
    if command == 'metrics_F1':
        metrics_crf(Y_testing, y_pred)
    elif command == 'confusion_matrix':
        confusion_matrix_CRF(Y_testing, y_pred)
    elif command == 'write_results':
        list_write = write_CRFs_compare(Y_testing, y_pred)
        for value in list_write:
            pred_list = value[0]
            test_list = value[1]

            for i in range(0, len(pred_list)):
                print str(pred_list[i]) + '\t' + str(test_list[i])
Exemplo n.º 34
0
def chaincrf_test():
    num_pics = 3000
    X, Y = load_pictures(num_pics)
    X = np.array(X)
    Y = np.array(Y)

    print X.shape
    print Y.shape

    # 0: pixel, 1: row, 2: picture
    mode = 0
    outstr = "Test score with data arranged by "

    if mode == 0:
        X, Y = arrange_by_pixel(X, Y)
        outstr += "pixel:"
    elif mode == 1:
        X, Y = arrange_by_row(X, Y)
        outstr += "row:"
    elif mode == 2:
        X, Y = arrange_by_picture(X, Y)
        outstr += "picture:"

    print X.shape
    print Y.shape

    #print X.shape, Y.shape
    train_pct = 0.66
    test_pct = 1 - train_pct
    X_train = X[0:math.floor(train_pct * num_pics)]
    X_test = X[math.floor(test_pct * num_pics):]
    Y_train = Y[0:math.floor(train_pct * num_pics)]
    Y_test = Y[math.floor(test_pct * num_pics):]

    model = ChainCRF()
    ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10)
    # #print X_train.shape, Y_train.shape
    ssvm.fit(X_train, Y_train)
    results = ssvm.score(X_test, Y_test)
    print outstr
    print results
Exemplo n.º 35
0
def chaincrf_test():
	num_pics = 3000
	X, Y= load_pictures(num_pics)
	X = np.array(X)
	Y = np.array(Y)

	print X.shape
	print Y.shape

	# 0: pixel, 1: row, 2: picture
	mode = 0
	outstr = "Test score with data arranged by "

	if mode == 0:
		X, Y = arrange_by_pixel(X, Y)
		outstr += "pixel:"
	elif mode == 1:
		X, Y = arrange_by_row(X, Y)
		outstr += "row:"
	elif mode == 2:
		X, Y = arrange_by_picture(X, Y)
		outstr += "picture:"

	print X.shape
	print Y.shape

	#print X.shape, Y.shape
	train_pct = 0.66
	test_pct = 1 - train_pct
	X_train = X[0:math.floor(train_pct * num_pics)]
	X_test = X[math.floor(test_pct*num_pics):]
	Y_train = Y[0:math.floor(train_pct * num_pics)]
	Y_test = Y[math.floor(test_pct*num_pics):]

	model = ChainCRF()
	ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10)
	# #print X_train.shape, Y_train.shape
	ssvm.fit(X_train, Y_train)
	results = ssvm.score(X_test, Y_test)
	print outstr
	print results
Exemplo n.º 36
0
def learn(train_set):
    X = []
    y = []
    for num in train_set:
        X += get_features_value(num)
        y += get_segments_classes(num)

    X = np.array(X)
        

    X = [(np.atleast_2d(x), np.empty((0, 2), dtype=np.int)) for x in X]
    y = np.vstack(y)

    pbl = GraphCRF(inference_method='unary')
    #svm = NSlackSSVM(pbl, C=100)
    svm = FrankWolfeSSVM(pbl, C=10, max_iter=50)

    svm.fit(X, y)

    cPickle.dump(svm, open("classifier", "wb+"))
    return svm
Exemplo n.º 37
0
 def __init__(self, do_train=False, trained_model_name="passage_crf_model", algorithm="crf"):
   self.trained_model_name = trained_model_name
   self.fp = FeatureProcessing()
   self.do_train = do_train
   self.algorithm = algorithm
   if algorithm == "crf":
     if do_train:
       self.trainer = Trainer()
     else:
       self.tagger = Tagger()
   else:
     if do_train:
       model = ChainCRF()
       self.trainer = FrankWolfeSSVM(model=model)
       self.feat_index = {}
       self.label_index = {}
     else:
       self.tagger = pickle.load(open(self.trained_model_name, "rb"))
       self.feat_index = pickle.load(open("ssvm_feat_index.pkl", "rb"))
       label_index = pickle.load(open("ssvm_label_index.pkl", "rb"))
       self.rev_label_index = {i: x for x, i in label_index.items()}
Exemplo n.º 38
0
def Chain_CRF(x, y, x_test, model_args):
    # Reshape for CRF
    #svc = SVC(class_weight='balanced', kernel='rbf', decision_function_shape='ovr')
    #svc.fit(x, y)
    #x = svc.decision_function(x)
    #x_test = svc.decision_function(x_test)
    #scaler = StandardScaler().fit(x)
    #x = scaler.transform(x)
    #x_test = scaler.transform(x_test)
    x = x[:, :11]
    x_test = x_test[:, :11]
    x = x.reshape(-1, 21600, x.shape[-1])
    x_test = x_test.reshape(-1, 21600, x.shape[-1])
    y = y.reshape(-1, 21600)
    crf = ChainCRF(directed=False)
    ssvm = FrankWolfeSSVM(model=crf,
                          C=model_args['C'],
                          max_iter=model_args['max_iter'])
    ssvm.fit(x, y)
    y_pred = np.array(ssvm.predict(x_test))
    return y_pred.flatten()
Exemplo n.º 39
0
def chain_crf():
    letters = load_letters()
    x, y, folds = letters['data'], letters['labels'], letters['folds']
    print "Letters : "
    print letters
    # print "Data : "
    # print letters['data']
    # print "Labels : "
    # print letters['labels']
    x, y = np.array(x), np.array(y)
    x_train, x_test = x[folds == 1], x[folds != 1]
    y_train, y_test = y[folds == 1], y[folds != 1]
    print len(x_train)
    print len(x_test)
    print "Done"

    print x_train[0].shape
    print y_train[0].shape
    print x_train[10].shape
    print y_train[10].shape

    model = ChainCRF()
    ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10)
    print ssvm.fit(x_train, y_train)
    print ssvm.score(x_test, y_test)
Exemplo n.º 40
0
def test_ssvm_objectives():
    # test that the algorithms provide consistent objective curves.
    # this is not that strong a test now but at least makes sure that
    # the objective function is called.
    X, Y = generate_blocks_multinomial(n_samples=10, noise=1.5, seed=0)
    n_labels = len(np.unique(Y))
    crf = GridCRF(n_states=n_labels, inference_method=inference_method)
    # once for n-slack
    clf = NSlackSSVM(model=crf, max_iter=5, C=1, tol=.1)
    clf.fit(X, Y)
    primal_objective = objective_primal(clf.model, clf.w, X, Y, clf.C)
    assert_almost_equal(clf.primal_objective_curve_[-1], primal_objective)

    # once for one-slack
    clf = OneSlackSSVM(model=crf, max_iter=5, C=1, tol=.1)
    clf.fit(X, Y)
    primal_objective = objective_primal(clf.model,
                                        clf.w,
                                        X,
                                        Y,
                                        clf.C,
                                        variant='one_slack')
    assert_almost_equal(clf.primal_objective_curve_[-1], primal_objective)

    # now subgradient. Should also work in batch-mode.
    clf = SubgradientSSVM(model=crf, max_iter=5, C=1, batch_size=-1)
    clf.fit(X, Y)
    primal_objective = objective_primal(clf.model, clf.w, X, Y, clf.C)
    assert_almost_equal(clf.objective_curve_[-1], primal_objective)

    # frank wolfe
    clf = FrankWolfeSSVM(model=crf, max_iter=5, C=1, batch_mode=True)
    clf.fit(X, Y)
    primal_objective = objective_primal(clf.model, clf.w, X, Y, clf.C)
    assert_almost_equal(clf.primal_objective_curve_[-1], primal_objective)
    # block-coordinate Frank-Wolfe
    clf = FrankWolfeSSVM(model=crf, max_iter=5, C=1, batch_mode=False)
    clf.fit(X, Y)
    primal_objective = objective_primal(clf.model, clf.w, X, Y, clf.C)
    assert_almost_equal(clf.primal_objective_curve_[-1], primal_objective)
Exemplo n.º 41
0
def build_models(X_train, y_train):
    '''
    PURPOSE:    ouput model objects which have been fitted with training data
    INPUT:      X_train (np.array) - features matrix
                y_train (np.array) - label matrix
    OUTPUT:     nmb (MultinomialNB obj) - model trained on X_train, y_train
                svm (LinearSVC obj) - model trained on X_train, y_train
                ssvm (PyStruct chainCRF object) - trained Chain CRF model
    '''
    # Multinomial Naive Bayes Classifier:
    nmb = MultinomialNB()
    nmb.fit(np.vstack(X_train), np.hstack(y_train))

    # Support Vector Machine Classifier
    svm = LinearSVC(dual=False, C=.1)
    svm.fit(np.vstack(X_train), np.hstack(y_train))

    # Chain Conditional Random Field Classifier
    model = ChainCRF()
    ssvm = FrankWolfeSSVM(model=model, C=0.5, max_iter=15)
    ssvm.fit(X_train, y_train)
    return nmb, svm, ssvm
Exemplo n.º 42
0
class CRFModel(object):
    def __init__(self, c_val=1.0):
        self.clf = FrankWolfeSSVM(model=ChainCRF(), C=c_val, max_iter=100)

    #Load the training data
    def load_data(self):
        alphabets = load_letters()
        X = np.array(alphabets['data'])
        y = np.array(alphabets['labels'])
        folds = alphabets['folds']
        return X, y, folds

    #Train the CRF
    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    #Evaluate the accuracy of the CRF
    def evaluate(self, X_test, y_test):
        return self.clf.score(X_test, y_test)

    #Run the CRF on unknown data
    def classify(self, input_data):
        return self.clf.predict(input_data)[0]
Exemplo n.º 43
0
def main():
  parser = argparse.ArgumentParser(description="learn to tokenize",
                                   formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument("--untokfile", "-u", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="untok file")
  parser.add_argument("--biofile", "-b", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="bio file")
  parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('wb'), default=None, help="output file")
  parser.add_argument("--debug", "-d", action='store_true', default=False, help="debug mode")



  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))

  untokfile = prepfile(args.untokfile, 'r')
  biofile = prepfile(args.biofile, 'r')


  data, labels, datamap, labelmap = prepdata(untokfile, biofile, args.debug)

#  print(data)
#  print(labels)
  model = ChainCRF()
  #ssvm = SubgradientSSVM(model=model, C=.1)#, show_loss_every=5)
  ssvm = FrankWolfeSSVM(model=model, max_iter=100, C=.1)#, show_loss_every=5)
  ssvm.fit(data, labels)
  #  curve = ssvm.loss_curve_
  # TONT
  # print("TONT score with chain CRF: %f" % ssvm.score(data, labels))

  ret = {}
  ret['model']=ssvm
  ret['feats']=datamap
  ret['labels']=labelmap
  if args.outfile is not None:
    pickle.dump(ret, args.outfile)
Exemplo n.º 44
0
def CRF_pred_label(X, Y, command):
    texts = load_demo_text(command)
    if command == 'twitter':
        convert_texts = filterText_demo(texts, 'removeLink', command)
        X_ftr = load_demo_ftr(command)
        print len(convert_texts), len(X_ftr)
        path_write = 'D:/Project/Transportation_SMU-NEC_collaboration/Data_demo_Dec_2015/twitter'
        name_write = 'pred_label_' + command

    elif command == 'sgforums':
        convert_texts = filterText_demo(texts, 'removePunc', command)
        X_ftr = load_demo_ftr(command)
        print len(convert_texts), len(X_ftr)
        path_write = 'D:/Project/Transportation_SMU-NEC_collaboration/Data_demo_Dec_2015/sgforums'
        name_write = 'pred_label_' + command

    elif command == 'facebook':
        convert_texts = filterText_demo(texts, 'removeLink', command)
        X_ftr = load_demo_ftr(command)
        print len(convert_texts), len(X_ftr)
        path_write = 'D:/Project/Transportation_SMU-NEC_collaboration/Data_demo_Dec_2015/facebook'
        name_write = 'pred_label_' + command

    crf = ChainCRF(inference_method='max-product', directed=False, class_weight=None)
    ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=100)
    ssvm.fit(X, Y)
    y_pred = ssvm.predict(X_ftr)

    list_write = list()
    for line in y_pred:
        labels = ''
        for label in line:
            labels += str(label) + '\t'
        list_write.append(labels.strip())

    write_file(path_write, name_write, list_write)
Exemplo n.º 45
0
def trainModel_Basic(num_iter=5,inference="qpbo",trainer="NSlack",num_train=2,num_test=1,C=0.1,edges="180x180_dist1_diag0",inputs=[1,1,1,1,1,1],features="all",directed=False,savePred=False):
    
    
    padding=(30,30,30,30)
    
    
    if directed==True:
        features +='+directed'
        
    resultsDir = os.getcwd()+'/CRFResults'
    nameLen = len(os.listdir(resultsDir))
    edgeFeature = edges
    filename=str(nameLen)+'_CRF_iter_'+str(num_iter)+"_"+inference+"_"+trainer+"_"+features+"_"+str(num_train)+"_"+str(num_test)+"_"+edgeFeature
        
    
    print "Loading training slices"
    
    
    start = time.clock()
    train =extractSlices2(train_path,num_train,padding,inputs=inputs)
    end= time.clock()
    train_load_time = (end-start)/60.0
    
    [trainLayers,trainTruth,sliceShape] = train
    print "Training slices loaded in %f" % (train_load_time)
    
    n_features= len(trainLayers[0][0,0])
    print "Layer shape is : "
    print trainLayers[0].shape
    
    print "Training the model"
    edges= np.load("/home/bmi/CRF/edges/"+edges+".npy")
    
    G = [edges for x in trainLayers]
   
    print trainLayers[0].shape
    
    trainLayers = np.array( [x.reshape((sliceShape[0]*sliceShape[1],n_features)) for x in trainLayers] )
    trainTruth = np.array( [x.reshape((sliceShape[0]*sliceShape[1],)).astype(int) for x in trainTruth] )
    
    if inference=='ogm':
        crf = GraphCRF(inference_method=('ogm',{'alg':'fm'}),directed=directed)
    else:
        crf = GraphCRF(inference_method=inference,directed=directed)
    
    if trainer=="Frank":
        svm = FrankWolfeSSVM(model = crf,max_iter=num_iter,C=C,n_jobs=6,verbose=1)
    elif trainer=="NSlack":
        svm = NSlackSSVM(model = crf,max_iter=num_iter,C=C,n_jobs=-1,verbose=1)
    else:
        svm = OneSlackSSVM(model = crf,max_iter=num_iter,C=C,n_jobs=-1,verbose=1)
    
    
    start = time.clock()
    asdf = zip(trainLayers,G)
    svm.fit(asdf,trainTruth)
    end = time.clock()
    train_time = (end-start)/60.0
    print "The training took %f" % (train_time)
    print "Model parameter size :"
    print svm.w.shape
    
    print "making predictions on train data"
    predTrain = svm.predict(asdf)
    trainDice=[]
    for i in range(len(trainLayers)):
        diceScore = accuracy(predTrain[i],trainTruth[i])
        trainDice.append(diceScore)
    meanTrainDice =  sum(trainDice)/len(trainLayers)
    
    del trainLayers,trainTruth
    
################################################################################################    
    overallDicePerPatient=[]           # For overall test Dice 
    extDicePerPatient=[]
    PatientTruthLayers=[]
    PatientPredLayers=[]
    PREC=[]
    RECALL=[]
    F1=[]
    LayerwiseDiceTotal=[]
    
    
    
    
    testResultFile = open(os.getcwd()+"/CRFResults/"+filename+".csv",'a')
    testResultFile.write("folderName,numLayers, Overall Dice, precision , recall, F1"+"\n")
    
    
    counter=0
    print "Loading the test slices"
    for folder in os.listdir(test_path):
        path = test_path + "/" + folder
        layerDiceScores=''
#        print path
        
        data = extractTestSlices2(path,padding,inputs=inputs)
        if data!=0:
            [testLayers,testTruth,sliceShape,startSlice,endSlice] = data
        
#        trueTestLayers=testLayers
        GTest = [edges for x in testLayers]
        testLayers = np.array( [x.reshape((sliceShape[0]*sliceShape[1],n_features)) for x in testLayers] )
        testTruth = np.array( [x.reshape((sliceShape[0]*sliceShape[1],)).astype(int) for x in testTruth] )
        
        asdfTest = zip(testLayers,GTest)
        predTest = svm.predict(asdfTest)  
        
        LayerwiseDice=[]
        
        for i in range(len(testLayers)):
            diceScore = accuracy(predTest[i],testTruth[i])
            layerDiceScores+=","+str(diceScore)
            if math.isnan(diceScore):
                if sum(predTest[i])==0 and sum(testTruth[i])==0:
                    LayerwiseDice.append(1.0)
                continue
            LayerwiseDice.append(diceScore)
        
        LayerwiseDiceTotal.append(LayerwiseDice)
        
        
        overallTestDice = accuracy(np.hstack(predTest),np.hstack(testTruth))
        extDice = np.mean ( np.array(LayerwiseDice)[ range(10) + range(len(LayerwiseDice)-10, len(LayerwiseDice)) ] )
        prec,recall,f1 = precision_score(np.hstack(testTruth),np.hstack(predTest)) , recall_score(np.hstack(testTruth),np.hstack(predTest)) , f1_score(np.hstack(testTruth),np.hstack(predTest))
        print "Patient %d : Overall test DICE for %s is : %f and extDice is %f"%(counter,folder,overallTestDice,extDice)
        print "Precision : %f  Recall : %f  F1 : %f " %(prec,recall,f1)
        print "__________________________________________"

        
        
#        testResultFile.write(folder+","+str(len(testLayers))+","+str(meanTestDice)+","+str(overallTestDice) ","+str(np.max(testDice)) +","+ str(np.min(testDice))+"\n" )
        testResultFile.write(folder+","+str(len(testLayers)) + ","+ str(overallTestDice) + ","+str(prec)+","+str(recall)+","+str(extDice)+layerDiceScores+"\n" )
        overallDicePerPatient.append(overallTestDice)
        extDicePerPatient.append(extDice)
        PREC.append(prec), RECALL.append(recall) , F1.append(f1)
        
        PatientTruthLayers.append(testTruth)
        PatientPredLayers.append(predTest)
        
        counter+=1
        if counter==num_test and num_test!=-1:
            break
######################################################################################################       
    print "Done testing slices"
    overallDice = sum(overallDicePerPatient)/len(PatientTruthLayers)
    overallPrec = sum(PREC)/len(PatientTruthLayers)
    overallRecall = sum(RECALL)/len(PatientTruthLayers)
    overallExtDice = np.mean(extDicePerPatient)
    print "Overall DICE : %f Precision : %f Recall : %f extDice : %f  "%(overallDice,overallPrec,overallRecall,overallExtDice)
    print "############################################"    
    
#    testOutput=np.array([PatientPredLayers,PatientTruthLayers,trueTestLayers])
    testOutput=np.array([PatientPredLayers,PatientTruthLayers])
    
    ########### Saving the models ######################################################################
    
    
#    print "Saving the model"
#    modelDir = os.getcwd()+"/CRFModel/"
#    svmModel = open(modelDir+filename+"_model"+".pkl",'wb')
#    cPickle.dump(svm,svmModel,protocol=cPickle.HIGHEST_PROTOCOL)
#    svmModel.close()    
#    
#    print "saving the predictions"
#    predFileTest = open(os.getcwd()+"/CRFPred/"+filename+"_pred.pkl",'wb')
#    cPickle.dump(testOutput,predFileTest,protocol=cPickle.HIGHEST_PROTOCOL)
#    predFileTest.close()   
    
    
    layerDataLog = open(os.getcwd()+"/CRFModel/"+filename+"_layer.pkl",'wb')
    cPickle.dump(LayerwiseDiceTotal,layerDataLog,protocol = cPickle.HIGHEST_PROTOCOL)
    layerDataLog.close()
    
    resultLog = os.getcwd()+"/CRFResults/TestResultFinal.csv"
    resultFile = open(resultLog,'a')
    resultFile.write(time.ctime()+","+str(num_iter)+","+str(num_train)+","+str(num_test)+","+inference+","+
    trainer+","+str(C)+","+str(train_time)+","+str(meanTrainDice)+","+str(overallDice)+","+
    str(np.std(overallDicePerPatient))+","+edgeFeature+","+"None"+","+features+","+filename +","+ str(overallPrec) +","+ str(overallRecall) +","+ str(overallExtDice)+","+"Flair(5)+T2(9)-Without last 4 train Layers"+"\n")
    
    
    resultFile.close()
    testResultFile.close()
    
    return
Exemplo n.º 46
0
net_base_path = '/media/ohadsh/sheard/googleDrive/Master/courses/probabilistic_graphical_models/outputs/part_3/training_2016_06_11/'
# Load pre-trained network
train_name = 'train_pred_-1.pkl'
test_name = 'test_pred_-1.pkl'
with open(os.path.join(net_base_path, train_name), 'r') as f:
    train_net_pred = cPickle.load(f)
with open(os.path.join(net_base_path, test_name), 'r') as f:
    test_net_pred = cPickle.load(f)

# Rearrange data for CRF
nn_predictions_train = arrange_letters_in_pred_like(X_train, train_net_pred, size_of_pred=26)
nn_predictions_test = arrange_letters_in_pred_like(X_test, test_net_pred, size_of_pred=26)

# Train LCCRF
chain_model = ChainCRF(directed=True)
chain_ssvm = FrankWolfeSSVM(model=chain_model, C=.1, max_iter=11)
chain_ssvm.fit(X_train, y_train)

# Train LCCRF+NN
chain_model = ChainCRF(directed=True)
chain_ssvm_nn = FrankWolfeSSVM(model=chain_model, C=.1, max_iter=11)
chain_ssvm_nn.fit(nn_predictions_train, y_train)

print("Test score with linear NN: 84.15%")

print("Test score with LCCRF: %f" % chain_ssvm.score(X_test, y_test))

print("Test score with LCCRF+NN: %f" % chain_ssvm_nn.score(nn_predictions_test, y_test))

# plot some word sequenced
n_words = 4
Exemplo n.º 47
0
import loader
import util
from sklearn import preprocessing


directory = "/Users/thijs/dev/boilerplate/src/main/resources/dataset/"
featureset = "features10"

print("Load files")
features, labels = \
  loader.loadBinary(featureset+'.csv', 'labels.csv', directory)

# print("Shuffle results")
# features, labels = util.shuffle(features, labels)

print("Loaded")
# print(labels)

# features = preprocessing.scale(features)


from pystruct.models import BinaryClf
from pystruct.learners import (NSlackSSVM, OneSlackSSVM,
                               SubgradientSSVM, FrankWolfeSSVM)
clf = FrankWolfeSSVM(BinaryClf(),verbose=True)
# print(clf)
clf.fit(features,labels)
trscore = clf.score(features,labels)

# print("Training score: {0}".format(trscore))
print("Klaar")
Exemplo n.º 48
0
def run_crf(w2v, words_before, words_after, shallow_parse):

    pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \
        parse_summerscales.get_tokens_and_lbls(
                make_pmids_dict=True, sen=True)


    """
        Create model
    """
    model = ChainCRF(directed=False)
    ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=30)

    all_pmids = pmids_dict.keys()
    n = len(all_pmids)
    n_folds = 5
    kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds)
    fold_gi = []


    for fold_idx, (train, test) in enumerate(kf):
        print("on fold %s" % fold_idx)
        train_pmids = [all_pmids[pmid_idx] for pmid_idx in train]
        test_pmids  = [all_pmids[pmid_idx] for pmid_idx in test]
        print('loading data...')
        train_x, train_y = abstract2features(pmids_dict, words_before, w2v, shallow_parse)
        test_x, test_y = abstract2features(pmids_dict, words_after, w2v, shallow_parse)

        print('loaded data...')
        print 'training...'
        ssvm.fit(train_x, train_y)

        print ssvm.score(test_x, test_y)

        for i, (pmid, x, y) in enumerate(zip(test_pmids, test_x, test_y)):
            abstract_words, _, _= pmids_dict[pmid]

            print(pmid)

            # predict() takes in a list returns another list
            prediction = ssvm.predict([x]).pop(0)

            predicted = ''
            output = ''

            if len(prediction) > 0:

                for p in prediction:
                    if p == 1:
                        print "word: {}".format(abstract_words[p])
                        if n == 0:
                            predicted += abstract_words[p]
                        else:
                            predicted += ' ' + abstract_words[p]

                if not predicted == '':
                    output = 'predicted: {}'.format(predicted)
                else:
                    output = 'Predicted nothing!'
            else:
                output = 'Predicted nothing!'
            print output
Exemplo n.º 49
0
    # break
    list_y.append(len(y[i]))
print 'Shape of targets:', y.shape
print 'Max length:', max(list_y)

features_train, features_test = features[folds == 1], features[folds != 1]
y_train, y_test = y[folds == 1], y[folds != 1]

f_t = features_train
X_train = [(features_i, np.vstack([np.arange(f_t.shape[0] - 1), np.arange(1, f_t.shape[0])])) for features_i in f_t]
print 'Loading X_train'
f_test = features_test
X_test = [(features_i, np.vstack([np.arange(f_t.shape[0] - 1), np.arange(1, f_t.shape[0])])) for features_i in f_test]
print 'Loading X_test'

print len(X_train), len(y_train)
print type(X_train), type(y_train)
for each in X_train:
    print len(each)

start = time()
model = GraphCRF(directed=True, inference_method="max-product")
ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10)
ssvm.fit(X_train, y_train)
#
# print 'accuracy of GraphCRF %f:' % ssvm.score(X_test, y_test), ' time spend: %f' % (time()-start)




Exemplo n.º 50
0
net_base_path = '/media/ohadsh/sheard/googleDrive/Master/courses/probabilistic_graphical_models/outputs/part_3/training_2016_06_11/'
# Load pre-trained network
train_name = 'train_pred_-2.pkl'
test_name = 'test_pred_-2.pkl'
with open(os.path.join(net_base_path, train_name), 'r') as f:
    train_net_pred = cPickle.load(f)
with open(os.path.join(net_base_path, test_name), 'r') as f:
    test_net_pred = cPickle.load(f)

# Rearrange data for CRF
nn_predictions_train = arrange_letters_in_pred_like(X_train, train_net_pred, size_of_pred=26)
nn_predictions_test = arrange_letters_in_pred_like(X_test, test_net_pred, size_of_pred=26)

# Train CRF
model = ChainCRF(directed=True)
ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=11)
ssvm.fit(np.vstack(nn_predictions_train).reshape((5375, 1, 128)), np.hstack(y_train).reshape(5375, 1))

# Train linear chain CRF
chain_model = ChainCRF(directed=True)
chain_ssvm = FrankWolfeSSVM(model=chain_model, C=.1, max_iter=11)
chain_ssvm.fit(nn_predictions_train, y_train)


# # Create linear regression object
# regr = LinearRegression()
# # Train the model using the training sets
# regr.fit(np.vstack(nn_predictions_train), np.hstack(y_train))

# print("Test score with linear regression: %f" % regr.score(np.vstack(nn_predictions_test),
#                                                    np.hstack(y_test)))
Exemplo n.º 51
0
from pystruct.models import GridCRF
from pystruct.learners import (NSlackSSVM, OneSlackSSVM, SubgradientSSVM,
                               FrankWolfeSSVM)
from pystruct.datasets import generate_crosses_explicit

X, Y = generate_crosses_explicit(n_samples=50, noise=10, size=6, n_crosses=1)
n_labels = len(np.unique(Y))
crf = GridCRF(n_states=n_labels, inference_method=("ad3", {'branch_and_bound': True}))

n_slack_svm = NSlackSSVM(crf, check_constraints=False,
                         max_iter=50, batch_size=1, tol=0.001)
one_slack_svm = OneSlackSSVM(crf, check_constraints=False,
                             max_iter=100, tol=0.001, inference_cache=50)
subgradient_svm = SubgradientSSVM(crf, learning_rate=0.001, max_iter=20,
                                  decay_exponent=0, momentum=0)
bcfw_svm = FrankWolfeSSVM(crf, max_iter=50, check_dual_every=4)

#n-slack cutting plane ssvm
n_slack_svm.fit(X, Y)

# 1-slack cutting plane ssvm
one_slack_svm.fit(X, Y)

# online subgradient ssvm
subgradient_svm.fit(X, Y)

# Block coordinate Frank-Wolfe
bcfw_svm.fit(X, Y)

# don't plot objective from chached inference for 1-slack
inference_run = ~np.array(one_slack_svm.cached_constraint_)
# print x
# for value in x:
#     print value
y = [0, 1, 1, 2, 2]
y_1 = [0, 1, 1, 2, 2]
# print y

list_x, list_y = [], []
list_x.append(np.array(x))
list_x.append(np.array(x_1))
list_y.append(y)
list_y.append(y_1)

# crf = ChainCRF(inference_method='max-product')
crf = ChainCRF(inference_method="max-product", directed=False)
ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=100)
ssvm.fit(np.array(list_x), np.array(list_y))

test_x = np.array(list_x)
test_y = np.array(list_y)
# print np.array(list_x)[0].shape[1]

x_test = [[1, 0, 0, 0], [1, 0, 1, 0]]
list_x_test = list()
list_x_test.append(x_test)

pred = ssvm.predict(np.array(list_x_test))
# for value in pred:
#     print value

Exemplo n.º 53
0

# for value in X:
#     print value.shape
#
# print X_train.shape
# print y_train.shape
#
# print type(X_train)

# for value in y_train:
#     print value
#
# for i in range(0, len(X_train)):
#     if i == 15:
#         print X_train[i], len(X_train[i])
#         for f in X_train[i]:
#             print len(f)
#             break
#         print y_train[i], len(X_train[i])
#     # break
#

start = time()

model = ChainCRF(inference_method='max-product', directed=True)
ssvm = FrankWolfeSSVM(model=model, C=1.0, max_iter=10)

ssvm.fit(X_train, y_train)

print 'accuracy of linear-crf %f:' % ssvm.score(X_test, y_test), ' time spend: %f' %(time()-start)
Exemplo n.º 54
0
letters = load_letters()
X, y, folds = letters["data"], letters["labels"], letters["folds"]
# we convert the lists to object arrays, as that makes slicing much more
# convenient
X, y = np.array(X), np.array(y)
X_train, X_test = X[folds == 1], X[folds != 1]
y_train, y_test = y[folds == 1], y[folds != 1]

# Train linear SVM
svm = LinearSVC(dual=False, C=0.1)
# flatten input
svm.fit(np.vstack(X_train), np.hstack(y_train))

# Train linear chain CRF
model = ChainCRF()
ssvm = FrankWolfeSSVM(model=model, C=0.1, max_iter=11)
ssvm.fit(X_train, y_train)

print("Test score with chain CRF: %f" % ssvm.score(X_test, y_test))

print("Test score with linear SVM: %f" % svm.score(np.vstack(X_test), np.hstack(y_test)))

# plot some word sequenced
n_words = 4
rnd = np.random.RandomState(1)
selected = rnd.randint(len(y_test), size=n_words)
max_word_len = max([len(y_) for y_ in y_test[selected]])
fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10))
fig.subplots_adjust(wspace=0)
for ind, axes_row in zip(selected, axes):
    y_pred_svm = svm.predict(X_test[ind])
Exemplo n.º 55
0
class PassageTagger(object):
  def __init__(self, do_train=False, trained_model_name="passage_crf_model", algorithm="crf"):
    self.trained_model_name = trained_model_name
    self.fp = FeatureProcessing()
    self.do_train = do_train
    self.algorithm = algorithm
    if algorithm == "crf":
      if do_train:
        self.trainer = Trainer()
      else:
        self.tagger = Tagger()
    else:
      if do_train:
        model = ChainCRF()
        self.trainer = FrankWolfeSSVM(model=model)
        self.feat_index = {}
        self.label_index = {}
      else:
        self.tagger = pickle.load(open(self.trained_model_name, "rb"))
        self.feat_index = pickle.load(open("ssvm_feat_index.pkl", "rb"))
        label_index = pickle.load(open("ssvm_label_index.pkl", "rb"))
        self.rev_label_index = {i: x for x, i in label_index.items()}

  def read_input(self, filename):
    str_seqs = []
    str_seq = []
    feat_seqs = []
    feat_seq = []
    label_seqs = []
    label_seq = []
    for line in codecs.open(filename, "r", "utf-8"):
      lnstrp = line.strip()
      if lnstrp == "":
        if len(str_seq) != 0:
          str_seqs.append(str_seq)
          str_seq = []
          feat_seqs.append(feat_seq)
          feat_seq = []
          label_seqs.append(label_seq)
          label_seq = []
      else:
        if self.do_train:
          clause, label = lnstrp.split("\t")
          label_seq.append(label)
        else:
          clause = lnstrp
        str_seq.append(clause)
        feats = self.fp.get_features(clause)
        feat_dict = {}
        for f in feats:
          if f in feat_dict:
            feat_dict[f] += 1
          else:
            feat_dict[f] = 1
        #feat_dict = {i: v for i, v in enumerate(feats)}
        feat_seq.append(feat_dict)
    if len(str_seq) != 0:
      str_seqs.append(str_seq)
      str_seq = []
      feat_seqs.append(feat_seq)
      feat_seq = []
      label_seqs.append(label_seq)
      label_seq = []
    return str_seqs, feat_seqs, label_seqs

  def predict(self, feat_seqs):
    print >>sys.stderr, "Tagging %d sequences"%len(feat_seqs)
    if self.algorithm == "crf":
      self.tagger.open(self.trained_model_name)
      preds = [self.tagger.tag(ItemSequence(feat_seq)) for feat_seq in feat_seqs]
    else:
      Xs = []
      for fs in feat_seqs:
        X = []
        for feat_dict in fs:
          x = [0] * len(self.feat_index)
          for f in feat_dict:
            if f in self.feat_index:
              x[self.feat_index[f]] = feat_dict[f]
          X.append(x)
        Xs.append(numpy.asarray(X))
      pred_ind_seqs = self.tagger.predict(Xs)
      preds = []
      for ps in pred_ind_seqs:
        pred = []
        for pred_ind in ps:
          pred.append(self.rev_label_index[pred_ind])
        preds.append(pred)
    return preds

  def train(self, feat_seqs, label_seqs):
    print >>sys.stderr, "Training on %d sequences"%len(feat_seqs)
    if self.algorithm == "crf":
      for feat_seq, label_seq in zip(feat_seqs, label_seqs):
        self.trainer.append(ItemSequence(feat_seq), label_seq)
      self.trainer.train(self.trained_model_name)
    else:
      for fs in feat_seqs:
        for feat_dict in fs:
          for f in feat_dict:
            if f not in self.feat_index:
              self.feat_index[f] = len(self.feat_index)
      Xs = []
      for fs in feat_seqs:
        X = []
        for feat_dict in fs:
          x = [0] * len(self.feat_index)
          for f in feat_dict:
            x[self.feat_index[f]] = feat_dict[f]
          X.append(x)
        Xs.append(numpy.asarray(X))

      for ls in label_seqs:
        for label in ls:
          if label not in self.label_index:
            self.label_index[label] = len(self.label_index)

      Ys = []
      for ls in label_seqs:
        Y = []
        for label in ls:
          Y.append(self.label_index[label])
        Ys.append(numpy.asarray(Y))

      self.trainer.fit(Xs, Ys)
      pickle.dump(self.trainer, open(self.trained_model_name, "wb"))
      pickle.dump(self.feat_index, open("ssvm_feat_index.pkl", "wb"))
      pickle.dump(self.label_index, open("ssvm_label_index.pkl", "wb"))
Exemplo n.º 56
0
X = X / 16.
#y = y.astype(np.int) - 1
X_train, X_test, y_train, y_test = train_test_split(X, y)

# we add a constant 1 feature for the bias
X_train_bias = np.hstack([X_train, np.ones((X_train.shape[0], 1))])
X_test_bias = np.hstack([X_test, np.ones((X_test.shape[0], 1))])

model = MultiClassClf(n_features=X_train_bias.shape[1], n_classes=10)
n_slack_svm = NSlackSSVM(model, verbose=2, check_constraints=False, C=0.1,
                         batch_size=100, tol=1e-2)
one_slack_svm = OneSlackSSVM(model, verbose=2, C=.10, tol=.001)
subgradient_svm = SubgradientSSVM(model, C=0.1, learning_rate=0.000001,
                                  max_iter=1000, verbose=0)

fw_bc_svm = FrankWolfeSSVM(model, C=.1, max_iter=50)
fw_batch_svm = FrankWolfeSSVM(model, C=.1, max_iter=50, batch_mode=True)

# n-slack cutting plane ssvm
start = time()
n_slack_svm.fit(X_train_bias, y_train)
time_n_slack_svm = time() - start
y_pred = np.hstack(n_slack_svm.predict(X_test_bias))
print("Score with pystruct n-slack ssvm: %f (took %f seconds)"
      % (np.mean(y_pred == y_test), time_n_slack_svm))

## 1-slack cutting plane ssvm
start = time()
one_slack_svm.fit(X_train_bias, y_train)
time_one_slack_svm = time() - start
y_pred = np.hstack(one_slack_svm.predict(X_test_bias))
def classify(traincorpus, testcorpus):

    model = ChainCRF()
    ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10)
	
    pos_lexicon = load_lexicon("lexica/restaurants/ote/pos")
    term_lexicon = load_lexicon("lexica/restaurants/ote/term")
    pre1_lexicon = load_lexicon("lexica/restaurants/ote/prefix1")
    pre2_lexicon = load_lexicon("lexica/restaurants/ote/prefix2")
    pre3_lexicon = load_lexicon("lexica/restaurants/ote/prefix3")
    suf1_lexicon = load_lexicon("lexica/restaurants/ote/suffix1")
    suf2_lexicon = load_lexicon("lexica/restaurants/ote/suffix2")
    suf3_lexicon = load_lexicon("lexica/restaurants/ote/suffix3")
    
    train_sentences = [] #the list to be used to store our features for the words    
    sentence_labels = [] #the list to be used for labeling if a word is an aspect term

    print('Creating train feature vectors...')

    #extracting sentences and appending them labels
    for instance in traincorpus.corpus:
        words = nltk.word_tokenize(instance.text)
        
        tags = nltk.pos_tag(words)
        tags_list = [] #the pos list
        for _, t in tags:
                tags_list.append(t)

        last_prediction = ""

        train_words = []
        word_labels = []
        for i, w in enumerate(words):
            word_found = False
            if words[i] == w:
                word_found = True
                
                pos_feats = []
                previous_pos_feats = []
                second_previous_pos_feats = []
                next_pos_feats = []
                second_next_pos_feats = []
                morph_feats = []
                term_feats = []
                pre1_feats = []
                pre2_feats = []
                pre3_feats = []
                suf1_feats = []
                suf2_feats = []
                suf3_feats = []

                target_labels = []
                train_word_features = []

                #prefix of lengths 1,2,3 lexicon features
                for p1 in pre1_lexicon:
                    if p1 == w[0]:
                        pre1_feats.append(1)
                    else:
                        pre1_feats.append(0)

                for p2 in pre2_lexicon:
                    if len(w) > 1:
                        if p2 == w[0]+w[1]:
                            pre2_feats.append(1)
                        else:
                            pre2_feats.append(0)
                    else:
                        pre2_feats.append(0)

                for p3 in pre3_lexicon:
                    if len(w) > 2:
                        if p3 == w[0]+w[1]+w[2]:
                            pre3_feats.append(1)
                        else:
                            pre3_feats.append(0)
                    else:
                        pre3_feats.append(0)

                #suffix of lengths 1,2,3 lexicon features
                for s1 in suf1_lexicon:
                    if s1 == w[-1]:
                        suf1_feats.append(1)
                    else:
                        suf1_feats.append(0)

                for s2 in suf2_lexicon:
                    if len(w) > 1:
                        if s2 == w[-2]+w[-1]:
                            suf2_feats.append(1)
                        else:
                            suf2_feats.append(0)
                    else:
                        suf2_feats.append(0)

                for s3 in suf3_lexicon:
                    if len(w) > 2:
                        if s3 == w[-3]+w[-2]+w[-1]:
                            suf3_feats.append(1)
                        else:
                            suf3_feats.append(0)
                    else:
                        suf3_feats.append(0)

                #frequent term lexicon features
                for t in term_lexicon:
                    if t == w.lower():
                        term_feats.append(1)
                    else:
                        term_feats.append(0)

                #morphological features
                if w[0].isupper(): #is first letter capital
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                capitals = 0
                lowers = 0
                for letter in w:
                    if letter.isupper():
                        capitals = capitals + 1
                    if letter.islower():
                        lowers = lowers + 1

                if w[0].islower() and capitals > 0: #contains capitals, except 1st letter
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if capitals == len(w): #is all letters capitals
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if lowers == len(w): #is all letters lower
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"\d", w)) == len(w): #is all letters digits
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"[a-zA-Z]", w)) == len(w): #is all letters words
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"[.]", w)) > 0: #is there a '.'
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"[-]", w)) > 0: #is there a '-'
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r'''[][,;"'?():_`]''', w)) > 0: #is there a punctuation mark, except '.', '-'
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)
                
                for p in pos_lexicon:
                    #check the POS tag of the current word
                    if tags_list[i] == p:
                        pos_feats.append(1)
                    else:
                        pos_feats.append(0)
                            
                    #check the POS tag of the previous word (if the index is IN list's bounds)
                    if (i-1) >= 0:
                        if tags_list[i-1] == p:
                            previous_pos_feats.append(1)
                        else:
                            previous_pos_feats.append(0)
                    else:
                        previous_pos_feats.append(0)
                            
                    #check the POS tag of the 2nd previous word (if the index is IN list's bounds)
                    if (i-2) >= 0:
                        if tags_list[i-2] == p:
                            second_previous_pos_feats.append(1)
                        else:
                            second_previous_pos_feats.append(0)
                    else:
                        second_previous_pos_feats.append(0)
                            
                    #check the POS tag of the next word (if the index is IN list's bounds)
                    if (i+1) < len(words):
                        if tags_list[i+1] == p:
                            next_pos_feats.append(1)
                        else:
                            next_pos_feats.append(0)
                    else:
                        next_pos_feats.append(0)
                            
                    #check the POS tag of the next word (if the index is IN list's bounds)
                    if (i+2) < len(words):
                        if tags_list[i+2] == p:
                            second_next_pos_feats.append(1)
                        else:
                            second_next_pos_feats.append(0)
                    else:
                        second_next_pos_feats.append(0)
                            
		#label the word, using IOB system,
                #B:start of aspect term, I:continue of aspect term, O: no aspect term
                term_found = False                
                for aspect_term in set(instance.get_aspect_terms()):
                    term_words = aspect_term.split()
                    for term_index, term in enumerate(term_words):
                        if (w.lower() == term) and (term_found is False):
                            if term_index == 0:
                                target_labels = [1] #1 is "B"
                                last_prediction = "1"
                                term_found = True                            
                            else:
                                if (last_prediction == "1") or (last_prediction == "2"):
                                    target_labels = [2] #2 is "I"
                                    last_prediction = "2"
                                    term_found = True                            
                                else:
                                    target_labels = [0]
                                    last_prediction = "0"

                if term_found is False:
                    target_labels = [0] #0 is "O"
                    last_prediction = "0"
            
                train_word_features = [pos_feats + previous_pos_feats + second_previous_pos_feats +
                                next_pos_feats + second_next_pos_feats + morph_feats + term_feats +
                                pre1_feats + pre2_feats + pre3_feats + suf1_feats + suf2_feats + suf3_feats]
            if word_found is True:
                train_words.append(train_word_features)
                word_labels.append(target_labels)

        train_sentences_array = np.zeros((len(train_words), len(train_words[0][0])))
        index_i = 0
        for word in train_words:
            index_j = 0
            for features in word:
                for f in features:
                    train_sentences_array[index_i, index_j] = f
                    index_j = index_j + 1
            index_i = index_i + 1
        train_sentences.append(train_sentences_array)        

        sentence_labels_array = np.zeros((len(word_labels)))
        index_i = 0
        for label in word_labels:
            sentence_labels_array[index_i] = label[0]
            index_i = index_i + 1
        sentence_labels.append(sentence_labels_array.astype(np.int64))

    #the chain-crf needs a list (representing the sentences), that
    #contains a 2d-array(n_words, n_features), which in turn contains the
    #features extracted from each word. the sentence labels must be
    #an array of type int
    ssvm.fit(train_sentences, sentence_labels)

    print('Done!')
    print('Creating test feature vectors...')
    
    test_sentences = []
    for instance in testcorpus.corpus:
        words = nltk.word_tokenize(instance.text)
        
        tags = nltk.pos_tag(words)
        tags_list = [] #the pos list
        for _, t in tags:
            tags_list.append(t)

        test_words = []
        for i, w in enumerate(words):
            word_found = False
            if words[i] == w:
                word_found = True
                
                pos_feats = []
                previous_pos_feats = []
                second_previous_pos_feats = []
                next_pos_feats = []
                second_next_pos_feats = []
                morph_feats = []
                term_feats = []
                pre1_feats = []
                pre2_feats = []
                pre3_feats = []
                suf1_feats = []
                suf2_feats = []
                suf3_feats = []

                test_word_features = []

                #prefix 1,2,3 lexicon features
                for p1 in pre1_lexicon:
                    if p1 == w[0]:
                        pre1_feats.append(1)
                    else:
                        pre1_feats.append(0)

                for p2 in pre2_lexicon:
                    if len(w) > 1:
                        if p2 == w[0]+w[1]:
                            pre2_feats.append(1)
                        else:
                            pre2_feats.append(0)
                    else:
                        pre2_feats.append(0)

                for p3 in pre3_lexicon:
                    if len(w) > 2:
                        if p3 == w[0]+w[1]+w[2]:
                            pre3_feats.append(1)
                        else:
                            pre3_feats.append(0)
                    else:
                        pre3_feats.append(0)

                #suffix 1,2,3 lexicon features
                for s1 in suf1_lexicon:
                    if s1 == w[-1]:
                        suf1_feats.append(1)
                    else:
                        suf1_feats.append(0)

                for s2 in suf2_lexicon:
                    if len(w) > 1:
                        if s2 == w[-2]+w[-1]:
                            suf2_feats.append(1)
                        else:
                            suf2_feats.append(0)
                    else:
                        suf2_feats.append(0)

                for s3 in suf3_lexicon:
                    if len(w) > 2:
                        if s3 == w[-3]+w[-2]+w[-1]:
                            suf3_feats.append(1)
                        else:
                            suf3_feats.append(0)
                    else:
                        suf3_feats.append(0)

                #term lexicon features
                for t in term_lexicon:
                    if t == w.lower():
                        term_feats.append(1)
                    else:
                        term_feats.append(0)

                #morphological features
                if w[0].isupper(): #is first letter capital
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                capitals = 0
                lowers = 0
                for letter in w:
                    if letter.isupper():
                        capitals = capitals + 1
                    if letter.islower():
                        lowers = lowers + 1

                if w[0].islower() and capitals > 0: #contains capitals, except 1st letter
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if capitals == len(w): #is all letters capitals
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if lowers == len(w): #is all letters lower
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"\d", w)) == len(w): #is all letters digits
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"[a-zA-Z]", w)) == len(w): #is all letters words
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"[.]", w)) > 0: #is there a '.'
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"[-]", w)) > 0: #is there a '-'
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r'''[][,;"'?():_`]''', w)) > 0: #is there a punctuation mark, except '.', '-'
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)
                
                for p in pos_lexicon:
                    #check the POS tag of the current word
                    if tags_list[i] == p:
                        pos_feats.append(1)
                    else:
                        pos_feats.append(0)
                            
                    #check the POS tag of the previous word (if the index is IN list's bounds)
                    if (i-1) >= 0:
                        if tags_list[i-1] == p:
                            previous_pos_feats.append(1)
                        else:
                            previous_pos_feats.append(0)
                    else:
                        previous_pos_feats.append(0)
                            
                    #check the POS tag of the 2nd previous word (if the index is IN list's bounds)
                    if (i-2) >= 0:
                        if tags_list[i-2] == p:
                            second_previous_pos_feats.append(1)
                        else:
                            second_previous_pos_feats.append(0)
                    else:
                        second_previous_pos_feats.append(0)
                            
                    #check the POS tag of the next word (if the index is IN list's bounds)
                    if (i+1) < len(words):
                        if tags_list[i+1] == p:
                            next_pos_feats.append(1)
                        else:
                            next_pos_feats.append(0)
                    else:
                        next_pos_feats.append(0)
                            
                    #check the POS tag of the next word (if the index is IN list's bounds)
                    if (i+2) < len(words):
                        if tags_list[i+2] == p:
                            second_next_pos_feats.append(1)
                        else:
                            second_next_pos_feats.append(0)
                    else:
                        second_next_pos_feats.append(0)
            
                test_word_features = [pos_feats + previous_pos_feats + second_previous_pos_feats +
                                next_pos_feats + second_next_pos_feats + morph_feats + term_feats +
                                pre1_feats + pre2_feats + pre3_feats + suf1_feats + suf2_feats + suf3_feats]
            if word_found is True:
                test_words.append(test_word_features)

        test_sentences_array = np.zeros((len(test_words), len(test_words[0][0])))
        index_i = 0
        for word in test_words:
            index_j = 0
            for features in word:
                for f in features:
                    test_sentences_array[index_i, index_j] = f
                    index_j = index_j + 1
            index_i = index_i + 1
        test_sentences.append(test_sentences_array)

    print('Done!')
    print('Predicting aspect terms...')

    predictions = ssvm.predict(test_sentences)
    #the predict function returns a list (symbolizing the sentences),
    #which contains a list that contains the predicted label for each word
    for sentence_index, sentence_predictions in enumerate(predictions):
            testcorpus.corpus[sentence_index].aspect_terms = []

            predicted_term = ""
            last_prediction = ""
            for word_index, word_prediction in enumerate(sentence_predictions):
                if word_prediction == 1:
                    if last_prediction == 1 or last_prediction == 2:
                        start, end = find_offsets(testcorpus.corpus[sentence_index].text.lower(), predicted_term)
                        testcorpus.corpus[sentence_index].add_aspect_term(term=predicted_term, offsets={'from': str(start), 'to': str(end)})
                        
                    c = find_term(testcorpus.corpus[sentence_index].text.lower(), word_index)
                    predicted_term = c
                    last_prediction = 1
                    
                elif word_prediction == 2:
                    if last_prediction == 1 or last_prediction == 2:
                        c = find_term(testcorpus.corpus[sentence_index].text.lower(), word_index)
                        if len(predicted_term) > 0:
                            predicted_term = predicted_term + " " + c
                        else:
                            predicted_term = c
                    last_prediction = 2

                elif word_prediction == 0:
                    if last_prediction == 1 or last_prediction == 2:
                        start, end = find_offsets(testcorpus.corpus[sentence_index].text.lower(), predicted_term)
                        testcorpus.corpus[sentence_index].add_aspect_term(term=predicted_term, offsets={'from': str(start), 'to': str(end)})
                    last_prediction = 0
                            
    print('Done!')
    return testcorpus.corpus
Exemplo n.º 58
0
# print("Shuffle results")
# features, labels = util.shuffle(features, labels)


trsize = int(0.7*len(labels))
X_train = features[1:trsize]
y_train = labels[1:trsize]

X_test = features[trsize+1:]
y_test = labels[trsize+1:]

# X_train = X_test = features
# y_train = y_test = labels
# trsize = len(labels)

# Evaluate the chain
model = ChainCRF()
C=0.0001
max_iter=50
ssvm = FrankWolfeSSVM(model=model, C=C, max_iter=max_iter, verbose=True)
print(ssvm)
print(ssvm.fit(X_train, y_train))
print(ssvm.w)
trscore = ssvm.score(X_train,y_train)
# testscore = ssvm.score(X_test,y_test)
print("Training score: {0}".format(trscore))
# print("Test score: {0}".format(testscore))

# Save the result
# util.saveToSQL(featureset, C, max_iter, trsize, trscore, 2)