def chain_crf(): letters = load_letters() x, y, folds = letters['data'], letters['labels'], letters['folds'] print "Letters : " print letters # print "Data : " # print letters['data'] # print "Labels : " # print letters['labels'] x, y = np.array(x), np.array(y) x_train, x_test = x[folds == 1], x[folds != 1] y_train, y_test = y[folds == 1], y[folds != 1] print len(x_train) print len(x_test) print "Done" print x_train[0].shape print y_train[0].shape print x_train[10].shape print y_train[10].shape model = ChainCRF() ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10) print ssvm.fit(x_train, y_train) print ssvm.score(x_test, y_test)
def cross_val(self, X_train, y_train): ''' method to conduct 5-fold cross validation ''' kf = KFold(len(X_train), n_folds=5, random_state=None, shuffle=False) for train_idx, test_idx in kf: xtrain, xval = X_train[train_idx], X_train[test_idx] ytrain, yval = y_train[train_idx], y_train[test_idx] model = ChainCRF() ssvm = FrankWolfeSSVM(model=model, C=0.5, max_iter=15) ssvm.fit(xtrain, ytrain) print ssvm.score(xval, yval)
class CRFTrainer(object): def __init__(self, c_value, classifier_name='ChainCRF'): self.c_value = c_value self.classifier_name = classifier_name if self.classifier_name == 'ChainCRF': model = ChainCRF() self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=50) else: raise TypeError('Invalid classifier type') def load_data(self): letters = load_letters() X, y, folds = letters['data'], letters['labels'], letters['folds'] X, y = np.array(X), np.array(y) return X, y, folds # X是一个由样本组成的numpy数组,每个样本为(字母,数值) def train(self, X_train, y_train): self.clf.fit(X_train, y_train) def evaluate(self, X_test, y_test): return self.clf.score(X_test, y_test) # 对输入数据运行分类器 def classify(self, input_data): return self.clf.predict(input_data)[0]
class CRFTrainer(object): #define an init function to initialize the values. def __init__(self, c_value, classifier_name='ChainCRF'): self.c_value = c_value self.classifier_name = classifier_name #using chain crf to analyze the data, so add an error check for this: if self.classifier_name == 'ChainCRF': model = ChainCRF() #define the classifier to use with CRF model. self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=100) else: raise TypeError('Invalid classifier type') def load_clean_data(self): ''' load the data into X and y, where X is a numpy array of samples where each sample has the shape (n_letters, n_features) ''' df = featurize.get_data() featurize.split_words(df) featurize.first_letter_uppercase(df) featurize.has_number(df) featurize.has_slash(df) featurize.spacy_pos_tagger(df) featurize.pos_ngrams(df) featurize.encoding_labels(df) X, y = featurize.get_X_and_y(df) return df, X, y def cross_val(self, X_train, y_train): ''' method to conduct 5-fold cross validation ''' kf = KFold(len(X_train), n_folds=5, random_state=None, shuffle=False) for train_idx, test_idx in kf: xtrain, xval = X_train[train_idx], X_train[test_idx] ytrain, yval = y_train[train_idx], y_train[test_idx] model = ChainCRF() ssvm = FrankWolfeSSVM(model=model, C=0.5, max_iter=15) ssvm.fit(xtrain, ytrain) print ssvm.score(xval, yval) def train(self, X_train, y_train): ''' training method ''' self.clf.fit(X_train, y_train) def evaluate(self, X_test, y_test): ''' method to evaluate the performance of the model ''' return self.clf.score(X_test, y_test) def classify(self, input_data): ''' method to run the classifier on input data ''' return self.clf.predict(input_data)[0]
class CRFModel(object): def __init__(self, c_val=1.0): self.clf = FrankWolfeSSVM(model=ChainCRF(), C=c_val, max_iter=50) def load_data(self): alphabets = load_letters() X = np.array(alphabets['data']) y = np.array(alphabets['labels']) folds = alphabets['folds'] return X, y, folds def train(self, X_train, y_train): self.clf.fit(X_train, y_train) def evaluate(self, X_test, y_test): return self.clf.score(X_test, y_test) def classify(self, input_data): return self.clf.predict(input_data)[0] def convert_to_letters(indices): alphabets = np.array(list(string.ascii_lowercase)) output = np.take(alphabets, indices) output = ''.join(output) return output
class CRFTrainer(object): def __init__(self, c_value, classifier_name='ChainCRF'): self.c_value = c_value self.classifier_name = classifier_name if self.classifier_name == 'ChainCRF': model = ChainCRF() self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=50) else: raise TypeError('Invalid classifier type') def load_data(self): letters = load_letters() X, y, folds = letters['data'], letters['labels'], letters['folds'] X, y = np.array(X), np.array(y) return X, y, folds # X is a numpy array of samples where each sample # has the shape (n_letters, n_features) def train(self, X_train, y_train): self.clf.fit(X_train, y_train) def evaluate(self, X_test, y_test): return self.clf.score(X_test, y_test) # Run the classifier on input data def classify(self, input_data): return self.clf.predict(input_data)[0]
def n_cross_valid_crf(X, Y, K, command): # cross validation for crf if command == 'write_results': list_write = list() cv = KFold(len(X), K, shuffle=True, random_state=0) for traincv, testcv in cv: x_train, x_test = X[traincv], X[testcv] y_train, y_test = Y[traincv], Y[testcv] crf = ChainCRF(inference_method='max-product', directed=False, class_weight=None) ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=100) ssvm.fit(x_train, y_train) y_pred = ssvm.predict(x_test) print 'Accuracy of linear-crf %f:' % ssvm.score(x_test, y_test) if command == 'metrics_F1': metrics_crf(y_test, y_pred) elif command == 'confusion_matrix': confusion_matrix_CRF(y_test, y_pred) elif command == 'write_results': list_write += write_results_CRF(testcv, y_test, y_pred) print '------------------------------------------------------' print '------------------------------------------------------' if command == 'write_results': list_write = sorted(list_write, key=itemgetter(0)) # sorted list based on index for value in list_write: pred_list = value[1] test_list = value[2] for i in range(0, len(pred_list)): print str(pred_list[i]) + '\t' + str(test_list[i])
def structraining(self, bags, mentions, retweets, labels): total_datas = [] total_labels = [] print('num_user', len(bags.keys())) for user_id, bag in bags.items(): if not user_id in labels: continue features = np.empty((0, self.top_seq)) edge_nodes = np.empty((0, 2)) edge_features = np.empty((0, 1)) clique_labels = np.array([labels[user_id]]) features = np.vstack([features, bag]) mentioned_ids = mentions[user_id] cnt = 0 for mentioned_id in enumerate(mentioned_ids): if not mentioned_id in labels: continue clique_labels = np.append(clique_labels, np.array([labels[mentioned_id]])) if mentioned_id in bags: features = np.vstack([features, bags[mentioned_id]]) else: features = np.vstack([features, np.zeros(self.top_seq)]) edge_nodes = np.vstack([edge_nodes, np.array([0, cnt + 1])]) edge_features = np.vstack([edge_features, np.array([[0]])]) cnt += 1 num_mentioned = edge_nodes.shape[0] retweet_ids = retweets[user_id] cnt = 0 for retweet_id in retweet_ids: if not retweet_id in labels: continue clique_labels = np.append(clique_labels, np.array([labels[retweet_id]])) if retweet_id in bags: features = np.vstack([features, bags[retweet_id]]) else: features = np.vstack([features, np.zeros(self.top_seq)]) edge_nodes = np.vstack( [edge_nodes, np.array([0, cnt + 1 + num_mentioned])]) edge_features = np.vstack([edge_features, np.array([[1]])]) cnt += 1 total_datas.append( (features, edge_nodes.astype(int), edge_features)) total_labels.append(clique_labels) ratio = len(total_datas) * 0.7 ratio = int(ratio) print(ratio) X_train, y_train = total_datas[:ratio], total_labels[:ratio] X_test, y_test = total_datas[ratio:], total_labels[ratio:] model = EdgeFeatureGraphCRF(inference_method="max-product") ssvm = FrankWolfeSSVM(model=model, C=0.1, max_iter=10) ssvm.fit(X_train, y_train) result = ssvm.score(X_test, y_test) print(result)
def test_svm_as_crf_pickling_batch(): iris = load_iris() X, y = iris.data, iris.target X_ = [(np.atleast_2d(x), np.empty((0, 2), dtype=np.int)) for x in X] Y = y.reshape(-1, 1) X_train, X_test, y_train, y_test = train_test_split(X_, Y, random_state=1) _, file_name = mkstemp() pbl = GraphCRF(n_features=4, n_states=3, inference_method='unary') logger = SaveLogger(file_name) svm = FrankWolfeSSVM(pbl, C=10, logger=logger, max_iter=50, batch_mode=False) svm.fit(X_train, y_train) assert_less(.97, svm.score(X_test, y_test)) assert_less(.97, logger.load().score(X_test, y_test))
def test_svm_as_crf_pickling_bcfw(): iris = load_iris() X, y = iris.data, iris.target X_ = [(np.atleast_2d(x), np.empty((0, 2), dtype=np.int)) for x in X] Y = y.reshape(-1, 1) X_train, X_test, y_train, y_test = train_test_split(X_, Y, random_state=1) _, file_name = mkstemp() pbl = GraphCRF(n_features=4, n_states=3, inference_method='unary') logger = SaveLogger(file_name) svm = FrankWolfeSSVM(pbl, C=10, logger=logger, max_iter=50) svm.fit(X_train, y_train) assert_less(.97, svm.score(X_test, y_test)) assert_less(.97, logger.load().score(X_test, y_test))
def MLfitCRF(data_train, data_test, records, folds): fvector = np.array([data_train[0]]) labels = np.array([data_train[1]]) #create CRF model CRFmodel = ChainCRF() #create ML classifier ssvm = FrankWolfeSSVM(model = CRFmodel, C = 0.1) #training ssvm.fit(fvector, labels) #model testing fvector_test = np.array(data_test[0]) labels_test = np.array(data_test[1]) score = ssvm.score(fvector_train, labels_test) print score return
def results_CRFs(X_training, Y_training, X_testing, Y_testing, command): crf = ChainCRF(inference_method='max-product', directed=False, class_weight=None) ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=100) ssvm.fit(X_training, Y_training) y_pred = ssvm.predict(X_testing) list_write = list() print 'Accuracy of linear-crf %f:' % ssvm.score(X_testing, Y_testing) if command == 'metrics_F1': metrics_crf(Y_testing, y_pred) elif command == 'confusion_matrix': confusion_matrix_CRF(Y_testing, y_pred) elif command == 'write_results': list_write = write_CRFs_compare(Y_testing, y_pred) for value in list_write: pred_list = value[0] test_list = value[1] for i in range(0, len(pred_list)): print str(pred_list[i]) + '\t' + str(test_list[i])
def chaincrf_test(): num_pics = 3000 X, Y= load_pictures(num_pics) X = np.array(X) Y = np.array(Y) print X.shape print Y.shape # 0: pixel, 1: row, 2: picture mode = 0 outstr = "Test score with data arranged by " if mode == 0: X, Y = arrange_by_pixel(X, Y) outstr += "pixel:" elif mode == 1: X, Y = arrange_by_row(X, Y) outstr += "row:" elif mode == 2: X, Y = arrange_by_picture(X, Y) outstr += "picture:" print X.shape print Y.shape #print X.shape, Y.shape train_pct = 0.66 test_pct = 1 - train_pct X_train = X[0:math.floor(train_pct * num_pics)] X_test = X[math.floor(test_pct*num_pics):] Y_train = Y[0:math.floor(train_pct * num_pics)] Y_test = Y[math.floor(test_pct*num_pics):] model = ChainCRF() ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10) # #print X_train.shape, Y_train.shape ssvm.fit(X_train, Y_train) results = ssvm.score(X_test, Y_test) print outstr print results
def chaincrf_test(): num_pics = 3000 X, Y = load_pictures(num_pics) X = np.array(X) Y = np.array(Y) print X.shape print Y.shape # 0: pixel, 1: row, 2: picture mode = 0 outstr = "Test score with data arranged by " if mode == 0: X, Y = arrange_by_pixel(X, Y) outstr += "pixel:" elif mode == 1: X, Y = arrange_by_row(X, Y) outstr += "row:" elif mode == 2: X, Y = arrange_by_picture(X, Y) outstr += "picture:" print X.shape print Y.shape #print X.shape, Y.shape train_pct = 0.66 test_pct = 1 - train_pct X_train = X[0:math.floor(train_pct * num_pics)] X_test = X[math.floor(test_pct * num_pics):] Y_train = Y[0:math.floor(train_pct * num_pics)] Y_test = Y[math.floor(test_pct * num_pics):] model = ChainCRF() ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10) # #print X_train.shape, Y_train.shape ssvm.fit(X_train, Y_train) results = ssvm.score(X_test, Y_test) print outstr print results
class CRFModel(object): def __init__(self, c_val=1.0): self.clf = FrankWolfeSSVM(model=ChainCRF(), C=c_val, max_iter=100) #Load the training data def load_data(self): alphabets = load_letters() X = np.array(alphabets['data']) y = np.array(alphabets['labels']) folds = alphabets['folds'] return X, y, folds #Train the CRF def train(self, X_train, y_train): self.clf.fit(X_train, y_train) #Evaluate the accuracy of the CRF def evaluate(self, X_test, y_test): return self.clf.score(X_test, y_test) #Run the CRF on unknown data def classify(self, input_data): return self.clf.predict(input_data)[0]
for train_index, test_index in kf.split(X): print(' ') print('train index {}'.format(train_index)) print('test index {}'.format(test_index)) print('{} jackets for training, {} for testing'.format( len(train_index), len(test_index))) X_train = X[train_index] Y_train = Y[train_index] X_test = X[test_index] Y_test = Y[test_index] """ YOUR S-SVM TRAINING CODE HERE """ ssvm.fit(X_train, Y_train) """ LABEL THE TESTING SET AND PRINT RESULTS """ Y_pred = ssvm.predict(X_test) wrong_segments_crf.append(np.sum(Y_pred != Y_test)) score = ssvm.score(X_test, Y_test) scores_crf[fold] = score """ figure showing the result of classification of segments for each jacket in the testing part of present fold """ if plot_labeling: for ti, pred in zip(test_index, Y_pred): print(ti) print(pred) s = segments[ti] plot_segments(s, caption='SSVM predictions for jacket ' + str(ti + 1), labels_segments=pred) """ YOUR LINEAR SVM TRAINING CODE HERE """ svm.fit(X_train.reshape((-1, num_features)), Y_train.reshape((-1))) """ LABEL THE TESTING SET AND PRINT RESULTS """ Y_pred = svm.predict(X_test.reshape((-1, num_features))).reshape(
# print("Shuffle results") # features, labels = util.shuffle(features, labels) trsize = int(0.7*len(labels)) X_train = features[1:trsize] y_train = labels[1:trsize] X_test = features[trsize+1:] y_test = labels[trsize+1:] # X_train = X_test = features # y_train = y_test = labels # trsize = len(labels) # Evaluate the chain model = ChainCRF() C=0.0001 max_iter=50 ssvm = FrankWolfeSSVM(model=model, C=C, max_iter=max_iter, verbose=True) print(ssvm) print(ssvm.fit(X_train, y_train)) print(ssvm.w) trscore = ssvm.score(X_train,y_train) # testscore = ssvm.score(X_test,y_test) print("Training score: {0}".format(trscore)) # print("Test score: {0}".format(testscore)) # Save the result # util.saveToSQL(featureset, C, max_iter, trsize, trscore, 2)
nn_predictions_train = arrange_letters_in_pred_like(X_train, train_net_pred, size_of_pred=26) nn_predictions_test = arrange_letters_in_pred_like(X_test, test_net_pred, size_of_pred=26) # Train LCCRF chain_model = ChainCRF(directed=True) chain_ssvm = FrankWolfeSSVM(model=chain_model, C=.1, max_iter=11) chain_ssvm.fit(X_train, y_train) # Train LCCRF+NN chain_model = ChainCRF(directed=True) chain_ssvm_nn = FrankWolfeSSVM(model=chain_model, C=.1, max_iter=11) chain_ssvm_nn.fit(nn_predictions_train, y_train) print("Test score with linear NN: 84.15%") print("Test score with LCCRF: %f" % chain_ssvm.score(X_test, y_test)) print("Test score with LCCRF+NN: %f" % chain_ssvm_nn.score(nn_predictions_test, y_test)) # plot some word sequenced n_words = 4 rnd = np.random.RandomState(1) selected = rnd.randint(len(y_test), size=n_words) max_word_len = max([len(y_) for y_ in y_test[selected]]) fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10)) fig.subplots_adjust(wspace=0) fig.text(0.2, 0.05, 'GT', color="#00AA00", size=25) fig.text(0.4, 0.05, 'NN', color="#5555FF", size=25) fig.text(0.6, 0.05, 'LCCRF', color="#FF5555", size=25) fig.text(0.8, 0.05, 'LCCRF+NN', color="#FFD700", size=25)
class CRFClassifierText(object): IGNORE_IF = re.compile(r'(in press|submitted|to appear)', flags=re.IGNORECASE) QUOTES_AROUND_ETAL_REMOVE = re.compile(r'(.*)(")(et al\.?)(")(.*)', re.IGNORECASE) TO_ADD_DOT_AFTER_INITIALS = re.compile( r'\b([A-Z]{1}(?!\.))([\s,]+)([A-Z12(]|and)') TO_ADD_SEPARATE_INITIALS = re.compile(r'\b([A-Z]{1})([A-Z]{1})([,\s]{1})') SEPARATE_AUTHOR = re.compile(r'^((.*?)([\d\":]+))(.*)$') TO_REMOVE_HYPEN_NEAR_INITIAL = [ re.compile(r'([A-Z]\.)(\-)([A-Z]\.)'), re.compile(r'([A-Z])(\-)(\.)'), re.compile(r'([A-Z])(\-)([A-Z])\b') ] URL_EXTRACTOR = re.compile(r'((url\s*)?(http)s?://[A-z0-9\-\.\/\={}?&%]+)', re.IGNORECASE) MONTH_NAME_EXTRACTOR = re.compile( r'\b([Jj]an(?:uary)?|[Ff]eb(?:ruary)?|[Mm]ar(?:ch)?|[Aa]pr(?:il)?|[Mm]ay|[Jj]un(?:e)?|[Jj]ul(?:y)?|[Aa]ug(?:ust)?|[Ss]ep(?:tember)?|[Oo]ct(?:ober)?|([Nn]ov|[Dd]ec)(?:ember)?)\b' ) URL_TO_DOI = re.compile( r'((url\s*)?(https\s*:\s*//\s*|http\s*:\s*//\s*)((.*?)doi(.*?)org/))|(DOI:https\s*://\s*)', flags=re.IGNORECASE) URL_TO_ARXIV = re.compile( r'((url\s*)?(https://|http://)(arxiv.org/(abs|pdf)/))', flags=re.IGNORECASE) URL_TO_ASCL = re.compile(r'((url\s*)?(https://|http://)(ascl.net/))', flags=re.IGNORECASE) ADD_COLON_TO_IDENTIFIER = re.compile(r'(\s+(DOI|arXiv|ascl))(:?\s*)', flags=re.IGNORECASE) IS_START_WITH_YEAR = re.compile(r'(^[12][089]\d\d)') START_WITH_AUTHOR = re.compile(r'([A-Za-z].*$)') WORD_BREAKER_REMOVE = [re.compile(r'([A-Za-z]+)([\-]+\s+)([A-Za-z]+)')] TOKENS_NOT_IDENTIFIED = re.compile(r'\w+\b(?!\|)') REFERENCE_TOKENIZER = re.compile(r'([\s.,():;\[\]\'\"#\/])') TAGGED_MULTI_WORD_TOKENIZER = re.compile(r'([\s.,])') # is all capital IS_ALL_CAPITAL = re.compile(r'^([A-Z]+)$') # is only the first character capital IS_FIRST_CAPITAL = re.compile(r'^([A-Z][a-z]+)$') # is alphabet only, consider hyphenated words also IS_ALPHABET = re.compile(r'^(?=.*[a-zA-Z])([a-zA-Z\-]+)$') # is numeric only, consider the page range with - being also numeric # also include arxiv id with a dot to be numeric # note that this differs from function is_numeric in the # sense that this recognizes numeric even if it was not identified/tagged IS_NUMERIC = re.compile(r'^(?=.*[0-9])([0-9\-\.]+)$') # is alphanumeric, must have at least one digit and one alphabet character IS_ALPHANUMERIC = re.compile(r'^(?=.*[0-9])(?=.*[a-zA-Z])([a-zA-Z0-9]+)$') ADD_SPACE_BETWEEN_TWO_IDENTIFIED_TOKENS = re.compile( r'(\|[a-z\_]+\|)(\|[a-z\_]+\|)') REGEX_PATTERN_WHOLE_WORD_ONLY = r'(?:\b|\B)%s(?:\b|\B)' nltk_tagger = None crf = None X = y = label_code = folds = None def __init__(self): """ """ self.originator_token = OriginatorToken(self.REFERENCE_TOKENIZER) self.numeric_token = NumericToken() self.pub_token = PubToken() self.unknown_tokens = [] self.filename = os.path.dirname( __file__) + '/serialized_files/crfModelText.pkl' def create_crf(self): """ :return: """ # to load nltk tagger, a time consuming, one time needed operation self.nltk_tagger = nltk.tag._get_tagger() self.crf = FrankWolfeSSVM(model=ChainCRF(), C=1.0, max_iter=50) self.X, self.y, self.label_code, self.folds, generate_fold = self.load_training_data( ) score = 0 # only need to iterate through if fold was generated num_tries = 10 if generate_fold else 1 while (score <= 0.90) and (num_tries > 0): try: X_train, y_train = self.get_train_data() self.train(X_train, y_train) X_test, y_test = self.get_test_data() score = self.evaluate(X_test, y_test) except Exception as e: current_app.logger.error('Exception: %s' % (str(e))) current_app.logger.error(traceback.format_exc()) pass num_tries -= 1 return (score > 0) def format_training_data(self, the_data): """ :param the_data: :return: """ # get label, word in the original presentation labels = [[elem[0] for elem in ref] for ref in the_data] words = [[elem[1] for elem in ref] for ref in the_data] # count how many unique labels there are, return a dict to convert from words to numeric words label_code = self.encoder(labels) numeric_labels = [] features = [] for label, word in zip(labels, words): # replace of numeric words for the original presentation of label numeric_label = [] for l in label: numeric_label.append(label_code[l]) numeric_labels.append(np.array(numeric_label)) # get the numeric features for the original presentation of word and insert at index of label feature = [] for idx in range(len(word)): feature.append(self.get_data_features(word, idx, label)) features.append(np.array(feature)) return features, numeric_labels, label_code def get_num_states(self): """ :return: """ num_states = len( np.unique(np.hstack([y for y in self.y[self.folds != 0]]))) current_app.logger.debug("number of states = %s" % num_states) return num_states def get_folds_array(self, filename): """ read the distribution of train and test indices from file :param filename: :return: """ with open(filename, 'r') as f: reader = f.readlines() for line in reader: if line.startswith("STATIC_FOLD"): try: return eval(line.split(" = ")[1]) except: return None def get_train_data(self): """ :return: """ return self.X[self.folds != 0], self.y[self.folds != 0] def get_test_data(self): """ :return: """ return self.X[self.folds == 0], self.y[self.folds == 0] def train(self, X_train, y_train): """ :param X_train: is a numpy array of samples where each sample has the shape (n_labels, n_features) :param y_train: is numpy array of labels :return: """ self.crf.fit(X_train, y_train) def evaluate(self, X_test, y_test): """ :param X_test: :param y_test: :return: """ return self.crf.score(X_test, y_test) def decoder(self, numeric_label): """ :param numeric_label: :return: """ labels = [] for nl in numeric_label: key = next(key for key, value in self.label_code.items() if value == nl) labels.append(key) return labels def encoder(self, labels): """ :param labels: :return: dict of labels as key and numeric value is its value """ # assign a numeric value to each label label_code = {} numeric = -1 for label in labels: for l in label: if (numeric >= 0 and l in label_code): continue else: numeric = numeric + 1 label_code[l] = numeric return label_code def load_training_data(self): """ load training/test data :return: """ training_files_path = os.path.dirname(__file__) + '/training_files/' arXiv_text_ref_filenames = [ training_files_path + 'arxiv.raw', ] references = [] for f in arXiv_text_ref_filenames: references = references + get_arxiv_tagged_data(f) X, y, label_code = self.format_training_data(references) # for now use static division. see comments in foldModelText.dat generate_fold = False if generate_fold: folds = list(np.random.choice(range(0, 9), len(y))) else: folds = self.get_folds_array(training_files_path + 'foldModelText.dat') return np.array(X, dtype=object), np.array( y, dtype=object), label_code, np.array(folds), generate_fold def save(self): """ save object to a pickle file :return: """ try: with open(self.filename, "wb") as f: pickler = pickle.Pickler(f, -1) pickler.dump(self.crf) pickler.dump(self.label_code) pickler.dump(self.nltk_tagger) current_app.logger.info("saved crf in %s." % self.filename) return True except Exception as e: current_app.logger.error('Exception: %s' % (str(e))) current_app.logger.error(traceback.format_exc()) return False def load(self): """ :return: """ try: with open(self.filename, "rb") as f: unpickler = pickle.Unpickler(f) self.crf = unpickler.load() self.label_code = unpickler.load() self.nltk_tagger = unpickler.load() current_app.logger.info("loaded crf from %s." % self.filename) return self.crf except Exception as e: current_app.logger.error('Exception: %s' % (str(e))) current_app.logger.error(traceback.format_exc()) def search(self, pattern, text): """ search whole word only in the text :param pattern: :param text: :return: Ture/False depending if found """ try: return re.search(self.REGEX_PATTERN_WHOLE_WORD_ONLY % pattern, text) is not None except: return False def reference(self, refstr, words, labels): """ put identified words into a dict to be passed out :param words: :param labels: :return: """ ref_dict = {} ref_dict['authors'] = self.originator_token.collect_tagged_tokens( words, labels) if 'DOI' in labels or 'ARXIV' in labels or 'ASCL' in labels: ref_dict.update( self.numeric_token.collect_id_tagged_tokens(words, labels)) if 'YEAR' in labels: ref_dict['year'] = words[labels.index('YEAR')] if 'VOLUME' in labels: volume = self.numeric_token.collect_tagged_numerals_token( words, labels, 'VOLUME') if volume: ref_dict['volume'] = volume if 'PAGE' in labels: page = self.numeric_token.collect_tagged_numerals_token( words, labels, 'PAGE') if page: ref_dict['page'] = page if 'ISSUE' in labels: ref_dict['issue'] = words[labels.index('ISSUE')] if 'ISSN' in labels: ref_dict['ISSN'] = words[labels.index('ISSN')] if 'JOURNAL' in labels: ref_dict['journal'] = self.pub_token.collect_tagged_journal_tokens( words, labels) if 'TITLE' in labels: title = self.pub_token.collect_tagged_title_tokens(words, labels) if title: ref_dict['title'] = title ref_dict['refstr'] = refstr return ref_dict def punctuation_features(self, ref_word, ref_label): """ return a feature vector that has 1 in the first cell if ref_word is a punctuation followed by 1 in the position corresponding to which one :param ref_word: :param ref_label: :return: """ which = which_punctuation(ref_word, ref_label) return [ 1 if which == 0 else 0, # 0 if punctuation, 1 if which == 1 else 0, # 1 if brackets, 1 if which == 2 else 0, # 2 if colon, 1 if which == 3 else 0, # 3 if comma, 1 if which == 4 else 0, # 4 if dot, 1 if which == 5 else 0, # 5 if parenthesis, 1 if which == 6 else 0, # 6 if quotes (both single and double), 1 if which == 7 else 0, # 7 if num signs, 1 if which == 8 else 0, # 8 if hypen, 1 if which == 9 else 0, # 9 if forward slash, 1 if which == 10 else 0, # 10 if semicolon, ] def is_token_unknown(self, ref_word, ref_label): """ :param ref_word: :param ref_label: :return: """ if ref_label: return 1 if ref_label == 'NA' else 0 if ref_word is None: return 0 return int(any(ref_word == token for token in self.unknown_tokens)) def length_features(self, ref_word): """ distinguish between token of length 1, and longer :param ref_word: :return: """ return [1 if len(ref_word) == 1 else 0, 1 if len(ref_word) > 1 else 0] def get_data_features(self, ref_word_list, index, ref_label_list=None): """ :param ref_word_list: has the form [e1,e2,e3,..] :param index: the position of the word in the set, assume it is valid :param ref_label_list: labels for ref_word_list available during training only :return: """ ref_word = ref_word_list[index] ref_label = ref_label_list[index] if ref_label_list else None return \ self.length_features(ref_word) \ + self.originator_token.author_features(ref_word_list, ref_label_list, index) \ + self.pub_token.title_features(ref_word_list, ref_label_list, index) \ + self.pub_token.journal_features(ref_word_list, ref_label_list, index) \ + self.numeric_token.numeric_features(ref_word, ref_label) \ + self.numeric_token.identifying_word_features(ref_word, ref_label) \ + self.punctuation_features(ref_word, ref_label) \ + self.pub_token.publisher_features(ref_word, ref_label) \ + self.originator_token.editor_features(ref_word_list, ref_label_list, index) \ + [ int(self.IS_ALL_CAPITAL.match(ref_word) is not None), # is element all capital int(self.IS_FIRST_CAPITAL.match(ref_word) is not None), # is first character capital int(self.IS_ALPHABET.match(ref_word) is not None), # is alphabet only, consider hyphenated words also int(self.IS_NUMERIC.match(ref_word) is not None), # is numeric only, consider the page range with - being also numeric int(self.IS_ALPHANUMERIC.match(ref_word) is not None), # is alphanumeric, must at least one digit and one alphabet character self.is_token_unknown(ref_word, ref_label), # is it one of the words unable to guess self.pub_token.is_token_stopword(ref_word, ref_label), # is it one of tagged stopwords ] def segment(self, reference_str): """ going to attempt and segment the reference string each token that is identified is removed from reference_str in the reverse order the identified tokens are inserted back to reference_str before feature extraction :param reference_str: :return: """ if isinstance(reference_str, list): return [] # start fresh self.numeric_token.clear() self.originator_token.clear() self.pub_token.clear() na_url = None na_month = None # step 1: remove any non essential tokens (ie, urls, months, etc) matches = self.URL_EXTRACTOR.findall(reference_str) if len(matches) > 0: na_url = [] for i, url in enumerate(matches, start=1): na_url.append(url[0]) reference_str = reference_str.replace(url[0], '|na_url_%d|' % i) extractor = self.MONTH_NAME_EXTRACTOR.search(reference_str) if extractor: na_month = extractor.group().strip() reference_str = reference_str.replace(na_month, '|na_month|') # step 2: identify doi/arxiv/ascl reference_str = self.numeric_token.segment_ids(reference_str) # step 3: identify list of authors and editors reference_str = self.originator_token.identify(reference_str) # step 4: identify title and journal substrings # but first remove any numerical identifying words reference_str = self.pub_token.identify( self.numeric_token.remove_identifying_words(reference_str).strip(), self.nltk_tagger, self.originator_token.indices(), self.originator_token.have_editor()) # step 5: identify year, volume, page, issue reference_str = self.numeric_token.segment_numerals(reference_str) # collect all tokens that has not been identified self.unknown_tokens = self.TOKENS_NOT_IDENTIFIED.findall(reference_str) if na_url: self.unknown_tokens.append(' '.join(na_url)) if na_month: self.unknown_tokens.append(na_month) # now put the identified tokens back into the string, and before tokenizing and sending to crf # step 5 reverse reference_str = self.numeric_token.assemble_stage1(reference_str) # step 4 reverse reference_str = self.pub_token.assemble(reference_str) # step 3 reverse reference_str = self.originator_token.assemble(reference_str) # tokenize ref_words = list( filter(None, [ w.strip() for w in self.REFERENCE_TOKENIZER.split( self.ADD_SPACE_BETWEEN_TWO_IDENTIFIED_TOKENS.sub( r'\1 \2', reference_str)) ])) # step 2 reverse ref_words = self.numeric_token.assemble_stage2(ref_words) # step 1 reverse if na_month: ref_words[ref_words.index('|na_month|')] = na_month if na_url: for i, url in enumerate(na_url, start=1): ref_words[ref_words.index('|na_url_%d|' % i)] = url return ref_words def dots_after_initials(self, reference_str): """ :param reference_str: :return: """ try: author_part = self.SEPARATE_AUTHOR.search(reference_str).group(1) # separate first and middle initials if there are any attached, add dot after each # make sure there is a dot after single character, repeat to capture middle name reference_str = reference_str.replace( author_part, self.TO_ADD_SEPARATE_INITIALS.sub( r"\1. \2. \3", self.TO_ADD_DOT_AFTER_INITIALS.sub( r"\1.\2\3", self.TO_ADD_DOT_AFTER_INITIALS.sub( r"\1.\2\3", author_part)))) except: pass return reference_str def pre_processing(self, reference_str): """ :param reference_str: :return: """ # remove any numbering that appears before the reference to start with authors # exception is the year if self.IS_START_WITH_YEAR.search(reference_str) is None: reference_str = self.START_WITH_AUTHOR.search( reference_str).group() # also if for some reason et al. has been put in double quoted! remove them reference_str = self.QUOTES_AROUND_ETAL_REMOVE.sub( r"\1\3\5", reference_str) # if there is a hypen either between initials, or after initials and before dot, remove it for rhni, replace in zip(self.TO_REMOVE_HYPEN_NEAR_INITIAL, [r"\1 \3", r"\1\3", r"\1. \3"]): reference_str = rhni.sub(replace, reference_str) # add dots after initials, separate first and middle if needed reference_str = self.dots_after_initials(reference_str) # if no colon after the identifer, add it in reference_str = self.ADD_COLON_TO_IDENTIFIER.sub(r"\1:", reference_str) # if there is a url for DOI turned it to recognizable DOI reference_str = self.URL_TO_DOI.sub(r"DOI:", reference_str) # if there is a url for arxiv turned it to recognizable arxiv reference_str = self.URL_TO_ARXIV.sub(r"arXiv:", reference_str) # if there is a url for ascl turned it to recognizable ascl reference_str = self.URL_TO_ASCL.sub(r"ascl:", reference_str) for rwb in self.WORD_BREAKER_REMOVE: reference_str = rwb.sub(r'\1\3', reference_str) return reference_str def classify(self, reference_str): """ Run the classifier on input data :param reference_str: :return: list of words and the corresponding list of labels """ reference_str = self.pre_processing(reference_str) ref_words = self.segment(reference_str) features = [] for i in range(len(ref_words)): features.append(self.get_data_features(ref_words, i, [])) ref_labels = self.decoder(self.crf.predict([np.array(features)])[0]) return ref_words, ref_labels def parse(self, reference_str): """ :param reference_str: :return: """ if self.IGNORE_IF.search(reference_str): return None words, labels = self.classify(reference_str) return self.reference(reference_str, words, labels) def tokenize(self, reference_str): """ used for unittest only :param reference_str: :return: """ if self.IGNORE_IF.search(reference_str): return None words, _ = self.classify(reference_str) return words
def run_crf(w2v, words_before, words_after, shallow_parse): pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \ parse_summerscales.get_tokens_and_lbls( make_pmids_dict=True, sen=True) """ Create model """ model = ChainCRF(directed=False) ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=30) all_pmids = pmids_dict.keys() n = len(all_pmids) n_folds = 5 kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds) fold_gi = [] for fold_idx, (train, test) in enumerate(kf): print("on fold %s" % fold_idx) train_pmids = [all_pmids[pmid_idx] for pmid_idx in train] test_pmids = [all_pmids[pmid_idx] for pmid_idx in test] print('loading data...') train_x, train_y = abstract2features(pmids_dict, words_before, w2v, shallow_parse) test_x, test_y = abstract2features(pmids_dict, words_after, w2v, shallow_parse) print('loaded data...') print 'training...' ssvm.fit(train_x, train_y) print ssvm.score(test_x, test_y) for i, (pmid, x, y) in enumerate(zip(test_pmids, test_x, test_y)): abstract_words, _, _ = pmids_dict[pmid] print(pmid) # predict() takes in a list returns another list prediction = ssvm.predict([x]).pop(0) predicted = '' output = '' if len(prediction) > 0: for p in prediction: if p == 1: print "word: {}".format(abstract_words[p]) if n == 0: predicted += abstract_words[p] else: predicted += ' ' + abstract_words[p] if not predicted == '': output = 'predicted: {}'.format(predicted) else: output = 'Predicted nothing!' else: output = 'Predicted nothing!' print output
y_test = preprocess_label(y_test) ### CS : best c =0.01 ### Phy: best c= 0.005 ### stat: best c = 0.005 ''' C= [0.005,0.01,0.02,0.05,0.1,0.2] score = {} for i in C: model = ChainCRF() ssvm = FrankWolfeSSVM(model=model, C=i, max_iter=100) ssvm.fit(x_train, y_train) score[i] = ssvm.score(x_dev, y_dev) print score ''' model = ChainCRF() ssvm = FrankWolfeSSVM(model=model, C=0.005, max_iter=100) ssvm.fit(x_train, y_train) score = ssvm.score(x_test, y_test) y_pred = ssvm.predict(x_test) print 'Micro-averaged F1 score:', f1_score(get_one_list(y_test), get_one_list(y_pred), average='micro') experiment_util.sequential_error_analysis( restore_label(y_test), restore_label(y_pred), './chaincrf_sequential_error_analysis')
<<<<<<< HEAD test_datas, test_labels, node_ids = self.get_datas(test_ids, labels, mentions, retweets, bags) if i == 0: x_test_ori, y_test_ori = test_datas, test_labels ======= >>>>>>> 93309e3207d37152eefafa6b563c72777a863935 print(len(train_datas)) print(len(test_datas)) X_train, y_train = train_datas, train_labels model = GraphCRF(inference_method="max-product") ssvm = FrankWolfeSSVM(model=model, C=0.1, max_iter=10) ssvm.fit(X_train, y_train) y_preds = ssvm.predict(test_datas) <<<<<<< HEAD result = ssvm.score(x_test_ori, y_test_ori) print('iter {} result = {}'.format(i, result)) count = 0 for clique_idx, clique in enumerate(y_preds): for node_idx, node in enumerate(clique): node_id = node_ids[clique_idx][node_idx] if node == central_propagation_df.iloc[node_id].values: clabels[int(node_id)] = node if not int(node_id) in c_idxs: c_idxs = np.append(c_idxs, int(node_id)) count += 1 print('iter {} update {} new labels'.format(i, count)) ======= # result = ssvm.score(test_datas, test_labels) # print('iter {} result = {}'.format(i, result)) count = 0
X = X[:100] y = y[:100] #Add edges for i in range(X.shape[0]): X[i] = [X[i], np.vstack([(0,1),(2,2)])] model = GraphCRF(directed=True, inference_method="max-product") X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X,y, test_size =0.5, random_state=0) ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10) ssvm.fit(X_train,y_train) print ssvm.score(X_test, y_test) print ssvm.predict(X_test) print y_test ''' for i in range(X.shape[0]): X_train, X_test = X[] X_test = X[i] y_test = y[i] X_train = np.delete(X,i) y_train = np.delete(y,i) ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10) ssvm.fit(X_train,y_train)
y_train, y_test = y[folds == 1], y[folds != 1] """ features_0 = features_train[0] n_nodes = features_0.shape[0] edges_0 = np.vstack([np.arange(n_nodes - 1), np.arange(1, n_nodes)]) x = (features_0, edges_0) """ f_t = features_train X_train = [(features_i, np.vstack([np.arange(features_i.shape[0] - 1), np.arange(1, features_i.shape[0])])) for features_i in f_t] print type(X_train) print type(X_train[0][1]) print X_train[0][1].shape print type(y_train) print type(y_train[0]) print y_train[0] print y_train[0].shape from pystruct.models import GraphCRF from pystruct.learners import FrankWolfeSSVM model = GraphCRF(directed=True, inference_method="max-product") ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10) ssvm.fit(X_train, y_train) print "OM SRI SAIRAM" print ("Accuracy score with Graph CRF : %f" % ssvm.score(y_train,y_test))
gssvm.fit(X_train, y_train) train_score = gssvm.score(X_train, y_train) test_score = gssvm.score(X_test, y_test) print("Train / Test score with gchain CRF: %f %f" % (train_score, test_score)) else: # Train linear chain CRF model = ChainCRF() # pdb.set_trace() ssvm = FrankWolfeSSVM(model=model, C=C, check_dual_every=10, max_iter=100, verbose=True) ssvm.fit(X_train, y_train) train_score = ssvm.score(X_train, y_train) test_score = ssvm.score(X_test, y_test) print("Train / Test score with chain CRF: %f %f" % (train_score, test_score)) # plot some word sequenced n_words = 4 rnd = np.random.RandomState(1) selected = rnd.randint(len(y_test), size=n_words) max_word_len = max([len(y_) for y_ in y_test[selected]]) fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10)) fig.subplots_adjust(wspace=0) for ind, axes_row in zip(selected, axes): y_pred_svm = svm.predict(X_test[ind]) y_pred_chain = ssvm.predict([X_test[ind]])[0] for i, (a, image, y_true, y_svm, y_chain) in enumerate(
def run_crf(w2v, words_before, words_after, shallow_parse): pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \ parse_summerscales.get_tokens_and_lbls( make_pmids_dict=True, sen=True) """ Create model """ model = ChainCRF(directed=False) ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=30) all_pmids = pmids_dict.keys() n = len(all_pmids) n_folds = 5 kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds) fold_gi = [] for fold_idx, (train, test) in enumerate(kf): print("on fold %s" % fold_idx) train_pmids = [all_pmids[pmid_idx] for pmid_idx in train] test_pmids = [all_pmids[pmid_idx] for pmid_idx in test] print('loading data...') train_x, train_y = abstract2features(pmids_dict, words_before, w2v, shallow_parse) test_x, test_y = abstract2features(pmids_dict, words_after, w2v, shallow_parse) print('loaded data...') print 'training...' ssvm.fit(train_x, train_y) print ssvm.score(test_x, test_y) for i, (pmid, x, y) in enumerate(zip(test_pmids, test_x, test_y)): abstract_words, _, _= pmids_dict[pmid] print(pmid) # predict() takes in a list returns another list prediction = ssvm.predict([x]).pop(0) predicted = '' output = '' if len(prediction) > 0: for p in prediction: if p == 1: print "word: {}".format(abstract_words[p]) if n == 0: predicted += abstract_words[p] else: predicted += ' ' + abstract_words[p] if not predicted == '': output = 'predicted: {}'.format(predicted) else: output = 'Predicted nothing!' else: output = 'Predicted nothing!' print output
# Train CRF model = ChainCRF(directed=True) ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=11) ssvm.fit(np.vstack(X_train).reshape((5375, 1, 128)), np.hstack(y_train).reshape(5375, 1)) # Train linear chain CRF chain_model = ChainCRF(directed=True) chain_ssvm = FrankWolfeSSVM(model=chain_model, C=.1, max_iter=11, verbose=0) chain_ssvm.fit(X_train, y_train) print("Test score with linear SVM: %f" % svm.score(np.vstack(X_test), np.hstack(y_test))) print("Test score with CRF: %f" % ssvm.score(X_test, y_test)) print("Test score with Linear Chain CRF: %f" % chain_ssvm.score(X_test, y_test)) # plot some word sequenced n_words = 4 rnd = np.random.RandomState(1) selected = rnd.randint(len(y_test), size=n_words) max_word_len = max([len(y_) for y_ in y_test[selected]]) fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10)) fig.subplots_adjust(wspace=0) fig.text(0.2, 0.05, 'GT', color="#00AA00", size=25) fig.text(0.4, 0.05, 'SVM', color="#5555FF", size=25) fig.text(0.6, 0.05, 'LCCRF', color="#FF5555", size=25) fig.text(0.8, 0.05, 'CRF', color="#FFD700", size=25)
chain_ssvm.fit(nn_predictions_train, y_train) # # Create linear regression object # regr = LinearRegression() # # Train the model using the training sets # regr.fit(np.vstack(nn_predictions_train), np.hstack(y_train)) # print("Test score with linear regression: %f" % regr.score(np.vstack(nn_predictions_test), # np.hstack(y_test))) print("Test score with linear NN: 84.15%") print("Test score with linear SVM: %f" % svm.score(np.vstack(X_test), np.hstack(y_test))) print("Test score with CRF: %f" % ssvm.score(nn_predictions_test, y_test)) print("Test score with Linear Chain CRF: %f" % chain_ssvm.score(nn_predictions_test, y_test)) # # plot some word sequenced # n_words = 4 # rnd = np.random.RandomState(1) # selected = rnd.randint(len(y_test), size=n_words) # max_word_len = max([len(y_) for y_ in y_test[selected]]) # fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10)) # fig.subplots_adjust(wspace=0) # fig.text(0.2, 0.05, 'GT', color="#00AA00", size=25) # fig.text(0.4, 0.05, 'SVM', color="#5555FF", size=25) # fig.text(0.6, 0.05, 'LCCRF', color="#FF5555", size=25) # fig.text(0.8, 0.05, 'CRF', color="#FFD700", size=25)
# print("Shuffle results") # features, labels = util.shuffle(features, labels) trsize = int(0.7 * len(labels)) X_train = features[1:trsize] y_train = labels[1:trsize] X_test = features[trsize + 1:] y_test = labels[trsize + 1:] # X_train = X_test = features # y_train = y_test = labels # trsize = len(labels) # Evaluate the chain model = ChainCRF() C = 0.0001 max_iter = 50 ssvm = FrankWolfeSSVM(model=model, C=C, max_iter=max_iter, verbose=True) print(ssvm) print(ssvm.fit(X_train, y_train)) print(ssvm.w) trscore = ssvm.score(X_train, y_train) # testscore = ssvm.score(X_test,y_test) print("Training score: {0}".format(trscore)) # print("Test score: {0}".format(testscore)) # Save the result # util.saveToSQL(featureset, C, max_iter, trsize, trscore, 2)
y_half_train = np.ones_like(X_half_train) for ind in range(0, X_train.shape[0]): # n_letters = 2 #fixed len of word n_letters = int(np.floor(X_train[ind].shape[0] / 2)) X_half_train[2*ind] = X_train[ind][0:n_letters] X_half_train[2*ind+1] = X_train[ind][n_letters:] y_half_train[2*ind] = y_train[ind][0:n_letters] y_half_train[2*ind+1] = y_train[ind][n_letters:] # Train the model half_ssvm.fit(X_half_train, y_half_train) print("Test score with linear SVM: %f" % svm.score(np.vstack(X_test), np.hstack(y_test))) print("Test score with FULL LCCRF: %f" % ssvm.score(X_test, y_test)) print("Test score with HALF LCCRF: %f" % half_ssvm.score(X_test, y_test)) # plot some word sequenced n_words = 4 rnd = np.random.RandomState(1) selected = rnd.randint(len(y_test), size=n_words) max_word_len = max([len(y_) for y_ in y_test[selected]]) fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10)) fig.subplots_adjust(wspace=0) fig.text(0.2, 0.05, 'GT', color="#00AA00", size=25) fig.text(0.4, 0.05, 'SVM', color="#5555FF", size=25) fig.text(0.6, 0.05, 'HALF-LCCRF', color="#FF5555", size=25) fig.text(0.8, 0.05, 'FULL-LCCRF', color="#FFD700", size=25)
# for value in X: # print value.shape # # print X_train.shape # print y_train.shape # # print type(X_train) # for value in y_train: # print value # # for i in range(0, len(X_train)): # if i == 15: # print X_train[i], len(X_train[i]) # for f in X_train[i]: # print len(f) # break # print y_train[i], len(X_train[i]) # # break # start = time() model = ChainCRF(inference_method='max-product', directed=True) ssvm = FrankWolfeSSVM(model=model, C=1.0, max_iter=10) ssvm.fit(X_train, y_train) print 'accuracy of linear-crf %f:' % ssvm.score(X_test, y_test), ' time spend: %f' %(time()-start)
# Train directed chain CRF model = ChainCRF(directed=True) ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=11) ssvm.fit(X_train, y_train) # Train undirected chain CRF undirected_model = ChainCRF(directed=False) undirected_ssvm = FrankWolfeSSVM(model=undirected_model, C=.1, max_iter=11, verbose=0) undirected_ssvm.fit(X_train, y_train) print("Test score with linear SVM: %f" % svm.score(np.vstack(X_test), np.hstack(y_test))) print("Test score with directed LCCRF: %f" % ssvm.score(X_test, y_test)) print("Test score with undirected LCCRF: %f" % undirected_ssvm.score(X_test, y_test)) # plot some word sequenced n_words = 4 rnd = np.random.RandomState(1) selected = rnd.randint(len(y_test), size=n_words) max_word_len = max([len(y_) for y_ in y_test[selected]]) fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10)) fig.subplots_adjust(wspace=0) fig.text(0.2, 0.05, 'GT', color="#00AA00", size=25) fig.text(0.4, 0.05, 'SVM', color="#5555FF", size=25) fig.text(0.6, 0.05, 'UD-LCCRF', color="#FF5555", size=25) fig.text(0.8, 0.05, 'D-LCCRF', color="#FFD700", size=25)
# convenient X, y = np.array(X), np.array(y) X_train, X_test = X[folds == 1], X[folds != 1] y_train, y_test = y[folds == 1], y[folds != 1] # Train linear SVM svm = LinearSVC(dual=False, C=0.1) # flatten input svm.fit(np.vstack(X_train), np.hstack(y_train)) # Train linear chain CRF model = ChainCRF() ssvm = FrankWolfeSSVM(model=model, C=0.1, max_iter=11) ssvm.fit(X_train, y_train) print("Test score with chain CRF: %f" % ssvm.score(X_test, y_test)) print("Test score with linear SVM: %f" % svm.score(np.vstack(X_test), np.hstack(y_test))) # plot some word sequenced n_words = 4 rnd = np.random.RandomState(1) selected = rnd.randint(len(y_test), size=n_words) max_word_len = max([len(y_) for y_ in y_test[selected]]) fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10)) fig.subplots_adjust(wspace=0) for ind, axes_row in zip(selected, axes): y_pred_svm = svm.predict(X_test[ind]) y_pred_chain = ssvm.predict([X_test[ind]])[0] for i, (a, image, y_true, y_svm, y_chain) in enumerate( zip(axes_row, X_test[ind], y_test[ind], y_pred_svm, y_pred_chain)
import loader import util from sklearn import preprocessing directory = "/Users/thijs/dev/boilerplate/src/main/resources/dataset/" featureset = "features10" print("Load files") features, labels = \ loader.loadBinary(featureset+'.csv', 'labels.csv', directory) # print("Shuffle results") # features, labels = util.shuffle(features, labels) print("Loaded") # print(labels) # features = preprocessing.scale(features) from pystruct.models import BinaryClf from pystruct.learners import (NSlackSSVM, OneSlackSSVM, SubgradientSSVM, FrankWolfeSSVM) clf = FrankWolfeSSVM(BinaryClf(),verbose=True) # print(clf) clf.fit(features,labels) trscore = clf.score(features,labels) # print("Training score: {0}".format(trscore)) print("Klaar")
Cs = [.5] test_cs = [] test_g = [] for C in Cs: fw_bc_svm = FrankWolfeSSVM(model, C=C, max_iter=1000, check_dual_every=20, line_search=False, verbose=True) # fw_batch_svm = FrankWolfeSSVM(model, C=.1, max_iter=50, batch_mode=True) gfw_bc_svm = GeneralizedFrankWolfeSSVM(gmodel, C=C, max_iter=1000, check_dual_every=5, line_search=False, verbose=True, X_test=X_test_bias, Y_test=y_test) # VANILLA print("CRAMMER-SINGER RUNNING") start = time() fw_bc_svm.fit(X_train_bias, y_train) print("error train %f and test %f" % (fw_bc_svm.score(X_train_bias, y_train), fw_bc_svm.score(X_test_bias, y_test))) test_cs.append(fw_bc_svm.score(X_test_bias, y_test)) # y_pred = np.hstack(fw_bc_svm.predict(X_test_bias)) # time_fw_bc_svm = time() - start # print("Score with cssvm: %f , C=%f (took %f seconds)" % # (np.mean(y_pred == y_test), C, time_fw_bc_svm)) # pdb.set_trace() # GENERALIZED print("GENERALIZED METHOD RUNNING") start = time() gfw_bc_svm.fit(X_train_bias, y_train) print("error train %f and test %f" % (gfw_bc_svm.score(X_train_bias, y_train), gfw_bc_svm.score(X_test_bias, y_test))) test_g.append(gfw_bc_svm.score(X_test_bias, y_test))
# convenient X, y = np.array(X), np.array(y) X_train, X_test = X[folds == 1], X[folds != 1] y_train, y_test = y[folds == 1], y[folds != 1] # Train linear SVM svm = LinearSVC(dual=False, C=.1) # flatten input svm.fit(np.vstack(X_train), np.hstack(y_train)) # Train linear chain CRF model = ChainCRF() ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=11) ssvm.fit(X_train, y_train) print("Test score with chain CRF: %f" % ssvm.score(X_test, y_test)) print("Test score with linear SVM: %f" % svm.score(np.vstack(X_test), np.hstack(y_test))) # plot some word sequenced n_words = 4 rnd = np.random.RandomState(1) selected = rnd.randint(len(y_test), size=n_words) max_word_len = max([len(y_) for y_ in y_test[selected]]) fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10)) fig.subplots_adjust(wspace=0) for ind, axes_row in zip(selected, axes): y_pred_svm = svm.predict(X_test[ind]) y_pred_chain = ssvm.predict([X_test[ind]])[0] for i, (a, image, y_true, y_svm, y_chain) in enumerate(
import numpy as np import loader import util from sklearn import preprocessing directory = "/Users/thijs/dev/boilerplate/src/main/resources/dataset/" featureset = "features10" print("Load files") features, labels = \ loader.loadBinary(featureset+'.csv', 'labels.csv', directory) # print("Shuffle results") # features, labels = util.shuffle(features, labels) print("Loaded") # print(labels) # features = preprocessing.scale(features) from pystruct.models import BinaryClf from pystruct.learners import (NSlackSSVM, OneSlackSSVM, SubgradientSSVM, FrankWolfeSSVM) clf = FrankWolfeSSVM(BinaryClf(), verbose=True) # print(clf) clf.fit(features, labels) trscore = clf.score(features, labels) # print("Training score: {0}".format(trscore)) print("Klaar")