class CRFTrainer(object): #define an init function to initialize the values. def __init__(self, c_value, classifier_name='ChainCRF'): self.c_value = c_value self.classifier_name = classifier_name #using chain crf to analyze the data, so add an error check for this: if self.classifier_name == 'ChainCRF': model = ChainCRF() #define the classifier to use with CRF model. self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=100) else: raise TypeError('Invalid classifier type') def load_clean_data(self): ''' load the data into X and y, where X is a numpy array of samples where each sample has the shape (n_letters, n_features) ''' df = featurize.get_data() featurize.split_words(df) featurize.first_letter_uppercase(df) featurize.has_number(df) featurize.has_slash(df) featurize.spacy_pos_tagger(df) featurize.pos_ngrams(df) featurize.encoding_labels(df) X, y = featurize.get_X_and_y(df) return df, X, y def cross_val(self, X_train, y_train): ''' method to conduct 5-fold cross validation ''' kf = KFold(len(X_train), n_folds=5, random_state=None, shuffle=False) for train_idx, test_idx in kf: xtrain, xval = X_train[train_idx], X_train[test_idx] ytrain, yval = y_train[train_idx], y_train[test_idx] model = ChainCRF() ssvm = FrankWolfeSSVM(model=model, C=0.5, max_iter=15) ssvm.fit(xtrain, ytrain) print ssvm.score(xval, yval) def train(self, X_train, y_train): ''' training method ''' self.clf.fit(X_train, y_train) def evaluate(self, X_test, y_test): ''' method to evaluate the performance of the model ''' return self.clf.score(X_test, y_test) def classify(self, input_data): ''' method to run the classifier on input data ''' return self.clf.predict(input_data)[0]
def test_multinomial_blocks_frankwolfe_batch(): X, Y = generate_blocks_multinomial(n_samples=10, noise=0.3, seed=0) crf = GridCRF(inference_method='qpbo') clf = FrankWolfeSSVM(model=crf, C=1, max_iter=500, batch_mode=True) clf.fit(X, Y) Y_pred = clf.predict(X) assert_array_equal(Y, Y_pred)
class CRFTrainer(object): def __init__(self, c_value, classifier_name='ChainCRF'): self.c_value = c_value self.classifier_name = classifier_name if self.classifier_name == 'ChainCRF': model = ChainCRF() self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=50) else: raise TypeError('Invalid classifier type') def load_data(self): letters = load_letters() X, y, folds = letters['data'], letters['labels'], letters['folds'] X, y = np.array(X), np.array(y) return X, y, folds # X is a numpy array of samples where each sample # has the shape (n_letters, n_features) def train(self, X_train, y_train): self.clf.fit(X_train, y_train) def evaluate(self, X_test, y_test): return self.clf.score(X_test, y_test) # Run the classifier on input data def classify(self, input_data): return self.clf.predict(input_data)[0]
class CRFModel(object): def __init__(self, c_val=1.0): self.clf = FrankWolfeSSVM(model=ChainCRF(), C=c_val, max_iter=50) def load_data(self): alphabets = load_letters() X = np.array(alphabets['data']) y = np.array(alphabets['labels']) folds = alphabets['folds'] return X, y, folds def train(self, X_train, y_train): self.clf.fit(X_train, y_train) def evaluate(self, X_test, y_test): return self.clf.score(X_test, y_test) def classify(self, input_data): return self.clf.predict(input_data)[0] def convert_to_letters(indices): alphabets = np.array(list(string.ascii_lowercase)) output = np.take(alphabets, indices) output = ''.join(output) return output
def test_multinomial_blocks_frankwolfe(): X, Y = generate_blocks_multinomial(n_samples=10, noise=0.5, seed=0) crf = GridCRF(inference_method='qpbo') clf = FrankWolfeSSVM(model=crf, C=1, max_iter=50, verbose=3) clf.fit(X, Y) Y_pred = clf.predict(X) assert_array_equal(Y, Y_pred)
def n_cross_valid_crf(X, Y, K, command): # cross validation for crf if command == 'write_results': list_write = list() cv = KFold(len(X), K, shuffle=True, random_state=0) for traincv, testcv in cv: x_train, x_test = X[traincv], X[testcv] y_train, y_test = Y[traincv], Y[testcv] crf = ChainCRF(inference_method='max-product', directed=False, class_weight=None) ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=100) ssvm.fit(x_train, y_train) y_pred = ssvm.predict(x_test) print 'Accuracy of linear-crf %f:' % ssvm.score(x_test, y_test) if command == 'metrics_F1': metrics_crf(y_test, y_pred) elif command == 'confusion_matrix': confusion_matrix_CRF(y_test, y_pred) elif command == 'write_results': list_write += write_results_CRF(testcv, y_test, y_pred) print '------------------------------------------------------' print '------------------------------------------------------' if command == 'write_results': list_write = sorted(list_write, key=itemgetter(0)) # sorted list based on index for value in list_write: pred_list = value[1] test_list = value[2] for i in range(0, len(pred_list)): print str(pred_list[i]) + '\t' + str(test_list[i])
class CRFTrainer(object): def __init__(self, c_value, classifier_name='ChainCRF'): self.c_value = c_value self.classifier_name = classifier_name if self.classifier_name == 'ChainCRF': model = ChainCRF() self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=50) else: raise TypeError('Invalid classifier type') def load_data(self): letters = load_letters() X, y, folds = letters['data'], letters['labels'], letters['folds'] X, y = np.array(X), np.array(y) return X, y, folds # X是一个由样本组成的numpy数组,每个样本为(字母,数值) def train(self, X_train, y_train): self.clf.fit(X_train, y_train) def evaluate(self, X_test, y_test): return self.clf.score(X_test, y_test) # 对输入数据运行分类器 def classify(self, input_data): return self.clf.predict(input_data)[0]
def pick_best_C_value(train_sentences, sentence_labels, test_SF, test_sentences, test_sentence_labels): i = 0.10 best_C = i f_old = 0 for z in range(1, 20): print "----------------- Training on C-value %f" % i modelCRF = ChainCRF() ssvm = FrankWolfeSSVM(model=modelCRF, C=i, max_iter=20, random_state=5) ssvm.fit(train_sentences, sentence_labels) print "\n" print "-------- Training complete --------" predictions = ssvm.predict(test_sentences) test_SF['predicted_labels'] = predictions #Saving model print "Saving model...." pickle.dump(ssvm, open('models/ote/otemodel.sav', 'wb')) #Evaluating Trained CRF model p, r, f1, common, retrieved, relevant = evaluating_ote(test_SF) if (f1 >= f_old): #save value of 'C' f_old = f1 best_C = i i = i + 0.05 return best_C
def fit_predict(train_docs, test_docs, dataset, C, class_weight, constraints, compat_features, second_order, coparents, grandparents, siblings, exact_test=False): stats = stats_train(train_docs) prop_vect, _ = prop_vectorizer(train_docs, which=dataset, stats=stats, n_most_common_tok=None, n_most_common_dep=2000, return_transf=True) link_vect = link_vectorizer(train_docs, stats, n_most_common=500) sec_ord_vect = (second_order_vectorizer(train_docs) if second_order else None) _, _, _, pmi_in, pmi_out = stats def _transform_x_y(docs): X = [ _vectorize(doc, pmi_in, pmi_out, prop_vect, link_vect, sec_ord_vect) for doc in docs ] Y = [doc.label for doc in docs] return X, Y X_tr, Y_tr = _transform_x_y(train_docs) X_te, Y_te = _transform_x_y(test_docs) model = ArgumentGraphCRF(class_weight=class_weight, constraints=constraints, compat_features=compat_features, coparents=coparents, grandparents=grandparents, siblings=siblings) clf = FrankWolfeSSVM(model, C=C, random_state=0, verbose=1, check_dual_every=25, show_loss_every=25, max_iter=100, tol=0) clf.fit(X_tr, Y_tr) if exact_test: clf.model.exact = True Y_pred = clf.predict(X_te) return clf, Y_te, Y_pred
def test_multinomial_blocks_frankwolfe(): X, Y = generate_blocks_multinomial(n_samples=50, noise=0.5, seed=0) crf = GridCRF(inference_method='qpbo') clf = FrankWolfeSSVM(model=crf, C=1, line_search=True, batch_mode=False, check_dual_every=500) clf.fit(X, Y) Y_pred = clf.predict(X) assert_array_equal(Y, Y_pred)
def graph_crf(): crf = GraphCRF() # X_train # creating features # maximum number of attributes = 2 # variables have only one attribute (assigned value), so other second attribute is set to zero feature_1 = [1, 0] # var_1 feature_2 = [2, 0] # var_2 # function has two attributes, so an indicator variable is used to show those two feature_3 = [1, 1] # function # if has only one condition, which checks for value 1 feature_4 = [1, 0] # if features = np.array([feature_1, feature_2, feature_3, feature_4]) # creating edges # there are four edges: (v1, v2), (v1, func), (v2, func), (v1, if) edge_1 = [0, 1] # (v1,v2) edge_2 = [0, 2] # (v1, func) edge_3 = [1, 2] # (v2, func) edge_4 = [0, 3] # (v1, if) edges = np.array([edge_1, edge_2, edge_3, edge_4]) X_train_sample = (features, edges) # y_train # These are enumerated values for actions # We assume there should be an action for each node(variable, function, if, etc.) y_train_sample = np.array([0, 0, 1, 2]) # creat some full training set by re-sampling above thing n_samples = 100 X_train = [] y_train = [] for i in range(n_samples): X_train.append(X_train_sample) y_train.append(y_train_sample) model = GraphCRF(directed=True, inference_method="max-product") ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10) ssvm.fit(X_train, y_train) # predict something output = ssvm.predict(X_train[0:3]) print output
def n_cross_valid_crf_candidate(list_line, X, Y, K): list_text = [] for i in range(0, len(list_line), 3): split_first = 0 split_second = 0 if i % 3 == 0: split_first = list_line[i].strip().split('\t') list_text.append(split_first) list_text = np.array(list_text) cv = KFold(len(X), K, shuffle=True, random_state=0) list_write = [] for traincv, testcv in cv: x_train, x_test = X[traincv], X[testcv] y_train, y_test = Y[traincv], Y[testcv] list_text_train, list_text_test = list_text[traincv], list_text[testcv] crf = ChainCRF(inference_method='max-product', directed=False, class_weight=None) ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=10) ssvm.fit(x_train, y_train) y_pred = ssvm.predict(x_test) list_wrong = metrics_crf_candidate(list_text_test, y_test, y_pred) if len(list_write) == 0: list_write = list_wrong else: for i in range(0, len(list_wrong)): svc = list_wrong[0] road = list_wrong[1] busstop = list_wrong[2] list_write[0] = list_write[0] + svc list_write[1] = list_write[1] + road list_write[2] = list_write[2] + busstop # write_file('d:/', 'wrong_svc', list_write[0]) # write_file('d:/', 'wrong_road', list_write[1]) # write_file('d:/', 'wrong_busstop', list_write[2]) write_file('d:/', 'good_svc', list_write[0]) write_file('d:/', 'good_road', list_write[1]) write_file('d:/', 'good_busstop', list_write[2])
def results_CRFs(X_training, Y_training, X_testing, Y_testing, command): crf = ChainCRF(inference_method='max-product', directed=False, class_weight=None) ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=100) ssvm.fit(X_training, Y_training) y_pred = ssvm.predict(X_testing) list_write = list() print 'Accuracy of linear-crf %f:' % ssvm.score(X_testing, Y_testing) if command == 'metrics_F1': metrics_crf(Y_testing, y_pred) elif command == 'confusion_matrix': confusion_matrix_CRF(Y_testing, y_pred) elif command == 'write_results': list_write = write_CRFs_compare(Y_testing, y_pred) for value in list_write: pred_list = value[0] test_list = value[1] for i in range(0, len(pred_list)): print str(pred_list[i]) + '\t' + str(test_list[i])
def Chain_CRF(x, y, x_test, model_args): # Reshape for CRF #svc = SVC(class_weight='balanced', kernel='rbf', decision_function_shape='ovr') #svc.fit(x, y) #x = svc.decision_function(x) #x_test = svc.decision_function(x_test) #scaler = StandardScaler().fit(x) #x = scaler.transform(x) #x_test = scaler.transform(x_test) x = x[:, :11] x_test = x_test[:, :11] x = x.reshape(-1, 21600, x.shape[-1]) x_test = x_test.reshape(-1, 21600, x.shape[-1]) y = y.reshape(-1, 21600) crf = ChainCRF(directed=False) ssvm = FrankWolfeSSVM(model=crf, C=model_args['C'], max_iter=model_args['max_iter']) ssvm.fit(x, y) y_pred = np.array(ssvm.predict(x_test)) return y_pred.flatten()
class CRFModel(object): def __init__(self, c_val=1.0): self.clf = FrankWolfeSSVM(model=ChainCRF(), C=c_val, max_iter=100) #Load the training data def load_data(self): alphabets = load_letters() X = np.array(alphabets['data']) y = np.array(alphabets['labels']) folds = alphabets['folds'] return X, y, folds #Train the CRF def train(self, X_train, y_train): self.clf.fit(X_train, y_train) #Evaluate the accuracy of the CRF def evaluate(self, X_test, y_test): return self.clf.score(X_test, y_test) #Run the CRF on unknown data def classify(self, input_data): return self.clf.predict(input_data)[0]
def CRF_pred_label(X, Y, command): texts = load_demo_text(command) if command == 'twitter': convert_texts = filterText_demo(texts, 'removeLink', command) X_ftr = load_demo_ftr(command) print len(convert_texts), len(X_ftr) path_write = 'D:/Project/Transportation_SMU-NEC_collaboration/Data_demo_Dec_2015/twitter' name_write = 'pred_label_' + command elif command == 'sgforums': convert_texts = filterText_demo(texts, 'removePunc', command) X_ftr = load_demo_ftr(command) print len(convert_texts), len(X_ftr) path_write = 'D:/Project/Transportation_SMU-NEC_collaboration/Data_demo_Dec_2015/sgforums' name_write = 'pred_label_' + command elif command == 'facebook': convert_texts = filterText_demo(texts, 'removeLink', command) X_ftr = load_demo_ftr(command) print len(convert_texts), len(X_ftr) path_write = 'D:/Project/Transportation_SMU-NEC_collaboration/Data_demo_Dec_2015/facebook' name_write = 'pred_label_' + command crf = ChainCRF(inference_method='max-product', directed=False, class_weight=None) ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=100) ssvm.fit(X, Y) y_pred = ssvm.predict(X_ftr) list_write = list() for line in y_pred: labels = '' for label in line: labels += str(label) + '\t' list_write.append(labels.strip()) write_file(path_write, name_write, list_write)
rnd = np.random.RandomState(1) selected = rnd.randint(len(y_test), size=n_words) max_word_len = max([len(y_) for y_ in y_test[selected]]) fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10)) fig.subplots_adjust(wspace=0) fig.text(0.2, 0.05, 'GT', color="#00AA00", size=25) fig.text(0.4, 0.05, 'SVM', color="#5555FF", size=25) fig.text(0.6, 0.05, 'HALF-LCCRF', color="#FF5555", size=25) fig.text(0.8, 0.05, 'FULL-LCCRF', color="#FFD700", size=25) fig.text(0.05, 0.5, 'Word', color="#000000", size=25) fig.text(0.5, 0.95, 'Letters', color="#000000", size=25) for ind, axes_row in zip(selected, axes): y_pred_svm = svm.predict(X_test[ind]) y_pred_half = half_ssvm.predict([X_test[ind]])[0] y_pred_crf = ssvm.predict([X_test[ind]])[0] for i, (a, image, y_true, y_svm, y_half, y_crf) in enumerate( zip(axes_row, X_test[ind], y_test[ind], y_pred_svm, y_pred_half, y_pred_crf)): a.matshow(image.reshape(16, 8), cmap=plt.cm.Greys) a.text(0, 3, abc[y_true], color="#00AA00", size=25) # Green a.text(0, 14, abc[y_svm], color="#5555FF", size=25) # Blue a.text(5, 14, abc[y_half], color="#FF5555", size=25) # Red a.text(5, 3, abc[y_crf], color="#FFD700", size=25) # Yellow a.set_xticks(()) a.set_yticks(()) for ii in range(i + 1, max_word_len): axes_row[ii].set_visible(False) w = ssvm.w[26 * 8 * 16:].reshape(26, 26)
kf = KFold(n_splits=n_folds) fold = 0 for train_index, test_index in kf.split(X): print(' ') print('train index {}'.format(train_index)) print('test index {}'.format(test_index)) print('{} jackets for training, {} for testing'.format( len(train_index), len(test_index))) X_train = X[train_index] Y_train = Y[train_index] X_test = X[test_index] Y_test = Y[test_index] """ YOUR S-SVM TRAINING CODE HERE """ ssvm.fit(X_train, Y_train) """ LABEL THE TESTING SET AND PRINT RESULTS """ Y_pred = ssvm.predict(X_test) wrong_segments_crf.append(np.sum(Y_pred != Y_test)) score = ssvm.score(X_test, Y_test) scores_crf[fold] = score """ figure showing the result of classification of segments for each jacket in the testing part of present fold """ if plot_labeling: for ti, pred in zip(test_index, Y_pred): print(ti) print(pred) s = segments[ti] plot_segments(s, caption='SSVM predictions for jacket ' + str(ti + 1), labels_segments=pred) """ YOUR LINEAR SVM TRAINING CODE HERE """ svm.fit(X_train.reshape((-1, num_features)), Y_train.reshape((-1)))
print("Test score with chain CRF: %f" % ssvm.score(X_test, y_test)) print("Test score with linear SVM: %f" % svm.score(np.vstack(X_test), np.hstack(y_test))) # plot some word sequenced n_words = 4 rnd = np.random.RandomState(1) selected = rnd.randint(len(y_test), size=n_words) max_word_len = max([len(y_) for y_ in y_test[selected]]) fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10)) fig.subplots_adjust(wspace=0) for ind, axes_row in zip(selected, axes): y_pred_svm = svm.predict(X_test[ind]) y_pred_chain = ssvm.predict([X_test[ind]])[0] for i, (a, image, y_true, y_svm, y_chain) in enumerate( zip(axes_row, X_test[ind], y_test[ind], y_pred_svm, y_pred_chain)): a.matshow(image.reshape(16, 8), cmap=plt.cm.Greys) a.text(0, 3, abc[y_true], color="#00AA00", size=25) a.text(0, 14, abc[y_svm], color="#5555FF", size=25) a.text(5, 14, abc[y_chain], color="#FF5555", size=25) a.set_xticks(()) a.set_yticks(()) for ii in range(i + 1, max_word_len): axes_row[ii].set_visible(False) plt.matshow(ssvm.w[26 * 8 * 16:].reshape(26, 26)) plt.title("Transition parameters of the chain CRF.") plt.xticks(np.arange(25), abc) plt.yticks(np.arange(25), abc)
class CRFClassifierText(object): IGNORE_IF = re.compile(r'(in press|submitted|to appear)', flags=re.IGNORECASE) QUOTES_AROUND_ETAL_REMOVE = re.compile(r'(.*)(")(et al\.?)(")(.*)', re.IGNORECASE) TO_ADD_DOT_AFTER_INITIALS = re.compile( r'\b([A-Z]{1}(?!\.))([\s,]+)([A-Z12(]|and)') TO_ADD_SEPARATE_INITIALS = re.compile(r'\b([A-Z]{1})([A-Z]{1})([,\s]{1})') SEPARATE_AUTHOR = re.compile(r'^((.*?)([\d\":]+))(.*)$') TO_REMOVE_HYPEN_NEAR_INITIAL = [ re.compile(r'([A-Z]\.)(\-)([A-Z]\.)'), re.compile(r'([A-Z])(\-)(\.)'), re.compile(r'([A-Z])(\-)([A-Z])\b') ] URL_EXTRACTOR = re.compile(r'((url\s*)?(http)s?://[A-z0-9\-\.\/\={}?&%]+)', re.IGNORECASE) MONTH_NAME_EXTRACTOR = re.compile( r'\b([Jj]an(?:uary)?|[Ff]eb(?:ruary)?|[Mm]ar(?:ch)?|[Aa]pr(?:il)?|[Mm]ay|[Jj]un(?:e)?|[Jj]ul(?:y)?|[Aa]ug(?:ust)?|[Ss]ep(?:tember)?|[Oo]ct(?:ober)?|([Nn]ov|[Dd]ec)(?:ember)?)\b' ) URL_TO_DOI = re.compile( r'((url\s*)?(https\s*:\s*//\s*|http\s*:\s*//\s*)((.*?)doi(.*?)org/))|(DOI:https\s*://\s*)', flags=re.IGNORECASE) URL_TO_ARXIV = re.compile( r'((url\s*)?(https://|http://)(arxiv.org/(abs|pdf)/))', flags=re.IGNORECASE) URL_TO_ASCL = re.compile(r'((url\s*)?(https://|http://)(ascl.net/))', flags=re.IGNORECASE) ADD_COLON_TO_IDENTIFIER = re.compile(r'(\s+(DOI|arXiv|ascl))(:?\s*)', flags=re.IGNORECASE) IS_START_WITH_YEAR = re.compile(r'(^[12][089]\d\d)') START_WITH_AUTHOR = re.compile(r'([A-Za-z].*$)') WORD_BREAKER_REMOVE = [re.compile(r'([A-Za-z]+)([\-]+\s+)([A-Za-z]+)')] TOKENS_NOT_IDENTIFIED = re.compile(r'\w+\b(?!\|)') REFERENCE_TOKENIZER = re.compile(r'([\s.,():;\[\]\'\"#\/])') TAGGED_MULTI_WORD_TOKENIZER = re.compile(r'([\s.,])') # is all capital IS_ALL_CAPITAL = re.compile(r'^([A-Z]+)$') # is only the first character capital IS_FIRST_CAPITAL = re.compile(r'^([A-Z][a-z]+)$') # is alphabet only, consider hyphenated words also IS_ALPHABET = re.compile(r'^(?=.*[a-zA-Z])([a-zA-Z\-]+)$') # is numeric only, consider the page range with - being also numeric # also include arxiv id with a dot to be numeric # note that this differs from function is_numeric in the # sense that this recognizes numeric even if it was not identified/tagged IS_NUMERIC = re.compile(r'^(?=.*[0-9])([0-9\-\.]+)$') # is alphanumeric, must have at least one digit and one alphabet character IS_ALPHANUMERIC = re.compile(r'^(?=.*[0-9])(?=.*[a-zA-Z])([a-zA-Z0-9]+)$') ADD_SPACE_BETWEEN_TWO_IDENTIFIED_TOKENS = re.compile( r'(\|[a-z\_]+\|)(\|[a-z\_]+\|)') REGEX_PATTERN_WHOLE_WORD_ONLY = r'(?:\b|\B)%s(?:\b|\B)' nltk_tagger = None crf = None X = y = label_code = folds = None def __init__(self): """ """ self.originator_token = OriginatorToken(self.REFERENCE_TOKENIZER) self.numeric_token = NumericToken() self.pub_token = PubToken() self.unknown_tokens = [] self.filename = os.path.dirname( __file__) + '/serialized_files/crfModelText.pkl' def create_crf(self): """ :return: """ # to load nltk tagger, a time consuming, one time needed operation self.nltk_tagger = nltk.tag._get_tagger() self.crf = FrankWolfeSSVM(model=ChainCRF(), C=1.0, max_iter=50) self.X, self.y, self.label_code, self.folds, generate_fold = self.load_training_data( ) score = 0 # only need to iterate through if fold was generated num_tries = 10 if generate_fold else 1 while (score <= 0.90) and (num_tries > 0): try: X_train, y_train = self.get_train_data() self.train(X_train, y_train) X_test, y_test = self.get_test_data() score = self.evaluate(X_test, y_test) except Exception as e: current_app.logger.error('Exception: %s' % (str(e))) current_app.logger.error(traceback.format_exc()) pass num_tries -= 1 return (score > 0) def format_training_data(self, the_data): """ :param the_data: :return: """ # get label, word in the original presentation labels = [[elem[0] for elem in ref] for ref in the_data] words = [[elem[1] for elem in ref] for ref in the_data] # count how many unique labels there are, return a dict to convert from words to numeric words label_code = self.encoder(labels) numeric_labels = [] features = [] for label, word in zip(labels, words): # replace of numeric words for the original presentation of label numeric_label = [] for l in label: numeric_label.append(label_code[l]) numeric_labels.append(np.array(numeric_label)) # get the numeric features for the original presentation of word and insert at index of label feature = [] for idx in range(len(word)): feature.append(self.get_data_features(word, idx, label)) features.append(np.array(feature)) return features, numeric_labels, label_code def get_num_states(self): """ :return: """ num_states = len( np.unique(np.hstack([y for y in self.y[self.folds != 0]]))) current_app.logger.debug("number of states = %s" % num_states) return num_states def get_folds_array(self, filename): """ read the distribution of train and test indices from file :param filename: :return: """ with open(filename, 'r') as f: reader = f.readlines() for line in reader: if line.startswith("STATIC_FOLD"): try: return eval(line.split(" = ")[1]) except: return None def get_train_data(self): """ :return: """ return self.X[self.folds != 0], self.y[self.folds != 0] def get_test_data(self): """ :return: """ return self.X[self.folds == 0], self.y[self.folds == 0] def train(self, X_train, y_train): """ :param X_train: is a numpy array of samples where each sample has the shape (n_labels, n_features) :param y_train: is numpy array of labels :return: """ self.crf.fit(X_train, y_train) def evaluate(self, X_test, y_test): """ :param X_test: :param y_test: :return: """ return self.crf.score(X_test, y_test) def decoder(self, numeric_label): """ :param numeric_label: :return: """ labels = [] for nl in numeric_label: key = next(key for key, value in self.label_code.items() if value == nl) labels.append(key) return labels def encoder(self, labels): """ :param labels: :return: dict of labels as key and numeric value is its value """ # assign a numeric value to each label label_code = {} numeric = -1 for label in labels: for l in label: if (numeric >= 0 and l in label_code): continue else: numeric = numeric + 1 label_code[l] = numeric return label_code def load_training_data(self): """ load training/test data :return: """ training_files_path = os.path.dirname(__file__) + '/training_files/' arXiv_text_ref_filenames = [ training_files_path + 'arxiv.raw', ] references = [] for f in arXiv_text_ref_filenames: references = references + get_arxiv_tagged_data(f) X, y, label_code = self.format_training_data(references) # for now use static division. see comments in foldModelText.dat generate_fold = False if generate_fold: folds = list(np.random.choice(range(0, 9), len(y))) else: folds = self.get_folds_array(training_files_path + 'foldModelText.dat') return np.array(X, dtype=object), np.array( y, dtype=object), label_code, np.array(folds), generate_fold def save(self): """ save object to a pickle file :return: """ try: with open(self.filename, "wb") as f: pickler = pickle.Pickler(f, -1) pickler.dump(self.crf) pickler.dump(self.label_code) pickler.dump(self.nltk_tagger) current_app.logger.info("saved crf in %s." % self.filename) return True except Exception as e: current_app.logger.error('Exception: %s' % (str(e))) current_app.logger.error(traceback.format_exc()) return False def load(self): """ :return: """ try: with open(self.filename, "rb") as f: unpickler = pickle.Unpickler(f) self.crf = unpickler.load() self.label_code = unpickler.load() self.nltk_tagger = unpickler.load() current_app.logger.info("loaded crf from %s." % self.filename) return self.crf except Exception as e: current_app.logger.error('Exception: %s' % (str(e))) current_app.logger.error(traceback.format_exc()) def search(self, pattern, text): """ search whole word only in the text :param pattern: :param text: :return: Ture/False depending if found """ try: return re.search(self.REGEX_PATTERN_WHOLE_WORD_ONLY % pattern, text) is not None except: return False def reference(self, refstr, words, labels): """ put identified words into a dict to be passed out :param words: :param labels: :return: """ ref_dict = {} ref_dict['authors'] = self.originator_token.collect_tagged_tokens( words, labels) if 'DOI' in labels or 'ARXIV' in labels or 'ASCL' in labels: ref_dict.update( self.numeric_token.collect_id_tagged_tokens(words, labels)) if 'YEAR' in labels: ref_dict['year'] = words[labels.index('YEAR')] if 'VOLUME' in labels: volume = self.numeric_token.collect_tagged_numerals_token( words, labels, 'VOLUME') if volume: ref_dict['volume'] = volume if 'PAGE' in labels: page = self.numeric_token.collect_tagged_numerals_token( words, labels, 'PAGE') if page: ref_dict['page'] = page if 'ISSUE' in labels: ref_dict['issue'] = words[labels.index('ISSUE')] if 'ISSN' in labels: ref_dict['ISSN'] = words[labels.index('ISSN')] if 'JOURNAL' in labels: ref_dict['journal'] = self.pub_token.collect_tagged_journal_tokens( words, labels) if 'TITLE' in labels: title = self.pub_token.collect_tagged_title_tokens(words, labels) if title: ref_dict['title'] = title ref_dict['refstr'] = refstr return ref_dict def punctuation_features(self, ref_word, ref_label): """ return a feature vector that has 1 in the first cell if ref_word is a punctuation followed by 1 in the position corresponding to which one :param ref_word: :param ref_label: :return: """ which = which_punctuation(ref_word, ref_label) return [ 1 if which == 0 else 0, # 0 if punctuation, 1 if which == 1 else 0, # 1 if brackets, 1 if which == 2 else 0, # 2 if colon, 1 if which == 3 else 0, # 3 if comma, 1 if which == 4 else 0, # 4 if dot, 1 if which == 5 else 0, # 5 if parenthesis, 1 if which == 6 else 0, # 6 if quotes (both single and double), 1 if which == 7 else 0, # 7 if num signs, 1 if which == 8 else 0, # 8 if hypen, 1 if which == 9 else 0, # 9 if forward slash, 1 if which == 10 else 0, # 10 if semicolon, ] def is_token_unknown(self, ref_word, ref_label): """ :param ref_word: :param ref_label: :return: """ if ref_label: return 1 if ref_label == 'NA' else 0 if ref_word is None: return 0 return int(any(ref_word == token for token in self.unknown_tokens)) def length_features(self, ref_word): """ distinguish between token of length 1, and longer :param ref_word: :return: """ return [1 if len(ref_word) == 1 else 0, 1 if len(ref_word) > 1 else 0] def get_data_features(self, ref_word_list, index, ref_label_list=None): """ :param ref_word_list: has the form [e1,e2,e3,..] :param index: the position of the word in the set, assume it is valid :param ref_label_list: labels for ref_word_list available during training only :return: """ ref_word = ref_word_list[index] ref_label = ref_label_list[index] if ref_label_list else None return \ self.length_features(ref_word) \ + self.originator_token.author_features(ref_word_list, ref_label_list, index) \ + self.pub_token.title_features(ref_word_list, ref_label_list, index) \ + self.pub_token.journal_features(ref_word_list, ref_label_list, index) \ + self.numeric_token.numeric_features(ref_word, ref_label) \ + self.numeric_token.identifying_word_features(ref_word, ref_label) \ + self.punctuation_features(ref_word, ref_label) \ + self.pub_token.publisher_features(ref_word, ref_label) \ + self.originator_token.editor_features(ref_word_list, ref_label_list, index) \ + [ int(self.IS_ALL_CAPITAL.match(ref_word) is not None), # is element all capital int(self.IS_FIRST_CAPITAL.match(ref_word) is not None), # is first character capital int(self.IS_ALPHABET.match(ref_word) is not None), # is alphabet only, consider hyphenated words also int(self.IS_NUMERIC.match(ref_word) is not None), # is numeric only, consider the page range with - being also numeric int(self.IS_ALPHANUMERIC.match(ref_word) is not None), # is alphanumeric, must at least one digit and one alphabet character self.is_token_unknown(ref_word, ref_label), # is it one of the words unable to guess self.pub_token.is_token_stopword(ref_word, ref_label), # is it one of tagged stopwords ] def segment(self, reference_str): """ going to attempt and segment the reference string each token that is identified is removed from reference_str in the reverse order the identified tokens are inserted back to reference_str before feature extraction :param reference_str: :return: """ if isinstance(reference_str, list): return [] # start fresh self.numeric_token.clear() self.originator_token.clear() self.pub_token.clear() na_url = None na_month = None # step 1: remove any non essential tokens (ie, urls, months, etc) matches = self.URL_EXTRACTOR.findall(reference_str) if len(matches) > 0: na_url = [] for i, url in enumerate(matches, start=1): na_url.append(url[0]) reference_str = reference_str.replace(url[0], '|na_url_%d|' % i) extractor = self.MONTH_NAME_EXTRACTOR.search(reference_str) if extractor: na_month = extractor.group().strip() reference_str = reference_str.replace(na_month, '|na_month|') # step 2: identify doi/arxiv/ascl reference_str = self.numeric_token.segment_ids(reference_str) # step 3: identify list of authors and editors reference_str = self.originator_token.identify(reference_str) # step 4: identify title and journal substrings # but first remove any numerical identifying words reference_str = self.pub_token.identify( self.numeric_token.remove_identifying_words(reference_str).strip(), self.nltk_tagger, self.originator_token.indices(), self.originator_token.have_editor()) # step 5: identify year, volume, page, issue reference_str = self.numeric_token.segment_numerals(reference_str) # collect all tokens that has not been identified self.unknown_tokens = self.TOKENS_NOT_IDENTIFIED.findall(reference_str) if na_url: self.unknown_tokens.append(' '.join(na_url)) if na_month: self.unknown_tokens.append(na_month) # now put the identified tokens back into the string, and before tokenizing and sending to crf # step 5 reverse reference_str = self.numeric_token.assemble_stage1(reference_str) # step 4 reverse reference_str = self.pub_token.assemble(reference_str) # step 3 reverse reference_str = self.originator_token.assemble(reference_str) # tokenize ref_words = list( filter(None, [ w.strip() for w in self.REFERENCE_TOKENIZER.split( self.ADD_SPACE_BETWEEN_TWO_IDENTIFIED_TOKENS.sub( r'\1 \2', reference_str)) ])) # step 2 reverse ref_words = self.numeric_token.assemble_stage2(ref_words) # step 1 reverse if na_month: ref_words[ref_words.index('|na_month|')] = na_month if na_url: for i, url in enumerate(na_url, start=1): ref_words[ref_words.index('|na_url_%d|' % i)] = url return ref_words def dots_after_initials(self, reference_str): """ :param reference_str: :return: """ try: author_part = self.SEPARATE_AUTHOR.search(reference_str).group(1) # separate first and middle initials if there are any attached, add dot after each # make sure there is a dot after single character, repeat to capture middle name reference_str = reference_str.replace( author_part, self.TO_ADD_SEPARATE_INITIALS.sub( r"\1. \2. \3", self.TO_ADD_DOT_AFTER_INITIALS.sub( r"\1.\2\3", self.TO_ADD_DOT_AFTER_INITIALS.sub( r"\1.\2\3", author_part)))) except: pass return reference_str def pre_processing(self, reference_str): """ :param reference_str: :return: """ # remove any numbering that appears before the reference to start with authors # exception is the year if self.IS_START_WITH_YEAR.search(reference_str) is None: reference_str = self.START_WITH_AUTHOR.search( reference_str).group() # also if for some reason et al. has been put in double quoted! remove them reference_str = self.QUOTES_AROUND_ETAL_REMOVE.sub( r"\1\3\5", reference_str) # if there is a hypen either between initials, or after initials and before dot, remove it for rhni, replace in zip(self.TO_REMOVE_HYPEN_NEAR_INITIAL, [r"\1 \3", r"\1\3", r"\1. \3"]): reference_str = rhni.sub(replace, reference_str) # add dots after initials, separate first and middle if needed reference_str = self.dots_after_initials(reference_str) # if no colon after the identifer, add it in reference_str = self.ADD_COLON_TO_IDENTIFIER.sub(r"\1:", reference_str) # if there is a url for DOI turned it to recognizable DOI reference_str = self.URL_TO_DOI.sub(r"DOI:", reference_str) # if there is a url for arxiv turned it to recognizable arxiv reference_str = self.URL_TO_ARXIV.sub(r"arXiv:", reference_str) # if there is a url for ascl turned it to recognizable ascl reference_str = self.URL_TO_ASCL.sub(r"ascl:", reference_str) for rwb in self.WORD_BREAKER_REMOVE: reference_str = rwb.sub(r'\1\3', reference_str) return reference_str def classify(self, reference_str): """ Run the classifier on input data :param reference_str: :return: list of words and the corresponding list of labels """ reference_str = self.pre_processing(reference_str) ref_words = self.segment(reference_str) features = [] for i in range(len(ref_words)): features.append(self.get_data_features(ref_words, i, [])) ref_labels = self.decoder(self.crf.predict([np.array(features)])[0]) return ref_words, ref_labels def parse(self, reference_str): """ :param reference_str: :return: """ if self.IGNORE_IF.search(reference_str): return None words, labels = self.classify(reference_str) return self.reference(reference_str, words, labels) def tokenize(self, reference_str): """ used for unittest only :param reference_str: :return: """ if self.IGNORE_IF.search(reference_str): return None words, _ = self.classify(reference_str) return words
def classify(traincorpus, testcorpus): model = ChainCRF() ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10) pos_lexicon = load_lexicon("lexica/restaurants/ote/pos") term_lexicon = load_lexicon("lexica/restaurants/ote/term") pre1_lexicon = load_lexicon("lexica/restaurants/ote/prefix1") pre2_lexicon = load_lexicon("lexica/restaurants/ote/prefix2") pre3_lexicon = load_lexicon("lexica/restaurants/ote/prefix3") suf1_lexicon = load_lexicon("lexica/restaurants/ote/suffix1") suf2_lexicon = load_lexicon("lexica/restaurants/ote/suffix2") suf3_lexicon = load_lexicon("lexica/restaurants/ote/suffix3") train_sentences = [] #the list to be used to store our features for the words sentence_labels = [] #the list to be used for labeling if a word is an aspect term print('Creating train feature vectors...') #extracting sentences and appending them labels for instance in traincorpus.corpus: words = nltk.word_tokenize(instance.text) tags = nltk.pos_tag(words) tags_list = [] #the pos list for _, t in tags: tags_list.append(t) last_prediction = "" train_words = [] word_labels = [] for i, w in enumerate(words): word_found = False if words[i] == w: word_found = True pos_feats = [] previous_pos_feats = [] second_previous_pos_feats = [] next_pos_feats = [] second_next_pos_feats = [] morph_feats = [] term_feats = [] pre1_feats = [] pre2_feats = [] pre3_feats = [] suf1_feats = [] suf2_feats = [] suf3_feats = [] target_labels = [] train_word_features = [] #prefix of lengths 1,2,3 lexicon features for p1 in pre1_lexicon: if p1 == w[0]: pre1_feats.append(1) else: pre1_feats.append(0) for p2 in pre2_lexicon: if len(w) > 1: if p2 == w[0]+w[1]: pre2_feats.append(1) else: pre2_feats.append(0) else: pre2_feats.append(0) for p3 in pre3_lexicon: if len(w) > 2: if p3 == w[0]+w[1]+w[2]: pre3_feats.append(1) else: pre3_feats.append(0) else: pre3_feats.append(0) #suffix of lengths 1,2,3 lexicon features for s1 in suf1_lexicon: if s1 == w[-1]: suf1_feats.append(1) else: suf1_feats.append(0) for s2 in suf2_lexicon: if len(w) > 1: if s2 == w[-2]+w[-1]: suf2_feats.append(1) else: suf2_feats.append(0) else: suf2_feats.append(0) for s3 in suf3_lexicon: if len(w) > 2: if s3 == w[-3]+w[-2]+w[-1]: suf3_feats.append(1) else: suf3_feats.append(0) else: suf3_feats.append(0) #frequent term lexicon features for t in term_lexicon: if t == w.lower(): term_feats.append(1) else: term_feats.append(0) #morphological features if w[0].isupper(): #is first letter capital morph_feats.append(1) else: morph_feats.append(0) capitals = 0 lowers = 0 for letter in w: if letter.isupper(): capitals = capitals + 1 if letter.islower(): lowers = lowers + 1 if w[0].islower() and capitals > 0: #contains capitals, except 1st letter morph_feats.append(1) else: morph_feats.append(0) if capitals == len(w): #is all letters capitals morph_feats.append(1) else: morph_feats.append(0) if lowers == len(w): #is all letters lower morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"\d", w)) == len(w): #is all letters digits morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"[a-zA-Z]", w)) == len(w): #is all letters words morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"[.]", w)) > 0: #is there a '.' morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"[-]", w)) > 0: #is there a '-' morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r'''[][,;"'?():_`]''', w)) > 0: #is there a punctuation mark, except '.', '-' morph_feats.append(1) else: morph_feats.append(0) for p in pos_lexicon: #check the POS tag of the current word if tags_list[i] == p: pos_feats.append(1) else: pos_feats.append(0) #check the POS tag of the previous word (if the index is IN list's bounds) if (i-1) >= 0: if tags_list[i-1] == p: previous_pos_feats.append(1) else: previous_pos_feats.append(0) else: previous_pos_feats.append(0) #check the POS tag of the 2nd previous word (if the index is IN list's bounds) if (i-2) >= 0: if tags_list[i-2] == p: second_previous_pos_feats.append(1) else: second_previous_pos_feats.append(0) else: second_previous_pos_feats.append(0) #check the POS tag of the next word (if the index is IN list's bounds) if (i+1) < len(words): if tags_list[i+1] == p: next_pos_feats.append(1) else: next_pos_feats.append(0) else: next_pos_feats.append(0) #check the POS tag of the next word (if the index is IN list's bounds) if (i+2) < len(words): if tags_list[i+2] == p: second_next_pos_feats.append(1) else: second_next_pos_feats.append(0) else: second_next_pos_feats.append(0) #label the word, using IOB system, #B:start of aspect term, I:continue of aspect term, O: no aspect term term_found = False for aspect_term in set(instance.get_aspect_terms()): term_words = aspect_term.split() for term_index, term in enumerate(term_words): if (w.lower() == term) and (term_found is False): if term_index == 0: target_labels = [1] #1 is "B" last_prediction = "1" term_found = True else: if (last_prediction == "1") or (last_prediction == "2"): target_labels = [2] #2 is "I" last_prediction = "2" term_found = True else: target_labels = [0] last_prediction = "0" if term_found is False: target_labels = [0] #0 is "O" last_prediction = "0" train_word_features = [pos_feats + previous_pos_feats + second_previous_pos_feats + next_pos_feats + second_next_pos_feats + morph_feats + term_feats + pre1_feats + pre2_feats + pre3_feats + suf1_feats + suf2_feats + suf3_feats] if word_found is True: train_words.append(train_word_features) word_labels.append(target_labels) train_sentences_array = np.zeros((len(train_words), len(train_words[0][0]))) index_i = 0 for word in train_words: index_j = 0 for features in word: for f in features: train_sentences_array[index_i, index_j] = f index_j = index_j + 1 index_i = index_i + 1 train_sentences.append(train_sentences_array) sentence_labels_array = np.zeros((len(word_labels))) index_i = 0 for label in word_labels: sentence_labels_array[index_i] = label[0] index_i = index_i + 1 sentence_labels.append(sentence_labels_array.astype(np.int64)) #the chain-crf needs a list (representing the sentences), that #contains a 2d-array(n_words, n_features), which in turn contains the #features extracted from each word. the sentence labels must be #an array of type int ssvm.fit(train_sentences, sentence_labels) print('Done!') print('Creating test feature vectors...') test_sentences = [] for instance in testcorpus.corpus: words = nltk.word_tokenize(instance.text) tags = nltk.pos_tag(words) tags_list = [] #the pos list for _, t in tags: tags_list.append(t) test_words = [] for i, w in enumerate(words): word_found = False if words[i] == w: word_found = True pos_feats = [] previous_pos_feats = [] second_previous_pos_feats = [] next_pos_feats = [] second_next_pos_feats = [] morph_feats = [] term_feats = [] pre1_feats = [] pre2_feats = [] pre3_feats = [] suf1_feats = [] suf2_feats = [] suf3_feats = [] test_word_features = [] #prefix 1,2,3 lexicon features for p1 in pre1_lexicon: if p1 == w[0]: pre1_feats.append(1) else: pre1_feats.append(0) for p2 in pre2_lexicon: if len(w) > 1: if p2 == w[0]+w[1]: pre2_feats.append(1) else: pre2_feats.append(0) else: pre2_feats.append(0) for p3 in pre3_lexicon: if len(w) > 2: if p3 == w[0]+w[1]+w[2]: pre3_feats.append(1) else: pre3_feats.append(0) else: pre3_feats.append(0) #suffix 1,2,3 lexicon features for s1 in suf1_lexicon: if s1 == w[-1]: suf1_feats.append(1) else: suf1_feats.append(0) for s2 in suf2_lexicon: if len(w) > 1: if s2 == w[-2]+w[-1]: suf2_feats.append(1) else: suf2_feats.append(0) else: suf2_feats.append(0) for s3 in suf3_lexicon: if len(w) > 2: if s3 == w[-3]+w[-2]+w[-1]: suf3_feats.append(1) else: suf3_feats.append(0) else: suf3_feats.append(0) #term lexicon features for t in term_lexicon: if t == w.lower(): term_feats.append(1) else: term_feats.append(0) #morphological features if w[0].isupper(): #is first letter capital morph_feats.append(1) else: morph_feats.append(0) capitals = 0 lowers = 0 for letter in w: if letter.isupper(): capitals = capitals + 1 if letter.islower(): lowers = lowers + 1 if w[0].islower() and capitals > 0: #contains capitals, except 1st letter morph_feats.append(1) else: morph_feats.append(0) if capitals == len(w): #is all letters capitals morph_feats.append(1) else: morph_feats.append(0) if lowers == len(w): #is all letters lower morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"\d", w)) == len(w): #is all letters digits morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"[a-zA-Z]", w)) == len(w): #is all letters words morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"[.]", w)) > 0: #is there a '.' morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"[-]", w)) > 0: #is there a '-' morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r'''[][,;"'?():_`]''', w)) > 0: #is there a punctuation mark, except '.', '-' morph_feats.append(1) else: morph_feats.append(0) for p in pos_lexicon: #check the POS tag of the current word if tags_list[i] == p: pos_feats.append(1) else: pos_feats.append(0) #check the POS tag of the previous word (if the index is IN list's bounds) if (i-1) >= 0: if tags_list[i-1] == p: previous_pos_feats.append(1) else: previous_pos_feats.append(0) else: previous_pos_feats.append(0) #check the POS tag of the 2nd previous word (if the index is IN list's bounds) if (i-2) >= 0: if tags_list[i-2] == p: second_previous_pos_feats.append(1) else: second_previous_pos_feats.append(0) else: second_previous_pos_feats.append(0) #check the POS tag of the next word (if the index is IN list's bounds) if (i+1) < len(words): if tags_list[i+1] == p: next_pos_feats.append(1) else: next_pos_feats.append(0) else: next_pos_feats.append(0) #check the POS tag of the next word (if the index is IN list's bounds) if (i+2) < len(words): if tags_list[i+2] == p: second_next_pos_feats.append(1) else: second_next_pos_feats.append(0) else: second_next_pos_feats.append(0) test_word_features = [pos_feats + previous_pos_feats + second_previous_pos_feats + next_pos_feats + second_next_pos_feats + morph_feats + term_feats + pre1_feats + pre2_feats + pre3_feats + suf1_feats + suf2_feats + suf3_feats] if word_found is True: test_words.append(test_word_features) test_sentences_array = np.zeros((len(test_words), len(test_words[0][0]))) index_i = 0 for word in test_words: index_j = 0 for features in word: for f in features: test_sentences_array[index_i, index_j] = f index_j = index_j + 1 index_i = index_i + 1 test_sentences.append(test_sentences_array) print('Done!') print('Predicting aspect terms...') predictions = ssvm.predict(test_sentences) #the predict function returns a list (symbolizing the sentences), #which contains a list that contains the predicted label for each word for sentence_index, sentence_predictions in enumerate(predictions): testcorpus.corpus[sentence_index].aspect_terms = [] predicted_term = "" last_prediction = "" for word_index, word_prediction in enumerate(sentence_predictions): if word_prediction == 1: if last_prediction == 1 or last_prediction == 2: start, end = find_offsets(testcorpus.corpus[sentence_index].text.lower(), predicted_term) testcorpus.corpus[sentence_index].add_aspect_term(term=predicted_term, offsets={'from': str(start), 'to': str(end)}) c = find_term(testcorpus.corpus[sentence_index].text.lower(), word_index) predicted_term = c last_prediction = 1 elif word_prediction == 2: if last_prediction == 1 or last_prediction == 2: c = find_term(testcorpus.corpus[sentence_index].text.lower(), word_index) if len(predicted_term) > 0: predicted_term = predicted_term + " " + c else: predicted_term = c last_prediction = 2 elif word_prediction == 0: if last_prediction == 1 or last_prediction == 2: start, end = find_offsets(testcorpus.corpus[sentence_index].text.lower(), predicted_term) testcorpus.corpus[sentence_index].add_aspect_term(term=predicted_term, offsets={'from': str(start), 'to': str(end)}) last_prediction = 0 print('Done!') return testcorpus.corpus
rnd = np.random.RandomState(1) selected = rnd.randint(len(y_test), size=n_words) max_word_len = max([len(y_) for y_ in y_test[selected]]) fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10)) fig.subplots_adjust(wspace=0) fig.text(0.2, 0.05, 'GT', color="#00AA00", size=25) fig.text(0.4, 0.05, 'SVM', color="#5555FF", size=25) fig.text(0.6, 0.05, 'UD-LCCRF', color="#FF5555", size=25) fig.text(0.8, 0.05, 'D-LCCRF', color="#FFD700", size=25) fig.text(0.05, 0.5, 'Word', color="#000000", size=25) fig.text(0.5, 0.95, 'Letters', color="#000000", size=25) for ind, axes_row in zip(selected, axes): y_pred_svm = svm.predict(X_test[ind]) y_pred_undirected = undirected_ssvm.predict([X_test[ind]])[0] y_pred_crf = ssvm.predict([X_test[ind]])[0] for i, (a, image, y_true, y_svm, y_undirected, y_crf) in enumerate( zip(axes_row, X_test[ind], y_test[ind], y_pred_svm, y_pred_undirected, y_pred_crf)): a.matshow(image.reshape(16, 8), cmap=plt.cm.Greys) a.text(0, 3, abc[y_true], color="#00AA00", size=25) # Green a.text(0, 14, abc[y_svm], color="#5555FF", size=25) # Blue a.text(5, 14, abc[y_undirected], color="#FF5555", size=25) # Red a.text(5, 3, abc[y_crf], color="#FFD700", size=25) # Yellow a.set_xticks(()) a.set_yticks(()) for ii in range(i + 1, max_word_len): axes_row[ii].set_visible(False) w = ssvm.w[26 * 8 * 16:].reshape(26, 26)
rnd = np.random.RandomState(1) selected = rnd.randint(len(y_test), size=n_words) max_word_len = max([len(y_) for y_ in y_test[selected]]) fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10)) fig.subplots_adjust(wspace=0) fig.text(0.2, 0.05, 'GT', color="#00AA00", size=25) fig.text(0.4, 0.05, 'SVM', color="#5555FF", size=25) fig.text(0.6, 0.05, 'LCCRF', color="#FF5555", size=25) fig.text(0.8, 0.05, 'CRF', color="#FFD700", size=25) fig.text(0.05, 0.5, 'Word', color="#000000", size=25) fig.text(0.5, 0.95, 'Letters', color="#000000", size=25) for ind, axes_row in zip(selected, axes): y_pred_svm = svm.predict(X_test[ind]) y_pred_chain = chain_ssvm.predict([X_test[ind]])[0] y_pred_crf = ssvm.predict([X_test[ind]])[0] import pdb pdb.set_trace() for i, (a, image, y_true, y_svm, y_chain, y_crf) in enumerate( zip(axes_row, X_test[ind], y_test[ind], y_pred_svm, y_pred_chain, y_pred_crf)): a.matshow(image.reshape(16, 8), cmap=plt.cm.Greys) a.text(0, 3, abc[y_true], color="#00AA00", size=25) # Green a.text(0, 14, abc[y_svm], color="#5555FF", size=25) # Blue a.text(5, 14, abc[y_chain], color="#FF5555", size=25) # Red a.text(5, 3, abc[y_crf], color="#FFD700", size=25) # Yellow a.set_xticks(()) a.set_yticks(()) for ii in range(i + 1, max_word_len): axes_row[ii].set_visible(False)
X = X[:100] y = y[:100] #Add edges for i in range(X.shape[0]): X[i] = [X[i], np.vstack([(0,1),(2,2)])] model = GraphCRF(directed=True, inference_method="max-product") X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X,y, test_size =0.5, random_state=0) ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10) ssvm.fit(X_train,y_train) print ssvm.score(X_test, y_test) print ssvm.predict(X_test) print y_test ''' for i in range(X.shape[0]): X_train, X_test = X[] X_test = X[i] y_test = y[i] X_train = np.delete(X,i) y_train = np.delete(y,i) ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10) ssvm.fit(X_train,y_train) print ssvm.model
subgradient_svm.fit(X_train_bias, y_train) time_subgradient_svm = time() - start y_pred = np.hstack(subgradient_svm.predict(X_test_bias)) print("Score with pystruct subgradient ssvm: %f (took %f seconds)" % (np.mean(y_pred == y_test), time_subgradient_svm)) # the standard one-vs-rest multi-class would probably be as good and faster # but solving a different model libsvm = LinearSVC(multi_class='crammer_singer', C=.1) start = time() libsvm.fit(X_train, y_train) time_libsvm = time() - start print("Score with sklearn and libsvm: %f (took %f seconds)" % (libsvm.score(X_test, y_test), time_libsvm)) start = time() fw_bc_svm.fit(X_train_bias, y_train) y_pred = np.hstack(fw_bc_svm.predict(X_test_bias)) time_fw_bc_svm = time() - start print("Score with pystruct frankwolfe block coordinate ssvm: %f (took %f seconds)" % (np.mean(y_pred == y_test), time_fw_bc_svm)) start = time() fw_batch_svm.fit(X_train_bias, y_train) y_pred = np.hstack(fw_batch_svm.predict(X_test_bias)) time_fw_batch_svm = time() - start print("Score with pystruct frankwolfe batch ssvm: %f (took %f seconds)" % (np.mean(y_pred == y_test), time_fw_batch_svm))
y_test = preprocess_label(y_test) ### CS : best c =0.01 ### Phy: best c= 0.005 ### stat: best c = 0.005 ''' C= [0.005,0.01,0.02,0.05,0.1,0.2] score = {} for i in C: model = ChainCRF() ssvm = FrankWolfeSSVM(model=model, C=i, max_iter=100) ssvm.fit(x_train, y_train) score[i] = ssvm.score(x_dev, y_dev) print score ''' model = ChainCRF() ssvm = FrankWolfeSSVM(model=model, C=0.005, max_iter=100) ssvm.fit(x_train, y_train) score = ssvm.score(x_test, y_test) y_pred = ssvm.predict(x_test) print 'Micro-averaged F1 score:', f1_score(get_one_list(y_test), get_one_list(y_pred), average='micro') experiment_util.sequential_error_analysis( restore_label(y_test), restore_label(y_pred), './chaincrf_sequential_error_analysis')
def trainModel_Basic(num_iter=5, inference="qpbo", trainer="NSlack", num_train=2, num_test=1, C=0.1, edges="180x180_dist1_diag0", inputs=[1, 1, 1, 1, 1, 1], features="all", directed=False, savePred=False): padding = (30, 30, 30, 30) if directed == True: features += '+directed' resultsDir = os.getcwd() + '/CRFResults' nameLen = len(os.listdir(resultsDir)) edgeFeature = edges filename = str(nameLen) + '_CRF_iter_' + str( num_iter ) + "_" + inference + "_" + trainer + "_" + features + "_" + str( num_train) + "_" + str(num_test) + "_" + edgeFeature print "Loading training slices" start = time.clock() train = extractSlices2(train_path, num_train, padding, inputs=inputs) end = time.clock() train_load_time = (end - start) / 60.0 [trainLayers, trainTruth, sliceShape] = train print "Training slices loaded in %f" % (train_load_time) n_features = len(trainLayers[0][0, 0]) print "Layer shape is : " print trainLayers[0].shape print "Training the model" edges = np.load("/home/bmi/CRF/edges/" + edges + ".npy") G = [edges for x in trainLayers] print trainLayers[0].shape trainLayers = np.array([ x.reshape((sliceShape[0] * sliceShape[1], n_features)) for x in trainLayers ]) trainTruth = np.array([ x.reshape((sliceShape[0] * sliceShape[1], )).astype(int) for x in trainTruth ]) if inference == 'ogm': crf = GraphCRF(inference_method=('ogm', { 'alg': 'fm' }), directed=directed) else: crf = GraphCRF(inference_method=inference, directed=directed) if trainer == "Frank": svm = FrankWolfeSSVM(model=crf, max_iter=num_iter, C=C, n_jobs=6, verbose=1) elif trainer == "NSlack": svm = NSlackSSVM(model=crf, max_iter=num_iter, C=C, n_jobs=-1, verbose=1) else: svm = OneSlackSSVM(model=crf, max_iter=num_iter, C=C, n_jobs=-1, verbose=1) start = time.clock() asdf = zip(trainLayers, G) svm.fit(asdf, trainTruth) end = time.clock() train_time = (end - start) / 60.0 print "The training took %f" % (train_time) print "Model parameter size :" print svm.w.shape print "making predictions on train data" predTrain = svm.predict(asdf) trainDice = [] for i in range(len(trainLayers)): diceScore = accuracy(predTrain[i], trainTruth[i]) trainDice.append(diceScore) meanTrainDice = sum(trainDice) / len(trainLayers) del trainLayers, trainTruth ################################################################################################ overallDicePerPatient = [] # For overall test Dice extDicePerPatient = [] PatientTruthLayers = [] PatientPredLayers = [] PREC = [] RECALL = [] F1 = [] LayerwiseDiceTotal = [] testResultFile = open(os.getcwd() + "/CRFResults/" + filename + ".csv", 'a') testResultFile.write( "folderName,numLayers, Overall Dice, precision , recall, F1" + "\n") counter = 0 print "Loading the test slices" for folder in os.listdir(test_path): path = test_path + "/" + folder layerDiceScores = '' # print path data = extractTestSlices2(path, padding, inputs=inputs) if data != 0: [testLayers, testTruth, sliceShape, startSlice, endSlice] = data # trueTestLayers=testLayers GTest = [edges for x in testLayers] testLayers = np.array([ x.reshape((sliceShape[0] * sliceShape[1], n_features)) for x in testLayers ]) testTruth = np.array([ x.reshape((sliceShape[0] * sliceShape[1], )).astype(int) for x in testTruth ]) asdfTest = zip(testLayers, GTest) predTest = svm.predict(asdfTest) LayerwiseDice = [] for i in range(len(testLayers)): diceScore = accuracy(predTest[i], testTruth[i]) layerDiceScores += "," + str(diceScore) if math.isnan(diceScore): if sum(predTest[i]) == 0 and sum(testTruth[i]) == 0: LayerwiseDice.append(1.0) continue LayerwiseDice.append(diceScore) LayerwiseDiceTotal.append(LayerwiseDice) overallTestDice = accuracy(np.hstack(predTest), np.hstack(testTruth)) extDice = np.mean( np.array(LayerwiseDice) [range(10) + range(len(LayerwiseDice) - 10, len(LayerwiseDice))]) prec, recall, f1 = precision_score(np.hstack(testTruth), np.hstack(predTest)), recall_score( np.hstack(testTruth), np.hstack(predTest)), f1_score( np.hstack(testTruth), np.hstack(predTest)) print "Patient %d : Overall test DICE for %s is : %f and extDice is %f" % ( counter, folder, overallTestDice, extDice) print "Precision : %f Recall : %f F1 : %f " % (prec, recall, f1) print "__________________________________________" # testResultFile.write(folder+","+str(len(testLayers))+","+str(meanTestDice)+","+str(overallTestDice) ","+str(np.max(testDice)) +","+ str(np.min(testDice))+"\n" ) testResultFile.write(folder + "," + str(len(testLayers)) + "," + str(overallTestDice) + "," + str(prec) + "," + str(recall) + "," + str(extDice) + layerDiceScores + "\n") overallDicePerPatient.append(overallTestDice) extDicePerPatient.append(extDice) PREC.append(prec), RECALL.append(recall), F1.append(f1) PatientTruthLayers.append(testTruth) PatientPredLayers.append(predTest) counter += 1 if counter == num_test and num_test != -1: break ###################################################################################################### print "Done testing slices" overallDice = sum(overallDicePerPatient) / len(PatientTruthLayers) overallPrec = sum(PREC) / len(PatientTruthLayers) overallRecall = sum(RECALL) / len(PatientTruthLayers) overallExtDice = np.mean(extDicePerPatient) print "Overall DICE : %f Precision : %f Recall : %f extDice : %f " % ( overallDice, overallPrec, overallRecall, overallExtDice) print "############################################" # testOutput=np.array([PatientPredLayers,PatientTruthLayers,trueTestLayers]) testOutput = np.array([PatientPredLayers, PatientTruthLayers]) ########### Saving the models ###################################################################### # print "Saving the model" # modelDir = os.getcwd()+"/CRFModel/" # svmModel = open(modelDir+filename+"_model"+".pkl",'wb') # cPickle.dump(svm,svmModel,protocol=cPickle.HIGHEST_PROTOCOL) # svmModel.close() # # print "saving the predictions" # predFileTest = open(os.getcwd()+"/CRFPred/"+filename+"_pred.pkl",'wb') # cPickle.dump(testOutput,predFileTest,protocol=cPickle.HIGHEST_PROTOCOL) # predFileTest.close() layerDataLog = open(os.getcwd() + "/CRFModel/" + filename + "_layer.pkl", 'wb') cPickle.dump(LayerwiseDiceTotal, layerDataLog, protocol=cPickle.HIGHEST_PROTOCOL) layerDataLog.close() resultLog = os.getcwd() + "/CRFResults/TestResultFinal.csv" resultFile = open(resultLog, 'a') resultFile.write(time.ctime() + "," + str(num_iter) + "," + str(num_train) + "," + str(num_test) + "," + inference + "," + trainer + "," + str(C) + "," + str(train_time) + "," + str(meanTrainDice) + "," + str(overallDice) + "," + str(np.std(overallDicePerPatient)) + "," + edgeFeature + "," + "None" + "," + features + "," + filename + "," + str(overallPrec) + "," + str(overallRecall) + "," + str(overallExtDice) + "," + "Flair(5)+T2(9)-Without last 4 train Layers" + "\n") resultFile.close() testResultFile.close() return
def run_crf(w2v, words_before, words_after, shallow_parse): pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \ parse_summerscales.get_tokens_and_lbls( make_pmids_dict=True, sen=True) """ Create model """ model = ChainCRF(directed=False) ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=30) all_pmids = pmids_dict.keys() n = len(all_pmids) n_folds = 5 kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds) fold_gi = [] for fold_idx, (train, test) in enumerate(kf): print("on fold %s" % fold_idx) train_pmids = [all_pmids[pmid_idx] for pmid_idx in train] test_pmids = [all_pmids[pmid_idx] for pmid_idx in test] print('loading data...') train_x, train_y = abstract2features(pmids_dict, words_before, w2v, shallow_parse) test_x, test_y = abstract2features(pmids_dict, words_after, w2v, shallow_parse) print('loaded data...') print 'training...' ssvm.fit(train_x, train_y) print ssvm.score(test_x, test_y) for i, (pmid, x, y) in enumerate(zip(test_pmids, test_x, test_y)): abstract_words, _, _= pmids_dict[pmid] print(pmid) # predict() takes in a list returns another list prediction = ssvm.predict([x]).pop(0) predicted = '' output = '' if len(prediction) > 0: for p in prediction: if p == 1: print "word: {}".format(abstract_words[p]) if n == 0: predicted += abstract_words[p] else: predicted += ' ' + abstract_words[p] if not predicted == '': output = 'predicted: {}'.format(predicted) else: output = 'Predicted nothing!' else: output = 'Predicted nothing!' print output
print _js_function # processed_function = process_function(_js_function) line_f = _js_function.replace('/n', " ") raw_tokens = tokenizer.init_processing_function(line_f) tr_sets = crfpredictor.generate_type1_prediction(raw_tokens) r_assert = ssvm.predict(tr_sets[0][0:1]) unit_test = assert_pre.unit_test_assembler(r_assert, raw_tokens, 2) response = Response(str(unit_test)) response.headers["content-type"] = "text/plain" return response if __name__ == '__main__': result = tokenizer.read_process_file() train_sets = crfpredictor.generate_type1_prediction(result) ssvm.fit(train_sets[0], train_sets[1]) result_assert = ssvm.predict(train_sets[0][0:1]) test = assert_pre.unit_test_assembler(result_assert, result, 2) for f in test: print f print result_assert app.run() def process_function(_js_function): pass
def classify(traincorpus, testcorpus): model = ChainCRF() ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10) pos_lexicon = load_lexicon("lexica/restaurants/ote/pos") term_lexicon = load_lexicon("lexica/restaurants/ote/term") pre1_lexicon = load_lexicon("lexica/restaurants/ote/prefix1") pre2_lexicon = load_lexicon("lexica/restaurants/ote/prefix2") pre3_lexicon = load_lexicon("lexica/restaurants/ote/prefix3") suf1_lexicon = load_lexicon("lexica/restaurants/ote/suffix1") suf2_lexicon = load_lexicon("lexica/restaurants/ote/suffix2") suf3_lexicon = load_lexicon("lexica/restaurants/ote/suffix3") train_sentences = [ ] #the list to be used to store our features for the words sentence_labels = [ ] #the list to be used for labeling if a word is an aspect term print('Creating train feature vectors...') #extracting sentences and appending them labels for instance in traincorpus.corpus: words = nltk.word_tokenize(instance.text) tags = nltk.pos_tag(words) tags_list = [] #the pos list for _, t in tags: tags_list.append(t) last_prediction = "" train_words = [] word_labels = [] for i, w in enumerate(words): word_found = False if words[i] == w: word_found = True pos_feats = [] previous_pos_feats = [] second_previous_pos_feats = [] next_pos_feats = [] second_next_pos_feats = [] morph_feats = [] term_feats = [] pre1_feats = [] pre2_feats = [] pre3_feats = [] suf1_feats = [] suf2_feats = [] suf3_feats = [] target_labels = [] train_word_features = [] #prefix of lengths 1,2,3 lexicon features for p1 in pre1_lexicon: if p1 == w[0]: pre1_feats.append(1) else: pre1_feats.append(0) for p2 in pre2_lexicon: if len(w) > 1: if p2 == w[0] + w[1]: pre2_feats.append(1) else: pre2_feats.append(0) else: pre2_feats.append(0) for p3 in pre3_lexicon: if len(w) > 2: if p3 == w[0] + w[1] + w[2]: pre3_feats.append(1) else: pre3_feats.append(0) else: pre3_feats.append(0) #suffix of lengths 1,2,3 lexicon features for s1 in suf1_lexicon: if s1 == w[-1]: suf1_feats.append(1) else: suf1_feats.append(0) for s2 in suf2_lexicon: if len(w) > 1: if s2 == w[-2] + w[-1]: suf2_feats.append(1) else: suf2_feats.append(0) else: suf2_feats.append(0) for s3 in suf3_lexicon: if len(w) > 2: if s3 == w[-3] + w[-2] + w[-1]: suf3_feats.append(1) else: suf3_feats.append(0) else: suf3_feats.append(0) #frequent term lexicon features for t in term_lexicon: if t == w.lower(): term_feats.append(1) else: term_feats.append(0) #morphological features if w[0].isupper(): #is first letter capital morph_feats.append(1) else: morph_feats.append(0) capitals = 0 lowers = 0 for letter in w: if letter.isupper(): capitals = capitals + 1 if letter.islower(): lowers = lowers + 1 if w[0].islower( ) and capitals > 0: #contains capitals, except 1st letter morph_feats.append(1) else: morph_feats.append(0) if capitals == len(w): #is all letters capitals morph_feats.append(1) else: morph_feats.append(0) if lowers == len(w): #is all letters lower morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"\d", w)) == len(w): #is all letters digits morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"[a-zA-Z]", w)) == len(w): #is all letters words morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"[.]", w)) > 0: #is there a '.' morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"[-]", w)) > 0: #is there a '-' morph_feats.append(1) else: morph_feats.append(0) if len(re.findall( r'''[][,;"'?():_`]''', w)) > 0: #is there a punctuation mark, except '.', '-' morph_feats.append(1) else: morph_feats.append(0) for p in pos_lexicon: #check the POS tag of the current word if tags_list[i] == p: pos_feats.append(1) else: pos_feats.append(0) #check the POS tag of the previous word (if the index is IN list's bounds) if (i - 1) >= 0: if tags_list[i - 1] == p: previous_pos_feats.append(1) else: previous_pos_feats.append(0) else: previous_pos_feats.append(0) #check the POS tag of the 2nd previous word (if the index is IN list's bounds) if (i - 2) >= 0: if tags_list[i - 2] == p: second_previous_pos_feats.append(1) else: second_previous_pos_feats.append(0) else: second_previous_pos_feats.append(0) #check the POS tag of the next word (if the index is IN list's bounds) if (i + 1) < len(words): if tags_list[i + 1] == p: next_pos_feats.append(1) else: next_pos_feats.append(0) else: next_pos_feats.append(0) #check the POS tag of the next word (if the index is IN list's bounds) if (i + 2) < len(words): if tags_list[i + 2] == p: second_next_pos_feats.append(1) else: second_next_pos_feats.append(0) else: second_next_pos_feats.append(0) #label the word, using IOB system, #B:start of aspect term, I:continue of aspect term, O: no aspect term term_found = False for aspect_term in set(instance.get_aspect_terms()): term_words = aspect_term.split() for term_index, term in enumerate(term_words): if (w.lower() == term) and (term_found is False): if term_index == 0: target_labels = [1] #1 is "B" last_prediction = "1" term_found = True else: if (last_prediction == "1") or (last_prediction == "2"): target_labels = [2] #2 is "I" last_prediction = "2" term_found = True else: target_labels = [0] last_prediction = "0" if term_found is False: target_labels = [0] #0 is "O" last_prediction = "0" train_word_features = [ pos_feats + previous_pos_feats + second_previous_pos_feats + next_pos_feats + second_next_pos_feats + morph_feats + term_feats + pre1_feats + pre2_feats + pre3_feats + suf1_feats + suf2_feats + suf3_feats ] if word_found is True: train_words.append(train_word_features) word_labels.append(target_labels) train_sentences_array = np.zeros( (len(train_words), len(train_words[0][0]))) index_i = 0 for word in train_words: index_j = 0 for features in word: for f in features: train_sentences_array[index_i, index_j] = f index_j = index_j + 1 index_i = index_i + 1 train_sentences.append(train_sentences_array) sentence_labels_array = np.zeros((len(word_labels))) index_i = 0 for label in word_labels: sentence_labels_array[index_i] = label[0] index_i = index_i + 1 sentence_labels.append(sentence_labels_array.astype(np.int64)) #the chain-crf needs a list (representing the sentences), that #contains a 2d-array(n_words, n_features), which in turn contains the #features extracted from each word. the sentence labels must be #an array of type int ssvm.fit(train_sentences, sentence_labels) print('Done!') print('Creating test feature vectors...') test_sentences = [] for instance in testcorpus.corpus: words = nltk.word_tokenize(instance.text) tags = nltk.pos_tag(words) tags_list = [] #the pos list for _, t in tags: tags_list.append(t) test_words = [] for i, w in enumerate(words): word_found = False if words[i] == w: word_found = True pos_feats = [] previous_pos_feats = [] second_previous_pos_feats = [] next_pos_feats = [] second_next_pos_feats = [] morph_feats = [] term_feats = [] pre1_feats = [] pre2_feats = [] pre3_feats = [] suf1_feats = [] suf2_feats = [] suf3_feats = [] test_word_features = [] #prefix 1,2,3 lexicon features for p1 in pre1_lexicon: if p1 == w[0]: pre1_feats.append(1) else: pre1_feats.append(0) for p2 in pre2_lexicon: if len(w) > 1: if p2 == w[0] + w[1]: pre2_feats.append(1) else: pre2_feats.append(0) else: pre2_feats.append(0) for p3 in pre3_lexicon: if len(w) > 2: if p3 == w[0] + w[1] + w[2]: pre3_feats.append(1) else: pre3_feats.append(0) else: pre3_feats.append(0) #suffix 1,2,3 lexicon features for s1 in suf1_lexicon: if s1 == w[-1]: suf1_feats.append(1) else: suf1_feats.append(0) for s2 in suf2_lexicon: if len(w) > 1: if s2 == w[-2] + w[-1]: suf2_feats.append(1) else: suf2_feats.append(0) else: suf2_feats.append(0) for s3 in suf3_lexicon: if len(w) > 2: if s3 == w[-3] + w[-2] + w[-1]: suf3_feats.append(1) else: suf3_feats.append(0) else: suf3_feats.append(0) #term lexicon features for t in term_lexicon: if t == w.lower(): term_feats.append(1) else: term_feats.append(0) #morphological features if w[0].isupper(): #is first letter capital morph_feats.append(1) else: morph_feats.append(0) capitals = 0 lowers = 0 for letter in w: if letter.isupper(): capitals = capitals + 1 if letter.islower(): lowers = lowers + 1 if w[0].islower( ) and capitals > 0: #contains capitals, except 1st letter morph_feats.append(1) else: morph_feats.append(0) if capitals == len(w): #is all letters capitals morph_feats.append(1) else: morph_feats.append(0) if lowers == len(w): #is all letters lower morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"\d", w)) == len(w): #is all letters digits morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"[a-zA-Z]", w)) == len(w): #is all letters words morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"[.]", w)) > 0: #is there a '.' morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"[-]", w)) > 0: #is there a '-' morph_feats.append(1) else: morph_feats.append(0) if len(re.findall( r'''[][,;"'?():_`]''', w)) > 0: #is there a punctuation mark, except '.', '-' morph_feats.append(1) else: morph_feats.append(0) for p in pos_lexicon: #check the POS tag of the current word if tags_list[i] == p: pos_feats.append(1) else: pos_feats.append(0) #check the POS tag of the previous word (if the index is IN list's bounds) if (i - 1) >= 0: if tags_list[i - 1] == p: previous_pos_feats.append(1) else: previous_pos_feats.append(0) else: previous_pos_feats.append(0) #check the POS tag of the 2nd previous word (if the index is IN list's bounds) if (i - 2) >= 0: if tags_list[i - 2] == p: second_previous_pos_feats.append(1) else: second_previous_pos_feats.append(0) else: second_previous_pos_feats.append(0) #check the POS tag of the next word (if the index is IN list's bounds) if (i + 1) < len(words): if tags_list[i + 1] == p: next_pos_feats.append(1) else: next_pos_feats.append(0) else: next_pos_feats.append(0) #check the POS tag of the next word (if the index is IN list's bounds) if (i + 2) < len(words): if tags_list[i + 2] == p: second_next_pos_feats.append(1) else: second_next_pos_feats.append(0) else: second_next_pos_feats.append(0) test_word_features = [ pos_feats + previous_pos_feats + second_previous_pos_feats + next_pos_feats + second_next_pos_feats + morph_feats + term_feats + pre1_feats + pre2_feats + pre3_feats + suf1_feats + suf2_feats + suf3_feats ] if word_found is True: test_words.append(test_word_features) test_sentences_array = np.zeros( (len(test_words), len(test_words[0][0]))) index_i = 0 for word in test_words: index_j = 0 for features in word: for f in features: test_sentences_array[index_i, index_j] = f index_j = index_j + 1 index_i = index_i + 1 test_sentences.append(test_sentences_array) print('Done!') print('Predicting aspect terms...') predictions = ssvm.predict(test_sentences) #the predict function returns a list (symbolizing the sentences), #which contains a list that contains the predicted label for each word for sentence_index, sentence_predictions in enumerate(predictions): testcorpus.corpus[sentence_index].aspect_terms = [] predicted_term = "" last_prediction = "" for word_index, word_prediction in enumerate(sentence_predictions): if word_prediction == 1: if last_prediction == 1 or last_prediction == 2: start, end = find_offsets( testcorpus.corpus[sentence_index].text.lower(), predicted_term) testcorpus.corpus[sentence_index].add_aspect_term( term=predicted_term, offsets={ 'from': str(start), 'to': str(end) }) c = find_term(testcorpus.corpus[sentence_index].text.lower(), word_index) predicted_term = c last_prediction = 1 elif word_prediction == 2: if last_prediction == 1 or last_prediction == 2: c = find_term( testcorpus.corpus[sentence_index].text.lower(), word_index) if len(predicted_term) > 0: predicted_term = predicted_term + " " + c else: predicted_term = c last_prediction = 2 elif word_prediction == 0: if last_prediction == 1 or last_prediction == 2: start, end = find_offsets( testcorpus.corpus[sentence_index].text.lower(), predicted_term) testcorpus.corpus[sentence_index].add_aspect_term( term=predicted_term, offsets={ 'from': str(start), 'to': str(end) }) last_prediction = 0 print('Done!') return testcorpus.corpus
list_x.append(np.array(x_1)) list_y.append(y) list_y.append(y_1) # crf = ChainCRF(inference_method='max-product') crf = ChainCRF(inference_method="max-product", directed=False) ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=100) ssvm.fit(np.array(list_x), np.array(list_y)) test_x = np.array(list_x) test_y = np.array(list_y) # print np.array(list_x)[0].shape[1] x_test = [[1, 0, 0, 0], [1, 0, 1, 0]] list_x_test = list() list_x_test.append(x_test) pred = ssvm.predict(np.array(list_x_test)) # for value in pred: # print value # file_model = pickle.dumps(ssvm) # load_model = pickle.loads(file_model) joblib.dump(ssvm, "d:/filename.pkl") load_model = joblib.load("d:/filename.pkl") output = load_model.predict(np.array(list_x_test)) for value in output: print value
def main(): tweets_data_train = [] with open('Train\dataset_train.pkl', 'rb') as r: tweets_set = pickle.load(r) for i in range(0, len(tweets_set)): for j in range(0, len(tweets_set[i])): t = tweets_set[i][j][1].encode('ascii', 'ignore') tweets_data_train.append(t) features_train_transformed = get_extra_features(tweets_data_train) print(features_train_transformed.shape) features_train_transformed.dump("Train\extra_features_train.pkl") extra_features_train = numpy.load("Train\extra_features_train.pkl") print "EXTRA FEATURES FOR TRAIN DATA IS SUCCESSFULLY EXTRACTED" tweets_data_test = [] with open('Test\dataset_test.pkl', 'rb') as r: tweets_set = pickle.load(r) for i in range(0, len(tweets_set)): for j in range(0, len(tweets_set[i])): t = tweets_set[i][j][1].encode('ascii', 'ignore') tweets_data_test.append(t) features_test_transformed = get_extra_features(tweets_data_test) features_test_transformed.dump("Test\extra_features_test.pkl") extra_features_test = numpy.load("Test\extra_features_test.pkl") print "EXTRA FEATURES FOR TEST DATA IS SUCCESSFULLY EXTRACTED" #TFIDF VECTORIZER features_train_tfidf, features_test_tfidf = get_main_features( tweets_data_train, tweets_data_test) with open('Train\edges_train.pkl', 'rb') as e: edges_train = pickle.load(e) with open('Train\labels_train.pkl', 'rb') as l: labels_tr = pickle.load(l) with open('Test\edges_test.pkl', 'rb') as e: edges_test = pickle.load(e) with open('Test\labels_test.pkl', 'rb') as l: labels_te = pickle.load(l) #edges=numpy.array(edges) labels_tr = numpy.array(labels_tr) labels_te = numpy.array(labels_te) #labels_1D=numpy.zeros(1) labels_train = array_to_list(labels_tr) labels_test = array_to_list(labels_te) labels_test = numpy.array(labels_test) #labels_1D=numpy.delete(labels_1D,(0),0) """ selector=SelectPercentile(f_classif,percentile=70) selector.fit(features_train_tfidf,labels_1D) features_train_transformed=selector.transform(features_train_tfidf).toarray() features_test_transformed=selector.transform(features_test_tfidf).toarray() print "Features Selection is done successfully """ print features_test_tfidf.shape, extra_features_test.shape features_train_transformed = numpy.concatenate( (features_train_tfidf, extra_features_train), axis=1) features_test_transformed = numpy.concatenate( (features_test_tfidf, extra_features_test), axis=1) print "TFIDF FEATURES ARE SUCCESSFULLY CREATED" features_train = get_features_and_edges(features_train_transformed, edges_train) features_test = get_features_and_edges(features_test_transformed, edges_test) labels_train = numpy.array(labels_train) print labels_train.shape model_name = "GraphCRF_model" model = GraphCRF(directed=True) ssvm = FrankWolfeSSVM(model=model, C=1.0, max_iter=100, logger=SaveLogger(model_name + ".pickle", save_every=100)) start_time = time.time() final_model = ssvm.fit(features_train, labels_train) print("--- Time taken to train the classifier is %s seconds " % (time.time() - start_time)) print "YAAY ! A GRAPH CRF MODEL IS SUCCESSFULLY CREATED AND TRAINED" print("Charliehedbo event is the Test Data") pickle.dump(final_model, open('Saved_Model/sdqc_final_model.pkl', 'wb')) ssvm = pickle.load(open('Saved_Model/sdqc_final_model.pkl', 'rb')) #ssvm = SaveLogger(model_name+".pickle").load() X_test = [] y_test = [] for i in range(0, len(features_test)): if features_test[i][0].shape[0] >= 3: X_test.append(features_test[i]) y_test.append(labels_test[i]) #print X_test #print ("Accuracy score with Graph CRF : %f" % ssvm.score(X_test,y_test)) predictions = ssvm.predict(X_test) #PREDICTIONS AND y_TEST ARE LIST OF ARRAYS true = numpy.zeros(1) prediction = numpy.zeros(1) for i in range(0, len(predictions)): true = numpy.hstack((true, y_test[i])) prediction = numpy.hstack((prediction, predictions[i])) true = numpy.delete(true, (0), axis=0) prediction = numpy.delete(prediction, (0), axis=0) print "TOTAL", true.shape[0] print accuracy_score(true, prediction) with open('SDQC_Result.pkl', 'wb') as w: pickle.dump(prediction, w) print( classification_report( true, prediction, target_names=["support", "deny", "query", "comment"])) print confusion_matrix(true, prediction, labels=[0, 1, 2, 3]) plot_cmat(true, prediction)
ssvm.fit(X_train, y_train) print("Test score with chain CRF: %f" % ssvm.score(X_test, y_test)) print("Test score with linear SVM: %f" % svm.score(np.vstack(X_test), np.hstack(y_test))) # plot some word sequenced n_words = 4 rnd = np.random.RandomState(1) selected = rnd.randint(len(y_test), size=n_words) max_word_len = max([len(y_) for y_ in y_test[selected]]) fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10)) fig.subplots_adjust(wspace=0) for ind, axes_row in zip(selected, axes): y_pred_svm = svm.predict(X_test[ind]) y_pred_chain = ssvm.predict([X_test[ind]])[0] for i, (a, image, y_true, y_svm, y_chain) in enumerate( zip(axes_row, X_test[ind], y_test[ind], y_pred_svm, y_pred_chain) ): a.matshow(image.reshape(16, 8), cmap=plt.cm.Greys) a.text(0, 3, abc[y_true], color="#00AA00", size=25) a.text(0, 14, abc[y_svm], color="#5555FF", size=25) a.text(5, 14, abc[y_chain], color="#FF5555", size=25) a.set_xticks(()) a.set_yticks(()) for ii in range(i + 1, max_word_len): axes_row[ii].set_visible(False) plt.matshow(ssvm.w[26 * 8 * 16 :].reshape(26, 26)) plt.colorbar() plt.title("Transition parameters of the chain CRF.")
def main(train_SF, test_SF, model): #read files #print("Reading training and testing files") #train_SF = pd.read_csv('data/restaurants/train.csv',sep = '\t') #test_SF = pd.read_csv('data/restaurants/test.csv',sep = '\t') #vectors_filename = "vectors_yelp_200.txt" #filename = 'otemodel.sav' #for OTE model #for laptops #'data/laptops/laptop_train_ote.csv' #'data/laptops/laptop_test_ote.csv' #"gloveVec200.txt" #train_SF = pd.read_csv(trainF,sep = '\t') #test_SF = pd.read_csv(testF,sep = '\t') #vectors_filename = vecF filename = 'otemodel.sav' #for OTE model pos_lexicon = load_lexicon("lexica/pos") #Load word2vec text files #print "Loading word2vec file" #model = gensim.models.Word2Vec.load_word2vec_format(vectors_filename,binary=False) ndim = model.vector_size index2word_set = set(model.index2word) #Cleaning text print "Cleaning text" train_SF['cleanText'] = train_SF['text'].apply(review_to_words) test_SF['cleanText'] = test_SF['text'].apply(review_to_words) test_SF = test_SF[test_SF['cleanText'] != ''].reset_index(drop=True) train_SF = train_SF[train_SF['cleanText'] != ''].reset_index(drop=True) #Extracting vector features print "Extracting vector features" train_vec = [] test_vec = [] for i in range(0, len(train_SF)): train_vec.append( create_vector_features(train_SF['cleanText'][i], model)) for i in range(0, len(test_SF)): test_vec.append(create_vector_features(test_SF['cleanText'][i], model)) train_SF['vector_feats'] = train_vec test_SF['vector_feats'] = test_vec #Extracting morphological features print "Extracting morphological features" train_SF['morph_feats'] = train_SF['cleanText'].apply(create_morph_feats) test_SF['morph_feats'] = test_SF['cleanText'].apply(create_morph_feats) #Extracting POS features print "Extracting POS features" train_pos = [] test_pos = [] for index, row in train_SF.iterrows(): if (index % 1000 == 0): print "Train data - POS features extraction Progress :%d sentences done" % index train_pos.append(create_pos_feats(row['cleanText'], pos_lexicon)) for index, row in test_SF.iterrows(): if (index % 1000 == 0): print "Test data - POS features extraction Progress :%d sentences done" % index test_pos.append(create_pos_feats(row['cleanText'], pos_lexicon)) train_SF['pos_feats'] = train_pos test_SF['pos_feats'] = test_pos #Extracting previous,next Vector Features print "Extracting previous,next Vector features" previous_vector_feats_array = [] next_vector_feats_array = [] second_next_vector_feats_array = [] second_previous_vector_feats_array = [] for i in range(0, len(train_SF)): previous_vector_feats, next_vector_feats, second_next_vector_feats, second_previous_vector_feats = create_next_prev_vector_features( train_SF['cleanText'][i], model) previous_vector_feats_array.append(previous_vector_feats) next_vector_feats_array.append(next_vector_feats) second_next_vector_feats_array.append(second_next_vector_feats) second_previous_vector_feats_array.append(second_previous_vector_feats) train_SF['previous_vector_feats'] = previous_vector_feats_array train_SF['next_vector_feats'] = next_vector_feats_array train_SF[ 'second_previous_vector_feats'] = second_previous_vector_feats_array train_SF['second_next_vector_feats'] = second_next_vector_feats_array #create next prev vector features previous_vector_feats_array = [] next_vector_feats_array = [] second_next_vector_feats_array = [] second_previous_vector_feats_array = [] for i in range(0, len(test_SF)): previous_vector_feats, next_vector_feats, second_next_vector_feats, second_previous_vector_feats = create_next_prev_vector_features( test_SF['cleanText'][i], model) previous_vector_feats_array.append(previous_vector_feats) next_vector_feats_array.append(next_vector_feats) second_next_vector_feats_array.append(second_next_vector_feats) second_previous_vector_feats_array.append(second_previous_vector_feats) test_SF['previous_vector_feats'] = previous_vector_feats_array test_SF['next_vector_feats'] = next_vector_feats_array test_SF[ 'second_previous_vector_feats'] = second_previous_vector_feats_array test_SF['second_next_vector_feats'] = second_next_vector_feats_array #Extracting previous,next POS features print "Extracting previous,next POS features" pos_sent_prev_feats_array = [] pos_sent_next_feats_array = [] pos_sent_second_prev_feats_array = [] pos_sent_second_next_feats_array = [] for i in range(0, len(train_SF)): pos_sent_prev_feats, pos_sent_next_feats, pos_sent_second_prev_feats, pos_sent_second_next_feats = create_prev_pos_feats( train_SF['cleanText'][i], pos_lexicon) pos_sent_prev_feats_array.append(pos_sent_prev_feats) pos_sent_next_feats_array.append(pos_sent_next_feats) pos_sent_second_next_feats_array.append(pos_sent_second_next_feats) pos_sent_second_prev_feats_array.append(pos_sent_second_prev_feats) train_SF['pos_sent_prev_feats'] = pos_sent_prev_feats_array train_SF['pos_sent_next_feats'] = pos_sent_next_feats_array train_SF['pos_sent_second_prev_feats'] = pos_sent_second_prev_feats_array train_SF['pos_sent_second_next_feats'] = pos_sent_second_next_feats_array #for test file pos_sent_prev_feats_array = [] pos_sent_next_feats_array = [] pos_sent_second_prev_feats_array = [] pos_sent_second_next_feats_array = [] for i in range(0, len(test_SF)): pos_sent_prev_feats, pos_sent_next_feats, pos_sent_second_prev_feats, pos_sent_second_next_feats = create_prev_pos_feats( test_SF['cleanText'][i], pos_lexicon) pos_sent_prev_feats_array.append(pos_sent_prev_feats) pos_sent_next_feats_array.append(pos_sent_next_feats) pos_sent_second_next_feats_array.append(pos_sent_second_next_feats) pos_sent_second_prev_feats_array.append(pos_sent_second_prev_feats) test_SF['pos_sent_prev_feats'] = pos_sent_prev_feats_array test_SF['pos_sent_next_feats'] = pos_sent_next_feats_array test_SF['pos_sent_second_prev_feats'] = pos_sent_second_prev_feats_array test_SF['pos_sent_second_next_feats'] = pos_sent_second_next_feats_array print "Features extraction complete............" # Creating labels print "Creating labels.." labels_train = [] labels_test = [] for index, rev in train_SF.iterrows(): labels_train.append(create_labels(rev['cleanText'], rev['aspect term'])) for index, rev in test_SF.iterrows(): labels_test.append(create_labels(rev['cleanText'], rev['aspect term'])) train_SF['labels'] = labels_train test_SF['labels'] = labels_test test_SF = test_SF[test_SF['cleanText'] != ''] # Training CRF model... print "Training CRF model...." train_sentences, sentence_labels = create_features_array(train_SF) test_sentences, test_sentence_labels = create_features_array(test_SF) print "Parameter 'C' value selection...." best_C_val = pick_best_C_value(train_sentences, sentence_labels, test_SF, test_sentences, test_sentence_labels) print "C-value found : %f" % best_C_val modelCRF = ChainCRF() ssvm = FrankWolfeSSVM(model=modelCRF, C=best_C_val, max_iter=10, random_state=5) ssvm.fit(train_sentences, sentence_labels) print "Training complete...." predictions = ssvm.predict(test_sentences) test_SF['predicted_labels'] = predictions #Saving model print "Saving model...." pickle.dump(ssvm, open(filename, 'wb')) #Evaluating Trained CRF model print "" print " -------------- Evaluation Results --------------" predictions = ssvm.predict(train_sentences) train_SF['predicted_labels'] = predictions p, r, f1, common, retrieved, relevant = evaluating_ote(train_SF) print "--------- Train Set Results ---------" print "Precision : %f" % p print "Recall : %f" % r print "F1 measure : %f" % f1 print "" p, r, f1, common, retrieved, relevant = evaluating_ote(test_SF) print "--------- Test Set Results ---------" print "Precision : %f" % p print "Recall : %f" % r print "F1 measure : %f" % f1 print "" return f1
def main(): default_train = \ scriptdir+'/../../../data/compression/googlecomp100.train.lbl' default_test = \ scriptdir+'/../../../data/compression/googlecomp.dev.lbl' parser = argparse.ArgumentParser() parser.add_argument('--threshold', '-t', type=float, help='Threshold for predicting 0/1. ') parser.add_argument('--iterations', '-i', type=int, default=50, help='Training iterations.') parser.add_argument('--data', '-d', default=default_train, help='Features and labels') parser.add_argument('--testdata', default=default_test, help='Test data (not needed for crossval).') parser.add_argument('--verbose', '-v', dest='verbose', action='store_true', help='Print avg. loss at every iter.') parser.add_argument('--output', '-o', help="Output file") parser.add_argument('--features', '-f', dest='features', default=[], type=str, nargs='+', help='Used feature types') parser.add_argument('--train', action='store_true', help='If set, will train the model') args = parser.parse_args() featurizer = edge_featurize.Featurizer(args.features) X, y = featurizer.fit_transform(default_train) crf = EdgeFeatureGraphCRF(inference_method="max-product") model = FrankWolfeSSVM(model=crf, C=.1, max_iter=args.iterations) model.fit(X, y) if args.testdata: X_te, y_te = featurizer.transform(args.testdata) pred = model.predict(X_te) pred_flat = [item for sublist in pred for item in sublist] y_te_flat = [item for sublist in y_te for item in sublist] if args.output: with open(args.output, 'w') as of: for sent_pred in pred: for lid in sent_pred: # print(lid) of.write('%s\n' % featurizer.mapper.id2label[lid]) of.write('\n') res = evaluate(pred_flat, y_te_flat) resout = "F1: %f, R: %f, A: %f, P: %f\n" % res print(resout)
rnd = np.random.RandomState(1) selected = rnd.randint(len(y_test), size=n_words) max_word_len = max([len(y_) for y_ in y_test[selected]]) fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10)) fig.subplots_adjust(wspace=0) fig.text(0.2, 0.05, 'GT', color="#00AA00", size=25) fig.text(0.4, 0.05, 'NN', color="#5555FF", size=25) fig.text(0.6, 0.05, 'LCCRF', color="#FF5555", size=25) fig.text(0.8, 0.05, 'LCCRF+NN', color="#FFD700", size=25) fig.text(0.05, 0.5, 'Word', color="#000000", size=25) fig.text(0.5, 0.95, 'Letters', color="#000000", size=25) for ind, axes_row in zip(selected, axes): y_pred_nn = nn_predictions_test[ind].argmax(axis=1) y_pred_chain = chain_ssvm.predict([X_test[ind]])[0] y_pred_chain_nn = chain_ssvm_nn.predict([nn_predictions_test[ind]])[0] for i, (a, image, y_true, y_nn, y_chain, y_chain_nn) in enumerate( zip(axes_row, X_test[ind], y_test[ind], y_pred_nn, y_pred_chain, y_pred_chain_nn)): a.matshow(image.reshape(16, 8), cmap=plt.cm.Greys) a.text(0, 3, abc[y_true], color="#00AA00", size=25) # Green a.text(0, 14, abc[y_nn], color="#5555FF", size=25) # Blue a.text(5, 14, abc[y_chain], color="#FF5555", size=25) # Red a.text(5, 3, abc[y_chain_nn], color="#FFD700", size=25) # Yellow a.set_xticks(()) a.set_yticks(()) for ii in range(i + 1, max_word_len): axes_row[ii].set_visible(False) w = chain_ssvm_nn.w[26 * 26:].reshape(26, 26)
def run_crf(w2v, words_before, words_after, shallow_parse): pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \ parse_summerscales.get_tokens_and_lbls( make_pmids_dict=True, sen=True) """ Create model """ model = ChainCRF(directed=False) ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=30) all_pmids = pmids_dict.keys() n = len(all_pmids) n_folds = 5 kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds) fold_gi = [] for fold_idx, (train, test) in enumerate(kf): print("on fold %s" % fold_idx) train_pmids = [all_pmids[pmid_idx] for pmid_idx in train] test_pmids = [all_pmids[pmid_idx] for pmid_idx in test] print('loading data...') train_x, train_y = abstract2features(pmids_dict, words_before, w2v, shallow_parse) test_x, test_y = abstract2features(pmids_dict, words_after, w2v, shallow_parse) print('loaded data...') print 'training...' ssvm.fit(train_x, train_y) print ssvm.score(test_x, test_y) for i, (pmid, x, y) in enumerate(zip(test_pmids, test_x, test_y)): abstract_words, _, _ = pmids_dict[pmid] print(pmid) # predict() takes in a list returns another list prediction = ssvm.predict([x]).pop(0) predicted = '' output = '' if len(prediction) > 0: for p in prediction: if p == 1: print "word: {}".format(abstract_words[p]) if n == 0: predicted += abstract_words[p] else: predicted += ' ' + abstract_words[p] if not predicted == '': output = 'predicted: {}'.format(predicted) else: output = 'Predicted nothing!' else: output = 'Predicted nothing!' print output
def trainModel_Basic(num_iter=5,inference="qpbo",trainer="NSlack",num_train=2,num_test=1,C=0.1,edges="180x180_dist1_diag0",inputs=[1,1,1,1,1,1],features="all",directed=False,savePred=False): padding=(30,30,30,30) if directed==True: features +='+directed' resultsDir = os.getcwd()+'/CRFResults' nameLen = len(os.listdir(resultsDir)) edgeFeature = edges filename=str(nameLen)+'_CRF_iter_'+str(num_iter)+"_"+inference+"_"+trainer+"_"+features+"_"+str(num_train)+"_"+str(num_test)+"_"+edgeFeature print "Loading training slices" start = time.clock() train =extractSlices2(train_path,num_train,padding,inputs=inputs) end= time.clock() train_load_time = (end-start)/60.0 [trainLayers,trainTruth,sliceShape] = train print "Training slices loaded in %f" % (train_load_time) n_features= len(trainLayers[0][0,0]) print "Layer shape is : " print trainLayers[0].shape print "Training the model" edges= np.load("/home/bmi/CRF/edges/"+edges+".npy") G = [edges for x in trainLayers] print trainLayers[0].shape trainLayers = np.array( [x.reshape((sliceShape[0]*sliceShape[1],n_features)) for x in trainLayers] ) trainTruth = np.array( [x.reshape((sliceShape[0]*sliceShape[1],)).astype(int) for x in trainTruth] ) if inference=='ogm': crf = GraphCRF(inference_method=('ogm',{'alg':'fm'}),directed=directed) else: crf = GraphCRF(inference_method=inference,directed=directed) if trainer=="Frank": svm = FrankWolfeSSVM(model = crf,max_iter=num_iter,C=C,n_jobs=6,verbose=1) elif trainer=="NSlack": svm = NSlackSSVM(model = crf,max_iter=num_iter,C=C,n_jobs=-1,verbose=1) else: svm = OneSlackSSVM(model = crf,max_iter=num_iter,C=C,n_jobs=-1,verbose=1) start = time.clock() asdf = zip(trainLayers,G) svm.fit(asdf,trainTruth) end = time.clock() train_time = (end-start)/60.0 print "The training took %f" % (train_time) print "Model parameter size :" print svm.w.shape print "making predictions on train data" predTrain = svm.predict(asdf) trainDice=[] for i in range(len(trainLayers)): diceScore = accuracy(predTrain[i],trainTruth[i]) trainDice.append(diceScore) meanTrainDice = sum(trainDice)/len(trainLayers) del trainLayers,trainTruth ################################################################################################ overallDicePerPatient=[] # For overall test Dice extDicePerPatient=[] PatientTruthLayers=[] PatientPredLayers=[] PREC=[] RECALL=[] F1=[] LayerwiseDiceTotal=[] testResultFile = open(os.getcwd()+"/CRFResults/"+filename+".csv",'a') testResultFile.write("folderName,numLayers, Overall Dice, precision , recall, F1"+"\n") counter=0 print "Loading the test slices" for folder in os.listdir(test_path): path = test_path + "/" + folder layerDiceScores='' # print path data = extractTestSlices2(path,padding,inputs=inputs) if data!=0: [testLayers,testTruth,sliceShape,startSlice,endSlice] = data # trueTestLayers=testLayers GTest = [edges for x in testLayers] testLayers = np.array( [x.reshape((sliceShape[0]*sliceShape[1],n_features)) for x in testLayers] ) testTruth = np.array( [x.reshape((sliceShape[0]*sliceShape[1],)).astype(int) for x in testTruth] ) asdfTest = zip(testLayers,GTest) predTest = svm.predict(asdfTest) LayerwiseDice=[] for i in range(len(testLayers)): diceScore = accuracy(predTest[i],testTruth[i]) layerDiceScores+=","+str(diceScore) if math.isnan(diceScore): if sum(predTest[i])==0 and sum(testTruth[i])==0: LayerwiseDice.append(1.0) continue LayerwiseDice.append(diceScore) LayerwiseDiceTotal.append(LayerwiseDice) overallTestDice = accuracy(np.hstack(predTest),np.hstack(testTruth)) extDice = np.mean ( np.array(LayerwiseDice)[ range(10) + range(len(LayerwiseDice)-10, len(LayerwiseDice)) ] ) prec,recall,f1 = precision_score(np.hstack(testTruth),np.hstack(predTest)) , recall_score(np.hstack(testTruth),np.hstack(predTest)) , f1_score(np.hstack(testTruth),np.hstack(predTest)) print "Patient %d : Overall test DICE for %s is : %f and extDice is %f"%(counter,folder,overallTestDice,extDice) print "Precision : %f Recall : %f F1 : %f " %(prec,recall,f1) print "__________________________________________" # testResultFile.write(folder+","+str(len(testLayers))+","+str(meanTestDice)+","+str(overallTestDice) ","+str(np.max(testDice)) +","+ str(np.min(testDice))+"\n" ) testResultFile.write(folder+","+str(len(testLayers)) + ","+ str(overallTestDice) + ","+str(prec)+","+str(recall)+","+str(extDice)+layerDiceScores+"\n" ) overallDicePerPatient.append(overallTestDice) extDicePerPatient.append(extDice) PREC.append(prec), RECALL.append(recall) , F1.append(f1) PatientTruthLayers.append(testTruth) PatientPredLayers.append(predTest) counter+=1 if counter==num_test and num_test!=-1: break ###################################################################################################### print "Done testing slices" overallDice = sum(overallDicePerPatient)/len(PatientTruthLayers) overallPrec = sum(PREC)/len(PatientTruthLayers) overallRecall = sum(RECALL)/len(PatientTruthLayers) overallExtDice = np.mean(extDicePerPatient) print "Overall DICE : %f Precision : %f Recall : %f extDice : %f "%(overallDice,overallPrec,overallRecall,overallExtDice) print "############################################" # testOutput=np.array([PatientPredLayers,PatientTruthLayers,trueTestLayers]) testOutput=np.array([PatientPredLayers,PatientTruthLayers]) ########### Saving the models ###################################################################### # print "Saving the model" # modelDir = os.getcwd()+"/CRFModel/" # svmModel = open(modelDir+filename+"_model"+".pkl",'wb') # cPickle.dump(svm,svmModel,protocol=cPickle.HIGHEST_PROTOCOL) # svmModel.close() # # print "saving the predictions" # predFileTest = open(os.getcwd()+"/CRFPred/"+filename+"_pred.pkl",'wb') # cPickle.dump(testOutput,predFileTest,protocol=cPickle.HIGHEST_PROTOCOL) # predFileTest.close() layerDataLog = open(os.getcwd()+"/CRFModel/"+filename+"_layer.pkl",'wb') cPickle.dump(LayerwiseDiceTotal,layerDataLog,protocol = cPickle.HIGHEST_PROTOCOL) layerDataLog.close() resultLog = os.getcwd()+"/CRFResults/TestResultFinal.csv" resultFile = open(resultLog,'a') resultFile.write(time.ctime()+","+str(num_iter)+","+str(num_train)+","+str(num_test)+","+inference+","+ trainer+","+str(C)+","+str(train_time)+","+str(meanTrainDice)+","+str(overallDice)+","+ str(np.std(overallDicePerPatient))+","+edgeFeature+","+"None"+","+features+","+filename +","+ str(overallPrec) +","+ str(overallRecall) +","+ str(overallExtDice)+","+"Flair(5)+T2(9)-Without last 4 train Layers"+"\n") resultFile.close() testResultFile.close() return
format(len(train_index), len(test_index))) X_train = X[train_index] Y_train = Y[train_index] X_test = X[test_index] Y_test = Y[test_index] X_train_vector = np.reshape( X_train, (X_train.shape[0] * X_train.shape[1], X_train.shape[2])) X_test_vector = np.reshape( X_test, (X_test.shape[0] * X_test.shape[1], X_test.shape[2])) Y_train_vector = np.reshape(Y_train, (Y_train.shape[0] * Y_train.shape[1])) Y_test_vector = np.reshape(Y_test, (Y_test.shape[0] * Y_test.shape[1])) """ YOUR S-SVM TRAINING CODE HERE """ ssvm.fit(X_train, Y_train) """ LABEL THE TESTING SET AND PRINT RESULTS """ y_pred_ssvm = ssvm.predict(X_test) ssvm_score = ssvm.score(X_test, Y_test) scores_crf[fold] = ssvm_score wrong_segments_crf.append(np.size(Y_test) - np.sum(y_pred_ssvm == Y_test)) """ figure showing the result of classification of segments for each jacket in the testing part of present fold """ if plot_labeling: for ti, pred in zip(test_index, Y_pred): print(ti) print(pred) s = segments[ti] plot_segments(s, caption='SSVM predictions for jacket ' + str(ti + 1), labels_segments=pred) """ YOUR LINEAR SVM TRAINING CODE HERE """
C=C, max_iter=300, check_dual_every=50, line_search=False, verbose=True) # fw_batch_svm = FrankWolfeSSVM(model, C=.1, max_iter=50, batch_mode=True) gfw_bc_svm = GeneralizedFrankWolfeSSVM(gmodel, C=C, max_iter=300, check_dual_every=50, line_search=False, verbose=True) if method == 'generalized': start = time() gfw_bc_svm.fit(X_train_bias, y_train) y_pred = np.hstack(gfw_bc_svm.predict(X_test_bias)) time_fw_bc_svm = time() - start print("Score with maxminsvm: %f , C=%f (took %f seconds)" % (np.mean(y_pred == y_test), C, time_fw_bc_svm)) pdb.set_trace() elif method == 'vanilla': start = time() fw_bc_svm.fit(X_train_bias, y_train) y_pred = np.hstack(fw_bc_svm.predict(X_test_bias)) time_fw_bc_svm = time() - start print("Score with cssvm: %f , C=%f (took %f seconds)" % (np.mean(y_pred == y_test), C, time_fw_bc_svm)) pdb.set_trace() # compute error
print('find training data') train_datas, train_labels, _ = self.get_datas(c_idxs, labels, mentions, retweets, bags) <<<<<<< HEAD test_datas, test_labels, node_ids = self.get_datas(test_ids, labels, mentions, retweets, bags) if i == 0: x_test_ori, y_test_ori = test_datas, test_labels ======= >>>>>>> 93309e3207d37152eefafa6b563c72777a863935 print(len(train_datas)) print(len(test_datas)) X_train, y_train = train_datas, train_labels model = GraphCRF(inference_method="max-product") ssvm = FrankWolfeSSVM(model=model, C=0.1, max_iter=10) ssvm.fit(X_train, y_train) y_preds = ssvm.predict(test_datas) <<<<<<< HEAD result = ssvm.score(x_test_ori, y_test_ori) print('iter {} result = {}'.format(i, result)) count = 0 for clique_idx, clique in enumerate(y_preds): for node_idx, node in enumerate(clique): node_id = node_ids[clique_idx][node_idx] if node == central_propagation_df.iloc[node_id].values: clabels[int(node_id)] = node if not int(node_id) in c_idxs: c_idxs = np.append(c_idxs, int(node_id)) count += 1 print('iter {} update {} new labels'.format(i, count)) ======= # result = ssvm.score(test_datas, test_labels)