def train(n_queries=10, mode='boreholes'): datafile = paths.get_dataset_path(name, mode) df = pd.read_csv(datafile) df = df.loc[df['Content'] != '[]'] clf = Pipeline( [ ('list2str', FunctionTransformer(concat_tables)), #('vect', CountVectorizer(ngram_range=(1, 2), min_df=0.01)), ('tfidf', TfidfVectorizer(ngram_range=(1, 2), min_df=0.0025) ), # min_df discourages overfitting ('cnb', ComplementNB(alpha=0.2)) ], verbose=True) accuracy, learner = active_learning.train(df, y_column, n_queries, clf, datafile, limit_cols=limit_cols, mode=mode) model_loc = paths.get_model_path(name, mode) with open(model_loc, "wb") as file: pickle.dump(learner, file) return learner
def train( n_queries=10, mode=paths.dataset_version, spec_name=name ): #datafile=settings.get_dataset_path('heading_id_intext'), model_file=settings.get_model_path('heading_id_intext'), datafile = paths.get_dataset_path(name, mode) model_file = paths.get_model_path(spec_name, mode) data = pd.read_csv(datafile) if 'no_toc' in model_file: limit_cols.extend(['MatchesHeading', 'MatchesType']) estimator = Pipeline( [ ( 'text', ColumnTransformer( [( 'cnb', Text2CNBPrediction(), 1 ) # 1 HAS TO BE 'TEXT'. changing it to int bc AL uses np arrays ], remainder="passthrough")), ('forest', RandomForestClassifier()) ], verbose=True) accuracy, learner = active_learning.train(data, y_column, n_queries, estimator, datafile, limit_cols) print(accuracy) with open(model_file, "wb") as file: pickle.dump(learner, file) print("End of training stage. Re-run to train again")
def train(datafile=paths.get_dataset_path(name), model_file=paths.get_model_path(name) ): #settings.heading_classification_model_file): data = pd.read_csv(datafile) X, Y = data_prep(data, y=True) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20) clf = Pipeline([ ('tfidf', TfidfVectorizer( analyzer='word', ngram_range=(1, 2))), #(token_pattern=r'([a-zA-Z]|[0-9])+')), ('clf', ComplementNB(norm=True)) ]) clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_test) #weights = eli5.formatters.as_dataframe.explain_weights_df(clf, feature_names=clf['tfidf'].get_feature_names(), top=10, target_names=y_test) #print(weights) #prediction = eli5.formatters.as_dataframe.explain_prediction_df(clf, X_test[0], feature_names=clf['tfidf'].get_feature_names(), target_names=y_test) #print(prediction) accuracy = accuracy_score(y_test, y_pred) print(accuracy) report = classification_report(Y, clf.predict(X)) print(report) with open(paths.result_path + name + '_CNB_report.txt', "w") as r: r.write(report) with open(model_file, "wb") as file: pickle.dump(clf, file)
def run_model(model_name, model_type='NN', mode=paths.dataset_version): nn = NeuralNetwork(model_name, model_type) data = pd.Series(['page 3 of 8', 'bhp hello 3', 'epm3424 \t3 \tfebruary 1900', 'epm3424 \tpage 3 \tfebruary 1900', 'epm3424 page \t3 \tfebruary 1900', 'epm3424 page 3 \tfebruary 1900', 'epm34985 \t40', '8 \t9 \t10', '8 may 1998 \treport 90', '3 \tbhp annual report']) #trans_data = data.apply(lambda x: transform_text(x, transform_all=False)) r = nn.predict(data) print(r) all_predictions = False if all_predictions: df = pd.read_csv(paths.get_model_path(name, mode)) #df = pd.read_csv(paths.page_extraction_dataset) data = df.transformed r = nn.predict(data) correct = 0 incorrect = 0 for i, row in df.iterrows(): print(row.original, ', ', r[i]) if str(row.pagenum) != r[i]: incorrect += 1 else: correct += 1 print('real accuracy: ', correct/(correct+incorrect))
def train(n_queries=10, mode=paths.dataset_version): #, model='forest') datafile=data_path, datafile = paths.get_dataset_path(name, mode) # need to define these here because mode may be production model_path = paths.get_model_path(name, mode) data = pd.read_csv(datafile) accuracy, learner = active_learning.train(data, y_column, n_queries, estimator, datafile, limit_cols=limit_cols) with open(model_path, "wb") as file: pickle.dump(learner, file) print("End of training stage. Re-run to train again") return accuracy
def predict(inputs, mode=mode): if isinstance(inputs, str): inputs = [inputs] with open(paths.get_model_path(name, mode), "rb") as file: #with open(paths.heading_classification_model_file, "rb") as file: model = pickle.load(file) pred = model.predict(inputs) proba = model.predict_proba(inputs) return pred, proba #, model
def train_models_pt2(): heading_id_toc_nn = heading_id_toc.NeuralNetwork() heading_id_toc_nn.train() # page_id_nn = page_identification.NeuralNetwork() # page_id_nn.train() heading_id_intext.train() heading_id_intext.train( model_file=paths.get_model_path('heading_id_intext_no_toc'))
def create_dataset(mode=paths.dataset_version): sourcefile = paths.get_model_path(name, mode) texts = pd.read_csv(sourcefile) page_texts = texts.loc[texts.tag == 1] #page_texts.transformed = page_texts.original.apply(lambda x: transform_text(x, transform_all=False)) page_texts = page_texts.drop(['tag'], axis=1) page_texts['pagenum'] = None #page_texts.to_csv(settings.page_extraction_dataset, index=False) return page_texts
def run_model(mode=paths.production): nn = NeuralNetwork() model_loc = paths.get_model_path(name, mode=mode) nn.load_model_from_file(model_loc=model_loc) df = pd.read_csv(paths.get_dataset_path(name, mode=mode), usecols=['original']) #df = pd.read_csv(paths.marginals_id_trans_dataset, usecols=['original']) #data = df.original data = pd.Series(['page 8', 'bhp hello 3', '12 month report', 'epm3424 3 february 1900', 'epm23 february 2000', 'epm34985 4000']) p, r = nn.predict(data)#.original) for i, row in df.iterrows(): print(row.original, ', ', p[i], ', ', r[i])
def classify(data, model_name, y_column, limit_cols, mode=paths.dataset_version): #frame = inspect.stack()[9] # 9 only works if this functions is called from get_classified # 1 if called from model file model_path = paths.get_model_path(model_name, mode) if not os.path.exists(model_path): frame = inspect.stack()[2] # 0: this, 1: mlh.get_classified, 2: model file module = inspect.getmodule(frame[0]) # inspect.getmodule(frame[0]) # gets the module that this function was called from to call the correct training function module.train(n_queries=0, mode=mode, spec_name=model_name) #datafile=settings.get_dataset_path(model_name, mode), model_file=model_path) with open(model_path, "rb") as file: model = joblib.load(file) if isinstance(data, pd.DataFrame) and y_column in data.columns: limit_cols.append(y_column) #better than passing y_column to data prep to be removed because then y will also be returned data = data_prep(data, limit_cols) pred = model.predict(data) proba = model.predict_proba(data) #print(proba) return pred, proba
def train(n_queries=10, mode=paths.dataset_version): #datafile=data_path, ): datafile = paths.get_dataset_path(name, mode) data = pd.read_csv(datafile) accuracy, learner = active_learning.train(data, y_column, n_queries, estimator, datafile, mode=mode) if type(learner) == tree._classes.DecisionTreeClassifier: tree.plot_tree(learner, feature_names=include_cols, class_names=True) plt.show() with open(paths.get_model_path(name, mode), "wb") as file: pickle.dump(learner, file) print("End of training stage. Re-run to train again") return accuracy
def train( n_queries=10, mode=paths.dataset_version ): # datafile=settings.get_dataset_path(name), model_file=settings.get_model_path(name), datafile = paths.get_dataset_path(name, mode) if not os.path.exists(datafile): data = create_dataset() data.to_csv(datafile, index=False) else: data = pd.read_csv(datafile) clf = RandomForestClassifier() #tree.DecisionTreeClassifier() accuracy, clf = al.train(data, y_column, n_queries, clf, datafile, limited_cols) print(accuracy) model_file = paths.get_model_path(name, mode) with open(model_file, "wb") as file: pickle.dump(clf, file)
def train(self, n_queries=10, mode=paths.dataset_version): #settings.marginals_id_trans_dataset): file = paths.get_dataset_path(name, mode) df = pd.read_csv(file) #self.X = df['transformed'] #self.Y = df['tag'] self.max_words, self.max_len = check_maxlens(df) lstm = KerasClassifier(build_fn=self.LSTM, batch_size=self.batch_size, epochs=self.epochs, validation_split=0.2) estimator = Pipeline([ #('transform1', ColumnTransformer([ ('transform_text', FunctionTransformer(transform_text_wrapper)),# 0) #], remainder="passthrough")), ('transform2', Text2Seq(classes=2)), ('lstm', lstm) ], verbose=True) accuracy, learner = active_learning.train(df, y_column, n_queries, estimator, file, limit_cols=limit_cols) self.model = learner # self.tok = Tokenizer(num_words=self.max_words+1) # only num_words-1 will be taken into account! # self.model = self.LSTM() # # X_train, X_test, Y_train, Y_test = train_test_split(self.X, self.Y, test_size=0.15) # # self.tok.fit_on_texts(X_train) # sequences = self.tok.texts_to_sequences(X_train) # sequences_matrix = sequence.pad_sequences(sequences, maxlen=self.max_len) # y_binary = to_categorical(Y_train) # self.model.summary() # self.model.fit(sequences_matrix, y_binary, batch_size=self.batch_size, epochs=self.epochs, # validation_split=0.2) #, callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)] # # test_sequences = self.tok.texts_to_sequences(X_test) # test_sequences_matrix = sequence.pad_sequences(test_sequences, maxlen=self.max_len) # # accr = self.model.evaluate(test_sequences_matrix, to_categorical(Y_test)) # print('Test set\n Loss: {:0.3f}\n Accuracy: {:0.3f}'.format(accr[0], accr[1])) self.model_loc = paths.get_model_path(name, mode) #self.model.save(self.model_loc) #joblib.dump(self.tok, self.tok_loc) with open(self.model_loc, "wb") as f: pickle.dump(self.model, f)
from sklearn import ensemble import pickle import re from report import active_learning, machine_learning_helper as mlh name = 'marginal_lines' y_column = 'Marginal' columns = ['DocID', 'PageNum', 'LineNum', 'NormedLineNum','Text', 'Words2Width', 'WordsWidth', 'Width', 'Height', 'Left', 'Top', 'ContainsNum', 'ContainsTab', 'ContainsPage', 'Centrality', y_column, 'TagMethod'] limit_cols=['DocID', 'Text', 'LineNum'] include_cols = ['PageNum', 'NormedLineNum', 'Words2Width', 'WordsWidth', 'Width', 'Height', 'Left', 'Top', 'ContainsNum', 'ContainsTab', 'ContainsPage', 'Centrality'] estimator = ensemble.RandomForestClassifier() data_path = paths.get_dataset_path(name) model_path = paths.get_model_path(name) def contains_num(string): if re.search(r'(\s|^)[0-9]+(\s|$)', string): return 1 return 0 def contains_tab(string): if re.search(r'\t', string): return 1 return 0 def contains_page(string):
def train(self, n_queries=10, mode=paths.dataset_version): #settings.page_extraction_dataset): file = paths.get_dataset_path(name, mode) df = pd.read_csv(file) #self.X = df['transformed'] #self.X = df['transformed'] # self.X only exists here to get proper maxlens self.Y = df['position'] # try with y position instead of y value #transform = FunctionTransformer(lambda x: num2word(x)) # not sure this will work #self.X = self.X.apply(lambda x: num2word(x)) #self.max_words, self.max_len = check_maxlens(self.X) self.max_len = 20 # assuming max num of words in line will be 20 self.classes, y_vectorised = self.position2int() self.inv_classes = {v: k for k, v in self.classes.items()} y_masked = np.zeros((self.Y.size, self.max_len)) for i, j in zip(y_masked, y_vectorised): p = self.inv_classes[j] i[p] = 1 self.num_classes = len(self.classes.items()) nn = KerasClassifier(build_fn=self.NN, batch_size=self.batch_size, epochs=self.epochs, validation_split=0.2) clf = Pipeline([ ('transform_text', FunctionTransformer(transform_text_wrapper)), #('num2word', FunctionTransformer(lambda x: num2word(x))), ('transform', Text2Seq(classes=self.num_classes, pad_len=self.max_len)), ('nn', nn) ], verbose=True) accuracy, learner = active_learning.train(df, y_column, n_queries, clf, file, limit_cols=limit_cols) self.model = learner self.model_loc = paths.get_model_path(name, mode) # self.tok = Tokenizer(num_words=self.max_words+1) # only num_words-1 will be taken into account! # if self.mode_type == 'LSTM': # self.model = self.LSTM() # else: # self.model = self.NN() # # X_train, X_test, Y_train, Y_test = train_test_split(self.X, y_masked, test_size=0.15) # # self.tok.fit_on_texts(self.X) # sequences = self.tok.texts_to_sequences(X_train) # sequences_matrix = sequence.pad_sequences(sequences, maxlen=self.max_len) # #y_binary = to_categorical(Y_train) # y needs to be onehot encoded # # self.model.summary() # self.model.fit(sequences_matrix, Y_train, batch_size=self.batch_size, epochs=self.epochs, # validation_split=0.2) #, callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)] # # test_sequences = self.tok.texts_to_sequences(X_test) # test_sequences_matrix = sequence.pad_sequences(test_sequences, maxlen=self.max_len) # # accr = self.model.evaluate(test_sequences_matrix, Y_test) # print('Test set\n Loss: {:0.3f}\n Accuracy: {:0.3f}'.format(accr[0], accr[1])) #self.model.save(self.model_loc) with open(self.model_loc, "wb") as f: pickle.dump(self.model, f) self.classes_loc = paths.get_model_path(name, mode, classes=True) #self.model_path + self.model_name + 'class_dict.joblib' #joblib.dump(self.tok, self.tok_loc) joblib.dump(self.inv_classes, self.classes_loc) print("End of training stage. Re-run to train again") return accuracy