예제 #1
0
def train(n_queries=10, mode='boreholes'):
    datafile = paths.get_dataset_path(name, mode)
    df = pd.read_csv(datafile)
    df = df.loc[df['Content'] != '[]']

    clf = Pipeline(
        [
            ('list2str', FunctionTransformer(concat_tables)),
            #('vect', CountVectorizer(ngram_range=(1, 2), min_df=0.01)),
            ('tfidf', TfidfVectorizer(ngram_range=(1, 2), min_df=0.0025)
             ),  # min_df discourages overfitting
            ('cnb', ComplementNB(alpha=0.2))
        ],
        verbose=True)
    accuracy, learner = active_learning.train(df,
                                              y_column,
                                              n_queries,
                                              clf,
                                              datafile,
                                              limit_cols=limit_cols,
                                              mode=mode)
    model_loc = paths.get_model_path(name, mode)

    with open(model_loc, "wb") as file:
        pickle.dump(learner, file)
    return learner
def train(
    n_queries=10,
    mode=paths.dataset_version,
    spec_name=name
):  #datafile=settings.get_dataset_path('heading_id_intext'), model_file=settings.get_model_path('heading_id_intext'),
    datafile = paths.get_dataset_path(name, mode)
    model_file = paths.get_model_path(spec_name, mode)
    data = pd.read_csv(datafile)
    if 'no_toc' in model_file:
        limit_cols.extend(['MatchesHeading', 'MatchesType'])

    estimator = Pipeline(
        [
            (
                'text',
                ColumnTransformer(
                    [(
                        'cnb', Text2CNBPrediction(), 1
                    )  # 1 HAS TO BE 'TEXT'. changing it to int bc AL uses np arrays
                     ],
                    remainder="passthrough")),
            ('forest', RandomForestClassifier())
        ],
        verbose=True)

    accuracy, learner = active_learning.train(data, y_column, n_queries,
                                              estimator, datafile, limit_cols)

    print(accuracy)
    with open(model_file, "wb") as file:
        pickle.dump(learner, file)
    print("End of training stage. Re-run to train again")
def train(datafile=paths.get_dataset_path(name),
          model_file=paths.get_model_path(name)
          ):  #settings.heading_classification_model_file):
    data = pd.read_csv(datafile)
    X, Y = data_prep(data, y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)
    clf = Pipeline([
        ('tfidf', TfidfVectorizer(
            analyzer='word',
            ngram_range=(1, 2))),  #(token_pattern=r'([a-zA-Z]|[0-9])+')),
        ('clf', ComplementNB(norm=True))
    ])

    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    #weights = eli5.formatters.as_dataframe.explain_weights_df(clf, feature_names=clf['tfidf'].get_feature_names(), top=10, target_names=y_test)
    #print(weights)
    #prediction = eli5.formatters.as_dataframe.explain_prediction_df(clf, X_test[0], feature_names=clf['tfidf'].get_feature_names(), target_names=y_test)
    #print(prediction)

    accuracy = accuracy_score(y_test, y_pred)
    print(accuracy)
    report = classification_report(Y, clf.predict(X))
    print(report)
    with open(paths.result_path + name + '_CNB_report.txt', "w") as r:
        r.write(report)
    with open(model_file, "wb") as file:
        pickle.dump(clf, file)
예제 #4
0
def run_model(model_name, model_type='NN', mode=paths.dataset_version):
    nn = NeuralNetwork(model_name, model_type)
    data = pd.Series(['page 3 of 8',
                      'bhp hello 3',
                      'epm3424 \t3 \tfebruary 1900',
                      'epm3424 \tpage 3 \tfebruary 1900',
                      'epm3424 page \t3 \tfebruary 1900',
                      'epm3424 page 3 \tfebruary 1900',
                      'epm34985 \t40',
                      '8 \t9 \t10',
                      '8 may 1998 \treport 90',
                      '3 \tbhp annual report'])
    #trans_data = data.apply(lambda x: transform_text(x, transform_all=False))
    r = nn.predict(data)
    print(r)

    all_predictions = False
    if all_predictions:
        df = pd.read_csv(paths.get_model_path(name, mode))
        #df = pd.read_csv(paths.page_extraction_dataset)
        data = df.transformed
        r = nn.predict(data)
        correct = 0
        incorrect = 0
        for i, row in df.iterrows():

            print(row.original, ', ', r[i])
            if str(row.pagenum) != r[i]:
                incorrect += 1
            else:
                correct += 1
        print('real accuracy: ', correct/(correct+incorrect))
def train(n_queries=10, mode=paths.dataset_version): #, model='forest') datafile=data_path,
    datafile = paths.get_dataset_path(name, mode)  # need to define these here because mode may be production
    model_path = paths.get_model_path(name, mode)
    data = pd.read_csv(datafile)
    accuracy, learner = active_learning.train(data, y_column, n_queries, estimator, datafile, limit_cols=limit_cols)
    with open(model_path, "wb") as file:
        pickle.dump(learner, file)
    print("End of training stage. Re-run to train again")
    return accuracy
def predict(inputs, mode=mode):
    if isinstance(inputs, str):
        inputs = [inputs]

    with open(paths.get_model_path(name, mode), "rb") as file:
        #with open(paths.heading_classification_model_file, "rb") as file:
        model = pickle.load(file)
    pred = model.predict(inputs)
    proba = model.predict_proba(inputs)
    return pred, proba  #, model
def train_models_pt2():
    heading_id_toc_nn = heading_id_toc.NeuralNetwork()
    heading_id_toc_nn.train()

    # page_id_nn = page_identification.NeuralNetwork()
    # page_id_nn.train()

    heading_id_intext.train()
    heading_id_intext.train(
        model_file=paths.get_model_path('heading_id_intext_no_toc'))
예제 #8
0
def create_dataset(mode=paths.dataset_version):
    sourcefile = paths.get_model_path(name, mode)
    texts = pd.read_csv(sourcefile)
    page_texts = texts.loc[texts.tag == 1]
    #page_texts.transformed = page_texts.original.apply(lambda x: transform_text(x, transform_all=False))

    page_texts = page_texts.drop(['tag'], axis=1)
    page_texts['pagenum'] = None

    #page_texts.to_csv(settings.page_extraction_dataset, index=False)
    return page_texts
def run_model(mode=paths.production):
    nn = NeuralNetwork()
    model_loc = paths.get_model_path(name, mode=mode)
    nn.load_model_from_file(model_loc=model_loc)
    df = pd.read_csv(paths.get_dataset_path(name, mode=mode), usecols=['original'])
    #df = pd.read_csv(paths.marginals_id_trans_dataset, usecols=['original'])
    #data = df.original
    data = pd.Series(['page 8', 'bhp hello 3', '12 month report', 'epm3424 3 february 1900',
                                 'epm23 february 2000', 'epm34985 4000'])
    p, r = nn.predict(data)#.original)

    for i, row in df.iterrows():
        print(row.original, ', ', p[i], ', ', r[i])
def classify(data, model_name, y_column, limit_cols, mode=paths.dataset_version):
    #frame = inspect.stack()[9]  # 9 only works if this functions is called from get_classified  # 1 if called from model file
    model_path = paths.get_model_path(model_name, mode)
    if not os.path.exists(model_path):
        frame = inspect.stack()[2]  # 0: this, 1: mlh.get_classified, 2: model file
        module = inspect.getmodule(frame[0])  # inspect.getmodule(frame[0])  # gets the module that this function was called from to call the correct training function
        module.train(n_queries=0, mode=mode, spec_name=model_name) #datafile=settings.get_dataset_path(model_name, mode), model_file=model_path)
    with open(model_path, "rb") as file:
        model = joblib.load(file)
    if isinstance(data, pd.DataFrame) and y_column in data.columns:
        limit_cols.append(y_column)  #better than passing y_column to data prep to be removed because then y will also be returned
    data = data_prep(data, limit_cols)
    pred = model.predict(data)
    proba = model.predict_proba(data)
    #print(proba)
    return pred, proba
예제 #11
0
def train(n_queries=10, mode=paths.dataset_version):  #datafile=data_path, ):
    datafile = paths.get_dataset_path(name, mode)
    data = pd.read_csv(datafile)
    accuracy, learner = active_learning.train(data,
                                              y_column,
                                              n_queries,
                                              estimator,
                                              datafile,
                                              mode=mode)
    if type(learner) == tree._classes.DecisionTreeClassifier:
        tree.plot_tree(learner, feature_names=include_cols, class_names=True)
        plt.show()
    with open(paths.get_model_path(name, mode), "wb") as file:
        pickle.dump(learner, file)
    print("End of training stage. Re-run to train again")
    return accuracy
예제 #12
0
def train(
    n_queries=10,
    mode=paths.dataset_version
):  # datafile=settings.get_dataset_path(name), model_file=settings.get_model_path(name),
    datafile = paths.get_dataset_path(name, mode)
    if not os.path.exists(datafile):
        data = create_dataset()
        data.to_csv(datafile, index=False)
    else:
        data = pd.read_csv(datafile)
    clf = RandomForestClassifier()  #tree.DecisionTreeClassifier()
    accuracy, clf = al.train(data, y_column, n_queries, clf, datafile,
                             limited_cols)
    print(accuracy)
    model_file = paths.get_model_path(name, mode)
    with open(model_file, "wb") as file:
        pickle.dump(clf, file)
    def train(self, n_queries=10, mode=paths.dataset_version):  #settings.marginals_id_trans_dataset):
        file = paths.get_dataset_path(name, mode)
        df = pd.read_csv(file)
        #self.X = df['transformed']
        #self.Y = df['tag']
        self.max_words, self.max_len = check_maxlens(df)

        lstm = KerasClassifier(build_fn=self.LSTM, batch_size=self.batch_size, epochs=self.epochs,
                               validation_split=0.2)

        estimator = Pipeline([
            #('transform1', ColumnTransformer([
                ('transform_text', FunctionTransformer(transform_text_wrapper)),# 0)
                #], remainder="passthrough")),
            ('transform2', Text2Seq(classes=2)),
            ('lstm', lstm)
        ], verbose=True)

        accuracy, learner = active_learning.train(df, y_column, n_queries, estimator, file, limit_cols=limit_cols)
        self.model = learner
        # self.tok = Tokenizer(num_words=self.max_words+1) # only num_words-1 will be taken into account!
        # self.model = self.LSTM()
        #
        # X_train, X_test, Y_train, Y_test = train_test_split(self.X, self.Y, test_size=0.15)
        #
        # self.tok.fit_on_texts(X_train)
        # sequences = self.tok.texts_to_sequences(X_train)
        # sequences_matrix = sequence.pad_sequences(sequences, maxlen=self.max_len)
        # y_binary = to_categorical(Y_train)
        # self.model.summary()
        # self.model.fit(sequences_matrix, y_binary, batch_size=self.batch_size, epochs=self.epochs,
        #           validation_split=0.2) #, callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)]
        #
        # test_sequences = self.tok.texts_to_sequences(X_test)
        # test_sequences_matrix = sequence.pad_sequences(test_sequences, maxlen=self.max_len)
        #
        # accr = self.model.evaluate(test_sequences_matrix, to_categorical(Y_test))
        # print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0], accr[1]))
        self.model_loc = paths.get_model_path(name, mode)
        #self.model.save(self.model_loc)
        #joblib.dump(self.tok, self.tok_loc)
        with open(self.model_loc, "wb") as f:
            pickle.dump(self.model, f)
from sklearn import ensemble
import pickle
import re
from report import active_learning, machine_learning_helper as mlh


name = 'marginal_lines'
y_column = 'Marginal'
columns = ['DocID', 'PageNum', 'LineNum', 'NormedLineNum','Text', 'Words2Width', 'WordsWidth', 'Width', 'Height',
           'Left', 'Top', 'ContainsNum', 'ContainsTab', 'ContainsPage', 'Centrality', y_column, 'TagMethod']
limit_cols=['DocID', 'Text', 'LineNum']
include_cols = ['PageNum', 'NormedLineNum', 'Words2Width', 'WordsWidth', 'Width', 'Height', 'Left', 'Top',
                'ContainsNum', 'ContainsTab', 'ContainsPage', 'Centrality']
estimator = ensemble.RandomForestClassifier()
data_path = paths.get_dataset_path(name)
model_path = paths.get_model_path(name)


def contains_num(string):
    if re.search(r'(\s|^)[0-9]+(\s|$)', string):
        return 1
    return 0


def contains_tab(string):
    if re.search(r'\t', string):
        return 1
    return 0


def contains_page(string):
예제 #15
0
    def train(self, n_queries=10, mode=paths.dataset_version):  #settings.page_extraction_dataset):
        file = paths.get_dataset_path(name, mode)
        df = pd.read_csv(file)
        #self.X = df['transformed']
        #self.X = df['transformed']  # self.X only exists here to get proper maxlens
        self.Y = df['position']         # try with y position instead of y value
        #transform = FunctionTransformer(lambda x: num2word(x))  # not sure this will work
        #self.X = self.X.apply(lambda x: num2word(x))
        #self.max_words, self.max_len = check_maxlens(self.X)
        self.max_len = 20  # assuming max num of words in line will be 20
        self.classes, y_vectorised = self.position2int()
        self.inv_classes = {v: k for k, v in self.classes.items()}
        y_masked = np.zeros((self.Y.size, self.max_len))
        for i, j in zip(y_masked, y_vectorised):
           p = self.inv_classes[j]
           i[p] = 1

        self.num_classes = len(self.classes.items())
        nn = KerasClassifier(build_fn=self.NN, batch_size=self.batch_size, epochs=self.epochs, validation_split=0.2)

        clf = Pipeline([
            ('transform_text', FunctionTransformer(transform_text_wrapper)),
            #('num2word', FunctionTransformer(lambda x: num2word(x))),
            ('transform', Text2Seq(classes=self.num_classes, pad_len=self.max_len)),
            ('nn', nn)
        ], verbose=True)

        accuracy, learner = active_learning.train(df, y_column, n_queries, clf, file, limit_cols=limit_cols)
        self.model = learner
        self.model_loc = paths.get_model_path(name, mode)
        # self.tok = Tokenizer(num_words=self.max_words+1) # only num_words-1 will be taken into account!

        # if self.mode_type == 'LSTM':
        #     self.model = self.LSTM()
        # else:
        # self.model = self.NN()
        #
        # X_train, X_test, Y_train, Y_test = train_test_split(self.X, y_masked, test_size=0.15)
        #
        # self.tok.fit_on_texts(self.X)
        # sequences = self.tok.texts_to_sequences(X_train)
        # sequences_matrix = sequence.pad_sequences(sequences, maxlen=self.max_len)
        # #y_binary = to_categorical(Y_train) # y needs to be onehot encoded
        #
        # self.model.summary()
        # self.model.fit(sequences_matrix, Y_train, batch_size=self.batch_size, epochs=self.epochs,
        #           validation_split=0.2) #, callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)]
        #
        # test_sequences = self.tok.texts_to_sequences(X_test)
        # test_sequences_matrix = sequence.pad_sequences(test_sequences, maxlen=self.max_len)
        #
        # accr = self.model.evaluate(test_sequences_matrix, Y_test)
        # print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0], accr[1]))
        #self.model.save(self.model_loc)
        with open(self.model_loc, "wb") as f:
            pickle.dump(self.model, f)
        self.classes_loc = paths.get_model_path(name, mode, classes=True)  #self.model_path + self.model_name + 'class_dict.joblib'
        #joblib.dump(self.tok, self.tok_loc)
        joblib.dump(self.inv_classes, self.classes_loc)
        print("End of training stage. Re-run to train again")
        return accuracy