def train(datafile=paths.get_dataset_path(name),
          model_file=paths.get_model_path(name)
          ):  #settings.heading_classification_model_file):
    data = pd.read_csv(datafile)
    X, Y = data_prep(data, y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)
    clf = Pipeline([
        ('tfidf', TfidfVectorizer(
            analyzer='word',
            ngram_range=(1, 2))),  #(token_pattern=r'([a-zA-Z]|[0-9])+')),
        ('clf', ComplementNB(norm=True))
    ])

    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    #weights = eli5.formatters.as_dataframe.explain_weights_df(clf, feature_names=clf['tfidf'].get_feature_names(), top=10, target_names=y_test)
    #print(weights)
    #prediction = eli5.formatters.as_dataframe.explain_prediction_df(clf, X_test[0], feature_names=clf['tfidf'].get_feature_names(), target_names=y_test)
    #print(prediction)

    accuracy = accuracy_score(y_test, y_pred)
    print(accuracy)
    report = classification_report(Y, clf.predict(X))
    print(report)
    with open(paths.result_path + name + '_CNB_report.txt', "w") as r:
        r.write(report)
    with open(model_file, "wb") as file:
        pickle.dump(clf, file)
Exemplo n.º 2
0
def sanitise_datasets():
    rtitle = 'QGMJ'
    rtype = 'WELCOM'
    ref = pd.read_excel(
        'C:/Users/andraszeka/Documents/gsq-boreholes/investigations/QDEX_metada_export.xlsx',
        dtype={'REPNO': int})
    bad = ref.loc[ref.RTITLE.str.contains(rtitle)
                  | ref.RTYPE.str.contains(rtype)]
    bad_docids = bad.REPNO.values
    names = [
        'marginal_lines', 'toc', 'fig', 'heading_id_toc', 'heading_id_intext'
    ]  # page id and page extraction datasets don't have DocID attribute
    datasets = [paths.get_dataset_path(name) for name in names]
    for dataset in datasets:
        if os.path.exists(dataset):
            try:
                data = pd.read_csv(dataset, dtype={'DocID': int})
            except ValueError:
                data = pd.read_csv(dataset)
                data.dropna(subset=['DocID'], inplace=True)
                data.DocID = data.DocID.astype(int)
            prelen = data.shape[0]
            data = data.loc[~data.DocID.isin(bad_docids)]
            postlen = data.shape[0]
            data.to_csv(dataset, index=False)
            print('Removed ', str(prelen - postlen), ' bad values from ',
                  dataset)
def train(
    n_queries=10,
    mode=paths.dataset_version,
    spec_name=name
):  #datafile=settings.get_dataset_path('heading_id_intext'), model_file=settings.get_model_path('heading_id_intext'),
    datafile = paths.get_dataset_path(name, mode)
    model_file = paths.get_model_path(spec_name, mode)
    data = pd.read_csv(datafile)
    if 'no_toc' in model_file:
        limit_cols.extend(['MatchesHeading', 'MatchesType'])

    estimator = Pipeline(
        [
            (
                'text',
                ColumnTransformer(
                    [(
                        'cnb', Text2CNBPrediction(), 1
                    )  # 1 HAS TO BE 'TEXT'. changing it to int bc AL uses np arrays
                     ],
                    remainder="passthrough")),
            ('forest', RandomForestClassifier())
        ],
        verbose=True)

    accuracy, learner = active_learning.train(data, y_column, n_queries,
                                              estimator, datafile, limit_cols)

    print(accuracy)
    with open(model_file, "wb") as file:
        pickle.dump(learner, file)
    print("End of training stage. Re-run to train again")
Exemplo n.º 4
0
def train(n_queries=10, mode='boreholes'):
    datafile = paths.get_dataset_path(name, mode)
    df = pd.read_csv(datafile)
    df = df.loc[df['Content'] != '[]']

    clf = Pipeline(
        [
            ('list2str', FunctionTransformer(concat_tables)),
            #('vect', CountVectorizer(ngram_range=(1, 2), min_df=0.01)),
            ('tfidf', TfidfVectorizer(ngram_range=(1, 2), min_df=0.0025)
             ),  # min_df discourages overfitting
            ('cnb', ComplementNB(alpha=0.2))
        ],
        verbose=True)
    accuracy, learner = active_learning.train(df,
                                              y_column,
                                              n_queries,
                                              clf,
                                              datafile,
                                              limit_cols=limit_cols,
                                              mode=mode)
    model_loc = paths.get_model_path(name, mode)

    with open(model_loc, "wb") as file:
        pickle.dump(learner, file)
    return learner
def create_dataset():
    sourcefile = paths.get_dataset_path('heading_id_intext')
    df = pd.read_csv(sourcefile)
    df = df.loc[df['Heading'] > 0]
    df = df.drop(
        columns=['Heading', 'MatchesHeading', 'MatchesType', 'MatchesI'])
    df['HeadingClass'] = ''
    return df
def train(n_queries=10, mode=paths.dataset_version): #, model='forest') datafile=data_path,
    datafile = paths.get_dataset_path(name, mode)  # need to define these here because mode may be production
    model_path = paths.get_model_path(name, mode)
    data = pd.read_csv(datafile)
    accuracy, learner = active_learning.train(data, y_column, n_queries, estimator, datafile, limit_cols=limit_cols)
    with open(model_path, "wb") as file:
        pickle.dump(learner, file)
    print("End of training stage. Re-run to train again")
    return accuracy
def save_dataset(df, name):
    path = paths.get_dataset_path(name)
    if not os.path.exists(paths.dataset_path + '/' + paths.dataset_version):
        os.mkdir(paths.dataset_path + '/' + paths.dataset_version)
    if not os.path.exists(path):
        df.to_csv(path, index=False)
    else:
        print(
            'Dataset already exists here. To prevent overwriting annotation, delete it manually first.'
        )
def create_training_sets_pt2():
    proc_df = heading_id_toc.pre_process_id_dataset(
        datafile=paths.get_dataset_path('heading_id_toc'))
    save_dataset(proc_df, 'processed_heading_id_toc')

    # page_id_df = page_identification.create_dataset()
    # save_dataset(page_id_df, 'page_id')

    heading_id_intext_df = heading_id_intext.create_dataset()
    save_dataset(heading_id_intext_df, 'heading_id_intext')
def create_dataset():
    sourcefile = paths.get_dataset_path('marginal_lines')
    texts = pd.read_csv(sourcefile, usecols=['Text', 'Marginal'])
    texts = texts.loc[texts['Marginal'] > 0]
    new_texts = pd.DataFrame(columns=columns)
    new_texts['original'] = texts['Text']
    new_texts['transformed'] = texts.Text.apply(lambda x: transform_text(x))
    new_texts['tag'] = None
    #print(new_text)
    #new_text.to_csv(settings.marginals_id_trans_dataset, index=False)
    return new_texts
def run_model(mode=paths.production):
    nn = NeuralNetwork()
    model_loc = paths.get_model_path(name, mode=mode)
    nn.load_model_from_file(model_loc=model_loc)
    df = pd.read_csv(paths.get_dataset_path(name, mode=mode), usecols=['original'])
    #df = pd.read_csv(paths.marginals_id_trans_dataset, usecols=['original'])
    #data = df.original
    data = pd.Series(['page 8', 'bhp hello 3', '12 month report', 'epm3424 3 february 1900',
                                 'epm23 february 2000', 'epm34985 4000'])
    p, r = nn.predict(data)#.original)

    for i, row in df.iterrows():
        print(row.original, ', ', p[i], ', ', r[i])
Exemplo n.º 11
0
def create_dataset():
    df = pd.DataFrame(columns=columns)
    pageinfos = sorted(glob.glob('training/restructpageinfo/*.json'))

    for pagesinfo in pageinfos:
        pi = json.load(open(pagesinfo))
        #docset = np.zeros((len(pi.items()), 11))
        docid = pagesinfo.split('\\')[-1].replace('_1_restructpageinfo.json',
                                                  '')
        docset = write_to_dataset(pi, docid)
        pgdf = pd.DataFrame(data=docset, columns=columns)
        df = df.append(pgdf, ignore_index=True)

    prev_dataset = paths.get_dataset_path(name, paths.production)
    df = mlh.add_legacy_y(prev_dataset, df, y_column)
    return df
Exemplo n.º 12
0
def train(n_queries=10, mode=paths.dataset_version):  #datafile=data_path, ):
    datafile = paths.get_dataset_path(name, mode)
    data = pd.read_csv(datafile)
    accuracy, learner = active_learning.train(data,
                                              y_column,
                                              n_queries,
                                              estimator,
                                              datafile,
                                              mode=mode)
    if type(learner) == tree._classes.DecisionTreeClassifier:
        tree.plot_tree(learner, feature_names=include_cols, class_names=True)
        plt.show()
    with open(paths.get_model_path(name, mode), "wb") as file:
        pickle.dump(learner, file)
    print("End of training stage. Re-run to train again")
    return accuracy
Exemplo n.º 13
0
def train(
    n_queries=10,
    mode=paths.dataset_version
):  # datafile=settings.get_dataset_path(name), model_file=settings.get_model_path(name),
    datafile = paths.get_dataset_path(name, mode)
    if not os.path.exists(datafile):
        data = create_dataset()
        data.to_csv(datafile, index=False)
    else:
        data = pd.read_csv(datafile)
    clf = RandomForestClassifier()  #tree.DecisionTreeClassifier()
    accuracy, clf = al.train(data, y_column, n_queries, clf, datafile,
                             limited_cols)
    print(accuracy)
    model_file = paths.get_model_path(name, mode)
    with open(model_file, "wb") as file:
        pickle.dump(clf, file)
    def train(self, n_queries=10, mode=paths.dataset_version):  #settings.marginals_id_trans_dataset):
        file = paths.get_dataset_path(name, mode)
        df = pd.read_csv(file)
        #self.X = df['transformed']
        #self.Y = df['tag']
        self.max_words, self.max_len = check_maxlens(df)

        lstm = KerasClassifier(build_fn=self.LSTM, batch_size=self.batch_size, epochs=self.epochs,
                               validation_split=0.2)

        estimator = Pipeline([
            #('transform1', ColumnTransformer([
                ('transform_text', FunctionTransformer(transform_text_wrapper)),# 0)
                #], remainder="passthrough")),
            ('transform2', Text2Seq(classes=2)),
            ('lstm', lstm)
        ], verbose=True)

        accuracy, learner = active_learning.train(df, y_column, n_queries, estimator, file, limit_cols=limit_cols)
        self.model = learner
        # self.tok = Tokenizer(num_words=self.max_words+1) # only num_words-1 will be taken into account!
        # self.model = self.LSTM()
        #
        # X_train, X_test, Y_train, Y_test = train_test_split(self.X, self.Y, test_size=0.15)
        #
        # self.tok.fit_on_texts(X_train)
        # sequences = self.tok.texts_to_sequences(X_train)
        # sequences_matrix = sequence.pad_sequences(sequences, maxlen=self.max_len)
        # y_binary = to_categorical(Y_train)
        # self.model.summary()
        # self.model.fit(sequences_matrix, y_binary, batch_size=self.batch_size, epochs=self.epochs,
        #           validation_split=0.2) #, callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)]
        #
        # test_sequences = self.tok.texts_to_sequences(X_test)
        # test_sequences_matrix = sequence.pad_sequences(test_sequences, maxlen=self.max_len)
        #
        # accr = self.model.evaluate(test_sequences_matrix, to_categorical(Y_test))
        # print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0], accr[1]))
        self.model_loc = paths.get_model_path(name, mode)
        #self.model.save(self.model_loc)
        #joblib.dump(self.tok, self.tok_loc)
        with open(self.model_loc, "wb") as f:
            pickle.dump(self.model, f)
Exemplo n.º 15
0
def automatically_tag(type,
                      classification_function,
                      y_column,
                      mode=paths.dataset_version):
    source = paths.get_dataset_path(type, mode)  # 'toc'
    df = pd.read_csv(source)
    df = df.reset_index(drop=True)
    new_tags = classification_function(
        df, masked=False
    )  # can add mode parameter if ever use it on production set
    #idx = df.loc[((df['TagMethod'] != 'legacy') != (df['TOCPage'] == df['TOCPage'])) & (df['TagMethod'] != 'manual')].index.values #= new_tags.loc[(df['TagMethod'] != 'legacy') & (df['TagMethod'] != 'manual')]
    idx = df.loc[((df['TagMethod'] == 'auto') |
                  (df['TagMethod'] != df['TagMethod'])) |
                 (df[y_column] != df[y_column]
                  )].index.values  # join of auto and TOCPage==None
    df.loc[idx, y_column] = new_tags.loc[idx]
    df.loc[idx, 'TagMethod'] = 'auto'
    print(len(idx), " automatically tagged")
    #df['TagMethod'].loc[(df['TagMethod'] != 'legacy') & (df['TagMethod'] != 'manual')] = 'auto'
    if 'proba' in df.columns:
        df = df.drop(columns=['proba'])
    df.to_csv(source, index=False)
def train(n_queries=10, mode=paths.dataset_version):
    if not os.path.exists(paths.get_dataset_path('page_id', mode)):
        df = create_dataset()
        df.to_csv(paths.get_dataset_path('page_id', mode), index=False)
    nn = NeuralNetwork()
    nn.train(n_queries=n_queries, mode=mode)
Exemplo n.º 17
0
def create_dataset(
    datafile=paths.get_dataset_path(name), docid=False
):  #datafile=settings.dataset_path + 'heading_id_intext_dataset.csv', docid=False):
    sourcefile = paths.get_dataset_path('marginal_lines')
    df = pd.read_csv(sourcefile,
                     dtype={
                         'DocID': int,
                         'PageNum': int,
                         'LineNum': int,
                         'Heading': int
                     })

    if docid:
        df = df.loc[df['DocID'] == float(docid)]
    # remove ContainsTab, ContainsPage
    df = df.drop(['ContainsTab', 'ContainsPage'], axis=1)
    # remove rows with Marginal == 1 or 2. then remove marginal column
    df = df.loc[(df.Marginal == 0) | (df.Marginal != df.Marginal)]
    df = df.drop(['Marginal'], axis=1)
    # find ALL the toc pages and remove their lines from the dataset
    # find ALL the fig pages and remove their lines from the dataset
    toc_dataset = pd.read_csv(paths.get_dataset_path('toc'))
    # fig_dataset = pd.read_csv(settings.get_dataset_path('fig'))
    tocs = toc_dataset.loc[toc_dataset.TOCPage == 1]
    #figs = fig_dataset.loc[fig_dataset.FigPage == 1]
    toc_tuples = [(id, page) for id, page in zip(tocs.DocID, tocs.PageNum)]
    #fig_tuples = [(id, page) for id, page in zip(figs.DocID, figs.PageNum)]
    to_drop = []
    for i, row in df.iterrows():
        if (row.DocID, row.PageNum
            ) in toc_tuples:  #or (row.DocID, row.PageNum) in fig_tuples:
            to_drop.append(i)
    df = df.drop(index=to_drop)

    # update contains num to just re.search('[0-9]+')
    df['ContainsNum'] = df.Text.apply(lambda x: contains_num(x))
    # add column: line word count
    df.dropna(subset=['Text'], inplace=True)  # remove nans
    df['WordCount'] = df.Text.apply(lambda x: len(x.split()))

    proc_df = pd.read_csv(paths.get_dataset_path('proc_heading_id_toc'))
    proc_head_df = proc_df.loc[
        proc_df.Heading >
        0]  # works with None type? no, but works with NaN and it should be that
    proc_head_df['Text'] = proc_head_df.apply(
        lambda x: str(x.SectionPrefix) + ' ' + x.SectionText, axis=1)
    series_mh = pd.Series()
    series_mt = pd.Series()
    series_mi = pd.Series()

    for id in df.DocID.unique():
        doc_toc = proc_head_df.loc[proc_head_df.DocID == float(id)]
        df_doc = df.loc[df.DocID == float(id)]
        matches_heading, matches_type, matches_i = compare_lines2headings(
            df_doc.Text, doc_toc)
        print(len(matches_heading) == df_doc.shape[0], id)
        series_mh = series_mh.append(pd.Series(matches_heading),
                                     ignore_index=True)
        series_mt = series_mt.append(pd.Series(matches_type),
                                     ignore_index=True)
        series_mi = series_mi.append(pd.Series(matches_i), ignore_index=True)

    df['MatchesHeading'], df['MatchesType'], df[
        'MatchesI'] = series_mh, series_mt, series_mi
    df['TagMethod'] = None
    df[y_column] = None
    prev_dataset = paths.dataset_path + 'heading_id_intext_dataset.csv'
    df = mlh.add_legacy_y(prev_dataset, df, y_column, line=True)
    if not docid:
        df.to_csv(datafile, index=False)
    #df['Heading'] = 0
    return df
import paths
from sklearn import ensemble
import pickle
import re
from report import active_learning, machine_learning_helper as mlh


name = 'marginal_lines'
y_column = 'Marginal'
columns = ['DocID', 'PageNum', 'LineNum', 'NormedLineNum','Text', 'Words2Width', 'WordsWidth', 'Width', 'Height',
           'Left', 'Top', 'ContainsNum', 'ContainsTab', 'ContainsPage', 'Centrality', y_column, 'TagMethod']
limit_cols=['DocID', 'Text', 'LineNum']
include_cols = ['PageNum', 'NormedLineNum', 'Words2Width', 'WordsWidth', 'Width', 'Height', 'Left', 'Top',
                'ContainsNum', 'ContainsTab', 'ContainsPage', 'Centrality']
estimator = ensemble.RandomForestClassifier()
data_path = paths.get_dataset_path(name)
model_path = paths.get_model_path(name)


def contains_num(string):
    if re.search(r'(\s|^)[0-9]+(\s|$)', string):
        return 1
    return 0


def contains_tab(string):
    if re.search(r'\t', string):
        return 1
    return 0

Exemplo n.º 19
0
def create_dataset(ids=False, save=True, docids_only=False, training=True):
    if ids:
        save = False
    if save:
        dataset = paths.get_dataset_path('tables', 'boreholes')
        dataset = dataset.split('../')[1]
        #docids = ['32730', '44448', '37802', '2646', '44603']
        ids = paths.get_files_from_path(type='tables', training=training)
    cols = ['DocID', 'TableNum', 'Content', 'FullTable']
    all_columns = pd.DataFrame(columns=cols)
    if docids_only:
        new_ids = []
        for id in ids:
            i = paths.get_files_from_path('tables',
                                          one_docid=id,
                                          training=training)
            new_ids.extend(i)
        ids = new_ids
    for id in ids:
        # try:
        #     texttransforming.save_tables_and_kvs(id)
        # except json.decoder.JSONDecodeError:
        #     print(id)
        #     continue
        docid, file_num = id[0], id[1]
        tables = get_tables(docid, file_num=file_num, training=training)
        #columns = pd.Series([table.columns.values for table in tables])
        full_tables = []
        for table in tables:
            t = table.to_numpy()
            t = t.astype(str)
            t = np.insert(t, 0, table.columns.values, 0)
            full_tables.append(t)

        tables_values = [list(table.columns.values) for table in tables]
        #exclude = ['Unnamed: ', 'nan']
        for t, i in zip(tables, range(len(tables))):
            for j, row in t.iterrows():
                tables_values[i] = np.concatenate(
                    (tables_values[i], row.values))
                tables_values[i] = [
                    v for v in tables_values[i] if re.match(r'[A-z]+', str(v))
                ]
                tables_values[i] = [
                    v for v in tables_values[i] if 'Unnamed:' not in str(v)
                ]
                tables_values[i] = [
                    v for v in tables_values[i] if str(v) != 'nan'
                ]
        tables_values = pd.Series(tables_values)
        docids = pd.Series([docid for x in range(len(tables_values))])
        tablenums = pd.Series([x + 1 for x in range(len(tables_values))])
        fulls = pd.Series(full_tables)
        series = [docids, tablenums, tables_values, fulls]
        iddf = pd.concat(series, axis=1)
        iddf.columns = cols
        #all_columns = all_columns.append(pd.Series(columns), ignore_index=True)
        all_columns = all_columns.append(iddf, ignore_index=True)
    if save:
        all_columns.to_csv(dataset, index=False)
        print('Done creating ', dataset)
    else:
        return all_columns
Exemplo n.º 20
0
    def train(self, n_queries=10, mode=paths.dataset_version):  #settings.page_extraction_dataset):
        file = paths.get_dataset_path(name, mode)
        df = pd.read_csv(file)
        #self.X = df['transformed']
        #self.X = df['transformed']  # self.X only exists here to get proper maxlens
        self.Y = df['position']         # try with y position instead of y value
        #transform = FunctionTransformer(lambda x: num2word(x))  # not sure this will work
        #self.X = self.X.apply(lambda x: num2word(x))
        #self.max_words, self.max_len = check_maxlens(self.X)
        self.max_len = 20  # assuming max num of words in line will be 20
        self.classes, y_vectorised = self.position2int()
        self.inv_classes = {v: k for k, v in self.classes.items()}
        y_masked = np.zeros((self.Y.size, self.max_len))
        for i, j in zip(y_masked, y_vectorised):
           p = self.inv_classes[j]
           i[p] = 1

        self.num_classes = len(self.classes.items())
        nn = KerasClassifier(build_fn=self.NN, batch_size=self.batch_size, epochs=self.epochs, validation_split=0.2)

        clf = Pipeline([
            ('transform_text', FunctionTransformer(transform_text_wrapper)),
            #('num2word', FunctionTransformer(lambda x: num2word(x))),
            ('transform', Text2Seq(classes=self.num_classes, pad_len=self.max_len)),
            ('nn', nn)
        ], verbose=True)

        accuracy, learner = active_learning.train(df, y_column, n_queries, clf, file, limit_cols=limit_cols)
        self.model = learner
        self.model_loc = paths.get_model_path(name, mode)
        # self.tok = Tokenizer(num_words=self.max_words+1) # only num_words-1 will be taken into account!

        # if self.mode_type == 'LSTM':
        #     self.model = self.LSTM()
        # else:
        # self.model = self.NN()
        #
        # X_train, X_test, Y_train, Y_test = train_test_split(self.X, y_masked, test_size=0.15)
        #
        # self.tok.fit_on_texts(self.X)
        # sequences = self.tok.texts_to_sequences(X_train)
        # sequences_matrix = sequence.pad_sequences(sequences, maxlen=self.max_len)
        # #y_binary = to_categorical(Y_train) # y needs to be onehot encoded
        #
        # self.model.summary()
        # self.model.fit(sequences_matrix, Y_train, batch_size=self.batch_size, epochs=self.epochs,
        #           validation_split=0.2) #, callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)]
        #
        # test_sequences = self.tok.texts_to_sequences(X_test)
        # test_sequences_matrix = sequence.pad_sequences(test_sequences, maxlen=self.max_len)
        #
        # accr = self.model.evaluate(test_sequences_matrix, Y_test)
        # print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0], accr[1]))
        #self.model.save(self.model_loc)
        with open(self.model_loc, "wb") as f:
            pickle.dump(self.model, f)
        self.classes_loc = paths.get_model_path(name, mode, classes=True)  #self.model_path + self.model_name + 'class_dict.joblib'
        #joblib.dump(self.tok, self.tok_loc)
        joblib.dump(self.inv_classes, self.classes_loc)
        print("End of training stage. Re-run to train again")
        return accuracy