示例#1
0
文件: KEA.py 项目: bit212-2019/keep
    def TrainingKEAModel(self, pathToCollectionOfDocs, groundTruthFile, lang,
                         normalization, pathToDFFile, pathToKEAFile,
                         pathToKeaModelsFolder):
        print(f"\nSTEP 2: Compute Document Frequency")
        ComputeDF(pathToCollectionOfDocs, lang, normalization, pathToDFFile)
        df = pke.load_document_frequency_file(input_file=pathToDFFile)

        print(
            f"\nSTEP 3: Train KEA Model on top of the following set of docs: {pathToCollectionOfDocs}"
        )

        if os.path.exists(pathToKEAFile):
            print(f"KEA Model File already exists here:  {pathToKEAFile} ")
        else:
            print(
                f"KEA Model doesn't exists. Let's create here: {pathToCollectionOfDocs}. It may take a while."
            )
            # If folder Models does not exist: Create it
            if not os.path.exists(pathToKeaModelsFolder):
                os.makedirs(pathToKeaModelsFolder)

            pke.train_supervised_model(input_dir=pathToCollectionOfDocs,
                                       reference_file=groundTruthFile,
                                       model_file=pathToKEAFile,
                                       extension='txt',
                                       language=lang,
                                       normalization=normalization,
                                       df=df,
                                       model=pke.supervised.Kea())
示例#2
0
import codecs
import logging
import pke

# setting info in terminal
logging.basicConfig(level=logging.INFO)

# path to the collection of documents
input_dir = sys.argv[1]

# path to the reference file
reference_file = sys.argv[2]

# path to the df file
df_file = sys.argv[3]
logging.info('loading df counts from '+df_file)
df_counts = pke.load_document_frequency_file(df_file, delimiter='\t')

# path to the model, saved as a pickle
output_mdl = sys.argv[4]

pke.train_supervised_model(input_dir=input_dir,
                           reference_file=reference_file,
                           model_file=output_mdl,
                           df=df_counts,
                           format="corenlp",
                           use_lemmas=False,
                           stemmer="porter",
                           model=pke.supervised.Kea()
                           language='english',
                           extension="xml")
示例#3
0
文件: train.py 项目: xiaoman0220/pke
import logging
import pke

# setting info in terminal
logging.basicConfig(level=logging.INFO)

# path to the collection of documents
input_dir = 'train/'

# path to the reference file
reference_file = "gold-annotation.txt"

# path to the df file
df_file = "df.tsv.gz"
logging.info('Loading df counts from {}'.format(df_file))
df_counts = pke.load_document_frequency_file(input_file=df_file,
                                             delimiter='\t')

# path to the model, saved as a pickle
output_mdl = "model.pickle"

pke.train_supervised_model(input_dir=input_dir,
                           reference_file=reference_file,
                           model_file=output_mdl,
                           df=df_counts,
                           format="corenlp",
                           use_lemmas=False,
                           stemmer="porter",
                           model=pke.supervised.Kea(),
                           language='english',
                           extension="xml")
示例#4
0
import pke
#import logging
## Training the model on train set.
#train_input_dir = 'drive/My Drive/Recommendation systems/kea_trained/train_doc/'
reference_file = 'drive/My Drive/Recommendation systems/kea_trained/reference.txt'
output_mdl = "drive/My Drive/Recommendation systems/kea_trained/Models/kea_model.pickle"
#train_df_file = 'drive/My Drive/Recommendation systems/kea_trained/train_DF.tsv.gz'

#logging.info('Loading df counts from {}'.format(df_file))

df_counts = pke.load_document_frequency_file(input_file='train_DF.tsv.gz',
                                             delimiter='\t')

pke.train_supervised_model(input_dir='train_doc/',
                           reference_file='reference.txt',
                           model_file='model/kea_model.pickle',
                           extension='txt',
                           language='en',
                           normalization="stemming",
                           df=df_counts,
                           model=pke.supervised.Kea())
示例#5
0
import codecs
import logging
import pke

# setting info in terminal
logging.basicConfig(level=logging.INFO)

# path to the collection of documents
input_dir = sys.argv[1]

# path to the reference file
reference_file = sys.argv[2]

# path to the df file
df_file = sys.argv[3]
logging.info('loading df counts from '+df_file)
df_counts = pke.load_document_frequency_file(df_file, delimiter='\t')

# path to the model, saved as a pickle
output_mdl = sys.argv[4]

pke.train_supervised_model(input_dir=input_dir,
                           reference_file=reference_file,
                           model_file=output_mdl,
                           df=df_counts,
                           format="corenlp",
                           use_lemmas=False,
                           stemmer="porter",
                           model=pke.Kea()
                           language='english',
                           extension="xml")
示例#6
0
for fn in glob(base + os.sep + 'train/*.txt'):
    with open(fn) as f:
        doc = f.read()
    doc_id = os.path.basename(fn).rsplit('.', 1)[0]
    documents.append((doc_id, doc))

logging.info('Loaded {} documents'.format(len(documents)))

# path to the reference file
reference = {}
with open(base + os.sep + 'gold-annotation.txt') as f:
    for line in f:
        doc_id, keywords = line.split(' : ')
        reference[doc_id] = keywords.split(',')

# path to the df file
df_file = base + os.sep + 'df.tsv.gz'
logging.info('Loading df counts from {}'.format(df_file))
df_counts = pke.load_document_frequency_file(input_file=df_file,
                                             delimiter='\t')

# path to the model, saved as a pickle
output_mdl = base + os.sep + 'model.pickle'
pke.train_supervised_model(documents,
                           reference,
                           model_file=output_mdl,
                           language='en',
                           normalization='stemming',
                           df=df_counts,
                           model=pke.supervised.Kea())
示例#7
0
###############################################################################

if not only_test:
    # Training a supervised Kea model
    if not os.path.isfile(path_to_kea_file):

        logging.info("Training supervised model {}".format(path_to_kea_file))

        logging.info("loading DF counts from {}".format(path_to_df_file))
        df_counts = pke.load_document_frequency_file(
            input_file=path_to_df_file)

        pke.train_supervised_model(input_dir=path_to_train,
                                   reference_file=params["reference"],
                                   model_file=path_to_kea_file,
                                   extension=params["extension"],
                                   language=params["language"],
                                   normalization=params["normalization"],
                                   df=df_counts,
                                   model=pke.supervised.Kea())

else:
    # No training set is available
    if not os.path.isdir(path_to_leave_one_out_models):
        os.makedirs(path_to_leave_one_out_models)

        logging.info("Training LOO models {}".format(
            path_to_leave_one_out_models))

        logging.info("loading DF counts from {}".format(path_to_df_file))
        df_counts = pke.load_document_frequency_file(
            input_file=path_to_df_file)
示例#8
0
文件: train.py 项目: yuan776/pke
import logging

import pke

# setting info in terminal
logging.basicConfig(level=logging.INFO)

# path to the collection of documents
input_dir = 'train' + os.sep

# path to the reference file
reference_file = "gold-annotation.txt"

# path to the df file
df_file = "df.tsv.gz"
logging.info('Loading df counts from {}'.format(df_file))
df_counts = pke.load_document_frequency_file(input_file=df_file,
                                             delimiter='\t')

# path to the model, saved as a pickle
output_mdl = "model.pickle"

pke.train_supervised_model(input_dir=input_dir,
                           reference_file=reference_file,
                           model_file=output_mdl,
                           extension='xml',
                           language='en',
                           normalization="stemming",
                           df=df_counts,
                           model=pke.supervised.Kea())