def patent_invention_train(mallet_file, features="invention", version="1", xval=0, verbose=False, stats_file=None): """Wrapper around mallet.py functionality to create a classifier model. The .mallet training instances file must exist and full path passed in. Other files needed for mallet processing will be placed in the same directory (train_output_dir). Creates an instance of MalletTraining class to do the rest: creating the .vectors file from the mallet file, and creating the model.""" #d_phr2label = load_phrase_labels3(annotation_file, annotation_count) train_output_dir = os.path.dirname(mallet_file) mconfig = mallet.MalletConfig(config.MALLET_DIR, 'itrain', 'iclassify', version, train_output_dir, '/tmp', classifier_type="MaxEnt", number_xval=xval, training_portion=0, prune_p=False, infogain_pruning="5000", count_pruning="3") mtr = mallet.MalletTraining(mconfig, features) # we can't use make_utraining_file3 since we do not base our annotations on doc_feats. #mtr.make_utraining_file3(fnames, d_phr2label, features=features) mtr.write_train_mallet_vectors_file() mtr.mallet_train_classifier()
def patent_utraining_data3(mallet_file, annotation_file, annotation_count, fnames, features=None, version="1", xval=0, verbose=False, stats_file=None): """Wrapper around mallet.py functionality to create a classifier model. Creates a dictionary of annotations, sets the mallet configuration and creates an instance of MalletTraining class to do the rest: creating .mallet file, creating the .vectors file from the mallet file, and creating the model.""" d_phr2label = load_phrase_labels3(annotation_file, annotation_count) train_output_dir = os.path.dirname(mallet_file) mconfig = mallet.MalletConfig(config.MALLET_DIR, 'train', 'classify', version, train_output_dir, '/tmp', classifier_type="MaxEnt", number_xval=xval, training_portion=0, prune_p=False, infogain_pruning="5000", count_pruning="3") mtr = mallet.MalletTraining(mconfig, features) mtr.make_utraining_file3(fnames, d_phr2label) mtr.mallet_train_classifier()
def _create_mallet_file(self): self._load_phrase_labels() mconfig = mallet.MalletConfig( self.model, 'train', 'classify', '0', self.model, '/tmp', classifier_type="MaxEnt", number_xval=0, training_portion=0, prune_p=False, infogain_pruning="5000", count_pruning="3") mtr = mallet.MalletTraining(mconfig) fnames = filename_generator(self.input_dataset.path, self.file_list) mtr.make_utraining_file3(fnames, self.d_phr2label, verbose=VERBOSE) self._create_info_stats_file(mtr.stats_labeled_count, mtr.stats_unlabeled_count, mtr.stats_terms, mtr.stats_terms_y, mtr.stats_terms_n)