def main(sentences, model, language, outfile, processes, gazetteer): gazetteer = reverse_gazetteer(json.load(gazetteer)) if gazetteer else {} logger.info("Loading model from '%s' ...", model) model, extractor = joblib.load(model) classifier = SentenceClassifier(model, extractor, language, gazetteer) def worker(batch): data = (json.loads(s) for s in batch) for classified in classifier.classify_sentences(data): yield json.dumps(classified) count = 0 for each in parallel.map(worker, sentences, batch_size=1000, flatten=True, processes=processes): outfile.write(each) outfile.write('\n') count += 1 if count % 1000 == 0: logger.info('Classified %d sentences', count) logger.info('Done, classified %d sentences', count) if count > 0: logger.info("Dumped classified sentences to '%s'", outfile.name)
def main(training_set, language, outfile, gazetteer, **kwargs): """ Trains the classifier """ gazetteer = reverse_gazetteer(json.load(gazetteer)) if gazetteer else {} extractor = FactExtractorFeatureExtractor(language) logger.info("Building training set from '%s' ..." % training_set.name) for row in training_set: data = json.loads(row) extractor.process_sentence(data['sentence'], data['fes'], add_unknown=True, gazetteer=gazetteer) logger.info('Finalizing training set ...') x, y = extractor.get_features() logger.info('Got %d samples with %d features each', *x.shape) logger.info('Fitting model ...') kwargs['C'] = kwargs.pop('c') svc = LinearSVC(**kwargs) svc.fit(x, y) joblib.dump((svc, extractor), outfile) logger.info("Done, dumped model to '%s'", outfile)
def main(training_set, language, gold_standard, gazetteer): """ Searches for the best hyperparameters """ gazetteer = reverse_gazetteer(json.load(gazetteer)) if gazetteer else {} logger.info('Building training set') extractor = FactExtractorFeatureExtractor(language) for row in training_set: data = json.loads(row) extractor.process_sentence(data['sentence'], data['fes'], add_unknown=True, gazetteer=gazetteer) logger.info('Finalizing training set') x, y = extractor.get_features() logger.info('Searching for the best model parameters') svc = LinearSVC() search = GridSearchCV( svc, param_grid=[{ 'C': [0.01, 0.1, 1.0, 10.0, 100.0, 1000.0], 'multi_class': ['ovr', 'crammer_singer'], }], scoring='f1_weighted', cv=10) search.fit(x, y) logger.info('The best model (weighted-averaged F1 of %.4f) has parameters %s', search.best_score_, search.best_params_) if not gold_standard: logger.info('Skipping gold standard evaluation') return logger.info('Evaluating on the gold standard') for row in gold_standard: data = json.loads(row) extractor.process_sentence(data['sentence'], data['fes']) x_gold, y_gold = extractor.get_features() dummy = DummyClassifier(strategy='stratified') dummy.fit(x, y) y_dummy = dummy.predict(x_gold) logger.info('Dummy model has a weighted-averaged F1 on the gold standard of %.4f', metrics.f1_score(y_gold, y_dummy, average='weighted')) y_best = search.predict(x_gold) logger.info('Best model has a weighted-averaged F1 on the gold standard of %.4f', metrics.f1_score(y_gold, y_best, average='weighted'))
def main( training_set, language, outfile, model_class, model_param, extractor_class, extractor_param, gazetteer, folds, scoring, skip_majority, evaluate_gold, ): """ Trains the classifier """ gazetteer = reverse_gazetteer(json.load(gazetteer)) if gazetteer else {} model_cls, model_args = initialize(model_class, model_param, False) if evaluate_gold: gold_extractor = initialize(extractor_class, [("language", language)] + list(extractor_param), True) gold_evaluation(map(json.loads, training_set), gold_extractor, gazetteer, model_cls, model_args) training_set.seek(0) extractor = initialize(extractor_class, [("language", language)] + list(extractor_param), True) logger.info("Building training set from '%s' ..." % training_set.name) for row in training_set: data = json.loads(row) extractor.process_sentence(data["sentence"], data["lu"], data["fes"], add_unknown=True, gazetteer=gazetteer) x, y = extractor.get_features(refit=True) logger.info("Got %d samples with %d features each", *x.shape) model = FeatureSelectedClassifier(model_cls, extractor.lu_column(), model_args) if folds > 1: kfolds_evaluation(folds, model, scoring, skip_majority, x, y) logger.info("Fitting model ...") model.fit(x, y) joblib.dump((model, {"extractor": extractor}), outfile) logger.info("Done, dumped model to '%s'", outfile)
def main(training_set, language, outfile, model_class, model_param, extractor_class, extractor_param, gazetteer, folds, scoring, skip_majority, evaluate_gold): """ Trains the classifier """ gazetteer = reverse_gazetteer(json.load(gazetteer)) if gazetteer else {} model_cls, model_args = initialize(model_class, model_param, False) if evaluate_gold: gold_extractor = initialize( extractor_class, [('language', language)] + list(extractor_param), True ) gold_evaluation( map(json.loads, training_set), gold_extractor, gazetteer, model_cls, model_args ) training_set.seek(0) extractor = initialize(extractor_class, [('language', language)] + list(extractor_param), True) logger.info("Building training set from '%s' ..." % training_set.name) for row in training_set: data = json.loads(row) extractor.process_sentence(data['sentence'], data['lu'], data['fes'], add_unknown=True, gazetteer=gazetteer) x, y = extractor.get_features(refit=True) logger.info('Got %d samples with %d features each', *x.shape) model = FeatureSelectedClassifier(model_cls, extractor.lu_column(), model_args) if folds > 1: kfolds_evaluation(folds, model, scoring, skip_majority, x, y) logger.info('Fitting model ...') model.fit(x, y) joblib.dump((model, { 'extractor': extractor }), outfile) logger.info("Done, dumped model to '%s'", outfile)
def get_training_sets(training_set, language, gazetteer, word2vec_model, independent_lus): extractor_args = itertools.chain( itertools.product([BagOfTermsFeatureExtractor], [True, False], [0, 1, 2]), itertools.product([Word2VecFeatureExtractor], [word2vec_model], [True, False], [0, 1, 2]) if word2vec_model else []) lus = set(json.loads(row)['lu'] for row in training_set) if independent_lus else ['$all'] count = 0 for gaz in list(gazetteer) + [None]: for args in extractor_args: for lu in lus: logger.debug('%d) gazetteer: %s, extractor params: %s, lu: %s', count, gaz.name if gaz else None, args, lu) count += 1 extractor, init_args = args[0], args[1:] extractor = extractor(language, *init_args) gazetteer = reverse_gazetteer( json.load(gazetteer)) if gaz else {} training_set.seek(0) for row in training_set: data = json.loads(row) if not independent_lus or data['lu'] in lus: extractor.process_sentence(data['sentence'], data['lu'], data['fes'], add_unknown=True, gazetteer=gazetteer) meta = { 'lu': lu, 'gazetteer': gaz, 'extractor_cls': args[0], 'extractor_args': [language] + list(args[1:]), 'extractor': extractor } yield meta, extractor
def get_training_sets(training_set, language, gazetteer, word2vec_model, independent_lus): extractor_args = itertools.chain( itertools.product([BagOfTermsFeatureExtractor], [True, False], [0, 1, 2]), itertools.product([Word2VecFeatureExtractor], [word2vec_model], [True, False], [0, 1, 2]) if word2vec_model else [] ) lus = set(json.loads(row)['lu'] for row in training_set) if independent_lus else ['$all'] count = 0 for gaz in list(gazetteer) + [None]: for args in extractor_args: for lu in lus: logger.debug('%d) gazetteer: %s, extractor params: %s, lu: %s', count, gaz.name if gaz else None, args, lu) count += 1 extractor, init_args = args[0], args[1:] extractor = extractor(language, *init_args) gazetteer = reverse_gazetteer(json.load(gazetteer)) if gaz else {} training_set.seek(0) for row in training_set: data = json.loads(row) if not independent_lus or data['lu'] in lus: extractor.process_sentence(data['sentence'], data['lu'], data['fes'], add_unknown=True, gazetteer=gazetteer) meta = { 'lu': lu, 'gazetteer': gaz, 'extractor_cls': args[0], 'extractor_args': [language] + list(args[1:]), 'extractor': extractor } yield meta, extractor