def train(args): """Main training method. Does the following: 1. Create a new pycrfsuite trainer object. We will have to add feature chains and label chains to that object and then train on them. 2. Creates the feature (generators). A feature generator might e.g. take in a window of N tokens and then return ["upper=1"] for each token that starts with an uppercase letter and ["upper=0"] for each token that starts with a lowercase letter. (Lists, because a token can be converted into multiple features by a single feature generator, e.g. the case for LDA as a token may be part of multiple topics.) 3. Loads windows from the corpus. Each window has a fixed (maximum) size in tokens. We only load windows that contain at least one label (named entity), so that we don't waste too much time on windows without any label. 4. Generate features for each chain of tokens (window). That's basically described in (2.). Each chain of tokens from a window will be converted to a list of lists. One list at the top level representing each token, then another list for the feature values. E.g. [["w2v=123", "bc=742", "upper=0"], ["w2v=4", "bc=12", "upper=1", "lda4=1"]] for two tokens. 5. Add feature chains and label chains to the trainer. 6. Train. This may take several hours for 20k windows. Args: args: Command line arguments as parsed by argparse.ArgumentParser. """ trainer = pycrfsuite.Trainer(verbose=True) # Create/Initialize the feature generators # this may take a few minutes print("Creating features...") feature_generators = features.create_features() # Initialize the window generator # each window has a fixed maximum size of tokens print("Loading windows...") windows = load_windows(load_articles(cfg.ARTICLES_FILEPATH), cfg.WINDOW_SIZE, feature_generators, only_labeled_windows=True) # Add chains of features (each list of lists of strings) # and chains of labels (each list of strings) # to the trainer. # This may take a long while, especially because of the lengthy POS tagging. # POS tags and LDA results are cached, so the second run through this part will be significantly # faster. print("Adding example windows (up to max %d)..." % (cfg.COUNT_WINDOWS_TRAIN)) examples = generate_examples(windows, nb_append=cfg.COUNT_WINDOWS_TRAIN, nb_skip=cfg.COUNT_WINDOWS_TEST, verbose=True) for feature_values_lists, labels in examples: trainer.append(feature_values_lists, labels) # Train the model # this may take several hours print("Training...") if cfg.MAX_ITERATIONS is not None and cfg.MAX_ITERATIONS > 0: # set the maximum number of iterations of defined in the config file # the optimizer stops automatically after some iterations if this is not set trainer.set_params({'max_iterations': cfg.MAX_ITERATIONS}) trainer.train(args.identifier)
def train(args): """Main training method. Does the following: 1. Create a new pycrfsuite trainer object. We will have to add feature chains and label chains to that object and then train on them. 2. Creates the feature (generators). A feature generator might e.g. take in a window of N tokens and then return ["upper=1"] for each token that starts with an uppercase letter and ["upper=0"] for each token that starts with a lowercase letter. (Lists, because a token can be converted into multiple features by a single feature generator, e.g. the case for LDA as a token may be part of multiple topics.) 3. Loads windows from the corpus. Each window has a fixed (maximum) size in tokens. We only load windows that contain at least one label (named entity), so that we don't waste too much time on windows without any label. 4. Generate features for each chain of tokens (window). That's basically described in (2.). Each chain of tokens from a window will be converted to a list of lists. One list at the top level representing each token, then another list for the feature values. E.g. [["w2v=123", "bc=742", "upper=0"], ["w2v=4", "bc=12", "upper=1", "lda4=1"]] for two tokens. 5. Add feature chains and label chains to the trainer. 6. Train. This may take several hours for 20k windows. Args: args: Command line arguments as parsed by argparse.ArgumentParser. """ #trainer = pycrfsuite.Trainer(verbose=True) # Create/Initialize the feature generators # this may take a few minutes print("Creating features...") feature_generators = features.create_features() # Initialize the window generator # each window has a fixed maximum size of tokens print("Loading windows...") windows = load_windows(load_articles(cfg.ARTICLES_FILEPATH), cfg.WINDOW_SIZE, feature_generators, only_labeled_windows=True) # Add chains of features (each list of lists of strings) # and chains of labels (each list of strings) # to the trainer. # This may take a long while, especially because of the lengthy POS tagging. # POS tags and LDA results are cached, so the second run through this part will be significantly # faster. print("Adding example windows (up to max %d)..." % (cfg.COUNT_WINDOWS_TRAIN)) examples = generate_examples(windows, nb_append=cfg.COUNT_WINDOWS_TRAIN, nb_skip=cfg.COUNT_WINDOWS_TEST, verbose=True) #for feature_values_lists, labels in examples: # trainer.append(feature_values_lists, labels) f = open('/home/nitin.jain/ner_aleju/dataset/outfeatures.txt', 'w') for feature_values_lists, labels in examples: f.write(str(feature_values_lists + ["------"] + labels)) f.close()
def train_lda(): """ Train the LDA model. generate_dictionary() must be called before this method. """ print("------------------") print("Training LDA model") print("------------------") # load dictionary, as generated by generate_dictionary() print("Loading dictionary...") dictionary = gensim.corpora.dictionary.Dictionary.load(cfg.LDA_DICTIONARY_FILEPATH) # generate a mapping from word id to word print("Generating id2word...") id2word = {} for word in dictionary.token2id: id2word[dictionary.token2id[word]] = word # initialize LDA print("Initializing LDA...") lda_model = LdaMulticore(corpus=None, num_topics=cfg.LDA_COUNT_TOPICS, id2word=id2word, workers=LDA_COUNT_WORKERS, chunksize=LDA_CHUNK_SIZE) # Train the LDA model print("Training...") examples = [] update_every_n_windows = 25000 windows = load_windows(load_articles(cfg.ARTICLES_FILEPATH), cfg.LDA_WINDOW_SIZE, only_labeled_windows=True) for i, window in enumerate(windows): tokens_str = [token.word.lower() for token in window.tokens] bow = dictionary.doc2bow(tokens_str) # each window as bag of words examples.append(bow) if len(examples) >= update_every_n_windows: print("Updating (at window %d of max %d)..." % (i, COUNT_EXAMPLES_FOR_LDA)) # this is where the LDA model is trained lda_model.update(examples) examples = [] if i >= COUNT_EXAMPLES_FOR_LDA: print("Reached max of %d windows." % (COUNT_EXAMPLES_FOR_LDA,)) break # i don't update here with the remainder of windows, because im not sure if each update step's # results are heavily influenced/skewed by the the number of examples #if len(examples) > 0: # print("Updating with remaining windows...") # lda_model.update(examples) # save trained model to HDD print("Saving...") lda_model.save(cfg.LDA_MODEL_FILEPATH)
def test_on_articles(identifier, articles, nb_append=None): """Test a trained CRF model on a list of Article objects (annotated text). Will print a full classification report by label (f1, precision, recall). Args: identifier: Identifier of the trained model to be used. articles: A list of Article objects or a generator for such a list. May only contain one single Article object. """ print("Loading tagger...") tagger = pycrfsuite.Tagger() tagger.open(identifier) # create feature generators # this may take a while print("Creating features...") feature_generators = features.create_features() # create window generator print("Loading windows...") windows = load_windows(articles, cfg.WINDOW_SIZE, feature_generators, only_labeled_windows=True) # load feature lists and label lists (X, Y) # this may take a while all_feature_values_lists = [] correct_label_chains = [] for fvlist, labels in generate_examples(windows, nb_append=nb_append): all_feature_values_lists.append(fvlist) correct_label_chains.append(labels) # generate predicted chains of labels print("Testing on %d windows..." % (len(all_feature_values_lists))) predicted_label_chains = [ tagger.tag(fvlists) for fvlists in all_feature_values_lists ] # print classification report (precision, recall, f1) print( bio_classification_report(correct_label_chains, predicted_label_chains))
def train(args): """Main training method. """ trainer = pycrfsuite.Trainer(verbose=True) # Create/Initialize the feature generators # this may take a few minutes print("Creating features...") feature_generators = features.create_features() # Initialize the window generator # each window has a fixed maximum size of tokens print("Loading windows...") windows = load_windows(load_articles(cfg.ARTICLES_FILEPATH), cfg.WINDOW_SIZE, feature_generators, only_labeled_windows=True) # Add chains of features (each list of lists of strings) # and chains of labels (each list of strings) # to the trainer. print("Adding example windows (up to max %d)..." % (cfg.COUNT_WINDOWS_TRAIN)) examples = generate_examples(windows, nb_append=cfg.COUNT_WINDOWS_TRAIN, nb_skip=cfg.COUNT_WINDOWS_TEST, verbose=True) for feature_values_lists, labels, tokens in examples: trainer.append(feature_values_lists, labels) # Train the model # this may take several hours print("Training...") if cfg.MAX_ITERATIONS is not None and cfg.MAX_ITERATIONS > 0: # set the maximum number of iterations of defined in the config file # the optimizer stops automatically after some iterations if this is not set trainer.set_params({'max_iterations': cfg.MAX_ITERATIONS}) trainer.train(args.identifier)
def test_on_articles(identifier, articles, nb_append=None): """Test a trained CRF model on a list of Article objects (annotated text). Will print a full classification report by label (f1, precision, recall). Args: identifier: Identifier of the trained model to be used. articles: A list of Article objects or a generator for such a list. May only contain one single Article object. """ print("Loading tagger...") tagger = pycrfsuite.Tagger() tagger.open(identifier) # create feature generators # this may take a while print("Creating features...") feature_generators = features.create_features() # create window generator print("Loading windows...") windows = load_windows(articles, cfg.WINDOW_SIZE, feature_generators, only_labeled_windows=True) # load feature lists and label lists (X, Y) # this may take a while all_feature_values_lists = [] correct_label_chains = [] for fvlist, labels in generate_examples(windows, nb_append=nb_append): all_feature_values_lists.append(fvlist) correct_label_chains.append(labels) # generate predicted chains of labels print("Testing on %d windows..." % (len(all_feature_values_lists))) predicted_label_chains = [tagger.tag(fvlists) for fvlists in all_feature_values_lists] # print classification report (precision, recall, f1) print(bio_classification_report(correct_label_chains, predicted_label_chains))
def test_on_articles(identifier, articles, nb_append=None): """Test a trained CRF model on a list of Article objects (annotated text). Will print a full classification report by label (f1, precision, recall). Args: identifier: Identifier of the trained model to be used. articles: A list of Article objects or a generator for such a list. May only contain one single Article object. """ print("Loading tagger...") tagger = pycrfsuite.Tagger() tagger.open(identifier) # create feature generators # this may take a while print("Creating features...") feature_generators = features.create_features() # create window generator print("Loading windows...") windows = load_windows(articles, cfg.WINDOW_SIZE, feature_generators, only_labeled_windows=False) # load feature lists and label lists (X, Y) # this may take a while all_feature_values_lists = [] correct_label_chains = [] all_tokens = [] model_labels = [] for fvlist, labels, tokens in generate_examples(windows, nb_append=nb_append): all_feature_values_lists.append(fvlist) correct_label_chains.append(labels) all_tokens.extend(tokens) # generate predicted chains of labels print("Testing on %d windows..." % (len(all_feature_values_lists))) predicted_label_chains = [] for fv in all_feature_values_lists: tags = tagger.tag(fv) predicted_label_chains.append(tags) model_labels.extend(tags) result = zip(all_tokens, model_labels) print("Find named entities") result_tokens = [] print(set([word for word,label in result if label != 'O'])) i = 0 while i < len(result) - 1: if result[i][1] != 'O': j = i+1 while j < len(result) and result[j][1] != 'O': j = j + 1 entities = result[i][0] for k in range(i+1, j): entities += ' ' + result[k][0] i = j result_tokens.append(entities) else: result_tokens.append(result[i][0]) i = i + 1 with open('tokens.txt', 'w') as f: for token in result_tokens: try: f.write(token + '\n') except Exception: print(token) # result_tokens = nltk.pos_tag(result_tokens) # with open('tags.txt', 'w') as f: # for token in result_tokens: # try: # f.write(str(token) + '\n') # except Exception: # print(token) # print classification report (precision, recall, f1) print(bio_classification_report(correct_label_chains, predicted_label_chains)) return result_tokens