def run(): for training_condition in ['context', 'context_stress', 'context_phonology']: # load data set with dimensionality-reduced word embeddings obtained from bimodal auto encoder # trained in 'train_bimodal_auto_encoder.py' data_set = load_bimodal_data_set(training_condition) # softmax model for predicting words from the right context softmax = SoftmaxPredictor(data_set=data_set, n_next_words=2000, learning_rate=0.008, input_size=500) # train the model softmax.train() # get word embeddings embeddings = softmax.embeddings_over_epochs[-1] # write predicted categories to file -- files can be used for additional processing, e.g. significance testing # on predictions made at two different stages in the training process write_preds_to_file(embeddings, '10-NN', '%s_10-NN_after_stage2' % training_condition) # append 10-NN results obtained from word embeddings to file micro_f1, macro_f1, classification_report = get_f1_and_classification_report(embeddings, '10-NN') results_to_disk(micro_f1, macro_f1, classification_report, epoch=softmax.epochs, condition=training_condition, training_stage='AFTER STAGE 2', newfile=False) # plot word embeddings, reduced to two dimensions via the T-SNE algorithm plot_2D_embeddings(embeddings, training_condition, training_stage='after_stage_2') # plot accuracy over all training epochs plot_metric(plot_name='%s_softmax' % training_condition, plot_type='micro_f1_over_epochs', ys=softmax.f1_over_epochs, label='micro_f1') # plot training error over all training epochs plot_metric(plot_name='%s_softmax' % training_condition, plot_type='training_error_over_epochs', ys=softmax.training_error_over_epochs, label='training_error')
def run(): data_set = load_text_data_set("CDS") # exclude the following words, all longer than three syllables, which is not supported by the system ued to # construct phonological feature vectors more_than_three_syllables = [ "actually", "anybody", "investigator", "helicopter", "interesting", "cookie_monster", "definitely", "refrigerator", "oh_my_goodness", "humpty_dumpty", "interested", "everybody", "father_christmas", "helicopter", "alligator", "caterpillar", "everybody's", "hippopotamus", ] # also exclude the empty string, which occasionally occurs in the corpus empty_string = [""] vocabulary = get_target_words( 2000, tags={"n", "v", "adj", "fn"}, exclude_words=more_than_three_syllables + empty_string ) # auto encoder for reducing the dimensionality of the phonological feature vectors phon_ae = PhonAutoEncoder( vocabulary=vocabulary, epochs=200, learning_rate=0.1, n_hidden=30, corruption_level=0.1, batch_size=1 ) phon_ae.train() for training_condition in ["context", "context_stress", "context_phonology"]: # if 'phonology' is part of the model name, get dimensionality-reduced phonological feature vectors if "phonology" in training_condition: phon_vs = phon_ae.get_hidden_vectors() phon_vs_size = 30 else: phon_vs = None phon_vs_size = 0 # if 'stress' is part of the model name, get dimensionality-reduced lexical stress feature vectors if "stress" in training_condition: stress_vs = get_primary_stress_vectors(vocabulary) stress_vs_size = 3 else: stress_vs = None stress_vs_size = 0 # auto encoder for projecting left-context and phonological / lexical stress feature vector into a shared # dimensionality-reduced space bm_ae = BiModalAutoEncoder( data_set=data_set, context_feature_size=2000, phon_vectors=phon_vs, phon_feature_size=phon_vs_size, stress_vectors=stress_vs, stress_feature_size=stress_vs_size, learning_rate=0.01, n_hidden=500, corruption_level=0.1, batch_size=1, ) # vectors of left-context frequencies, plus phonological and / or lexical stress features input_embeddings = dict(zip(bm_ae.vocabulary, bm_ae.embeddings_matrix.get_value())) # write predicted categories to file -- files can be used for additional processing, e.g. significance testing # on predictions made at two different stages in the training process write_preds_to_file(input_embeddings, "majority_vote", "%s_majority_vote" % training_condition) write_preds_to_file(input_embeddings, "stratified", "%s_stratified_sampling" % training_condition) write_preds_to_file(input_embeddings, "10-NN", "%s_10-NN_before_stage1" % training_condition) # create a new file with results by training stage -- write majority vote baseline results to this file micro_f1, macro_f1, classification_report = get_f1_and_classification_report(input_embeddings, "majority_vote") results_to_disk( micro_f1, macro_f1, classification_report, epoch="model was not trained at this stage", condition=training_condition, training_stage="BASELINE 1: MAJORITY VOTE", newfile=True, ) # append stratified sampling baseline results to file micro_f1, macro_f1, classification_report = get_f1_and_classification_report(input_embeddings, "stratified") results_to_disk( micro_f1, macro_f1, classification_report, epoch="model was not trained at this stage", condition=training_condition, training_stage="BASELINE 2: STRATIFIED SAMPLING", newfile=False, ) # append 10-NN results obtained from input vectors to file micro_f1, macro_f1, classification_report = get_f1_and_classification_report(input_embeddings, "10-NN") results_to_disk( micro_f1, macro_f1, classification_report, epoch="model was not trained at this stage", condition=training_condition, training_stage="BEFORE STAGE 1", newfile=False, ) # train the bimodal auto encoder bm_ae.train() # get the dimensionality-reduced embeddings after training (vectors of hidden unit activation values) embeddings = bm_ae.embeddings_over_epochs[-1] # # append 10-NN results obtained from hidden embeddings to file micro_f1, macro_f1, classification_report = get_f1_and_classification_report(embeddings, "10-NN") results_to_disk( micro_f1, macro_f1, classification_report, epoch=bm_ae.epochs, condition=training_condition, training_stage="AFTER STAGE 1", newfile=False, ) # plot word embeddings, reduced to two dimensions via the T-SNE algorithm plot_2D_embeddings(embeddings, training_condition, training_stage="after_stage_1") # plot accuracy over all training epochs plot_metric( plot_name="%s_auto_encoder" % training_condition, plot_type="micro_f1_over_epochs", ys=bm_ae.f1_over_epochs, label="micro_f1", ) # plot training error over all training epochs plot_metric( plot_name="%s_auto_encoder" % training_condition, plot_type="training_error_over_epochs", ys=bm_ae.training_error_over_epochs, label="training_error", ) # create and save a new data set with the new embeddings -- for further training with the softmax model save_bimodal_data_set(bm_ae, data_set_name=training_condition, embeddings_dict=embeddings)