def _print_op_list(ops, dmap): for d, o in ops: print "#############################################" print " ".join(DataTools.mark_unknown(d.tokens, word_embeddings.vocabulary.word2index)) print "ORIG:\t", _ops2str(d.opinions) print o print "" print " :\t", _ops2str(dmap[d.id].opinions)
def build(self): polarities = set() categories = set() entities = set() attributes = set() for s in self.sentences: for o in s.opinions: polarities.add(o.polarity) categories.add(o.category) entities.add(o.entity) attributes.add(o.attribute) self.polarity_table = DataTools.Vocabulary() self.polarity_table.init_from_vocab(polarities) self.category_table = DataTools.Vocabulary() self.category_table.init_from_vocab(categories) self.entity_table = DataTools.Vocabulary() self.entity_table.init_from_vocab(entities) self.attribute_table = DataTools.Vocabulary() self.attribute_table.init_from_vocab(attributes)
def _print_list(l): for d, td, cd in l: d.opinions = sorted(d.opinions, key=lambda o: o.start) td.opinions = sorted(td.opinions, key=lambda o: o.start) cd.opinions = sorted(cd.opinions, key=lambda o: o.start) print "#############################################" print " ".join(DataTools.mark_unknown(d.tokens, word_embeddings.vocabulary.word2index)) print "" print "ORIG:\t", _ops2str(d.opinions) print "" print "TOKN:\t", _ops2str(td.opinions) print "" print "CHAR:\t", _ops2str(cd.opinions)
from nlputils import DataTools import data # for top_k in [10000, 20000, 50000]: word_embeddings = DataTools.Embedding() word_embeddings.load( "/vol/scstaff/sjebbara/data/embeddings/amazon_review_corpus_en_100D_advanced_W.npy", "/vol/scstaff/sjebbara/data/embeddings/amazon_review_corpus_en_100D_advanced_vocab.txt" ) word_embeddings.trim_embeddings(vocab_trim=["<UNK>"], top_k=top_k) word_embeddings.vocabulary.set_unknown( word_embeddings.vocabulary.get_index("<UNK>")) word_embeddings.add("<pad>", 0, vector_init="zeros") word_embeddings.vocabulary.set_padding(0) word_embeddings.add(data.SENTENCE_START_TOKEN, 1, vector_init="zeros") word_embeddings.add(data.SENTENCE_END_TOKEN, 2, vector_init="zeros") word_embeddings.save( "../res/embeddings/", "amazon_review_corpus_en_100D_advanced_top-{}".format(top_k))
def evaluate_aspects(model, documents, word_vocabulary, pos_vocabulary, char_vocabulary, conf, verbose=True): print "Evaluate model ..." tagging_scheme = AnnotationTools.get_tagging_scheme(conf.tagging_scheme) documents = filter(lambda d: not d.out_of_scope, documents) batch_generator = DatasetTools.BatchGenerator( documents, 100, get_vectorizer(word_vocabulary, pos_vocabulary, char_vocabulary, tagging_scheme), raw_data_name="document") results = LearningTools.ExperimentSnapshotResults() errors = 0 for i, batches in enumerate(batch_generator): actual_batch_size = len(batches.text_input) i_text = i * conf.batch_size + actual_batch_size print("Pretrain Batch %d; Text %d:" % (i + 1, i_text)) predicted_aspect_batch = model.predict_on_batch(batches) batches["predicted_aspect_output"] = predicted_aspect_batch for instance in DatasetTools.BatchIterator([batches]): d = instance["document"] true_aspects = instance["aspect_output"] predicted_aspect_probas = instance["predicted_aspect_output"] if verbose: print u"#### Sentence: [{}]: '{}'".format(d.id, d.text) tokens = DataTools.mark_unknown(d.tokens, word_vocabulary.word2index) true_aspects = true_aspects[-len(tokens):, :] # remove padding predicted_aspect_probas = predicted_aspect_probas[ -len(tokens):, :] # remove padding true_aspect_spans_orig = set([(o.token_start, o.token_end) for o in d.opinions]) true_aspect_spans = set( tagging_scheme.encoding2spans(true_aspects)) if true_aspect_spans != true_aspect_spans_orig: print "ERROR: {} vs. {}".format(true_aspect_spans_orig, true_aspect_spans) errors += 1 predicted_aspect_spans = set( tagging_scheme.encoding2spans(predicted_aspect_probas)) tokens_proba = [ u"{} ({:.2f},{:.2f},{:.2f})".format(t, pb, pi, po) for t, (pb, pi, po) in zip(tokens, predicted_aspect_probas) ] if verbose: print "TRUE: ", tagging_scheme.visualize_tags( tokens_proba, tagging_scheme.spans2tags(len(tokens), true_aspect_spans), spacer=" ") print "PRED: ", tagging_scheme.visualize_tags( tokens_proba, tagging_scheme.spans2tags(len(tokens), predicted_aspect_spans), spacer=" ") data_sample = DataTools.DataSample() data_sample.document = d data_sample.true_aspect_spans = true_aspect_spans data_sample.predicted_aspect_spans = predicted_aspect_spans data_sample.predicted_aspect_probas = predicted_aspect_probas results.add(data_sample) def extract_aspects(min_confidence=0.75): all_true_aspects = set() all_predicted_aspects = set() for ds in results.data_samples: for a in ds.true_aspect_spans: all_true_aspects.add((ds.document.id, ) + a) for a in ds.predicted_aspect_spans: probas = numpy.max(ds.predicted_aspect_probas[a[0]:a[1]], axis=1) if numpy.mean(probas) > min_confidence: all_predicted_aspects.add((ds.document.id, ) + a) return all_true_aspects, all_predicted_aspects def score(beta=1, min_confidence=0.): all_true_aspects, all_predicted_aspects = results.extract_aspects( min_confidence) return EvaluationTools.f1(beta=beta, targets=all_true_aspects, predictions=all_predicted_aspects) results.extract_aspects = extract_aspects results.score = score f1, p, r = results.score(min_confidence=0) print "F1: {:.3f}".format(f1) print "P: {:.3f}".format(p) print "R: {:.3f}".format(r) print "#Errors:", errors return results
def main(conf, plot_scores=True): conf.experiment_id = "AspectBasedSentiment_Configuration_" + LearningTools.get_timestamp( ) print(conf) base_dirpath = os.path.join(EXPERIMENTS_OUTPUT_DIR, "AspectBasedSentiment_" + conf.timestamp, conf.experiment_id) os.makedirs(base_dirpath) print("read dataset...") # Read documents and split in train/val portions if conf.data_split == "original": train_dataset = data.read_semeval2016_restaurant_train( conf.scope, conf.text_preprocessing, conf.tokenization_style, conf.sentence_filter, conf.opinion_filter) blind_test_documents = data.read_semeval2016_restaurant_blind_test( conf.scope, conf.text_preprocessing, conf.tokenization_style).sentences train_documents, val_documents = DataTools.custom_split( train_dataset.sentences, 0.8, seed=7) train_test_splits = [(train_documents, val_documents)] elif conf.data_split == "custom": dataset = data.read_semeval2016_restaurant_train( conf.scope, conf.text_preprocessing, conf.tokenization_style, conf.sentence_filter, conf.opinion_filter) train_documents, test_documents = DataTools.custom_split( dataset.sentences, 0.8, seed=7) train_test_splits = [(train_documents, test_documents)] elif conf.data_split == "cv": train_dataset = data.read_semeval2016_restaurant_train( conf.scope, conf.text_preprocessing, conf.tokenization_style, conf.sentence_filter, conf.opinion_filter) train_test_splits = DataTools.cross_validation_split( train_dataset.sentences, conf.n_cross_validation, seed=7) # read word embeddings word_embeddings = DataTools.Embedding() word_embeddings.load( "../res/embeddings/amazon_review_corpus_en_100D_advanced_top-{}_W.npy". format(conf.top_k_vocab), "../res/embeddings/amazon_review_corpus_en_100D_advanced_top-{}_vocab.txt" .format(conf.top_k_vocab)) word_embeddings.vocabulary.set_padding( word_embeddings.vocabulary.get_index("<pad>")) word_embeddings.vocabulary.set_unknown( word_embeddings.vocabulary.get_index("<UNK>")) conf.word_input_size = len(word_embeddings.vocabulary) conf.word_embedding_size = word_embeddings.W.shape[1] # read character vocabulary (map from word to index and back) char_vocabulary = DataTools.Vocabulary() char_vocab = Counter(c for w in word_embeddings.vocabulary.vocab for c in unidecode(w) if c != " ") print(char_vocab.most_common()) char_vocabulary.init_from_vocab(char_vocab) char_vocabulary.add_padding("<0>", 0) char_vocabulary.add_unknown("<?>", 1) char_vocabulary.save(os.path.join(base_dirpath, "char_vocabulary.txt")) conf.char_input_size = len(char_vocabulary) pos_vocabulary = LexicalTools.pos_vocabulary conf.pos_input_size = len(pos_vocabulary) if not conf.use_pos: pos_vocabulary = None # setup plotting if plot_scores: score_plot = LearningTools.ScorePlot( "Aspect Extraction", n_cross_validation=len(train_test_splits), n_epochs=conf.n_epochs) # iterate over cross validation splits and train model for n, (train_documents, val_documents) in enumerate(train_test_splits): cv_dirpath = os.path.join(base_dirpath, "cv-{}".format(n + 1)) os.makedirs(cv_dirpath) conf.save(os.path.join(cv_dirpath, "configuration.conf")) best_epoch = 0 best_score = 0 model_name = "{}_{}_n-docs={}_batch-size={}_epochs={}_s-size={}_c-size={}_topK={}".format( conf.model, conf.dataset, conf.max_documents, conf.batch_size, conf.n_epochs, conf.sequence_embedding_size, conf.char_embedding_size, conf.top_k_vocab) print("Model:", model_name) print(conf) # instantiate model using the defined configuration model_fn = models.__dict__[conf.model] modelz = model_fn(word_embedding_weights=[word_embeddings.W], **conf) # modelz[0] is the model for tagging sentences, modelz[1] for obtaining a char-level vector for a word model = modelz[0] model.summary() models_dirpath = os.path.join(cv_dirpath, "models") os.makedirs(models_dirpath) best_model = (0, 0, None) for e in range(conf.n_epochs): process.train_aspects(model, train_documents, word_embeddings.vocabulary, pos_vocabulary, char_vocabulary, conf, e, n_epochs=conf.n_epochs) print("\n\nEvaluate on TRAIN") train_results = process.evaluate_aspects( model, train_documents, word_embeddings.vocabulary, pos_vocabulary, char_vocabulary, conf, verbose=False) print("\n\nEvaluate on VAL") val_results = process.evaluate_aspects(model, val_documents, word_embeddings.vocabulary, pos_vocabulary, char_vocabulary, conf) if conf.data_split == "original": predict_documents = blind_test_documents else: predict_documents = val_documents process.predict_and_write( os.path.join(cv_dirpath, "epoch={}_predicted_aspects.xml".format(e + 1)), model, predict_documents, word_embeddings.vocabulary, pos_vocabulary, char_vocabulary, conf) f1_train, p_train, r_train = train_results.score(min_confidence=0) f1, p, r = val_results.score(min_confidence=0) if plot_scores: score_plot.add(n, e, f1_train, "F1-Train") score_plot.add(n, e, f1, "F1") score_plot.add(n, e, p, "P") score_plot.add(n, e, r, "R") score_plot.print_scores("F1") if e > 1: if best_model is None or f1 > best_model[0]: model.save_weights( os.path.join(models_dirpath, "weights@{}.h5".format(e + 1))) best_model = (f1, e) with io.open(os.path.join(cv_dirpath, "scores.txt".format(e + 1)), "a") as f: f.write("{:.6f}\n".format(f1)) print("best model:", best_model) ############ Save Model Weights ############ model.save_weights(os.path.join(models_dirpath, "final_weights.h5")) if plot_scores: numpy.save("../results/scores_{}.npy".format(conf.model), score_plot.scores["F1"]) print("Best Epoch {} with score {}".format(best_epoch, best_score))
def _ops2str(opinions): return "|".join(["({}-{}): '{}'".format(o.start, o.end, " ".join( DataTools.mark_unknown(o.tokens, word_embeddings.vocabulary.word2index))) for o in opinions])
os.path.join(experiment_base_dirpath, cv_dirname, "configuration.conf")) print(conf) model_fn = models.__dict__[conf.model] modelz = model_fn(word_embedding_weights=None, **conf) char_model = modelz[1] # load the trained weights weights = char_model.load_weights(os.path.join(experiment_base_dirpath, cv_dirname, "models/best_model.h5"), by_name=True) # load resources: character vocabulary and pretrained word embeddings char_vocabulary = DataTools.Vocabulary() char_vocabulary.load( os.path.join(experiment_base_dirpath, "char_vocabulary.txt")) char_vocabulary.set_padding(char_vocabulary.get_index("<0>")) char_vocabulary.set_unknown(char_vocabulary.get_index("<?>")) print(char_vocabulary) word_embeddings = DataTools.Embedding() word_embeddings.load( "../res/embeddings/amazon_review_corpus_en_100D_advanced_top-100000_W.npy", "../res/embeddings/amazon_review_corpus_en_100D_advanced_top-100000_vocab.txt" ) word_embeddings.vocabulary.set_padding( word_embeddings.vocabulary.get_index("<pad>")) word_embeddings.vocabulary.set_unknown( word_embeddings.vocabulary.get_index("<UNK>"))