def get_srl_test_data(filepath, config, word_dict, label_dict, allow_new_words=True): """get the test data from file""" word_dict.accept_new = allow_new_words if label_dict.accept_new: label_dict.set_unknown_token(UNKNOWN_LABEL) label_dict.accept_new = False if filepath != None and filepath != '': samples = get_srl_sentences(filepath, config.use_se_marker) else: samples = [] word_to_embeddings = get_pretrained_embeddings(WORD_EMBEDDINGS[config.word_embedding]) if allow_new_words: tokens = [string_sequence_to_ids(sent[1], word_dict, True, word_to_embeddings) for sent in samples] else: tokens = [string_sequence_to_ids(sent[1], word_dict, True) for sent in samples] test_sentences_ids = [sent[0] for sent in samples] labels = [string_sequence_to_ids(sent[3], label_dict) for sent in samples] srl_features, feature_shapes = features.get_srl_features(samples, config) sentences = [] for i in range(len(tokens)): # i is each sentence sentences.append((test_sentences_ids[i],) + (tokens[i],) + tuple(srl_features[i]) + (labels[i],)) word_embedding = [word_to_embeddings[w] for w in word_dict.idx2str] word_embedding_shape = [len(word_embedding), len(word_embedding[0])] return (sentences, [word_embedding, None, None], [word_embedding_shape, ] + feature_shapes)
def get_srl_test_data_gemb(filepath, config, word_dict, label_dict, allow_new_words=False): allow_new_words = False # should not make use of pretrained embeddings at test time word_dict.accept_new = allow_new_words if label_dict.accept_new: label_dict.set_unknown_token(UNKNOWN_LABEL) label_dict.accept_new = False if filepath != None and filepath != '': samples = get_srl_sentences(filepath, config.use_se_marker) else: samples = [] word_to_embeddings = get_pretrained_embeddings(WORD_EMBEDDINGS[config.word_embedding]) if allow_new_words: tokens = [string_sequence_to_ids(sent[0], word_dict, True, word_to_embeddings) for sent in samples] else: # list of sent [list of ids[]] tokens = [string_sequence_to_ids(sent[0], word_dict, True) for sent in samples] labels = [string_sequence_to_ids(sent[2], label_dict) for sent in samples] srl_features, feature_shapes = features.get_srl_features(samples, config) sentences = [] for i in range(len(tokens)): sentences.append((tokens[i],) + tuple(srl_features[i]) + (labels[i],)) word_embedding = [word_to_embeddings[w] for w in word_dict.idx2str] word_embedding_shape = [len(word_embedding), len(word_embedding[0])] # sentences: tuple(list[word_ids], list[feat_ids], list[emb_shapes], list[feat_shapes]) return (sentences, [word_embedding, None, None], [word_embedding_shape,] + feature_shapes)
def get_srl_test_data(filepath, config, word_dict, label_dict, lower_case=True, allow_new_words=True): word_dict.accept_new = allow_new_words if label_dict.accept_new: label_dict.set_unknown_token(UNKNOWN_LABEL) label_dict.accept_new = False if filepath != None and filepath != '': print "Getting sentences from",filepath samples = get_srl_sentences(filepath, config.use_se_marker) else: samples = [] embeddings_file = config.embedding_file if config.embedding_file is not None else WORD_EMBEDDINGS[config.word_embedding] print "Reading", embeddings_file word_to_embeddings = get_pretrained_embeddings(embeddings_file) print "Done, got",len(word_to_embeddings) if allow_new_words: tokens = [string_sequence_to_ids(sent[0], word_dict, True, word_to_embeddings) for sent in samples] else: tokens = [string_sequence_to_ids(sent[0], word_dict, True) for sent in samples] # for i in range(5): # print tokens[i], samples[i] labels = [string_sequence_to_ids(sent[2], label_dict) for sent in samples] srl_features, feature_shapes = features.get_srl_features(samples, config) sentences = [] for i in range(len(tokens)): sentences.append((tokens[i],) + tuple(srl_features[i]) + (labels[i],)) #word_embedding = [get_embeddings(w, word_to_embeddings) for w in word_dict.idx2str] word_embedding = [word_to_embeddings[w] if w in word_to_embeddings else word_to_embeddings[UNKNOWN_TOKEN] for w in word_dict.idx2str] # for i in range(10): # w = word_dict.idx2str[i] # print(w,word_embedding[i][:10]) word_embedding_shape = [len(word_embedding), len(word_embedding[0])] return (sentences, [word_embedding, None, None], [word_embedding_shape,] + feature_shapes)
def get_srl_data(config, train_data_path, dev_data_path, vocab_path=None, label_path=None): ''' ''' use_se_marker = config.use_se_marker raw_train_sents = get_srl_sentences(train_data_path, use_se_marker) raw_dev_sents = get_srl_sentences(dev_data_path, use_se_marker) word_to_embeddings = get_pretrained_embeddings( WORD_EMBEDDINGS[config.word_embedding]) # get pre-trained embeddings # Prepare word dictionary. word_dict = Dictionary(padding_token=PADDING_TOKEN, unknown_token=UNKNOWN_TOKEN) if use_se_marker: word_dict.add_all([START_MARKER, END_MARKER]) if vocab_path != None: with open(vocab_path, 'r') as f_vocab: for line in f_vocab: word_dict.add(line.strip()) f_vocab.close() word_dict.accept_new = False print 'Load {} words. Dictionary freezed.'.format(word_dict.size()) # Parpare label dictionary. label_dict = Dictionary() if label_path != None: with open(label_path, 'r') as f_labels: for line in f_labels: label_dict.add(line.strip()) f_labels.close() label_dict.set_unknown_token(UNKNOWN_LABEL) label_dict.accept_new = False print 'Load {} labels. Dictionary freezed.'.format(label_dict.size()) # Get tokens and labels: [sentence_id, word, predicate, label] train_sentences_ids = [sent[0] for sent in raw_train_sents] train_tokens = [ string_sequence_to_ids(sent[1], word_dict, True, word_to_embeddings) for sent in raw_train_sents ] train_labels = [ string_sequence_to_ids(sent[3], label_dict) for sent in raw_train_sents ] if label_dict.accept_new: label_dict.set_unknown_token( UNKNOWN_LABEL) # train corpus contains the label 'O' ? label_dict.accept_new = False dev_sentences_ids = [sent[0] for sent in raw_dev_sents] dev_tokens = [ string_sequence_to_ids(sent[1], word_dict, True, word_to_embeddings) for sent in raw_dev_sents ] dev_labels = [ string_sequence_to_ids(sent[3], label_dict) for sent in raw_dev_sents ] print 'Total tokens in Dev dataset {}'.format( sum([len(sent[1]) for sent in raw_dev_sents])) # Get features print 'Extracting features' train_features, feature_shapes = features.get_srl_features( raw_train_sents, config) dev_features, feature_shapes2 = features.get_srl_features( raw_dev_sents, config) for f1, f2 in zip(feature_shapes, feature_shapes2): assert f1 == f2 # For additional features. Unused now. feature_dicts = [] for feature in config.features: feature_dicts.append(None) train_sents = [] dev_sents = [] for i in range(len(train_tokens)): train_sents.append((train_sentences_ids[i], ) + (train_tokens[i], ) + tuple(train_features[i]) + (train_labels[i], )) for i in range(len(dev_tokens)): dev_sents.append((dev_sentences_ids[i], ) + (dev_tokens[i], ) + tuple(dev_features[i]) + (dev_labels[i], )) print("Extraced {} words and {} tags".format(word_dict.size(), label_dict.size())) print("Max training sentence length: {}".format( max([len(s[1]) for s in train_sents]))) print("Max development sentence length: {}".format( max([len(s[1]) for s in dev_sents]))) word_embedding = [word_to_embeddings[w] for w in word_dict.idx2str] word_embedding_shape = [len(word_embedding), len(word_embedding[0])] return (train_sents, dev_sents, word_dict, label_dict, [word_embedding, None, None], [word_embedding_shape] + feature_shapes, [ word_dict, ] + feature_dicts)