def load_data(self, data_splits): anno = {} for data_split in data_splits: # data_path = osp.join(cfg.DATA_DIR, cfg.IMDB_NAME, 'format_%s.pkl'%str(data_split)) data_path = cfg.ANNO_PATH % str(data_split) t_anno = load(data_path) anno.update(t_anno) return anno
def __init__(self, model_name, data_name, cv_runs, params_dict, logger): print("Loading data...") if data_name == "wiki": words_train, mentions_train, positions_train, labels_train = data_utils.load(config.WIKI_TRAIN_CLEAN) words, mentions, positions, labels = data_utils.load(config.WIKI_TEST_CLEAN) type2id, typeDict = pkl_utils._load(config.WIKI_TYPE) num_types = len(type2id) type_info = config.WIKI_TYPE elif data_name == "ontonotes": words_train, mentions_train, positions_train, labels_train = data_utils.load(config.ONTONOTES_TRAIN_CLEAN) words, mentions, positions, labels = data_utils.load(config.ONTONOTES_TEST_CLEAN) type2id, typeDict = pkl_utils._load(config.ONTONOTES_TYPE) num_types = len(type2id) type_info = config.ONTONOTES_TYPE # "./data/corpus/OntoNotes/type.pkl" elif data_name == "wikim": words_train, mentions_train, positions_train, labels_train = data_utils.load(config.WIKIM_TRAIN_CLEAN) words, mentions, positions, labels = data_utils.load(config.WIKIM_TEST_CLEAN) type2id, typeDict = pkl_utils._load(config.WIKIM_TYPE) num_types = len(type2id) type_info = config.WIKIM_TYPE self.id2type = {type2id[x]:x for x in type2id.keys()} def type2vec(types): tmp = np.zeros(num_types) for t in types.split(): tmp[type2id[t]] = 1.0 return tmp labels_train = np.array([type2vec(t) for t in labels_train]) # one_hot coding labels = np.array([type2vec(t) for t in labels]) # labels_test [test_size,num_types] self.embedding = embedding_utils.Embedding.fromCorpus(config.EMBEDDING_DATA, list(words_train)+list(words), config.MAX_DOCUMENT_LENGTH, config.MENTION_SIZE) # MAX_DOCUMENT_LENGTH = 30 # MENTION_SIZE = 15 # WINDOW_SIZE = 10 print("Preprocessing data...") textlen_train = np.array([self.embedding.len_transform1(x) for x in words_train]) #1-D array [total] constraints max sentences len to 30 words_train = np.array([self.embedding.text_transform1(x) for x in words_train]) # 2-D array [[index,],] [total,30] mentionlen_train = np.array([self.embedding.len_transform2(x) for x in mentions_train]) # [total],constrains max mentions len to 15 mentions_train = np.array([self.embedding.text_transform2(x) for x in mentions_train]) # [total,15] positions_train = np.array([self.embedding.position_transform(x) for x in positions_train]) # [total,30] textlen = np.array([self.embedding.len_transform1(x) for x in words]) words = np.array([self.embedding.text_transform1(x) for x in words]) mentionlen = np.array([self.embedding.len_transform2(x) for x in mentions]) mentions = np.array([self.embedding.text_transform2(x) for x in mentions]) positions = np.array([self.embedding.position_transform(x) for x in positions]) ss = ShuffleSplit(n_splits=1, test_size=0.1, random_state=config.RANDOM_SEED) for test_index, valid_index in ss.split(np.zeros(len(labels)), labels): textlen_test, textlen_valid = textlen[test_index], textlen[valid_index] words_test, words_valid = words[test_index], words[valid_index] mentionlen_test, mentionlen_valid = mentionlen[test_index], mentionlen[valid_index] mentions_test, mentions_valid = mentions[test_index], mentions[valid_index] positions_test, positions_valid = positions[test_index], positions[valid_index] labels_test, labels_valid = labels[test_index], labels[valid_index] # [?,30] [?] [?,15] [?] [?,30] [?,num_types] # --> ? total tuples (30,1,15,1,30,num_types) --> (sentence, len, mention, len, positions, type) self.train_set = list(zip(words_train, textlen_train, mentions_train, mentionlen_train, positions_train, labels_train)) self.valid_set = list(zip(words_valid, textlen_valid, mentions_valid, mentionlen_valid, positions_valid, labels_valid)) self.test_set = list(zip(words_test, textlen_test, mentions_test, mentionlen_test, positions_test, labels_test)) self.full_test_set = list(zip(words, textlen, mentions, mentionlen, positions, labels)) self.labels_test = labels_test self.labels = labels self.model_name = model_name self.data_name = data_name self.cv_runs = cv_runs self.params_dict = params_dict self.hparams = AttrDict(params_dict) self.logger = logger self.num_types = num_types self.type_info = type_info self.model = self._get_model() self.saver = tf.train.Saver(tf.global_variables()) checkpoint_dir = os.path.abspath(config.CHECKPOINT_DIR) if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) self.checkpoint_prefix = os.path.join(checkpoint_dir, self.__str__())
def get_types(model_name, input_file, dev_file, output_file, options): checkpoint_file = os.path.join(config.CHECKPOINT_DIR, model_name) type2id, typeDict = pkl_utils._load(config.WIKI_TYPE) id2type = {type2id[x]: x for x in type2id.keys()} #different way? -> data is different! # words, mentions, positions, labels = data_utils.load(input_file) # n = len(words) embedding = embedding_utils.Embedding.restore(checkpoint_file) test_set, test_labels, test_tokenized = create_labelset_input( *data_utils.load(input_file), embedding) dev_set, dev_labels, dev_tokenized = create_labelset_input( *data_utils.load(dev_file), embedding) store = StructuredLogitsStore( model_name, idx2label=id2type, hierarchical=True if "hier" in model_name else False, nested=False) graph = tf.Graph() with graph.as_default(): sess = tf.Session() saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # DEFINE operations input_words = graph.get_operation_by_name("input_words").outputs[0] input_textlen = graph.get_operation_by_name("input_textlen").outputs[0] input_mentions = graph.get_operation_by_name( "input_mentions").outputs[0] input_mentionlen = graph.get_operation_by_name( "input_mentionlen").outputs[0] input_positions = graph.get_operation_by_name( "input_positions").outputs[0] phase = graph.get_operation_by_name("phase").outputs[0] dense_dropout = graph.get_operation_by_name("dense_dropout").outputs[0] rnn_dropout = graph.get_operation_by_name("rnn_dropout").outputs[0] pred_op = graph.get_operation_by_name("output/predictions").outputs[0] #proba_op = graph.get_operation_by_name("output/proba").outputs[0] #proba logit_op = graph.get_operation_by_name("output/scores").outputs[ 0] #proba tune_op = graph.get_operation_by_name("tune").outputs[0] # K x K # results_op = graph.get_operation_by_name("results").outputs[0] # require labels # DO THE SAME FOR DEV set! test_batches = data_utils.batch_iter(test_set, 512, 1, shuffle=False) all_predictions = [] all_logits = [] for batch in test_batches: words_batch, textlen_batch, mentions_batch, mentionlen_batch, positions_batch, labels_batch = zip( *batch) feed = { input_words: words_batch, input_textlen: textlen_batch, input_mentions: mentions_batch, input_mentionlen: mentionlen_batch, input_positions: positions_batch, phase: False, dense_dropout: 1.0, rnn_dropout: 1.0 } batch_predictions = sess.run(pred_op, feed_dict=feed) all_predictions = np.concatenate( [all_predictions, batch_predictions]) #probas = sess.run(logit_op, feed_dict=feed) logit_predictions = sess.run(logit_op, feed_dict=feed) if all_logits == []: all_logits = logit_predictions else: all_logits = np.concatenate([all_logits, logit_predictions]) store.create_labelset( StructuredLogits(f_x=all_logits, y_true=test_labels, tokenized=test_tokenized, y_hat=None, probas=None, c=None, document_masks=None, idx2label=id2type), "test") store.score_set("test") dev_batches = data_utils.batch_iter(dev_set, 512, 1, shuffle=False) all_predictions = [] all_logits = [] for batch in dev_batches: words_batch, textlen_batch, mentions_batch, mentionlen_batch, positions_batch, labels_batch = zip( *batch) feed = { input_words: words_batch, input_textlen: textlen_batch, input_mentions: mentions_batch, input_mentionlen: mentionlen_batch, input_positions: positions_batch, phase: False, dense_dropout: 1.0, rnn_dropout: 1.0 } batch_predictions = sess.run(pred_op, feed_dict=feed) all_predictions = np.concatenate( [all_predictions, batch_predictions]) #probas = sess.run(logit_op, feed_dict=feed) logit_predictions = sess.run(logit_op, feed_dict=feed) if all_logits == []: all_logits = logit_predictions else: all_logits = np.concatenate([all_logits, logit_predictions]) store.create_labelset( StructuredLogits(f_x=all_logits, y_true=dev_labels, tokenized=dev_tokenized, y_hat=None, probas=None, c=None, document_masks=None, idx2label=id2type), "dev") store.score_set("dev") #np.transpose(prior_utils.create_prior(type_info, hparams.alpha) # all_logits.append(logit_predictions) # save as pickle with open(os.path.join(os.path.dirname(checkpoint_file), "logits.pickle"), "wb") as f: pickle.dump(store, f) """
def load(self): self.idx2token = load(self.save_dir + '/idx2token.pkl') self.token2idx = load(self.save_dir + '/token2idx.pkl') self.word_freq = load(self.save_dir + '/word_freq.pkl') self.special = load(self.save_dir + '/special_words.pkl')
def __init__(self, model_name, data_name, cv_runs, params_dict, logger, portion=100, save_name=''): print("Loading data...") if portion <= 100: # all the data, portion% clean + all noisy self.portion = '-' + str(portion) if portion != 100 else '' else: portion /= 100 # only clean data, portion% clean self.portion = '-' + str(int(portion)) + '-clean' print('run task on: ', self.portion, ' dataset: ', data_name) if data_name == "ontonotes": words_train, mentions_train, positions_train, labels_train = data_utils.load( config.ONTONOTES_TRAIN_CLEAN + self.portion) words, mentions, positions, labels = data_utils.load( config.ONTONOTES_TEST_CLEAN) type2id, typeDict = pkl_utils._load(config.ONTONOTES_TYPE) num_types = len(type2id) type_info = config.ONTONOTES_TYPE elif data_name == "bbn": words_train, mentions_train, positions_train, labels_train = data_utils.load( config.BBN_TRAIN_CLEAN + self.portion) words, mentions, positions, labels = data_utils.load( config.BBN_TEST_CLEAN) type2id, typeDict = pkl_utils._load(config.BBN_TYPE) num_types = len(type2id) type_info = config.BBN_TYPE else: assert False, 'you have to specify the name of dataset with -d (ie. bbn/....)' self.model_name = model_name self.savename = save_name self.data_name = data_name self.cv_runs = cv_runs self.params_dict = params_dict self.hparams = AttrDict(params_dict) #self.hparams.alpha=alpha self.logger = logger self.id2type = {type2id[x]: x for x in type2id.keys()} def type2vec(types): # only terminal will be labeled tmp = np.zeros(num_types) for t in str(types).split(): if t in type2id.keys(): tmp[type2id[t]] = 1.0 return tmp labels_train = np.array([type2vec(t) for t in labels_train]) # one hot vec' labels = np.array([type2vec(t) for t in labels]) tempname = self.data_name + config.testemb tempname = os.path.join(config.PKL_DIR, tempname) if os.path.exists(tempname): self.embedding = pickle.load(open(tempname, 'rb')) print('embedding load over') else: self.embedding = embedding_utils.\ Embedding.fromCorpus(config.EMBEDDING_DATA,list(words_train) + list(words), config.MAX_DOCUMENT_LENGTH, config.MENTION_SIZE) pickle.dump(self.embedding, open(tempname, 'wb')) print('embedding dump over') self.embedding.max_document_length = config.MAX_DOCUMENT_LENGTH print("Preprocessing data...") if True: textlen_train = np.array([ self.embedding.len_transform1(x) for x in words_train ]) # with cut down len sequence words_train = np.array([ self.embedding.text_transform1(x) for x in words_train ]) # with cut down word id sequence and mask with zero <PAD> mentionlen_train = np.array([ self.embedding.len_transform2(x) for x in mentions_train ]) # mention len mentions_train = np.array([ self.embedding.text_transform2(x) for x in mentions_train ]) # mention text indexer positions_train = np.array([ self.embedding.position_transform(x) for x in positions_train ]) # start ,end position print('get train data') textlen = np.array( [self.embedding.len_transform1(x) for x in words]) words = np.array([ self.embedding.text_transform1(x) for x in words ]) # padding and cut down mentionlen = np.array( [self.embedding.len_transform2(x) for x in mentions]) mentions = np.array( [self.embedding.text_transform2(x) for x in mentions]) positions = np.array( [self.embedding.position_transform(x) for x in positions]) print('get test data') # pickle.dump([textlen_train, words_train, mentionlen_train, mentions_train, positions_train, # textlen, words, mentionlen, mentions, positions # ], open(os.path.join(self.data_name + config.prep+self.portion, 'wb')) # print('dump preprocessed data to pkl over...') # else: # textlen_train, words_train, mentionlen_train, mentions_train, \ # positions_train, textlen, words, mentionlen, mentions, positions = pickle.load( # open(self.data_name + config.prep+self.portion, 'rb')) # print('load preprocessed data from pkl over...') #if True: ss = ShuffleSplit(n_splits=1, test_size=0.1, random_state=config.RANDOM_SEED) for test_index, valid_index in ss.split(np.zeros(len(labels)), labels): # 用index做划分 textlen_test, textlen_valid = textlen[test_index], textlen[ valid_index] words_test, words_valid = words[test_index], words[valid_index] mentionlen_test, mentionlen_valid = mentionlen[ test_index], mentionlen[valid_index] mentions_test, mentions_valid = mentions[test_index], mentions[ valid_index] positions_test, positions_valid = positions[test_index], positions[ valid_index] labels_test, labels_valid = labels[test_index], labels[valid_index] self.train_set = list( zip( words_train, textlen_train, mentions_train, mentionlen_train, positions_train, labels_train, )) self.valid_set = list( zip( words_valid, textlen_valid, mentions_valid, mentionlen_valid, positions_valid, labels_valid, )) self.test_set = list( zip( words_test, textlen_test, mentions_test, mentionlen_test, positions_test, labels_test, )) self.full_test_set = list( zip( words, textlen, mentions, mentionlen, positions, labels, )) self.labels_test = labels_test self.labels = labels self.labels_valid = labels_valid self.num_types = num_types self.type_info = type_info self.logger.info("train set size:%d, test set size: %d" % (len(self.train_set), len(self.full_test_set))) self.model = self._get_model() self.saver = tf.train.Saver(tf.global_variables()) checkpoint_dir = os.path.abspath(config.CHECKPOINT_DIR) if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) self.checkpoint_prefix = os.path.join(checkpoint_dir, self.__str__())