Пример #1
0
 def load_data(self, data_splits):
     anno = {}
     for data_split in data_splits:
         # data_path = osp.join(cfg.DATA_DIR, cfg.IMDB_NAME, 'format_%s.pkl'%str(data_split))
         data_path = cfg.ANNO_PATH % str(data_split)
         t_anno = load(data_path)
         anno.update(t_anno)
     return anno
Пример #2
0
    def __init__(self, model_name, data_name, cv_runs, params_dict, logger):
        print("Loading data...")
        if data_name == "wiki":
            words_train, mentions_train, positions_train, labels_train = data_utils.load(config.WIKI_TRAIN_CLEAN)
            words, mentions, positions, labels = data_utils.load(config.WIKI_TEST_CLEAN)
            type2id, typeDict = pkl_utils._load(config.WIKI_TYPE)
            num_types = len(type2id)
            type_info = config.WIKI_TYPE
        elif data_name == "ontonotes":
            words_train, mentions_train, positions_train, labels_train = data_utils.load(config.ONTONOTES_TRAIN_CLEAN)
            words, mentions, positions, labels = data_utils.load(config.ONTONOTES_TEST_CLEAN)
            type2id, typeDict = pkl_utils._load(config.ONTONOTES_TYPE)
            num_types = len(type2id)
            type_info = config.ONTONOTES_TYPE # "./data/corpus/OntoNotes/type.pkl"
        elif data_name == "wikim":
            words_train, mentions_train, positions_train, labels_train = data_utils.load(config.WIKIM_TRAIN_CLEAN)
            words, mentions, positions, labels = data_utils.load(config.WIKIM_TEST_CLEAN)
            type2id, typeDict = pkl_utils._load(config.WIKIM_TYPE)
            num_types = len(type2id)
            type_info = config.WIKIM_TYPE

        self.id2type = {type2id[x]:x for x in type2id.keys()}
        def type2vec(types):
            tmp = np.zeros(num_types)
            for t in types.split():
                tmp[type2id[t]] = 1.0
            return tmp
        labels_train = np.array([type2vec(t) for t in labels_train]) # one_hot coding
        labels = np.array([type2vec(t) for t in labels]) # labels_test [test_size,num_types]

        self.embedding = embedding_utils.Embedding.fromCorpus(config.EMBEDDING_DATA, list(words_train)+list(words), config.MAX_DOCUMENT_LENGTH, config.MENTION_SIZE)
        # MAX_DOCUMENT_LENGTH = 30
        # MENTION_SIZE = 15
        # WINDOW_SIZE = 10

        print("Preprocessing data...")
        textlen_train = np.array([self.embedding.len_transform1(x) for x in words_train]) #1-D array [total] constraints max sentences len to 30
        words_train = np.array([self.embedding.text_transform1(x) for x in words_train]) # 2-D array [[index,],] [total,30]
        mentionlen_train = np.array([self.embedding.len_transform2(x) for x in mentions_train]) # [total],constrains max mentions len to 15
        mentions_train = np.array([self.embedding.text_transform2(x) for x in mentions_train]) # [total,15]
        positions_train = np.array([self.embedding.position_transform(x) for x in positions_train]) # [total,30]

        textlen = np.array([self.embedding.len_transform1(x) for x in words])
        words = np.array([self.embedding.text_transform1(x) for x in words])
        mentionlen = np.array([self.embedding.len_transform2(x) for x in mentions])
        mentions = np.array([self.embedding.text_transform2(x) for x in mentions])
        positions = np.array([self.embedding.position_transform(x) for x in positions])

        ss = ShuffleSplit(n_splits=1, test_size=0.1, random_state=config.RANDOM_SEED)
        for test_index, valid_index in ss.split(np.zeros(len(labels)), labels):
            textlen_test, textlen_valid = textlen[test_index], textlen[valid_index]
            words_test, words_valid = words[test_index], words[valid_index]
            mentionlen_test, mentionlen_valid = mentionlen[test_index], mentionlen[valid_index]
            mentions_test, mentions_valid = mentions[test_index], mentions[valid_index]
            positions_test, positions_valid = positions[test_index], positions[valid_index]
            labels_test, labels_valid = labels[test_index], labels[valid_index]
                                    # [?,30] [?]    [?,15]   [?]  [?,30] [?,num_types]
         # --> ? total tuples (30,1,15,1,30,num_types)  --> (sentence, len, mention, len, positions, type)
        self.train_set = list(zip(words_train, textlen_train, mentions_train, mentionlen_train, positions_train, labels_train))
        self.valid_set = list(zip(words_valid, textlen_valid, mentions_valid, mentionlen_valid, positions_valid, labels_valid))
        self.test_set = list(zip(words_test, textlen_test, mentions_test, mentionlen_test, positions_test, labels_test))
        self.full_test_set = list(zip(words, textlen, mentions, mentionlen, positions, labels))

        self.labels_test = labels_test
        self.labels = labels

        self.model_name = model_name
        self.data_name = data_name
        self.cv_runs = cv_runs
        self.params_dict = params_dict
        self.hparams = AttrDict(params_dict)
        self.logger = logger

        self.num_types = num_types
        self.type_info = type_info

        self.model = self._get_model()
        self.saver = tf.train.Saver(tf.global_variables())
        checkpoint_dir = os.path.abspath(config.CHECKPOINT_DIR)
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        self.checkpoint_prefix = os.path.join(checkpoint_dir, self.__str__())
Пример #3
0
def get_types(model_name, input_file, dev_file, output_file, options):

    checkpoint_file = os.path.join(config.CHECKPOINT_DIR, model_name)
    type2id, typeDict = pkl_utils._load(config.WIKI_TYPE)
    id2type = {type2id[x]: x for x in type2id.keys()}

    #different way? -> data is different!
    # words, mentions, positions, labels = data_utils.load(input_file)
    # n = len(words)

    embedding = embedding_utils.Embedding.restore(checkpoint_file)

    test_set, test_labels, test_tokenized = create_labelset_input(
        *data_utils.load(input_file), embedding)
    dev_set, dev_labels, dev_tokenized = create_labelset_input(
        *data_utils.load(dev_file), embedding)

    store = StructuredLogitsStore(
        model_name,
        idx2label=id2type,
        hierarchical=True if "hier" in model_name else False,
        nested=False)

    graph = tf.Graph()
    with graph.as_default():
        sess = tf.Session()
        saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
        saver.restore(sess, checkpoint_file)

        # DEFINE operations
        input_words = graph.get_operation_by_name("input_words").outputs[0]
        input_textlen = graph.get_operation_by_name("input_textlen").outputs[0]
        input_mentions = graph.get_operation_by_name(
            "input_mentions").outputs[0]
        input_mentionlen = graph.get_operation_by_name(
            "input_mentionlen").outputs[0]
        input_positions = graph.get_operation_by_name(
            "input_positions").outputs[0]
        phase = graph.get_operation_by_name("phase").outputs[0]
        dense_dropout = graph.get_operation_by_name("dense_dropout").outputs[0]
        rnn_dropout = graph.get_operation_by_name("rnn_dropout").outputs[0]

        pred_op = graph.get_operation_by_name("output/predictions").outputs[0]
        #proba_op = graph.get_operation_by_name("output/proba").outputs[0] #proba
        logit_op = graph.get_operation_by_name("output/scores").outputs[
            0]  #proba
        tune_op = graph.get_operation_by_name("tune").outputs[0]  # K x K
        # results_op = graph.get_operation_by_name("results").outputs[0] # require labels

        # DO THE SAME FOR DEV set!

        test_batches = data_utils.batch_iter(test_set, 512, 1, shuffle=False)

        all_predictions = []
        all_logits = []
        for batch in test_batches:
            words_batch, textlen_batch, mentions_batch, mentionlen_batch, positions_batch, labels_batch = zip(
                *batch)
            feed = {
                input_words: words_batch,
                input_textlen: textlen_batch,
                input_mentions: mentions_batch,
                input_mentionlen: mentionlen_batch,
                input_positions: positions_batch,
                phase: False,
                dense_dropout: 1.0,
                rnn_dropout: 1.0
            }
            batch_predictions = sess.run(pred_op, feed_dict=feed)
            all_predictions = np.concatenate(
                [all_predictions, batch_predictions])

            #probas = sess.run(logit_op, feed_dict=feed)
            logit_predictions = sess.run(logit_op, feed_dict=feed)

            if all_logits == []:
                all_logits = logit_predictions
            else:
                all_logits = np.concatenate([all_logits, logit_predictions])

        store.create_labelset(
            StructuredLogits(f_x=all_logits,
                             y_true=test_labels,
                             tokenized=test_tokenized,
                             y_hat=None,
                             probas=None,
                             c=None,
                             document_masks=None,
                             idx2label=id2type), "test")
        store.score_set("test")

        dev_batches = data_utils.batch_iter(dev_set, 512, 1, shuffle=False)

        all_predictions = []
        all_logits = []
        for batch in dev_batches:
            words_batch, textlen_batch, mentions_batch, mentionlen_batch, positions_batch, labels_batch = zip(
                *batch)
            feed = {
                input_words: words_batch,
                input_textlen: textlen_batch,
                input_mentions: mentions_batch,
                input_mentionlen: mentionlen_batch,
                input_positions: positions_batch,
                phase: False,
                dense_dropout: 1.0,
                rnn_dropout: 1.0
            }
            batch_predictions = sess.run(pred_op, feed_dict=feed)
            all_predictions = np.concatenate(
                [all_predictions, batch_predictions])

            #probas = sess.run(logit_op, feed_dict=feed)
            logit_predictions = sess.run(logit_op, feed_dict=feed)

            if all_logits == []:
                all_logits = logit_predictions
            else:
                all_logits = np.concatenate([all_logits, logit_predictions])

        store.create_labelset(
            StructuredLogits(f_x=all_logits,
                             y_true=dev_labels,
                             tokenized=dev_tokenized,
                             y_hat=None,
                             probas=None,
                             c=None,
                             document_masks=None,
                             idx2label=id2type), "dev")
        store.score_set("dev")

        #np.transpose(prior_utils.create_prior(type_info, hparams.alpha)
        # all_logits.append(logit_predictions)

    # save as pickle
    with open(os.path.join(os.path.dirname(checkpoint_file), "logits.pickle"),
              "wb") as f:
        pickle.dump(store, f)
    """     
Пример #4
0
 def load(self):
     self.idx2token = load(self.save_dir + '/idx2token.pkl')
     self.token2idx = load(self.save_dir + '/token2idx.pkl')
     self.word_freq = load(self.save_dir + '/word_freq.pkl')
     self.special = load(self.save_dir + '/special_words.pkl')
Пример #5
0
    def __init__(self,
                 model_name,
                 data_name,
                 cv_runs,
                 params_dict,
                 logger,
                 portion=100,
                 save_name=''):
        print("Loading data...")
        if portion <= 100:  # all the data, portion% clean + all noisy
            self.portion = '-' + str(portion) if portion != 100 else ''
        else:
            portion /= 100  # only clean data, portion% clean
            self.portion = '-' + str(int(portion)) + '-clean'
        print('run task on: ', self.portion, ' dataset: ', data_name)
        if data_name == "ontonotes":
            words_train, mentions_train, positions_train, labels_train = data_utils.load(
                config.ONTONOTES_TRAIN_CLEAN + self.portion)
            words, mentions, positions, labels = data_utils.load(
                config.ONTONOTES_TEST_CLEAN)
            type2id, typeDict = pkl_utils._load(config.ONTONOTES_TYPE)
            num_types = len(type2id)
            type_info = config.ONTONOTES_TYPE
        elif data_name == "bbn":
            words_train, mentions_train, positions_train, labels_train = data_utils.load(
                config.BBN_TRAIN_CLEAN + self.portion)
            words, mentions, positions, labels = data_utils.load(
                config.BBN_TEST_CLEAN)
            type2id, typeDict = pkl_utils._load(config.BBN_TYPE)
            num_types = len(type2id)
            type_info = config.BBN_TYPE
        else:
            assert False, 'you have to specify the name of dataset with -d (ie. bbn/....)'
        self.model_name = model_name
        self.savename = save_name
        self.data_name = data_name
        self.cv_runs = cv_runs
        self.params_dict = params_dict
        self.hparams = AttrDict(params_dict)
        #self.hparams.alpha=alpha
        self.logger = logger

        self.id2type = {type2id[x]: x for x in type2id.keys()}

        def type2vec(types):  # only terminal will be labeled
            tmp = np.zeros(num_types)
            for t in str(types).split():
                if t in type2id.keys():
                    tmp[type2id[t]] = 1.0
            return tmp

        labels_train = np.array([type2vec(t)
                                 for t in labels_train])  # one hot vec'
        labels = np.array([type2vec(t) for t in labels])

        tempname = self.data_name + config.testemb
        tempname = os.path.join(config.PKL_DIR, tempname)
        if os.path.exists(tempname):
            self.embedding = pickle.load(open(tempname, 'rb'))
            print('embedding load over')
        else:
            self.embedding = embedding_utils.\
             Embedding.fromCorpus(config.EMBEDDING_DATA,list(words_train) + list(words),
                                     config.MAX_DOCUMENT_LENGTH, config.MENTION_SIZE)
            pickle.dump(self.embedding, open(tempname, 'wb'))
            print('embedding dump over')
        self.embedding.max_document_length = config.MAX_DOCUMENT_LENGTH

        print("Preprocessing data...")

        if True:
            textlen_train = np.array([
                self.embedding.len_transform1(x) for x in words_train
            ])  # with cut down len sequence
            words_train = np.array([
                self.embedding.text_transform1(x) for x in words_train
            ])  # with cut down word id sequence and mask with zero <PAD>
            mentionlen_train = np.array([
                self.embedding.len_transform2(x) for x in mentions_train
            ])  # mention len
            mentions_train = np.array([
                self.embedding.text_transform2(x) for x in mentions_train
            ])  # mention text indexer
            positions_train = np.array([
                self.embedding.position_transform(x) for x in positions_train
            ])  # start ,end position
            print('get train data')

            textlen = np.array(
                [self.embedding.len_transform1(x) for x in words])
            words = np.array([
                self.embedding.text_transform1(x) for x in words
            ])  # padding and cut down
            mentionlen = np.array(
                [self.embedding.len_transform2(x) for x in mentions])
            mentions = np.array(
                [self.embedding.text_transform2(x) for x in mentions])
            positions = np.array(
                [self.embedding.position_transform(x) for x in positions])
            print('get test data')
            # pickle.dump([textlen_train, words_train, mentionlen_train, mentions_train, positions_train,
            #              textlen, words, mentionlen, mentions, positions
            #              ], open(os.path.join(self.data_name + config.prep+self.portion, 'wb'))
            # print('dump preprocessed data to pkl over...')
        # else:
        # textlen_train, words_train, mentionlen_train, mentions_train, \
        # positions_train, textlen, words, mentionlen, mentions, positions = pickle.load(
        # 	open(self.data_name + config.prep+self.portion, 'rb'))
        # print('load preprocessed data from pkl over...')

        #if True:
        ss = ShuffleSplit(n_splits=1,
                          test_size=0.1,
                          random_state=config.RANDOM_SEED)
        for test_index, valid_index in ss.split(np.zeros(len(labels)),
                                                labels):  # 用index做划分
            textlen_test, textlen_valid = textlen[test_index], textlen[
                valid_index]
            words_test, words_valid = words[test_index], words[valid_index]
            mentionlen_test, mentionlen_valid = mentionlen[
                test_index], mentionlen[valid_index]
            mentions_test, mentions_valid = mentions[test_index], mentions[
                valid_index]
            positions_test, positions_valid = positions[test_index], positions[
                valid_index]
            labels_test, labels_valid = labels[test_index], labels[valid_index]

        self.train_set = list(
            zip(
                words_train,
                textlen_train,
                mentions_train,
                mentionlen_train,
                positions_train,
                labels_train,
            ))
        self.valid_set = list(
            zip(
                words_valid,
                textlen_valid,
                mentions_valid,
                mentionlen_valid,
                positions_valid,
                labels_valid,
            ))
        self.test_set = list(
            zip(
                words_test,
                textlen_test,
                mentions_test,
                mentionlen_test,
                positions_test,
                labels_test,
            ))

        self.full_test_set = list(
            zip(
                words,
                textlen,
                mentions,
                mentionlen,
                positions,
                labels,
            ))

        self.labels_test = labels_test
        self.labels = labels
        self.labels_valid = labels_valid

        self.num_types = num_types
        self.type_info = type_info
        self.logger.info("train set size:%d, test set size: %d" %
                         (len(self.train_set), len(self.full_test_set)))

        self.model = self._get_model()
        self.saver = tf.train.Saver(tf.global_variables())
        checkpoint_dir = os.path.abspath(config.CHECKPOINT_DIR)
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

        self.checkpoint_prefix = os.path.join(checkpoint_dir, self.__str__())