def map_string_2_id_open(string_list, name):
    string_id_list = []
    alphabet_string = Alphabet(name)
    for strings in string_list:
        ids = []
        for string in strings:
            id = alphabet_string.get_index(string)
            ids.append(id)
        string_id_list.append(ids)
    alphabet_string.close()
    return string_id_list, alphabet_string
Exemplo n.º 2
0
Arquivo: data.py Projeto: NLP1502/NLP
class Data:
    def __init__(self):
        self.substring_names = ['word', 'pos', 'char', 'bpe', 'word-pos']
        self.substring_maxlen = 10

        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = True
        self.norm_word_emb = False
        self.norm_char_emb = False
        self.norm_trans_emb = False
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')

        self.translation_alphabet = Alphabet('translation')
        self.translation_id_format = {}

        self.feature_names = []
        self.feature_alphabets = []
        self.feature_num = len(self.feature_alphabets)
        self.feat_config = None

        self.label_alphabet = Alphabet('label', True)
        self.tagScheme = "NoSeg"  ## BMES/BIO

        self.seg = True
        ###
        self.task_name = None

        ### I/O
        self.data_bin_dir = None
        self.train_dir = None
        self.dev_dir = None
        self.test_dir = None
        self.raw_dir = None
        self.middle_dir = None
        self.viterbi_inputs_model_name = None

        self.trans_dir = None

        self.decode_dir = None
        self.model_dir = None  ## model save  file
        self.load_model_dir = None  ## model load file

        self.word_emb_dir = None
        self.char_emb_dir = None
        self.trans_embed_dir = None
        self.typeinfo_dir = None

        self.feature_emb_dirs = []

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []

        self.pretrain_word_embedding = None
        self.pretrain_char_embedding = None
        self.pretrain_trans_embedding = None
        self.pretrain_feature_embeddings = []

        self.label_size = 0
        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0
        self.trans_alphabet_size = 0

        self.feature_alphabet_sizes = []
        self.feature_emb_dims = []
        self.norm_feature_embs = []
        self.word_emb_dim = 50
        self.char_emb_dim = 30
        self.trans_emb_dim = 100

        ###Classification
        ## Dataset Plus
        self.substring_dir = None
        self.bpe_emb_dir = None
        self.pos_emb_dir = None
        self.pretrain_bpe_embedding = None
        self.pretrain_pos_embedding = None
        self.bpe_emb_dim = 30
        self.pos_emb_dim = 30
        self.bpe_alphabet_size = 0
        self.pos_alphabet_size = 0
        self.norm_bpe_emb = False
        self.norm_pos_emb = False
        self.bpe_texts = []
        self.bpe_Ids = []
        self.pos_texts = []
        self.pos_Ids = []
        self.label_size = 0
        self.substring_train_texts = None
        self.substring_train_Ids = None
        self.substring_dev_texts = None
        self.substring_dev_Ids = None
        self.substring_test_texts = None
        self.substring_test_Ids = None
        self.substring_label_alphabet = Alphabet('substring_label', True)

        ###Networks
        self.word_feature_extractor = "LSTM"  # "LSTM"/"CNN"/"GRU"/
        self.use_char = True
        self.char_seq_feature = "CNN"  # "LSTM"/"CNN"/"GRU"/None
        self.use_trans = False
        self.use_crf = True
        self.nbest = None
        self.use_mapping = False
        self.mapping_func = None  # tanh or sigmoid

        # Training
        self.save_model = True
        self.state_training_name = 'default'
        self.average_batch_loss = False
        self.optimizer = "SGD"  # "SGD"/"Adam"
        self.status = "train"
        self.show_loss_per_batch = 100
        # Hyperparameters
        self.seed_num = None
        self.cnn_layer = 4
        self.iteration = 100
        self.batch_size = 10
        self.char_hidden_dim = 50
        self.trans_hidden_dim = 50
        self.hidden_dim = 200
        self.dropout = 0.5
        self.lstm_layer = 1
        self.bilstm = True

        self.gpu = False
        self.lr = 0.015
        self.lr_decay = 0.05
        self.clip = None
        self.momentum = 0
        self.l2 = 1e-8

        # circul
        self.circul_time = 4
        self.circul_deepth = 2
        self.circul_gather_output_mode = "concat"

        # decode prepare
        self.decode_prepare_mode = 'example'

    def init_substring_instance(self):
        len_names = len(self.substring_names)
        self.substring_train_texts = [[[]
                                       for _ in range(self.substring_maxlen)]
                                      for _ in range(len_names)]
        self.substring_train_Ids = [[[] for _ in range(self.substring_maxlen)]
                                    for _ in range(len_names)]
        self.substring_dev_texts = [[[] for _ in range(self.substring_maxlen)]
                                    for _ in range(len_names)]
        self.substring_dev_Ids = [[[] for _ in range(self.substring_maxlen)]
                                  for _ in range(len_names)]
        self.substring_test_texts = [[[] for _ in range(self.substring_maxlen)]
                                     for _ in range(len_names)]
        self.substring_test_Ids = [[[] for _ in range(self.substring_maxlen)]
                                   for _ in range(len_names)]

    def show_data_summary(self):
        print("++" * 50)
        print("DATA SUMMARY START:")
        print(" I/O:")
        print("     Tag          scheme: %s" % (self.tagScheme))
        print("     MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH))
        print("     MAX   WORD   LENGTH: %s" % (self.MAX_WORD_LENGTH))
        print("     Number   normalized: %s" % (self.number_normalized))
        print("     Word  alphabet size: %s" % (self.word_alphabet_size))
        print("     Char  alphabet size: %s" % (self.char_alphabet_size))
        print("     Label alphabet size: %s" % (self.label_alphabet_size))
        print("     Trans alphabet size: %s" % (self.trans_alphabet_size))
        print("     Word embedding  dir: %s" % (self.word_emb_dir))
        print("     Char embedding  dir: %s" % (self.char_emb_dir))
        print("     Tran embedding  dir: %s" % (self.trans_embed_dir))
        print("     Word embedding size: %s" % (self.word_emb_dim))
        print("     Char embedding size: %s" % (self.char_emb_dim))
        print("     Tran embedding size: %s" % (self.trans_emb_dim))
        print("     Norm   word     emb: %s" % (self.norm_word_emb))
        print("     Norm   char     emb: %s" % (self.norm_char_emb))
        print("     Norm   tran     emb: %s" % (self.norm_trans_emb))
        print("++" * 50)
        print("   task name: %s" % (self.task_name))
        print("++" * 50)
        print("   Data bin file directory: %s" % (self.data_bin_dir))
        print("     Train  file directory: %s" % (self.train_dir))
        print("     Dev    file directory: %s" % (self.dev_dir))
        print("     Test   file directory: %s" % (self.test_dir))
        print("     Raw    file directory: %s" % (self.raw_dir))
        print("     Middle file directory: %s" % (self.middle_dir))
        print(" viterbi inputs model name: %s" %
              (self.viterbi_inputs_model_name))
        if self.typeinfo_dir:
            print("     typeinfo    directory: %s" % (self.typeinfo_dir))
        print("     Model  file directory: %s" % (self.model_dir))
        print("     Loadmodel   directory: %s" % (self.load_model_dir))
        print("     Decode file directory: %s" % (self.decode_dir))
        print("     Train instance number: %s" % (len(self.train_texts)))
        print("     Dev   instance number: %s" % (len(self.dev_texts)))
        print("     Test  instance number: %s" % (len(self.test_texts)))
        print("     Raw   instance number: %s" % (len(self.raw_texts)))
        print("     FEATURE num: %s" % (self.feature_num))
        for idx in range(self.feature_num):
            print("         Fe: %s  alphabet  size: %s" %
                  (self.feature_alphabets[idx].name,
                   self.feature_alphabet_sizes[idx]))
            print(
                "         Fe: %s  embedding  dir: %s" %
                (self.feature_alphabets[idx].name, self.feature_emb_dirs[idx]))
            print(
                "         Fe: %s  embedding size: %s" %
                (self.feature_alphabets[idx].name, self.feature_emb_dims[idx]))
            print("         Fe: %s  norm       emb: %s" %
                  (self.feature_alphabets[idx].name,
                   self.norm_feature_embs[idx]))
        print(" " + "++" * 20)
        print(" Model Network:")
        print("     Model        use_crf: %s" % (self.use_crf))
        print("     Model word extractor: %s" % (self.word_feature_extractor))
        print("     Model       use_char: %s" % (self.use_char))
        if self.use_char:
            print("     Model char_seq_feature: %s" % (self.char_seq_feature))
            print("     Model char_hidden_dim: %s" % (self.char_hidden_dim))
        if self.use_trans:
            print("     Model trans_hidden_dim: %s" % (self.trans_hidden_dim))
        if self.use_mapping:
            print("     Model mapping function: %s" % (self.mapping_func))
        print(" " + "++" * 20)
        print(" Training:")
        print("     show_loss_per_batch: %s" % (self.show_loss_per_batch))
        print("     save_model: %s" % (self.save_model))
        print("     state_training_name: %s" % (self.state_training_name))
        print("     Optimizer: %s" % (self.optimizer))
        print("     Iteration: %s" % (self.iteration))
        print("     BatchSize: %s" % (self.batch_size))
        print("     Average  batch   loss: %s" % (self.average_batch_loss))

        print(" " + "++" * 20)
        print(" Hyperparameters:")

        print("     Hyper        seed_num: %s" % (self.seed_num))
        print("     Hyper              lr: %s" % (self.lr))
        print("     Hyper        lr_decay: %s" % (self.lr_decay))
        print("     Hyper            clip: %s" % (self.clip))
        print("     Hyper        momentum: %s" % (self.momentum))
        print("     Hyper              l2: %s" % (self.l2))
        print("     Hyper      hidden_dim: %s" % (self.hidden_dim))
        print("     Hyper         dropout: %s" % (self.dropout))
        print("     Hyper      lstm_layer: %s" % (self.lstm_layer))
        print("     Hyper          bilstm: %s" % (self.bilstm))
        print("     Hyper             GPU: %s" % (self.gpu))
        print("DATA SUMMARY END.")
        print("++" * 50)

        print("      substring dir : %s" % (self.substring_dir))
        print("    bpe_emb_dir dir : %s" % (self.bpe_emb_dir))
        print("    pos_emb_dir dir : %s" % (self.pos_emb_dir))
        print("++" * 50)

        print("      circul time   : %s" % (self.circul_time))
        print("      circul deepth : %s" % (self.circul_deepth))
        print(" gather output mode : %s" % (self.circul_gather_output_mode))
        print("++" * 50)

        print(" decode prepare mode : %s" % (self.decode_prepare_mode))
        print("++" * 50)

        sys.stdout.flush()

    def make_substring_label_alphabet(self):
        for label in self.label_alphabet.instances:
            label = label.split('-')[-1]
            self.substring_label_alphabet.add(label)
        self.substring_label_alphabet.close()

    def initial_feature_alphabets(self):
        items = open(self.train_dir, 'r').readline().strip('\n').split()
        total_column = len(items)
        if total_column > 2:
            for idx in range(1, total_column - 1):
                feature_prefix = 'feature_' + str(idx)
                self.feature_alphabets.append(Alphabet(feature_prefix))
                self.feature_names.append(feature_prefix)
                print "Find feature: ", feature_prefix
        self.feature_num = len(self.feature_alphabets)
        self.pretrain_feature_embeddings = [None] * self.feature_num
        self.feature_emb_dims = [20] * self.feature_num
        self.feature_emb_dirs = [None] * self.feature_num
        self.norm_feature_embs = [False] * self.feature_num
        self.feature_alphabet_sizes = [0] * self.feature_num
        if self.feat_config:
            for idx in range(self.feature_num):
                self.feature_emb_dims[idx] = self.feat_config[
                    self.feature_names[idx]]['emb_size']
                self.feature_emb_dirs[idx] = self.feat_config[
                    self.feature_names[idx]]['emb_dir']
                self.norm_feature_embs[idx] = self.feat_config[
                    self.feature_names[idx]]['emb_norm']
        # exit(0)

    def build_alphabet(self, input_file):
        in_lines = open(input_file, 'r').readlines()
        for line in in_lines:
            if len(line) > 2:
                pairs = line.strip().split()
                word = pairs[0].decode('windows-1252')
                # word = pairs[0].decode('utf-8')
                if self.number_normalized:
                    word = normalize_word(word)
                label = pairs[-1]
                self.label_alphabet.add(label)
                self.word_alphabet.add(word)
                ## build feature alphabet
                for idx in range(self.feature_num):
                    feat_idx = pairs[idx + 1].split(']', 1)[-1]
                    self.feature_alphabets[idx].add(feat_idx)
                for char in word:
                    self.char_alphabet.add(char)
        self.word_alphabet_size = self.word_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()
        for idx in range(self.feature_num):
            self.feature_alphabet_sizes[idx] = self.feature_alphabets[
                idx].size()
        startS = False
        startB = False
        for label, _ in self.label_alphabet.iteritems():
            if "S-" in label.upper():
                startS = True
            elif "B-" in label.upper():
                startB = True
        if startB:
            if startS:
                self.tagScheme = "BMES"
            else:
                self.tagScheme = "BIO"

    def build_alphabet_substring(self, input_file_dir, substring_file_prefix):
        ## will not read lables
        input_files = os.listdir(input_file_dir)
        print input_files
        for input_file in input_files:
            plus_feature = ''
            input_file_name = os.path.split(input_file)[1]
            if input_file_name.split('.')[0] != substring_file_prefix:
                continue
            if 'bpe' in input_file_name:
                plus_feature = 'bpe'
            elif 'word' in input_file_name:
                plus_feature = 'word'
            if plus_feature == '':
                continue
            in_lines = open(input_file_dir + input_file, 'r').readlines()
            for line in in_lines:
                if len(line.strip()) > 0:
                    pairs = line.strip().split('\t')
                    words = pairs[0].decode('windows-1252')
                    # word = pairs[0].decode('utf-8')
                    if self.number_normalized:
                        words = normalize_word(words)
                    labels = pairs[-1]
                    for word in words.split():
                        self.word_alphabet.add(word)
                        for char in word:
                            self.char_alphabet.add(char)
            self.word_alphabet_size = self.word_alphabet.size()
            self.char_alphabet_size = self.char_alphabet.size()

    def fix_alphabet(self):
        self.word_alphabet.close()
        self.char_alphabet.close()
        self.label_alphabet.close()
        self.translation_alphabet.close()
        for idx in range(self.feature_num):
            self.feature_alphabets[idx].close()

    def build_pretrain_emb(self):
        if self.word_emb_dir:
            print("Load pretrained word embedding, norm: %s, dir: %s" %
                  (self.norm_word_emb, self.word_emb_dir))
            self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(
                self.word_emb_dir, self.word_alphabet, self.word_emb_dim,
                self.norm_word_emb)

            if self.typeinfo_dir:
                type_info_matrix = []
                with codecs.open(self.typeinfo_dir, 'r') as typeinfo_file:
                    type_info_lines = typeinfo_file.readlines()
                    for line in type_info_lines:
                        line = line.rstrip().split()
                        for i, _ in enumerate(line):
                            line[i] = float(line[i])
                        line = np.array(line)
                        type_info_matrix.append(line)

                print(
                    "Caculate type info distribution,and concate word and type......"
                )
                cos_res = []
                for i, word_embed in enumerate(self.pretrain_word_embedding):
                    word_type_info = []
                    if i == 0:
                        word_type_info = np.random.random(
                            size=len(type_info_matrix))
                        cos_res.append(word_type_info)
                    else:
                        for type_info in type_info_matrix:
                            cos_sim = 1 - spatial.distance.cosine(
                                word_embed, type_info)
                            word_type_info.append(cos_sim)
                        cos_res.append(word_type_info)
                cos_res = np.array(cos_res)
                cos_res = sigmoid(cos_res)
                self.pretrain_word_embedding = np.concatenate(
                    [self.pretrain_word_embedding, cos_res], axis=1)
                print "type info length:{}".format(len(type_info_matrix))
                self.word_emb_dim += len(type_info_matrix)
                print "new word dim is :{}".format(self.word_emb_dim)

        if self.char_emb_dir:
            print("Load pretrained char embedding, norm: %s, dir: %s" %
                  (self.norm_char_emb, self.char_emb_dir))
            self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding(
                self.char_emb_dir, self.char_alphabet, self.char_emb_dim,
                self.norm_char_emb)
        if self.trans_embed_dir:
            print("Load pretrained trans embedding, norm: %s, dir: %s" %
                  (self.norm_trans_emb, self.trans_embed_dir))
            self.pretrain_trans_embedding, self.trans_emb_dim = build_chi_pretrain_embedding(
                self.trans_embed_dir, self.translation_alphabet,
                self.trans_emb_dim, self.norm_trans_emb)

        for idx in range(self.feature_num):
            if self.feature_emb_dirs[idx]:
                print(
                    "Load pretrained feature %s embedding:, norm: %s, dir: %s"
                    % (self.feature_name[idx], self.norm_feature_embs[idx],
                       self.feature_emb_dirs[idx]))
                self.pretrain_feature_embeddings[idx], self.feature_emb_dims[
                    idx] = build_pretrain_embedding(
                        self.feature_emb_dirs[idx],
                        self.feature_alphabets[idx],
                        self.feature_emb_dims[idx],
                        self.norm_feature_embs[idx])

    def generate_instance(self, name):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_instance(
                self.train_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.translation_id_format)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_instance(
                self.dev_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.translation_id_format)
        elif name == "test":
            self.test_texts, self.test_Ids = read_instance(
                self.test_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.translation_id_format)
        elif name == "raw":
            self.raw_texts, self.raw_Ids = read_instance(
                self.raw_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.translation_id_format)
        else:
            print(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))

    def generate_instance_substring(self, substring_file_prefix):
        self.init_substring_instance()
        self.make_substring_label_alphabet()
        input_files = os.listdir(self.substring_dir)
        print input_files
        for input_file in input_files:
            input_file_name = os.path.split(input_file)[1]
            input_file_dir = os.path.join(self.substring_dir, input_file_name)
            input_file_name_split = input_file_name.split('.')
            if input_file_name_split[0] != substring_file_prefix:
                continue
            print('dealing %s' % (input_file_name))
            name = input_file_name_split[1]
            feature_name = input_file_name_split[2]
            f_l = int(input_file_name_split[-1][3:])  #feature_len

            if feature_name == 'word':
                alphabet = self.word_alphabet
            elif feature_name == 'char':
                alphabet = self.char_alphabet
            elif feature_name == 'pos':
                alphabet = self.feature_alphabets[0]
            elif feature_name == 'bpe':
                alphabet = self.feature_alphabets[1]

            s_f_id = self.substring_names.index(
                feature_name)  #substring_feature_id
            if name == "train":
                self.substring_train_texts[s_f_id][f_l], self.substring_train_Ids[s_f_id][f_l]\
                    = read_instance_substring(input_file_dir, alphabet, self.substring_label_alphabet, self.number_normalized)
            elif name == "testa":
                self.substring_dev_texts[s_f_id][f_l], self.substring_dev_Ids[s_f_id][f_l] \
                    = read_instance_substring(input_file_dir, alphabet, self.substring_label_alphabet, self.number_normalized)
            elif name == "testb":
                self.substring_test_texts[s_f_id][f_l], self.substring_test_Ids[s_f_id][f_l] \
                    = read_instance_substring(input_file_dir, alphabet, self.substring_label_alphabet, self.number_normalized)
            else:
                print(
                    "Error: you can only generate train/testa/testb instance! Illegal input:%s"
                    % (name))

    def write_decoded_results(self, predict_results, name):
        fout = open(self.decode_dir, 'w')
        sent_num = len(predict_results)
        content_list = []
        if name == 'raw':
            content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print(
                "Error: illegal name during writing predict result, name should be within train/dev/test/raw !"
            )
        assert (sent_num == len(content_list))
        for idx in range(sent_num):
            sent_length = len(predict_results[idx])
            for idy in range(sent_length):
                ## content_list[idx] is a list with [word, char, label]
                fout.write(content_list[idx][0][idy].encode('utf-8') + " " +
                           predict_results[idx][idy] + '\n')
            fout.write('\n')
        fout.close()
        print("Predict %s result has been written into file. %s" %
              (name, self.decode_dir))

    def load(self, data_file):
        f = open(data_file, 'rb')
        tmp_dict = pickle.load(f)
        f.close()
        self.__dict__.update(tmp_dict)

    def save(self, save_file):
        f = open(save_file, 'wb')
        pickle.dump(self.__dict__, f, 2)
        f.close()

    def write_nbest_decoded_results(self, predict_results, pred_scores, name):
        ## predict_results : [whole_sent_num, nbest, each_sent_length]
        ## pred_scores: [whole_sent_num, nbest]
        fout = open(self.decode_dir, 'w')
        sent_num = len(predict_results)
        content_list = []
        if name == 'raw':
            content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print(
                "Error: illegal name during writing predict result, name should be within train/dev/test/raw !"
            )
        assert (sent_num == len(content_list))
        assert (sent_num == len(pred_scores))
        for idx in range(sent_num):
            sent_length = len(predict_results[idx][0])
            nbest = len(predict_results[idx])
            score_string = "# "
            for idz in range(nbest):
                score_string += format(pred_scores[idx][idz], '.4f') + " "
            fout.write(score_string.strip() + "\n")

            for idy in range(sent_length):
                label_string = content_list[idx][0][idy].encode('utf-8') + " "
                for idz in range(nbest):
                    label_string += predict_results[idx][idz][idy] + " "
                label_string = label_string.strip() + "\n"
                fout.write(label_string)
            fout.write('\n')
        fout.close()
        print("Predict %s %s-best result has been written into file. %s" %
              (name, nbest, self.decode_dir))

    def read_config(self, config_file):
        config = config_file_to_dict(config_file)
        ## task:
        the_item = 'task_name'
        if the_item in config:
            self.task_name = config[the_item]

        ## read data:
        the_item = 'data_bin_dir'
        if the_item in config:
            self.data_bin_dir = config[the_item]
        the_item = 'train_dir'
        if the_item in config:
            self.train_dir = config[the_item]
        the_item = 'dev_dir'
        if the_item in config:
            self.dev_dir = config[the_item]
        the_item = 'test_dir'
        if the_item in config:
            self.test_dir = config[the_item]
        the_item = 'trans_dir'
        if the_item in config:
            self.trans_dir = config[the_item]
        the_item = 'middle_dir'
        if the_item in config:
            self.middle_dir = config[the_item]
        the_item = 'viterbi_inputs_model_name'
        if the_item in config:
            self.viterbi_inputs_model_name = config[the_item]

        the_item = 'substring_dir'
        if the_item in config:
            self.substring_dir = config[the_item]
        the_item = 'bpe_emb_dir'
        if the_item in config:
            self.bpe_emb_dir = config[the_item]
        the_item = 'pos_emb_dir'
        if the_item in config:
            self.pos_emb_dir = config[the_item]

        the_item = 'raw_dir'
        if the_item in config:
            self.raw_dir = config[the_item]
        the_item = 'decode_dir'
        if the_item in config:
            self.decode_dir = config[the_item]
        the_item = 'model_dir'
        if the_item in config:
            self.model_dir = config[the_item]
        the_item = 'load_model_dir'
        if the_item in config:
            self.load_model_dir = config[the_item]

        the_item = 'word_emb_dir'
        if the_item in config:
            self.word_emb_dir = config[the_item]
        the_item = 'char_emb_dir'
        if the_item in config:
            self.char_emb_dir = config[the_item]
        the_item = 'trans_embed_dir'
        if the_item in config:
            self.trans_embed_dir = config[the_item]
        the_item = 'typeinfo_dir'
        if the_item in config:
            self.typeinfo_dir = config[the_item]

        the_item = 'MAX_SENTENCE_LENGTH'
        if the_item in config:
            self.MAX_SENTENCE_LENGTH = int(config[the_item])
        the_item = 'MAX_WORD_LENGTH'
        if the_item in config:
            self.MAX_WORD_LENGTH = int(config[the_item])

        the_item = 'norm_word_emb'
        if the_item in config:
            self.norm_word_emb = str2bool(config[the_item])
        the_item = 'norm_char_emb'
        if the_item in config:
            self.norm_char_emb = str2bool(config[the_item])
        the_item = 'number_normalized'
        if the_item in config:
            self.number_normalized = str2bool(config[the_item])

        the_item = 'seg'
        if the_item in config:
            self.seg = str2bool(config[the_item])
        the_item = 'word_emb_dim'
        if the_item in config:
            self.word_emb_dim = int(config[the_item])
        the_item = 'char_emb_dim'
        if the_item in config:
            self.char_emb_dim = int(config[the_item])
        the_item = 'trans_emb_dim'
        if the_item in config:
            self.trans_emb_dim = int(config[the_item])

        ## read network:
        the_item = 'use_crf'
        if the_item in config:
            self.use_crf = str2bool(config[the_item])
        the_item = 'use_char'
        if the_item in config:
            self.use_char = str2bool(config[the_item])
        the_item = 'use_trans'
        if the_item in config:
            self.use_trans = str2bool(config[the_item])
        the_item = 'use_mapping'
        if the_item in config:
            self.use_mapping = str2bool(config[the_item])
        the_item = 'mapping_func'
        if the_item in config:
            self.mapping_func = config[the_item]
        the_item = 'word_seq_feature'
        if the_item in config:
            self.word_feature_extractor = config[the_item]
        the_item = 'char_seq_feature'
        if the_item in config:
            self.char_seq_feature = config[the_item]
        the_item = 'nbest'
        if the_item in config:
            self.nbest = int(config[the_item])

        the_item = 'feature'
        if the_item in config:
            self.feat_config = config[the_item]  ## feat_config is a dict

        ## read training setting:
        the_item = 'save_model'
        if the_item in config:
            self.save_model = str2bool(config[the_item])
        the_item = 'state_training_name'
        if the_item in config:
            self.state_training_name = config[the_item]
        the_item = 'optimizer'
        if the_item in config:
            self.optimizer = config[the_item]
        the_item = 'ave_batch_loss'
        if the_item in config:
            self.average_batch_loss = str2bool(config[the_item])
        the_item = 'status'
        if the_item in config:
            self.status = config[the_item]
        the_item = 'show_loss_per_batch'
        if the_item in config:
            self.show_loss_per_batch = int(config[the_item])

        ## read Hyperparameters:
        the_item = 'seed_num'
        if the_item in config:
            if config[the_item] != 'None':
                self.seed_num = int(config[the_item])
        the_item = 'cnn_layer'
        if the_item in config:
            self.cnn_layer = int(config[the_item])
        the_item = 'iteration'
        if the_item in config:
            self.iteration = int(config[the_item])
        the_item = 'batch_size'
        if the_item in config:
            self.batch_size = int(config[the_item])

        the_item = 'char_hidden_dim'
        if the_item in config:
            self.char_hidden_dim = int(config[the_item])

        the_item = 'trans_hidden_dim'
        if the_item in config:
            self.trans_hidden_dim = int(config[the_item])

        the_item = 'hidden_dim'
        if the_item in config:
            self.hidden_dim = int(config[the_item])
        the_item = 'dropout'
        if the_item in config:
            self.dropout = float(config[the_item])
        the_item = 'lstm_layer'
        if the_item in config:
            self.lstm_layer = int(config[the_item])
        the_item = 'bilstm'
        if the_item in config:
            self.bilstm = str2bool(config[the_item])

        the_item = 'gpu'
        if the_item in config:
            self.gpu = str2bool(config[the_item])
        the_item = 'learning_rate'
        if the_item in config:
            self.lr = float(config[the_item])
        the_item = 'lr_decay'
        if the_item in config:
            self.lr_decay = float(config[the_item])
        the_item = 'clip'
        if the_item in config:
            if config[the_item] == 'None':
                self.clip = None
            else:
                self.clip = float(config[the_item])
        the_item = 'momentum'
        if the_item in config:
            self.momentum = float(config[the_item])
        the_item = 'l2'
        if the_item in config:
            self.l2 = float(config[the_item])

        ###base2
        the_item = 'feature_name'
        if the_item in config:
            self.feature_name = config[the_item]
        the_item = 'feature_length'
        if the_item in config:
            self.feature_length = int(config[the_item])
        the_item = 'class_num'
        if the_item in config:
            self.class_num = int(config[the_item])
        the_item = 'feature_ans'
        if the_item in config:
            self.feature_ans = config[the_item]

        ###circul
        the_item = 'circul_time'
        if the_item in config:
            self.circul_time = config[the_item]
        the_item = 'circul_deepth'
        if the_item in config:
            self.circul_deepth = config[the_item]
        the_item = 'circul_gather_output_mode'
        if the_item in config:
            self.circul_gather_output_mode = config[the_item]

        ###decode_prepare
        the_item = 'decode_prepare_mode'
        if the_item in config:
            self.decode_prepare_mode = config[the_item]

    def read_arg(self, args):
        if args.task_name != None: self.task_name = args.task_name

        if args.data_bin_dir != None: self.data_bin_dir = args.data_bin_dir
        if args.train_dir != None: self.train_dir = args.train_dir
        if args.dev_dir != None: self.dev_dir = args.dev_dir
        if args.test_dir != None: self.test_dir = args.test_dir
        if args.trans_dir != None: self.trans_dir = args.trans_dir
        if args.word_emb_dir != None: self.word_emb_dir = args.word_emb_dir
        if args.trans_embed_dir != None:
            self.trans_embed_dir = args.trans_embed_dir
        if args.middle_dir != None: self.middle_dir = args.middle_dir
        if args.viterbi_inputs_model_name != None:
            self.viterbi_inputs_model_name = args.viterbi_inputs_model_name

        if args.substring_dir != None: self.substring_dir = args.substring_dir
        if args.bpe_emb_dir != None: self.bpe_emb_dir = args.bpe_emb_dir
        if args.pos_emb_dir != None: self.pos_emb_dir = args.pos_emb_dir

        if args.model_dir != None: self.model_dir = args.model_dir
        if args.norm_word_emb != None: self.norm_word_emb = args.norm_word_emb
        if args.norm_char_emb != None: self.norm_char_emb = args.norm_char_emb
        if args.word_emb_dim != None: self.word_emb_dim = args.word_emb_dim
        if args.char_emb_dim != None: self.char_emb_dim = args.char_emb_dim
        if args.trans_emb_dim != None: self.trans_emb_dim = args.trans_emb_dim

        if args.number_normalized != None:
            self.number_normalized = args.number_normalized
        if args.seg != None: self.seg = args.seg

        if args.use_crf != None: self.use_crf = args.use_crf
        if args.use_char != None: self.use_char = args.use_char
        if args.use_trans != None: self.use_trans = args.use_trans

        if args.word_seq_feature != None:
            self.word_seq_feature = args.word_seq_feature
        if args.char_seq_feature != None:
            self.char_seq_feature = args.char_seq_feature

        if args.nbest != None: self.nbest = args.nbest

        if args.status != None: self.status = args.status
        if args.state_training_name != None:
            self.state_training_name = args.state_training_name
        if args.save_model != None: self.save_model = args.save_model
        if args.optimizer != None: self.optimizer = args.optimizer
        if args.iteration != None: self.iteration = args.iteration
        if args.batch_size != None: self.batch_size = args.batch_size
        if args.ave_batch_loss != None:
            self.ave_batch_loss = args.ave_batch_loss
        if args.show_loss_per_batch != None:
            self.show_loss_per_batch = args.show_loss_per_batch

        if args.seed_num != None: self.seed_num = args.seed_num
        if args.cnn_layer != None: self.cnn_layer = args.cnn_layer
        if args.char_hidden_dim != None:
            self.char_hidden_dim = args.char_hidden_dim
        if args.trans_hidden_dim != None:
            self.trans_hidden_dim = args.trans_hidden_dim
        if args.hidden_dim != None: self.hidden_dim = args.hidden_dim
        if args.dropout != None: self.dropout = args.dropout
        if args.lstm_layer != None: self.lstm_layer = args.lstm_layer
        if args.bilstm != None: self.bilstm = args.bilstm
        if args.learning_rate != None: self.learning_rate = args.learning_rate
        if args.lr_decay != None: self.lr_decay = args.lr_decay
        if args.momentum != None: self.momentum = args.momentum
        if args.l2 != None: self.l2 = args.l2
        if args.gpu != None: self.gpu = args.gpu
        if args.clip != None: self.clip = args.clip

        ###base2
        if args.feature_name != None: self.feature_name = args.feature_name
        if args.feature_length != None:
            self.feature_length = args.feature_length
        if args.class_num != None: self.class_num = args.class_num
        if args.feature_ans != None:
            self.feature_ans = args.feature_ans

        ###circul
        if args.circul_time != None: self.circul_time = args.circul_time
        if args.circul_deepth != None: self.circul_deepth = args.circul_deepth
        if args.circul_gather_output_mode != None:
            self.circul_gather_output_mode = args.circul_gather_output_mode

        ###decode_prepare
        if args.decode_prepare_mode != None:
            self.decode_prepare_mode = args.decode_prepare_mode

    def build_translation_alphabet(self, trans_path):
        print("Creating translation alphabet......")
        with codecs.open(trans_path, 'r', "utf-8") as f:
            lines = f.readlines()
            for line in lines:
                if len(line.strip().split(":")) == 2:
                    temp = line.strip().split(":", 1)
                    words = temp[1].split()
                    for word in words:
                        self.translation_alphabet.add(word.strip())
        self.trans_alphabet_size = self.translation_alphabet.size()

    def build_translation_dict(self, trans_path):
        print("Creating Id to Id translation dictionary......")
        translation_id_format_temp = {}
        with codecs.open(trans_path, 'r', "utf-8") as f:
            lines = f.readlines()
            for line in lines:
                ids = []
                if len(line.strip().split(":", 1)) == 2:
                    temp = line.strip().split(":", 1)
                    word_id = self.word_alphabet.get_index(temp[0].strip())
                    translations = temp[1].split()
                    for translation in translations:
                        ids.append(
                            self.translation_alphabet.get_index(
                                translation.strip()))
                    if ids == []:
                        ids = [0]
                    translation_id_format_temp[word_id] = ids

        for word in self.word_alphabet.instances:
            if self.word_alphabet.get_index(
                    word) in translation_id_format_temp.keys():
                self.translation_id_format[self.word_alphabet.get_index(
                    word)] = translation_id_format_temp[
                        self.word_alphabet.get_index(word)]
            else:
                self.translation_id_format[self.word_alphabet.get_index(
                    word)] = [0]
Exemplo n.º 3
0
def generate_character_data(sentences_train,
                            sentences_dev,
                            sentences_test,
                            max_sent_length,
                            char_embedd_dim=80):
    """
    generate data for charaters
    :param sentences_train:
    :param sentences_dev:
    :param sentences_test:
    :param max_sent_length:
    :return: C_train, C_dev, C_test, char_embedd_table
    """
    def get_character_indexes(sentences):
        index_sentences = []
        max_length = 0
        for words in sentences:
            index_words = []
            for word in words:
                index_chars = []
                if len(word) > max_length:
                    max_length = len(word)

                for char in word[:MAX_CHAR_LENGTH]:
                    char_id = char_alphabet.get_index(char)
                    index_chars.append(char_id)

                index_words.append(index_chars)
            index_sentences.append(index_words)
        return index_sentences, max_length

    def construct_tensor_char(index_sentences):
        C = np.empty([len(index_sentences), max_sent_length, max_char_length],
                     dtype=np.int32)
        word_end_id = char_alphabet.get_index(word_end)

        for i in range(len(index_sentences)):
            words = index_sentences[i]
            sent_length = len(words)
            for j in range(sent_length):
                chars = words[j]
                char_length = len(chars)
                for k in range(char_length):
                    cid = chars[k]
                    C[i, j, k] = cid
                # fill index of word end after the end of word
                C[i, j, char_length:] = word_end_id
            # Zero out C after the end of the sentence
            C[i, sent_length:, :] = 0
        return C

    def build_char_embedd_table():
        logger.info('Dimension of char embedding dim is ' +
                    str(char_embedd_dim))
        scale = np.sqrt(3.0 / char_embedd_dim)
        char_embedd_table = np.random.uniform(
            -scale, scale, [char_alphabet.size(), char_embedd_dim]).astype(
                theano.config.floatX)
        return char_embedd_table

    char_alphabet = Alphabet('character')
    char_alphabet.get_index(word_end)

    index_sentences_train, max_char_length_train = get_character_indexes(
        sentences_train)
    index_sentences_dev, max_char_length_dev = get_character_indexes(
        sentences_dev)
    index_sentences_test, max_char_length_test = get_character_indexes(
        sentences_test)

    # close character alphabet
    char_alphabet.close()
    logger.info("character alphabet size: %d" % (char_alphabet.size() - 1))

    max_char_length = min(
        MAX_CHAR_LENGTH,
        max(max_char_length_train, max_char_length_dev, max_char_length_test))
    logger.info("Maximum character length of training set is %d" %
                max_char_length_train)
    logger.info("Maximum character length of dev set is %d" %
                max_char_length_dev)
    logger.info("Maximum character length of test set is %d" %
                max_char_length_test)
    logger.info("Maximum character length used for training is %d" %
                max_char_length)

    # fill character tensor
    C_train = construct_tensor_char(index_sentences_train)
    C_dev = construct_tensor_char(index_sentences_dev)
    C_test = construct_tensor_char(index_sentences_test)

    return C_train, C_dev, C_test, build_char_embedd_table()
Exemplo n.º 4
0
class Template:
    def __init__(self, args):
        self.config = yaml.load(open('config.yaml', 'r'), Loader=yaml.FullLoader)
        if args.dataset not in self.config['data_list']:
            raise KeyError("No such dataset named {}.".format(args.dataset))
        self.config['dataset'] = args.dataset
        self.datatype = 'binary'
        if self.config['dataset'] in self.config['datatype']['train_test']:
            self.datatype = 'train_test'
        self.alphabet = Alphabet('word')
        self.set_seed()

    def set_seed(self):
        np.random.seed(self.config['seed'])
        random.seed(self.config['seed'])
    
    def clean_str_sst(self, string):
        """
        Tokenization/string cleaning for SST.
        Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
        """
        string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
        string = re.sub(r"\s{2,}", " ", string)
        return string.strip().lower()

    def clean_str(self, string):
        """
        Tokenization/string cleaning for all datasets except for SST.
        Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
        """
        if self.config['dataset'].startswith('SST'):
            return self.clean_str_sst(string)
        string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
        string = re.sub(r"\'s", " \'s", string)
        string = re.sub(r"\'ve", " \'ve", string)
        string = re.sub(r"n\'t", " n\'t", string)
        string = re.sub(r"\'re", " \'re", string)
        string = re.sub(r"\'d", " \'d", string)
        string = re.sub(r"\'ll", " \'ll", string)
        string = re.sub(r",", " , ", string)
        string = re.sub(r"!", " ! ", string)
        string = re.sub(r"\(", " \( ", string)
        string = re.sub(r"\)", " \) ", string)
        string = re.sub(r"\?", " \? ", string)
        string = re.sub(r"\s{2,}", " ", string)
        return string.strip().lower()

    def read_split_file(self, mode):
        filelist = self.config['data_list'][self.config['dataset']]
        try:
            filename = os.path.join(self.config['dirname'], self.config['dataset'], filelist[mode])
        except:
            return None
        a = open(filename, 'r', encoding='utf-8')
        res = []
        for line in a:
            label, text = int(line[0]), self.clean_str(line[1:]).split()
            res.append((text, label))
        return res
    
    def read_binary_file(self):
        filelist = self.config['data_list'][self.config['dataset']]
        modes = ['pos', 'neg']
        labels = {'pos': 1, 'neg': 0}
        res = []
        for mode in modes:
            filename = os.path.join(self.config['dirname'], self.config['dataset'], filelist[mode])
            # print(filename)
            a = open(filename, 'r', encoding='latin1').read().splitlines()
            for line in a:
                line = self.clean_str(line)
                res.append((line.split(), labels[mode]))
        random.shuffle(res)
        return res
        
        # X, y = zip(*res)
        # train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=self.config['valid_rate'])

    def normalize_word(self, word):
        new_word = ""
        for char in word:
            if char.isdigit():
                new_word += '0'
            else:
                new_word += char
        return new_word

    def execute(self, data_list):
        res_list = {}
        for key, data in data_list.items():
            cur_res = []
            for line, label in data:
                res_line = []
                for word in line:
                    word = self.normalize_word(word)
                    res_line.append(self.alphabet.get_index(word))
                cur_res.append((res_line, label))
            # self.alphabet.close()
            res_list[key] = cur_res
        return res_list

    def load_pretrain_emb(self, embedding_path, skip_first_row, separator):
        embedd_dim = -1
        embedd_dict = dict()
        if os.path.exists(embedding_path[0]):
            embedding_path = embedding_path[0]
        else:
            embedding_path = embedding_path[1]
        with open(embedding_path, 'r', encoding='utf-8') as file:
            i = 0
            j = 0
            for line in tqdm(file, total=3e6):
                if i == 0:
                    i = i + 1
                    if skip_first_row:
                        _ = line.strip()
                        continue
                j = j + 1
                line = line.strip()
                if len(line) == 0:
                    continue
                tokens = line.split(separator)
                if embedd_dim < 0:
                    embedd_dim = len(tokens) - 1
                else:
                    if embedd_dim + 1 == len(tokens):
                        embedd = np.empty([1, embedd_dim])
                        embedd[:] = tokens[1:]
                        embedd_dict[tokens[0]] = embedd
                    else:
                        continue
        return embedd_dict, embedd_dim, embedding_path

    def norm2one(self, vec):
        root_sum_square = np.sqrt(np.sum(np.square(vec)))
        return vec / root_sum_square

    def build_pretrain_embedding(self, embedding_path, alphabet, skip_first_row=True, separator=" ", embedd_dim=300,
                                 norm=True):
        embedd_dict = dict()
        if embedding_path != None:
            embedd_dict, embedd_dim, embedding_path = self.load_pretrain_emb(embedding_path, skip_first_row, separator)
        scale = np.sqrt(3.0 / embedd_dim)
        pretrain_emb = np.empty([alphabet.size(), embedd_dim])
        perfect_match = 0
        case_match = 0
        not_match = 0
        for alph, index in alphabet.iteritems():
            if alph in embedd_dict:
                if norm:
                    pretrain_emb[index, :] = self.norm2one(embedd_dict[alph])
                else:
                    pretrain_emb[index, :] = embedd_dict[alph]
                perfect_match += 1
            elif alph.lower() in embedd_dict:
                if norm:
                    pretrain_emb[index, :] = self.norm2one(embedd_dict[alph.lower()])
                else:
                    pretrain_emb[index, :] = embedd_dict[alph.lower()]
                case_match += 1
            else:
                pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedd_dim])
                not_match += 1
        pretrained_size = len(embedd_dict)
        print("Embedding: %s\n     pretrain num:%s, prefect match:%s, case_match:%s, oov:%s, oov%%:%s" % (
            embedding_path, pretrained_size, perfect_match, case_match, not_match, (not_match + 0.) / alphabet.size()))
        pretrain_emb = np.float32(pretrain_emb)
        self.alphabet.pretrained_emb = pretrain_emb
        return pretrain_emb, embedd_dim

    def run_read_file(self):
        data_list = []
        if self.datatype == 'train_test':
            modes = ['train', 'valid', 'test']
            data_list = list(map(self.read_split_file, modes))
            if data_list[1] is None:
                X, y = zip(*data_list[0])
                train_x, valid_x, train_y, valid_y = train_test_split(X, y,
                                    test_size=self.config['valid_rate'])
                data_list[0] = list(zip(train_x, train_y))
                data_list[1] = list(zip(valid_x, valid_y))
            data_list = {
                'train': data_list[0],
                'valid': data_list[1],
                'test': data_list[2]
                }
        elif self.datatype == 'binary':
            datalist = self.read_binary_file()
            X, y = zip(*datalist)
            kf = StratifiedKFold(n_splits=self.config['kfold'], shuffle=True)
            data_list = []
            for train_index, test_index in kf.split(X, y):
                train_x = [X[w] for w in train_index]
                train_y = [y[w] for w in train_index]
                test_x = [X[w] for w in test_index]
                test_y = [y[w] for w in test_index]

                temp = {'train': list(zip(train_x, train_y)), 'test': list(zip(test_x, test_y))}
                temp['valid'] = temp['test']
                data_list.append(temp)
        return data_list

    def forward(self):
        data_list = self.run_read_file()
        if isinstance(data_list, list):
            processed_list = list(map(self.execute, data_list))
        else:
            processed_list = self.execute(data_list)
        pretrained_emb, emb_dim = self.build_pretrain_embedding(self.config['embedding_path'], self.alphabet, norm=True)
        pkl.dump((processed_list, self.alphabet, pretrained_emb, emb_dim), open(self.config['res_path'].format(self.config['dataset']), 'wb'))
Exemplo n.º 5
0
def generate_character_data(sentences_train, sentences_dev, sentences_test, max_sent_length, char_embedd_dim=30):
    """
    generate data for charaters
    :param sentences_train:
    :param sentences_dev:
    :param sentences_test:
    :param max_sent_length:
    :return: C_train, C_dev, C_test, char_embedd_table
    """

    def get_character_indexes(sentences):
        index_sentences = []
        max_length = 0
        for words in sentences:
            index_words = []
            for word in words:
                index_chars = []
                if len(word) > max_length:
                    max_length = len(word)

                for char in word[:MAX_CHAR_LENGTH]:
                    char_id = char_alphabet.get_index(char)
                    index_chars.append(char_id)

                index_words.append(index_chars)
            index_sentences.append(index_words)
        return index_sentences, max_length

    def construct_tensor_char(index_sentences):
        C = np.empty([len(index_sentences), max_sent_length, max_char_length], dtype=np.int32)
        word_end_id = char_alphabet.get_index(word_end)

        for i in range(len(index_sentences)):
            words = index_sentences[i]
            sent_length = len(words)
            for j in range(sent_length):
                chars = words[j]
                char_length = len(chars)
                for k in range(char_length):
                    cid = chars[k]
                    C[i, j, k] = cid
                # fill index of word end after the end of word
                C[i, j, char_length:] = word_end_id
            # Zero out C after the end of the sentence
            C[i, sent_length:, :] = 0
        return C

    def build_char_embedd_table():
        scale = np.sqrt(3.0 / char_embedd_dim)
        char_embedd_table = np.random.uniform(-scale, scale, [char_alphabet.size(), char_embedd_dim]).astype(
            theano.config.floatX)
        return char_embedd_table

    char_alphabet = Alphabet('character')
    char_alphabet.get_index(word_end)

    index_sentences_train, max_char_length_train = get_character_indexes(sentences_train)
    index_sentences_dev, max_char_length_dev = get_character_indexes(sentences_dev)
    index_sentences_test, max_char_length_test = get_character_indexes(sentences_test)

    # close character alphabet
    char_alphabet.close()
    logger.info("character alphabet size: %d" % (char_alphabet.size() - 1))

    max_char_length = min(MAX_CHAR_LENGTH, max(max_char_length_train, max_char_length_dev, max_char_length_test))
    logger.info("Maximum character length of training set is %d" % max_char_length_train)
    logger.info("Maximum character length of dev set is %d" % max_char_length_dev)
    logger.info("Maximum character length of test set is %d" % max_char_length_test)
    logger.info("Maximum character length used for training is %d" % max_char_length)

    # fill character tensor
    C_train = construct_tensor_char(index_sentences_train)
    C_dev = construct_tensor_char(index_sentences_dev)
    C_test = construct_tensor_char(index_sentences_test)

    return C_train, C_dev, C_test, build_char_embedd_table()
Exemplo n.º 6
0
class VsmNormer(nn.Module):
    def __init__(self):
        super(VsmNormer, self).__init__()
        self.word_alphabet = Alphabet('word')
        self.embedding_dim = None
        self.word_embedding = None
        self.dict_alphabet = Alphabet('dict')
        self.dict_embedding = None
        self.gpu = opt.gpu

    def transfer_model_into_gpu(self):
        if torch.cuda.is_available():
            self.word_embedding = self.word_embedding.cuda(self.gpu)
            self.dict_embedding = self.dict_embedding.cuda(self.gpu)

    def batch_name_to_ids(self, name):
        tokens = my_tokenize(name)
        length = len(tokens)
        tokens_id = np.zeros((1, length), dtype=np.int)
        for i, word in enumerate(tokens):
            word = norm_utils.word_preprocess(word)
            tokens_id[0][i] = self.word_alphabet.get_index(word)

        tokens_id = torch.from_numpy(tokens_id)

        if torch.cuda.is_available():
            return tokens_id.cuda(self.gpu)
        else:
            return tokens_id

    def init_vector_for_dict(self, meddra_dict):
        self.dict_embedding = nn.Embedding(len(meddra_dict),
                                           self.embedding_dim)
        if torch.cuda.is_available():
            self.dict_embedding = self.dict_embedding.cuda(self.gpu)

        for concept_id, concept_name in meddra_dict.items():
            self.dict_alphabet.add(concept_id)
            with torch.no_grad():
                tokens_id = self.batch_name_to_ids(concept_name)
                length = tokens_id.size(1)
                emb = self.word_embedding(tokens_id)
                emb = emb.unsqueeze_(1)
                pool = functional.avg_pool2d(emb, (length, 1))
                index = norm_utils.get_dict_index(self.dict_alphabet,
                                                  concept_id)
                self.dict_embedding.weight.data[index] = pool[0][0]

    def compute_similarity(self, mention_rep, concep_rep):
        # mention_rep is (batch, emb_dim) and concep_rep is (concept_num, emb_dim)
        mention_rep_norm = torch.norm(mention_rep, 2, 1, True)  # batch 1
        concep_rep_norm = torch.norm(concep_rep, 2, 1, True)  # concept 1
        a = torch.matmul(mention_rep_norm,
                         torch.t(concep_rep_norm))  # batch, concept
        a = a.clamp(min=1e-8)

        b = torch.matmul(mention_rep, torch.t(concep_rep))  # batch, concept

        return b / a

    def forward(self, mention_word_ids):
        length = mention_word_ids.size(1)
        mention_word_emb = self.word_embedding(mention_word_ids)
        mention_word_emb = mention_word_emb.unsqueeze_(1)
        mention_word_pool = functional.avg_pool2d(mention_word_emb,
                                                  (length, 1))  # batch,1,1,100
        mention_word_pool = mention_word_pool.squeeze_(1).squeeze_(
            1)  # batch,100

        # similarities = torch.t(torch.matmul(self.dict_embedding.weight.data, torch.t(mention_word_pool))) # batch, dict
        similarities = self.compute_similarity(mention_word_pool,
                                               self.dict_embedding.weight.data)

        values, indices = torch.max(similarities, 1)

        return values, indices

    def process_one_doc(self, doc, entities, dict):

        for entity in entities:
            with torch.no_grad():
                tokens_id = self.batch_name_to_ids(entity.name)

                values, indices = self.forward(tokens_id)

                norm_id = norm_utils.get_dict_name(self.dict_alphabet,
                                                   indices.item())
                name = dict[norm_id]
                entity.norm_ids.append(norm_id)
                entity.norm_names.append(name)
                entity.norm_confidences.append(values.item())
Exemplo n.º 7
0
class Data:
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = True
        self.norm_word_emb = False
        self.norm_char_emb = False
        self.norm_trans_emb = False
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')

        self.translation_alphabet = Alphabet('translation')
        self.translation_id_format = {}

        self.feature_name = []
        self.feature_alphabets = []
        self.feature_num = len(self.feature_alphabets)
        self.feat_config = None

        self.label_alphabet = Alphabet('label', True)
        self.tagScheme = "NoSeg"  ## BMES/BIO

        self.seg = True

        ### I/O
        self.train_dir = None
        self.dev_dir = None
        self.test_dir = None
        self.raw_dir = None

        self.trans_dir = None

        self.decode_dir = None
        self.dset_dir = None  ## data vocabulary related file
        self.model_dir = None  ## model save  file
        self.load_model_dir = None  ## model load file

        self.word_emb_dir = None
        self.char_emb_dir = None
        self.trans_embed_dir = None
        self.typeinfo_dir = None

        self.feature_emb_dirs = []

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []

        self.pretrain_word_embedding = None
        self.pretrain_char_embedding = None
        self.pretrain_trans_embedding = None
        self.pretrain_feature_embeddings = []

        self.label_size = 0
        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0
        self.trans_alphabet_size = 0

        self.feature_alphabet_sizes = []
        self.feature_emb_dims = []
        self.norm_feature_embs = []
        self.word_emb_dim = 50
        self.char_emb_dim = 30
        self.trans_emb_dim = 100

        ###Networks
        self.word_feature_extractor = "LSTM"  # "LSTM"/"CNN"/"GRU"/
        self.use_char = True
        self.char_seq_feature = "CNN"  # "LSTM"/"CNN"/"GRU"/None
        self.use_trans = False
        self.use_crf = True
        self.nbest = None
        self.use_mapping = False
        self.mapping_func = None  # tanh or sigmoid

        # Training
        self.average_batch_loss = False
        self.optimizer = "SGD"  # "SGD"/"Adam"
        self.status = "train"
        # Hyperparameters
        self.HP_cnn_layer = 4
        self.HP_iteration = 100
        self.HP_batch_size = 10
        self.HP_char_hidden_dim = 50
        self.HP_trans_hidden_dim = 50
        self.HP_hidden_dim = 200
        self.HP_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = True

        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05
        self.HP_clip = None
        self.HP_momentum = 0
        self.HP_l2 = 1e-8

    def show_data_summary(self):
        print("++" * 50)
        print("DATA SUMMARY START:")
        print(" I/O:")
        print("     Tag          scheme: %s" % (self.tagScheme))
        print("     MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH))
        print("     MAX   WORD   LENGTH: %s" % (self.MAX_WORD_LENGTH))
        print("     Number   normalized: %s" % (self.number_normalized))
        print("     Word  alphabet size: %s" % (self.word_alphabet_size))
        print("     Char  alphabet size: %s" % (self.char_alphabet_size))
        print("     Label alphabet size: %s" % (self.label_alphabet_size))
        print("     Trans alphabet size: %s" % (self.trans_alphabet_size))
        print("     Word embedding  dir: %s" % (self.word_emb_dir))
        print("     Char embedding  dir: %s" % (self.char_emb_dir))
        print("     Tran embedding  dir: %s" % (self.trans_embed_dir))
        print("     Word embedding size: %s" % (self.word_emb_dim))
        print("     Char embedding size: %s" % (self.char_emb_dim))
        print("     Tran embedding size: %s" % (self.trans_emb_dim))
        print("     Norm   word     emb: %s" % (self.norm_word_emb))
        print("     Norm   char     emb: %s" % (self.norm_char_emb))
        print("     Norm   tran     emb: %s" % (self.norm_trans_emb))
        print("     Train  file directory: %s" % (self.train_dir))
        print("     Dev    file directory: %s" % (self.dev_dir))
        print("     Test   file directory: %s" % (self.test_dir))
        print("     Raw    file directory: %s" % (self.raw_dir))
        if self.typeinfo_dir:
            print("     typeinfo    directory: %s" % (self.typeinfo_dir))
        print("     Dset   file directory: %s" % (self.dset_dir))
        print("     Model  file directory: %s" % (self.model_dir))
        print("     Loadmodel   directory: %s" % (self.load_model_dir))
        print("     Decode file directory: %s" % (self.decode_dir))
        print("     Train instance number: %s" % (len(self.train_texts)))
        print("     Dev   instance number: %s" % (len(self.dev_texts)))
        print("     Test  instance number: %s" % (len(self.test_texts)))
        print("     Raw   instance number: %s" % (len(self.raw_texts)))
        print("     FEATURE num: %s" % (self.feature_num))
        for idx in range(self.feature_num):
            print("         Fe: %s  alphabet  size: %s" %
                  (self.feature_alphabets[idx].name,
                   self.feature_alphabet_sizes[idx]))
            print(
                "         Fe: %s  embedding  dir: %s" %
                (self.feature_alphabets[idx].name, self.feature_emb_dirs[idx]))
            print(
                "         Fe: %s  embedding size: %s" %
                (self.feature_alphabets[idx].name, self.feature_emb_dims[idx]))
            print("         Fe: %s  norm       emb: %s" %
                  (self.feature_alphabets[idx].name,
                   self.norm_feature_embs[idx]))
        print(" " + "++" * 20)
        print(" Model Network:")
        print("     Model        use_crf: %s" % (self.use_crf))
        print("     Model word extractor: %s" % (self.word_feature_extractor))
        print("     Model       use_char: %s" % (self.use_char))
        if self.use_char:
            print("     Model char_seq_feature: %s" % (self.char_seq_feature))
            print("     Model char_hidden_dim: %s" % (self.HP_char_hidden_dim))
        if self.use_trans:
            print("     Model trans_hidden_dim: %s" %
                  (self.HP_trans_hidden_dim))
        if self.use_mapping:
            print("     Model mapping function: %s" % (self.mapping_func))
        print(" " + "++" * 20)
        print(" Training:")
        print("     Optimizer: %s" % (self.optimizer))
        print("     Iteration: %s" % (self.HP_iteration))
        print("     BatchSize: %s" % (self.HP_batch_size))
        print("     Average  batch   loss: %s" % (self.average_batch_loss))

        print(" " + "++" * 20)
        print(" Hyperparameters:")

        print("     Hyper              lr: %s" % (self.HP_lr))
        print("     Hyper        lr_decay: %s" % (self.HP_lr_decay))
        print("     Hyper         HP_clip: %s" % (self.HP_clip))
        print("     Hyper        momentum: %s" % (self.HP_momentum))
        print("     Hyper              l2: %s" % (self.HP_l2))
        print("     Hyper      hidden_dim: %s" % (self.HP_hidden_dim))
        print("     Hyper         dropout: %s" % (self.HP_dropout))
        print("     Hyper      lstm_layer: %s" % (self.HP_lstm_layer))
        print("     Hyper          bilstm: %s" % (self.HP_bilstm))
        print("     Hyper             GPU: %s" % (self.HP_gpu))
        print("DATA SUMMARY END.")
        print("++" * 50)
        sys.stdout.flush()

    def initial_feature_alphabets(self):
        items = open(self.train_dir, 'r').readline().strip('\n').split()
        total_column = len(items)
        if total_column > 2:
            for idx in range(1, total_column - 1):
                feature_prefix = items[idx].split(']', 1)[0] + "]"
                self.feature_alphabets.append(Alphabet(feature_prefix))
                self.feature_name.append(feature_prefix)
                print "Find feature: ", feature_prefix
        self.feature_num = len(self.feature_alphabets)
        self.pretrain_feature_embeddings = [None] * self.feature_num
        self.feature_emb_dims = [20] * self.feature_num
        self.feature_emb_dirs = [None] * self.feature_num
        self.norm_feature_embs = [False] * self.feature_num
        self.feature_alphabet_sizes = [0] * self.feature_num
        if self.feat_config:
            for idx in range(self.feature_num):
                if self.feature_name[idx] in self.feat_config:
                    self.feature_emb_dims[idx] = self.feat_config[
                        self.feature_name[idx]]['emb_size']
                    self.feature_emb_dirs[idx] = self.feat_config[
                        self.feature_name[idx]]['emb_dir']
                    self.norm_feature_embs[idx] = self.feat_config[
                        self.feature_name[idx]]['emb_norm']
        # exit(0)

    def build_alphabet(self, input_file):
        in_lines = open(input_file, 'r').readlines()
        for line in in_lines:
            if len(line) > 2:
                pairs = line.strip().split()
                word = pairs[0].decode('utf-8')
                if self.number_normalized:
                    word = normalize_word(word)
                label = pairs[-1]
                self.label_alphabet.add(label)
                self.word_alphabet.add(word)
                ## build feature alphabet
                for idx in range(self.feature_num):
                    feat_idx = pairs[idx + 1].split(']', 1)[-1]
                    self.feature_alphabets[idx].add(feat_idx)
                for char in word:
                    self.char_alphabet.add(char)
        self.word_alphabet_size = self.word_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()
        for idx in range(self.feature_num):
            self.feature_alphabet_sizes[idx] = self.feature_alphabets[
                idx].size()
        startS = False
        startB = False
        for label, _ in self.label_alphabet.iteritems():
            if "S-" in label.upper():
                startS = True
            elif "B-" in label.upper():
                startB = True
        if startB:
            if startS:
                self.tagScheme = "BMES"
            else:
                self.tagScheme = "BIO"

    def fix_alphabet(self):
        self.word_alphabet.close()
        self.char_alphabet.close()
        self.label_alphabet.close()
        self.translation_alphabet.close()
        for idx in range(self.feature_num):
            self.feature_alphabets[idx].close()

    def build_pretrain_emb(self):
        if self.word_emb_dir:
            print("Load pretrained word embedding, norm: %s, dir: %s" %
                  (self.norm_word_emb, self.word_emb_dir))
            self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(
                self.word_emb_dir, self.word_alphabet, self.word_emb_dim,
                self.norm_word_emb)

            if self.typeinfo_dir:
                type_info_matrix = []
                with codecs.open(self.typeinfo_dir, 'r') as typeinfo_file:
                    type_info_lines = typeinfo_file.readlines()
                    for line in type_info_lines:
                        line = line.rstrip().split()
                        for i, _ in enumerate(line):
                            line[i] = float(line[i])
                        line = np.array(line)
                        type_info_matrix.append(line)

                print(
                    "Caculate type info distribution,and concate word and type......"
                )
                cos_res = []
                for i, word_embed in enumerate(self.pretrain_word_embedding):
                    word_type_info = []
                    if i == 0:
                        word_type_info = np.random.random(
                            size=len(type_info_matrix))
                        cos_res.append(word_type_info)
                    else:
                        for type_info in type_info_matrix:
                            cos_sim = 1 - spatial.distance.cosine(
                                word_embed, type_info)
                            word_type_info.append(cos_sim)
                        cos_res.append(word_type_info)
                cos_res = np.array(cos_res)
                cos_res = sigmoid(cos_res)
                self.pretrain_word_embedding = np.concatenate(
                    [self.pretrain_word_embedding, cos_res], axis=1)
                print "type info length:{}".format(len(type_info_matrix))
                self.word_emb_dim += len(type_info_matrix)
                print "new word dim is :{}".format(self.word_emb_dim)

        if self.char_emb_dir:
            print("Load pretrained char embedding, norm: %s, dir: %s" %
                  (self.norm_char_emb, self.char_emb_dir))
            self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding(
                self.char_emb_dir, self.char_alphabet, self.char_emb_dim,
                self.norm_char_emb)
        if self.trans_embed_dir:
            print("Load pretrained trans embedding, norm: %s, dir: %s" %
                  (self.norm_trans_emb, self.trans_embed_dir))
            self.pretrain_trans_embedding, self.trans_emb_dim = build_chi_pretrain_embedding(
                self.trans_embed_dir, self.translation_alphabet,
                self.trans_emb_dim, self.norm_trans_emb)

        for idx in range(self.feature_num):
            if self.feature_emb_dirs[idx]:
                print(
                    "Load pretrained feature %s embedding:, norm: %s, dir: %s"
                    % (self.feature_name[idx], self.norm_feature_embs[idx],
                       self.feature_emb_dirs[idx]))
                self.pretrain_feature_embeddings[idx], self.feature_emb_dims[
                    idx] = build_pretrain_embedding(
                        self.feature_emb_dirs[idx],
                        self.feature_alphabets[idx],
                        self.feature_emb_dims[idx],
                        self.norm_feature_embs[idx])

    def generate_instance(self, name):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_instance(
                self.train_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.translation_id_format)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_instance(
                self.dev_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.translation_id_format)
        elif name == "test":
            self.test_texts, self.test_Ids = read_instance(
                self.test_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.translation_id_format)
        elif name == "raw":
            self.raw_texts, self.raw_Ids = read_instance(
                self.raw_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.translation_id_format)
        else:
            print(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))

    def write_decoded_results(self, predict_results, name):
        fout = open(self.decode_dir, 'w')
        sent_num = len(predict_results)
        content_list = []
        if name == 'raw':
            content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print(
                "Error: illegal name during writing predict result, name should be within train/dev/test/raw !"
            )
        assert (sent_num == len(content_list))
        for idx in range(sent_num):
            sent_length = len(predict_results[idx])
            for idy in range(sent_length):
                ## content_list[idx] is a list with [word, char, label]
                fout.write(content_list[idx][0][idy].encode('utf-8') + " " +
                           predict_results[idx][idy] + '\n')
            fout.write('\n')
        fout.close()
        print("Predict %s result has been written into file. %s" %
              (name, self.decode_dir))

    def load(self, data_file):
        f = open(data_file, 'rb')
        tmp_dict = pickle.load(f)
        f.close()
        self.__dict__.update(tmp_dict)

    def save(self, save_file):
        f = open(save_file, 'wb')
        pickle.dump(self.__dict__, f, 2)
        f.close()

    def write_nbest_decoded_results(self, predict_results, pred_scores, name):
        ## predict_results : [whole_sent_num, nbest, each_sent_length]
        ## pred_scores: [whole_sent_num, nbest]
        fout = open(self.decode_dir, 'w')
        sent_num = len(predict_results)
        content_list = []
        if name == 'raw':
            content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print(
                "Error: illegal name during writing predict result, name should be within train/dev/test/raw !"
            )
        assert (sent_num == len(content_list))
        assert (sent_num == len(pred_scores))
        for idx in range(sent_num):
            sent_length = len(predict_results[idx][0])
            nbest = len(predict_results[idx])
            score_string = "# "
            for idz in range(nbest):
                score_string += format(pred_scores[idx][idz], '.4f') + " "
            fout.write(score_string.strip() + "\n")

            for idy in range(sent_length):
                label_string = content_list[idx][0][idy].encode('utf-8') + " "
                for idz in range(nbest):
                    label_string += predict_results[idx][idz][idy] + " "
                label_string = label_string.strip() + "\n"
                fout.write(label_string)
            fout.write('\n')
        fout.close()
        print("Predict %s %s-best result has been written into file. %s" %
              (name, nbest, self.decode_dir))

    def read_config(self, config_file):
        config = config_file_to_dict(config_file)
        ## read data:
        the_item = 'train_dir'
        if the_item in config:
            self.train_dir = config[the_item]
        the_item = 'dev_dir'
        if the_item in config:
            self.dev_dir = config[the_item]
        the_item = 'test_dir'
        if the_item in config:
            self.test_dir = config[the_item]

        the_item = 'trans_dir'
        if the_item in config:
            self.trans_dir = config[the_item]

        the_item = 'raw_dir'
        if the_item in config:
            self.raw_dir = config[the_item]
        the_item = 'decode_dir'
        if the_item in config:
            self.decode_dir = config[the_item]
        the_item = 'dset_dir'
        if the_item in config:
            self.dset_dir = config[the_item]
        the_item = 'model_dir'
        if the_item in config:
            self.model_dir = config[the_item]
        the_item = 'load_model_dir'
        if the_item in config:
            self.load_model_dir = config[the_item]

        the_item = 'word_emb_dir'
        if the_item in config:
            self.word_emb_dir = config[the_item]
        the_item = 'char_emb_dir'
        if the_item in config:
            self.char_emb_dir = config[the_item]
        the_item = 'trans_embed_dir'
        if the_item in config:
            self.trans_embed_dir = config[the_item]
        the_item = 'typeinfo_dir'
        if the_item in config:
            self.typeinfo_dir = config[the_item]

        the_item = 'MAX_SENTENCE_LENGTH'
        if the_item in config:
            self.MAX_SENTENCE_LENGTH = int(config[the_item])
        the_item = 'MAX_WORD_LENGTH'
        if the_item in config:
            self.MAX_WORD_LENGTH = int(config[the_item])

        the_item = 'norm_word_emb'
        if the_item in config:
            self.norm_word_emb = str2bool(config[the_item])
        the_item = 'norm_char_emb'
        if the_item in config:
            self.norm_char_emb = str2bool(config[the_item])
        the_item = 'number_normalized'
        if the_item in config:
            self.number_normalized = str2bool(config[the_item])

        the_item = 'seg'
        if the_item in config:
            self.seg = str2bool(config[the_item])
        the_item = 'word_emb_dim'
        if the_item in config:
            self.word_emb_dim = int(config[the_item])
        the_item = 'char_emb_dim'
        if the_item in config:
            self.char_emb_dim = int(config[the_item])
        the_item = 'trans_emb_dim'
        if the_item in config:
            self.trans_emb_dim = int(config[the_item])

        ## read network:
        the_item = 'use_crf'
        if the_item in config:
            self.use_crf = str2bool(config[the_item])
        the_item = 'use_char'
        if the_item in config:
            self.use_char = str2bool(config[the_item])
        the_item = 'use_trans'
        if the_item in config:
            self.use_trans = str2bool(config[the_item])
        the_item = 'use_mapping'
        if the_item in config:
            self.use_mapping = str2bool(config[the_item])
        the_item = 'mapping_func'
        if the_item in config:
            self.mapping_func = config[the_item]
        the_item = 'word_seq_feature'
        if the_item in config:
            self.word_feature_extractor = config[the_item]
        the_item = 'char_seq_feature'
        if the_item in config:
            self.char_seq_feature = config[the_item]
        the_item = 'nbest'
        if the_item in config:
            self.nbest = int(config[the_item])

        the_item = 'feature'
        if the_item in config:
            self.feat_config = config[the_item]  ## feat_config is a dict

        ## read training setting:
        the_item = 'optimizer'
        if the_item in config:
            self.optimizer = config[the_item]
        the_item = 'ave_batch_loss'
        if the_item in config:
            self.average_batch_loss = str2bool(config[the_item])
        the_item = 'status'
        if the_item in config:
            self.status = config[the_item]

        ## read Hyperparameters:
        the_item = 'cnn_layer'
        if the_item in config:
            self.HP_cnn_layer = int(config[the_item])
        the_item = 'iteration'
        if the_item in config:
            self.HP_iteration = int(config[the_item])
        the_item = 'batch_size'
        if the_item in config:
            self.HP_batch_size = int(config[the_item])

        the_item = 'char_hidden_dim'
        if the_item in config:
            self.HP_char_hidden_dim = int(config[the_item])

        the_item = 'trans_hidden_dim'
        if the_item in config:
            self.HP_trans_hidden_dim = int(config[the_item])

        the_item = 'hidden_dim'
        if the_item in config:
            self.HP_hidden_dim = int(config[the_item])
        the_item = 'dropout'
        if the_item in config:
            self.HP_dropout = float(config[the_item])
        the_item = 'lstm_layer'
        if the_item in config:
            self.HP_lstm_layer = int(config[the_item])
        the_item = 'bilstm'
        if the_item in config:
            self.HP_bilstm = str2bool(config[the_item])

        the_item = 'gpu'
        if the_item in config:
            self.HP_gpu = str2bool(config[the_item])
        the_item = 'learning_rate'
        if the_item in config:
            self.HP_lr = float(config[the_item])
        the_item = 'lr_decay'
        if the_item in config:
            self.HP_lr_decay = float(config[the_item])
        the_item = 'clip'
        if the_item in config:
            self.HP_clip = float(config[the_item])
        the_item = 'momentum'
        if the_item in config:
            self.HP_momentum = float(config[the_item])
        the_item = 'l2'
        if the_item in config:
            self.HP_l2 = float(config[the_item])

    def build_translation_alphabet(self, trans_path):
        print("Creating translation alphabet......")
        with codecs.open(trans_path, 'r', "utf-8") as f:
            lines = f.readlines()
            for line in lines:
                if len(line.strip().split(":")) == 2:
                    temp = line.strip().split(":", 1)
                    words = temp[1].split()
                    for word in words:
                        self.translation_alphabet.add(word.strip())
        self.trans_alphabet_size = self.translation_alphabet.size()

    def build_translation_dict(self, trans_path):
        print("Creating Id to Id translation dictionary......")
        translation_id_format_temp = {}
        with codecs.open(trans_path, 'r', "utf-8") as f:
            lines = f.readlines()
            for line in lines:
                ids = []
                if len(line.strip().split(":", 1)) == 2:
                    temp = line.strip().split(":", 1)
                    word_id = self.word_alphabet.get_index(temp[0].strip())
                    translations = temp[1].split()
                    for translation in translations:
                        ids.append(
                            self.translation_alphabet.get_index(
                                translation.strip()))
                    if ids == []:
                        ids = [0]
                    translation_id_format_temp[word_id] = ids

        for word in self.word_alphabet.instances:
            if self.word_alphabet.get_index(
                    word) in translation_id_format_temp.keys():
                self.translation_id_format[self.word_alphabet.get_index(
                    word)] = translation_id_format_temp[
                        self.word_alphabet.get_index(word)]
            else:
                self.translation_id_format[self.word_alphabet.get_index(
                    word)] = [0]
Exemplo n.º 8
0
class Data:
    def __init__(self, input_file):
        self.original_data = open(input_file, 'r').readlines()
        self.index_data = []
        self.word_alphabet = Alphabet('word')
        self.gloss_alphabet = Alphabet('gloss')
        self.entity_alphabet = Alphabet('entity')
        self.gaz_alphabet = Alphabet('gaz')
        self.label_alphabet = Alphabet('label')
        self.word_alphabet_size = 0
        self.gloss_alphabet_size = 0
        self.entity_alphabet_size = 0
        self.gaz_alphabet_size = 0
        self.label_alphabet_size = 0
        ### hyperparameters
        self.HP_iteration = 100
        self.HP_batch_size = 1
        self.HP_gaz_hidden_dim = 50
        self.HP_lstm_hidden_dim = 200
        self.HP_dropout = 0.5
        self.gaz_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = False
        self.HP_use_entity = False
        self.HP_use_gloss = True
        self.HP_use_gaz = False
        self.HP_gpu = True
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05
        self.HP_clip = 5.0
        self.HP_momentum = 0
        self.HP_iteration = 100
        # embedding hyperparameter
        self.word_emb_dim = 200
        self.entity_emb_dim = 50
        self.gloss_features = "CNN"  #["CNN","LSTM"]
        self.gloss_emb_dim = 200
        self.gloss_hidden_dim = 300
        self.pretrain_word_embedding = np.array([])
        self.pretrain_gaz_embedding = None
        self.word_embed_path = "../LOVECC/NYM.6B.200d.txt"  #"NYM_200.txt"
        self.gaz_embed_path = None
        self.gaz_emb_dim = 200
        self.HP_fix_gaz_emb = True

    def build_alphabet(self):
        in_lines = self.original_data
        for idx in range(len(in_lines)):
            line = json.loads(in_lines[idx])
            words = line["word_context"]
            for word in words:
                self.word_alphabet.add(word)

            sentence_gloss = line["babel_gloss"]
            for word_gloss in sentence_gloss:
                for phrase_gloss in word_gloss:  #一个词可以匹配多个词组
                    if "EN" in phrase_gloss:
                        phrase_gloss_EN = phrase_gloss["EN"]
                        final_gloss = " . ".join(phrase_gloss_EN)
                        for de_word in final_gloss:
                            # for definates in phrase_gloss_EN:
                            # for de_word in definates.split():
                            self.gloss_alphabet.add(de_word)

            entitys = line["entity_context"]
            for entity in entitys:
                self.entity_alphabet.add(entity)

            gazs = line["babel_phase"]
            for gaz in gazs:
                for item in gaz:
                    self.gaz_alphabet.add(item)

            labels = line["detection_label"]
            for label in labels:
                self.label_alphabet.add(label)
        print(self.label_alphabet.get_content())
        self.word_alphabet_size = self.word_alphabet.size()
        self.gloss_alphabet_size = self.gloss_alphabet.size()
        self.entity_alphabet_size = self.entity_alphabet.size()
        self.gaz_alphabet_size = self.gaz_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()
        self.word_alphabet.close()
        self.gloss_alphabet.close()
        self.entity_alphabet.close()
        self.gaz_alphabet.close()
        self.label_alphabet.close()

    def generate_instance_Ids(self):  #把输入句子变成对应的标号(Id)
        in_lines = self.original_data
        for idx in range(len(in_lines)):
            line = json.loads(in_lines[idx])
            words = line["word_context"]
            words_Id = []
            for word in words:
                words_Id.append(self.word_alphabet.get_index(word))

            sentence_gloss = line["babel_gloss"]
            sentence_glosses_Id = []
            for word_gloss in sentence_gloss:
                word_glosses_Id = []
                for phrase_gloss in word_gloss:  #一个词可以匹配多个词组
                    if "EN" in phrase_gloss:
                        phrase_gloss_EN = phrase_gloss["EN"]  #这是个list
                        final_gloss = " . ".join(phrase_gloss_EN)
                        for de_word in final_gloss:
                            word_glosses_Id.append(
                                self.gloss_alphabet.get_index(de_word))
                sentence_glosses_Id.append(word_glosses_Id)

            entitys = line["entity_context"]
            entitys_Id = []
            for entity in entitys:
                entitys_Id.append(self.entity_alphabet.get_index(entity))

            gazs = line["babel_phase"]
            sentence_gazs_Id = [
            ]  #gazs_Id=[[[take over,take over of,...],[2,3,...]],[[legal,legal procedures,...],[1,2,...]],...,[[open the window,open the window please,...],[3,4,...]]]
            for gaz in gazs:
                word_gazs_Id = []
                Ids = []
                Lens = []
                for item in gaz:
                    Ids.append(self.gaz_alphabet.get_index(item))
                    Lens.append(len(item.split()))
                word_gazs_Id = [Ids, Lens]
                sentence_gazs_Id.append(word_gazs_Id)

            labels = line["detection_label"]
            labels_Id = []
            for label in labels:
                labels_Id.append(self.label_alphabet.get_index(label))
            self.index_data.append([
                words_Id, entitys_Id, sentence_gazs_Id, sentence_glosses_Id,
                labels_Id
            ])

    def load_pretrain_emb(self, embedding_path):
        lines = open(embedding_path, 'r', encoding="utf-8").readlines()
        statistic = lines[0].strip()  #开头的两个统计数据:单词数,向量长度
        # print(statistic)
        embedd_dim = int(statistic.split()[1])
        embedd_dict = dict()
        embedd_dict["<pad>"] = [0.0 for i in range(embedd_dim)]  #填充词对应的向量置为全零
        # print(len(embedd_dict["<pad>"]))
        for line in lines[1:]:
            line = line.strip()
            if len(line) == 0:
                continue
            tokens = line.split()
            if embedd_dim < 0:
                embedd_dim = len(tokens) - 1
            else:
                assert (embedd_dim + 1 == len(tokens))
            embedd_dict[tokens[0]] = [float(i) for i in tokens[1:]]
        return embedd_dict, embedd_dim

    def norm2one(self, vec):
        if np.sum(vec) == 0:
            return vec
        root_sum_square = np.sqrt(np.sum(np.square(vec)))
        return vec / root_sum_square

    def build_pretrain_embedding(self,
                                 embedding_path,
                                 word_alphabet,
                                 embedd_dim=200,
                                 norm=True):
        embedd_dict = dict()
        if embedding_path != None:
            # 读取embedding字典
            embedd_dict, embedd_dim = self.load_pretrain_emb(embedding_path)
        scale = np.sqrt(3.0 / embedd_dim)
        pretrain_emb = np.zeros([word_alphabet.size(),
                                 embedd_dim])  #pretrain_emb就是重排之后的embedding矩阵
        perfect_match = 0
        case_match = 0
        not_match = 0
        for word, index in word_alphabet.get_alphabet().items():
            if word in embedd_dict:
                # print(word,index)
                # print(len(embedd_dict[word]))
                if norm:
                    pretrain_emb[index] = self.norm2one(embedd_dict[word])
                else:
                    pretrain_emb[index] = embedd_dict[word]
                perfect_match += 1
            elif word.lower() in embedd_dict:
                if norm:
                    pretrain_emb[index] = self.norm2one(
                        embedd_dict[word.lower()])
                else:
                    pretrain_emb[index] = embedd_dict[word.lower()]
                case_match += 1
            else:
                pretrain_emb[index] = np.random.uniform(
                    -scale, scale, [1, embedd_dim])
                not_match += 1
        pretrained_size = len(embedd_dict)
        # print("pad's embedding:",pretrain_emb[word_alphabet.get_index(",")])
        print(
            "Embedding:\n  pretrain word:%s, prefect match:%s, case_match:%s, oov:%s, oov%%:%s"
            % (pretrained_size, perfect_match, case_match, not_match,
               (not_match + 0.) / word_alphabet.size()))
        return pretrain_emb, embedd_dim  #pretrain_emb就是根据alphabet的顺序重排embedding矩阵,embedd_dim是向量的纬度

    def generate_embedding(self):
        self.pretrain_word_embedding, self.word_pretrain_dim = self.build_pretrain_embedding(
            self.word_embed_path, self.word_alphabet)
        self.pretrain_gloss_embedding, self.gloss_pretrain_dim = self.build_pretrain_embedding(
            self.word_embed_path, self.gloss_alphabet)
        self.pretrain_gaz_embedding, self.gaz_pretrain_dim = self.build_pretrain_embedding(
            self.word_embed_path, self.gaz_alphabet)