def generate_pos_training(pos_training_file): raw_text_dir = read.read_from_json('raw_data_dir') max_len_text = read.get_char2id_dict(raw_text_dir) text_pos_text_dict = read.read_json( "data/pos/text_pos_text_dict_normalized") raw_dir_simple = read.read_from_json('raw_dir_simple') pos_tag_dict = read.read_from_json("pos_tag_dict") data_size = len(raw_text_dir) f = h5py.File("data/" + pos_training_file + ".hdf5", "w") dset = f.create_dataset("input", (data_size, max_len_text), dtype='int8') #dset2 = f.create_dataset("output", (data_size, max_len_text), dtype='int8') for data_id in range(data_size): pos_list = text_pos_text_dict[raw_dir_simple[data_id]] print raw_dir_simple[data_id] text_inputs = [[pos_tag_dict[pos] for pos in pos_list]] # print text_inputs # print labels data_x = pad_sequences(text_inputs, dtype='int8', maxlen=max_len_text, padding="post") dset[data_id, :] = data_x[0]
def get_list_cd(): start = 0 end = 63 raw_dir_simple = read.read_from_json('raw_dir_simple') cd_list = list() for data_id in range(start, end): #print raw_dir_simple[data_id] pos = read.read_json("data/pos/" + raw_dir_simple[data_id]) for pos_sen in pos: if len(pos_sen) > 0: for pos_token in pos_sen: if pos_token[1] == "CD": if pos_token not in cd_list: cd_list.append(pos_token) print pos_token read.save_json("data/pos/cd_list", cd_list)
def get_list_punctuation(): start = 0 end = 63 p_list = ["/", ":", "-"] raw_dir_simple = read.read_from_json('raw_dir_simple') punctuation_list = list() for data_id in range(start, end): #print raw_dir_simple[data_id] pos = read.read_json("data/pos/" + raw_dir_simple[data_id]) for pos_sen in pos: if len(pos_sen) > 0: for pos_token in pos_sen: if any(e in pos_token[0] for e in p_list): if pos_token not in punctuation_list: punctuation_list.append(pos_token) print pos_token read.save_json("data/pos/punctuation_list", punctuation_list)
def span2xmlfiles(exp, target): import anafora raw_dir_simple = read1.read_from_json('raw_dir_simple') for data_id in range(0, 10): data_spans = read1.read_json(exp + "\\span_label_all" + target)[data_id] data = anafora.AnaforaData() id = 0 for data_span in data_spans: e = anafora.AnaforaEntity() e.spans = ((int(data_span[0]), int(data_span[1]) + 1), ) e.type = data_span[2] e.id = str(id) + "@e@" + raw_dir_simple[data_id] data.annotations.append(e) id += 1 print data data.indent() outputfile = exp + "\\" + raw_dir_simple[data_id] + "\\" if not os.path.exists(outputfile): os.makedirs(outputfile) data.to_file(outputfile + raw_dir_simple[data_id] + ".TimeNorm.gold.completed.xml")
#generate_pos() start = 0 end = 63 raw_text_dir = read.read_from_json('raw_data_dir') raw_dir_simple = read.read_from_json('raw_dir_simple') # data_size = len(raw_text_dir) max_len_text = read.get_char2id_dict(raw_text_dir) char2int = read.read_from_json('char2int') int2char = dict((int, char) for char, int in char2int.items()) text_pos_text_dict = dict() for data_id in range(start, end): print raw_dir_simple[data_id] pos = read.read_json("data/pos/" + raw_dir_simple[data_id]) raw_text = read.read_from_dir(raw_text_dir[data_id]) text_inputs = [[char2int[char] for char in raw_text]] postag = list() index = 0 for line in raw_text.splitlines(): if len(line) == 0: postag.append('\n') index += 1 else: token_index = 0 term = "" for char in line: # if term =="leade": # print "ok"