def features_extraction(raw_data_dir, preprocessed_path, model_path, data_folder="", mode="train"): max_len = 350 pad = 3 input_char = list() char2int = read.readfrom_json(char2int_path) total = 0 for data_id in range(0, len(raw_data_dir)): print(raw_data_dir[data_id]) #preprocessed_file_path = os.path.join(preprocessed_path, raw_data_dir[data_id], raw_data_dir[data_id]) - TODO preprocessed_file_path = os.path.join(preprocessed_path, raw_data_dir[data_id]) sent_span_list_file = read.readfrom_json(preprocessed_file_path + "_sent") print(len(sent_span_list_file)) n_sent = len(sent_span_list_file) for index in range(n_sent): total += 1 input_char.append( get_idx_from_sent("\n", sent_span_list_file[index][0], char2int, max_len, pad)) print("Finished processing file: ", raw_data_dir[data_id]) print(total) input_char = np.asarray(input_char, dtype="int16") if not os.path.exists(model_path): os.makedirs(model_path) read.save_hdf5(model_path + "/input" + data_folder, ["char"], [input_char], ['int16'])
def get_train(): file_dev = read.readfrom_json("data/dev_file_simple") train_all_simple = read.readfrom_json("data/train_all_simple") train = [ train_file for train_file in train_all_simple if train_file not in file_dev ] read.savein_json("data/train_simple", train)
def generate_output_multiclass(model, input, gold, doc_list_sub, processed_path, output_pred_path, pred=True, data_folder="", format_abbre=".TimeNorm.system.completed.xml"): non_operator = read.textfile2list(non_operator_path) operator = read.textfile2list(operator_path) labels_index = [non_operator, operator, operator] classes, probs = output.make_prediction_function_multiclass( input, model, output_pred_path) if pred == True: np.save(output_pred_path + "/y_predict_classes" + data_folder, classes) read.savein_pickle(output_pred_path + "/y_predict_proba" + data_folder, probs) spans = list() int2labels = list() for index in range(len(classes)): class_loc = output.found_location_with_constraint(classes[index]) span = output.loc2span(class_loc, probs[index], post_process=False) spans.append(span) one_hot = read.counterList2Dict(list(enumerate(labels_index[index], 1))) one_hot = {y: x for x, y in one_hot.items()} int2label = dict((int, char) for char, int in one_hot.items()) int2labels.append(int2label) n_marks = 3 sent_index = 0 for data_id in range(0, len(doc_list_sub)): sent_spans = read.readfrom_json( os.path.join(processed_path, doc_list_sub[data_id], doc_list_sub[data_id] + "_sent")) data_span = list() for sent_span in sent_spans: for index in range(len(classes)): span_list = spans[index][sent_index] if len(span_list[0]) < 1: pass else: for [posi_start, posi_end, label] in span_list: data_span.append([ posi_start - n_marks + sent_span[1], posi_end - n_marks + sent_span[1], int2labels[index][label] ]) sent_index += 1 data = span2xmlfiles(data_span, doc_list_sub[data_id]) output_path = os.path.join(output_pred_path, doc_list_sub[data_id], doc_list_sub[data_id]) read.create_folder(output_path) data.to_file(output_path + format_abbre) del classes, probs, input
def evaluate(xml_path, output_pred_path, raw_data_path, doc_list, output_format): gold_count = 0 pred_count = 0 true_count = 0 print('xml_path: %s', xml_path) print('doc_list: %s', len(doc_list)) for file_id in range(len(doc_list)): print('path: ', os.path.join(xml_path, doc_list[file_id] + "_tag")) print( 'path: %s', os.path.exists( os.path.join(xml_path, doc_list[file_id] + "_tag.txt"))) if os.path.exists( os.path.join(xml_path, doc_list[file_id] + "_tag.txt")): gold_tag_dict = get_gold_dict( read.readfrom_json( os.path.join(xml_path, doc_list[file_id] + "_tag"))) output_path = os.path.join(output_pred_path, doc_list[file_id], doc_list[file_id] + output_format) raw_text_path = os.path.join(raw_data_path, doc_list[file_id]) pre_tag_dict = process.extract_xmltag_anafora_pred( output_path, read.readfrom_txt(raw_text_path)) scores = calculate_score(gold_tag_dict, pre_tag_dict) gold_count += scores[0] pred_count += scores[1] true_count += scores[2] metrics(true_count, pred_count, gold_count)
def features_extraction(raw_data_dir, preprocessed_path, model_path, data_folder="", mode="train"): max_len = 350 pad = 3 input_char = list() input_pos = list() input_unic = list() char2int = read.readfrom_json(char2int_path) pos2int = read.readfrom_json(pos2int_path) unicode2int = read.readfrom_json(unicode2int_path) total = 0 for data_id in range(0, len(raw_data_dir)): print(raw_data_dir[data_id]) preprocessed_file_path = os.path.join(preprocessed_path, raw_data_dir[data_id], raw_data_dir[data_id]) sent_span_list_file = read.readfrom_json(preprocessed_file_path + "_sent") print(len(sent_span_list_file)) pos_sentences_character = read.readfrom_json(preprocessed_file_path + "_pos") print(len(pos_sentences_character)) unico_sentences_characte = read.readfrom_json(preprocessed_file_path + "_unicodecategory") print(len(unico_sentences_characte)) n_sent = len(sent_span_list_file) for index in range(n_sent): total += 1 input_char.append( get_idx_from_sent("\n", sent_span_list_file[index][0], char2int, max_len, pad)) input_pos.append( get_idx_from_sent("\n", pos_sentences_character[index], pos2int, max_len, pad)) input_unic.append( get_idx_from_sent("Cc", unico_sentences_characte[index], unicode2int, max_len, pad)) print("Finished processing file: ", raw_data_dir[data_id]) print(total) input_char = np.asarray(input_char, dtype="int") input_pos = np.asarray(input_pos, dtype="int") input_unic = np.asarray(input_unic, dtype="int") if not os.path.exists(model_path): os.makedirs(model_path) read.save_hdf5(model_path + "/input" + data_folder, ["char", "pos", "unic"], [input_char, input_pos, input_unic], ['int8', 'int8', 'int8'])
def generate_output_multiclass(model,input,gold,doc_list_sub, processed_path,output_pred_path,pred =True,data_folder = "",format_abbre = ".TimeNorm.system.completed.xml"): non_operator = read.textfile2list(non_operator_path) operator = read.textfile2list(operator_path) labels_index = [non_operator,operator,operator] classes, probs = output.make_prediction_function_multiclass(input, model, output_pred_path) if pred == True: np.save(output_pred_path + "/y_predict_classes"+data_folder, classes) read.savein_pickle(output_pred_path + "/y_predict_proba"+data_folder, probs) spans = list() int2labels = list() for index in range(len(classes)): class_loc = output.found_location_with_constraint(classes[index]) span = output.loc2span(class_loc, probs[index],post_process = False) spans.append(span) one_hot = read.counterList2Dict(list(enumerate(labels_index[index], 1))) one_hot = {y: x for x, y in one_hot.items()} int2label = dict((int, char) for char, int in one_hot.items()) int2labels.append(int2label) n_marks =3 sent_index = 0 for data_id in range(0,len(doc_list_sub)): sent_spans = read.readfrom_json(os.path.join(processed_path,doc_list_sub[data_id],doc_list_sub[data_id]+"_sent")) data_span = list() for sent_span in sent_spans: for index in range(len(classes)): span_list = spans[index][sent_index] if len(span_list[0]) <1: pass else: for [posi_start,posi_end,label] in span_list: data_span.append([posi_start-n_marks+sent_span[1],posi_end-n_marks+ sent_span[1],int2labels[index][label]]) sent_index += 1 data = span2xmlfiles(data_span,doc_list_sub[data_id]) output_path = os.path.join(output_pred_path,doc_list_sub[data_id],doc_list_sub[data_id]) read.create_folder(output_path) data.to_file(output_path+format_abbre) del classes,probs,input
x = [] for word in sent: if word in word_idx_map: x.append(word_idx_map[word]) else: x.append(0) while len(x) < 356: x.append(4) return x char, pos, unicate = read.load_hdf5("data/cvcolon_train_input", ["char", "pos", "unic"]) char2int = read.readfrom_json("data/char2int") int2char = {index: char for char, index in char2int.items()} # print(char2int) int2char = dict((c, i) for i, c in char2int.items()) sent = list() sent_len = list() for char_x_sent in char: # 2637 8820 12760 ####2637 6183 3940 7140 sent_single = [ int2char[i] if i != 88 and i != 0 else ' ' for i in char_x_sent ] sent.append(sent_single) import torch forward_flairTorch = torch.load("data/lm-news-english-forward-v0.2rc.pt") dictionary = { k.decode('utf8'): v
def output_encoding( raw_data_dir, preprocessed_path, model_path, data_folder="", activation="softmax", type="interval" ): ###type in "[interval","operator","explicit_operator","implicit_operator"] target_labels = defaultdict(float) if type not in [ "interval", "operator", "explicit_operator", "implicit_operator" ]: return interval = read.textfile2list(non_operator_path) operator = read.textfile2list(operator_path) max_len = 350 n_marks = 3 max_len_text = max_len + 2 * n_marks n_output = 0 final_labels = 0 if activation == "sigmoid": final_labels = interval + operator n_output = len(final_labels) elif activation == "softmax": if "interval" in type: final_labels = interval elif "operator" in type: final_labels = operator n_output = len(final_labels) + 1 one_hot = read.counterList2Dict(list(enumerate(final_labels, 1))) output_one_hot = {y: x for x, y in one_hot.items()} sample_weights_output = [] outputs = [] total_with_timex = 0 n_sent_total = 0 for data_id in range(0, len(raw_data_dir)): #preprocessed_file_path = os.path.join(preprocessed_path, raw_data_dir[data_id], raw_data_dir[data_id]) - TODO preprocessed_file_path = os.path.join(preprocessed_path, raw_data_dir[data_id]) sent_span_list_file = read.readfrom_json(preprocessed_file_path + "_sent") tag_span_list_file = read.readfrom_json(preprocessed_file_path + "_tag") n_sent = len(tag_span_list_file) n_sent_total += n_sent for index in range(n_sent): sent_info = sent_span_list_file[index] tag_info = tag_span_list_file[index] sentence_start = sent_info[1] label_encoding_sent = np.zeros((max_len_text, n_output)) if activation == "softmax": label_encoding_sent[:, 0] = 1 sample_weights_sent = np.zeros(max_len_text) for label in tag_info: posi, info = label position = int(posi) - sentence_start posi_end = int(info[0]) - sentence_start info_new = list(set(info[2:])) if activation == "sigmoid": label_indices = [ output_one_hot[token_tag] for token_tag in info_new if token_tag in output_one_hot ] k = np.sum(np.eye(n_output)[[ sigmoid_index - 1 for sigmoid_index in label_indices ]], axis=0) label_encoding_sent[position + n_marks:posi_end + n_marks, :] = np.repeat([k], posi_end - position, axis=0) elif activation == "softmax": if "explicit" in type or "interval" in type: target_label = process.get_explict_label( info_new, interval, operator) elif "implicit" in type.split("_"): target_label = process.get_implict_label( info_new, interval, operator) for token_tag in target_label: if token_tag in final_labels: target_labels[token_tag] += 1.0 label_indices = [ output_one_hot[token_tag] for token_tag in target_label if token_tag in final_labels ] if len(label_indices) != 0: k = np.sum(np.eye(n_output)[[ softmax_index for softmax_index in label_indices ]], axis=0) label_encoding_sent[position + n_marks:posi_end + n_marks, :] = np.repeat([k], posi_end - position, axis=0) t = len(label_indices) if t >= 1: sample_weights_sent[position + n_marks:posi_end + n_marks] = label_indices[randint( 0, t - 1)] sample_weights_output.append(sample_weights_sent) outputs.append(label_encoding_sent) total_with_timex += 1 #print total_with_timex print(n_sent_total) sample_weights = np.asarray(sample_weights_output) sample_weights = get_sample_weights_multiclass(n_output, sample_weights, 0.05) #print target_labels np.save( model_path + "/sample_weights" + data_folder + "_" + type + "_" + activation, sample_weights) read.save_hdf5( model_path + "/output" + data_folder + "_" + type + "_" + activation, [type + "_" + activation], [outputs], ['int8'])
def generate_output_multiclass(sent_len,model,input,doc_list_sub, processed_path,output_pred_path,pred =True,data_folder = "",format_abbre = ".TimeNorm.system.completed.xml"): non_operator = read.textfile2list(non_operator_path) print('non_operator') print(non_operator) operator = read.textfile2list(operator_path) print('operator') print(operator) labels_index = [non_operator,operator,operator] print('labels_index') print(labels_index) classes, probs = output.make_prediction_function_multiclass(input, model, output_pred_path) print('sent_len') print(sent_len) print('classes, probs - ') print(classes) #print(probs) if pred == True: np.save(output_pred_path + "/y_predict_classes"+data_folder, classes) read.savein_pickle(output_pred_path + "/y_predict_proba"+data_folder, probs) spans = list() int2labels = list() for index in range(len(classes)): class_loc = output.found_location_with_constraint(classes[index], sent_len) print('class_loc') print(class_loc) span = output.loc2span(class_loc, probs[index],post_process = False) print('span') print(span) spans.append(span) one_hot = read.counterList2Dict(list(enumerate(labels_index[index], 1))) one_hot = {y: x for x, y in one_hot.items()} int2label = dict((int, char) for char, int in one_hot.items()) int2labels.append(int2label) n_marks =3 sent_index = 0 for data_id in range(0,len(doc_list_sub)): print('HERE %s', doc_list_sub[data_id]) print(os.path.join(processed_path,doc_list_sub[data_id]+"_sent")) sent_spans = read.readfrom_json(os.path.join(processed_path,doc_list_sub[data_id]+"_sent")) print('sent_spans %s', sent_spans) data_span = list() for sent_span in sent_spans: """ print('sent_span - ') print(sent_span) posi_start = sent_span[1] posi_end = sent_span[2] label = sent_span[0] print('posi_start:%s posi_end:%s, label:%s', (posi_start,posi_end,label)) data_span.append([posi_start-n_marks+sent_span[1],posi_end-n_marks+ sent_span[1],int2labels[index][label]]) """ print('classes - ', range(len(classes))) for index in range(len(classes)): #print('index - ' + index + ':' + sent_index) """ print(index) print(sent_index) """ span_list = spans[index][sent_index] #print('span_list - ') #print(len(span_list[0])) #print(span_list) if len(span_list[0]) <1: pass else: for [posi_start,posi_end,label] in span_list: print('posi_start:%s posi_end:%s, label:%s', (posi_start,posi_end,label)) data_span.append([posi_start-n_marks+sent_span[1],posi_end-n_marks+ sent_span[1],int2labels[index][label]]) sent_index += 1 print('data_span - ', data_span) data = span2xmlfiles(data_span,doc_list_sub[data_id]) print('data %s', data) output_path = os.path.join(output_pred_path,doc_list_sub[data_id],doc_list_sub[data_id]) read.create_folder(output_path) data.to_file(output_path+format_abbre) del classes,probs,input