def generate_output_multiclass(model, input, gold, doc_list_sub, processed_path, output_pred_path, pred=True, data_folder="", format_abbre=".TimeNorm.system.completed.xml"): non_operator = read.textfile2list(non_operator_path) operator = read.textfile2list(operator_path) labels_index = [non_operator, operator, operator] classes, probs = output.make_prediction_function_multiclass( input, model, output_pred_path) if pred == True: np.save(output_pred_path + "/y_predict_classes" + data_folder, classes) read.savein_pickle(output_pred_path + "/y_predict_proba" + data_folder, probs) spans = list() int2labels = list() for index in range(len(classes)): class_loc = output.found_location_with_constraint(classes[index]) span = output.loc2span(class_loc, probs[index], post_process=False) spans.append(span) one_hot = read.counterList2Dict(list(enumerate(labels_index[index], 1))) one_hot = {y: x for x, y in one_hot.items()} int2label = dict((int, char) for char, int in one_hot.items()) int2labels.append(int2label) n_marks = 3 sent_index = 0 for data_id in range(0, len(doc_list_sub)): sent_spans = read.readfrom_json( os.path.join(processed_path, doc_list_sub[data_id], doc_list_sub[data_id] + "_sent")) data_span = list() for sent_span in sent_spans: for index in range(len(classes)): span_list = spans[index][sent_index] if len(span_list[0]) < 1: pass else: for [posi_start, posi_end, label] in span_list: data_span.append([ posi_start - n_marks + sent_span[1], posi_end - n_marks + sent_span[1], int2labels[index][label] ]) sent_index += 1 data = span2xmlfiles(data_span, doc_list_sub[data_id]) output_path = os.path.join(output_pred_path, doc_list_sub[data_id], doc_list_sub[data_id]) read.create_folder(output_path) data.to_file(output_path + format_abbre) del classes, probs, input
def generate_output_multiclass(model,input,gold,doc_list_sub, processed_path,output_pred_path,pred =True,data_folder = "",format_abbre = ".TimeNorm.system.completed.xml"): non_operator = read.textfile2list(non_operator_path) operator = read.textfile2list(operator_path) labels_index = [non_operator,operator,operator] classes, probs = output.make_prediction_function_multiclass(input, model, output_pred_path) if pred == True: np.save(output_pred_path + "/y_predict_classes"+data_folder, classes) read.savein_pickle(output_pred_path + "/y_predict_proba"+data_folder, probs) spans = list() int2labels = list() for index in range(len(classes)): class_loc = output.found_location_with_constraint(classes[index]) span = output.loc2span(class_loc, probs[index],post_process = False) spans.append(span) one_hot = read.counterList2Dict(list(enumerate(labels_index[index], 1))) one_hot = {y: x for x, y in one_hot.items()} int2label = dict((int, char) for char, int in one_hot.items()) int2labels.append(int2label) n_marks =3 sent_index = 0 for data_id in range(0,len(doc_list_sub)): sent_spans = read.readfrom_json(os.path.join(processed_path,doc_list_sub[data_id],doc_list_sub[data_id]+"_sent")) data_span = list() for sent_span in sent_spans: for index in range(len(classes)): span_list = spans[index][sent_index] if len(span_list[0]) <1: pass else: for [posi_start,posi_end,label] in span_list: data_span.append([posi_start-n_marks+sent_span[1],posi_end-n_marks+ sent_span[1],int2labels[index][label]]) sent_index += 1 data = span2xmlfiles(data_span,doc_list_sub[data_id]) output_path = os.path.join(output_pred_path,doc_list_sub[data_id],doc_list_sub[data_id]) read.create_folder(output_path) data.to_file(output_path+format_abbre) del classes,probs,input
def create_class_weight(n_labels, labels, mu): n_softmax = n_labels counts = np.zeros(n_softmax, dtype='int32') for softmax_index in labels: softmax_index = np.asarray(softmax_index) for i in range(n_softmax): counts[i] = counts[i] + np.count_nonzero(softmax_index == i) labels_dict = read.counterList2Dict(list(enumerate(counts, 0))) total = np.sum(list(labels_dict.values())) class_weight = dict() for key, value in labels_dict.items(): if not value == 0: score = mu * total / float(value) class_weight[key] = score if score > 1.0 else 1.0 else: class_weight[key] = 10.0 return class_weight
def output_encoding( raw_data_dir, preprocessed_path, model_path, data_folder="", activation="softmax", type="interval" ): ###type in "[interval","operator","explicit_operator","implicit_operator"] target_labels = defaultdict(float) if type not in [ "interval", "operator", "explicit_operator", "implicit_operator" ]: return interval = read.textfile2list(non_operator_path) operator = read.textfile2list(operator_path) max_len = 350 n_marks = 3 max_len_text = max_len + 2 * n_marks n_output = 0 final_labels = 0 if activation == "sigmoid": final_labels = interval + operator n_output = len(final_labels) elif activation == "softmax": if "interval" in type: final_labels = interval elif "operator" in type: final_labels = operator n_output = len(final_labels) + 1 one_hot = read.counterList2Dict(list(enumerate(final_labels, 1))) output_one_hot = {y: x for x, y in one_hot.items()} sample_weights_output = [] outputs = [] total_with_timex = 0 n_sent_total = 0 for data_id in range(0, len(raw_data_dir)): #preprocessed_file_path = os.path.join(preprocessed_path, raw_data_dir[data_id], raw_data_dir[data_id]) - TODO preprocessed_file_path = os.path.join(preprocessed_path, raw_data_dir[data_id]) sent_span_list_file = read.readfrom_json(preprocessed_file_path + "_sent") tag_span_list_file = read.readfrom_json(preprocessed_file_path + "_tag") n_sent = len(tag_span_list_file) n_sent_total += n_sent for index in range(n_sent): sent_info = sent_span_list_file[index] tag_info = tag_span_list_file[index] sentence_start = sent_info[1] label_encoding_sent = np.zeros((max_len_text, n_output)) if activation == "softmax": label_encoding_sent[:, 0] = 1 sample_weights_sent = np.zeros(max_len_text) for label in tag_info: posi, info = label position = int(posi) - sentence_start posi_end = int(info[0]) - sentence_start info_new = list(set(info[2:])) if activation == "sigmoid": label_indices = [ output_one_hot[token_tag] for token_tag in info_new if token_tag in output_one_hot ] k = np.sum(np.eye(n_output)[[ sigmoid_index - 1 for sigmoid_index in label_indices ]], axis=0) label_encoding_sent[position + n_marks:posi_end + n_marks, :] = np.repeat([k], posi_end - position, axis=0) elif activation == "softmax": if "explicit" in type or "interval" in type: target_label = process.get_explict_label( info_new, interval, operator) elif "implicit" in type.split("_"): target_label = process.get_implict_label( info_new, interval, operator) for token_tag in target_label: if token_tag in final_labels: target_labels[token_tag] += 1.0 label_indices = [ output_one_hot[token_tag] for token_tag in target_label if token_tag in final_labels ] if len(label_indices) != 0: k = np.sum(np.eye(n_output)[[ softmax_index for softmax_index in label_indices ]], axis=0) label_encoding_sent[position + n_marks:posi_end + n_marks, :] = np.repeat([k], posi_end - position, axis=0) t = len(label_indices) if t >= 1: sample_weights_sent[position + n_marks:posi_end + n_marks] = label_indices[randint( 0, t - 1)] sample_weights_output.append(sample_weights_sent) outputs.append(label_encoding_sent) total_with_timex += 1 #print total_with_timex print(n_sent_total) sample_weights = np.asarray(sample_weights_output) sample_weights = get_sample_weights_multiclass(n_output, sample_weights, 0.05) #print target_labels np.save( model_path + "/sample_weights" + data_folder + "_" + type + "_" + activation, sample_weights) read.save_hdf5( model_path + "/output" + data_folder + "_" + type + "_" + activation, [type + "_" + activation], [outputs], ['int8'])
def generate_output_multiclass(sent_len,model,input,doc_list_sub, processed_path,output_pred_path,pred =True,data_folder = "",format_abbre = ".TimeNorm.system.completed.xml"): non_operator = read.textfile2list(non_operator_path) print('non_operator') print(non_operator) operator = read.textfile2list(operator_path) print('operator') print(operator) labels_index = [non_operator,operator,operator] print('labels_index') print(labels_index) classes, probs = output.make_prediction_function_multiclass(input, model, output_pred_path) print('sent_len') print(sent_len) print('classes, probs - ') print(classes) #print(probs) if pred == True: np.save(output_pred_path + "/y_predict_classes"+data_folder, classes) read.savein_pickle(output_pred_path + "/y_predict_proba"+data_folder, probs) spans = list() int2labels = list() for index in range(len(classes)): class_loc = output.found_location_with_constraint(classes[index], sent_len) print('class_loc') print(class_loc) span = output.loc2span(class_loc, probs[index],post_process = False) print('span') print(span) spans.append(span) one_hot = read.counterList2Dict(list(enumerate(labels_index[index], 1))) one_hot = {y: x for x, y in one_hot.items()} int2label = dict((int, char) for char, int in one_hot.items()) int2labels.append(int2label) n_marks =3 sent_index = 0 for data_id in range(0,len(doc_list_sub)): print('HERE %s', doc_list_sub[data_id]) print(os.path.join(processed_path,doc_list_sub[data_id]+"_sent")) sent_spans = read.readfrom_json(os.path.join(processed_path,doc_list_sub[data_id]+"_sent")) print('sent_spans %s', sent_spans) data_span = list() for sent_span in sent_spans: """ print('sent_span - ') print(sent_span) posi_start = sent_span[1] posi_end = sent_span[2] label = sent_span[0] print('posi_start:%s posi_end:%s, label:%s', (posi_start,posi_end,label)) data_span.append([posi_start-n_marks+sent_span[1],posi_end-n_marks+ sent_span[1],int2labels[index][label]]) """ print('classes - ', range(len(classes))) for index in range(len(classes)): #print('index - ' + index + ':' + sent_index) """ print(index) print(sent_index) """ span_list = spans[index][sent_index] #print('span_list - ') #print(len(span_list[0])) #print(span_list) if len(span_list[0]) <1: pass else: for [posi_start,posi_end,label] in span_list: print('posi_start:%s posi_end:%s, label:%s', (posi_start,posi_end,label)) data_span.append([posi_start-n_marks+sent_span[1],posi_end-n_marks+ sent_span[1],int2labels[index][label]]) sent_index += 1 print('data_span - ', data_span) data = span2xmlfiles(data_span,doc_list_sub[data_id]) print('data %s', data) output_path = os.path.join(output_pred_path,doc_list_sub[data_id],doc_list_sub[data_id]) read.create_folder(output_path) data.to_file(output_path+format_abbre) del classes,probs,input