def generate_output_multiclass(model, input, gold, doc_list_sub, processed_path, output_pred_path, pred=True, data_folder="", format_abbre=".TimeNorm.system.completed.xml"): non_operator = read.textfile2list(non_operator_path) operator = read.textfile2list(operator_path) labels_index = [non_operator, operator, operator] classes, probs = output.make_prediction_function_multiclass( input, model, output_pred_path) if pred == True: np.save(output_pred_path + "/y_predict_classes" + data_folder, classes) read.savein_pickle(output_pred_path + "/y_predict_proba" + data_folder, probs) spans = list() int2labels = list() for index in range(len(classes)): class_loc = output.found_location_with_constraint(classes[index]) span = output.loc2span(class_loc, probs[index], post_process=False) spans.append(span) one_hot = read.counterList2Dict(list(enumerate(labels_index[index], 1))) one_hot = {y: x for x, y in one_hot.items()} int2label = dict((int, char) for char, int in one_hot.items()) int2labels.append(int2label) n_marks = 3 sent_index = 0 for data_id in range(0, len(doc_list_sub)): sent_spans = read.readfrom_json( os.path.join(processed_path, doc_list_sub[data_id], doc_list_sub[data_id] + "_sent")) data_span = list() for sent_span in sent_spans: for index in range(len(classes)): span_list = spans[index][sent_index] if len(span_list[0]) < 1: pass else: for [posi_start, posi_end, label] in span_list: data_span.append([ posi_start - n_marks + sent_span[1], posi_end - n_marks + sent_span[1], int2labels[index][label] ]) sent_index += 1 data = span2xmlfiles(data_span, doc_list_sub[data_id]) output_path = os.path.join(output_pred_path, doc_list_sub[data_id], doc_list_sub[data_id]) read.create_folder(output_path) data.to_file(output_path + format_abbre) del classes, probs, input
def get_all_resources(data): sentences = read.read_from_csv(data + "/inputdata.csv") nss_terms = read.textfile2list("data/nss_terms.txt") nss_terms_plural = read.textfile2list("data/nss_terms_plural.txt") print sentences print nss_terms print nss_terms_plural return sentences, nss_terms, nss_terms_plural
def get_list_name(file_list_name): file_names = read.textfile2list(file_list_name) file_simple = [ file_name.split("/")[-1] for file_name in file_names if "THYMEColonFinal" in file_name ] read.savein_json(file_list_name.replace(".txt", "_simple"), file_simple)
def generate_output_multiclass(model,input,gold,doc_list_sub, processed_path,output_pred_path,pred =True,data_folder = "",format_abbre = ".TimeNorm.system.completed.xml"): non_operator = read.textfile2list(non_operator_path) operator = read.textfile2list(operator_path) labels_index = [non_operator,operator,operator] classes, probs = output.make_prediction_function_multiclass(input, model, output_pred_path) if pred == True: np.save(output_pred_path + "/y_predict_classes"+data_folder, classes) read.savein_pickle(output_pred_path + "/y_predict_proba"+data_folder, probs) spans = list() int2labels = list() for index in range(len(classes)): class_loc = output.found_location_with_constraint(classes[index]) span = output.loc2span(class_loc, probs[index],post_process = False) spans.append(span) one_hot = read.counterList2Dict(list(enumerate(labels_index[index], 1))) one_hot = {y: x for x, y in one_hot.items()} int2label = dict((int, char) for char, int in one_hot.items()) int2labels.append(int2label) n_marks =3 sent_index = 0 for data_id in range(0,len(doc_list_sub)): sent_spans = read.readfrom_json(os.path.join(processed_path,doc_list_sub[data_id],doc_list_sub[data_id]+"_sent")) data_span = list() for sent_span in sent_spans: for index in range(len(classes)): span_list = spans[index][sent_index] if len(span_list[0]) <1: pass else: for [posi_start,posi_end,label] in span_list: data_span.append([posi_start-n_marks+sent_span[1],posi_end-n_marks+ sent_span[1],int2labels[index][label]]) sent_index += 1 data = span2xmlfiles(data_span,doc_list_sub[data_id]) output_path = os.path.join(output_pred_path,doc_list_sub[data_id],doc_list_sub[data_id]) read.create_folder(output_path) data.to_file(output_path+format_abbre) del classes,probs,input
def read_training(): training1 = read.textfile2list( "data/SMM4H/subtask3/task_3_normalization_training1.txt") training2 = read.textfile2list( "data/SMM4H/subtask3/task_3_normalization_training2.txt") training3 = read.textfile2list( "data/SMM4H/subtask3/task_3_normalization_training3.txt") training4 = read.textfile2list( "data/SMM4H/subtask3/task_3_normalization_training4.txt") data1, labels1 = get_label(training1) data2, labels2 = get_label(training2) data3, labels3 = get_label(training3) data4, labels4 = get_label(training4) test_data = read.textfile2list( "data/SMM4H/subtask3/task_3_normalization_evaluation.txt") data_test, labels_test = get_label(test_data) data_train = data1 + data2 + data3 + data4 labels_all = list(set(labels1 + labels2 + labels3 + labels4 + labels_test)) read.save_in_json("data/SMM4H/train_ori", data_train) read.save_in_json("data/SMM4H/test", data_test) read.save_in_json("data/SMM4H/labels_ori", labels_all)
def get_entity(sentences): nss_terms = read.textfile2list("data/nss_terms.txt") nss_terms_plural = read.textfile2list("data/nss_terms_plural.txt") non_specfic_terms = nss_terms + nss_terms_plural entity_list = [] sid = 0 for sentence in sentences: index = 0 entity = [] eid = 0 if sid == 57: print 123 while index < len(sentence): if "Bio" in sentence[index][2]: #print sentence[index] name, end, last_term = get_name(index, sentence) start = index tag = sentence[index][2] referent = sentence[index][3] if "_Bio" in sentence[index][ 2] and last_term != "" and plural_singular( last_term) in non_specfic_terms: total, before, after, before_i = normalize_split( normalize(name)) entity.append((sid, eid, start, start + before_i, before, tag, referent)) eid = eid + 1 entity.append( (sid, eid, start, end, name, tag, referent) ) #### sentence_id, entity_id, position_start, position_end, entity_term, referent index = end eid += 1 index = index + 1 sid = sid + 1 entity_list.append(entity) #print entity return entity_list
def output_encoding( raw_data_dir, preprocessed_path, model_path, data_folder="", activation="softmax", type="interval" ): ###type in "[interval","operator","explicit_operator","implicit_operator"] target_labels = defaultdict(float) if type not in [ "interval", "operator", "explicit_operator", "implicit_operator" ]: return interval = read.textfile2list(non_operator_path) operator = read.textfile2list(operator_path) max_len = 350 n_marks = 3 max_len_text = max_len + 2 * n_marks n_output = 0 final_labels = 0 if activation == "sigmoid": final_labels = interval + operator n_output = len(final_labels) elif activation == "softmax": if "interval" in type: final_labels = interval elif "operator" in type: final_labels = operator n_output = len(final_labels) + 1 one_hot = read.counterList2Dict(list(enumerate(final_labels, 1))) output_one_hot = {y: x for x, y in one_hot.items()} sample_weights_output = [] outputs = [] total_with_timex = 0 n_sent_total = 0 for data_id in range(0, len(raw_data_dir)): #preprocessed_file_path = os.path.join(preprocessed_path, raw_data_dir[data_id], raw_data_dir[data_id]) - TODO preprocessed_file_path = os.path.join(preprocessed_path, raw_data_dir[data_id]) sent_span_list_file = read.readfrom_json(preprocessed_file_path + "_sent") tag_span_list_file = read.readfrom_json(preprocessed_file_path + "_tag") n_sent = len(tag_span_list_file) n_sent_total += n_sent for index in range(n_sent): sent_info = sent_span_list_file[index] tag_info = tag_span_list_file[index] sentence_start = sent_info[1] label_encoding_sent = np.zeros((max_len_text, n_output)) if activation == "softmax": label_encoding_sent[:, 0] = 1 sample_weights_sent = np.zeros(max_len_text) for label in tag_info: posi, info = label position = int(posi) - sentence_start posi_end = int(info[0]) - sentence_start info_new = list(set(info[2:])) if activation == "sigmoid": label_indices = [ output_one_hot[token_tag] for token_tag in info_new if token_tag in output_one_hot ] k = np.sum(np.eye(n_output)[[ sigmoid_index - 1 for sigmoid_index in label_indices ]], axis=0) label_encoding_sent[position + n_marks:posi_end + n_marks, :] = np.repeat([k], posi_end - position, axis=0) elif activation == "softmax": if "explicit" in type or "interval" in type: target_label = process.get_explict_label( info_new, interval, operator) elif "implicit" in type.split("_"): target_label = process.get_implict_label( info_new, interval, operator) for token_tag in target_label: if token_tag in final_labels: target_labels[token_tag] += 1.0 label_indices = [ output_one_hot[token_tag] for token_tag in target_label if token_tag in final_labels ] if len(label_indices) != 0: k = np.sum(np.eye(n_output)[[ softmax_index for softmax_index in label_indices ]], axis=0) label_encoding_sent[position + n_marks:posi_end + n_marks, :] = np.repeat([k], posi_end - position, axis=0) t = len(label_indices) if t >= 1: sample_weights_sent[position + n_marks:posi_end + n_marks] = label_indices[randint( 0, t - 1)] sample_weights_output.append(sample_weights_sent) outputs.append(label_encoding_sent) total_with_timex += 1 #print total_with_timex print(n_sent_total) sample_weights = np.asarray(sample_weights_output) sample_weights = get_sample_weights_multiclass(n_output, sample_weights, 0.05) #print target_labels np.save( model_path + "/sample_weights" + data_folder + "_" + type + "_" + activation, sample_weights) read.save_hdf5( model_path + "/output" + data_folder + "_" + type + "_" + activation, [type + "_" + activation], [outputs], ['int8'])
def get_pt(): # cuis_all = read.read_from_json("data/umls/snomed_rxnorm_dict.txt") ranking = read.textfile2list("data/umls_vocab_ranking") vocab_idx = {item: idx for idx, item in enumerate(ranking)} print(vocab_idx) idx_vocab = {idx: item for idx, item in enumerate(ranking)}
def generate_output_multiclass(sent_len,model,input,doc_list_sub, processed_path,output_pred_path,pred =True,data_folder = "",format_abbre = ".TimeNorm.system.completed.xml"): non_operator = read.textfile2list(non_operator_path) print('non_operator') print(non_operator) operator = read.textfile2list(operator_path) print('operator') print(operator) labels_index = [non_operator,operator,operator] print('labels_index') print(labels_index) classes, probs = output.make_prediction_function_multiclass(input, model, output_pred_path) print('sent_len') print(sent_len) print('classes, probs - ') print(classes) #print(probs) if pred == True: np.save(output_pred_path + "/y_predict_classes"+data_folder, classes) read.savein_pickle(output_pred_path + "/y_predict_proba"+data_folder, probs) spans = list() int2labels = list() for index in range(len(classes)): class_loc = output.found_location_with_constraint(classes[index], sent_len) print('class_loc') print(class_loc) span = output.loc2span(class_loc, probs[index],post_process = False) print('span') print(span) spans.append(span) one_hot = read.counterList2Dict(list(enumerate(labels_index[index], 1))) one_hot = {y: x for x, y in one_hot.items()} int2label = dict((int, char) for char, int in one_hot.items()) int2labels.append(int2label) n_marks =3 sent_index = 0 for data_id in range(0,len(doc_list_sub)): print('HERE %s', doc_list_sub[data_id]) print(os.path.join(processed_path,doc_list_sub[data_id]+"_sent")) sent_spans = read.readfrom_json(os.path.join(processed_path,doc_list_sub[data_id]+"_sent")) print('sent_spans %s', sent_spans) data_span = list() for sent_span in sent_spans: """ print('sent_span - ') print(sent_span) posi_start = sent_span[1] posi_end = sent_span[2] label = sent_span[0] print('posi_start:%s posi_end:%s, label:%s', (posi_start,posi_end,label)) data_span.append([posi_start-n_marks+sent_span[1],posi_end-n_marks+ sent_span[1],int2labels[index][label]]) """ print('classes - ', range(len(classes))) for index in range(len(classes)): #print('index - ' + index + ':' + sent_index) """ print(index) print(sent_index) """ span_list = spans[index][sent_index] #print('span_list - ') #print(len(span_list[0])) #print(span_list) if len(span_list[0]) <1: pass else: for [posi_start,posi_end,label] in span_list: print('posi_start:%s posi_end:%s, label:%s', (posi_start,posi_end,label)) data_span.append([posi_start-n_marks+sent_span[1],posi_end-n_marks+ sent_span[1],int2labels[index][label]]) sent_index += 1 print('data_span - ', data_span) data = span2xmlfiles(data_span,doc_list_sub[data_id]) print('data %s', data) output_path = os.path.join(output_pred_path,doc_list_sub[data_id],doc_list_sub[data_id]) read.create_folder(output_path) data.to_file(output_path+format_abbre) del classes,probs,input