Пример #1
0
def generate_output_multiclass(model,
                               input,
                               gold,
                               doc_list_sub,
                               processed_path,
                               output_pred_path,
                               pred=True,
                               data_folder="",
                               format_abbre=".TimeNorm.system.completed.xml"):
    non_operator = read.textfile2list(non_operator_path)
    operator = read.textfile2list(operator_path)
    labels_index = [non_operator, operator, operator]
    classes, probs = output.make_prediction_function_multiclass(
        input, model, output_pred_path)
    if pred == True:
        np.save(output_pred_path + "/y_predict_classes" + data_folder, classes)
        read.savein_pickle(output_pred_path + "/y_predict_proba" + data_folder,
                           probs)

    spans = list()
    int2labels = list()
    for index in range(len(classes)):

        class_loc = output.found_location_with_constraint(classes[index])
        span = output.loc2span(class_loc, probs[index], post_process=False)
        spans.append(span)

        one_hot = read.counterList2Dict(list(enumerate(labels_index[index],
                                                       1)))
        one_hot = {y: x for x, y in one_hot.items()}
        int2label = dict((int, char) for char, int in one_hot.items())
        int2labels.append(int2label)

    n_marks = 3
    sent_index = 0

    for data_id in range(0, len(doc_list_sub)):
        sent_spans = read.readfrom_json(
            os.path.join(processed_path, doc_list_sub[data_id],
                         doc_list_sub[data_id] + "_sent"))
        data_span = list()
        for sent_span in sent_spans:
            for index in range(len(classes)):
                span_list = spans[index][sent_index]
                if len(span_list[0]) < 1:
                    pass
                else:
                    for [posi_start, posi_end, label] in span_list:
                        data_span.append([
                            posi_start - n_marks + sent_span[1],
                            posi_end - n_marks + sent_span[1],
                            int2labels[index][label]
                        ])
            sent_index += 1
        data = span2xmlfiles(data_span, doc_list_sub[data_id])
        output_path = os.path.join(output_pred_path, doc_list_sub[data_id],
                                   doc_list_sub[data_id])
        read.create_folder(output_path)
        data.to_file(output_path + format_abbre)
    del classes, probs, input
Пример #2
0
def get_all_resources(data):
    sentences = read.read_from_csv(data + "/inputdata.csv")
    nss_terms = read.textfile2list("data/nss_terms.txt")
    nss_terms_plural = read.textfile2list("data/nss_terms_plural.txt")
    print sentences
    print nss_terms
    print nss_terms_plural
    return sentences, nss_terms, nss_terms_plural
def get_list_name(file_list_name):
    file_names = read.textfile2list(file_list_name)
    file_simple = [
        file_name.split("/")[-1] for file_name in file_names
        if "THYMEColonFinal" in file_name
    ]
    read.savein_json(file_list_name.replace(".txt", "_simple"), file_simple)
Пример #4
0
def generate_output_multiclass(model,input,gold,doc_list_sub, processed_path,output_pred_path,pred =True,data_folder = "",format_abbre = ".TimeNorm.system.completed.xml"):
    non_operator = read.textfile2list(non_operator_path)
    operator = read.textfile2list(operator_path)
    labels_index = [non_operator,operator,operator]
    classes, probs = output.make_prediction_function_multiclass(input, model, output_pred_path)
    if pred == True:
        np.save(output_pred_path + "/y_predict_classes"+data_folder, classes)
        read.savein_pickle(output_pred_path + "/y_predict_proba"+data_folder, probs)

    spans = list()
    int2labels = list()
    for index in range(len(classes)):

        class_loc = output.found_location_with_constraint(classes[index])
        span = output.loc2span(class_loc, probs[index],post_process = False)
        spans.append(span)

        one_hot = read.counterList2Dict(list(enumerate(labels_index[index], 1)))
        one_hot = {y: x for x, y in one_hot.items()}
        int2label = dict((int, char) for char, int in one_hot.items())
        int2labels.append(int2label)

    n_marks =3
    sent_index = 0

    for data_id in range(0,len(doc_list_sub)):
        sent_spans = read.readfrom_json(os.path.join(processed_path,doc_list_sub[data_id],doc_list_sub[data_id]+"_sent"))
        data_span = list()
        for sent_span in sent_spans:
            for index in range(len(classes)):
                span_list = spans[index][sent_index]
                if len(span_list[0]) <1:
                    pass
                else:
                    for [posi_start,posi_end,label] in span_list:
                        data_span.append([posi_start-n_marks+sent_span[1],posi_end-n_marks+ sent_span[1],int2labels[index][label]])
            sent_index += 1
        data = span2xmlfiles(data_span,doc_list_sub[data_id])
        output_path = os.path.join(output_pred_path,doc_list_sub[data_id],doc_list_sub[data_id])
        read.create_folder(output_path)
        data.to_file(output_path+format_abbre)
    del classes,probs,input
Пример #5
0
def read_training():
    training1 = read.textfile2list(
        "data/SMM4H/subtask3/task_3_normalization_training1.txt")
    training2 = read.textfile2list(
        "data/SMM4H/subtask3/task_3_normalization_training2.txt")
    training3 = read.textfile2list(
        "data/SMM4H/subtask3/task_3_normalization_training3.txt")
    training4 = read.textfile2list(
        "data/SMM4H/subtask3/task_3_normalization_training4.txt")
    data1, labels1 = get_label(training1)
    data2, labels2 = get_label(training2)
    data3, labels3 = get_label(training3)
    data4, labels4 = get_label(training4)

    test_data = read.textfile2list(
        "data/SMM4H/subtask3/task_3_normalization_evaluation.txt")
    data_test, labels_test = get_label(test_data)
    data_train = data1 + data2 + data3 + data4
    labels_all = list(set(labels1 + labels2 + labels3 + labels4 + labels_test))

    read.save_in_json("data/SMM4H/train_ori", data_train)
    read.save_in_json("data/SMM4H/test", data_test)
    read.save_in_json("data/SMM4H/labels_ori", labels_all)
def get_entity(sentences):
    nss_terms = read.textfile2list("data/nss_terms.txt")
    nss_terms_plural = read.textfile2list("data/nss_terms_plural.txt")
    non_specfic_terms = nss_terms + nss_terms_plural
    entity_list = []
    sid = 0
    for sentence in sentences:
        index = 0
        entity = []
        eid = 0
        if sid == 57:
            print 123
        while index < len(sentence):
            if "Bio" in sentence[index][2]:
                #print sentence[index]
                name, end, last_term = get_name(index, sentence)
                start = index
                tag = sentence[index][2]
                referent = sentence[index][3]
                if "_Bio" in sentence[index][
                        2] and last_term != "" and plural_singular(
                            last_term) in non_specfic_terms:
                    total, before, after, before_i = normalize_split(
                        normalize(name))
                    entity.append((sid, eid, start, start + before_i, before,
                                   tag, referent))
                    eid = eid + 1
                entity.append(
                    (sid, eid, start, end, name, tag, referent)
                )  #### sentence_id, entity_id, position_start, position_end, entity_term, referent
                index = end
                eid += 1
            index = index + 1
        sid = sid + 1
        entity_list.append(entity)
        #print entity
    return entity_list
def output_encoding(
    raw_data_dir,
    preprocessed_path,
    model_path,
    data_folder="",
    activation="softmax",
    type="interval"
):  ###type in "[interval","operator","explicit_operator","implicit_operator"]
    target_labels = defaultdict(float)
    if type not in [
            "interval", "operator", "explicit_operator", "implicit_operator"
    ]:
        return
    interval = read.textfile2list(non_operator_path)
    operator = read.textfile2list(operator_path)
    max_len = 350
    n_marks = 3
    max_len_text = max_len + 2 * n_marks
    n_output = 0
    final_labels = 0

    if activation == "sigmoid":
        final_labels = interval + operator
        n_output = len(final_labels)
    elif activation == "softmax":
        if "interval" in type:
            final_labels = interval
        elif "operator" in type:
            final_labels = operator
        n_output = len(final_labels) + 1

    one_hot = read.counterList2Dict(list(enumerate(final_labels, 1)))
    output_one_hot = {y: x for x, y in one_hot.items()}

    sample_weights_output = []
    outputs = []
    total_with_timex = 0
    n_sent_total = 0
    for data_id in range(0, len(raw_data_dir)):
        #preprocessed_file_path = os.path.join(preprocessed_path, raw_data_dir[data_id], raw_data_dir[data_id]) - TODO
        preprocessed_file_path = os.path.join(preprocessed_path,
                                              raw_data_dir[data_id])
        sent_span_list_file = read.readfrom_json(preprocessed_file_path +
                                                 "_sent")
        tag_span_list_file = read.readfrom_json(preprocessed_file_path +
                                                "_tag")
        n_sent = len(tag_span_list_file)
        n_sent_total += n_sent
        for index in range(n_sent):
            sent_info = sent_span_list_file[index]
            tag_info = tag_span_list_file[index]

            sentence_start = sent_info[1]
            label_encoding_sent = np.zeros((max_len_text, n_output))
            if activation == "softmax":
                label_encoding_sent[:, 0] = 1
            sample_weights_sent = np.zeros(max_len_text)

            for label in tag_info:
                posi, info = label
                position = int(posi) - sentence_start
                posi_end = int(info[0]) - sentence_start
                info_new = list(set(info[2:]))

                if activation == "sigmoid":

                    label_indices = [
                        output_one_hot[token_tag] for token_tag in info_new
                        if token_tag in output_one_hot
                    ]
                    k = np.sum(np.eye(n_output)[[
                        sigmoid_index - 1 for sigmoid_index in label_indices
                    ]],
                               axis=0)

                    label_encoding_sent[position + n_marks:posi_end +
                                        n_marks, :] = np.repeat([k],
                                                                posi_end -
                                                                position,
                                                                axis=0)

                elif activation == "softmax":
                    if "explicit" in type or "interval" in type:
                        target_label = process.get_explict_label(
                            info_new, interval, operator)
                    elif "implicit" in type.split("_"):
                        target_label = process.get_implict_label(
                            info_new, interval, operator)
                    for token_tag in target_label:
                        if token_tag in final_labels:
                            target_labels[token_tag] += 1.0

                    label_indices = [
                        output_one_hot[token_tag] for token_tag in target_label
                        if token_tag in final_labels
                    ]
                    if len(label_indices) != 0:
                        k = np.sum(np.eye(n_output)[[
                            softmax_index for softmax_index in label_indices
                        ]],
                                   axis=0)
                        label_encoding_sent[position + n_marks:posi_end +
                                            n_marks, :] = np.repeat([k],
                                                                    posi_end -
                                                                    position,
                                                                    axis=0)
                t = len(label_indices)
                if t >= 1:
                    sample_weights_sent[position + n_marks:posi_end +
                                        n_marks] = label_indices[randint(
                                            0, t - 1)]
            sample_weights_output.append(sample_weights_sent)
            outputs.append(label_encoding_sent)
            total_with_timex += 1
            #print total_with_timex
    print(n_sent_total)
    sample_weights = np.asarray(sample_weights_output)
    sample_weights = get_sample_weights_multiclass(n_output, sample_weights,
                                                   0.05)
    #print target_labels
    np.save(
        model_path + "/sample_weights" + data_folder + "_" + type + "_" +
        activation, sample_weights)
    read.save_hdf5(
        model_path + "/output" + data_folder + "_" + type + "_" + activation,
        [type + "_" + activation], [outputs], ['int8'])
def get_pt():
    # cuis_all = read.read_from_json("data/umls/snomed_rxnorm_dict.txt")
    ranking = read.textfile2list("data/umls_vocab_ranking")
    vocab_idx = {item: idx for idx, item in enumerate(ranking)}
    print(vocab_idx)
    idx_vocab = {idx: item for idx, item in enumerate(ranking)}
Пример #9
0
def generate_output_multiclass(sent_len,model,input,doc_list_sub, processed_path,output_pred_path,pred =True,data_folder = "",format_abbre = ".TimeNorm.system.completed.xml"):
    non_operator = read.textfile2list(non_operator_path)
    print('non_operator')
    print(non_operator)
    operator = read.textfile2list(operator_path)
    print('operator')
    print(operator)
    labels_index = [non_operator,operator,operator]
    print('labels_index')
    print(labels_index)
    classes, probs = output.make_prediction_function_multiclass(input, model, output_pred_path)
    print('sent_len')
    print(sent_len)
    print('classes, probs - ')
    print(classes)
    #print(probs)
    if pred == True:
        np.save(output_pred_path + "/y_predict_classes"+data_folder, classes)
        read.savein_pickle(output_pred_path + "/y_predict_proba"+data_folder, probs)

    spans = list()
    int2labels = list()
    for index in range(len(classes)):
        class_loc = output.found_location_with_constraint(classes[index], sent_len)
        print('class_loc')
        print(class_loc)
        span = output.loc2span(class_loc, probs[index],post_process = False)
        print('span')
        print(span)
        spans.append(span)

        one_hot = read.counterList2Dict(list(enumerate(labels_index[index], 1)))
        one_hot = {y: x for x, y in one_hot.items()}
        int2label = dict((int, char) for char, int in one_hot.items())
        int2labels.append(int2label)

    n_marks =3
    sent_index = 0

    for data_id in range(0,len(doc_list_sub)):
        print('HERE %s', doc_list_sub[data_id])
        print(os.path.join(processed_path,doc_list_sub[data_id]+"_sent"))
        sent_spans = read.readfrom_json(os.path.join(processed_path,doc_list_sub[data_id]+"_sent"))
        print('sent_spans %s', sent_spans)
        data_span = list()
        for sent_span in sent_spans:
            """ print('sent_span - ')
            print(sent_span)
            posi_start = sent_span[1]
            posi_end = sent_span[2]
            label = sent_span[0]
            print('posi_start:%s posi_end:%s, label:%s', (posi_start,posi_end,label))
            data_span.append([posi_start-n_marks+sent_span[1],posi_end-n_marks+ sent_span[1],int2labels[index][label]]) """
            print('classes - ', range(len(classes)))
            for index in range(len(classes)):
                #print('index - ' + index + ':' + sent_index)
                """ print(index)
                print(sent_index) """
                span_list = spans[index][sent_index]
                #print('span_list - ')
                #print(len(span_list[0]))
                #print(span_list)
                if len(span_list[0]) <1:
                    pass
                else:
                    for [posi_start,posi_end,label] in span_list:
                        print('posi_start:%s posi_end:%s, label:%s', (posi_start,posi_end,label))
                        data_span.append([posi_start-n_marks+sent_span[1],posi_end-n_marks+ sent_span[1],int2labels[index][label]])
            sent_index += 1
        print('data_span - ', data_span)
        data = span2xmlfiles(data_span,doc_list_sub[data_id])
        print('data %s', data)
        output_path = os.path.join(output_pred_path,doc_list_sub[data_id],doc_list_sub[data_id])
        read.create_folder(output_path)
        data.to_file(output_path+format_abbre)
    del classes,probs,input