示例#1
0
    def get_septic_labels(self, dir_clamp, dir_labels):

        clamp_obj = Clamp()

        labels = dict()
        n_septic = 0

        for i in range(self.n_sepsis_sample_docs + self.n_negative_docs):
            cur_label = 'non_septic'

            is_infected = False
            n_present_labs = 0

            entities = clamp_obj.get_entities(str(i) + '.txt', dir_clamp)

            for cur_entity in entities:
                if not is_infected:
                    if (self.is_pneumonia_and_empyema(cur_entity.mention) or
                        self.get_regex_match(cur_entity.mention,
                                             self.regex_meningitis) or
                        self.get_regex_match(cur_entity.mention,
                                             self.regex_endocarditis) or
                        self.get_regex_match(cur_entity.mention,
                                             self.regex_other_infections)
                    ) and cur_entity.assertion.lower() == 'present':
                        # infection term is mentioned and not negated in the sentence
                        is_infected = True

                if n_present_labs < 2:
                    if (self.get_regex_match(cur_entity.mention,
                                             self.regex_labs_temp) or
                        self.get_regex_match(cur_entity.mention,
                                             self.regex_labs_wbc) or
                        self.get_regex_match(cur_entity.mention,
                                             self.regex_mental_status) or
                        self.get_regex_match(cur_entity.mention,
                                             self.regex_labs_tachycardia) or
                        self.get_regex_match(cur_entity.mention,
                                             self.regex_labs_tachypnea) or
                        self.get_regex_match(cur_entity.mention,
                                             self.regex_labs_hyperglycemia)
                    ) and cur_entity.assertion.lower() == 'present':
                        # patient condition term is mentioned and
                        # not negated in the sentence
                        n_present_labs += 1

            # septic if the patient has an infection and
            # at least two of the pre-specified conditions
            if is_infected and n_present_labs >= 2:
                cur_label = 'septic'
                n_septic += 1

            labels[str(i)] = cur_label

        print("Number of instances labeled as septic: {} of total {} instances"
              .format(n_septic, self.n_sepsis_sample_docs + self.n_negative_docs))

        FileUtils.write_json(labels, 'sepsis_labels.json', dir_labels)
        return labels
示例#2
0
class Newsgroups:
    PATH_DIR_CORPUS = realpath('../dataset/newsgroups/')
    print(PATH_DIR_CORPUS)
    FNAME_TRAIN = 'train_newsgroups.csv'
    FNAME_VAL = 'val_newsgroups.csv'
    FNAME_TEST = 'test_newsgroups.csv'
    FNAME_LABELDICT = 'newsgroups_labeldict.json'
    PATH_DIR_OUT = realpath('../out/')

    TOKENIZER = spacy_eng_tokenizer
    LABEL_DICT = FileUtils.read_json(FNAME_LABELDICT, PATH_DIR_CORPUS)

    PRETRAINED_EMBS = True
    PATH_DIR_EMBS = '/home/corpora/word_embeddings/'
    FNAME_EMBS = 'glove.840B.300d.txt'
    N_DIM_EMBS = 300
    embs_from_disk = True
    FNAME_EMBS_WT = 'pretrained_embs_newsgroups.npy'

    load_encoder = True
    FNAME_ENCODER = 'corpus_encoder_newsgroups.json'
    PATH_ENCODER = realpath('../out/')

    train_model = True
    model_name = 'lstm'
    n_layers = 3
    n_hid = 600
    n_emb = N_DIM_EMBS
    dropout = 0.3
    bidir = True

    test_mode = 'val'  # val | test
示例#3
0
    def save(self, fname, dir_out='../out/'):

        seqs = self.corpus_encoder.get_decoded_sequences(self.corpus,
                                                         strip_angular=True)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            golds = self.corpus.label_encoder.inverse_transform(self.corpus.get_labels())
            preds = self.corpus.label_encoder.inverse_transform(self.preds)

            if not isinstance(golds, list): golds = golds.tolist()
            if not isinstance(preds, list): preds = preds.tolist()

        # saving the sequences, the importance scores, and the gold and predicted labels as JSON file

        FileUtils.write_json(
            {'seq_lst': seqs,
             'imp_scores': self.imp_scores,
             'gold': golds,
             'pred': preds},
            fname, dir_out)
示例#4
0
    def from_imp(cls, pooling, model, corpus, encoder, dir_in='../out/'):

        fname = 'imp_scores_' + model.model_type + \
                '_hid' + str(model.hidden_dim) + '_emb' + str(model.emb_dim) + \
                '_' + splitext(corpus.fname)[0] + '_' + pooling + '.json'

        json_file = FileUtils.read_json(fname, dir_in)

        inst = cls(pooling, model, corpus, encoder,
                   json_file['imp_scores'],
                   corpus.label_encoder.transform(json_file['pred']))
        return inst
def sg_param_search(seqs, scores, eval_obj):

    prec = dict()
    best_prec, best_min_n, best_max_n, best_skip = 0., None, None, None

    for min_n in range(1, 5):
        for max_n in range(min_n, min_n + 4):
            for skip in range(11):
                sg = SeqImpSkipGram.from_seqs(seqs,
                                              scores,
                                              min_n=1,
                                              max_n=max_n,
                                              skip=skip,
                                              topk=50)
                cur_prec = eval_obj.avg_prec_sg(sg.top_sg_seqs)
                prec[repr(
                    (min_n, max_n, skip)
                )] = cur_prec  # converting key to string for JSON serialization

                if cur_prec > best_prec:
                    best_prec, best_min_n, best_max_n, best_skip = cur_prec, min_n, max_n, skip

                print(
                    "Average precision at min_n {}, max_n {}, skip {} is: {}".
                    format(min_n, max_n, skip, cur_prec))
                if max_n == 1:
                    # all skip values will give the same unigram.
                    # Hence iterating over it only once.
                    break

    print("Maximum precision {} for min_n {}, max_n {} and skip {}".format(
        best_prec, best_min_n, best_max_n, best_skip))

    FileUtils.write_json(prec, 'sg_param_search.json', '../out/')

    return best_min_n, best_max_n, best_skip
示例#6
0
def main(f_labels,
         dir_labels,
         dir_corpus,
         fname_suffix,
         dir_csv,
         dir_clamp=None):

    label_dict = FileUtils.read_json(f_labels, dir_labels)
    sorted_labels = sorted(label_dict.items())
    fname_lst = [i for i, j in sorted_labels]  # file names in sorted order
    all_labels_lst = [j for i, j in sorted_labels
                      ]  # labels sorted according to file names
    train_idx, val_idx, test_idx = split_data(fname_lst, all_labels_lst)
    # train_idx, val_idx, test_idx = read_splits('/home/madhumita/sepsis_synthetic/splits/')

    write_csv(train_idx, label_dict, dir_corpus, dir_clamp,
              'train_' + fname_suffix + '.csv', dir_csv, ["label", "text"])

    write_csv(val_idx, label_dict, dir_corpus, dir_clamp,
              'val_' + fname_suffix + '.csv', dir_csv, ["label", "text"])

    write_csv(test_idx, label_dict, dir_corpus, dir_clamp,
              'test_' + fname_suffix + '.csv', dir_csv, ["label", "text"])
示例#7
0
def process_model(ds):

    train_corp = CSVCorpus(ds.FNAME_TRAIN, realpath(ds.PATH_DIR_CORPUS),
                           'train', ds.TOKENIZER, ds.LABEL_DICT)
    val_corp = CSVCorpus(ds.FNAME_VAL, realpath(ds.PATH_DIR_CORPUS), 'val',
                         ds.TOKENIZER, ds.LABEL_DICT)
    test_corp = CSVCorpus(ds.FNAME_TEST, realpath(ds.PATH_DIR_CORPUS), 'test',
                          ds.TOKENIZER, ds.LABEL_DICT)

    if ds.load_encoder:
        if not exists(realpath(join(ds.PATH_ENCODER, ds.FNAME_ENCODER))):
            raise FileNotFoundError("Encoder not found")
        # load encoder
        corpus_encoder = CorpusEncoder.from_json(ds.FNAME_ENCODER,
                                                 ds.PATH_ENCODER)
    else:
        print("Initializing vocabulary")
        corpus_encoder = CorpusEncoder.from_corpus(train_corp)

        if not exists(realpath(ds.PATH_ENCODER)):
            makedirs(realpath(ds.PATH_ENCODER))
        print("Serializing corpus encoder")
        corpus_encoder.to_json(ds.FNAME_ENCODER, realpath(ds.PATH_ENCODER))

    print("Vocab size:", len(corpus_encoder.vocab))

    if ds.train_model:

        if ds.PRETRAINED_EMBS:
            # get embedding weights matrix
            if ds.embs_from_disk:
                print("Loading word embeddings matrix ...")
                weights = FileUtils.read_numpy(ds.FNAME_EMBS_WT,
                                               realpath(ds.PATH_DIR_OUT))
            else:
                weights = EmbeddingUtils.get_embedding_weight(
                    ds.FNAME_EMBS, realpath(ds.PATH_DIR_EMBS), ds.N_DIM_EMBS,
                    corpus_encoder.vocab.word2idx)
                print("Saving word embeddings matrix ...")
                FileUtils.write_numpy(weights, ds.FNAME_EMBS_WT,
                                      realpath(ds.PATH_DIR_OUT))

            weights = torch.from_numpy(weights).type(torch.FloatTensor)
        else:
            weights = None

        print("Word embeddings loaded!")

        net_params = {
            'n_layers': ds.n_layers,
            'hidden_dim': ds.n_hid,
            'vocab_size': corpus_encoder.vocab.size,
            'padding_idx': corpus_encoder.vocab.pad,
            'embedding_dim': ds.n_emb,
            'emb_weights': weights,
            'dropout': ds.dropout,
            'label_size': len(ds.LABEL_DICT.keys()),
            'batch_size': 64,
            'bidir': ds.bidir
        }

        classifier = LSTMClassifier(**net_params)

        n_epochs = 50
        lr = 0.001
        optimizer = torch.optim.Adam(classifier.parameters(), lr=lr)

        classifier.train_model(train_corp, corpus_encoder, n_epochs, optimizer,
                               val_corp)
        classifier.save(f_model=splitext(ds.FNAME_TRAIN)[0][6:] + '_' +
                        ds.model_name + '_' + str(ds.n_layers) + 'layer' +
                        '_hid' + str(ds.n_hid) + '_emb' + str(ds.n_emb) +
                        '_dropout' + str(ds.dropout) + '_bidir' +
                        str(ds.bidir) + '.tar',
                        dir_model=realpath(ds.PATH_DIR_OUT))

    else:
        f_model = splitext(ds.FNAME_TRAIN)[0][6:] + '_' + ds.model_name + '_' + \
                  str(ds.n_layers) + 'layer' + \
                  '_hid' + str(ds.n_hid) + \
                  '_emb' + str(ds.n_emb) + \
                  '_dropout' + str(ds.dropout) + \
                  '_bidir' + str(ds.bidir) + '.tar'

        print("Loading model", f_model)
        classifier = LSTMClassifier.load(f_model=f_model,
                                         dir_model=realpath(ds.PATH_DIR_OUT))

    if ds.test_mode == 'val':
        eval_corp = val_corp
    elif ds.test_mode == 'test':
        eval_corp = test_corp
    else:
        raise ValueError("Specify val|test corpus for evaluation")

    print("Testing on {} data".format(ds.test_mode))

    # get predictions
    y_pred, y_true = classifier.predict(eval_corp, corpus_encoder)
    # compute scoring metrics
    print("Macro F1 score: ",
          f1_score(y_true=y_true, y_pred=y_pred, average='macro'))
    print("Accuracy %", accuracy_score(y_true=y_true, y_pred=y_pred) * 100)
示例#8
0
    def get_septic_notes(self,
                         septic_hadm_ids,
                         fname_notes=FNAME_NOTES,
                         dir_in=PATH_MIMICIII):
        print("Loading notes csv")
        notes_df = PandasUtils.load_csv(fname_notes, dir_in)

        print("Removing error entries")
        prev_len = notes_df.shape[0]
        notes_df = notes_df[notes_df['ISERROR'] != 1]
        assert notes_df.shape[0] < prev_len, "None of the entries are removed"

        print(
            "Removing leading and trailing spaces and converting text to lowercase"
        )
        notes_df['TEXT'] = notes_df['TEXT'].str.strip()
        print("Converting text to lowercase")
        notes_df['TEXT'] = notes_df['TEXT'].str.lower()

        print("Removing blank and NA entries from TEXT and HADM_ID columns")
        notes_df['TEXT'].replace('', np.nan, inplace=True)
        notes_df.dropna(subset=['HADM_ID', 'TEXT'], inplace=True)

        print("Converting HADM ID to int")
        notes_df['HADM_ID'] = notes_df['HADM_ID'].astype('int64')
        print("Converting chartdate to datetime")
        notes_df['CHARTDATE'] = pd.to_datetime(notes_df['CHARTDATE'],
                                               format='%Y-%m-%d')
        # print("All data types", notes_df.dtypes)

        print("Dropping duplicates")
        notes_df = notes_df.drop_duplicates(subset=[
            'SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'CHARTTIME', 'CATEGORY',
            'DESCRIPTION', 'TEXT'
        ],
                                            keep='first')

        print("Adding septic labels")
        notes_df['SEPTIC'] = np.where(
            notes_df['HADM_ID'].isin(septic_hadm_ids), "septic", "non_septic")

        len_all_notes = [
            len(cur_note.split()) for cur_note in list(notes_df['TEXT'])
        ]
        print("Average length of notes: ", statistics.mean(len_all_notes))
        print("Total number of notes: ", len(len_all_notes))

        print("Number of septic notes: ",
              notes_df[notes_df['SEPTIC'] == "septic"].shape[0])

        print("All categories of notes")
        print(set(notes_df['CATEGORY']))

        print("Removing social work notes")
        notes_df = notes_df[notes_df['CATEGORY'] != "Social Work"]

        print("Removing rehabilitation notes ")
        notes_df = notes_df[notes_df['CATEGORY'] != "Rehab Services"]

        print("Removing nutrition notes ")
        notes_df = notes_df[notes_df['CATEGORY'] != "Nutrition"]

        print(
            "Removing discharge notes to prevent direct mention of rnn_expl_rules"
        )
        notes_df = notes_df[notes_df['CATEGORY'] != "Discharge summary"]

        print("New categories, ", set(notes_df['CATEGORY']))

        print("Total Number of notes: ", notes_df.shape[0])
        print("Number of septic notes: ",
              notes_df[notes_df['SEPTIC'] == "septic"].shape[0])

        note_subset = notes_df.loc[notes_df.groupby(
            'HADM_ID').CHARTDATE.idxmax()]
        print("Number of notes after selecting last note per admission: ",
              note_subset.shape[0])
        print(
            "Number of septic notes after selecting last note per admission: ",
            note_subset[note_subset['SEPTIC'] == "septic"].shape[0])

        hadm_ids = list(
            note_subset[note_subset['SEPTIC'] == "septic"]['HADM_ID'])

        n_mention_sepsis = 0

        for hadm_id in hadm_ids:
            if 'rnn_expl_rules' in note_subset[note_subset['HADM_ID'] ==
                                               hadm_id]['TEXT'].item():
                n_mention_sepsis += 1
                # print(note_subset[note_subset['HADM_ID'] == hadm_id]['TEXT'].item())

        print("Number of septic cases that mention rnn_expl_rules: ",
              n_mention_sepsis)

        print("Serializing data")
        label_dict = {}  # {"HADM_ID":"septic"/"non-septic"}
        for hadm_id in note_subset['HADM_ID'].tolist():
            cur_label = note_subset[note_subset['HADM_ID'] ==
                                    hadm_id]['SEPTIC'].item()
            label_dict[str(hadm_id)] = cur_label
            text = note_subset[note_subset['HADM_ID'] ==
                               hadm_id]['TEXT'].item()
            FileUtils.write_txt(text,
                                str(hadm_id) + '.txt',
                                PATH_MIMICIII_SEPSIS_TEXT)

        # write labels json file
        FileUtils.write_json(label_dict, FNAME_LABELS,
                             PATH_MIMICIII_SEPSIS_LABELS)

        # pandas dataframe as csv?
        note_subset.to_csv(
            join(PATH_MIMICIII_SEPSIS, "mimic_sepsis_subset_df.csv"))
示例#9
0
    def write_dataset(self, dir_out):
        write_csv(self.x_train, self.y_train, 'train_newsgroups.csv', dir_out)
        write_csv(self.x_val, self.y_val, 'val_newsgroups.csv', dir_out)
        write_csv(self.x_test, self.y_test, 'test_newsgroups.csv', dir_out)

        FileUtils.write_json(self.label_dict, 'newsgroups_labeldict.json', dir_out)
示例#10
0
from src.utils import MathUtils, FileUtils, Validator
from src.mdchar import *

validator = Validator()
mathUtils = MathUtils()
fileUtils = FileUtils()

class RSA:
    def generate_key(self):
        """Gera a chave pública pegando todos os inputs do usuário"""

        [p, q] = validator.get_p_and_q_input()

        # Tamanho do conjunto finito de valores para
        # que possamos fazer o caminho inverso ao realizado
        # para cifrar a mensagem

        n = p * q

        while n <= 26:
            print("[!] P * Q precisa ser maior que 26")
            p = validator.get_prime_input("P")
            q = validator.get_prime_input("Q")
            n = p * q
        totiente = mathUtils.totiente(p, q)

        e = validator.get_e_input(totiente)


        fileUtils.write_file(f"{n} {e}", "public_key")