예제 #1
0
    def load_pretrained_token_embeddings(self, sess, dataset, parameters, token_to_vector=None):
        if parameters['token_pretrained_embedding_filepath'] == '':
            return
        # Load embeddings
        start_time = time.time()
        print('Load token embeddings... ', end='', flush=True)
        if token_to_vector == None:
            token_to_vector = hd.load_pretrained_token_embeddings(parameters)

        initial_weights = sess.run(self.token_embedding_weights.read_value())
        number_of_loaded_word_vectors = 0
        number_of_token_original_case_found = 0
        number_of_token_lowercase_found = 0
        number_of_token_digits_replaced_with_zeros_found = 0
        number_of_token_lowercase_and_digits_replaced_with_zeros_found = 0
        for token in dataset.token_to_index.keys():
            if token in token_to_vector.keys():
                initial_weights[dataset.token_to_index[token]] = token_to_vector[token]
                number_of_token_original_case_found += 1
            elif parameters['check_for_lowercase'] and token.lower() in token_to_vector.keys():
                initial_weights[dataset.token_to_index[token]] = token_to_vector[token.lower()]
                number_of_token_lowercase_found += 1
            elif parameters['check_for_digits_replaced_with_zeros'] and re.sub('\d', '0', token) in token_to_vector.keys():
                initial_weights[dataset.token_to_index[token]] = token_to_vector[re.sub('\d', '0', token)]
                number_of_token_digits_replaced_with_zeros_found += 1
            elif parameters['check_for_lowercase'] and parameters['check_for_digits_replaced_with_zeros'] and re.sub('\d', '0', token.lower()) in token_to_vector.keys():
                initial_weights[dataset.token_to_index[token]] = token_to_vector[re.sub('\d', '0', token.lower())]
                number_of_token_lowercase_and_digits_replaced_with_zeros_found += 1
            else:
                continue
            number_of_loaded_word_vectors += 1
        elapsed_time = time.time() - start_time
        print('done ({0:.2f} seconds)'.format(elapsed_time))
        print("number_of_token_original_case_found: {0}".format(number_of_token_original_case_found))
        print("number_of_token_lowercase_found: {0}".format(number_of_token_lowercase_found))
        print("number_of_token_digits_replaced_with_zeros_found: {0}".format(number_of_token_digits_replaced_with_zeros_found))
        print("number_of_token_lowercase_and_digits_replaced_with_zeros_found: {0}".format(number_of_token_lowercase_and_digits_replaced_with_zeros_found))
        print('number_of_loaded_word_vectors: {0}'.format(number_of_loaded_word_vectors))
        print("dataset.vocabulary_size: {0}".format(dataset.vocabulary_size))
        sess.run(self.token_embedding_weights.assign(initial_weights))
def predict(files, model_path, output_dir, format, use_lstm=True):

    # Must specify output format
    if format not in ['i2b2']:
        sys.stderr.write('\n\tError: Must specify output format\n')
        sys.stderr.write('\tAvailable formats: i2b2\n')
        sys.stderr.write('\n')
        exit(1)

    # Load model
    #if use_lstm==False:
    with open(model_path, 'rb') as f:
        model = pickle.load(f, encoding='latin1')

    if model._use_lstm:
        import helper_dataset as hd
        import DatasetCliner_experimental as Exp
        import entity_lstm as entity_model

        parameters = hd.load_parameters_from_file("LSTM_parameters.txt")
        parameters['use_pretrained_model'] = True

        temp_pretrained_dataset_adress = parameters[
            'model_folder'] + os.sep + "dataset.pickle"
        model._pretrained_dataset = pickle.load(
            open(temp_pretrained_dataset_adress, 'rb'))
        model._pretrained_wordvector = hd.load_pretrained_token_embeddings(
            parameters)
        model._current_model = None
        '''
        updating_notes=[]
        for i,txt in enumerate(sorted(files)):
           note=Document(txt)
           tokenized_sents  = note.getTokenizedSentences()
           updating_notes+=tokenized_sents
        print (updating_notes)
        fictional_labels= copy.deepcopy(tokenized_sents)
        for idx,x in enumerate(fictional_labels):
           for val_id,value in enumerate(x):
                fictional_labels[idx][val_id]='O'
        Datasets_tokens={}
        Datasets_labels={}
        Datasets_tokens['deploy']=tokenized_sents
        Datasets_labels['deploy']=fictional_labels
        dataset = Exp.Dataset() 
        token_to_vector=dataset.load_dataset(Datasets_tokens, Datasets_labels, "", parameters,token_to_vector=model._pretrained_wordvector, pretrained_dataset=model._pretrained_dataset)
        parameters['Feature_vector_length']=dataset.feature_vector_size
        parameters['use_features_before_final_lstm']=False       
        dataset.update_dataset("", ['deploy'],Datasets_tokens,Datasets_labels)
        model._pretrained_dataset=dataset
        model_LSTM=entity_model.EntityLSTM(dataset,parameters)
        model._current_model=model_LSTM
        ._current_model
        '''
        print("END TEST")
        #exit()
        #model.parameters=None

    # Tell user if not predicting
    if not files:
        sys.stderr.write("\n\tNote: You did not supply any input files\n\n")
        exit()

    n = len(files)

    for i, txt in enumerate(sorted(files)):
        note = Document(txt)

        # Output file
        fname = os.path.splitext(os.path.basename(txt))[0] + '.' + 'con'
        out_path = os.path.join(output_dir, fname)

        #'''
        if os.path.exists(out_path):
            print('\tWARNING: prediction file already exists (%s)' % out_path)
            #continue
        #'''

        sys.stdout.write('%s\n' % ('-' * 30))
        sys.stdout.write('\n\t%d of %d\n' % (i + 1, n))
        sys.stdout.write('\t%s\n\n' % txt)

        # Predict concept labels
        labels = model.predict_classes_from_document(note)

        # Get predictions in proper format
        output = note.write(labels)

        # Output the concept predictions
        sys.stdout.write('\n\nwriting to: %s\n' % out_path)
        with open(out_path, 'w') as f:
            write(f, '%s\n' % output)
        sys.stdout.write('\n')
예제 #3
0
    def load_dataset(self,avaliable_datasets_sent,avaliable_datasets_labels, dataset_filepaths, parameters, token_to_vector=None,pretrained_dataset=None):
        '''
        dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy'
        '''       
        start_time = time.time()
        print('Load dataset... \n')
        if parameters['token_pretrained_embedding_filepath'] != '':
            if token_to_vector==None:
                token_to_vector = hd.load_pretrained_token_embeddings(parameters)
        else:
            token_to_vector = {}
        
        all_tokens_in_pretraining_dataset = []
        all_characters_in_pretraining_dataset = []
        
        if parameters['use_pretrained_model']:
            
            #temp_pretrained_dataset_adress="./models/NN_models/1235-4/dataset.pickle"  #"./models/NN_models/1234-5/dataset.pickle"
            if pretrained_dataset==None:
               temp_pretrained_dataset_adress=parameters['model_folder']+os.sep+"dataset.pickle"
               pretraining_dataset = pickle.load(open(temp_pretrained_dataset_adress, "rb"))
               print ("Pre-loading Pre-trained dataset objects")
            else: 
                pretraining_dataset=pretrained_dataset
                print ("Pretrained dataset was pre-loaded")
            
            all_tokens_in_pretraining_dataset = pretraining_dataset.index_to_token.values()
            all_characters_in_pretraining_dataset = pretraining_dataset.index_to_character.values()
            

        remap_to_unk_count_threshold = 1
        self.UNK_TOKEN_INDEX = 0
        self.PADDING_CHARACTER_INDEX = 0
        self.tokens_mapped_to_unk = []
        self.UNK = 'UNK'
        self.unique_labels = []
        labels = {}
        tokens = {}
        label_count = {}
        token_count = {}
        character_count = {}
        
        
        features={}
        features_file_names={}
        feature_vector_size={}
        #deploy
      
        for dataset_type in ['train', 'valid', 'test','deploy']:
            Not_here=False
            
            if dataset_type not in avaliable_datasets_sent:
                Not_here=True
             #_parse_dataset(self, dataset_filepath,dataset_type,sentences_list="",tags_list="")
            if Not_here==False:
                labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], character_count[dataset_type], features[dataset_type], \
                features_file_names[dataset_type],feature_vector_size[dataset_type] \
                = self._parse_dataset("", dataset_type, sentences_list=avaliable_datasets_sent[dataset_type], tags_list=avaliable_datasets_labels[dataset_type])
                
                
            if Not_here==True:    
                labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], character_count[dataset_type], features[dataset_type], \
                features_file_names[dataset_type],feature_vector_size[dataset_type] \
                = self._parse_dataset("", dataset_type, sentences_list=[], tags_list=[])  #

        
        token_count['all'] = {}
        for token in list(token_count['train'].keys()) + list(token_count['valid'].keys()) + list(token_count['test'].keys()) + list(token_count['deploy'].keys()):
            token_count['all'][token] = token_count['train'][token] + token_count['valid'][token] + token_count['test'][token] + token_count['deploy'][token]
        
        
        
        if parameters['load_all_pretrained_token_embeddings']:
            for token in token_to_vector:
                if token not in token_count['all']:
                    token_count['all'][token] = -1
                    token_count['train'][token] = -1
            for token in all_tokens_in_pretraining_dataset:
                if token not in token_count['all']:
                    token_count['all'][token] = -1
                    token_count['train'][token] = -1

        character_count['all'] = {}
        for character in list(character_count['train'].keys()) + list(character_count['valid'].keys()) + list(character_count['test'].keys()) + list(character_count['deploy'].keys()):
            character_count['all'][character] = character_count['train'][character] + character_count['valid'][character] + character_count['test'][character] + character_count['deploy'][character]

        for character in all_characters_in_pretraining_dataset:
            if character not in character_count['all']:
                character_count['all'][character] = -1
                character_count['train'][character] = -1   
            
        
        label_count['all'] = {}
        for character in list(label_count['train'].keys()) + list(label_count['valid'].keys()) + list(label_count['test'].keys()) + list(label_count['deploy'].keys()):
            label_count['all'][character] = label_count['train'][character] + label_count['valid'][character] + label_count['test'][character] + label_count['deploy'][character]

        token_count['all'] = hd.order_dictionary(token_count['all'], 'value_key', reverse = True)
        label_count['all'] = hd.order_dictionary(label_count['all'], 'key', reverse = False)
        character_count['all'] = hd.order_dictionary(character_count['all'], 'value', reverse = True)
        if self.verbose: print('character_count[\'all\']: {0}'.format(character_count['all']))
        
        

        token_to_index = {}
        token_to_index[self.UNK] = self.UNK_TOKEN_INDEX
        iteration_number = 0
        number_of_unknown_tokens = 0
        if self.verbose: print("parameters['remap_unknown_tokens_to_unk']: {0}".format(parameters['remap_unknown_tokens_to_unk']))
        if self.verbose: print("len(token_count['train'].keys()): {0}".format(len(token_count['train'].keys())))
        for token, count in token_count['all'].items():
            if iteration_number == self.UNK_TOKEN_INDEX: iteration_number += 1

            if parameters['remap_unknown_tokens_to_unk'] == 1 and \
                (token_count['train'][token] == 0 or \
                parameters['load_only_pretrained_token_embeddings']) and \
                not hd.is_token_in_pretrained_embeddings(token, token_to_vector, parameters) and \
                token not in all_tokens_in_pretraining_dataset:
                token_to_index[token] =  self.UNK_TOKEN_INDEX
                number_of_unknown_tokens += 1
                self.tokens_mapped_to_unk.append(token)
            else:
                token_to_index[token] = iteration_number
                iteration_number += 1

        infrequent_token_indices = []
        for token, count in token_count['train'].items():
            if 0 < count <= remap_to_unk_count_threshold:
                infrequent_token_indices.append(token_to_index[token])
        #if self.verbose: print("len(token_count['train']): {0}".format(len(token_count['train'])))
       # if self.verbose: print("len(infrequent_token_indices): {0}".format(len(infrequent_token_indices)))
        
        # Ensure that both B- and I- versions exist for each label
        labels_without_bio = set()
        for label in label_count['all'].keys():
            new_label = hd.remove_bio_from_label_name(label)
            labels_without_bio.add(new_label)
        for label in labels_without_bio:
            if label == 'O':
                continue
            if parameters['tagging_format'] == 'bioes':
                prefixes = ['B-', 'I-', 'E-', 'S-']
            else:
                prefixes = ['B-', 'I-']
            for prefix in prefixes:
                l = prefix + label
                if l not in label_count['all']:
                    label_count['all'][l] = 0
        label_count['all'] = hd.order_dictionary(label_count['all'], 'key', reverse = False)

        if parameters['use_pretrained_model']:
            
            print ("USE_PRETRAINED_MODEL ACTIVE")
            self.unique_labels = sorted(list(pretraining_dataset.label_to_index.keys()))
            # Make sure labels are compatible with the pretraining dataset.
            for label in label_count['all']:
                if label not in pretraining_dataset.label_to_index:
                    raise AssertionError("The label {0} does not exist in the pretraining dataset. ".format(label) +
                                         "Please ensure that only the following labels exist in the dataset: {0}".format(', '.join(self.unique_labels)))
            label_to_index = pretraining_dataset.label_to_index.copy()
        else:
            label_to_index = {}
            iteration_number = 0
            for label, count in label_count['all'].items():
                label_to_index[label] = iteration_number
                iteration_number += 1
                self.unique_labels.append(label)

        
        character_to_index = {}
        iteration_number = 0
        for character, count in character_count['all'].items():
            if iteration_number == self.PADDING_CHARACTER_INDEX: iteration_number += 1
            character_to_index[character] = iteration_number
            iteration_number += 1


        token_to_index = hd.order_dictionary(token_to_index, 'value', reverse = False)
        if self.verbose: print('token_to_index: {0}'.format(token_to_index))
        index_to_token = hd.reverse_dictionary(token_to_index)
        if parameters['remap_unknown_tokens_to_unk'] == 1: index_to_token[self.UNK_TOKEN_INDEX] = self.UNK
        if self.verbose: print('index_to_token: {0}'.format(index_to_token))

        label_to_index = hd.order_dictionary(label_to_index, 'value', reverse = False)
        index_to_label = hd.reverse_dictionary(label_to_index)

        character_to_index = hd.order_dictionary(character_to_index, 'value', reverse = False)
        index_to_character = hd.reverse_dictionary(character_to_index)

        self.token_to_index = token_to_index
        self.index_to_token = index_to_token
        self.index_to_character = index_to_character
        self.character_to_index = character_to_index
        self.index_to_label = index_to_label
        self.label_to_index = label_to_index
        
        
        self.tokens = tokens
        self.labels = labels
        
        
        
        dataset_types=['train','test','valid','deploy']
        token_indices, label_indices, character_indices_padded, character_indices, token_lengths, characters, label_vector_indices = self._convert_to_indices(dataset_types)
        
        self.token_indices = token_indices
        self.label_indices = label_indices
        self.character_indices_padded = character_indices_padded
        self.character_indices = character_indices
        self.token_lengths = token_lengths
        self.characters = characters
        self.label_vector_indices = label_vector_indices

        self.number_of_classes = max(self.index_to_label.keys()) + 1
        self.vocabulary_size = max(self.index_to_token.keys()) + 1
        self.alphabet_size = max(self.index_to_character.keys()) + 1


        # unique_labels_of_interest is used to compute F1-scores.
        self.unique_labels_of_interest = list(self.unique_labels)
        self.unique_labels_of_interest.remove('O')

        self.unique_label_indices_of_interest = []
        for lab in self.unique_labels_of_interest:
            self.unique_label_indices_of_interest.append(label_to_index[lab])

        self.infrequent_token_indices = infrequent_token_indices


        elapsed_time = time.time() - start_time
        print('done ({0:.2f} seconds)'.format(elapsed_time))
        
        
        
        self.feature_vector_size=0
        
        
        self._log()
        
        return token_to_vector
예제 #4
0
def predict(files, model_path, output_dir, format, use_lstm=True):

    # Must specify output format
    if format not in ['i2b2']:
        sys.stderr.write('\n\tError: Must specify output format\n')
        sys.stderr.write('\tAvailable formats: i2b2\n')
        sys.stderr.write('\n')
        exit(1)

    # Load model
    #if use_lstm==False:
    with open(model_path, 'rb') as f:
        model = pickle.load(f, encoding='latin1')

    if model._use_lstm:
        import helper_dataset as hd
        import DatasetCliner_experimental as Exp
        import entity_lstm as entity_model

        parameters = hd.load_parameters_from_file("LSTM_parameters.txt")
        parameters['use_pretrained_model'] = True

        temp_pretrained_dataset_adress = parameters[
            'model_folder'] + os.sep + "dataset.pickle"
        model._pretrained_dataset = pickle.load(
            open(temp_pretrained_dataset_adress, 'rb'))
        model._pretrained_wordvector = hd.load_pretrained_token_embeddings(
            parameters)
        model._current_model = None

        print("END TEST")
        #exit()
        #model.parameters=None

    # Tell user if not predicting
    if not files:
        sys.stderr.write("\n\tNote: You did not supply any input files\n\n")
        exit()

    n = len(files)

    for i, txt in enumerate(sorted(files)):
        note = Document(txt)
        # Output file
        fname = os.path.splitext(os.path.basename(txt))[0] + '.' + 'con'
        out_path = os.path.join(output_dir, fname)

        if os.path.exists(out_path):
            print()
            #print('\tWARNING: prediction file already exists (%s)' % out_path)
            #continue
        '''
        sys.stdout.write('%s\n' % ('-' * 30))
        sys.stdout.write('\n\t%d of %d\n' % (i+1,n))
        sys.stdout.write('\t%s\n\n' % txt)
        '''
        # Predict concept labels
        labels = model.predict_classes_from_document(note)

        # Get predictions in proper format
        output = note.write(labels)

        print("-----------OUTPUT----------\n")
        print(output)

        # Output the concept predictions
        sys.stdout.write('\n\nwriting to: %s\n' % out_path)
        with open(out_path, 'w') as f:
            write(f, '%s\n' % output)
        sys.stdout.write('\n')