def save_data_test(data_test): filename = format_processed_filename( PROCESSED_DATA_DIR, PROCESSED_CONCEPTS_DATA_TEST_FILENAME_TEMPLATE, genre='clinical') save_pickle(filename, data_test) print('Saved:', filename.name)
def completed_event(self, stop_time, result): self.result = result exp_results = { 'id': self.id, 'name': self.name, 'repository': self.repository, 'config': self.config, 'result': self.result, } filename = self._get_experiment_filename(self.id) save_pickle(filename, exp_results)
def process_test(): # load the test data _, _, data_mli_test = load_data(load_test=True) logging.info('Data: %s', data_mli_test.shape) for genre in ['clinical']: genre_test = data_mli_test.loc[genre] logging.info('Genre: %s, test: %s', genre, genre_test.shape) tokenized_test = tokenize_data(genre_test) # save all the data into a numpy file filename = format_processed_filename(PROCESSED_DATA_DIR, PROCESSED_DATA_TEST_FILENAME_TEMPLATE, genre=genre) save_pickle(filename, tokenized_test)
def save(self, weights_path, weights_name): sess = K.get_session() variables = tf.trainable_variables() var_dict = dict() for v in variables: if "transition" in v.name: name = re.sub('transition/', '', v.name) name = re.sub(':0', '', name) layer_name, var_name = name.split('/') layer_name = re.sub('dense_', '', layer_name) if not var_dict.get(layer_name): var_dict[layer_name] = dict() var_dict[layer_name][var_name] = v for k in var_dict.keys(): print(k) for j in var_dict[k].keys(): print('---{0}'.format(j)) weights = sess.run(var_dict) save_pickle(weights, weights_path, weights_name)
def main(): # load SNLI, MultiNLI and MLI datasets data_dev, data_train = load_data() logging.info('Data: train - %s, dev - %s', data_train.shape, data_dev.shape) if not PROCESSED_DATA_DIR.exists(): PROCESSED_DATA_DIR.mkdir() for genre in GENRES: if genre not in data_train.index: continue genre_train = data_train.loc[genre] genre_dev = data_dev.loc[genre] logging.info('Genre: %s, train: %s, dev: %s', genre, genre_train.shape, genre_dev.shape) tokenized_train = tokenize_data(genre_train) tokenized_dev = tokenize_data(genre_dev) # save all the data into a numpy file filename = format_processed_filename(PROCESSED_DATA_DIR, PROCESSED_DATA_FILENAME_TEMPLATE, genre=genre) save_pickle(filename, (tokenized_train, tokenized_dev))
def process_data(genre_source, genre_target, genre_tune, max_len, lowercase, stem, clean, downsample_source, word_vectors_type, word_vectors_replace_cui, use_umls_attention, use_token_level_attention, padding='pre'): """Load data for the target genres, create and fit tokenizer, and return the input matrices""" data_source_train, data_source_dev, data_target_train, data_target_dev, data_tune_train, data_tune_dev = \ load_processed_genre_data(PROCESSED_DATA_DIR, PROCESSED_CONCEPTS_DATA_FILENAME_TEMPLATE, genre_source, genre_target, genre_tune) _, _, data_clinical_test = load_single_genre_data( PROCESSED_DATA_DIR, PROCESSED_CONCEPTS_DATA_FILENAME_TEMPLATE, genre='clinical', filename_test_template=PROCESSED_CONCEPTS_DATA_TEST_FILENAME_TEMPLATE) if clean: data_source_train = clean_data(data_source_train) data_source_dev = clean_data(data_source_dev) data_target_train = clean_data(data_target_train) data_target_dev = clean_data(data_target_dev) data_tune_train = clean_data(data_tune_train) data_tune_dev = clean_data(data_tune_dev) data_clinical_test = clean_data(data_clinical_test) logging.info('Data cleaned') if stem: data_source_train = stem_data(data_source_train) data_source_dev = stem_data(data_source_dev) data_target_train = stem_data(data_target_train) data_target_dev = stem_data(data_target_dev) data_tune_train = stem_data(data_tune_train) data_tune_dev = stem_data(data_tune_dev) data_clinical_test = stem_data(data_clinical_test) logging.info('Data stemmed') if use_token_level_attention: data_source_train = create_token_cuis(data_source_train) data_source_dev = create_token_cuis(data_source_dev) data_target_dev = create_token_cuis(data_target_dev) data_clinical_test = create_token_cuis(data_clinical_test) word_vectors_replacement = None if word_vectors_replace_cui != '': word_vectors_replacement_filename = get_word_vectors_filename( word_vectors_replace_cui) word_vectors_replacement = load_pickle( word_vectors_replacement_filename) logging.info('Replacements word vectors loaded: %s', word_vectors_replacement_filename.name) target_cuis = set(word_vectors_replacement.keys()) logging.info('Target CUIs: %s', len(target_cuis)) data_source_train = replace_cui_data(data_source_train, target_cuis) data_source_dev = replace_cui_data(data_source_dev, target_cuis) data_target_train = replace_cui_data(data_target_train, target_cuis) data_target_dev = replace_cui_data(data_target_dev, target_cuis) data_tune_train = replace_cui_data(data_tune_train, target_cuis) data_tune_dev = replace_cui_data(data_tune_dev, target_cuis) data_clinical_test = replace_cui_data(data_clinical_test, target_cuis) logging.info('CUIs replaced') if downsample_source != 0: # downsample train and dev sets to the size of the clinical dataset nb_clinical_train = 11232 nb_clinical_dev = 1395 data_source_train = downsample_data(data_source_train, nb_needed=nb_clinical_train) data_source_dev = downsample_data(data_source_dev, nb_needed=nb_clinical_dev) # create tokenizer and vocabulary sentences_train = data_source_train['premise'] + data_source_train[ 'hypothesis'] if data_tune_train is not None: sentences_train += data_tune_train['premise'] + data_tune_train[ 'hypothesis'] tokenizer = Tokenizer(lower=lowercase, filters='') tokenizer.fit_on_texts(sentences_train) # create data matrices m_source_train = create_data_matrices(tokenizer, data_source_train, max_len, padding) m_source_dev = create_data_matrices(tokenizer, data_source_dev, max_len, padding) logging.info('Source: %s - train: %s, %s, %s, dev: %s, %s, %s', genre_source, m_source_train['premise'].shape, m_source_train['hypothesis'].shape, m_source_train['label'].shape, m_source_dev['premise'].shape, m_source_dev['hypothesis'].shape, m_source_dev['label'].shape) m_tune_train = None m_tune_dev = None if data_tune_train is not None: m_tune_train = create_data_matrices(tokenizer, data_tune_train, max_len, padding) m_tune_dev = create_data_matrices(tokenizer, data_tune_dev, max_len, padding) logging.info('Tune: %s - train: %s, %s, %s, dev: %s, %s, %s', genre_tune, m_tune_train['premise'].shape, m_tune_train['hypothesis'].shape, m_tune_train['label'].shape, m_tune_dev['premise'].shape, m_tune_dev['hypothesis'].shape, m_tune_dev['label'].shape) m_target_train = None m_target_dev = None if data_target_train is not None: m_target_train = create_data_matrices(tokenizer, data_target_train, max_len, padding) m_target_dev = create_data_matrices(tokenizer, data_target_dev, max_len, padding) logging.info('Target: %s - train: %s, %s, %s, dev: %s, %s, %s', genre_target, m_target_train['premise'].shape, m_target_train['hypothesis'].shape, m_target_train['label'].shape, m_target_dev['premise'].shape, m_target_dev['hypothesis'].shape, m_target_dev['label'].shape) else: m_target_dev = m_source_dev # target domain was not specified - use the dev set of the source domain data_target_dev = data_source_dev logging.info('Target: %s - dev: %s, %s, %s', genre_source, m_target_dev['premise'].shape, m_target_dev['hypothesis'].shape, m_target_dev['label'].shape) m_clinical_test = create_data_matrices(tokenizer, data_clinical_test, max_len, padding) logging.info('Clinical test: %s, %s, %s', m_clinical_test['premise'].shape, m_clinical_test['hypothesis'].shape, m_clinical_test['label'].shape) # create embedding matrix if word_vectors_type != 'random': word_vectors_filename = get_word_vectors_filename(word_vectors_type) word_vectors = load_pickle(word_vectors_filename) logging.info('Word vectors loaded: %s', word_vectors_filename.name) if word_vectors_replacement is not None: word_vectors.update(word_vectors_replacement) else: random_vectors_params = ( -0.5, 0.5, 300, ) word_vectors = {} for token in tokenizer.word_index.keys(): word_vectors[token] = np.random.uniform(*random_vectors_params) logging.info('Random vectors created: %s', random_vectors_params) W_emb = create_embedding_matrix(word_vectors, tokenizer.word_index) id_to_token = {i: t for t, i in tokenizer.word_index.items()} logging.info('Id to token: %s', len(id_to_token)) if word_vectors_replace_cui != '' or use_token_level_attention: concepts_graph = nx.read_gpickle(str(UMLS_CONCEPTS_GRAPH_FILENAME)) logging.info('UMLS concepts graph: %s', len(concepts_graph)) # create UMLS-based attention if use_token_level_attention: att_source_train = create_umls_attention( m_source_train, id_to_token, concepts_graph, use_token_level_attention, data_source_train['premise_token_cuis'], data_source_train['hypothesis_token_cuis']) att_source_dev = create_umls_attention( m_source_dev, id_to_token, concepts_graph, use_token_level_attention, data_source_dev['premise_token_cuis'], data_source_dev['hypothesis_token_cuis']) att_target_dev = create_umls_attention( m_target_dev, id_to_token, concepts_graph, use_token_level_attention, data_target_dev['premise_token_cuis'], data_target_dev['hypothesis_token_cuis']) att_clinical_test = create_umls_attention( m_clinical_test, id_to_token, concepts_graph, use_token_level_attention, data_clinical_test['premise_token_cuis'], data_clinical_test['hypothesis_token_cuis']) m_source_train.update(att_source_train) m_source_dev.update(att_source_dev) m_target_dev.update(att_target_dev) m_clinical_test.update(att_clinical_test) # create memory if not use_token_level_attention: memory_source_train = create_memory_matrix( m_source_train, id_to_token, concepts_graph, word_vectors, use_token_level_attention) memory_source_dev = create_memory_matrix( m_source_dev, id_to_token, concepts_graph, word_vectors, use_token_level_attention) memory_target_dev = create_memory_matrix( m_target_dev, id_to_token, concepts_graph, word_vectors, use_token_level_attention) memory_clinical_test = create_memory_matrix( m_clinical_test, id_to_token, concepts_graph, word_vectors, use_token_level_attention) m_source_train.update(memory_source_train) m_source_dev.update(memory_source_dev) m_target_dev.update(memory_target_dev) m_clinical_test.update(memory_clinical_test) # use WordNet attention if genre_source != 'clinical' and word_vectors_replace_cui == '' and use_umls_attention: att_source_train = create_wordnet_attention(m_source_train, id_to_token) att_source_dev = create_wordnet_attention(m_source_dev, id_to_token) att_target_dev = create_wordnet_attention(m_target_dev, id_to_token) att_clinical_test = create_wordnet_attention(m_clinical_test, id_to_token) m_source_train.update(att_source_train) m_source_dev.update(att_source_dev) m_target_dev.update(att_target_dev) m_clinical_test.update(att_clinical_test) # save tokenizer and embeddings matrix for demo server save_pickle( DATA_DIR / 'tokenizer_{}_{}.pickled'.format(genre_source, genre_tune), tokenizer) save_pickle( DATA_DIR / 'embeddings_{}_{}.pickled'.format(genre_source, genre_tune), W_emb) return m_source_train, m_source_dev, m_tune_train, m_tune_dev, m_target_train, m_target_dev, m_clinical_test, W_emb
def save_embeddings(filename, data_train, data_dev, word_vectors, target_cuis, mode): # find with which words concepts are presented in the data concepts_tokens = defaultdict(list) data_all = [ zip(data_train['premise'], data_train['premise_concepts']), zip(data_train['hypothesis'], data_train['hypothesis_concepts']), ] for i, (sentence, concepts) in enumerate(itertools.chain.from_iterable(data_all)): for concept in concepts: cui = concept['cui'] if cui not in target_cuis: continue pos_info = concept['pos_info'] tokens = [sentence[p[0]:p[1]] for p in pos_info] concepts_tokens[cui].extend(tokens) logging.info('Concepts mode: %s', mode) logging.info('Concepts: %s', len(concepts_tokens)) # filter out concepts without a representation with a single token, # and the rest is the same as the cbow_most_common mode if mode == 'single_token': concepts_tokens = { concept: [t for t in tokens_list if not ' ' in t] for concept, tokens_list in concepts_tokens.items() } concepts_tokens = { c: t for c, t in concepts_tokens.items() if len(t) > 0 } logging.info('Concepts single tokens: %s', len(concepts_tokens)) if mode == 'cbow_all': concepts_tokens = { cui: [tok for tokens in tokens_list for tok in tokens.split()] for cui, tokens_list in concepts_tokens.items() } concepts_tokens = { cui: set(tokens) for cui, tokens in concepts_tokens.items() } elif mode == 'single_most_common': concepts_tokens = { cui: [tok for tokens in tokens_list for tok in tokens.split()] for cui, tokens_list in concepts_tokens.items() } concepts_tokens_counter = { cui: Counter(tokens) for cui, tokens in concepts_tokens.items() } concepts_tokens = {} for concept, tokens_counts in concepts_tokens_counter.items(): # there might be several tokens with the same frequency - take the longest one in this case _, nb_most_common = tokens_counts.most_common(1)[0] tokens = [ t for t, c in tokens_counts.most_common() if c == nb_most_common ] tokens = sorted(tokens, key=lambda x: len(x), reverse=True) concepts_tokens[concept] = tokens[:1] elif mode == 'cbow_most_common' or mode == 'single_token': concepts_tokens_counter = { cui: Counter(tokens) for cui, tokens in concepts_tokens.items() } concept_tokens = {} for concept, tokens_counts in concepts_tokens_counter.items(): # add first most common that have at least one embedding for tokens, counts in tokens_counts.most_common(): tokens = tokens.split(' ') if any([t in word_vectors for t in tokens]): concept_tokens[concept] = tokens break else: raise ValueError('Unknown mode: {}'.format(mode)) logging.info('Concepts tokens: %s', len(concepts_tokens)) # create a word vectors for each CUI as an average of embeddings cuis_embeddings = {} for cui, tokens in concepts_tokens.items(): cui_embeds = [] for token in tokens: if token in word_vectors: cui_embeds.append(word_vectors[token]) if len(cui_embeds) > 0: cuis_embeddings[cui] = np.mean(cui_embeds, axis=0) logging.info('Concepts with embeddings: %s', len(cuis_embeddings)) del word_vectors if filename.suffix == '.txt': # save embeddings in the retrofitting format with open(str(filename), 'w') as f: for cui, embeddings in cuis_embeddings.items(): row = '{} {}\n'.format(cui, ' '.join(embeddings.astype(str))) f.write(row) else: save_pickle(filename, cuis_embeddings) logging.info('Embeddings saved: %s', filename.name)