예제 #1
0
def ingredients(output_folder):
    folder_name = 'preprocessed_data_to_extract_ingredients'
    # Special preprocessing for easier extraction of the ingredients
    generate_preprocessed_data([
        'ContractionExpander', 'NumberUnifier', 'SpellingCorrector',
        'MedicationRemover', 'SentenceTokenizer', 'WordTokenizer',
        'PunctuationRemover', 'LowerCaseTransformer'
    ],
                               params=None,
                               output_folder=output_folder,
                               folder_name=folder_name)

    generate_ingredients_file(output_folder / folder_name, 'train')
    generate_ingredients_file(output_folder / folder_name, 'test')
def generate_normal(base_folder):
    base_folder.mkdir(parents=True, exist_ok=True)

    #######################
    # example_run_step1.py (generates features for BERT)
    #######################
    output_folder = base_folder / '1_example_run_step1'
    example_run_step1(output_folder)

    #######################
    # run.py (generates BERT scores for each fold)
    #######################
    input_folder = output_folder
    output_folder = base_folder / '2_run'

    env = os.environ.copy()
    env['FEATURE_PATH'] = input_folder
    env['OUTPUT_FOLDER'] = output_folder
    os.chdir('copied_from_bert')
    retcode = subprocess.call('python run.py', shell=True, env=env)
    assert retcode == 0, 'An error occurred while running BERT'
    os.chdir('..')

    #######################
    # evaluate_bert_test.py (combines BERT scores from kfolds, applies kfold ensembling on test set)
    #######################
    input_folder = sorted([
        folder for folder in output_folder.iterdir() if folder.is_dir()
    ])[-1]  # The longest folder contains the training result
    output_folder = base_folder / '3_evaluate_bert_test'
    evaluate_bert_test(input_folder, output_folder)

    shutil.copy2(output_folder / 'train_scores.csv',
                 base_folder / 'step1_train_scores.csv')
    shutil.copy2(output_folder / 'test_scores.csv',
                 base_folder / 'step1_test_scores.csv')

    #######################
    # example_run_step2.py (voting regression)
    #######################
    input_folder = output_folder
    output_folder = base_folder / '4_example_run_step2'

    env = os.environ.copy()
    env['BERT_SCORES_PATH'] = input_folder
    env['OUTPUT_FOLDER'] = output_folder
    os.chdir('challenge_pipelines')
    retcode = subprocess.call('python example_run_step2.py',
                              shell=True,
                              env=env)
    assert retcode == 0, 'An error occurred while running the voting regression'
    os.chdir('..')

    shutil.copy2(output_folder / '0_dev_prediction.csv',
                 base_folder / 'step2_train_scores.csv')
    shutil.copy2(output_folder / '0_test_prediction.csv',
                 base_folder / 'step2_test_scores.csv')

    #######################
    # ingredients.py (extract ingredients from sentences)
    #######################
    output_folder = base_folder / '5_ingredients'

    ingredients(output_folder)

    # Generate special preprocessing and folds for the graph
    generate_preprocessed_data([
        'ContractionExpander', 'NumberUnifier', 'SpellingCorrector',
        'MedicationRemover', 'SentenceTokenizer', 'WordTokenizer'
    ],
                               params=None,
                               output_folder=output_folder,
                               folder_name='preprocessed_data_ingredients')
    prepare_k_folds(output_folder / 'preprocessed_data_ingredients', k=10)

    #######################
    # eval_graph.py (medication graph)
    #######################
    ingredients_folder = output_folder
    input_scores_folder = base_folder / '4_example_run_step2'
    output_folder = base_folder / '6_eval_graph'
    eval_graph(ingredients_folder, input_scores_folder, output_folder)

    shutil.copy2(output_folder / 'tablet_similarity_train.csv',
                 base_folder / 'step4_train_scores.csv')
    shutil.copy2(output_folder / 'tablet_similarity_test.csv',
                 base_folder / 'step4_test_scores.csv')
예제 #3
0
def prepare_input_folder_cluster(folder):
    folder = folder / 'n2c2'
    folder.mkdir(parents=True, exist_ok=True)

    # Apply basic preprocessing for the InferSent embeddings
    preprocessing_name = generate_preprocessed_data([
        'ContractionExpander', 'NumberUnifier', 'SpellingCorrector',
        'LowerCaseTransformer'
    ],
                                                    output_folder=folder)
    df_train = pd.read_csv(folder / preprocessing_name /
                           'preprocessed_data_train.tsv',
                           sep='\t',
                           index_col='index')
    df_test = pd.read_csv(folder / preprocessing_name /
                          'preprocessed_data_test.tsv',
                          sep='\t',
                          index_col='index')

    # Calculate the InferSent embeddings
    df = pd.concat([df_train, df_test])
    df_embeddings = df.copy()

    class SentenceEmbedder:
        def __init__(self):
            self.document_embedding = DocumentEmbeddings('InferSentEmbeddings',
                                                         version=2)

        def get_embeddings(self, sentences):
            sentences = [Sentence(s) for s in sentences]
            self.document_embedding.embed_str(sentences)

            return [sentence.embedding.numpy() for sentence in sentences]

    embedder = SentenceEmbedder()
    df_embeddings['sentence a'] = embedder.get_embeddings(df['sentence a'])
    df_embeddings['sentence b'] = embedder.get_embeddings(df['sentence b'])

    # Calculate kmeans clustering
    vectors = np.array(df_embeddings['sentence a'].tolist() +
                       df_embeddings['sentence b'].tolist())

    test_train_labels = 2 * (1642 * ['Training set'] + 412 * ['Test set'])
    train_idx_bool = [t == 'Training set' for t in test_train_labels]
    kmeans = KMeans(n_clusters=10,
                    random_state=1337).fit(vectors[train_idx_bool])

    # Select a subset of clusters used for training
    test_cluster = [
        0, 3, 4, 7, 9
    ]  # cf. PaperFigures.ipynb notebook for more information why these clusters were selected
    ix = np.isin(kmeans.labels_[0:1642], test_cluster)

    # Create the corresponding train data set
    train_data = pd.read_csv(root_path / train_name, sep='\t', header=None)
    train_data = train_data[ix]
    train_data.to_csv(folder / train_name, sep='\t', header=None, index=False)

    # Test data remains unchainged
    shutil.copy2(root_path / test_name, folder / test_name)
    shutil.copy2(root_path / test_labels_name, folder / test_labels_name)
예제 #4
0
                'pearsons': pearsons,
                'train_losses': train_losses
            },
            file,
            indent='\t',
            cls=JSONNumpyEncoder)


if __name__ == "__main__":
    with MeasureTime():
        output_folder = os.environ.get('OUTPUT_FOLDER',
                                       os.path.join(NLP_RAW_DATA, TASK_NAME))

        preprocessing_name = generate_preprocessed_data(
            [
                'ContractionExpander', 'NumberUnifier', 'SpellingCorrector',
                'LowerCaseTransformer'
            ],
            output_folder=output_folder)
        # 'ContractionExpander', 'NumberUnifier', 'SpellingCorrector', 'SentenceTokenizer', 'WordTokenizer', 'PunctuationRemover', 'LowerCaseTransformer'
        #preprocessing_name = 'preprocessed_data_2019-07-30_18-00-28'

        model_type = 'bert'
        model_name = 'biobert_pretrain_output_all_notes_150000'
        experiment_name = preprocessing_name + '_' + model_name

        #model = model_name
        model = os.path.join(NLP_MODELS_PATH, 'pretrained', 'word_embeddings',
                             'bert_models', model_name)

        input_dir = os.path.join(output_folder, preprocessing_name)
        output_dir = os.path.join(output_folder, experiment_name)