def fetch_and_fold(table, engine, targets, n_reports): """ function to fetch reports from vcare database Parameters ---------- For definition of parameters, see arguments in `main_fetch_and_fold` """ key1, key2, date = 'patient_id', 'nip', 'date' # data used to train the model df_targets = sql2df(engine, targets).loc[:, ['nip', 'id', 'C1J1']] df_targets.loc[:, 'C1J1'] = pd.to_datetime(df_targets['C1J1'], format='%Y-%m-%d', unit='D') df_reports = sql2df(engine, table)\ .loc[:, ['original_date', 'patient_id', 'report']] mask = [report is not None for report in df_reports['report']] df_reports.rename(columns={'original_date': 'date'}, inplace=True) df_reports = df_reports.loc[mask] # joining features df with complete patient informations df_reports = df_reports.merge(df_targets, on=None, left_on='patient_id', right_on='id').drop('id', axis=1) # df_reports = df_reports[df_reports[date] <= df_reports['C1J1']] # folding frames so that they have the same columns folder = Folder(key1, key2, ['report'], date, n_jobs=-1) reports_folded = folder.fold(df_reports) reports_folded.dropna(inplace=True) reports_folded.drop_duplicates(subset=['value'], inplace=True) # taking only first `n_reports` reports group_dict = {key2: 'first', 'feature': 'first', date: 'last', 'value': lambda g: ' '.join(g[:n_reports])} reports_folded = reports_folded.groupby(key1, as_index=False)\ .agg(group_dict) # parsing and vectorising text reports sections = ['examens complementaire', 'hopital de jour', 'examen du patient'] parser = ReportsParser(sections=None, n_jobs=-1, norm=False, col_name='value') reports_folded['value'] = parser.transform(reports_folded) return reports_folded
def test_pipeline(self): """ Tests transform_and_label on custom ReportParser transformer """ df = self.setUp() df = df[df['feature'] == 'feat2'] df.index = pd.RangeIndex(len(df.index)) pipeline = Pipeline df_res = transform_and_label(df, 'key1', 'key2', 'date', 'feature', 'value', pipeline, steps=[('parser', ReportsParser(headers=None)), ('tfidf', TfidfVectorizer())]) expected = { 'key1': ['1', '2', '2'], 'key2': ['a1', 'a2', 'a2'], 'feature': ['feat2_0', 'feat2_1', 'feat2_2'], 'value': [1, 1, 1], 'date': ['2018-06-14', '2018-05-22', '2017-03-01'] } df_expected = pd.DataFrame(expected) assert_array_equal(df_expected, df_res.values) assert_list_equal(list(df_expected.columns), list(df_res.columns))
def test_remove_section(self): x = self.setUp() parsed_x = ReportsParser(sections=('title 1')).transform(x) res_text = parsed_x.values[0] expected_text = 'this is div 1 text 1 0 text 1 1 this is a span' assert_equal(res_text, expected_text)
def test_transform(self): x = self.setUp() parsed_x = ReportsParser(strategy='strings', norm=False).transform(x) res_text = parsed_x.values[0] expected_text = 'this is div 0 text 0 0 text 0 1 ' \ 'this is div 1 text 1 0 text 1 1 this is a span' assert_equal(res_text, expected_text)
def test_transform2(self): x = self.setUp(False) parsed_x = ReportsParser(strategy='strings', headers='b', is_html=False).transform(x) res_text = parsed_x.values[0] expected_text = 'this is a text some other text conclusion text' assert_equal(res_text, expected_text)
def test_remove_section2(self): x = self.setUp(False) parsed_x = ReportsParser(sections=('other bold title', 'last bold title'), headers='b', is_html=False).transform(x) res_text = parsed_x.values[0] expected_text = 'some other text conclusion text' assert_equal(res_text, expected_text)
def test_tranform_tokens(self): x = self.setUp() parsed_x = ReportsParser(strategy='tokens', norm=True).transform(x) res_tokens = parsed_x.values[0] expected_tokens = [ 'this', 'is', 'div', '0', 'text', '0', '0', 'text', '0', '1', 'this', 'is', 'div', '1', 'text', '1', '0', 'text', '1', '1', 'this', 'is', 'a', 'span' ] assert_list_equal(res_tokens, expected_tokens)
from clintk.text_parser.parser import ReportsParser from clintk.text2vec.transformers import AverageWords2Vector, Text2Vector from sklearn.manifold import TSNE from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.pipeline import Pipeline sns.set() # load data path = 'data/reports.csv' df = pd.read_csv(path, sep=';', encoding='utf-8').head(1000) # parse the reports sections = ['hopital de jour', 'examen du patient', 'examens complentaires'] parser = ReportsParser(strategy='tokens', remove_sections=sections) X = parser.transform(df) ###################### vectorization using word2vec aggregation # text2vec = AverageWords2Vector(sg=1, min_count=5).fit(X) # x_w2v = text2vec.transform(X) text2vec = Text2Vector() text2vec.fit(X) x_w2v = text2vec.transform(X) # plot the result using t-sne reduction X_embedded = TSNE().fit_transform(x_w2v) plt.scatter(X_embedded[:, 0], X_embedded[:, 1], s=10) plt.show() ###################### vectorization using TF-IDF
def fetch_and_fold(path, engine, targets, n_reports): """ function to fetch reports from simbad data Parameters ---------- For definition of parameters, see arguments in `main_fetch_and_fold` """ # fetching targets df_targets = sql2df(engine, targets) # fetching reports df = pd.read_excel(path) # normalize nip df['nip'] = df['N° Dossier patient IGR'].astype(str) + df['LC'] df['nip'] = df.loc[:, 'nip'] \ .apply(lambda s: s[:4] + '-' + s[4:-2] + ' ' + s[-2:]) df.drop([ 'N° Dossier patient IGR', 'LC', 'NOCET', 'SIGLE_ETUDE', 'LIBELLE_TYPE_ETUDE', 'NUM CR', 'CR RESP' ], axis=1, inplace=True) df.rename(columns={'CR DATE': 'date', 'text CR': 'value'}, inplace=True) # keep only date in 'date columns' df['date'] = df.loc[:, 'date'].dt.date df['DATE_SIGN_OK'] = df.loc[:, 'DATE_SIGN_OK'].dt.date # taking only consultation reports df = df[df['CR NAT'] == 'CC'] # mask to get only the first one mask = (df['date'] == df['DATE_SIGN_OK']) df = df[mask] # df_rh = df[df['CR NAT'] == 'RH'] # taking only the first for each patient df.dropna(inplace=True) df.drop_duplicates(subset=['value'], inplace=True) # taking only the first reports group_dict = { 'date': 'first', 'DATE_SIGN_OK': 'last', 'value': lambda g: ' '.join(g) } df = df.groupby('nip', as_index=False).agg(group_dict) # # filter uninformative reports and taking the first # df_rh = df_rh[~(df_rh['value'].str.match('Examen du', na=False))] # df_rh.dropna(inplace=True) # df_rh.drop_duplicates(subset=['value'], inplace=True) # # # taking only the first reports # df_rh = df_rh.groupby('nip', as_index=False).agg(group_dict) # # df = pd.concat([df_cc, df_rh], ignore_index=True) # removing useless tags (blocks parsing) df['value'] = df.loc[:, 'value'].apply(lambda s: \ str(s).replace('<u>', '').replace('</u>', '')) # filter date # df = df[df['date'] <= (df['DATE_SIGN_OK'] + datetime.timedelta(weeks=8))] df = df.merge(df_targets, on='nip') parser = ReportsParser(headers='b', is_html=False, norm=False, n_jobs=-1, col_name='value') df['value'] = parser.transform(df) df['feature'] = ['report'] * df.shape[0] df = df.loc[:, ['nip', 'id', 'feature', 'value', 'date']] df = df[df['value'] != ''] df.drop_duplicates(inplace=True) return df
""" This script is intended to show the usage of word2vec for our use-case """ import pandas as pd import matplotlib.pyplot as plt from gensim.models import word2vec from sklearn.manifold import TSNE from clintk.text_parser.parser import ReportsParser path = '../data/reports.csv' df = pd.read_csv(path, sep=';').head(1000) parser = ReportsParser(strategy='tokens') X = parser.transform(df) w2v = word2vec.Word2Vec(X, size=128, window=8, sample=0.1) word_vectors = w2v[w2v.wv.vocab] words = w2v.wv.vocab words_embedded = TSNE().fit_transform(word_vectors) plt.scatter(words_embedded[:, 0], words_embedded[:, 1], s=10) for i, word in enumerate(words): plt.annotate(word, xy=(words_embedded[i, 0], words_embedded[i, 1])) plt.show()
def fetch_and_fold(path, engine, targets, n_reports): """ function to fetch radiology reports from vcare database Parameters ---------- For definition of parameters, see arguments in `main_fetch_and_fold` """ key1, key2, date = 'patient_id', 'nip', 'date' # fetching targets table df_targets = sql2df(engine, targets).loc[:, [key2, 'id', 'C1J1']] df_targets.loc[:, 'C1J1'] = pd.to_datetime(df_targets['C1J1'], format='%Y-%m-%d', unit='D') df_rad = pd.read_excel(path, sep=';', usecols=7, parse_dates=[1, 2, 5]) # filter SESSION and keep prescreen values mask = df_rad['SESSION'].isin(['SCA ', 'IRM ']) # mask_prescreen = df_rad['OBSERVATION DATE'] == 'Before Date' df_rad = df_rad[mask] # [mask_prescreen] df_rad['CR_DATE'] = pd.to_datetime(df_rad['CR_DATE'], format='%Y%m%d') # remove useless rows df_rad = df_rad[~(df_rad['CONTENU_CR'].str.match('Examen du', na=False))] df_rad.rename({'CR_DATE': date}, axis=1, inplace=True) df_rad = df_rad.merge(df_targets, on=None, left_on='Nip', right_on='nip').drop('Nip', axis=1) df_rad['patient_id'] = df_rad['id'] df_rad.drop('id', axis=1, inplace=True) folder = Folder(key1, key2, ['CONTENU_CR'], date, n_jobs=-1) rad_folded = folder.fold(df_rad) rad_folded.dropna(inplace=True) rad_folded.drop_duplicates(subset=['value'], inplace=True) # concatenating n_reports group_dict = { key2: 'first', 'feature': 'first', date: 'last', 'value': lambda g: ' '.join(g[:n_reports]) } rad_folded = rad_folded.groupby(key1, as_index=False).agg(group_dict) # removing useless tags (blocks parsing) rad_folded['value'] = rad_folded.loc[:, 'value'].apply(lambda s: \ str(s).replace('<u>', '').replace('</u>', '')) sections = ('resultats', 'resultat', 'evaluation des cibles', 'conclusion', 'lesion(s) non cible(s)', 'nouvelles(s) lesion(s)', 'resultats a l etage thoracique', 'en fenetre osseuse', 'a l etage abdomino plevien', 'conclusion :') parser = ReportsParser(headers='b', is_html=False, col_name='value', sections=sections, n_jobs=-1) rad_folded['value'] = parser.transform(rad_folded) rad_folded = rad_folded[rad_folded['value'] != ''] rad_folded['feature'] = ['rad'] * rad_folded.shape[0] return rad_folded
def parse_cr(path, engine, targets, n_reports): PATH = path # fetching targets df_targets = sql2df(engine, targets) # fetching reports # PATH = 'data/cr_sfditep_2012.xlsx' df = pd.read_excel(PATH) df = df[df['CR NAT'] == 'CR'] df.rename(columns={'CR DATE': 'date', 'text CR': 'value'}, inplace=True) # keep only date in 'date columns' df['date'] = df.loc[:, 'date'].dt.date df['DATE_SIGN_OK'] = df.loc[:, 'DATE_SIGN_OK'].dt.date # remove useless reports df = df[~(df['value'].str.match('Examen du', na=False))] # filter by date # df = df[df['date'] <= (df['DATE_SIGN_OK'] + datetime.timedelta(weeks=8))] # normalize nip df['nip'] = df['N° Dossier patient IGR'].astype(str) + df['LC'] df['nip'] = df.loc[:, 'nip'] \ .apply(lambda s: s[:4] + '-' + s[4:-2] + ' ' + s[-2:]) df.drop([ 'N° Dossier patient IGR', 'LC', 'NOCET', 'SIGLE_ETUDE', 'LIBELLE_TYPE_ETUDE', 'NUM CR', 'CR RESP' ], axis=1, inplace=True) df.dropna(inplace=True) df.drop_duplicates(subset=['value'], inplace=True) # taking only the first reports group_dict = { 'date': 'first', 'DATE_SIGN_OK': 'last', 'value': lambda g: ' '.join(g[:n_reports]) } df = df.groupby('nip', as_index=False).agg(group_dict) df = df.merge(df_targets, on='nip') # removing useless tags (blocks parsing) df['value'] = df.loc[:, 'value'].apply(lambda s: \ str(s).replace('<u>', '').replace('</u>', '')) sections = ('resultats', 'resultat', 'evaluation des cibles', 'conclusion', 'lesions non cibles', 'nouvelles lesions', 'lesion s non cible s', 'nouvelle s lesion s', 'resultats a l etage thoracique', 'en fenetre osseuse', 'a l etage abdomino plevien', 'conclusion :', '') parser = ReportsParser(headers='b', is_html=False, col_name='value', sections=sections, n_jobs=-1) df['value'] = parser.transform(df) # dropping empty rows df = df[df['value'] != ''] df['feature'] = ['rad'] * df.shape[0] df = df.loc[:, ['nip', 'id', 'feature', 'value', 'date']] return df