Пример #1
0
def fetch_and_fold(table, engine, targets, n_reports):
    """ function to fetch reports from vcare database

    Parameters
    ----------
    For definition of parameters, see arguments in `main_fetch_and_fold`
    """
    key1, key2, date = 'patient_id', 'nip', 'date'

    # data used to train the model
    df_targets = sql2df(engine, targets).loc[:, ['nip', 'id', 'C1J1']]
    df_targets.loc[:, 'C1J1'] = pd.to_datetime(df_targets['C1J1'],
                                               format='%Y-%m-%d',
                                               unit='D')

    df_reports = sql2df(engine, table)\
        .loc[:, ['original_date', 'patient_id', 'report']]

    mask = [report is not None for report in df_reports['report']]

    df_reports.rename(columns={'original_date': 'date'}, inplace=True)
    df_reports = df_reports.loc[mask]

    # joining features df with complete patient informations
    df_reports = df_reports.merge(df_targets, on=None, left_on='patient_id',
                                  right_on='id').drop('id', axis=1)
    # df_reports = df_reports[df_reports[date] <= df_reports['C1J1']]

    # folding frames so that they have the same columns
    folder = Folder(key1, key2, ['report'], date, n_jobs=-1)
    reports_folded = folder.fold(df_reports)

    reports_folded.dropna(inplace=True)
    reports_folded.drop_duplicates(subset=['value'], inplace=True)

    # taking only first `n_reports` reports
    group_dict = {key2: 'first', 'feature': 'first', date: 'last',
                  'value': lambda g: ' '.join(g[:n_reports])}
    reports_folded = reports_folded.groupby(key1, as_index=False)\
        .agg(group_dict)

    # parsing and vectorising text reports
    sections = ['examens complementaire', 'hopital de jour',
                'examen du patient']

    parser = ReportsParser(sections=None, n_jobs=-1, norm=False,
                           col_name='value')

    reports_folded['value'] = parser.transform(reports_folded)

    return reports_folded
Пример #2
0
    def test_pipeline(self):
        """
        Tests transform_and_label on custom ReportParser transformer
        """
        df = self.setUp()
        df = df[df['feature'] == 'feat2']
        df.index = pd.RangeIndex(len(df.index))

        pipeline = Pipeline

        df_res = transform_and_label(df,
                                     'key1',
                                     'key2',
                                     'date',
                                     'feature',
                                     'value',
                                     pipeline,
                                     steps=[('parser',
                                             ReportsParser(headers=None)),
                                            ('tfidf', TfidfVectorizer())])

        expected = {
            'key1': ['1', '2', '2'],
            'key2': ['a1', 'a2', 'a2'],
            'feature': ['feat2_0', 'feat2_1', 'feat2_2'],
            'value': [1, 1, 1],
            'date': ['2018-06-14', '2018-05-22', '2017-03-01']
        }

        df_expected = pd.DataFrame(expected)

        assert_array_equal(df_expected, df_res.values)
        assert_list_equal(list(df_expected.columns), list(df_res.columns))
Пример #3
0
    def test_remove_section(self):
        x = self.setUp()

        parsed_x = ReportsParser(sections=('title 1')).transform(x)
        res_text = parsed_x.values[0]

        expected_text = 'this is div 1 text 1 0 text 1 1 this is a span'

        assert_equal(res_text, expected_text)
Пример #4
0
    def test_transform(self):
        x = self.setUp()

        parsed_x = ReportsParser(strategy='strings', norm=False).transform(x)
        res_text = parsed_x.values[0]

        expected_text = 'this is div 0 text 0 0 text 0 1 ' \
                        'this is div 1 text 1 0 text 1 1 this is a span'

        assert_equal(res_text, expected_text)
Пример #5
0
    def test_transform2(self):
        x = self.setUp(False)

        parsed_x = ReportsParser(strategy='strings',
                                 headers='b',
                                 is_html=False).transform(x)
        res_text = parsed_x.values[0]

        expected_text = 'this is a text some other text conclusion text'

        assert_equal(res_text, expected_text)
Пример #6
0
    def test_remove_section2(self):
        x = self.setUp(False)

        parsed_x = ReportsParser(sections=('other bold title',
                                           'last bold title'),
                                 headers='b',
                                 is_html=False).transform(x)
        res_text = parsed_x.values[0]

        expected_text = 'some other text conclusion text'

        assert_equal(res_text, expected_text)
Пример #7
0
    def test_tranform_tokens(self):
        x = self.setUp()
        parsed_x = ReportsParser(strategy='tokens', norm=True).transform(x)

        res_tokens = parsed_x.values[0]

        expected_tokens = [
            'this', 'is', 'div', '0', 'text', '0', '0', 'text', '0', '1',
            'this', 'is', 'div', '1', 'text', '1', '0', 'text', '1', '1',
            'this', 'is', 'a', 'span'
        ]

        assert_list_equal(res_tokens, expected_tokens)
Пример #8
0
from clintk.text_parser.parser import ReportsParser
from clintk.text2vec.transformers import AverageWords2Vector, Text2Vector
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

sns.set()

# load data
path = 'data/reports.csv'
df = pd.read_csv(path, sep=';', encoding='utf-8').head(1000)

# parse the reports
sections = ['hopital de jour', 'examen du patient', 'examens complentaires']
parser = ReportsParser(strategy='tokens', remove_sections=sections)
X = parser.transform(df)

###################### vectorization using word2vec aggregation
# text2vec = AverageWords2Vector(sg=1, min_count=5).fit(X)
# x_w2v = text2vec.transform(X)
text2vec = Text2Vector()
text2vec.fit(X)
x_w2v = text2vec.transform(X)

# plot the result using t-sne reduction
X_embedded = TSNE().fit_transform(x_w2v)
plt.scatter(X_embedded[:, 0], X_embedded[:, 1], s=10)
plt.show()

###################### vectorization using TF-IDF
Пример #9
0
def fetch_and_fold(path, engine, targets, n_reports):
    """ function to fetch reports from simbad data

    Parameters
    ----------
    For definition of parameters, see arguments in `main_fetch_and_fold`
    """

    # fetching targets
    df_targets = sql2df(engine, targets)

    # fetching reports
    df = pd.read_excel(path)

    # normalize nip
    df['nip'] = df['N° Dossier patient IGR'].astype(str) + df['LC']
    df['nip'] = df.loc[:, 'nip'] \
        .apply(lambda s: s[:4] + '-' + s[4:-2] + ' ' + s[-2:])

    df.drop([
        'N° Dossier patient IGR', 'LC', 'NOCET', 'SIGLE_ETUDE',
        'LIBELLE_TYPE_ETUDE', 'NUM CR', 'CR RESP'
    ],
            axis=1,
            inplace=True)

    df.rename(columns={'CR DATE': 'date', 'text CR': 'value'}, inplace=True)

    # keep only date in 'date columns'
    df['date'] = df.loc[:, 'date'].dt.date
    df['DATE_SIGN_OK'] = df.loc[:, 'DATE_SIGN_OK'].dt.date

    # taking only consultation reports
    df = df[df['CR NAT'] == 'CC']

    # mask to get only the first one
    mask = (df['date'] == df['DATE_SIGN_OK'])
    df = df[mask]

    # df_rh = df[df['CR NAT'] == 'RH']

    # taking only the first for each patient
    df.dropna(inplace=True)
    df.drop_duplicates(subset=['value'], inplace=True)

    # taking only the first reports
    group_dict = {
        'date': 'first',
        'DATE_SIGN_OK': 'last',
        'value': lambda g: ' '.join(g)
    }
    df = df.groupby('nip', as_index=False).agg(group_dict)

    # # filter uninformative reports and taking the first
    # df_rh = df_rh[~(df_rh['value'].str.match('Examen du', na=False))]
    # df_rh.dropna(inplace=True)
    # df_rh.drop_duplicates(subset=['value'], inplace=True)
    #
    # # taking only the first reports
    # df_rh = df_rh.groupby('nip', as_index=False).agg(group_dict)
    #
    # df = pd.concat([df_cc, df_rh], ignore_index=True)

    # removing useless tags (blocks parsing)
    df['value'] = df.loc[:, 'value'].apply(lambda s: \
        str(s).replace('<u>', '').replace('</u>', ''))

    # filter date
    # df = df[df['date'] <= (df['DATE_SIGN_OK'] + datetime.timedelta(weeks=8))]

    df = df.merge(df_targets, on='nip')

    parser = ReportsParser(headers='b',
                           is_html=False,
                           norm=False,
                           n_jobs=-1,
                           col_name='value')

    df['value'] = parser.transform(df)

    df['feature'] = ['report'] * df.shape[0]

    df = df.loc[:, ['nip', 'id', 'feature', 'value', 'date']]
    df = df[df['value'] != '']
    df.drop_duplicates(inplace=True)

    return df
Пример #10
0
"""
This script is intended to show the usage of word2vec for
our use-case
"""
import pandas as pd
import matplotlib.pyplot as plt

from gensim.models import word2vec
from sklearn.manifold import TSNE
from clintk.text_parser.parser import ReportsParser

path = '../data/reports.csv'

df = pd.read_csv(path, sep=';').head(1000)
parser = ReportsParser(strategy='tokens')
X = parser.transform(df)

w2v = word2vec.Word2Vec(X, size=128, window=8, sample=0.1)

word_vectors = w2v[w2v.wv.vocab]
words = w2v.wv.vocab

words_embedded = TSNE().fit_transform(word_vectors)
plt.scatter(words_embedded[:, 0], words_embedded[:, 1], s=10)

for i, word in enumerate(words):
    plt.annotate(word, xy=(words_embedded[i, 0], words_embedded[i, 1]))
plt.show()
Пример #11
0
def fetch_and_fold(path, engine, targets, n_reports):
    """ function to fetch radiology reports from vcare database

       Parameters
       ----------
       For definition of parameters, see arguments in `main_fetch_and_fold`
    """
    key1, key2, date = 'patient_id', 'nip', 'date'

    # fetching targets table
    df_targets = sql2df(engine, targets).loc[:, [key2, 'id', 'C1J1']]
    df_targets.loc[:, 'C1J1'] = pd.to_datetime(df_targets['C1J1'],
                                               format='%Y-%m-%d',
                                               unit='D')

    df_rad = pd.read_excel(path, sep=';', usecols=7, parse_dates=[1, 2, 5])

    # filter SESSION and keep prescreen values
    mask = df_rad['SESSION'].isin(['SCA ', 'IRM '])

    # mask_prescreen = df_rad['OBSERVATION DATE'] == 'Before Date'
    df_rad = df_rad[mask]  # [mask_prescreen]

    df_rad['CR_DATE'] = pd.to_datetime(df_rad['CR_DATE'], format='%Y%m%d')

    # remove useless rows
    df_rad = df_rad[~(df_rad['CONTENU_CR'].str.match('Examen du', na=False))]

    df_rad.rename({'CR_DATE': date}, axis=1, inplace=True)

    df_rad = df_rad.merge(df_targets, on=None, left_on='Nip',
                          right_on='nip').drop('Nip', axis=1)

    df_rad['patient_id'] = df_rad['id']
    df_rad.drop('id', axis=1, inplace=True)

    folder = Folder(key1, key2, ['CONTENU_CR'], date, n_jobs=-1)

    rad_folded = folder.fold(df_rad)

    rad_folded.dropna(inplace=True)
    rad_folded.drop_duplicates(subset=['value'], inplace=True)
    # concatenating n_reports
    group_dict = {
        key2: 'first',
        'feature': 'first',
        date: 'last',
        'value': lambda g: ' '.join(g[:n_reports])
    }

    rad_folded = rad_folded.groupby(key1, as_index=False).agg(group_dict)

    # removing useless tags (blocks parsing)
    rad_folded['value'] = rad_folded.loc[:, 'value'].apply(lambda s: \
        str(s).replace('<u>', '').replace('</u>', ''))

    sections = ('resultats', 'resultat', 'evaluation des cibles', 'conclusion',
                'lesion(s) non cible(s)', 'nouvelles(s) lesion(s)',
                'resultats a l etage thoracique', 'en fenetre osseuse',
                'a l etage abdomino plevien', 'conclusion :')

    parser = ReportsParser(headers='b',
                           is_html=False,
                           col_name='value',
                           sections=sections,
                           n_jobs=-1)

    rad_folded['value'] = parser.transform(rad_folded)

    rad_folded = rad_folded[rad_folded['value'] != '']
    rad_folded['feature'] = ['rad'] * rad_folded.shape[0]

    return rad_folded
Пример #12
0
def parse_cr(path, engine, targets, n_reports):
    PATH = path

    # fetching targets
    df_targets = sql2df(engine, targets)

    # fetching reports
    # PATH = 'data/cr_sfditep_2012.xlsx'
    df = pd.read_excel(PATH)

    df = df[df['CR NAT'] == 'CR']
    df.rename(columns={'CR DATE': 'date', 'text CR': 'value'}, inplace=True)

    # keep only date in 'date columns'
    df['date'] = df.loc[:, 'date'].dt.date
    df['DATE_SIGN_OK'] = df.loc[:, 'DATE_SIGN_OK'].dt.date

    # remove useless reports
    df = df[~(df['value'].str.match('Examen du', na=False))]

    # filter by date
    # df = df[df['date'] <= (df['DATE_SIGN_OK'] + datetime.timedelta(weeks=8))]

    # normalize nip
    df['nip'] = df['N° Dossier patient IGR'].astype(str) + df['LC']
    df['nip'] = df.loc[:, 'nip'] \
        .apply(lambda s: s[:4] + '-' + s[4:-2] + ' ' + s[-2:])

    df.drop([
        'N° Dossier patient IGR', 'LC', 'NOCET', 'SIGLE_ETUDE',
        'LIBELLE_TYPE_ETUDE', 'NUM CR', 'CR RESP'
    ],
            axis=1,
            inplace=True)

    df.dropna(inplace=True)
    df.drop_duplicates(subset=['value'], inplace=True)

    # taking only the first reports
    group_dict = {
        'date': 'first',
        'DATE_SIGN_OK': 'last',
        'value': lambda g: ' '.join(g[:n_reports])
    }
    df = df.groupby('nip', as_index=False).agg(group_dict)

    df = df.merge(df_targets, on='nip')

    # removing useless tags (blocks parsing)
    df['value'] = df.loc[:, 'value'].apply(lambda s: \
        str(s).replace('<u>', '').replace('</u>', ''))

    sections = ('resultats', 'resultat', 'evaluation des cibles', 'conclusion',
                'lesions non cibles', 'nouvelles lesions',
                'lesion s non cible s', 'nouvelle s lesion s',
                'resultats a l etage thoracique', 'en fenetre osseuse',
                'a l etage abdomino plevien', 'conclusion :', '')
    parser = ReportsParser(headers='b',
                           is_html=False,
                           col_name='value',
                           sections=sections,
                           n_jobs=-1)

    df['value'] = parser.transform(df)

    # dropping empty rows
    df = df[df['value'] != '']

    df['feature'] = ['rad'] * df.shape[0]

    df = df.loc[:, ['nip', 'id', 'feature', 'value', 'date']]

    return df