예제 #1
0
def get_preprocessed_patients(sample_size = 25, rebuild_cache=False):
    cache_file = '/PHShome/ju601/crt/data/patient_cache.json'
    
    # Build cache
    if not os.path.isfile(cache_file) or rebuild_cache:
        patients_out = []
        delta_efs_out = []
        patient_nums = range(906)
        for i in patient_nums:
            if i%100 == 0:
                logger.info(str(i) + '/' + str(patient_nums[-1]))
            patient_data = get_data([i])[0]
            if patient_data is not None:
                ef_delta = get_ef_delta(patient_data)
                if ef_delta is not None:
                    patients_out.append(patient_data['NEW_EMPI'])
                    delta_efs_out.append(ef_delta)
        with open(cache_file, 'w') as cache:
            cache_obj = {
                'patients': patients_out,
                'delta_efs': delta_efs_out
            }
            json.dump(cache_obj, cache)

    # Load from cache
    with open(cache_file, 'r') as f:
        cached = json.load(f)
    n = min(sample_size, len(cached['patients']))
    return cached['patients'][:n], cached['delta_efs'][:n]
예제 #2
0
 def __iter__(self):
     for i in self.patient_list:
         p = get_data([i])[0]
         self.status.write(p['NEW_EMPI'] + '\n')
         for category in categories:
             if category in p:
                 for idx, doc in enumerate(p[category]):
                     tag = p['NEW_EMPI'] + '_' + category + '_' + str(idx) + '\n'
                     yield LabeledSentence(words=doc['free_text'].split(), tags=[tag])
예제 #3
0
def preprocess(i):
    print "\nPreprocessing Medications - " + str(i)
    p = get_data([i])[0]

    for (i, m) in enumerate(p['Med']):
        (name, rxclasses) = get_rx_classes(m['Medication'], include_name=True)
        m['RXNORM_NAME'] = name
        m['RXNORM_CLASSES'] = rxclasses

    save(p)
예제 #4
0
def remove_medication_preprocessing(num_patients):
    for i in range(num_patients):
        print "Removing Medication Preprocessing - " + str(i)
        p = get_data([i])[0]

        for m in p['Med']:
            del m['RXNORM_NAME']
            del m['RXNORM_CLASSES']

        save(p)
예제 #5
0
def remove_medication_preprocessing(num_patients):
    for i in range(num_patients):
        print "Removing Medication Preprocessing - " + str(i)
        p = get_data([i])[0]

        for m in p['Med']:
            del m['RXNORM_NAME']
            del m['RXNORM_CLASSES']

        save(p)
예제 #6
0
def preprocess(i):
    print "\nPreprocessing Medications - " + str(i)
    p = get_data([i])[0]

    for (i, m) in enumerate(p['Med']):
        (name, rxclasses) = get_rx_classes(m['Medication'], include_name=True)
        m['RXNORM_NAME'] = name
        m['RXNORM_CLASSES'] = rxclasses

    save(p)
예제 #7
0
 def __iter__(self):
     for i in self.patient_list:
         p = get_data([i])[0]
         self.status.write(p['NEW_EMPI'] + '\n')
         for category in categories:
             if category in p:
                 for idx, doc in enumerate(p[category]):
                     tag = p['NEW_EMPI'] + '_' + category + '_' + str(
                         idx) + '\n'
                     yield LabeledSentence(words=doc['free_text'].split(),
                                           tags=[tag])
예제 #8
0
def jsonify_text(person_id):
    person = loader.get_data([person_id])[0]
    for key in person.keys():
        if lp.is_note_doc(key):
            for i in range(len(person[key])):
                doc = person[key][i]
                data = lp.parse_note_header(doc, key)
                data['free_text'] = doc
                person[key][i] = data
    with open('./data/patients/FAKE_EMPI_' + str(person_id) + '.json', 'w') as outfile:
        json.dump(person, outfile)
        print 'JSONIFIED PERSON ' + str(person_id)
예제 #9
0
def preprocess_medications(num_patients):
    for i in range(num_patients):
        print "\nPreprocessing Medications - " + str(i) + " - progress: ",
        p = get_data([i])[0]

        for (i, m) in enumerate(p['Med']):
            if i%100 == 0:
                print ", " + str(i) + '/' + str(len(p['Med'])),
            (name, rxclasses) = get_rx_classes(m['Medication'], include_name=True)
            m['RXNORM_NAME'] = name
            m['RXNORM_CLASSES'] = rxclasses

        save(p)
예제 #10
0
def preprocess_medications(num_patients):
    for i in range(num_patients):
        print "\nPreprocessing Medications - " + str(i) + " - progress: ",
        p = get_data([i])[0]

        for (i, m) in enumerate(p['Med']):
            if i % 100 == 0:
                print ", " + str(i) + '/' + str(len(p['Med'])),
            (name, rxclasses) = get_rx_classes(m['Medication'],
                                               include_name=True)
            m['RXNORM_NAME'] = name
            m['RXNORM_CLASSES'] = rxclasses

        save(p)
예제 #11
0
def load_and_write_data(artifact: Artifact, key: str, location: str):
    """Loads data and writes it to the artifact if not already present.

    Parameters
    ----------
    artifact
        The artifact to write to.
    key
        The entity key associated with the data to write.
    location
        The location associated with the data to load and the artifact to
        write to.

    """
    if key in artifact:
        logger.debug(f'Data for {key} already in artifact.  Skipping...')
    else:
        logger.debug(f'Loading data for {key} for location {location}.')
        data = loader.get_data(key, location)
        logger.debug(f'Writing data for {key} to artifact.')
        artifact.write(key, data)
    return artifact.load(key)
예제 #12
0
def generate_train_test(levels):
    mask, lats, lons = get_mask_data()
    sat_data = get_data(lats.ravel(), lons.ravel(), levels)
    sat_layers = [
        exposure.equalize_adapthist(item[0].reshape(lats.shape).T,
                                    clip_limit=0.03) for item in sat_data
    ]
    m, n = mask.shape
    image_num = 0
    for x in range(0, m - SLIDING_WINDOW_SIZE, SLIDING_INCREMENT):
        for y in range(0, n - SLIDING_WINDOW_SIZE, SLIDING_INCREMENT):
            prepared_layers = map(
                lambda data: data[x:x + SLIDING_WINDOW_SIZE, y:y +
                                  SLIDING_WINDOW_SIZE], sat_layers)
            prepared_mask = mask[x:x + SLIDING_WINDOW_SIZE,
                                 y:y + SLIDING_WINDOW_SIZE]
            layer = np.dstack(prepared_layers)
            imsave(os.path.join(IMAGES_PATH,
                                str(image_num) + '.png'), np.flipud(layer))
            imsave(os.path.join(LABELS_PATH,
                                str(image_num) + '.png'),
                   np.flipud(prepared_mask))
            image_num += 1
예제 #13
0
def make_test_file(filepath,env):
    '''
    make test file into X for LSTM model 
    
    Parameters
    ----------
    filepath : string
        filepath of current test data
    env : Env
        Environment of system
    Returns
    -------
    X : 3d matrix (batch_size,timestep(=lookback),1(=feature))
        matrix of X which can be used for LSTM model
    dataX : matrix (feature * timestep)
        original data X
    '''
    feature=env.get_config("data","feature",type="list")
    time_slice=env.get_config("data","time_slice",type="int")
    is_debug=env.get_config("system","debug",type="int")

    # load data
    dataX=ld.get_data(filepath,feature)
    # simplify data
    # dataX=pp.mean_simplify(dataX,len(feature),time_slice)
    
    #make LSTM data
    X = make_LSTM_X(dataX,env)
    if is_debug==1:
        label_path=env.get_config("path","label_path")
        labeldata=lb.load_label(label_path)
        label=lb.data_labeling(dataX,filepath,labeldata)
        Y = make_LSTM_Y(label,env)
        return X,dataX,Y
    else:
        return X,dataX
예제 #14
0
파일: main.py 프로젝트: aroville/sacem_mfcc
import extractor
import loader
from sklearn.ensemble import RandomForestClassifier
from os.path import join
import os
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.decomposition import PCA
from sklearn.externals import joblib
from sklearn.base import clone
from time import time

extractor.extract_features_from_yt_audioset()
extractor.extract_features_from_augmented_audioset()
x, y = loader.get_data()

print(len(x))
print(len(x[0]))
print(len(y))

n_components = 8
print('Using PCA on dataset: keeping %s features' % n_components)
pca = PCA(n_components=n_components)
pca.fit(x, y)

x = pca.transform(x)
print('Explained variance ratio: %s' % pca.explained_variance_ratio_)
print('\n\n')

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3)

fix_params = dict(min_samples_split=3,
예제 #15
0
dead_patients = []
gender = []
age = []
is_cancer = []
consult_time = []
cancer_time = []
cancer_mgh_time = []
cardio_mgh_time = []
noncancer_time = []
utilization = []
utilization_cancer = []
utilization_mgh_cancer = []
utilization_noncancer = []
utilization_mgh_cardio = []
for i in range(num_patients):
    p = get_data([i])[0]
    # Filter to only dead patients
    if p['Vital_status'] == 'Date of Death reported from SS Death Master File':
        if p['Consult_Date'] not in [None, '']:
            dead_patients.append(p['EMPI'])
            gender.append(p['Gender'])

            # Dates
            dob = datetime.strptime(p['Date_of_Birth'], "%m/%d/%Y")
            dod = datetime.strptime(p['Date_Of_Death\r'], "%m/%d/%Y")
            doc = datetime.strptime(p['Consult_Date'], "%m/%d/%Y")
            age.append((dod - dob).days / 365.0)
            timing = dod - doc
            consult_time.append(timing.days)

            # Diagnoses
예제 #16
0
def plot_num_docs(patient_range = range(90)):
    rel_dates = dict()
    keyword_counts = dict()
    keywords = ['ef\w+(.+)%', 'ejection fraction:\w*(.+)%', 'ef of (.+)%','ejection fraction of (.+)%', 'ef is (.+)%', 'ef:\w*(.+)%','ejection fraction is (.+)%', 'ef:\w*(.+)%']
    overall_counts = dict()
    for i in patient_range:
        if i % 25 == 0:
            print i
        data = get_data([i])[0]
        rel_dates = get_doc_rel_dates(data, rel_dates, True)
        #keyword_counts = get_doc_keywords(data, keywords, keyword_counts, True)
        #ef_occurances = get_ef_values(data, car_only = True)
        if False and len(ef_occurances) > 2: #REMOVE FALSE TO SEE PLOTS
            dates, efs = zip(*ef_occurances)
            pl.figure()
            pl.scatter(dates, efs)
            pl.show()
        #for doc in keyword_counts:
        #    s = 0
        #    for key in keyword_counts[doc]:
        #        s += len(keyword_counts[doc][key])
        #    
        #    if not doc in overall_counts:
        #        overall_counts[doc] = [s]
        #    else:
        #        overall_counts[doc] += [s]
    
   # print overall_counts['Car']
   # pl.figure()
   # pl.hist(overall_counts['Car'])
   # pl.show()

    #for keyword in keyword_counts:
    #    print keyword, ": ", str(sum(keyword_counts[keyword]))
    #for doc in keyword_counts:
    #    print doc
    #    for keyword in keyword_counts[doc]:
    #        print "\t", keyword, ": ", str(sum(keyword_counts[doc][keyword]))
    note_deltas = []
    struct_deltas = []
    for doc_type in rel_dates:
        if is_note_doc(doc_type):
            note_deltas += [x.days for x in rel_dates[doc_type]]
        else:
            struct_deltas += [x.days for x in rel_dates[doc_type]]
    for word in keyword_counts:
        keyword_counts[word] = [x.days for x in rel_dates[doc_type]]
    
    bins = 100    
    print
    print "Notes: ", len(note_deltas)
    print "Structs: ", len(struct_deltas)
    pl.figure()
    h = pl.hist([note_deltas, struct_deltas], bins,stacked = True, color = ['blue', 'red'], label = ['Number of sentences in\nunstructured notes', 'Number of structured entries'])
    pl.legend(loc = 2)
    pl.title("Frequency of Occurances of New Data in Patient")
    pl.xlabel("Days Since Implant Procedure")
    pl.ylabel("Number of Pieces of Information")
    pl.show()


    for word in keyword_counts:
        pl.figure()
        pl.hist(keyword_counts[word], bins, color = ['blue'])
        pl.title("Occurances of " + word + " in corpus at time from procedure")
        pl.show()
예제 #17
0
def get_dataset():
    # get training data and test data from MNIST dataset
    train_x_from_dataset, train_y_from_dataset, test_x_from_dataset, test_y_from_dataset = mnist.get_data(
    )
    # preprocess for training data
    # get data with label 6 and 9
    index_6 = np.where(train_y_from_dataset == 6)
    index_9 = np.where(train_y_from_dataset == 9)
    # shuffle data
    index = np.concatenate([index_6[0], index_9[0]])
    np.random.seed(1)
    np.random.shuffle(index)
    # get data that we want (data with label 6 and 9)
    train_y = train_y_from_dataset[index]
    train_x = train_x_from_dataset[index]
    # if data's label = 6 set 0 and if data's label = 9 set 1
    train_y[np.where(train_y == 6)] = 0
    train_y[np.where(train_y == 9)] = 1
    # preprocess for test data
    index_6 = np.where(test_y_from_dataset == 6)
    index_9 = np.where(test_y_from_dataset == 9)
    index = np.concatenate([index_6[0], index_9[0]])
    np.random.shuffle(index)
    test_y = test_y_from_dataset[index]
    test_x = test_x_from_dataset[index]
    test_y[np.where(test_y == 6)] = 0
    test_y[np.where(test_y == 9)] = 1
    return train_x, train_y, test_x, test_y
예제 #18
0
파일: tables.py 프로젝트: Jdhaimson/NLP_CRT
                after_date = rel_date
                dist_from_thresh = dist
    if before is not None and after is not None:
        return (after - before, before, after, before_date, after_date)
    else:
        return (None, None, None, None, None)

# Collect statistics
has_procedure = 0
has_baseline = 0 
no_baseline = []
has_followup = 0
stats = defaultdict(list) 
total = 1056
for i in range(total - 1):
    p = get_data([i])[0]
    print str(i) + " - " + p['EMPI']

    procedure_date = get_operation_date(p)
    if procedure_date:
        has_procedure += 1
        (ef_delta, baseline_ef, followup_ef, baseline_date, followup_date) = get_ef_delta(p)    
        if not baseline_ef:
            no_baseline.append(p['EMPI'])
        if baseline_ef and baseline_date > -60:
            has_baseline += 1
            if followup_date > 100 and followup_date < 500:
                has_followup += 1
                stats['procedure_date'].append(procedure_date)
                stats['baseline_days'].append(baseline_date)
                stats['followup_days'].append(followup_date)
예제 #19
0
def plot_num_docs(patient_range=range(90)):
    rel_dates = dict()
    keyword_counts = dict()
    keywords = [
        'ef\w+(.+)%', 'ejection fraction:\w*(.+)%', 'ef of (.+)%',
        'ejection fraction of (.+)%', 'ef is (.+)%', 'ef:\w*(.+)%',
        'ejection fraction is (.+)%', 'ef:\w*(.+)%'
    ]
    overall_counts = dict()
    for i in patient_range:
        if i % 25 == 0:
            print i
        data = get_data([i])[0]
        rel_dates = get_doc_rel_dates(data, rel_dates, True)
        #keyword_counts = get_doc_keywords(data, keywords, keyword_counts, True)
        #ef_occurances = get_ef_values(data, car_only = True)
        if False and len(ef_occurances) > 2:  #REMOVE FALSE TO SEE PLOTS
            dates, efs = zip(*ef_occurances)
            pl.figure()
            pl.scatter(dates, efs)
            pl.show()
        #for doc in keyword_counts:
        #    s = 0
        #    for key in keyword_counts[doc]:
        #        s += len(keyword_counts[doc][key])
        #
        #    if not doc in overall_counts:
        #        overall_counts[doc] = [s]
        #    else:
        #        overall_counts[doc] += [s]

# print overall_counts['Car']
# pl.figure()
# pl.hist(overall_counts['Car'])
# pl.show()

#for keyword in keyword_counts:
#    print keyword, ": ", str(sum(keyword_counts[keyword]))
#for doc in keyword_counts:
#    print doc
#    for keyword in keyword_counts[doc]:
#        print "\t", keyword, ": ", str(sum(keyword_counts[doc][keyword]))
    note_deltas = []
    struct_deltas = []
    for doc_type in rel_dates:
        if is_note_doc(doc_type):
            note_deltas += [x.days for x in rel_dates[doc_type]]
        else:
            struct_deltas += [x.days for x in rel_dates[doc_type]]
    for word in keyword_counts:
        keyword_counts[word] = [x.days for x in rel_dates[doc_type]]

    bins = 100
    print
    print "Notes: ", len(note_deltas)
    print "Structs: ", len(struct_deltas)
    pl.figure()
    h = pl.hist([note_deltas, struct_deltas],
                bins,
                stacked=True,
                color=['blue', 'red'],
                label=[
                    'Number of sentences in\nunstructured notes',
                    'Number of structured entries'
                ])
    pl.legend(loc=2)
    pl.title("Frequency of Occurances of New Data in Patient")
    pl.xlabel("Days Since Implant Procedure")
    pl.ylabel("Number of Pieces of Information")
    pl.show()

    for word in keyword_counts:
        pl.figure()
        pl.hist(keyword_counts[word], bins, color=['blue'])
        pl.title("Occurances of " + word + " in corpus at time from procedure")
        pl.show()