예제 #1
0
def read_disease_data(feat_file, fname_file, metadata_file, drop_nans=True, export_nan_worms=False):
    """

    Parameters
    ----------
    feat_file : TYPE
        DESCRIPTION.
    fname_file : TYPE
        DESCRIPTION.
    metadata_file : TYPE
        DESCRIPTION.
    export_nan_worms : TYPE, optional
        DESCRIPTION. The default is False.

    Returns
    -------
    feat : TYPE
        DESCRIPTION.
    meta : TYPE
        DESCRIPTION.

    """
    # feat = pd.read_csv(feat_file,
    #                    comment='#')
    # fname = pd.read_csv(fname_file,
    #                     comment='#')
    # meta = pd.read_csv(metadata_file, index_col=None)
    # meta['imaging_date_yyymmdd'] = pd.to_datetime(meta.imaging_date_yyyymmdd,
    #                                               format='%Y%m%d').dt.date
    
    # assert meta.worm_strain.unique().shape[0] == meta.worm_gene.unique().shape[0]

    feat, meta = read_hydra_metadata(feat_file,
                                     fname_file,
                                     metadata_file)
    meta['imaging_date_yyyymmdd'] = pd.to_datetime(meta.imaging_date_yyyymmdd,
                                                  format='%Y%m%d').dt.date
    
    assert meta.worm_strain.unique().shape[0] == meta.worm_gene.unique().shape[0]
    
    feat, meta = align_bluelight_conditions(feat,
                                            meta,
                                            how='inner') #removes wells that don't have all 3 conditions
    if drop_nans:
        feat, meta = drop_nan_worms(feat, meta, saveto=feat_file.parent)

    return feat, meta
예제 #2
0
    parser.add_argument("--save_dir", help="Directory to save super-plots", default=None)
    parser.add_argument('--feature_list_from_csv', help="Path to saved list of selected features\
                        to plot (CSV)", nargs='+', default=None)
    args = parser.parse_args()

    assert Path(args.compiled_metadata_path).exists()
    assert Path(args.results_dir).is_dir()
    args.save_dir = (args.save_dir if args.save_dir is not None else 
                     Path(args.results_dir).parent / "Analysis" / "Superplots")

    combined_feats_path = Path(args.results_dir) / "full_features.csv"
    combined_fnames_path = Path(args.results_dir) / "full_filenames.csv"
    
    # NB: leaves the df in a "long format" that seaborn 'likes'   
    features, metadata = read_hydra_metadata(feat_file=combined_feats_path,
                                             fname_file=combined_fnames_path,
                                             meta_file=args.compiled_metadata_path,
                                             add_bluelight=True)

    # Convert metadata column dtypes, ie. stringsAsFactors, no floats, Δ, etc
    metadata = fix_dtypes(metadata)
    metadata['food_type'] = [f.replace("Δ","_") for f in metadata['food_type']]
    
    features, metadata = clean_summary_results(features, metadata)
        
    # Load feature list from file
    if args.feature_list_from_csv is not None:
        assert Path(args.feature_list_from_csv).exists()
        
        feature_list = pd.read_csv(args.feature_list_from_csv)
        feature_list = list(feature_list[feature_list.columns[0]].unique())
    elif args.n_top_feats is not None:
예제 #3
0
    feat_files = [f for f in summary_files if 'features' in str(f)]
    feat_files.sort(key=find_window)
    fname_files = [f for f in summary_files if 'filenames' in str(f)]
    fname_files.sort(key=find_window)

    assert (find_window(f[0]) == find_window(f[1]) for f in list(zip(feat_files, fname_files)))

    feat_df = []
    meta_df = []
    for c,f in enumerate(list(zip(feat_files, fname_files))):
        _feat = pd.read_csv(f[0],
                            comment='#')
        _fname = pd.read_csv(f[1],
                             comment='#')
    
        _feat, _meta = read_hydra_metadata(_feat, _fname, meta)
    
        _feat, _meta = align_bluelight_conditions(_feat,
                                                  _meta,
                                                  how='inner')
        _meta['window'] = find_window(f[0])
        meta_df.append(_meta)
        feat_df.append(_feat)
    
    assert pd.concat(meta_df).shape[0] == pd.concat(feat_df).shape[0]

    meta = pd.concat(meta_df)
    meta.reset_index(drop=True, inplace=True)
    feat = pd.concat(feat_df)
    feat.reset_index(drop=True, inplace=True)
    
예제 #4
0
    if not metadata_path.exists():
        metadata_df = compile_metadata(args.aux_dir,
                                       imaging_dates=args.imaging_dates)
    else:
        metadata_df = pd.read_csv(metadata_path, index_col=False)

    # add well annotations to metadata
    annotated_metadata_path = Path(
        str(metadata_path).replace('.csv', '_annotated.csv'))
    if not annotated_metadata_path.exists():
        metadata_df = update_metadata_with_wells_annotations(
            Path(args.aux_dir), saveto=annotated_metadata_path)

    # read metadata + features summaries
    features_df, metadata_df = read_hydra_metadata(
        feat_file=args.features_file,
        fname_file=args.filenames_file,
        meta_file=annotated_metadata_path)

    # align bluelight conditions (as separate feature columns)
    features_df, metadata_df = align_bluelight_conditions(
        features_df,
        metadata_df,
        merge_on_cols=['date_yyyymmdd', 'imaging_plate_id', 'well_name'])

    ### clean data

    # remove rows with missing strain information (n=10)
    metadata_df = metadata_df[~metadata_df[args.strain_colname].isna()]
    features_df = features_df.reindex(metadata_df.index)

    # subset for Tierpsy features only
)
moa_file = '/Users/em812/Data/Drugs/StrainScreens/AllCompoundsMoA.csv'

bad_well_cols = [
    'is_bad_well_from_gui', 'is_bad_well_misplaced_plate',
    'is_bad_well_ledfailure'
]
#%% Read data
feat = pd.read_csv(feat_file, comment='#')
fname = pd.read_csv(fname_file, comment='#')

meta = pd.read_csv(metadata_file, index_col=None)
meta.loc[meta['drug_type'].isna(), 'drug_type'] = 'NoCompound'

# Match metadata to feature summaries
feat, meta = read_hydra_metadata(feat, fname, meta)
feat, meta = align_bluelight_conditions(feat, meta, how='outer')

del feat

meta_colnames = list(meta.columns)
print(meta_colnames)
#%% Choose the videos
# Keep only N2s
meta = meta[meta['worm_strain'] == 'N2']

# Remove wells missing bluelight conditions
imgstore_cols = [col for col in meta.columns if 'imgstore_name' in col]
miss = meta[imgstore_cols].isna().any(axis=1)
meta = meta.loc[~miss, :]
        ]

        # Compile feature summaries for mathed features/filename summaries
        compile_features_summaries.compile_tierpsy_summaries(
            feat_files=feat_files,
            compiled_feat_file=combined_feats_path,
            compiled_fname_file=combined_fnames_path,
            fname_files=fname_files)

        # Read features/filename summaries
        feature_summaries = pd.read_csv(combined_feats_path, comment='#')
        filename_summaries = pd.read_csv(combined_fnames_path, comment='#')

        features, metadata = hydra_metadata.read_hydra_metadata(
            feature_summaries,
            filename_summaries,
            metadata,
            add_bluelight=False)

        features, metadata = clean_features_summaries(
            features,
            metadata,
            featurelist=None,
            imputeNaN=True,
            nan_threshold=nan_threshold,
            filter_size_related_feats=filter_size_related_feats)
        # Join metadata + results
        fullresults = metadata.join(features)

        # Save full results to file
        fullresults.to_csv(full_results_path, index=False)
예제 #7
0
def process_feature_summaries(metadata_path,
                              results_dir,
                              compile_day_summaries=True,
                              imaging_dates=None,
                              align_bluelight=True,
                              window_summaries=False,
                              n_wells=96):
    """ Compile feature summary results and join with metadata to produce
        combined full feature summary results
        
        Parameters
        ----------
        metadata : pd.DataFrame
            Experiment metadata
        results_dir : str, Path
            Path to 'Results' directory, containing Tierpsy feature summaries files
        compile_day_summaries : bool
            Compile from Tierpsy feature summaries for each experiment day
        imaging_dates : list of str, None
            List of imaging dates to compile Tierspy feature summaries from. If None, will use 
            'date_yyyymmdd' column of metadata
        align_bluelight : bool
            Align bluelight conditions (convert to wide format)
        window_summaries : bool
            Compile from windowed features summaries files
        
        Returns
        -------
        features, metadata
        
    """

    from tierpsytools.read_data.compile_features_summaries import compile_tierpsy_summaries
    from tierpsytools.read_data.hydra_metadata import read_hydra_metadata, align_bluelight_conditions
    from preprocessing.compile_window_summaries import find_window_summaries, compile_window_summaries

    combined_feats_path = Path(results_dir) / ("full_features.csv"
                                               if not window_summaries else
                                               "full_window_features.csv")
    combined_fnames_path = Path(results_dir) / ("full_filenames.csv"
                                                if not window_summaries else
                                                "full_window_filenames.csv")

    if np.logical_and(combined_feats_path.is_file(),
                      combined_fnames_path.is_file()):
        print("Found existing full feature summaries")
    else:
        print("Compiling feature summary results")
        if window_summaries:
            print("\nFinding window summaries files..")
            fname_files, feat_files = find_window_summaries(
                results_dir=results_dir, dates=imaging_dates)

            # compile window summaries files
            print("\nCompiling window summaries..")
            compiled_filenames, compiled_features = compile_window_summaries(
                fname_files=fname_files,
                feat_files=feat_files,
                compiled_fnames_path=combined_fnames_path,
                compiled_feats_path=combined_feats_path,
                results_dir=Path(results_dir),
                window_list=None,
                n_wells=n_wells)
        else:
            if compile_day_summaries:
                if imaging_dates is not None:
                    assert type(imaging_dates) == list
                    feat_files = []
                    fname_files = []
                    for date in imaging_dates:
                        date_dir = Path(results_dir) / date
                        feat_files.extend(
                            list(
                                Path(date_dir).rglob('features_summary*.csv')))
                        fname_files.extend(
                            list(
                                Path(date_dir).rglob(
                                    'filenames_summary*.csv')))
                else:
                    feat_files = list(
                        Path(results_dir).rglob('features_summary*.csv'))
                    fname_files = [
                        Path(str(f).replace("/features_", "/filenames_"))
                        for f in feat_files
                    ]
            else:
                feat_files = list(
                    Path(results_dir).glob('features_summary*.csv'))
                fname_files = list(
                    Path(results_dir).glob('filenames_summary*.csv'))

            # Keep only features files for which matching filenames_summaries exist
            feat_files = [
                ft for ft, fn in zip(np.unique(feat_files),
                                     np.unique(fname_files)) if fn is not None
            ]
            fname_files = [
                fn for fn in np.unique(fname_files) if fn is not None
            ]

            feat_files = [ft for ft in feat_files if not 'window' in str(ft)]
            fname_files = [fn for fn in fname_files if not 'window' in str(fn)]

            # Compile feature summaries for matched features/filename summaries
            compile_tierpsy_summaries(feat_files=feat_files,
                                      fname_files=fname_files,
                                      compiled_feat_file=combined_feats_path,
                                      compiled_fname_file=combined_fnames_path)

    # Read metadata + record column order
    metadata = pd.read_csv(metadata_path,
                           dtype={
                               "comments": str,
                               "source_plate_id": str,
                               "imaging_run_number": str
                           })
    meta_col_order = metadata.columns.tolist()

    feat_id_cols = ['file_id', 'n_skeletons', 'well_name', 'is_good_well']

    # if there are no well annotations in metadata, omit 'is_good_well' from feat_id_cols
    if 'is_good_well' not in meta_col_order:
        feat_id_cols = [f for f in feat_id_cols if f != 'is_good_well']
    if window_summaries:
        feat_id_cols.append('window')

    # Read features summaries + metadata and add bluelight column if aligning bluelight video results
    features, metadata = read_hydra_metadata(combined_feats_path,
                                             combined_fnames_path,
                                             metadata_path,
                                             feat_id_cols=feat_id_cols,
                                             add_bluelight=align_bluelight)

    if align_bluelight:
        features, metadata = align_bluelight_conditions(
            feat=features,
            meta=metadata,
            how='outer',
            merge_on_cols=[
                'date_yyyymmdd', 'imaging_run_number', 'imaging_plate_id',
                'well_name'
            ])
        meta_col_order.remove('imgstore_name')

    assert set(features.index) == set(metadata.index)

    # record new columns
    assert len(set(meta_col_order) - set(metadata.columns)
               ) == 0  # ensure no old columns were dropped
    new_cols = list(set(metadata.columns) - set(meta_col_order))
    meta_col_order.extend(new_cols)

    return features, metadata[meta_col_order]
    window_list = [find_window(p) for p in window_feats]
    
    for w in tqdm(window_list):
        
        compiled_feat_path = window_dir / 'features_summary_compiled_window_{}.csv'.format(str(w))
        compiled_file_path = window_dir / 'filenames_summary_compiled_window_{}.csv'.format(str(w))
        
        w_feat = [f for f in window_feats if find_window(f) == w]
        w_file = [f for f in window_files if find_window(f) == w]
        
        # assert that only one windows file exists for that window
        assert len(w_feat) == 1 and len(w_file) == 1
        
        # compile features/filenames summaries for window
        compile_tierpsy_summaries(feat_files = w_feat, 
                                  fname_files = w_file,
                                  compiled_feat_file = compiled_feat_path,
                                  compiled_fname_file = compiled_file_path)
        
    compiled_window_feats, compiled_window_files = find_window_summaries(window_dir, pattern='*_compiled_window_*.csv')

    features_path = Path(args.project_dir) / 
    metadata_path = Path()
    
    # load metadata
    features, metadata = read_hydra_metadata()
        
    toc = time()
    print("Done in %.1f seconds" % (toc - tic))

예제 #9
0
#%%

if __name__ == "__main__":
    print("Running: %s" % os.path.basename(sys.argv[0]))
    tic = time.time()

    # Read in syngenta screen compiled metadata, features & filenames summaries
    metadata = pd.read_csv(metadata_path)
    featSums = pd.read_csv(featSums_path, comment='#')  #dtype={'':str})
    fileSums = pd.read_csv(fileSums_path, comment='#')

    # Align metadata nd feature summaries
    feats, metadata = tt_hm.read_hydra_metadata(feat=featSums,
                                                fname=fileSums,
                                                meta=metadata,
                                                add_bluelight=True,
                                                feat_id_cols=feat_id_cols)
    print(metadata.bluelight.unique())

    #    feats, metadata = tt_hm.align_bluelight_conditions(feats, metadata, how='outer',
    #                                                       return_separate_feat_dfs = False)

    # Drop bad wells from metadata + feature summaries
    feats, metadata = dropBadWells(feats, metadata)

    # Add columns for time washed off food, 'time_washed'
    metadata = addTimeWashed(metadata)

    # Calculate duration spent in M9 buffer, 'duration_in_M9'
    # NB: duration_in_M9 = middle_wormsorter_time - time_washed