def get_annotations(variants): interface.status('Fetching DNA annotations') renamed_variants = np.array( [var.replace(':', '-').replace('/', '-') for var in variants], dtype='str') url = 'https://api.missionbio.io/annotations/v1/variants?ids=' + ','.join( renamed_variants) r = requests.get(url=url) data = r.json() data = [d['annotations'] for d in data] function = [', '.join(d['function']['value']) for d in data] gene = [d['gene']['value'] for d in data] protein = [d['protein']['value'] for d in data] coding_impact = [d['protein_coding_impact']['value'] for d in data] clinvar = [', '.join(d['clinvar']['value']) for d in data] dann = np.array([d['impact']['value'] for d in data]) dann[dann == ''] = 0 dann = np.round(dann.astype(float), 2) annot_types = [ 'Gene', 'Function', 'Protein', 'Coding Impact', 'ClinVar', 'DANN' ] df = pd.DataFrame([gene, function, protein, coding_impact, clinvar, dann], index=annot_types).T df['Variant'] = variants df = df[['Variant'] + annot_types] return df
def run(sample, name, should_save): for assay, og_assay in zip( [sample.dna, sample.protein], [sample._original_dna, sample._original_protein]): if assay is not None: for key in assay.metadata: og_assay.add_metadata(key, assay.metadata[key]) for key in assay.row_attrs: og_assay.add_row_attr(key, assay.row_attrs[key]) if should_save: interface.status('Saving h5 file.') if name == '': interface.error('Please provide a name to save by.') elif name[-3:] == '.h5': name = name[:-3] try: os.remove(DFT.ROOT / f'h5/analyzed/{name}.h5') except FileNotFoundError: pass samp = sample[:] set_defaults(samp) mio.save(samp, DFT.ROOT / f'h5/analyzed/{name}.h5') interface.status('Saved.') interface.rerun()
def cluster(assay, method_func, description, **kwargs): similarity = None if 'similarity' in kwargs: similarity = kwargs['similarity'] del kwargs['similarity'] if DFT.CLUSTER_DESCRIPTION not in assay.metadata or assay.metadata[ DFT.CLUSTER_DESCRIPTION] != description or not assay.metadata[ DFT.CLUSTERED]: interface.status(f'Clustering {assay.name.replace("_", " ")}') method_func(**kwargs) if similarity is not None: assay.cluster_cleanup(AF_MISSING, similarity) assay.add_metadata(DFT.CLUSTER_DESCRIPTION, description) assay.add_metadata(DFT.CLUSTERED, True)
def download(link): interface.status('Downloading from s3.') s3 = boto3.client('s3') link = link.replace('s3://', '') link = link.split('/') bucket, file = link[0], '/'.join(link[1:]) filename = file.split('/')[-1] filename = DFT.ROOT / f'h5/downloads/{filename}' filename = str(filename) try: s3.download_file(bucket, file, filename) except Exception as e: interface.status('Done.') interface.error(f'Could not find the given h5 file. {e}') return filename
def run(sample, name): interface.status('Saving h5 file.') if name == '': interface.error('Please provide a name to save by.') elif name[-3:] == '.h5': name = name[:-3] try: os.remove(DFT.ROOT / f'h5/analyzed/{name}.h5') except FileNotFoundError: pass samp = sample[:] set_defaults(samp) mio.save(samp, DFT.ROOT / f'h5/analyzed/{name}.h5') interface.status('Saved.') interface.rerun()
def load(path, load_raw, apply_filter): interface.status('Reading h5 file.') sample = mio.load(path, apply_filter=apply_filter, raw=load_raw) if sample.protein is not None: try: new_ids = np.array( [ab.split(' ')[2] for ab in sample.protein.col_attrs['id']]) except IndexError: new_ids = sample.protein.ids() sample.protein.add_col_attr('id', new_ids) if sample.protein_raw is not None: sample.protein_raw.add_col_attr('id', new_ids) init_defaults(sample) return sample
def preprocess_protein(sample, clicked, drop_abs): if sample.protein.metadata[DFT.INITIALIZE] or ( set(sample.protein.metadata[DFT.DROP_IDS]) != set(drop_abs) and clicked): interface.status('Processing protein assay.') sample.reset('protein') sample.protein.add_metadata(DFT.ALL_IDS, sample.protein.ids()) protein = sample.protein.drop( drop_abs) if len(drop_abs) > 0 else sample.protein[:, :] for norm in [DFT.CLR, DFT.ASINH, DFT.NSP]: protein.normalize_reads(norm) protein.add_layer(norm, protein.layers[NORMALIZED_READS]) sample.protein = protein sample.protein.add_metadata(DFT.DROP_IDS, drop_abs) if not sample.protein.metadata[DFT.INITIALIZE]: sample.protein.add_metadata(DFT.PREPPED, False) sample.protein.add_metadata(DFT.CLUSTERED, False)
def preprocess_dna(sample, clicked, drop_vars, keep_vars, dp, gq, af, std): args_changed = ( list(sample.dna.metadata[DFT.PREPROCESS_ARGS]) != [dp, gq, af, std] or set(sample.dna.metadata[DFT.DROP_IDS]) != set(drop_vars) or set(sample.dna.metadata[DFT.KEEP_IDS]) != set(keep_vars)) if sample.dna.metadata[DFT.INITIALIZE] or (args_changed and clicked): interface.status('Processing DNA assay.') sample.reset('dna') if len(keep_vars) == 0: dna_vars = sample.dna.filter_variants(min_dp=dp, min_gq=gq, min_vaf=af, min_std=std) sample.dna.add_metadata(DFT.ALL_IDS, sample.dna.ids()) if len(drop_vars) > 0: sample.dna = sample.dna.drop(drop_vars) else: dna_vars = keep_vars if len(dna_vars) == 0: interface.status('Done.') interface.error( 'No variants found. Adjust the filters and process again. Make sure "Filter" is deselected in the Files section.' ) sample.dna = sample.dna[:, dna_vars] sample.dna.add_metadata(DFT.PREPROCESS_ARGS, [dp, gq, af, std]) sample.dna.add_metadata(DFT.DROP_IDS, drop_vars) sample.dna.add_metadata(DFT.KEEP_IDS, keep_vars) if not sample.dna.metadata[DFT.INITIALIZE]: sample.dna.add_metadata(DFT.PREPPED, False) sample.dna.add_metadata(DFT.CLUSTERED, False)
def prepare(assay, scale_attribute, pca_attribute, umap_attribute, pca_comps): interface.status(f'Preparing {assay.name.replace("_", " ")} data.') attr = scale_attribute if SCALED_LABEL not in assay.layers or assay.metadata[ DFT.SCALE_ATTR] != attr or not assay.metadata[DFT.PREPPED]: assay.scale_data(scale_attribute) assay.add_metadata(DFT.SCALE_ATTR, attr) attr = f'{pca_attribute}' if pca_attribute == SCALED_LABEL: attr = f'scaled {scale_attribute}' if PCA_LABEL not in assay.row_attrs or assay.metadata[ DFT.PCA_ATTR] != attr or not assay.metadata[DFT.PREPPED]: assay.run_pca(pca_attribute, components=pca_comps) assay.add_metadata(DFT.PCA_ATTR, attr) attr = f'{umap_attribute}' if umap_attribute == SCALED_LABEL: attr = f'scaled {scale_attribute}' if umap_attribute == PCA_LABEL: if pca_attribute == SCALED_LABEL: attr = f'PCA of scaled {scale_attribute}' else: attr = f'PCA of {pca_attribute}' if UMAP_LABEL not in assay.row_attrs or assay.metadata[ DFT.UMAP_ATTR] != attr or not assay.metadata[DFT.PREPPED]: assay.run_umap(attribute=umap_attribute, random_state=42) assay.add_metadata(DFT.UMAP_ATTR, attr) if not assay.metadata[DFT.INITIALIZE]: assay.add_metadata(DFT.CLUSTERED, False) assay.add_metadata(DFT.PREPPED, True)
def status(): print(request.get_data()) sys.stdout.flush() return jsonify(status=interface.status())
import streamlit as st import interface import defaults as DFT from tasks import (load, preprocess, prepare, cluster, customize, save, visual) st.set_page_config(page_title='Mosaic', layout='wide') interface.init() interface.subheader('GUI for Mosaic built using Streamlit') interface.status('v0.1.2') sample, should_save, save_name = load.run() current_assay, available_assays = preprocess.run(sample) prepare.run(current_assay, available_assays) cluster.run(current_assay, available_assays) customize.run(current_assay) save.run(sample, save_name, should_save) visual.run(sample, current_assay) for a in available_assays: a.add_metadata(DFT.INITIALIZE, False)
def make_python_call_string(title, *args, font=subfont): python_loc = "/Users/rpurp/.pyenv/shims/python" # TODO just python 3? command = "{} | bash={} ".format(title, python_loc) for i, arg in enumerate(args, 1): command += 'param{}="{}" '.format(i, arg) command += "terminal=false refresh=true" command += font return command print(interface.status()) print('---') print('Track' + titlefont) for subject in interface.get_subjects(): print( make_python_call_string(subject, script, "-t", subject, font=trackfont)) print(make_python_call_string("mark", script, "-m")) print(make_python_call_string("cancel", script, "-c")) print(make_python_call_string("end", script, "-e")) print('---') now = datetime.datetime.now()
import streamlit as st import interface from tasks import (load, preprocess, prepare, cluster, customize, save, visual) st.set_page_config(page_title='Mosaic', layout='wide') interface.init() interface.subheader('GUI for Mosaic built using Streamlit') interface.status('v0.4.1') sample, should_save, save_name = load.run() current_assay, available_assays = preprocess.run(sample) prepare.run(current_assay, available_assays) cluster.run(current_assay, available_assays) sample_kept, current_assay_kept = customize.run(sample, current_assay) visual_type = visual.run(sample_kept, current_assay_kept) if should_save: save.run(sample_kept, save_name) save.store_metadata(sample, current_assay, visual_type, available_assays)
def render(sample, assay): interface.status('Creating visuals.') category, kind = assay.metadata[DFT.VISUAL_TYPE] options = DFT.VISUALS[category][1] column_sizes = DFT.VISUALS[category][0] columns = st.beta_columns(column_sizes) with columns[0]: new_category = st.selectbox("", list(DFT.VISUALS.keys())) if new_category != category: assay.add_metadata(DFT.VISUAL_TYPE, [new_category, DFT.VISUALS[new_category][1][0]]) interface.rerun() for i in range(len(options)): with columns[i + 1]: st.markdown(f"<p style='margin-bottom:33px'></p>", unsafe_allow_html=True) clicked = st.button(options[i], key=f'visual-{options[i]}') if clicked: kind = options[i] assay.add_metadata(DFT.VISUAL_TYPE, [category, kind]) if kind in DFT.LAYOUT: columns = st.beta_columns(DFT.LAYOUT[kind]) args_conatiner = columns[0] plot_columns = columns[1:] else: columns = st.beta_columns([0.75, 0.1, 2]) args_conatiner = columns[0] plot_columns = columns[2] with args_conatiner: kwargs = {} analyte_map = {'protein': 'Protein', 'dna': 'DNA'} if kind == DFT.SIGNATURES: kwargs['layer'] = st.selectbox('Layer', DFT.LAYERS[assay.name]) kwargs['attribute'] = st.selectbox( 'Signature', ['Median', 'Standard deviation', 'p-value']) elif kind == DFT.HEATMAP: kwargs['attribute'] = st.selectbox('Attribute', DFT.LAYERS[assay.name], key='Visualization Attribute') kwargs['splitby'] = st.selectbox('Split by', DFT.SPLITBY[assay.name]) kwargs['orderby'] = st.selectbox('Order by', DFT.LAYERS[assay.name], key='Visualization Orderby') kwargs['cluster'] = st.checkbox('Cluster within labels', True) kwargs['convolve'] = st.slider('Smoothing', 0, 100) elif kind == DFT.SCATTERPLOT: kwargs['attribute'] = st.selectbox('Attribute', DFT.ATTRS_2D) kwargs['colorby'] = st.selectbox('Color by', DFT.COLORBY[assay.name]) if kwargs['colorby'] not in DFT.SPLITBY[assay.name] + ['density']: features = st.multiselect( 'Features', list(assay.ids()), list(assay.ids())[:min(len(assay.ids()), 4)]) if len(features) != 0: kwargs['features'] = features elif kind == DFT.FEATURE_SCATTER: kwargs['layer'] = st.selectbox('Layer', DFT.LAYERS[assay.name]) feature1 = st.selectbox('Feature 1', list(assay.ids()), index=0) feature2 = st.selectbox('Feature 1', list(assay.ids()), index=2) kwargs['ids'] = [feature1, feature2] kwargs['colorby'] = st.selectbox('Color by', DFT.COLORBY[assay.name]) elif kind == DFT.VIOLINPLOT: kwargs['attribute'] = st.selectbox('Attribute', DFT.LAYERS[assay.name]) kwargs['splitby'] = st.selectbox('Split by', DFT.SPLITBY[assay.name]) kwargs['points'] = st.checkbox('Box and points', False) features = st.multiselect( 'Features', list(assay.ids()), list(assay.ids())[:min(len(assay.ids()), 4)]) if len(features) != 0: kwargs['features'] = features elif kind == DFT.RIDGEPLOT: kwargs['attribute'] = st.selectbox('Attribute', DFT.LAYERS[assay.name]) kwargs['splitby'] = st.selectbox('Split by', DFT.SPLITBY[assay.name]) features = st.multiselect( 'Features', list(assay.ids()), list(assay.ids())[:min(len(assay.ids()), 4)]) if len(features) != 0: kwargs['features'] = features elif kind == DFT.STRIPPLOT: kwargs['attribute'] = st.selectbox('Attribute', DFT.LAYERS[assay.name]) kwargs['colorby'] = st.selectbox('Colorby', DFT.LAYERS[assay.name]) features = st.multiselect( 'Features', list(assay.ids()), list(assay.ids())[:min(len(assay.ids()), 4)]) if len(features) != 0: kwargs['features'] = features elif kind == DFT.DNA_PROTEIN_PLOT: kwargs['analyte'] = st.selectbox( 'Analyte', ['protein'], format_func=lambda a: analyte_map[a]) kwargs['dna_features'] = st.multiselect('DNA features', list(sample.dna.ids()), sample.dna.ids()[:4]) kwargs['protein_features'] = st.multiselect( 'Protein features', list(sample.protein.ids()), sample.protein.ids()[:4]) elif kind == DFT.DNA_PROTEIN_HEATMAP: kwargs['clusterby'] = st.selectbox( 'Cluster by', ['dna', 'protein'], format_func=lambda a: analyte_map[a]) kwargs['sortby'] = st.selectbox( 'Sort by', ['dna', 'protein'], format_func=lambda a: analyte_map[a]) kwargs['dna_features'] = st.multiselect('DNA features', list(sample.dna.ids()), sample.dna.ids()) kwargs['protein_features'] = st.multiselect( 'Protein features', list(sample.protein.ids()), sample.protein.ids()) elif kind == DFT.METRICS: st.header('') interface.info( '<b>Some values might be missing in case the raw<br> files are not loaded.</b> These metrics can be<br> pasted into the metrics sheet as is.' ) elif kind == DFT.READ_DEPTH: if assay.name == PROTEIN_ASSAY: kwargs['layer'] = st.selectbox('Layer', DFT.LAYERS[assay.name]) kwargs['colorby'] = st.selectbox('Color by', ['density', None]) kwargs['features'] = st.multiselect( 'Features', list(assay.ids()), list(assay.ids())[:min(len(assay.ids()), 4)]) else: st.header('') interface.info('<b>Only applicable for the protein assay</b>') elif kind == DFT.ASSAY_SCATTER: kwargs['draw'] = sample.protein_raw is not None if not kwargs['draw']: interface.info('<b>Raw files needed for this plot.</b>') elif kind == DFT.DOWNLOAD: kwargs['item'] = st.selectbox('Object to Download', DFT.DOWNLOAD_ITEMS) kwargs['download'] = st.button('Download', key='download_button') return plot_columns, kind, kwargs
def run(sample, assay): plot_columns, kind, visualization_kwargs = render(sample, assay) visual(sample, assay, kind, plot_columns, visualization_kwargs) interface.status('Done.')