def show_heatmap(): from crispr_analysis import get_db rdb = get_db() cell_lines = rdb.smembers('cell_lines') # I don't like the idea of manually fixing the strings, but I couldn't find a better way cell_lines = [cell_line.decode('utf-8') for cell_line in cell_lines] if request.method == 'POST': selected_cell_lines = request.form.getlist('cell_lines') selected_genes = request.form.get('selected_genes').strip().split() joint_df = None for cell_line in selected_cell_lines: df = pd.read_msgpack(rdb.get(cell_line)) df = df[['gene_id', 'fc', 'pval', 'inc_ess']] if cell_line == 'WT': df['inc_ess'] = 'n/a' df.fc.astype(float) df.pval.astype(float) df.columns = ['gene_id', '{}_fc'.format(cell_line), '{}_pval'.format(cell_line), '{}_inc_ess'.format(cell_line)] if joint_df is None: joint_df = df.copy() else: joint_df = pd.merge(joint_df, df, how='outer', on='gene_id') joint_df = joint_df.loc[joint_df['gene_id'].isin(selected_genes)].reset_index() joint_df = joint_df.round(decimals=3) plot_series = [] for i, row in joint_df.iterrows(): for cell_line in selected_cell_lines: plot_series.append({ 'x': selected_cell_lines.index(cell_line), 'y': i, 'cell_line': cell_line, 'gene_id': row['gene_id'], 'pval': row['{}_pval'.format(cell_line)], 'fc': row['{}_fc'.format(cell_line)], 'value': row['{}_fc'.format(cell_line)], 'inc_ess': row['{}_inc_ess'.format(cell_line)] }) # to keep the order of genes selected_genes=joint_df['gene_id'].tolist() return render_template('heatmap.html', cell_lines=cell_lines, selected_cell_lines=selected_cell_lines, selected_genes=selected_genes, plot_series=plot_series) return render_template('heatmap.html', cell_lines=cell_lines)
def import_data(): from crispr_analysis import get_db rdb = get_db() if request.method == 'POST': data_type = request.form.get('data_type') with_drugs = request.form.get('with_drugs') input_file = request.files.get('input_file') if data_type != 'flush_db' and input_file is None: return render_template('data_import.html', selected_data_type=data_type, with_drugs=with_drugs, error="Please select file!") email = request.form.get('email') notify = request.form.get('notify') == 'true' try: if data_type == "fold_changes": # doesn't matter with or without # if with_drugs == 'without_drugs': df = pd.read_csv(input_file, sep='\t') header = list(df.columns) genes = df['gene_id'].tolist() cell_lines = [] for column in header: if '_fc' in column: cell_line = column.split('_fc')[0] if cell_line not in cell_lines: cell_lines.append(cell_line) # todo: unicode for cell_line in cell_lines: inc_ess = 'increasedEssential_{}'.format(cell_line) if 'increasedEssential_{}'.format( cell_line) not in df.columns: df[inc_ess] = 'n/a' columns_to_select = [ 'gene_id', '{}_fc'.format(cell_line), '{}_pval'.format(cell_line), inc_ess ] # select only the columns for that cell line current_df = df[columns_to_select] # rename the columns columns_to_rename = ['gene_id', 'fc', 'pval', 'inc_ess'] current_df.columns = columns_to_rename rdb.set( cell_line, current_df[columns_to_rename].to_msgpack( encoding='utf-8')) rdb.sadd('cell_lines', *set(cell_lines)) rdb.sadd('genes', *set(genes)) success = "Data successfully imported. \nFile:{} \nCell lines: {} \nGenes:{}".format( input_file.filename, len(cell_lines), len(genes)) if notify and email: send_email(email, message=success) return render_template('data_import.html', success=success) elif data_type == "norm_counts": if with_drugs == 'without_drugs': full_df = pd.read_csv(input_file, sep='\t') error = '' required = [ 'gene_id', 'cell_line', 'treatment', 'norm_counts' ] if not set(required).issubset(set(full_df.columns)): error = 'ERROR: incorrect header! \n' error += 'Required columns: {}\n'.format( ', '.join(required)) error += "Given: {}\n".format(', '.join( full_df.columns)) if error: error += 'Filename: {}\n'.format(input_file.filename) error += "Please read the instructions once again!!" return render_template( 'data_import.html', selected_data_type=data_type, with_drugs=with_drugs, error=error, ) genes = full_df['gene_id'].unique() cell_lines = full_df['cell_line'].unique() last_part = False for i in range(len(genes)): gene = genes[i] gene_df = full_df.loc[full_df['gene_id'] == gene] gene_df = gene_df[[ 'gene_id', 'cell_line', 'treatment', 'norm_counts' ]] if i + 100 < len(genes) and i % 100 == 0: if notify and email: message = 'writing {} to {} out of {} genes'.format( i, i + 100, len(genes)) send_email(email, message=message) elif i + 100 >= len(genes) and not last_part: if notify and email: send_email( email, 'writing {} to {} out of {} genes'.format( i, len(genes), len(genes))) last_part = True key = '{}_counts'.format(gene) rdb.set(key, gene_df.to_msgpack()) success = "Data successfully imported. \nFile: {}\nCell lines: {} \nGenes:{}".format( input_file.filename, len(cell_lines), len(genes)) if notify and email: send_email(email, message=success) return render_template('data_import.html', success=success) elif with_drugs == 'with_drugs': full_df = pd.read_csv(input_file, sep='\t') error = '' required = [ 'cell_line', 'sample', 'gene_id', 'day', 'norm_counts' ] if not set(required).issubset(set(full_df.columns)): error = 'ERROR: incorrect header! \n' error += 'Required columns: {}\n'.format( ', '.join(required)) error += "Given: {}\n".format(', '.join( full_df.columns)) if error: error += 'Filename: {}\n'.format(input_file.filename) error += "Please read the instructions once again!!" return render_template( 'data_import.html', selected_data_type=data_type, with_drugs=with_drugs, error=error, ) last_printed = False # for print message cell_lines = full_df['cell_line'].unique() genes = full_df['gene_id'].unique() drugs = full_df['sample'].unique( ) # this is only for day 21 for i in range(len(genes)): gene = genes[i] msg_pack = rdb.get('{}_counts'.format(gene)) if msg_pack is None: # print('DID YOU IMPORT norm_counts WITHOUT DRUGS???') error = "Can't import {}. Did you import Normalized Counts WITHOUT drugs?\nCould not find gene {}".format( input_file.filename, genes[i]) if notify and email: send_email(email, message=error) return render_template('data_import.html', error=error) existing_df = pd.read_msgpack(msg_pack) for cell_line in cell_lines: # for all drugs we need day0, day7 and day21 day0_df = full_df.loc[ (full_df['cell_line'] == cell_line) & (full_df['day'] == 0) & (full_df['gene_id'] == gene)] day7_df = full_df.loc[ (full_df['cell_line'] == cell_line) & (full_df['day'] == 7) & (full_df['gene_id'] == gene)] for drug in drugs: day21_df = full_df.loc[ (full_df['cell_line'] == cell_line) & (full_df['sample'] == drug) & (full_df['day'] == 21) & (full_df['gene_id'] == gene)] # this will be empty in case if sample = 'before1' or 'before2' if day21_df.empty: # print("EMPTY: ", cell_line, drug, gene) continue # day0vs7 # before day0vs7_df = day0_df.copy() day0vs7_df = day0vs7_df[[ 'gene_id', 'cell_line', 'sample', 'norm_counts' ]] day0vs7_df.columns = [ 'gene_id', 'cell_line', 'treatment', 'norm_counts' ] day0vs7_df[ 'cell_line'] = '{}_{}_day0vs7'.format( cell_line, drug) day0vs7_df['treatment'] = 0 existing_df = existing_df.append( day0vs7_df.copy(), ignore_index=True) # after day0vs7_df = day7_df.copy() day0vs7_df = day0vs7_df[[ 'gene_id', 'cell_line', 'sample', 'norm_counts' ]] day0vs7_df.columns = [ 'gene_id', 'cell_line', 'treatment', 'norm_counts' ] day0vs7_df[ 'cell_line'] = '{}_{}_day0vs7'.format( cell_line, drug) day0vs7_df['treatment'] = 1 existing_df = existing_df.append( day0vs7_df.copy(), ignore_index=True) # day7vs21 # before day7vs21_df = day0vs7_df.copy() day7vs21_df[ 'cell_line'] = '{}_{}_day7vs21'.format( cell_line, drug) day7vs21_df['treatment'] = 0 existing_df = existing_df.append( day7vs21_df.copy(), ignore_index=True) # after day7vs21_df = day21_df.copy() day7vs21_df = day7vs21_df[[ 'gene_id', 'cell_line', 'sample', 'norm_counts' ]] day7vs21_df.columns = [ 'gene_id', 'cell_line', 'treatment', 'norm_counts' ] day7vs21_df[ 'cell_line'] = '{}_{}_day7vs21'.format( cell_line, drug) day7vs21_df['treatment'] = 1 existing_df = existing_df.append( day7vs21_df.copy(), ignore_index=True) # day0vs21 # before day0vs21_df = day0vs7_df.copy() day0vs21_df[ 'cell_line'] = '{}_{}_day0vs21'.format( cell_line, drug) day0vs21_df['treatment'] = 0 existing_df = existing_df.append( day0vs21_df.copy(), ignore_index=True) # after day0vs21_df = day7vs21_df.copy() day0vs21_df[ 'cell_line'] = '{}_{}_day0vs21'.format( cell_line, drug) day0vs21_df['treatment'] = 1 existing_df = existing_df.append( day0vs21_df.copy(), ignore_index=True) if i + 100 < len(genes) and i % 100 == 0: if notify and email: message = 'writing {} to {} out of {} genes'.format( i, i + 100, len(genes)) send_email(email, message=message) elif i + 100 >= len(genes) and not last_printed: if notify and email: send_email( email, 'writing {} to {} out of {} genes'.format( i, len(genes), len(genes))) last_printed = True key = '{}_counts'.format(gene) rdb.set(key, existing_df.to_msgpack()) success = "Data succesfully imported.\nFile: {} \nCell lines: {}\nGenes: {}".format( input_file.filename, str(cell_lines), len(genes)) if notify and email: send_email(email, success) return render_template( 'data_import.html', selected_data_type=data_type, with_drugs=with_drugs, success=success, ) elif data_type == "flush_db": rdb.flushall() if notify and email: send_email(email, 'Database succesfully flushed') return render_template('data_import.html', success='Database successfully flushed') except Exception as e: import traceback if notify and email: send_email(email, traceback.format_exc()) return render_template('data_import.html', error='Import failed, error:\n {}'.format( traceback.format_exc())) return render_template('data_import.html')
def get_norm_counts(gene, cell_lines): # this import has to be here!! from crispr_analysis import get_db rdb = get_db() cell_lines = cell_lines.split(',') msgpack = rdb.get('{}_counts'.format(gene)) if msgpack is None: return jsonify({ 'data': [], 'errors': ['No counts for gene {} found'.format(gene)] }) gene_df = pd.read_msgpack(msgpack) series_before = [] series_after = [] outliers = [] error_messages = [] for i in range(len(cell_lines)): cell_line = cell_lines[i] # key = 'RPE_{}'.format(cell_line) df = gene_df.loc[gene_df['cell_line'] == cell_line] # if df.empty: # key = cell_line # df = gene_df.loc[gene_df['cell_line'] == key] if df.empty: error_messages.append('No counts for gene: {} and cell line: {}. '.format(gene, cell_line)) continue # keep only the ones that are within +3 to -3 standard deviations without_outliers = df[np.abs(df.norm_counts - df.norm_counts.mean()) <= (3 * df.norm_counts.std())] only_outliers = df[np.abs(df.norm_counts - df.norm_counts.mean()) > (3 * df.norm_counts.std())] before = without_outliers.loc[without_outliers['treatment'] == 0] after = without_outliers.loc[without_outliers['treatment'] == 1] # calculate boxplot data q1, median, q3 = before.norm_counts.quantile([0.25, 0.5, 0.75]).round(decimals=3).tolist() series_before.append([ before['norm_counts'].min().round(decimals=3), q1, median, q3, before['norm_counts'].max().round(decimals=3)]) q1, median, q3 = after.norm_counts.quantile([0.25, 0.5, 0.75]).round(decimals=3).tolist() series_after.append([ after['norm_counts'].min().round(decimals=3), q1, median, q3, after['norm_counts'].max().round(decimals=3)]) for index, row in only_outliers.iterrows(): x = i y = round(row['norm_counts'], 3) treatment = 'Before Treatment' if int(row['treatment']) == 0 else 'After Treatment' outliers.append({ 'x': x, 'y': y, 'treatment': treatment, 'cell_line': cell_line, 'color': 'black' if treatment == 'After Treatment' else '#7cb5ec' }) counts_series = [{ 'name': 'Before Treatment', 'data': series_before, 'color': '#7cb5ec', }, { 'name': 'After Treatment', 'data': series_after, 'color': 'black', }, {'name': 'Outliers', 'type': 'scatter', 'data': outliers, 'color': 'black', 'tooltip': { 'pointFormat': '<br>cell line: {point.cell_line}<br>norm. counts: {point.y}<br>treatment: {point.treatment}', } } ] return jsonify({ 'data': counts_series, 'errors': error_messages, })
def compare_cell_lines(): from crispr_analysis import get_db rdb = get_db() cell_lines = rdb.smembers('cell_lines') cell_lines = [cell_line.decode('utf-8') for cell_line in cell_lines] if request.method == 'GET': return render_template('compare.html', cell_lines=cell_lines) if request.method == 'POST': x_axis = request.form.get('x_axis') y_axis_multiple = request.form.getlist('y_axis_multiple') # here it doesn't matter how to plot, the data will be the same how_to_plot = request.form.get('how_to_plot') show_data_table = request.form.get('show_data_table') is not None # filters apply_filters = request.form.get('apply_filters') is not None x_fc_max = float(request.form.get('x_fc_max')) # x_fc_min = float(request.form.get('x_fc_min')) x_pval = float(request.form.get('x_pval')) x_pval_less_or_greater = request.form.get('x_pval_less_or_greater') y_fc_max = float(request.form.get('y_fc_max')) # y_fc_min = float(request.form.get('y_fc_min')) y_pval = float(request.form.get('y_pval')) y_pval_less_or_greater = request.form.get('y_pval_less_or_greater') x_axis_df = pd.read_msgpack(rdb.get(x_axis)) x_axis_df = x_axis_df[['gene_id', 'fc', 'pval']] x_axis_df.columns = ['gene_id', 'x', 'x_pval'] x_axis_df = x_axis_df.round(decimals=3) # if apply_filters: x_axis_df = x_axis_df.loc[x_axis_df['x'] <= x_fc_max] # x_axis_df = x_axis_df.loc[x_axis_df['x'] >= x_fc_min] x_axis_df = x_axis_df.loc[x_axis_df['x_pval'] >= x_pval] if x_pval_less_or_greater == 'greater' else \ x_axis_df.loc[x_axis_df['x_pval'] <= x_pval] joint_df = None full_df = None for cell_line in y_axis_multiple: df = pd.read_msgpack(rdb.get(cell_line)) df = df[['gene_id', 'fc', 'pval', 'inc_ess']] df.fc.astype(float) df.pval.astype(float) if show_data_table: # to fill the data table correctly to_merge = df.copy() if cell_line != 'WT' else df[['gene_id', 'fc', 'pval']].copy() to_merge = to_merge.round(decimals=3) columns = ['gene_id', '{}_fc'.format(cell_line), '{}_pval'.format(cell_line)] if cell_line != 'WT': columns.append('{}_inc_ess'.format(cell_line)) to_merge.columns = columns full_df = to_merge if full_df is None else pd.merge(full_df, to_merge, how='outer', on='gene_id') if apply_filters: # if 'WT' in cell_line: # # df = df.loc[df['fc'] >= x_fc_min] # df = df.loc[df['fc'] <= x_fc_max] # df = df.loc[df['pval'] >= x_pval] if x_pval_less_or_greater == 'greater' \ # else df.loc[df['pval'] <= x_pval] # # df = df.loc[df['fc'] >= x_fc_min] # # df = df.loc[df['fc'] <= x_fc_max] # # df = df.loc[df['pval'] >= x_pval] if x_pval_less_or_greater == 'greater' \ # # else df.loc[df['pval'] <= x_pval] # else: # df = df.loc[df['fc'] >= y_fc_min] df = df.loc[df['fc'] <= y_fc_max] df = df.loc[df['pval'] >= y_pval] if y_pval_less_or_greater == 'greater' \ else df.loc[df['pval'] <= y_pval] # df = df.loc[df['fc'] >= y_fc_min] # df = df.loc[df['fc'] <= y_fc_max] # df = df.loc[df['pval'] >= y_pval] if y_pval_less_or_greater == 'greater' \ # else df.loc[df['pval'] <= y_pval] df = df.round(decimals=3) df.columns = ['gene_id', '{}_fc'.format(cell_line), '{}_pval'.format(cell_line), '{}_inc_ess'.format(cell_line)] joint_df = df.copy() if joint_df is None else pd.merge(joint_df, df, how='outer', on='gene_id') # # # apply_filters # if apply_filters: # for cell_line in cell_lines: # wt_df = joint_df.loc[(joint_df["cell_line"].str.contains('WT')) & (joint_df["fc"] <= x_fc_max)] # wt_df = wt_df.loc[wt_df['pval'] >= x_pval] if x_pval_less_or_greater == 'greater' \ # else wt_df.loc[wt_df['pval'] <= x_pval] # df = joint_df.loc[(not joint_df["cell_line"].str.contains('WT')) & (joint_df["fc"] <= y_fc_max)] # df = df.loc[df['pval'] >= y_pval] if y_pval_less_or_greater == 'greater' \ # else df.loc[df['pval'] <= y_pval] # joint_df = pd.merge(df, wt_df, how='inner', on='gene_id') plot_series = [] for cell_line in y_axis_multiple: df = joint_df[['gene_id', '{}_fc'.format(cell_line), '{}_pval'.format(cell_line), '{}_inc_ess'.format(cell_line)]] df.columns = ['gene_id', 'y', 'y_pval', 'inc_ess'] df = pd.merge(df, x_axis_df, how='outer', on='gene_id') series_length = len(df.dropna()) # df = df.fillna('null') df = df.dropna() plot_series.append({ 'name': cell_line, 'data': list(df.T.to_dict().values()), 'turboThreshold': len(df), 'series_length': series_length }) data_table = None if show_data_table: genes = joint_df['gene_id'].tolist() data_table_df = full_df[full_df['gene_id'].isin(genes)] x_columns = ['gene_id', '{}_fc'.format(x_axis), '{}_pval'.format(x_axis)] if 'WT' not in x_axis: x_columns.append('{}_inc_ess'.format(x_axis)) x_axis_df.columns = x_columns data_table_df = pd.merge(data_table_df, x_axis_df, how='inner', on='gene_id') data_table_df.insert(0, '#', range(1, len(data_table_df)+1)) data_table = { 'header': data_table_df.columns, 'rows': data_table_df.values.tolist(), 'csv': data_table_df.to_csv(sep='\t', index=False) } return render_template('compare.html', cell_lines=cell_lines, x_axis=x_axis, y_axis_multiple=y_axis_multiple, plot_series=plot_series, how_to_plot=how_to_plot, data_table=data_table, apply_filters=apply_filters, selected_filter={ 'x_fc_max': x_fc_max, # 'x_fc_min': x_fc_min, 'x_pval': x_pval, 'x_pval_less_or_greater': x_pval_less_or_greater, 'y_fc_max': y_fc_max, 'y_pval': y_pval, 'y_pval_less_or_greater': y_pval_less_or_greater, })
def get_log_plots(): from crispr_analysis import get_db rdb = get_db() cell_lines = rdb.smembers('cell_lines') cell_lines = [cell_line.decode('utf-8') for cell_line in cell_lines] if request.method == 'GET': return render_template('log_plots.html', cell_lines=cell_lines) if request.method == 'POST': cell_line = request.form['cell_line'] df = pd.read_msgpack(rdb.get(cell_line)) df = df[['gene_id', 'fc', 'pval']] # dash lines left = float(request.form.get('left')) right = float(request.form.get('right')) bottom = float(request.form.get('bottom')) left_line = round(math.log2(left), 2) right_line = round(math.log2(right), 2) bottom_line = -round(math.log10(bottom), 2) genes = df['gene_id'].tolist() df['log_2_fc'] = df['fc'].apply(lambda x: math.log2(x)) df['minus_log_10_pval'] = df['pval'].apply(lambda y: -math.log10(y) if y != 0 else np.nan) df = df.round(decimals=3) max_val = df['minus_log_10_pval'].max().round(decimals=3) df.columns = ['gene_id', 'fc', 'pval', 'x', 'y'] left_df = df.loc[(df['fc'] <= left) & (df['pval'] <= bottom)] right_df = df.loc[(df['fc'] >= right) & (df['pval'] <= bottom)] bottom_df = df[~df.isin(left_df) & ~df.isin(right_df)].dropna() right_df = right_df.fillna(max_val + 0.5) left_df = left_df.fillna(max_val + 0.5) plot_series = [{ 'name': cell_line, 'data': list(bottom_df.dropna().T.to_dict().values()), 'turboThreshold': len(bottom_df), 'marker': { 'symbol': 'circle', 'radius': 5, }, 'color': 'grey', }, { 'name': cell_line, 'data': list(left_df.dropna().T.to_dict().values()), 'turboThreshold': len(left_df), 'color': 'blue', 'marker': { 'symbol': 'circle', 'radius': 5, }, }, { 'name': cell_line, 'data': list(right_df.dropna().T.to_dict().values()), 'turboThreshold': len(right_df), 'color': 'red', 'marker': { 'symbol': 'circle', 'radius': 5, }, }] return render_template('log_plots.html', cell_lines=cell_lines, selected_cell_line=cell_line, plot_series=plot_series, genes=genes, right=right_line, left=left_line, bottom=bottom_line, selected_thresholds={ 'left': left, 'right': right, 'bottom': bottom, })
def show_scatter_plot(): from crispr_analysis import get_db rdb = get_db() cell_lines = rdb.smembers('cell_lines') # I don't like the idea of manually fixing the strings, but I couldn't find a better way cell_lines = [cell_line.decode('utf-8') for cell_line in cell_lines] if request.method == 'GET': return render_template('main.html', cell_lines=cell_lines) if request.method == 'POST': selected_cell_lines = request.form.getlist('cell_lines') increased_essentiality = request.form.get('increased_essentiality') is not None show_data_table = request.form.get('show_data_table') is not None # filters apply_filters = request.form.get('apply_filters') is not None wt_fc_max = float(request.form.get('wt_fc_max')) wt_fc_min = float(request.form.get('wt_fc_min')) wt_pval = float(request.form.get('wt_pval')) wt_pval_less_or_greater = request.form.get('wt_pval_less_or_greater') other_fc_max = float(request.form.get('other_fc_max')) other_fc_min = float(request.form.get('other_fc_min')) other_pval = float(request.form.get('other_pval')) other_pval_less_or_greater = request.form.get('other_pval_less_or_greater') print(other_fc_max, other_fc_min, other_pval, other_pval_less_or_greater) joint_df = None wt_df = None full_df = None for cell_line in selected_cell_lines: data = rdb.get(cell_line) if not data: continue df = pd.read_msgpack(data) df = df[['gene_id', 'fc', 'pval', 'inc_ess']] df.fc.astype(float) df.pval.astype(float) if show_data_table: # to fill the data table correctly to_merge = df.copy() if cell_line != 'WT' else df[['gene_id', 'fc', 'pval']] to_merge = to_merge.round(decimals=3) columns = ['gene_id', '{}_fc'.format(cell_line), '{}_pval'.format(cell_line)] if cell_line != 'WT': columns.append('{}_inc_ess'.format(cell_line)) to_merge.columns = columns full_df = to_merge if full_df is None else pd.merge(full_df, to_merge, how='outer', on='gene_id') # apply_filters if apply_filters: if cell_line == 'WT': df = df.loc[df['fc'] >= wt_fc_min] df = df.loc[df['fc'] <= wt_fc_max] df = df.loc[df['pval'] >= wt_pval] if wt_pval_less_or_greater == 'greater' else df.loc[df['pval'] <= wt_pval] else: df = df.loc[df['fc'] >= other_fc_min] df = df.loc[df['fc'] <= other_fc_max] df = df.loc[df['pval'] >= other_pval] if other_pval_less_or_greater == 'greater' \ else df.loc[df['pval'] <= other_pval] df = df.round(decimals=3) df.columns = ['gene_id', '{}_fc'.format(cell_line), '{}_pval'.format(cell_line), '{}_inc_ess'.format(cell_line)] if cell_line == 'WT': df['WT_inc_ess'] = 'n/a' wt_df = df else: joint_df = df.copy() if joint_df is None else pd.merge(joint_df, df, how='outer', on='gene_id') if wt_df is not None: joint_df = pd.merge(joint_df, wt_df, how='inner', on='gene_id') # goodbye performance null_rows = {} if increased_essentiality: inc_ess_columns = ['{}_inc_ess'.format(cell_line) for cell_line in selected_cell_lines if cell_line != 'WT'] rows_to_drop = [] for i, row in joint_df.iterrows(): if not(any(row[column] == 'yes' for column in inc_ess_columns)): rows_to_drop.append(i) if i not in rows_to_drop: for column in inc_ess_columns: if row[column] == 'no' or row[column] == np.nan: cell_line = column.split('_inc_ess')[0] if cell_line not in null_rows: null_rows[cell_line] = [] null_rows[cell_line].append(i) joint_df = joint_df.drop(rows_to_drop) genes = list(joint_df['gene_id']) plot_series = [] for cell_line in selected_cell_lines: columns = ['{}_fc'.format(cell_line), '{}_pval'.format(cell_line), '{}_inc_ess'.format(cell_line)] df = joint_df[['gene_id'] + columns].copy() # # ok, I give up here - it just doesnt work # # will just show all points regardless # if increased_essentiality and cell_line in null_rows: # df.loc[null_rows[cell_line], columns] = np.nan series_length = len(df.dropna()) df = df.fillna("null") # do not dropna!!! It aligns all genes to the left # a nice bug in JSON.parse: # if the first element of a dictionary contains string "null" it will be parsed as a string "null" but not as a null object. # then highcharts will throw error 14. instead of converting it. they bother to detect the error, but not to convert - waste of my time!!! # to fix that, I did a very bad thing - removed "&&a.error(14,!0)" from the line 298 in highcharts.js (it doesn't check for error anymore) df.columns = ['name', 'y', 'pval', 'inc_ess'] plot_series.append({ 'name': cell_line, 'turboThreshold': len(df), 'series_length': series_length, 'data': list(df.T.to_dict().values()), }) data_table = None if show_data_table: genes = joint_df['gene_id'].tolist() data_table_df = full_df[full_df['gene_id'].isin(genes)] data_table_df.insert(0, '#', range(1, len(data_table_df)+1)) data_table = { 'header': data_table_df.columns, 'rows': data_table_df.values.tolist(), 'csv': data_table_df.to_csv(sep='\t', index=False) } # data for normalized counts counts_series = {} return render_template('main.html', cell_lines=cell_lines, genes=genes, plot_series=plot_series, selected_cell_lines=selected_cell_lines, data_table=data_table, counts_series=counts_series, increased_essentiality=increased_essentiality, apply_filters=apply_filters, selected_filters={ 'wt_fc_max': wt_fc_max, 'wt_fc_min': wt_fc_min, 'wt_pval': wt_pval, 'wt_pval_less_or_greater': wt_pval_less_or_greater, 'other_fc_max': other_fc_max, 'other_fc_min': other_fc_min, 'other_pval': other_pval, 'other_pval_less_or_greater': other_pval_less_or_greater, })