Пример #1
0
def show_heatmap():

    from crispr_analysis import get_db
    rdb = get_db()
    cell_lines = rdb.smembers('cell_lines')

    # I don't like the idea of manually fixing the strings, but I couldn't find a better way
    cell_lines = [cell_line.decode('utf-8') for cell_line in cell_lines]

    if request.method == 'POST':
        selected_cell_lines = request.form.getlist('cell_lines')
        selected_genes = request.form.get('selected_genes').strip().split()
        joint_df = None
        for cell_line in selected_cell_lines:
            df = pd.read_msgpack(rdb.get(cell_line))
            df = df[['gene_id', 'fc', 'pval', 'inc_ess']]
            if cell_line == 'WT':
                df['inc_ess'] = 'n/a'
            df.fc.astype(float)
            df.pval.astype(float)
            df.columns = ['gene_id', '{}_fc'.format(cell_line), '{}_pval'.format(cell_line), '{}_inc_ess'.format(cell_line)]

            if joint_df is None:
                joint_df = df.copy()
            else:
                joint_df = pd.merge(joint_df, df, how='outer', on='gene_id')

        joint_df = joint_df.loc[joint_df['gene_id'].isin(selected_genes)].reset_index()
        joint_df = joint_df.round(decimals=3)
        plot_series = []
        for i, row in joint_df.iterrows():
            for cell_line in selected_cell_lines:
                plot_series.append({
                    'x': selected_cell_lines.index(cell_line),
                    'y': i,
                    'cell_line': cell_line,
                    'gene_id': row['gene_id'],
                    'pval': row['{}_pval'.format(cell_line)],
                    'fc': row['{}_fc'.format(cell_line)],
                    'value': row['{}_fc'.format(cell_line)],
                    'inc_ess': row['{}_inc_ess'.format(cell_line)]
                })

        # to keep the order of genes
        selected_genes=joint_df['gene_id'].tolist()

        return render_template('heatmap.html', cell_lines=cell_lines, selected_cell_lines=selected_cell_lines,
                               selected_genes=selected_genes, plot_series=plot_series)

    return render_template('heatmap.html', cell_lines=cell_lines)
def import_data():
    from crispr_analysis import get_db
    rdb = get_db()

    if request.method == 'POST':
        data_type = request.form.get('data_type')
        with_drugs = request.form.get('with_drugs')
        input_file = request.files.get('input_file')
        if data_type != 'flush_db' and input_file is None:
            return render_template('data_import.html',
                                   selected_data_type=data_type,
                                   with_drugs=with_drugs,
                                   error="Please select file!")
        email = request.form.get('email')
        notify = request.form.get('notify') == 'true'
        try:
            if data_type == "fold_changes":
                # doesn't matter with or without
                # if with_drugs == 'without_drugs':
                df = pd.read_csv(input_file, sep='\t')
                header = list(df.columns)
                genes = df['gene_id'].tolist()
                cell_lines = []

                for column in header:
                    if '_fc' in column:
                        cell_line = column.split('_fc')[0]
                        if cell_line not in cell_lines:
                            cell_lines.append(cell_line)  # todo: unicode

                for cell_line in cell_lines:
                    inc_ess = 'increasedEssential_{}'.format(cell_line)
                    if 'increasedEssential_{}'.format(
                            cell_line) not in df.columns:
                        df[inc_ess] = 'n/a'
                    columns_to_select = [
                        'gene_id', '{}_fc'.format(cell_line),
                        '{}_pval'.format(cell_line), inc_ess
                    ]
                    # select only the columns for that cell line
                    current_df = df[columns_to_select]
                    # rename the columns
                    columns_to_rename = ['gene_id', 'fc', 'pval', 'inc_ess']
                    current_df.columns = columns_to_rename
                    rdb.set(
                        cell_line, current_df[columns_to_rename].to_msgpack(
                            encoding='utf-8'))

                rdb.sadd('cell_lines', *set(cell_lines))
                rdb.sadd('genes', *set(genes))
                success = "Data successfully imported. \nFile:{} \nCell lines: {} \nGenes:{}".format(
                    input_file.filename, len(cell_lines), len(genes))
                if notify and email:
                    send_email(email, message=success)
                return render_template('data_import.html', success=success)

            elif data_type == "norm_counts":
                if with_drugs == 'without_drugs':
                    full_df = pd.read_csv(input_file, sep='\t')

                    error = ''
                    required = [
                        'gene_id', 'cell_line', 'treatment', 'norm_counts'
                    ]
                    if not set(required).issubset(set(full_df.columns)):
                        error = 'ERROR: incorrect header! \n'
                        error += 'Required columns: {}\n'.format(
                            ', '.join(required))
                        error += "Given: {}\n".format(', '.join(
                            full_df.columns))
                    if error:
                        error += 'Filename: {}\n'.format(input_file.filename)
                        error += "Please read the instructions once again!!"
                        return render_template(
                            'data_import.html',
                            selected_data_type=data_type,
                            with_drugs=with_drugs,
                            error=error,
                        )
                    genes = full_df['gene_id'].unique()
                    cell_lines = full_df['cell_line'].unique()
                    last_part = False
                    for i in range(len(genes)):
                        gene = genes[i]
                        gene_df = full_df.loc[full_df['gene_id'] == gene]
                        gene_df = gene_df[[
                            'gene_id', 'cell_line', 'treatment', 'norm_counts'
                        ]]

                        if i + 100 < len(genes) and i % 100 == 0:
                            if notify and email:
                                message = 'writing {} to {} out of {} genes'.format(
                                    i, i + 100, len(genes))
                                send_email(email, message=message)
                        elif i + 100 >= len(genes) and not last_part:
                            if notify and email:
                                send_email(
                                    email,
                                    'writing {} to {} out of {} genes'.format(
                                        i, len(genes), len(genes)))
                            last_part = True

                        key = '{}_counts'.format(gene)
                        rdb.set(key, gene_df.to_msgpack())

                    success = "Data successfully imported. \nFile: {}\nCell lines: {} \nGenes:{}".format(
                        input_file.filename, len(cell_lines), len(genes))
                    if notify and email:
                        send_email(email, message=success)
                    return render_template('data_import.html', success=success)
                elif with_drugs == 'with_drugs':
                    full_df = pd.read_csv(input_file, sep='\t')

                    error = ''
                    required = [
                        'cell_line', 'sample', 'gene_id', 'day', 'norm_counts'
                    ]
                    if not set(required).issubset(set(full_df.columns)):
                        error = 'ERROR: incorrect header! \n'
                        error += 'Required columns: {}\n'.format(
                            ', '.join(required))
                        error += "Given: {}\n".format(', '.join(
                            full_df.columns))
                    if error:
                        error += 'Filename: {}\n'.format(input_file.filename)
                        error += "Please read the instructions once again!!"
                        return render_template(
                            'data_import.html',
                            selected_data_type=data_type,
                            with_drugs=with_drugs,
                            error=error,
                        )
                    last_printed = False  # for print message
                    cell_lines = full_df['cell_line'].unique()
                    genes = full_df['gene_id'].unique()
                    drugs = full_df['sample'].unique(
                    )  # this is only for day 21

                    for i in range(len(genes)):
                        gene = genes[i]
                        msg_pack = rdb.get('{}_counts'.format(gene))
                        if msg_pack is None:
                            # print('DID YOU IMPORT norm_counts WITHOUT DRUGS???')
                            error = "Can't import {}. Did you import Normalized Counts WITHOUT drugs?\nCould not find gene {}".format(
                                input_file.filename, genes[i])
                            if notify and email:
                                send_email(email, message=error)
                            return render_template('data_import.html',
                                                   error=error)
                        existing_df = pd.read_msgpack(msg_pack)
                        for cell_line in cell_lines:
                            # for all drugs we need day0, day7 and day21
                            day0_df = full_df.loc[
                                (full_df['cell_line'] == cell_line)
                                & (full_df['day'] == 0) &
                                (full_df['gene_id'] == gene)]
                            day7_df = full_df.loc[
                                (full_df['cell_line'] == cell_line)
                                & (full_df['day'] == 7) &
                                (full_df['gene_id'] == gene)]
                            for drug in drugs:
                                day21_df = full_df.loc[
                                    (full_df['cell_line'] == cell_line)
                                    & (full_df['sample'] == drug) &
                                    (full_df['day'] == 21) &
                                    (full_df['gene_id'] == gene)]
                                # this will be empty in case if sample = 'before1' or 'before2'
                                if day21_df.empty:
                                    # print("EMPTY: ", cell_line, drug, gene)
                                    continue

                                # day0vs7
                                # before
                                day0vs7_df = day0_df.copy()
                                day0vs7_df = day0vs7_df[[
                                    'gene_id', 'cell_line', 'sample',
                                    'norm_counts'
                                ]]
                                day0vs7_df.columns = [
                                    'gene_id', 'cell_line', 'treatment',
                                    'norm_counts'
                                ]
                                day0vs7_df[
                                    'cell_line'] = '{}_{}_day0vs7'.format(
                                        cell_line, drug)
                                day0vs7_df['treatment'] = 0
                                existing_df = existing_df.append(
                                    day0vs7_df.copy(), ignore_index=True)
                                # after
                                day0vs7_df = day7_df.copy()
                                day0vs7_df = day0vs7_df[[
                                    'gene_id', 'cell_line', 'sample',
                                    'norm_counts'
                                ]]
                                day0vs7_df.columns = [
                                    'gene_id', 'cell_line', 'treatment',
                                    'norm_counts'
                                ]
                                day0vs7_df[
                                    'cell_line'] = '{}_{}_day0vs7'.format(
                                        cell_line, drug)
                                day0vs7_df['treatment'] = 1
                                existing_df = existing_df.append(
                                    day0vs7_df.copy(), ignore_index=True)

                                # day7vs21
                                # before
                                day7vs21_df = day0vs7_df.copy()
                                day7vs21_df[
                                    'cell_line'] = '{}_{}_day7vs21'.format(
                                        cell_line, drug)
                                day7vs21_df['treatment'] = 0
                                existing_df = existing_df.append(
                                    day7vs21_df.copy(), ignore_index=True)
                                # after
                                day7vs21_df = day21_df.copy()
                                day7vs21_df = day7vs21_df[[
                                    'gene_id', 'cell_line', 'sample',
                                    'norm_counts'
                                ]]
                                day7vs21_df.columns = [
                                    'gene_id', 'cell_line', 'treatment',
                                    'norm_counts'
                                ]
                                day7vs21_df[
                                    'cell_line'] = '{}_{}_day7vs21'.format(
                                        cell_line, drug)
                                day7vs21_df['treatment'] = 1
                                existing_df = existing_df.append(
                                    day7vs21_df.copy(), ignore_index=True)

                                # day0vs21
                                # before
                                day0vs21_df = day0vs7_df.copy()
                                day0vs21_df[
                                    'cell_line'] = '{}_{}_day0vs21'.format(
                                        cell_line, drug)
                                day0vs21_df['treatment'] = 0
                                existing_df = existing_df.append(
                                    day0vs21_df.copy(), ignore_index=True)
                                # after
                                day0vs21_df = day7vs21_df.copy()
                                day0vs21_df[
                                    'cell_line'] = '{}_{}_day0vs21'.format(
                                        cell_line, drug)
                                day0vs21_df['treatment'] = 1
                                existing_df = existing_df.append(
                                    day0vs21_df.copy(), ignore_index=True)

                        if i + 100 < len(genes) and i % 100 == 0:
                            if notify and email:
                                message = 'writing {} to {} out of {} genes'.format(
                                    i, i + 100, len(genes))
                                send_email(email, message=message)
                        elif i + 100 >= len(genes) and not last_printed:
                            if notify and email:
                                send_email(
                                    email,
                                    'writing {} to {} out of {} genes'.format(
                                        i, len(genes), len(genes)))
                            last_printed = True
                    key = '{}_counts'.format(gene)
                    rdb.set(key, existing_df.to_msgpack())
                    success = "Data succesfully imported.\nFile: {} \nCell lines: {}\nGenes: {}".format(
                        input_file.filename, str(cell_lines), len(genes))
                    if notify and email:
                        send_email(email, success)
                    return render_template(
                        'data_import.html',
                        selected_data_type=data_type,
                        with_drugs=with_drugs,
                        success=success,
                    )

            elif data_type == "flush_db":
                rdb.flushall()
                if notify and email:
                    send_email(email, 'Database succesfully flushed')
                return render_template('data_import.html',
                                       success='Database successfully flushed')
        except Exception as e:
            import traceback
            if notify and email:
                send_email(email, traceback.format_exc())
            return render_template('data_import.html',
                                   error='Import failed, error:\n {}'.format(
                                       traceback.format_exc()))
    return render_template('data_import.html')
Пример #3
0
def get_norm_counts(gene, cell_lines):
    # this import has to be here!!
    from crispr_analysis import get_db
    rdb = get_db()
    cell_lines = cell_lines.split(',')
    msgpack = rdb.get('{}_counts'.format(gene))
    if msgpack is None:
        return jsonify({
            'data': [],
            'errors': ['No counts for gene {} found'.format(gene)]
        })
    gene_df = pd.read_msgpack(msgpack)
    series_before = []
    series_after = []
    outliers = []
    error_messages = []
    for i in range(len(cell_lines)):
        cell_line = cell_lines[i]
        # key = 'RPE_{}'.format(cell_line)
        df = gene_df.loc[gene_df['cell_line'] == cell_line]
        # if df.empty:
        #     key = cell_line
        #     df = gene_df.loc[gene_df['cell_line'] == key]
        if df.empty:
            error_messages.append('No counts for gene: {} and cell line: {}. '.format(gene, cell_line))
            continue

        # keep only the ones that are within +3 to -3 standard deviations
        without_outliers = df[np.abs(df.norm_counts - df.norm_counts.mean()) <= (3 * df.norm_counts.std())]
        only_outliers = df[np.abs(df.norm_counts - df.norm_counts.mean()) > (3 * df.norm_counts.std())]
        before = without_outliers.loc[without_outliers['treatment'] == 0]
        after = without_outliers.loc[without_outliers['treatment'] == 1]

        # calculate boxplot data
        q1, median, q3 = before.norm_counts.quantile([0.25, 0.5, 0.75]).round(decimals=3).tolist()
        series_before.append([
            before['norm_counts'].min().round(decimals=3),
            q1,
            median,
            q3,
            before['norm_counts'].max().round(decimals=3)])
        q1, median, q3 = after.norm_counts.quantile([0.25, 0.5, 0.75]).round(decimals=3).tolist()
        series_after.append([
                after['norm_counts'].min().round(decimals=3),
                q1,
                median,
                q3,
                after['norm_counts'].max().round(decimals=3)])
        for index, row in only_outliers.iterrows():
            x = i
            y = round(row['norm_counts'], 3)
            treatment = 'Before Treatment' if int(row['treatment']) == 0 else 'After Treatment'
            outliers.append({
                'x': x,
                'y': y,
                'treatment': treatment,
                'cell_line': cell_line,
                'color': 'black' if treatment == 'After Treatment' else '#7cb5ec'
            })

    counts_series = [{
            'name': 'Before Treatment',
            'data': series_before,
            'color': '#7cb5ec',
        }, {
            'name': 'After Treatment',
            'data': series_after,
            'color': 'black',
        }, {'name': 'Outliers',
            'type': 'scatter',
            'data': outliers,
            'color': 'black',
            'tooltip': {
                'pointFormat': '<br>cell line: {point.cell_line}<br>norm. counts: {point.y}<br>treatment: {point.treatment}',
            }
        }
    ]
    return jsonify({
        'data': counts_series,
        'errors': error_messages,
    })
Пример #4
0
def compare_cell_lines():
    from crispr_analysis import get_db
    rdb = get_db()

    cell_lines = rdb.smembers('cell_lines')
    cell_lines = [cell_line.decode('utf-8') for cell_line in cell_lines]

    if request.method == 'GET':
        return render_template('compare.html', cell_lines=cell_lines)

    if request.method == 'POST':
        x_axis = request.form.get('x_axis')
        y_axis_multiple = request.form.getlist('y_axis_multiple')

        # here it doesn't matter how to plot, the data will be the same
        how_to_plot = request.form.get('how_to_plot')
        show_data_table = request.form.get('show_data_table') is not None

        # filters
        apply_filters = request.form.get('apply_filters') is not None
        x_fc_max = float(request.form.get('x_fc_max'))
        # x_fc_min = float(request.form.get('x_fc_min'))
        x_pval = float(request.form.get('x_pval'))
        x_pval_less_or_greater = request.form.get('x_pval_less_or_greater')
        y_fc_max = float(request.form.get('y_fc_max'))
        # y_fc_min = float(request.form.get('y_fc_min'))
        y_pval = float(request.form.get('y_pval'))
        y_pval_less_or_greater = request.form.get('y_pval_less_or_greater')

        x_axis_df = pd.read_msgpack(rdb.get(x_axis))
        x_axis_df = x_axis_df[['gene_id', 'fc', 'pval']]
        x_axis_df.columns = ['gene_id', 'x', 'x_pval']
        x_axis_df = x_axis_df.round(decimals=3)
        #
        if apply_filters:
            x_axis_df = x_axis_df.loc[x_axis_df['x'] <= x_fc_max]
            # x_axis_df = x_axis_df.loc[x_axis_df['x'] >= x_fc_min]
            x_axis_df = x_axis_df.loc[x_axis_df['x_pval'] >= x_pval] if x_pval_less_or_greater == 'greater' else \
                        x_axis_df.loc[x_axis_df['x_pval'] <= x_pval]

        joint_df = None
        full_df = None
        for cell_line in y_axis_multiple:
            df = pd.read_msgpack(rdb.get(cell_line))
            df = df[['gene_id', 'fc', 'pval', 'inc_ess']]
            df.fc.astype(float)
            df.pval.astype(float)

            if show_data_table:
                # to fill the data table correctly
                to_merge = df.copy() if cell_line != 'WT' else df[['gene_id', 'fc', 'pval']].copy()
                to_merge = to_merge.round(decimals=3)
                columns = ['gene_id', '{}_fc'.format(cell_line), '{}_pval'.format(cell_line)]
                if cell_line != 'WT':
                    columns.append('{}_inc_ess'.format(cell_line))
                to_merge.columns = columns
                full_df = to_merge if full_df is None else pd.merge(full_df, to_merge, how='outer', on='gene_id')

            if apply_filters:
                # if 'WT' in cell_line:
                #     # df = df.loc[df['fc'] >= x_fc_min]
                #     df = df.loc[df['fc'] <= x_fc_max]
                #     df = df.loc[df['pval'] >= x_pval] if x_pval_less_or_greater == 'greater' \
                #         else df.loc[df['pval'] <= x_pval]
                #     # df = df.loc[df['fc'] >= x_fc_min]
                #     # df = df.loc[df['fc'] <= x_fc_max]
                #     # df = df.loc[df['pval'] >= x_pval] if x_pval_less_or_greater == 'greater' \
                #     #     else df.loc[df['pval'] <= x_pval]
                # else:
                    # df = df.loc[df['fc'] >= y_fc_min]
                    df = df.loc[df['fc'] <= y_fc_max]
                    df = df.loc[df['pval'] >= y_pval] if y_pval_less_or_greater == 'greater' \
                        else df.loc[df['pval'] <= y_pval]
                    # df = df.loc[df['fc'] >= y_fc_min]
                    # df = df.loc[df['fc'] <= y_fc_max]
                    # df = df.loc[df['pval'] >= y_pval] if y_pval_less_or_greater == 'greater' \
                    #     else df.loc[df['pval'] <= y_pval]


            df = df.round(decimals=3)

            df.columns = ['gene_id', '{}_fc'.format(cell_line), '{}_pval'.format(cell_line),
                    '{}_inc_ess'.format(cell_line)]
            joint_df = df.copy() if joint_df is None else pd.merge(joint_df, df, how='outer', on='gene_id')
        #
        # # apply_filters
        # if apply_filters:
        #     for cell_line in cell_lines:
        #     wt_df = joint_df.loc[(joint_df["cell_line"].str.contains('WT')) & (joint_df["fc"] <= x_fc_max)]
        #     wt_df = wt_df.loc[wt_df['pval'] >= x_pval] if x_pval_less_or_greater == 'greater' \
        #         else wt_df.loc[wt_df['pval'] <= x_pval]
        #     df = joint_df.loc[(not joint_df["cell_line"].str.contains('WT')) & (joint_df["fc"] <= y_fc_max)]
        #     df = df.loc[df['pval'] >= y_pval] if y_pval_less_or_greater == 'greater' \
        #         else df.loc[df['pval'] <= y_pval]
        #     joint_df = pd.merge(df, wt_df, how='inner', on='gene_id')


        plot_series = []
        for cell_line in y_axis_multiple:

            df = joint_df[['gene_id', '{}_fc'.format(cell_line),
                           '{}_pval'.format(cell_line), '{}_inc_ess'.format(cell_line)]]

            df.columns = ['gene_id', 'y', 'y_pval', 'inc_ess']
            df = pd.merge(df, x_axis_df, how='outer', on='gene_id')
            series_length = len(df.dropna())
            # df = df.fillna('null')
            df = df.dropna()
            plot_series.append({
                'name': cell_line,
                'data': list(df.T.to_dict().values()),
                'turboThreshold': len(df),
                'series_length': series_length
            })

        data_table = None
        if show_data_table:
            genes = joint_df['gene_id'].tolist()
            data_table_df = full_df[full_df['gene_id'].isin(genes)]
            x_columns = ['gene_id', '{}_fc'.format(x_axis), '{}_pval'.format(x_axis)]
            if 'WT' not in x_axis:
                x_columns.append('{}_inc_ess'.format(x_axis))
            x_axis_df.columns = x_columns
            data_table_df = pd.merge(data_table_df, x_axis_df, how='inner', on='gene_id')

            data_table_df.insert(0, '#', range(1, len(data_table_df)+1))
            data_table = {
                'header': data_table_df.columns,
                'rows': data_table_df.values.tolist(),
                'csv': data_table_df.to_csv(sep='\t', index=False)
            }

        return render_template('compare.html', cell_lines=cell_lines, x_axis=x_axis, y_axis_multiple=y_axis_multiple,
                               plot_series=plot_series, how_to_plot=how_to_plot, data_table=data_table, apply_filters=apply_filters,
                               selected_filter={
                                    'x_fc_max': x_fc_max,
                                    # 'x_fc_min': x_fc_min,
                                    'x_pval': x_pval,
                                    'x_pval_less_or_greater': x_pval_less_or_greater,
                                    'y_fc_max': y_fc_max,
                                    'y_pval': y_pval,
                                    'y_pval_less_or_greater': y_pval_less_or_greater,
                               })
Пример #5
0
def get_log_plots():
    from crispr_analysis import get_db
    rdb = get_db()
    cell_lines = rdb.smembers('cell_lines')
    cell_lines = [cell_line.decode('utf-8') for cell_line in cell_lines]

    if request.method == 'GET':
        return render_template('log_plots.html', cell_lines=cell_lines)

    if request.method == 'POST':
        cell_line = request.form['cell_line']
        df = pd.read_msgpack(rdb.get(cell_line))
        df = df[['gene_id', 'fc', 'pval']]

        #  dash lines
        left = float(request.form.get('left'))
        right = float(request.form.get('right'))
        bottom = float(request.form.get('bottom'))

        left_line = round(math.log2(left), 2)
        right_line = round(math.log2(right), 2)
        bottom_line = -round(math.log10(bottom), 2)

        genes = df['gene_id'].tolist()
        df['log_2_fc'] = df['fc'].apply(lambda x: math.log2(x))
        df['minus_log_10_pval'] = df['pval'].apply(lambda y: -math.log10(y)
                                                   if y != 0 else np.nan)

        df = df.round(decimals=3)
        max_val = df['minus_log_10_pval'].max().round(decimals=3)

        df.columns = ['gene_id', 'fc', 'pval', 'x', 'y']
        left_df = df.loc[(df['fc'] <= left) & (df['pval'] <= bottom)]
        right_df = df.loc[(df['fc'] >= right) & (df['pval'] <= bottom)]
        bottom_df = df[~df.isin(left_df) & ~df.isin(right_df)].dropna()
        right_df = right_df.fillna(max_val + 0.5)
        left_df = left_df.fillna(max_val + 0.5)
        plot_series = [{
            'name': cell_line,
            'data': list(bottom_df.dropna().T.to_dict().values()),
            'turboThreshold': len(bottom_df),
            'marker': {
                'symbol': 'circle',
                'radius': 5,
            },
            'color': 'grey',
        }, {
            'name': cell_line,
            'data': list(left_df.dropna().T.to_dict().values()),
            'turboThreshold': len(left_df),
            'color': 'blue',
            'marker': {
                'symbol': 'circle',
                'radius': 5,
            },
        }, {
            'name': cell_line,
            'data': list(right_df.dropna().T.to_dict().values()),
            'turboThreshold': len(right_df),
            'color': 'red',
            'marker': {
                'symbol': 'circle',
                'radius': 5,
            },
        }]
        return render_template('log_plots.html',
                               cell_lines=cell_lines,
                               selected_cell_line=cell_line,
                               plot_series=plot_series,
                               genes=genes,
                               right=right_line,
                               left=left_line,
                               bottom=bottom_line,
                               selected_thresholds={
                                   'left': left,
                                   'right': right,
                                   'bottom': bottom,
                               })
Пример #6
0
def show_scatter_plot():
    from crispr_analysis import get_db
    rdb = get_db()

    cell_lines = rdb.smembers('cell_lines')

    # I don't like the idea of manually fixing the strings, but I couldn't find a better way
    cell_lines = [cell_line.decode('utf-8') for cell_line in cell_lines]

    if request.method == 'GET':
        return render_template('main.html', cell_lines=cell_lines)

    if request.method == 'POST':
        selected_cell_lines = request.form.getlist('cell_lines')
        increased_essentiality = request.form.get('increased_essentiality') is not None
        show_data_table = request.form.get('show_data_table') is not None

        # filters
        apply_filters = request.form.get('apply_filters') is not None
        wt_fc_max = float(request.form.get('wt_fc_max'))
        wt_fc_min = float(request.form.get('wt_fc_min'))
        wt_pval = float(request.form.get('wt_pval'))
        wt_pval_less_or_greater = request.form.get('wt_pval_less_or_greater')
        other_fc_max = float(request.form.get('other_fc_max'))
        other_fc_min = float(request.form.get('other_fc_min'))
        other_pval = float(request.form.get('other_pval'))
        other_pval_less_or_greater = request.form.get('other_pval_less_or_greater')
        print(other_fc_max, other_fc_min, other_pval, other_pval_less_or_greater)

        joint_df = None
        wt_df = None
        full_df = None
        for cell_line in selected_cell_lines:
            data = rdb.get(cell_line)
            if not data:
                continue
            df = pd.read_msgpack(data)
            df = df[['gene_id', 'fc', 'pval', 'inc_ess']]
            df.fc.astype(float)
            df.pval.astype(float)

            if show_data_table:
                # to fill the data table correctly
                to_merge = df.copy() if cell_line != 'WT' else df[['gene_id', 'fc', 'pval']]
                to_merge = to_merge.round(decimals=3)
                columns = ['gene_id', '{}_fc'.format(cell_line), '{}_pval'.format(cell_line)]
                if cell_line != 'WT':
                    columns.append('{}_inc_ess'.format(cell_line))
                to_merge.columns = columns
                full_df = to_merge if full_df is None else pd.merge(full_df, to_merge, how='outer', on='gene_id')


            # apply_filters
            if apply_filters:
                if cell_line == 'WT':
                    df = df.loc[df['fc'] >= wt_fc_min]
                    df = df.loc[df['fc'] <= wt_fc_max]
                    df = df.loc[df['pval'] >= wt_pval] if wt_pval_less_or_greater == 'greater' else df.loc[df['pval'] <= wt_pval]
                else:
                    df = df.loc[df['fc'] >= other_fc_min]
                    df = df.loc[df['fc'] <= other_fc_max]
                    df = df.loc[df['pval'] >= other_pval] if other_pval_less_or_greater == 'greater' \
                        else df.loc[df['pval'] <= other_pval]
            df = df.round(decimals=3)

            df.columns = ['gene_id', '{}_fc'.format(cell_line), '{}_pval'.format(cell_line),
                    '{}_inc_ess'.format(cell_line)]

            if cell_line == 'WT':
                df['WT_inc_ess'] = 'n/a'
                wt_df = df
            else:
                joint_df = df.copy() if joint_df is None else pd.merge(joint_df, df, how='outer', on='gene_id')

        if wt_df is not None:
            joint_df = pd.merge(joint_df, wt_df, how='inner', on='gene_id')

        # goodbye performance
        null_rows = {}
        if increased_essentiality:
            inc_ess_columns = ['{}_inc_ess'.format(cell_line) for cell_line in selected_cell_lines if cell_line != 'WT']
            rows_to_drop = []
            for i, row in joint_df.iterrows():
                if not(any(row[column] == 'yes' for column in inc_ess_columns)):
                    rows_to_drop.append(i)

                if i not in rows_to_drop:
                    for column in inc_ess_columns:
                        if row[column] == 'no' or row[column] == np.nan:
                            cell_line = column.split('_inc_ess')[0]
                            if cell_line not in null_rows:
                                null_rows[cell_line] = []
                            null_rows[cell_line].append(i)
            joint_df = joint_df.drop(rows_to_drop)

        genes = list(joint_df['gene_id'])

        plot_series = []
        for cell_line in selected_cell_lines:
            columns = ['{}_fc'.format(cell_line),
                           '{}_pval'.format(cell_line), '{}_inc_ess'.format(cell_line)]
            df = joint_df[['gene_id'] + columns].copy()

            # # ok, I give up here - it just doesnt work
            # # will just show all points regardless
            # if increased_essentiality and cell_line in null_rows:
            #     df.loc[null_rows[cell_line], columns] = np.nan

            series_length = len(df.dropna())
            df = df.fillna("null") # do not dropna!!! It aligns all genes to the left
            # a nice bug in JSON.parse:
            # if the first element of a dictionary contains string "null" it will be parsed as a string "null" but not as a null object.
            # then highcharts will throw error 14. instead of converting it. they bother to detect the error, but not to convert - waste of my time!!!
            # to fix that, I did a very bad thing - removed "&&a.error(14,!0)" from the line 298 in highcharts.js (it doesn't check for error anymore)
            df.columns = ['name', 'y', 'pval', 'inc_ess']
            plot_series.append({
                'name': cell_line,
                'turboThreshold': len(df),
                'series_length': series_length,
                'data': list(df.T.to_dict().values()),
            })

        data_table = None
        if show_data_table:
            genes = joint_df['gene_id'].tolist()
            data_table_df = full_df[full_df['gene_id'].isin(genes)]
            data_table_df.insert(0, '#', range(1, len(data_table_df)+1))
            data_table = {
                'header': data_table_df.columns,
                'rows': data_table_df.values.tolist(),
                'csv': data_table_df.to_csv(sep='\t', index=False)
            }

        # data for normalized counts
        counts_series = {}
        return render_template('main.html', cell_lines=cell_lines, genes=genes, plot_series=plot_series,
                               selected_cell_lines=selected_cell_lines, data_table=data_table, counts_series=counts_series,
                               increased_essentiality=increased_essentiality, apply_filters=apply_filters,
                               selected_filters={
                                    'wt_fc_max': wt_fc_max,
                                    'wt_fc_min': wt_fc_min,
                                    'wt_pval': wt_pval,
                                    'wt_pval_less_or_greater': wt_pval_less_or_greater,
                                    'other_fc_max': other_fc_max,
                                    'other_fc_min': other_fc_min,
                                    'other_pval': other_pval,
                                    'other_pval_less_or_greater': other_pval_less_or_greater,
                               })