Пример #1
0
def summary_stat_table(table, factors):
    '''returns summary statistics for classifier, feature set combination'''
    fscore_cols = [c for c in table.header if c.startswith('fscore')]
    distinct = table.distinct_values(factors)
    rows = []
    for comb in tqdm(distinct, ncols=80):
        subtable = table.filtered(lambda x: tuple(x) == tuple(comb),
                                  columns=factors)
        aurocs = numpy.array(subtable.tolist('auc'))
        mean_prec = numpy.array(subtable.tolist('mean_precision'))
        accuracy = numpy.array(subtable.tolist('balanced_accuracy'))
        row = list(comb) + [
            aurocs.mean(),
            aurocs.std(ddof=1),
            mean_prec.mean(),
            mean_prec.std(ddof=1),
            accuracy.mean(),
            accuracy.std(ddof=1)
        ]
        for col in fscore_cols:
            data = numpy.array(subtable.tolist(col))
            row.append(data.mean())
            row.append(data.std(ddof=1))
        rows.append(row)

    header = list(factors) + [
        'mean_auc', 'std_auc', 'mean_ap', 'std_ap', 'mean_balanced_accuracy',
        'std_balanced_accuracy'
    ]
    for col in fscore_cols:
        header.extend([f'mean_{col}', f'std_{col}'])

    table = LoadTable(header=header, rows=rows)
    table = table.sorted(reverse='mean_auc')
    return table
Пример #2
0
def get_combined_counts(table, positions):
    bases = 'ACGT'
    if type(positions) == str:
        counts = reduced_one_position(table, positions)
        mut_counts = counts['M']
        unmut_counts = counts['R']
        positions = [positions]
        states = bases
        header = ['mut', 'base', 'count']
    else:
        counts = reduced_multiple_positions(table, *positions)
        mut_counts = counts['M']
        unmut_counts = counts['R']
        states = product(*list([bases] * len(positions)))
        header = ['mut'] + ['base%d' % (i + 1)
                            for i in range(len(positions))] + ['count']

    combined = []
    for state in states:
        combined.append(['R'] + list(state) + [unmut_counts[state]])
        combined.append(['M'] + list(state) + [mut_counts[state]])

    counts_table = LoadTable(header=header, rows=combined)
    counts_table = counts_table.sorted(columns=header[:-1])
    return counts_table
Пример #3
0
def get_grouped_combined_counts(table, position, group_label):
    """wraps motif_count.get_combined_counts for groups"""
    group_cats = table.distinct_values(group_label)
    all_data = []
    header = None
    for category in group_cats:
        subtable = table.filtered(lambda x: x == category, columns=group_label)
        counts = motif_count.get_combined_counts(subtable, position)
        if header is None:
            header = [group_label] + list(counts.header)

        counts = counts.with_new_column(group_label, lambda x: category,
                                        columns=counts.header[0])
        all_data.extend(counts.tolist(header))
    counts = LoadTable(header=header, rows=all_data)
    counts.sorted(columns=[group_label, 'mut'])
    return counts
Пример #4
0
def display_available_dbs(account, release=None):
    """displays the available Ensembl databases at the nominated host"""
    db_list = get_db_name(account=account, db_type="core", release=release)
    db_list += get_db_name(account=account, db_type="compara", release=release)
    rows = []
    for db_name in db_list:
        species_name = db_name.species
        if species_name:
            common_name = Species.get_common_name(db_name.species,
                                                  level="ignore")

        if "compara" in db_name.name:
            species_name = common_name = "-"
        rows.append([db_name.release, db_name.name, species_name, common_name])

    table = LoadTable(header=["Release", "Db Name", "Species", "Common Name"],
                      rows=rows,
                      space=2)
    table = table.sorted(["Release", "Db Name"])
    table.legend = (
        "Values of 'None' indicate cogent does not have a value for that database name."
    )
    return table
Пример #5
0
def collate(base_path, output_path, exclude_paths, overwrite):
    """collates all classifier performance stats and writes
    to a single tsv file"""
    LOGGER.log_args()
    outpath = os.path.join(output_path, "collated.tsv.gz")
    logfile_path = os.path.join(output_path, "collated.log")
    if os.path.exists(outpath) and not overwrite:
        click.secho(f"Skipping. {outpath} exists. "
                    "Use overwrite to force.",
                    fg='green')
        exit(0)

    stat_fns = exec_command(f'find {base_path} -name' ' "*performance.json*"')
    stat_fns = stat_fns.splitlines()
    if not stat_fns:
        msg = f'No files matching "*performance.json*" in {base_path}'
        click.secho(msg, fg='red')
        return

    LOGGER.log_file_path = logfile_path

    records = []
    keys = set()
    exclude_paths = [] if exclude_paths is None else exclude_paths.split(',')
    num_skipped = 0
    for fn in tqdm(stat_fns, ncols=80):
        if skip_path(exclude_paths, fn):
            num_skipped += 1
            LOGGER.log_message(fn, label="SKIPPED FILE")
            continue

        LOGGER.input_file(fn)
        data = load_json(fn)
        labels = data['classification_report']['labels']
        fscores = data['classification_report']['f-score']
        row = {
            "stat_path": fn,
            "classifier_path": data["classifier_path"],
            "auc": data["auc"],
            "algorithm": data["classifier_label"],
            "mean_precision": data["mean_precision"],
            f"fscore({labels[0]})": fscores[0],
            f"fscore({labels[1]})": fscores[1],
            'balanced_accuracy': data['balanced_accuracy']
        }
        row.update(data["feature_params"])
        keys.update(row.keys())
        records.append(row)

    columns = sorted(keys)
    rows = list(map(lambda r: [r.get(c, None) for c in columns], records))
    table = LoadTable(header=columns, rows=rows)
    table = table.sorted(reverse="auc")
    table = table.with_new_column(
        "name",
        lambda x: model_name_from_features(*x),
        columns=["flank_size", "feature_dim", "usegc", "proximal"])
    table = table.with_new_column("size",
                                  sample_size_from_path,
                                  columns="classifier_path")
    table.write(outpath)
    LOGGER.output_file(outpath)

    # make summary statistics via grouping by factors
    factors = [
        "algorithm", "name", "flank_size", "feature_dim", "proximal", "usegc",
        "size"
    ]
    summary = summary_stat_table(table, factors=factors)
    outpath = os.path.join(output_path, "summary_statistics.tsv.gz")
    summary.write(outpath)
    LOGGER.output_file(outpath)
    if num_skipped:
        click.secho("Skipped %d files that matched exclude_paths" %
                    num_skipped,
                    fg='red')