def create_new_header(infile, mappings, outfile):
    """Create new header in BigWig, with UCSC chromosome names."""
    with pyBigWig.open(infile) as bw:
        if set(bw.chroms().keys()).issubset(mappings.values()):
            # If chromosome names are already UCSC, just rename input file to output name.
            # Exit with status 0 since this is normal behavior.
            os.rename(infile, outfile)
            sys.exit(0)

        hdr = [(mappings[chrom], length) for chrom, length in bw.chroms().items() if chrom in mappings]

        if not hdr:
            msg = "Neither of the chromosomes in the input file has a valid UCSC pair. No mapping will be done."
            print(warning(msg))
            os.rename(infile, outfile)
            sys.exit(0)

        seq_num = 0
        with pyBigWig.open(outfile, 'w') as bw_output:
            bw_output.addHeader(hdr)
            for chrom, length in bw.chroms().items():
                ints = bw.intervals(chrom, 0, length)
                if ints and chrom in mappings:
                    bw_output.addEntries([mappings[chrom]] * len(ints),
                                         [x[0] for x in ints],
                                         ends=[x[1] for x in ints],
                                         values=[x[2] for x in ints])
                elif chrom not in mappings:
                    seq_num += 1
                    print('UCSC chromosome/conting mapping for {} is missing'.format(chrom))

        if seq_num > 0:
            print(warning("UCSC chromosome/conting mapping for {} sequence(s) is missing. "
                          "This sequence(s) will not be included in the bigWig file.".format(seq_num)))
예제 #2
0
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    validate_inputs(args)

    exp_type = args.exp_types[0]
    spikeins_mix = args.spikeins_mix

    expected = get_expected(spikeins_mix, log2=True)

    min_one_has_spikeins = False  # At least one sample has spikeins = False
    warnings = []
    for sample_name, sample_exp in zip(args.sample_names, args.sample_exps):

        measured_zero = get_measured(sample_exp,
                                     sample_name,
                                     exp_type,
                                     only_zero=True)
        measured_nonzero = get_measured(sample_exp,
                                        sample_name,
                                        exp_type,
                                        only_nonzero=True,
                                        log2=True)

        merged_zero = merge_expected_measured(expected, measured_zero)
        merged_nonzero = merge_expected_measured(expected, measured_nonzero)

        # Get only ERCC spike-in's and plot the histogram-scatter figure.
        if merged_nonzero.iloc[
                merged_nonzero.index.str.startswith('ERCC'), :].empty:
            warnings.append(
                'All ERCC spike-ins have zero expression in sample {}'.format(
                    sample_name))
            continue

        min_one_has_spikeins = True
        plot_histogram_scatter(
            expected=expected.iloc[expected.index.str.startswith('ERCC')],
            zero=merged_zero.iloc[merged_zero.index.str.startswith('ERCC'), :],
            nonzero=merged_nonzero.iloc[
                merged_nonzero.index.str.startswith('ERCC'), :],
            spikein_type='ERCC',
            sample_name=sample_name,
            exp_type=exp_type,
        )

    if min_one_has_spikeins:
        for message in warnings:
            print(warning(message))
    else:
        # In case all samples have zero expression for all spikeins,
        # rather print one warning that says so (instead of printing
        # warning for each of the samples).
        print(
            warning('All ERCC spike-ins in all samples have zero expression.'))
예제 #3
0
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    res = resdk.Resolwe()

    with open(args.feature_ids) as gene_file:
        genes = [gene.strip() for gene in gene_file]

    org_features = res.feature.filter(source=args.source_db, species=args.species, feature_id=genes)

    if len(org_features) == 0:
        print(error("No genes were fetched from the knowledge base."))
        exit(1)

    if args.source_db == args.target_db:
        target_ids = genes
    else:
        mapping_res = res.mapping.filter(
            source_db=args.source_db,
            source_species=args.species,
            target_db=args.target_db,
            target_species=args.species,
            source_id=genes,
        )

        if len(mapping_res) == 0:
            print(error("Failed to map features."))
            exit(1)

        mappings = {}
        for m in mapping_res:
            if m.source_id in genes:
                if m.source_id not in mappings:
                    mappings[m.source_id] = m.target_id
                else:
                    print(warning("Mapping {} returned multiple times.".format(m)))

        if len(genes) > len(mappings):
            print(warning("Not all features could be mapped."))

        target_ids = mappings.values()

    with tempfile.NamedTemporaryFile() as input_genes:
        input_genes.write(' '.join(target_ids).encode("UTF-8"))
        input_genes.flush()
        process = Popen(['processor', str(args.pval), str(args.min_genes), args.obo, args.gaf, input_genes.name],
                        stdout=PIPE,
                        stderr=DEVNULL
                        )
        out, err = process.communicate()

        with open('terms.json', 'w') as f:
            f.write(out.decode("UTF-8"))
예제 #4
0
def iterate_snpeff_file(file_handle, filename):
    """Iterate entries in file produced by SnpSift extractFields."""
    for row in file_handle:
        # One line can contain two or more ALT values (and consequently two or more AF/AD values) Such "multiple"
        # entries are split into one ALT/AF/AD value per row. Lofreq data does not contain AD value, this is why
        # ``ad_s`` generation might appear messy (to cover the case with or without AD column).
        alts = row["ALT"].strip().split(",")
        afqs = row["AF"].strip().split(",")
        default_ad_s = ",".join([""] * (len(alts) + 1))
        # First entry is AD of REF allele, and the rest of them are for ALT alleles.
        ad_s = row.get("GEN[0].AD", default_ad_s).strip().split(",")[1:]
        if not (len(alts) == len(afqs) == len(ad_s)):
            print(
                warning(
                    "Inconsistency for entry {} in file {}. Skipping this entry."
                    .format(row, os.path.basename(filename))))
            continue

        if len(ad_s) == 1:
            row["AD"] = ad_s[0]
            yield row
        else:
            for alt, afq, ad_ in zip(alts, afqs, ad_s):
                row_copy = copy.deepcopy(row)
                row_copy["ALT"] = alt
                row_copy["AF"] = afq
                if ad_:
                    row_copy["AD"] = ad_
                yield row_copy
def iterate_snpeff_file(file_handle, filename):
    """Iterate entries in file produced by SnpSift extractFields."""
    for row in file_handle:
        # One line can contain two or more ALT values (and consequently two or more AF/AD values) Such "multiple"
        # entries are split into one ALT/AF/AD value per row. Lofreq data does not contain AD value, this is why
        # ``ad_s`` generation might appear messy (to cover the case with or without AD column).
        alts = row['ALT'].strip().split(',')
        afqs = row['AF'].strip().split(',')
        default_ad_s = ','.join([''] * (len(alts) + 1))
        # First entry is AD of REF allele, and the rest of them are for ALT alleles.
        ad_s = row.get('GEN[0].AD', default_ad_s).strip().split(',')[1:]
        if not (len(alts) == len(afqs) == len(ad_s)):
            print(warning('Inconsistency for entry {} in file {}. Skipping this entry.'.format(
                row, os.path.basename(filename))))
            continue

        if len(ad_s) == 1:
            row['AD'] = ad_s[0]
            yield row
        else:
            for alt, afq, ad_ in zip(alts, afqs, ad_s):
                row_copy = copy.deepcopy(row)
                row_copy['ALT'] = alt
                row_copy['AF'] = afq
                if ad_:
                    row_copy['AD'] = ad_
                yield row_copy
 def test_string(self):
     expected = {
         'type': 'COMMAND',
         'type_data': 'process_log',
         'data': {'warning': 'Some warning'},
     }
     self.assertEqual(warning('Some warning'), expected)
예제 #7
0
def get_pca(expressions=pd.DataFrame(), n_components=2, gene_labels=[]):
    """Compute PCA."""
    if not gene_labels:
        gene_labels = expressions.index
    skipped_gene_labels = list(set(gene_labels).difference(expressions.index))

    if expressions.shape[0] < 2 or expressions.shape[1] < 2:
        coordinates = [[0.0, 0.0] for i in range(expressions.shape[1])]
        all_components = [[], []]
        all_explained_variance_ratios = [0.0, 0.0]
    else:
        pca = PCA(n_components=n_components, whiten=True)
        pca_expressions = pca.fit_transform(expressions.transpose())

        coordinates = [t[:2].tolist() if len(t) > 1 else [t[0], 0.0] for t in pca_expressions]
        all_components = [component_top_factors(component, gene_labels) for component in pca.components_]
        if np.isnan(pca.explained_variance_ratio_).any():
            all_explained_variance_ratios = [0.0 for _ in pca.explained_variance_ratio_]
        else:
            all_explained_variance_ratios = pca.explained_variance_ratio_.tolist()

    result = {
        'coordinates': coordinates,
        'all_components': all_components,
        'all_explained_variance_ratios': all_explained_variance_ratios,
        'skipped_gene_labels': skipped_gene_labels,
        'warning': None
    }

    if expressions.empty:
        print(warning('Gene selection and filtering resulted in no genes. Please select different samples or genes.'))

    return result
예제 #8
0
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    validate_inputs(args)

    exp_type = args.exp_types[0]
    spikeins_mix = args.spikeins_mix

    expected = get_expected(spikeins_mix, log2=True)

    for sample_name, sample_exp in zip(args.sample_names, args.sample_exps):

        measured_zero = get_measured(sample_exp, sample_name, exp_type, only_zero=True)
        measured_nonzero = get_measured(sample_exp, sample_name, exp_type, only_nonzero=True, log2=True)

        merged_zero = merge_expected_measured(expected, measured_zero)
        merged_nonzero = merge_expected_measured(expected, measured_nonzero)

        # Get only ERCC spike-in's and plot the histogram-scatter figure.
        if merged_nonzero.iloc[merged_nonzero.index.str.startswith('ERCC'), :].empty:
            print(warning('All ERCC spike-ins have zero expression in sample {}'.format(sample_name)))
            continue

        plot_histogram_scatter(
            expected=expected.iloc[expected.index.str.startswith('ERCC')],
            zero=merged_zero.iloc[merged_zero.index.str.startswith('ERCC'), :],
            nonzero=merged_nonzero.iloc[merged_nonzero.index.str.startswith('ERCC'), :],
            spikein_type='ERCC',
            sample_name=sample_name,
            exp_type=exp_type,
        )
예제 #9
0
def get_clustering(expressions,
                   distance_metric='euclidean',
                   linkage_method='average',
                   ordering_method=None,
                   n_keep=None,
                   n_trials=1000):
    """Compute linkage, order, and produce a dendrogram."""
    if len(expressions.columns) < 2:
        return np.array([]), {'leaves': list(range(len(expressions.columns)))}
    try:
        distances = pdist(np.transpose(np.array(expressions)),
                          metric=distance_metric)
        if np.isnan(distances).any():
            distances = np.nan_to_num(distances, copy=False)
            warning(
                'Distances between some samples were undefined and were set to zero.'
            )
    except:
        msg = 'Cannot compute distances between samples.'
        print(error(msg))
        raise ValueError(msg)
    try:
        link = linkage(y=distances, method=linkage_method)
    except:
        msg = 'Cannot compute linkage.'
        print(error(msg))
        raise ValueError(msg)
    if ordering_method:
        if ordering_method == 'knn':
            link = knn(link, distances)
        elif ordering_method == 'optimal':
            link = optimal(link, distances, n_keep)
        elif ordering_method == 'sa':
            link = simulated_annealing(link, distances, n_trials)
        else:
            msg = 'Unknown ordering method {}'.format(ordering_method)
            print(error(msg))
            raise ValueError(msg)
    try:
        dend = dendrogram(link, no_plot=True)
    except:
        msg = 'Cannot compute dendrogram.'
        print(error(msg))
        raise ValueError(msg)
    return link, dend
예제 #10
0
def create_new_header(infile, mappings, outfile):
    """Create new header in BigWig, with UCSC chromosome names."""
    with pyBigWig.open(infile) as bw:
        if set(bw.chroms().keys()).issubset(mappings.values()):
            # If chromosome names are already UCSC, just rename input file to output name.
            # Exit with status 0 since this is normal behavior.
            os.rename(infile, outfile)
            sys.exit(0)

        hdr = [(mappings[chrom], length)
               for chrom, length in bw.chroms().items() if chrom in mappings]

        if not hdr:
            msg = "Neither of the chromosomes in the input file has a valid UCSC pair. No mapping will be done."
            send_message(warning(msg))
            os.rename(infile, outfile)
            sys.exit(0)

        seq_num = 0
        with pyBigWig.open(outfile, "w") as bw_output:
            bw_output.addHeader(hdr)
            for chrom, length in bw.chroms().items():
                ints = bw.intervals(chrom, 0, length)
                if ints and chrom in mappings:
                    bw_output.addEntries(
                        [mappings[chrom]] * len(ints),
                        [x[0] for x in ints],
                        ends=[x[1] for x in ints],
                        values=[x[2] for x in ints],
                    )
                elif chrom not in mappings:
                    seq_num += 1
                    print("UCSC chromosome/conting mapping for {} is missing".
                          format(chrom))

        if seq_num > 0:
            send_message(
                warning(
                    "UCSC chromosome/conting mapping for {} sequence(s) is missing. "
                    "This sequence(s) will not be included in the bigWig file."
                    .format(seq_num)))
예제 #11
0
def validate_inputs(args):
    """Validate inputs."""
    # Validate that all expression types are equal.
    exp_type_set = set(args.exp_types)
    if len(exp_type_set) != 1:
        msg = "All samples should have the same expression type, but multiple expression types were given: {}."
        msg = msg.format(", ".join(exp_type_set))
        send_message(warning(msg))

    # Validate that same number of sample names, expression files and
    # expression types are given.
    assert len(args.sample_names) == len(args.sample_exps) == len(args.exp_types)
예제 #12
0
def validate_inputs(args):
    """Validate inputs."""
    # Validate that all expression types are equal.
    exp_type_set = set(args.exp_types)
    if len(exp_type_set) != 1:
        msg = "All samples should have the same expression type, but multiple expression types were given: {}."
        msg = msg.format(', '.join(exp_type_set))
        print(warning(msg))

    # Validate that same number of sample names, expression files and
    # expression types are given.
    assert len(args.sample_names) == len(args.sample_exps) == len(args.exp_types)
def parse_mappings(species, infile, outfile):
    """Parse file with chromosome mappings."""
    mappings = dict()
    # if species doesn't have prepared mapping file the script should exit with status 0 and return BigWig file
    # with output name and warining
    if species not in MAPPINGS_FILES:
        msg = 'Chromosome mappings for Species "{}" are not supported.'.format(species)
        print(warning(msg))
        os.rename(infile, outfile)
        sys.exit(0)

    for basename in MAPPINGS_FILES[species]:
        filename = os.path.join(MAPPINGS_DIR, basename)
        mappings.update(parse_mapping_file(filename))
    return mappings
예제 #14
0
def parse_mappings(species, infile, outfile):
    """Parse file with chromosome mappings."""
    mappings = dict()
    # if species doesn't have prepared mapping file the script should exit with status 0 and return BigWig file
    # with output name and warining
    if species not in MAPPINGS_FILES:
        msg = 'Chromosome mappings for Species "{}" are not supported.'.format(
            species)
        send_message(warning(msg))
        os.rename(infile, outfile)
        sys.exit(0)

    for basename in MAPPINGS_FILES[species]:
        filename = os.path.join(MAPPINGS_DIR, basename)
        mappings.update(parse_mapping_file(filename))
    return mappings
예제 #15
0
def get_pca(expressions=pd.DataFrame(), n_components=2, gene_labels=[]):
    """Compute PCA."""
    if not gene_labels:
        gene_labels = expressions.index
    skipped_gene_labels = list(set(gene_labels).difference(expressions.index))

    if expressions.shape[0] < 2 or expressions.shape[1] < 2:
        coordinates = [[0, 0] for i in range(len(expressions))]
        all_components = [[], []]
        all_explained_variance_ratios = [0.0, 0.0]
    else:
        pca = PCA(n_components=n_components, whiten=True)
        pca_expressions = pca.fit_transform(expressions.transpose())

        coordinates = [
            t[:2].tolist() if len(t) > 1 else [t[0], 0.0]
            for t in pca_expressions
        ]
        all_components = [
            component_top_factors(component, gene_labels)
            for component in pca.components_
        ]
        if np.isnan(pca.explained_variance_ratio_).any():
            all_explained_variance_ratios = [
                0.0 for _ in pca.explained_variance_ratio_
            ]
        else:
            all_explained_variance_ratios = pca.explained_variance_ratio_.tolist(
            )

    result = {
        'coordinates': coordinates,
        'all_components': all_components,
        'all_explained_variance_ratios': all_explained_variance_ratios,
        'skipped_gene_labels': skipped_gene_labels,
        'warning': None
    }

    if expressions.empty:
        print(
            warning(
                'Gene selection and filtering resulted in no genes. Please select different samples or genes.'
            ))

    return result
예제 #16
0
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()
    gene_sets = create_gene_sets(args.dge_file, args.logfc, args.fdr)

    fname_prefix = generate_name(args.analysis_name, args.tool, args.logfc,
                                 args.fdr)

    out_dir = Path(args.out_dir)
    if not out_dir.exists():
        out_dir.mkdir()

    for name, data in gene_sets.items():
        if data.empty:
            send_message(
                warning(
                    f"No {name}-regulated genes. Gene set was not created."))
        else:
            save_genes(data, out_dir / f"{fname_prefix}_{name}.tab.gz")
예제 #17
0
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    validate_inputs(args)

    exp_type = args.exp_types[0]
    spikeins_mix = args.spikeins_mix

    expected = get_expected(spikeins_mix, log2=True)

    for sample_name, sample_exp in zip(args.sample_names, args.sample_exps):

        measured_zero = get_measured(sample_exp,
                                     sample_name,
                                     exp_type,
                                     only_zero=True)
        measured_nonzero = get_measured(sample_exp,
                                        sample_name,
                                        exp_type,
                                        only_nonzero=True,
                                        log2=True)

        merged_zero = merge_expected_measured(expected, measured_zero)
        merged_nonzero = merge_expected_measured(expected, measured_nonzero)

        # Get only ERCC spike-in's and plot the histogram-scatter figure.
        if merged_nonzero.iloc[
                merged_nonzero.index.str.startswith('ERCC'), :].empty:
            print(
                warning('All ERCC spike-ins have zero expression in sample {}'.
                        format(sample_name)))
            continue

        plot_histogram_scatter(
            expected=expected.iloc[expected.index.str.startswith('ERCC')],
            zero=merged_zero.iloc[merged_zero.index.str.startswith('ERCC'), :],
            nonzero=merged_nonzero.iloc[
                merged_nonzero.index.str.startswith('ERCC'), :],
            spikein_type='ERCC',
            sample_name=sample_name,
            exp_type=exp_type,
        )
예제 #18
0
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    with open(args.geneset_file, "rU") as infile:
        # skip empty lines in input gene set file
        genes = [str(line.strip()) for line in infile if line.strip()]
        geneset = sorted(set(genes))

        if len(genes) != len(geneset):
            send_message(warning("Removed duplicated genes."))

        with open(args.output_json, "w") as json_out:
            json.dump({"genes": geneset},
                      json_out,
                      separators=(",", ":"),
                      allow_nan=False)

        with gzip.open(args.output_file, "w") as file_out:
            file_out.write("\n".join(geneset).encode("utf-8"))
예제 #19
0
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('vcf_file',
                    help="VCF file (can be compressed using gzip/bgzip).")
parser.add_argument('summary', help="Summary file to append to.")
args = parser.parse_args()

try:
    vcf = VariantFile(args.vcf_file)
except (OSError, ValueError) as error_msg:
    proc_error = 'Input VCF file does not exist or could not be correctly opened.'
    print(error(proc_error))
    raise ValueError(error_msg)

vcf_header = vcf.header
header_records = {record.key: record.value for record in vcf_header.records}

with open(args.summary, "a") as out_file:
    try:
        fasta_name = os.path.basename(header_records['reference'])
    except KeyError:
        fasta_name = ''
        print(
            warning(
                'Reference sequence (FASTA) name could not be recognized from the VCF header.'
            ))

    out_file.write('\nReference (genome) sequence:\n{}\n'.format(fasta_name))
    out_file.write('\nSamples:\n{}'.format('\n'.join(list(
        vcf_header.samples))))
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    if args.norm_expressions and args.norm_expressions_type:
        if len(args.norm_expressions) != len(args.norm_expressions_type):
            print(
                error(
                    'The number of additional expression files must match the number of specified '
                    'expressions types.'))
            sys.exit(1)

    if args.norm_expressions_type:
        exp_types = [args.expressions_type] + args.norm_expressions_type
        if len(exp_types) != len(set(exp_types)):
            print(
                error(
                    'The union of the main expression type ({}) and additional normalized expression types {} '
                    'does not contain unique items.'.format(
                        args.expressions_type, args.norm_expressions_type)))
            sys.exit(1)

    res = resdk.Resolwe()

    feature_dict = {}
    df = parse_expression_file(args.expressions, args.expressions_type)

    # Get a list of feature IDs
    input_features = df['FEATURE_ID'].tolist()

    # Split feature IDs into chunks with max size of 10000 elements
    features_sublists = [
        input_features[i:i + CHUNK_SIZE]
        for i in range(0, len(input_features), CHUNK_SIZE)
    ]

    # Fetch features from KB and add them to {feature_id: feature_name} mapping dict
    for fsublist in features_sublists:
        features = res.feature.filter(source=args.source_db,
                                      species=args.species,
                                      feature_id=fsublist)
        feature_dict.update({f.feature_id: f.name for f in features})

    # Map gene symbols to feature IDs
    df['GENE_SYMBOL'] = df['FEATURE_ID'].map(feature_dict)

    # Check if all of the input feature IDs could be mapped to the gene symbols
    if not all(f_id in feature_dict for f_id in input_features):
        print(
            warning(
                '{} feature(s) could not be mapped to the associated feature symbols.'
                .format(sum(df.isnull().values.ravel()))))

    # Merge additional expression files with the original data frame
    if args.norm_expressions and args.norm_expressions_type:
        for exp_file, exp_type in zip(args.norm_expressions,
                                      args.norm_expressions_type):
            exp_df = parse_expression_file(exp_file, exp_type)
            df = df.merge(exp_df, on='FEATURE_ID')

    # Reorder the columns in dataframe
    columns = ['FEATURE_ID', 'GENE_SYMBOL', args.expressions_type]
    if args.norm_expressions_type:
        columns = columns + args.norm_expressions_type
    df = df[columns]

    # Replace NaN values with empty string
    df.fillna('', inplace=True)

    # Write to file
    df.to_csv(args.output_name + '.txt.gz',
              header=True,
              index=False,
              sep='\t',
              compression='gzip')

    # Write to JSON
    df_dict = df.set_index('FEATURE_ID').to_dict(orient='index')
    with open(args.output_name + '.json', 'w') as f:
        json.dump({'genes': df_dict}, f, allow_nan=False)
예제 #21
0
 def warning(self, *args):
     """Log warning message."""
     report = resolwe_runtime_utils.warning(' '.join([str(x) for x in args]))
     # TODO: Use the protocol to report progress.
     print(report)
예제 #22
0
import pandas as pd
from resolwe_runtime_utils import send_message, warning

parser = argparse.ArgumentParser(description=__doc__)

parser.add_argument("-bed",
                    "--bed_file",
                    required=True,
                    help="All splice junctions in BED12 format")
parser.add_argument("-sj",
                    "--novel_sj",
                    required=True,
                    help="Table of annotated novel splice junctions")

if __name__ == "__main__":

    args = parser.parse_args()
    bed_file = args.bed_file

    if os.path.getsize(bed_file) == 0:
        send_message(warning("Bed file has no entries."))
        os.rename(bed_file, "novel_sj.bed")
        sys.exit(0)

    bed = pd.read_csv(args.bed_file, delimiter="\t", header=None, dtype=str)
    novel_sj = pd.read_csv(args.novel_sj, delimiter="\t", dtype=str)
    bed_novel_sj = bed[bed[3].isin(novel_sj["name"])]

    bed_novel_sj.to_csv("novel_sj.bed", sep="\t", index=False, header=False)
예제 #23
0
matrix = np.array(matrix)

matrix_sum = np.sum(matrix, axis=0)  # sum of expressions for each gene
genes_zero = np.where(matrix_sum < 0.1)[0]

if args.filter:
    matrix = np.delete(matrix, genes_zero, axis=1)

    if matrix.shape[1] == 0:
        raise ValueError("Expressions of all selected genes are 0")

distance = distance_map[args.dstfunc.lower()]
cluster = linkage(matrix, method=args.linkage.lower(), metric=distance)

distance_sum = cluster[:, 2].sum()
if distance_sum < 0.1:
    print(warning('All sample distances are 0.'))

dend = dendrogram(cluster, no_plot=True)

sample_ids = {}
for i, sample_id in enumerate(args.sampleids):
    sample_ids[i + 1] = {'id': sample_id}

output = {'cluster': {'linkage': cluster.tolist(),
                      'samples_names': sample_ids,
                      'order': dend['leaves']}}

print(json.dumps(output, separators=(',', ':')))
예제 #24
0
def main():
    """Compute sample hierarchical clustering."""
    args = parse_args()

    if len(args.sample_files) != len(args.sample_ids):
        msg = "The number of sample files does not match the number of sample IDs."
        set_error(msg)

    if len(args.sample_files) != len(args.sample_names):
        msg = "The number of sample files does not match the number of sample names."
        set_error(msg)

    if len(args.sample_files) < 2:
        msg = (
            "Select at least two samples to compute hierarchical clustering of samples."
        )
        set_error(msg)

    if len(args.gene_labels) == 1 and args.distance_metric != "euclidean":
        msg = (
            "Select at least two genes to compute hierarchical clustering of samples with "
            "correlation distance metric or use Euclidean distance metric.")
        set_error(msg)

    expressions, excluded = get_expressions(fnames=args.sample_files,
                                            gene_set=args.gene_labels)

    if len(expressions.index) == 0:
        if not args.gene_labels:
            msg = "The selected samples do not have any common genes."
        else:
            msg = "None of the selected genes are present in all samples."
        set_error(msg)

    if len(expressions.index) == 1 and args.distance_metric != "euclidean":
        if not args.gene_labels:
            msg = (
                "The selected samples contain only one common gene ({}). At least two common "
                "genes are required to compute hierarchical clustering of samples with "
                "correlation distance metric. Select a different set of samples or use Euclidean "
                "distance metric.".format(
                    get_gene_names(list(expressions.index), args.source,
                                   args.species)[0]))
        else:
            msg = (
                "Only one of the selected genes ({}) is present in all samples but at least two "
                "such genes are required to compute hierarchical clustering of samples with "
                "correlation distance metric. Select more genes or use Euclidean distance "
                "metric.".format(
                    get_gene_names(list(expressions.index), args.source,
                                   args.species)[0]))
        set_error(msg)

    expressions = transform(expressions, log2=args.log2, z_score=args.z_score)

    if args.remove_const:
        expressions, matches = remove_const_samples(expressions)
        if len(expressions.columns) == 0:
            msg = (
                "All of the selected samples have constant expression across genes. Hierarchical "
                "clustering of samples cannot be computed.")
            set_error(msg)
        if len(expressions.columns) == 1:
            sample_name = [
                id for i, id in enumerate(args.sample_names) if matches[i]
            ][0]
            msg = (
                "Only one of the selected samples ({}) has a non-constant expression across "
                "genes. However, hierarchical clustering of samples cannot be computed with "
                "just one sample.".format(sample_name))
            set_error(msg)
        removed = [
            name for i, name in enumerate(args.sample_names) if not matches[i]
        ]
        suffix = "" if len(removed) <= 3 else ", ..."
        if removed:
            msg = (
                "{} of the selected samples ({}) have constant expression across genes. "
                "Those samples are excluded from the computation of hierarchical clustering of "
                "samples with correlation distance "
                "metric.".format(len(removed),
                                 ", ".join(removed[:3]) + suffix))
            send_message(warning(msg))
    else:
        matches = [True] * len(args.sample_files)

    suffix = "" if len(excluded) <= 3 else ", ..."
    if excluded:
        excluded_names = get_gene_names(excluded[:3], args.source,
                                        args.species)
    if len(excluded) == 1:
        if not args.gene_labels:
            msg = (
                "Gene {} is present in some but not all of the selected samples. This "
                "gene is excluded from the computation of hierarchical clustering of "
                "samples.".format(", ".join(excluded_names)))
        else:
            msg = (
                "{} of the selected genes ({}) is missing in at least one of the selected "
                "samples. This gene is excluded from the computation of hierarchical "
                "clustering of samples.".format(len(excluded),
                                                ", ".join(excluded_names)))
        send_message(warning(msg))
    if len(excluded) > 1:
        if not args.gene_labels:
            msg = (
                "{} genes ({}) are present in some but not all of the selected samples. Those "
                "genes are excluded from the computation of hierarchical clustering of "
                "samples.".format(len(excluded), ", ".join(excluded_names)))
        else:
            msg = (
                "{} of the selected genes ({}) are missing in at least one of the selected "
                "samples. Those genes are excluded from the computation of hierarchical "
                "clustering of samples.".format(len(excluded),
                                                ", ".join(excluded_names)))
        send_message(warning(msg))

    linkage, dendrogram = get_clustering(
        expressions,
        distance_metric=get_distance_metric(args.distance_metric),
        linkage_method=args.linkage_method,
        order=args.order,
    )

    sample_ids = [
        sample_id for i, sample_id in enumerate(args.sample_ids) if matches[i]
    ]
    result = {
        "sample_ids":
        {i: {
            "id": sample_id
        }
         for i, sample_id in enumerate(sample_ids)},
        "linkage": linkage.tolist(),
        "order": dendrogram["leaves"],
    }
    output_json(result, args.output)
예제 #25
0
import pandas as pd
from resolwe_runtime_utils import warning

parser = argparse.ArgumentParser(description=__doc__)

parser.add_argument("-bed",
                    "--bed_file",
                    required=True,
                    help="All splice junctions in BED12 format")
parser.add_argument("-sj",
                    "--novel_sj",
                    required=True,
                    help="Table of annotated novel splice junctions")

if __name__ == "__main__":

    args = parser.parse_args()
    bed_file = args.bed_file

    if os.path.getsize(bed_file) == 0:
        print(warning("Bed file has no entries."))
        os.rename(bed_file, "novel_sj.bed")
        sys.exit(0)

    bed = pd.read_csv(args.bed_file, delimiter="\t", header=None, dtype=str)
    novel_sj = pd.read_csv(args.novel_sj, delimiter="\t", dtype=str)
    bed_novel_sj = bed[bed[3].isin(novel_sj["name"])]

    bed_novel_sj.to_csv("novel_sj.bed", sep="\t", index=False, header=False)
"""Filter novel splice junctions in BED12 format."""

import argparse
import os
import sys

import pandas as pd

from resolwe_runtime_utils import warning

parser = argparse.ArgumentParser(description=__doc__)

parser.add_argument('-bed', '--bed_file', required=True, help="All splice junctions in BED12 format")
parser.add_argument('-sj', '--novel_sj', required=True, help="Table of annotated novel splice junctions")

if __name__ == "__main__":

    args = parser.parse_args()
    bed_file = args.bed_file

    if os.path.getsize(bed_file) == 0:
        print(warning('Bed file has no entries.'))
        os.rename(bed_file, 'novel_sj.bed')
        sys.exit(0)

    bed = pd.read_csv(args.bed_file, delimiter='\t', header=None, dtype=str)
    novel_sj = pd.read_csv(args.novel_sj, delimiter='\t', dtype=str)
    bed_novel_sj = bed[bed[3].isin(novel_sj["name"])]

    bed_novel_sj.to_csv('novel_sj.bed', sep='\t', index=False, header=False)
예제 #27
0
 def warning(self, *args):
     """Log warning message."""
     report = resolwe_runtime_utils.warning(' '.join([str(x) for x in args]))
     # TODO: Use the protocol to report progress.
     print(report)
 def test_string(self):
     self.assertEqual(warning('Some warning'), '{"proc.warning": "Some warning"}')
예제 #29
0
def main():
    """Compute sample hierarchical clustering."""
    args = parse_args()

    if len(args.sample_files) != len(args.sample_ids):
        msg = 'The number of sample files does not match the number of sample IDs.'
        set_error(msg)

    if len(args.sample_files) != len(args.sample_names):
        msg = 'The number of sample files does not match the number of sample names.'
        set_error(msg)

    if len(args.sample_files) < 2:
        msg = 'Select at least two samples to compute hierarchical clustering of samples.'
        set_error(msg)

    if len(args.gene_labels) == 1 and args.distance_metric != 'euclidean':
        msg = ('Select at least two genes to compute hierarchical clustering of samples with '
               'correlation distance metric or use Euclidean distance metric.')
        set_error(msg)

    expressions, excluded = get_expressions(fnames=args.sample_files, gene_set=args.gene_labels)

    if len(expressions.index) == 0:
        if not args.gene_labels:
            msg = 'The selected samples do not have any common genes.'
        else:
            msg = 'None of the selected genes are present in all samples.'
        set_error(msg)

    if len(expressions.index) == 1 and args.distance_metric != 'euclidean':
        if not args.gene_labels:
            msg = ('The selected samples contain only one common gene ({}). At least two common '
                   'genes are required to compute hierarchical clustering of samples with '
                   'correlation distance metric. Select a different set of samples or use Euclidean '
                   'distance metric.'.format(get_gene_names(list(expressions.index), args.source, args.species)[0]))
        else:
            msg = ('Only one of the selected genes ({}) is present in all samples but at least two '
                   'such genes are required to compute hierarchical clustering of samples with '
                   'correlation distance metric. Select more genes or use Euclidean distance '
                   'metric.'.format(get_gene_names(list(expressions.index), args.source, args.species)[0]))
        set_error(msg)

    expressions = transform(expressions, log2=args.log2, z_score=args.z_score)

    if args.remove_const:
        expressions, matches = remove_const_samples(expressions)
        if len(expressions.columns) == 0:
            msg = ('All of the selected samples have constant expression across genes. Hierarchical '
                   'clustering of samples cannot be computed.')
            set_error(msg)
        if len(expressions.columns) == 1:
            sample_name = [id for i, id in enumerate(args.sample_names) if matches[i]][0]
            msg = ('Only one of the selected samples ({}) has a non-constant expression across '
                   'genes. However, hierarchical clustering of samples cannot be computed with '
                   'just one sample.'.format(sample_name))
            set_error(msg)
        removed = [name for i, name in enumerate(args.sample_names) if not matches[i]]
        suffix = '' if len(removed) <= 3 else ', ...'
        if removed:
            msg = ('{} of the selected samples ({}) have constant expression across genes. '
                   'Those samples are excluded from the computation of hierarchical clustering of '
                   'samples with correlation distance '
                   'metric.'.format(len(removed), ', '.join(removed[:3]) + suffix))
            print(warning(msg))
    else:
        matches = [True] * len(args.sample_files)

    suffix = '' if len(excluded) <= 3 else ', ...'
    if excluded:
        excluded_names = get_gene_names(excluded[:3], args.source, args.species)
    if len(excluded) == 1:
        if not args.gene_labels:
            msg = ('Gene {} is present in some but not all of the selected samples. This '
                   'gene is excluded from the computation of hierarchical clustering of '
                   'samples.'.format(len(excluded), ', '.join(excluded_names)))
        else:
            msg = ('{} of the selected genes ({}) is missing in at least one of the selected '
                   'samples. This gene is excluded from the computation of hierarchical '
                   'clustering of samples.'.format(len(excluded), ', '.join(excluded_names)))
        print(warning(msg))
    if len(excluded) > 1:
        if not args.gene_labels:
            msg = ('{} genes ({}) are present in some but not all of the selected samples. Those '
                   'genes are excluded from the computation of hierarchical clustering of '
                   'samples.'.format(len(excluded), ', '.join(excluded_names)))
        else:
            msg = ('{} of the selected genes ({}) are missing in at least one of the selected '
                   'samples. Those genes are excluded from the computation of hierarchical '
                   'clustering of samples.'.format(len(excluded), ', '.join(excluded_names)))
        print(warning(msg))

    linkage, dendrogram = get_clustering(
        expressions,
        distance_metric=get_distance_metric(args.distance_metric),
        linkage_method=args.linkage_method,
        order=args.order
    )

    sample_ids = [sample_id for i, sample_id in enumerate(args.sample_ids) if matches[i]]
    result = {
        'sample_ids': {i: {'id': sample_id} for i, sample_id in enumerate(sample_ids)},
        'linkage': linkage.tolist(),
        'order': dendrogram['leaves'],
    }
    output_json(result, args.output)
 def test_string(self):
     self.assertEqual(warning('Some warning'),
                      '{"proc.warning": "Some warning"}')
예제 #31
0
if args.filter:
    matrix = np.delete(matrix, genes_zero, axis=1)

    if matrix.shape[1] == 0:
        msg = "Expressions of selected genes are 0. Please select additional genes."
        print(error(msg))
        raise ValueError(msg)

distance = distance_map[args.dstfunc]
cluster = linkage(matrix, method=args.linkage, metric=distance)

distance_sum = cluster[:, 2].sum()
if distance_sum < 0.1:
    msg = 'All sample distances are 0.'
    print(warning(msg))

dend = dendrogram(cluster, no_plot=True)

sample_ids = {}
for i, sample_id in enumerate(args.sampleids):
    sample_ids[i + 1] = {'id': int(sample_id)}

output = {
    'cluster': {
        'linkage': cluster.tolist(),
        'samples_names': sample_ids,
        'order': dend['leaves']
    }
}
예제 #32
0
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    res = resdk.Resolwe()

    with open(args.feature_ids) as gene_file:
        genes = [gene.strip() for gene in gene_file]

    org_features = res.feature.filter(source=args.source_db,
                                      species=args.species,
                                      feature_id=genes)

    if len(org_features) == 0:
        print(error("No genes were fetched from the knowledge base."))
        exit(1)

    if args.source_db == args.target_db:
        target_ids = genes
    else:
        mapping_res = res.mapping.filter(
            source_db=args.source_db,
            source_species=args.species,
            target_db=args.target_db,
            target_species=args.species,
            source_id=genes,
        )

        if len(mapping_res) == 0:
            print(error("Failed to map features."))
            exit(1)

        mappings = {}
        for m in mapping_res:
            if m.source_id in genes:
                if m.source_id not in mappings:
                    mappings[m.source_id] = m.target_id
                else:
                    print(
                        warning(
                            "Mapping {} returned multiple times.".format(m)))

        if len(genes) > len(mappings):
            print(warning("Not all features could be mapped."))

        target_ids = mappings.values()

    with tempfile.NamedTemporaryFile() as input_genes:
        input_genes.write(' '.join(target_ids).encode("UTF-8"))
        input_genes.flush()
        process = Popen([
            'processor',
            str(args.pval),
            str(args.min_genes), args.obo, args.gaf, input_genes.name
        ],
                        stdout=PIPE,
                        stderr=DEVNULL)
        out, err = process.communicate()

        with open('terms.json', 'w') as f:
            f.write(out.decode("UTF-8"))
예제 #33
0
def main():
    """Compute gene hierarchical clustering."""
    args = parse_args()

    if len(args.sample_files) != len(args.sample_names):
        msg = 'The number of sample files does not match the number of sample names.'
        set_error(msg)

    if len(args.gene_labels) == 1:
        msg = 'Select at least two genes to compute hierarchical clustering of genes.'
        set_error(msg)

    if len(args.sample_files) == 1 and args.distance_metric != 'euclidean':
        msg = (
            'Select at least two samples to compute hierarchical clustering of genes with '
            'correlation distance metric or use Euclidean distance metric.')
        set_error(msg)

    expressions, excluded = get_expressions(fnames=args.sample_files,
                                            gene_set=args.gene_labels)

    if len(expressions.index) == 0:
        if not args.gene_labels:
            msg = 'The selected samples do not have any common genes.'
        else:
            msg = 'None of the selected genes are present in all samples.'
        set_error(msg)

    if len(expressions.index) == 1 and args.distance_metric != 'euclidean':
        if not args.gene_labels:
            msg = (
                'The selected samples contain only one common gene ({}). At least two common '
                'genes are required to compute hierarchical clustering of genes with '
                'correlation distance metric. Select a different set of samples or use Euclidean '
                'distance metric.'.format(
                    get_gene_names(list(expressions.index), args.source,
                                   args.species)[0]))
        else:
            msg = (
                'Only one of the selected genes ({}) is present in all samples but at least two '
                'such genes are required to compute hierarchical clustering of genes with '
                'correlation distance metric. Select more genes or use Euclidean distance '
                'metric.'.format(
                    get_gene_names(list(expressions.index), args.source,
                                   args.species)[0]))
        set_error(msg)

    expressions = transform(expressions, log2=args.log2, z_score=args.z_score)

    if args.remove_const:
        expressions, matches = remove_const_genes(expressions)
        if len(expressions.index) == 0:
            msg = (
                'All of the selected genes have constant expression across samples. '
                'Hierarchical clustering of genes cannot be computed.')
            set_error(msg)
        if len(expressions.index) == 1:
            gene_names = get_gene_names(list(expressions.index), args.source,
                                        args.species)
            msg = (
                'Only one of the selected genes ({}) has a non-constant expression across '
                'samples. However, hierarchical clustering of genes cannot be computed with '
                'just one gene.'.format(gene_names[0]))
            set_error(msg)
        removed = [
            name for i, name in enumerate(expressions.index) if not matches[i]
        ]
        suffix = '' if len(removed) <= 3 else ', ...'
        if removed:
            removed_names = get_gene_names(removed[:3], args.source,
                                           args.species)
            msg = (
                '{} of the selected genes ({}) have constant expression across samples. '
                'Those genes are excluded from the computation of hierarchical clustering of '
                'genes with correlation distance '
                'metric.'.format(len(removed),
                                 ', '.join(removed_names) + suffix))
            print(warning(msg))
    else:
        matches = [True] * len(expressions.index)

    suffix = '' if len(excluded) <= 3 else ', ...'
    if excluded:
        excluded_names = get_gene_names(excluded[:3], args.source,
                                        args.species)
    if len(excluded) == 1:
        if not args.gene_labels:
            msg = (
                'Gene {} is present in some but not all of the selected samples. This '
                'gene is excluded from the computation of hierarchical clustering of '
                'genes.'.format(len(excluded), ', '.join(excluded_names)))
        else:
            msg = (
                '{} of the selected genes ({}) is missing in at least one of the selected '
                'samples. This gene is excluded from the computation of hierarchical '
                'clustering of genes.'.format(len(excluded),
                                              ', '.join(excluded_names)))
        print(warning(msg))
    if len(excluded) > 1:
        if not args.gene_labels:
            msg = (
                '{} genes ({}) are present in some but not all of the selected samples. Those '
                'genes are excluded from the computation of hierarchical clustering of '
                'genes.'.format(len(excluded), ', '.join(excluded_names)))
        else:
            msg = (
                '{} of the selected genes ({}) are missing in at least one of the selected '
                'samples. Those genes are excluded from the computation of hierarchical '
                'clustering of genes.'.format(len(excluded),
                                              ', '.join(excluded_names)))
        print(warning(msg))

    linkage, dendrogram = get_clustering(expressions,
                                         distance_metric=get_distance_metric(
                                             args.distance_metric),
                                         linkage_method=args.linkage_method,
                                         order=args.order)

    result = {
        'gene_symbols':
        {i: {
            'gene': gene
        }
         for i, gene in enumerate(expressions.index)},
        'linkage': linkage.tolist(),
        'order': dendrogram['leaves'],
    }
    output_json(result, args.output)
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    if args.norm_expressions and args.norm_expressions_type:
        if len(args.norm_expressions) != len(args.norm_expressions_type):
            print(error('The number of additional expression files must match the number of specified '
                        'expressions types.'))
            sys.exit(1)

    if args.norm_expressions_type:
        exp_types = [args.expressions_type] + args.norm_expressions_type
        if len(exp_types) != len(set(exp_types)):
            print(error('The union of the main expression type ({}) and additional normalized expression types {} '
                        'does not contain unique items.'.format(args.expressions_type, args.norm_expressions_type)))
            sys.exit(1)

    res = resdk.Resolwe()

    feature_dict = {}
    df = parse_expression_file(args.expressions, args.expressions_type)

    # Get a list of feature IDs
    input_features = df['FEATURE_ID'].tolist()

    # Split feature IDs into chunks with max size of 10000 elements
    features_sublists = [input_features[i:i + CHUNK_SIZE] for i in range(0, len(input_features), CHUNK_SIZE)]

    # Fetch features from KB and add them to {feature_id: feature_name} mapping dict
    for fsublist in features_sublists:
        features = res.feature.filter(source=args.source_db, species=args.species, feature_id=fsublist)
        feature_dict.update({f.feature_id: f.name for f in features})

    # Map gene symbols to feature IDs
    df['GENE_SYMBOL'] = df['FEATURE_ID'].map(feature_dict)

    # Check if all of the input feature IDs could be mapped to the gene symbols
    if not all(f_id in feature_dict for f_id in input_features):
        print(warning('{} feature(s) could not be mapped to the associated feature symbols.'.format(
            sum(df.isnull().values.ravel())))
        )

    # Merge additional expression files with the original data frame
    if args.norm_expressions and args.norm_expressions_type:
        for exp_file, exp_type in zip(args.norm_expressions, args.norm_expressions_type):
            exp_df = parse_expression_file(exp_file, exp_type)
            df = df.merge(exp_df, on='FEATURE_ID')

    # Reorder the columns in dataframe
    columns = ['FEATURE_ID', 'GENE_SYMBOL', args.expressions_type]
    if args.norm_expressions_type:
        columns = columns + args.norm_expressions_type
    df = df[columns]

    # Replace NaN values with empty string
    df.fillna('', inplace=True)

    # Write to file
    df.to_csv(args.output_name + '.txt.gz', header=True, index=False, sep='\t', compression='gzip')

    # Write to JSON
    df_dict = df.set_index('FEATURE_ID').to_dict(orient='index')
    with open(args.output_name + '.json', 'w') as f:
        json.dump({'genes': df_dict}, f, allow_nan=False)
예제 #35
0
matrix_sum = np.sum(matrix, axis=0)  # sum of expressions for each gene
genes_zero = np.where(matrix_sum < 0.1)[0]

if args.filter:
    matrix = np.delete(matrix, genes_zero, axis=1)

    if matrix.shape[1] == 0:
        msg = "Expressions of selected genes are 0. Please select additional genes."
        print(error(msg))
        raise ValueError(msg)

distance = distance_map[args.dstfunc]
cluster = linkage(matrix, method=args.linkage, metric=distance)

distance_sum = cluster[:, 2].sum()
if distance_sum < 0.1:
    msg = 'All sample distances are 0.'
    print(warning(msg))

dend = dendrogram(cluster, no_plot=True)

sample_ids = {}
for i, sample_id in enumerate(args.sampleids):
    sample_ids[i + 1] = {'id': int(sample_id)}

output = {'cluster': {'linkage': cluster.tolist(),
                      'samples_names': sample_ids,
                      'order': dend['leaves']}}

print(json.dumps(output, separators=(',', ':')))
예제 #36
0
import pandas as pd

from resolwe_runtime_utils import warning

parser = argparse.ArgumentParser(description=__doc__)

parser.add_argument('-bed',
                    '--bed_file',
                    required=True,
                    help="All splice junctions in BED12 format")
parser.add_argument('-sj',
                    '--novel_sj',
                    required=True,
                    help="Table of annotated novel splice junctions")

if __name__ == "__main__":

    args = parser.parse_args()
    bed_file = args.bed_file

    if os.path.getsize(bed_file) == 0:
        print(warning('Bed file has no entries.'))
        os.rename(bed_file, 'novel_sj.bed')
        sys.exit(0)

    bed = pd.read_csv(args.bed_file, delimiter='\t', header=None, dtype=str)
    novel_sj = pd.read_csv(args.novel_sj, delimiter='\t', dtype=str)
    bed_novel_sj = bed[bed[3].isin(novel_sj["name"])]

    bed_novel_sj.to_csv('novel_sj.bed', sep='\t', index=False, header=False)