def main(): """Compute sample hierarchical clustering.""" args = parse_args() expressions = get_expressions(fnames=args.sample_files, gene_set=args.genes) expressions = transform(expressions, log2=args.log2, normalization=args.normalization) zero_genes = get_zero_genes(expressions) zero_samples = get_zero_samples(expressions) linkage, dendrogram = get_clustering(expressions, distance_metric=get_distance_metric( args.distance), linkage_method=args.linkage, ordering_method=args.ordering, n_keep=args.n_keep, n_trials=args.n_trials) result = { 'linkage': linkage.tolist(), 'sample_ids': {i: { 'id': sampleid } for i, sampleid in enumerate(args.sampleids)}, 'order': dendrogram['leaves'], 'zero_gene_symbols': zero_genes, 'missing_gene_symbols': list(set(args.genes).difference(set(expressions.index))), 'zero_sample_ids': [args.sampleids[sample] for sample in zero_samples] } output_json(result, args.output)
def main(): """Compute sample hierarchical clustering.""" args = parse_args() if len(args.sample_files) != len(args.sample_ids): msg = "The number of sample files does not match the number of sample IDs." set_error(msg) if len(args.sample_files) != len(args.sample_names): msg = "The number of sample files does not match the number of sample names." set_error(msg) if len(args.sample_files) < 2: msg = ( "Select at least two samples to compute hierarchical clustering of samples." ) set_error(msg) if len(args.gene_labels) == 1 and args.distance_metric != "euclidean": msg = ( "Select at least two genes to compute hierarchical clustering of samples with " "correlation distance metric or use Euclidean distance metric.") set_error(msg) expressions, excluded = get_expressions(fnames=args.sample_files, gene_set=args.gene_labels) if len(expressions.index) == 0: if not args.gene_labels: msg = "The selected samples do not have any common genes." else: msg = "None of the selected genes are present in all samples." set_error(msg) if len(expressions.index) == 1 and args.distance_metric != "euclidean": if not args.gene_labels: msg = ( "The selected samples contain only one common gene ({}). At least two common " "genes are required to compute hierarchical clustering of samples with " "correlation distance metric. Select a different set of samples or use Euclidean " "distance metric.".format( get_gene_names(list(expressions.index), args.source, args.species)[0])) else: msg = ( "Only one of the selected genes ({}) is present in all samples but at least two " "such genes are required to compute hierarchical clustering of samples with " "correlation distance metric. Select more genes or use Euclidean distance " "metric.".format( get_gene_names(list(expressions.index), args.source, args.species)[0])) set_error(msg) expressions = transform(expressions, log2=args.log2, z_score=args.z_score) if args.remove_const: expressions, matches = remove_const_samples(expressions) if len(expressions.columns) == 0: msg = ( "All of the selected samples have constant expression across genes. Hierarchical " "clustering of samples cannot be computed.") set_error(msg) if len(expressions.columns) == 1: sample_name = [ id for i, id in enumerate(args.sample_names) if matches[i] ][0] msg = ( "Only one of the selected samples ({}) has a non-constant expression across " "genes. However, hierarchical clustering of samples cannot be computed with " "just one sample.".format(sample_name)) set_error(msg) removed = [ name for i, name in enumerate(args.sample_names) if not matches[i] ] suffix = "" if len(removed) <= 3 else ", ..." if removed: msg = ( "{} of the selected samples ({}) have constant expression across genes. " "Those samples are excluded from the computation of hierarchical clustering of " "samples with correlation distance " "metric.".format(len(removed), ", ".join(removed[:3]) + suffix)) send_message(warning(msg)) else: matches = [True] * len(args.sample_files) suffix = "" if len(excluded) <= 3 else ", ..." if excluded: excluded_names = get_gene_names(excluded[:3], args.source, args.species) if len(excluded) == 1: if not args.gene_labels: msg = ( "Gene {} is present in some but not all of the selected samples. This " "gene is excluded from the computation of hierarchical clustering of " "samples.".format(", ".join(excluded_names))) else: msg = ( "{} of the selected genes ({}) is missing in at least one of the selected " "samples. This gene is excluded from the computation of hierarchical " "clustering of samples.".format(len(excluded), ", ".join(excluded_names))) send_message(warning(msg)) if len(excluded) > 1: if not args.gene_labels: msg = ( "{} genes ({}) are present in some but not all of the selected samples. Those " "genes are excluded from the computation of hierarchical clustering of " "samples.".format(len(excluded), ", ".join(excluded_names))) else: msg = ( "{} of the selected genes ({}) are missing in at least one of the selected " "samples. Those genes are excluded from the computation of hierarchical " "clustering of samples.".format(len(excluded), ", ".join(excluded_names))) send_message(warning(msg)) linkage, dendrogram = get_clustering( expressions, distance_metric=get_distance_metric(args.distance_metric), linkage_method=args.linkage_method, order=args.order, ) sample_ids = [ sample_id for i, sample_id in enumerate(args.sample_ids) if matches[i] ] result = { "sample_ids": {i: { "id": sample_id } for i, sample_id in enumerate(sample_ids)}, "linkage": linkage.tolist(), "order": dendrogram["leaves"], } output_json(result, args.output)
def main(): """Compute gene hierarchical clustering.""" args = parse_args() if len(args.sample_files) != len(args.sample_names): msg = 'The number of sample files does not match the number of sample names.' set_error(msg) if len(args.gene_labels) == 1: msg = 'Select at least two genes to compute hierarchical clustering of genes.' set_error(msg) if len(args.sample_files) == 1 and args.distance_metric != 'euclidean': msg = ( 'Select at least two samples to compute hierarchical clustering of genes with ' 'correlation distance metric or use Euclidean distance metric.') set_error(msg) expressions, excluded = get_expressions(fnames=args.sample_files, gene_set=args.gene_labels) if len(expressions.index) == 0: if not args.gene_labels: msg = 'The selected samples do not have any common genes.' else: msg = 'None of the selected genes are present in all samples.' set_error(msg) if len(expressions.index) == 1 and args.distance_metric != 'euclidean': if not args.gene_labels: msg = ( 'The selected samples contain only one common gene ({}). At least two common ' 'genes are required to compute hierarchical clustering of genes with ' 'correlation distance metric. Select a different set of samples or use Euclidean ' 'distance metric.'.format( get_gene_names(list(expressions.index), args.source, args.species)[0])) else: msg = ( 'Only one of the selected genes ({}) is present in all samples but at least two ' 'such genes are required to compute hierarchical clustering of genes with ' 'correlation distance metric. Select more genes or use Euclidean distance ' 'metric.'.format( get_gene_names(list(expressions.index), args.source, args.species)[0])) set_error(msg) expressions = transform(expressions, log2=args.log2, z_score=args.z_score) if args.remove_const: expressions, matches = remove_const_genes(expressions) if len(expressions.index) == 0: msg = ( 'All of the selected genes have constant expression across samples. ' 'Hierarchical clustering of genes cannot be computed.') set_error(msg) if len(expressions.index) == 1: gene_names = get_gene_names(list(expressions.index), args.source, args.species) msg = ( 'Only one of the selected genes ({}) has a non-constant expression across ' 'samples. However, hierarchical clustering of genes cannot be computed with ' 'just one gene.'.format(gene_names[0])) set_error(msg) removed = [ name for i, name in enumerate(expressions.index) if not matches[i] ] suffix = '' if len(removed) <= 3 else ', ...' if removed: removed_names = get_gene_names(removed[:3], args.source, args.species) msg = ( '{} of the selected genes ({}) have constant expression across samples. ' 'Those genes are excluded from the computation of hierarchical clustering of ' 'genes with correlation distance ' 'metric.'.format(len(removed), ', '.join(removed_names) + suffix)) print(warning(msg)) else: matches = [True] * len(expressions.index) suffix = '' if len(excluded) <= 3 else ', ...' if excluded: excluded_names = get_gene_names(excluded[:3], args.source, args.species) if len(excluded) == 1: if not args.gene_labels: msg = ( 'Gene {} is present in some but not all of the selected samples. This ' 'gene is excluded from the computation of hierarchical clustering of ' 'genes.'.format(len(excluded), ', '.join(excluded_names))) else: msg = ( '{} of the selected genes ({}) is missing in at least one of the selected ' 'samples. This gene is excluded from the computation of hierarchical ' 'clustering of genes.'.format(len(excluded), ', '.join(excluded_names))) print(warning(msg)) if len(excluded) > 1: if not args.gene_labels: msg = ( '{} genes ({}) are present in some but not all of the selected samples. Those ' 'genes are excluded from the computation of hierarchical clustering of ' 'genes.'.format(len(excluded), ', '.join(excluded_names))) else: msg = ( '{} of the selected genes ({}) are missing in at least one of the selected ' 'samples. Those genes are excluded from the computation of hierarchical ' 'clustering of genes.'.format(len(excluded), ', '.join(excluded_names))) print(warning(msg)) linkage, dendrogram = get_clustering(expressions, distance_metric=get_distance_metric( args.distance_metric), linkage_method=args.linkage_method, order=args.order) result = { 'gene_symbols': {i: { 'gene': gene } for i, gene in enumerate(expressions.index)}, 'linkage': linkage.tolist(), 'order': dendrogram['leaves'], } output_json(result, args.output)
def main(): """Compute sample hierarchical clustering.""" args = parse_args() if len(args.sample_files) != len(args.sample_ids): msg = 'The number of sample files does not match the number of sample IDs.' set_error(msg) if len(args.sample_files) != len(args.sample_names): msg = 'The number of sample files does not match the number of sample names.' set_error(msg) if len(args.sample_files) < 2: msg = 'Select at least two samples to compute hierarchical clustering of samples.' set_error(msg) if len(args.gene_labels) == 1 and args.distance_metric != 'euclidean': msg = ('Select at least two genes to compute hierarchical clustering of samples with ' 'correlation distance metric or use Euclidean distance metric.') set_error(msg) expressions, excluded = get_expressions(fnames=args.sample_files, gene_set=args.gene_labels) if len(expressions.index) == 0: if not args.gene_labels: msg = 'The selected samples do not have any common genes.' else: msg = 'None of the selected genes are present in all samples.' set_error(msg) if len(expressions.index) == 1 and args.distance_metric != 'euclidean': if not args.gene_labels: msg = ('The selected samples contain only one common gene ({}). At least two common ' 'genes are required to compute hierarchical clustering of samples with ' 'correlation distance metric. Select a different set of samples or use Euclidean ' 'distance metric.'.format(get_gene_names(list(expressions.index), args.source, args.species)[0])) else: msg = ('Only one of the selected genes ({}) is present in all samples but at least two ' 'such genes are required to compute hierarchical clustering of samples with ' 'correlation distance metric. Select more genes or use Euclidean distance ' 'metric.'.format(get_gene_names(list(expressions.index), args.source, args.species)[0])) set_error(msg) expressions = transform(expressions, log2=args.log2, z_score=args.z_score) if args.remove_const: expressions, matches = remove_const_samples(expressions) if len(expressions.columns) == 0: msg = ('All of the selected samples have constant expression across genes. Hierarchical ' 'clustering of samples cannot be computed.') set_error(msg) if len(expressions.columns) == 1: sample_name = [id for i, id in enumerate(args.sample_names) if matches[i]][0] msg = ('Only one of the selected samples ({}) has a non-constant expression across ' 'genes. However, hierarchical clustering of samples cannot be computed with ' 'just one sample.'.format(sample_name)) set_error(msg) removed = [name for i, name in enumerate(args.sample_names) if not matches[i]] suffix = '' if len(removed) <= 3 else ', ...' if removed: msg = ('{} of the selected samples ({}) have constant expression across genes. ' 'Those samples are excluded from the computation of hierarchical clustering of ' 'samples with correlation distance ' 'metric.'.format(len(removed), ', '.join(removed[:3]) + suffix)) print(warning(msg)) else: matches = [True] * len(args.sample_files) suffix = '' if len(excluded) <= 3 else ', ...' if excluded: excluded_names = get_gene_names(excluded[:3], args.source, args.species) if len(excluded) == 1: if not args.gene_labels: msg = ('Gene {} is present in some but not all of the selected samples. This ' 'gene is excluded from the computation of hierarchical clustering of ' 'samples.'.format(len(excluded), ', '.join(excluded_names))) else: msg = ('{} of the selected genes ({}) is missing in at least one of the selected ' 'samples. This gene is excluded from the computation of hierarchical ' 'clustering of samples.'.format(len(excluded), ', '.join(excluded_names))) print(warning(msg)) if len(excluded) > 1: if not args.gene_labels: msg = ('{} genes ({}) are present in some but not all of the selected samples. Those ' 'genes are excluded from the computation of hierarchical clustering of ' 'samples.'.format(len(excluded), ', '.join(excluded_names))) else: msg = ('{} of the selected genes ({}) are missing in at least one of the selected ' 'samples. Those genes are excluded from the computation of hierarchical ' 'clustering of samples.'.format(len(excluded), ', '.join(excluded_names))) print(warning(msg)) linkage, dendrogram = get_clustering( expressions, distance_metric=get_distance_metric(args.distance_metric), linkage_method=args.linkage_method, order=args.order ) sample_ids = [sample_id for i, sample_id in enumerate(args.sample_ids) if matches[i]] result = { 'sample_ids': {i: {'id': sample_id} for i, sample_id in enumerate(sample_ids)}, 'linkage': linkage.tolist(), 'order': dendrogram['leaves'], } output_json(result, args.output)