def prepare_geo_rnaseq(resource, name=None): """Run ``Prepare GEO - RNA-Seq`` process on the resource. This method can be used to run ``Prepare GEO - RNA-Seq`` process on a single collection or a list of samples. :param resource: resource on which prepare_geo_rnaseq will be run :param str name: name of the prepare GEO tarball and table """ reads = [] expressions = [] samples = get_samples(resource) resolwe = get_resolwe(*samples) collection_ids = set() for sample in samples: reads.append(sample.get_reads().id) expressions.append(sample.get_expression().id) collection_ids.add(get_resource_collection(sample)) auto_name, collection = get_name_collection(collection_ids, resolwe) inputs = { 'reads': reads, 'expressions': expressions, 'name': name or auto_name, } geo = resolwe.get_or_run(slug='prepare-geo-rnaseq', input=inputs) if collection: collection.add_data(geo) return geo
def macs(resource, use_background=True, p_value=None): """Run ``MACS 1.4`` process on the resource. This method runs `MACS 1.4`_ process with ``p-value`` specified in arguments and ``bam`` file from the sample. If ``use_background`` argument is set to ``True``, ``bam`` file from background sample is passed to the process as the control. Mappable genome size is taken from the sample annotation. .. _MACS 1.4: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-macs14 :param bool use_background: if set to ``True``, background sample will be used in the process :param float p_value: p-value used in the process """ inputs = {} if p_value is not None: inputs['pvalue'] = p_value results = [] if not isinstance(resource, list): resource = [resource] for single_resource in resource: background_filter = {} if use_background: collection_id = get_resource_collection(single_resource) if collection_id: background_filter['collection'] = collection_id for sample in get_samples(single_resource): inputs['treatment'] = sample.get_bam().id try: inputs['gsize'] = gsize_organism( sample.descriptor['sample']['organism']) except KeyError: raise KeyError('{} is not annotated'.format(sample)) if use_background: if is_background(sample) and not is_sample(single_resource): # Don't run process on the background sample, # but let it fail if it is run directly on sample continue background = sample.get_background(**background_filter) inputs['control'] = background.get_bam().id macs_obj = sample.resolwe.get_or_run(slug='macs14', input=inputs) sample.add_data(macs_obj) results.append(macs_obj) return results
def test_get_resource_collection(self): collection = Collection(id=1, resolwe=MagicMock()) collection.id = 1 # this is overriden when initialized self.assertEqual(get_resource_collection(collection), 1) relation = Relation(id=1, resolwe=MagicMock()) relation._hydrated_collection = Collection(id=2, resolwe=MagicMock()) relation._hydrated_collection.id = 2 # this is overriden when initialized self.assertEqual(get_resource_collection(relation), 2) data = Data(id=1, resolwe=MagicMock()) data._collections = [Collection(id=3, resolwe=MagicMock())] data._collections[0].id = 3 # this is overriden when initialized self.assertEqual(get_resource_collection(data), 3) sample = Sample(id=1, resolwe=MagicMock()) sample._collections = [Collection(id=4, resolwe=MagicMock())] sample._collections[0].id = 4 # this is overriden when initialized self.assertEqual(get_resource_collection(sample), 4) sample = Sample(id=1, resolwe=MagicMock()) sample._collections = [ Collection(id=5, resolwe=MagicMock()), Collection(id=6, resolwe=MagicMock()) ] sample._collections[0].id = 5 # this is overriden when initialized sample._collections[1].id = 6 # this is overriden when initialized self.assertEqual(get_resource_collection(sample), None) with self.assertRaises(LookupError): get_resource_collection(sample, fail_silently=False)
def prepare_geo_chipseq(resource, name=None): """Run ``Prepare GEO - ChIP-Seq`` process on the resource. This method can be used to run ``Prepare GEO - ChIP-Seq`` process on a single collection or a list of samples. :param resource: resource on which prepare_geo_chipseq will be run :param str name: name of the prepare GEO tarball and table """ reads = [] macs = [] samples = get_samples(resource) resolwe = get_resolwe(*samples) collection_ids = set() for sample in samples: reads.append(sample.get_reads().id) if sample.is_background: continue macs_list = sample.get_macs() if not macs_list: raise ValueError( "Sample {} has no `macs` data object!".format(sample)) elif len(macs_list) != 1: raise ValueError( "Sample {} has more than one `macs` data objects!".format( sample)) macs.append(macs_list[0].id) if sample.background: if sample.background not in samples: raise ValueError( "Background of the sample {} cannot be found in the resource you provided: " "{}!".format(sample, resource)) collection_ids.add(get_resource_collection(sample)) auto_name, collection = get_name_collection(collection_ids, resolwe) inputs = { 'reads': reads, 'macs': macs, 'name': name or auto_name, } geo = resolwe.get_or_run(slug='prepare-geo-chipseq', input=inputs) if collection: collection.add_data(geo) return geo
def prepare_geo_chipseq(resource, name=None): """Run ``Prepare GEO - ChIP-Seq`` process on the resource. This method can be used to run ``Prepare GEO - ChIP-Seq`` process on a single collection or a list of samples. :param resource: resource on which prepare_geo_chipseq will be run :param str name: name of the prepare GEO tarball and table """ reads = [] macs = [] samples = get_samples(resource) resolwe = get_resolwe(*samples) collection_ids = set() for sample in samples: reads.append(sample.get_reads().id) if sample.is_background: continue macs_list = sample.get_macs() if not macs_list: raise ValueError("Sample {} has no `macs` data object!".format(sample)) elif len(macs_list) != 1: raise ValueError("Sample {} has more than one `macs` data objects!".format(sample)) macs.append(macs_list[0].id) if sample.background: if sample.background not in samples: raise ValueError( "Background of the sample {} cannot be found in the resource you provided: " "{}!".format(sample, resource) ) collection_ids.add(get_resource_collection(sample)) auto_name, collection = get_name_collection(collection_ids, resolwe) inputs = { 'reads': reads, 'macs': macs, 'name': name or auto_name, } geo = resolwe.get_or_run(slug='prepare-geo-chipseq', input=inputs) if collection: collection.add_data(geo) return geo
def macs(resource, use_background=True, p_value=None): """Run ``MACS 1.4`` process on the resource. This method runs `MACS 1.4`_ process with ``p-value`` specified in arguments and ``bam`` file from the sample. If ``use_background`` argument is set to ``True``, ``bam`` file from background sample is passed to the process as the control. .. _MACS 1.4: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-macs14 :param bool use_background: if set to ``True``, background sample will be used in the process :param float p_value: p-value used in the process """ inputs = {} if p_value is not None: inputs['pvalue'] = p_value results = [] if not isinstance(resource, list): resource = [resource] for single_resource in resource: background_filter = {} if use_background: collection_id = get_resource_collection(single_resource) if collection_id: background_filter['collection'] = collection_id for sample in get_samples(single_resource): inputs['treatment'] = sample.get_primary_bam(fallback_to_bam=True).id if use_background: if sample.is_background and not is_sample(single_resource): # Don't run process on the background sample, # but let it fail if it is run directly on sample continue background = sample.get_background(**background_filter) inputs['control'] = background.get_primary_bam(fallback_to_bam=True).id macs_obj = sample.resolwe.get_or_run(slug='macs14', input=inputs) sample.add_data(macs_obj) results.append(macs_obj) return results
def cuffnorm(resource, annotation, use_ercc=None): """Run Cuffnorm_ for selected cuffquats. This method runs `Cuffnorm`_ process on ``resource`` with ``annotation`` and ``use_ercc`` parameters specified in arguments. .. _Cuffnorm: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-upload-expression-cuffnorm :param resource: resource on which cuffnorm will be run :param annotation: annotation object used in cuffnorm :type annotation: `~resdk.resources.data.Data` :param bool use_ercc: use ERRCC spike-in controls for normalization """ relation_filter = {} collection_id = get_resource_collection(resource) if collection_id: relation_filter['collection'] = collection_id samples = get_samples(resource) input_objects = [annotation] input_objects.extend(samples) resolwe = get_resolwe(*input_objects) cuffquants = [get_data_id(sample.get_cuffquant()) for sample in samples] inputs = { 'cuffquant': cuffquants, 'annotation': get_data_id(annotation), } if use_ercc is not None: inputs['useERCC'] = use_ercc cuffnorm_obj = resolwe.get_or_run(slug='cuffnorm', input=inputs) if is_collection(resource): resource.add_data(cuffnorm_obj) elif is_relation(resource): resource.collection.add_data(cuffnorm_obj) return cuffnorm_obj
def cuffdiff(resource, annotation, genome=None, multi_read_correct=None, fdr=None, library_type=None, library_normalization=None, dispersion_method=None, threads=None): """Run Cuffdiff_ for selected cuffquants. This method runs `Cuffdiff`_ process with ``annotation`` specified in arguments. Library type is by defalt fr-unstranded. Other parameters defaults: multi_read_correct=false, fdr=0.05, library_normalization=geometric, dispersion_method=pooled, threads=1. Parameter genome is optional. The way the function works depends on the resource. If it is run on a collection, it will perform cuffdiff on every 'compare' relation labeled 'case-control' in the selected collection. If it is run on a list of samples (not necesssarily in the same collection) it will run cuffdiff on all 'compare' relations labeled 'case-control' containing all of the given samples but will discard those samples in a relation that are not in the list of samples. .. _Cuffdiff: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-cuffdiff :param annotation: annotation file :type annotation: `~resdk.resources.data.Data` :param genome: genome object to use for bias detection and correction algorithm :type genome: `~resdk.resources.data.Data` :param bool multi_read_correct: do initial estimation procedure to more accurately weight reads with multiple genome mappings :param fdr: the allowed false discovery rate :type fdr: decimal :param str library_type: options are: fr-unstranded, fr-firststrand, fr-secondstrand :param str library_normalization: options are: geometric, classic-fpkm, quartile :param str dispersion_method: options are: pooled, per-condition, blind, poisson :param int threads: use this many processor threads """ inputs = {'annotation': get_data_id(annotation)} input_objects = [annotation] if genome is not None: inputs['genome'] = genome input_objects.append(genome) if multi_read_correct is not None: inputs['multi_read_correct'] = multi_read_correct if fdr is not None: inputs['fdr'] = fdr if library_type is not None: inputs['library_type'] = library_type if library_normalization is not None: inputs['library_normalization'] = library_normalization if dispersion_method is not None: inputs['dispersion_method'] = dispersion_method if threads is not None: inputs['threads'] = threads samples = get_samples(resource) sample_ids = [sample.id for sample in samples] input_objects.extend(samples) resolwe = get_resolwe(*input_objects) collection_id = get_resource_collection(resource) relation_filter = {} if collection_id: relation_filter['collection'] = collection_id else: relation_filter['entity'] = sample_ids relations = resolwe.relation.filter(type='compare', label='case-control', **relation_filter) cuffdiff_objects = [] for relation in relations: control = [] case = [] for sample, position in zip(relation.samples, relation.positions): if sample.id not in sample_ids: continue if position == 'case': case.append(get_data_id(sample.get_cuffquant())) elif position == 'control': control.append(get_data_id(sample.get_cuffquant())) else: raise ValueError( "Position different from 'case' or 'control' was found in the " "following relation: {}".format(relation.id))
def cuffnorm(resource, annotation, use_ercc=None, threads=None): """Run Cuffnorm_ for selected cuffquats. This method runs `Cuffnorm`_ process on ``resource`` with ``annotation``, ``useERCC`` and ``threads`` parameters specified in arguments. .. _Cuffnorm: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-upload-expression-cuffnorm :param resource: resource on which cuffnorm will be run :param annotation: annotation object used in cuffnorm :type annotation: int or `~resdk.resources.data.Data` :param bool useERCC: use ERRCC spike-in controls for normalization :param int threads: use this many threads to align reads (default: ``1``) """ relation_filter = {} collection_id = get_resource_collection(resource) if collection_id: relation_filter['collection'] = collection_id samples = get_samples(resource) input_objects = [annotation] input_objects.extend(samples) resolwe = get_resolwe(*input_objects) cuffquants = [get_data_id(sample.get_cuffquant()) for sample in samples] labels = [] replicates = [] replicates_ids = {} for sample in samples: relations = resolwe.relation.filter(type='group', label='replicates', entity=[sample.id], **relation_filter) if len(relations) == 1: relation = relations[0] else: raise LookupError( "Cannot determine unique group relation with label `replicates` for the " "following sample: {}".format(sample.name)) if relation.id not in replicates_ids: replicates_ids[relation.id] = str(len(replicates_ids)) replicates.append(replicates_ids[relation.id]) if str(relation.id) not in labels: labels.append(str(relation.id)) inputs = { 'cuffquant': cuffquants, 'replicates': replicates, 'annotation': get_data_id(annotation), 'labels': labels, } if use_ercc is not None: inputs['useERCC'] = use_ercc if threads is not None: inputs['threads'] = threads cuffnorm_obj = resolwe.get_or_run(slug='cuffnorm', input=inputs) if is_collection(resource): resource.add_data(cuffnorm_obj) elif is_relation(resource): resource.collection.add_data(cuffnorm_obj) return cuffnorm_obj
def cuffdiff(resource, annotation, genome=None, multi_read_correct=None, fdr=None, library_type=None, library_normalization=None, dispersion_method=None): """Run Cuffdiff_ for selected cuffquants. This method runs `Cuffdiff`_ process with ``annotation`` specified in arguments. Library type is by defalt fr-unstranded. Other parameters defaults: multi_read_correct=false, fdr=0.05, library_normalization=geometric, dispersion_method=pooled, threads=1. Parameter genome is optional. The way the function works depends on the resource. If it is run on a collection, it will perform cuffdiff on every 'compare' relation labeled 'case-control' in the selected collection. If it is run on a list of samples (not necesssarily in the same collection) it will run cuffdiff on all 'compare' relations labeled 'case-control' containing all of the given samples but will discard those samples in a relation that are not in the list of samples. .. _Cuffdiff: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-cuffdiff :param annotation: annotation file :type annotation: `~resdk.resources.data.Data` :param genome: genome object to use for bias detection and correction algorithm :type genome: `~resdk.resources.data.Data` :param bool multi_read_correct: do initial estimation procedure to more accurately weight reads with multiple genome mappings :param fdr: the allowed false discovery rate :type fdr: decimal :param str library_type: options are: fr-unstranded, fr-firststrand, fr-secondstrand :param str library_normalization: options are: geometric, classic-fpkm, quartile :param str dispersion_method: options are: pooled, per-condition, blind, poisson """ inputs = {'annotation': get_data_id(annotation)} input_objects = [annotation] if genome is not None: inputs['genome'] = genome.id input_objects.append(genome) if multi_read_correct is not None: inputs['multi_read_correct'] = multi_read_correct if fdr is not None: inputs['fdr'] = fdr if library_type is not None: inputs['library_type'] = library_type if library_normalization is not None: inputs['library_normalization'] = library_normalization if dispersion_method is not None: inputs['dispersion_method'] = dispersion_method samples = get_samples(resource) sample_ids = [sample.id for sample in samples] input_objects.extend(samples) resolwe = get_resolwe(*input_objects) collection_id = get_resource_collection(resource) relation_filter = {} if collection_id: relation_filter['collection'] = collection_id else: relation_filter['entity'] = sample_ids relations = resolwe.relation.filter( type='compare', **relation_filter ) cuffdiff_objects = [] for relation in relations: control = [] case = [] for partition in relation.partitions: sample = resolwe.sample.get(partition['entity']) label = partition['label'] if sample.id not in sample_ids: continue if label == 'case': case.append(get_data_id(sample.get_cuffquant())) elif label == 'control': control.append(get_data_id(sample.get_cuffquant())) else: raise ValueError( "Label different from 'case' or 'control' was found in the " "following relation: {}".format(relation.id) ) if not case or not control: continue inputs['case'] = case inputs['control'] = control cuffdiff_obj = resolwe.get_or_run(slug='cuffdiff', input=inputs) cuffdiff_objects.append(cuffdiff_obj) if is_collection(resource): resource.add_data(cuffdiff_obj) elif is_relation(resource): resource.collection.add_data(cuffdiff_obj) if not cuffdiff_objects: if not relations: raise ValueError("No relation containing all of the given samples was found") else: raise ValueError( "No suitable relation was found (given samples all have either 'case' label " "or 'control' label" ) return cuffdiff_objects
def rose2(resource, use_background=True, tss=None, stitch=None, beds=None): """Run ``ROSE 2`` process on the resource. This method runs `ROSE2`_ process with ``tss_exclusion`` and ``stitch`` parameters specified in arguments. Separate process is run for each bed file on the sample. To run process only on subset of those files, list them in ``beds`` argument (if only one object is given, it will be auto-wrapped in list, if it is not already). If ``use_background`` argument is set to ``True``, bam file from background sample is passed to the process as the control. .. _ROSE2: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-rose2 :param bool use_background: if set to ``True``, background sample will be used in the process :param int tss: TSS exclusion used in process :param int stitch: Stitch used in process :param list beds: subset of bed files to run process on, if empty processes for all bed files will be run """ results = [] if not isinstance(resource, list): resource = [resource] for single_resource in resource: background_filter = {} if use_background: collection_id = get_resource_collection(single_resource) if collection_id: background_filter['collection'] = collection_id for sample in get_samples(single_resource): inputs = { 'rankby': sample.get_bam().id, } if tss is not None: inputs['tss'] = tss if stitch is not None: inputs['stitch'] = stitch if use_background: if sample.is_background and not is_sample(single_resource): # Don't run process on the background sample, # but let it fail if it is run directly on sample continue background = sample.get_background(**background_filter) inputs['control'] = background.get_bam().id bed_list = sample.get_macs() if beds is not None: # Convert objects to the list of their ids if isinstance(beds, list): bed_filter = [get_data_id(bed) for bed in beds] else: bed_filter = [get_data_id(beds)] bed_list = bed_list.filter(id__in=bed_filter) for bed in bed_list: inputs['input'] = bed.id rose = sample.resolwe.get_or_run(slug='rose2', input=inputs) sample.add_data(rose) results.append(rose) return results
def cuffnorm(resource, annotation, use_ercc=None): """Run Cuffnorm_ for selected cuffquats. This method runs `Cuffnorm`_ process on ``resource`` with ``annotation`` and ``use_ercc`` parameters specified in arguments. .. _Cuffnorm: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-upload-expression-cuffnorm :param resource: resource on which cuffnorm will be run :param annotation: annotation object used in cuffnorm :type annotation: `~resdk.resources.data.Data` :param bool use_ercc: use ERRCC spike-in controls for normalization """ relation_filter = {} collection_id = get_resource_collection(resource) if collection_id: relation_filter['collection'] = collection_id samples = get_samples(resource) input_objects = [annotation] input_objects.extend(samples) resolwe = get_resolwe(*input_objects) cuffquants = [get_data_id(sample.get_cuffquant()) for sample in samples] replicates = [] replicates_ids = {} for sample in samples: relations = resolwe.relation.filter( type='group', entity=[sample.id], **relation_filter ) if len(relations) == 1: relation = relations[0] else: raise LookupError( "Cannot determine unique group relation with label `replicates` for the " "following sample: {}".format(sample.name) ) if relation.id not in replicates_ids: replicates_ids[relation.id] = str(len(replicates_ids)) replicates.append(replicates_ids[relation.id]) inputs = { 'cuffquant': cuffquants, 'replicates': replicates, 'annotation': get_data_id(annotation), } if use_ercc is not None: inputs['useERCC'] = use_ercc cuffnorm_obj = resolwe.get_or_run(slug='cuffnorm', input=inputs) if is_collection(resource): resource.add_data(cuffnorm_obj) elif is_relation(resource): resource.collection.add_data(cuffnorm_obj) return cuffnorm_obj