def test_run_workflow_with_dataset_collection(self): dataset1 = self.hist.paste_content(FOO_DATA) dataset2 = self.hist.paste_content(FOO_DATA_2) collection_description = dataset_collections.CollectionDescription( name="MyDatasetList", elements=[ dataset_collections.HistoryDatasetElement(name="sample1", id=dataset1.id), dataset_collections.HistoryDatasetElement(name="sample2", id=dataset2.id), ]) dataset_collection = self.hist.create_dataset_collection( collection_description) input_map = { "Input Dataset Collection": dataset_collection, "Input 2": dataset1 } outputs, out_hist = self.wf.run(input_map, self.hist, wait=True) self.assertEqual(len(outputs), 1) out_hdca = outputs[0] self.assertIsInstance(out_hdca, wrappers.HistoryDatasetCollectionAssociation) self.assertEqual(out_hdca.collection_type, 'list') self.assertEqual(len(out_hdca.elements), 2) self.assertEqual(out_hist.id, self.hist.id)
def test_create_list_in_history(self): history_id = self.gi.histories.create_history( name="TestDSListCreate")["id"] dataset1_id = self._test_dataset(history_id) dataset2_id = self._test_dataset(history_id) dataset3_id = self._test_dataset(history_id) collection_response = self.gi.histories.create_dataset_collection( history_id=history_id, collection_description=collections.CollectionDescription( name="MyDatasetList", elements=[ collections.HistoryDatasetElement(name="sample1", id=dataset1_id), collections.HistoryDatasetElement(name="sample2", id=dataset2_id), collections.HistoryDatasetElement(name="sample3", id=dataset3_id), ])) self.assertEqual(collection_response["name"], "MyDatasetList") self.assertEqual(collection_response["collection_type"], "list") elements = collection_response["elements"] self.assertEqual(len(elements), 3) self.assertEqual(elements[0]["element_index"], 0) self.assertEqual(elements[0]["object"]["id"], dataset1_id) self.assertEqual(elements[1]["object"]["id"], dataset2_id) self.assertEqual(elements[2]["object"]["id"], dataset3_id) self.assertEqual(elements[2]["element_identifier"], "sample3")
def build_list(self): """ Builds list of fastqs and fasta files respectively, from the data uploaded to SNVPhyl Also checks the number fo r1 and r2 files to see if there is a discrepancy in the data :return: """ while True: try: contents = self.gi.histories.show_history(self.history_id, contents=True) break except (ConnectionError, requests.exceptions.ConnectionError): self.wait_for_problem() fastqs = [] # create a list of galaxy items, for all the fastq files that are found for item in contents: if item["history_content_type"] == "dataset" and item["extension"] == "fastq": fastqs.append(item) # create a list of galaxy items for the r1 and r2 files, to check if they are the same length r1s = [] r2s = [] for fastq in fastqs: result1 = re.findall(r"(.+)_[Rr]1", fastq["name"], flags=0) result2 = re.findall(r"(.+)_[Rr]2", fastq["name"], flags=0) if len(result1) >= 1: fastq["name"] = result1[0] r1s.append(fastq) if len(result2) >= 1: fastq["name"] = result2[0] r2s.append(fastq) if len(r1s) != len(r2s): self.t.time_print("[WARNING] There are different amounts of R1 and R2 files," " will only use ones that can be paired.") pairs = [] done = [] # create collection elements with the pairs of r1 and r2 files that were found for sequence in r1s: for compare in r2s: if sequence["name"] == compare["name"] and sequence["name"] not in done: # Pair them elements = [ collections.HistoryDatasetElement(name="forward", id=sequence["id"]), collections.HistoryDatasetElement(name="reverse", id=compare["id"]) ] done.append(sequence["name"]) pairs.append(collections.CollectionElement(sequence["name"], type="paired", elements=elements)) collection_description = collections.CollectionDescription("pair_list", type="list:paired", elements=pairs) while True: try: self.gi.histories.create_dataset_collection(self.history_id, collection_description) break except (ConnectionError, requests.exceptions.ConnectionError): self.wait_for_problem()
def test_create_list_of_paired_datasets_in_history(self): history_id = self.gi.histories.create_history( name="TestDSListCreate")["id"] dataset1_id = self._test_dataset(history_id) dataset2_id = self._test_dataset(history_id) dataset3_id = self._test_dataset(history_id) dataset4_id = self._test_dataset(history_id) collection_response = self.gi.histories.create_dataset_collection( history_id=history_id, collection_description=collections.CollectionDescription( name="MyListOfPairedDatasets", type="list:paired", elements=[ collections.CollectionElement( name="sample1", type="paired", elements=[ collections.HistoryDatasetElement(name="forward", id=dataset1_id), collections.HistoryDatasetElement(name="reverse", id=dataset2_id), ]), collections.CollectionElement( name="sample2", type="paired", elements=[ collections.HistoryDatasetElement(name="forward", id=dataset3_id), collections.HistoryDatasetElement(name="reverse", id=dataset4_id), ]), ])) self.assertEqual(collection_response["name"], "MyListOfPairedDatasets") self.assertEqual(collection_response["collection_type"], "list:paired") elements = collection_response["elements"] self.assertEqual(len(elements), 2) self.assertEqual(elements[0]["element_index"], 0) created_pair1 = elements[0]["object"] self.assertEqual(created_pair1["collection_type"], "paired") self.assertEqual(len(created_pair1["elements"]), 2) forward_element1 = created_pair1["elements"][0] self.assertEqual(forward_element1["element_identifier"], "forward") self.assertEqual(forward_element1["element_index"], 0) forward_dataset1 = forward_element1["object"] self.assertEqual(forward_dataset1["id"], dataset1_id) self.assertEqual(elements[1]["element_index"], 1) created_pair2 = elements[1]["object"] self.assertEqual(created_pair2["collection_type"], "paired") self.assertEqual(len(created_pair2["elements"]), 2) reverse_element2 = created_pair2["elements"][1] reverse_dataset2 = reverse_element2["object"] self.assertEqual(reverse_element2["element_identifier"], "reverse") self.assertEqual(reverse_element2["element_index"], 1) self.assertEqual(reverse_dataset2["id"], dataset4_id)
def _create_collection_description(self): self.dataset1 = self.hist.paste_content(FOO_DATA) self.dataset2 = self.hist.paste_content(FOO_DATA_2) self.collection_description = dataset_collections.CollectionDescription( name="MyDatasetList", elements=[ dataset_collections.HistoryDatasetElement(name="sample1", id=self.dataset1.id), dataset_collections.HistoryDatasetElement(name="sample2", id=self.dataset2.id), ] )
def _create_pair_in_history(self, history_id): dataset1_id = self._test_dataset(history_id) dataset2_id = self._test_dataset(history_id) collection_response = self.gi.histories.create_dataset_collection( history_id=history_id, collection_description=collections.CollectionDescription( name="MyTestPair", type="paired", elements=[ collections.HistoryDatasetElement(name="forward", id=dataset1_id), collections.HistoryDatasetElement(name="reverse", id=dataset2_id), ])) return collection_response
def create_dataset_collection(self, gi, outputhist, name="DatasetList"): """ Make a dataset collection with the datasets listed in self.dataset_collection Args: gi (GalaxyInstance): The current instance of Galaxy being used outputhist (History): The history in which to create the dataset collection name (str): The name of the new dataset collection Returns: dataset_collection (HistoryDatasetCollectionAssociation): The new dataset collection object """ self.logger.info("Dataset collection name: '%s'" % name) collection_elements = [] datasets = self.import_datasets('dataset_collection', gi, outputhist) if self.dataset_collection['type'] == 'list': for i in range(0, len(datasets)): collection_elements.append( collections.HistoryDatasetElement(name=datasets[i].name, id=datasets[i].id)) elif self.dataset_collection['type'] == 'list:paired': pair_num = 1 for i in range(0, len(datasets), 2): collection_elements.append( collections.CollectionElement( name=datasets[i].name, type='paired', elements=[ collections.HistoryDatasetElement( name='forward', id=datasets[i].id), collections.HistoryDatasetElement( name='reverse', id=datasets[i + 1].id), ])) pair_num += 1 else: self.logger.error( "Dataset collection type must be 'list' or 'list:paired'") raise ValueError( "Dataset collection type must be 'list' or 'list:paired'") collection_description = collections.CollectionDescription( name=name, type=self.dataset_collection['type'], elements=collection_elements) dataset_collection = outputhist.create_dataset_collection( collection_description) return dataset_collection
def test_run_workflow_with_dataset_collection(self): dataset1 = self.hist.paste_content(FOO_DATA) dataset2 = self.hist.paste_content(FOO_DATA_2) collection_description = dataset_collections.CollectionDescription( name="MyDatasetList", elements=[ dataset_collections.HistoryDatasetElement(name="sample1", id=dataset1.id), dataset_collections.HistoryDatasetElement(name="sample2", id=dataset2.id), ]) dataset_collection = self.hist.create_dataset_collection( collection_description) self.assertEqual(len(self.hist.content_infos), 3) input_map = {"0": dataset_collection, "1": dataset1} inv = self.wf.invoke(input_map, history=self.hist) inv.wait() self.hist.refresh() self.assertEqual(len(self.hist.content_infos), 6) last_step = inv.sorted_steps_by()[-1] out_hdca = last_step.get_output_collections()['out_file1'] self.assertEqual(out_hdca.collection_type, 'list') self.assertEqual(len(out_hdca.elements), 2) self.assertEqual(out_hdca.container.id, self.hist.id)
def build_list(self): while True: try: contents = self.gi.histories.show_history(self.history_id, contents=True) break except (ConnectionError, requests.exceptions.ConnectionError): self.wait_for_problem() fastqs = [] # get fastq files for item in contents: if item["history_content_type"] == "dataset" and item[ "extension"] == "fastq": fastqs.append(item) # pair fastq files r1s = [] r2s = [] for fastq in fastqs: result1 = re.findall(r"(.+)_[Rr]1", fastq["name"], flags=0) result2 = re.findall(r"(.+)_[Rr]2", fastq["name"], flags=0) if len(result1) >= 1: fastq["name"] = result1[0] r1s.append(fastq) if len(result2) >= 1: fastq["name"] = result2[0] r2s.append(fastq) if len(r1s) != len(r2s): self.t.time_print( "[WARNING] There are different amounts of R1 and R2 files," " will only use ones that can be paired.") pairs = [] done = [] for sequence in r1s: for compare in r2s: if sequence["name"] == compare["name"] and sequence[ "name"] not in done: # Pair them elements = [ collections.HistoryDatasetElement(name="forward", id=sequence["id"]), collections.HistoryDatasetElement(name="reverse", id=compare["id"]) ] done.append(sequence["name"]) pairs.append( collections.CollectionElement(sequence["name"], type="paired", elements=elements)) collection_description = collections.CollectionDescription( "pair_list", type="list:paired", elements=pairs) while True: try: self.gi.histories.create_dataset_collection( self.history_id, collection_description) break except (ConnectionError, requests.exceptions.ConnectionError): self.wait_for_problem()
failedCollection = gi.histories.show_dataset_collection( historyId, collectionId) okDatasets = filter( lambda d: d['object']['state'] == 'ok' and d['object']['file_size'] > 0, failedCollection['elements']) notOkDatasets = filter( lambda d: d['object']['state'] != 'ok' or d['object']['file_size'] == 0, failedCollection['elements']) okCollectionName = failedCollection['name'] + " (ok)" notOkCollectionName = failedCollection['name'] + " (not ok)" gi.histories.create_dataset_collection( history_id=historyId, collection_description=collections.CollectionDescription( name=okCollectionName, elements=[ collections.HistoryDatasetElement(d['object']['name'], d['object']['id']) for d in okDatasets ])) gi.histories.create_dataset_collection( history_id=historyId, collection_description=collections.CollectionDescription( name=notOkCollectionName, elements=[ collections.HistoryDatasetElement(d['object']['name'], d['object']['id']) for d in notOkDatasets ]))
historyId = historyMatches[0]['id'] historyContents = gi.histories.show_history(historyId, contents=True, deleted=False, visible=True, details=False) matchingCollections = [x for x in historyContents if x['hid'] == collectionHistoryId] if len(matchingCollections) == 0: print("Error: no collections matching that id found.") exit(1) if len(matchingCollections) > 1: print("Error: more than one collection matching that id found (WTF?)") exit(1) collectionId = matchingCollections[0]['id'] failedCollection = gi.histories.show_dataset_collection(historyId, collectionId) okDatasets = [d for d in failedCollection['elements'] if d['object']['state'] == 'ok' and d['object']['file_size'] > 0] notOkDatasets = [d for d in failedCollection['elements'] if d['object']['state'] != 'ok' or d['object']['file_size'] == 0] okCollectionName = failedCollection['name'] + " (ok)" notOkCollectionName = failedCollection['name'] + " (not ok)" gi.histories.create_dataset_collection( history_id=historyId, collection_description=collections.CollectionDescription( name=okCollectionName, elements=[collections.HistoryDatasetElement(d['object']['name'], d['object']['id']) for d in okDatasets])) gi.histories.create_dataset_collection( history_id=historyId, collection_description=collections.CollectionDescription( name=notOkCollectionName, elements=[collections.HistoryDatasetElement(d['object']['name'], d['object']['id']) for d in notOkDatasets]))