def gather_chunks(chunks, output_file, nproc=1):
    if len(chunks) == 1:
        datastore = DataStore.load_from_json(op.realpath(chunks[0]))
        log.info("Writing datastore to %s", output_file)
        datastore.write_json(output_file)
        return len(datastore.files)
    file_names_by_bc = defaultdict(list)
    datastore_files_by_bc = {}
    for file_name in chunks:
        log.info("Reading datastore from %s", file_name)
        datastore = DataStore.load_from_json(op.realpath(file_name))
        for ds_file in datastore.files.values():
            ds_file_name = op.realpath(ds_file.path)
            base_name = op.basename(ds_file_name)
            fields = base_name.split(".")
            bc_pair = fields[-3]
            file_names_by_bc[bc_pair].append(ds_file_name)
            datastore_files_by_bc[bc_pair] = ds_file
    log.info("Found %d unique barcode pairs", len(file_names_by_bc))
    _results = []
    pool = multiprocessing.Pool(nproc)
    for bc_pair, file_names in file_names_by_bc.items():
        _results.append(
            pool.apply_async(_merge_chunks,
                             (file_names, datastore_files_by_bc[bc_pair])))
    pool.close()
    pool.join()
    datastore_files = [r.get() for r in _results]
    datastore_out = DataStore(datastore_files)
    log.info("Writing datastore to %s", output_file)
    datastore_out.write_json(output_file)
    return len(datastore_files)
예제 #2
0
def run_consolidate(dataset_file,
                    output_file,
                    datastore_file,
                    consolidate,
                    n_files,
                    task_id=Constants.TOOL_ID):
    datastore_files = []
    with openDataSet(dataset_file) as ds_in:
        if consolidate:
            if len(ds_in.toExternalFiles()) != 1:
                new_resource_file = op.splitext(output_file)[0] + ".bam"
                ds_in.consolidate(new_resource_file, numFiles=n_files)
            # always display the BAM/BAI if consolidation is enabled
            # XXX there is no uniqueness constraint on the sourceId, but this
            # seems sloppy nonetheless - unfortunately I don't know how else to
            # make view rule whitelisting work
            for ext_res in ds_in.externalResources:
                if ext_res.resourceId.endswith(".bam"):
                    ds_file = DataStoreFile(ext_res.uniqueId,
                                            task_id + "-out-2",
                                            ext_res.metaType, ext_res.bam)
                    datastore_files.append(ds_file)
                    for index in ext_res.indices:
                        if index.metaType in Constants.BAI_FILE_TYPES:
                            ds_file = DataStoreFile(index.uniqueId,
                                                    task_id + "-out-3",
                                                    index.metaType,
                                                    index.resourceId)
                            datastore_files.append(ds_file)
        ds_in.newUuid()
        ds_in.write(output_file)
    datastore = DataStore(datastore_files)
    datastore.write_json(datastore_file)
    return 0
예제 #3
0
def _make_datastore(subreads):
    files = [
        DataStoreFile(uuid.uuid4(), "barcoding.tasks.lima-out-0",
                      FileTypes.DS_SUBREADS.file_type_id, subreads)
    ]
    ds = DataStore(files)
    ds_path = tempfile.NamedTemporaryFile(suffix=".datastore.json").name
    ds.write_json(ds_path)
    return ds_path
예제 #4
0
def dataset_to_datastore(dataset_file,
                         datastore_file,
                         source_id="dataset_to_datastore"):
    """Copied from pbcoretools.tasks.barcoding"""
    # FIXME: replace barcoding
    dsmd = get_dataset_metadata(dataset_file)
    ds_file = DataStoreFile(dsmd.uuid, source_id, dsmd.metatype, dataset_file)
    ds_out = DataStore([ds_file])
    ds_out.write_json(datastore_file)
    return 0
def run_consolidate(dataset_file,
                    output_file,
                    datastore_file,
                    consolidate,
                    n_files,
                    consolidate_f=lambda ds: ds.consolidate):
    # XXX https://github.com/pysam-developers/pysam/issues/939
    pysam.set_verbosity(0)  # pylint: disable=no-member
    datastore_files = []
    with openDataSet(dataset_file) as ds_in:
        if consolidate:
            if len(ds_in.toExternalFiles()) <= 0:
                raise ValueError(
                    "DataSet {} must contain one or more files!".format(
                        dataset_file))
            new_resource_file = bam_of_dataset(output_file)
            consolidate_f(ds_in)(new_resource_file,
                                 numFiles=n_files,
                                 useTmp=False)
            # always display the BAM/BAI if consolidation is enabled
            # XXX there is no uniqueness constraint on the sourceId, but this
            # seems sloppy nonetheless - unfortunately I don't know how else to
            # make view rule whitelisting work
            reads_name = get_reads_name(ds_in)
            for ext_res in ds_in.externalResources:
                if ext_res.resourceId.endswith(".bam"):
                    ds_file = DataStoreFile(ext_res.uniqueId,
                                            Constants.TOOL_ID + "-out-2",
                                            ext_res.metaType,
                                            ext_res.bam,
                                            name=reads_name,
                                            description=reads_name)
                    datastore_files.append(ds_file)
                    # Prevent duplicated index files being added to datastore, since consolidated
                    # dataset may contain multiple indices pointing to the same physical file
                    added_resources = set()
                    for index in ext_res.indices:
                        if (index.metaType in Constants.BAI_FILE_TYPES
                                and index.resourceId not in added_resources):
                            added_resources.add(index.resourceId)
                            ds_file = DataStoreFile(
                                index.uniqueId,
                                Constants.TOOL_ID + "-out-3",
                                index.metaType,
                                index.resourceId,
                                name="Index of {}".format(reads_name.lower()),
                                description="Index of {}".format(
                                    reads_name.lower()))
                            datastore_files.append(ds_file)
        ds_in.newUuid()
        ds_in.write(output_file)
    datastore = DataStore(datastore_files)
    datastore.write_json(datastore_file)
    return 0
예제 #6
0
 def test_datastore_paths(self):
     tmpfile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     base_dir = os.path.dirname(tmpfile)
     tmp_ds = os.path.join(base_dir, "datastore.json")
     dsf = DataStoreFile(str(uuid.uuid4()), "pbcommand.tasks.dev_task",
                         FileTypes.DS_SUBREADS.file_type_id,
                         os.path.basename(tmpfile), False, "Subreads",
                         "Subread DataSet XML")
     ds = DataStore([dsf])
     ds.write_json(tmp_ds)
     with open(tmp_ds) as json_in:
         d = json.loads(json_in.read())
         self.assertFalse(os.path.isabs(d['files'][0]['path']))
     ds = DataStore.load_from_json(tmp_ds)
     self.assertEqual(ds.files.values()[0].path, tmpfile)
예제 #7
0
def run_dev_txt_to_datastore(rtc):

    p = os.path.dirname(rtc.task.output_files[0])

    sleep_multiplier = rtc.task.options[
        'pbsmrtpipe.task_options.sleep_multiplier']
    t_sleep = sleep_multiplier * random.random()
    log.info("Sleeping for %.1f seconds", t_sleep)
    time.sleep(t_sleep)

    from pbcore.io import SubreadSet

    num_subreadsets = rtc.task.options[
        'pbsmrtpipe.task_options.num_subreadsets']

    sset = SubreadSet(rtc.task.input_files[0])
    add_parent = True
    if len(sset.metadata.provenance) > 0:
        log.warn("Not adding provenance since input already has a parent")
        add_parent = False

    def to_f(x):
        source_id = "out-1"
        sset_out = sset.copy()
        sset_out.newUuid(random=True)
        if add_parent:
            sset_out.metadata.addParentDataSet(sset.uuid,
                                               sset.datasetType,
                                               createdBy="AnalysisJob",
                                               timeStampedName="")
        file_name = "file-{x:03d}.subreadset.xml".format(x=x)
        out_path = os.path.join(p, file_name)
        sset_out.write(out_path)
        sset_uuid = sset_out.uniqueId
        name = "subreadset-{}".format(x)
        dsf = DataStoreFile(sset_uuid,
                            source_id,
                            FileTypes.DS_SUBREADS.file_type_id,
                            file_name,
                            name=name,
                            description="{} Example Description".format(name))
        return dsf

    files = [to_f(i + 1) for i in xrange(num_subreadsets)]
    ds = DataStore(files)
    ds.write_json(rtc.task.output_files[0])
    return 0
예제 #8
0
def gather_datastore(input_files, output_file, skip_empty=True):
    ds = DataStore([])
    for i_fn in input_files:
        for uuid, f in DataStore.load_from_json(i_fn).files.items():
            ds.add(f)
    ds.write_json(output_file)
예제 #9
0
def write_and_initialize_data_store_json(file_name, ds_files):
    ds = DataStore(ds_files)
    ds.write_json(file_name)
    return ds
예제 #10
0
 def test_failure_no_inputs(self):
     ds = DataStore([])
     ds_path = tempfile.NamedTemporaryFile(suffix=".datastore.json").name
     ds.write_json(ds_path)
     with self.assertRaises(ValueError) as err:
         report = run_to_report(ds_path, self.barcodes, self.subreads)
예제 #11
0
def write_and_initialize_data_store_json(file_name, ds_files):
    ds = DataStore(ds_files)
    ds.write_json(file_name)
    return ds