Python DataStore.DataStore 예제들, pbcommand.models.DataStore.DataStore Python 예제들

예제 #1

0

파일 보기

파일: gather_lima_datasets.py 프로젝트: MShaffar19/pbcoretools

def gather_chunks(chunks, output_file, nproc=1):
    if len(chunks) == 1:
        datastore = DataStore.load_from_json(op.realpath(chunks[0]))
        log.info("Writing datastore to %s", output_file)
        datastore.write_json(output_file)
        return len(datastore.files)
    file_names_by_bc = defaultdict(list)
    datastore_files_by_bc = {}
    for file_name in chunks:
        log.info("Reading datastore from %s", file_name)
        datastore = DataStore.load_from_json(op.realpath(file_name))
        for ds_file in datastore.files.values():
            ds_file_name = op.realpath(ds_file.path)
            base_name = op.basename(ds_file_name)
            fields = base_name.split(".")
            bc_pair = fields[-3]
            file_names_by_bc[bc_pair].append(ds_file_name)
            datastore_files_by_bc[bc_pair] = ds_file
    log.info("Found %d unique barcode pairs", len(file_names_by_bc))
    _results = []
    pool = multiprocessing.Pool(nproc)
    for bc_pair, file_names in file_names_by_bc.items():
        _results.append(
            pool.apply_async(_merge_chunks,
                             (file_names, datastore_files_by_bc[bc_pair])))
    pool.close()
    pool.join()
    datastore_files = [r.get() for r in _results]
    datastore_out = DataStore(datastore_files)
    log.info("Writing datastore to %s", output_file)
    datastore_out.write_json(output_file)
    return len(datastore_files)

예제 #2

0

파일 보기

파일: subreads_reports.py 프로젝트: MShaffar19/pbreports

def to_reports(subreads, output_dir):
    output_files = []
    log.info("Loading {f}".format(f=subreads))
    ds = SubreadSet(subreads)
    ds.loadStats()
    for base, module in [("filter_stats_xml", filter_stats_xml),
                         ("adapter_xml", adapter_xml),
                         ("loading_xml", loading_xml),
                         ("control", control)]:
        constants = getattr(module, "Constants")
        task_id = constants.TOOL_ID
        to_report = getattr(module, "to_report_impl")
        try:
            rpt_output_dir = os.path.join(output_dir, base)
            os.mkdir(rpt_output_dir)
            file_name = os.path.join(rpt_output_dir, "{b}.json".format(b=base))
            report = to_report(ds, rpt_output_dir)
            log.info("Writing {f}".format(f=file_name))
            report.write_json(file_name)
            output_files.append(DataStoreFile(
                uuid=report.uuid,
                source_id=task_id,
                type_id=FileTypes.REPORT.file_type_id,
                path=file_name,
                is_chunked=False,
                name=base))
        except InvalidStatsError as e:
            log.error("This dataset lacks some required statistics")
            log.error("Skipping generation of {b} report".format(b=base))
    datastore = DataStore(output_files)
    return datastore

예제 #3

0

파일 보기

def run_consolidate(dataset_file,
                    output_file,
                    datastore_file,
                    consolidate,
                    n_files,
                    task_id=Constants.TOOL_ID):
    datastore_files = []
    with openDataSet(dataset_file) as ds_in:
        if consolidate:
            if len(ds_in.toExternalFiles()) != 1:
                new_resource_file = op.splitext(output_file)[0] + ".bam"
                ds_in.consolidate(new_resource_file, numFiles=n_files)
            # always display the BAM/BAI if consolidation is enabled
            # XXX there is no uniqueness constraint on the sourceId, but this
            # seems sloppy nonetheless - unfortunately I don't know how else to
            # make view rule whitelisting work
            for ext_res in ds_in.externalResources:
                if ext_res.resourceId.endswith(".bam"):
                    ds_file = DataStoreFile(ext_res.uniqueId,
                                            task_id + "-out-2",
                                            ext_res.metaType, ext_res.bam)
                    datastore_files.append(ds_file)
                    for index in ext_res.indices:
                        if index.metaType in Constants.BAI_FILE_TYPES:
                            ds_file = DataStoreFile(index.uniqueId,
                                                    task_id + "-out-3",
                                                    index.metaType,
                                                    index.resourceId)
                            datastore_files.append(ds_file)
        ds_in.newUuid()
        ds_in.write(output_file)
    datastore = DataStore(datastore_files)
    datastore.write_json(datastore_file)
    return 0

예제 #4

0

파일 보기

파일: file_utils.py 프로젝트: MShaffar19/pbcoretools

def mock_update_barcoded_sample_metadata(base_dir,
                                         datastore_file,
                                         input_reads,
                                         barcode_set,
                                         use_barcode_uuids=True):
    """
    Function to mimic the actual update function, without actually reading
    any barcoding information from the datasets.  Instead, the barcodes
    defined in the input dataset will be applied sequentially.
    """
    barcode_names, bio_samples_d, barcode_uuids_d, update_files, parent_info = _load_files_for_update(
        input_reads, barcode_set, datastore_file, None)
    barcode_ids = {name: i for i, name in enumerate(barcode_names)}
    bc_pairs = []
    ds_files = {}
    for bc_label in barcode_uuids_d.keys():
        bc_fw_label, bc_rev_label = bc_label.split("--")
        bc_pairs.append((barcode_ids[bc_fw_label], barcode_ids[bc_rev_label]))
        suffix = ".{l}.subreadset.xml".format(l=bc_label)
        for ds_file in update_files:
            if ds_file.path.endswith(suffix):
                ds_files[bc_pairs[-1]] = ds_file
    new_files = []
    assert len(bc_pairs) >= len(update_files)
    for bc_pair in bc_pairs:
        ds_file = ds_files[bc_pair]
        new_files.append(
            _mock_update_barcoded_sample_metadata(base_dir, ds_file,
                                                  barcode_names, parent_info,
                                                  use_barcode_uuids, bc_pair,
                                                  bio_samples_d,
                                                  barcode_uuids_d))
    return DataStore(new_files)

예제 #5

0

파일 보기

def _make_datastore(subreads):
    files = [
        DataStoreFile(uuid.uuid4(), "barcoding.tasks.lima-out-0",
                      FileTypes.DS_SUBREADS.file_type_id, subreads)
    ]
    ds = DataStore(files)
    ds_path = tempfile.NamedTemporaryFile(suffix=".datastore.json").name
    ds.write_json(ds_path)
    return ds_path

예제 #6

0

파일 보기

def dataset_to_datastore(dataset_file,
                         datastore_file,
                         source_id="dataset_to_datastore"):
    """Copied from pbcoretools.tasks.barcoding"""
    # FIXME: replace barcoding
    dsmd = get_dataset_metadata(dataset_file)
    ds_file = DataStoreFile(dsmd.uuid, source_id, dsmd.metatype, dataset_file)
    ds_out = DataStore([ds_file])
    ds_out.write_json(datastore_file)
    return 0

예제 #7

0

파일 보기

파일: consolidate_transcripts.py 프로젝트: MShaffar19/pbcoretools

def run_consolidate(dataset_file,
                    output_file,
                    datastore_file,
                    consolidate,
                    n_files,
                    consolidate_f=lambda ds: ds.consolidate):
    # XXX https://github.com/pysam-developers/pysam/issues/939
    pysam.set_verbosity(0)  # pylint: disable=no-member
    datastore_files = []
    with openDataSet(dataset_file) as ds_in:
        if consolidate:
            if len(ds_in.toExternalFiles()) <= 0:
                raise ValueError(
                    "DataSet {} must contain one or more files!".format(
                        dataset_file))
            new_resource_file = bam_of_dataset(output_file)
            consolidate_f(ds_in)(new_resource_file,
                                 numFiles=n_files,
                                 useTmp=False)
            # always display the BAM/BAI if consolidation is enabled
            # XXX there is no uniqueness constraint on the sourceId, but this
            # seems sloppy nonetheless - unfortunately I don't know how else to
            # make view rule whitelisting work
            reads_name = get_reads_name(ds_in)
            for ext_res in ds_in.externalResources:
                if ext_res.resourceId.endswith(".bam"):
                    ds_file = DataStoreFile(ext_res.uniqueId,
                                            Constants.TOOL_ID + "-out-2",
                                            ext_res.metaType,
                                            ext_res.bam,
                                            name=reads_name,
                                            description=reads_name)
                    datastore_files.append(ds_file)
                    # Prevent duplicated index files being added to datastore, since consolidated
                    # dataset may contain multiple indices pointing to the same physical file
                    added_resources = set()
                    for index in ext_res.indices:
                        if (index.metaType in Constants.BAI_FILE_TYPES
                                and index.resourceId not in added_resources):
                            added_resources.add(index.resourceId)
                            ds_file = DataStoreFile(
                                index.uniqueId,
                                Constants.TOOL_ID + "-out-3",
                                index.metaType,
                                index.resourceId,
                                name="Index of {}".format(reads_name.lower()),
                                description="Index of {}".format(
                                    reads_name.lower()))
                            datastore_files.append(ds_file)
        ds_in.newUuid()
        ds_in.write(output_file)
    datastore = DataStore(datastore_files)
    datastore.write_json(datastore_file)
    return 0

예제 #8

0

파일 보기

 def test_datastore_paths(self):
     tmpfile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     base_dir = os.path.dirname(tmpfile)
     tmp_ds = os.path.join(base_dir, "datastore.json")
     dsf = DataStoreFile(str(uuid.uuid4()), "pbcommand.tasks.dev_task",
                         FileTypes.DS_SUBREADS.file_type_id,
                         os.path.basename(tmpfile), False, "Subreads",
                         "Subread DataSet XML")
     ds = DataStore([dsf])
     ds.write_json(tmp_ds)
     with open(tmp_ds) as json_in:
         d = json.loads(json_in.read())
         self.assertFalse(os.path.isabs(d['files'][0]['path']))
     ds = DataStore.load_from_json(tmp_ds)
     self.assertEqual(ds.files.values()[0].path, tmpfile)

예제 #9

0

파일 보기

def run_dev_txt_to_datastore(rtc):

    p = os.path.dirname(rtc.task.output_files[0])

    sleep_multiplier = rtc.task.options[
        'pbsmrtpipe.task_options.sleep_multiplier']
    t_sleep = sleep_multiplier * random.random()
    log.info("Sleeping for %.1f seconds", t_sleep)
    time.sleep(t_sleep)

    from pbcore.io import SubreadSet

    num_subreadsets = rtc.task.options[
        'pbsmrtpipe.task_options.num_subreadsets']

    sset = SubreadSet(rtc.task.input_files[0])
    add_parent = True
    if len(sset.metadata.provenance) > 0:
        log.warn("Not adding provenance since input already has a parent")
        add_parent = False

    def to_f(x):
        source_id = "out-1"
        sset_out = sset.copy()
        sset_out.newUuid(random=True)
        if add_parent:
            sset_out.metadata.addParentDataSet(sset.uuid,
                                               sset.datasetType,
                                               createdBy="AnalysisJob",
                                               timeStampedName="")
        file_name = "file-{x:03d}.subreadset.xml".format(x=x)
        out_path = os.path.join(p, file_name)
        sset_out.write(out_path)
        sset_uuid = sset_out.uniqueId
        name = "subreadset-{}".format(x)
        dsf = DataStoreFile(sset_uuid,
                            source_id,
                            FileTypes.DS_SUBREADS.file_type_id,
                            file_name,
                            name=name,
                            description="{} Example Description".format(name))
        return dsf

    files = [to_f(i + 1) for i in xrange(num_subreadsets)]
    ds = DataStore(files)
    ds.write_json(rtc.task.output_files[0])
    return 0

예제 #10

0

파일 보기

def _run_auto_ccs_outputs_barcoded(datastore_in, datastore_out, nproc=Constants.MAX_NPROC):
    base_dir = op.dirname(datastore_out)
    files = DataStore.load_from_json(datastore_in).files.values()
    ccs_files = []
    for ds_file in files:
        # FIXME use a better file_id
        if ds_file.file_type_id == FileTypes.DS_CCS.file_type_id and ds_file.file_id == "barcoding.tasks.lima-0":
            ccs_files.append(ds_file.path)
            log.info("Exporting %s", ds_file.path)
    log.info("Exporting %d CCS datasets", len(ccs_files))
    args = [(f, base_dir) for f in ccs_files]
    output_files = list(itertools.chain.from_iterable(
        pool_map(__run_ccs_bam_fastq_exports, args, nproc)))
    output_files.extend([
        _create_zipped_fastq(output_files, "all_barcodes.fastq.tar.gz"),
        _create_zipped_fasta(output_files, "all_barcodes.fasta.tar.gz")
    ])
    DataStore(output_files).write_json(datastore_out)
    return 0

예제 #11

0

파일 보기

def run_args(args):
    datastore_out = op.abspath(args.datastore_out)
    base_dir = op.dirname(datastore_out)
    datastore_files = []
    with ConsensusReadSet(args.dataset_file, strict=True) as ds:
        bam_file_name, file_prefix = get_prefix_and_bam_file_name(
            ds, is_barcoded=False)
        if args.mode == "fasta":
            datastore_files.extend(to_fastx_files(
                FileTypes.FASTA, ds, args.dataset_file, Constants.FASTA_FILE_IDS, base_dir, file_prefix, args.min_rq, no_zip=args.no_zip))
        elif args.mode == "fastq":
            datastore_files.extend(to_fastx_files(
                FileTypes.FASTQ, ds, args.dataset_file, Constants.FASTQ_FILE_IDS, base_dir, file_prefix, args.min_rq, no_zip=args.no_zip))
        elif args.mode == "consolidate":
            if bam_file_name is None:
                datastore_files.append(
                    consolidate_bam(base_dir, file_prefix, ds,
                                    min_rq=args.min_rq))
    DataStore(datastore_files).write_json(datastore_out)
    return 0

예제 #12

0

파일 보기

파일: collect_files.py 프로젝트: MShaffar19/pbcoretools

def run_args(args):
    sample_name = None
    if not args.single_sample and not args.all_samples:
        bam = openDataFile(args.samples_file)
        sample_name = bam.readGroupTable[0].SampleName
        log.info("Sample name is {}".format(sample_name))
    elif args.all_samples:
        sample_name = "All Samples"
    files = []
    for file_id, file_type, label in FILE_IDS_AND_NAMES:
        file_path = getattr(args, file_id)
        if file_path is None:
            log.info("Skipping {}".format(file_id))
            continue
        assert file_path is not None and op.exists(file_path)
        if sample_name:
            label += " ({})".format(sample_name)
        files.append(to_datastore_file(file_path, file_id, file_type, label))
    DataStore(files).write_json(args.datastore)
    return 0

예제 #13

0

파일 보기

파일: file_utils.py 프로젝트: MShaffar19/pbcoretools

def update_barcoded_sample_metadata(
        base_dir,
        datastore_file,
        input_reads,
        barcode_set,
        isoseq_mode=False,
        use_barcode_uuids=True,
        nproc=1,
        min_score_filter=Constants.BARCODE_QUALITY_GREATER_THAN):
    """
    Given a datastore JSON of SubreadSets produced by barcoding, apply the
    following updates to each:
    1. Include only the BioSample(s) corresponding to its barcode
    2. Add the BioSample name to the dataset name
    3. Add a ParentDataSet record in the Provenance section.
    """
    barcode_names, bio_samples_d, barcode_uuids_d, update_files, parent_info = _load_files_for_update(
        input_reads, barcode_set, datastore_file)
    pool = multiprocessing.Pool(nproc)
    _results = []
    for ds_file in update_files:
        _results.append(
            pool.apply_async(_update_barcoded_sample_metadata,
                             (base_dir, ds_file, barcode_names, parent_info,
                              isoseq_mode, use_barcode_uuids, bio_samples_d,
                              barcode_uuids_d, min_score_filter)))
    pool.close()
    pool.join()
    datastore_files = [r.get() for r in _results]
    # copy over the un-barcoded reads BAM
    dstore = DataStore.load_from_json(datastore_file)
    files = dstore.files.values()
    for f in files:
        if f.file_id != "barcoding.tasks.lima-0":
            datastore_files.append(f)
    return DataStore(datastore_files)

예제 #14

0

파일 보기

 def test_failure_no_inputs(self):
     ds = DataStore([])
     ds_path = tempfile.NamedTemporaryFile(suffix=".datastore.json").name
     ds.write_json(ds_path)
     with self.assertRaises(ValueError) as err:
         report = run_to_report(ds_path, self.barcodes, self.subreads)

예제 #15

0

파일 보기

파일: gather.py 프로젝트: MShaffar19/pbcoretools

def gather_datastore(input_files, output_file, skip_empty=True):
    ds = DataStore([])
    for i_fn in input_files:
        for uuid, f in DataStore.load_from_json(i_fn).files.items():
            ds.add(f)
    ds.write_json(output_file)

예제 #16

0

파일 보기

def _to_datastore(dx):
    # Friction to get around service endpoint not returning a list of files
    ds_files = [_to_ds_file(d) for d in dx]
    return DataStore(ds_files)

예제 #17

0

파일 보기

파일: driver_utils.py 프로젝트: lpp1985/lpp_Script

def write_and_initialize_data_store_json(file_name, ds_files):
    ds = DataStore(ds_files)
    ds.write_json(file_name)
    return ds