示例#1
0
 def test_factory_function(self):
     bam = data.getBam()
     aln = data.getXml(8)
     ref = data.getXml(9)
     sub = data.getXml(10)
     inTypes = [bam, aln, ref, sub]
     expTypes = [DataSet, AlignmentSet, ReferenceSet, SubreadSet]
     for infn, exp in zip(inTypes, expTypes):
         # TODO enable this for all when simulated subread files can be
         # pbi'd
         if exp in [DataSet, ReferenceSet, AlignmentSet]:
             ds = openDataSet(infn, strict=True)
         else:
             ds = openDataSet(infn)
         self.assertEqual(type(ds), exp)
示例#2
0
 def test_datset_name(self):
     ssfn = data.getXml(7)
     ofn = tempfile.NamedTemporaryFile(suffix=".xml").name
     run_filter_dataset(ssfn, ofn, "0", "None")
     ds = openDataSet(ofn)
     assert ds.name.endswith("(filtered)")
     assert "filtered" in ds.tags
示例#3
0
 def __init__(self, alignment_file):
     self.alignment_file = alignment_file
     self.dataset_uuids = []
     if alignment_file.endswith('.xml'):
         log.debug('Importing alignments from dataset XML')
         alignment_set = openDataSet(alignment_file)
         if not isinstance(alignment_set,
                           (AlignmentSet, ConsensusAlignmentSet)):
             raise TypeError("Dataset type %s not allowed here" %
                             type(alignment_set).__name__)
         self.alignment_file_list = alignment_set.toExternalFiles()
         self.dataset_uuids.append(alignment_set.uuid)
         movies = []
         for x in self.alignment_file_list:
             if not os.path.exists(x):
                 raise IOError(
                     "Unable to find DataSet external resource {x}".format(x=x))
             movies.extend(_movienames_from_bam(x))
         self.movies = sorted(list(set(movies)))
     elif _is_sam_or_bam_file(alignment_file):
         self.alignment_file_list = [alignment_file]
         self.movies = _movienames_from_bam(alignment_file)
     else:
         raise ValueError("Unsupported alignment file type '${x}'".format(
             x=alignment_file))
示例#4
0
文件: cli.py 项目: lpp1985/lpp_Script
def import_local_dataset(sal, path):
    """:type sal: ServiceAccessLayer"""
    # XXX basic validation of external resources
    try:
        from pbcore.io import openDataSet, ReadSet, HdfSubreadSet
    except ImportError:
        log.warn("Can't import pbcore, skipping dataset sanity check")
    else:
        ds = openDataSet(path, strict=True)
        if isinstance(ds, ReadSet) and not isinstance(ds, HdfSubreadSet):
            if len(ds) > 0:
                log.info("checking BAM file integrity")
                for rr in ds.resourceReaders():
                    try:
                        _ = rr[-1]
                    except Exception as e:
                        log.exception("Import failed because the underlying "+
                                      "data appear to be corrupted.  Run "+
                                      "'pbvalidate' on the dataset for more "+
                                      "thorough checking.")
                        return 1
            else:
                log.warn("Empty dataset - will import anyway")

    # this will raise if the import wasn't successful
    _ = sal.run_import_local_dataset(path)
    log.info("Successfully import dataset from {f}".format(f=path))
    return 0
示例#5
0
 def test_dataset_io(self):
     ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     rc = bamSieve.filter_reads(input_bam=DS2,
                                output_bam=ofn,
                                whitelist="8")
     self.assertEqual(rc, 0)
     with openDataSet(ofn, strict=False) as bam_out:
         have_zmws = set([rec.HoleNumber for rec in bam_out])
         self.assertEqual(have_zmws, set([8]))
     # make sure paths are absolute
     tmpdir = tempfile.mkdtemp()
     ofn2 = op.join(tmpdir, op.basename(ofn))
     shutil.copyfile(ofn, ofn2)
     with openDataSet(ofn2, strict=False) as bam_out:
         have_zmws = set([rec.HoleNumber for rec in bam_out])
         self.assertEqual(have_zmws, set([8]))
def consolidateXml(args):
    """Combine BAMs and apply the filters described in the XML file, producing
    one consolidated XML"""
    dset = openDataSet(args.infile)
    dset.consolidate(args.datafile, numFiles=args.numFiles, useTmp=(not
                     args.noTmp))
    dset.write(args.xmlfile)
示例#7
0
def loadStatsXml(args):
    dset = openDataSet(args.infile, strict=args.strict)
    dset.loadStats(args.statsfile)
    if args.outfile:
        dset.write(args.outfile, validate=False)
    else:
        dset.write(args.infile, validate=False)
示例#8
0
def run_consolidate(dataset_file,
                    output_file,
                    datastore_file,
                    consolidate,
                    n_files,
                    task_id=Constants.TOOL_ID):
    datastore_files = []
    with openDataSet(dataset_file) as ds_in:
        if consolidate:
            if len(ds_in.toExternalFiles()) != 1:
                new_resource_file = op.splitext(output_file)[0] + ".bam"
                ds_in.consolidate(new_resource_file, numFiles=n_files)
            # always display the BAM/BAI if consolidation is enabled
            # XXX there is no uniqueness constraint on the sourceId, but this
            # seems sloppy nonetheless - unfortunately I don't know how else to
            # make view rule whitelisting work
            for ext_res in ds_in.externalResources:
                if ext_res.resourceId.endswith(".bam"):
                    ds_file = DataStoreFile(ext_res.uniqueId,
                                            task_id + "-out-2",
                                            ext_res.metaType, ext_res.bam)
                    datastore_files.append(ds_file)
                    for index in ext_res.indices:
                        if index.metaType in Constants.BAI_FILE_TYPES:
                            ds_file = DataStoreFile(index.uniqueId,
                                                    task_id + "-out-3",
                                                    index.metaType,
                                                    index.resourceId)
                            datastore_files.append(ds_file)
        ds_in.newUuid()
        ds_in.write(output_file)
    datastore = DataStore(datastore_files)
    datastore.write_json(datastore_file)
    return 0
示例#9
0
 def run_after(self, rtc, output_dir):
     n_actual = n_expected = 0
     with openDataSet(self.INPUT_FILES[0]) as ds:
         n_expected = len([ rec for rec in ds ])
     with self.READER_CLASS(rtc.task.output_files[0]) as f:
         n_actual = len([ rec for rec in f ])
     self.assertEqual(n_actual, n_expected)
示例#10
0
def summarizeXml(args):
    dset = openDataSet(args.infile, strict=args.strict)

    # check to see if there was an error updating the dataset length:
    numFlag = ""
    if dset.numRecords == 0:
        dset.updateCounts()
        if not dset._countsUpdated:
            numFlag = " Unable to update counts!"
    print("DataSet Type          : {f}".format(f=dset.datasetType))
    print("Name                  : {f}".format(f=dset.name))
    print("Id                    : {f}".format(f=dset.uuid))
    print("Number of records     : {r}{f}".format(r=dset.numRecords,
                                                  f=numFlag))
    print("Total number of bases : {r}{f}".format(r=dset.totalLength,
                                                  f=numFlag))
    print("# of Resources        : {r}".format(r=len(dset.toExternalFiles())))
    print("Filters               : {r}".format(
        r=str(dset.filters) if dset.filters else "None"))
    show_sample_names_if_defined(dset)
    if args.show_chemistry:
        print("Sequencing Chemistry  : {c}".format(
            c=", ".join(dset.sequencingChemistry)))
    for fname in dset.toExternalFiles():
        print(fname)
    return 0
示例#11
0
def _labels_reads_iterator(reads, barcodes, subreads=True):
    with openDataSet(reads) as ds:
        for er in ds.externalResources:
            if er.barcodes != barcodes:
                raise ValueError(
                    "Mismatch between external resource "
                    + "barcodes and input BarcodeSet: "
                    + "{a} != {b}".format(a=er.barcodes, b=barcodes)
                )
        assert ds.isIndexed
        zmws_by_barcode = defaultdict(set)
        reads_by_zmw = defaultdict(list)
        for rr in ds.resourceReaders():
            for i, (b, z, q) in enumerate(zip(rr.pbi.bcForward, rr.pbi.holeNumber, rr.pbi.qId)):
                movie = rr.readGroupInfo(q).MovieName
                zmws_by_barcode[b].add((movie, z))
                reads_by_zmw[(movie, z)].append((rr, i))
        with BarcodeSet(barcodes) as bc:
            for i_bc, barcode in enumerate(bc):
                zmws = sorted(list(zmws_by_barcode[i_bc]))
                for (movie, zmw) in zmws:
                    for rr, i_read in reads_by_zmw[(movie, zmw)]:
                        # FIXME(nechols)(2016-03-15) this will not work on CCS
                        qlen = rr.pbi.qEnd[i_read] - rr.pbi.qStart[i_read]
                        barcode_id = "{f}--{r}".format(f=rr.pbi.bcForward[i_read], r=rr.pbi.bcReverse[i_read])
                        yield barcode_id, barcode, ["n"] * qlen
示例#12
0
文件: cli.py 项目: knyquist/pbcommand
def import_local_dataset(sal, path):
    """:type sal: ServiceAccessLayer"""
    # XXX basic validation of external resources
    try:
        from pbcore.io import openDataSet, ReadSet, HdfSubreadSet
    except ImportError:
        log.warn("Can't import pbcore, skipping dataset sanity check")
    else:
        ds = openDataSet(path, strict=True)
        if isinstance(ds, ReadSet) and not isinstance(ds, HdfSubreadSet):
            if len(ds) > 0:
                log.info("checking BAM file integrity")
                for rr in ds.resourceReaders():
                    try:
                        _ = rr[-1]
                    except Exception as e:
                        log.exception("Import failed because the underlying " +
                                      "data appear to be corrupted.  Run " +
                                      "'pbvalidate' on the dataset for more " +
                                      "thorough checking.")
                        return 1
            else:
                log.warn("Empty dataset - will import anyway")

    # this will raise if the import wasn't successful
    _ = sal.run_import_local_dataset(path)
    log.info("Successfully import dataset from {f}".format(f=path))
    return 0
def _example_main( input_file, output_file, **kwargs ):
    """
    This func should be imported from your python package.

    This should have *no* dependency on the pbcommand IO, such as the RTC/TC models.
    """

    # This is just for test purposes
    log.info("Running example main with {i} {o} kw:{k}".format(i=input_file,
                                                               o=output_file,
                                                               k=kwargs))

    # Try to open SubreadSet with pbcore
    log.info( "Attempting to open SubreadSet input with pbcore.io.BamIO" )
    dset = openDataSet( input_file )
    nreads = len( dset )

    # write mock output files, otherwise the End-to-End test will fail when
    # run within testkit
    log.info( "Attempting to write simple information from the SubreadSet to CSV output" )
    with open( output_file, 'wb' ) as csvfile:
        writer = csv.writer( csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL )
        writer.writerow( ['foo', 'bar'] )
        writer.writerow( ['baz', 'waz'] )
        writer.writerow( ['nreads', str(nreads) ] )
        #f.write( "MOCK TEST DATA" )
    return 0
示例#14
0
 def __init__(self, alignment_file):
     self.alignment_file = alignment_file
     self.dataset_uuids = []
     if alignment_file.endswith('.xml'):
         log.debug('Importing alignments from dataset XML')
         alignment_set = openDataSet(alignment_file)
         if not isinstance(alignment_set,
                           (AlignmentSet, ConsensusAlignmentSet)):
             raise TypeError("Dataset type %s not allowed here" %
                             type(alignment_set).__name__)
         self.alignment_file_list = alignment_set.toExternalFiles()
         self.dataset_uuids.append(alignment_set.uuid)
         movies = []
         for x in self.alignment_file_list:
             if not os.path.exists(x):
                 raise IOError(
                     "Unable to find DataSet external resource {x}".format(x=x))
             movies.extend(_movienames_from_bam(x))
         self.movies = sorted(list(set(movies)))
     elif _is_sam_or_bam_file(alignment_file):
         self.alignment_file_list = [alignment_file]
         self.movies = _movienames_from_bam(alignment_file)
     else:
         raise ValueError("Unsupported alignment file type '${x}'".format(
             x=alignment_file))
示例#15
0
def _example_main(input_file, output_file, **kwargs):
    """
    This func should be imported from your python package.

    This should have *no* dependency on the pbcommand IO, such as the RTC/TC models.
    """

    # This is just for test purposes
    log.info("Running example main with {i} {o} kw:{k}".format(i=input_file,
                                                               o=output_file,
                                                               k=kwargs))

    # Open input CSV. Store absolute path of each alignment set.
    dset_paths = _get_dset_paths(input_file)

    dsets_kpis = {}
    for f in dset_paths:
        dset = openDataSet(f)
        subsampled_dset = _subsample_alignments(dset)
        dsets_kpis[f] = _getKPIs(dset, subsampled_dset)

    pickle.dump(dsets_kpis, open(output_file, 'wb'))

    # save a simple plot
    traces = []
    titles = []
    max_rl = 0
    for key in dsets_kpis.keys():
        rl = dsets_kpis[key]['readlength']
        acc = dsets_kpis[key]['accuracy']
        if max(rl) > max_rl:
            max_rl = max(rl)
        trace = Scatter(x=rl, y=acc, mode='markers')
        traces.append(trace)
        titles.append(str(key))
    rows = len(traces)
    fig = plotly.tools.make_subplots(rows=rows,
                                     cols=1,
                                     subplot_titles=tuple(titles))
    fig['layout']['font']['size'] = 8
    fig['layout'].update(showlegend=False)
    for row, trace in enumerate(traces):
        fig.append_trace(trace, row + 1,
                         1)  # convert from zero-based to one-based indexing
        fig['layout']['xaxis' + str(row + 1)]['tickfont'].update(size=20)
        fig['layout']['yaxis' + str(row + 1)]['tickfont'].update(size=20)
        fig['layout']['xaxis' + str(row + 1)].update(range=[0, max_rl])

    fig['layout']['yaxis' + str(rows / 2 + 1)].update(title='accuracy')
    fig['layout']['yaxis' + str(rows / 2 + 1)]['titlefont'].update(size=20)
    fig['layout']['xaxis' + str(rows)].update(title='readlength (bases)')
    fig['layout']['xaxis' + str(rows)]['titlefont'].update(size=20)

    plot(fig, filename='test-plot.html', show_link=False, auto_open=False)
    phantomjs_driver.set_window_size(1200, 800)
    phantomjs_driver.get('test-plot.html')
    phantomjs_driver.save_screenshot('test-plot.png')

    return 0
示例#16
0
def _example_main(input_file, output_file, **kwargs):
    """
    This func should be imported from your python package.

    This should have *no* dependency on the pbcommand IO, such as the RTC/TC models.
    """

    # This is just for test purposes
    log.info("Running example main with {i} {o} kw:{k}".format(i=input_file,
                                                               o=output_file,
                                                               k=kwargs))

    # Open dset CSV. Store absolute path of each alignment set.
    dset_paths = _get_dset_paths(input_file[0])

    # Open plots CSV. Store names of plots to produce.
    plots_to_generate = _get_plots_to_generate(input_file[1])

    dsets_kpis = {}
    for f in dset_paths:
        dset = openDataSet(dset_paths[f]['aset'])
        subsampled_dset = _subsample_alignments(dset)
        dsets_kpis[f] = _getKPIs(dset, subsampled_dset)

    figures = []
    # figure tuple has form (plot_group_id, plot_id, figure)
    if 'accuracy_vs_readlength' in plots_to_generate:
        figures.append(('accuracy', 'accuracy_vs_readlength', accuracy_plots._plot_accuracy_vs_readlength(dsets_kpis)))
    if 'accuracy' in plots_to_generate:
        figures.append(('accuracy', 'accuracy', accuracy_plots._plot_accuracy_distribution(dsets_kpis)))
    if 'accuracy_boxplot' in plots_to_generate:
        figures.append(('accuracy', 'accuracy_boxplot', accuracy_plots._plot_accuracy_boxplots(dsets_kpis)))

    all_plots = {} # dictionary of plots. keys are groups
    for plot_group, plot_id, fig in figures:
        if plot_group not in all_plots.keys():
            all_plots[plot_group] = []
        plot(fig, filename='{i}.html'.format(i=plot_id), show_link=False, auto_open=False)
        phantomjs_driver.set_window_size(1920, 1080)
        phantomjs_driver.get('{i}.html'.format(i=plot_id))
        phantomjs_driver.save_screenshot('{i}.png'.format(i=plot_id))
        phantomjs_driver.get('{i}.html'.format(i=plot_id))
        phantomjs_driver.save_screenshot('{i}_thumb.png'.format(i=plot_id))
        os.remove('{i}.html'.format(i=plot_id))
        plot_path = '{i}.png'.format(i=plot_id)
        thumb_path = '{i}_thumb.png'.format(i=plot_id)
        all_plots[plot_group].append(Plot(plot_id, plot_path, thumbnail=thumb_path))

    plot_groups = []
    for plot_group_title in all_plots.keys():
        plot_group = PlotGroup( plot_group_title, plots=all_plots[plot_group_title])
        plot_groups.append(plot_group) 

    report = Report('mh_toy', tables=(), plotgroups=plot_groups, attributes=())
    report.write_json( output_file )

    phantomjs_driver.quit()

    return 0
示例#17
0
def copyToXml(args):
    dss = openDataSet(args.infile, strict=args.strict)
    outfn = args.outdir
    if os.path.isdir(args.outdir):
        outfn = _swapPath(args.outdir, args.infile)
    dss.copyTo(os.path.split(outfn)[0])
    dss.write(outfn, relPaths=args.relative)
    return 0
示例#18
0
 def _run_reheader_dataset_bams(self, ds_file):
     with openDataSet(ds_file) as ds:
         ds_out = reheader_dataset_bams(ds, os.getcwd(),
                                        self.BIOSAMPLE_NAME,
                                        self.LIBRARY_NAME)
         self._validate_dataset(ds_out)
         self._validate_records(ds, ds_out)
         return ds_out
示例#19
0
def loadMetadataXml(args):
    dset = openDataSet(args.infile, strict=args.strict)
    dset.loadMetadata(args.metadata)
    if args.outfile:
        dset.write(args.outfile, validate=False)
    else:
        dset.write(args.infile, validate=False)
    return 0
示例#20
0
def consolidateXml(args):
    """Combine BAMs and apply the filters described in the XML file, producing
    one consolidated XML"""
    dset = openDataSet(args.infile)
    dset.consolidate(args.datafile,
                     numFiles=args.numFiles,
                     useTmp=(not args.noTmp))
    dset.write(args.xmlfile)
示例#21
0
 def _verify():
     with openDataSet(ofn, strict=False) as ds_out:
         ext_res = ds_out.externalResources[0]
         for bam_file in [ext_res.bam, ext_res.scraps]:
             with BamReader(bam_file) as bam:
                 zmws = set([rec.HoleNumber for rec in bam])
                 self.assertEqual(len(zmws), 1)
                 self.assertTrue(74056024 in zmws)
示例#22
0
 def test_dataset_io(self):
     ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     rc = bamSieve.filter_reads(
         input_bam=DS2,
         output_bam=ofn,
         whitelist="8")
     self.assertEqual(rc, 0)
     with openDataSet(ofn, strict=False) as bam_out:
         have_zmws = set([rec.HoleNumber for rec in bam_out])
         self.assertEqual(have_zmws, set([8]))
     # make sure paths are absolute
     tmpdir = tempfile.mkdtemp()
     ofn2 = op.join(tmpdir, op.basename(ofn))
     shutil.copyfile(ofn, ofn2)
     with openDataSet(ofn2, strict=False) as bam_out:
         have_zmws = set([rec.HoleNumber for rec in bam_out])
         self.assertEqual(have_zmws, set([8]))
def loadMetadataXml(args):
    dset = openDataSet(args.infile, strict=args.strict)
    dset.loadMetadata(args.metadata)
    if args.outfile:
        dset.write(args.outfile, validate=False)
    else:
        dset.write(args.infile, validate=False)
    return 0
示例#24
0
 def _verify():
     with openDataSet(ofn, strict=False) as ds_out:
         ext_res = ds_out.externalResources[0]
         for bam_file in [ext_res.bam, ext_res.scraps]:
             with BamReader(bam_file) as bam:
                 zmws = set([rec.HoleNumber for rec in bam])
                 self.assertEqual(len(zmws), 1)
                 self.assertTrue(74056024 in zmws)
示例#25
0
def WriteWhitelistedDataSets(dsFile, mappings):
    prefix = GetPrefix(dsFile)
    for seqId, subreads in mappings.iteritems():
        sset = openDataSet(dsFile)
        sset.filters.addRequirement(qname=[('=', subreadId)
                                           for subreadId in subreads])
        #sset.filters.addRequirement(qname=[('=', sorted(subreads))])
        sset.write(prefix + "." + seqId + ".subreadset.xml")
def copyToXml(args):
    dss = openDataSet(args.infile, strict=args.strict)
    outfn = args.outdir
    if os.path.isdir(args.outdir):
        outfn = _swapPath(args.outdir, args.infile)
    dss.copyTo(os.path.split(outfn)[0])
    dss.write(outfn, relPaths=args.relative)
    return 0
示例#27
0
def summarize_coverage(aln_set, aln_summ_gff, ref_set=None,
                       num_regions=Constants.NUM_REGIONS,
                       region_size=Constants.REGION_SIZE,
                       force_num_regions=Constants.FORCE_NUM_REGIONS,
                       max_region_size=Constants.MAX_REGION_SIZE):
    """
    Main point of entry
    """

    if ref_set:
        untruncator = get_name_untruncator(ref_set)
    else:
        # this dict is always used with get(x, x), so when it's empty it will
        # just preserve the original name
        untruncator = {}

    #readers = enumerate_readers(args.alignment_file)
    readers = openDataSet(aln_set).resourceReaders()
    gff_writer = GffIO.GffWriter(aln_summ_gff)

    # First write the metadata. Names of references, command line used, things
    # like that
    metadata_lines = get_metadata_lines(readers, untruncator)
    for metadata_line in metadata_lines:
        gff_writer.writeHeader(metadata_line)
    log.debug("Wrote {n} header lines to {f}"
              .format(n=len(metadata_lines), f=aln_summ_gff))

    # Build lists of intervals for each reference
    interval_lists = build_interval_lists(readers)
    log.debug("Finished creating interval lists for {n} references"
              .format(n=len(interval_lists)))

    # Create a function that gets region size from the reference length by
    # freezing the constant parameters
    get_region_size_frozen = functools.partial(
        get_region_size, num_refs=len(interval_lists),
        region_size=region_size, num_regions=num_regions,
        force_num_regions=force_num_regions,
        max_region_size=max_region_size)

    # Create Gff records and write them
    for ref_group_id in sorted(interval_lists):
        log.debug("Generating coverage GFF records for refGroupID {r}"
                  .format(r=ref_group_id))

        gff_generator = generate_gff_records(
            interval_lists[ref_group_id], readers,
            ref_group_id, get_region_size_frozen,
            untruncator)

        try:

            for gff_record in gff_generator:
                gff_writer.writeRecord(gff_record)

        except ValueError as e:
            log.warn(e)
示例#28
0
 def run_and_validate(args, ds_sizes):
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     final_args = base_args + args + ["--outdir", outdir, tmp_ds]
     self._check_cmd(" ".join(final_args))
     dss = [
         openDataSet(op.join(outdir, fn))
         for fn in sorted(os.listdir(outdir))
     ]
     assert [len(ds) for ds in dss] == ds_sizes
示例#29
0
def resolved_tool_contract_runner(rtc):
    log.info("Starting {f} version {v} report generation".format(f=__file__, v=__version__))
    dataset_uuids = [openDataSet(rtc.task.input_files[0]).uuid, BarcodeSet(rtc.task.input_files[1]).uuid]
    report = run_to_report(
        reads=rtc.task.input_files[0], barcodes=rtc.task.input_files[1], subreads=True, dataset_uuids=dataset_uuids
    )
    log.info(pformat(report.to_dict()))
    report.write_json(rtc.task.output_files[0])
    return 0
示例#30
0
    def test_filter_more(self):
        ssfn = data.getXml(7)
        ofn = tempfile.NamedTemporaryFile(suffix=".xml").name

        # zm=[3,4,5] condition
        run_filter_dataset(ssfn, ofn, 0, "zm=[3,4,5] AND length >= 1000")
        ds = openDataSet(ofn)
        assert str(ds.filters) == "( zm = [3,4,5] AND length >= 1000 )"

        # zm=[3,4,5] condition
        run_filter_dataset(ssfn, ofn, 0, "zm=[3,4,5]; length >= 1000")
        ds = openDataSet(ofn)
        assert str(ds.filters) == "( zm = [3,4,5] AND length >= 1000 )"

        # zm=[3,4,5] condition by itself
        run_filter_dataset(ssfn, ofn, 0, "zm=[3,4,5]")
        ds = openDataSet(ofn)
        assert str(ds.filters) == '( zm = [3,4,5] )'
示例#31
0
def run(in_file, out_file, filterstr):
    dataSet = openDataSet(in_file)
    filters = dict(parse_filter_list(filterstr.split(',')))
    log.info("Adding {} filters to {}: {}".format(len(filters), in_file,
                                                  out_file, repr(filters)))
    dataSet.filters.addFilter(**filters)
    log.info("Added. Writing new dataset {}".format(repr(out_file)))
    #dataSet.updateCounts() # just in case # no, not needed
    dataSet.write(out_file, validate=False)  # to avoid warnings
def run_consolidate(dataset_file, output_file, consolidate, n_files):
    with openDataSet(dataset_file) as ds_in:
        # XXX shouldn't the file count check be done elsewhere?
        if consolidate and len(ds_in.toExternalFiles()) != 1:
            new_resource_file = op.splitext(output_file)[0] + ".bam" # .fasta?
            ds_in.consolidate(new_resource_file, numFiles=n_files)
        ds_in.newUuid()
        ds_in.write(output_file)
    return 0
示例#33
0
 def run_after(self, output_file, n_expected, expected_filter_str):
     n_actual = self._get_counts(output_file)
     assert self._get_filters(output_file) == expected_filter_str
     assert n_actual == n_expected
     ds = openDataSet(output_file)
     assert len(ds.metadata.provenance) == 0
     assert ds.name.endswith("(filtered)")
     assert "filtered" in ds.tags
     return ds
示例#34
0
def run(in_file, out_file, filterstr):
    dataSet = openDataSet(in_file)
    filters = dict(parse_filter_list(filterstr.split(',')))
    log.info("Adding {} filters to {}: {}".format(
        len(filters), in_file, out_file, repr(filters)))
    dataSet.filters.addFilter(**filters)
    log.info("Added. Writing new dataset {}".format(repr(out_file)))
    #dataSet.updateCounts() # just in case # no, not needed
    dataSet.write(out_file, validate=False) # to avoid warnings
示例#35
0
 def test_dataset_io(self):
     ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     rc = bamSieve.filter_reads(
         input_bam=DS2,
         output_bam=ofn,
         whitelist="8")
     self.assertEqual(rc, 0)
     with openDataSet(ofn, strict=False) as bam_out:
         have_zmws = set([rec.HoleNumber for rec in bam_out])
         self.assertEqual(have_zmws, set([8]))
示例#36
0
def mergeXml(args):
    dss = [openDataSet(infn, strict=args.strict) for infn in args.infiles]
    allds = reduce(lambda ds1, ds2: ds1 + ds2, dss)
    if not allds is None:
        allds.updateCounts()
        allds.write(args.outfile)
    else:
        raise InvalidDataSetIOError("Merge failed, likely due to "
                                    "conflicting Filters")
    return 0
示例#37
0
def filterXml(args):
    if args.infile.endswith('xml'):
        dataSet = openDataSet(args.infile, strict=args.strict)
        filters = parse_filter_list(args.filters)
        dataSet.filters.addRequirement(**filters)
        dataSet.updateCounts()
        log.info("{i} filters added".format(i=len(filters)))
        dataSet.write(args.outfile)
    else:
        raise IOError("No files found/found to be compatible")
示例#38
0
def loadStatsXml(args):
    dset = openDataSet(args.infile, strict=args.strict)
    if len(dset.externalResources) > 1:
        log.info("More than one ExternalResource found, adding the "
                 "sts.xml nested external resource to the first one")
    dset.externalResources[0].sts = args.statsfile
    if args.outfile:
        dset.write(args.outfile, validate=False)
    else:
        dset.write(args.infile, validate=False)
示例#39
0
def absolutizeXml(args):
    dss = openDataSet(args.infile, strict=args.strict)
    outfn = args.infile
    if args.outdir:
        if os.path.isdir(args.outdir):
            outfn = _swapPath(args.outdir, args.infile)
        else:
            outfn = args.outdir
    dss.write(outfn, relPaths=False)
    return 0
def absolutizeXml(args):
    dss = openDataSet(args.infile, strict=args.strict)
    outfn = args.infile
    if args.outdir:
        if os.path.isdir(args.outdir):
            outfn = _swapPath(args.outdir, args.infile)
        else:
            outfn = args.outdir
    dss.write(outfn, relPaths=False)
    return 0
def mergeXml(args):
    dss = [openDataSet(infn, strict=args.strict) for infn in args.infiles]
    allds = reduce(lambda ds1, ds2: ds1 + ds2, dss)
    if not allds is None:
        allds.updateCounts()
        allds.write(args.outfile)
    else:
        raise InvalidDataSetIOError("Merge failed, likely due to "
                                    "conflicting Filters")
    return 0
示例#42
0
def ReadAdaptersFromScraps(bam, windows):
    handles = []
    if bam.lower().endswith(".scraps.bam"):
        handles.append(IndexedBamReader(bam))
    else:
        # Iterate through each external resource, looking for scraps files to read
        ds = openDataSet(bam)
        for er in ds.externalResources:
            try:
                handle = IndexedBamReader(er.scraps)
            except:
                continue
            handles.append(handle)

    adps = defaultdict(list)
    for handle in handles:
        for record in handle:
            if record.scrapType != "A":
                continue
            hn = record.holeNumber
            # Skip records without alignments that passed QC
            try:
                qS, qE, _, _, _, _, _ = windows[hn]
            except:
                continue
            # Skip records for ZMWs other than the one selected for it's alignment
            if record.qStart not in [qS, qE] and record.qEnd not in [qS, qE]:
                continue
            # If we made it this far, record the position and type of adapter
            seq = record.peer.seq
            tFrac = sum(1 for b in seq if b == "T") / float(len(seq))
            if tFrac < MIN_T:
                adps[hn].append((record.qStart, "TC6"))
            else:
                adps[hn].append((record.qStart, "POLYA"))

    # Convert our counts into a T/F depending on whether there are polyAs
    results = {}
    for hn, adpData in adps.iteritems():
        if len(adpData) != 2:
            print "ERROR! ERROR! {0} adps for hn #{1}".format(
                len(adpTypes), hn)
        # Using the strand, sort the adps left-to-right (by alignment)
        _, _, _, _, _, strand, _ = windows[hn]
        if strand == 0:
            adpData = sorted(adpData)
        else:
            adpData = sorted(adpData, reverse=True)
        # Now ordered we can record both ADP types and locations
        leftTc6 = "T" if adpData[0][1] == "TC6" else "F"
        rightTc6 = "T" if adpData[1][1] == "TC6" else "F"
        leftPolyA = "T" if adpData[0][1] == "POLYA" else "F"
        rightPolyA = "T" if adpData[1][1] == "POLYA" else "F"
        results[hn] = (leftTc6, rightTc6, leftPolyA, rightPolyA)
    return results
示例#43
0
def summarizeXml(args):
    dset = openDataSet(args.infile, strict=args.strict)
    print "DataSet Type          : {f}".format(f=dset.datasetType)
    print "Name                  : {f}".format(f=dset.name)
    print "Id                    : {f}".format(f=dset.uuid)
    print "Number of records     : {r}".format(r=dset.numRecords)
    print "Total number of bases : {r}".format(r=dset.totalLength)
    print "# of Resources        : {r}".format(r=len(dset.toExternalFiles()))
    for fname in dset.toExternalFiles():
        print fname
    return 0
示例#44
0
def _run_bam_to_fastx(program_name, fastx_reader, fastx_writer,
                     input_file_name, output_file_name, tmp_dir=None):
    assert isinstance(program_name, basestring)
    barcode_mode = False
    if output_file_name.endswith(".gz"):
        with openDataSet(input_file_name) as ds_in:
            barcode_mode = ds_in.isBarcoded
    tmp_out_prefix = tempfile.NamedTemporaryFile(dir=tmp_dir).name
    args = [
        program_name,
        "-o", tmp_out_prefix,
        input_file_name,
    ]
    if barcode_mode:
        args.insert(1, "--split-barcodes")
    log.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    if result.exit_code != 0:
        return result.exit_code
    else:
        base_ext = re.sub("bam2", "", program_name) 
        if not barcode_mode:
            tmp_out = "{p}.{b}.gz".format(p=tmp_out_prefix, b=base_ext)
            assert os.path.isfile(tmp_out), tmp_out
            if output_file_name.endswith(".gz"):
                log.info("cp {t} {f}".format(t=tmp_out, f=output_file_name))
                shutil.copyfile(tmp_out, output_file_name)
            else:
                _unzip_fastx(tmp_out, output_file_name)
            os.remove(tmp_out)
        else:
            suffix = "{f}.gz".format(f=base_ext)
            tmp_out_dir = op.dirname(tmp_out_prefix)
            tc_out_dir = op.dirname(output_file_name)
            barcoded_file_names = []
            # find the barcoded FASTX files and unzip them to the same
            # output directory and file prefix as the ultimate output
            for fn in os.listdir(tmp_out_dir):
                fn = op.join(tmp_out_dir, fn)
                if fn.startswith(tmp_out_prefix) and fn.endswith(suffix):
                    bc_fwd_rev = fn.split(".")[-3].split("_")
                    suffix2 = ".{f}_{r}.{t}".format(
                        f=bc_fwd_rev[0], r=bc_fwd_rev[1], t=base_ext)
                    assert fn == tmp_out_prefix + suffix2 + ".gz"
                    fn_out = re.sub(".gz$", suffix2, output_file_name)
                    fastx_out = op.join(tc_out_dir, fn_out)
                    _unzip_fastx(fn, fastx_out)
                    barcoded_file_names.append(fn_out)
                    os.remove(fn)
            assert len(barcoded_file_names) > 0
            return archive_files(barcoded_file_names, output_file_name)
    return 0
示例#45
0
 def run_after(self, rtc, output_dir):
     with openDataSet(rtc.task.output_files[0]) as f:
         f.assertIndexed()
         self.assertEqual(len(f.toExternalFiles()), 1)
         # test for bug 33778
         qnames = set()
         for rec in f:
             qnames.add(rec.qName)
         self.assertEqual(len(qnames), len(f))
     ds = DataStore.load_from_json(rtc.task.output_files[1])
     self.assertEqual(len(ds.files), 2)
def run_consolidate(dataset_file,
                    output_file,
                    datastore_file,
                    consolidate,
                    n_files,
                    consolidate_f=lambda ds: ds.consolidate):
    # XXX https://github.com/pysam-developers/pysam/issues/939
    pysam.set_verbosity(0)  # pylint: disable=no-member
    datastore_files = []
    with openDataSet(dataset_file) as ds_in:
        if consolidate:
            if len(ds_in.toExternalFiles()) <= 0:
                raise ValueError(
                    "DataSet {} must contain one or more files!".format(
                        dataset_file))
            new_resource_file = bam_of_dataset(output_file)
            consolidate_f(ds_in)(new_resource_file,
                                 numFiles=n_files,
                                 useTmp=False)
            # always display the BAM/BAI if consolidation is enabled
            # XXX there is no uniqueness constraint on the sourceId, but this
            # seems sloppy nonetheless - unfortunately I don't know how else to
            # make view rule whitelisting work
            reads_name = get_reads_name(ds_in)
            for ext_res in ds_in.externalResources:
                if ext_res.resourceId.endswith(".bam"):
                    ds_file = DataStoreFile(ext_res.uniqueId,
                                            Constants.TOOL_ID + "-out-2",
                                            ext_res.metaType,
                                            ext_res.bam,
                                            name=reads_name,
                                            description=reads_name)
                    datastore_files.append(ds_file)
                    # Prevent duplicated index files being added to datastore, since consolidated
                    # dataset may contain multiple indices pointing to the same physical file
                    added_resources = set()
                    for index in ext_res.indices:
                        if (index.metaType in Constants.BAI_FILE_TYPES
                                and index.resourceId not in added_resources):
                            added_resources.add(index.resourceId)
                            ds_file = DataStoreFile(
                                index.uniqueId,
                                Constants.TOOL_ID + "-out-3",
                                index.metaType,
                                index.resourceId,
                                name="Index of {}".format(reads_name.lower()),
                                description="Index of {}".format(
                                    reads_name.lower()))
                            datastore_files.append(ds_file)
        ds_in.newUuid()
        ds_in.write(output_file)
    datastore = DataStore(datastore_files)
    datastore.write_json(datastore_file)
    return 0
示例#47
0
def _run_bam_to_fastx(program_name, fastx_reader, fastx_writer,
                     input_file_name, output_file_name, tmp_dir=None):
    assert isinstance(program_name, basestring)
    barcode_mode = False
    if output_file_name.endswith(".gz"):
        with openDataSet(input_file_name) as ds_in:
            barcode_mode = ds_in.isBarcoded
    tmp_out_prefix = tempfile.NamedTemporaryFile(dir=tmp_dir).name
    args = [
        program_name,
        "-o", tmp_out_prefix,
        input_file_name,
    ]
    if barcode_mode:
        args.insert(1, "--split-barcodes")
    log.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    if result.exit_code != 0:
        return result.exit_code
    else:
        base_ext = re.sub("bam2", "", program_name) 
        if not barcode_mode:
            tmp_out = "{p}.{b}.gz".format(p=tmp_out_prefix, b=base_ext)
            assert os.path.isfile(tmp_out), tmp_out
            if output_file_name.endswith(".gz"):
                log.info("cp {t} {f}".format(t=tmp_out, f=output_file_name))
                shutil.copyfile(tmp_out, output_file_name)
            else:
                _unzip_fastx(tmp_out, output_file_name)
            os.remove(tmp_out)
        else:
            suffix = "{f}.gz".format(f=base_ext)
            tmp_out_dir = op.dirname(tmp_out_prefix)
            tc_out_dir = op.dirname(output_file_name)
            barcoded_file_names = []
            # find the barcoded FASTX files and unzip them to the same
            # output directory and file prefix as the ultimate output
            for fn in os.listdir(tmp_out_dir):
                fn = op.join(tmp_out_dir, fn)
                if fn.startswith(tmp_out_prefix) and fn.endswith(suffix):
                    bc_fwd_rev = fn.split(".")[-3].split("_")
                    suffix2 = ".{f}_{r}.{t}".format(
                        f=bc_fwd_rev[0], r=bc_fwd_rev[1], t=base_ext)
                    assert fn == tmp_out_prefix + suffix2 + ".gz"
                    fn_out = re.sub(".gz$", suffix2, output_file_name)
                    fastx_out = op.join(tc_out_dir, fn_out)
                    _unzip_fastx(fn, fastx_out)
                    barcoded_file_names.append(fn_out)
                    os.remove(fn)
            assert len(barcoded_file_names) > 0
            return archive_files(barcoded_file_names, output_file_name)
    return 0
示例#48
0
def loadStatsXml(args):
    dset = openDataSet(args.infile, strict=args.strict)
    if len(dset.externalResources) > 1:
        log.info("More than one ExternalResource found, adding the "
                 "sts.xml nested external resource to the first one")
    dset.externalResources[0].sts = args.statsfile
    if args.outfile:
        dset.write(args.outfile, validate=False)
    else:
        dset.write(args.infile, validate=False)
    return 0
示例#49
0
def filterXml(args):
    if args.infile.endswith('xml'):
        dataSet = openDataSet(args.infile, strict=args.strict)
        filters = parse_filter_list(args.filters)
        dataSet.filters.addRequirement(**filters)
        dataSet.updateCounts()
        log.info("{i} filters added".format(i=len(filters)))
        dataSet.write(args.outfile)
    else:
        raise IOError("No files found/found to be compatible")
    return 0
示例#50
0
def run_filter_dataset(in_file, out_file, read_length, other_filters,
                       downsample_factor=0,
                       min_rq=None):
    rlen = sanitize_read_length(read_length)
    filters = {}
    if other_filters and other_filters != "None":
        filters = parse_filter_list([str(other_filters)])
        log.info("{i} other filters will be added".format(i=len(filters)))
    tags = set()
    if rlen or min_rq is not None or len(filters) > 0 or not downsample_factor in [0, 1]:
        dataSet = openDataSet(in_file)
        orig_uuid = dataSet.uuid
        dataSet.updateCounts()  # just in case
        combine_filters(dataSet, filters)
        tags.update({t.strip() for t in dataSet.tags.strip().split(",")})
        if rlen:
            combine_filters(dataSet, {'length': [('>=', rlen)]})
        if min_rq is not None and min_rq > 0:
            combine_filters(dataSet, {'rq': [('>=', min_rq)]})
        if not downsample_factor in [0, 1]:
            combine_filters(dataSet, {'zm': [("==", "0", downsample_factor)]})
            tags.add("downsampled")
        dataSet.updateCounts()
        # XXX note we do *not* set a new UUID in case we want to keep a parent-
        # child relationship to the input dataset.  since the filtered dataset
        # will not be imported back into SMRT Link it is ok to keep the
        # original UUID
        dataSet.uuid = orig_uuid
    else:
        # if we're not actually changing anything, don't load indices
        dataSet = openDataSet(in_file, skipCounts=True)
    tags.add("filtered")
    dataSet.tags = ",".join(list(tags))
    if not "(filtered)" in dataSet.name:
        dataSet.name = dataSet.name + " (filtered)"
    if len(dataSet.metadata.provenance) > 0:
        log.warning("Removing existing provenance record: %s",
                    dataSet.metadata.provenance)
        dataSet.metadata.provenance = None
    dataSet.write(out_file)
    return 0
示例#51
0
def _get_rtc_dataset_uuids(report_file, use_outputs=False):
    rtc_path = op.join(op.dirname(report_file), "resolved-tool-contract.json")
    rtc = load_resolved_tool_contract_from(rtc_path)
    all_files = rtc.task.input_files
    if use_outputs:
        all_files = rtc.task.output_files
    ds_uuids = set()
    for file_name in all_files:
        if file_name.endswith(".xml"):
            with openDataSet(file_name) as ds:
                ds_uuids.add(ds.uuid)
    return ds_uuids
示例#52
0
def _get_rtc_dataset_uuids(report_file, use_outputs=False):
    rtc_path = op.join(op.dirname(report_file), "resolved-tool-contract.json")
    rtc = load_resolved_tool_contract_from(rtc_path)
    all_files = rtc.task.input_files
    if use_outputs:
        all_files = rtc.task.output_files
    ds_uuids = set()
    for file_name in all_files:
        if file_name.endswith(".xml"):
            with openDataSet(file_name) as ds:
                ds_uuids.add(ds.uuid)
    return ds_uuids
示例#53
0
 def _check_outputs(self, dataset_file):
     assert op.isfile(self.output_bam)
     assert op.isfile(self.output_bam + ".bai")
     assert op.isfile(self.output_bam + ".pbi")
     with openDataSet(dataset_file) as f:
         f.assertIndexed()
         assert len(f.toExternalFiles()) == 1
         # test for bug 33778
         qnames = set()
         for rec in f:
             qnames.add(rec.qName)
         assert len(qnames) == len(f)
 def test_split(self):
     ds1 = openDataSet(data.getXml(12))
     self.assertTrue(ds1.numExternalResources > 1)
     dss = ds1.split()
     self.assertTrue(len(dss) == ds1.numExternalResources)
     self.assertEqual(sum(ds.numRecords for ds in dss), ds1.numRecords)
     self.assertEqual(sum(ds.totalLength for ds in dss), ds1.totalLength)
     self.assertEqual(sum(len(ds) for ds in dss), len(ds1))
     dss = ds1.split(chunks=1)
     self.assertTrue(len(dss) == 1)
     self.assertEqual(sum(ds.numRecords for ds in dss), ds1.numRecords)
     self.assertEqual(sum(ds.totalLength for ds in dss), ds1.totalLength)
     self.assertEqual(sum(len(ds) for ds in dss), len(ds1))
     dss = ds1.split(chunks=2)
     self.assertTrue(len(dss) == 2)
     self.assertEqual(sum(ds.numRecords for ds in dss), ds1.numRecords)
     self.assertEqual(sum(ds.totalLength for ds in dss), ds1.totalLength)
     self.assertEqual(sum(len(ds) for ds in dss), len(ds1))
     dss = ds1.split(chunks=2, ignoreSubDatasets=True)
     self.assertTrue(len(dss) == 2)
     self.assertEqual(sum(ds.numRecords for ds in dss), ds1.numRecords)
     self.assertEqual(sum(ds.totalLength for ds in dss), ds1.totalLength)
     self.assertEqual(sum(len(ds) for ds in dss), len(ds1))
     self.assertFalse(dss[0].uuid == dss[1].uuid)
     self.assertTrue(dss[0].name == dss[1].name)
     # Lets try merging and splitting on subdatasets
     ds1 = openDataSet(data.getXml(8))
     self.assertEquals(ds1.totalLength, 123588)
     ds1tl = ds1.totalLength
     ds2 = openDataSet(data.getXml(11))
     self.assertEquals(ds2.totalLength, 117086)
     ds2tl = ds2.totalLength
     dss = ds1 + ds2
     self.assertTrue(dss.totalLength == (ds1tl + ds2tl))
     ds1, ds2 = sorted(dss.split(2, ignoreSubDatasets=False),
                       key=lambda x: x.totalLength,
                       reverse=True)
     self.assertTrue(ds1.totalLength == ds1tl)
     self.assertTrue(ds2.totalLength == ds2tl)
示例#55
0
 def __init__(self, file_name):
     self.file_name = file_name
     self._is_fasta = False
     self.ext = op.splitext(file_name)[1].upper()
     if self.ext in [".FA", ".FASTA"]:
         self._dataset = FastaReader(file_name)
         self._is_fasta = True
     elif self.ext == ".BAM":
         self._dataset = openDataFile(file_name)
     else: # either contigset.xml or consensusreadset.xml
         assert self.ext == ".XML"
         self._dataset = openDataSet(file_name)
         if isinstance(self._dataset, ContigSet):
             self._is_fasta = True
示例#56
0
def run_filter_dataset(in_file, out_file, read_length, other_filters):
    dataSet = openDataSet(in_file)
    if other_filters and other_filters != "None":
        filters = parse_filter_list(str(other_filters).split(','))
        dataSet.filters.addFilter(**filters)
        log.info("{i} other filters added".format(i=len(filters)))
    rlen = sanitize_read_length(read_length)
    if rlen:
        dataSet.filters.addRequirement(
            length=[('>=', rlen)])
    if rlen or other_filters:
        dataSet.updateCounts()
    dataSet.write(out_file)
    return 0
def splitXml(args):
    log.debug("Starting split")
    dataSet = openDataSet(args.infile, strict=args.strict)
    chunks = len(args.outfiles)
    if args.chunks:
        chunks = args.chunks
    if isinstance(dataSet, ContigSet):
        dss = dataSet.split(chunks)
    else:
        dss = dataSet.split(chunks=chunks,
                            ignoreSubDatasets=(not args.subdatasets),
                            contigs=args.contigs,
                            maxChunks=args.maxChunks,
                            breakContigs=args.breakContigs,
                            targetSize=args.targetSize,
                            zmws=args.zmws,
                            barcodes=args.barcodes,
                            byRecords=(not args.byRefLength),
                            updateCounts=(not args.noCounts))
    log.debug("Splitting into {i} chunks".format(i=len(dss)))
    infix = 'chunk{i}'
    nSuf = -2 if re.search(r".+\.\w+set\.xml", args.infile) else -1
    if not args.outfiles:
        if not args.outdir:
            args.outfiles = ['.'.join(args.infile.split('.')[:nSuf] +
                                      [infix.format(i=chNum)] +
                                      args.infile.split('.')[nSuf:])
                             for chNum in range(len(dss))]
        else:
            args.outfiles = ['.'.join(args.infile.split('.')[:nSuf] +
                                      [infix.format(i=chNum)] +
                                      args.infile.split('.')[nSuf:])
                             for chNum in range(len(dss))]
            args.outfiles = [os.path.join(args.outdir,
                                          os.path.basename(outfn))
                             for outfn in args.outfiles]
            num = len(dss)
            end = ''
            if num > 5:
                num = 5
                end = '...'
            log.debug("Emitting {f} {e}".format(
                f=', '.join(args.outfiles[:num]),
                e=end))
    log.debug("Finished splitting, now writing")
    for out_fn, dset in zip(args.outfiles, dss):
        dset.write(out_fn)
    log.debug("Done writing files")
    return 0
示例#58
0
def filterXml(args):
    if args.infile.endswith("xml"):
        dataSet = openDataSet(args.infile, strict=args.strict)
        filters = defaultdict(list)
        separators = ["<=", ">=", "!=", "==", ">", "<", "="]
        for filt in args.filters:
            for sep in separators:
                if sep in filt:
                    param, condition = filt.split(sep)
                    condition = (sep, condition)
                    filters[param].append(condition)
                    break
        dataSet.filters.addRequirement(**filters)
        dataSet.updateCounts()
        log.info("{i} filters added".format(i=len(filters)))
        dataSet.write(args.outfile)
    else:
        raise IOError("No files found/found to be compatible")