示例#1
0
def run_main(fastq_file, gmap_ref_file, output_json_file, max_nchunks):
    """
    Parameters:
      fastq_file -- HQ isoforms in FASTQ
      gmap_ref_file -- GMAP reference set xml
      output_json -- chunk.json
    """
    # Chunk FASTQ
    output_fastq_json = output_json_file + ".fastq.json"
    output_dir = op.dirname(output_json_file)
    CU.write_fastq_chunks_to_file(output_fastq_json, fastq_file, max_nchunks,
                                  output_dir, "scattered-fastq", "fastq")

    # get fastq_ids from output_fastq_json
    fastq_chunks = load_pipeline_chunks_from_json(output_fastq_json)
    fastq_files = get_datum_from_chunks_by_chunk_key(fastq_chunks,
                                                     "$chunk.fastq_id")
    log.debug("Chunked FASTQ files are %s.", (', '.join(fastq_files)))

    # Writing chunk.json
    chunks = []
    for i, fastq_file in enumerate(fastq_files):
        chunk_id = "_".join(["map_isoforms_to_genome_chunk", str(i)])
        d = {
            Constants.CHUNK_KEYS[0]: fastq_file,
            Constants.CHUNK_KEYS[1]: gmap_ref_file
        }
        c = PipelineChunk(chunk_id, **d)
        chunks.append(c)

    log.info("Writing chunk.json to %s", output_json_file)
    write_pipeline_chunks(chunks, output_json_file,
                          "created by %s" % Constants.TOOL_ID)
    return 0
def run_main(chunk_json, contigset_output, chunk_key):
    """run main"""
    chunks = load_pipeline_chunks_from_json(chunk_json)

    # Allow looseness
    if not chunk_key.startswith('$chunk.'):
        chunk_key = '$chunk.' + chunk_key
        log.warn("Prepending chunk key with '$chunk.' to '%s'", str(chunk_key))

    fasta_files = get_datum_from_chunks_by_chunk_key(chunks, chunk_key)
    log.debug("Chunked consensus isoforms files are %s.", (', '.join(fasta_files)))

    out_fa = CombinedFiles(combined_dir=op.dirname(contigset_output)).all_consensus_isoforms_fa
    combine_consensus_isoforms(split_indices=range(0, len(fasta_files)),
                               split_files=fasta_files,
                               combined_consensus_isoforms_fa=out_fa)
    log.info("Combining files to %s.", out_fa)

    log.info("Writing contigset %s", contigset_output)
    assert contigset_output.endswith('xml')
    as_contigset(out_fa, contigset_output)

    #cs = ContigSet(*fasta_files)
    #cs.newUuid()
    #cs.write(contigset_output)
    return 0
def _run_main(chunk_input_json, output_file, chunk_key):
    chunks = load_pipeline_chunks_from_json(chunk_input_json)
    chunked_files = []
    for chunk in chunks:
        if chunk_key in chunk.chunk_keys:
            chunked_files.append(chunk.chunk_d[chunk_key])
        else:
            raise KeyError("Unable to find chunk key '{i}' in {p}".format(i=chunk_key, p=chunk))
    return gather_kinetics_h5_byref(chunked_files, output_file)
示例#4
0
 def run_after(self, rtc, output_dir):
     json_file = rtc.task.output_files[0]
     chunks = load_pipeline_chunks_from_json(json_file)
     for chunk in chunks:
         d = chunk.chunk_d
         # the cluster pickle (file index 2) is chunked, the rest not
         self.assertNotEqual(d["$chunk.pickle_id"], self.INPUT_FILES[2])
         self.assertEqual(d["$chunk.subreadset_id"], self.INPUT_FILES[0])
         self.assertEqual(d["$chunk.contigset_id"], self.INPUT_FILES[1])
         self.assertEqual(d["$chunk.nfl_pickle_id"], self.INPUT_FILES[3])
示例#5
0
def _run_main(chunk_input_json, output_file, chunk_key):
    chunks = load_pipeline_chunks_from_json(chunk_input_json)
    chunked_files = []
    for chunk in chunks:
        if chunk_key in chunk.chunk_keys:
            chunked_files.append(chunk.chunk_d[chunk_key])
        else:
            raise KeyError("Unable to find chunk key '{i}' in {p}".format(
                i=chunk_key, p=chunk))
    return gather_kinetics_h5_byref(chunked_files, output_file)
示例#6
0
def __gather_runner(func, chunk_input_json, output_file, chunk_key, **kwargs):
    chunks = load_pipeline_chunks_from_json(chunk_input_json)

    # Allow looseness
    if not chunk_key.startswith('$chunk.'):
        chunk_key = '$chunk.' + chunk_key
        log.warn("Prepending chunk key with '$chunk.' to '{c}'".format(c=chunk_key))

    chunked_files = get_datum_from_chunks_by_chunk_key(chunks, chunk_key)
    _ = func(chunked_files, output_file, **kwargs)
    return 0
示例#7
0
 def run_after(self, rtc, output_dir):
     json_file = rtc.task.output_files[0]
     chunks = load_pipeline_chunks_from_json(json_file)
     windows = []
     for chunk in chunks:
         d = chunk.chunk_d
         chunked = d[self.CHUNK_KEYS[0]]
         with self.READER_CLASS(chunked, **self.READER_KWARGS) as ds:
             windows.append(ds.refWindows)
     self.assertEqual(windows, [[('lambda_NEB3011', 0, 24251)],
                                [('lambda_NEB3011', 24251, 48502)]])
示例#8
0
def __gather_runner(func, chunk_input_json, output_file, chunk_key, **kwargs):
    chunks = load_pipeline_chunks_from_json(chunk_input_json)

    # Allow looseness
    if not chunk_key.startswith('$chunk.'):
        chunk_key = '$chunk.' + chunk_key
        log.warning(
            "Prepending chunk key with '$chunk.' to '{c}'".format(c=chunk_key))

    chunked_files = get_datum_from_chunks_by_chunk_key(chunks, chunk_key)
    _ = func(chunked_files, output_file, **kwargs)
    return 0
 def run_after(self, rtc, output_dir):
     gathered_file = rtc.task.output_files[0]
     chunks = load_pipeline_chunks_from_json(self.INPUT_FILES[0])
     n_rec = 0
     with self.READER_CLASS(gathered_file, **self.READER_KWARGS) as f:
         n_rec = len([r for r in f])
     n_rec_chunked = 0
     for chunk in chunks:
         d = chunk.chunk_d
         chunked = d[self.CHUNK_KEY]
         with self.READER_CLASS(chunked, **self.READER_KWARGS) as cs:
             n_rec_chunked += len([r for r in cs])
     self.assertEqual(n_rec_chunked, n_rec)
示例#10
0
 def run_after(self, rtc, output_dir):
     gathered_file = rtc.task.output_files[0]
     chunks = load_pipeline_chunks_from_json(self.INPUT_FILES[0])
     n_rec = 0
     with self.READER_CLASS(gathered_file, **self.READER_KWARGS) as f:
         n_rec = len([r for r in f])
     n_rec_chunked = 0
     for chunk in chunks:
         d = chunk.chunk_d
         chunked = d[self.CHUNK_KEY]
         with self.READER_CLASS(chunked, **self.READER_KWARGS) as cs:
             n_rec_chunked += len([r for r in cs])
     self.assertEqual(n_rec_chunked, n_rec)
 def run_after(self, rtc, output_dir):
     json_file = rtc.task.output_files[0]
     chunks = load_pipeline_chunks_from_json(json_file)
     windows = []
     for chunk in chunks:
         d = chunk.chunk_d
         chunked = d[self.CHUNK_KEYS[0]]
         with self.READER_CLASS(chunked, **self.READER_KWARGS) as ds:
             windows.append(ds.refWindows)
     self.assertEqual(windows, [
         [('lambda_NEB3011', 0, 24251)],
         [('lambda_NEB3011', 24251, 48502)]
     ])
示例#12
0
 def run_after(self, rtc, output_dir):
     json_file = rtc.task.output_files[0]
     chunks = load_pipeline_chunks_from_json(json_file)
     n_rec = 0
     with ContigSet(self.INPUT_FILES[0]) as f:
         n_rec = len(f)
     n_rec_chunked = 0
     for chunk in chunks:
         d = chunk.chunk_d
         with ContigSet(d['$chunk.contigset_id']) as cs:
             n_rec_chunked += len([r for r in cs])
         self._check_unchunked_files(d)
     self.assertEqual(n_rec_chunked, n_rec)
示例#13
0
def chunk_contigset(in_file, n_chunks, out_dir, out_chunk_json):
    """
    Chunk input contigset into n_chunks under out_dir, and
    write chunk info to out_chunk_json, return chunked files.
    """
    log.info("Splitting %s into %s chunks", in_file, str(n_chunks))
    CU.write_contigset_chunks_to_file(out_chunk_json, in_file, n_chunks,
                                      out_dir, "scattered-nfl", "contigset.xml")

    out_chunks = load_pipeline_chunks_from_json(out_chunk_json)
    chunked_files = get_datum_from_chunks_by_chunk_key(out_chunks, '$chunk.contigset_id')
    log.info("Splitted files are %s\n", ("\n".join(chunked_files)))

    # Return chunked files from out_chunk_json
    return chunked_files
示例#14
0
def run_main(chunk_json, sam_output, chunk_key):
    """run main"""
    chunks = load_pipeline_chunks_from_json(chunk_json)

    # Allow looseness
    if not chunk_key.startswith('$chunk.'):
        chunk_key = '$chunk.' + chunk_key
        log.warn("Prepending chunk key with '$chunk.' to '%s'", str(chunk_key))

    sam_files = get_datum_from_chunks_by_chunk_key(chunks, chunk_key)
    log.debug("Chunked SAM files are %s.", (', '.join(sam_files)))

    log.info("Concatenate chunked SAM files to %s.", sam_output)
    concatenate_sam(sam_files, sam_output)

    return 0
 def run_after(self, rtc, output_dir):
     unchunked = self.INPUT_FILES[0]
     json_file = rtc.task.output_files[0]
     chunks = load_pipeline_chunks_from_json(json_file)
     if self.NCHUNKS_EXPECTED is not None:
         self.assertEqual(len(chunks), self.NCHUNKS_EXPECTED)
     n_rec = 0
     with self.READER_CLASS(unchunked, **self.READER_KWARGS) as f:
         n_rec = len([rec for rec in f])
         self.assertTrue(n_rec > 0)
     n_rec_chunked = 0
     for chunk in chunks:
         d = chunk.chunk_d
         chunked = d[self.CHUNK_KEYS[0]]
         with self.READER_CLASS(chunked, **self.READER_KWARGS) as cs:
             n_rec_chunk = len([rec for rec in cs])
             self.assertTrue(n_rec_chunk > 0)
             n_rec_chunked += n_rec_chunk
     self.assertEqual(n_rec_chunked, n_rec)
示例#16
0
 def run_after(self, rtc, output_dir):
     unchunked = self.INPUT_FILES[0]
     json_file = rtc.task.output_files[0]
     chunks = load_pipeline_chunks_from_json(json_file)
     if self.NCHUNKS_EXPECTED is not None:
         self.assertEqual(len(chunks), self.NCHUNKS_EXPECTED)
     n_rec = 0
     with self.READER_CLASS(unchunked, **self.READER_KWARGS) as f:
         n_rec = len([rec for rec in f])
         self.assertTrue(n_rec > 0)
     n_rec_chunked = 0
     for chunk in chunks:
         d = chunk.chunk_d
         chunked = d[self.CHUNK_KEYS[0]]
         with self.READER_CLASS(chunked, **self.READER_KWARGS) as cs:
             n_rec_chunk = len([rec for rec in cs])
             self.assertTrue(n_rec_chunk > 0)
             n_rec_chunked += n_rec_chunk
     self.assertEqual(n_rec_chunked, n_rec)
def run_main(fastq_file, gmap_ref_file, output_json_file, max_nchunks):
    """
    Parameters:
      fastq_file -- HQ isoforms in FASTQ
      gmap_ref_file -- GMAP reference set xml
      output_json -- chunk.json
    """
    # Check size of fastq_file before scattering, so that a meaningful
    # error message can be displayed instead of 'float division by zero'
    if os.stat(fastq_file).st_size == 0:
        raise IOError("Fastq file %s is empty, exiting." % fastq_file)

    # Chunk FASTQ
    output_fastq_json = output_json_file + ".fastq.json"
    output_dir = op.dirname(output_json_file)
    CU.write_fastq_chunks_to_file(output_fastq_json, fastq_file, max_nchunks,
                                  output_dir, "scattered-fastq", "fastq")

    # get fastq_ids from output_fastq_json
    fastq_chunks = load_pipeline_chunks_from_json(output_fastq_json)
    fastq_files = get_datum_from_chunks_by_chunk_key(fastq_chunks,
                                                     "$chunk.fastq_id")
    log.debug("Chunked FASTQ files are %s.", (', '.join(fastq_files)))

    # Writing chunk.json
    chunks = []
    for i, fastq_file in enumerate(fastq_files):
        chunk_id = "_".join(["map_isoforms_to_genome_chunk", str(i)])
        d = {
            Constants.CHUNK_KEYS[0]: fastq_file,
            Constants.CHUNK_KEYS[1]: gmap_ref_file
        }
        c = PipelineChunk(chunk_id, **d)
        chunks.append(c)

    log.info("Writing chunk.json to %s", output_json_file)
    write_pipeline_chunks(chunks, output_json_file,
                          "created by %s" % Constants.TOOL_ID)
    return 0
def run_main(fastq_file, gmap_ref_file, output_json_file, max_nchunks):
    """
    Parameters:
      fastq_file -- HQ isoforms in FASTQ
      gmap_ref_file -- GMAP reference set xml
      output_json -- chunk.json
    """
    # Check size of fastq_file before scattering, so that a meaningful
    # error message can be displayed instead of 'float division by zero'
    if os.stat(fastq_file).st_size == 0:
        raise IOError("Fastq file %s is empty, exiting." % fastq_file)

    # Chunk FASTQ
    output_fastq_json = output_json_file + ".fastq.json"
    output_dir = op.dirname(output_json_file)
    CU.write_fastq_chunks_to_file(output_fastq_json, fastq_file, max_nchunks,
                                  output_dir, "scattered-fastq", "fastq")

    # get fastq_ids from output_fastq_json
    fastq_chunks = load_pipeline_chunks_from_json(output_fastq_json)
    fastq_files = get_datum_from_chunks_by_chunk_key(fastq_chunks, "$chunk.fastq_id")
    log.debug("Chunked FASTQ files are %s.", (', '.join(fastq_files)))

    # Writing chunk.json
    chunks = []
    for i, fastq_file in enumerate(fastq_files):
        chunk_id = "_".join(["map_isoforms_to_genome_chunk", str(i)])
        d = {Constants.CHUNK_KEYS[0]: fastq_file,
             Constants.CHUNK_KEYS[1]: gmap_ref_file}
        c = PipelineChunk(chunk_id, **d)
        chunks.append(c)

    log.info("Writing chunk.json to %s", output_json_file)
    write_pipeline_chunks(chunks, output_json_file,
                          "created by %s" % Constants.TOOL_ID)
    return 0
示例#19
0
def run_main(chunk_json, sam_output, chunk_key):
    """run main"""
    chunks = load_pipeline_chunks_from_json(chunk_json)

    # Allow looseness
    if not chunk_key.startswith('$chunk.'):
        chunk_key = '$chunk.' + chunk_key
        log.warn("Prepending chunk key with '$chunk.' to '%s'", str(chunk_key))

    sam_files = get_datum_from_chunks_by_chunk_key(chunks, chunk_key)
    log.debug("Chunked SAM files are %s.", (', '.join(sam_files)))

    log.info("Concatenate chunked SAM files to %s.", sam_output)

    # concatenate sam files
    unsorted_sam_output = sam_output + ".unsorted.sam"
    concatenate_sam(sam_files, unsorted_sam_output)

    # then sort
    sort_sam(unsorted_sam_output, sam_output)

    # remove intermediate file
    rmpath(unsorted_sam_output)
    return 0
示例#20
0
def _validate_chunk_json_file(path):
    chunks = load_pipeline_chunks_from_json(path)
    return path
示例#21
0
def run(chunk_input_json, output_file, chunk_key):
    chunks = load_pipeline_chunks_from_json(chunk_input_json)
    chunked_files = get_datum_from_chunks_by_chunk_key(chunks, chunk_key)
    _ = combine_nfl_pickles(chunked_files, output_file)
    return 0
示例#22
0
def run(chunk_input_json, output_file, chunk_key):
    chunks = load_pipeline_chunks_from_json(chunk_input_json)
    chunked_files = get_datum_from_chunks_by_chunk_key(chunks, chunk_key)
    _ = combine_nfl_pickles(chunked_files, output_file)
    return 0
示例#23
0
def _validate_chunk_json_file(path):
    chunks = load_pipeline_chunks_from_json(path)
    return path