示例#1
0
文件: cli.py 项目: rodipm/OS
    def add_command(self, run_time, number):
        run_time = int(run_time)
        number = int(number)
        finish_events = {
            "disco": DiskFinishedEvent,
            "leitora1": LeitoraUmFinishedEvent,
            "leitora2": LeitoraDoisFinishedEvent,
            "impressora1": ImpressoraUmFinishedEvent,
            "impressora2": ImpressoraDoisFinishedEvent
        }

        for _ in range(number):
            io = {
                "disco": None,
                "leitora1": None,
                "leitora2": None,
                "impressora1": None,
                "impressora2": None
            }

            last_start_cycles = [1]

            for dev in io.keys():
                io_requests = []
                has_device = bool(random.random() < 0.9)

                if not has_device:
                    continue

                number_requests = random.randint(1, 5)

                for i in range(number_requests):
                    io_cycles = random.randint(*io_config[dev])
                    start_cycle = 1

                    try:
                        start_cycle = random.randint(
                            last_start_cycles[-1],
                            i * run_time // number_requests - io_cycles)
                        if start_cycle in last_start_cycles:
                            continue

                        last_start_cycles.append(start_cycle)
                    except ValueError:
                        continue

                    io_requests.append((start_cycle, io_cycles))

                if len(io_requests):
                    io[dev] = Device(dev, io_requests, finish_events[dev])

            job_priority = random.choice(list(JobPriority))
            job_size = random.randint(10, 70)

            new_job = Job(self.job_ids, run_time, job_priority, io, job_size)

            self.job_ids += 1

            self.os.add_job(new_job)
示例#2
0
 def run(self):
     for job_config in self.job_configs:
         assert job_config.submit_time >= self.env.now
         yield self.env.timeout(job_config.submit_time - self.env.now)
         job = Job(self.env, job_config)
         # print('a task arrived at time %f' % self.env.now)
         self.cluster.add_job(job)
     self.destroyed = True
示例#3
0
    def run_arriba(self):
        """
        """

        jobs = []
        for sample in self.samples:
            if len(sample.readsets) > 1:
                raise Exception("Error: only one read set per sample allowed")
            if sample.readsets[0].bam:  # .bam input
                fastq_dir = os.path.join("fusions", "picard_sam_to_fastq",
                                         sample.name)
                bam = sample.readsets[0].bam
                left_fastq = os.path.join(
                    self._output_dir, fastq_dir,
                    os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz",
                                            bam)))
                right_fastq = os.path.join(
                    self._output_dir, fastq_dir,
                    os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz",
                                            bam)))
            elif sample.readsets[0].fastq2 and sample.readsets[0].fastq2.split(
                    ".")[-1] == "gz":
                left_fastq = sample.readsets[0].fastq1
                right_fastq = sample.readsets[0].fastq2
            else:
                raise Exception(
                    "Error: only .bam and .fastq.gz inputs allowed")
            output_dir = os.path.join("fusions", "arriba", sample.name)
            # JOBS
            chgdir_job = Job(command="cd " + output_dir)
            back_to_outdir_job = Job(command="cd " + self._output_dir)
            # CONCAT
            job = concat_jobs([
                Job(command="mkdir -p " + output_dir), chgdir_job,
                arriba.run(left_fastq,
                           right_fastq,
                           self._output_dir,
                           output_dir,
                           keep_bam=self.args.keep_bams), back_to_outdir_job
            ],
                              name="run_arriba." + sample.name)

            job.samples = [sample]
            jobs.append(job)

        return jobs
示例#4
0
    def gunzip_fastq(self):
        """
        Gunzip .fastq.gz files or symlink if already uncompressed
        """
        jobs = []
        for readset in self.readsets:
            out_dir = os.path.join("fusions", "gunzip_fastq",
                                   readset.sample.name)
            # Find input readset FASTQs first from previous trimmomatic job,
            # then from original FASTQs in the readset sheet
            if readset.run_type == "PAIRED_END":
                candidate_input_files = []
                if readset.fastq1 and readset.fastq2:
                    candidate_input_files.append(
                        [readset.fastq1, readset.fastq2])
                if readset.bam:
                    picard_dir = os.path.join("fusions", "picard_sam_to_fastq",
                                              readset.sample.name)
                    candidate_input_files.append([
                        os.path.join(
                            picard_dir,
                            os.path.basename(
                                re.sub(r"\.bam$", ".pair1.fastq.gz",
                                       readset.bam))),
                        os.path.join(
                            picard_dir,
                            os.path.basename(
                                re.sub(r"\.bam$", ".pair2.fastq.gz",
                                       readset.bam)))
                    ])
                if readset.cram:
                    picard_dir = os.path.join("fusions", "picard_sam_to_fastq",
                                              readset.sample.name)
                    candidate_input_files.append([
                        os.path.join(
                            picard_dir,
                            os.path.basename(readset.cram) +
                            ".pair1.fastq.gz"),
                        os.path.join(
                            picard_dir,
                            os.path.basename(readset.cram) + ".pair2.fastq.gz")
                    ])
                [fastq1,
                 fastq2] = self.select_input_files(candidate_input_files)
            else:
                raise Exception("Error: run type \"" + readset.run_type +
                                "\" is invalid for readset \"" + readset.name +
                                "\" (should be PAIRED_END)!")
            gunzip1_job = gunzip.gunzip_fastq(fastq1, out_dir)
            gunzip2_job = gunzip.gunzip_fastq(fastq2, out_dir)
            job = concat_jobs(
                [Job(command="mkdir -p " + out_dir), gunzip1_job, gunzip2_job],
                name="gunzip_fastq." + readset.sample.name + "." +
                readset.name)

            jobs.append(job)

        return jobs
示例#5
0
    def __init__(self, guid, remote_address, pubkey_xml):
        self.guid = guid
        self.address = remote_address
        self.data = None
        self.checkin_time = None
        self.crypto = ECDHE(pubkey_xml)
        self.jobs = Queue()

        self.add_job(Job(command=('checkin', '')))
示例#6
0
    def chimerascan(self):
        """
        Run chimerascan to call gene fusions
        """
        jobs = []
        for sample in self.samples:
            fastq1, fastq2 = self.select_input_fastq(sample)
            out_dir = os.path.join("fusions", "chimerascan", sample.name)
            chimerascan_job = chimerascan.run(fastq1, fastq2, out_dir)
            job = concat_jobs([
                Job(command="mkdir -p " + out_dir),
                Job(command="rm -r " + out_dir), chimerascan_job
            ],
                              name="chimerascan." + sample.name)

            jobs.append(job)

        return jobs
示例#7
0
    def ericscript(self):
        """
        Run EricScript to call gene fusions
        """
        jobs = []
        for sample in self.samples:
            fastq1, fastq2 = self.select_input_fastq(sample)
            out_dir = os.path.join("fusions", "ericscript", sample.name)
            ericscript_job = ericscript.ericscript(
                fastq1, fastq2, out_dir, keep_bam=self.args.keep_bams)
            job = concat_jobs([
                Job(command="mkdir -p " + out_dir),
                Job(command="rm -r " + out_dir), ericscript_job
            ],
                              name="ericscript." + sample.name)

            jobs.append(job)

        return jobs
示例#8
0
def main(hub_id, dataset_id, version):
    conn = psql.connect('')
    queue = pq.PQ(conn=conn)['jobs']

    queue.put(
        Job(1, 'verify_partitions', {
            'hub_id': hub_id,
            'dataset_id': dataset_id,
            'version': version,
        }).__dict__)
示例#9
0
 def _build_jobs(self, response, data, epoch, now, source, ignoreempty = False, discardiffull = False):
   decoded = data.decode("utf_8")
   if len(decoded) == 0 and ignoreempty:
     self.core.log(self, "Got empty %s response\n" % source, 500)
     return
   decoded = json.loads(decoded)
   data = unhexlify(decoded["result"]["data"].encode("ascii"))
   target = unhexlify(decoded["result"]["target"].encode("ascii"))
   try: identifier = int(decoded["result"]["identifier"])
   except: identifier = None
   if identifier != self.lastidentifier:
     self._cancel_jobs()
     self.lastidentifier = identifier
   self.blockchain.check_job(Job(self.core, self, 0, data, target, True, identifier))
   roll_ntime = 1
   expiry = 60
   isp2pool = False
   headers = response.getheaders()
   for h in headers:
     if h[0].lower() == "x-is-p2pool" and h[1].lower() == "true": isp2pool = True
     elif h[0].lower() == "x-roll-ntime" and h[1] and h[1].lower() != "n":
       roll_ntime = 60
       parts = h[1].split("=", 1)
       if parts[0].strip().lower() == "expire":
         try: roll_ntime = int(parts[1])
         except: pass
       expiry = roll_ntime
   if isp2pool: expiry = 60
   self.stats.supports_rollntime = roll_ntime > 1
   if epoch != self.jobepoch:
     self.core.log(self, "Discarding %d jobs from %s response because request was issued before flush\n" % (roll_ntime, source), 500)
     with self.stats.lock: self.stats.jobsreceived += roll_ntime
     return
   if self.core.workqueue.count > self.core.workqueue.target * (1 if discardiffull else 5):
     self.core.log(self, "Discarding %d jobs from %s response because work buffer is full\n" % (roll_ntime, source), 500)
     with self.stats.lock: self.stats.jobsreceived += roll_ntime
     return
   expiry += now - self.settings.expirymargin
   midstate = Job.calculate_midstate(data)
   prefix = data[:68]
   timebase = struct.unpack(">I", data[68:72])[0]
   suffix = data[72:]
   return [Job(self.core, self, expiry, prefix + struct.pack(">I", timebase + i) + suffix, target, midstate, identifier) for i in range(roll_ntime)]
示例#10
0
    def integrate(self):
        """
        Run Integrate to call gene fusions
        """
        jobs = []
        for sample in self.samples:
            input_dir = os.path.join("fusions", "tophat2", sample.name)
            accepted_bam = os.path.join(self.output_dir, input_dir,
                                        "accepted_hits.bam")
            unmapped_bam = os.path.join(self.output_dir, input_dir,
                                        "unmapped.bam")

            out_dir = os.path.join("fusions", "integrate", sample.name)
            integrate_job = integrate.integrate(accepted_bam, unmapped_bam,
                                                out_dir)
            job = concat_jobs([
                Job(command="mkdir -p " + out_dir),
                Job(command="cd " + out_dir), integrate_job,
                Job(command="cd -")
            ],
                              name="integrate." + sample.name)
            jobs.append(job)
        return jobs
示例#11
0
    def sleep(self, guid: str, interval: int):
        """
        Set the checkin interval for an agent

        Usage: sleep <guid> <interval> [-h]

        Arguments:
            guid  filter by session's guid
            interval  checkin interval in milliseconds
        """

        for session in self.sessions:
            if session == guid:
                session.add_job(Job(command=('sleep', int(interval))))
示例#12
0
    def fusionmap(self):
        """
        Run FusionMap to call gene fusions
        """
        jobs = []
        for sample in self.samples:
            # add pipeline top outpud dir as input to bfx fusionmap script
            # self._output_dir assigned from command line args in pipeline.py
            top_dir = self._output_dir

            fastq1, fastq2 = self.select_input_fastq(sample)
            out_dir = os.path.join("fusions", "fusionmap", sample.name)
            fusionmap_job = fusionmap.fusionmap(fastq1, fastq2, out_dir,
                                                top_dir)
            job = concat_jobs([
                Job(command="mkdir -p " + out_dir), fusionmap_job,
                Job(command="ls " + out_dir + "/02_RNA*")
            ],
                              name="fusionmap." + sample.name)

            jobs.append(job)

        return jobs
示例#13
0
 def get_jobs(self):
     jobs = []
     for task in self.tasks:
         for job_number in range(0, task.get_number_of_jobs(self.H)):
             start = task.phase + task.period * job_number
             end = task.deadline + task.period * job_number + task.phase
             job = Job(task=task,
                       name=job_number + 1,
                       release=start,
                       deadline=end,
                       ex_time=task.ex_time,
                       status=1)
             jobs.append(job)
     return jobs
示例#14
0
    def run_star_seqr(self):
        """
        RNA Fusion Detection and Quantification using STAR
        https://github.com/ExpressionAnalysis/STAR-SEQR
        """

        jobs = []
        for sample in self.samples:
            if len(sample.readsets) > 1:
                raise Exception("Error: only one read set per sample allowed")
            if sample.readsets[0].bam:  # .bam input
                fastq_dir = os.path.join("fusions", "picard_sam_to_fastq",
                                         sample.name)
                bam = sample.readsets[0].bam
                # fastq1 = os.path.join(out_dir, os.path.basename(re.sub("\.bam$", ".pair1.fastq.gz", out_bam)))
                # fastq2 = os.path.join(out_dir, os.path.basename(re.sub("\.bam$", ".pair2.fastq.gz", out_bam)))
                left_fastq = os.path.join(
                    fastq_dir,
                    os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz",
                                            bam)))
                right_fastq = os.path.join(
                    fastq_dir,
                    os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz",
                                            bam)))
            elif sample.readsets[0].fastq2 and sample.readsets[0].fastq2.split(
                    ".")[-1] == "gz":
                # print(sample.readsets[0].fastq2)
                # print(sample.readsets[0].fastq2.split(".")[-1])
                left_fastq = sample.readsets[0].fastq1
                right_fastq = sample.readsets[0].fastq2
            else:
                raise Exception(
                    "Error: only .bam and .fastq.gz inputs allowed")
            output_dir = os.path.join("fusions", "star_seqr", sample.name)

            job = concat_jobs([
                Job(command="mkdir -p " + output_dir),
                star_seqr.run(left_fastq,
                              right_fastq,
                              output_dir,
                              sample.name,
                              keep_bam=self.args.keep_bams)
            ],
                              name="run_star_seqr." + sample.name)

            job.samples = [sample]
            jobs.append(job)

        return jobs
示例#15
0
    def delete_fastqs(self):
        """
        Delete fastqs when all callers' jobs are finished                                                     
        """
        jobs = []
        for sample in self.samples:
            defuse_result = os.path.join("fusions", "defuse", sample.name,
                                         "results.filtered.tsv")
            fusionmap_result = os.path.join("fusions", "fusionmap",
                                            sample.name,
                                            "02_RNA.FusionReport.txt")
            ericscript_result = os.path.join("fusions", "ericscript",
                                             sample.name,
                                             "fusion.results.filtered.tsv")
            integrate_result = os.path.join("fusions", "integrate",
                                            sample.name, "breakpoints.cov.tsv")
            star_seqr_result = os.path.join("fusions", "star_seqr",
                                            sample.name,
                                            "out_STAR-SEQR_candidates.txt")
            arriba_result = os.path.join("fusions", "arriba", sample.name,
                                         "fusions.tsv")
            star_fusion_result = os.path.join(
                "fusions", "star_fusion", sample.name,
                "star-fusion.fusion_predictions.abridged.coding_effect.tsv")
            cicero_result = os.path.join("fusions", "cicero", sample.name,
                                         "final_fusions.txt")

            # result_file_list = [defuse_result, fusionmap_result, ericscript_result, integrate_result,
            #                     star_seqr_result, arriba_result, star_fusion_result]
            result_file_list = [defuse_result, fusionmap_result]
            del_job = delete_fastqs.delete_fastqs(sample.name,
                                                  result_file_list)
            job = concat_jobs([Job(command="mkdir -p delete_fastqs"), del_job],
                              name="delete_fastqs." + sample.name)
            # job = concat_jobs([
            #    Job(command="mkdir -p delete_fastqs")
            # ], name="delete_fastqs." + sample.name)
            job.input_files = [
                defuse_result, fusionmap_result, ericscript_result,
                integrate_result, star_seqr_result, arriba_result,
                star_fusion_result, cicero_result
            ]
            jobs.append(job)
            # DELETE BAMS JOB (one across all samples)
        del_bams_job = concat_jobs(
            [delete_fastqs.delete_bams(result_file_list, self._output_dir)],
            name="delete_bams")
        jobs.append(del_bams_job)
        return jobs
示例#16
0
 def tophat2(self):
     """
     Run Tophat2 for Integrate. Determines accepted hits and unmapped reads, and outputs 
     corresponding .bam files required as input files for integrate step.
     """
     jobs = []
     for sample in self.samples:
         fastq1, fastq2 = self.select_input_fastq(sample)
         out_dir = os.path.join(self.output_dir, "fusions", "tophat2",
                                sample.name)
         tophat2_job = tophat2.tophat2(fastq1, fastq2, out_dir)
         job = concat_jobs(
             [Job(command="mkdir -p " + out_dir), tophat2_job],
             name="tophat2." + sample.name)
         jobs.append(job)
     return jobs
示例#17
0
    def MetaFusion_clinical(self):
        """
        Run MetaFusion.IsoHunter.clinical
        """
        jobs = []
        out_dir_abspath = self._output_dir
        metafusion_outdir = os.path.join("fusions", "metafusion_clinical")
        metafusion_job = metafusion_clinical.run_metafusion_clinical(
            out_dir_abspath, self.args.database)
        job = concat_jobs(
            [Job(command="mkdir -p " + metafusion_outdir), metafusion_job],
            name="MetaFusion.clinical")

        jobs.append(job)

        return jobs
示例#18
0
    def MetaFusion_IsoHunter(self):
        """
        Run MetaFusion.IsoHunter
        """
        jobs = []
        out_dir_abspath = self._output_dir
        isohunter_outdir = os.path.join("fusions", "metafusion_isohunter")
        metafusion_job = metafusion_isohunter.run_isohunter_singularity(
            out_dir_abspath)
        job = concat_jobs(
            [Job(command="mkdir -p " + isohunter_outdir), metafusion_job],
            name="MetaFusion.IsoHunter")

        jobs.append(job)

        return jobs
示例#19
0
    def run(self, guids: List[str]):
        """
        Run a module

        Usage: 
            run <guids>...
            run -h | --help

        Arguments:
            guids    session guids to run modules on

        Options:
            -h, --help   Show dis
        """
        job = Job(self.selected)
        for guid in guids:
            ipc_server.publish(NEW_JOB, (guid, job.encode()))
示例#20
0
def main():
    if len(sys.argv) < 3:
        print 'usage: %s input_dir output_dir' % sys.argv[0]
        return

    conf = DefaultConfigure()
    job = Job(conf)
    job.set_splliter(LineSplitter)
    job.set_mapper(WordCountMapper)
    job.set_mapper_num(4)
    job.set_reducer(WordCountReducer)
    job.set_reducer_num(1)

    job.add_input_path(sys.argv[1])
    job.set_output_path(sys.argv[2])

    print job.run()
示例#21
0
    def wrapper():

        if Agent.get('agent_status') == 'disabled':
            return jsonify({'status': 'disabled'})
        Agent.set('agent_status', 'busy')

        log.info(f'processing request: \n'
                 f'{[{k:v} for k,v in request.args.items()]}\n'
                 f'role: {api.__name__}')

        try:
            job = Job(request)
            job.set('role', api.__name__)
            log.info(f'job object created with id: {job.job_id}')
            return api(job)
        except Exception as e:
            log.info(f'error in job processing: {e}', report=True)
示例#22
0
    def MetaFusion(self):
        """
        Run MetaFusion
        """
        jobs = []
        cff_dir_abspath = os.path.join(self._output_dir, "fusions", "cff")
        out_dir_abspath = os.path.join(self._output_dir, "fusions",
                                       "metafusion")
        metafusion_job = metafusion.run_metafusion_singularity(out_dir_abspath)
        # metafusion_job.name = "MetaFusion"
        job = concat_jobs(
            [Job(command="mkdir -p " + out_dir_abspath), metafusion_job],
            name="MetaFusion")

        jobs.append(job)

        return jobs
示例#23
0
    def run(self, guids: List[str]):
        """
        Run a module

        Usage:
            run <guids>...
            run -h | --help

        Arguments:
            guids    session guids to run modules on

        Options:
            -h, --help   Show dis
        """

        for guid in guids:
            self.prompt_session.contexts[1].add_job(
                (guid, Job(module=self.selected)))
示例#24
0
def main():
    logging.configure()

    conn = psql.connect('')
    queue = pq.PQ(conn=conn)['jobs']

    backends = load_backends(conn.cursor())

    for job_entry in queue:
        if job_entry is None:
            time.sleep(2)
            continue

        job = Job(**job_entry.data)
        backend = backends[job.backend_id]

        run_job(conn.cursor(), backend, job)
        conn.commit()
示例#25
0
    def defuse(self):
        """
        Run Defuse to call gene fusions
        """
        jobs = []
        for sample in self.samples:
            fastq1, fastq2 = self.select_input_fastq(sample)
            out_dir = os.path.join("fusions", "defuse", sample.name)
            defuse_job = defuse.defuse(fastq1,
                                       fastq2,
                                       out_dir,
                                       keep_bam=self.args.keep_bams)
            job = concat_jobs([Job(command="mkdir -p " + out_dir), defuse_job],
                              name="defuse." + sample.name)

            jobs.append(job)

        return jobs
示例#26
0
    def __init__(self, guid, remote_address, pubkey_xml):
        self.guid = guid
        self.address = remote_address
        self.data = None
        self.checkin_time = None
        self.crypto = ECDHE(pubkey_xml)
        self.jobs = Queue()

        self.logger = logging.getLogger(str(guid))
        self.logger.propagate = False
        self.logger.setLevel(logging.DEBUG)

        formatter = logging.Formatter('%(asctime)s - %(message)s')
        fh = logging.FileHandler(f"./logs/{guid}.log", encoding='UTF-8')
        fh.setLevel(logging.DEBUG)
        fh.setFormatter(formatter)

        self.logger.addHandler(fh)

        self.add_job(Job(command=('checkin', '')))
示例#27
0
    def star_fusion(self):
        """
        Run STAR-Fusion to call gene fusions
        """
        jobs = []
        CTAT_resource_lib = "/hpf/largeprojects/ccmbio/mapostolides/validate_fusion/test_star_star-fusion/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play/ctat_genome_lib_build_dir"
        for sample in self.samples:
            fastq1, fastq2 = self.select_input_fastq(sample)
            out_dir = os.path.join("fusions", "star_fusion", sample.name)
            # star_fusion_job = star_fusion.star_fusion(fastq1, fastq2, out_dir, CTAT_resource_lib)
            star_fusion_job = star_fusion.star_fusion(
                fastq1,
                fastq2,
                CTAT_resource_lib,
                out_dir,
                keep_bam=self.args.keep_bams)
            job = concat_jobs(
                [Job(command="mkdir -p " + out_dir), star_fusion_job],
                name="star_fusion." + sample.name)

            jobs.append(job)

        return jobs
示例#28
0
    def fusion_stats(self):
        """
        Outputs count files and plots about the detected gene fusions.
        """
        jobs = []
        cff_dir = os.path.join("fusions", "cff")
        out_dir = os.path.join("fusions", "fusion_stats")
        sampleinfo_file = os.path.relpath(self.args.sampleinfo.name,
                                          self.output_dir)

        fusion_stats_job = fusion_stats.fusion_stats(cff_dir, out_dir,
                                                     sampleinfo_file)
        category_table_job = fusion_stats.generate_category_count_table(
            cff_dir, out_dir)
        category_barplot_job = fusion_stats.generate_categories_barplot(
            fusion_stats_dir=out_dir)
        job = concat_jobs([
            Job(command="mkdir -p " + out_dir), fusion_stats_job,
            category_table_job, category_barplot_job
        ],
                          name="fusion_stats")

        jobs.append(job)
        return jobs
示例#29
0
    def convert_fusion_results_to_cff(self):
        """
        Convert fusion results of all 4 gene fusion callers to cff format
        """
        jobs = []
        out_dir = os.path.join("fusions", "cff")
        job_list = [Job(command="mkdir -p " + out_dir)]
        sampleinfo_file = os.path.relpath(self.args.sampleinfo.name,
                                          self.output_dir)

        for sample in self.samples:

            # Define result files
            # output_file = os.path.join(output_dir, prefix + "_STAR-SEQR", prefix  + "_STAR-SEQR_candidates.txt")
            # star_seqr_result = os.path.join("fusions", "star_seqr", sample.name,
            #                                 "out_STAR-SEQR", "out_STAR-SEQR_candidates.txt")
            star_seqr_result = os.path.join("fusions", "star_seqr",
                                            sample.name,
                                            "out_STAR-SEQR_candidates.txt")
            # print >> sys.stderr, star_seqr_result
            arriba_result = os.path.join("fusions", "arriba", sample.name,
                                         "fusions.tsv")
            # star_fusion_result = os.path.join("fusions", "star_fusion",
            #                                   sample.name, "star-fusion.fusion_predictions.abridged.tsv")
            star_fusion_result = os.path.join(
                "fusions", "star_fusion", sample.name,
                "star-fusion.fusion_predictions.abridged.coding_effect.tsv")
            defuse_result = os.path.join("fusions", "defuse", sample.name,
                                         "results.filtered.tsv")
            fusionmap_result = os.path.join("fusions", "fusionmap",
                                            sample.name,
                                            "02_RNA.FusionReport.txt")
            ericscript_result = os.path.join("fusions", "ericscript",
                                             sample.name,
                                             "fusion.results.filtered.tsv")
            integrate_result = os.path.join("fusions", "integrate",
                                            sample.name, "breakpoints.cov.tsv")
            cicero_result = os.path.join("fusions", "cicero", sample.name,
                                         "final_fusions.txt")
            # Build tool_results list based on self.tool_list
            result_file_dict = {
                "star_seqr": star_seqr_result,
                "arriba": arriba_result,
                "star_fusion": star_fusion_result,
                "defuse": defuse_result,
                "fusionmap": fusionmap_result,
                "ericscript": ericscript_result,
                "integrate": integrate_result,
                "cicero": cicero_result
            }
            tool_results = [(key, result_file_dict[key])
                            for key in result_file_dict.keys()
                            if key in self.tool_list]
            # tool_results = [("star_seqr",star_seqr_result), ("arriba", arriba_result),
            #                 ("star_fusion", star_fusion_result), ("defuse", defuse_result),
            #                 ("fusionmap", fusionmap_result), ("ericscript", ericscript_result),
            #                 ("integrate", integrate_result)]
            # tool_results = [("arriba", arriba_result), ("star_fusion", star_fusion_result),
            #                 ("defuse", defuse_result), ("fusionmap", fusionmap_result),
            #                 ("ericscript", ericscript_result), ("integrate", integrate_result)]
            # determine sample_type
            """
            sample_type = ""
            for contrast in self.contrasts:
                if sample in contrast.controls:
                    sample_type = "Normal"
                elif sample in contrast.treatments:
                    sample_type = "Tumor"
                if sample_type:
                    disease_name = contrast.name
                    break    
            if not sample_type:
                raise Exception("Error: sample " + sample.name + " not found in design file " + self.args.design.name)
            """
            # convert caller output files to common fusion format(cff)
            for tool, result_file in tool_results:
                job = cff_conversion.cff_convert(sample.name, result_file,
                                                 sampleinfo_file, tool,
                                                 out_dir)
                job.command = job.command.strip()
                job_list.append(job)
        job = concat_jobs(job_list, name="cff_conversion")
        jobs.append(job)
        return jobs
示例#30
0
    def run_cicero(self):
        """
        Fusion detection specializing in internal tandem duplication (ITD)
        https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02043-x
        https://github.com/stjude/Cicero

        This software runs as a docker application. However, this can also be installed manually.
        As of May 2021, versions 0.2.0, 0.3.0 and 1.4.2 are available as modules on the HPF.

        Also runs RNApeg, a complementary tool to generate the junctions file for use by CICERO.
        Available on the HPF via RNApeg/20210226 and runs as a singularity container.
        """
        jobs = []

        for sample in self.samples:
            # Get fastq files
            if len(sample.readsets) > 1:
                raise Exception("Error: only one read set per sample allowed")
            if sample.readsets[0].bam:  # .bam input
                fastq_dir = os.path.join("fusions", "picard_sam_to_fastq",
                                         sample.name)
                bam = sample.readsets[0].bam
                fq1 = os.path.join(
                    self._output_dir, fastq_dir,
                    os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz",
                                            bam)))
                fq2 = os.path.join(
                    self._output_dir, fastq_dir,
                    os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz",
                                            bam)))
            elif sample.readsets[0].fastq2 and sample.readsets[0].fastq2.split(
                    ".")[-1] == "gz":
                fq1 = sample.readsets[0].fastq1
                fq2 = sample.readsets[0].fastq2
            else:
                raise Exception(
                    "Error: only .bam and .fastq.gz inputs allowed")

            # Directories
            tmp_dir = "/localhd/${PBS_JOBID}"  # The variable should be unevaluated in the qsub script
            trim_dir = os.path.join(tmp_dir, "trimmomatic")
            align_dir = os.path.join(tmp_dir, "star")
            cicero_dir = os.path.join(tmp_dir, "cicero")
            rnapeg_dir = os.path.join(tmp_dir, "rnapeg")
            output_dir = os.path.join("fusions", "cicero", sample.name)

            # Files
            fq1_trimmed = os.path.join(
                trim_dir, "".join([sample.name, ".trimmed.R1.fq.gz"]))
            fq2_trimmed = os.path.join(
                trim_dir, "".join([sample.name, ".trimmed.R2.fq.gz"]))
            fq1_dropped = os.path.join(
                trim_dir, "".join([sample.name, ".filtered.R1.fq.gz"]))
            fq2_dropped = os.path.join(
                trim_dir, "".join([sample.name, ".filtered.R2.fq.gz"]))
            trim_log = os.path.join(trim_dir,
                                    "".join([sample.name, ".trim.log"]))
            star_bam = os.path.join(align_dir, "Aligned.sortedByCoord.out.bam")
            dedup_bam = os.path.join(align_dir,
                                     "Aligned.sortedByCoord.dedup.bam")
            dedup_metrics = os.path.join(
                align_dir, "Aligned.sortedByCoord.dedup.metrics")
            symlink_bam = os.path.join(cicero_dir, sample.name + ".bam")
            junction_file = os.path.join(
                rnapeg_dir, sample.name + ".bam.junctions.tab.shifted.tab")

            # Jobs
            trim = trimmomatic.trimmomatic(
                fq1, fq2, fq1_trimmed, fq1_dropped, fq2_trimmed, fq2_dropped,
                None, None,
                config.param("trimmomatic", "adapter_fasta",
                             required=False), trim_log)
            align = star.align(fq1_trimmed,
                               fq2_trimmed,
                               align_dir,
                               config.param("run_cicero", "genome_build"),
                               rg_id=sample.name,
                               rg_library=sample.name,
                               rg_sample=sample.name,
                               rg_platform="ILLUMINA",
                               sort_bam=True)
            index = samtools.index(star_bam)
            # Also indexes for us! idx_file=re.sub(r"\.bam$", ".bai", dedup_bam)
            dedup = picard.mark_duplicates([star_bam], dedup_bam,
                                           dedup_metrics)
            # RNApeg
            rna_peg = Job(
                input_files=[dedup_bam],
                output_files=[junction_file],
                module_entries=[("run_cicero", "module_rnapeg")],
                name="RNApeg",
                command="""ln -s \\\n{idx_file} \\\n{new_idx_file} && \\
ln -s {bamfile} \\\n{new_bamfile} && \\
singularity exec --cleanenv -B /hpf:/hpf -B /localhd:/localhd -B {outpath}:/results \\
$(which rnapeg.sif) RNApeg.sh -b {new_bamfile} \\\n   -f {ref} \\\n   -r {reflat}"""
                .format(bamfile=dedup_bam,
                        ref=config.param("run_cicero",
                                         "reference",
                                         required=True),
                        reflat=config.param("run_cicero",
                                            "reflat",
                                            required=True),
                        outpath=rnapeg_dir,
                        idx_file=re.sub(r"\.bam$", ".bai", dedup_bam),
                        new_bamfile=symlink_bam,
                        new_idx_file=symlink_bam + ".bai"))
            # Cicero
            cicero = Job(
                input_files=[dedup_bam, junction_file],
                output_files=[
                    os.path.join(cicero_dir, "CICERO_DATADIR", sample.name,
                                 "final_fusions.txt")
                ],
                module_entries=[("run_cicero", "module_cicero")],
                name="run_cicero" + sample.name,
                command=
                """singularity exec --cleanenv -B /hpf:/hpf -B /localhd:/localhd \\
                         $CICERO_PATH/CICERO_1.4.2.sif \\
Cicero.sh -n {threads} -b {bamfile} \\\n -g {genome} \\\n -r {reference} \\\n  -j {junction} -o {out_dir}"""
                .format(threads=config.param("run_cicero",
                                             "threads",
                                             required=True),
                        bamfile=symlink_bam,
                        genome=config.param("run_cicero",
                                            "genome",
                                            required=True),
                        reference=config.param("run_cicero",
                                               "cicero_data",
                                               required=True),
                        junction=junction_file,
                        out_dir=cicero_dir))
            save_out = Job(
                input_files=[
                    os.path.join(cicero_dir, "CICERO_DATADIR", sample.name,
                                 "final_fusions.txt")
                ],
                output_files=[os.path.join(output_dir, "final_fusions.txt")],
                name="save_cicero_results" + sample.name,
                command="""mv {files_to_keep} {target_dir}""".format(
                    files_to_keep=" ".join([
                        junction_file,
                        os.path.join(cicero_dir, "0*.{err,log}"),  # Logs
                        os.path.join(cicero_dir, "CICERO_DATADIR", sample.name,
                                     "*.{txt,frame.tab,html}")  #
                        # Result files
                    ]),
                    target_dir=output_dir)
            )  # the files in /localhd/ should be removed automatically upon job end

            job_mkdir = Job(
                command="mkdir -p {trim} {align} {cicero} {output} {rnapeg}".
                format(trim=trim_dir,
                       align=align_dir,
                       cicero=cicero_dir,
                       output=output_dir,
                       rnapeg=rnapeg_dir))
            combined_job = concat_jobs([
                job_mkdir, trim, align, index, dedup, rna_peg, cicero, save_out
            ],
                                       name="run_cicero." + sample.name)
            # Replace input and output specification
            combined_job._output_files = [
                os.path.join(output_dir, "final_fusions.txt")
            ]
            combined_job.input_files = [fq1, fq2]
            jobs.append(combined_job)
        return jobs