예제 #1
0
 def setUp(self):
     self.ann_dir = "tests/data/ann.bed12"
     self.size = "tests/data/chrom.sizes"
     names = ('Bract', 'Cotyledon')
     ngs = [
         os.path.join('tests/data', "%s.sort.bam" % name) for name in names
     ]
     self.ngs = [SeqFile(fname, 'NGS') for fname in ngs]
     self.ann = SeqFile(bed2bam(self.ann_dir, self.size, "/tmp/igia"),
                        'ANN')
     load_seqinfo(self.ngs)
예제 #2
0
 def setUp(self):
     names = ('Bract', 'Cotyledon')
     ngs = [
         os.path.join('tests/data', "%s.sort.bam" % name) for name in names
     ]
     self.ngs = [SeqFile(fname, 'NGS') for fname in ngs]
     load_seqinfo(self.ngs)
     self.tgs = [SeqFile('tests/data/all_fixed_star.sort.bam', 'TGS')]
     self.ival_rev = Interval("Chr01", 93000, 96500, "-")
     self.ival_rev.build_cov(self.ngs)
     jgn_rev = JunctionGraphNgs(self.ival_rev)
     self.rev_intron = jgn_rev.identify_intron(self.ngs)
     self.ival_fwd = Interval("Chr01", 485500, 493000, "+")
     self.ival_fwd.build_cov(self.ngs)
     jgn_fwd = JunctionGraphNgs(self.ival_fwd)
     self.fwd_intron = jgn_fwd.identify_intron(self.ngs)
     self.genome = GenomeFile(os.path.join("tests", "data", "genome.fa"))
예제 #3
0
def main(args):
    """Main entry point allowing external calls
    Args:
      args ([str]): command line parameter list
    """
    args = parse_args(args)
    setup_logging(args.loglevel)
    check_paraclu(args)

    _logger.debug("Starting IGIA ...")

    ngs_obj_list = [SeqFile(x, "NGS") for x in args.ngs_file]
    tgs_obj_list = [SeqFile(x, "TGS") for x in args.tgs_file]
    ext_tss_list = load_txs(args.tss)
    ext_tes_list = load_txs(args.tes)

    out_dir = args.out_dir
    ann = load_ann(args.ann, args.size, out_dir, "ANN")

    # Update Global variables.
    GVAR.RULE = args.rule
    GVAR.TXS_DIFF = args.dtxs
    GVAR.SPLICED_INTRON_PIR_CUTOFF = args.pir
    f_genome = args.f_genome
    paraclu_path = args.paraclu_path

    load_seqinfo(ngs_obj_list)
    _logger.info("Start building linkage ...")
    bam_list = ngs_obj_list + tgs_obj_list
    if ann is not None:
        bam_list += [ann]
    linkage = find_linkage(bam_list)
    _logger.info("Finish building linkage")

    cluster_indx = 0
    with OutputHandle(out_dir) as f_out:
        for chrom, start, end in linkage.iterlinkage():
            try:
                if args.time_out is not None:
                    signal.signal(signal.SIGALRM, time_out_handler)
                    signal.alarm(args.time_out)
                _logger.debug(
                    "Start identifying elements in {0}:{1}-{2}".format(
                        chrom, start, end))
                gene_cluster_list = identify_element(
                    chrom, start, end, ngs_obj_list, tgs_obj_list,
                    ext_tss_list, ext_tes_list, ann, f_genome, paraclu_path)
                _logger.debug(
                    "Finish identifying elements in {0}:{1}-{2}".format(
                        chrom, start, end))

                for gene_cluster in gene_cluster_list:  # list of gene cluster without any common exon
                    if not gene_cluster.has_element():
                        continue
                    cluster_indx += 1
                    cluster_name = "c_{0}".format(cluster_indx)
                    gene_cluster.write_element2bed6(*f_out.element_handles(),
                                                    cluster_name)

                    _logger.debug(
                        "Start identifying transcript for {0}".format(
                            gene_cluster))
                    trans = identify_transcript(gene_cluster, ann)
                    trans.write2bed12(cluster_name, *f_out.isoform_handles())
                    _logger.debug(
                        "Finish identifying transcript for {0}".format(
                            gene_cluster))
                if args.time_out is not None:
                    signal.alarm(0)
            except TimeOutError:
                print("TimeOut: {0}\t{1}\t{2}\n".format(chrom, start, end))
                with open(os.path.join(args.out_dir, "igia_debug_timeout.log"),
                          "a") as f:
                    f.write("TimeOut ({0}s): {1}\t{2}\t{3}\n".format(
                        args.time_out, chrom, start, end))

    _logger.info("End")
예제 #4
0
def main(args):
    """Main entry point allowing external calls
    Args:
      args ([str]): command line parameter list
    """
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    rank_size = comm.Get_size()

    if rank == 0:
        args = parse_args(args)
        check_paraclu(args)
        out_dir = args.out_dir
        rand_seq = str(hash(time.time() * 100000 + random.random()))
        tmp_dir = os.path.join(out_dir, rand_seq)
        args.tmp_dir = tmp_dir
        setup_logging(args.loglevel)
        _logger.info("Starting IGIA in MPI mode ...")
    else:
        args = None

    args = comm.bcast(args, root=0)

    out_dir = args.out_dir
    tmp_dir = args.tmp_dir

    ngs_obj_list = [SeqFile(x, "NGS") for x in args.ngs_file]
    tgs_obj_list = [SeqFile(x, "TGS") for x in args.tgs_file]
    ext_tss_list = load_txs(args.tss)
    ext_tes_list = load_txs(args.tes)
    f_ann = args.ann
    size = args.size
    f_genome = args.f_genome
    paraclu_path = args.paraclu_path

    ann = None
    if f_ann:
        if rank == 0:
            _logger.info("Loading annotation transcripts ...")
            if not size:
                raise ValueError("Error: missing Chrom Size file")

            ann_bam = bed2bam(f_ann, size, tmp_dir)
        else:
            ann_bam = None
        ann_bam = comm.bcast(ann_bam, root=0)
        ann = SeqFile(ann_bam, "ANN")

    # Update Global variables
    GVAR.RULE = args.rule
    GVAR.TXS_DIFF = args.dtxs
    GVAR.SPLICED_INTRON_PIR_CUTOFF = args.pir

    load_seqinfo(ngs_obj_list)

    _logger.info("Start building linkage ... ")
    bam_list = ngs_obj_list + tgs_obj_list
    if ann is not None:
        bam_list += [ann]

    # Scatter linkage scan tasks
    if rank == 0:
        chrom_size_list = bam_list[0].chromsize()
        scan_linkage_infos = [(chrom_size, seq_obj_indx)
                              for chrom_size in chrom_size_list
                              for seq_obj_indx in range(len(bam_list))]
        master = LBMaster(comm,
                          GVAR.MAX_QUEUE_LEN,
                          master_tag=0,
                          worker_tag=1,
                          sleep=GVAR.SLEEP_TIME)
        linkages = master.do(scan_linkage_infos)
        linkage = Linkage()
        for sub_linkage in linkages:
            linkage.add_linkage(sub_linkage)
    else:
        worker = LBScanLinkageWorker(comm, master_tag=0, worker_tag=1)
        worker.do(bam_list)
        linkage = None

    _logger.info("Finish building linkage ... ")

    # with open("node_{0}.log".format(rank), "a") as f:
    #     f.write("finish building linkage\n")

    if rank == 0:
        data = "do"
    else:
        data = None
    data = comm.bcast(data, root=0)
    assert data == "do"

    # Identify transcripts
    if rank == 0:
        _logger.info("Start identifying transcripts")
        linkage_region_list = list(linkage.iterlinkage())
        master = LBMaster(comm,
                          GVAR.MAX_QUEUE_LEN,
                          master_tag=2,
                          worker_tag=3,
                          sleep=GVAR.SLEEP_TIME)
        master.do(linkage_region_list)
    else:
        worker = LBIdentifyIsoWorker(comm, master_tag=2, worker_tag=3)
        worker.do(tmp_dir, ngs_obj_list, tgs_obj_list, ext_tss_list,
                  ext_tes_list, ann, f_genome, paraclu_path)

    # Merge results
    if rank == 0:
        _logger.info("Finish identifying transcripts")
        outfiles = [
            "intron.bed6", "internal_exon.bed6", "tss_exon.bed6",
            "tes_exon.bed6", "isoF.bed12", "isoA.bed12", "isoR.bed12",
            "isoM.bed12", "isoC.bed12", "isoP.bed12"
        ]
        res_dirs = [
            os.path.join(tmp_dir, "node_{0}".format(x))
            for x in range(1, rank_size)
        ]
        for name in outfiles:
            out_files = [os.path.join(res_dir, name) for res_dir in res_dirs]
            code = "cat " + " ".join(out_files) + " > " + os.path.join(
                out_dir, name)
            subprocess.call(code, shell=True)
        for res_dir in res_dirs:
            code = "rm -r {0}".format(res_dir)
            subprocess.call(code, shell=True)

    _logger.info("End")