Пример #1
0
def run_tfbscan(args):

    ###### Check input arguments ######
    check_required(args, ["motifs", "fasta"])  #Check input arguments
    check_files([args.motifs, args.fasta, args.regions])  #Check if files exist

    ##Test input
    if args.outdir != None and args.outfile != None:  #Error - both set
        sys.exit("ERROR: Please choose either --outdir or --outfile")
    elif ((args.outdir == None or args.outdir != None)
          and args.outfile == None):  #Separate files
        args.outdir = "tfbscan_output/" if args.outdir == None else args.outdir
        make_directory(args.outdir)  #Check and create output directory
    elif args.outdir == None and args.outfile != None:  #Joined file
        check_files([args.outfile], "w")

    ###### Create logger and write argument overview ######
    logger = TobiasLogger("TFBScan", args.verbosity)
    logger.begin()
    parser = add_tfbscan_arguments(argparse.ArgumentParser())

    logger.arguments_overview(parser, args)

    if args.outfile != None:
        logger.output_files([args.outfile])

    ######## Read sequences from file and estimate background gc ########

    logger.info("Handling input files")
    logger.info("Reading sequences from fasta")

    fastafile = pysam.FastaFile(args.fasta)
    fasta_chrom_info = dict(zip(fastafile.references, fastafile.lengths))
    fastafile.close()
    logger.stats("- Found {0} sequences in fasta".format(
        len(fasta_chrom_info)))

    #Create regions available in fasta
    logger.info("Setting up regions")
    fasta_regions = RegionList([
        OneRegion([header, 0, fasta_chrom_info[header]])
        for header in fasta_chrom_info
    ])

    #If subset, setup regions
    if args.regions:
        regions = RegionList().from_bed(args.regions)

    else:  #set up regions from fasta references
        regions = fasta_regions
        regions = regions.apply_method(OneRegion.split_region, 1000000)
        regions = regions.apply_method(OneRegion.extend_reg,
                                       50)  #extend to overlap at junctions

    #Clip regions at chromosome boundaries
    regions = regions.apply_method(OneRegion.check_boundary, fasta_chrom_info,
                                   "cut")
    if len(regions) == 0:
        logger.error("No regions found.")
        sys.exit()
    logger.info("- Total of {0} regions (after splitting)".format(
        len(regions)))

    #Background gc
    if args.gc == None:
        logger.info(
            "Estimating GC content from fasta (set --gc to skip this step)")
        args.gc = get_gc_content(regions, args.fasta)
        logger.info("- GC content: {0}".format(round(args.gc, 5)))

    bg = np.array([(1 - args.gc) / 2.0, args.gc / 2.0, args.gc / 2.0,
                   (1 - args.gc) / 2.0])

    #Split regions
    region_chunks = regions.chunks(args.split)

    #################### Read motifs from file ####################

    logger.info("Reading motifs from file")

    motif_list = MotifList().from_file(args.motifs)
    logger.stats("- Found {0} motifs".format(len(motif_list)))

    logger.debug("Getting motifs ready")
    motif_list.bg = bg

    reverse_motifs = [motif.get_reverse() for motif in motif_list]
    motif_list.extend(reverse_motifs)
    for motif in motif_list:  #now with reverse motifs as well
        motif.set_prefix(args.naming)
        motif.bg = bg
        motif.get_pssm()

    motif_names = list(set([motif.prefix for motif in motif_list]))

    #Calculate scanning-threshold for each motif
    pool = mp.Pool(processes=args.cores)
    outlist = pool.starmap(OneMotif.get_threshold,
                           itertools.product(motif_list, [args.pvalue]))
    motif_list = MotifList(outlist)

    pool.close()
    pool.join()

    #################### Find TFBS in regions #####################

    logger.comment("")
    logger.info("Scanning for TFBS with all motifs")

    manager = mp.Manager()

    if args.outdir != None:
        writer_cores = max(1, int(args.cores * 0.1))
        worker_cores = max(1, args.cores - writer_cores)

    elif args.outfile != None:  #Write to one file
        writer_cores = 1
        worker_cores = max(1, args.cores - writer_cores)

    #Setup pools
    logger.debug("Writer cores: {0}".format(writer_cores))
    logger.debug("Worker cores: {0}".format(worker_cores))
    worker_pool = mp.Pool(processes=worker_cores, maxtasksperchild=1)
    writer_pool = mp.Pool(processes=writer_cores)

    #Setup bed-writers based on --outdir or --outfile
    temp_files = []
    qs = {}
    TF_names_chunks = [
        motif_names[i::writer_cores] for i in range(writer_cores)
    ]
    for TF_names_sub in TF_names_chunks:

        #Skip over any empty chunks
        if len(TF_names_sub) == 0:
            continue

        logger.debug("Creating writer queue for {0}".format(TF_names_sub))

        if args.outdir != None:
            files = [
                os.path.join(args.outdir, TF + ".tmp") for TF in TF_names_sub
            ]
            temp_files.extend(files)
        elif args.outfile != None:
            files = [args.outfile + ".tmp"
                     for TF in TF_names_sub]  #write to the same file for all
            temp_files.append(files[0])

        q = manager.Queue()

        TF2files = dict(zip(TF_names_sub, files))
        logger.debug("TF2files dict: {0}".format(TF2files))
        writer_pool.apply_async(
            file_writer, args=(q, TF2files, args)
        )  #, callback = lambda x: finished.append(x) print("Writing time: {0}".format(x)))
        for TF in TF_names_sub:
            qs[TF] = q
    writer_pool.close()  #no more jobs applied to writer_pool
    args.qs = qs  #qs is a dict

    #Setup scanners pool
    input_arguments = [(chunk, args, motif_list) for chunk in region_chunks]
    task_list = [
        worker_pool.apply_async(motif_scanning, (
            chunk,
            args,
            motif_list,
        )) for chunk in region_chunks
    ]
    monitor_progress(task_list, logger)
    results = [task.get() for task in task_list]  #1s

    #Wait for files to write
    for TF in qs:
        qs[TF].put((None, None))

    writer_pool.join()

    #Process each file output and write out
    logger.comment("")
    logger.info("Processing results from scanning")
    logger.debug("Running processing for files: {0}".format(temp_files))
    task_list = [
        worker_pool.apply_async(process_TFBS, (file, args))
        for file in temp_files
    ]
    worker_pool.close()
    monitor_progress(task_list, logger)
    worker_pool.terminate()
    results = [task.get() for task in task_list]

    logger.debug("Joining multiprocessing pools")
    worker_pool.join()
    writer_pool.join()

    logger.end()
Пример #2
0
    logger.info("Processing results from scanning")
    logger.debug("Running processing for files: {0}".format(temp_files))
    task_list = [
        worker_pool.apply_async(process_TFBS, (file, args))
        for file in temp_files
    ]
    worker_pool.close()
    monitor_progress(task_list, logger)
    worker_pool.terminate()
    results = [task.get() for task in task_list]

    logger.debug("Joining multiprocessing pools")
    worker_pool.join()
    writer_pool.join()

    logger.end()


#----------------------------------------------------------------------------------------------------------#
if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser = add_tfbscan_arguments(parser)
    args = parser.parse_args()

    if len(sys.argv[1:]) == 0:
        parser.print_help()
        sys.exit()

    run_tfbscan(args)