Пример #1
0
def download_genome_seq(genome,
                        output_dir):
    """
    Download genome sequence files from UCSC.
    """
    print "Downloading genome sequence files for %s" %(genome)
    print "  - Output dir: %s" %(output_dir)
    output_dir = os.path.join(output_dir, "genome")
    if os.path.isdir(output_dir):
        dir_files = os.listdir(output_dir)
        if len(dir_files) >= 1:
            print "Directory %s exists and contains files; skipping download of genome..." \
                %(output_dir)
            return None
    utils.make_dir(output_dir)
    # Change to output directory
    os.chdir(output_dir)
    ##
    ## Download the genome sequence files
    ##
    genome_url = "%s/%s/chromosomes/" %(UCSC_GOLDENPATH_FTP,
                                        genome)
    # Fetch all chromosome sequence files
    download_utils.wget(os.path.join(genome_url, "*"))
    # Download only chrom17 / chr13 random
    #download_utils.wget(os.path.join(genome_url, "chr17.fa.gz"))    
    #download_utils.wget(os.path.join(genome_url, "chr13_random.fa.gz"))
    # Remove random chromosome contigs
    for fname in glob.glob(os.path.join(output_dir, "*.fa.gz")):
        if "_" in os.path.basename(fname):
            print "Deleting: %s" %(fname)
            os.remove(fname)
    ##
    ## Uncompress the files
    ##
    print "Uncompressing files..."
    uncompress_cmd = "gunzip %s/*.gz" %(output_dir)
    t1 = time.time()
    os.system(uncompress_cmd)
    t2 = time.time()
    print "Uncompressing took %.2f minutes" %((t2 - t1)/60.)
Пример #2
0
def download_genome_seq(genome,
                        output_dir):
    """
    Download genome sequence files from UCSC.
    """
    print "Downloading genome sequence files for %s" %(genome)
    print "  - Output dir: %s" %(output_dir)
    output_dir = utils.pathify(os.path.join(output_dir, "genome"))
    utils.make_dir(output_dir)
    dir_files = os.listdir(output_dir)
    # Change to output directory
    os.chdir(output_dir)
    ##
    ## Download the genome sequence files
    ##
    genome_url = "%s/%s/chromosomes/" %(UCSC_GOLDENPATH_FTP,
                                        genome)
    # Fetch all chromosome sequence files
    if len(dir_files) >= 1:
        print "Directory %s exists and contains files; " \
              "skipping download of genome..." \
              %(output_dir)
    else:
        download_utils.wget(os.path.join(genome_url, "*"))
        # Remove random chromosome contigs
        for fname in glob.glob(os.path.join(output_dir, "*.fa.gz")):
            if "_" in os.path.basename(fname):
                print "Deleting: %s" %(fname)
                os.remove(fname)
        ##
        ## Uncompress the files
        ##
        print "Uncompressing files..."
        uncompress_cmd = "gunzip %s/*.gz" %(output_dir)
        print "  - Uncompress cmd: %s" %(uncompress_cmd)
        t1 = time.time()
        ret_val = os.system(uncompress_cmd)
        if ret_val != 0:
            print "Error: Cannot uncompress files in %s" %(output_dir)
            sys.exit(1)
        t2 = time.time()
        print "Uncompressing took %.2f minutes" %((t2 - t1)/60.)
    # Create a single genome FASTA file by concatenating the
    # chromosomes together
    genome_output_fname = \
        os.path.join(output_dir, "%s.fa" %(genome))
    if not os.path.isfile(genome_output_fname):
        print "Concatenating genome chromosomes into one file..."
        print "  - Output file: %s" %(genome_output_fname)
        t1 = time.time()
        concat_chrom_cmd = "cat %s/*.fa > %s" %(output_dir,
                                                genome_output_fname)
        print "  - Concat cmd: %s" %(concat_chrom_cmd)
        ret_val = os.system(concat_chrom_cmd)
        if ret_val != 0:
            print "Error: Could not concatenate genome chromosomes."
            sys.exit(1)
        # Create an index for resulting genome file
        print "Indexing genome file..."
        samtools_index_cmd = "samtools faidx %s" %(genome_output_fname)
        print "  - Index cmd: %s" %(samtools_index_cmd)
        ret_val = os.system(samtools_index_cmd)
        if ret_val != 0:
            print "Error: Could not index genome file."
            sys.exit(1)
        t2 = time.time()
        print "Concatenation and indexing took %.2f minutes" \
            %((t2 - t1)/60.)