def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('--pfam', dest='pfam_dir')
    parser.add_argument('--genome-fasta', dest='genome_fasta_file')
    parser.add_argument('--min-orf-length',
                        dest='min_orf_length',
                        type=int,
                        default=30)
    parser.add_argument('-o', '--output-dir', dest='output_dir', default='out')
    parser.add_argument('-p',
                        '--num-processes',
                        dest='num_processes',
                        type=int,
                        default=1)
    parser.add_argument('--mode',
                        dest='mode',
                        choices=['orf', 'first_orf', 'full'],
                        default='orf')
    parser.add_argument('gtf_file')
    args = parser.parse_args()
    # get args
    pfam_dir = args.pfam_dir
    genome_fasta_file = args.genome_fasta_file
    gtf_file = args.gtf_file
    min_orf_length = args.min_orf_length
    num_processes = args.num_processes
    output_dir = args.output_dir
    mode = args.mode
    # check command line parameters
    if which('pfam_scan.pl') is None:
        parser.error("'pfam_scan.pl' not found in PATH")
    if which('hmmscan') is None:
        parser.error("'hmmscan' not found in PATH")
    if which('signalp') is None:
        parser.error("'signalp' not found in PATH")
    if not check_pfam_dir(pfam_dir):
        parser.error("Required Pfam-A and Pfam-B files not found at '%s'" %
                     (pfam_dir))
    if not os.path.exists(genome_fasta_file):
        parser.error("Genome FASTA file '%s' not found" % (genome_fasta_file))
    if not os.path.exists(gtf_file):
        parser.error("GTF file '%s' not found" % (gtf_file))
    #if os.path.exists(output_dir):
    #    parser.error("Output directory '%s' already exists" % (output_dir))
    # create output dir
    if not os.path.exists(output_dir):
        logging.info("Creating output directory '%s'" % (output_dir))
        os.makedirs(output_dir)
    if mode == 'full':
        return full_transcript_analysis(gtf_file, genome_fasta_file, pfam_dir,
                                        output_dir, num_processes)
    else:
        first_orf_only = (mode == 'first_orf')
        return orf_analysis(gtf_file, genome_fasta_file, pfam_dir, output_dir,
                            min_orf_length, first_orf_only, num_processes)
    return 0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('--pfam', dest='pfam_dir')
    parser.add_argument('--genome-fasta', dest='genome_fasta_file')
    parser.add_argument('--min-orf-length', dest='min_orf_length', type=int, default=30)
    parser.add_argument('-o', '--output-dir', dest='output_dir', default='out')
    parser.add_argument('-p', '--num-processes', dest='num_processes', type=int, default=1)
    parser.add_argument('--mode', dest='mode', choices=['orf', 'first_orf', 'full'], default='orf')
    parser.add_argument('gtf_file')
    args = parser.parse_args()
    # get args
    pfam_dir = args.pfam_dir
    genome_fasta_file = args.genome_fasta_file
    gtf_file = args.gtf_file
    min_orf_length = args.min_orf_length
    num_processes = args.num_processes
    output_dir = args.output_dir
    mode = args.mode
    # check command line parameters
    if which('pfam_scan.pl') is None:
        parser.error("'pfam_scan.pl' not found in PATH")
    if which('hmmscan') is None:
        parser.error("'hmmscan' not found in PATH")
    if which('signalp') is None:
        parser.error("'signalp' not found in PATH")
    if not check_pfam_dir(pfam_dir):
        parser.error("Required Pfam-A and Pfam-B files not found at '%s'" % (pfam_dir))
    if not os.path.exists(genome_fasta_file):
        parser.error("Genome FASTA file '%s' not found" % (genome_fasta_file))
    if not os.path.exists(gtf_file):
        parser.error("GTF file '%s' not found" % (gtf_file))
    #if os.path.exists(output_dir):
    #    parser.error("Output directory '%s' already exists" % (output_dir))
    # create output dir
    if not os.path.exists(output_dir):
        logging.info("Creating output directory '%s'" % (output_dir))
        os.makedirs(output_dir)
    if mode == 'full':
        return full_transcript_analysis(gtf_file, genome_fasta_file, 
                                        pfam_dir, output_dir, num_processes)
    else:
        first_orf_only = (mode == 'first_orf')
        return orf_analysis(gtf_file, genome_fasta_file, pfam_dir, 
                            output_dir, min_orf_length, first_orf_only, 
                            num_processes)
    return 0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("--assembly", dest = 'assembly_bed',
                    default = intergenic_assembly_bed,
                    help = 'Assembly file used for shuffling and snp overlap intersection')
    parser.add_argument("--snps", dest = 'snps',
                    default = snp_bed,
                    help = 'SNP universe bed file')
    parser.add_argument("--excl", dest = 'excl',
                    default = excl_file,
                    help = 'Exclusion file used for shuffling')
    parser.add_argument("--chrom", dest = 'chrom',
                    default = chrom_sizes_file,
                    help = 'Chrom size file used for shuffling')
    parser.add_argument("--gtf", dest = 'gtf',
                    default = gtf_file,
                    help = 'GTF file used to generate shuffle (should match assembly_bed)')
    parser.add_argument("--gwas", dest = 'gwas',
                    default = gwas_bed,
                    help = 'GWAS bed file file used for intersection')
    parser.add_argument("--shuffs", dest = 'shuffs',
                    default = 100,
                    help = 'number of shuffles to perform')
    parser.add_argument("-p", dest = 'proc',
                    default = 4,
                    help = 'number of processors to use')
    parser.add_argument("--flank", dest = 'flank',
                    default = 0,
                    help = 'number of flanking bases to add to bed files')
    args = parser.parse_args()
    
    args.proc = int(args.proc)
    args.flank = int(args.flank)
    
    logging.info('Output is printed to stdout, to save use \'>\' <filename>')
    
    # check command line parameters
    if which('bedtools') is None:
        parser.error('bedtools binary not found in PATH')
    if not os.path.exists(chrom_sizes_file):
        parser.error('chrom sizes file %s not found' % (chrom_sizes_file))

    if not os.path.isdir('GWAS_TMPS'):
        os.mkdir('GWAS_TMPS')
    
    prefix = 'GWAS_TMPS'    
    locus_intervals_file = os.path.join(prefix, 'locus_intervals.bed')
    intersect_file = os.path.join(prefix, 'intersect.txt')
    assembly_flank = os.path.join(prefix, 'flank.bed')
    
        
    logging.info('Parsing GTF file')
    with open(locus_intervals_file, 'w') as f:
        j = 0
        for locus_transcripts in parse_gtf(open(gtf_file)):
            # find borders of locus
            locus_chrom = locus_transcripts[0].chrom
            locus_start = min(t.start for t in locus_transcripts)
            locus_end = max(t.end for t in locus_transcripts)
            locus_id = j
            j+=1
            print >>f, '\t'.join(map(str, [locus_chrom, locus_start, locus_end, locus_id]))   
    #apply flank to the bed file 
    #read chrom file to make sure flanks added do not enter chrom ends
    chrom_length = {}
    for line in open(chrom_sizes_file): 
        line = line.strip().split('\t')
        chr = line[0]
        length = line[1]
        chrom_length[chr] = length
    with open(assembly_flank, 'w') as f:
        for line in open(args.assembly_bed):
            line = line.strip().split('\t')
            chr = line[0]
            start = int(line[1])
            end = int(line[2])
            chr_len = chrom_length[chr]
            start = max(0, (start - args.flank))
            end = min(chr_len, (end + args.flank))
            line[1] = start
            line[2] = end
            print >> f, '\t'.join(map(str, line))
    
    
    #GWAS snps
    #do intersections for real data and report number of overlapping GWAS snps 
    logging.info('Intersecting assembly with GWAS snps')
    args_int = ['bedtools', 'intersect', 
            '-a', args.gwas,
            '-b', assembly_flank,
            '-wa',
            '-wb']
    with open(intersect_file, 'w') as fileh:
        subprocess.call(args_int, stdout=fileh)
    #count number of SNPs caught
    snps = set()
    
    for line in open(intersect_file):
        line = line.strip().split('\t')
        rsID = line[RSIDCOL]
        snps.add(rsID)
    gwas_overlap = len(snps)

    #snp universe
    #do intersections for real data and report number of overlapping snps in snp universe 
    logging.info('Intersecting assembly with snp universe')
    args_int = ['bedtools', 'intersect', 
            '-a', args.snps,
            '-b', assembly_flank,
            '-wa',
            '-wb',
            '-sorted']
    with open(intersect_file, 'w') as fileh:
        subprocess.call(args_int, stdout=fileh)
    #count number of SNPs caught
    snps = set()
    for line in open(intersect_file):
        line = line.strip().split('\t')
        rsID = line[RSIDCOL]
        snps.add(rsID)
    snp_overlap = len(snps)
    frac_real = float(gwas_overlap)/snp_overlap
    logging.info('%d GWAS snps overlap compendia genes'  % gwas_overlap)
    logging.info('%d snps (from \"snp universe\") overlap compendia genes' % snp_overlap)
    logging.info('Frac: %f' % frac_real)
    
    
    #loop the shuffle to generate a distribution of nulls for number of snps hit by random intergenic genes
    pool = multiprocessing.Pool(args.proc)
    NUM_SHUFFS = int(args.shuffs)
    shuff_args = (args.snps,
                  args.gwas,
                  args.excl,
                  locus_intervals_file,
                  args.chrom,
                  args.gtf,
                  frac_real,
                  gwas_overlap,
                  snp_overlap,
                  NUM_SHUFFS,
                  prefix,
                  args.flank)
    tasks = []
    header = [
              'gwas_shuff',
              'snp_shuff',
              'frac_shuff',
              'gwas_real',
              'snp_real',
              'frac_real',
              'OR'
              ]
    print '\t'.join(header)
    for i in xrange(NUM_SHUFFS):
        tasks.append((i,) + shuff_args)
    result_iter = pool.imap_unordered(shuffle_imap, tasks)
    for line in result_iter:
        print line
    pool.close()
    pool.join()
    
    shutil.rmtree(prefix)
    
    return 0
Exemplo n.º 4
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('gtf_file')
    parser.add_argument('excl_file')
    parser.add_argument('chrom_sizes')
    parser.add_argument("output_prefix")
    args = parser.parse_args()
    prefix = args.output_prefix
    excl_file = args.excl_file
    chrom_sizes_file = args.chrom_sizes
    gtf_file = args.gtf_file
    # check command line parameters
    if which('bedtools') is None:
        parser.error('bedtools binary not found in PATH')
    if not os.path.exists(chrom_sizes_file):
        parser.error('chrom sizes file %s not found' % (chrom_sizes_file))
    gene_intervals_file = prefix + '.gene_intervals.bed'
    gene_intervals_shuffled_file = prefix + '.gene_intervals.shuffle.bed'
    shuffled_gtf_file = prefix + '.shuffle.gtf'
    sorted_shuffled_gtf_file = prefix + '.shuffle.srt.gtf'
    logging.info('Parsing GTF file')
    with open(gene_intervals_file, 'w') as f:
        for locus_transcripts in parse_gtf(open(gtf_file)):
            # find borders of locus
            locus_chrom = locus_transcripts[0].chrom
            locus_start = min(t.start for t in locus_transcripts)
            locus_end = max(t.end for t in locus_transcripts)
            logging.debug("[LOCUS] %s:%d-%d %d transcripts" % 
                          (locus_chrom, locus_start, locus_end, 
                           len(locus_transcripts)))
            for g in get_gene_intervals(locus_transcripts):
                print >>f, '\t'.join(map(str, [g.chrom, g.start, g.end, g.gene_id]))    
    # randomly shuffle genes
    logging.info("Shuffling genes")
    args = ['bedtools', 'shuffle', 
            '-excl', excl_file,
            '-i', gene_intervals_file, 
            '-g', args.chrom_sizes]
    with open(gene_intervals_shuffled_file, 'w') as fileh:
        subprocess.call(args, stdout=fileh)
    # read new gene positions
    logging.info("Reading shuffled gene intervals")
    shuffle_gene_map = {}
    with open(gene_intervals_shuffled_file) as fileh:
        for line in fileh:
            fields = line.strip().split('\t')
            chrom = fields[0]
            start = int(fields[1])
            end = int(fields[2])
            gene_id = fields[3]
            shuffle_gene_map[gene_id] = (chrom, start, end)
    # reposition transcripts
    logging.info("Repositioning transcripts")
    with open(shuffled_gtf_file, 'w') as fileh:
        for locus_transcripts in parse_gtf(open(gtf_file)):
            # get original positions
            orig_gene_map = {}
            for g in get_gene_intervals(locus_transcripts):
                orig_gene_map[g.gene_id] = (g.chrom, g.start, g.end)
            for t in locus_transcripts:
                gene_id = t.attrs['gene_id']
                orig_chrom, orig_start, orig_end = orig_gene_map[gene_id]
                if gene_id not in shuffle_gene_map:
                    logging.warning('Gene %s [%s:%d-%d] could not be shuffled' % (gene_id, orig_chrom, orig_start, orig_end))
                    continue
                new_chrom, new_start, new_end = shuffle_gene_map[gene_id]
                # reposition transcript
                t.chrom = new_chrom
                t.start = new_start + (t.start - orig_start)
                t.end = new_start + (t.end - orig_start)
                for e in t.exons:
                    e.start = new_start + (e.start - orig_start)
                    e.end = new_start + (e.end - orig_start)
                
                fields = write_bed(t.chrom, t.attrs['transcript_id'], t.strand, 1000, t.exons)
                print '\t'.join(fields)                
                #for f in t.to_gtf_features(source='shuffle'):
                #    print >>fileh, str(f)
    logging.info("Sorting GTF file")
    sort_gtf(shuffled_gtf_file, sorted_shuffled_gtf_file)
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("--rand_snp", dest = 'rand_snp',
                    default = rand_snps_file,
                    help = 'Bed file of random snps to use as negative control for analyses')
    parser.add_argument("--snps", dest = 'snps',
                    default = snp_bed,
                    help = 'SNP universe bed file')
    parser.add_argument("--excl", dest = 'excl',
                    default = excl_file,
                    help = 'Exclusion file used for shuffling')
    parser.add_argument("--chrom", dest = 'chrom',
                    default = chrom_sizes_file,
                    help = 'Chrom size file used for shuffling')
    parser.add_argument("--gtf", dest = 'gtf',
                    help = 'GTF file used to generate shuffle')
    parser.add_argument("--gwas", dest = 'gwas',
                    default = gwas_bed_file,
                    help = 'GWAS bed file file used for intersection')
    parser.add_argument("--shuffs", dest = 'shuffs',
                    default = 100,
                    help = 'number of shuffles to perform')
    parser.add_argument("-p", dest = 'proc',
                    default = 4,
                    help = 'number of processors to use')
    parser.add_argument("--flank", dest = 'flank',
                    default = 0,
                    help = 'number of flanking bases to add to bed files')
    parser.add_argument("--exon", dest="exon", 
                        action="store_true", default=False, 
                        help="Perform analysis looking only at exonic overlap")
    args = parser.parse_args()
    
    args.proc = int(args.proc)
    args.flank = int(args.flank)
    
    logging.info('Output is printed to stdout')
    if args.exon: 
        logging.info('Looking at exonic overlap only')
    # check command line parameters
    if which('bedtools') is None:
        parser.error('bedtools binary not found in PATH')
    if not os.path.exists(chrom_sizes_file):
        parser.error('chrom sizes file %s not found' % (chrom_sizes_file))

    if not os.path.isdir('GWAS_TMPS'):
        os.mkdir('GWAS_TMPS')
    
    prefix = 'GWAS_TMPS'    
    locus_intervals_file = os.path.join(prefix, 'locus_intervals.bed')
    intersect_file = os.path.join(prefix, 'intersect.txt')
    assembly_flank = os.path.join(prefix, 'flank.bed')
    assembly_flank_sorted = os.path.join(prefix + '.flank.sorted.bed')
    assembly_bed = os.path.join(prefix, 'assembly.bed')
    
    #read chrom file to make sure flanks added do not enter chrom ends
    chrom_length = {}
    for line in open(chrom_sizes_file): 
        line = line.strip().split('\t')
        chr = line[0]
        length = line[1]
        chrom_length[chr] = length
    
    
    #convert GTF file to BED for initial intersections and get locus intervals 
    logging.info('Parsing GTF: converting to BED and obtaining locus intervals')
    with open(assembly_bed, 'w') as f2:
        with open(locus_intervals_file, 'w') as f:
            j = 0
            for locus_transcripts in parse_gtf(open(args.gtf)):
                if (j%2500)==0: 
                    logging.debug('Finished %d/%d loci' % (j, 35000))
                for t in locus_transcripts:
                    name = t.attrs['transcript_id']
                    fields = write_bed(t.chrom, name, t.strand, 1000, t.exons, args.flank, chrom_length)
                    print >>f2, '\t'.join(fields)
                # find borders of locus
                locus_chrom = locus_transcripts[0].chrom
                locus_start = min(t.start for t in locus_transcripts)
                locus_end = max(t.end for t in locus_transcripts)
                locus_id = j
                j+=1
                print >>f, '\t'.join(map(str, [locus_chrom, locus_start, locus_end, locus_id]))
    #apply flank to the bed file 
    with open(assembly_flank, 'w') as f:
        for line in open(assembly_bed):
            line = line.strip().split('\t')
            chr = line[0]
            start = int(line[1])
            end = int(line[2])
            chr_len = chrom_length[chr]
            start = max(0, (start - args.flank))
            end = min(chr_len, (end + args.flank))
            line[1] = start
            line[2] = end
            print >> f, '\t'.join(map(str, line))
    
    
    args_sort = ['sort', 
                 '-k1,1',
                 '-k2,2n', 
                 assembly_flank]
    with open(assembly_flank_sorted, 'w') as fileh:
        subprocess.call(args_sort, stdout=fileh)
    
    
    #GWAS snps
    #do intersections for real data and report number of overlapping GWAS snps 
    logging.info('Intersecting assembly with GWAS snps')
    args_int = ['bedtools', 'intersect', 
            '-a', args.gwas,
            '-b', assembly_flank_sorted,
            '-sorted',
            '-wa',
            '-wb']
    if args.exon:
        args_int.append('-split')
    with open(intersect_file, 'w') as fileh:
        subprocess.call(args_int, stdout=fileh)
    #count number of SNPs caught
    snps = set()
    for line in open(intersect_file):
        line = line.strip().split('\t')
        rsID = line[RSIDCOL]
        snps.add(rsID)
    gwas_overlap = len(snps)

    #Random snps
    #do intersections for real data and report number of overlapping GWAS snps 
    logging.info('Intersecting assembly with random snps')
    args_int = ['bedtools', 'intersect', 
            '-a', args.rand_snp,
            '-b', assembly_flank_sorted,
            '-sorted',
            '-wa',
            '-wb']
    if args.exon:
        args_int.append('-split')
    with open(intersect_file, 'w') as fileh:
        subprocess.call(args_int, stdout=fileh)
    #count number of SNPs caught
    snps = set()
    for line in open(intersect_file):
        line = line.strip().split('\t')
        rsID = line[RSIDCOL]
        snps.add(rsID)
    rand_overlap = len(snps)

    #snp universe
    #do intersections for real data and report number of overlapping snps in snp universe 
    logging.info('Intersecting assembly with snp universe')
    args_int = ['bedtools', 'intersect', 
            '-a', args.snps,
            '-b', assembly_flank_sorted,
            '-sorted',
            '-wa',
            '-wb']
    if args.exon:
        args_int.append('-split')
    with open(intersect_file, 'w') as fileh:
        subprocess.call(args_int, stdout=fileh)
    #count number of SNPs caught
    snps = set()
    for line in open(intersect_file):
        line = line.strip().split('\t')
        rsID = line[RSIDCOL]
        snps.add(rsID)
    snp_overlap = len(snps)
    frac_real = float(gwas_overlap)/snp_overlap
    frac_rand = float(rand_overlap)/snp_overlap
    if args.exon: 
        logging.info('%d GWAS snps overlap compendia exons'  % gwas_overlap)
        logging.info('%d random snps overlap compendia exons'  % rand_overlap)
        logging.info('%d total snps overlap compendia exons' % snp_overlap)
    else: 
        logging.info('%d GWAS snps overlap compendia genes'  % gwas_overlap)
        logging.info('%d random snps overlap compendia genes'  % rand_overlap)
        logging.info('%d total snps overlap compendia genes' % snp_overlap)    
    logging.info('Frac_gwas: %f' % frac_real)
    logging.info('Frac_rand: %f' % frac_rand)
    
    
    
    #loop the shuffle to generate a distribution of nulls for number of snps hit by random intergenic genes
    pool = multiprocessing.Pool(args.proc)
    NUM_SHUFFS = int(args.shuffs)
    shuff_args = (args.snps,
                  args.gwas,
                  args.rand_snp,
                  args.excl,
                  locus_intervals_file,
                  args.chrom,
                  args.gtf,
                  frac_real,
                  frac_rand,
                  gwas_overlap,
                  rand_overlap,
                  snp_overlap,
                  NUM_SHUFFS,
                  prefix,
                  args.flank,
                  args.exon)
    tasks = []
    header = [
              'gwas_shuff_overlap',
              'rand_snp_shuff_overlap',
              'all_snp_shuff_overlap',
              'frac_gwas_shuff',
              'frac_rand_shuff',
              'gwas_overlap',
              'rand_snp_overlap',
              'all_snp_overlap',
              'frac_gwas',
              'frac_rand',
              'OR_gwas',
              'OR_rand'
              ]
    
    print '\t'.join(header)
    logging.info("Beginning shuffles")
    for i in xrange(NUM_SHUFFS):
        tasks.append((i,) + shuff_args)
    result_iter = pool.imap_unordered(shuffle_imap, tasks)
    for line in result_iter:
        print line
    pool.close()
    pool.join()
    
    shutil.rmtree(prefix)
    
    return 0
Exemplo n.º 6
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("--assembly", dest = 'assembly_bed',
                    default = intergenic_assembly_bed,
                    help = 'Assembly file used for shuffling and snp overlap intersection')
    parser.add_argument("--snps", dest = 'snps',
                    default = snp_bed,
                    help = 'SNP universe bed file')
    parser.add_argument("--excl", dest = 'excl',
                    default = excl_file,
                    help = 'Exclusion file used for shuffling')
    parser.add_argument("--chrom", dest = 'chrom',
                    default = chrom_sizes_file,
                    help = 'Chrom size file used for shuffling')
    parser.add_argument("--gtf", dest = 'gtf',
                    default = gtf_file,
                    help = 'GTF file used to generate shuffle (should match assembly_bed)')
    parser.add_argument("--gwas", dest = 'gwas',
                    default = gwas_bed,
                    help = 'GWAS bed file file used for intersection')
    parser.add_argument("--flank", dest = 'flank',
                    default = 0,
                    help = 'number of flanking bases to add to bed files')
    args = parser.parse_args()
    
    args.flank = int(args.flank)
    
    logging.info('Output is printed to stdout, to save use \'>\' <filename>')
    
    # check command line parameters
    if which('bedtools') is None:
        parser.error('bedtools binary not found in PATH')
    if not os.path.exists(chrom_sizes_file):
        parser.error('chrom sizes file %s not found' % (chrom_sizes_file))

    if not os.path.isdir('GWAS_TMPS'):
        os.mkdir('GWAS_TMPS')
    
    prefix = 'GWAS_TMPS'    
    gene_intervals_file = os.path.join(prefix, 'gene_intervals.bed')
    intersect_file = os.path.join(prefix, 'intersect.txt')
    assembly_flank = os.path.join(prefix, 'flank.bed')
    
    output_file = 'gwas_intergenic_null.txt'
    
        
    logging.info('Parsing GTF file')
    with open(gene_intervals_file, 'w') as f:
        for locus_transcripts in parse_gtf(open(gtf_file)):
            # find borders of locus
            locus_chrom = locus_transcripts[0].chrom
            locus_start = min(t.start for t in locus_transcripts)
            locus_end = max(t.end for t in locus_transcripts)
#             logging.debug("[LOCUS] %s:%d-%d %d transcripts" % 
#                           (locus_chrom, locus_start, locus_end, 
#                            len(locus_transcripts)))
            for g in get_gene_intervals(locus_transcripts):
                print >>f, '\t'.join(map(str, [g.chrom, g.start, g.end, g.gene_id]))   
                
    #apply flank to the bed file 
    #read chrom file to make sure flanks added do not enter chrom ends
    chrom_length = {}
    for line in open(chrom_sizes_file): 
        line = line.strip().split('\t')
        chr = line[0]
        length = line[1]
        chrom_length[chr] = length
    with open(assembly_flank, 'w') as f:
        for line in open(args.assembly_bed):
            line = line.strip().split('\t')
            chr = line[0]
            start = int(line[1])
            end = int(line[2])
            chr_len = chrom_length[chr]
            start = max(0, (start - args.flank))
            end = min(chr_len, (end + args.flank))
            line[1] = start
            line[2] = end
            print >> f, '\t'.join(map(str, line))
    
    
    #GWAS snps
    #do intersections for real data and report number of overlapping GWAS snps 
    logging.info('Intersecting assembly with GWAS snps')
    args_int = ['bedtools', 'intersect', 
            '-a', args.gwas,
            '-b', assembly_flank,
            '-wa',
            '-wb']
    with open(intersect_file, 'w') as fileh:
        subprocess.call(args_int, stdout=fileh)
    #count number of SNPs caught
    snps = set()
    
    for line in open(intersect_file):
        line = line.strip().split('\t')
        rsID = line[RSIDCOL]
        snps.add(rsID)
    gwas_overlap = len(snps)

    #snp universe
    #do intersections for real data and report number of overlapping snps in snp universe 
    logging.info('Intersecting assembly with snp universe')
    args_int = ['bedtools', 'intersect', 
            '-a', args.snps,
            '-b', assembly_flank,
            '-wa',
            '-wb',
            '-sorted']
    with open(intersect_file, 'w') as fileh:
        subprocess.call(args_int, stdout=fileh)
    #count number of SNPs caught
    snps = set()
    for line in open(intersect_file):
        line = line.strip().split('\t')
        rsID = line[RSIDCOL]
        snps.add(rsID)
    snp_overlap = len(snps)
    frac_real = float(gwas_overlap)/snp_overlap
    logging.info('%d GWAS snps overlap compendia genes'  % gwas_overlap)
    logging.info('%d snps (from \"snp universe\") overlap compendia genes' % snp_overlap)
    logging.info('Frac: %f' % frac_real)
    
    
    print '\t'.join(map(str, [args.flank, gwas_overlap, snp_overlap, frac_real]))
    

    
#     valso.close()
    shutil.rmtree(prefix)
    
    return 0
Exemplo n.º 7
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('gtf_file')
    parser.add_argument('excl_file')
    parser.add_argument('chrom_sizes')
    parser.add_argument("output_prefix")
    args = parser.parse_args()
    prefix = args.output_prefix
    excl_file = args.excl_file
    chrom_sizes_file = args.chrom_sizes
    gtf_file = args.gtf_file
    # check command line parameters
    if which('bedtools') is None:
        parser.error('bedtools binary not found in PATH')
    if not os.path.exists(chrom_sizes_file):
        parser.error('chrom sizes file %s not found' % (chrom_sizes_file))
    gene_intervals_file = prefix + '.gene_intervals.bed'
    gene_intervals_shuffled_file = prefix + '.gene_intervals.shuffle.bed'
    shuffled_gtf_file = prefix + '.shuffle.gtf'
    sorted_shuffled_gtf_file = prefix + '.shuffle.srt.gtf'
    logging.info('Parsing GTF file')
    with open(gene_intervals_file, 'w') as f:
        for locus_transcripts in parse_gtf(open(gtf_file)):
            # find borders of locus
            locus_chrom = locus_transcripts[0].chrom
            locus_start = min(t.start for t in locus_transcripts)
            locus_end = max(t.end for t in locus_transcripts)
            logging.debug(
                "[LOCUS] %s:%d-%d %d transcripts" %
                (locus_chrom, locus_start, locus_end, len(locus_transcripts)))
            for g in get_gene_intervals(locus_transcripts):
                print >> f, '\t'.join(
                    map(str, [g.chrom, g.start, g.end, g.gene_id]))
    # randomly shuffle genes
    logging.info("Shuffling genes")
    args = [
        'bedtools', 'shuffle', '-excl', excl_file, '-i', gene_intervals_file,
        '-g', args.chrom_sizes
    ]
    with open(gene_intervals_shuffled_file, 'w') as fileh:
        subprocess.call(args, stdout=fileh)
    # read new gene positions
    logging.info("Reading shuffled gene intervals")
    shuffle_gene_map = {}
    with open(gene_intervals_shuffled_file) as fileh:
        for line in fileh:
            fields = line.strip().split('\t')
            chrom = fields[0]
            start = int(fields[1])
            end = int(fields[2])
            gene_id = fields[3]
            shuffle_gene_map[gene_id] = (chrom, start, end)
    # reposition transcripts
    logging.info("Repositioning transcripts")
    with open(shuffled_gtf_file, 'w') as fileh:
        for locus_transcripts in parse_gtf(open(gtf_file)):
            # get original positions
            orig_gene_map = {}
            for g in get_gene_intervals(locus_transcripts):
                orig_gene_map[g.gene_id] = (g.chrom, g.start, g.end)
            for t in locus_transcripts:
                gene_id = t.attrs['gene_id']
                orig_chrom, orig_start, orig_end = orig_gene_map[gene_id]
                if gene_id not in shuffle_gene_map:
                    logging.warning(
                        'Gene %s [%s:%d-%d] could not be shuffled' %
                        (gene_id, orig_chrom, orig_start, orig_end))
                    continue
                new_chrom, new_start, new_end = shuffle_gene_map[gene_id]
                # reposition transcript
                t.chrom = new_chrom
                t.start = new_start + (t.start - orig_start)
                t.end = new_start + (t.end - orig_start)
                for e in t.exons:
                    e.start = new_start + (e.start - orig_start)
                    e.end = new_start + (e.end - orig_start)

                fields = write_bed(t.chrom, t.attrs['transcript_id'], t.strand,
                                   1000, t.exons)
                print '\t'.join(fields)
                #for f in t.to_gtf_features(source='shuffle'):
                #    print >>fileh, str(f)
    logging.info("Sorting GTF file")
    sort_gtf(shuffled_gtf_file, sorted_shuffled_gtf_file)