if not args.prefix: prefix = "" else: prefix = args.prefix + "-" outfilename = prefix + "cnvs.csv" # Prep the output file. with open(outfilename, "w") as outfile: MQ.runTime("# CNV VCF parsing", outfile) MQ.PWS("# VCF File: " + args.input, outfile) MQ.PWS("# Samples File: " + args.samples, outfile) MQ.PWS("# Output prefix: " + args.prefix, outfile) MQ.PWS("# Output file: " + outfilename, outfile) MQ.PWS("# ----------", outfile) MQ.PWS("# " + MQ.getDateTime() + " Reading sample info...", outfile) samples = cnvlib.readSamples(args.samples) MQ.PWS("# " + MQ.getDateTime() + " Samples read: " + str(len(samples)), outfile) # Read the sample info. MQ.PWS("# " + MQ.getDateTime() + " Reading VCF...", outfile) vcf, vcf_headers, vcf_format, human_flag = cnvlib.readVCF(args.input) MQ.PWS("# " + MQ.getDateTime() + " Variants read: " + str(len(vcf)), outfile) # Read the VCF file. MQ.PWS("# " + MQ.getDateTime() + " Fixing VCF headers...", outfile) vcf_headers = [h.replace("NHP-", "") for h in vcf_headers] if "39239A" in vcf_headers: vcf_headers[vcf_headers.index("39239A")] = "39239"
MQ.PWS("# Gene file: " + gene_file, logfile) MQ.PWS("# 10kb up file: " + gene_up_file, logfile) MQ.PWS("# 10kb down file: " + gene_down_file, logfile) MQ.PWS("# Transcript file: " + transcript_file, logfile) MQ.PWS("# Exon file: " + exon_file, logfile) MQ.PWS("# ----------------", logfile) # I/O options and info. feature_types = [ "gene", "gene-10kb-up", "gene-10kb-down", "transcript", "exon" ] cnv_types = ["del", "dup"] overlap_types = ["full", "partial"] # Categories for features, CNVs, and overlaps. MQ.PWS("# " + MQ.getDateTime() + " Reading CNVs...", logfile) cnvs = [] for line in open(cnvs_file): cnv = line.strip().split("\t")[3] cnv_len = int(cnv.split(":")[3]) if cnv_len < max_cnv_len: cnvs.append(cnv) num_cnvs = len(cnvs) MQ.PWS("# CNVs read: " + str(num_cnvs), logfile) MQ.PWS("\n# " + MQ.getDateTime() + " Counting gene overlaps...", logfile) countOverlaps(cnvs, gene_file, "GENES") MQ.PWS( "\n# " + MQ.getDateTime() + " Counting 10kb upstream gene overlaps...", logfile)
MQ.runTime("# Fisher's test for GO enrichment", outfile) MQ.PWS(MQ.spacedOut("# Query file:", io_pad) + queryfile, outfile) MQ.PWS(MQ.spacedOut("# Background file:", io_pad) + bgfile, outfile) MQ.PWS( MQ.spacedOut("# Alpha (p-value threshold):", io_pad) + str(alpha), outfile) MQ.PWS( MQ.spacedOut("# Multiple test correction method:", io_pad) + correction_str, outfile) if correction == "None": MQ.PWS("# --> WARNING: Not correcting for multiple tests!", outfile) MQ.PWS(MQ.spacedOut("# Output file:", io_pad) + outfilename, outfile) MQ.PWS("# ----------", outfile) # Report I/O information. MQ.PWS("# " + MQ.getDateTime() + " Counting total query GO terms...", outfile) query_genes = [] query_go_count = 0 for line in open(queryfile): if line[0] == "#": continue line = line.strip().split("\t") gid = line[0] if gid not in query_genes: query_genes.append(gid) query_go_count += 1 # Get count of query GO terms and unique list of features. MQ.PWS("# " + MQ.getDateTime() + " Counting total background GO terms...", outfile)
gofile = "go/" + species + "-go-terms-uniq.txt" transcripts_file = "go/" + species + "-noalu-transcripts-" + mode + ".txt" # The input files: A GO term database from Ensembl (with identical lines removed with bash's sort | uniq commands) # and a file containing transcript IDs that overlap CNVs (from 06_gene_count_bed.py) queryoutfile = "go/" + species + "-cnv-" + mode + "-go-query.tab" bgoutfile = "go/" + species + "-cnv-" + mode + "-go-bg.tab" MQ.PWS("# Go file : " + gofile) MQ.PWS("# Transcripts file : " + transcripts_file) MQ.PWS("# Mode : " + mode) MQ.PWS("# Query out : " + queryoutfile) MQ.PWS("# BG out : " + bgoutfile) MQ.PWS("# ----------------") MQ.PWS("# " + MQ.getDateTime() + " Getting annotation info...") transcript_go = {} go_accs = {} first = True for line in gzip.open(gofile): if first: first = False continue line = line.decode().strip().split("\t") #print(line); chrome = line[4] if chrome == "MT": continue
dumpfile = "../cafe-data/dump.out.mlemur-blast.txt.I30" overlap_file = "bed/" + species + "-cnvs-filtered-noalu.csv" gtf_file = "gtf/" + species + "-chromes.gtf" overlap_file = "bed/" + species + "-cnvs-to-transcripts.bed" outfilename = "../cafe-data/" + species + "-cafe-genes" + ".csv" # File names. MQ.PWS("# Log file: " + logfilename, logfile) MQ.PWS("# GTF file: " + gtf_file, logfile) MQ.PWS("# Overlap file:" + overlap_file, logfile) MQ.PWS("# Dump file: " + dumpfile, logfile) MQ.PWS("# Output file: " + outfilename, logfile) MQ.PWS("# ----------------", logfile) # I/O options and info. MQ.PWS("# " + MQ.getDateTime() + " Reading features from GTF...", logfile) ttp = defaultdict(str) overlaps = {} for line in open(gtf_file): if line[0] == "#": continue line = line.strip().split("\t") feature_type, chrome, start, end, feature_info = line[2], line[0], int( line[3]), int(line[4]), line[8] if feature_type == "CDS": tid = re.findall('ENS' + regstr + 'T[\d]+', feature_info)[0] pid = re.findall('ENS' + regstr + 'P[\d]+', feature_info)[0] if pid not in ttp[tid]: ttp[tid] = pid overlaps[tid] = {