def main(argv): # Parse args if len(sys.argv) is not 2: sys.stderr.write("invalid usage: python " + sys.argv[0] + " <level1_tss_rna_chip.json>\n") sys.exit(2) # Set args data_fn = sys.argv[1] progress = FileProgress(data_fn, "Percent Complete: ") # Main loop with open(data_fn) as json_file: for line in json_file: site = json.loads(line) # Filter for criteria analyze_this_site = False """ for transcript in site['transcripts']: if 'tag' in transcript['attribute']: if transcript['attribute']['tag'] == "CCDS": analyze_this_site = True """ if site['exon_number'] != 1: analyze_this_site = True # This is the main printing section if analyze_this_site: print json.dumps(site['samples']) progress.update() sys.stderr.write("\nAll done!\n")
def main(argv): # parse args if len(sys.argv) is not 3: sys.stderr.write("invalid usage: python " + sys.argv[0] + " <all_tss_rna.json> <57epigenomes.RPKM.all> \n") sys.exit(2) json_fn = sys.argv[1] rna_fn = sys.argv[2] progress1 = FileProgress(rna_fn, "Part 1/2: ") progress2 = FileProgress(json_fn, "Part 2/2: ") # Read gene RPKM into memory gene_dict = {} header = [] with open(rna_fn) as csv_file: for line in csv_file: row = line.strip('\n').split("\t") if progress1.count == 0: header = row[1:] else: gene = row[0] samples = row[1:] gene_dict[gene] = samples progress1.update() sys.stderr.write("\nFirst part done.\n") # Now read through json file and append with open(json_fn) as json_file: for line in json_file: site = json.loads(line, object_pairs_hook=collections.OrderedDict) for i in range(0, len(header)): if header[i] in site['samples']: site['samples'][header[i]]['gene_rpkm'] = gene_dict[ site['gene_id']][i] else: site['samples'][header[i]] = { 'gene_rpkm': gene_dict[site['gene_id']][i] } print json.dumps(site) progress2.update() sys.stderr.write("\nAll done!\n")
'transcript_type']: results['transcript_type'][transcript['attribute'] ['transcript_type']] += 1 else: results['transcript_type'][transcript['attribute'] ['transcript_type']] = 1 if transcript['source'] in results['source']: results['source'][transcript['source']] += 1 else: results['source'][transcript['source']] = 1 search = [ 'exon_number', 'exon_total', 'splice_count', 'splice_before', 'coverage_count', 'tss_mapped', 'tss_total', 'transcript_total' ] for item in search: if str(site[item]) in results[item]: results[item][str(site[item])] += 1 else: results[item][str(site[item])] = 1 if not site['gene_id'] in gene_dict: gene_dict[site['gene_id']] = True progress.update() counter += 1 print json.dumps(results, indent=2) print "Number of unique genes: " + str(len(gene_dict)) print "Number of tss sites: " + str(counter) sys.stderr.write("\nAll Done!\n")
def main(argv): # parse args if len(sys.argv) is not 5: sys.stderr.write("invalid usage: python " + sys.argv[0] + " <all_tss.json> <57epigenomes.exon.RPKM.all> <chromosome_order.json> <granularity>\n") sys.exit(2) tss_fn = sys.argv[1] rna_fn = sys.argv[2] chromosomes_fn = sys.argv[3] granularity = int(sys.argv[4]) progress1 = FileProgress(rna_fn, "Part 1/2: ") # load expected chromosome order from json into a dictionary with open(chromosomes_fn) as chromosomes_file: chromosomes = json.load(chromosomes_file) # Load JSON GTF file into memory gene_dict = {} with open(tss_fn, 'rb') as json_file: for line in json_file: gene = json.loads(line) gene_dict[gene['gene_id']] = gene sys.stderr.write("Loaded " + str(len(gene_dict)) + " genes into memory.\n") # Read RNA-seq data into memory # The purpose of this entire section is calculate leading and cassette exons gene_rna_dict = {} sample_names = {} with open(rna_fn) as rna_f: for line in rna_f: if progress1.count == 0: row = line.strip('\n').split('\t') for i in range(2, len(row)): sample_names[row[i]] = i-2 else: row = line.strip('\t\n').split('\t') gene = row[1] if gene in gene_dict: start = int(row[0].split(':')[1].split('-')[0]) end = int(row[0].split('-')[1].split('<')[0]) strand = int(row[0].split('<')[1]) if gene not in gene_rna_dict: gene_rna_dict[gene] = [] gene_rna_dict[gene].append( { 'gene' : gene, 'seqname' : row[0].split(':')[0], 'start' : start, 'end' : end, 'strand' : ('+' if strand==1 else '-'), 'samples' : row[2:], # There's some weird formatting in the RPKM file 'tss' : (start if strand==1 else end) } ) assert len(sample_names) == len(row[2:]) progress1.update() # Main loop of genes progress2 = FileProgress(None, "Part 2/2: ", len(gene_rna_dict)) sys.stderr.write("\nLoaded " + str(len(gene_rna_dict)) + " mRNA exons into memory.\n") for genes in sorted(gene_rna_dict.values(), key=lambda k: ( chromosomes[k[0]['seqname']], k[0]['tss'] )): gene = genes[0]['gene'] if gene in gene_dict: # Iterate through the genes and calculate the exon number genes.sort(key=lambda x: x['start']) if genes[0]['strand'] == '+': for i in range(1, len(genes) + 1): genes[i-1]['exon_number'] = i else: for i in range(1, len(genes) + 1): genes[len(genes)-i]['exon_number'] = i # Calculate cell with from samples maximum value max_exon = None max_rpkm = 0 for exon in genes: for sample_rpkm in exon['samples']: if float(sample_rpkm) >= max_rpkm: max_exon = exon['samples'] max_rpkm = float(sample_rpkm) assert max_exon is not None # Iterate through exons within this gene printlist = [] for i in range(0, len(genes)): exon = genes[i] samples = exon['samples'] # Assign all transcripts that map to this exon exon_transcripts = [] splice_count = 0 splice_before = 0 coverage_count = 0 for transcript in gene_dict[gene]['transcripts'].itervalues(): if transcript['tss'] > exon['start'] - granularity and transcript['tss'] < exon['end'] + granularity: exon_transcripts.append(transcript) for intron in transcript['introns']: if exon['start'] > intron[1] or exon['end'] < intron[0]: # not spliced out pass else: splice_count += 1 if exon['strand'] == '+': if intron[1] < exon['start']: splice_before += 1 else: if intron[0] > exon['end']: splice_before += 1 if exon['strand'] == '+': if transcript['end'] < exon['start']: splice_before += 1 else: if transcript['end'] < exon['start']: if transcript['start'] > exon['end']: splice_before += 1 for transcript_exon in transcript['exons']: if exon['start'] > transcript_exon[1] or exon['end'] < transcript_exon[0]: # not covered by exon pass else: coverage_count += 1 # If a transcript mapped to one of the exons if len(exon_transcripts) > 0: # Save this transcript d = collections.OrderedDict() d['seqname'] = exon['seqname'] d['location'] = exon['tss'] d['strand'] = (1 if exon['strand']=='+' else -1) d['gene_id'] = gene d['exon_number'] = exon['exon_number'] d['exon_total'] = len(genes) d['splice_count'] = splice_count d['splice_before'] = splice_before d['coverage_count'] = coverage_count d['tss_mapped'] = len(exon_transcripts) d['tss_total'] = 0 d['transcript_total'] = len(gene_dict[gene]['transcripts']) d['transcripts'] = copy.deepcopy(exon_transcripts) d['samples'] = {} for sample_name, i in sample_names.iteritems(): if sample_name not in d['samples']: d['samples'][sample_name] = {} d['samples'][sample_name]['rpkm'] = float(samples[i]) d['samples'][sample_name]['max_rpkm'] = float(max_exon[i]) printlist.append(d) # Iterate through the exons again (this time of the accepted list) # We calculate the delta_rpkm to the previous transcript start site for k in range(0, len(printlist)): d = printlist[k] for sample_name in sample_names.iterkeys(): d['tss_total'] = len(printlist) if len(printlist) == 1: d['samples'][sample_name]['delta_rpkm'] = d['samples'][sample_name]['rpkm'] else: try: if d['strand'] == '+': d['samples'][sample_name]['delta_rpkm'] = d['samples'][sample_name]['rpkm'] - printlist[k-1]['samples'][sample_name]['rpkm'] else: d['samples'][sample_name]['delta_rpkm'] = d['samples'][sample_name]['rpkm'] - printlist[k+1]['samples'][sample_name]['rpkm'] except IndexError: d['samples'][sample_name]['delta_rpkm'] = d['samples'][sample_name]['rpkm'] # Delete redundant information to reduce size of the file for this_transcript in d['transcripts']: this_transcript.pop("exons") this_transcript.pop("introns") this_transcript.pop("length") this_transcript.pop("score") this_transcript.pop("frame") this_transcript.pop("feature") this_transcript.pop("strand") this_transcript.pop("seqname") this_transcript.pop("start") this_transcript.pop("end") print json.dumps(d) progress2.update() sys.stderr.write("\nAll done!\n")
# Load metadata into memory. It's going to be big. chrom = {} with open(meta, 'rb') as json_file: for line in json_file: site = json.loads(line) seqname = site.pop('seqname') tss = site.pop('tss') site.pop('strand') site.pop('gene_id') site.pop('transcripts') if seqname in chrom: chrom[seqname][tss] = site else: chrom[seqname] = { tss : site} progress1.update() # Now go through data file and update sys.stderr.write("\nNow working through data file!\n") with open(data) as json_file: for line in json_file: site = json.loads(line) meta = chrom[site['seqname']][site['tss']] # Site-level features features = ['exon_number', 'exon_total', 'splice_count', 'coverage_count', 'tss_mapped', 'tss_total', 'transcript_total'] for feature in features: site[feature] = meta[feature] # Sample-level features
def main(argv): if not (len(sys.argv) == 4 or len(sys.argv) == 5): sys.stderr.write("invalid usage: python " + sys.argv[0] + " <all_level1.json> <left> <right> [cutoff]\n") sys.exit(2) keys = [ #core marks "H3K4me1", "H3K4me3", "H3K27me3", "H3K36me3", "H3K9me3", "H2A.Z", "H3K4me2", "H3K27ac", "H4K20me1", "H3K9ac", "DNase", "H3K79me2" ] ranges = (int(sys.argv[2]), int(sys.argv[3])) # Load file datapoint_list = [] file = sys.argv[1] progress = FileProgress(file, "Reading file: ") with open(file) as json_file: for line in json_file: tss_dict = json.loads(line) for sample in tss_dict.values(): remove = False for mark in keys: if mark not in sample: remove = True else: # Compute the feature vector sum = listsum(sample[mark][ranges[0]:ranges[1]]) sample[mark] = sum if remove: continue # Compute the label sample['gene_rpkm'] = float(sample['gene_rpkm']) if sample['delta_rpkm'] < 0: sample['delta_rpkm'] = 0 #if sample['max_rpkm'] == 0: # sample['label'] = 0 #else: sample['label'] = sample['delta_rpkm'] datapoint_list.append(sample) progress.update() sys.stderr.write("\nFinished reading file\n") print "Label Method: rpkm" # For regression, create vector X = [] Y_R = [] for datapoint in datapoint_list: # Assign feature vector exprmt_feature_vector = [] for mark in keys: exprmt_feature_vector.append(datapoint[mark]) # Assing feature label exprmt_label = datapoint['label'] # Add both vectors X.append(exprmt_feature_vector) Y_R.append(exprmt_label) # Classify feature labels into binary space if len(sys.argv) == 5: label_cutoff = int(sys.argv[4]) else: label_cutoff = np.median(Y_R) print "marks: " + ", ".join(keys) print "window: " + str(ranges[0]) + ", " + str(ranges[1]) print "label cutoff: " + str(label_cutoff) Y_C = [] for datapoint in datapoint_list: Y_C.append(int(float(datapoint['label']) < label_cutoff)) print "mean label: " + str(np.mean(Y_R)) print "median label: " + str(np.median(Y_R)) """ ### DUPLICATE # Perform the same thing for items stratified by mark type samples_features_and_labels={} for mark in sample_dicts.keys(): samples_features_and_labels[mark]={} samples_features_and_labels[mark]["X"]=[] samples_features_and_labels[mark]["Y_R"]=[] samples_features_and_labels[mark]["Y_C"]=[] for datapoint in sample_dicts[mark]: exprmt_feature_vector=[] for mark in keys: exprmt_feature_vector.append(datapoint[mark]) # Calculate feature label if datapoint["delta_rpkm"] < 0: datapoint["delta_rpkm"] = 0 if datapoint["max_rpkm"] == 0: # This gene is not expressed exprmt_label = 0 else: exprmt_label = datapoint["delta_rpkm"] / datapoint["max_rpkm"] samples_features_and_labels[mark]["X"].append(exprmt_feature_vector) samples_features_and_labels[mark]["Y_R"].append(exprmt_label) samples_features_and_labels[mark]["Y_C"].append(int(float(exprmt_label) < label_cutoff)) """ print "number of datapoints: " + str(len(Y_C)) # Permutate the datapoints perms = np.random.permutation(len(X)) X_p = [] Y_C_p = [] Y_R_p = [] for i in range(0, len(X)): X_p.append(X[perms[i]]) Y_C_p.append(Y_C[perms[i]]) Y_R_p.append(Y_R[perms[i]]) Y_C = Y_C_p X = X_p """ for sample in samples_features_and_labels.keys(): X_sample=samples_features_and_labels[sample]["X"] Y_sample_C=samples_features_and_labels[sample]["Y_C"] Y_sample_R=samples_features_and_labels[sample]["Y_R"] perms = np.random.permutation(len(X_sample)) X_sample_p=[] Y_sample_R_p=[] Y_sample_C_p=[] for i in range(0,len(X_sample)): X_sample_p.append(X_sample[perms[i]]) Y_sample_C_p.append(Y_sample_C[perms[i]]) Y_sample_R_p.append(Y_sample_R[perms[i]]) samples_features_and_labels[sample]["X"]=X_sample_p samples_features_and_labels[sample]["Y_C"]=Y_sample_C_p samples_features_and_labels[sample]["Y_R"]=Y_sample_R_p """ # Regression! print "len Y_C: " + str(len(Y_C)) print "len X: " + str(len(X)) print "Starting Random Forests:" for n_estimators in [100, 150]: for depth in [4, 6]: clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=depth, min_samples_split=10, random_state=0) clf.fit(X, Y_C) print "n_estimators, depth: " + str(n_estimators) + ", " + str( depth) feature_importances = clf.feature_importances_ print "feature_importances: " for i in range(0, len(keys)): print "\t" + keys[i] + ":\t" + str(feature_importances[i]) """ print "score by experiment:" mean_acc=[] for sample in samples_features_and_labels.keys(): Y_C_sample=samples_features_and_labels[sample]["Y_C"] X_sample=samples_features_and_labels[sample]["X"] print "\t" +sample+": "+str(len(X_sample))+" points" sample_scores=cross_val_score(clf,X_sample,Y_C_sample) mean_acc.append(np.mean(sample_scores)*(float(len(X_sample)/float(num_points)))) print "\t"+sample+" test accuracy: "+str(np.mean(sample_scores)) print "average sample accuracy:"+str(sum(mean_acc)) """ scores = cross_val_score(clf, X, Y_C) print "RandomForest mean cross validation score: " + str( scores.mean()) print "#" * 75 + "\n"
def main(argv): # parse args if len(sys.argv) is not 5: sys.stderr.write( "invalid usage: python " + sys.argv[0] + " <all_tss.json> <57epigenomes.exon.RPKM.all> <chromosome_order.json> <granularity>\n" ) sys.exit(2) tss_fn = sys.argv[1] rna_fn = sys.argv[2] chromosomes_fn = sys.argv[3] granularity = int(sys.argv[4]) progress1 = FileProgress(rna_fn, "Part 1/2: ") progress2 = FileProgress(rna_fn, "Part 2/2: ") # load expected chromosome order from json into a dictionary with open(chromosomes_fn) as chromosomes_file: chromosomes = json.load(chromosomes_file) # Sort RNA file by gene id, so they are confirmed to be in order rna_f = unix_sort(rna_fn, "-k2,2 -k1,1", header=True) # Load JSON GTF file into memory gene_dict = {} with open(tss_fn, 'rb') as json_file: for line in json_file: gene = json.loads(line) gene_dict[gene['gene_id']] = gene sys.stderr.write("Loaded " + str(len(gene_dict)) + " genes into memory.\n") # Read RNA-seq data into memory # The purpose of this entire section is calculate leading and cassette exons rna_data = [] samples = [] previous_gene = None rna_file = csv.reader(rna_f, delimiter='\t') for row in rna_file: if progress1.count == 0: samples = row else: gene = row[1] if gene in gene_dict: if previous_gene != gene: if previous_gene is not None: # The previous exon was the last exon of the previous gene if gene_dict[previous_gene]['strand'] == '-': rna_data[-1][2] = 'leading' # The current exon is the first exon of this gene if gene_dict[gene]['strand'] == '+': row.insert(2, 'leading') else: row.insert(2, 'cassette') else: row.insert(2, 'cassette') rna_data.append(row) previous_gene = gene progress1.update() # Now sort the RNA-seq data that is in memory sys.stderr.write("\nBeginning to sort loaded RNA-seq file\n") rna_data.sort(key=lambda row: ( chromosomes[row[0].split(':')[0] ], #chromosome, passed into chromosomes config dictionary ( int(row[0].split(':')[1].split('-')[0]) #start if int(row[0].split('<')[1]) == 1 #if strand==1 else int(row[0].split('-')[1].split('<')[0]) #else end ))) sys.stderr.write("Finished sorting loaded RNA-seq file\n") # Now print the exons that have transcript start sites for row in rna_data: gene = row[1] if gene in gene_dict: seqname = row[0].split(':')[0] start = int(row[0].split(':')[1].split('-')[0]) end = int(row[0].split('-')[1].split('<')[0]) strand = int(row[0].split('<')[1]) tss = (start if strand == 1 else end) # Assign all transcripts that map to this exon exon_transcripts = [] splice_count = 0 for transcript in gene_dict[gene]['transcripts'].itervalues(): if transcript['tss'] > start - granularity and transcript[ 'tss'] < end + granularity: exon_transcripts.append(transcript) for intron in transcript['introns']: if start > intron[1] or end < intron[0]: # not spliced out pass else: splice_count += 1 # If a transcript mapped to one of the exons if len(exon_transcripts) > 0: # Print this transcript d = collections.OrderedDict() d['seqname'] = seqname d['tss'] = tss d['strand'] = ('+' if strand == 1 else '-') d['gene_id'] = gene d['tss_type'] = row[2] d['splice_count'] = splice_count d['transcripts'] = exon_transcripts d['samples'] = {} for i in range(3, len(samples)): sample_name = samples[i] if sample_name not in d['samples']: d['samples'][sample_name] = {} d['samples'][sample_name]['rpkm'] = row[i] print json.dumps(d) break progress2.update() sys.stderr.write("\nAll done!\n")
def main(argv): if len(sys.argv) is not 2: sys.stderr.write("invalid usage: python " + sys.argv[0] + " <genes.json>\n") sys.exit(2) # Initialize Variables and report progress in file genes_fn = sys.argv[1] progress = FileProgress(genes_fn, "Percent Complete: ") # Load GTF file in json format with open(genes_fn) as json_file: for line in json_file: gene = json.loads(line) for transcript in gene['transcripts'].itervalues(): exon_list = [] for exon in transcript['exons']: exon_list.append((exon['start'], exon['end'])) transcript.pop("exons", None) transcript['exons'] = exon_list if (transcript['strand'] == "+"): transcript['tss'] = transcript['start'] transcript['exons'].sort(key=lambda tup: tup[0]) else: transcript['tss'] = transcript['end'] transcript['exons'].sort(key=lambda tup: tup[1], reverse=True) transcript['length'] = int(transcript['end']) - int( transcript['start']) # Generate a list of introns intron_list = [] if (transcript['strand'] == "+"): for i in range(0, len(transcript['exons']) - 1): intron_start = transcript['exons'][i][1] + 1 intron_end = transcript['exons'][i + 1][0] - 1 intron_list.append((intron_start, intron_end)) else: for i in range(0, len(transcript['exons']) - 1): intron_start = transcript['exons'][i][0] - 1 intron_end = transcript['exons'][i + 1][1] + 1 intron_list.append((intron_start, intron_end)) transcript['introns'] = intron_list # Reorder the gene dictionary so it is easier to sort in the future d = collections.OrderedDict() d['gene_id'] = gene['attribute']['gene_id'].split('.')[0] d['seqname'] = gene['seqname'] d['source'] = gene['source'] d['start'] = gene['start'] d['end'] = gene['end'] d['strand'] = gene['strand'] d['attribute'] = gene['attribute'] d['transcripts'] = gene['transcripts'] print json.dumps(d) progress.update() json_file.close() sys.stderr.write("\nAll Done\n")
def main(argv): # parse args if len(sys.argv) is not 4: sys.stderr.write( "invalid usage: python " + sys.argv[0] + " <nosplice_tss_rna.json> <nosplice_tss_chip.tsv> <experiment_read_counts.json>\n" ) sys.exit(2) rna_fn = sys.argv[1] chip_fn = sys.argv[2] reads_fn = sys.argv[3] progress = FileProgress(chip_fn, "Percent Complete: ") # Load read count normalization file with open(reads_fn) as experiment_read_counts_file: read_counts = json.load(experiment_read_counts_file) # Sort both files lexically by chromosome, then by position sys.stderr.write('Beginning to sort both files (may take a while).\n') rna_f = unix_sort(rna_fn, "-k2,2 -k4,4", header=False, save=True) chip_f = unix_sort(chip_fn, "-t $'\t' -k1,2", header=False, save=True) sys.stderr.write('Finished sorting.\n') # Read RNA-seq data into memory sys.stderr.write('Reading the RNA-seq data into memory.\n') dict = {} for line in rna_f: site = json.loads(line, object_pairs_hook=collections.OrderedDict) seqname = site['seqname'] tss = site['location'] if seqname in dict: if tss in dict[seqname]: print "Error!" else: dict[seqname][str(tss)] = site else: dict[seqname] = {str(tss): site} rna_f.close() # Begin looping through chip file sys.stderr.write('Beginning to read the ChIP data.\n') previous_seqname = None previous_tss = None for line in chip_f: chip_row = line.strip("\n").split("\t") if len(chip_row) != 5: continue seqname, tss, sample, mark, rpm = (chip_row[0], chip_row[1], chip_row[2], chip_row[3], eval(chip_row[4])) if previous_tss is None: previous_seqname = seqname previous_tss = tss progress.update() continue if previous_seqname != seqname or previous_tss != tss: print json.dumps(dict[previous_seqname][previous_tss]) dict[previous_seqname].pop(previous_tss, None) tss_site = dict[seqname][tss] correction = float(read_counts[sample][mark]) / 1000000 if sample in tss_site['samples']: tss_site['samples'][sample][mark] = [x / correction for x in rpm] else: tss_site['samples'][sample] = {mark: [x / correction for x in rpm]} previous_seqname = seqname previous_tss = tss progress.update() print json.dumps(dict[previous_seqname][previous_tss]) chip_f.close() sys.stderr.write("\nAll done!\n")