def __call__(self, track, slice = None): c_transcript = [] c_gene = [] for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): c_transcript.append(len(transcript)) for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): c_gene.append(len(gene)) return odict( ( ("transcript", np.mean(c_transcript)), ("gene",np.mean(c_gene) )) )
def __call__(self, track, slice = None): if slice == "transcript": lengths_transcripts = [] for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): length = sum([gtf.end - gtf.start for gtf in transcript]) lengths_transcripts.append(length) return np.mean(lengths_transcripts) elif slice == "gene": lengths_genes = [] for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): length = sum([gtf.end - gtf.start for gtf in gene]) lengths_genes.append(length) return np.mean(lengths_genes)
def __call__(self, track, slice = None): classes = ["antisense" , "antisense_upstream" , "antisense_downstream" , "sense_upstream" , "sense_downstream" , "intergenic" , "sense_intronic" , "antisense_intronic"] coding_set = {} for gtf in GTF.iterator(IOTools.openFile("gtfs/lncrna_filtered.class.gtf.gz")): coding_set[gtf.transcript_id] = gtf.source result = {"noncoding": {}, "coding":collections.defaultdict(int)} total_nc = float(self.getValue("SELECT COUNT(*) FROM %(track)s_cpc_result WHERE C_NC = 'noncoding'")) for c in classes: result["noncoding"][c] = (float(self.getValue("""SELECT COUNT(*) FROM lncrna_final_class as a, %s_cpc_result as b WHERE a.class = '%s' AND b.C_NC = 'noncoding' AND a.transcript_id = b.transcript_id""" % (track,c)))/total_nc)*100 total_c = len(coding_set.keys()) for c in classes: ids = self.getValues("SELECT transcript_id FROM %(track)s_cpc_result WHERE C_NC = 'coding'") for i in ids: if i in coding_set.keys(): if coding_set[i] == c: result["coding"][c] += 1 for x, y in result["coding"].iteritems(): result["coding"][x] = (float(y)/total_c)*100 return result
def main(): # Input files. # GENCODE = args['-g'] GENCODE = "/cs/zbio/jrosensk/ccle_fastq/hg19_reference/hg19.ensGene.gtf" #"Homo_sapiens.GRCh38.103.gtf.gz"#"gencode.v29.annotation.gtf.gz" # Output file prefix. GENE_LENGTHS = "coding_lengths.hg19.tsv" with log("Reading the Gencode annotation file: {}".format(GENCODE)): gc = GTF.dataframe(GENCODE) # ccle_transcript_tpm = pd.read_csv("CCLE_expression.csv", nrows=3) # Select just exons of protein coding genes, and columns that we want to use. idx = (gc.feature == 'exon') #& (gc.gene_biotype == 'protein_coding') # idx2 = (gc.feature == 'gene') & (gc.transcript_type == 'protein_coding')gene_biotype # trial = gc[idx2] exon = gc[idx][['seqname', 'start', 'end', 'gene_id', 'gene_name']] # Convert columns to proper types. exon.start = exon.start.astype(int) exon.end = exon.end.astype(int) # Sort in place. exon.sort_values(['seqname', 'start', 'end'], inplace=True) # Group the rows by the Ensembl gene identifier (with version numbers.) groups = exon.groupby('gene_id') with log("Calculating coding region (exonic) length for each gene..."): lengths = groups.apply(count_bp) # with log("Reading NCBI mapping of Entrez GeneID "\ # "to Ensembl gene identifier: {}".format(NCBI_ENSEMBL)): # g2e = pd.read_table(NCBI_ENSEMBL, # compression="gzip", # header=None, # names=['tax_id', 'GeneID', # 'Ensembl_gene_identifier', # 'RNA_nucleotide_accession.version', # 'Ensembl_rna_identifier', # 'protein_accession.version', # 'Ensembl_protein_identifier']) # Create a new DataFrame with gene lengths and EnsemblID. ensembl_no_version = lengths.index.map(lambda x: x.split(".")[0]) ldf = pd.DataFrame( { 'length': lengths, 'Ensembl_gene_identifier': ensembl_no_version }, index=lengths.index) # Merge so we have EntrezGeneID with length. # m1 = pd.merge(ldf, g2e, on='Ensembl_gene_identifier') m1 = ldf[['Ensembl_gene_identifier', 'length']].drop_duplicates() with log("Writing output file: {}".format(GENE_LENGTHS)): m1.to_csv(GENE_LENGTHS, sep="\t", index=False)
def getReferenceLincRNA(self, reference_gtf): lincs = [] for entry in GTF.iterator(IOTools.openFile(reference_gtf)): if entry.source == "lincRNA": if entry.gene_id not in lincs: lincs.append(entry.gene_id) return len(lincs)
def __call__(self, track): length = {} for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile("gtfs/lncrna_filtered.gtf.gz"))): length[transcript[0].transcript_id] = sum([gtf.end - gtf.start for gtf in transcript]) score = {} dbh = sqlite3.connect("csvdb") cc = dbh.cursor() for data in cc.execute("SELECT transcript_id, CP_score FROM lncrna_filtered_cpc_result"): score[data[0]] = data[1] result = {"length": [], "score": []} for transcript, value in length.iteritems(): result["length"].append(np.log10(length[transcript])) result["score"].append(score[transcript]) return result
def main(args): p = GTF.dataframe( args['gtf']) ## GFT.dataframe returns a pandas.core.data.DataFrame with open(args['out'], 'w') as wo: for i in range(len(p)): wo.write("{}\t{}\t{}\t{}___{}\t{}\t{}\n".format( p['seqname'][i], p['start'][i], p['end'][i], p['gene_id'][i], p['gene_name'][i], p['gene_biotype'][i], p['strand'][i]))
def main(args): # Input files. GENCODE = args['-g'] NCBI_ENSEMBL = args['-n'] # Output file prefix. GENE_LENGTHS = args['-o'] or "ncbi_ensembl_coding_lengths.txt.gz" with log("Reading the Gencode annotation file: {}".format(GENCODE)): gc = GTF.dataframe(GENCODE) # Select just exons of protein coding genes, and columns that we want to use. idx = (gc.feature == 'exon') & (gc.transcript_type == 'protein_coding') exon = gc.ix[idx, ['seqname', 'start', 'end', 'gene_id', 'gene_name']] # Convert columns to proper types. exon.start = exon.start.astype(int) exon.end = exon.end.astype(int) # Sort in place. exon.sort(['seqname', 'start', 'end'], inplace=True) # Group the rows by the Ensembl gene identifier (with version numbers.) groups = exon.groupby('gene_id') with log("Calculating coding region (exonic) length for each gene..."): lengths = groups.apply(count_bp) with log("Reading NCBI mapping of Entrez GeneID "\ "to Ensembl gene identifier: {}".format(NCBI_ENSEMBL)): g2e = pd.read_table(NCBI_ENSEMBL, compression="gzip", header=None, names=[ 'tax_id', 'GeneID', 'Ensembl_gene_identifier', 'RNA_nucleotide_accession.version', 'Ensembl_rna_identifier', 'protein_accession.version', 'Ensembl_protein_identifier' ]) # Create a new DataFrame with gene lengths and EnsemblID. ensembl_no_version = lengths.index.map(lambda x: x.split(".")[0]) ldf = pd.DataFrame( { 'length': lengths, 'Ensembl_gene_identifier': ensembl_no_version }, index=lengths.index) # Merge so we have EntrezGeneID with length. m1 = pd.merge(ldf, g2e, on='Ensembl_gene_identifier') m1 = m1[['Ensembl_gene_identifier', 'GeneID', 'length']].drop_duplicates() with log("Writing output file: {}".format(GENE_LENGTHS)): with gzip.open(GENE_LENGTHS, "wb") as out: m1.to_csv(out, sep="\t", index=False)
def __call__(self,track, slice = None): transcript_counts = collections.defaultdict( set ) counts = [] for gtf in GTF.iterator(IOTools.openFile(self.getFilename(track))): transcript_counts[gtf.gene_id].add(gtf.transcript_id) for gene, transcripts in transcript_counts.iteritems(): counts.append(len(transcripts)) return counts
def __call__(self,track, slice = None): transcript_counts = collections.defaultdict( set ) counts = [] for gtf in GTF.iterator(IOTools.openFile(self.getFilename(track))): transcript_counts[gtf.gene_id].add(gtf.transcript_id) for gene, transcripts in transcript_counts.iteritems(): counts.append(len(transcripts)) count, lower, dx, _ = scipy.stats.cumfreq(counts, numbins=40, defaultreallimits=(1,15)) x = np.arange(count.size) * dx + lower return odict( (("transcript number", x), ("cumulative frequency", count/len(counts))) )
def __call__(self, track, slice = None): if slice == "transcript": lengths_transcripts = [] for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): length = sum([gtf.end - gtf.start for gtf in transcript]) lengths_transcripts.append(length) counts, lower, dx, _ = scipy.stats.cumfreq(lengths_transcripts, numbins=40, defaultreallimits=(0,20000)) x = np.arange(counts.size) * dx + lower return odict( (("length", x), ("cumulative frequency", counts/len(lengths_transcripts))) ) elif slice == "gene": lengths_genes = [] for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): length = sum([gtf.end - gtf.start for gtf in gene]) lengths_genes.append(length) counts, lower, dx, _ = scipy.stats.cumfreq(lengths_genes, numbins=40, defaultreallimits=(0,20000)) x = np.arange(counts.size) * dx + lower return odict( (("length", x), ("cumulative frequency", counts/len(lengths_genes))) )
def main(args): with log("Reading the Fasta file: {}".format(args.fastaFile)): records = list(SeqIO.parse(args.fastaFile, "fasta")) with log("Reading the Gencode annotation file: {}".format(args.wigFile)): wig = pd.DataFrame.from_csv(args.wigFile, header=0, sep=" ", index_col=None) wig['CpG'] = ["CpG"] * (wig.size / 2) #-------------------------------------------------- # for raw in range(0,wig.size): #-------------------------------------------------- wig.to_csv(args.outWigFile, header=True, index=None, sep=' ', mode='a') seqStr = dict() with log("Reading the fasta file: {}".format(args.fastaFile)): seqHandle = open(args.fastaFile, "rU") for record in SeqIO.parse(seqHandle, "fasta"): seqStr[record.id] = record.seq print(seqStr) with log("Reading the Gencode annotation file: {}".format(args.gffFile)): gc = GTF.dataframe(args.gffFile) #-------------------------------------------------- # print(gc[1:10]) #-------------------------------------------------- # Select just exons of protein coding genes, and columns that we want to use. idx = (gc.feature == 'exon') exon = gc.ix[idx, ['seqname', 'start', 'end', 'ID', 'Parent']] exon['ID'] = exon['ID'].map(lambda x: re.sub(r'-mRNA.*', '', x)) # Convert columns to proper types. exon.start = exon.start.astype(int) exon.end = exon.end.astype(int) # Sort in place. exon.sort_values(['seqname', 'start', 'end'], inplace=True) # Group the rows by the Ensembl gene identifier (with version numbers.) groups = exon.groupby('ID') with log("Calculating coding region (exonic) length for each gene..."): lengths = groups.apply(count_bp) print(type(lengths)) with log("Writing output file: {}".format(args.outFile)): lengths.to_csv(args.outFile, sep="\t", encoding="utf-8", index=True)
def main(GENCODE): gc = GTF.dataframe(GENCODE) gc.gene_id = gc.gene_id.replace(to_replace=r"\.[0-9]+", value="", regex=True) idx = (gc.feature == "transcript") & (gc.transcript_type == "lincRNA") lincRNA = gc.ix[idx, ["seqname", "start", "end", "gene_id", "gene_name", "strand"]] lincRNA.start = lincRNA.start.astype(int) lincRNA.end = lincRNA.end.astype(int) lincRNA.sort_values(by=["seqname", "start", "end"], inplace=True) lincRNA.to_csv("lincRNA.bed", sep="\t", header=False, index=False) idx = (gc.feature == "gene") & (gc.gene_type == "lincRNA") lincRNA = gc.ix[idx, ["seqname", "start", "end", "gene_id", "gene_name", "strand"]] lincRNA.start = lincRNA.start.astype(int) lincRNA.end = lincRNA.end.astype(int) lincRNA.sort_values(by=["seqname", "start", "end"], inplace=True) lincRNA.to_csv("lincRNA_genes.bed", sep="\t", header=False, index=False)
def main(GENCODE): gc = GTF.dataframe(GENCODE) gc.gene_id = gc.gene_id.replace(to_replace=r'\.[0-9]+', value='', regex=True) idx = (gc.feature == 'transcript') & (gc.transcript_type == 'lincRNA') lincRNA = gc.ix[ idx, ['seqname', 'start', 'end', 'gene_id', 'gene_name', 'strand']] lincRNA.start = lincRNA.start.astype(int) lincRNA.end = lincRNA.end.astype(int) lincRNA.sort_values(by=['seqname', 'start', 'end'], inplace=True) lincRNA.to_csv('lincRNA.bed', sep='\t', header=False, index=False) idx = (gc.feature == 'gene') & (gc.gene_type == 'lincRNA') lincRNA = gc.ix[ idx, ['seqname', 'start', 'end', 'gene_id', 'gene_name', 'strand']] lincRNA.start = lincRNA.start.astype(int) lincRNA.end = lincRNA.end.astype(int) lincRNA.sort_values(by=['seqname', 'start', 'end'], inplace=True) lincRNA.to_csv('lincRNA_genes.bed', sep='\t', header=False, index=False)
def count_lowlevel_in_hightlevel(filename, low_level_name, high_level_name): """ To count how many sub-features in a highlevel feature. :param filename: File used to be processed. :param low_level_name: Feature names in GTF file. Such as "exon", "transcript". :param high_level_name: Feature names in GTF file. Such as "exon", "transcript", "gene". :return: No return, but output to file directly. """ occurrence = 0 with open('{} number in each {}'.format(low_level_name, high_level_name), 'w') as f: for idx, item_with_bool in enumerate(lookahead(GTF.lines(filename))): if item_with_bool[0]['feature'] == high_level_name: if idx != 0 and idx != 1: f.write(str(occurrence) + '\n') occurrence = 0 elif item_with_bool[0]['feature'] == low_level_name: occurrence += 1 elif item_with_bool[1] == False: f.write(str(occurrence) + '\n') else: continue
*_toTranscriptome_cov.txt Output file(s): *_toTranscriptome_cov.average.txt Tools: GTF.py to load GTF files Process line-by-line with Python """ import glob import os import sys sys.path.insert(0, '/12TBLVM/Data/MinhTri/6_SCRIPTS') import GTF print('Loading gtf file.') (GeneDict, TransScDict) = GTF.dataframe("/12TBLVM/Data/hg19-2/GENCODE/gencode.v22.annotation.gtf") print('Listing files to be processed:') cases = set() for file in glob.glob('*_toTranscriptome_cov.txt'): cases.add(file.split('.')[0]) print('\t', file) for entry in cases: summary = {} # Read file original_file = open(entry + '.txt', 'r') print('Reading input file:', original_file.name) total_cov = 0 counter = 0
def main(argv): # Ottengo la stringa relativa al file da processare input_file = argv[0] temp_folder = argv[1] username = argv[2] experiment = argv[3] species = argv[4] config = json.load(open('../configuration.json')) temp_token = username + '_' + str(uuid.uuid4()) # Creo i csv per memorizzare le informazioni sui nodi chromosome_csv = open(temp_folder + temp_token + '_chromosome.csv', 'w') gene_csv = open(temp_folder + temp_token + '_gene.csv', 'w') transcript_csv = open(temp_folder + temp_token + '_transcript.csv', 'w') exon_csv = open(temp_folder + temp_token + '_exon.csv', 'w') # Creo i csv per memorizzare le informazioni sulle relazioni contains_csv = open(temp_folder + temp_token + '_contains.csv', 'w') in_chromosome_csv = open(temp_folder + temp_token + '_in_chromosome.csv', 'w') has_transcript_csv = open(temp_folder + temp_token + '_has_transcript.csv', 'w') has_exon_csv = open(temp_folder + temp_token + '_has_exon.csv', 'w') # Inizializzo i writer per tutti i file # ---- nodi chromosomeWriter = csv.writer(chromosome_csv, delimiter=',') geneWriter = csv.writer(gene_csv, delimiter=',') transcriptWriter = csv.writer(transcript_csv, delimiter=',') exonWriter = csv.writer(exon_csv, delimiter=',') # ---- relazioni containsWriter = csv.writer(contains_csv, delimiter=',') inChromosomeWriter = csv.writer(in_chromosome_csv, delimiter=',') hasTranscriptWriter = csv.writer(has_transcript_csv, delimiter=',') hasExonWriter = csv.writer(has_exon_csv, delimiter=',') # Cotruisco gli header dei file # ---- nodi chromosome_header = ["chromosome"] gene_header = ["gene_id"] transcript_header = [ "transcript_id", "reference_id", "cov", "FPKM", "TPM", "start", "end" ] exon_header = ["exon_id", "exon_number", "start", "end", "cov"] # ---- relazioni contains_header = ["name", "gene_id"] in_chromosome_header = ["gene_id", "chromosome"] has_transcript_header = ["gene_id", "strand", "transcript_id"] has_exon_header = ["transcript_id", "exon_id"] # Scrivo gli header nei rispettivi file # ---- nodi chromosomeWriter.writerow(chromosome_header) geneWriter.writerow(gene_header) transcriptWriter.writerow(transcript_header) exonWriter.writerow(exon_header) # ---- relazioni containsWriter.writerow(contains_header) inChromosomeWriter.writerow(in_chromosome_header) hasTranscriptWriter.writerow(has_transcript_header) hasExonWriter.writerow(has_exon_header) # Inizializzo le strutture dati necessarie al parsing (per ottimizzare il caricamento dei dati su database) # ---- nodi chromosomes = set() genes_dict = {} transcripts_dict = {} # ---- relazioni contains_dict = {} in_chromosome_dict = {} has_transcript_dict = {} print 'Starting parsing procedure for file ' + input_file properties = { "name": os.path.basename(input_file), "extension": os.path.splitext(input_file)[1] } # Connessione a Neo4j driver = GraphDatabase.driver("bolt://" + config["neo4j"]["address"], auth=basic_auth(config["neo4j"]["username"], config["neo4j"]["password"])) # Inizializzazione degli indici session = driver.session() statements = [ "CREATE INDEX ON :File(name);", "CREATE INDEX ON :Species(species);", "CREATE INDEX ON :Gene(gene_id);", "CREATE INDEX ON :Chromosome(chromosome);", "CREATE INDEX ON :Transcript(transcript_id);", "CREATE INDEX ON :Exon(exon_id);" ] for statement in statements: session.run(statement) session.close() print 'Parsing file...' # inizializzo un contatore per fare un load parziale del file su database per file troppo grandi row_count = 0 for line in GTF.lines(input_file): row_count += 1 # memorizzo il cromosoma chromosomes.add(line["seqname"]) # memorizzo il gene (se non presente) if not genes_dict.has_key(line["gene_id"]): genes_dict[line["gene_id"]] = [ line[attr] if line.has_key(attr) else "None" for attr in gene_header ] # memorizzo la relazione (file)-[contiene]->(gene) (se non esiste) if not contains_dict.has_key(properties["name"] + ':' + line["gene_id"]): contains_dict[properties["name"] + ':' + line["gene_id"]] = [ properties["name"], line["gene_id"] ] # memorizzo la relazione (gene)-[contenuto in]->(cromosoma) (se non esiste) if not in_chromosome_dict.has_key(line["gene_id"] + ':' + line["seqname"]): in_chromosome_dict[line["gene_id"] + ':' + line["seqname"]] = [ line["gene_id"], line["seqname"] ] # a seconda della feature considerata (transcript, exon) memorizzo opportunamente le informazioni della riga if line['feature'] == 'transcript': # memorizzo il trascritto (se non presente) if not transcripts_dict.has_key(line["transcript_id"]): transcripts_dict[line["transcript_id"]] = [ line[attr] if line.has_key(attr) else "None" for attr in transcript_header ] # memorizzo la relazione (gene)-[contiente]->(trascritto) (se non esiste) if not has_transcript_dict.has_key(line["gene_id"] + ':' + line["transcript_id"]): has_transcript_dict[line["gene_id"] + ':' + line["transcript_id"]] = [ line[attr] for attr in has_transcript_header ] elif line['feature'] == 'exon': #definisco un ID per l'esone (necessario per il popolamento su db) exon_id = line["exon_number"] + ':' + line["transcript_id"] # memorizzo l'esone nel file csv exonWriter.writerow([exon_id] + [ line[attr] if line.has_key(attr) else "None" for attr in exon_header[1:] ]) #memorizzo la relazione (trascritto)-[contiene]->(esone) nel file csv hasExonWriter.writerow([line["transcript_id"], exon_id]) if not (row_count % 15000): print str(row_count) + " scanned" # scrivo i file csv dei dict creati in precedenza for chrom in list(chromosomes): chromosomeWriter.writerow([chrom]) for gene in genes_dict.keys(): geneWriter.writerow(genes_dict[gene]) for transcript in transcripts_dict.keys(): transcriptWriter.writerow(transcripts_dict[transcript]) for entry in contains_dict.keys(): containsWriter.writerow(contains_dict[entry]) for entry in in_chromosome_dict.keys(): inChromosomeWriter.writerow(in_chromosome_dict[entry]) for entry in has_transcript_dict.keys(): hasTranscriptWriter.writerow(has_transcript_dict[entry]) # termino la scrittura dei file csv # ---- nodi chromosome_csv.close() gene_csv.close() transcript_csv.close() exon_csv.close() # ---- relazioni contains_csv.close() in_chromosome_csv.close() has_transcript_csv.close() has_exon_csv.close() print 'Populating Database...' session = driver.session() prova = [ "MERGE (u:User { username:{username} })", "MERGE (e:Experiment { name:{experiment} })", "MERGE (s:Species {species: {species} })", "MERGE (f:File { name:{properties}.name }) ON CREATE SET f += {properties}", "MERGE (u)-[:Created]->(e)", "MERGE (e)-[:For_Species]->(s)", "MERGE (e)-[:Composed_By]->(f)" ] # Associo il file all'utente session.run( " ".join(prova), { "username": username, "experiment": experiment, "species": species, "properties": properties }) session.close() populateDB(driver, temp_folder + temp_token) print 'Done.'
for val in it: # Report the *previous* value (more to come). yield last, True last = val # Report the last value. yield last, False def processing_count(filenames): count_lowlevel_in_hightlevel(filenames, 'transcript', 'gene') count_lowlevel_in_hightlevel(filenames, 'exon', 'transcript') if __name__ == '__main__': # Below is a demo for using function lookahead(): # for i, has_more in lookahead(range(3)): # print(i, has_more) whole_gtf = GTF.dataframe(sys.argv[1]) processing_count(sys.argv[1]) whole_gtf['length'] = whole_gtf['end'].astype( 'int') - whole_gtf['start'].astype('int') + 1 whole_gtf = whole_gtf.loc[:, ['gene_biotype', 'feature', 'length']] whole_gtf.to_csv("whole_gtf", sep='\t', index=False) # Below is a example for using ggplot package in Python: # p = ggplot(aes(x='length'), data=a) + geom_histogram() + facet_grid(x='gene_biotype', y='feature') \ # + xlim(0,50000) + scale_y_log(10) + ylim(1, 1e3) # ggplot.save(p, "f**k.tiff", width=55, height=50, dpi=300) biotype_count_as_features = whole_gtf.groupby(['gene_biotype', 'feature']).size() biotype_count_as_features.to_csv("biotype_count_as_features", sep='\t')
def main(args): # -------------------------------------------------------------------------- # READING GTF with GTF.py # -------------------------------------------------------------------------- try: gtf_file=args['gtf'] p = GTF.dataframe(gtf_file) ## GTF.dataframe returns a pandas.core.data.DataFrame except Exception as e: logger.error("ERROR: in reading GTF\n; {}".format(e)) exit(1) # -------------------------------------------------------------------------- # INIT VARIABLES # -------------------------------------------------------------------------- _Expected_Eight_Columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame'] _Expected_Other_Columns = [ 'gene_id', 'gene_name', 'gene_version', 'gene_source', 'gene_biotype', 'transcript_id', 'transcript_version', 'transcript_name', 'transcript_source', 'transcript_biotype', 'tag', 'transcript_support_level', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version' ] logger.debug(p.index) logger.debug(p.columns) logger.info("-"*50) logger.info("Printing the fields (aka columns) needed in GTF to make the BED:") logger.info(_Expected_Eight_Columns + _Expected_Other_Columns) logger.info("-"*50) # -------------------------------------------------------------------------- # CHECK FIELDS # -------------------------------------------------------------------------- mandatory_columns_count = 0 mandatory_columns_list = [] other_fields_list = [] for col in p.columns: if col in _Expected_Eight_Columns: mandatory_columns_count +=1 mandatory_columns_list.append(col) other_fields_list.append(col) if mandatory_columns_count != len(_Expected_Eight_Columns): raise ValueError("MISSING MANDATORY COLUMNS in GTF: {} ".format(';'.join([ str(col) for col in _Expected_Eight_Columns if col not in mandatory_columns_list ])) ) logger.info("Mandatory Expected Fields Check out OK") logger.info("Testing if any other missing field exists or are extra or with different expected names ...") unexpected_fields_list = [] expected_fields_list = [] for field in other_fields_list: if field not in _Expected_Other_Columns and field not in _Expected_Eight_Columns: unexpected_fields_list.append(field) else: expected_fields_list.append(field) logger.warning("The Following Fields are NOT added to BED; Check if they should be used and if so, check if they might be named differently: {} ".format(unexpected_fields_list)) # -------------------------------------------------------------------------- # UPDATE GTF IF NEEDED # -------------------------------------------------------------------------- read_GTF_again=False for field in _Expected_Other_Columns + _Expected_Eight_Columns: if field not in expected_fields_list: if not read_GTF_again: shutil.copy(gtf_file, gtf_file+"upd.gtf") gtf_file = gtf_file+"upd.gtf" add_field_to_GTF(field, gtf_file) read_GTF_again=True if read_GTF_again: logger.info("processing GTF2BED for << {} >> updated GTF...".format(gtf_file)) p = GTF.dataframe(gtf_file) # -------------------------------------------------------------------------- # writing to output file # -------------------------------------------------------------------------- with open(args['out'], 'w') as wo: logger.info("writing out BED file ... {}".format(args['out'])) ## writing HEADER line to output file wo.write("\t".join( ['##seqname', 'start', 'end', 'gene_id__gene_name', 'score', 'strand', 'frame', 'gene_version', 'gene_source', 'gene_biotype', 'transcript_id', 'source', 'feature', 'transcript_version', 'transcript_name', ' transcript_source', 'transcript_biotype', 'tag', 'transcript_support_level', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version']) + "\n") ## writing VALUE lines to output file try: for i in range(len(p)): wo.write("{}\t{}\t{}\t{}___{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(p['seqname'][i], int(p['start'][i]) - 1, p['end'][i], p['gene_id'][i], p['gene_name'][i], p['score'][i], p['strand'][i], p['frame'][i], p['gene_version'][i], p['gene_source'][i], p['gene_biotype'][i], p['transcript_id'][i], p['source'][i], p['feature'][i], p['transcript_version'][i], p['transcript_name'][i], p['transcript_source'][i], p['transcript_biotype'][i], p['tag'][i], p['transcript_support_level'][i], p['exon_number'][i], p['exon_id'][i], p['exon_version'][i], p['protein_id'][i], p['protein_version'][i] ) ) except IOError as IOE: logger.error("ERROR: in Writing data\n; {}".format(IOE)) exit(2) except ValueError as VE: logger.error("ERROR: in Writing data\n; {}".format(VE)) exit(2) except Exception as E: logger.error("ERROR: in Writing data\n; {}".format(E)) exit(1)
def main(args): with log("Reading compare Gencode annotation file: {}".format(args.compGffFile)): gc = GTF.dictionary(args.compGffFile,"ID") compGeneInfo = gc['gene'] #-------------------------------------------------- # gene['Name'] = gene['ID'].map(lambda x: re.sub(r':maker.*','',x)) #-------------------------------------------------- with log("Reading reference Gencode annotation file: {}".format(args.refGffFile)): gc = GTF.dataframe(args.refGffFile) # Select just genes of protein coding genes, and columns that we want to use. idx = (gc.feature == 'gene') gene = gc.ix[idx, ['seqname','start','end','Name']] #-------------------------------------------------- # print(gene) #-------------------------------------------------- # Convert columns to proper types. gene.start = gene.start.astype(int) gene.end = gene.end.astype(int) for geneID in gene['Name']: if geneID in compGeneInfo: # gene annotated in both species, read coordinates projecting information in maf mafFile = args.mafPath + "/" + geneID + ".maf" if not os.path.exists(mafFile): continue with log("Reading the Maf file: {}".format(mafFile)): with open(mafFile) as maf: out_files = dict() geneCoords = dict() for block in bx.align.maf.Reader(maf): ref_comp = block.components[0] refSpecies, refChrom = ref_comp.src.split('.')[:2] if refSpecies not in geneCoords: geneCoords[refSpecies] = nested_dict(2,str) geneCoords[refSpecies]['refInfo']['start'] = ref_comp.forward_strand_start geneCoords[refSpecies]['refInfo']['end'] = ref_comp.forward_strand_end geneCoords[refSpecies]['refInfo']['chr'] = refChrom for comp in block.components[1:]: comp_species, compChrom = comp.src.split('.')[:2] if comp_species not in geneCoords: geneCoords[comp_species] = nested_dict(2,str) geneCoords[comp_species][compChrom]['start'] = comp.start geneCoords[comp_species][compChrom]['end'] = int(comp.end) if compChrom not in geneCoords[comp_species]: geneCoords[comp_species][compChrom]['start'] = comp.start geneCoords[comp_species][compChrom]['end'] = int(comp.end) if comp_species not in out_files: bedfile = "%s/%s.%s.bed" % (args.mafPath, geneID, comp_species ) f = open( bedfile , "w" ) out_files[comp_species] = f pid = block_pid( ref_comp, comp ) if pid: #-------------------------------------------------- # print("%s\t%s" % (comp.end, geneCoords[comp_species][compChrom])) #-------------------------------------------------- if geneCoords[refSpecies]['refInfo']['start'] > ref_comp.forward_strand_start: geneCoords[refSpecies]['refInfo']['start'] = ref_comp.forward_strand_start if geneCoords[refSpecies]['refInfo']['end'] < ref_comp.forward_strand_end: geneCoords[refSpecies]['refInfo']['end'] = ref_comp.forward_strand_end if geneCoords[comp_species][compChrom]['start'] > comp.start: geneCoords[comp_species][compChrom]['start'] = comp.start if geneCoords[comp_species][compChrom]['end'] <= int(comp.end): geneCoords[comp_species][compChrom]['end'] = int(comp.end) out_files[comp_species].write( "%s\t%d\t%d\t%s:%d-%d,%s\t%f\n" % ( refChrom, ref_comp.forward_strand_start, ref_comp.forward_strand_end, \ compChrom, comp.start, comp.end, comp.strand, pid ) ) for f in out_files.values(): f.close() if args.compSpecies in geneCoords: for chrom in geneCoords[args.compSpecies]: if chrom in compGeneInfo[geneID]: annoStart = int(compGeneInfo[geneID][chrom]['start']) annoEnd = int(compGeneInfo[geneID][chrom]['end']) compStart = int(geneCoords[args.compSpecies][chrom]['start']) compEnd = int(geneCoords[args.compSpecies][chrom]['end']) if compEnd > annoEnd and compStart < annoEnd or compEnd > annoStart and compStart < annoEnd : print("Matched\t%s\t%s\t%s\tanno: %d - %d\tmapped: %d - %d\t%s\t%s\t%s: %s - %s" % \ (geneID, args.compSpecies, chrom, annoStart,annoEnd, compStart, compEnd, compGeneInfo[geneID][chrom]['ID'], \ args.refSpecies,geneCoords[args.refSpecies]['refInfo']['chr'], geneCoords[args.refSpecies]['refInfo']['start'], geneCoords[args.refSpecies]['refInfo']['end'] )) else: eprint("unMatched\t%s\t%s\t%s\tanno: %d - %d\tmapped: %d - %d" % \ (geneID, args.compSpecies, chrom, annoStart, annoEnd, compStart, compEnd)) else: eprint("Error Chrom\t%s\t%s\t%s\tmapped: %s - %s" % \ (geneID, args.compSpecies, chrom, geneCoords[args.compSpecies][chrom]['start'],geneCoords[args.compSpecies][chrom]['end']))
def main( argv = sys.argv ): parser = optparse.OptionParser( version = "%prog version: $Id: bed2gff.py 2861 2010-02-23 17:36:32Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-a", "--as-gtf", dest="as_gtf", action="store_true", help="output as gtf." ) parser.set_defaults( as_gtf = False, id_format = "%08i", test = None ) (options, args) = E.Start( parser, add_pipe_options = True ) as_gtf = options.as_gtf id_format = options.id_format if as_gtf: gff = GTF.Entry() else: gff = GFF.Entry() gff.source = "bed" gff.feature = "exon" ninput, noutput, nskipped = 0, 0, 0 id = 0 for bed in Bed.iterator( options.stdin ): ninput += 1 gff.contig = bed.contig gff.start = bed.start gff.end = bed.end if bed.mFields and len(bed.mFields) >= 3: gff.strand = bed.mFields[2] else: gff.strand = "." if bed.mFields and len(bed.mFields) >= 2: gff.score = bed.mFields[1] if as_gtf: if bed.mFields: gff.gene_id = bed.mFields[0] gff.transcript_id = bed.mFields[0] else: id += 1 gff.gene_id = id_format % id gff.transcript_id = id_format % id else: if bed.mFields: gff.source = bed.mFields[0] options.stdout.write( str(gff) + "\n" ) noutput += 1 E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped) ) E.Stop()
def main(): with open(utr5FName, "w") as utr5File, open(utr5StartFName, "w") as utr5StartFile, open(cdsFName, "w") as cdsFile, \ open(utr3FName, "w") as utr3File, open(exonFName, "w") as exonFile, open (intronFName, "w") as intronFile, \ open(codingExonFName, "w") as codingExonFile, open(codingIntronFName, "w") as codingIntronFile, \ open(noncodingExonFName, "w") as noncodingExonFile, open(noncodingIntronFName, "w") as noncodingIntronFile: def writeOutput(gene): if (useBlocks): # output all region primitives on the same line by specifying nBlocks and lists inside the BED output if(gene.coding): #blockBedFormat is one line by definition if (gene.utr5Len > 0): utr5File.write(gene.blockBedFormat(region="5utr") + "\n") if (gene.utr5startLen > 0): utr5StartFile.write(gene.blockBedFormat(region="5utr_start") + "\n") if (gene.cdsLen > 0): cdsFile.write(gene.blockBedFormat(region="cds") + "\n") if (gene.utr3Len > 0): utr3File.write(gene.blockBedFormat(region="3utr") + "\n") if (gene.exonsLen > 0): exonFile.write(gene.blockBedFormat(region="exons") + "\n") codingExonFile.write(gene.blockBedFormat(region="exons") + "\n") if (gene.intronsLen > 0): intronFile.write(gene.blockBedFormat(region="introns") + "\n") codingIntronFile.write(gene.blockBedFormat(region="introns") + "\n") else: # noncoding transcripts just have exons and introns if (gene.exonsLen > 0): exonFile.write(gene.blockBedFormat(region="exons") + "\n") noncodingExonFile.write(gene.blockBedFormat(region="exons") + "\n") if (gene.intronsLen > 0): intronFile.write(gene.blockBedFormat(region="introns") + "\n") noncodingIntronFile.write(gene.blockBedFormat(region="introns") + "\n") else: # output one line per region primitive instead of combining regions via blocks if(gene.coding): for entry in gene.bedFormat(region="5utr"): utr5File.write(entry + "\n") for entry in gene.bedFormat(region="5utr_start"): utr5StartFile.write(entry + "\n") for entry in gene.bedFormat(region="cds"): cdsFile.write(entry + "\n") for entry in gene.bedFormat(region="3utr"): utr3File.write(entry + "\n") for entry in gene.bedFormat(region="exons"): exonFile.write(entry + "\n") codingExonFile.write(entry + "\n") for entry in gene.bedFormat(region="introns"): intronFile.write(entry + "\n") codingIntronFile.write(entry + "\n") else: # noncoding transcripts just have exons and introns for entry in gene.bedFormat(region="exons"): exonFile.write(entry + "\n") noncodingExonFile.write(entry + "\n") for entry in gene.bedFormat(region="introns"): intronFile.write(entry + "\n") noncodingIntronFile.write(entry + "\n") if (args.ucsc): with open(args.input, "r") as genesFile: genesRead = 0 for line in genesFile: # all of the knowngenes parsing and metadata construction is done inside UCSCKnownGene.py, especially the createGene method gene = createUCSCTranscript(line) genesRead += 1 writeOutput(gene) if (not genesRead % 2500): print "Processed %d entries..." % genesRead elif (args.gtf): # first parse the entire file into a dictionary of lists txDict = defaultdict(list) print "Building GTF dictionary..." # the issue here is that lines for various transcripts may be interleaved, so can either create lots of SNFGene objects, or a giant dict. opted for giant dict. for line in GTF.lines(args.input): txDict[line["transcript_id"]].append(line) genesRead += 1 if (not genesRead % 100000): print "Processed %d lines..." % genesRead print "Dictionary built." # now create a SNFGene object for each transcript and output it genesRead = 0 for key in txDict: #print key tx = createGTFTranscript(txDict[key]) #print tx writeOutput(tx) genesRead += 1 if (not genesRead % 2500): print "Processed %d entries..." % genesRead print "Processed %d entries." % genesRead # BTD Edit: making unique regions and linking to gene name # -------------------------------------------------------- # utr5FName = args.output + "_5utr.bed" # utr5StartFName = args.output + "_5utr_start.bed" # cdsFName = args.output + "_cds.bed" # utr3FName = args.output + "_3utr.bed" # exonFName = args.output + "_exons.bed" # intronFName = args.output + "_introns.bed" # codingExonFName = args.output + "_codingexons.bed" # codingIntronFName = args.output + "_codingintrons.bed" # note that these are introns from coding genes, not necessarily introns that make it to mRNA # noncodingExonFName = args.output + "_noncodingexons.bed" # noncodingIntronFName = args.output + "_noncodingintrons.bed" # 1. Get gene ID (NM_123, ENSG123) --> gene name (Abcd1) print "Getting gene ID" idToName = {} if args.ucsc: with open(args.input, 'r') as knownGeneFile: reader = csv.reader(knownGeneFile, 'textdialect') for row in reader: idToName[row[0]] = row[-1] # 2. Get unique 5'UTR, 5'Start UTR, and 3'UTR print "Getting unique UTRs" def getUniqUTR(uniqFN, utrFN): with open(uniqFN, 'w') as uniq, open(utrFN, 'r') as utr: already = set() reader = csv.reader(utr, 'textdialect') writer = csv.writer(uniq, 'textdialect') for row in reader: if tuple(row[6:]) in already: continue #repeat geneIDInfo = row[3] id = geneIDInfo.split('__')[0] try: geneName = idToName[id] except: geneName = id if geneName != id: row[3] = id + '__' + geneName else: row[3] = id already.add(tuple(row[6:])) writer.writerow(row) uniq5UTR = args.output + "_uniq_5utr.bed" getUniqUTR(uniq5UTR, utr5FName) uniq3UTR = args.output + '_uniq_3utr.bed' getUniqUTR(uniq3UTR, utr3FName) uniq5SUTR = args.output + '_uniq_5utr_start.bed' getUniqUTR(uniq5SUTR, utr5StartFName) # 3. Get unique exons + num. Do it 3x for all, coding, and noncoding print "Getting unique exons" def getUniqExons(uniqFN, exonFN): with open(uniqFN, 'w') as uniq, open(exonFN, 'r') as exons: already = set() reader = csv.reader(exons, 'textdialect') writer = csv.writer(uniq, 'textdialect') for row in reader: # gene ID info geneIDInfo = row[3] id = geneIDInfo.split('__')[0] try: geneName = idToName[id] except: geneName = id if geneName != id: geneIDInfo = id + '__' + geneName else: geneIDInfo = id # chrom, start, stop, strand chrom = row[0] start, end = int(row[1]), int(row[2]) strand = row[5] # calculate exon starts and lengths exonLengths = row[10].split(',') if exonLengths[-1] == '': exonLengths = exonLengths[:-1] exonLengths = [int(x) for x in exonLengths] exonStarts = row[11].split(',') if exonStarts[-1] == '': exonStarts = exonStarts[:-1] exonStarts = [int(x) for x in exonStarts] # calculate exons exons = [] for i in range(len(exonStarts)): absStart = start + exonStarts[i] exons.append([absStart, absStart + exonLengths[i]]) if strand == '-': exons = exons[::-1] #flip exon order # making BED6 for i in range(len(exons)): exonNum = i + 1 exonNumInfo = str(exonNum) + 'of' + str(len(exons)) exon = exons[i] outputRow = [chrom, exon[0], exon[1]] # unique if tuple(outputRow) in already: continue already.add(tuple(outputRow)) outputRow.extend([geneIDInfo + '__exon__' + exonNumInfo, 0, strand]) writer.writerow(outputRow) uniqExons = args.output + '_uniq_exons.bed' getUniqExons(uniqExons, exonFName) uniqExons = args.output + '_uniq_codingexons.bed' getUniqExons(uniqExons, codingExonFName) uniqExons = args.output + '_uniq_noncodingexons.bed' getUniqExons(uniqExons, noncodingExonFName) # 4. Get unique introns + num. unique 5'SS, 3'SS. # 5'SS is first base of intron, 3'SS is last base of intron print "Getting unique introns and 5' and 3' SS" def getUniqIntronsAndSS(uniqIntronFN, uniq5SSFN, uniq3SSFN, intronFN): with open(uniqIntronFN, 'w') as uniqIntron, open(uniq5SSFN, 'w') as uniq5, \ open(uniq3SSFN, 'w') as uniq3, open(intronFN, 'r') as introns: alreadyIntron = set() already5 = set() already3 = set() reader = csv.reader(introns, 'textdialect') intronWriter = csv.writer(uniqIntron, 'textdialect') fiveWriter = csv.writer(uniq5, 'textdialect') threeWriter = csv.writer(uniq3, 'textdialect') for row in reader: # gene ID info geneIDInfo = row[3] id = geneIDInfo.split('__')[0] try: geneName = idToName[id] except: geneName = id if geneName != id: geneIDInfo = id + '__' + geneName else: geneIDInfo = id # chrom, start, stop, strand chrom = row[0] start, end = int(row[1]), int(row[2]) strand = row[5] # calculate intron starts and lengths intronLengths = row[10].split(',') if intronLengths[-1] == '': intronLengths = intronLengths[:-1] intronLengths = [int(x) for x in intronLengths] intronStarts = row[11].split(',') if intronStarts[-1] == '': intronStarts = intronStarts[:-1] intronStarts = [int(x) for x in intronStarts] # calculate introns introns = [] for i in range(len(intronStarts)): absStart = start + intronStarts[i] introns.append([absStart, absStart + intronLengths[i]]) if strand == '-': introns = introns[::-1] #flip intron order # making BED6 for i in range(len(introns)): intronNum = i + 1 intronNumInfo = str(intronNum) + 'of' + str(len(introns)) intron = introns[i] outputRow = [chrom, intron[0], intron[1]] # unique introns if tuple(outputRow) in alreadyIntron: continue alreadyIntron.add(tuple(outputRow)) outputRow.extend([geneIDInfo+ '__intron__' + intronNumInfo, 0, strand]) intronWriter.writerow(outputRow) # unique splice sites if strand == '+': fiveSS = [chrom, intron[0], intron[0] + 1] threeSS = [chrom, intron[1] - 1, intron[1]] else: threeSS = [chrom, intron[0], intron[0] + 1] fiveSS = [chrom, intron[1] - 1, intron[1]] if tuple(fiveSS) not in already5: already5.add(tuple(fiveSS)) fiveSS.extend([geneIDInfo + '__5ss__' + intronNumInfo, 0, strand]) fiveWriter.writerow(fiveSS) if tuple(threeSS) not in already3: already3.add(tuple(threeSS)) threeSS.extend([geneIDInfo+ '__3ss__' + intronNumInfo, 0, strand]) threeWriter.writerow(threeSS) uniqIntrons = args.output + '_uniq_introns.bed' uniq5 = args.output + '_uniq_5ss.bed' uniq3 = args.output + '_uniq_3ss.bed' getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, intronFName) uniqIntrons = args.output + '_uniq_codingintrons.bed' uniq5 = args.output + '_uniq_coding5ss.bed' uniq3 = args.output + '_uniq_coding3ss.bed' getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, codingIntronFName) uniqIntrons = args.output + '_uniq_noncodingintrons.bed' uniq5 = args.output + '_uniq_noncoding5ss.bed' uniq3 = args.output + '_uniq_noncoding3ss.bed' getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, noncodingIntronFName) # 5. unique TSS/TES print "Getting unique TSS and TES" def getUniqTSSAndTES(tssFN, tesFN, cdsFN): with open(tssFN, 'w') as uniqTSS, open(tesFN, 'w') as uniqTES, open(cdsFN, 'r') as cds: alreadyTSS = set() alreadyTES = set() reader = csv.reader(cds, 'textdialect') tssWriter = csv.writer(uniqTSS, 'textdialect') tesWriter = csv.writer(uniqTES, 'textdialect') for row in reader: geneIDInfo = row[3] id = geneIDInfo.split('__')[0] try: geneName = idToName[id] except: geneName = id if geneName != id: geneIDInfo = id + '__' + geneName else: geneIDInfo = id # chrom, start, stop, strand chrom = row[0] strand = row[5] start, end = int(row[1]), int(row[2]) if strand == '+': startRow = [chrom, start, start + 1] endRow = [chrom, end - 1, end] else: startRow = [chrom, end - 1, end] endRow = [chrom, start, start + 1] if tuple(startRow) not in alreadyTSS: alreadyTSS.add(tuple(startRow)) startRow.extend([geneIDInfo, 0, strand]) tssWriter.writerow(startRow) if tuple(endRow) not in alreadyTSS: alreadyTES.add(tuple(endRow)) endRow.extend([geneIDInfo, 0, strand]) tesWriter.writerow(endRow) uniqTSS = args.output + '_uniq_tss.bed' uniqTES = args.output + '_uniq_tes.bed' getUniqTSSAndTES(uniqTSS, uniqTES, cdsFName) # sort everything print "Sorting BED files" for fn in glob.glob("*.bed"): os.system("sort -k1,1 -k2,2n %s -o %s"%(fn, fn))
writeOutput(gene) if (not genesRead % 2500): print "Processed %d entries..." % genesRead elif (args.gtf): # first parse the entire file into a dictionary of lists txDict = defaultdict(list) print "Building GTF dictionary..." # the issue here is that lines for various transcripts may be interleaved, so can either create lots of objects, or a giant dict. opted for giant dict. for line in GTF.lines(args.input): # only want to read in lines corresponding to these features if line["feature"] in ["exon", "CDS", "start_codon", "stop_codon"]: txDict[line["transcript_id"]].append(line) genesRead += 1 if (not genesRead % 25000): print "\tProcessed %d lines..." % genesRead print "Dictionary built." print "Writing transcript properties." genesRead = 0 # now create a Transcript object for each transcript and output it
param_1= sys.argv[1] # gtf file param_2= sys.argv[2] # promoter region length param_3= sys.argv[3] # list gene to calculate param_4= sys.argv[4] # output file param_5= sys.argv[5] # per_transcript or per_gene param_6= sys.argv[6] # human genome reference fasta promoter_length = int(param_2) file_gene_filter = param_3 file_output = param_4 print "Read GTF file into memory" result = GTF.dataframe(param_1) print "Read gene list" gene_filter_list = ReadFilterGene(file_gene_filter) print "Calculate promoter region to bed format" if param_5=="per_gene": promoter = CalculateAllPromoterRegions(result,promoter_length,gene_filter_list) promoter = filterOverlaps(promoter) else: promoter = CalculateAllPromoterRegions2(result,promoter_length,gene_filter_list) print "Writing temporary bed file" temp_file = file_output+".tmp.txt" printToFile(promoter,temp_file)
def main(): with open(utr5FName, "w") as utr5File, open(utr5StartFName, "w") as utr5StartFile, open(cdsFName, "w") as cdsFile, \ open(utr3FName, "w") as utr3File, open(exonFName, "w") as exonFile, open (intronFName, "w") as intronFile, \ open(codingExonFName, "w") as codingExonFile, open(codingIntronFName, "w") as codingIntronFile, \ open(noncodingExonFName, "w") as noncodingExonFile, open(noncodingIntronFName, "w") as noncodingIntronFile: def writeOutput(gene): if ( useBlocks ): # output all region primitives on the same line by specifying nBlocks and lists inside the BED output if (gene.coding): #blockBedFormat is one line by definition if (gene.utr5Len > 0): utr5File.write( gene.blockBedFormat(region="5utr") + "\n") if (gene.utr5startLen > 0): utr5StartFile.write( gene.blockBedFormat(region="5utr_start") + "\n") if (gene.cdsLen > 0): cdsFile.write(gene.blockBedFormat(region="cds") + "\n") if (gene.utr3Len > 0): utr3File.write( gene.blockBedFormat(region="3utr") + "\n") if (gene.exonsLen > 0): exonFile.write( gene.blockBedFormat(region="exons") + "\n") codingExonFile.write( gene.blockBedFormat(region="exons") + "\n") if (gene.intronsLen > 0): intronFile.write( gene.blockBedFormat(region="introns") + "\n") codingIntronFile.write( gene.blockBedFormat(region="introns") + "\n") else: # noncoding transcripts just have exons and introns if (gene.exonsLen > 0): exonFile.write( gene.blockBedFormat(region="exons") + "\n") noncodingExonFile.write( gene.blockBedFormat(region="exons") + "\n") if (gene.intronsLen > 0): intronFile.write( gene.blockBedFormat(region="introns") + "\n") noncodingIntronFile.write( gene.blockBedFormat(region="introns") + "\n") else: # output one line per region primitive instead of combining regions via blocks if (gene.coding): for entry in gene.bedFormat(region="5utr"): utr5File.write(entry + "\n") for entry in gene.bedFormat(region="5utr_start"): utr5StartFile.write(entry + "\n") for entry in gene.bedFormat(region="cds"): cdsFile.write(entry + "\n") for entry in gene.bedFormat(region="3utr"): utr3File.write(entry + "\n") for entry in gene.bedFormat(region="exons"): exonFile.write(entry + "\n") codingExonFile.write(entry + "\n") for entry in gene.bedFormat(region="introns"): intronFile.write(entry + "\n") codingIntronFile.write(entry + "\n") else: # noncoding transcripts just have exons and introns for entry in gene.bedFormat(region="exons"): exonFile.write(entry + "\n") noncodingExonFile.write(entry + "\n") for entry in gene.bedFormat(region="introns"): intronFile.write(entry + "\n") noncodingIntronFile.write(entry + "\n") if (args.ucsc): with open(args.input, "r") as genesFile: genesRead = 0 for line in genesFile: # all of the knowngenes parsing and metadata construction is done inside UCSCKnownGene.py, especially the createGene method gene = createUCSCTranscript(line) genesRead += 1 writeOutput(gene) if (not genesRead % 2500): print "Processed %d entries..." % genesRead elif (args.gtf): # first parse the entire file into a dictionary of lists txDict = defaultdict(list) print "Building GTF dictionary..." # the issue here is that lines for various transcripts may be interleaved, so can either create lots of SNFGene objects, or a giant dict. opted for giant dict. for line in GTF.lines(args.input): txDict[line["transcript_id"]].append(line) genesRead += 1 if (not genesRead % 100000): print "Processed %d lines..." % genesRead print "Dictionary built." # now create a SNFGene object for each transcript and output it genesRead = 0 for key in txDict: #print key tx = createGTFTranscript(txDict[key]) #print tx writeOutput(tx) genesRead += 1 if (not genesRead % 2500): print "Processed %d entries..." % genesRead print "Processed %d entries." % genesRead # BTD Edit: making unique regions and linking to gene name # -------------------------------------------------------- # utr5FName = args.output + "_5utr.bed" # utr5StartFName = args.output + "_5utr_start.bed" # cdsFName = args.output + "_cds.bed" # utr3FName = args.output + "_3utr.bed" # exonFName = args.output + "_exons.bed" # intronFName = args.output + "_introns.bed" # codingExonFName = args.output + "_codingexons.bed" # codingIntronFName = args.output + "_codingintrons.bed" # note that these are introns from coding genes, not necessarily introns that make it to mRNA # noncodingExonFName = args.output + "_noncodingexons.bed" # noncodingIntronFName = args.output + "_noncodingintrons.bed" # 1. Get gene ID (NM_123, ENSG123) --> gene name (Abcd1) print "Getting gene ID" idToName = {} if args.ucsc: with open(args.input, 'r') as knownGeneFile: reader = csv.reader(knownGeneFile, 'textdialect') for row in reader: idToName[row[0]] = row[-1] # 2. Get unique 5'UTR, 5'Start UTR, and 3'UTR print "Getting unique UTRs" def getUniqUTR(uniqFN, utrFN): with open(uniqFN, 'w') as uniq, open(utrFN, 'r') as utr: already = set() reader = csv.reader(utr, 'textdialect') writer = csv.writer(uniq, 'textdialect') for row in reader: if tuple(row[6:]) in already: continue #repeat geneIDInfo = row[3] id = geneIDInfo.split('__')[0] try: geneName = idToName[id] except: geneName = id if geneName != id: row[3] = id + '__' + geneName else: row[3] = id already.add(tuple(row[6:])) writer.writerow(row) uniq5UTR = args.output + "_uniq_5utr.bed" getUniqUTR(uniq5UTR, utr5FName) uniq3UTR = args.output + '_uniq_3utr.bed' getUniqUTR(uniq3UTR, utr3FName) uniq5SUTR = args.output + '_uniq_5utr_start.bed' getUniqUTR(uniq5SUTR, utr5StartFName) # 3. Get unique exons + num. Do it 3x for all, coding, and noncoding print "Getting unique exons" def getUniqExons(uniqFN, exonFN): with open(uniqFN, 'w') as uniq, open(exonFN, 'r') as exons: already = set() reader = csv.reader(exons, 'textdialect') writer = csv.writer(uniq, 'textdialect') for row in reader: # gene ID info geneIDInfo = row[3] id = geneIDInfo.split('__')[0] try: geneName = idToName[id] except: geneName = id if geneName != id: geneIDInfo = id + '__' + geneName else: geneIDInfo = id # chrom, start, stop, strand chrom = row[0] start, end = int(row[1]), int(row[2]) strand = row[5] # calculate exon starts and lengths exonLengths = row[10].split(',') if exonLengths[-1] == '': exonLengths = exonLengths[:-1] exonLengths = [int(x) for x in exonLengths] exonStarts = row[11].split(',') if exonStarts[-1] == '': exonStarts = exonStarts[:-1] exonStarts = [int(x) for x in exonStarts] # calculate exons exons = [] for i in range(len(exonStarts)): absStart = start + exonStarts[i] exons.append([absStart, absStart + exonLengths[i]]) if strand == '-': exons = exons[::-1] #flip exon order # making BED6 for i in range(len(exons)): exonNum = i + 1 exonNumInfo = str(exonNum) + 'of' + str(len(exons)) exon = exons[i] outputRow = [chrom, exon[0], exon[1]] # unique if tuple(outputRow) in already: continue already.add(tuple(outputRow)) outputRow.extend( [geneIDInfo + '__exon__' + exonNumInfo, 0, strand]) writer.writerow(outputRow) uniqExons = args.output + '_uniq_exons.bed' getUniqExons(uniqExons, exonFName) uniqExons = args.output + '_uniq_codingexons.bed' getUniqExons(uniqExons, codingExonFName) uniqExons = args.output + '_uniq_noncodingexons.bed' getUniqExons(uniqExons, noncodingExonFName) # 4. Get unique introns + num. unique 5'SS, 3'SS. # 5'SS is first base of intron, 3'SS is last base of intron print "Getting unique introns and 5' and 3' SS" def getUniqIntronsAndSS(uniqIntronFN, uniq5SSFN, uniq3SSFN, intronFN): with open(uniqIntronFN, 'w') as uniqIntron, open(uniq5SSFN, 'w') as uniq5, \ open(uniq3SSFN, 'w') as uniq3, open(intronFN, 'r') as introns: alreadyIntron = set() already5 = set() already3 = set() reader = csv.reader(introns, 'textdialect') intronWriter = csv.writer(uniqIntron, 'textdialect') fiveWriter = csv.writer(uniq5, 'textdialect') threeWriter = csv.writer(uniq3, 'textdialect') for row in reader: # gene ID info geneIDInfo = row[3] id = geneIDInfo.split('__')[0] try: geneName = idToName[id] except: geneName = id if geneName != id: geneIDInfo = id + '__' + geneName else: geneIDInfo = id # chrom, start, stop, strand chrom = row[0] start, end = int(row[1]), int(row[2]) strand = row[5] # calculate intron starts and lengths intronLengths = row[10].split(',') if intronLengths[-1] == '': intronLengths = intronLengths[:-1] intronLengths = [int(x) for x in intronLengths] intronStarts = row[11].split(',') if intronStarts[-1] == '': intronStarts = intronStarts[:-1] intronStarts = [int(x) for x in intronStarts] # calculate introns introns = [] for i in range(len(intronStarts)): absStart = start + intronStarts[i] introns.append([absStart, absStart + intronLengths[i]]) if strand == '-': introns = introns[::-1] #flip intron order # making BED6 for i in range(len(introns)): intronNum = i + 1 intronNumInfo = str(intronNum) + 'of' + str(len(introns)) intron = introns[i] outputRow = [chrom, intron[0], intron[1]] # unique introns if tuple(outputRow) in alreadyIntron: continue alreadyIntron.add(tuple(outputRow)) outputRow.extend( [geneIDInfo + '__intron__' + intronNumInfo, 0, strand]) intronWriter.writerow(outputRow) # unique splice sites if strand == '+': fiveSS = [chrom, intron[0], intron[0] + 1] threeSS = [chrom, intron[1] - 1, intron[1]] else: threeSS = [chrom, intron[0], intron[0] + 1] fiveSS = [chrom, intron[1] - 1, intron[1]] if tuple(fiveSS) not in already5: already5.add(tuple(fiveSS)) fiveSS.extend([ geneIDInfo + '__5ss__' + intronNumInfo, 0, strand ]) fiveWriter.writerow(fiveSS) if tuple(threeSS) not in already3: already3.add(tuple(threeSS)) threeSS.extend([ geneIDInfo + '__3ss__' + intronNumInfo, 0, strand ]) threeWriter.writerow(threeSS) uniqIntrons = args.output + '_uniq_introns.bed' uniq5 = args.output + '_uniq_5ss.bed' uniq3 = args.output + '_uniq_3ss.bed' getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, intronFName) uniqIntrons = args.output + '_uniq_codingintrons.bed' uniq5 = args.output + '_uniq_coding5ss.bed' uniq3 = args.output + '_uniq_coding3ss.bed' getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, codingIntronFName) uniqIntrons = args.output + '_uniq_noncodingintrons.bed' uniq5 = args.output + '_uniq_noncoding5ss.bed' uniq3 = args.output + '_uniq_noncoding3ss.bed' getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, noncodingIntronFName) # 5. unique cdsStart, cdsEnd print "Getting unique cdsStart and cdsEnd" def getUniqCDSStartEnd(startFN, endFN, cdsFN): with open(startFN, 'w') as uniqStart, open(endFN, 'w') as uniqEnd, open(cdsFN, 'r') as cds: alreadyStart = set() alreadyEnd = set() reader = csv.reader(cds, 'textdialect') startWriter = csv.writer(uniqStart, 'textdialect') endWriter = csv.writer(uniqEnd, 'textdialect') for row in reader: geneIDInfo = row[3] id = geneIDInfo.split('__')[0] try: geneName = idToName[id] except: geneName = id if geneName != id: geneIDInfo = id + '__' + geneName else: geneIDInfo = id # chrom, start, stop, strand chrom = row[0] strand = row[5] start, end = int(row[1]), int(row[2]) if strand == '+': startRow = [chrom, start, start + 1] endRow = [chrom, end - 1, end] else: startRow = [chrom, end - 1, end] endRow = [chrom, start, start + 1] if tuple(startRow) not in alreadyStart: alreadyStart.add(tuple(startRow)) startRow.extend([geneIDInfo, 0, strand]) startWriter.writerow(startRow) if tuple(endRow) not in alreadyEnd: alreadyEnd.add(tuple(endRow)) endRow.extend([geneIDInfo, 0, strand]) endWriter.writerow(endRow) uniqCDSStart = args.output + '_uniq_cdsStart.bed' uniqCDSEnd = args.output + '_uniq_cdsEnd.bed' getUniqCDSStartEnd(uniqCDSStart, uniqCDSEnd, cdsFName) # 6. unique TSS, TES print "Getting unique TSS and TES" def getUniqTSSAndTES(tssFN, tesFN, fiveFN, threeFN): with open(tssFN, 'w') as uniqTSS, open(tesFN, 'w') as uniqTES, open( fiveFN, 'r') as fiveUTR, open(threeFN, 'r') as threeUTR: alreadyTSS = set() fiveReader = csv.reader(fiveUTR, 'textdialect') tssWriter = csv.writer(uniqTSS, 'textdialect') for row in fiveReader: geneIDInfo = row[3] id = geneIDInfo.split('__')[0] try: geneName = idToName[id] except: geneName = id if geneName != id: geneIDInfo = id + '__' + geneName else: geneIDInfo = id # chrom, start, stop, strand chrom = row[0] strand = row[5] start, end = int(row[1]), int(row[2]) if strand == '+': startRow = [chrom, start, start + 1] else: startRow = [chrom, end - 1, end] if tuple(startRow) not in alreadyTSS: alreadyTSS.add(tuple(startRow)) startRow.extend([geneIDInfo, 0, strand]) tssWriter.writerow(startRow) alreadyTES = set() threeReader = csv.reader(threeUTR, 'textdialect') tesWriter = csv.writer(uniqTES, 'textdialect') for row in threeReader: geneIDInfo = row[3] id = geneIDInfo.split('__')[0] try: geneName = idToName[id] except: geneName = id if geneName != id: geneIDInfo = id + '__' + geneName else: geneIDInfo = id # chrom, start, stop, strand chrom = row[0] strand = row[5] start, end = int(row[1]), int(row[2]) if strand == '-': endRow = [chrom, start, start + 1] else: endRow = [chrom, end - 1, end] if tuple(endRow) not in alreadyTES: alreadyTES.add(tuple(endRow)) endRow.extend([geneIDInfo, 0, strand]) tesWriter.writerow(endRow) uniqTSS = args.output + '_uniq_tss.bed' uniqTES = args.output + '_uniq_tes.bed' getUniqTSSAndTES(uniqTSS, uniqTES, utr5FName, utr3FName) # sort everything print "Sorting BED files" for fn in glob.glob("*.bed"): os.system("sort -k1,1 -k2,2n %s -o %s" % (fn, fn))