def main(): # Input files. # GENCODE = args['-g'] GENCODE = "/cs/zbio/jrosensk/ccle_fastq/hg19_reference/hg19.ensGene.gtf" #"Homo_sapiens.GRCh38.103.gtf.gz"#"gencode.v29.annotation.gtf.gz" # Output file prefix. GENE_LENGTHS = "coding_lengths.hg19.tsv" with log("Reading the Gencode annotation file: {}".format(GENCODE)): gc = GTF.dataframe(GENCODE) # ccle_transcript_tpm = pd.read_csv("CCLE_expression.csv", nrows=3) # Select just exons of protein coding genes, and columns that we want to use. idx = (gc.feature == 'exon') #& (gc.gene_biotype == 'protein_coding') # idx2 = (gc.feature == 'gene') & (gc.transcript_type == 'protein_coding')gene_biotype # trial = gc[idx2] exon = gc[idx][['seqname', 'start', 'end', 'gene_id', 'gene_name']] # Convert columns to proper types. exon.start = exon.start.astype(int) exon.end = exon.end.astype(int) # Sort in place. exon.sort_values(['seqname', 'start', 'end'], inplace=True) # Group the rows by the Ensembl gene identifier (with version numbers.) groups = exon.groupby('gene_id') with log("Calculating coding region (exonic) length for each gene..."): lengths = groups.apply(count_bp) # with log("Reading NCBI mapping of Entrez GeneID "\ # "to Ensembl gene identifier: {}".format(NCBI_ENSEMBL)): # g2e = pd.read_table(NCBI_ENSEMBL, # compression="gzip", # header=None, # names=['tax_id', 'GeneID', # 'Ensembl_gene_identifier', # 'RNA_nucleotide_accession.version', # 'Ensembl_rna_identifier', # 'protein_accession.version', # 'Ensembl_protein_identifier']) # Create a new DataFrame with gene lengths and EnsemblID. ensembl_no_version = lengths.index.map(lambda x: x.split(".")[0]) ldf = pd.DataFrame( { 'length': lengths, 'Ensembl_gene_identifier': ensembl_no_version }, index=lengths.index) # Merge so we have EntrezGeneID with length. # m1 = pd.merge(ldf, g2e, on='Ensembl_gene_identifier') m1 = ldf[['Ensembl_gene_identifier', 'length']].drop_duplicates() with log("Writing output file: {}".format(GENE_LENGTHS)): m1.to_csv(GENE_LENGTHS, sep="\t", index=False)
def main(args): p = GTF.dataframe( args['gtf']) ## GFT.dataframe returns a pandas.core.data.DataFrame with open(args['out'], 'w') as wo: for i in range(len(p)): wo.write("{}\t{}\t{}\t{}___{}\t{}\t{}\n".format( p['seqname'][i], p['start'][i], p['end'][i], p['gene_id'][i], p['gene_name'][i], p['gene_biotype'][i], p['strand'][i]))
def main(args): # Input files. GENCODE = args['-g'] NCBI_ENSEMBL = args['-n'] # Output file prefix. GENE_LENGTHS = args['-o'] or "ncbi_ensembl_coding_lengths.txt.gz" with log("Reading the Gencode annotation file: {}".format(GENCODE)): gc = GTF.dataframe(GENCODE) # Select just exons of protein coding genes, and columns that we want to use. idx = (gc.feature == 'exon') & (gc.transcript_type == 'protein_coding') exon = gc.ix[idx, ['seqname', 'start', 'end', 'gene_id', 'gene_name']] # Convert columns to proper types. exon.start = exon.start.astype(int) exon.end = exon.end.astype(int) # Sort in place. exon.sort(['seqname', 'start', 'end'], inplace=True) # Group the rows by the Ensembl gene identifier (with version numbers.) groups = exon.groupby('gene_id') with log("Calculating coding region (exonic) length for each gene..."): lengths = groups.apply(count_bp) with log("Reading NCBI mapping of Entrez GeneID "\ "to Ensembl gene identifier: {}".format(NCBI_ENSEMBL)): g2e = pd.read_table(NCBI_ENSEMBL, compression="gzip", header=None, names=[ 'tax_id', 'GeneID', 'Ensembl_gene_identifier', 'RNA_nucleotide_accession.version', 'Ensembl_rna_identifier', 'protein_accession.version', 'Ensembl_protein_identifier' ]) # Create a new DataFrame with gene lengths and EnsemblID. ensembl_no_version = lengths.index.map(lambda x: x.split(".")[0]) ldf = pd.DataFrame( { 'length': lengths, 'Ensembl_gene_identifier': ensembl_no_version }, index=lengths.index) # Merge so we have EntrezGeneID with length. m1 = pd.merge(ldf, g2e, on='Ensembl_gene_identifier') m1 = m1[['Ensembl_gene_identifier', 'GeneID', 'length']].drop_duplicates() with log("Writing output file: {}".format(GENE_LENGTHS)): with gzip.open(GENE_LENGTHS, "wb") as out: m1.to_csv(out, sep="\t", index=False)
def main(args): with log("Reading the Fasta file: {}".format(args.fastaFile)): records = list(SeqIO.parse(args.fastaFile, "fasta")) with log("Reading the Gencode annotation file: {}".format(args.wigFile)): wig = pd.DataFrame.from_csv(args.wigFile, header=0, sep=" ", index_col=None) wig['CpG'] = ["CpG"] * (wig.size / 2) #-------------------------------------------------- # for raw in range(0,wig.size): #-------------------------------------------------- wig.to_csv(args.outWigFile, header=True, index=None, sep=' ', mode='a') seqStr = dict() with log("Reading the fasta file: {}".format(args.fastaFile)): seqHandle = open(args.fastaFile, "rU") for record in SeqIO.parse(seqHandle, "fasta"): seqStr[record.id] = record.seq print(seqStr) with log("Reading the Gencode annotation file: {}".format(args.gffFile)): gc = GTF.dataframe(args.gffFile) #-------------------------------------------------- # print(gc[1:10]) #-------------------------------------------------- # Select just exons of protein coding genes, and columns that we want to use. idx = (gc.feature == 'exon') exon = gc.ix[idx, ['seqname', 'start', 'end', 'ID', 'Parent']] exon['ID'] = exon['ID'].map(lambda x: re.sub(r'-mRNA.*', '', x)) # Convert columns to proper types. exon.start = exon.start.astype(int) exon.end = exon.end.astype(int) # Sort in place. exon.sort_values(['seqname', 'start', 'end'], inplace=True) # Group the rows by the Ensembl gene identifier (with version numbers.) groups = exon.groupby('ID') with log("Calculating coding region (exonic) length for each gene..."): lengths = groups.apply(count_bp) print(type(lengths)) with log("Writing output file: {}".format(args.outFile)): lengths.to_csv(args.outFile, sep="\t", encoding="utf-8", index=True)
def main(GENCODE): gc = GTF.dataframe(GENCODE) gc.gene_id = gc.gene_id.replace(to_replace=r"\.[0-9]+", value="", regex=True) idx = (gc.feature == "transcript") & (gc.transcript_type == "lincRNA") lincRNA = gc.ix[idx, ["seqname", "start", "end", "gene_id", "gene_name", "strand"]] lincRNA.start = lincRNA.start.astype(int) lincRNA.end = lincRNA.end.astype(int) lincRNA.sort_values(by=["seqname", "start", "end"], inplace=True) lincRNA.to_csv("lincRNA.bed", sep="\t", header=False, index=False) idx = (gc.feature == "gene") & (gc.gene_type == "lincRNA") lincRNA = gc.ix[idx, ["seqname", "start", "end", "gene_id", "gene_name", "strand"]] lincRNA.start = lincRNA.start.astype(int) lincRNA.end = lincRNA.end.astype(int) lincRNA.sort_values(by=["seqname", "start", "end"], inplace=True) lincRNA.to_csv("lincRNA_genes.bed", sep="\t", header=False, index=False)
def main(GENCODE): gc = GTF.dataframe(GENCODE) gc.gene_id = gc.gene_id.replace(to_replace=r'\.[0-9]+', value='', regex=True) idx = (gc.feature == 'transcript') & (gc.transcript_type == 'lincRNA') lincRNA = gc.ix[ idx, ['seqname', 'start', 'end', 'gene_id', 'gene_name', 'strand']] lincRNA.start = lincRNA.start.astype(int) lincRNA.end = lincRNA.end.astype(int) lincRNA.sort_values(by=['seqname', 'start', 'end'], inplace=True) lincRNA.to_csv('lincRNA.bed', sep='\t', header=False, index=False) idx = (gc.feature == 'gene') & (gc.gene_type == 'lincRNA') lincRNA = gc.ix[ idx, ['seqname', 'start', 'end', 'gene_id', 'gene_name', 'strand']] lincRNA.start = lincRNA.start.astype(int) lincRNA.end = lincRNA.end.astype(int) lincRNA.sort_values(by=['seqname', 'start', 'end'], inplace=True) lincRNA.to_csv('lincRNA_genes.bed', sep='\t', header=False, index=False)
*_toTranscriptome_cov.txt Output file(s): *_toTranscriptome_cov.average.txt Tools: GTF.py to load GTF files Process line-by-line with Python """ import glob import os import sys sys.path.insert(0, '/12TBLVM/Data/MinhTri/6_SCRIPTS') import GTF print('Loading gtf file.') (GeneDict, TransScDict) = GTF.dataframe("/12TBLVM/Data/hg19-2/GENCODE/gencode.v22.annotation.gtf") print('Listing files to be processed:') cases = set() for file in glob.glob('*_toTranscriptome_cov.txt'): cases.add(file.split('.')[0]) print('\t', file) for entry in cases: summary = {} # Read file original_file = open(entry + '.txt', 'r') print('Reading input file:', original_file.name) total_cov = 0 counter = 0
for val in it: # Report the *previous* value (more to come). yield last, True last = val # Report the last value. yield last, False def processing_count(filenames): count_lowlevel_in_hightlevel(filenames, 'transcript', 'gene') count_lowlevel_in_hightlevel(filenames, 'exon', 'transcript') if __name__ == '__main__': # Below is a demo for using function lookahead(): # for i, has_more in lookahead(range(3)): # print(i, has_more) whole_gtf = GTF.dataframe(sys.argv[1]) processing_count(sys.argv[1]) whole_gtf['length'] = whole_gtf['end'].astype( 'int') - whole_gtf['start'].astype('int') + 1 whole_gtf = whole_gtf.loc[:, ['gene_biotype', 'feature', 'length']] whole_gtf.to_csv("whole_gtf", sep='\t', index=False) # Below is a example for using ggplot package in Python: # p = ggplot(aes(x='length'), data=a) + geom_histogram() + facet_grid(x='gene_biotype', y='feature') \ # + xlim(0,50000) + scale_y_log(10) + ylim(1, 1e3) # ggplot.save(p, "f**k.tiff", width=55, height=50, dpi=300) biotype_count_as_features = whole_gtf.groupby(['gene_biotype', 'feature']).size() biotype_count_as_features.to_csv("biotype_count_as_features", sep='\t')
param_1= sys.argv[1] # gtf file param_2= sys.argv[2] # promoter region length param_3= sys.argv[3] # list gene to calculate param_4= sys.argv[4] # output file param_5= sys.argv[5] # per_transcript or per_gene param_6= sys.argv[6] # human genome reference fasta promoter_length = int(param_2) file_gene_filter = param_3 file_output = param_4 print "Read GTF file into memory" result = GTF.dataframe(param_1) print "Read gene list" gene_filter_list = ReadFilterGene(file_gene_filter) print "Calculate promoter region to bed format" if param_5=="per_gene": promoter = CalculateAllPromoterRegions(result,promoter_length,gene_filter_list) promoter = filterOverlaps(promoter) else: promoter = CalculateAllPromoterRegions2(result,promoter_length,gene_filter_list) print "Writing temporary bed file" temp_file = file_output+".tmp.txt" printToFile(promoter,temp_file)
def main(args): with log("Reading compare Gencode annotation file: {}".format(args.compGffFile)): gc = GTF.dictionary(args.compGffFile,"ID") compGeneInfo = gc['gene'] #-------------------------------------------------- # gene['Name'] = gene['ID'].map(lambda x: re.sub(r':maker.*','',x)) #-------------------------------------------------- with log("Reading reference Gencode annotation file: {}".format(args.refGffFile)): gc = GTF.dataframe(args.refGffFile) # Select just genes of protein coding genes, and columns that we want to use. idx = (gc.feature == 'gene') gene = gc.ix[idx, ['seqname','start','end','Name']] #-------------------------------------------------- # print(gene) #-------------------------------------------------- # Convert columns to proper types. gene.start = gene.start.astype(int) gene.end = gene.end.astype(int) for geneID in gene['Name']: if geneID in compGeneInfo: # gene annotated in both species, read coordinates projecting information in maf mafFile = args.mafPath + "/" + geneID + ".maf" if not os.path.exists(mafFile): continue with log("Reading the Maf file: {}".format(mafFile)): with open(mafFile) as maf: out_files = dict() geneCoords = dict() for block in bx.align.maf.Reader(maf): ref_comp = block.components[0] refSpecies, refChrom = ref_comp.src.split('.')[:2] if refSpecies not in geneCoords: geneCoords[refSpecies] = nested_dict(2,str) geneCoords[refSpecies]['refInfo']['start'] = ref_comp.forward_strand_start geneCoords[refSpecies]['refInfo']['end'] = ref_comp.forward_strand_end geneCoords[refSpecies]['refInfo']['chr'] = refChrom for comp in block.components[1:]: comp_species, compChrom = comp.src.split('.')[:2] if comp_species not in geneCoords: geneCoords[comp_species] = nested_dict(2,str) geneCoords[comp_species][compChrom]['start'] = comp.start geneCoords[comp_species][compChrom]['end'] = int(comp.end) if compChrom not in geneCoords[comp_species]: geneCoords[comp_species][compChrom]['start'] = comp.start geneCoords[comp_species][compChrom]['end'] = int(comp.end) if comp_species not in out_files: bedfile = "%s/%s.%s.bed" % (args.mafPath, geneID, comp_species ) f = open( bedfile , "w" ) out_files[comp_species] = f pid = block_pid( ref_comp, comp ) if pid: #-------------------------------------------------- # print("%s\t%s" % (comp.end, geneCoords[comp_species][compChrom])) #-------------------------------------------------- if geneCoords[refSpecies]['refInfo']['start'] > ref_comp.forward_strand_start: geneCoords[refSpecies]['refInfo']['start'] = ref_comp.forward_strand_start if geneCoords[refSpecies]['refInfo']['end'] < ref_comp.forward_strand_end: geneCoords[refSpecies]['refInfo']['end'] = ref_comp.forward_strand_end if geneCoords[comp_species][compChrom]['start'] > comp.start: geneCoords[comp_species][compChrom]['start'] = comp.start if geneCoords[comp_species][compChrom]['end'] <= int(comp.end): geneCoords[comp_species][compChrom]['end'] = int(comp.end) out_files[comp_species].write( "%s\t%d\t%d\t%s:%d-%d,%s\t%f\n" % ( refChrom, ref_comp.forward_strand_start, ref_comp.forward_strand_end, \ compChrom, comp.start, comp.end, comp.strand, pid ) ) for f in out_files.values(): f.close() if args.compSpecies in geneCoords: for chrom in geneCoords[args.compSpecies]: if chrom in compGeneInfo[geneID]: annoStart = int(compGeneInfo[geneID][chrom]['start']) annoEnd = int(compGeneInfo[geneID][chrom]['end']) compStart = int(geneCoords[args.compSpecies][chrom]['start']) compEnd = int(geneCoords[args.compSpecies][chrom]['end']) if compEnd > annoEnd and compStart < annoEnd or compEnd > annoStart and compStart < annoEnd : print("Matched\t%s\t%s\t%s\tanno: %d - %d\tmapped: %d - %d\t%s\t%s\t%s: %s - %s" % \ (geneID, args.compSpecies, chrom, annoStart,annoEnd, compStart, compEnd, compGeneInfo[geneID][chrom]['ID'], \ args.refSpecies,geneCoords[args.refSpecies]['refInfo']['chr'], geneCoords[args.refSpecies]['refInfo']['start'], geneCoords[args.refSpecies]['refInfo']['end'] )) else: eprint("unMatched\t%s\t%s\t%s\tanno: %d - %d\tmapped: %d - %d" % \ (geneID, args.compSpecies, chrom, annoStart, annoEnd, compStart, compEnd)) else: eprint("Error Chrom\t%s\t%s\t%s\tmapped: %s - %s" % \ (geneID, args.compSpecies, chrom, geneCoords[args.compSpecies][chrom]['start'],geneCoords[args.compSpecies][chrom]['end']))
def main(args): # -------------------------------------------------------------------------- # READING GTF with GTF.py # -------------------------------------------------------------------------- try: gtf_file=args['gtf'] p = GTF.dataframe(gtf_file) ## GTF.dataframe returns a pandas.core.data.DataFrame except Exception as e: logger.error("ERROR: in reading GTF\n; {}".format(e)) exit(1) # -------------------------------------------------------------------------- # INIT VARIABLES # -------------------------------------------------------------------------- _Expected_Eight_Columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame'] _Expected_Other_Columns = [ 'gene_id', 'gene_name', 'gene_version', 'gene_source', 'gene_biotype', 'transcript_id', 'transcript_version', 'transcript_name', 'transcript_source', 'transcript_biotype', 'tag', 'transcript_support_level', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version' ] logger.debug(p.index) logger.debug(p.columns) logger.info("-"*50) logger.info("Printing the fields (aka columns) needed in GTF to make the BED:") logger.info(_Expected_Eight_Columns + _Expected_Other_Columns) logger.info("-"*50) # -------------------------------------------------------------------------- # CHECK FIELDS # -------------------------------------------------------------------------- mandatory_columns_count = 0 mandatory_columns_list = [] other_fields_list = [] for col in p.columns: if col in _Expected_Eight_Columns: mandatory_columns_count +=1 mandatory_columns_list.append(col) other_fields_list.append(col) if mandatory_columns_count != len(_Expected_Eight_Columns): raise ValueError("MISSING MANDATORY COLUMNS in GTF: {} ".format(';'.join([ str(col) for col in _Expected_Eight_Columns if col not in mandatory_columns_list ])) ) logger.info("Mandatory Expected Fields Check out OK") logger.info("Testing if any other missing field exists or are extra or with different expected names ...") unexpected_fields_list = [] expected_fields_list = [] for field in other_fields_list: if field not in _Expected_Other_Columns and field not in _Expected_Eight_Columns: unexpected_fields_list.append(field) else: expected_fields_list.append(field) logger.warning("The Following Fields are NOT added to BED; Check if they should be used and if so, check if they might be named differently: {} ".format(unexpected_fields_list)) # -------------------------------------------------------------------------- # UPDATE GTF IF NEEDED # -------------------------------------------------------------------------- read_GTF_again=False for field in _Expected_Other_Columns + _Expected_Eight_Columns: if field not in expected_fields_list: if not read_GTF_again: shutil.copy(gtf_file, gtf_file+"upd.gtf") gtf_file = gtf_file+"upd.gtf" add_field_to_GTF(field, gtf_file) read_GTF_again=True if read_GTF_again: logger.info("processing GTF2BED for << {} >> updated GTF...".format(gtf_file)) p = GTF.dataframe(gtf_file) # -------------------------------------------------------------------------- # writing to output file # -------------------------------------------------------------------------- with open(args['out'], 'w') as wo: logger.info("writing out BED file ... {}".format(args['out'])) ## writing HEADER line to output file wo.write("\t".join( ['##seqname', 'start', 'end', 'gene_id__gene_name', 'score', 'strand', 'frame', 'gene_version', 'gene_source', 'gene_biotype', 'transcript_id', 'source', 'feature', 'transcript_version', 'transcript_name', ' transcript_source', 'transcript_biotype', 'tag', 'transcript_support_level', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version']) + "\n") ## writing VALUE lines to output file try: for i in range(len(p)): wo.write("{}\t{}\t{}\t{}___{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(p['seqname'][i], int(p['start'][i]) - 1, p['end'][i], p['gene_id'][i], p['gene_name'][i], p['score'][i], p['strand'][i], p['frame'][i], p['gene_version'][i], p['gene_source'][i], p['gene_biotype'][i], p['transcript_id'][i], p['source'][i], p['feature'][i], p['transcript_version'][i], p['transcript_name'][i], p['transcript_source'][i], p['transcript_biotype'][i], p['tag'][i], p['transcript_support_level'][i], p['exon_number'][i], p['exon_id'][i], p['exon_version'][i], p['protein_id'][i], p['protein_version'][i] ) ) except IOError as IOE: logger.error("ERROR: in Writing data\n; {}".format(IOE)) exit(2) except ValueError as VE: logger.error("ERROR: in Writing data\n; {}".format(VE)) exit(2) except Exception as E: logger.error("ERROR: in Writing data\n; {}".format(E)) exit(1)