Пример #1
0
def main():
    # Input files.
    # GENCODE = args['-g']
    GENCODE = "/cs/zbio/jrosensk/ccle_fastq/hg19_reference/hg19.ensGene.gtf"  #"Homo_sapiens.GRCh38.103.gtf.gz"#"gencode.v29.annotation.gtf.gz"

    # Output file prefix.
    GENE_LENGTHS = "coding_lengths.hg19.tsv"

    with log("Reading the Gencode annotation file: {}".format(GENCODE)):
        gc = GTF.dataframe(GENCODE)

    # ccle_transcript_tpm = pd.read_csv("CCLE_expression.csv", nrows=3)

    # Select just exons of protein coding genes, and columns that we want to use.
    idx = (gc.feature == 'exon')  #& (gc.gene_biotype == 'protein_coding')
    # idx2 = (gc.feature == 'gene') & (gc.transcript_type == 'protein_coding')gene_biotype
    # trial = gc[idx2]
    exon = gc[idx][['seqname', 'start', 'end', 'gene_id', 'gene_name']]

    # Convert columns to proper types.
    exon.start = exon.start.astype(int)
    exon.end = exon.end.astype(int)

    # Sort in place.
    exon.sort_values(['seqname', 'start', 'end'], inplace=True)

    # Group the rows by the Ensembl gene identifier (with version numbers.)
    groups = exon.groupby('gene_id')

    with log("Calculating coding region (exonic) length for each gene..."):
        lengths = groups.apply(count_bp)

    # with log("Reading NCBI mapping of Entrez GeneID "\
    #          "to Ensembl gene identifier: {}".format(NCBI_ENSEMBL)):
    #     g2e = pd.read_table(NCBI_ENSEMBL,
    #                         compression="gzip",
    #                         header=None,
    #                         names=['tax_id', 'GeneID',
    #                                'Ensembl_gene_identifier',
    #                                'RNA_nucleotide_accession.version',
    #                                'Ensembl_rna_identifier',
    #                                'protein_accession.version',
    #                                'Ensembl_protein_identifier'])

    # Create a new DataFrame with gene lengths and EnsemblID.
    ensembl_no_version = lengths.index.map(lambda x: x.split(".")[0])
    ldf = pd.DataFrame(
        {
            'length': lengths,
            'Ensembl_gene_identifier': ensembl_no_version
        },
        index=lengths.index)

    # Merge so we have EntrezGeneID with length.
    # m1 = pd.merge(ldf, g2e, on='Ensembl_gene_identifier')
    m1 = ldf[['Ensembl_gene_identifier', 'length']].drop_duplicates()

    with log("Writing output file: {}".format(GENE_LENGTHS)):
        m1.to_csv(GENE_LENGTHS, sep="\t", index=False)
Пример #2
0
def main(args):
    p = GTF.dataframe(
        args['gtf'])  ## GFT.dataframe returns a pandas.core.data.DataFrame
    with open(args['out'], 'w') as wo:
        for i in range(len(p)):
            wo.write("{}\t{}\t{}\t{}___{}\t{}\t{}\n".format(
                p['seqname'][i], p['start'][i], p['end'][i], p['gene_id'][i],
                p['gene_name'][i], p['gene_biotype'][i], p['strand'][i]))
Пример #3
0
def main(args):
    # Input files.
    GENCODE = args['-g']
    NCBI_ENSEMBL = args['-n']

    # Output file prefix.
    GENE_LENGTHS = args['-o'] or "ncbi_ensembl_coding_lengths.txt.gz"

    with log("Reading the Gencode annotation file: {}".format(GENCODE)):
        gc = GTF.dataframe(GENCODE)

    # Select just exons of protein coding genes, and columns that we want to use.
    idx = (gc.feature == 'exon') & (gc.transcript_type == 'protein_coding')
    exon = gc.ix[idx, ['seqname', 'start', 'end', 'gene_id', 'gene_name']]

    # Convert columns to proper types.
    exon.start = exon.start.astype(int)
    exon.end = exon.end.astype(int)

    # Sort in place.
    exon.sort(['seqname', 'start', 'end'], inplace=True)

    # Group the rows by the Ensembl gene identifier (with version numbers.)
    groups = exon.groupby('gene_id')

    with log("Calculating coding region (exonic) length for each gene..."):
        lengths = groups.apply(count_bp)

    with log("Reading NCBI mapping of Entrez GeneID "\
             "to Ensembl gene identifier: {}".format(NCBI_ENSEMBL)):
        g2e = pd.read_table(NCBI_ENSEMBL,
                            compression="gzip",
                            header=None,
                            names=[
                                'tax_id', 'GeneID', 'Ensembl_gene_identifier',
                                'RNA_nucleotide_accession.version',
                                'Ensembl_rna_identifier',
                                'protein_accession.version',
                                'Ensembl_protein_identifier'
                            ])

    # Create a new DataFrame with gene lengths and EnsemblID.
    ensembl_no_version = lengths.index.map(lambda x: x.split(".")[0])
    ldf = pd.DataFrame(
        {
            'length': lengths,
            'Ensembl_gene_identifier': ensembl_no_version
        },
        index=lengths.index)

    # Merge so we have EntrezGeneID with length.
    m1 = pd.merge(ldf, g2e, on='Ensembl_gene_identifier')
    m1 = m1[['Ensembl_gene_identifier', 'GeneID', 'length']].drop_duplicates()

    with log("Writing output file: {}".format(GENE_LENGTHS)):
        with gzip.open(GENE_LENGTHS, "wb") as out:
            m1.to_csv(out, sep="\t", index=False)
Пример #4
0
def main(args):

    with log("Reading the Fasta  file: {}".format(args.fastaFile)):
        records = list(SeqIO.parse(args.fastaFile, "fasta"))

    with log("Reading the Gencode annotation file: {}".format(args.wigFile)):
        wig = pd.DataFrame.from_csv(args.wigFile,
                                    header=0,
                                    sep=" ",
                                    index_col=None)
        wig['CpG'] = ["CpG"] * (wig.size / 2)
        #--------------------------------------------------
        # for raw in range(0,wig.size):
        #--------------------------------------------------

    wig.to_csv(args.outWigFile, header=True, index=None, sep=' ', mode='a')

    seqStr = dict()
    with log("Reading the fasta file: {}".format(args.fastaFile)):
        seqHandle = open(args.fastaFile, "rU")
        for record in SeqIO.parse(seqHandle, "fasta"):
            seqStr[record.id] = record.seq
    print(seqStr)

    with log("Reading the Gencode annotation file: {}".format(args.gffFile)):
        gc = GTF.dataframe(args.gffFile)
        #--------------------------------------------------
        # print(gc[1:10])
        #--------------------------------------------------

    # Select just exons of protein coding genes, and columns that we want to use.
    idx = (gc.feature == 'exon')
    exon = gc.ix[idx, ['seqname', 'start', 'end', 'ID', 'Parent']]
    exon['ID'] = exon['ID'].map(lambda x: re.sub(r'-mRNA.*', '', x))
    # Convert columns to proper types.
    exon.start = exon.start.astype(int)
    exon.end = exon.end.astype(int)

    # Sort in place.
    exon.sort_values(['seqname', 'start', 'end'], inplace=True)

    # Group the rows by the Ensembl gene identifier (with version numbers.)
    groups = exon.groupby('ID')

    with log("Calculating coding region (exonic) length for each gene..."):
        lengths = groups.apply(count_bp)

    print(type(lengths))
    with log("Writing output file: {}".format(args.outFile)):
        lengths.to_csv(args.outFile, sep="\t", encoding="utf-8", index=True)
Пример #5
0
def main(GENCODE):
    gc = GTF.dataframe(GENCODE)
    gc.gene_id = gc.gene_id.replace(to_replace=r"\.[0-9]+", value="", regex=True)

    idx = (gc.feature == "transcript") & (gc.transcript_type == "lincRNA")
    lincRNA = gc.ix[idx, ["seqname", "start", "end", "gene_id", "gene_name", "strand"]]
    lincRNA.start = lincRNA.start.astype(int)
    lincRNA.end = lincRNA.end.astype(int)
    lincRNA.sort_values(by=["seqname", "start", "end"], inplace=True)
    lincRNA.to_csv("lincRNA.bed", sep="\t", header=False, index=False)

    idx = (gc.feature == "gene") & (gc.gene_type == "lincRNA")
    lincRNA = gc.ix[idx, ["seqname", "start", "end", "gene_id", "gene_name", "strand"]]
    lincRNA.start = lincRNA.start.astype(int)
    lincRNA.end = lincRNA.end.astype(int)
    lincRNA.sort_values(by=["seqname", "start", "end"], inplace=True)
    lincRNA.to_csv("lincRNA_genes.bed", sep="\t", header=False, index=False)
Пример #6
0
def main(GENCODE):
    gc = GTF.dataframe(GENCODE)
    gc.gene_id = gc.gene_id.replace(to_replace=r'\.[0-9]+',
                                    value='',
                                    regex=True)

    idx = (gc.feature == 'transcript') & (gc.transcript_type == 'lincRNA')
    lincRNA = gc.ix[
        idx, ['seqname', 'start', 'end', 'gene_id', 'gene_name', 'strand']]
    lincRNA.start = lincRNA.start.astype(int)
    lincRNA.end = lincRNA.end.astype(int)
    lincRNA.sort_values(by=['seqname', 'start', 'end'], inplace=True)
    lincRNA.to_csv('lincRNA.bed', sep='\t', header=False, index=False)

    idx = (gc.feature == 'gene') & (gc.gene_type == 'lincRNA')
    lincRNA = gc.ix[
        idx, ['seqname', 'start', 'end', 'gene_id', 'gene_name', 'strand']]
    lincRNA.start = lincRNA.start.astype(int)
    lincRNA.end = lincRNA.end.astype(int)
    lincRNA.sort_values(by=['seqname', 'start', 'end'], inplace=True)
    lincRNA.to_csv('lincRNA_genes.bed', sep='\t', header=False, index=False)
Пример #7
0
	*_toTranscriptome_cov.txt
Output file(s): 
	*_toTranscriptome_cov.average.txt
Tools:
	GTF.py to load GTF files
	Process line-by-line with Python
"""

import glob
import os
import sys
sys.path.insert(0, '/12TBLVM/Data/MinhTri/6_SCRIPTS')
import GTF

print('Loading gtf file.')
(GeneDict, TransScDict) = GTF.dataframe("/12TBLVM/Data/hg19-2/GENCODE/gencode.v22.annotation.gtf")

print('Listing files to be processed:')
cases = set()
for file in glob.glob('*_toTranscriptome_cov.txt'):
	cases.add(file.split('.')[0])
	print('\t', file)

for entry in cases:
	summary = {}
	
	# Read file
	original_file = open(entry + '.txt', 'r')
	print('Reading input file:', original_file.name)
	total_cov = 0
	counter = 0
Пример #8
0
    for val in it:
        # Report the *previous* value (more to come).
        yield last, True
        last = val
    # Report the last value.
    yield last, False


def processing_count(filenames):
    count_lowlevel_in_hightlevel(filenames, 'transcript', 'gene')
    count_lowlevel_in_hightlevel(filenames, 'exon', 'transcript')


if __name__ == '__main__':
    # Below is a demo for using function lookahead():
    # for i, has_more in lookahead(range(3)):
    #     print(i, has_more)
    whole_gtf = GTF.dataframe(sys.argv[1])
    processing_count(sys.argv[1])
    whole_gtf['length'] = whole_gtf['end'].astype(
        'int') - whole_gtf['start'].astype('int') + 1
    whole_gtf = whole_gtf.loc[:, ['gene_biotype', 'feature', 'length']]
    whole_gtf.to_csv("whole_gtf", sep='\t', index=False)
    # Below is a example for using ggplot package in Python:
    # p = ggplot(aes(x='length'), data=a) + geom_histogram() + facet_grid(x='gene_biotype', y='feature') \
    # + xlim(0,50000) + scale_y_log(10) + ylim(1, 1e3)
    # ggplot.save(p, "f**k.tiff", width=55, height=50, dpi=300)
    biotype_count_as_features = whole_gtf.groupby(['gene_biotype',
                                                   'feature']).size()
    biotype_count_as_features.to_csv("biotype_count_as_features", sep='\t')
Пример #9
0
param_1= sys.argv[1] # gtf file
param_2= sys.argv[2] # promoter region length
param_3= sys.argv[3] # list gene to calculate
param_4= sys.argv[4] # output file
param_5= sys.argv[5] # per_transcript or per_gene
param_6= sys.argv[6] # human genome reference fasta


promoter_length = int(param_2)
file_gene_filter = param_3
file_output = param_4


print "Read GTF file into memory"
result = GTF.dataframe(param_1)

print "Read gene list"
gene_filter_list = ReadFilterGene(file_gene_filter)

print "Calculate promoter region to bed format"
if param_5=="per_gene":
    promoter = CalculateAllPromoterRegions(result,promoter_length,gene_filter_list)
    promoter = filterOverlaps(promoter)
else:
    promoter = CalculateAllPromoterRegions2(result,promoter_length,gene_filter_list)

print "Writing temporary bed file"
temp_file = file_output+".tmp.txt"
printToFile(promoter,temp_file)
Пример #10
0
def main(args):
    with log("Reading compare Gencode annotation file: {}".format(args.compGffFile)):
        gc = GTF.dictionary(args.compGffFile,"ID")
    compGeneInfo = gc['gene']
    #--------------------------------------------------
    # gene['Name'] = gene['ID'].map(lambda x: re.sub(r':maker.*','',x))
    #--------------------------------------------------

    with log("Reading reference Gencode annotation file: {}".format(args.refGffFile)):
        gc = GTF.dataframe(args.refGffFile)

    # Select just genes of protein coding genes, and columns that we want to use.
    idx = (gc.feature == 'gene')
    gene = gc.ix[idx, ['seqname','start','end','Name']]
    #--------------------------------------------------
    # print(gene)
    #--------------------------------------------------
    # Convert columns to proper types.
    gene.start = gene.start.astype(int)
    gene.end = gene.end.astype(int)

    for geneID in gene['Name']:
        if geneID in compGeneInfo:
            # gene annotated in both species, read coordinates projecting information in maf
            mafFile = args.mafPath + "/" + geneID + ".maf"
            if not os.path.exists(mafFile):
                continue
            with log("Reading the Maf file: {}".format(mafFile)):
                with open(mafFile) as maf:
                    out_files = dict()
                    geneCoords = dict()
                    for block in bx.align.maf.Reader(maf):
                        ref_comp = block.components[0]
                        refSpecies, refChrom = ref_comp.src.split('.')[:2]
                        if refSpecies not in geneCoords:
                            geneCoords[refSpecies] = nested_dict(2,str)
                            geneCoords[refSpecies]['refInfo']['start'] = ref_comp.forward_strand_start
                            geneCoords[refSpecies]['refInfo']['end'] = ref_comp.forward_strand_end
                            geneCoords[refSpecies]['refInfo']['chr'] = refChrom
                        for comp in block.components[1:]:
                            comp_species, compChrom = comp.src.split('.')[:2]
                            if comp_species not in geneCoords:
                                geneCoords[comp_species] = nested_dict(2,str)
                                geneCoords[comp_species][compChrom]['start'] = comp.start
                                geneCoords[comp_species][compChrom]['end'] = int(comp.end)
                            if compChrom not in geneCoords[comp_species]:
                                geneCoords[comp_species][compChrom]['start'] = comp.start
                                geneCoords[comp_species][compChrom]['end'] = int(comp.end)
                            if comp_species not in out_files:
                                bedfile = "%s/%s.%s.bed" % (args.mafPath, geneID, comp_species )
                                f = open( bedfile , "w" )
                                out_files[comp_species] = f
                            pid = block_pid( ref_comp, comp )

                            if pid:
                                #--------------------------------------------------
                                # print("%s\t%s" % (comp.end, geneCoords[comp_species][compChrom]))
                                #--------------------------------------------------
                                if geneCoords[refSpecies]['refInfo']['start'] > ref_comp.forward_strand_start:
                                    geneCoords[refSpecies]['refInfo']['start'] = ref_comp.forward_strand_start
                                if geneCoords[refSpecies]['refInfo']['end'] < ref_comp.forward_strand_end:
                                    geneCoords[refSpecies]['refInfo']['end'] = ref_comp.forward_strand_end
                                if geneCoords[comp_species][compChrom]['start'] > comp.start:
                                    geneCoords[comp_species][compChrom]['start'] = comp.start
                                if geneCoords[comp_species][compChrom]['end'] <= int(comp.end):
                                    geneCoords[comp_species][compChrom]['end'] = int(comp.end)
                                out_files[comp_species].write( "%s\t%d\t%d\t%s:%d-%d,%s\t%f\n" %
                                                ( refChrom, ref_comp.forward_strand_start, ref_comp.forward_strand_end, \
                                                compChrom, comp.start, comp.end, comp.strand, pid ) )

                    for f in out_files.values():
                        f.close()
            if args.compSpecies in geneCoords:
                for chrom in geneCoords[args.compSpecies]:
                    if chrom in compGeneInfo[geneID]:
                        annoStart = int(compGeneInfo[geneID][chrom]['start'])
                        annoEnd = int(compGeneInfo[geneID][chrom]['end'])
                        compStart = int(geneCoords[args.compSpecies][chrom]['start'])
                        compEnd = int(geneCoords[args.compSpecies][chrom]['end'])
                        if compEnd > annoEnd and compStart < annoEnd or compEnd > annoStart and compStart < annoEnd :
                            print("Matched\t%s\t%s\t%s\tanno: %d - %d\tmapped: %d - %d\t%s\t%s\t%s: %s - %s" % \
                                    (geneID, args.compSpecies, chrom, annoStart,annoEnd, compStart, compEnd, compGeneInfo[geneID][chrom]['ID'], \
                                    args.refSpecies,geneCoords[args.refSpecies]['refInfo']['chr'],  geneCoords[args.refSpecies]['refInfo']['start'], geneCoords[args.refSpecies]['refInfo']['end'] ))
                        else:
                            eprint("unMatched\t%s\t%s\t%s\tanno: %d - %d\tmapped: %d - %d" % \
                                    (geneID, args.compSpecies, chrom, annoStart, annoEnd, compStart, compEnd))
                    else:
                        eprint("Error Chrom\t%s\t%s\t%s\tmapped: %s - %s" % \
                                (geneID, args.compSpecies, chrom, geneCoords[args.compSpecies][chrom]['start'],geneCoords[args.compSpecies][chrom]['end']))
def main(args):

	# --------------------------------------------------------------------------
	# READING GTF with GTF.py
	# --------------------------------------------------------------------------
	try:
		gtf_file=args['gtf']
		p = GTF.dataframe(gtf_file)  ## GTF.dataframe returns a pandas.core.data.DataFrame
	except Exception as e:
		logger.error("ERROR: in reading GTF\n; {}".format(e))
		exit(1)

	# --------------------------------------------------------------------------
	# INIT VARIABLES
	# --------------------------------------------------------------------------
	_Expected_Eight_Columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame']
	_Expected_Other_Columns = [ 'gene_id', 'gene_name', 'gene_version', 'gene_source', 'gene_biotype', 'transcript_id', 'transcript_version', 'transcript_name', 'transcript_source', 'transcript_biotype', 'tag', 'transcript_support_level', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version' ]

	logger.debug(p.index)
	logger.debug(p.columns)
	logger.info("-"*50)
	logger.info("Printing the fields (aka columns) needed in GTF to make the BED:")
	logger.info(_Expected_Eight_Columns + _Expected_Other_Columns)
	logger.info("-"*50)

	# --------------------------------------------------------------------------
	# CHECK FIELDS
	# --------------------------------------------------------------------------
	mandatory_columns_count = 0
	mandatory_columns_list = []
	other_fields_list = []
	for col in p.columns:
		if col in _Expected_Eight_Columns:
			mandatory_columns_count +=1
			mandatory_columns_list.append(col)
		other_fields_list.append(col)
	if mandatory_columns_count != len(_Expected_Eight_Columns):
			raise ValueError("MISSING MANDATORY COLUMNS in GTF: {} ".format(';'.join([ str(col) for col in _Expected_Eight_Columns if col not in mandatory_columns_list ])) )
	logger.info("Mandatory Expected Fields Check out OK")
	logger.info("Testing if any other missing field exists or are extra or with different expected names ...")

	unexpected_fields_list = []
	expected_fields_list = []
	for field in other_fields_list:
		if field not in _Expected_Other_Columns and field not in _Expected_Eight_Columns:
			unexpected_fields_list.append(field)
		else:
			expected_fields_list.append(field)

	logger.warning("The Following Fields are NOT added to BED; Check if they should be used and if so, check if they might be named differently: {} ".format(unexpected_fields_list))

	# --------------------------------------------------------------------------
	# UPDATE GTF IF NEEDED
	# --------------------------------------------------------------------------
	read_GTF_again=False
	for field in _Expected_Other_Columns + _Expected_Eight_Columns:
		if field not in expected_fields_list:
			if not read_GTF_again:
				shutil.copy(gtf_file, gtf_file+"upd.gtf")
				gtf_file = gtf_file+"upd.gtf"
			add_field_to_GTF(field, gtf_file)
			read_GTF_again=True

	if read_GTF_again:
		logger.info("processing GTF2BED for << {} >> updated GTF...".format(gtf_file))
		p = GTF.dataframe(gtf_file)

	# --------------------------------------------------------------------------
	# writing to output file
	# --------------------------------------------------------------------------
	with open(args['out'], 'w') as wo:
		logger.info("writing out BED file ... {}".format(args['out']))
		## writing HEADER line to output file
		wo.write("\t".join(
			['##seqname', 'start', 'end', 'gene_id__gene_name', 'score', 'strand', 'frame', 'gene_version', 'gene_source', 'gene_biotype', 'transcript_id', 'source', 'feature', 'transcript_version', 'transcript_name', ' transcript_source', 'transcript_biotype', 'tag',
			 'transcript_support_level', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version']) + "\n")
		## writing VALUE lines to output file
		try:
			for i in range(len(p)):
				wo.write("{}\t{}\t{}\t{}___{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(p['seqname'][i], int(p['start'][i]) - 1, p['end'][i], p['gene_id'][i], p['gene_name'][i], p['score'][i], p['strand'][i], p['frame'][i], p['gene_version'][i], p['gene_source'][i], p['gene_biotype'][i], p['transcript_id'][i], p['source'][i], p['feature'][i], p['transcript_version'][i], p['transcript_name'][i], p['transcript_source'][i], p['transcript_biotype'][i], p['tag'][i], p['transcript_support_level'][i], p['exon_number'][i], p['exon_id'][i], p['exon_version'][i], p['protein_id'][i], p['protein_version'][i] )
				)
		except IOError as IOE:
			logger.error("ERROR: in Writing data\n; {}".format(IOE))
			exit(2)
		except ValueError as VE:
			logger.error("ERROR: in Writing data\n; {}".format(VE))
			exit(2)
		except Exception as E:
			logger.error("ERROR: in Writing data\n; {}".format(E))
			exit(1)