示例#1
0
文件: CPC.py 项目: pombredanne/cgat
    def __call__(self, track, slice = None):
        
        c_transcript = []
        c_gene = []
        for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
            c_transcript.append(len(transcript))
        for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
            c_gene.append(len(gene))

        return odict( ( ("transcript", np.mean(c_transcript)), ("gene",np.mean(c_gene) )) )
示例#2
0
 def __call__(self, track, slice = None):
     
     if slice == "transcript":
         lengths_transcripts = []
         for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
             length = sum([gtf.end - gtf.start for gtf in transcript])
             lengths_transcripts.append(length)
         return np.mean(lengths_transcripts)
     
     elif slice == "gene":
         lengths_genes = []
         for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
             length = sum([gtf.end - gtf.start for gtf in gene])
             lengths_genes.append(length)
         return np.mean(lengths_genes)
示例#3
0
文件: CPC.py 项目: pombredanne/cgat
    def __call__(self, track, slice = None):

        classes = ["antisense"
              , "antisense_upstream"
              , "antisense_downstream"
              , "sense_upstream"
              , "sense_downstream"
              , "intergenic" 
              , "sense_intronic" 
              , "antisense_intronic"]

        coding_set = {}
        for gtf in GTF.iterator(IOTools.openFile("gtfs/lncrna_filtered.class.gtf.gz")):
            coding_set[gtf.transcript_id] = gtf.source

        result = {"noncoding": {}, "coding":collections.defaultdict(int)}
        total_nc = float(self.getValue("SELECT COUNT(*) FROM %(track)s_cpc_result WHERE C_NC = 'noncoding'"))
        for c in classes:
            result["noncoding"][c] = (float(self.getValue("""SELECT COUNT(*) FROM lncrna_final_class as a, %s_cpc_result as b WHERE a.class = '%s' 
                                                              AND b.C_NC = 'noncoding' 
                                                              AND a.transcript_id = b.transcript_id""" % (track,c)))/total_nc)*100

        
        total_c = len(coding_set.keys())
        for c in classes:
            ids = self.getValues("SELECT transcript_id FROM %(track)s_cpc_result WHERE C_NC = 'coding'")
            for i in ids:
                if i in coding_set.keys():
                    if coding_set[i] == c:
                        result["coding"][c] += 1
            
        for x, y in result["coding"].iteritems():
            result["coding"][x] = (float(y)/total_c)*100
            
        return result
示例#4
0
def main():
    # Input files.
    # GENCODE = args['-g']
    GENCODE = "/cs/zbio/jrosensk/ccle_fastq/hg19_reference/hg19.ensGene.gtf"  #"Homo_sapiens.GRCh38.103.gtf.gz"#"gencode.v29.annotation.gtf.gz"

    # Output file prefix.
    GENE_LENGTHS = "coding_lengths.hg19.tsv"

    with log("Reading the Gencode annotation file: {}".format(GENCODE)):
        gc = GTF.dataframe(GENCODE)

    # ccle_transcript_tpm = pd.read_csv("CCLE_expression.csv", nrows=3)

    # Select just exons of protein coding genes, and columns that we want to use.
    idx = (gc.feature == 'exon')  #& (gc.gene_biotype == 'protein_coding')
    # idx2 = (gc.feature == 'gene') & (gc.transcript_type == 'protein_coding')gene_biotype
    # trial = gc[idx2]
    exon = gc[idx][['seqname', 'start', 'end', 'gene_id', 'gene_name']]

    # Convert columns to proper types.
    exon.start = exon.start.astype(int)
    exon.end = exon.end.astype(int)

    # Sort in place.
    exon.sort_values(['seqname', 'start', 'end'], inplace=True)

    # Group the rows by the Ensembl gene identifier (with version numbers.)
    groups = exon.groupby('gene_id')

    with log("Calculating coding region (exonic) length for each gene..."):
        lengths = groups.apply(count_bp)

    # with log("Reading NCBI mapping of Entrez GeneID "\
    #          "to Ensembl gene identifier: {}".format(NCBI_ENSEMBL)):
    #     g2e = pd.read_table(NCBI_ENSEMBL,
    #                         compression="gzip",
    #                         header=None,
    #                         names=['tax_id', 'GeneID',
    #                                'Ensembl_gene_identifier',
    #                                'RNA_nucleotide_accession.version',
    #                                'Ensembl_rna_identifier',
    #                                'protein_accession.version',
    #                                'Ensembl_protein_identifier'])

    # Create a new DataFrame with gene lengths and EnsemblID.
    ensembl_no_version = lengths.index.map(lambda x: x.split(".")[0])
    ldf = pd.DataFrame(
        {
            'length': lengths,
            'Ensembl_gene_identifier': ensembl_no_version
        },
        index=lengths.index)

    # Merge so we have EntrezGeneID with length.
    # m1 = pd.merge(ldf, g2e, on='Ensembl_gene_identifier')
    m1 = ldf[['Ensembl_gene_identifier', 'length']].drop_duplicates()

    with log("Writing output file: {}".format(GENE_LENGTHS)):
        m1.to_csv(GENE_LENGTHS, sep="\t", index=False)
示例#5
0
    def getReferenceLincRNA(self, reference_gtf):

        lincs = []
        for entry in GTF.iterator(IOTools.openFile(reference_gtf)):
            if entry.source == "lincRNA":
                if entry.gene_id not in lincs:
                    lincs.append(entry.gene_id)
        return len(lincs)
示例#6
0
文件: CPC.py 项目: pombredanne/cgat
 def __call__(self, track):
     
     length = {}
     for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile("gtfs/lncrna_filtered.gtf.gz"))):
         length[transcript[0].transcript_id] = sum([gtf.end - gtf.start for gtf in transcript])
     
     score = {}
     dbh = sqlite3.connect("csvdb")
     cc = dbh.cursor()
     for data in cc.execute("SELECT transcript_id, CP_score FROM lncrna_filtered_cpc_result"):
         score[data[0]] = data[1]
 
     result = {"length": [], "score": []}
     for transcript, value in length.iteritems():
         result["length"].append(np.log10(length[transcript]))
         result["score"].append(score[transcript])
     return result
    def getReferenceLincRNA(self, reference_gtf):

        lincs = []
        for entry in GTF.iterator(IOTools.openFile(reference_gtf)):
            if entry.source == "lincRNA":
                if entry.gene_id not in lincs:
                    lincs.append(entry.gene_id)
        return len(lincs)
示例#8
0
def main(args):
    p = GTF.dataframe(
        args['gtf'])  ## GFT.dataframe returns a pandas.core.data.DataFrame
    with open(args['out'], 'w') as wo:
        for i in range(len(p)):
            wo.write("{}\t{}\t{}\t{}___{}\t{}\t{}\n".format(
                p['seqname'][i], p['start'][i], p['end'][i], p['gene_id'][i],
                p['gene_name'][i], p['gene_biotype'][i], p['strand'][i]))
示例#9
0
def main(args):
    # Input files.
    GENCODE = args['-g']
    NCBI_ENSEMBL = args['-n']

    # Output file prefix.
    GENE_LENGTHS = args['-o'] or "ncbi_ensembl_coding_lengths.txt.gz"

    with log("Reading the Gencode annotation file: {}".format(GENCODE)):
        gc = GTF.dataframe(GENCODE)

    # Select just exons of protein coding genes, and columns that we want to use.
    idx = (gc.feature == 'exon') & (gc.transcript_type == 'protein_coding')
    exon = gc.ix[idx, ['seqname', 'start', 'end', 'gene_id', 'gene_name']]

    # Convert columns to proper types.
    exon.start = exon.start.astype(int)
    exon.end = exon.end.astype(int)

    # Sort in place.
    exon.sort(['seqname', 'start', 'end'], inplace=True)

    # Group the rows by the Ensembl gene identifier (with version numbers.)
    groups = exon.groupby('gene_id')

    with log("Calculating coding region (exonic) length for each gene..."):
        lengths = groups.apply(count_bp)

    with log("Reading NCBI mapping of Entrez GeneID "\
             "to Ensembl gene identifier: {}".format(NCBI_ENSEMBL)):
        g2e = pd.read_table(NCBI_ENSEMBL,
                            compression="gzip",
                            header=None,
                            names=[
                                'tax_id', 'GeneID', 'Ensembl_gene_identifier',
                                'RNA_nucleotide_accession.version',
                                'Ensembl_rna_identifier',
                                'protein_accession.version',
                                'Ensembl_protein_identifier'
                            ])

    # Create a new DataFrame with gene lengths and EnsemblID.
    ensembl_no_version = lengths.index.map(lambda x: x.split(".")[0])
    ldf = pd.DataFrame(
        {
            'length': lengths,
            'Ensembl_gene_identifier': ensembl_no_version
        },
        index=lengths.index)

    # Merge so we have EntrezGeneID with length.
    m1 = pd.merge(ldf, g2e, on='Ensembl_gene_identifier')
    m1 = m1[['Ensembl_gene_identifier', 'GeneID', 'length']].drop_duplicates()

    with log("Writing output file: {}".format(GENE_LENGTHS)):
        with gzip.open(GENE_LENGTHS, "wb") as out:
            m1.to_csv(out, sep="\t", index=False)
示例#10
0
 def __call__(self,track, slice = None):
     
     transcript_counts = collections.defaultdict( set )
     counts = []
     for gtf in GTF.iterator(IOTools.openFile(self.getFilename(track))):
         transcript_counts[gtf.gene_id].add(gtf.transcript_id)
     for gene, transcripts in transcript_counts.iteritems():
         counts.append(len(transcripts))
     return counts
示例#11
0
 def __call__(self,track, slice = None):
     
     transcript_counts = collections.defaultdict( set )
     counts = []
     for gtf in GTF.iterator(IOTools.openFile(self.getFilename(track))):
         transcript_counts[gtf.gene_id].add(gtf.transcript_id)
     for gene, transcripts in transcript_counts.iteritems():
         counts.append(len(transcripts))
     count, lower, dx, _ = scipy.stats.cumfreq(counts, numbins=40, defaultreallimits=(1,15))
     x = np.arange(count.size) * dx + lower
     return odict( (("transcript number", x), ("cumulative frequency", count/len(counts))) )
示例#12
0
    def __call__(self, track, slice = None):
        
        if slice == "transcript":
            lengths_transcripts = []
            for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
                length = sum([gtf.end - gtf.start for gtf in transcript])
                lengths_transcripts.append(length)
            counts, lower, dx, _ = scipy.stats.cumfreq(lengths_transcripts, numbins=40, defaultreallimits=(0,20000))
            x = np.arange(counts.size) * dx + lower
            return odict( (("length", x), ("cumulative frequency", counts/len(lengths_transcripts))) )

        
        elif slice == "gene":
            lengths_genes = []
            for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
                length = sum([gtf.end - gtf.start for gtf in gene])
                lengths_genes.append(length)
            counts, lower, dx, _ = scipy.stats.cumfreq(lengths_genes, numbins=40, defaultreallimits=(0,20000))
            x = np.arange(counts.size) * dx + lower
            return odict( (("length", x), ("cumulative frequency", counts/len(lengths_genes))) )
示例#13
0
def main(args):

    with log("Reading the Fasta  file: {}".format(args.fastaFile)):
        records = list(SeqIO.parse(args.fastaFile, "fasta"))

    with log("Reading the Gencode annotation file: {}".format(args.wigFile)):
        wig = pd.DataFrame.from_csv(args.wigFile,
                                    header=0,
                                    sep=" ",
                                    index_col=None)
        wig['CpG'] = ["CpG"] * (wig.size / 2)
        #--------------------------------------------------
        # for raw in range(0,wig.size):
        #--------------------------------------------------

    wig.to_csv(args.outWigFile, header=True, index=None, sep=' ', mode='a')

    seqStr = dict()
    with log("Reading the fasta file: {}".format(args.fastaFile)):
        seqHandle = open(args.fastaFile, "rU")
        for record in SeqIO.parse(seqHandle, "fasta"):
            seqStr[record.id] = record.seq
    print(seqStr)

    with log("Reading the Gencode annotation file: {}".format(args.gffFile)):
        gc = GTF.dataframe(args.gffFile)
        #--------------------------------------------------
        # print(gc[1:10])
        #--------------------------------------------------

    # Select just exons of protein coding genes, and columns that we want to use.
    idx = (gc.feature == 'exon')
    exon = gc.ix[idx, ['seqname', 'start', 'end', 'ID', 'Parent']]
    exon['ID'] = exon['ID'].map(lambda x: re.sub(r'-mRNA.*', '', x))
    # Convert columns to proper types.
    exon.start = exon.start.astype(int)
    exon.end = exon.end.astype(int)

    # Sort in place.
    exon.sort_values(['seqname', 'start', 'end'], inplace=True)

    # Group the rows by the Ensembl gene identifier (with version numbers.)
    groups = exon.groupby('ID')

    with log("Calculating coding region (exonic) length for each gene..."):
        lengths = groups.apply(count_bp)

    print(type(lengths))
    with log("Writing output file: {}".format(args.outFile)):
        lengths.to_csv(args.outFile, sep="\t", encoding="utf-8", index=True)
示例#14
0
def main(GENCODE):
    gc = GTF.dataframe(GENCODE)
    gc.gene_id = gc.gene_id.replace(to_replace=r"\.[0-9]+", value="", regex=True)

    idx = (gc.feature == "transcript") & (gc.transcript_type == "lincRNA")
    lincRNA = gc.ix[idx, ["seqname", "start", "end", "gene_id", "gene_name", "strand"]]
    lincRNA.start = lincRNA.start.astype(int)
    lincRNA.end = lincRNA.end.astype(int)
    lincRNA.sort_values(by=["seqname", "start", "end"], inplace=True)
    lincRNA.to_csv("lincRNA.bed", sep="\t", header=False, index=False)

    idx = (gc.feature == "gene") & (gc.gene_type == "lincRNA")
    lincRNA = gc.ix[idx, ["seqname", "start", "end", "gene_id", "gene_name", "strand"]]
    lincRNA.start = lincRNA.start.astype(int)
    lincRNA.end = lincRNA.end.astype(int)
    lincRNA.sort_values(by=["seqname", "start", "end"], inplace=True)
    lincRNA.to_csv("lincRNA_genes.bed", sep="\t", header=False, index=False)
示例#15
0
def main(GENCODE):
    gc = GTF.dataframe(GENCODE)
    gc.gene_id = gc.gene_id.replace(to_replace=r'\.[0-9]+',
                                    value='',
                                    regex=True)

    idx = (gc.feature == 'transcript') & (gc.transcript_type == 'lincRNA')
    lincRNA = gc.ix[
        idx, ['seqname', 'start', 'end', 'gene_id', 'gene_name', 'strand']]
    lincRNA.start = lincRNA.start.astype(int)
    lincRNA.end = lincRNA.end.astype(int)
    lincRNA.sort_values(by=['seqname', 'start', 'end'], inplace=True)
    lincRNA.to_csv('lincRNA.bed', sep='\t', header=False, index=False)

    idx = (gc.feature == 'gene') & (gc.gene_type == 'lincRNA')
    lincRNA = gc.ix[
        idx, ['seqname', 'start', 'end', 'gene_id', 'gene_name', 'strand']]
    lincRNA.start = lincRNA.start.astype(int)
    lincRNA.end = lincRNA.end.astype(int)
    lincRNA.sort_values(by=['seqname', 'start', 'end'], inplace=True)
    lincRNA.to_csv('lincRNA_genes.bed', sep='\t', header=False, index=False)
示例#16
0
def count_lowlevel_in_hightlevel(filename, low_level_name, high_level_name):
    """
    To count how many sub-features in a highlevel feature.
    :param filename: File used to be processed.
    :param low_level_name: Feature names in GTF file. Such as "exon", "transcript".
    :param high_level_name: Feature names in GTF file. Such as "exon", "transcript", "gene".
    :return: No return, but output to file directly.
    """
    occurrence = 0
    with open('{} number in each {}'.format(low_level_name, high_level_name),
              'w') as f:
        for idx, item_with_bool in enumerate(lookahead(GTF.lines(filename))):
            if item_with_bool[0]['feature'] == high_level_name:
                if idx != 0 and idx != 1:
                    f.write(str(occurrence) + '\n')
                occurrence = 0
            elif item_with_bool[0]['feature'] == low_level_name:
                occurrence += 1
            elif item_with_bool[1] == False:
                f.write(str(occurrence) + '\n')
            else:
                continue
示例#17
0
	*_toTranscriptome_cov.txt
Output file(s): 
	*_toTranscriptome_cov.average.txt
Tools:
	GTF.py to load GTF files
	Process line-by-line with Python
"""

import glob
import os
import sys
sys.path.insert(0, '/12TBLVM/Data/MinhTri/6_SCRIPTS')
import GTF

print('Loading gtf file.')
(GeneDict, TransScDict) = GTF.dataframe("/12TBLVM/Data/hg19-2/GENCODE/gencode.v22.annotation.gtf")

print('Listing files to be processed:')
cases = set()
for file in glob.glob('*_toTranscriptome_cov.txt'):
	cases.add(file.split('.')[0])
	print('\t', file)

for entry in cases:
	summary = {}
	
	# Read file
	original_file = open(entry + '.txt', 'r')
	print('Reading input file:', original_file.name)
	total_cov = 0
	counter = 0
示例#18
0
def main(argv):

    # Ottengo la stringa relativa al file da processare
    input_file = argv[0]
    temp_folder = argv[1]
    username = argv[2]
    experiment = argv[3]
    species = argv[4]

    config = json.load(open('../configuration.json'))

    temp_token = username + '_' + str(uuid.uuid4())

    # Creo i csv per memorizzare le informazioni sui nodi
    chromosome_csv = open(temp_folder + temp_token + '_chromosome.csv', 'w')
    gene_csv = open(temp_folder + temp_token + '_gene.csv', 'w')
    transcript_csv = open(temp_folder + temp_token + '_transcript.csv', 'w')
    exon_csv = open(temp_folder + temp_token + '_exon.csv', 'w')

    # Creo i csv per memorizzare le informazioni sulle relazioni
    contains_csv = open(temp_folder + temp_token + '_contains.csv', 'w')
    in_chromosome_csv = open(temp_folder + temp_token + '_in_chromosome.csv',
                             'w')
    has_transcript_csv = open(temp_folder + temp_token + '_has_transcript.csv',
                              'w')
    has_exon_csv = open(temp_folder + temp_token + '_has_exon.csv', 'w')

    # Inizializzo i writer per tutti i file

    # ---- nodi
    chromosomeWriter = csv.writer(chromosome_csv, delimiter=',')
    geneWriter = csv.writer(gene_csv, delimiter=',')
    transcriptWriter = csv.writer(transcript_csv, delimiter=',')
    exonWriter = csv.writer(exon_csv, delimiter=',')

    # ---- relazioni
    containsWriter = csv.writer(contains_csv, delimiter=',')
    inChromosomeWriter = csv.writer(in_chromosome_csv, delimiter=',')
    hasTranscriptWriter = csv.writer(has_transcript_csv, delimiter=',')
    hasExonWriter = csv.writer(has_exon_csv, delimiter=',')

    # Cotruisco gli header dei file

    # ---- nodi
    chromosome_header = ["chromosome"]
    gene_header = ["gene_id"]
    transcript_header = [
        "transcript_id", "reference_id", "cov", "FPKM", "TPM", "start", "end"
    ]
    exon_header = ["exon_id", "exon_number", "start", "end", "cov"]

    # ---- relazioni
    contains_header = ["name", "gene_id"]
    in_chromosome_header = ["gene_id", "chromosome"]
    has_transcript_header = ["gene_id", "strand", "transcript_id"]
    has_exon_header = ["transcript_id", "exon_id"]

    # Scrivo gli header nei rispettivi file

    # ---- nodi
    chromosomeWriter.writerow(chromosome_header)
    geneWriter.writerow(gene_header)
    transcriptWriter.writerow(transcript_header)
    exonWriter.writerow(exon_header)

    # ---- relazioni
    containsWriter.writerow(contains_header)
    inChromosomeWriter.writerow(in_chromosome_header)
    hasTranscriptWriter.writerow(has_transcript_header)
    hasExonWriter.writerow(has_exon_header)

    # Inizializzo le strutture dati necessarie al parsing (per ottimizzare il caricamento dei dati su database)

    # ---- nodi
    chromosomes = set()
    genes_dict = {}
    transcripts_dict = {}

    # ---- relazioni
    contains_dict = {}
    in_chromosome_dict = {}
    has_transcript_dict = {}

    print 'Starting parsing procedure for file ' + input_file
    properties = {
        "name": os.path.basename(input_file),
        "extension": os.path.splitext(input_file)[1]
    }

    # Connessione a Neo4j
    driver = GraphDatabase.driver("bolt://" + config["neo4j"]["address"],
                                  auth=basic_auth(config["neo4j"]["username"],
                                                  config["neo4j"]["password"]))

    # Inizializzazione degli indici
    session = driver.session()

    statements = [
        "CREATE INDEX ON :File(name);", "CREATE INDEX ON :Species(species);",
        "CREATE INDEX ON :Gene(gene_id);",
        "CREATE INDEX ON :Chromosome(chromosome);",
        "CREATE INDEX ON :Transcript(transcript_id);",
        "CREATE INDEX ON :Exon(exon_id);"
    ]

    for statement in statements:
        session.run(statement)

    session.close()

    print 'Parsing file...'

    # inizializzo un contatore per fare un load parziale del file su database per file troppo grandi
    row_count = 0

    for line in GTF.lines(input_file):
        row_count += 1

        # memorizzo il cromosoma
        chromosomes.add(line["seqname"])

        # memorizzo il gene (se non presente)
        if not genes_dict.has_key(line["gene_id"]):
            genes_dict[line["gene_id"]] = [
                line[attr] if line.has_key(attr) else "None"
                for attr in gene_header
            ]

        # memorizzo la relazione (file)-[contiene]->(gene) (se non esiste)
        if not contains_dict.has_key(properties["name"] + ':' +
                                     line["gene_id"]):
            contains_dict[properties["name"] + ':' + line["gene_id"]] = [
                properties["name"], line["gene_id"]
            ]

        # memorizzo la relazione (gene)-[contenuto in]->(cromosoma) (se non esiste)
        if not in_chromosome_dict.has_key(line["gene_id"] + ':' +
                                          line["seqname"]):
            in_chromosome_dict[line["gene_id"] + ':' + line["seqname"]] = [
                line["gene_id"], line["seqname"]
            ]

        # a seconda della feature considerata (transcript, exon) memorizzo opportunamente le informazioni della riga
        if line['feature'] == 'transcript':

            # memorizzo il trascritto (se non presente)
            if not transcripts_dict.has_key(line["transcript_id"]):
                transcripts_dict[line["transcript_id"]] = [
                    line[attr] if line.has_key(attr) else "None"
                    for attr in transcript_header
                ]

            # memorizzo la relazione (gene)-[contiente]->(trascritto) (se non esiste)
            if not has_transcript_dict.has_key(line["gene_id"] + ':' +
                                               line["transcript_id"]):
                has_transcript_dict[line["gene_id"] + ':' +
                                    line["transcript_id"]] = [
                                        line[attr]
                                        for attr in has_transcript_header
                                    ]

        elif line['feature'] == 'exon':
            #definisco un ID per l'esone (necessario per il popolamento su db)
            exon_id = line["exon_number"] + ':' + line["transcript_id"]

            # memorizzo l'esone nel file csv
            exonWriter.writerow([exon_id] + [
                line[attr] if line.has_key(attr) else "None"
                for attr in exon_header[1:]
            ])

            #memorizzo la relazione (trascritto)-[contiene]->(esone) nel file csv
            hasExonWriter.writerow([line["transcript_id"], exon_id])

        if not (row_count % 15000):
            print str(row_count) + " scanned"

    # scrivo i file csv dei dict creati in precedenza
    for chrom in list(chromosomes):
        chromosomeWriter.writerow([chrom])

    for gene in genes_dict.keys():
        geneWriter.writerow(genes_dict[gene])

    for transcript in transcripts_dict.keys():
        transcriptWriter.writerow(transcripts_dict[transcript])

    for entry in contains_dict.keys():
        containsWriter.writerow(contains_dict[entry])

    for entry in in_chromosome_dict.keys():
        inChromosomeWriter.writerow(in_chromosome_dict[entry])

    for entry in has_transcript_dict.keys():
        hasTranscriptWriter.writerow(has_transcript_dict[entry])

    # termino la scrittura dei file csv

    # ---- nodi
    chromosome_csv.close()
    gene_csv.close()
    transcript_csv.close()
    exon_csv.close()

    # ---- relazioni
    contains_csv.close()
    in_chromosome_csv.close()
    has_transcript_csv.close()
    has_exon_csv.close()

    print 'Populating Database...'
    session = driver.session()

    prova = [
        "MERGE (u:User { username:{username} })",
        "MERGE (e:Experiment { name:{experiment} })",
        "MERGE (s:Species {species: {species} })",
        "MERGE (f:File { name:{properties}.name }) ON CREATE SET f += {properties}",
        "MERGE (u)-[:Created]->(e)", "MERGE (e)-[:For_Species]->(s)",
        "MERGE (e)-[:Composed_By]->(f)"
    ]

    # Associo il file all'utente
    session.run(
        " ".join(prova), {
            "username": username,
            "experiment": experiment,
            "species": species,
            "properties": properties
        })

    session.close()

    populateDB(driver, temp_folder + temp_token)

    print 'Done.'
示例#19
0
    for val in it:
        # Report the *previous* value (more to come).
        yield last, True
        last = val
    # Report the last value.
    yield last, False


def processing_count(filenames):
    count_lowlevel_in_hightlevel(filenames, 'transcript', 'gene')
    count_lowlevel_in_hightlevel(filenames, 'exon', 'transcript')


if __name__ == '__main__':
    # Below is a demo for using function lookahead():
    # for i, has_more in lookahead(range(3)):
    #     print(i, has_more)
    whole_gtf = GTF.dataframe(sys.argv[1])
    processing_count(sys.argv[1])
    whole_gtf['length'] = whole_gtf['end'].astype(
        'int') - whole_gtf['start'].astype('int') + 1
    whole_gtf = whole_gtf.loc[:, ['gene_biotype', 'feature', 'length']]
    whole_gtf.to_csv("whole_gtf", sep='\t', index=False)
    # Below is a example for using ggplot package in Python:
    # p = ggplot(aes(x='length'), data=a) + geom_histogram() + facet_grid(x='gene_biotype', y='feature') \
    # + xlim(0,50000) + scale_y_log(10) + ylim(1, 1e3)
    # ggplot.save(p, "f**k.tiff", width=55, height=50, dpi=300)
    biotype_count_as_features = whole_gtf.groupby(['gene_biotype',
                                                   'feature']).size()
    biotype_count_as_features.to_csv("biotype_count_as_features", sep='\t')
def main(args):

	# --------------------------------------------------------------------------
	# READING GTF with GTF.py
	# --------------------------------------------------------------------------
	try:
		gtf_file=args['gtf']
		p = GTF.dataframe(gtf_file)  ## GTF.dataframe returns a pandas.core.data.DataFrame
	except Exception as e:
		logger.error("ERROR: in reading GTF\n; {}".format(e))
		exit(1)

	# --------------------------------------------------------------------------
	# INIT VARIABLES
	# --------------------------------------------------------------------------
	_Expected_Eight_Columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame']
	_Expected_Other_Columns = [ 'gene_id', 'gene_name', 'gene_version', 'gene_source', 'gene_biotype', 'transcript_id', 'transcript_version', 'transcript_name', 'transcript_source', 'transcript_biotype', 'tag', 'transcript_support_level', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version' ]

	logger.debug(p.index)
	logger.debug(p.columns)
	logger.info("-"*50)
	logger.info("Printing the fields (aka columns) needed in GTF to make the BED:")
	logger.info(_Expected_Eight_Columns + _Expected_Other_Columns)
	logger.info("-"*50)

	# --------------------------------------------------------------------------
	# CHECK FIELDS
	# --------------------------------------------------------------------------
	mandatory_columns_count = 0
	mandatory_columns_list = []
	other_fields_list = []
	for col in p.columns:
		if col in _Expected_Eight_Columns:
			mandatory_columns_count +=1
			mandatory_columns_list.append(col)
		other_fields_list.append(col)
	if mandatory_columns_count != len(_Expected_Eight_Columns):
			raise ValueError("MISSING MANDATORY COLUMNS in GTF: {} ".format(';'.join([ str(col) for col in _Expected_Eight_Columns if col not in mandatory_columns_list ])) )
	logger.info("Mandatory Expected Fields Check out OK")
	logger.info("Testing if any other missing field exists or are extra or with different expected names ...")

	unexpected_fields_list = []
	expected_fields_list = []
	for field in other_fields_list:
		if field not in _Expected_Other_Columns and field not in _Expected_Eight_Columns:
			unexpected_fields_list.append(field)
		else:
			expected_fields_list.append(field)

	logger.warning("The Following Fields are NOT added to BED; Check if they should be used and if so, check if they might be named differently: {} ".format(unexpected_fields_list))

	# --------------------------------------------------------------------------
	# UPDATE GTF IF NEEDED
	# --------------------------------------------------------------------------
	read_GTF_again=False
	for field in _Expected_Other_Columns + _Expected_Eight_Columns:
		if field not in expected_fields_list:
			if not read_GTF_again:
				shutil.copy(gtf_file, gtf_file+"upd.gtf")
				gtf_file = gtf_file+"upd.gtf"
			add_field_to_GTF(field, gtf_file)
			read_GTF_again=True

	if read_GTF_again:
		logger.info("processing GTF2BED for << {} >> updated GTF...".format(gtf_file))
		p = GTF.dataframe(gtf_file)

	# --------------------------------------------------------------------------
	# writing to output file
	# --------------------------------------------------------------------------
	with open(args['out'], 'w') as wo:
		logger.info("writing out BED file ... {}".format(args['out']))
		## writing HEADER line to output file
		wo.write("\t".join(
			['##seqname', 'start', 'end', 'gene_id__gene_name', 'score', 'strand', 'frame', 'gene_version', 'gene_source', 'gene_biotype', 'transcript_id', 'source', 'feature', 'transcript_version', 'transcript_name', ' transcript_source', 'transcript_biotype', 'tag',
			 'transcript_support_level', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version']) + "\n")
		## writing VALUE lines to output file
		try:
			for i in range(len(p)):
				wo.write("{}\t{}\t{}\t{}___{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(p['seqname'][i], int(p['start'][i]) - 1, p['end'][i], p['gene_id'][i], p['gene_name'][i], p['score'][i], p['strand'][i], p['frame'][i], p['gene_version'][i], p['gene_source'][i], p['gene_biotype'][i], p['transcript_id'][i], p['source'][i], p['feature'][i], p['transcript_version'][i], p['transcript_name'][i], p['transcript_source'][i], p['transcript_biotype'][i], p['tag'][i], p['transcript_support_level'][i], p['exon_number'][i], p['exon_id'][i], p['exon_version'][i], p['protein_id'][i], p['protein_version'][i] )
				)
		except IOError as IOE:
			logger.error("ERROR: in Writing data\n; {}".format(IOE))
			exit(2)
		except ValueError as VE:
			logger.error("ERROR: in Writing data\n; {}".format(VE))
			exit(2)
		except Exception as E:
			logger.error("ERROR: in Writing data\n; {}".format(E))
			exit(1)
示例#21
0
def main(args):
    with log("Reading compare Gencode annotation file: {}".format(args.compGffFile)):
        gc = GTF.dictionary(args.compGffFile,"ID")
    compGeneInfo = gc['gene']
    #--------------------------------------------------
    # gene['Name'] = gene['ID'].map(lambda x: re.sub(r':maker.*','',x))
    #--------------------------------------------------

    with log("Reading reference Gencode annotation file: {}".format(args.refGffFile)):
        gc = GTF.dataframe(args.refGffFile)

    # Select just genes of protein coding genes, and columns that we want to use.
    idx = (gc.feature == 'gene')
    gene = gc.ix[idx, ['seqname','start','end','Name']]
    #--------------------------------------------------
    # print(gene)
    #--------------------------------------------------
    # Convert columns to proper types.
    gene.start = gene.start.astype(int)
    gene.end = gene.end.astype(int)

    for geneID in gene['Name']:
        if geneID in compGeneInfo:
            # gene annotated in both species, read coordinates projecting information in maf
            mafFile = args.mafPath + "/" + geneID + ".maf"
            if not os.path.exists(mafFile):
                continue
            with log("Reading the Maf file: {}".format(mafFile)):
                with open(mafFile) as maf:
                    out_files = dict()
                    geneCoords = dict()
                    for block in bx.align.maf.Reader(maf):
                        ref_comp = block.components[0]
                        refSpecies, refChrom = ref_comp.src.split('.')[:2]
                        if refSpecies not in geneCoords:
                            geneCoords[refSpecies] = nested_dict(2,str)
                            geneCoords[refSpecies]['refInfo']['start'] = ref_comp.forward_strand_start
                            geneCoords[refSpecies]['refInfo']['end'] = ref_comp.forward_strand_end
                            geneCoords[refSpecies]['refInfo']['chr'] = refChrom
                        for comp in block.components[1:]:
                            comp_species, compChrom = comp.src.split('.')[:2]
                            if comp_species not in geneCoords:
                                geneCoords[comp_species] = nested_dict(2,str)
                                geneCoords[comp_species][compChrom]['start'] = comp.start
                                geneCoords[comp_species][compChrom]['end'] = int(comp.end)
                            if compChrom not in geneCoords[comp_species]:
                                geneCoords[comp_species][compChrom]['start'] = comp.start
                                geneCoords[comp_species][compChrom]['end'] = int(comp.end)
                            if comp_species not in out_files:
                                bedfile = "%s/%s.%s.bed" % (args.mafPath, geneID, comp_species )
                                f = open( bedfile , "w" )
                                out_files[comp_species] = f
                            pid = block_pid( ref_comp, comp )

                            if pid:
                                #--------------------------------------------------
                                # print("%s\t%s" % (comp.end, geneCoords[comp_species][compChrom]))
                                #--------------------------------------------------
                                if geneCoords[refSpecies]['refInfo']['start'] > ref_comp.forward_strand_start:
                                    geneCoords[refSpecies]['refInfo']['start'] = ref_comp.forward_strand_start
                                if geneCoords[refSpecies]['refInfo']['end'] < ref_comp.forward_strand_end:
                                    geneCoords[refSpecies]['refInfo']['end'] = ref_comp.forward_strand_end
                                if geneCoords[comp_species][compChrom]['start'] > comp.start:
                                    geneCoords[comp_species][compChrom]['start'] = comp.start
                                if geneCoords[comp_species][compChrom]['end'] <= int(comp.end):
                                    geneCoords[comp_species][compChrom]['end'] = int(comp.end)
                                out_files[comp_species].write( "%s\t%d\t%d\t%s:%d-%d,%s\t%f\n" %
                                                ( refChrom, ref_comp.forward_strand_start, ref_comp.forward_strand_end, \
                                                compChrom, comp.start, comp.end, comp.strand, pid ) )

                    for f in out_files.values():
                        f.close()
            if args.compSpecies in geneCoords:
                for chrom in geneCoords[args.compSpecies]:
                    if chrom in compGeneInfo[geneID]:
                        annoStart = int(compGeneInfo[geneID][chrom]['start'])
                        annoEnd = int(compGeneInfo[geneID][chrom]['end'])
                        compStart = int(geneCoords[args.compSpecies][chrom]['start'])
                        compEnd = int(geneCoords[args.compSpecies][chrom]['end'])
                        if compEnd > annoEnd and compStart < annoEnd or compEnd > annoStart and compStart < annoEnd :
                            print("Matched\t%s\t%s\t%s\tanno: %d - %d\tmapped: %d - %d\t%s\t%s\t%s: %s - %s" % \
                                    (geneID, args.compSpecies, chrom, annoStart,annoEnd, compStart, compEnd, compGeneInfo[geneID][chrom]['ID'], \
                                    args.refSpecies,geneCoords[args.refSpecies]['refInfo']['chr'],  geneCoords[args.refSpecies]['refInfo']['start'], geneCoords[args.refSpecies]['refInfo']['end'] ))
                        else:
                            eprint("unMatched\t%s\t%s\t%s\tanno: %d - %d\tmapped: %d - %d" % \
                                    (geneID, args.compSpecies, chrom, annoStart, annoEnd, compStart, compEnd))
                    else:
                        eprint("Error Chrom\t%s\t%s\t%s\tmapped: %s - %s" % \
                                (geneID, args.compSpecies, chrom, geneCoords[args.compSpecies][chrom]['start'],geneCoords[args.compSpecies][chrom]['end']))
示例#22
0
def main( argv = sys.argv ):

    parser = optparse.OptionParser( version = "%prog version: $Id: bed2gff.py 2861 2010-02-23 17:36:32Z andreas $", 
                                    usage = globals()["__doc__"] )

    parser.add_option("-a", "--as-gtf", dest="as_gtf", action="store_true",
                      help="output as gtf."  )

    parser.set_defaults( as_gtf = False,
                         id_format = "%08i",
                         test = None )
    
    (options, args) = E.Start( parser, add_pipe_options = True )

    as_gtf = options.as_gtf
    id_format = options.id_format

    if as_gtf:
        gff = GTF.Entry()
    else:
        gff = GFF.Entry()

    gff.source = "bed"
    gff.feature = "exon"

    ninput, noutput, nskipped = 0, 0, 0

    id = 0
    for bed in Bed.iterator( options.stdin ):

        ninput += 1

        gff.contig = bed.contig
        gff.start = bed.start 
        gff.end = bed.end
        if bed.mFields and len(bed.mFields) >= 3:
            gff.strand = bed.mFields[2]
        else: 
            gff.strand = "."

        if bed.mFields and len(bed.mFields) >= 2:
            gff.score = bed.mFields[1]
        
        
        if as_gtf:
            if bed.mFields:
                gff.gene_id = bed.mFields[0]
                gff.transcript_id = bed.mFields[0]
            else:
                id += 1
                gff.gene_id = id_format % id
                gff.transcript_id = id_format % id            
        else:
            if bed.mFields:
                gff.source = bed.mFields[0]
            
        options.stdout.write( str(gff) + "\n" )

        noutput += 1

    E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped) )

    E.Stop()
def main():
	with open(utr5FName, "w") as utr5File, open(utr5StartFName, "w") as utr5StartFile, open(cdsFName, "w") as cdsFile, \
			open(utr3FName, "w") as utr3File, open(exonFName, "w") as exonFile, open (intronFName, "w") as intronFile, \
			open(codingExonFName, "w") as codingExonFile, open(codingIntronFName, "w") as codingIntronFile, \
			open(noncodingExonFName, "w") as noncodingExonFile, open(noncodingIntronFName, "w") as noncodingIntronFile:

		def writeOutput(gene):
			if (useBlocks): # output all region primitives on the same line by specifying nBlocks and lists inside the BED output
				if(gene.coding):
					#blockBedFormat is one line by definition
					if (gene.utr5Len > 0): utr5File.write(gene.blockBedFormat(region="5utr") + "\n")
					if (gene.utr5startLen > 0): utr5StartFile.write(gene.blockBedFormat(region="5utr_start") + "\n")
					if (gene.cdsLen > 0): cdsFile.write(gene.blockBedFormat(region="cds") + "\n")
					if (gene.utr3Len > 0): utr3File.write(gene.blockBedFormat(region="3utr") + "\n")
				
					if (gene.exonsLen > 0):
						exonFile.write(gene.blockBedFormat(region="exons") + "\n")
						codingExonFile.write(gene.blockBedFormat(region="exons") + "\n")
					
					if (gene.intronsLen > 0):
						intronFile.write(gene.blockBedFormat(region="introns") + "\n")
						codingIntronFile.write(gene.blockBedFormat(region="introns") + "\n")
						
				else: # noncoding transcripts just have exons and introns
					if (gene.exonsLen > 0):
						exonFile.write(gene.blockBedFormat(region="exons") + "\n")
						noncodingExonFile.write(gene.blockBedFormat(region="exons") + "\n")

					if (gene.intronsLen > 0):
						intronFile.write(gene.blockBedFormat(region="introns") + "\n")
						noncodingIntronFile.write(gene.blockBedFormat(region="introns") + "\n")

			else: # output one line per region primitive instead of combining regions via blocks
				if(gene.coding):
					for entry in gene.bedFormat(region="5utr"):
						utr5File.write(entry + "\n")
					for entry in gene.bedFormat(region="5utr_start"):
						utr5StartFile.write(entry + "\n")
					for entry in gene.bedFormat(region="cds"):
						cdsFile.write(entry + "\n")
					for entry in gene.bedFormat(region="3utr"):
						utr3File.write(entry + "\n")

					for entry in gene.bedFormat(region="exons"):
						exonFile.write(entry + "\n")
						codingExonFile.write(entry + "\n")

					for entry in gene.bedFormat(region="introns"):
						intronFile.write(entry + "\n")
						codingIntronFile.write(entry + "\n")

				else: # noncoding transcripts just have exons and introns
					for entry in gene.bedFormat(region="exons"):
						exonFile.write(entry + "\n")
						noncodingExonFile.write(entry + "\n")

					for entry in gene.bedFormat(region="introns"):
						intronFile.write(entry + "\n")
						noncodingIntronFile.write(entry + "\n")


		if (args.ucsc): 
			with open(args.input, "r") as genesFile: 
				genesRead = 0

				for line in genesFile:
					# all of the knowngenes parsing and metadata construction is done inside UCSCKnownGene.py, especially the createGene method

					gene = createUCSCTranscript(line) 
					genesRead += 1

					writeOutput(gene)

					if (not genesRead % 2500):
						print "Processed %d entries..." %  genesRead

					
		elif (args.gtf): 
				
				# first parse the entire file into a dictionary of lists

			txDict = defaultdict(list) 

			print "Building GTF dictionary..." 

			# the issue here is that lines for various transcripts may be interleaved, so can either create lots of SNFGene objects, or a giant dict. opted for giant dict. 
			for line in GTF.lines(args.input): 

				txDict[line["transcript_id"]].append(line)
				genesRead += 1

				if (not genesRead % 100000):
					print "Processed %d lines..." %  genesRead

			print "Dictionary built." 

			# now create a SNFGene object for each transcript and output it 
			genesRead = 0
			for key in txDict: 

				#print key

				tx = createGTFTranscript(txDict[key])

				#print tx 
				writeOutput(tx)
				genesRead += 1
				
				if (not genesRead % 2500):
					print "Processed %d entries..." %  genesRead


	print "Processed %d entries." %  genesRead

	# BTD Edit: making unique regions and linking to gene name
	# --------------------------------------------------------
	# utr5FName = args.output  + "_5utr.bed"
	# utr5StartFName = args.output  + "_5utr_start.bed"
	# cdsFName = args.output  + "_cds.bed"
	# utr3FName = args.output  + "_3utr.bed"
	# exonFName = args.output  + "_exons.bed"
	# intronFName = args.output  + "_introns.bed"
	# codingExonFName = args.output  + "_codingexons.bed"
	# codingIntronFName = args.output  + "_codingintrons.bed" # note that these are introns from coding genes, not necessarily introns that make it to mRNA 
	# noncodingExonFName = args.output  + "_noncodingexons.bed" 
	# noncodingIntronFName = args.output  + "_noncodingintrons.bed" 

	# 1. Get gene ID (NM_123, ENSG123) --> gene name (Abcd1)
	print "Getting gene ID"
	idToName = {}
	if args.ucsc:
		with open(args.input, 'r') as knownGeneFile:
			reader = csv.reader(knownGeneFile, 'textdialect')
			for row in reader:
				idToName[row[0]] = row[-1]
			
	# 2. Get unique 5'UTR, 5'Start UTR, and 3'UTR
	print "Getting unique UTRs"
	def getUniqUTR(uniqFN, utrFN):
		with open(uniqFN, 'w') as uniq, open(utrFN, 'r') as utr:
			already = set()
			reader = csv.reader(utr, 'textdialect')
			writer = csv.writer(uniq, 'textdialect')
			for row in reader:
				if tuple(row[6:]) in already: continue #repeat
				geneIDInfo = row[3]
				id = geneIDInfo.split('__')[0]
				try: geneName = idToName[id]
				except: geneName = id
				if geneName != id: row[3] = id + '__' + geneName
				else: row[3] = id
				already.add(tuple(row[6:]))
				writer.writerow(row)
				
	uniq5UTR = args.output  + "_uniq_5utr.bed"
	getUniqUTR(uniq5UTR, utr5FName)

	uniq3UTR = args.output  + '_uniq_3utr.bed'
	getUniqUTR(uniq3UTR, utr3FName)

	uniq5SUTR = args.output  + '_uniq_5utr_start.bed'
	getUniqUTR(uniq5SUTR, utr5StartFName)
		
	# 3. Get unique exons + num. Do it 3x for all, coding, and noncoding
	print "Getting unique exons"
	def getUniqExons(uniqFN, exonFN):
		with open(uniqFN, 'w') as uniq, open(exonFN, 'r') as exons:
			already = set()
			reader = csv.reader(exons, 'textdialect')
			writer = csv.writer(uniq, 'textdialect')
			for row in reader:
				# gene ID info
				geneIDInfo = row[3]
				id = geneIDInfo.split('__')[0]
				try: geneName = idToName[id]
				except: geneName = id
				if geneName != id: geneIDInfo = id + '__' + geneName
				else: geneIDInfo = id
				
				# chrom, start, stop, strand
				chrom = row[0]
				start, end = int(row[1]), int(row[2])
				strand = row[5]

				# calculate exon starts and lengths
				exonLengths = row[10].split(',')
				if exonLengths[-1] == '': exonLengths = exonLengths[:-1]
				exonLengths = [int(x) for x in exonLengths]
				exonStarts = row[11].split(',')
				if exonStarts[-1] == '': exonStarts = exonStarts[:-1]
				exonStarts = [int(x) for x in exonStarts]
				
				# calculate exons
				exons = []
				for i in range(len(exonStarts)):
					absStart = start + exonStarts[i]
					exons.append([absStart, absStart + exonLengths[i]])
				if strand == '-': exons = exons[::-1] #flip exon order
				
				# making BED6
				for i in range(len(exons)):
					exonNum = i + 1
					exonNumInfo = str(exonNum) + 'of' + str(len(exons))
					exon = exons[i]
					outputRow = [chrom, exon[0], exon[1]]
					
					# unique
					if tuple(outputRow) in already: continue
					already.add(tuple(outputRow))            
					outputRow.extend([geneIDInfo + '__exon__' + exonNumInfo, 0, strand])
					writer.writerow(outputRow)
				
	uniqExons = args.output  + '_uniq_exons.bed'
	getUniqExons(uniqExons, exonFName)

	uniqExons = args.output  + '_uniq_codingexons.bed'
	getUniqExons(uniqExons, codingExonFName)

	uniqExons = args.output  + '_uniq_noncodingexons.bed'
	getUniqExons(uniqExons, noncodingExonFName)            

	# 4. Get unique introns + num. unique 5'SS, 3'SS. 
	# 5'SS is first base of intron, 3'SS is last base of intron
	print "Getting unique introns and 5' and 3' SS"
	def getUniqIntronsAndSS(uniqIntronFN, uniq5SSFN, uniq3SSFN, intronFN):
		with open(uniqIntronFN, 'w') as uniqIntron, open(uniq5SSFN, 'w') as uniq5, \
			open(uniq3SSFN, 'w') as uniq3, open(intronFN, 'r') as introns:
			alreadyIntron = set()
			already5 = set()
			already3 = set()
			
			reader = csv.reader(introns, 'textdialect')
			intronWriter = csv.writer(uniqIntron, 'textdialect')
			fiveWriter = csv.writer(uniq5, 'textdialect')
			threeWriter = csv.writer(uniq3, 'textdialect')
			
			for row in reader:
				# gene ID info
				geneIDInfo = row[3]
				id = geneIDInfo.split('__')[0]
				try: geneName = idToName[id]
				except: geneName = id
				if geneName != id: geneIDInfo = id + '__' + geneName
				else: geneIDInfo = id
				
				# chrom, start, stop, strand
				chrom = row[0]
				start, end = int(row[1]), int(row[2])
				strand = row[5]

				# calculate intron starts and lengths
				intronLengths = row[10].split(',')
				if intronLengths[-1] == '': intronLengths = intronLengths[:-1]
				intronLengths = [int(x) for x in intronLengths]
				intronStarts = row[11].split(',')
				if intronStarts[-1] == '': intronStarts = intronStarts[:-1]
				intronStarts = [int(x) for x in intronStarts]
				
				# calculate introns
				introns = []
				for i in range(len(intronStarts)):
					absStart = start + intronStarts[i]
					introns.append([absStart, absStart + intronLengths[i]])
				if strand == '-': introns = introns[::-1] #flip intron order
				
				# making BED6
				for i in range(len(introns)):
					intronNum = i + 1
					intronNumInfo = str(intronNum) + 'of' + str(len(introns))
					intron = introns[i]
					outputRow = [chrom, intron[0], intron[1]]
					
					# unique introns
					if tuple(outputRow) in alreadyIntron: continue
					alreadyIntron.add(tuple(outputRow))
					outputRow.extend([geneIDInfo+ '__intron__' + intronNumInfo, 0, strand])
					intronWriter.writerow(outputRow)
					
					# unique splice sites
					if strand == '+':
						fiveSS = [chrom, intron[0], intron[0] + 1]
						threeSS = [chrom, intron[1] - 1, intron[1]]
					else:
						threeSS = [chrom, intron[0], intron[0] + 1]
						fiveSS = [chrom, intron[1] - 1, intron[1]]
					if tuple(fiveSS) not in already5:
						already5.add(tuple(fiveSS))
						fiveSS.extend([geneIDInfo + '__5ss__' + intronNumInfo, 0, strand])
						fiveWriter.writerow(fiveSS)
					if tuple(threeSS) not in already3:
						already3.add(tuple(threeSS))
						threeSS.extend([geneIDInfo+ '__3ss__' + intronNumInfo, 0, strand])
						threeWriter.writerow(threeSS)

	uniqIntrons = args.output  + '_uniq_introns.bed'
	uniq5 = args.output  + '_uniq_5ss.bed'
	uniq3 = args.output  + '_uniq_3ss.bed'
	getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, intronFName)

	uniqIntrons = args.output  + '_uniq_codingintrons.bed'
	uniq5 = args.output  + '_uniq_coding5ss.bed'
	uniq3 = args.output  + '_uniq_coding3ss.bed'
	getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, codingIntronFName)

	uniqIntrons = args.output  + '_uniq_noncodingintrons.bed'
	uniq5 = args.output  + '_uniq_noncoding5ss.bed'
	uniq3 = args.output  + '_uniq_noncoding3ss.bed'
	getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, noncodingIntronFName)

	# 5. unique TSS/TES
	print "Getting unique TSS and TES"
	def getUniqTSSAndTES(tssFN, tesFN, cdsFN):
		with open(tssFN, 'w') as uniqTSS, open(tesFN, 'w') as uniqTES, open(cdsFN, 'r') as cds:
			alreadyTSS = set()
			alreadyTES = set()
			reader = csv.reader(cds, 'textdialect')
			tssWriter = csv.writer(uniqTSS, 'textdialect')
			tesWriter = csv.writer(uniqTES, 'textdialect')
			for row in reader:
				geneIDInfo = row[3]
				id = geneIDInfo.split('__')[0]
				try: geneName = idToName[id]
				except: geneName = id
				if geneName != id: geneIDInfo = id + '__' + geneName
				else: geneIDInfo = id
				
				# chrom, start, stop, strand
				chrom = row[0]
				strand = row[5]
				start, end = int(row[1]), int(row[2])
				
				if strand == '+':
					startRow = [chrom, start, start + 1]
					endRow = [chrom, end - 1, end]
				else:
					startRow = [chrom, end - 1, end]
					endRow = [chrom, start, start + 1]
				if tuple(startRow) not in alreadyTSS:
					alreadyTSS.add(tuple(startRow))
					startRow.extend([geneIDInfo, 0, strand])
					tssWriter.writerow(startRow)
				if tuple(endRow) not in alreadyTSS:
					alreadyTES.add(tuple(endRow))
					endRow.extend([geneIDInfo, 0, strand])
					tesWriter.writerow(endRow)            
				
	uniqTSS = args.output  + '_uniq_tss.bed'
	uniqTES = args.output  + '_uniq_tes.bed'
	getUniqTSSAndTES(uniqTSS, uniqTES, cdsFName)


	# sort everything
	print "Sorting BED files"
	for fn in glob.glob("*.bed"):
		os.system("sort -k1,1 -k2,2n %s -o %s"%(fn, fn))
示例#24
0
                writeOutput(gene)

                if (not genesRead % 2500):
                    print "Processed %d entries..." % genesRead

    elif (args.gtf):

        # first parse the entire file into a dictionary of lists

        txDict = defaultdict(list)

        print "Building GTF dictionary..."

        # the issue here is that lines for various transcripts may be interleaved, so can either create lots of objects, or a giant dict. opted for giant dict.
        for line in GTF.lines(args.input):
            # only want to read in lines corresponding to these features
            if line["feature"] in ["exon", "CDS", "start_codon", "stop_codon"]:
                txDict[line["transcript_id"]].append(line)
                genesRead += 1

                if (not genesRead % 25000):
                    print "\tProcessed %d lines..." % genesRead

        print "Dictionary built."

        print "Writing transcript properties."
        genesRead = 0

        # now create a Transcript object for each transcript and output it
param_1= sys.argv[1] # gtf file
param_2= sys.argv[2] # promoter region length
param_3= sys.argv[3] # list gene to calculate
param_4= sys.argv[4] # output file
param_5= sys.argv[5] # per_transcript or per_gene
param_6= sys.argv[6] # human genome reference fasta


promoter_length = int(param_2)
file_gene_filter = param_3
file_output = param_4


print "Read GTF file into memory"
result = GTF.dataframe(param_1)

print "Read gene list"
gene_filter_list = ReadFilterGene(file_gene_filter)

print "Calculate promoter region to bed format"
if param_5=="per_gene":
    promoter = CalculateAllPromoterRegions(result,promoter_length,gene_filter_list)
    promoter = filterOverlaps(promoter)
else:
    promoter = CalculateAllPromoterRegions2(result,promoter_length,gene_filter_list)

print "Writing temporary bed file"
temp_file = file_output+".tmp.txt"
printToFile(promoter,temp_file)
def main():
    with open(utr5FName, "w") as utr5File, open(utr5StartFName, "w") as utr5StartFile, open(cdsFName, "w") as cdsFile, \
      open(utr3FName, "w") as utr3File, open(exonFName, "w") as exonFile, open (intronFName, "w") as intronFile, \
      open(codingExonFName, "w") as codingExonFile, open(codingIntronFName, "w") as codingIntronFile, \
      open(noncodingExonFName, "w") as noncodingExonFile, open(noncodingIntronFName, "w") as noncodingIntronFile:

        def writeOutput(gene):
            if (
                    useBlocks
            ):  # output all region primitives on the same line by specifying nBlocks and lists inside the BED output
                if (gene.coding):
                    #blockBedFormat is one line by definition
                    if (gene.utr5Len > 0):
                        utr5File.write(
                            gene.blockBedFormat(region="5utr") + "\n")
                    if (gene.utr5startLen > 0):
                        utr5StartFile.write(
                            gene.blockBedFormat(region="5utr_start") + "\n")
                    if (gene.cdsLen > 0):
                        cdsFile.write(gene.blockBedFormat(region="cds") + "\n")
                    if (gene.utr3Len > 0):
                        utr3File.write(
                            gene.blockBedFormat(region="3utr") + "\n")

                    if (gene.exonsLen > 0):
                        exonFile.write(
                            gene.blockBedFormat(region="exons") + "\n")
                        codingExonFile.write(
                            gene.blockBedFormat(region="exons") + "\n")

                    if (gene.intronsLen > 0):
                        intronFile.write(
                            gene.blockBedFormat(region="introns") + "\n")
                        codingIntronFile.write(
                            gene.blockBedFormat(region="introns") + "\n")

                else:  # noncoding transcripts just have exons and introns
                    if (gene.exonsLen > 0):
                        exonFile.write(
                            gene.blockBedFormat(region="exons") + "\n")
                        noncodingExonFile.write(
                            gene.blockBedFormat(region="exons") + "\n")

                    if (gene.intronsLen > 0):
                        intronFile.write(
                            gene.blockBedFormat(region="introns") + "\n")
                        noncodingIntronFile.write(
                            gene.blockBedFormat(region="introns") + "\n")

            else:  # output one line per region primitive instead of combining regions via blocks
                if (gene.coding):
                    for entry in gene.bedFormat(region="5utr"):
                        utr5File.write(entry + "\n")
                    for entry in gene.bedFormat(region="5utr_start"):
                        utr5StartFile.write(entry + "\n")
                    for entry in gene.bedFormat(region="cds"):
                        cdsFile.write(entry + "\n")
                    for entry in gene.bedFormat(region="3utr"):
                        utr3File.write(entry + "\n")

                    for entry in gene.bedFormat(region="exons"):
                        exonFile.write(entry + "\n")
                        codingExonFile.write(entry + "\n")

                    for entry in gene.bedFormat(region="introns"):
                        intronFile.write(entry + "\n")
                        codingIntronFile.write(entry + "\n")

                else:  # noncoding transcripts just have exons and introns
                    for entry in gene.bedFormat(region="exons"):
                        exonFile.write(entry + "\n")
                        noncodingExonFile.write(entry + "\n")

                    for entry in gene.bedFormat(region="introns"):
                        intronFile.write(entry + "\n")
                        noncodingIntronFile.write(entry + "\n")

        if (args.ucsc):
            with open(args.input, "r") as genesFile:
                genesRead = 0

                for line in genesFile:
                    # all of the knowngenes parsing and metadata construction is done inside UCSCKnownGene.py, especially the createGene method

                    gene = createUCSCTranscript(line)
                    genesRead += 1

                    writeOutput(gene)

                    if (not genesRead % 2500):
                        print "Processed %d entries..." % genesRead

        elif (args.gtf):

            # first parse the entire file into a dictionary of lists

            txDict = defaultdict(list)

            print "Building GTF dictionary..."

            # the issue here is that lines for various transcripts may be interleaved, so can either create lots of SNFGene objects, or a giant dict. opted for giant dict.
            for line in GTF.lines(args.input):

                txDict[line["transcript_id"]].append(line)
                genesRead += 1

                if (not genesRead % 100000):
                    print "Processed %d lines..." % genesRead

            print "Dictionary built."

            # now create a SNFGene object for each transcript and output it
            genesRead = 0
            for key in txDict:

                #print key

                tx = createGTFTranscript(txDict[key])

                #print tx
                writeOutput(tx)
                genesRead += 1

                if (not genesRead % 2500):
                    print "Processed %d entries..." % genesRead

    print "Processed %d entries." % genesRead

    # BTD Edit: making unique regions and linking to gene name
    # --------------------------------------------------------
    # utr5FName = args.output  + "_5utr.bed"
    # utr5StartFName = args.output  + "_5utr_start.bed"
    # cdsFName = args.output  + "_cds.bed"
    # utr3FName = args.output  + "_3utr.bed"
    # exonFName = args.output  + "_exons.bed"
    # intronFName = args.output  + "_introns.bed"
    # codingExonFName = args.output  + "_codingexons.bed"
    # codingIntronFName = args.output  + "_codingintrons.bed" # note that these are introns from coding genes, not necessarily introns that make it to mRNA
    # noncodingExonFName = args.output  + "_noncodingexons.bed"
    # noncodingIntronFName = args.output  + "_noncodingintrons.bed"

    # 1. Get gene ID (NM_123, ENSG123) --> gene name (Abcd1)
    print "Getting gene ID"
    idToName = {}
    if args.ucsc:
        with open(args.input, 'r') as knownGeneFile:
            reader = csv.reader(knownGeneFile, 'textdialect')
            for row in reader:
                idToName[row[0]] = row[-1]

    # 2. Get unique 5'UTR, 5'Start UTR, and 3'UTR
    print "Getting unique UTRs"

    def getUniqUTR(uniqFN, utrFN):
        with open(uniqFN, 'w') as uniq, open(utrFN, 'r') as utr:
            already = set()
            reader = csv.reader(utr, 'textdialect')
            writer = csv.writer(uniq, 'textdialect')
            for row in reader:
                if tuple(row[6:]) in already: continue  #repeat
                geneIDInfo = row[3]
                id = geneIDInfo.split('__')[0]
                try:
                    geneName = idToName[id]
                except:
                    geneName = id
                if geneName != id: row[3] = id + '__' + geneName
                else: row[3] = id
                already.add(tuple(row[6:]))
                writer.writerow(row)

    uniq5UTR = args.output + "_uniq_5utr.bed"
    getUniqUTR(uniq5UTR, utr5FName)

    uniq3UTR = args.output + '_uniq_3utr.bed'
    getUniqUTR(uniq3UTR, utr3FName)

    uniq5SUTR = args.output + '_uniq_5utr_start.bed'
    getUniqUTR(uniq5SUTR, utr5StartFName)

    # 3. Get unique exons + num. Do it 3x for all, coding, and noncoding
    print "Getting unique exons"

    def getUniqExons(uniqFN, exonFN):
        with open(uniqFN, 'w') as uniq, open(exonFN, 'r') as exons:
            already = set()
            reader = csv.reader(exons, 'textdialect')
            writer = csv.writer(uniq, 'textdialect')
            for row in reader:
                # gene ID info
                geneIDInfo = row[3]
                id = geneIDInfo.split('__')[0]
                try:
                    geneName = idToName[id]
                except:
                    geneName = id
                if geneName != id: geneIDInfo = id + '__' + geneName
                else: geneIDInfo = id

                # chrom, start, stop, strand
                chrom = row[0]
                start, end = int(row[1]), int(row[2])
                strand = row[5]

                # calculate exon starts and lengths
                exonLengths = row[10].split(',')
                if exonLengths[-1] == '': exonLengths = exonLengths[:-1]
                exonLengths = [int(x) for x in exonLengths]
                exonStarts = row[11].split(',')
                if exonStarts[-1] == '': exonStarts = exonStarts[:-1]
                exonStarts = [int(x) for x in exonStarts]

                # calculate exons
                exons = []
                for i in range(len(exonStarts)):
                    absStart = start + exonStarts[i]
                    exons.append([absStart, absStart + exonLengths[i]])
                if strand == '-': exons = exons[::-1]  #flip exon order

                # making BED6
                for i in range(len(exons)):
                    exonNum = i + 1
                    exonNumInfo = str(exonNum) + 'of' + str(len(exons))
                    exon = exons[i]
                    outputRow = [chrom, exon[0], exon[1]]

                    # unique
                    if tuple(outputRow) in already: continue
                    already.add(tuple(outputRow))
                    outputRow.extend(
                        [geneIDInfo + '__exon__' + exonNumInfo, 0, strand])
                    writer.writerow(outputRow)

    uniqExons = args.output + '_uniq_exons.bed'
    getUniqExons(uniqExons, exonFName)

    uniqExons = args.output + '_uniq_codingexons.bed'
    getUniqExons(uniqExons, codingExonFName)

    uniqExons = args.output + '_uniq_noncodingexons.bed'
    getUniqExons(uniqExons, noncodingExonFName)

    # 4. Get unique introns + num. unique 5'SS, 3'SS.
    # 5'SS is first base of intron, 3'SS is last base of intron
    print "Getting unique introns and 5' and 3' SS"

    def getUniqIntronsAndSS(uniqIntronFN, uniq5SSFN, uniq3SSFN, intronFN):
        with open(uniqIntronFN, 'w') as uniqIntron, open(uniq5SSFN, 'w') as uniq5, \
         open(uniq3SSFN, 'w') as uniq3, open(intronFN, 'r') as introns:
            alreadyIntron = set()
            already5 = set()
            already3 = set()

            reader = csv.reader(introns, 'textdialect')
            intronWriter = csv.writer(uniqIntron, 'textdialect')
            fiveWriter = csv.writer(uniq5, 'textdialect')
            threeWriter = csv.writer(uniq3, 'textdialect')

            for row in reader:
                # gene ID info
                geneIDInfo = row[3]
                id = geneIDInfo.split('__')[0]
                try:
                    geneName = idToName[id]
                except:
                    geneName = id
                if geneName != id: geneIDInfo = id + '__' + geneName
                else: geneIDInfo = id

                # chrom, start, stop, strand
                chrom = row[0]
                start, end = int(row[1]), int(row[2])
                strand = row[5]

                # calculate intron starts and lengths
                intronLengths = row[10].split(',')
                if intronLengths[-1] == '': intronLengths = intronLengths[:-1]
                intronLengths = [int(x) for x in intronLengths]
                intronStarts = row[11].split(',')
                if intronStarts[-1] == '': intronStarts = intronStarts[:-1]
                intronStarts = [int(x) for x in intronStarts]

                # calculate introns
                introns = []
                for i in range(len(intronStarts)):
                    absStart = start + intronStarts[i]
                    introns.append([absStart, absStart + intronLengths[i]])
                if strand == '-': introns = introns[::-1]  #flip intron order

                # making BED6
                for i in range(len(introns)):
                    intronNum = i + 1
                    intronNumInfo = str(intronNum) + 'of' + str(len(introns))
                    intron = introns[i]
                    outputRow = [chrom, intron[0], intron[1]]

                    # unique introns
                    if tuple(outputRow) in alreadyIntron: continue
                    alreadyIntron.add(tuple(outputRow))
                    outputRow.extend(
                        [geneIDInfo + '__intron__' + intronNumInfo, 0, strand])
                    intronWriter.writerow(outputRow)

                    # unique splice sites
                    if strand == '+':
                        fiveSS = [chrom, intron[0], intron[0] + 1]
                        threeSS = [chrom, intron[1] - 1, intron[1]]
                    else:
                        threeSS = [chrom, intron[0], intron[0] + 1]
                        fiveSS = [chrom, intron[1] - 1, intron[1]]
                    if tuple(fiveSS) not in already5:
                        already5.add(tuple(fiveSS))
                        fiveSS.extend([
                            geneIDInfo + '__5ss__' + intronNumInfo, 0, strand
                        ])
                        fiveWriter.writerow(fiveSS)
                    if tuple(threeSS) not in already3:
                        already3.add(tuple(threeSS))
                        threeSS.extend([
                            geneIDInfo + '__3ss__' + intronNumInfo, 0, strand
                        ])
                        threeWriter.writerow(threeSS)

    uniqIntrons = args.output + '_uniq_introns.bed'
    uniq5 = args.output + '_uniq_5ss.bed'
    uniq3 = args.output + '_uniq_3ss.bed'
    getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, intronFName)

    uniqIntrons = args.output + '_uniq_codingintrons.bed'
    uniq5 = args.output + '_uniq_coding5ss.bed'
    uniq3 = args.output + '_uniq_coding3ss.bed'
    getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, codingIntronFName)

    uniqIntrons = args.output + '_uniq_noncodingintrons.bed'
    uniq5 = args.output + '_uniq_noncoding5ss.bed'
    uniq3 = args.output + '_uniq_noncoding3ss.bed'
    getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, noncodingIntronFName)

    # 5. unique cdsStart, cdsEnd
    print "Getting unique cdsStart and cdsEnd"

    def getUniqCDSStartEnd(startFN, endFN, cdsFN):
        with open(startFN,
                  'w') as uniqStart, open(endFN,
                                          'w') as uniqEnd, open(cdsFN,
                                                                'r') as cds:
            alreadyStart = set()
            alreadyEnd = set()
            reader = csv.reader(cds, 'textdialect')
            startWriter = csv.writer(uniqStart, 'textdialect')
            endWriter = csv.writer(uniqEnd, 'textdialect')
            for row in reader:
                geneIDInfo = row[3]
                id = geneIDInfo.split('__')[0]
                try:
                    geneName = idToName[id]
                except:
                    geneName = id
                if geneName != id: geneIDInfo = id + '__' + geneName
                else: geneIDInfo = id

                # chrom, start, stop, strand
                chrom = row[0]
                strand = row[5]
                start, end = int(row[1]), int(row[2])

                if strand == '+':
                    startRow = [chrom, start, start + 1]
                    endRow = [chrom, end - 1, end]
                else:
                    startRow = [chrom, end - 1, end]
                    endRow = [chrom, start, start + 1]
                if tuple(startRow) not in alreadyStart:
                    alreadyStart.add(tuple(startRow))
                    startRow.extend([geneIDInfo, 0, strand])
                    startWriter.writerow(startRow)
                if tuple(endRow) not in alreadyEnd:
                    alreadyEnd.add(tuple(endRow))
                    endRow.extend([geneIDInfo, 0, strand])
                    endWriter.writerow(endRow)

    uniqCDSStart = args.output + '_uniq_cdsStart.bed'
    uniqCDSEnd = args.output + '_uniq_cdsEnd.bed'
    getUniqCDSStartEnd(uniqCDSStart, uniqCDSEnd, cdsFName)

    # 6. unique TSS, TES
    print "Getting unique TSS and TES"

    def getUniqTSSAndTES(tssFN, tesFN, fiveFN, threeFN):
        with open(tssFN, 'w') as uniqTSS, open(tesFN, 'w') as uniqTES, open(
                fiveFN, 'r') as fiveUTR, open(threeFN, 'r') as threeUTR:
            alreadyTSS = set()
            fiveReader = csv.reader(fiveUTR, 'textdialect')
            tssWriter = csv.writer(uniqTSS, 'textdialect')
            for row in fiveReader:
                geneIDInfo = row[3]
                id = geneIDInfo.split('__')[0]
                try:
                    geneName = idToName[id]
                except:
                    geneName = id
                if geneName != id: geneIDInfo = id + '__' + geneName
                else: geneIDInfo = id

                # chrom, start, stop, strand
                chrom = row[0]
                strand = row[5]
                start, end = int(row[1]), int(row[2])

                if strand == '+':
                    startRow = [chrom, start, start + 1]
                else:
                    startRow = [chrom, end - 1, end]
                if tuple(startRow) not in alreadyTSS:
                    alreadyTSS.add(tuple(startRow))
                    startRow.extend([geneIDInfo, 0, strand])
                    tssWriter.writerow(startRow)

            alreadyTES = set()
            threeReader = csv.reader(threeUTR, 'textdialect')
            tesWriter = csv.writer(uniqTES, 'textdialect')
            for row in threeReader:
                geneIDInfo = row[3]
                id = geneIDInfo.split('__')[0]
                try:
                    geneName = idToName[id]
                except:
                    geneName = id
                if geneName != id: geneIDInfo = id + '__' + geneName
                else: geneIDInfo = id

                # chrom, start, stop, strand
                chrom = row[0]
                strand = row[5]
                start, end = int(row[1]), int(row[2])

                if strand == '-':
                    endRow = [chrom, start, start + 1]
                else:
                    endRow = [chrom, end - 1, end]
                if tuple(endRow) not in alreadyTES:
                    alreadyTES.add(tuple(endRow))
                    endRow.extend([geneIDInfo, 0, strand])
                    tesWriter.writerow(endRow)

    uniqTSS = args.output + '_uniq_tss.bed'
    uniqTES = args.output + '_uniq_tes.bed'

    getUniqTSSAndTES(uniqTSS, uniqTES, utr5FName, utr3FName)

    # sort everything
    print "Sorting BED files"
    for fn in glob.glob("*.bed"):
        os.system("sort -k1,1 -k2,2n %s -o %s" % (fn, fn))
                writeOutput(gene)

                if (not genesRead % 2500):
                    print "Processed %d entries..." %  genesRead

                
    elif (args.gtf): 
            
            # first parse the entire file into a dictionary of lists

        txDict = defaultdict(list) 

        print "Building GTF dictionary..." 

        # the issue here is that lines for various transcripts may be interleaved, so can either create lots of objects, or a giant dict. opted for giant dict. 
        for line in GTF.lines(args.input): 
            # only want to read in lines corresponding to these features
            if line["feature"] in ["exon", "CDS", "start_codon", "stop_codon"]:
                txDict[line["transcript_id"]].append(line)
                genesRead += 1

                if (not genesRead % 25000):
                    print "\tProcessed %d lines..." %  genesRead

        print "Dictionary built." 

        print "Writing transcript properties."
        genesRead = 0
        
        # now create a Transcript object for each transcript and output it