def count_lowlevel_in_hightlevel(filename, low_level_name, high_level_name): """ To count how many sub-features in a highlevel feature. :param filename: File used to be processed. :param low_level_name: Feature names in GTF file. Such as "exon", "transcript". :param high_level_name: Feature names in GTF file. Such as "exon", "transcript", "gene". :return: No return, but output to file directly. """ occurrence = 0 with open('{} number in each {}'.format(low_level_name, high_level_name), 'w') as f: for idx, item_with_bool in enumerate(lookahead(GTF.lines(filename))): if item_with_bool[0]['feature'] == high_level_name: if idx != 0 and idx != 1: f.write(str(occurrence) + '\n') occurrence = 0 elif item_with_bool[0]['feature'] == low_level_name: occurrence += 1 elif item_with_bool[1] == False: f.write(str(occurrence) + '\n') else: continue
def main(): with open(utr5FName, "w") as utr5File, open(utr5StartFName, "w") as utr5StartFile, open(cdsFName, "w") as cdsFile, \ open(utr3FName, "w") as utr3File, open(exonFName, "w") as exonFile, open (intronFName, "w") as intronFile, \ open(codingExonFName, "w") as codingExonFile, open(codingIntronFName, "w") as codingIntronFile, \ open(noncodingExonFName, "w") as noncodingExonFile, open(noncodingIntronFName, "w") as noncodingIntronFile: def writeOutput(gene): if (useBlocks): # output all region primitives on the same line by specifying nBlocks and lists inside the BED output if(gene.coding): #blockBedFormat is one line by definition if (gene.utr5Len > 0): utr5File.write(gene.blockBedFormat(region="5utr") + "\n") if (gene.utr5startLen > 0): utr5StartFile.write(gene.blockBedFormat(region="5utr_start") + "\n") if (gene.cdsLen > 0): cdsFile.write(gene.blockBedFormat(region="cds") + "\n") if (gene.utr3Len > 0): utr3File.write(gene.blockBedFormat(region="3utr") + "\n") if (gene.exonsLen > 0): exonFile.write(gene.blockBedFormat(region="exons") + "\n") codingExonFile.write(gene.blockBedFormat(region="exons") + "\n") if (gene.intronsLen > 0): intronFile.write(gene.blockBedFormat(region="introns") + "\n") codingIntronFile.write(gene.blockBedFormat(region="introns") + "\n") else: # noncoding transcripts just have exons and introns if (gene.exonsLen > 0): exonFile.write(gene.blockBedFormat(region="exons") + "\n") noncodingExonFile.write(gene.blockBedFormat(region="exons") + "\n") if (gene.intronsLen > 0): intronFile.write(gene.blockBedFormat(region="introns") + "\n") noncodingIntronFile.write(gene.blockBedFormat(region="introns") + "\n") else: # output one line per region primitive instead of combining regions via blocks if(gene.coding): for entry in gene.bedFormat(region="5utr"): utr5File.write(entry + "\n") for entry in gene.bedFormat(region="5utr_start"): utr5StartFile.write(entry + "\n") for entry in gene.bedFormat(region="cds"): cdsFile.write(entry + "\n") for entry in gene.bedFormat(region="3utr"): utr3File.write(entry + "\n") for entry in gene.bedFormat(region="exons"): exonFile.write(entry + "\n") codingExonFile.write(entry + "\n") for entry in gene.bedFormat(region="introns"): intronFile.write(entry + "\n") codingIntronFile.write(entry + "\n") else: # noncoding transcripts just have exons and introns for entry in gene.bedFormat(region="exons"): exonFile.write(entry + "\n") noncodingExonFile.write(entry + "\n") for entry in gene.bedFormat(region="introns"): intronFile.write(entry + "\n") noncodingIntronFile.write(entry + "\n") if (args.ucsc): with open(args.input, "r") as genesFile: genesRead = 0 for line in genesFile: # all of the knowngenes parsing and metadata construction is done inside UCSCKnownGene.py, especially the createGene method gene = createUCSCTranscript(line) genesRead += 1 writeOutput(gene) if (not genesRead % 2500): print "Processed %d entries..." % genesRead elif (args.gtf): # first parse the entire file into a dictionary of lists txDict = defaultdict(list) print "Building GTF dictionary..." # the issue here is that lines for various transcripts may be interleaved, so can either create lots of SNFGene objects, or a giant dict. opted for giant dict. for line in GTF.lines(args.input): txDict[line["transcript_id"]].append(line) genesRead += 1 if (not genesRead % 100000): print "Processed %d lines..." % genesRead print "Dictionary built." # now create a SNFGene object for each transcript and output it genesRead = 0 for key in txDict: #print key tx = createGTFTranscript(txDict[key]) #print tx writeOutput(tx) genesRead += 1 if (not genesRead % 2500): print "Processed %d entries..." % genesRead print "Processed %d entries." % genesRead # BTD Edit: making unique regions and linking to gene name # -------------------------------------------------------- # utr5FName = args.output + "_5utr.bed" # utr5StartFName = args.output + "_5utr_start.bed" # cdsFName = args.output + "_cds.bed" # utr3FName = args.output + "_3utr.bed" # exonFName = args.output + "_exons.bed" # intronFName = args.output + "_introns.bed" # codingExonFName = args.output + "_codingexons.bed" # codingIntronFName = args.output + "_codingintrons.bed" # note that these are introns from coding genes, not necessarily introns that make it to mRNA # noncodingExonFName = args.output + "_noncodingexons.bed" # noncodingIntronFName = args.output + "_noncodingintrons.bed" # 1. Get gene ID (NM_123, ENSG123) --> gene name (Abcd1) print "Getting gene ID" idToName = {} if args.ucsc: with open(args.input, 'r') as knownGeneFile: reader = csv.reader(knownGeneFile, 'textdialect') for row in reader: idToName[row[0]] = row[-1] # 2. Get unique 5'UTR, 5'Start UTR, and 3'UTR print "Getting unique UTRs" def getUniqUTR(uniqFN, utrFN): with open(uniqFN, 'w') as uniq, open(utrFN, 'r') as utr: already = set() reader = csv.reader(utr, 'textdialect') writer = csv.writer(uniq, 'textdialect') for row in reader: if tuple(row[6:]) in already: continue #repeat geneIDInfo = row[3] id = geneIDInfo.split('__')[0] try: geneName = idToName[id] except: geneName = id if geneName != id: row[3] = id + '__' + geneName else: row[3] = id already.add(tuple(row[6:])) writer.writerow(row) uniq5UTR = args.output + "_uniq_5utr.bed" getUniqUTR(uniq5UTR, utr5FName) uniq3UTR = args.output + '_uniq_3utr.bed' getUniqUTR(uniq3UTR, utr3FName) uniq5SUTR = args.output + '_uniq_5utr_start.bed' getUniqUTR(uniq5SUTR, utr5StartFName) # 3. Get unique exons + num. Do it 3x for all, coding, and noncoding print "Getting unique exons" def getUniqExons(uniqFN, exonFN): with open(uniqFN, 'w') as uniq, open(exonFN, 'r') as exons: already = set() reader = csv.reader(exons, 'textdialect') writer = csv.writer(uniq, 'textdialect') for row in reader: # gene ID info geneIDInfo = row[3] id = geneIDInfo.split('__')[0] try: geneName = idToName[id] except: geneName = id if geneName != id: geneIDInfo = id + '__' + geneName else: geneIDInfo = id # chrom, start, stop, strand chrom = row[0] start, end = int(row[1]), int(row[2]) strand = row[5] # calculate exon starts and lengths exonLengths = row[10].split(',') if exonLengths[-1] == '': exonLengths = exonLengths[:-1] exonLengths = [int(x) for x in exonLengths] exonStarts = row[11].split(',') if exonStarts[-1] == '': exonStarts = exonStarts[:-1] exonStarts = [int(x) for x in exonStarts] # calculate exons exons = [] for i in range(len(exonStarts)): absStart = start + exonStarts[i] exons.append([absStart, absStart + exonLengths[i]]) if strand == '-': exons = exons[::-1] #flip exon order # making BED6 for i in range(len(exons)): exonNum = i + 1 exonNumInfo = str(exonNum) + 'of' + str(len(exons)) exon = exons[i] outputRow = [chrom, exon[0], exon[1]] # unique if tuple(outputRow) in already: continue already.add(tuple(outputRow)) outputRow.extend([geneIDInfo + '__exon__' + exonNumInfo, 0, strand]) writer.writerow(outputRow) uniqExons = args.output + '_uniq_exons.bed' getUniqExons(uniqExons, exonFName) uniqExons = args.output + '_uniq_codingexons.bed' getUniqExons(uniqExons, codingExonFName) uniqExons = args.output + '_uniq_noncodingexons.bed' getUniqExons(uniqExons, noncodingExonFName) # 4. Get unique introns + num. unique 5'SS, 3'SS. # 5'SS is first base of intron, 3'SS is last base of intron print "Getting unique introns and 5' and 3' SS" def getUniqIntronsAndSS(uniqIntronFN, uniq5SSFN, uniq3SSFN, intronFN): with open(uniqIntronFN, 'w') as uniqIntron, open(uniq5SSFN, 'w') as uniq5, \ open(uniq3SSFN, 'w') as uniq3, open(intronFN, 'r') as introns: alreadyIntron = set() already5 = set() already3 = set() reader = csv.reader(introns, 'textdialect') intronWriter = csv.writer(uniqIntron, 'textdialect') fiveWriter = csv.writer(uniq5, 'textdialect') threeWriter = csv.writer(uniq3, 'textdialect') for row in reader: # gene ID info geneIDInfo = row[3] id = geneIDInfo.split('__')[0] try: geneName = idToName[id] except: geneName = id if geneName != id: geneIDInfo = id + '__' + geneName else: geneIDInfo = id # chrom, start, stop, strand chrom = row[0] start, end = int(row[1]), int(row[2]) strand = row[5] # calculate intron starts and lengths intronLengths = row[10].split(',') if intronLengths[-1] == '': intronLengths = intronLengths[:-1] intronLengths = [int(x) for x in intronLengths] intronStarts = row[11].split(',') if intronStarts[-1] == '': intronStarts = intronStarts[:-1] intronStarts = [int(x) for x in intronStarts] # calculate introns introns = [] for i in range(len(intronStarts)): absStart = start + intronStarts[i] introns.append([absStart, absStart + intronLengths[i]]) if strand == '-': introns = introns[::-1] #flip intron order # making BED6 for i in range(len(introns)): intronNum = i + 1 intronNumInfo = str(intronNum) + 'of' + str(len(introns)) intron = introns[i] outputRow = [chrom, intron[0], intron[1]] # unique introns if tuple(outputRow) in alreadyIntron: continue alreadyIntron.add(tuple(outputRow)) outputRow.extend([geneIDInfo+ '__intron__' + intronNumInfo, 0, strand]) intronWriter.writerow(outputRow) # unique splice sites if strand == '+': fiveSS = [chrom, intron[0], intron[0] + 1] threeSS = [chrom, intron[1] - 1, intron[1]] else: threeSS = [chrom, intron[0], intron[0] + 1] fiveSS = [chrom, intron[1] - 1, intron[1]] if tuple(fiveSS) not in already5: already5.add(tuple(fiveSS)) fiveSS.extend([geneIDInfo + '__5ss__' + intronNumInfo, 0, strand]) fiveWriter.writerow(fiveSS) if tuple(threeSS) not in already3: already3.add(tuple(threeSS)) threeSS.extend([geneIDInfo+ '__3ss__' + intronNumInfo, 0, strand]) threeWriter.writerow(threeSS) uniqIntrons = args.output + '_uniq_introns.bed' uniq5 = args.output + '_uniq_5ss.bed' uniq3 = args.output + '_uniq_3ss.bed' getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, intronFName) uniqIntrons = args.output + '_uniq_codingintrons.bed' uniq5 = args.output + '_uniq_coding5ss.bed' uniq3 = args.output + '_uniq_coding3ss.bed' getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, codingIntronFName) uniqIntrons = args.output + '_uniq_noncodingintrons.bed' uniq5 = args.output + '_uniq_noncoding5ss.bed' uniq3 = args.output + '_uniq_noncoding3ss.bed' getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, noncodingIntronFName) # 5. unique TSS/TES print "Getting unique TSS and TES" def getUniqTSSAndTES(tssFN, tesFN, cdsFN): with open(tssFN, 'w') as uniqTSS, open(tesFN, 'w') as uniqTES, open(cdsFN, 'r') as cds: alreadyTSS = set() alreadyTES = set() reader = csv.reader(cds, 'textdialect') tssWriter = csv.writer(uniqTSS, 'textdialect') tesWriter = csv.writer(uniqTES, 'textdialect') for row in reader: geneIDInfo = row[3] id = geneIDInfo.split('__')[0] try: geneName = idToName[id] except: geneName = id if geneName != id: geneIDInfo = id + '__' + geneName else: geneIDInfo = id # chrom, start, stop, strand chrom = row[0] strand = row[5] start, end = int(row[1]), int(row[2]) if strand == '+': startRow = [chrom, start, start + 1] endRow = [chrom, end - 1, end] else: startRow = [chrom, end - 1, end] endRow = [chrom, start, start + 1] if tuple(startRow) not in alreadyTSS: alreadyTSS.add(tuple(startRow)) startRow.extend([geneIDInfo, 0, strand]) tssWriter.writerow(startRow) if tuple(endRow) not in alreadyTSS: alreadyTES.add(tuple(endRow)) endRow.extend([geneIDInfo, 0, strand]) tesWriter.writerow(endRow) uniqTSS = args.output + '_uniq_tss.bed' uniqTES = args.output + '_uniq_tes.bed' getUniqTSSAndTES(uniqTSS, uniqTES, cdsFName) # sort everything print "Sorting BED files" for fn in glob.glob("*.bed"): os.system("sort -k1,1 -k2,2n %s -o %s"%(fn, fn))
writeOutput(gene) if (not genesRead % 2500): print "Processed %d entries..." % genesRead elif (args.gtf): # first parse the entire file into a dictionary of lists txDict = defaultdict(list) print "Building GTF dictionary..." # the issue here is that lines for various transcripts may be interleaved, so can either create lots of objects, or a giant dict. opted for giant dict. for line in GTF.lines(args.input): # only want to read in lines corresponding to these features if line["feature"] in ["exon", "CDS", "start_codon", "stop_codon"]: txDict[line["transcript_id"]].append(line) genesRead += 1 if (not genesRead % 25000): print "\tProcessed %d lines..." % genesRead print "Dictionary built." print "Writing transcript properties." genesRead = 0 # now create a Transcript object for each transcript and output it
def main(): with open(utr5FName, "w") as utr5File, open(utr5StartFName, "w") as utr5StartFile, open(cdsFName, "w") as cdsFile, \ open(utr3FName, "w") as utr3File, open(exonFName, "w") as exonFile, open (intronFName, "w") as intronFile, \ open(codingExonFName, "w") as codingExonFile, open(codingIntronFName, "w") as codingIntronFile, \ open(noncodingExonFName, "w") as noncodingExonFile, open(noncodingIntronFName, "w") as noncodingIntronFile: def writeOutput(gene): if ( useBlocks ): # output all region primitives on the same line by specifying nBlocks and lists inside the BED output if (gene.coding): #blockBedFormat is one line by definition if (gene.utr5Len > 0): utr5File.write( gene.blockBedFormat(region="5utr") + "\n") if (gene.utr5startLen > 0): utr5StartFile.write( gene.blockBedFormat(region="5utr_start") + "\n") if (gene.cdsLen > 0): cdsFile.write(gene.blockBedFormat(region="cds") + "\n") if (gene.utr3Len > 0): utr3File.write( gene.blockBedFormat(region="3utr") + "\n") if (gene.exonsLen > 0): exonFile.write( gene.blockBedFormat(region="exons") + "\n") codingExonFile.write( gene.blockBedFormat(region="exons") + "\n") if (gene.intronsLen > 0): intronFile.write( gene.blockBedFormat(region="introns") + "\n") codingIntronFile.write( gene.blockBedFormat(region="introns") + "\n") else: # noncoding transcripts just have exons and introns if (gene.exonsLen > 0): exonFile.write( gene.blockBedFormat(region="exons") + "\n") noncodingExonFile.write( gene.blockBedFormat(region="exons") + "\n") if (gene.intronsLen > 0): intronFile.write( gene.blockBedFormat(region="introns") + "\n") noncodingIntronFile.write( gene.blockBedFormat(region="introns") + "\n") else: # output one line per region primitive instead of combining regions via blocks if (gene.coding): for entry in gene.bedFormat(region="5utr"): utr5File.write(entry + "\n") for entry in gene.bedFormat(region="5utr_start"): utr5StartFile.write(entry + "\n") for entry in gene.bedFormat(region="cds"): cdsFile.write(entry + "\n") for entry in gene.bedFormat(region="3utr"): utr3File.write(entry + "\n") for entry in gene.bedFormat(region="exons"): exonFile.write(entry + "\n") codingExonFile.write(entry + "\n") for entry in gene.bedFormat(region="introns"): intronFile.write(entry + "\n") codingIntronFile.write(entry + "\n") else: # noncoding transcripts just have exons and introns for entry in gene.bedFormat(region="exons"): exonFile.write(entry + "\n") noncodingExonFile.write(entry + "\n") for entry in gene.bedFormat(region="introns"): intronFile.write(entry + "\n") noncodingIntronFile.write(entry + "\n") if (args.ucsc): with open(args.input, "r") as genesFile: genesRead = 0 for line in genesFile: # all of the knowngenes parsing and metadata construction is done inside UCSCKnownGene.py, especially the createGene method gene = createUCSCTranscript(line) genesRead += 1 writeOutput(gene) if (not genesRead % 2500): print "Processed %d entries..." % genesRead elif (args.gtf): # first parse the entire file into a dictionary of lists txDict = defaultdict(list) print "Building GTF dictionary..." # the issue here is that lines for various transcripts may be interleaved, so can either create lots of SNFGene objects, or a giant dict. opted for giant dict. for line in GTF.lines(args.input): txDict[line["transcript_id"]].append(line) genesRead += 1 if (not genesRead % 100000): print "Processed %d lines..." % genesRead print "Dictionary built." # now create a SNFGene object for each transcript and output it genesRead = 0 for key in txDict: #print key tx = createGTFTranscript(txDict[key]) #print tx writeOutput(tx) genesRead += 1 if (not genesRead % 2500): print "Processed %d entries..." % genesRead print "Processed %d entries." % genesRead # BTD Edit: making unique regions and linking to gene name # -------------------------------------------------------- # utr5FName = args.output + "_5utr.bed" # utr5StartFName = args.output + "_5utr_start.bed" # cdsFName = args.output + "_cds.bed" # utr3FName = args.output + "_3utr.bed" # exonFName = args.output + "_exons.bed" # intronFName = args.output + "_introns.bed" # codingExonFName = args.output + "_codingexons.bed" # codingIntronFName = args.output + "_codingintrons.bed" # note that these are introns from coding genes, not necessarily introns that make it to mRNA # noncodingExonFName = args.output + "_noncodingexons.bed" # noncodingIntronFName = args.output + "_noncodingintrons.bed" # 1. Get gene ID (NM_123, ENSG123) --> gene name (Abcd1) print "Getting gene ID" idToName = {} if args.ucsc: with open(args.input, 'r') as knownGeneFile: reader = csv.reader(knownGeneFile, 'textdialect') for row in reader: idToName[row[0]] = row[-1] # 2. Get unique 5'UTR, 5'Start UTR, and 3'UTR print "Getting unique UTRs" def getUniqUTR(uniqFN, utrFN): with open(uniqFN, 'w') as uniq, open(utrFN, 'r') as utr: already = set() reader = csv.reader(utr, 'textdialect') writer = csv.writer(uniq, 'textdialect') for row in reader: if tuple(row[6:]) in already: continue #repeat geneIDInfo = row[3] id = geneIDInfo.split('__')[0] try: geneName = idToName[id] except: geneName = id if geneName != id: row[3] = id + '__' + geneName else: row[3] = id already.add(tuple(row[6:])) writer.writerow(row) uniq5UTR = args.output + "_uniq_5utr.bed" getUniqUTR(uniq5UTR, utr5FName) uniq3UTR = args.output + '_uniq_3utr.bed' getUniqUTR(uniq3UTR, utr3FName) uniq5SUTR = args.output + '_uniq_5utr_start.bed' getUniqUTR(uniq5SUTR, utr5StartFName) # 3. Get unique exons + num. Do it 3x for all, coding, and noncoding print "Getting unique exons" def getUniqExons(uniqFN, exonFN): with open(uniqFN, 'w') as uniq, open(exonFN, 'r') as exons: already = set() reader = csv.reader(exons, 'textdialect') writer = csv.writer(uniq, 'textdialect') for row in reader: # gene ID info geneIDInfo = row[3] id = geneIDInfo.split('__')[0] try: geneName = idToName[id] except: geneName = id if geneName != id: geneIDInfo = id + '__' + geneName else: geneIDInfo = id # chrom, start, stop, strand chrom = row[0] start, end = int(row[1]), int(row[2]) strand = row[5] # calculate exon starts and lengths exonLengths = row[10].split(',') if exonLengths[-1] == '': exonLengths = exonLengths[:-1] exonLengths = [int(x) for x in exonLengths] exonStarts = row[11].split(',') if exonStarts[-1] == '': exonStarts = exonStarts[:-1] exonStarts = [int(x) for x in exonStarts] # calculate exons exons = [] for i in range(len(exonStarts)): absStart = start + exonStarts[i] exons.append([absStart, absStart + exonLengths[i]]) if strand == '-': exons = exons[::-1] #flip exon order # making BED6 for i in range(len(exons)): exonNum = i + 1 exonNumInfo = str(exonNum) + 'of' + str(len(exons)) exon = exons[i] outputRow = [chrom, exon[0], exon[1]] # unique if tuple(outputRow) in already: continue already.add(tuple(outputRow)) outputRow.extend( [geneIDInfo + '__exon__' + exonNumInfo, 0, strand]) writer.writerow(outputRow) uniqExons = args.output + '_uniq_exons.bed' getUniqExons(uniqExons, exonFName) uniqExons = args.output + '_uniq_codingexons.bed' getUniqExons(uniqExons, codingExonFName) uniqExons = args.output + '_uniq_noncodingexons.bed' getUniqExons(uniqExons, noncodingExonFName) # 4. Get unique introns + num. unique 5'SS, 3'SS. # 5'SS is first base of intron, 3'SS is last base of intron print "Getting unique introns and 5' and 3' SS" def getUniqIntronsAndSS(uniqIntronFN, uniq5SSFN, uniq3SSFN, intronFN): with open(uniqIntronFN, 'w') as uniqIntron, open(uniq5SSFN, 'w') as uniq5, \ open(uniq3SSFN, 'w') as uniq3, open(intronFN, 'r') as introns: alreadyIntron = set() already5 = set() already3 = set() reader = csv.reader(introns, 'textdialect') intronWriter = csv.writer(uniqIntron, 'textdialect') fiveWriter = csv.writer(uniq5, 'textdialect') threeWriter = csv.writer(uniq3, 'textdialect') for row in reader: # gene ID info geneIDInfo = row[3] id = geneIDInfo.split('__')[0] try: geneName = idToName[id] except: geneName = id if geneName != id: geneIDInfo = id + '__' + geneName else: geneIDInfo = id # chrom, start, stop, strand chrom = row[0] start, end = int(row[1]), int(row[2]) strand = row[5] # calculate intron starts and lengths intronLengths = row[10].split(',') if intronLengths[-1] == '': intronLengths = intronLengths[:-1] intronLengths = [int(x) for x in intronLengths] intronStarts = row[11].split(',') if intronStarts[-1] == '': intronStarts = intronStarts[:-1] intronStarts = [int(x) for x in intronStarts] # calculate introns introns = [] for i in range(len(intronStarts)): absStart = start + intronStarts[i] introns.append([absStart, absStart + intronLengths[i]]) if strand == '-': introns = introns[::-1] #flip intron order # making BED6 for i in range(len(introns)): intronNum = i + 1 intronNumInfo = str(intronNum) + 'of' + str(len(introns)) intron = introns[i] outputRow = [chrom, intron[0], intron[1]] # unique introns if tuple(outputRow) in alreadyIntron: continue alreadyIntron.add(tuple(outputRow)) outputRow.extend( [geneIDInfo + '__intron__' + intronNumInfo, 0, strand]) intronWriter.writerow(outputRow) # unique splice sites if strand == '+': fiveSS = [chrom, intron[0], intron[0] + 1] threeSS = [chrom, intron[1] - 1, intron[1]] else: threeSS = [chrom, intron[0], intron[0] + 1] fiveSS = [chrom, intron[1] - 1, intron[1]] if tuple(fiveSS) not in already5: already5.add(tuple(fiveSS)) fiveSS.extend([ geneIDInfo + '__5ss__' + intronNumInfo, 0, strand ]) fiveWriter.writerow(fiveSS) if tuple(threeSS) not in already3: already3.add(tuple(threeSS)) threeSS.extend([ geneIDInfo + '__3ss__' + intronNumInfo, 0, strand ]) threeWriter.writerow(threeSS) uniqIntrons = args.output + '_uniq_introns.bed' uniq5 = args.output + '_uniq_5ss.bed' uniq3 = args.output + '_uniq_3ss.bed' getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, intronFName) uniqIntrons = args.output + '_uniq_codingintrons.bed' uniq5 = args.output + '_uniq_coding5ss.bed' uniq3 = args.output + '_uniq_coding3ss.bed' getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, codingIntronFName) uniqIntrons = args.output + '_uniq_noncodingintrons.bed' uniq5 = args.output + '_uniq_noncoding5ss.bed' uniq3 = args.output + '_uniq_noncoding3ss.bed' getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, noncodingIntronFName) # 5. unique cdsStart, cdsEnd print "Getting unique cdsStart and cdsEnd" def getUniqCDSStartEnd(startFN, endFN, cdsFN): with open(startFN, 'w') as uniqStart, open(endFN, 'w') as uniqEnd, open(cdsFN, 'r') as cds: alreadyStart = set() alreadyEnd = set() reader = csv.reader(cds, 'textdialect') startWriter = csv.writer(uniqStart, 'textdialect') endWriter = csv.writer(uniqEnd, 'textdialect') for row in reader: geneIDInfo = row[3] id = geneIDInfo.split('__')[0] try: geneName = idToName[id] except: geneName = id if geneName != id: geneIDInfo = id + '__' + geneName else: geneIDInfo = id # chrom, start, stop, strand chrom = row[0] strand = row[5] start, end = int(row[1]), int(row[2]) if strand == '+': startRow = [chrom, start, start + 1] endRow = [chrom, end - 1, end] else: startRow = [chrom, end - 1, end] endRow = [chrom, start, start + 1] if tuple(startRow) not in alreadyStart: alreadyStart.add(tuple(startRow)) startRow.extend([geneIDInfo, 0, strand]) startWriter.writerow(startRow) if tuple(endRow) not in alreadyEnd: alreadyEnd.add(tuple(endRow)) endRow.extend([geneIDInfo, 0, strand]) endWriter.writerow(endRow) uniqCDSStart = args.output + '_uniq_cdsStart.bed' uniqCDSEnd = args.output + '_uniq_cdsEnd.bed' getUniqCDSStartEnd(uniqCDSStart, uniqCDSEnd, cdsFName) # 6. unique TSS, TES print "Getting unique TSS and TES" def getUniqTSSAndTES(tssFN, tesFN, fiveFN, threeFN): with open(tssFN, 'w') as uniqTSS, open(tesFN, 'w') as uniqTES, open( fiveFN, 'r') as fiveUTR, open(threeFN, 'r') as threeUTR: alreadyTSS = set() fiveReader = csv.reader(fiveUTR, 'textdialect') tssWriter = csv.writer(uniqTSS, 'textdialect') for row in fiveReader: geneIDInfo = row[3] id = geneIDInfo.split('__')[0] try: geneName = idToName[id] except: geneName = id if geneName != id: geneIDInfo = id + '__' + geneName else: geneIDInfo = id # chrom, start, stop, strand chrom = row[0] strand = row[5] start, end = int(row[1]), int(row[2]) if strand == '+': startRow = [chrom, start, start + 1] else: startRow = [chrom, end - 1, end] if tuple(startRow) not in alreadyTSS: alreadyTSS.add(tuple(startRow)) startRow.extend([geneIDInfo, 0, strand]) tssWriter.writerow(startRow) alreadyTES = set() threeReader = csv.reader(threeUTR, 'textdialect') tesWriter = csv.writer(uniqTES, 'textdialect') for row in threeReader: geneIDInfo = row[3] id = geneIDInfo.split('__')[0] try: geneName = idToName[id] except: geneName = id if geneName != id: geneIDInfo = id + '__' + geneName else: geneIDInfo = id # chrom, start, stop, strand chrom = row[0] strand = row[5] start, end = int(row[1]), int(row[2]) if strand == '-': endRow = [chrom, start, start + 1] else: endRow = [chrom, end - 1, end] if tuple(endRow) not in alreadyTES: alreadyTES.add(tuple(endRow)) endRow.extend([geneIDInfo, 0, strand]) tesWriter.writerow(endRow) uniqTSS = args.output + '_uniq_tss.bed' uniqTES = args.output + '_uniq_tes.bed' getUniqTSSAndTES(uniqTSS, uniqTES, utr5FName, utr3FName) # sort everything print "Sorting BED files" for fn in glob.glob("*.bed"): os.system("sort -k1,1 -k2,2n %s -o %s" % (fn, fn))
def main(argv): # Ottengo la stringa relativa al file da processare input_file = argv[0] temp_folder = argv[1] username = argv[2] experiment = argv[3] species = argv[4] config = json.load(open('../configuration.json')) temp_token = username + '_' + str(uuid.uuid4()) # Creo i csv per memorizzare le informazioni sui nodi chromosome_csv = open(temp_folder + temp_token + '_chromosome.csv', 'w') gene_csv = open(temp_folder + temp_token + '_gene.csv', 'w') transcript_csv = open(temp_folder + temp_token + '_transcript.csv', 'w') exon_csv = open(temp_folder + temp_token + '_exon.csv', 'w') # Creo i csv per memorizzare le informazioni sulle relazioni contains_csv = open(temp_folder + temp_token + '_contains.csv', 'w') in_chromosome_csv = open(temp_folder + temp_token + '_in_chromosome.csv', 'w') has_transcript_csv = open(temp_folder + temp_token + '_has_transcript.csv', 'w') has_exon_csv = open(temp_folder + temp_token + '_has_exon.csv', 'w') # Inizializzo i writer per tutti i file # ---- nodi chromosomeWriter = csv.writer(chromosome_csv, delimiter=',') geneWriter = csv.writer(gene_csv, delimiter=',') transcriptWriter = csv.writer(transcript_csv, delimiter=',') exonWriter = csv.writer(exon_csv, delimiter=',') # ---- relazioni containsWriter = csv.writer(contains_csv, delimiter=',') inChromosomeWriter = csv.writer(in_chromosome_csv, delimiter=',') hasTranscriptWriter = csv.writer(has_transcript_csv, delimiter=',') hasExonWriter = csv.writer(has_exon_csv, delimiter=',') # Cotruisco gli header dei file # ---- nodi chromosome_header = ["chromosome"] gene_header = ["gene_id"] transcript_header = [ "transcript_id", "reference_id", "cov", "FPKM", "TPM", "start", "end" ] exon_header = ["exon_id", "exon_number", "start", "end", "cov"] # ---- relazioni contains_header = ["name", "gene_id"] in_chromosome_header = ["gene_id", "chromosome"] has_transcript_header = ["gene_id", "strand", "transcript_id"] has_exon_header = ["transcript_id", "exon_id"] # Scrivo gli header nei rispettivi file # ---- nodi chromosomeWriter.writerow(chromosome_header) geneWriter.writerow(gene_header) transcriptWriter.writerow(transcript_header) exonWriter.writerow(exon_header) # ---- relazioni containsWriter.writerow(contains_header) inChromosomeWriter.writerow(in_chromosome_header) hasTranscriptWriter.writerow(has_transcript_header) hasExonWriter.writerow(has_exon_header) # Inizializzo le strutture dati necessarie al parsing (per ottimizzare il caricamento dei dati su database) # ---- nodi chromosomes = set() genes_dict = {} transcripts_dict = {} # ---- relazioni contains_dict = {} in_chromosome_dict = {} has_transcript_dict = {} print 'Starting parsing procedure for file ' + input_file properties = { "name": os.path.basename(input_file), "extension": os.path.splitext(input_file)[1] } # Connessione a Neo4j driver = GraphDatabase.driver("bolt://" + config["neo4j"]["address"], auth=basic_auth(config["neo4j"]["username"], config["neo4j"]["password"])) # Inizializzazione degli indici session = driver.session() statements = [ "CREATE INDEX ON :File(name);", "CREATE INDEX ON :Species(species);", "CREATE INDEX ON :Gene(gene_id);", "CREATE INDEX ON :Chromosome(chromosome);", "CREATE INDEX ON :Transcript(transcript_id);", "CREATE INDEX ON :Exon(exon_id);" ] for statement in statements: session.run(statement) session.close() print 'Parsing file...' # inizializzo un contatore per fare un load parziale del file su database per file troppo grandi row_count = 0 for line in GTF.lines(input_file): row_count += 1 # memorizzo il cromosoma chromosomes.add(line["seqname"]) # memorizzo il gene (se non presente) if not genes_dict.has_key(line["gene_id"]): genes_dict[line["gene_id"]] = [ line[attr] if line.has_key(attr) else "None" for attr in gene_header ] # memorizzo la relazione (file)-[contiene]->(gene) (se non esiste) if not contains_dict.has_key(properties["name"] + ':' + line["gene_id"]): contains_dict[properties["name"] + ':' + line["gene_id"]] = [ properties["name"], line["gene_id"] ] # memorizzo la relazione (gene)-[contenuto in]->(cromosoma) (se non esiste) if not in_chromosome_dict.has_key(line["gene_id"] + ':' + line["seqname"]): in_chromosome_dict[line["gene_id"] + ':' + line["seqname"]] = [ line["gene_id"], line["seqname"] ] # a seconda della feature considerata (transcript, exon) memorizzo opportunamente le informazioni della riga if line['feature'] == 'transcript': # memorizzo il trascritto (se non presente) if not transcripts_dict.has_key(line["transcript_id"]): transcripts_dict[line["transcript_id"]] = [ line[attr] if line.has_key(attr) else "None" for attr in transcript_header ] # memorizzo la relazione (gene)-[contiente]->(trascritto) (se non esiste) if not has_transcript_dict.has_key(line["gene_id"] + ':' + line["transcript_id"]): has_transcript_dict[line["gene_id"] + ':' + line["transcript_id"]] = [ line[attr] for attr in has_transcript_header ] elif line['feature'] == 'exon': #definisco un ID per l'esone (necessario per il popolamento su db) exon_id = line["exon_number"] + ':' + line["transcript_id"] # memorizzo l'esone nel file csv exonWriter.writerow([exon_id] + [ line[attr] if line.has_key(attr) else "None" for attr in exon_header[1:] ]) #memorizzo la relazione (trascritto)-[contiene]->(esone) nel file csv hasExonWriter.writerow([line["transcript_id"], exon_id]) if not (row_count % 15000): print str(row_count) + " scanned" # scrivo i file csv dei dict creati in precedenza for chrom in list(chromosomes): chromosomeWriter.writerow([chrom]) for gene in genes_dict.keys(): geneWriter.writerow(genes_dict[gene]) for transcript in transcripts_dict.keys(): transcriptWriter.writerow(transcripts_dict[transcript]) for entry in contains_dict.keys(): containsWriter.writerow(contains_dict[entry]) for entry in in_chromosome_dict.keys(): inChromosomeWriter.writerow(in_chromosome_dict[entry]) for entry in has_transcript_dict.keys(): hasTranscriptWriter.writerow(has_transcript_dict[entry]) # termino la scrittura dei file csv # ---- nodi chromosome_csv.close() gene_csv.close() transcript_csv.close() exon_csv.close() # ---- relazioni contains_csv.close() in_chromosome_csv.close() has_transcript_csv.close() has_exon_csv.close() print 'Populating Database...' session = driver.session() prova = [ "MERGE (u:User { username:{username} })", "MERGE (e:Experiment { name:{experiment} })", "MERGE (s:Species {species: {species} })", "MERGE (f:File { name:{properties}.name }) ON CREATE SET f += {properties}", "MERGE (u)-[:Created]->(e)", "MERGE (e)-[:For_Species]->(s)", "MERGE (e)-[:Composed_By]->(f)" ] # Associo il file all'utente session.run( " ".join(prova), { "username": username, "experiment": experiment, "species": species, "properties": properties }) session.close() populateDB(driver, temp_folder + temp_token) print 'Done.'