Пример #1
0
def count_lowlevel_in_hightlevel(filename, low_level_name, high_level_name):
    """
    To count how many sub-features in a highlevel feature.
    :param filename: File used to be processed.
    :param low_level_name: Feature names in GTF file. Such as "exon", "transcript".
    :param high_level_name: Feature names in GTF file. Such as "exon", "transcript", "gene".
    :return: No return, but output to file directly.
    """
    occurrence = 0
    with open('{} number in each {}'.format(low_level_name, high_level_name),
              'w') as f:
        for idx, item_with_bool in enumerate(lookahead(GTF.lines(filename))):
            if item_with_bool[0]['feature'] == high_level_name:
                if idx != 0 and idx != 1:
                    f.write(str(occurrence) + '\n')
                occurrence = 0
            elif item_with_bool[0]['feature'] == low_level_name:
                occurrence += 1
            elif item_with_bool[1] == False:
                f.write(str(occurrence) + '\n')
            else:
                continue
def main():
	with open(utr5FName, "w") as utr5File, open(utr5StartFName, "w") as utr5StartFile, open(cdsFName, "w") as cdsFile, \
			open(utr3FName, "w") as utr3File, open(exonFName, "w") as exonFile, open (intronFName, "w") as intronFile, \
			open(codingExonFName, "w") as codingExonFile, open(codingIntronFName, "w") as codingIntronFile, \
			open(noncodingExonFName, "w") as noncodingExonFile, open(noncodingIntronFName, "w") as noncodingIntronFile:

		def writeOutput(gene):
			if (useBlocks): # output all region primitives on the same line by specifying nBlocks and lists inside the BED output
				if(gene.coding):
					#blockBedFormat is one line by definition
					if (gene.utr5Len > 0): utr5File.write(gene.blockBedFormat(region="5utr") + "\n")
					if (gene.utr5startLen > 0): utr5StartFile.write(gene.blockBedFormat(region="5utr_start") + "\n")
					if (gene.cdsLen > 0): cdsFile.write(gene.blockBedFormat(region="cds") + "\n")
					if (gene.utr3Len > 0): utr3File.write(gene.blockBedFormat(region="3utr") + "\n")
				
					if (gene.exonsLen > 0):
						exonFile.write(gene.blockBedFormat(region="exons") + "\n")
						codingExonFile.write(gene.blockBedFormat(region="exons") + "\n")
					
					if (gene.intronsLen > 0):
						intronFile.write(gene.blockBedFormat(region="introns") + "\n")
						codingIntronFile.write(gene.blockBedFormat(region="introns") + "\n")
						
				else: # noncoding transcripts just have exons and introns
					if (gene.exonsLen > 0):
						exonFile.write(gene.blockBedFormat(region="exons") + "\n")
						noncodingExonFile.write(gene.blockBedFormat(region="exons") + "\n")

					if (gene.intronsLen > 0):
						intronFile.write(gene.blockBedFormat(region="introns") + "\n")
						noncodingIntronFile.write(gene.blockBedFormat(region="introns") + "\n")

			else: # output one line per region primitive instead of combining regions via blocks
				if(gene.coding):
					for entry in gene.bedFormat(region="5utr"):
						utr5File.write(entry + "\n")
					for entry in gene.bedFormat(region="5utr_start"):
						utr5StartFile.write(entry + "\n")
					for entry in gene.bedFormat(region="cds"):
						cdsFile.write(entry + "\n")
					for entry in gene.bedFormat(region="3utr"):
						utr3File.write(entry + "\n")

					for entry in gene.bedFormat(region="exons"):
						exonFile.write(entry + "\n")
						codingExonFile.write(entry + "\n")

					for entry in gene.bedFormat(region="introns"):
						intronFile.write(entry + "\n")
						codingIntronFile.write(entry + "\n")

				else: # noncoding transcripts just have exons and introns
					for entry in gene.bedFormat(region="exons"):
						exonFile.write(entry + "\n")
						noncodingExonFile.write(entry + "\n")

					for entry in gene.bedFormat(region="introns"):
						intronFile.write(entry + "\n")
						noncodingIntronFile.write(entry + "\n")


		if (args.ucsc): 
			with open(args.input, "r") as genesFile: 
				genesRead = 0

				for line in genesFile:
					# all of the knowngenes parsing and metadata construction is done inside UCSCKnownGene.py, especially the createGene method

					gene = createUCSCTranscript(line) 
					genesRead += 1

					writeOutput(gene)

					if (not genesRead % 2500):
						print "Processed %d entries..." %  genesRead

					
		elif (args.gtf): 
				
				# first parse the entire file into a dictionary of lists

			txDict = defaultdict(list) 

			print "Building GTF dictionary..." 

			# the issue here is that lines for various transcripts may be interleaved, so can either create lots of SNFGene objects, or a giant dict. opted for giant dict. 
			for line in GTF.lines(args.input): 

				txDict[line["transcript_id"]].append(line)
				genesRead += 1

				if (not genesRead % 100000):
					print "Processed %d lines..." %  genesRead

			print "Dictionary built." 

			# now create a SNFGene object for each transcript and output it 
			genesRead = 0
			for key in txDict: 

				#print key

				tx = createGTFTranscript(txDict[key])

				#print tx 
				writeOutput(tx)
				genesRead += 1
				
				if (not genesRead % 2500):
					print "Processed %d entries..." %  genesRead


	print "Processed %d entries." %  genesRead

	# BTD Edit: making unique regions and linking to gene name
	# --------------------------------------------------------
	# utr5FName = args.output  + "_5utr.bed"
	# utr5StartFName = args.output  + "_5utr_start.bed"
	# cdsFName = args.output  + "_cds.bed"
	# utr3FName = args.output  + "_3utr.bed"
	# exonFName = args.output  + "_exons.bed"
	# intronFName = args.output  + "_introns.bed"
	# codingExonFName = args.output  + "_codingexons.bed"
	# codingIntronFName = args.output  + "_codingintrons.bed" # note that these are introns from coding genes, not necessarily introns that make it to mRNA 
	# noncodingExonFName = args.output  + "_noncodingexons.bed" 
	# noncodingIntronFName = args.output  + "_noncodingintrons.bed" 

	# 1. Get gene ID (NM_123, ENSG123) --> gene name (Abcd1)
	print "Getting gene ID"
	idToName = {}
	if args.ucsc:
		with open(args.input, 'r') as knownGeneFile:
			reader = csv.reader(knownGeneFile, 'textdialect')
			for row in reader:
				idToName[row[0]] = row[-1]
			
	# 2. Get unique 5'UTR, 5'Start UTR, and 3'UTR
	print "Getting unique UTRs"
	def getUniqUTR(uniqFN, utrFN):
		with open(uniqFN, 'w') as uniq, open(utrFN, 'r') as utr:
			already = set()
			reader = csv.reader(utr, 'textdialect')
			writer = csv.writer(uniq, 'textdialect')
			for row in reader:
				if tuple(row[6:]) in already: continue #repeat
				geneIDInfo = row[3]
				id = geneIDInfo.split('__')[0]
				try: geneName = idToName[id]
				except: geneName = id
				if geneName != id: row[3] = id + '__' + geneName
				else: row[3] = id
				already.add(tuple(row[6:]))
				writer.writerow(row)
				
	uniq5UTR = args.output  + "_uniq_5utr.bed"
	getUniqUTR(uniq5UTR, utr5FName)

	uniq3UTR = args.output  + '_uniq_3utr.bed'
	getUniqUTR(uniq3UTR, utr3FName)

	uniq5SUTR = args.output  + '_uniq_5utr_start.bed'
	getUniqUTR(uniq5SUTR, utr5StartFName)
		
	# 3. Get unique exons + num. Do it 3x for all, coding, and noncoding
	print "Getting unique exons"
	def getUniqExons(uniqFN, exonFN):
		with open(uniqFN, 'w') as uniq, open(exonFN, 'r') as exons:
			already = set()
			reader = csv.reader(exons, 'textdialect')
			writer = csv.writer(uniq, 'textdialect')
			for row in reader:
				# gene ID info
				geneIDInfo = row[3]
				id = geneIDInfo.split('__')[0]
				try: geneName = idToName[id]
				except: geneName = id
				if geneName != id: geneIDInfo = id + '__' + geneName
				else: geneIDInfo = id
				
				# chrom, start, stop, strand
				chrom = row[0]
				start, end = int(row[1]), int(row[2])
				strand = row[5]

				# calculate exon starts and lengths
				exonLengths = row[10].split(',')
				if exonLengths[-1] == '': exonLengths = exonLengths[:-1]
				exonLengths = [int(x) for x in exonLengths]
				exonStarts = row[11].split(',')
				if exonStarts[-1] == '': exonStarts = exonStarts[:-1]
				exonStarts = [int(x) for x in exonStarts]
				
				# calculate exons
				exons = []
				for i in range(len(exonStarts)):
					absStart = start + exonStarts[i]
					exons.append([absStart, absStart + exonLengths[i]])
				if strand == '-': exons = exons[::-1] #flip exon order
				
				# making BED6
				for i in range(len(exons)):
					exonNum = i + 1
					exonNumInfo = str(exonNum) + 'of' + str(len(exons))
					exon = exons[i]
					outputRow = [chrom, exon[0], exon[1]]
					
					# unique
					if tuple(outputRow) in already: continue
					already.add(tuple(outputRow))            
					outputRow.extend([geneIDInfo + '__exon__' + exonNumInfo, 0, strand])
					writer.writerow(outputRow)
				
	uniqExons = args.output  + '_uniq_exons.bed'
	getUniqExons(uniqExons, exonFName)

	uniqExons = args.output  + '_uniq_codingexons.bed'
	getUniqExons(uniqExons, codingExonFName)

	uniqExons = args.output  + '_uniq_noncodingexons.bed'
	getUniqExons(uniqExons, noncodingExonFName)            

	# 4. Get unique introns + num. unique 5'SS, 3'SS. 
	# 5'SS is first base of intron, 3'SS is last base of intron
	print "Getting unique introns and 5' and 3' SS"
	def getUniqIntronsAndSS(uniqIntronFN, uniq5SSFN, uniq3SSFN, intronFN):
		with open(uniqIntronFN, 'w') as uniqIntron, open(uniq5SSFN, 'w') as uniq5, \
			open(uniq3SSFN, 'w') as uniq3, open(intronFN, 'r') as introns:
			alreadyIntron = set()
			already5 = set()
			already3 = set()
			
			reader = csv.reader(introns, 'textdialect')
			intronWriter = csv.writer(uniqIntron, 'textdialect')
			fiveWriter = csv.writer(uniq5, 'textdialect')
			threeWriter = csv.writer(uniq3, 'textdialect')
			
			for row in reader:
				# gene ID info
				geneIDInfo = row[3]
				id = geneIDInfo.split('__')[0]
				try: geneName = idToName[id]
				except: geneName = id
				if geneName != id: geneIDInfo = id + '__' + geneName
				else: geneIDInfo = id
				
				# chrom, start, stop, strand
				chrom = row[0]
				start, end = int(row[1]), int(row[2])
				strand = row[5]

				# calculate intron starts and lengths
				intronLengths = row[10].split(',')
				if intronLengths[-1] == '': intronLengths = intronLengths[:-1]
				intronLengths = [int(x) for x in intronLengths]
				intronStarts = row[11].split(',')
				if intronStarts[-1] == '': intronStarts = intronStarts[:-1]
				intronStarts = [int(x) for x in intronStarts]
				
				# calculate introns
				introns = []
				for i in range(len(intronStarts)):
					absStart = start + intronStarts[i]
					introns.append([absStart, absStart + intronLengths[i]])
				if strand == '-': introns = introns[::-1] #flip intron order
				
				# making BED6
				for i in range(len(introns)):
					intronNum = i + 1
					intronNumInfo = str(intronNum) + 'of' + str(len(introns))
					intron = introns[i]
					outputRow = [chrom, intron[0], intron[1]]
					
					# unique introns
					if tuple(outputRow) in alreadyIntron: continue
					alreadyIntron.add(tuple(outputRow))
					outputRow.extend([geneIDInfo+ '__intron__' + intronNumInfo, 0, strand])
					intronWriter.writerow(outputRow)
					
					# unique splice sites
					if strand == '+':
						fiveSS = [chrom, intron[0], intron[0] + 1]
						threeSS = [chrom, intron[1] - 1, intron[1]]
					else:
						threeSS = [chrom, intron[0], intron[0] + 1]
						fiveSS = [chrom, intron[1] - 1, intron[1]]
					if tuple(fiveSS) not in already5:
						already5.add(tuple(fiveSS))
						fiveSS.extend([geneIDInfo + '__5ss__' + intronNumInfo, 0, strand])
						fiveWriter.writerow(fiveSS)
					if tuple(threeSS) not in already3:
						already3.add(tuple(threeSS))
						threeSS.extend([geneIDInfo+ '__3ss__' + intronNumInfo, 0, strand])
						threeWriter.writerow(threeSS)

	uniqIntrons = args.output  + '_uniq_introns.bed'
	uniq5 = args.output  + '_uniq_5ss.bed'
	uniq3 = args.output  + '_uniq_3ss.bed'
	getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, intronFName)

	uniqIntrons = args.output  + '_uniq_codingintrons.bed'
	uniq5 = args.output  + '_uniq_coding5ss.bed'
	uniq3 = args.output  + '_uniq_coding3ss.bed'
	getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, codingIntronFName)

	uniqIntrons = args.output  + '_uniq_noncodingintrons.bed'
	uniq5 = args.output  + '_uniq_noncoding5ss.bed'
	uniq3 = args.output  + '_uniq_noncoding3ss.bed'
	getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, noncodingIntronFName)

	# 5. unique TSS/TES
	print "Getting unique TSS and TES"
	def getUniqTSSAndTES(tssFN, tesFN, cdsFN):
		with open(tssFN, 'w') as uniqTSS, open(tesFN, 'w') as uniqTES, open(cdsFN, 'r') as cds:
			alreadyTSS = set()
			alreadyTES = set()
			reader = csv.reader(cds, 'textdialect')
			tssWriter = csv.writer(uniqTSS, 'textdialect')
			tesWriter = csv.writer(uniqTES, 'textdialect')
			for row in reader:
				geneIDInfo = row[3]
				id = geneIDInfo.split('__')[0]
				try: geneName = idToName[id]
				except: geneName = id
				if geneName != id: geneIDInfo = id + '__' + geneName
				else: geneIDInfo = id
				
				# chrom, start, stop, strand
				chrom = row[0]
				strand = row[5]
				start, end = int(row[1]), int(row[2])
				
				if strand == '+':
					startRow = [chrom, start, start + 1]
					endRow = [chrom, end - 1, end]
				else:
					startRow = [chrom, end - 1, end]
					endRow = [chrom, start, start + 1]
				if tuple(startRow) not in alreadyTSS:
					alreadyTSS.add(tuple(startRow))
					startRow.extend([geneIDInfo, 0, strand])
					tssWriter.writerow(startRow)
				if tuple(endRow) not in alreadyTSS:
					alreadyTES.add(tuple(endRow))
					endRow.extend([geneIDInfo, 0, strand])
					tesWriter.writerow(endRow)            
				
	uniqTSS = args.output  + '_uniq_tss.bed'
	uniqTES = args.output  + '_uniq_tes.bed'
	getUniqTSSAndTES(uniqTSS, uniqTES, cdsFName)


	# sort everything
	print "Sorting BED files"
	for fn in glob.glob("*.bed"):
		os.system("sort -k1,1 -k2,2n %s -o %s"%(fn, fn))
Пример #3
0
                writeOutput(gene)

                if (not genesRead % 2500):
                    print "Processed %d entries..." % genesRead

    elif (args.gtf):

        # first parse the entire file into a dictionary of lists

        txDict = defaultdict(list)

        print "Building GTF dictionary..."

        # the issue here is that lines for various transcripts may be interleaved, so can either create lots of objects, or a giant dict. opted for giant dict.
        for line in GTF.lines(args.input):
            # only want to read in lines corresponding to these features
            if line["feature"] in ["exon", "CDS", "start_codon", "stop_codon"]:
                txDict[line["transcript_id"]].append(line)
                genesRead += 1

                if (not genesRead % 25000):
                    print "\tProcessed %d lines..." % genesRead

        print "Dictionary built."

        print "Writing transcript properties."
        genesRead = 0

        # now create a Transcript object for each transcript and output it
                writeOutput(gene)

                if (not genesRead % 2500):
                    print "Processed %d entries..." %  genesRead

                
    elif (args.gtf): 
            
            # first parse the entire file into a dictionary of lists

        txDict = defaultdict(list) 

        print "Building GTF dictionary..." 

        # the issue here is that lines for various transcripts may be interleaved, so can either create lots of objects, or a giant dict. opted for giant dict. 
        for line in GTF.lines(args.input): 
            # only want to read in lines corresponding to these features
            if line["feature"] in ["exon", "CDS", "start_codon", "stop_codon"]:
                txDict[line["transcript_id"]].append(line)
                genesRead += 1

                if (not genesRead % 25000):
                    print "\tProcessed %d lines..." %  genesRead

        print "Dictionary built." 

        print "Writing transcript properties."
        genesRead = 0
        
        # now create a Transcript object for each transcript and output it 
def main():
    with open(utr5FName, "w") as utr5File, open(utr5StartFName, "w") as utr5StartFile, open(cdsFName, "w") as cdsFile, \
      open(utr3FName, "w") as utr3File, open(exonFName, "w") as exonFile, open (intronFName, "w") as intronFile, \
      open(codingExonFName, "w") as codingExonFile, open(codingIntronFName, "w") as codingIntronFile, \
      open(noncodingExonFName, "w") as noncodingExonFile, open(noncodingIntronFName, "w") as noncodingIntronFile:

        def writeOutput(gene):
            if (
                    useBlocks
            ):  # output all region primitives on the same line by specifying nBlocks and lists inside the BED output
                if (gene.coding):
                    #blockBedFormat is one line by definition
                    if (gene.utr5Len > 0):
                        utr5File.write(
                            gene.blockBedFormat(region="5utr") + "\n")
                    if (gene.utr5startLen > 0):
                        utr5StartFile.write(
                            gene.blockBedFormat(region="5utr_start") + "\n")
                    if (gene.cdsLen > 0):
                        cdsFile.write(gene.blockBedFormat(region="cds") + "\n")
                    if (gene.utr3Len > 0):
                        utr3File.write(
                            gene.blockBedFormat(region="3utr") + "\n")

                    if (gene.exonsLen > 0):
                        exonFile.write(
                            gene.blockBedFormat(region="exons") + "\n")
                        codingExonFile.write(
                            gene.blockBedFormat(region="exons") + "\n")

                    if (gene.intronsLen > 0):
                        intronFile.write(
                            gene.blockBedFormat(region="introns") + "\n")
                        codingIntronFile.write(
                            gene.blockBedFormat(region="introns") + "\n")

                else:  # noncoding transcripts just have exons and introns
                    if (gene.exonsLen > 0):
                        exonFile.write(
                            gene.blockBedFormat(region="exons") + "\n")
                        noncodingExonFile.write(
                            gene.blockBedFormat(region="exons") + "\n")

                    if (gene.intronsLen > 0):
                        intronFile.write(
                            gene.blockBedFormat(region="introns") + "\n")
                        noncodingIntronFile.write(
                            gene.blockBedFormat(region="introns") + "\n")

            else:  # output one line per region primitive instead of combining regions via blocks
                if (gene.coding):
                    for entry in gene.bedFormat(region="5utr"):
                        utr5File.write(entry + "\n")
                    for entry in gene.bedFormat(region="5utr_start"):
                        utr5StartFile.write(entry + "\n")
                    for entry in gene.bedFormat(region="cds"):
                        cdsFile.write(entry + "\n")
                    for entry in gene.bedFormat(region="3utr"):
                        utr3File.write(entry + "\n")

                    for entry in gene.bedFormat(region="exons"):
                        exonFile.write(entry + "\n")
                        codingExonFile.write(entry + "\n")

                    for entry in gene.bedFormat(region="introns"):
                        intronFile.write(entry + "\n")
                        codingIntronFile.write(entry + "\n")

                else:  # noncoding transcripts just have exons and introns
                    for entry in gene.bedFormat(region="exons"):
                        exonFile.write(entry + "\n")
                        noncodingExonFile.write(entry + "\n")

                    for entry in gene.bedFormat(region="introns"):
                        intronFile.write(entry + "\n")
                        noncodingIntronFile.write(entry + "\n")

        if (args.ucsc):
            with open(args.input, "r") as genesFile:
                genesRead = 0

                for line in genesFile:
                    # all of the knowngenes parsing and metadata construction is done inside UCSCKnownGene.py, especially the createGene method

                    gene = createUCSCTranscript(line)
                    genesRead += 1

                    writeOutput(gene)

                    if (not genesRead % 2500):
                        print "Processed %d entries..." % genesRead

        elif (args.gtf):

            # first parse the entire file into a dictionary of lists

            txDict = defaultdict(list)

            print "Building GTF dictionary..."

            # the issue here is that lines for various transcripts may be interleaved, so can either create lots of SNFGene objects, or a giant dict. opted for giant dict.
            for line in GTF.lines(args.input):

                txDict[line["transcript_id"]].append(line)
                genesRead += 1

                if (not genesRead % 100000):
                    print "Processed %d lines..." % genesRead

            print "Dictionary built."

            # now create a SNFGene object for each transcript and output it
            genesRead = 0
            for key in txDict:

                #print key

                tx = createGTFTranscript(txDict[key])

                #print tx
                writeOutput(tx)
                genesRead += 1

                if (not genesRead % 2500):
                    print "Processed %d entries..." % genesRead

    print "Processed %d entries." % genesRead

    # BTD Edit: making unique regions and linking to gene name
    # --------------------------------------------------------
    # utr5FName = args.output  + "_5utr.bed"
    # utr5StartFName = args.output  + "_5utr_start.bed"
    # cdsFName = args.output  + "_cds.bed"
    # utr3FName = args.output  + "_3utr.bed"
    # exonFName = args.output  + "_exons.bed"
    # intronFName = args.output  + "_introns.bed"
    # codingExonFName = args.output  + "_codingexons.bed"
    # codingIntronFName = args.output  + "_codingintrons.bed" # note that these are introns from coding genes, not necessarily introns that make it to mRNA
    # noncodingExonFName = args.output  + "_noncodingexons.bed"
    # noncodingIntronFName = args.output  + "_noncodingintrons.bed"

    # 1. Get gene ID (NM_123, ENSG123) --> gene name (Abcd1)
    print "Getting gene ID"
    idToName = {}
    if args.ucsc:
        with open(args.input, 'r') as knownGeneFile:
            reader = csv.reader(knownGeneFile, 'textdialect')
            for row in reader:
                idToName[row[0]] = row[-1]

    # 2. Get unique 5'UTR, 5'Start UTR, and 3'UTR
    print "Getting unique UTRs"

    def getUniqUTR(uniqFN, utrFN):
        with open(uniqFN, 'w') as uniq, open(utrFN, 'r') as utr:
            already = set()
            reader = csv.reader(utr, 'textdialect')
            writer = csv.writer(uniq, 'textdialect')
            for row in reader:
                if tuple(row[6:]) in already: continue  #repeat
                geneIDInfo = row[3]
                id = geneIDInfo.split('__')[0]
                try:
                    geneName = idToName[id]
                except:
                    geneName = id
                if geneName != id: row[3] = id + '__' + geneName
                else: row[3] = id
                already.add(tuple(row[6:]))
                writer.writerow(row)

    uniq5UTR = args.output + "_uniq_5utr.bed"
    getUniqUTR(uniq5UTR, utr5FName)

    uniq3UTR = args.output + '_uniq_3utr.bed'
    getUniqUTR(uniq3UTR, utr3FName)

    uniq5SUTR = args.output + '_uniq_5utr_start.bed'
    getUniqUTR(uniq5SUTR, utr5StartFName)

    # 3. Get unique exons + num. Do it 3x for all, coding, and noncoding
    print "Getting unique exons"

    def getUniqExons(uniqFN, exonFN):
        with open(uniqFN, 'w') as uniq, open(exonFN, 'r') as exons:
            already = set()
            reader = csv.reader(exons, 'textdialect')
            writer = csv.writer(uniq, 'textdialect')
            for row in reader:
                # gene ID info
                geneIDInfo = row[3]
                id = geneIDInfo.split('__')[0]
                try:
                    geneName = idToName[id]
                except:
                    geneName = id
                if geneName != id: geneIDInfo = id + '__' + geneName
                else: geneIDInfo = id

                # chrom, start, stop, strand
                chrom = row[0]
                start, end = int(row[1]), int(row[2])
                strand = row[5]

                # calculate exon starts and lengths
                exonLengths = row[10].split(',')
                if exonLengths[-1] == '': exonLengths = exonLengths[:-1]
                exonLengths = [int(x) for x in exonLengths]
                exonStarts = row[11].split(',')
                if exonStarts[-1] == '': exonStarts = exonStarts[:-1]
                exonStarts = [int(x) for x in exonStarts]

                # calculate exons
                exons = []
                for i in range(len(exonStarts)):
                    absStart = start + exonStarts[i]
                    exons.append([absStart, absStart + exonLengths[i]])
                if strand == '-': exons = exons[::-1]  #flip exon order

                # making BED6
                for i in range(len(exons)):
                    exonNum = i + 1
                    exonNumInfo = str(exonNum) + 'of' + str(len(exons))
                    exon = exons[i]
                    outputRow = [chrom, exon[0], exon[1]]

                    # unique
                    if tuple(outputRow) in already: continue
                    already.add(tuple(outputRow))
                    outputRow.extend(
                        [geneIDInfo + '__exon__' + exonNumInfo, 0, strand])
                    writer.writerow(outputRow)

    uniqExons = args.output + '_uniq_exons.bed'
    getUniqExons(uniqExons, exonFName)

    uniqExons = args.output + '_uniq_codingexons.bed'
    getUniqExons(uniqExons, codingExonFName)

    uniqExons = args.output + '_uniq_noncodingexons.bed'
    getUniqExons(uniqExons, noncodingExonFName)

    # 4. Get unique introns + num. unique 5'SS, 3'SS.
    # 5'SS is first base of intron, 3'SS is last base of intron
    print "Getting unique introns and 5' and 3' SS"

    def getUniqIntronsAndSS(uniqIntronFN, uniq5SSFN, uniq3SSFN, intronFN):
        with open(uniqIntronFN, 'w') as uniqIntron, open(uniq5SSFN, 'w') as uniq5, \
         open(uniq3SSFN, 'w') as uniq3, open(intronFN, 'r') as introns:
            alreadyIntron = set()
            already5 = set()
            already3 = set()

            reader = csv.reader(introns, 'textdialect')
            intronWriter = csv.writer(uniqIntron, 'textdialect')
            fiveWriter = csv.writer(uniq5, 'textdialect')
            threeWriter = csv.writer(uniq3, 'textdialect')

            for row in reader:
                # gene ID info
                geneIDInfo = row[3]
                id = geneIDInfo.split('__')[0]
                try:
                    geneName = idToName[id]
                except:
                    geneName = id
                if geneName != id: geneIDInfo = id + '__' + geneName
                else: geneIDInfo = id

                # chrom, start, stop, strand
                chrom = row[0]
                start, end = int(row[1]), int(row[2])
                strand = row[5]

                # calculate intron starts and lengths
                intronLengths = row[10].split(',')
                if intronLengths[-1] == '': intronLengths = intronLengths[:-1]
                intronLengths = [int(x) for x in intronLengths]
                intronStarts = row[11].split(',')
                if intronStarts[-1] == '': intronStarts = intronStarts[:-1]
                intronStarts = [int(x) for x in intronStarts]

                # calculate introns
                introns = []
                for i in range(len(intronStarts)):
                    absStart = start + intronStarts[i]
                    introns.append([absStart, absStart + intronLengths[i]])
                if strand == '-': introns = introns[::-1]  #flip intron order

                # making BED6
                for i in range(len(introns)):
                    intronNum = i + 1
                    intronNumInfo = str(intronNum) + 'of' + str(len(introns))
                    intron = introns[i]
                    outputRow = [chrom, intron[0], intron[1]]

                    # unique introns
                    if tuple(outputRow) in alreadyIntron: continue
                    alreadyIntron.add(tuple(outputRow))
                    outputRow.extend(
                        [geneIDInfo + '__intron__' + intronNumInfo, 0, strand])
                    intronWriter.writerow(outputRow)

                    # unique splice sites
                    if strand == '+':
                        fiveSS = [chrom, intron[0], intron[0] + 1]
                        threeSS = [chrom, intron[1] - 1, intron[1]]
                    else:
                        threeSS = [chrom, intron[0], intron[0] + 1]
                        fiveSS = [chrom, intron[1] - 1, intron[1]]
                    if tuple(fiveSS) not in already5:
                        already5.add(tuple(fiveSS))
                        fiveSS.extend([
                            geneIDInfo + '__5ss__' + intronNumInfo, 0, strand
                        ])
                        fiveWriter.writerow(fiveSS)
                    if tuple(threeSS) not in already3:
                        already3.add(tuple(threeSS))
                        threeSS.extend([
                            geneIDInfo + '__3ss__' + intronNumInfo, 0, strand
                        ])
                        threeWriter.writerow(threeSS)

    uniqIntrons = args.output + '_uniq_introns.bed'
    uniq5 = args.output + '_uniq_5ss.bed'
    uniq3 = args.output + '_uniq_3ss.bed'
    getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, intronFName)

    uniqIntrons = args.output + '_uniq_codingintrons.bed'
    uniq5 = args.output + '_uniq_coding5ss.bed'
    uniq3 = args.output + '_uniq_coding3ss.bed'
    getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, codingIntronFName)

    uniqIntrons = args.output + '_uniq_noncodingintrons.bed'
    uniq5 = args.output + '_uniq_noncoding5ss.bed'
    uniq3 = args.output + '_uniq_noncoding3ss.bed'
    getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, noncodingIntronFName)

    # 5. unique cdsStart, cdsEnd
    print "Getting unique cdsStart and cdsEnd"

    def getUniqCDSStartEnd(startFN, endFN, cdsFN):
        with open(startFN,
                  'w') as uniqStart, open(endFN,
                                          'w') as uniqEnd, open(cdsFN,
                                                                'r') as cds:
            alreadyStart = set()
            alreadyEnd = set()
            reader = csv.reader(cds, 'textdialect')
            startWriter = csv.writer(uniqStart, 'textdialect')
            endWriter = csv.writer(uniqEnd, 'textdialect')
            for row in reader:
                geneIDInfo = row[3]
                id = geneIDInfo.split('__')[0]
                try:
                    geneName = idToName[id]
                except:
                    geneName = id
                if geneName != id: geneIDInfo = id + '__' + geneName
                else: geneIDInfo = id

                # chrom, start, stop, strand
                chrom = row[0]
                strand = row[5]
                start, end = int(row[1]), int(row[2])

                if strand == '+':
                    startRow = [chrom, start, start + 1]
                    endRow = [chrom, end - 1, end]
                else:
                    startRow = [chrom, end - 1, end]
                    endRow = [chrom, start, start + 1]
                if tuple(startRow) not in alreadyStart:
                    alreadyStart.add(tuple(startRow))
                    startRow.extend([geneIDInfo, 0, strand])
                    startWriter.writerow(startRow)
                if tuple(endRow) not in alreadyEnd:
                    alreadyEnd.add(tuple(endRow))
                    endRow.extend([geneIDInfo, 0, strand])
                    endWriter.writerow(endRow)

    uniqCDSStart = args.output + '_uniq_cdsStart.bed'
    uniqCDSEnd = args.output + '_uniq_cdsEnd.bed'
    getUniqCDSStartEnd(uniqCDSStart, uniqCDSEnd, cdsFName)

    # 6. unique TSS, TES
    print "Getting unique TSS and TES"

    def getUniqTSSAndTES(tssFN, tesFN, fiveFN, threeFN):
        with open(tssFN, 'w') as uniqTSS, open(tesFN, 'w') as uniqTES, open(
                fiveFN, 'r') as fiveUTR, open(threeFN, 'r') as threeUTR:
            alreadyTSS = set()
            fiveReader = csv.reader(fiveUTR, 'textdialect')
            tssWriter = csv.writer(uniqTSS, 'textdialect')
            for row in fiveReader:
                geneIDInfo = row[3]
                id = geneIDInfo.split('__')[0]
                try:
                    geneName = idToName[id]
                except:
                    geneName = id
                if geneName != id: geneIDInfo = id + '__' + geneName
                else: geneIDInfo = id

                # chrom, start, stop, strand
                chrom = row[0]
                strand = row[5]
                start, end = int(row[1]), int(row[2])

                if strand == '+':
                    startRow = [chrom, start, start + 1]
                else:
                    startRow = [chrom, end - 1, end]
                if tuple(startRow) not in alreadyTSS:
                    alreadyTSS.add(tuple(startRow))
                    startRow.extend([geneIDInfo, 0, strand])
                    tssWriter.writerow(startRow)

            alreadyTES = set()
            threeReader = csv.reader(threeUTR, 'textdialect')
            tesWriter = csv.writer(uniqTES, 'textdialect')
            for row in threeReader:
                geneIDInfo = row[3]
                id = geneIDInfo.split('__')[0]
                try:
                    geneName = idToName[id]
                except:
                    geneName = id
                if geneName != id: geneIDInfo = id + '__' + geneName
                else: geneIDInfo = id

                # chrom, start, stop, strand
                chrom = row[0]
                strand = row[5]
                start, end = int(row[1]), int(row[2])

                if strand == '-':
                    endRow = [chrom, start, start + 1]
                else:
                    endRow = [chrom, end - 1, end]
                if tuple(endRow) not in alreadyTES:
                    alreadyTES.add(tuple(endRow))
                    endRow.extend([geneIDInfo, 0, strand])
                    tesWriter.writerow(endRow)

    uniqTSS = args.output + '_uniq_tss.bed'
    uniqTES = args.output + '_uniq_tes.bed'

    getUniqTSSAndTES(uniqTSS, uniqTES, utr5FName, utr3FName)

    # sort everything
    print "Sorting BED files"
    for fn in glob.glob("*.bed"):
        os.system("sort -k1,1 -k2,2n %s -o %s" % (fn, fn))
Пример #6
0
def main(argv):

    # Ottengo la stringa relativa al file da processare
    input_file = argv[0]
    temp_folder = argv[1]
    username = argv[2]
    experiment = argv[3]
    species = argv[4]

    config = json.load(open('../configuration.json'))

    temp_token = username + '_' + str(uuid.uuid4())

    # Creo i csv per memorizzare le informazioni sui nodi
    chromosome_csv = open(temp_folder + temp_token + '_chromosome.csv', 'w')
    gene_csv = open(temp_folder + temp_token + '_gene.csv', 'w')
    transcript_csv = open(temp_folder + temp_token + '_transcript.csv', 'w')
    exon_csv = open(temp_folder + temp_token + '_exon.csv', 'w')

    # Creo i csv per memorizzare le informazioni sulle relazioni
    contains_csv = open(temp_folder + temp_token + '_contains.csv', 'w')
    in_chromosome_csv = open(temp_folder + temp_token + '_in_chromosome.csv',
                             'w')
    has_transcript_csv = open(temp_folder + temp_token + '_has_transcript.csv',
                              'w')
    has_exon_csv = open(temp_folder + temp_token + '_has_exon.csv', 'w')

    # Inizializzo i writer per tutti i file

    # ---- nodi
    chromosomeWriter = csv.writer(chromosome_csv, delimiter=',')
    geneWriter = csv.writer(gene_csv, delimiter=',')
    transcriptWriter = csv.writer(transcript_csv, delimiter=',')
    exonWriter = csv.writer(exon_csv, delimiter=',')

    # ---- relazioni
    containsWriter = csv.writer(contains_csv, delimiter=',')
    inChromosomeWriter = csv.writer(in_chromosome_csv, delimiter=',')
    hasTranscriptWriter = csv.writer(has_transcript_csv, delimiter=',')
    hasExonWriter = csv.writer(has_exon_csv, delimiter=',')

    # Cotruisco gli header dei file

    # ---- nodi
    chromosome_header = ["chromosome"]
    gene_header = ["gene_id"]
    transcript_header = [
        "transcript_id", "reference_id", "cov", "FPKM", "TPM", "start", "end"
    ]
    exon_header = ["exon_id", "exon_number", "start", "end", "cov"]

    # ---- relazioni
    contains_header = ["name", "gene_id"]
    in_chromosome_header = ["gene_id", "chromosome"]
    has_transcript_header = ["gene_id", "strand", "transcript_id"]
    has_exon_header = ["transcript_id", "exon_id"]

    # Scrivo gli header nei rispettivi file

    # ---- nodi
    chromosomeWriter.writerow(chromosome_header)
    geneWriter.writerow(gene_header)
    transcriptWriter.writerow(transcript_header)
    exonWriter.writerow(exon_header)

    # ---- relazioni
    containsWriter.writerow(contains_header)
    inChromosomeWriter.writerow(in_chromosome_header)
    hasTranscriptWriter.writerow(has_transcript_header)
    hasExonWriter.writerow(has_exon_header)

    # Inizializzo le strutture dati necessarie al parsing (per ottimizzare il caricamento dei dati su database)

    # ---- nodi
    chromosomes = set()
    genes_dict = {}
    transcripts_dict = {}

    # ---- relazioni
    contains_dict = {}
    in_chromosome_dict = {}
    has_transcript_dict = {}

    print 'Starting parsing procedure for file ' + input_file
    properties = {
        "name": os.path.basename(input_file),
        "extension": os.path.splitext(input_file)[1]
    }

    # Connessione a Neo4j
    driver = GraphDatabase.driver("bolt://" + config["neo4j"]["address"],
                                  auth=basic_auth(config["neo4j"]["username"],
                                                  config["neo4j"]["password"]))

    # Inizializzazione degli indici
    session = driver.session()

    statements = [
        "CREATE INDEX ON :File(name);", "CREATE INDEX ON :Species(species);",
        "CREATE INDEX ON :Gene(gene_id);",
        "CREATE INDEX ON :Chromosome(chromosome);",
        "CREATE INDEX ON :Transcript(transcript_id);",
        "CREATE INDEX ON :Exon(exon_id);"
    ]

    for statement in statements:
        session.run(statement)

    session.close()

    print 'Parsing file...'

    # inizializzo un contatore per fare un load parziale del file su database per file troppo grandi
    row_count = 0

    for line in GTF.lines(input_file):
        row_count += 1

        # memorizzo il cromosoma
        chromosomes.add(line["seqname"])

        # memorizzo il gene (se non presente)
        if not genes_dict.has_key(line["gene_id"]):
            genes_dict[line["gene_id"]] = [
                line[attr] if line.has_key(attr) else "None"
                for attr in gene_header
            ]

        # memorizzo la relazione (file)-[contiene]->(gene) (se non esiste)
        if not contains_dict.has_key(properties["name"] + ':' +
                                     line["gene_id"]):
            contains_dict[properties["name"] + ':' + line["gene_id"]] = [
                properties["name"], line["gene_id"]
            ]

        # memorizzo la relazione (gene)-[contenuto in]->(cromosoma) (se non esiste)
        if not in_chromosome_dict.has_key(line["gene_id"] + ':' +
                                          line["seqname"]):
            in_chromosome_dict[line["gene_id"] + ':' + line["seqname"]] = [
                line["gene_id"], line["seqname"]
            ]

        # a seconda della feature considerata (transcript, exon) memorizzo opportunamente le informazioni della riga
        if line['feature'] == 'transcript':

            # memorizzo il trascritto (se non presente)
            if not transcripts_dict.has_key(line["transcript_id"]):
                transcripts_dict[line["transcript_id"]] = [
                    line[attr] if line.has_key(attr) else "None"
                    for attr in transcript_header
                ]

            # memorizzo la relazione (gene)-[contiente]->(trascritto) (se non esiste)
            if not has_transcript_dict.has_key(line["gene_id"] + ':' +
                                               line["transcript_id"]):
                has_transcript_dict[line["gene_id"] + ':' +
                                    line["transcript_id"]] = [
                                        line[attr]
                                        for attr in has_transcript_header
                                    ]

        elif line['feature'] == 'exon':
            #definisco un ID per l'esone (necessario per il popolamento su db)
            exon_id = line["exon_number"] + ':' + line["transcript_id"]

            # memorizzo l'esone nel file csv
            exonWriter.writerow([exon_id] + [
                line[attr] if line.has_key(attr) else "None"
                for attr in exon_header[1:]
            ])

            #memorizzo la relazione (trascritto)-[contiene]->(esone) nel file csv
            hasExonWriter.writerow([line["transcript_id"], exon_id])

        if not (row_count % 15000):
            print str(row_count) + " scanned"

    # scrivo i file csv dei dict creati in precedenza
    for chrom in list(chromosomes):
        chromosomeWriter.writerow([chrom])

    for gene in genes_dict.keys():
        geneWriter.writerow(genes_dict[gene])

    for transcript in transcripts_dict.keys():
        transcriptWriter.writerow(transcripts_dict[transcript])

    for entry in contains_dict.keys():
        containsWriter.writerow(contains_dict[entry])

    for entry in in_chromosome_dict.keys():
        inChromosomeWriter.writerow(in_chromosome_dict[entry])

    for entry in has_transcript_dict.keys():
        hasTranscriptWriter.writerow(has_transcript_dict[entry])

    # termino la scrittura dei file csv

    # ---- nodi
    chromosome_csv.close()
    gene_csv.close()
    transcript_csv.close()
    exon_csv.close()

    # ---- relazioni
    contains_csv.close()
    in_chromosome_csv.close()
    has_transcript_csv.close()
    has_exon_csv.close()

    print 'Populating Database...'
    session = driver.session()

    prova = [
        "MERGE (u:User { username:{username} })",
        "MERGE (e:Experiment { name:{experiment} })",
        "MERGE (s:Species {species: {species} })",
        "MERGE (f:File { name:{properties}.name }) ON CREATE SET f += {properties}",
        "MERGE (u)-[:Created]->(e)", "MERGE (e)-[:For_Species]->(s)",
        "MERGE (e)-[:Composed_By]->(f)"
    ]

    # Associo il file all'utente
    session.run(
        " ".join(prova), {
            "username": username,
            "experiment": experiment,
            "species": species,
            "properties": properties
        })

    session.close()

    populateDB(driver, temp_folder + temp_token)

    print 'Done.'