Python GTF.lines примеры использования

Язык программирования: Python

Класс/Тип: GTF

Метод/Функция: lines

Примеров на hotexamples.com: 6

Python GTF.lines - 6 примеров найдено. Это лучшие примеры Python кода для GTF.lines из пакета taco, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

dataframe(10)

lines(4)

Entry(1)

dictionary(1)

iterator(1)

Пример #1

Показать файл

Файл: gtfStat.py Проект: yqwu1983/py3_scripts

def count_lowlevel_in_hightlevel(filename, low_level_name, high_level_name):
    """
    To count how many sub-features in a highlevel feature.
    :param filename: File used to be processed.
    :param low_level_name: Feature names in GTF file. Such as "exon", "transcript".
    :param high_level_name: Feature names in GTF file. Such as "exon", "transcript", "gene".
    :return: No return, but output to file directly.
    """
    occurrence = 0
    with open('{} number in each {}'.format(low_level_name, high_level_name),
              'w') as f:
        for idx, item_with_bool in enumerate(lookahead(GTF.lines(filename))):
            if item_with_bool[0]['feature'] == high_level_name:
                if idx != 0 and idx != 1:
                    f.write(str(occurrence) + '\n')
                occurrence = 0
            elif item_with_bool[0]['feature'] == low_level_name:
                occurrence += 1
            elif item_with_bool[1] == False:
                f.write(str(occurrence) + '\n')
            else:
                continue

Пример #2

Показать файл

Файл: extract_transcript_regions.py Проект: bdo311/knowngenes-to-transcript-regions

def main():
	with open(utr5FName, "w") as utr5File, open(utr5StartFName, "w") as utr5StartFile, open(cdsFName, "w") as cdsFile, \
			open(utr3FName, "w") as utr3File, open(exonFName, "w") as exonFile, open (intronFName, "w") as intronFile, \
			open(codingExonFName, "w") as codingExonFile, open(codingIntronFName, "w") as codingIntronFile, \
			open(noncodingExonFName, "w") as noncodingExonFile, open(noncodingIntronFName, "w") as noncodingIntronFile:

		def writeOutput(gene):
			if (useBlocks): # output all region primitives on the same line by specifying nBlocks and lists inside the BED output
				if(gene.coding):
					#blockBedFormat is one line by definition
					if (gene.utr5Len > 0): utr5File.write(gene.blockBedFormat(region="5utr") + "\n")
					if (gene.utr5startLen > 0): utr5StartFile.write(gene.blockBedFormat(region="5utr_start") + "\n")
					if (gene.cdsLen > 0): cdsFile.write(gene.blockBedFormat(region="cds") + "\n")
					if (gene.utr3Len > 0): utr3File.write(gene.blockBedFormat(region="3utr") + "\n")
				
					if (gene.exonsLen > 0):
						exonFile.write(gene.blockBedFormat(region="exons") + "\n")
						codingExonFile.write(gene.blockBedFormat(region="exons") + "\n")
					
					if (gene.intronsLen > 0):
						intronFile.write(gene.blockBedFormat(region="introns") + "\n")
						codingIntronFile.write(gene.blockBedFormat(region="introns") + "\n")
						
				else: # noncoding transcripts just have exons and introns
					if (gene.exonsLen > 0):
						exonFile.write(gene.blockBedFormat(region="exons") + "\n")
						noncodingExonFile.write(gene.blockBedFormat(region="exons") + "\n")

					if (gene.intronsLen > 0):
						intronFile.write(gene.blockBedFormat(region="introns") + "\n")
						noncodingIntronFile.write(gene.blockBedFormat(region="introns") + "\n")

			else: # output one line per region primitive instead of combining regions via blocks
				if(gene.coding):
					for entry in gene.bedFormat(region="5utr"):
						utr5File.write(entry + "\n")
					for entry in gene.bedFormat(region="5utr_start"):
						utr5StartFile.write(entry + "\n")
					for entry in gene.bedFormat(region="cds"):
						cdsFile.write(entry + "\n")
					for entry in gene.bedFormat(region="3utr"):
						utr3File.write(entry + "\n")

					for entry in gene.bedFormat(region="exons"):
						exonFile.write(entry + "\n")
						codingExonFile.write(entry + "\n")

					for entry in gene.bedFormat(region="introns"):
						intronFile.write(entry + "\n")
						codingIntronFile.write(entry + "\n")

				else: # noncoding transcripts just have exons and introns
					for entry in gene.bedFormat(region="exons"):
						exonFile.write(entry + "\n")
						noncodingExonFile.write(entry + "\n")

					for entry in gene.bedFormat(region="introns"):
						intronFile.write(entry + "\n")
						noncodingIntronFile.write(entry + "\n")


		if (args.ucsc): 
			with open(args.input, "r") as genesFile: 
				genesRead = 0

				for line in genesFile:
					# all of the knowngenes parsing and metadata construction is done inside UCSCKnownGene.py, especially the createGene method

					gene = createUCSCTranscript(line) 
					genesRead += 1

					writeOutput(gene)

					if (not genesRead % 2500):
						print "Processed %d entries..." %  genesRead

					
		elif (args.gtf): 
				
				# first parse the entire file into a dictionary of lists

			txDict = defaultdict(list) 

			print "Building GTF dictionary..." 

			# the issue here is that lines for various transcripts may be interleaved, so can either create lots of SNFGene objects, or a giant dict. opted for giant dict. 
			for line in GTF.lines(args.input): 

				txDict[line["transcript_id"]].append(line)
				genesRead += 1

				if (not genesRead % 100000):
					print "Processed %d lines..." %  genesRead

			print "Dictionary built." 

			# now create a SNFGene object for each transcript and output it 
			genesRead = 0
			for key in txDict: 

				#print key

				tx = createGTFTranscript(txDict[key])

				#print tx 
				writeOutput(tx)
				genesRead += 1
				
				if (not genesRead % 2500):
					print "Processed %d entries..." %  genesRead


	print "Processed %d entries." %  genesRead

	# BTD Edit: making unique regions and linking to gene name
	# --------------------------------------------------------
	# utr5FName = args.output  + "_5utr.bed"
	# utr5StartFName = args.output  + "_5utr_start.bed"
	# cdsFName = args.output  + "_cds.bed"
	# utr3FName = args.output  + "_3utr.bed"
	# exonFName = args.output  + "_exons.bed"
	# intronFName = args.output  + "_introns.bed"
	# codingExonFName = args.output  + "_codingexons.bed"
	# codingIntronFName = args.output  + "_codingintrons.bed" # note that these are introns from coding genes, not necessarily introns that make it to mRNA 
	# noncodingExonFName = args.output  + "_noncodingexons.bed" 
	# noncodingIntronFName = args.output  + "_noncodingintrons.bed" 

	# 1. Get gene ID (NM_123, ENSG123) --> gene name (Abcd1)
	print "Getting gene ID"
	idToName = {}
	if args.ucsc:
		with open(args.input, 'r') as knownGeneFile:
			reader = csv.reader(knownGeneFile, 'textdialect')
			for row in reader:
				idToName[row[0]] = row[-1]
			
	# 2. Get unique 5'UTR, 5'Start UTR, and 3'UTR
	print "Getting unique UTRs"
	def getUniqUTR(uniqFN, utrFN):
		with open(uniqFN, 'w') as uniq, open(utrFN, 'r') as utr:
			already = set()
			reader = csv.reader(utr, 'textdialect')
			writer = csv.writer(uniq, 'textdialect')
			for row in reader:
				if tuple(row[6:]) in already: continue #repeat
				geneIDInfo = row[3]
				id = geneIDInfo.split('__')[0]
				try: geneName = idToName[id]
				except: geneName = id
				if geneName != id: row[3] = id + '__' + geneName
				else: row[3] = id
				already.add(tuple(row[6:]))
				writer.writerow(row)
				
	uniq5UTR = args.output  + "_uniq_5utr.bed"
	getUniqUTR(uniq5UTR, utr5FName)

	uniq3UTR = args.output  + '_uniq_3utr.bed'
	getUniqUTR(uniq3UTR, utr3FName)

	uniq5SUTR = args.output  + '_uniq_5utr_start.bed'
	getUniqUTR(uniq5SUTR, utr5StartFName)
		
	# 3. Get unique exons + num. Do it 3x for all, coding, and noncoding
	print "Getting unique exons"
	def getUniqExons(uniqFN, exonFN):
		with open(uniqFN, 'w') as uniq, open(exonFN, 'r') as exons:
			already = set()
			reader = csv.reader(exons, 'textdialect')
			writer = csv.writer(uniq, 'textdialect')
			for row in reader:
				# gene ID info
				geneIDInfo = row[3]
				id = geneIDInfo.split('__')[0]
				try: geneName = idToName[id]
				except: geneName = id
				if geneName != id: geneIDInfo = id + '__' + geneName
				else: geneIDInfo = id
				
				# chrom, start, stop, strand
				chrom = row[0]
				start, end = int(row[1]), int(row[2])
				strand = row[5]

				# calculate exon starts and lengths
				exonLengths = row[10].split(',')
				if exonLengths[-1] == '': exonLengths = exonLengths[:-1]
				exonLengths = [int(x) for x in exonLengths]
				exonStarts = row[11].split(',')
				if exonStarts[-1] == '': exonStarts = exonStarts[:-1]
				exonStarts = [int(x) for x in exonStarts]
				
				# calculate exons
				exons = []
				for i in range(len(exonStarts)):
					absStart = start + exonStarts[i]
					exons.append([absStart, absStart + exonLengths[i]])
				if strand == '-': exons = exons[::-1] #flip exon order
				
				# making BED6
				for i in range(len(exons)):
					exonNum = i + 1
					exonNumInfo = str(exonNum) + 'of' + str(len(exons))
					exon = exons[i]
					outputRow = [chrom, exon[0], exon[1]]
					
					# unique
					if tuple(outputRow) in already: continue
					already.add(tuple(outputRow))            
					outputRow.extend([geneIDInfo + '__exon__' + exonNumInfo, 0, strand])
					writer.writerow(outputRow)
				
	uniqExons = args.output  + '_uniq_exons.bed'
	getUniqExons(uniqExons, exonFName)

	uniqExons = args.output  + '_uniq_codingexons.bed'
	getUniqExons(uniqExons, codingExonFName)

	uniqExons = args.output  + '_uniq_noncodingexons.bed'
	getUniqExons(uniqExons, noncodingExonFName)            

	# 4. Get unique introns + num. unique 5'SS, 3'SS. 
	# 5'SS is first base of intron, 3'SS is last base of intron
	print "Getting unique introns and 5' and 3' SS"
	def getUniqIntronsAndSS(uniqIntronFN, uniq5SSFN, uniq3SSFN, intronFN):
		with open(uniqIntronFN, 'w') as uniqIntron, open(uniq5SSFN, 'w') as uniq5, \
			open(uniq3SSFN, 'w') as uniq3, open(intronFN, 'r') as introns:
			alreadyIntron = set()
			already5 = set()
			already3 = set()
			
			reader = csv.reader(introns, 'textdialect')
			intronWriter = csv.writer(uniqIntron, 'textdialect')
			fiveWriter = csv.writer(uniq5, 'textdialect')
			threeWriter = csv.writer(uniq3, 'textdialect')
			
			for row in reader:
				# gene ID info
				geneIDInfo = row[3]
				id = geneIDInfo.split('__')[0]
				try: geneName = idToName[id]
				except: geneName = id
				if geneName != id: geneIDInfo = id + '__' + geneName
				else: geneIDInfo = id
				
				# chrom, start, stop, strand
				chrom = row[0]
				start, end = int(row[1]), int(row[2])
				strand = row[5]

				# calculate intron starts and lengths
				intronLengths = row[10].split(',')
				if intronLengths[-1] == '': intronLengths = intronLengths[:-1]
				intronLengths = [int(x) for x in intronLengths]
				intronStarts = row[11].split(',')
				if intronStarts[-1] == '': intronStarts = intronStarts[:-1]
				intronStarts = [int(x) for x in intronStarts]
				
				# calculate introns
				introns = []
				for i in range(len(intronStarts)):
					absStart = start + intronStarts[i]
					introns.append([absStart, absStart + intronLengths[i]])
				if strand == '-': introns = introns[::-1] #flip intron order
				
				# making BED6
				for i in range(len(introns)):
					intronNum = i + 1
					intronNumInfo = str(intronNum) + 'of' + str(len(introns))
					intron = introns[i]
					outputRow = [chrom, intron[0], intron[1]]
					
					# unique introns
					if tuple(outputRow) in alreadyIntron: continue
					alreadyIntron.add(tuple(outputRow))
					outputRow.extend([geneIDInfo+ '__intron__' + intronNumInfo, 0, strand])
					intronWriter.writerow(outputRow)
					
					# unique splice sites
					if strand == '+':
						fiveSS = [chrom, intron[0], intron[0] + 1]
						threeSS = [chrom, intron[1] - 1, intron[1]]
					else:
						threeSS = [chrom, intron[0], intron[0] + 1]
						fiveSS = [chrom, intron[1] - 1, intron[1]]
					if tuple(fiveSS) not in already5:
						already5.add(tuple(fiveSS))
						fiveSS.extend([geneIDInfo + '__5ss__' + intronNumInfo, 0, strand])
						fiveWriter.writerow(fiveSS)
					if tuple(threeSS) not in already3:
						already3.add(tuple(threeSS))
						threeSS.extend([geneIDInfo+ '__3ss__' + intronNumInfo, 0, strand])
						threeWriter.writerow(threeSS)

	uniqIntrons = args.output  + '_uniq_introns.bed'
	uniq5 = args.output  + '_uniq_5ss.bed'
	uniq3 = args.output  + '_uniq_3ss.bed'
	getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, intronFName)

	uniqIntrons = args.output  + '_uniq_codingintrons.bed'
	uniq5 = args.output  + '_uniq_coding5ss.bed'
	uniq3 = args.output  + '_uniq_coding3ss.bed'
	getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, codingIntronFName)

	uniqIntrons = args.output  + '_uniq_noncodingintrons.bed'
	uniq5 = args.output  + '_uniq_noncoding5ss.bed'
	uniq3 = args.output  + '_uniq_noncoding3ss.bed'
	getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, noncodingIntronFName)

	# 5. unique TSS/TES
	print "Getting unique TSS and TES"
	def getUniqTSSAndTES(tssFN, tesFN, cdsFN):
		with open(tssFN, 'w') as uniqTSS, open(tesFN, 'w') as uniqTES, open(cdsFN, 'r') as cds:
			alreadyTSS = set()
			alreadyTES = set()
			reader = csv.reader(cds, 'textdialect')
			tssWriter = csv.writer(uniqTSS, 'textdialect')
			tesWriter = csv.writer(uniqTES, 'textdialect')
			for row in reader:
				geneIDInfo = row[3]
				id = geneIDInfo.split('__')[0]
				try: geneName = idToName[id]
				except: geneName = id
				if geneName != id: geneIDInfo = id + '__' + geneName
				else: geneIDInfo = id
				
				# chrom, start, stop, strand
				chrom = row[0]
				strand = row[5]
				start, end = int(row[1]), int(row[2])
				
				if strand == '+':
					startRow = [chrom, start, start + 1]
					endRow = [chrom, end - 1, end]
				else:
					startRow = [chrom, end - 1, end]
					endRow = [chrom, start, start + 1]
				if tuple(startRow) not in alreadyTSS:
					alreadyTSS.add(tuple(startRow))
					startRow.extend([geneIDInfo, 0, strand])
					tssWriter.writerow(startRow)
				if tuple(endRow) not in alreadyTSS:
					alreadyTES.add(tuple(endRow))
					endRow.extend([geneIDInfo, 0, strand])
					tesWriter.writerow(endRow)            
				
	uniqTSS = args.output  + '_uniq_tss.bed'
	uniqTES = args.output  + '_uniq_tes.bed'
	getUniqTSSAndTES(uniqTSS, uniqTES, cdsFName)


	# sort everything
	print "Sorting BED files"
	for fn in glob.glob("*.bed"):
		os.system("sort -k1,1 -k2,2n %s -o %s"%(fn, fn))

Пример #3

Показать файл

                writeOutput(gene)

                if (not genesRead % 2500):
                    print "Processed %d entries..." % genesRead

    elif (args.gtf):

        # first parse the entire file into a dictionary of lists

        txDict = defaultdict(list)

        print "Building GTF dictionary..."

        # the issue here is that lines for various transcripts may be interleaved, so can either create lots of objects, or a giant dict. opted for giant dict.
        for line in GTF.lines(args.input):
            # only want to read in lines corresponding to these features
            if line["feature"] in ["exon", "CDS", "start_codon", "stop_codon"]:
                txDict[line["transcript_id"]].append(line)
                genesRead += 1

                if (not genesRead % 25000):
                    print "\tProcessed %d lines..." % genesRead

        print "Dictionary built."

        print "Writing transcript properties."
        genesRead = 0

        # now create a Transcript object for each transcript and output it

Пример #4

Показать файл

Файл: extract_transcript_regions.py Проект: stephenfloor/extract-transcript-regions

                writeOutput(gene)

                if (not genesRead % 2500):
                    print "Processed %d entries..." %  genesRead

                
    elif (args.gtf): 
            
            # first parse the entire file into a dictionary of lists

        txDict = defaultdict(list) 

        print "Building GTF dictionary..." 

        # the issue here is that lines for various transcripts may be interleaved, so can either create lots of objects, or a giant dict. opted for giant dict. 
        for line in GTF.lines(args.input): 
            # only want to read in lines corresponding to these features
            if line["feature"] in ["exon", "CDS", "start_codon", "stop_codon"]:
                txDict[line["transcript_id"]].append(line)
                genesRead += 1

                if (not genesRead % 25000):
                    print "\tProcessed %d lines..." %  genesRead

        print "Dictionary built." 

        print "Writing transcript properties."
        genesRead = 0
        
        # now create a Transcript object for each transcript and output it

Пример #5

Показать файл

Файл: extract_transcript_regions.py Проект: vallurumk/metagene-maker

def main():
    with open(utr5FName, "w") as utr5File, open(utr5StartFName, "w") as utr5StartFile, open(cdsFName, "w") as cdsFile, \
      open(utr3FName, "w") as utr3File, open(exonFName, "w") as exonFile, open (intronFName, "w") as intronFile, \
      open(codingExonFName, "w") as codingExonFile, open(codingIntronFName, "w") as codingIntronFile, \
      open(noncodingExonFName, "w") as noncodingExonFile, open(noncodingIntronFName, "w") as noncodingIntronFile:

        def writeOutput(gene):
            if (
                    useBlocks
            ):  # output all region primitives on the same line by specifying nBlocks and lists inside the BED output
                if (gene.coding):
                    #blockBedFormat is one line by definition
                    if (gene.utr5Len > 0):
                        utr5File.write(
                            gene.blockBedFormat(region="5utr") + "\n")
                    if (gene.utr5startLen > 0):
                        utr5StartFile.write(
                            gene.blockBedFormat(region="5utr_start") + "\n")
                    if (gene.cdsLen > 0):
                        cdsFile.write(gene.blockBedFormat(region="cds") + "\n")
                    if (gene.utr3Len > 0):
                        utr3File.write(
                            gene.blockBedFormat(region="3utr") + "\n")

                    if (gene.exonsLen > 0):
                        exonFile.write(
                            gene.blockBedFormat(region="exons") + "\n")
                        codingExonFile.write(
                            gene.blockBedFormat(region="exons") + "\n")

                    if (gene.intronsLen > 0):
                        intronFile.write(
                            gene.blockBedFormat(region="introns") + "\n")
                        codingIntronFile.write(
                            gene.blockBedFormat(region="introns") + "\n")

                else:  # noncoding transcripts just have exons and introns
                    if (gene.exonsLen > 0):
                        exonFile.write(
                            gene.blockBedFormat(region="exons") + "\n")
                        noncodingExonFile.write(
                            gene.blockBedFormat(region="exons") + "\n")

                    if (gene.intronsLen > 0):
                        intronFile.write(
                            gene.blockBedFormat(region="introns") + "\n")
                        noncodingIntronFile.write(
                            gene.blockBedFormat(region="introns") + "\n")

            else:  # output one line per region primitive instead of combining regions via blocks
                if (gene.coding):
                    for entry in gene.bedFormat(region="5utr"):
                        utr5File.write(entry + "\n")
                    for entry in gene.bedFormat(region="5utr_start"):
                        utr5StartFile.write(entry + "\n")
                    for entry in gene.bedFormat(region="cds"):
                        cdsFile.write(entry + "\n")
                    for entry in gene.bedFormat(region="3utr"):
                        utr3File.write(entry + "\n")

                    for entry in gene.bedFormat(region="exons"):
                        exonFile.write(entry + "\n")
                        codingExonFile.write(entry + "\n")

                    for entry in gene.bedFormat(region="introns"):
                        intronFile.write(entry + "\n")
                        codingIntronFile.write(entry + "\n")

                else:  # noncoding transcripts just have exons and introns
                    for entry in gene.bedFormat(region="exons"):
                        exonFile.write(entry + "\n")
                        noncodingExonFile.write(entry + "\n")

                    for entry in gene.bedFormat(region="introns"):
                        intronFile.write(entry + "\n")
                        noncodingIntronFile.write(entry + "\n")

        if (args.ucsc):
            with open(args.input, "r") as genesFile:
                genesRead = 0

                for line in genesFile:
                    # all of the knowngenes parsing and metadata construction is done inside UCSCKnownGene.py, especially the createGene method

                    gene = createUCSCTranscript(line)
                    genesRead += 1

                    writeOutput(gene)

                    if (not genesRead % 2500):
                        print "Processed %d entries..." % genesRead

        elif (args.gtf):

            # first parse the entire file into a dictionary of lists

            txDict = defaultdict(list)

            print "Building GTF dictionary..."

            # the issue here is that lines for various transcripts may be interleaved, so can either create lots of SNFGene objects, or a giant dict. opted for giant dict.
            for line in GTF.lines(args.input):

                txDict[line["transcript_id"]].append(line)
                genesRead += 1

                if (not genesRead % 100000):
                    print "Processed %d lines..." % genesRead

            print "Dictionary built."

            # now create a SNFGene object for each transcript and output it
            genesRead = 0
            for key in txDict:

                #print key

                tx = createGTFTranscript(txDict[key])

                #print tx
                writeOutput(tx)
                genesRead += 1

                if (not genesRead % 2500):
                    print "Processed %d entries..." % genesRead

    print "Processed %d entries." % genesRead

    # BTD Edit: making unique regions and linking to gene name
    # --------------------------------------------------------
    # utr5FName = args.output  + "_5utr.bed"
    # utr5StartFName = args.output  + "_5utr_start.bed"
    # cdsFName = args.output  + "_cds.bed"
    # utr3FName = args.output  + "_3utr.bed"
    # exonFName = args.output  + "_exons.bed"
    # intronFName = args.output  + "_introns.bed"
    # codingExonFName = args.output  + "_codingexons.bed"
    # codingIntronFName = args.output  + "_codingintrons.bed" # note that these are introns from coding genes, not necessarily introns that make it to mRNA
    # noncodingExonFName = args.output  + "_noncodingexons.bed"
    # noncodingIntronFName = args.output  + "_noncodingintrons.bed"

    # 1. Get gene ID (NM_123, ENSG123) --> gene name (Abcd1)
    print "Getting gene ID"
    idToName = {}
    if args.ucsc:
        with open(args.input, 'r') as knownGeneFile:
            reader = csv.reader(knownGeneFile, 'textdialect')
            for row in reader:
                idToName[row[0]] = row[-1]

    # 2. Get unique 5'UTR, 5'Start UTR, and 3'UTR
    print "Getting unique UTRs"

    def getUniqUTR(uniqFN, utrFN):
        with open(uniqFN, 'w') as uniq, open(utrFN, 'r') as utr:
            already = set()
            reader = csv.reader(utr, 'textdialect')
            writer = csv.writer(uniq, 'textdialect')
            for row in reader:
                if tuple(row[6:]) in already: continue  #repeat
                geneIDInfo = row[3]
                id = geneIDInfo.split('__')[0]
                try:
                    geneName = idToName[id]
                except:
                    geneName = id
                if geneName != id: row[3] = id + '__' + geneName
                else: row[3] = id
                already.add(tuple(row[6:]))
                writer.writerow(row)

    uniq5UTR = args.output + "_uniq_5utr.bed"
    getUniqUTR(uniq5UTR, utr5FName)

    uniq3UTR = args.output + '_uniq_3utr.bed'
    getUniqUTR(uniq3UTR, utr3FName)

    uniq5SUTR = args.output + '_uniq_5utr_start.bed'
    getUniqUTR(uniq5SUTR, utr5StartFName)

    # 3. Get unique exons + num. Do it 3x for all, coding, and noncoding
    print "Getting unique exons"

    def getUniqExons(uniqFN, exonFN):
        with open(uniqFN, 'w') as uniq, open(exonFN, 'r') as exons:
            already = set()
            reader = csv.reader(exons, 'textdialect')
            writer = csv.writer(uniq, 'textdialect')
            for row in reader:
                # gene ID info
                geneIDInfo = row[3]
                id = geneIDInfo.split('__')[0]
                try:
                    geneName = idToName[id]
                except:
                    geneName = id
                if geneName != id: geneIDInfo = id + '__' + geneName
                else: geneIDInfo = id

                # chrom, start, stop, strand
                chrom = row[0]
                start, end = int(row[1]), int(row[2])
                strand = row[5]

                # calculate exon starts and lengths
                exonLengths = row[10].split(',')
                if exonLengths[-1] == '': exonLengths = exonLengths[:-1]
                exonLengths = [int(x) for x in exonLengths]
                exonStarts = row[11].split(',')
                if exonStarts[-1] == '': exonStarts = exonStarts[:-1]
                exonStarts = [int(x) for x in exonStarts]

                # calculate exons
                exons = []
                for i in range(len(exonStarts)):
                    absStart = start + exonStarts[i]
                    exons.append([absStart, absStart + exonLengths[i]])
                if strand == '-': exons = exons[::-1]  #flip exon order

                # making BED6
                for i in range(len(exons)):
                    exonNum = i + 1
                    exonNumInfo = str(exonNum) + 'of' + str(len(exons))
                    exon = exons[i]
                    outputRow = [chrom, exon[0], exon[1]]

                    # unique
                    if tuple(outputRow) in already: continue
                    already.add(tuple(outputRow))
                    outputRow.extend(
                        [geneIDInfo + '__exon__' + exonNumInfo, 0, strand])
                    writer.writerow(outputRow)

    uniqExons = args.output + '_uniq_exons.bed'
    getUniqExons(uniqExons, exonFName)

    uniqExons = args.output + '_uniq_codingexons.bed'
    getUniqExons(uniqExons, codingExonFName)

    uniqExons = args.output + '_uniq_noncodingexons.bed'
    getUniqExons(uniqExons, noncodingExonFName)

    # 4. Get unique introns + num. unique 5'SS, 3'SS.
    # 5'SS is first base of intron, 3'SS is last base of intron
    print "Getting unique introns and 5' and 3' SS"

    def getUniqIntronsAndSS(uniqIntronFN, uniq5SSFN, uniq3SSFN, intronFN):
        with open(uniqIntronFN, 'w') as uniqIntron, open(uniq5SSFN, 'w') as uniq5, \
         open(uniq3SSFN, 'w') as uniq3, open(intronFN, 'r') as introns:
            alreadyIntron = set()
            already5 = set()
            already3 = set()

            reader = csv.reader(introns, 'textdialect')
            intronWriter = csv.writer(uniqIntron, 'textdialect')
            fiveWriter = csv.writer(uniq5, 'textdialect')
            threeWriter = csv.writer(uniq3, 'textdialect')

            for row in reader:
                # gene ID info
                geneIDInfo = row[3]
                id = geneIDInfo.split('__')[0]
                try:
                    geneName = idToName[id]
                except:
                    geneName = id
                if geneName != id: geneIDInfo = id + '__' + geneName
                else: geneIDInfo = id

                # chrom, start, stop, strand
                chrom = row[0]
                start, end = int(row[1]), int(row[2])
                strand = row[5]

                # calculate intron starts and lengths
                intronLengths = row[10].split(',')
                if intronLengths[-1] == '': intronLengths = intronLengths[:-1]
                intronLengths = [int(x) for x in intronLengths]
                intronStarts = row[11].split(',')
                if intronStarts[-1] == '': intronStarts = intronStarts[:-1]
                intronStarts = [int(x) for x in intronStarts]

                # calculate introns
                introns = []
                for i in range(len(intronStarts)):
                    absStart = start + intronStarts[i]
                    introns.append([absStart, absStart + intronLengths[i]])
                if strand == '-': introns = introns[::-1]  #flip intron order

                # making BED6
                for i in range(len(introns)):
                    intronNum = i + 1
                    intronNumInfo = str(intronNum) + 'of' + str(len(introns))
                    intron = introns[i]
                    outputRow = [chrom, intron[0], intron[1]]

                    # unique introns
                    if tuple(outputRow) in alreadyIntron: continue
                    alreadyIntron.add(tuple(outputRow))
                    outputRow.extend(
                        [geneIDInfo + '__intron__' + intronNumInfo, 0, strand])
                    intronWriter.writerow(outputRow)

                    # unique splice sites
                    if strand == '+':
                        fiveSS = [chrom, intron[0], intron[0] + 1]
                        threeSS = [chrom, intron[1] - 1, intron[1]]
                    else:
                        threeSS = [chrom, intron[0], intron[0] + 1]
                        fiveSS = [chrom, intron[1] - 1, intron[1]]
                    if tuple(fiveSS) not in already5:
                        already5.add(tuple(fiveSS))
                        fiveSS.extend([
                            geneIDInfo + '__5ss__' + intronNumInfo, 0, strand
                        ])
                        fiveWriter.writerow(fiveSS)
                    if tuple(threeSS) not in already3:
                        already3.add(tuple(threeSS))
                        threeSS.extend([
                            geneIDInfo + '__3ss__' + intronNumInfo, 0, strand
                        ])
                        threeWriter.writerow(threeSS)

    uniqIntrons = args.output + '_uniq_introns.bed'
    uniq5 = args.output + '_uniq_5ss.bed'
    uniq3 = args.output + '_uniq_3ss.bed'
    getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, intronFName)

    uniqIntrons = args.output + '_uniq_codingintrons.bed'
    uniq5 = args.output + '_uniq_coding5ss.bed'
    uniq3 = args.output + '_uniq_coding3ss.bed'
    getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, codingIntronFName)

    uniqIntrons = args.output + '_uniq_noncodingintrons.bed'
    uniq5 = args.output + '_uniq_noncoding5ss.bed'
    uniq3 = args.output + '_uniq_noncoding3ss.bed'
    getUniqIntronsAndSS(uniqIntrons, uniq5, uniq3, noncodingIntronFName)

    # 5. unique cdsStart, cdsEnd
    print "Getting unique cdsStart and cdsEnd"

    def getUniqCDSStartEnd(startFN, endFN, cdsFN):
        with open(startFN,
                  'w') as uniqStart, open(endFN,
                                          'w') as uniqEnd, open(cdsFN,
                                                                'r') as cds:
            alreadyStart = set()
            alreadyEnd = set()
            reader = csv.reader(cds, 'textdialect')
            startWriter = csv.writer(uniqStart, 'textdialect')
            endWriter = csv.writer(uniqEnd, 'textdialect')
            for row in reader:
                geneIDInfo = row[3]
                id = geneIDInfo.split('__')[0]
                try:
                    geneName = idToName[id]
                except:
                    geneName = id
                if geneName != id: geneIDInfo = id + '__' + geneName
                else: geneIDInfo = id

                # chrom, start, stop, strand
                chrom = row[0]
                strand = row[5]
                start, end = int(row[1]), int(row[2])

                if strand == '+':
                    startRow = [chrom, start, start + 1]
                    endRow = [chrom, end - 1, end]
                else:
                    startRow = [chrom, end - 1, end]
                    endRow = [chrom, start, start + 1]
                if tuple(startRow) not in alreadyStart:
                    alreadyStart.add(tuple(startRow))
                    startRow.extend([geneIDInfo, 0, strand])
                    startWriter.writerow(startRow)
                if tuple(endRow) not in alreadyEnd:
                    alreadyEnd.add(tuple(endRow))
                    endRow.extend([geneIDInfo, 0, strand])
                    endWriter.writerow(endRow)

    uniqCDSStart = args.output + '_uniq_cdsStart.bed'
    uniqCDSEnd = args.output + '_uniq_cdsEnd.bed'
    getUniqCDSStartEnd(uniqCDSStart, uniqCDSEnd, cdsFName)

    # 6. unique TSS, TES
    print "Getting unique TSS and TES"

    def getUniqTSSAndTES(tssFN, tesFN, fiveFN, threeFN):
        with open(tssFN, 'w') as uniqTSS, open(tesFN, 'w') as uniqTES, open(
                fiveFN, 'r') as fiveUTR, open(threeFN, 'r') as threeUTR:
            alreadyTSS = set()
            fiveReader = csv.reader(fiveUTR, 'textdialect')
            tssWriter = csv.writer(uniqTSS, 'textdialect')
            for row in fiveReader:
                geneIDInfo = row[3]
                id = geneIDInfo.split('__')[0]
                try:
                    geneName = idToName[id]
                except:
                    geneName = id
                if geneName != id: geneIDInfo = id + '__' + geneName
                else: geneIDInfo = id

                # chrom, start, stop, strand
                chrom = row[0]
                strand = row[5]
                start, end = int(row[1]), int(row[2])

                if strand == '+':
                    startRow = [chrom, start, start + 1]
                else:
                    startRow = [chrom, end - 1, end]
                if tuple(startRow) not in alreadyTSS:
                    alreadyTSS.add(tuple(startRow))
                    startRow.extend([geneIDInfo, 0, strand])
                    tssWriter.writerow(startRow)

            alreadyTES = set()
            threeReader = csv.reader(threeUTR, 'textdialect')
            tesWriter = csv.writer(uniqTES, 'textdialect')
            for row in threeReader:
                geneIDInfo = row[3]
                id = geneIDInfo.split('__')[0]
                try:
                    geneName = idToName[id]
                except:
                    geneName = id
                if geneName != id: geneIDInfo = id + '__' + geneName
                else: geneIDInfo = id

                # chrom, start, stop, strand
                chrom = row[0]
                strand = row[5]
                start, end = int(row[1]), int(row[2])

                if strand == '-':
                    endRow = [chrom, start, start + 1]
                else:
                    endRow = [chrom, end - 1, end]
                if tuple(endRow) not in alreadyTES:
                    alreadyTES.add(tuple(endRow))
                    endRow.extend([geneIDInfo, 0, strand])
                    tesWriter.writerow(endRow)

    uniqTSS = args.output + '_uniq_tss.bed'
    uniqTES = args.output + '_uniq_tes.bed'

    getUniqTSSAndTES(uniqTSS, uniqTES, utr5FName, utr3FName)

    # sort everything
    print "Sorting BED files"
    for fn in glob.glob("*.bed"):
        os.system("sort -k1,1 -k2,2n %s -o %s" % (fn, fn))

Пример #6

Показать файл

Файл: parse_gtf.py Проект: UlyssesD/output-viewer

def main(argv):

    # Ottengo la stringa relativa al file da processare
    input_file = argv[0]
    temp_folder = argv[1]
    username = argv[2]
    experiment = argv[3]
    species = argv[4]

    config = json.load(open('../configuration.json'))

    temp_token = username + '_' + str(uuid.uuid4())

    # Creo i csv per memorizzare le informazioni sui nodi
    chromosome_csv = open(temp_folder + temp_token + '_chromosome.csv', 'w')
    gene_csv = open(temp_folder + temp_token + '_gene.csv', 'w')
    transcript_csv = open(temp_folder + temp_token + '_transcript.csv', 'w')
    exon_csv = open(temp_folder + temp_token + '_exon.csv', 'w')

    # Creo i csv per memorizzare le informazioni sulle relazioni
    contains_csv = open(temp_folder + temp_token + '_contains.csv', 'w')
    in_chromosome_csv = open(temp_folder + temp_token + '_in_chromosome.csv',
                             'w')
    has_transcript_csv = open(temp_folder + temp_token + '_has_transcript.csv',
                              'w')
    has_exon_csv = open(temp_folder + temp_token + '_has_exon.csv', 'w')

    # Inizializzo i writer per tutti i file

    # ---- nodi
    chromosomeWriter = csv.writer(chromosome_csv, delimiter=',')
    geneWriter = csv.writer(gene_csv, delimiter=',')
    transcriptWriter = csv.writer(transcript_csv, delimiter=',')
    exonWriter = csv.writer(exon_csv, delimiter=',')

    # ---- relazioni
    containsWriter = csv.writer(contains_csv, delimiter=',')
    inChromosomeWriter = csv.writer(in_chromosome_csv, delimiter=',')
    hasTranscriptWriter = csv.writer(has_transcript_csv, delimiter=',')
    hasExonWriter = csv.writer(has_exon_csv, delimiter=',')

    # Cotruisco gli header dei file

    # ---- nodi
    chromosome_header = ["chromosome"]
    gene_header = ["gene_id"]
    transcript_header = [
        "transcript_id", "reference_id", "cov", "FPKM", "TPM", "start", "end"
    ]
    exon_header = ["exon_id", "exon_number", "start", "end", "cov"]

    # ---- relazioni
    contains_header = ["name", "gene_id"]
    in_chromosome_header = ["gene_id", "chromosome"]
    has_transcript_header = ["gene_id", "strand", "transcript_id"]
    has_exon_header = ["transcript_id", "exon_id"]

    # Scrivo gli header nei rispettivi file

    # ---- nodi
    chromosomeWriter.writerow(chromosome_header)
    geneWriter.writerow(gene_header)
    transcriptWriter.writerow(transcript_header)
    exonWriter.writerow(exon_header)

    # ---- relazioni
    containsWriter.writerow(contains_header)
    inChromosomeWriter.writerow(in_chromosome_header)
    hasTranscriptWriter.writerow(has_transcript_header)
    hasExonWriter.writerow(has_exon_header)

    # Inizializzo le strutture dati necessarie al parsing (per ottimizzare il caricamento dei dati su database)

    # ---- nodi
    chromosomes = set()
    genes_dict = {}
    transcripts_dict = {}

    # ---- relazioni
    contains_dict = {}
    in_chromosome_dict = {}
    has_transcript_dict = {}

    print 'Starting parsing procedure for file ' + input_file
    properties = {
        "name": os.path.basename(input_file),
        "extension": os.path.splitext(input_file)[1]
    }

    # Connessione a Neo4j
    driver = GraphDatabase.driver("bolt://" + config["neo4j"]["address"],
                                  auth=basic_auth(config["neo4j"]["username"],
                                                  config["neo4j"]["password"]))

    # Inizializzazione degli indici
    session = driver.session()

    statements = [
        "CREATE INDEX ON :File(name);", "CREATE INDEX ON :Species(species);",
        "CREATE INDEX ON :Gene(gene_id);",
        "CREATE INDEX ON :Chromosome(chromosome);",
        "CREATE INDEX ON :Transcript(transcript_id);",
        "CREATE INDEX ON :Exon(exon_id);"
    ]

    for statement in statements:
        session.run(statement)

    session.close()

    print 'Parsing file...'

    # inizializzo un contatore per fare un load parziale del file su database per file troppo grandi
    row_count = 0

    for line in GTF.lines(input_file):
        row_count += 1

        # memorizzo il cromosoma
        chromosomes.add(line["seqname"])

        # memorizzo il gene (se non presente)
        if not genes_dict.has_key(line["gene_id"]):
            genes_dict[line["gene_id"]] = [
                line[attr] if line.has_key(attr) else "None"
                for attr in gene_header
            ]

        # memorizzo la relazione (file)-[contiene]->(gene) (se non esiste)
        if not contains_dict.has_key(properties["name"] + ':' +
                                     line["gene_id"]):
            contains_dict[properties["name"] + ':' + line["gene_id"]] = [
                properties["name"], line["gene_id"]
            ]

        # memorizzo la relazione (gene)-[contenuto in]->(cromosoma) (se non esiste)
        if not in_chromosome_dict.has_key(line["gene_id"] + ':' +
                                          line["seqname"]):
            in_chromosome_dict[line["gene_id"] + ':' + line["seqname"]] = [
                line["gene_id"], line["seqname"]
            ]

        # a seconda della feature considerata (transcript, exon) memorizzo opportunamente le informazioni della riga
        if line['feature'] == 'transcript':

            # memorizzo il trascritto (se non presente)
            if not transcripts_dict.has_key(line["transcript_id"]):
                transcripts_dict[line["transcript_id"]] = [
                    line[attr] if line.has_key(attr) else "None"
                    for attr in transcript_header
                ]

            # memorizzo la relazione (gene)-[contiente]->(trascritto) (se non esiste)
            if not has_transcript_dict.has_key(line["gene_id"] + ':' +
                                               line["transcript_id"]):
                has_transcript_dict[line["gene_id"] + ':' +
                                    line["transcript_id"]] = [
                                        line[attr]
                                        for attr in has_transcript_header
                                    ]

        elif line['feature'] == 'exon':
            #definisco un ID per l'esone (necessario per il popolamento su db)
            exon_id = line["exon_number"] + ':' + line["transcript_id"]

            # memorizzo l'esone nel file csv
            exonWriter.writerow([exon_id] + [
                line[attr] if line.has_key(attr) else "None"
                for attr in exon_header[1:]
            ])

            #memorizzo la relazione (trascritto)-[contiene]->(esone) nel file csv
            hasExonWriter.writerow([line["transcript_id"], exon_id])

        if not (row_count % 15000):
            print str(row_count) + " scanned"

    # scrivo i file csv dei dict creati in precedenza
    for chrom in list(chromosomes):
        chromosomeWriter.writerow([chrom])

    for gene in genes_dict.keys():
        geneWriter.writerow(genes_dict[gene])

    for transcript in transcripts_dict.keys():
        transcriptWriter.writerow(transcripts_dict[transcript])

    for entry in contains_dict.keys():
        containsWriter.writerow(contains_dict[entry])

    for entry in in_chromosome_dict.keys():
        inChromosomeWriter.writerow(in_chromosome_dict[entry])

    for entry in has_transcript_dict.keys():
        hasTranscriptWriter.writerow(has_transcript_dict[entry])

    # termino la scrittura dei file csv

    # ---- nodi
    chromosome_csv.close()
    gene_csv.close()
    transcript_csv.close()
    exon_csv.close()

    # ---- relazioni
    contains_csv.close()
    in_chromosome_csv.close()
    has_transcript_csv.close()
    has_exon_csv.close()

    print 'Populating Database...'
    session = driver.session()

    prova = [
        "MERGE (u:User { username:{username} })",
        "MERGE (e:Experiment { name:{experiment} })",
        "MERGE (s:Species {species: {species} })",
        "MERGE (f:File { name:{properties}.name }) ON CREATE SET f += {properties}",
        "MERGE (u)-[:Created]->(e)", "MERGE (e)-[:For_Species]->(s)",
        "MERGE (e)-[:Composed_By]->(f)"
    ]

    # Associo il file all'utente
    session.run(
        " ".join(prova), {
            "username": username,
            "experiment": experiment,
            "species": species,
            "properties": properties
        })

    session.close()

    populateDB(driver, temp_folder + temp_token)

    print 'Done.'