Exemplo n.º 1
0
def histoneFeaturedVector(path, percentage, genome, splitGenome, typeOfFill, percentageOfOverPosition, histoneMarks_by_type): #tipeOfFill sera binario o decimal y percentageOfOverPosition is a percentagen needed to keep in a group
	hmFolders = glob(path+"*/")
	hmDict = {}
	if histoneMarks_by_type == "together":
		hmDict["histone_marks"] = get_vector(genome, splitGenome)
	if histoneMarks_by_type == "unique":
		for hmFolder in hmFolders:
			hmDict[hmFolder.split("/")[-2].replace("-human","")] = get_vector(genome, splitGenome)
	
	for hmFolder in hmFolders:
		currentHM = hmFolder.split("/")[-2].replace("-human","")
		if histoneMarks_by_type == "together":
			currentHM = "histone_marks"		

		#getting the current features to use
		features = intersect(hmFolder,percentage)
		#getting dictionary of zeros for each portion of the genome
		for feature in features:
			init = int(feature[1]/(splitGenome*1000))
			end = int(feature[2]/(splitGenome*1000))
			#percent init ocuppancy = (len(feature_init) * 100)/len(feature)
			#len(feature) = feature[2]-feature[1]
			#len(feature_init) = ((init_position+1)*(kb*1000)) - feature[1]
			percent_init_occupancy = ((((init+1)*(splitGenome*1000.))-feature[1])*100)/(feature[2]-feature[1])

			#percent end ocuppancy = (len(feature_end) * 100)/len(feature)
			#len(feature) = feature[2]-feature[1]
			#len(feature_end) = feature[2] -((end_position)*(kb*1000)) 		
		
			percent_end_occupancy = ((feature[2]-(end*(splitGenome*1000.)))*100)/(feature[2]-feature[1])
		
			i = init
			while i <= end:
				to_mark = True # to know if I need to mark that position
				if i == init:
					if percent_init_occupancy < percentageOfOverPosition:
						to_mark == False
				if i == end:
					if percent_end_occupancy < percentageOfOverPosition:
						to_mark == False
		
				if to_mark == True:
					aux = hmDict[currentHM][feature[0]]
					if typeOfFill == "decimal":
						aux[i] +=1
					if typeOfFill == "binary":
						aux[i] = 1
					hmDict[currentHM][feature[0]] = aux
			
				i+=1
		
	return hmDict
Exemplo n.º 2
0
Arquivo: GTF.py Projeto: Cold7/tesis
def geneVector(genome, splitGenome, GTF, typeOfFill, percentageOfOverPosition):

	#getting gene vector for each chr (its a dict)
	vector = get_vector(genome, splitGenome)
	gtfGene = []
	gtf = open(GTF,"r")
	for line in gtf:
		splittedLine = line.split("\t")
		geneID = splittedLine[8].split("\"")[1]
		if "spikein	exon" not in line:
			if (splittedLine[2] == "gene" or "ENSG" not in geneID):
				#getting gene coords
				c1 = float(splittedLine[3])
				c2 = float(splittedLine[4])
				init = int(c1/(splitGenome*1000))
				end = int(c2/(splitGenome*1000))
				#percent init ocuppancy = (len(feature_init) * 100)/len(feature)
				#len(feature) = feature[2]-feature[1]
				#len(feature_init) = ((init_position+1)*(kb*1000)) - feature[1]
				percent_init_occupancy = ((((init+1)*(splitGenome*1000.))-c1)*100)/(c2-c1)
		
				#percent end ocuppancy = (len(feature_end) * 100)/len(feature)
				#len(feature) = feature[2]-feature[1]
				#len(feature_end) = feature[2] -((end_position)*(kb*1000)) 		
					
				percent_end_occupancy = ((c2-(end*(splitGenome*1000.)))*100)/(c2-c1)
	
				i = init
				while i <= end:
					to_mark = True # to know if I need to mark that position
					if i == init:
						if percent_init_occupancy < percentageOfOverPosition:
							to_mark == False
					if i == end:
						if percent_end_occupancy < percentageOfOverPosition:
							to_mark == False
					
					if to_mark == True:
						aux = vector[splittedLine[0]]
						if typeOfFill == "decimal":
							aux[i] +=1
						if typeOfFill == "binary":
							aux[i] = 1
						vector[splittedLine[0]] = aux
						
					i+=1

	return vector
Exemplo n.º 3
0
def getFeaturedVector(path, percentage, genome, splitGenome, typeOfFill, percentageOfOverPosition): #tipeOfFill sera binario o decimal y percentageOfOverPosition is a percentagen needed to keep in a group
	
	#getting the current features to use
	features = intersect(path,percentage)
	#getting dictionary of zeros for each portion of the genome
	dictZeroVectors = get_vector(genome, splitGenome)

	for feature in features:
		init = int(feature[1]/(splitGenome*1000))
		end = int(feature[2]/(splitGenome*1000))
		
		#percent init ocuppancy = (len(feature_init) * 100)/len(feature)
		#len(feature) = feature[2]-feature[1]
		#len(feature_init) = ((init_position+1)*(kb*1000)) - feature[1]
		percent_init_occupancy = ((((init+1)*(splitGenome*1000.))-feature[1])*100)/(feature[2]-feature[1])

		#percent end ocuppancy = (len(feature_end) * 100)/len(feature)
		#len(feature) = feature[2]-feature[1]
		#len(feature_end) = feature[2] -((end_position)*(kb*1000)) 		
			
		percent_end_occupancy = ((feature[2]-(end*(splitGenome*1000.)))*100)/(feature[2]-feature[1])
	
		
		i = init
		while i <= end:
			to_mark = True # to know if I need to mark that position
			if i == init:
				if percent_init_occupancy < percentageOfOverPosition:
					to_mark == False
			if i == end:
				if percent_end_occupancy < percentageOfOverPosition:
					to_mark == False
			
			if to_mark == True:
				aux = dictZeroVectors[feature[0]]
				if typeOfFill == "decimal":
					aux[i] +=1
				if typeOfFill == "binary":
					aux[i] = 1
				dictZeroVectors[feature[0]] = aux
				
			i+=1
	
	return dictZeroVectors
Exemplo n.º 4
0
Arquivo: FIMO.py Projeto: Cold7/tesis
def FIMOVector(fimopath, genome, splitGenome, typeOfFill,
               percentageOfOverPosition, fimo_filter):
    fimoDict = {}  #TFBM = NSITES
    fimoMotifs = open("./dat/HOCOMOCOv11_full_HUMAN_mono_meme_format.meme",
                      "r")
    currentMotif = ""
    for line in fimoMotifs:
        if "MOTIF" in line:
            currentMotif = line[:-1].replace("MOTIF ", "")
        if "nsites=" in line:
            fimoDict[currentMotif] = int(line[:-1].split("nsites= ")[-1])
    fimoMotifs.close

    folders = glob(fimopath + "/*/")
    dictZeroVectors = get_vector(genome, splitGenome)
    for folder in folders:
        file = open(folder + "fimo.txt", "r")
        for line in file:
            if line[0] != "#":
                splt = line[:-1].split("\t")
                if fimoDict[splt[0]] >= fimo_filter:
                    chr = splt[2]
                    #print splt[1],splt[3], splt[4]
                    init = int(int(splt[3]) / (splitGenome * 1000))
                    end = int(int(splt[4]) / (splitGenome * 1000))
                    #percent init ocuppancy = (len(feature_init) * 100)/len(feature)
                    #len(feature) = feature[2]-feature[1]
                    #len(feature_init) = ((init_position+1)*(kb*1000)) - feature[1]
                    percent_init_occupancy = (
                        (((init + 1) * (splitGenome * 1000.)) - int(splt[3])) *
                        100) / (int(splt[4]) - int(splt[3]))

                    #percent end ocuppancy = (len(feature_end) * 100)/len(feature)
                    #len(feature) = feature[2]-feature[1]
                    #len(feature_end) = feature[2] -((end_position)*(kb*1000))
                    percent_end_occupancy = (
                        (int(splt[4]) - (end * (splitGenome * 1000.))) *
                        100) / (int(splt[4]) - int(splt[3]))

                    #if init!=end:
                    #	print init, end, percent_init_occupancy, percent_end_occupancy

                    i = init
                    while i <= end:
                        to_mark = True  # to know if I need to mark that position
                        if i == init:
                            if percent_init_occupancy < percentageOfOverPosition:
                                to_mark == False
                        if i == end:
                            if percent_end_occupancy < percentageOfOverPosition:
                                to_mark == False

                        if to_mark == True:

                            aux = dictZeroVectors[chr]
                            if typeOfFill == "decimal":
                                aux[i] += 1
                            if typeOfFill == "binary":
                                aux[1] = 1
                            dictZeroVectors[chr] = aux

                        i += 1

        file.close()

    return dictZeroVectors
Exemplo n.º 5
0
Arquivo: TF.py Projeto: Cold7/tesis
def tfFeaturedVector(
    path, percentage, genome, splitGenome, typeOfFill, percentageOfOverPosition
):  #tipeOfFill sera binario o decimal y percentageOfOverPosition is a percentagen needed to keep in a group
    #reading the list of TFs
    tfFile = open("./dat/TFs.csv", "r")
    tfList = []
    for line in tfFile:
        if line[0] != "#":  #if the current line is not a comment
            aux = line.split(",")
            if aux[3] == "Yes":
                tfList.append(aux[1])

    tfFile.close()
    tfFolders = glob(path + "*/")

    dictZeroVectors = get_vector(genome, splitGenome)

    for tfFolder in tfFolders:
        currentTF = tfFolder.split("/")[-2].replace("-human", "")
        if currentTF in tfList:
            #getting the current features to use
            features = intersect(tfFolder, percentage)
            #getting dictionary of zeros for each portion of the genome
            for feature in features:
                init = int(feature[1] / (splitGenome * 1000))
                end = int(feature[2] / (splitGenome * 1000))
                #percent init ocuppancy = (len(feature_init) * 100)/len(feature)
                #len(feature) = feature[2]-feature[1]
                #len(feature_init) = ((init_position+1)*(kb*1000)) - feature[1]
                percent_init_occupancy = (((
                    (init + 1) * (splitGenome * 1000.)) - feature[1]) *
                                          100) / (feature[2] - feature[1])

                #percent end ocuppancy = (len(feature_end) * 100)/len(feature)
                #len(feature) = feature[2]-feature[1]
                #len(feature_end) = feature[2] -((end_position)*(kb*1000))

                percent_end_occupancy = ((feature[2] -
                                          (end * (splitGenome * 1000.))) *
                                         100) / (feature[2] - feature[1])

                i = init
                while i <= end:
                    to_mark = True  # to know if I need to mark that position
                    if i == init:
                        if percent_init_occupancy < percentageOfOverPosition:
                            to_mark == False
                    if i == end:
                        if percent_end_occupancy < percentageOfOverPosition:
                            to_mark == False

                    if to_mark == True:
                        aux = dictZeroVectors[feature[0]]
                        if typeOfFill == "decimal":
                            aux[i] += 1
                        if typeOfFill == "binary":
                            aux[1] = 1
                        dictZeroVectors[feature[0]] = aux

                    i += 1

    return dictZeroVectors
Exemplo n.º 6
0
def transcriptVector(genome, gtf, RNAseqFolder, splitGenome, typeOfFill,
                     percentageOfOverPosition):
    #vector of genome fragments
    vector = get_vector(
        genome,
        splitGenome)  #in the reallity this is a dict of chr: [vector of zeros]
    vectorAux = get_vector(
        genome, splitGenome
    )  #an aux to know sum(fpkm)/how many sums where done in this position
    #transcript Dictionary
    transDict = transcriptDict(gtf)  #transcript: [init,end,chr, gene]
    #looking for tsv files where fpkm is annoted
    tsvs = glob(RNAseqFolder + "/*.tsv")
    for tsv in tsvs:
        tsvFile = open(tsv, "r")
        aux = 0  # to avoid the first line
        aux2 = None
        for line in tsvFile:
            if aux != 0:
                split = line.split("\t")
                transcriptID = split[0]
                fpkm = float(split[6])
                if transcriptID in transDict:
                    coords = transDict[transcriptID]
                    #now we need to sum the fpkm to the current position and also we will add 1 to the positions in vectoraux
                    #getting gene coords
                    c1 = float(coords[0])
                    c2 = float(coords[1])
                    init = int(c1 / (splitGenome * 1000))
                    end = int(c2 / (splitGenome * 1000))
                    #percent init ocuppancy = (len(feature_init) * 100)/len(feature)
                    #len(feature) = feature[2]-feature[1]
                    #len(feature_init) = ((init_position+1)*(kb*1000)) - feature[1]
                    percent_init_occupancy = ((
                        ((init + 1) *
                         (splitGenome * 1000.)) - c1) * 100) / (c2 - c1)

                    #percent end ocuppancy = (len(feature_end) * 100)/len(feature)
                    #len(feature) = feature[2]-feature[1]
                    #len(feature_end) = feature[2] -((end_position)*(kb*1000))

                    percent_end_occupancy = (
                        (c2 - (end * (splitGenome * 1000.))) * 100) / (c2 - c1)
                    i = init
                    while i <= end:
                        to_mark = True  # to know if I need to mark that position
                        if i == init:
                            if percent_init_occupancy < percentageOfOverPosition:
                                to_mark == False
                        if i == end:
                            if percent_end_occupancy < percentageOfOverPosition:
                                to_mark == False

                        if to_mark == True:
                            aux2 = vector[transDict[transcriptID][
                                2]]  # <------------------- Este splitted es el cromosoma actual
                            if typeOfFill == "decimal":
                                aux2[i] += 1
                            if typeOfFill == "binary":
                                aux2[i] = 1
                            if typeOfFill == "average":
                                #we will use an auxiliar vector to know how man times it was modified
                                vectorAux[transDict[transcriptID][2]][i] += 1
                                aux2[i] += fpkm
                                #print fpkm
                            vector[transDict[transcriptID][2]] = aux2
                        i += 1
            aux = 1
        tsvFile.close()

    #Now we exit from the tsv loop, so is time to divide if typeOfFill was average
    if typeOfFill == "average":
        for vect in vector.items():
            i = 0
            for fpkm in vect[1]:
                if fpkm != 0:
                    vector[vect[0]][i] /= vectorAux[vect[0]][i]
                i += 1

    return vector