def fromCancerList(n, value): cutoff = value casedict = { 'ACC': 92, 'BRCA': 1044, 'ESCA': 184, 'HNSC': 510, 'LAML': 149, 'MESO': 83, 'SKCM': 470, 'THCA': 496, 'COAD': 433 } diseaseList = casedict.keys() listCandidateProteins = [ ] #list of all proteins in all files with DA > cutoff (not filtered for idiosync) for disease in diseaseList: filename = "epex_outputs\epexchosen_{}.txt".format(disease) fin = open(filename) readfile = fin.readlines() for i in range(len(readfile)): readfile[i] = readfile[i].split() if float(readfile[i][2]) > cutoff and readfile[i][ 0] not in listCandidateProteins: # scorelist.append(round(float(readfile[i][2]),5)) # listCandidateProteins.append((readfile[i][0],readfile[i][2])) #[(protein, DAscore), (protein, DAscore),(protein, DAscore)...] listCandidateProteins.append(readfile[i][0]) fromCancerList = [] nProteins = numlist.cancerlist(n) for protein in listCandidateProteins: if protein in nProteins[1]: fromCancerList.append(protein) return fromCancerList
def idiosync(num): cancerdictlist = countproteins() genelistnum = cancerlist(num) returndict = {} for genel in genelistnum[1]: tupl = () for dict in cancerdictlist: for key in dict.keys(): for gene in dict[key].keys(): if genel == gene: tupl += (key, ) if tupl not in returndict.keys(): returndict[tupl] = [genel] else: returndict[tupl].append(genel) # print(returndict) return returndict
def numCompareTcga(nin, value): n = nin cutoff = value casedict = { 'ACC': 92, 'BRCA': 1044, 'ESCA': 184, 'HNSC': 510, 'LAML': 149, 'MESO': 83, 'SKCM': 470, 'THCA': 496, 'COAD': 433 } diseaseList = casedict.keys() listCandidateProteins = [ ] #list of all proteins in all files with DA > cutoff (not filtered for idiosync) for disease in diseaseList: filename = "epex_outputs\epexchosen_{}.txt".format(disease) fin = open(filename) readfile = fin.readlines() for i in range(len(readfile)): readfile[i] = readfile[i].split() if float(readfile[i][2]) > cutoff and readfile[i][ 0] not in listCandidateProteins: # scorelist.append(round(float(readfile[i][2]),5)) # listCandidateProteins.append((readfile[i][0],readfile[i][2])) #[(protein, DAscore), (protein, DAscore),(protein, DAscore)...] listCandidateProteins.append(readfile[i][0]) fromCancerList = [] nProteins = numlist.cancerlist(n) for protein in listCandidateProteins: if protein in nProteins[1]: fromCancerList.append(protein) ''' 2) For each protein, calculate the percent it is DA in all TCGA files represented as: (%, # DA, # normal) ''' returnDict = {} tcgaProteins = [] for disease in diseaseList: filename = "epex_outputs\epexchosen_{}.txt".format(disease) fin = open(filename) readfile = fin.readlines() for i in range(len(readfile)): readfile[i] = readfile[i].split() if readfile[i][0] in fromCancerList and float( readfile[i][2]) > cutoff: if readfile[i][0] not in returnDict: returnDict[readfile[i][0]] = 1 else: returnDict[readfile[i][0]] += 1 tcgaProteins.append(readfile[i][0]) tcgaProteinCount = Counter(tcgaProteins) for protein in returnDict: returnDict[protein] = (round( ((returnDict[protein] / tcgaProteinCount[protein]) * 100), 1), returnDict[protein], tcgaProteinCount[protein]) # print(returnDict) return returnDict
listCandidateProteins = [ ] #list of all proteins in all files with DA > cutoff (not filtered for idiosync) for disease in diseaseList: filename = "epex_outputs\epexchosen_{}.txt".format(disease) fin = open(filename) readfile = fin.readlines() for i in range(len(readfile)): readfile[i] = readfile[i].split() if float(readfile[i][2]) > cutoff and readfile[i][ 0] not in listCandidateProteins: # scorelist.append(round(float(readfile[i][2]),5)) # listCandidateProteins.append((readfile[i][0],readfile[i][2])) #[(protein, DAscore), (protein, DAscore),(protein, DAscore)...] listCandidateProteins.append(readfile[i][0]) fromCancerList = [] nProteins = numlist.cancerlist(n) for protein in listCandidateProteins: if protein in nProteins[1]: fromCancerList.append(protein) freq_dict = {} for disease in diseaseList: filename = "epex_outputs\epexchosen_{}.txt".format(disease) fin = open(filename) readfile = fin.readlines() for i in range(len(readfile)): readfile[i] = readfile[i].split() if readfile[i][0] in fromCancerList: if readfile[i][0] not in freq_dict: freq_dict[readfile[i][0]] = 1 else:
def relFreqDict(num, value): n = num cutoff = value casedict = { 'ACC': 92, 'BRCA': 1044, 'ESCA': 184, 'HNSC': 510, 'LAML': 149, 'MESO': 83, 'SKCM': 470, 'THCA': 496, 'COAD': 433 } diseaseList = casedict.keys() listCandidateProteins = [ ] #list of all proteins in all files with DA > cutoff (not filtered for idiosync) for disease in diseaseList: filename = "epex_outputs\epexchosen_{}.txt".format(disease) fin = open(filename) readfile = fin.readlines() for i in range(len(readfile)): readfile[i] = readfile[i].split() if float(readfile[i][2]) > cutoff and readfile[i][ 0] not in listCandidateProteins: # scorelist.append(round(float(readfile[i][2]),5)) # listCandidateProteins.append((readfile[i][0],readfile[i][2])) #[(protein, DAscore), (protein, DAscore),(protein, DAscore)...] listCandidateProteins.append(readfile[i][0]) fromCancerList = [] nProteins = numlist.cancerlist(n) for protein in listCandidateProteins: if protein in nProteins[1]: fromCancerList.append(protein) freq_dict = {} for disease in diseaseList: filename = "epex_outputs\epexchosen_{}.txt".format(disease) fin = open(filename) readfile = fin.readlines() for i in range(len(readfile)): readfile[i] = readfile[i].split() if readfile[i][0] in fromCancerList: if readfile[i][0] not in freq_dict: freq_dict[readfile[i][0]] = 1 else: freq_dict[readfile[i][0]] += 1 # print(freq_dict) ''' 2. Make list of all of the frequencies. Compute mean, std, and z-score for each protein. ''' ''' Make list ''' freq_list = [] for protein in freq_dict: freq_list.append(freq_dict[protein]) ''' Compute mean ''' def Average(lst): return sum(lst) / len(lst) mean = Average(freq_list) ''' Compute standard deviation ''' import statistics standardDev = statistics.stdev(freq_list) ''' Compute z-score for each protein. Create dictionary like the following: {protein: z-score, protein : z-score ...} ''' zscore_dict = {} for protein in freq_dict: zscore = round((freq_dict[protein] - mean) / standardDev, 2) zscore_dict[protein] = zscore # print(zscore_dict) return zscore_dict