def dfSummary(cls, thisObservation, mc=10): df = DataFrame() madeObs = cls.summarizeKmers(thisObservation, mc) for idx, obs in enumerate(madeObs): if idx == 0: # header cols = [x for x in obs] df.addColumns(cols) dfrow = DataRow.fromDict(obs) df.addRow(dfrow) return df
"genes") indf = DataFrame.parseFromFile(args.de.name, skipChar='#', replacements={ "None": None, "": None, "NA": None }) inHeaders = indf.getHeader() outdf = DataFrame() outdf.addColumns([ 'elem_id', 'population_size', 'success_population', 'sample_size', 'success_samples', 'sample_success_fraction', 'pval', 'adj_pval', 'direction', 'genes' ]) for direction in ["UP", "DOWN", "ANY"]: significantGenes = set() measuredGenes = set() for row in indf: geneID = row[args.gene_symbol].upper() if geneID in sym2approvSym: geneID = sym2approvSym[geneID]
def fetch(self, fromEntity, elements, toEntities=[ GeneIdentity.UNIPROT, GeneIdentity.GENE_SYMBOL, GeneIdentity.ORDERED_LOCUS ], error_on_empty_result=True): self._must_accept(fromEntity) self._must_provide(toEntities) elements = sorted(elements) reqParams = self._make_params(fromEntity, elements, toEntities) for x in reqParams: lenReqParams = len(reqParams[x]) if lenReqParams < 100: print(str(x) + " " + str(reqParams[x])) else: print(str(x) + " " + str(lenReqParams) + " elements") resp = self._request(RequestMethod.POST, "", reqParams) if (resp.text == None): print(json.dumps(reqParams)) raise StoreException("Could not retrieve elements") if len(resp.text) == 0 and error_on_empty_result: raise StoreException("Empty result") convData = DataFrame() dfCols = toEntities + [fromEntity] convData.addColumns(dfCols) def addLineToReturn(lineData): modLine = {} for c, x in zip(dfCols, lineData): if x == '': modLine[c] = None else: modLine[c] = x convData.addRow(DataRow.fromDict(modLine)) bFirstLine = True for line in resp.text.split('\n'): if bFirstLine: bFirstLine = False continue if len(line) == 0: continue aline = line.split('\t') if len(aline) == 0: continue aline = aline[:-1] if ',' in aline[-1]: elems = aline[-1].split(',') elemCount = len(elems) for i in range(0, elemCount): modLine = [] for elem in aline[:-1]: aelem = elem.split(' ') if len(aelem) != elemCount: modLine.append(elem) else: modLine.append(aelem[i]) modLine.append(elems[i]) addLineToReturn(modLine) else: addLineToReturn(aline) return convData
for mirna in fInts: mirnaFound = False for defMirna in defInts: if defMirna.accept(mirna): mirnaFound = True break if mirnaFound == False: additionalInteractions[x].add(miRNA(mirna)) missingDF = DataFrame() missingDF.addColumns([ 'chemokine', 'miRNA Group', 'miRNA', 'Original Network', 'PubMed', 'MIRECORD', 'MIRTARBASE', 'MIRWALK' ]) linkedDF = DataFrame() linkedDF.addColumns([ 'chemokine', 'miRNA Group', 'miRNA', 'Original Network', 'PubMed', 'MIRECORD', 'MIRTARBASE', 'MIRWALK' ]) totalMissing = 0 print("Missing miRNAs") for x in missingInteractions: print(x, len(missingInteractions[x]), len(interactions[x]), missingInteractions[x]) totalMissing += len(missingInteractions[x])
[x[1] - pseudoCount for x in sample2genecount[sample]]) print(sample, totalCount) sample2stats[sample] = {"sample": sample, "totalCount": totalCount} sample2stats = makeplot(sample2genecount, defile.name, sample2stats, args.output[fidx]) columns = list() for sample in sample2stats: for x in sample2stats[sample]: if not x in columns: columns.append(x) outdf = DataFrame() outdf.addColumns(columns) for sample in sample2stats: dr = DataRow.fromDict(sample2stats[sample]) outdf.addRow(dr) print(outdf) if dfGroups != None: allGenes = set() for sample in sample2genecount: for x in sample2genecount[sample]: allGenes.add(x[0])
for x in df2SpecialCols: xn = x.split("_") xn.insert(1, args.prefix2) xn = "_".join(xn) df2Col2New[x] = xn print("Sp2", x, xn) df1NewCols = [df1Col2New[x] for x in df1Col2New] df2NewCols = [df2Col2New[x] for x in df2Col2New] outdf = DataFrame() if args.prefix_counts: outdf.addColumns( df12CommonCols + [args.prefix1 + "_" + x for x in df1UniqueCols + df1SampleCols] + [args.prefix2 + "_" + x for x in df2UniqueCols + df2SampleCols] + df1NewCols + df2NewCols) else: outdf.addColumns(df12CommonCols + df1UniqueCols + df2UniqueCols + df1NewCols + df2NewCols) for x in outdf.getHeader(): print("O", x) id2dataDf = {} for row in indf1: data = {} for x in df12CommonCols: data[x] = row[x]
if args.fpkm: #print(curGeneID, geneLength) fpkmValue = row[sample] / (sample2total[sample] * geneLength) * pow(10, 9) rowDict[sample + ".FPKM"] = fpkmValue if args.tpm: tpmValue = row[sample] / (geneLength * sample2ratio[sample]) * pow(10, 6) rowDict[sample + ".TPM"] = tpmValue allRowUpdates.append(rowDict) allCols = set() for x in allRowUpdates: for y in x: if not y in featureCountsColumns: allCols.add(y) outdf.addColumns(featureCountsColumns) outdf.addColumns(sorted(allCols), default=0, ignoreDuplicates=True) outdf.updateRowIndexed("Geneid", allRowUpdates, ignoreMissingCols=True, addIfNotFound=True) outdf.export(args.output.name, exType=ExportTYPE.TSV)
print(stage, mirna, cellpair[0], cellpair[1], mirnaCellPairs[mirna][cellpair], stageMirnaCellPairs[cellpair], stageMir2CellEvidence[stage][mirna].get(cellpair[0]),stageMir2CellEvidence[stage][mirna].get(cellpair[1]) ) cellgraph = networkx.Graph() allnodes = set() for edge in edge2support: allnodes.add(edge[0]) allnodes.add(edge[1]) for node in allnodes: cellgraph.add_node(node[1] + " ("+node[0]+")", size=20 + stageCellCount[node]) cellCommunicatorDF = DataFrame() cellCommunicatorDF.addColumns(["miRNA", "cells"]) mirna2cells = defaultdict(set) for edge in edge2support: cellgraph.add_edge( edge[0][1] + " (" + edge[0][0] + ")", edge[1][1] + " (" + edge[1][0] + ")", label=", ".join(edge2support.get(edge, []))) mirnas = edge2support.get(edge, []) for mirna in mirnas: mirna2cells[mirna].add(edge[0][1] + " (" + edge[0][0] + ")") mirna2cells[mirna].add(edge[1][1] + " (" + edge[1][0] + ")")
pubmedID = article['PubmedData']['ArticleIdList'][0] if len( article['PubmedData']['ArticleIdList']) > 0 else "-1" pubID = int(pubmedID) artInfo = article['MedlineCitation']['Article'] articleTitle = artInfo['ArticleTitle'] articleJournal = artInfo['Journal'][ 'Title'] if 'Journal' in artInfo else '' pmid2title[pubID] = articleTitle return pmid2title res = DataFrame() res.addColumns(["SET", "PMID_ID", "PMID_TITLE", 'Common']) print(ntd) print("NTD", len(ntd)) pmidt = getPMIDTitles(ntd) for x in sorted([x for x in pmidt]): dataDict = { 'SET': 'NTinfect', 'PMID_ID': "<a href='https://www.ncbi.nlm.nih.gov/pubmed/" + str(x) + "' target='_blank'>" + str(x) + "</a>", 'PMID_TITLE': pmidt[x],
foundAcceptedInteractions[x].add(mirna) for mirna in fInts: mirnaFound = False for defMirna in defInts: if defMirna.accept(mirna): mirnaFound = True break if mirnaFound == False: additionalInteractions[x].add(miRNA(mirna)) missingDF = DataFrame() missingDF.addColumns( ['chemokine', 'miRNA Group', 'miRNA', 'Weber', 'PubMed', 'MIRTARBASE']) linkedDF = DataFrame() linkedDF.addColumns( ['chemokine', 'miRNA Group', 'miRNA', 'Weber', 'PubMed', 'MIRTARBASE']) totalMissing = 0 print("Missing miRNAs") for x in missingInteractions: print(x, len(missingInteractions[x]), len(interactions[x]), missingInteractions[x]) totalMissing += len(missingInteractions[x]) selInts = missingInteractions[x]
print(datetime.datetime.now(), "Loading ncit") ncitPMIDs = easyPMIDFinder(args.pmidBase + "/ncit.pmid") dbs2pmids["NCIT"] = ncitPMIDs with open("/mnt/d/pmidsindims.pickle", 'wb') as fout: pickle.dump(dbs2pmids, fout) else: with open("/mnt/d/pmidsindims.pickle", 'rb') as fout: dbs2pmids = pickle.load(fout) outdf = DataFrame() outdf.addColumns(["Subset", "Number of PMIDs"]) allDims = [x for x in dbs2pmids] allPowerSets = powerset(sorted(allDims)) allPMIDs = set() for x in dbs2pmids: allPMIDs = allPMIDs.union(dbs2pmids[x]) for pset in allPowerSets: if len(pset) == 0: continue
] networks['targetMirsCholEfflux'] = targetMirsCholEfflux # SMC proliferation / migration targetMirsSMCProlif = [ 'miR-24', 'miR-26a', 'miR-31', 'miR-146a', 'miR-155', 'miR-208', 'miR-221', 'miR-222', 'miR-7d', 'let-7d', 'miR-1', 'miR-10a', 'miR-21', 'miR-29', 'miR-100', 'miR-132', 'miR-133', 'miR-143', 'miR-145', 'miR-195', 'miR-204', 'miR-424', 'miR-638', 'miR-663' ] networks['targetMirsSMCProlif'] = targetMirsSMCProlif summaryDF = DataFrame() summaryDF.addColumns( ["Network", "Accepted miRNAs", 'Additional miRNAs', "Missing miRNAs"]) networkGraphs = {} makeStory = [] allNetworks = [x for x in networks] print(allNetworks) #exit() ignoreNetworks = [] networkRestrictions = { 'targetMirsECA': { "cells": [{ "group": "cells",
'name': 'atherosclerosis' }] }, 'andreou_table1_athero': { 'sentences': "false", "disease": [{ 'group': 'disease', 'termid': 'DOID:1936', 'name': 'atherosclerosis' }] }, } restrictDF = DataFrame() restrictDF.addColumns(["Network", "Cells", "Disease", "Other"], "") for x in networkRestrictions: restricts = networkRestrictions[x] networkDRdict = defaultdict(str) networkDRdict["Network"] = x.replace("_", " ") diseaseElems = [] cellElems = [] otherElems = [] for restrictType in restricts: if restrictType == "sentences":
'miR-29', 'miR-100', 'miR-132', 'miR-133', 'miR-143', 'miR-145', 'miR-195', 'miR-204', 'miR-424', 'miR-638', 'miR-663'] networks['targetMirsSMCProlif'] = targetMirsSMCProlif summaryDF = DataFrame() summaryDF.addColumns(["Network", "miRNAs", 'Target Genes']) networkGraphs = {} makeStory = [ ] allNetworks = [x for x in networks] print(allNetworks) #exit() ignoreNetworks = [] networkRestrictions = { 'targetMirsECA': {
homDB.finalize() homDB.save_to_file(fileLocation + "combed") """ for orgname in homDB.get_all_organisms(): genomDB.loadGenome(orgname) allorgs = list(homDB.get_all_organisms()) mc = ['4_N1-031C1', '2_N1-025A2', '14_1-20A_UB64', '13_N5-004A1', '3_N1-029C1', '11_N4-029C2', '10_N2-085C2', '1_N1-024A1'] nmc = [x for x in allorgs if not x in mc] # and not x.startswith("15") allData = DataFrame() allData.addColumns(allorgs) homClusterIDs = [] for homid in homDB.homologies: val = homDB.get_homology_cluster(homid) maxlength = 0 for org in val: geneid = val[org] seq = genomDB.get_sequence(org, geneid) if len(seq) > maxlength: maxlength = len(seq)
xn = "_".join(xn) df2Col2New[x] = xn print("S2", x, xn) df1NewCols = [df1Col2New[x] for x in df1Col2New] df2NewCols = [df2Col2New[x] for x in df2Col2New] outdf = DataFrame() if args.prefix_counts: if len(curPrefix) > 0: curPrefix += "_" outdf.addColumns( df12CommonCols + [curPrefix + x for x in df1UniqueCols] + [args.prefixes[didx] + "_" + x for x in df2UniqueCols] + df1NewCols + df2NewCols) else: outdf.addColumns(df12CommonCols + df1UniqueCols + df2UniqueCols + df1NewCols + df2NewCols) for x in outdf.getHeader(): print("O", x) id2dataDf = {} for row in curDF: data = {} for x in df12CommonCols: data[x] = row[x]
allorgs = list(homDB.get_all_organisms()) extra = ['AE001439', 'CP009259'] mc = [ '4_N1-031C1', '2_N1-025A2', '14_1-20A_UB64', '13_N5-004A1', '3_N1-029C1', '11_N4-029C2', '10_N2-085C2', '1_N1-024A1' ] nmc = [ x for x in allorgs if not x in mc and not x in extra and not x.startswith("6_") ] # and not x.startswith("15") allData = DataFrame() allData.addColumns(allorgs) homClusterIDs = [] for homid in homDB.homologies: val = homDB.get_homology_cluster(homid) maxlength = 0 for org in val: geneid = val[org] seq = genomDB.get_sequence(org, geneid) if len(seq) > maxlength: maxlength = len(seq)