def __init__(self, filename): self.orgmir2mimat = defaultdict(set) self.mimat2mi = {} self.mimat2orgmi = {} self.mimat2orgmir = {} self.mi2orgmi = defaultdict(set) self.mi2orgmir = defaultdict(set) self.orgmi2mi = defaultdict(set) self.orgmir2mi = defaultdict(set) allRelations = DataFrame.parseFromFile(filename, bConvertTextToNumber=False) for row in allRelations: mimat = row['MIMAT'] orgmir = row['ORGMIR'] mi = row['MI'] orgmi = row['ORGMI'] self.mimat2mi[mimat] = mi self.mimat2orgmir[mimat] = orgmir self.mimat2orgmi[mimat] = orgmi self.orgmir2mimat[orgmir].add(mimat) self.mi2orgmi[mi].add(orgmi) self.mi2orgmir[mi].add(orgmir) self.orgmir2mi[orgmir].add(mi) self.orgmi2mi[orgmi].add(mi)
def dfSummary(cls, thisObservation, mc=10): df = DataFrame() madeObs = cls.summarizeKmers(thisObservation, mc) for idx, obs in enumerate(madeObs): if idx == 0: # header cols = [x for x in obs] df.addColumns(cols) dfrow = DataRow.fromDict(obs) df.addRow(dfrow) return df
pseudoCount = 1 allSamples = [] for x in args.groups: for y in x: if not y in allSamples: allSamples.append(y) print(allSamples) for fidx, defile in enumerate(args.counts): indf = DataFrame.parseFromFile(defile.name, skipChar='#', replacements={ "None": None, "": None, "NA": None }) dfCols = indf.getHeader() allDerivedSamples = [] replaceSamples = {} for sample in allSamples: if sample in dfCols: replaceSamples[sample] = sample allDerivedSamples.append(sample) else:
else: foundAcceptedInteractions[x].add(mirna) for mirna in fInts: mirnaFound = False for defMirna in defInts: if defMirna.accept(mirna): mirnaFound = True break if mirnaFound == False: additionalInteractions[x].add(miRNA(mirna)) missingDF = DataFrame() missingDF.addColumns([ 'chemokine', 'miRNA Group', 'miRNA', 'Original Network', 'PubMed', 'MIRECORD', 'MIRTARBASE', 'MIRWALK' ]) linkedDF = DataFrame() linkedDF.addColumns([ 'chemokine', 'miRNA Group', 'miRNA', 'Original Network', 'PubMed', 'MIRECORD', 'MIRTARBASE', 'MIRWALK' ]) totalMissing = 0 print("Missing miRNAs") for x in missingInteractions: print(x, len(missingInteractions[x]), len(interactions[x]),
import json from collections import Counter from collections import defaultdict from porestat.utils.DataFrame import DataFrame, ExportTYPE base = "/mnt/c/ownCloud/data/bcn/" inputFile = base + "all/CV-IPN-Endothelial cell activation1.0.sif" inputFile = base + "manual/CV-IPN-Endothelial cell activation1.0.jgf" allNodesLabels = {} allEdges = [] uniqueNodes = set() if inputFile.endswith('.sif'): interactions = DataFrame.parseFromFile(inputFile, ['source', 'interaction', 'target']) for row in interactions: src = row['source'] dst = row['target'] if src is None or dst is None: continue uniqueNodes.add(src) uniqueNodes.add(dst) for row in interactions: src = row['source'] dst = row['target']
edge2support[cellpair].add(mirna) print(stage, mirna, cellpair[0], cellpair[1], mirnaCellPairs[mirna][cellpair], stageMirnaCellPairs[cellpair], stageMir2CellEvidence[stage][mirna].get(cellpair[0]),stageMir2CellEvidence[stage][mirna].get(cellpair[1]) ) cellgraph = networkx.Graph() allnodes = set() for edge in edge2support: allnodes.add(edge[0]) allnodes.add(edge[1]) for node in allnodes: cellgraph.add_node(node[1] + " ("+node[0]+")", size=20 + stageCellCount[node]) cellCommunicatorDF = DataFrame() cellCommunicatorDF.addColumns(["miRNA", "cells"]) mirna2cells = defaultdict(set) for edge in edge2support: cellgraph.add_edge( edge[0][1] + " (" + edge[0][0] + ")", edge[1][1] + " (" + edge[1][0] + ")", label=", ".join(edge2support.get(edge, []))) mirnas = edge2support.get(edge, []) for mirna in mirnas: mirna2cells[mirna].add(edge[0][1] + " (" + edge[0][0] + ")") mirna2cells[mirna].add(edge[1][1] + " (" + edge[1][0] + ")")
from Bio import Entrez from porestat.utils.DataFrame import DataFrame from utils.idutils import miRExploreDir dbData = DataFrame.parseFromFile(miRExploreDir + "/miR2Disease/AllEntries.txt", ['mirna', 'disease', 'effect', 'measurement', 'year', 'title'], bConvertTextToNumber=False) pmidTitleIdx = dbData.getColumnIndex('title') allTitles = [] for row in dbData: title = row['title'] if title == None: continue title = title.strip() if not title[-1] == '.': title += "." allTitles.append(title) allTitles = list(set(allTitles)) print(len(allTitles)) titlesSearch = []
help='alignment files') parser.add_argument('-o', '--output', type=str, required=False, help="output base") args = parser.parse_args() if args.output == None: args.output = args.summary.name indf = DataFrame.parseFromFile(args.summary.name, skipChar='#', replacements={ "None": None, "": None, "NA": None }) allStatus = [] allCols = indf.getHeader() allCols.remove("Status") for row in indf: allStatus.append(row["Status"]) sampleData = defaultdict(lambda: dict()) for row in indf:
from collections import Counter from collections import defaultdict from porestat.utils.DataFrame import DataFrame, ExportTYPE interactions = DataFrame.parseFromFile( "/home/users/joppich/ownCloud/data/chemokines_sfb/chemokine_interactions.tsv" ) #interactions.export("/home/users/joppich/ownCloud/data/chemokines_sfb/chemokine_interactions.html", ExportTYPE.HTML) uniqueEdges = set() chemList = [ 'CXCR2', 'CCL9', 'CXCL5', 'CXCL1', 'CXCL13', 'CXCL7', 'CCL2', 'CXCL9', 'CCL3', 'CXCL10', 'CCL22', 'CCR5', 'CCR7', 'CCL7', 'CCL4', 'CXCR4', 'CX3CL1',
import gzip import os from collections import defaultdict from porestat.utils.DataFrame import DataFrame from utils.idutils import miRExploreDir hgncData = DataFrame.parseFromFile(miRExploreDir + "/hgnc.tsv") allUniprotIDs = set() for row in hgncData: uniprotVals = row['UniProt ID(supplied by UniProt)'] if uniprotVals == None: continue uniprotVals = uniprotVals.strip() uniprotIDs = uniprotVals.split(', ') for x in uniprotIDs: allUniprotIDs.add(x) print(len(allUniprotIDs)) allUniprotIDs = sorted(allUniprotIDs) uniprot2ipr = defaultdict(set) neededUniprotIDs = miRExploreDir + "/interpro/relevant.uniprot.list" # zgrep -f relevant.uniprot.list > relevant.uniprot.ipr.list
else: foundAcceptedInteractions[x].add(mirna) for mirna in fInts: mirnaFound = False for defMirna in defInts: if defMirna.accept(mirna): mirnaFound = True break if mirnaFound == False: additionalInteractions[x].add(miRNA(mirna)) missingDF = DataFrame() missingDF.addColumns( ['chemokine', 'miRNA Group', 'miRNA', 'Weber', 'PubMed', 'MIRTARBASE']) linkedDF = DataFrame() linkedDF.addColumns( ['chemokine', 'miRNA Group', 'miRNA', 'Weber', 'PubMed', 'MIRTARBASE']) totalMissing = 0 print("Missing miRNAs") for x in missingInteractions: print(x, len(missingInteractions[x]), len(interactions[x]), missingInteractions[x]) totalMissing += len(missingInteractions[x])
for article in record['PubmedArticle']: pubmedID = article['PubmedData']['ArticleIdList'][0] if len( article['PubmedData']['ArticleIdList']) > 0 else "-1" pubID = int(pubmedID) artInfo = article['MedlineCitation']['Article'] articleTitle = artInfo['ArticleTitle'] articleJournal = artInfo['Journal'][ 'Title'] if 'Journal' in artInfo else '' pmid2title[pubID] = articleTitle return pmid2title res = DataFrame() res.addColumns(["SET", "PMID_ID", "PMID_TITLE", 'Common']) print(ntd) print("NTD", len(ntd)) pmidt = getPMIDTitles(ntd) for x in sorted([x for x in pmidt]): dataDict = { 'SET': 'NTinfect', 'PMID_ID': "<a href='https://www.ncbi.nlm.nih.gov/pubmed/" + str(x) + "' target='_blank'>" + str(x) + "</a>", 'PMID_TITLE':
dbs2pmids["CELLS"] = cellPMIDs print(datetime.datetime.now(), "Loading ncit") ncitPMIDs = easyPMIDFinder(args.pmidBase + "/ncit.pmid") dbs2pmids["NCIT"] = ncitPMIDs with open("/mnt/d/pmidsindims.pickle", 'wb') as fout: pickle.dump(dbs2pmids, fout) else: with open("/mnt/d/pmidsindims.pickle", 'rb') as fout: dbs2pmids = pickle.load(fout) outdf = DataFrame() outdf.addColumns(["Subset", "Number of PMIDs"]) allDims = [x for x in dbs2pmids] allPowerSets = powerset(sorted(allDims)) allPMIDs = set() for x in dbs2pmids: allPMIDs = allPMIDs.union(dbs2pmids[x]) for pset in allPowerSets: if len(pset) == 0: continue
from collections import Counter from neo4j.v1 import GraphDatabase, basic_auth from porestat.utils.DataFrame import DataFrame from utils.idutils import ltype2label, makeDBGeneID, dataDir from database.Neo4JInterface import neo4jInterface hgncGenes = DataFrame.parseFromFile(dataDir + "/miRExplore/hgnc_ensembl_entrez.tsv", bConvertTextToNumber=False) allStatus = Counter() db = neo4jInterface(simulate=False) db.createUniquenessConstraint('GENE', 'id') db.deleteRelationship('n', None, None, 'm', None, None, ['HAS_GENE'], None, 'r') db.deleteNode(["GENE"], None) for gene in hgncGenes: hgncID = gene['HGNC ID'] hgncSym = gene['Approved Symbol'] hgncName = gene['Approved Name'] hgncEnsembl = gene['Ensembl ID(supplied by Ensembl)'] hgncEntrez = gene['Entrez Gene ID(supplied by NCBI)'] hgncStatus = gene['Status'] hgncLocusType = gene['Locus Type']
'miR-145', 'miR-155', 'miR-302a', 'miR-758', 'miR-223', 'miR-378' ] networks['targetMirsCholEfflux'] = targetMirsCholEfflux # SMC proliferation / migration targetMirsSMCProlif = [ 'miR-24', 'miR-26a', 'miR-31', 'miR-146a', 'miR-155', 'miR-208', 'miR-221', 'miR-222', 'miR-7d', 'let-7d', 'miR-1', 'miR-10a', 'miR-21', 'miR-29', 'miR-100', 'miR-132', 'miR-133', 'miR-143', 'miR-145', 'miR-195', 'miR-204', 'miR-424', 'miR-638', 'miR-663' ] networks['targetMirsSMCProlif'] = targetMirsSMCProlif summaryDF = DataFrame() summaryDF.addColumns( ["Network", "Accepted miRNAs", 'Additional miRNAs', "Missing miRNAs"]) networkGraphs = {} makeStory = [] allNetworks = [x for x in networks] print(allNetworks) #exit() ignoreNetworks = [] networkRestrictions = { 'targetMirsECA': {
from collections import defaultdict import editdistance from nertoolkit.geneontology.GeneOntology import GeneOntology from porestat.utils.DataFrame import DataFrame from utils.idutils import miRExploreDir dbData = DataFrame.parseFromFile(miRExploreDir + "/miR2Disease/mirna_disease.tsv", bConvertTextToNumber=False) allDiseases = set() for row in dbData: disease = row['disease'] if disease == 'None': continue allDiseases.add(disease.upper()) print(len(allDiseases)) diseaseObo = GeneOntology(miRExploreDir + "/doid.obo") disease2obo = defaultdict(set) """ find perfect matches """
"hgnc2sym2ens2uniprot") as fin: for line in fin: line = line.strip().split("\t") sym = line[0] approvSym = line[3] sym2approvSym[sym] = approvSym availSets = {} setDF = DataFrame.parseFromFile(args.sets, skipChar='#', replacements={ "None": None, "": None, "NA": None }) allSetGenes = set() for row in setDF: availSets[row['set_id']] = (row["set_descr"], set([ sym2approvSym[x.strip()] for x in row["genes"].split(";") if x.strip() in sym2approvSym ])) allSetGenes = allSetGenes.union(availSets[row['set_id']][1]) print("Got", len(availSets), "sets with a total of", len(allSetGenes),
def fetch(self, fromEntity, elements, toEntities=[ GeneIdentity.UNIPROT, GeneIdentity.GENE_SYMBOL, GeneIdentity.ORDERED_LOCUS ], error_on_empty_result=True): self._must_accept(fromEntity) self._must_provide(toEntities) elements = sorted(elements) reqParams = self._make_params(fromEntity, elements, toEntities) for x in reqParams: lenReqParams = len(reqParams[x]) if lenReqParams < 100: print(str(x) + " " + str(reqParams[x])) else: print(str(x) + " " + str(lenReqParams) + " elements") resp = self._request(RequestMethod.POST, "", reqParams) if (resp.text == None): print(json.dumps(reqParams)) raise StoreException("Could not retrieve elements") if len(resp.text) == 0 and error_on_empty_result: raise StoreException("Empty result") convData = DataFrame() dfCols = toEntities + [fromEntity] convData.addColumns(dfCols) def addLineToReturn(lineData): modLine = {} for c, x in zip(dfCols, lineData): if x == '': modLine[c] = None else: modLine[c] = x convData.addRow(DataRow.fromDict(modLine)) bFirstLine = True for line in resp.text.split('\n'): if bFirstLine: bFirstLine = False continue if len(line) == 0: continue aline = line.split('\t') if len(aline) == 0: continue aline = aline[:-1] if ',' in aline[-1]: elems = aline[-1].split(',') elemCount = len(elems) for i in range(0, elemCount): modLine = [] for elem in aline[:-1]: aelem = elem.split(' ') if len(aelem) != elemCount: modLine.append(elem) else: modLine.append(aelem[i]) modLine.append(elems[i]) addLineToReturn(modLine) else: addLineToReturn(aline) return convData
homDB.homologies[combid] = elems homDB.finalize() homDB.save_to_file(fileLocation + "combed") """ for orgname in homDB.get_all_organisms(): genomDB.loadGenome(orgname) allorgs = list(homDB.get_all_organisms()) mc = ['4_N1-031C1', '2_N1-025A2', '14_1-20A_UB64', '13_N5-004A1', '3_N1-029C1', '11_N4-029C2', '10_N2-085C2', '1_N1-024A1'] nmc = [x for x in allorgs if not x in mc] # and not x.startswith("15") allData = DataFrame() allData.addColumns(allorgs) homClusterIDs = [] for homid in homDB.homologies: val = homDB.get_homology_cluster(homid) maxlength = 0 for org in val: geneid = val[org] seq = genomDB.get_sequence(org, geneid)
#parser.add_argument('-c', '--cutoff', type=float, help='alignment files', default=0.05) parser.add_argument('-minfc', '--min-foldchange', type=float, default=1.0, required=False) parser.add_argument('-minpval', '--min-pvalue', type=float, default=0.05, required=False) args = parser.parse_args() for fidx, defile in enumerate(args.de): indf = DataFrame.parseFromFile(defile.name) availMethods = set() headername2idx = {} indfHeader = indf.getHeader() genesymname = None if "gene_symbol" in indfHeader: genesymname = "gene_symbol" elif "Geneid" in indfHeader: genesymname = "Geneid" else: genesymname = "id"
from urllib import request from porestat.utils.DataFrame import DataFrame allgenomes = DataFrame.parseFromFile("../../../ena_bacteria_list.csv") print(allgenomes.getHeader()) for row in allgenomes: protInfo = row['proteins'] print(protInfo) for row in allgenomes: protInfo = row['proteins'] if protInfo == None or len(protInfo) == 0 or len( protInfo.strip()) == 0 or protInfo == 'n/a' or protInfo == 'None': continue downloadFile = row['seqID'] + ".gb" downloadLocation = "../../../genomes/" print(downloadFile) request.urlretrieve( "http://www.ebi.ac.uk/ena/data/view/" + row['seqID'] + "&display=txt&expanded=true", downloadLocation + "/" + downloadFile)
from collections import defaultdict from porestat.utils.DataFrame import DataFrame from database.MIRFamily import MIRFamilyDB from neo4j.v1 import GraphDatabase, basic_auth from database.Neo4JInterface import neo4jInterface from database.ORGMIRs import ORGMIRDB from synonymes.mirnaID import miRNA, miRNAPART from utils.idutils import dataDir mirbase = DataFrame.parseFromFile(dataDir + "/miRExplore/mirnas_mirbase.csv", bConvertTextToNumber=False) filename = dataDir + "/miRExplore/miFam.dat" familyDB = MIRFamilyDB(filename) print(mirbase.getHeader()) db = neo4jInterface(simulate=False, printQueries=False) db.deleteRelationship('n', None, None, 'm', None, None, ['IS_ORG_MI'], None, 'r') db.deleteRelationship('n', None, None, 'm', None, None, ['IS_ORG_MIR'], None, 'r') db.deleteRelationship('n', None, None, 'm', None, None, ['IS_ORG_MIRNA'], None, 'r') db.deleteRelationship('n', ['MIRNA'], None, 'm', ['MIRNA_PRE'], None, ['MIRNA_MATURE_OF'], None, 'r') db.deleteRelationship('n', ['MIRNA'], None, 'm', ['MIRNA_FAMILY'], None,
parser.add_argument("-p", "--prefixes", nargs='+', type=str, required=True) parser.add_argument('-s2', '--samples', nargs='+', type=str, default=[]) parser.add_argument('-pc', '--prefix-counts', dest="prefix_counts", action='store_true', default=False, help="run FC part") args = parser.parse_args() curDF = DataFrame.parseFromFile(args.de[0].name, skipChar='#', replacements={ "None": None, "": None, "NA": None }) for didx, deTable in enumerate(args.de): if didx == 0: continue indf2 = DataFrame.parseFromFile(deTable.name, skipChar='#', replacements={ "None": None, "": None, "NA": None
parser.add_argument('-p', '--pval', type=float, default=0.05) parser.add_argument('-t', '--tools', nargs='+') parser.add_argument('-o', '--output', type=argparse.FileType("w"), required=True) #parser.add_argument('-g', '--gene', type=str, required=True, help="gene id column name") args = parser.parse_args() for fidx, defile in enumerate(args.counts): indf = DataFrame.parseFromFile(defile.name, skipChar='#', replacements = { "None": None, "": None, "NA": None }) inHeaders = indf.getHeader() #if not args.gene in inHeaders: # print("Unknown gene id column", args.gene) # print(inHeaders) # exit(-1) allconditions = [] for conditions in args.conditions: for condition in conditions:
action='store_true', default=False) args = parser.parse_args() enhancedData = loadEnhancement(args.enhanced) geneLengths = loadGeneLengths(args.lengths) if args.norrna and enhancedData == None: raise argparse.ArgumentParser().error( "removal of rRNA requires --enhanced!") indf = DataFrame.parseFromFile(args.fc.name, skipChar='#', replacements={ "None": None, "": None, "NA": None }) allheaders = indf.getHeader() featureCountsColumns = [ y for y in ["Geneid", "Chr", "Start", "End", "Strand", "Length"] if y in allheaders ] sampleHeaders = [x for x in allheaders if not x in featureCountsColumns] outdf = DataFrame() sample2total = Counter()
from collections import Counter, defaultdict from porestat.utils.DataFrame import DataFrame from utils.idutils import ltype2label, makeDBGeneID, mirtarbase_exp_type, mirtarbase_function_label, speciesName2TaxID, \ dataDir from database.Neo4JInterface import neo4jInterface from utils.parallel import MapReduce mirtarbaseEvidences = DataFrame.parseFromFile(dataDir + "/miRExplore/miRTarBase.csv", bConvertTextToNumber=False) print(mirtarbaseEvidences.getHeader()) experimentTypes = Counter() supportTypes = Counter() referencesWithComma = Counter() db = neo4jInterface(simulate=False, printQueries=False) db.deleteRelationship('n', ['GENE'], None, 'm', ['MIRTARBASE'], None, ['GENE_MENTION'], None, 'r') db.deleteRelationship('n', ['MIRTARBASE'], None, 'm', ['MIRNA'], None, ['MIRNA_MENTION'], None, 'r') db.deleteRelationship('n', ['MIRTARBASE'], None, 'm', ['PUBMED'], None, ['MIRTARBASE_LITERATURE_SUPPORT'], None, 'r') db.deleteRelationship('n', ['MIRTARBASE_SUPPORT'], None, 'm', ['MIRTARBASE'], None, ['MIRTARBASE_FUNCTIONAL_SUPPORT'], None, 'r') db.deleteRelationship('n', ['MIRTARBASE_EXPERIMENT'], None, 'm', ['MIRTARBASE'], None, ['MIRTARBASE_EXPERIMENT_SUPPORT'], None, 'r') db.deleteRelationship('n', ['MIRTARBASE'], None, 'm', ['TAX'], None, ['ORGANISM_SUPPORT'], None, 'r') db.deleteNode(["MIRTARBASE"], None) db.deleteNode(["MIRTARBASE_SUPPORT"], None) db.deleteNode(["MIRTARBASE_EXPERIMENT"], None) db.createUniquenessConstraint('MIRTARBASE', 'id') if False: db.close()
parser.add_argument('-s2', '--samples', nargs='+', type=str, default=[]) parser.add_argument('-pc', '--prefix-counts', dest="prefix_counts", action='store_true', default=False, help="run FC part") args = parser.parse_args() indf1 = DataFrame.parseFromFile(args.de1.name, skipChar='#', replacements={ "None": None, "": None, "NA": None }) indf2 = DataFrame.parseFromFile(args.de2.name, skipChar='#', replacements={ "None": None, "": None, "NA": None }) allSamples = args.samples print("all samples", allSamples)
nargs='+', type=str, required=False, help="output base") parser.add_argument('-p', '--pathname', action="store_true", default=False) args = parser.parse_args() if args.output == None: args.output = [counts.name for counts in args.counts] for fidx, defile in enumerate(args.counts): indf = DataFrame.parseFromFile(defile.name, skipChar='#', replacements={ "None": None, "": None, "NA": None }) inHeaders = indf.getHeader() #if not args.gene in inHeaders: # print("Unknown gene id column", args.gene) # print(inHeaders) # exit(-1) for conditions in args.conditions: for condition in conditions:
for orgname in homDB.get_all_organisms(): genomDB.loadGenome(orgname) allorgs = list(homDB.get_all_organisms()) extra = ['AE001439', 'CP009259'] mc = [ '4_N1-031C1', '2_N1-025A2', '14_1-20A_UB64', '13_N5-004A1', '3_N1-029C1', '11_N4-029C2', '10_N2-085C2', '1_N1-024A1' ] nmc = [ x for x in allorgs if not x in mc and not x in extra and not x.startswith("6_") ] # and not x.startswith("15") allData = DataFrame() allData.addColumns(allorgs) homClusterIDs = [] for homid in homDB.homologies: val = homDB.get_homology_cluster(homid) maxlength = 0 for org in val: geneid = val[org] seq = genomDB.get_sequence(org, geneid) if len(seq) > maxlength: