from collections import Counter from synonymes.SynfileMap import SynfileMap from textmining.SyngrepHitFile import SyngrepHitFile from utils.idutils import dataDir, loadExludeWords resultBase = dataDir + "/miRExplore/textmine/results/" indexFoundSyns = Counter() excludedSyns = loadExludeWords() checkResultsFor = 'disease' analyseFiles = 100 maxFiles = 892 checkSynsMap = SynfileMap(resultBase + "/" + checkResultsFor + "/synfile.map") checkSynsMap.loadSynFiles(('/home/users/joppich/ownCloud/data/', dataDir)) for splitFileID in range(maxFiles, maxFiles - analyseFiles - 1, -1): fileID = "{:>4}".format(splitFileID).replace(" ", "0") print(fileID) indexFile = resultBase + "/" + checkResultsFor + "/medline17n" + fileID + ".index" foundHits = SyngrepHitFile(indexFile, checkSynsMap) for doc in foundHits: docHits = foundHits.getHitsForDocument(doc) for hit in docHits:
if oboSyns != None: for x in oboSyns: newSyn.addSyn(x.syn) allOrgs = [x for x in ncitTerm2Sym.org_term2symbol] for org in allOrgs: ncitID = oboID[oboID.index(":") + 1:] if ncitID in ncitTerm2Sym.org_term2symbol[org]: orgSyms = ncitTerm2Sym.org_term2symbol[org][ncitID] for sym in orgSyms: newSyn.addSyn(sym) vAllSyns.append(newSyn) globalKeywordExcludes = loadExludeWords() vPrintSyns = handleCommonExcludeWords(vAllSyns, None, mostCommonCount=100, maxCommonCount=5) #globalKeywordExcludes #printToFile(vPrintSyns, dataDir + "/miRExplore/textmine/synonyms/ncit.syn") printToFile(vPrintSyns, "/mnt/d/dev/data/pmid_jun2020/synonyms/ncit.syn", codec='utf8')
oboRels = oboNode.is_a newSyn = Synonym(oboID) newSyn.addSyn(oboName) aName = oboName.split(' ') if len(aName) > 1 and len(aName) < 5: acro = "" if aName[-1].upper() == 'CELL': acro = "".join([x[0].upper() for x in aName]) newSyn.addSyn(acro) if oboSyns != None: for x in oboSyns: newSyn.addSyn(x.syn) #print(str(taxID) + " " + str(newSyn)) vAllSyns.append(newSyn) globalKeywordExcludes = loadExludeWords(cell_co=False) vPrintSyns = handleCommonExcludeWords(vAllSyns, globalKeywordExcludes, mostCommonCount=200, maxCommonCount=5) printToFile(vPrintSyns, "/mnt/d/dev/data/pmid_jun2020/synonyms/model_anatomy.syn")
removeSyns = [] for synword in syn.syns: if len(synword) == 1: removeSyns.append(synword) if len(removeSyns) > 0: print(syn.id, removeSyns) syn.removeSyn(removeSyns) #exWords = loadExludeWords(common=True, generic=True, disease=False, taxnames=False, cell_co=False) exWords = loadExludeWords(cell_co=False, common=False, generic=True, syngrep=False) vPrintSyns = handleCommonExcludeWords( vAllSyns, exWords, mostCommonCount=500, maxCommonCount=7, addAlphaBeta=True, addHyphenGene=True, removeSyn=lambda synonym: synonym.id.startswith( 'MIR') and not synonym.id.endswith('HG')) printToFile(vPrintSyns, "/mnt/d/dev/data/pmid_jun2020/synonyms/mgi.syn", codec="utf8") """
filepath = sys.argv[1] fileObo = GeneOntology(filepath) namespace2syn = defaultdict(set) allowedTaxIDs = set([str(speciesName2TaxID[x]) for x in speciesName2TaxID]) allNodes = [] for cellID in fileObo.dTerms: oboNode = fileObo.dTerms[cellID] allNodes.append(oboNode) globalKeywordExcludes = loadExludeWords(common=False, cell_co=False, disease=False, generic=False) for x in globalKeywordExcludes: if 'membrane' in globalKeywordExcludes[x]: print("Membrane: " + x) synSet = set() for node in allNodes: newSyn = Synonym(node.id) newSyn.addSyn(node.name) if node.synonym != None: for x in node.synonym: if x == None: