def calculateRelevance(searchresults, recommendation, majorterms): idflib = open("idflib.json", "r") idf = json.loads(idflib.read()) idflib.close() useIDF = True for res in searchresults.keys(): try: titletokens = parserecommendation.cleanString( searchresults[res]["title"]) abstracttokens = parserecommendation.cleanString( searchresults[res]["abstract"]) except KeyError: titletokens = [] abstracttokens = [] for word in recommendation: if useIDF: titletokens.append(abstracttokens) if (word in titletokens) and (word in idf): searchresults[res]["grade"] += idf[word] / len( recommendation) searchresults[res]["keywords"].append(word) else: # compare words from recommendation to title if word in titletokens: searchresults[res]["grade"] += ( 10. / len(titletokens)) * titletokens.count(word) searchresults[res]["keywords"].append(word) # compare words from recommendation to abstract if word in abstracttokens: searchresults[res]["grade"] += ( 8. / len(abstracttokens)) * abstracttokens.count(word) searchresults[res]["keywords"].append(word) # calculate MeSH distance for term2 in searchresults[res]["meshterms"]: d = 0. termcount = 0 for term1 in majorterms: termdist = mb.getDistance(term1, term2) # check if term is from the same subtree if termdist > -1: d += termdist termcount += 1 if termcount > 0: searchresults[res]["meshdistance"] += d / termcount if len(searchresults[res]["meshterms"]) > 0: searchresults[res]["meshdistance"] /= len( searchresults[res]["meshterms"]) if searchresults[res]["meshdistance"] > 0: searchresults[res][ "meshdistance"] = 1 / searchresults[res]["meshdistance"] return searchresults
def calculateRelevance(searchresults, recommendation, majorterms): idflib = open("idflib.json", "r") idf = json.loads(idflib.read()) idflib.close() useIDF = True for res in searchresults.keys(): try: titletokens = parserecommendation.cleanString(searchresults[res]["title"]) abstracttokens = parserecommendation.cleanString(searchresults[res]["abstract"]) except KeyError: titletokens = [] abstracttokens = [] for word in recommendation: if useIDF: titletokens.append(abstracttokens) if (word in titletokens) and (word in idf): searchresults[res]["grade"] += idf[word] / len(recommendation) searchresults[res]["keywords"].append(word) else: # compare words from recommendation to title if word in titletokens: searchresults[res]["grade"] += (10./len(titletokens))*titletokens.count(word) searchresults[res]["keywords"].append(word) # compare words from recommendation to abstract if word in abstracttokens: searchresults[res]["grade"] += (8./len(abstracttokens))*abstracttokens.count(word) searchresults[res]["keywords"].append(word) # calculate MeSH distance for term2 in searchresults[res]["meshterms"]: d = 0. termcount = 0 for term1 in majorterms: termdist = mb.getDistance(term1, term2) # check if term is from the same subtree if termdist > -1: d += termdist termcount += 1 if termcount > 0: searchresults[res]["meshdistance"] += d/termcount if len(searchresults[res]["meshterms"]) > 0: searchresults[res]["meshdistance"] /= len(searchresults[res]["meshterms"]) if searchresults[res]["meshdistance"] > 0: searchresults[res]["meshdistance"] = 1/searchresults[res]["meshdistance"] return searchresults
def builddflib(filelist): doccount = {} idf = {} errcount = 0 nr_of_documents = 0 for f in filelist: if not os.path.isfile(f): id = f[12:-4] print "Getting articledata for PMID " + id resp, content = h.request(baseURL + "efetch.fcgi?db=pubmed&retmode=xml&id=" + id + creds) out = open(f, "w") out.write(content) out.close() doc = open(f, "r") articledata = xmltodict.parse(doc) try: base = articledata["PubmedArticleSet"]["PubmedArticle"]["MedlineCitation"]["Article"]["Abstract"] abstract = parseabstract(base) if abstract == "": errcount += 1 else: nr_of_documents += 1 words = cleanString(abstract) # word counts in current file for word in set(words): doccount[word] = doccount.setdefault(word, 0) + 1 except KeyError: errcount += 1 doc.close() # calculate idf idf = {} for word in doccount.keys(): idf[word] = math.log(nr_of_documents/doccount[word]) print "# processed succesfully: " + str(nr_of_documents) print "# errors: " + str(errcount) return idf
out.write("<h1>Results</h1>") out.write("<h3>Rsecommendation</h3>") out.write(recommendation) out.write("<h3>Input papers</h3>") for id in evidence: out.write("<a href=\"" + pubURL + id + "\">" + id + "</a><br />") related = [] #-------------------------------------------------# # PART 1: processing text from the recommendation # #-------------------------------------------------# # clean up the recommendation string recommendation = parserecommendation.cleanString(recommendation) # send to PubMed to see if terms are recognized recommendationterms = parserecommendation.getMeSHTerms(recommendation) # remove duplicate terms recommendationterms = list(set(recommendationterms)) # create list of 'major terms' for evaluation majorterms = [] # extract MeSH terms for term in recommendationterms: for token in term.split(" OR "): if "MeSH" in token: majorterms.append(token.split("\"")[1])
out.write("<h1>Results</h1>") out.write("<h3>recommendation</h3>") out.write(recommendation) out.write("<h3>Input papers</h3>") for id in evidence: out.write("<a href=\"" + pubURL + id + "\">" + id + "</a><br />") related = [] #-------------------------------------------------# # PART 1: processing text from the recommendation # #-------------------------------------------------# # clean up the recommendation string recommendation = parserecommendation.cleanString(recommendation) # send to PubMed to see if terms are recognized recommendationterms = parserecommendation.getMeSHTerms(recommendation) # remove duplicate terms recommendationterms = list(set(recommendationterms)) # create list of 'major terms' for evaluation majorterms = [] # extract MeSH terms for term in recommendationterms: for token in term.split(" OR "): if "MeSH" in token: majorterms.append(token.split("\"")[1])
<body>""") out.write("<h1>Results</h1>") out.write("<h3>Rsecommendation</h3>") out.write(recommendation) out.write("<h3>Input papers</h3>") for id in evidence: out.write("<a href=\"" + pubURL + id + "\">" + id + "</a><br />") related = [] #-------------------------------------------------# # PART 1: processing text from the recommendation # #-------------------------------------------------# # clean up the recommendation string recommendation = parserecommendation.cleanString(recommendation) # send to PubMed to see if terms are recognized recommendationterms = parserecommendation.getMeSHTerms(recommendation) # remove duplicate terms recommendationterms = list(set(recommendationterms)) # create list of 'major terms' for evaluation majorterms = [] # extract MeSH terms for term in recommendationterms: for token in term.split(" OR "): if "MeSH" in token: majorterms.append(token.split("\"")[1])