Exemplos de cleanString em Python, exemplos de parserecommendation.cleanString em Python

Exemplo n.º 1

0

Exibir arquivo

def calculateRelevance(searchresults, recommendation, majorterms):
    idflib = open("idflib.json", "r")
    idf = json.loads(idflib.read())
    idflib.close()
    useIDF = True

    for res in searchresults.keys():
        try:
            titletokens = parserecommendation.cleanString(
                searchresults[res]["title"])
            abstracttokens = parserecommendation.cleanString(
                searchresults[res]["abstract"])
        except KeyError:
            titletokens = []
            abstracttokens = []
        for word in recommendation:
            if useIDF:
                titletokens.append(abstracttokens)
                if (word in titletokens) and (word in idf):
                    searchresults[res]["grade"] += idf[word] / len(
                        recommendation)
                    searchresults[res]["keywords"].append(word)
            else:
                # compare words from recommendation to title
                if word in titletokens:
                    searchresults[res]["grade"] += (
                        10. / len(titletokens)) * titletokens.count(word)
                    searchresults[res]["keywords"].append(word)
                # compare words from recommendation to abstract
                if word in abstracttokens:
                    searchresults[res]["grade"] += (
                        8. / len(abstracttokens)) * abstracttokens.count(word)
                    searchresults[res]["keywords"].append(word)
    # calculate MeSH distance
    for term2 in searchresults[res]["meshterms"]:
        d = 0.
        termcount = 0
        for term1 in majorterms:
            termdist = mb.getDistance(term1, term2)
            # check if term is from the same subtree
            if termdist > -1:
                d += termdist
                termcount += 1
        if termcount > 0:
            searchresults[res]["meshdistance"] += d / termcount
    if len(searchresults[res]["meshterms"]) > 0:
        searchresults[res]["meshdistance"] /= len(
            searchresults[res]["meshterms"])
        if searchresults[res]["meshdistance"] > 0:
            searchresults[res][
                "meshdistance"] = 1 / searchresults[res]["meshdistance"]

    return searchresults

Exemplo n.º 2

0

Exibir arquivo

Arquivo: rank.py Projeto: angelzou/guidelineupdate

def calculateRelevance(searchresults, recommendation, majorterms):
	idflib = open("idflib.json", "r")
	idf = json.loads(idflib.read())
	idflib.close()
	useIDF = True


	for res in searchresults.keys():
		try:
			titletokens = parserecommendation.cleanString(searchresults[res]["title"])
			abstracttokens = parserecommendation.cleanString(searchresults[res]["abstract"])
		except KeyError:
			titletokens = []
			abstracttokens = []
		for word in recommendation:
			if useIDF:
				titletokens.append(abstracttokens)
				if (word in titletokens) and (word in idf):
					searchresults[res]["grade"] += idf[word] / len(recommendation)
					searchresults[res]["keywords"].append(word)
			else:
				# compare words from recommendation to title
				if word in titletokens:
					searchresults[res]["grade"] += (10./len(titletokens))*titletokens.count(word)
					searchresults[res]["keywords"].append(word)
				# compare words from recommendation to abstract
				if word in abstracttokens:
					searchresults[res]["grade"] += (8./len(abstracttokens))*abstracttokens.count(word)
					searchresults[res]["keywords"].append(word)
	# calculate MeSH distance
	for term2 in searchresults[res]["meshterms"]:
		d = 0.
		termcount = 0
		for term1 in majorterms:
			termdist = mb.getDistance(term1, term2)
			# check if term is from the same subtree
			if termdist > -1:
				d += termdist
				termcount += 1
		if termcount > 0:
			searchresults[res]["meshdistance"] += d/termcount
	if len(searchresults[res]["meshterms"]) > 0:
		searchresults[res]["meshdistance"] /= len(searchresults[res]["meshterms"])
		if searchresults[res]["meshdistance"] > 0:
			searchresults[res]["meshdistance"] = 1/searchresults[res]["meshdistance"]

	return searchresults

Exemplo n.º 3

0

Exibir arquivo

def builddflib(filelist):
	doccount = {}

	idf = {}
	errcount = 0
	nr_of_documents = 0
	for f in filelist:
		if not os.path.isfile(f):
			id = f[12:-4]
			print "Getting articledata for PMID " + id
			resp, content = h.request(baseURL + "efetch.fcgi?db=pubmed&retmode=xml&id=" + id + creds)
			out = open(f, "w")
			out.write(content)
			out.close()
		doc = open(f, "r")
		articledata = xmltodict.parse(doc)
		try:
			base = articledata["PubmedArticleSet"]["PubmedArticle"]["MedlineCitation"]["Article"]["Abstract"]
			abstract = parseabstract(base)
			if abstract == "":
				errcount += 1
			else:
				nr_of_documents += 1
				words = cleanString(abstract)
				# word counts in current file
				for word in set(words):
					doccount[word] = doccount.setdefault(word, 0) +	1
					

		except KeyError:
			errcount += 1
		doc.close()

	# calculate idf
	idf = {}
	for word in doccount.keys():
		idf[word] = math.log(nr_of_documents/doccount[word])

	print "# processed succesfully: " + str(nr_of_documents)
	print "# errors: " + str(errcount)
	
	return idf

Exemplo n.º 4

0

Exibir arquivo

Arquivo: run.py Projeto: roelofreinders/guidelineupdate

out.write("<h1>Results</h1>")
out.write("<h3>Rsecommendation</h3>")
out.write(recommendation)
out.write("<h3>Input papers</h3>")
for id in evidence:
	out.write("<a href=\"" + pubURL + id + "\">" + id + "</a><br />")


related = []

#-------------------------------------------------#
# PART 1: processing text from the recommendation #
#-------------------------------------------------#

# clean up the recommendation string
recommendation = parserecommendation.cleanString(recommendation)

# send to PubMed to see if terms are recognized
recommendationterms = parserecommendation.getMeSHTerms(recommendation)

# remove duplicate terms
recommendationterms = list(set(recommendationterms))

# create list of 'major terms' for evaluation
majorterms = []

# extract MeSH terms
for term in recommendationterms:
	for token in term.split(" OR "):
		if "MeSH" in token:
			majorterms.append(token.split("\"")[1])

Exemplo n.º 5

0

Exibir arquivo

out.write("<h1>Results</h1>")
out.write("<h3>recommendation</h3>")
out.write(recommendation)
out.write("<h3>Input papers</h3>")
for id in evidence:
	out.write("<a href=\"" + pubURL + id + "\">" + id + "</a><br />")


related = []

#-------------------------------------------------#
# PART 1: processing text from the recommendation #
#-------------------------------------------------#

# clean up the recommendation string
recommendation = parserecommendation.cleanString(recommendation)

# send to PubMed to see if terms are recognized
recommendationterms = parserecommendation.getMeSHTerms(recommendation)

# remove duplicate terms
recommendationterms = list(set(recommendationterms))

# create list of 'major terms' for evaluation
majorterms = []

# extract MeSH terms
for term in recommendationterms:
	for token in term.split(" OR "):
		if "MeSH" in token:
			majorterms.append(token.split("\"")[1])

Exemplo n.º 6

0

Exibir arquivo

Arquivo: run.py Projeto: roelofreinders/guidelineupdate

<body>""")
out.write("<h1>Results</h1>")
out.write("<h3>Rsecommendation</h3>")
out.write(recommendation)
out.write("<h3>Input papers</h3>")
for id in evidence:
    out.write("<a href=\"" + pubURL + id + "\">" + id + "</a><br />")

related = []

#-------------------------------------------------#
# PART 1: processing text from the recommendation #
#-------------------------------------------------#

# clean up the recommendation string
recommendation = parserecommendation.cleanString(recommendation)

# send to PubMed to see if terms are recognized
recommendationterms = parserecommendation.getMeSHTerms(recommendation)

# remove duplicate terms
recommendationterms = list(set(recommendationterms))

# create list of 'major terms' for evaluation
majorterms = []

# extract MeSH terms
for term in recommendationterms:
    for token in term.split(" OR "):
        if "MeSH" in token:
            majorterms.append(token.split("\"")[1])