def make_page_classifier(self): """set up classifier to evaluate whether a page (Response object) is a papers source""" classifier = BinaryNaiveBayes(prior_yes=0.6) classifier.likelihood( "contains at least 2 links to '.pdf' or '.doc'", lambda r: len(re.findall(r'href=[^>]+\.(?:pdf|docx?)\b', r.text, re.IGNORECASE)) > 1, p_ifyes=0.99, p_ifno=0.2) classifier.likelihood( "contains 'syllabus'", lambda r: 'syllabus' in r.textlower, p_ifyes=0.1, p_ifno=0.2) classifier.likelihood( "contains conference keywords", lambda r: r.textlower.count('schedule') + r.textlower.count('break') + r.textlower.count('dinner') > 2, p_ifyes=0.01, p_ifno=0.2) classifier.likelihood( "author name in url", lambda r: r.authorname.split()[-1].lower() in r.url.lower(), p_ifyes=0.6, p_ifno=0.1) return classifier
import re from statistics import median from scipy.stats import nbinom #import sys, os.path #curpath = os.path.abspath(os.path.dirname(__file__)) #libpath = os.path.join(curpath, os.path.pardir) #sys.path.insert(0, libpath) from opp.subjectivebayes import BinaryNaiveBayes from opp.debug import debug, debuglevel """ classifier to evaluate whether a pdf/word document is a paper (or book etc.), as opposed to a handout, a cv, lecture slides etc. """ classifier = BinaryNaiveBayes(prior_yes=0.6) def bad_url(doc): pat = re.compile(r'\bcours|\blecture|\btalk|handout|teaching') return pat.search(doc.url.lower()) classifier.likelihood('bad url', bad_url, p_ifyes=0.05, p_ifno=0.2) def bad_anchortext(doc): pat = re.compile(r'^site\s*map$|^home|page\b|\bslides\b|handout') return pat.search(doc.link.anchortext.lower()) classifier.likelihood('bad anchortext', bad_anchortext, p_ifyes=0.005, p_ifno=0.3) def good_linkcontext(doc): pat = re.compile(r'penultimate|draft|forthcoming') return pat.search(doc.link.context.lower()) classifier.likelihood('good link context', good_linkcontext, p_ifyes=0.2, p_ifno=0.05)
return string in doc.link.context.lower() return check def in_beginning(regex): reg = re.compile(regex, re.I) def check(doc): if not doc.content: return Ellipsis beginning = doc.content[:5000] return reg.search(beginning) return check # ========================================================================= bookfilter = BinaryNaiveBayes(prior_yes=0.2) bookfilter.likelihood('numwords', length, p_ifyes=nbinom(7, 0.0001), p_ifno=nbinom(1, 0.0001)) # TODO: add more features? "Acknowledgements" section? Occurrences of # "this book" TOC? Index? ... # ========================================================================= chapterfilter = BinaryNaiveBayes(prior_yes=0.2) chapterfilter.likelihood('numwords', length, p_ifyes=nbinom(2, 0.0002), p_ifno=nbinom(3, 0.0002)) chapterfilter.likelihood('"chapter" occurs in link context', in_context('chapter'),
def test_basic(): nb = BinaryNaiveBayes(prior_yes=0.5) nb.likelihood('', lambda x: True, p_ifyes=0.3, p_ifno=0.1) # 0.3 * 0.5 / 0.3 * 0.5 + 0.1 * 0.5 = 0.75 assert 0.749 < nb.test(0) < 0.751
def test_medical(): nb = BinaryNaiveBayes(prior_yes=0.0001) nb.likelihood('', lambda x: True, p_ifyes=0.99, p_ifno=0.01) assert 0.0097 < nb.test(0) < 0.0099