def testBaseFC(seedUrls, pLimit): mytfidf = TFIDF() myEventScorer = EventScorer.EventScorer() # Write the seedUrls to a file seedFile = 'addurls.txt' if os.path.isfile(seedFile): os.remove(seedFile) f = os.open(seedFile, os.O_CREAT|os.O_RDWR) os.write(f, seedUrls) os.close(f) docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.4 urlScoreThreshold = 0.4 options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs} #print urls_tokens #print title_tokens cleandocs = getTokenizedDocs(docs) pos = cleandocs #print len(pos) #print len(neg) #print pos mytfidf.buildModel(pos) #mytfidf.buildModel(cleandocs) #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens) #eventFC(myEventScorer, mytfidf, options) baseFC(mytfidf,options)
def testBaseFC(seedUrls, pLimit): print 'Content-Type: text/plain\n\n' mytfidf = TFIDF() myEventScorer = EventScorer.EventScorer() docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.5 urlScoreThreshold = 0.4 options = { "num_pages": pagesLimit, "pageScoreThreshold": pageScoreThreshold, "urlScoreThreshold": urlScoreThreshold, "seeds": seedURLs } #print urls_tokens #print title_tokens cleandocs = getTokenizedDocs(docs) pos = cleandocs #print len(pos) #print len(neg) #print pos mytfidf.buildModel(pos) #mytfidf.buildModel(cleandocs) #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens) #eventFC(myEventScorer, mytfidf, options) baseFC(mytfidf, options)
def baseFC(crawlParams): seedURLs = crawlParams['seedURLs'] t = [(-1,p,-1,"") for p in seedURLs] priorityQueue = PriorityQueue(t) crawlParams["priorityQueue"]=priorityQueue mytfidf = TFIDF() mytfidf.buildModel(crawlParams['model'],crawlParams['No_Keywords']) #mytfidf.buildModel(crawlParams['seedURLs'],crawlParams['No_Keywords']) crawlParams['scorer']=mytfidf #crawler = Crawler(priorityQueue,scorer,options) crawler = Crawler(crawlParams) crawler.crawl() ''' f = open("base-logData.txt","w") furl = open("base-Output-URLs.txt","w") for p in crawler.relevantPages: f.write(str(p.pageId) + "," + str(p.pageUrl[2]) + "\n") furl.write(p.pageUrl[1].encode("utf-8")+","+str(p.estimatedScore)+"\n") ftext = open("base-webpages/"+str(p.pageId) + ".txt", "w") ftext.write(p.text.encode("utf-8")) ftext.close() f.close() furl.close() bres = evaluator.evaluateFC(crawler.relevantPages) writeEvaluation(bres,"base-evaluateData.txt") print sum(bres) print len(bres) ''' return crawler.relevantPages
def testEventFC(seedUrls, pLimit, eventTree): print 'Content-Type: text/plain\n\n' mytfidf = TFIDF() myEventScorer = EventScorer.EventScorer() # Write the seedUrls to a file seedFile = 'addurls.txt' if os.path.isfile(seedFile): os.remove(seedFile) f = open(seedFile,"rw") f.write(seedUrls) f.close() # Write the Event Tree to file eventFile = 'event-details.txt' if os.path.isfile(eventFile): os.remove(eventFile) fw = open(eventFile,"rw") fw.write(eventTree) fw.close() docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.5 urlScoreThreshold = 0.4 # set threshold so scorer knows when to print tree to file myEventScorer.set_threshold(pageScoreThreshold) options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs} cleandocs = getTokenizedDocs(docs) mytfidf.buildModel(cleandocs) eventFC(myEventScorer, mytfidf, options)
def testBaseFC(seedUrls, pLimit): print 'Content-Type: text/plain\n\n' mytfidf = TFIDF() myEventScorer = EventScorer.EventScorer() docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.5 urlScoreThreshold = 0.4 options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs} #print urls_tokens #print title_tokens cleandocs = getTokenizedDocs(docs) pos = cleandocs #print len(pos) #print len(neg) #print pos mytfidf.buildModel(pos) #mytfidf.buildModel(cleandocs) #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens) #eventFC(myEventScorer, mytfidf, options) baseFC(mytfidf,options)
def baseFC_OneTargetVector(crawlParams): seedURLs = crawlParams['seedURLs'] t = [(-1, p, -1, "") for p in seedURLs] priorityQueue = PriorityQueue(t) crawlParams["priorityQueue"] = priorityQueue mytfidf = TFIDF() mytfidf.buildModel(crawlParams['model'], crawlParams['No_Keywords']) #mytfidf.buildModel(crawlParams['seedURLs'],crawlParams['No_Keywords']) crawlParams['scorer'] = mytfidf #crawler = Crawler(priorityQueue,scorer,options) crawler = Crawler(crawlParams) crawler.crawl() return crawler.relevantPages
def baseFC(crawlParams): seedURLs = crawlParams['seedURLs'] t = [(-1,p,-1,"") for p in seedURLs] priorityQueue = PriorityQueue(t) crawlParams["priorityQueue"]=priorityQueue mytfidf = TFIDF() mytfidf.buildModel(crawlParams['model'],crawlParams['No_Keywords']) #mytfidf.buildModel(crawlParams['seedURLs'],crawlParams['No_Keywords']) crawlParams['scorer']=mytfidf #crawler = Crawler(priorityQueue,scorer,options) crawler = Crawler(crawlParams) crawler.crawl() return crawler.relevantPages
def testEventFC(seedFile, pLimit): print 'Content-Type: text/plain\n\n' mytfidf = TFIDF() myEventScorer = EventScorer.EventScorer() docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.5 urlScoreThreshold = 0.4 # set threshold so scorer knows when to print tree to file myEventScorer.set_threshold(pageScoreThreshold) options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs} cleandocs = getTokenizedDocs(docs) mytfidf.buildModel(cleandocs) eventFC(myEventScorer, mytfidf, options)
def testEventFC(seedUrls, pLimit, eventTree): #print 'GIVEN TREE:' #print eventTree # Write the seedUrls to a file seedFile = 'addurls.txt' if os.path.isfile(seedFile): os.remove(seedFile) f = os.open(seedFile, os.O_CREAT | os.O_RDWR) os.write(f, seedUrls) os.close(f) # Write the Event Tree to file eventFile = 'event-details.txt' if os.path.isfile(eventFile): os.remove(eventFile) fw = os.open(eventFile, os.O_CREAT | os.O_RDWR) os.write(fw, eventTree.lower()) os.close(fw) mytfidf = TFIDF() # appears to work fine (called then exited) myEventScorer = EventScorer.EventScorer() docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.4 urlScoreThreshold = 0.4 # set threshold so scorer knows when to print tree to file myEventScorer.set_threshold(pageScoreThreshold) options = { "num_pages": pagesLimit, "pageScoreThreshold": pageScoreThreshold, "urlScoreThreshold": urlScoreThreshold, "seeds": seedURLs } cleandocs = getTokenizedDocs(docs) #print 'cleandocs' #print cleandocs mytfidf.buildModel(cleandocs) eventFC(myEventScorer, mytfidf, options)
def testEventFC(seedFile, pLimit): print 'Content-Type: text/plain\n\n' mytfidf = TFIDF() myEventScorer = EventScorer.EventScorer() docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.5 urlScoreThreshold = 0.4 # set threshold so scorer knows when to print tree to file myEventScorer.set_threshold(pageScoreThreshold) options = { "num_pages": pagesLimit, "pageScoreThreshold": pageScoreThreshold, "urlScoreThreshold": urlScoreThreshold, "seeds": seedURLs } cleandocs = getTokenizedDocs(docs) mytfidf.buildModel(cleandocs) eventFC(myEventScorer, mytfidf, options)
def testEventFC(seedUrls, pLimit, eventTree): #print 'GIVEN TREE:' #print eventTree # Write the seedUrls to a file seedFile = 'addurls.txt' if os.path.isfile(seedFile): os.remove(seedFile) f = os.open(seedFile, os.O_CREAT|os.O_RDWR) os.write(f, seedUrls) os.close(f) # Write the Event Tree to file eventFile = 'event-details.txt' if os.path.isfile(eventFile): os.remove(eventFile) fw = os.open(eventFile, os.O_CREAT|os.O_RDWR) os.write(fw, eventTree.lower()) os.close(fw) mytfidf = TFIDF() # appears to work fine (called then exited) myEventScorer = EventScorer.EventScorer() docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.4 urlScoreThreshold = 0.4 # set threshold so scorer knows when to print tree to file myEventScorer.set_threshold(pageScoreThreshold) options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs} cleandocs = getTokenizedDocs(docs) #print 'cleandocs' #print cleandocs mytfidf.buildModel(cleandocs) eventFC(myEventScorer, mytfidf, options)
def testBaseFC(seedUrls, pLimit): mytfidf = TFIDF() myEventScorer = EventScorer.EventScorer() # Write the seedUrls to a file seedFile = 'addurls.txt' if os.path.isfile(seedFile): os.remove(seedFile) f = os.open(seedFile, os.O_CREAT | os.O_RDWR) os.write(f, seedUrls) os.close(f) docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.4 urlScoreThreshold = 0.4 options = { "num_pages": pagesLimit, "pageScoreThreshold": pageScoreThreshold, "urlScoreThreshold": urlScoreThreshold, "seeds": seedURLs } #print urls_tokens #print title_tokens cleandocs = getTokenizedDocs(docs) pos = cleandocs #print len(pos) #print len(neg) #print pos mytfidf.buildModel(pos) #mytfidf.buildModel(cleandocs) #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens) #eventFC(myEventScorer, mytfidf, options) baseFC(mytfidf, options)
def testEventFC(seedUrls, pLimit, eventTree): print 'Content-Type: text/plain\n\n' mytfidf = TFIDF() myEventScorer = EventScorer.EventScorer() # Write the seedUrls to a file seedFile = 'addurls.txt' if os.path.isfile(seedFile): os.remove(seedFile) f = open(seedFile, "rw") f.write(seedUrls) f.close() # Write the Event Tree to file eventFile = 'event-details.txt' if os.path.isfile(eventFile): os.remove(eventFile) fw = open(eventFile, "rw") fw.write(eventTree) fw.close() docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.5 urlScoreThreshold = 0.4 # set threshold so scorer knows when to print tree to file myEventScorer.set_threshold(pageScoreThreshold) options = { "num_pages": pagesLimit, "pageScoreThreshold": pageScoreThreshold, "urlScoreThreshold": urlScoreThreshold, "seeds": seedURLs } cleandocs = getTokenizedDocs(docs) mytfidf.buildModel(cleandocs) eventFC(myEventScorer, mytfidf, options)
def test(): mytfidf = TFIDF() docs = downloadRawDocs("typhoon_haiyan_SEED_URLs.txt") seedURLs = getSeedURLs("typhoon_haiyan_SEED_URLs.txt") pagesLimit = 1000 pageScoreThreshold = 0.5 urlScoreThreshold = 0.4 options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs} #print urls_tokens #print title_tokens cleandocs = getTokenizedDocs(docs) pos = cleandocs #print len(pos) #print len(neg) #print pos mytfidf.buildModel(pos) #mytfidf.buildModel(cleandocs) #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens) baseFC(mytfidf,options)