def testBaseFC(seedUrls, pLimit):
    mytfidf = TFIDF()
    myEventScorer = EventScorer.EventScorer()
    
    # Write the seedUrls to a file
    seedFile = 'addurls.txt'
    if os.path.isfile(seedFile):
        os.remove(seedFile)
    f = os.open(seedFile, os.O_CREAT|os.O_RDWR)
    os.write(f, seedUrls)
    os.close(f)

    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs
    pagesLimit = pLimit
    pageScoreThreshold = 0.4
    urlScoreThreshold = 0.4
    options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs}
    #print urls_tokens
    #print title_tokens    
    
    cleandocs = getTokenizedDocs(docs)
    
    pos = cleandocs
    
    #print len(pos)
    #print len(neg)
    #print pos
    mytfidf.buildModel(pos)
    #mytfidf.buildModel(cleandocs)
    #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens)
    #eventFC(myEventScorer, mytfidf, options)
    baseFC(mytfidf,options)
示例#2
0
def testBaseFC(seedUrls, pLimit):
    print 'Content-Type: text/plain\n\n'
    mytfidf = TFIDF()
    myEventScorer = EventScorer.EventScorer()

    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs
    pagesLimit = pLimit
    pageScoreThreshold = 0.5
    urlScoreThreshold = 0.4
    options = {
        "num_pages": pagesLimit,
        "pageScoreThreshold": pageScoreThreshold,
        "urlScoreThreshold": urlScoreThreshold,
        "seeds": seedURLs
    }
    #print urls_tokens
    #print title_tokens

    cleandocs = getTokenizedDocs(docs)

    pos = cleandocs

    #print len(pos)
    #print len(neg)
    #print pos
    mytfidf.buildModel(pos)
    #mytfidf.buildModel(cleandocs)
    #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens)
    #eventFC(myEventScorer, mytfidf, options)
    baseFC(mytfidf, options)
def baseFC(crawlParams):
    seedURLs = crawlParams['seedURLs']
    t = [(-1,p,-1,"") for p in seedURLs]
    priorityQueue = PriorityQueue(t)
    
    crawlParams["priorityQueue"]=priorityQueue
    mytfidf = TFIDF()
    
    mytfidf.buildModel(crawlParams['model'],crawlParams['No_Keywords'])
    #mytfidf.buildModel(crawlParams['seedURLs'],crawlParams['No_Keywords'])
    crawlParams['scorer']=mytfidf
    
    #crawler = Crawler(priorityQueue,scorer,options)
    crawler = Crawler(crawlParams)
    crawler.crawl()

    '''
    f = open("base-logData.txt","w")
    furl = open("base-Output-URLs.txt","w")
    for p in crawler.relevantPages:
        f.write(str(p.pageId) + "," + str(p.pageUrl[2]) + "\n")
        furl.write(p.pageUrl[1].encode("utf-8")+","+str(p.estimatedScore)+"\n")
        ftext = open("base-webpages/"+str(p.pageId) + ".txt", "w")
        ftext.write(p.text.encode("utf-8"))
        ftext.close()
    f.close()
    furl.close()
    bres = evaluator.evaluateFC(crawler.relevantPages)
    writeEvaluation(bres,"base-evaluateData.txt")    
    print sum(bres)
    print len(bres)
    '''
    return crawler.relevantPages
def testEventFC(seedUrls, pLimit, eventTree):
    print 'Content-Type: text/plain\n\n'
    mytfidf = TFIDF()
    myEventScorer = EventScorer.EventScorer()
    
    
    # Write the seedUrls to a file
    seedFile = 'addurls.txt'
    if os.path.isfile(seedFile):
        os.remove(seedFile)
    f = open(seedFile,"rw")
    f.write(seedUrls)
    f.close()
    
    # Write the Event Tree to file
    eventFile = 'event-details.txt'
    if os.path.isfile(eventFile):
        os.remove(eventFile)
    fw = open(eventFile,"rw")
    fw.write(eventTree)
    fw.close()
    
    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs
    pagesLimit = pLimit
    pageScoreThreshold = 0.5
    urlScoreThreshold = 0.4
    # set threshold so scorer knows when to print tree to file
    myEventScorer.set_threshold(pageScoreThreshold)
    options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs}
    cleandocs = getTokenizedDocs(docs)
    mytfidf.buildModel(cleandocs)
    eventFC(myEventScorer, mytfidf, options)
def testBaseFC(seedUrls, pLimit):
    print 'Content-Type: text/plain\n\n'
    mytfidf = TFIDF()
    myEventScorer = EventScorer.EventScorer()
    
    
    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs
    pagesLimit = pLimit
    pageScoreThreshold = 0.5
    urlScoreThreshold = 0.4
    options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs}
    #print urls_tokens
    #print title_tokens    
    
    cleandocs = getTokenizedDocs(docs)
    
    pos = cleandocs
    
    #print len(pos)
    #print len(neg)
    #print pos
    mytfidf.buildModel(pos)
    #mytfidf.buildModel(cleandocs)
    #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens)
    #eventFC(myEventScorer, mytfidf, options)
    baseFC(mytfidf,options)
示例#6
0
def baseFC_OneTargetVector(crawlParams):
    seedURLs = crawlParams['seedURLs']
    t = [(-1, p, -1, "") for p in seedURLs]
    priorityQueue = PriorityQueue(t)

    crawlParams["priorityQueue"] = priorityQueue
    mytfidf = TFIDF()

    mytfidf.buildModel(crawlParams['model'], crawlParams['No_Keywords'])
    #mytfidf.buildModel(crawlParams['seedURLs'],crawlParams['No_Keywords'])
    crawlParams['scorer'] = mytfidf

    #crawler = Crawler(priorityQueue,scorer,options)
    crawler = Crawler(crawlParams)
    crawler.crawl()
    return crawler.relevantPages
def baseFC(crawlParams):
    seedURLs = crawlParams['seedURLs']
    t = [(-1,p,-1,"") for p in seedURLs]
    priorityQueue = PriorityQueue(t)
    
    crawlParams["priorityQueue"]=priorityQueue
    mytfidf = TFIDF()
    
    mytfidf.buildModel(crawlParams['model'],crawlParams['No_Keywords'])
    #mytfidf.buildModel(crawlParams['seedURLs'],crawlParams['No_Keywords'])
    crawlParams['scorer']=mytfidf
    
    #crawler = Crawler(priorityQueue,scorer,options)
    crawler = Crawler(crawlParams)
    crawler.crawl()
    return crawler.relevantPages
def testEventFC(seedFile, pLimit):
    print 'Content-Type: text/plain\n\n'
    mytfidf = TFIDF()
    myEventScorer = EventScorer.EventScorer()
    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs
    pagesLimit = pLimit
    pageScoreThreshold = 0.5
    urlScoreThreshold = 0.4
    # set threshold so scorer knows when to print tree to file
    myEventScorer.set_threshold(pageScoreThreshold)
    options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs}
    cleandocs = getTokenizedDocs(docs)
    mytfidf.buildModel(cleandocs)
    eventFC(myEventScorer, mytfidf, options)
示例#9
0
def testEventFC(seedUrls, pLimit, eventTree):
    #print 'GIVEN TREE:'
    #print eventTree
    # Write the seedUrls to a file
    seedFile = 'addurls.txt'
    if os.path.isfile(seedFile):
        os.remove(seedFile)
    f = os.open(seedFile, os.O_CREAT | os.O_RDWR)
    os.write(f, seedUrls)
    os.close(f)

    # Write the Event Tree to file
    eventFile = 'event-details.txt'
    if os.path.isfile(eventFile):
        os.remove(eventFile)
    fw = os.open(eventFile, os.O_CREAT | os.O_RDWR)
    os.write(fw, eventTree.lower())
    os.close(fw)

    mytfidf = TFIDF()  # appears to work fine (called then exited)

    myEventScorer = EventScorer.EventScorer()

    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs

    pagesLimit = pLimit
    pageScoreThreshold = 0.4
    urlScoreThreshold = 0.4
    # set threshold so scorer knows when to print tree to file
    myEventScorer.set_threshold(pageScoreThreshold)
    options = {
        "num_pages": pagesLimit,
        "pageScoreThreshold": pageScoreThreshold,
        "urlScoreThreshold": urlScoreThreshold,
        "seeds": seedURLs
    }
    cleandocs = getTokenizedDocs(docs)
    #print 'cleandocs'
    #print cleandocs
    mytfidf.buildModel(cleandocs)
    eventFC(myEventScorer, mytfidf, options)
def testEventFC(seedFile, pLimit):
    print 'Content-Type: text/plain\n\n'
    mytfidf = TFIDF()
    myEventScorer = EventScorer.EventScorer()
    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs
    pagesLimit = pLimit
    pageScoreThreshold = 0.5
    urlScoreThreshold = 0.4
    # set threshold so scorer knows when to print tree to file
    myEventScorer.set_threshold(pageScoreThreshold)
    options = {
        "num_pages": pagesLimit,
        "pageScoreThreshold": pageScoreThreshold,
        "urlScoreThreshold": urlScoreThreshold,
        "seeds": seedURLs
    }
    cleandocs = getTokenizedDocs(docs)
    mytfidf.buildModel(cleandocs)
    eventFC(myEventScorer, mytfidf, options)
def testEventFC(seedUrls, pLimit, eventTree):    
    #print 'GIVEN TREE:'
    #print eventTree
    # Write the seedUrls to a file
    seedFile = 'addurls.txt'
    if os.path.isfile(seedFile):
        os.remove(seedFile)
    f = os.open(seedFile, os.O_CREAT|os.O_RDWR)
    os.write(f, seedUrls)
    os.close(f)

    # Write the Event Tree to file
    eventFile = 'event-details.txt'
    if os.path.isfile(eventFile):
        os.remove(eventFile)
    fw = os.open(eventFile, os.O_CREAT|os.O_RDWR)
    os.write(fw, eventTree.lower())
    os.close(fw)

    mytfidf = TFIDF() # appears to work fine (called then exited)


    myEventScorer = EventScorer.EventScorer() 

    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs

    pagesLimit = pLimit
    pageScoreThreshold = 0.4
    urlScoreThreshold = 0.4
    # set threshold so scorer knows when to print tree to file
    myEventScorer.set_threshold(pageScoreThreshold)
    options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs}
    cleandocs = getTokenizedDocs(docs)
    #print 'cleandocs'
    #print cleandocs
    mytfidf.buildModel(cleandocs)
    eventFC(myEventScorer, mytfidf, options)
示例#12
0
def testBaseFC(seedUrls, pLimit):
    mytfidf = TFIDF()
    myEventScorer = EventScorer.EventScorer()

    # Write the seedUrls to a file
    seedFile = 'addurls.txt'
    if os.path.isfile(seedFile):
        os.remove(seedFile)
    f = os.open(seedFile, os.O_CREAT | os.O_RDWR)
    os.write(f, seedUrls)
    os.close(f)

    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs
    pagesLimit = pLimit
    pageScoreThreshold = 0.4
    urlScoreThreshold = 0.4
    options = {
        "num_pages": pagesLimit,
        "pageScoreThreshold": pageScoreThreshold,
        "urlScoreThreshold": urlScoreThreshold,
        "seeds": seedURLs
    }
    #print urls_tokens
    #print title_tokens

    cleandocs = getTokenizedDocs(docs)

    pos = cleandocs

    #print len(pos)
    #print len(neg)
    #print pos
    mytfidf.buildModel(pos)
    #mytfidf.buildModel(cleandocs)
    #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens)
    #eventFC(myEventScorer, mytfidf, options)
    baseFC(mytfidf, options)
示例#13
0
def testEventFC(seedUrls, pLimit, eventTree):
    print 'Content-Type: text/plain\n\n'
    mytfidf = TFIDF()
    myEventScorer = EventScorer.EventScorer()

    # Write the seedUrls to a file
    seedFile = 'addurls.txt'
    if os.path.isfile(seedFile):
        os.remove(seedFile)
    f = open(seedFile, "rw")
    f.write(seedUrls)
    f.close()

    # Write the Event Tree to file
    eventFile = 'event-details.txt'
    if os.path.isfile(eventFile):
        os.remove(eventFile)
    fw = open(eventFile, "rw")
    fw.write(eventTree)
    fw.close()

    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs
    pagesLimit = pLimit
    pageScoreThreshold = 0.5
    urlScoreThreshold = 0.4
    # set threshold so scorer knows when to print tree to file
    myEventScorer.set_threshold(pageScoreThreshold)
    options = {
        "num_pages": pagesLimit,
        "pageScoreThreshold": pageScoreThreshold,
        "urlScoreThreshold": urlScoreThreshold,
        "seeds": seedURLs
    }
    cleandocs = getTokenizedDocs(docs)
    mytfidf.buildModel(cleandocs)
    eventFC(myEventScorer, mytfidf, options)
示例#14
0
def test():
    mytfidf = TFIDF()
    docs = downloadRawDocs("typhoon_haiyan_SEED_URLs.txt")
    seedURLs = getSeedURLs("typhoon_haiyan_SEED_URLs.txt")
    pagesLimit = 1000
    pageScoreThreshold = 0.5
    urlScoreThreshold = 0.4
    options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs}
    #print urls_tokens
    #print title_tokens    
    
    cleandocs = getTokenizedDocs(docs)
    
    pos = cleandocs
    
    #print len(pos)
    #print len(neg)
    #print pos
    mytfidf.buildModel(pos)
    #mytfidf.buildModel(cleandocs)
    #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens)
    
    baseFC(mytfidf,options)