示例#1
0
文件: app.py 项目: Zain-1998/CNIR1
def past_news():
    keywords = search_history.query.all()
    if keywords:
        for i in keywords:
            print("result for ", i.search_keywords)
            news = i.search_keywords
            print(crawl.crawler(news))
示例#2
0
def server(THREAD_NUM,START_URLS,FETCH_TIME,KEY_WORD,IGNORE_KEY_WORD,DOWNLOAD_MODE,DEPTH,FETCH_COUNT,FETCH_MODE,STORAGE_MODEL,SIMILARITY,FOCUSKEYWORD):

    global REFUSE_COUNT
    global QUEUE_URLNODE
    global QUEUE_HTMLNODE

    #初始化url结点队列
    start_urls = START_URLS
    start_nodes = init_urlnode(start_urls)
    for i in start_nodes:
        QUEUE_URLNODE.put(i)
    my_tuple_list = []
    for i in xrange(THREAD_NUM):
        my_tuple_list.append((Queue.Queue(),str(i)))

    #起抓取线程
    threads_list = []
    for i in xrange(THREAD_NUM):
        threads_list.append(threading.Thread(target = single_thread,args = (my_tuple_list[i],QUEUE_HTMLNODE,DOWNLOAD_MODE,start_urls)))
    for i in threads_list:
        i.setDaemon(True)
        i.start()

    #起存储数据库线程
    #print (KEY_WORD,QUEUE_COMPLETE_NODE,QUEUE_SMART_NODE,STORAGE_MODEL)
    db_engine = threading.Thread(target = engine_db,args = (KEY_WORD,QUEUE_COMPLETE_NODE,QUEUE_SMART_NODE,STORAGE_MODEL))
    db_engine.setDaemon(True)
    db_engine.start()

    #URL结点队列调度
    while server_exit_conditions(FETCH_TIME,THREAD_NUM,FETCH_COUNT):
        for i in my_tuple_list:
            if QUEUE_URLNODE.qsize() > 0  and i[0].qsize() < 1:
                QUEUE_URLNODE = fetch_mode(QUEUE_URLNODE,FETCH_MODE)
                node = QUEUE_URLNODE.get()
                i[0].put(node)

        if QUEUE_HTMLNODE.qsize() > 0:
            html_node = QUEUE_HTMLNODE.get()

            nodelist = crawler(html_node)

            for i in nodelist:
                if i.depth <= DEPTH and SIMILARITY == 0:#SIMILARITY
                    if url_filter_similarity(i.url,KEY_WORD,IGNORE_KEY_WORD,FOCUSKEYWORD):
                        QUEUE_URLNODE.put(i)
                        if STORAGE_MODEL == 1 or STORAGE_MODEL == 2:
                            QUEUE_SMART_NODE.put(i)
                    else:
                        REFUSE_COUNT += 1

                elif i.depth <= DEPTH and SIMILARITY == 1:
                    if url_filter_no_similarity(i.url,KEY_WORD,IGNORE_KEY_WORD,FOCUSKEYWORD):
                        QUEUE_URLNODE.put(i)
                        if STORAGE_MODEL == 0 or STORAGE_MODEL == 2:
                            QUEUE_COMPLETE_NODE.put(i)
                    else:
                        REFUSE_COUNT += 1
                else:
                    REFUSE_COUNT += 1
示例#3
0
def server(THREAD_NUM,START_URLS,FETCH_TIME,KEY_WORD,IGNORE_KEY_WORD,DOWNLOAD_MODE,DEPTH,FETCH_COUNT,FETCH_MODE,STORAGE_MODEL,SIMILARITY,FOCUSKEYWORD):
    
    global REFUSE_COUNT
    global QUEUE_URLNODE
    global QUEUE_HTMLNODE

    #初始化url结点队列
    start_urls = START_URLS
    start_nodes = init_urlnode(start_urls)
    for i in start_nodes:
        QUEUE_URLNODE.put(i)
    my_tuple_list = []
    for i in xrange(THREAD_NUM):
        my_tuple_list.append((Queue.Queue(),str(i)))

    #起抓取线程
    threads_list = []
    for i in xrange(THREAD_NUM):
        threads_list.append(threading.Thread(target = single_thread,args = (my_tuple_list[i],QUEUE_HTMLNODE,DOWNLOAD_MODE)))
    for i in threads_list:
        i.setDaemon(True)
        i.start()

    #起存储数据库线程
    db_engine = threading.Thread(target = engine_db,args = (KEY_WORD,QUEUE_COMPLETE_NODE,QUEUE_SMART_NODE,STORAGE_MODEL))
    db_engine.setDaemon(True)
    db_engine.start()

    #URL结点队列调度
    while server_exit_conditions(FETCH_TIME,THREAD_NUM,FETCH_COUNT):
        for i in my_tuple_list:
            if QUEUE_URLNODE.qsize() > 0  and i[0].qsize() < 1:
                QUEUE_URLNODE = fetch_mode(QUEUE_URLNODE,FETCH_MODE)
                node = QUEUE_URLNODE.get()
                i[0].put(node)

        if QUEUE_HTMLNODE.qsize() > 0:
            html_node = QUEUE_HTMLNODE.get()

            nodelist = crawler(html_node)
            
            for i in nodelist:
                if i.depth <= DEPTH and SIMILARITY == 0:#SIMILARITY
                    if url_filter_similarity(i.url,KEY_WORD,IGNORE_KEY_WORD,FOCUSKEYWORD):
                        QUEUE_URLNODE.put(i)
                        if STORAGE_MODEL == 1 or STORAGE_MODEL == 2:
                            QUEUE_SMART_NODE.put(i)
                    else:
                        REFUSE_COUNT += 1

                elif i.depth <= DEPTH and SIMILARITY == 1:
                    if url_filter_no_similarity(i.url,KEY_WORD,IGNORE_KEY_WORD,FOCUSKEYWORD):
                        QUEUE_URLNODE.put(i)
                        if STORAGE_MODEL == 0 or STORAGE_MODEL == 2:
                            QUEUE_COMPLETE_NODE.put(i)
                    else:
                        REFUSE_COUNT += 1
                else:
                    REFUSE_COUNT += 1
示例#4
0
文件: main.py 项目: serereuk/crawl
from crawl import crawler
import pickle

keyword = str(input("입력해주세요\n"))
startdate = str(input("시작 날짜 형식은 2018-07-11\n"))
finishdate = str(input("끝나는 날짜 형식은 2018=07-12\n"))

result = crawler().twitter(keyword, startdate, finishdate)
with open("Result.txt", "wb") as f:
    pickle.dump(result, f)
示例#5
0
import sys 
import crawl
import classifier
import patternify

"""usage: python run.py http://cnn.com"""

crawler = crawl.crawler() #get a crawler object

urls = crawler.crawl(sys.argv[1], 500) #get URLs, number of URLs to crawl

classifier.trainSVM() #train preliminary classifier using the "content" and "notcontent" files

ones, zeros = classifier.testSVM(urls) #classify using preliminary classifier

patterns = patternify.getPatterns(ones) #get patterns from classified "ones" (content links)

classifier.trainSVM(patterns) #train secondary classifier with pattern features

ones, zeros = classifier.testSVM(urls, patterns) #classify using secondary classifier
 def _getlinks(self,starturl):
     crawler = crawl.crawler()
     links = crawler.crawl(starturl,200)
     return links
示例#7
0
import seed
import save
import crawl

seedURLs = [
    'https://en.wikipedia.org/wiki/Snake',
    'https://en.wikipedia.org/wiki/Reptile'
]

seedQ = seed.getSeedURLsQ(seedURLs)

relatedTerms = seed.getRelatedTerms()

pageLimit = 500

save.createDirectory('Assignment 2')

save.changeDirectory('Assignment 2')

crawl.createSSL()

savedPages = crawl.crawler(seedURLs, seedQ, relatedTerms, pageLimit)

save.saveFile('_CRAWLED_URLS_', '.txt', save.dictToSave(savedPages))