Пример #1
0
# Author: Yunqiao Zhang
# Email: [email protected]

from scheduler import CrawlerScheduler
import share
import time
from page_intializer import PageInitializer

if __name__ == '__main__':
    cs = CrawlerScheduler(share.GPR)
    cs.start()

    for line in open("test.txt", 'r'):
        line = line.strip()
        cs.add_request(line)

    time.sleep(40)

    print 'size:', share.GPR.qsize()
    # Get content out
    n = 0

    pi = PageInitializer(share.GPR)
    while True:
        url, content = pi.get_page()
        fp = open(str(n) + '.html', 'w')
        if url != None:
            fp.write(content)
        fp.close()
        n += 1
Пример #2
0
# Author: Yunqiao Zhang
# Email: [email protected]

from scheduler import CrawlerScheduler
import share
import time

if __name__ == '__main__':
	cs = CrawlerScheduler(share.GPR)
	cs.start()

	for line in open("test.txt", 'r'):
		line = line.strip()
		cs.add_request(line)
	
	time.sleep(40) 

	print 'size:', share.GPR.qsize()
	# Get content out
	n = 0
	while True:
		r = share.GPR.get(block=False)
		print r[0], r[1], r[2], r[3], r[4]
		fp = open(str(n) + '.html', 'w')
		fp.write(r[6])
		fp.close()
		n += 1

	cs.join()
Пример #3
0
# Author: Yunqiao Zhang
# Email: [email protected]

from scheduler import CrawlerScheduler
import share
import time
from page_intializer import PageInitializer

if __name__ == '__main__':
    cs = CrawlerScheduler(share.GPR)
    cs.start()

    # for line in open("chan.txt", 'r'):
    #     line = line.strip()
    #     cs.add_request(line)
    sitedata = {}
    lines = [line.strip() for line in open("chan.txt", 'r')]
    lines = filter(None,lines)
    while lines:
        url = lines.pop()
        title = lines.pop()
        sitedata[url] = title
        cs.add_request(url)

    # time.sleep(40)

    print 'size:', share.GPR.qsize()
    # Get content out
    n = 0

    pi = PageInitializer(share.GPR)