import Queue, csv import queueMethod as qm dictPub = {} dictPubSaved = {} dictQueue = {} myqueue = Queue.Queue() qm.loadQueue(myqueue) qm.loadDict(dictPubSaved, 'dictPub.csv') namedictPubSaved = 'dictPub.csv' namedictQueue = 'dictQueue.csv' directo = 'C:\Users\walter\Desktop\\' csvfile = file(directo + namedictPubSaved, 'rb') spamreader = csv.reader(csvfile, dialect='excel') for line in spamreader: dictPubSaved[line[0]] = 0 csvfile.close() directo = 'C:\Users\walter\Desktop\\' csvfile = file(directo + namedictQueue, 'rb') spamreader = csv.reader(csvfile, dialect='excel') for line in spamreader: dictQueue[line[0]] = 0 csvfile.close()
#LoadAndContinueCrawl.py '''load saved queue and restart crawl @version0.4.151001 @author:maajor{<mailto:[email protected]>} ''' import Queue, issuuPagePub, csv import queueMethod as qm dictPub = {} dictPubSaved = {} dictQueue = {} myqueue = Queue.Queue() qm.loadQueue(myqueue) qm.loadDict(dictPubSaved, 'dictPub.csv') qm.loadDict(dictQueue, 'dictQueue.csv') count = 0 while count < 4000: if myqueue.qsize() == 0: break thisurl = myqueue.get() if dictPubSaved.has_key(thisurl) or dictPub.has_key(thisurl): print thisurl + " passed" continue thisPage = issuuPagePub.issuuPagePub(thisurl) dictPub[thisurl] = thisPage.getInformation() count += 1 print str(count) + " : " + str(thisPage) print "Queue current size " + str(myqueue.qsize())
import Queue, csv import queueMethod as qm dictPub = {} dictPubSaved = {} dictQueue = {} myqueue = Queue.Queue() qm.loadQueue(myqueue) qm.loadDict(dictPubSaved, 'dictPub.csv') namedictPubSaved = 'dictPub.csv' namedictQueue = 'dictQueue.csv' directo = 'C:\Users\walter\Desktop\\' csvfile = file(directo + namedictPubSaved, 'rb') spamreader = csv.reader(csvfile, dialect = 'excel') for line in spamreader: dictPubSaved[line[0]] = 0 csvfile.close() directo = 'C:\Users\walter\Desktop\\' csvfile = file(directo + namedictQueue, 'rb') spamreader = csv.reader(csvfile, dialect = 'excel') for line in spamreader: dictQueue[line[0]] = 0 csvfile.close()
#LoadAndContinueCrawl.py '''load saved queue and restart crawl @version0.4.151001 @author:maajor{<mailto:[email protected]>} ''' import Queue, issuuPagePub, csv import queueMethod as qm dictPub = {} dictPubSaved = {} dictQueue = {} myqueue = Queue.Queue() qm.loadQueue(myqueue) qm.loadDict(dictPubSaved, 'dictPub.csv') qm.loadDict(dictQueue, 'dictQueue.csv') count = 0 while count < 4000: if myqueue.qsize() == 0: break thisurl = myqueue.get() if dictPubSaved.has_key(thisurl) or dictPub.has_key(thisurl): print thisurl + " passed" continue thisPage = issuuPagePub.issuuPagePub(thisurl) dictPub[thisurl] = thisPage.getInformation() count += 1 print str(count) + " : " + str(thisPage) print "Queue current size "+ str(myqueue.qsize())