Exemplo n.º 1
0
import Queue, csv
import queueMethod as qm

dictPub = {}
dictPubSaved = {}
dictQueue = {}
myqueue = Queue.Queue()
qm.loadQueue(myqueue)

qm.loadDict(dictPubSaved, 'dictPub.csv')

namedictPubSaved = 'dictPub.csv'
namedictQueue = 'dictQueue.csv'

directo = 'C:\Users\walter\Desktop\\'
csvfile = file(directo + namedictPubSaved, 'rb')
spamreader = csv.reader(csvfile, dialect='excel')
for line in spamreader:
    dictPubSaved[line[0]] = 0
csvfile.close()

directo = 'C:\Users\walter\Desktop\\'
csvfile = file(directo + namedictQueue, 'rb')
spamreader = csv.reader(csvfile, dialect='excel')
for line in spamreader:
    dictQueue[line[0]] = 0
csvfile.close()
Exemplo n.º 2
0
#LoadAndContinueCrawl.py
'''load saved queue and restart crawl
@version0.4.151001
@author:maajor{<mailto:[email protected]>} 
'''

import Queue, issuuPagePub, csv
import queueMethod as qm

dictPub = {}
dictPubSaved = {}
dictQueue = {}
myqueue = Queue.Queue()
qm.loadQueue(myqueue)
qm.loadDict(dictPubSaved, 'dictPub.csv')
qm.loadDict(dictQueue, 'dictQueue.csv')

count = 0

while count < 4000:
    if myqueue.qsize() == 0:
        break
    thisurl = myqueue.get()
    if dictPubSaved.has_key(thisurl) or dictPub.has_key(thisurl):
        print thisurl + " passed"
        continue
    thisPage = issuuPagePub.issuuPagePub(thisurl)
    dictPub[thisurl] = thisPage.getInformation()
    count += 1
    print str(count) + " : " + str(thisPage)
    print "Queue current size " + str(myqueue.qsize())
Exemplo n.º 3
0
import Queue, csv
import queueMethod as qm

dictPub = {}
dictPubSaved = {}
dictQueue = {}
myqueue = Queue.Queue()
qm.loadQueue(myqueue)

qm.loadDict(dictPubSaved, 'dictPub.csv')

namedictPubSaved = 'dictPub.csv'
namedictQueue = 'dictQueue.csv'

directo = 'C:\Users\walter\Desktop\\'
csvfile = file(directo + namedictPubSaved, 'rb')
spamreader = csv.reader(csvfile, dialect = 'excel')
for line in spamreader:
    dictPubSaved[line[0]] = 0
csvfile.close()

directo = 'C:\Users\walter\Desktop\\'
csvfile = file(directo + namedictQueue, 'rb')
spamreader = csv.reader(csvfile, dialect = 'excel')
for line in spamreader:
    dictQueue[line[0]] = 0
csvfile.close()
#LoadAndContinueCrawl.py
'''load saved queue and restart crawl
@version0.4.151001
@author:maajor{<mailto:[email protected]>} 
'''

import Queue, issuuPagePub, csv
import queueMethod as qm

dictPub = {}
dictPubSaved = {}
dictQueue = {}
myqueue = Queue.Queue()
qm.loadQueue(myqueue)
qm.loadDict(dictPubSaved, 'dictPub.csv')
qm.loadDict(dictQueue, 'dictQueue.csv')

count = 0

while count < 4000:
    if myqueue.qsize() == 0:
        break
    thisurl = myqueue.get()
    if dictPubSaved.has_key(thisurl) or dictPub.has_key(thisurl):
        print thisurl + " passed"
        continue
    thisPage = issuuPagePub.issuuPagePub(thisurl)
    dictPub[thisurl] = thisPage.getInformation()
    count += 1
    print str(count) + " : " + str(thisPage) 
    print "Queue current size "+ str(myqueue.qsize())