示例#1
0
def run_crawler(current_pages, min_images):
    global all_time_visited
    q = Queue.Queue()
    visited = set()
    for page in current_pages:
        q.put(page)
        visited.add(page)
    images = []
    while len(images) < min_images:
        page = q.get()
        visited.add(page)
        all_time_visited.add(page)
        links = Fetcher.fetch_links(page)
        if links is None:
            continue

        for l in links[:10]:
            print(l)
            signal.alarm(10)
            try:
                if l not in visited and l not in all_time_visited:
                    q.put(l)
                    images.extend(get_images(l))
                    print("\n \n Current number of images")
                    print(len(images))
                    print("I have been extended")
            except TimeoutException:
                continue  # continue the for loop if function A takes more than 5 second
            else:
                # Reset the alarm
                signal.alarm(0)
    print('$$$$$$$$$$$$$ret val', q.queue, '\n\n', images)
    return list(q.queue), images
示例#2
0
G = nx.DiGraph()
G_undir = nx.Graph()
root = "http://www.caltech.edu/"

bfs_queue.put(root)
G.add_node(root)
G_undir.add_node(root)

depth = 0

outcount = {}

while (bfs_queue.empty() == False and depth < 2001):
    cur_link = bfs_queue.get()
    links = fetcher.fetch_links(cur_link)

    if links != None:
        #iterate through children
        for link in links:
            if "caltech.edu" in link and link not in visited_links:
                time.sleep(0.5)

                # update graph and visited_links array
                bfs_queue.put(link)
                visited_links.append(link)

                # add to dictionary, as well as how many hyperlinks it has
                try:
                    hyperlinks = fetcher.fetch_links(link)
                    outcount[link] = len(hyperlinks)
示例#3
0
	print current

	sys.stdout.write('\r')
    # the exact output you're looking for:
	sys.stdout.write("[%-40s] %f%%" % ('='*(count/50), count/20))
	sys.stdout.write('\r')
	sys.stdout.flush()

	if current in dictionary_out.keys():
		continue # already crawled before

	G.add_node(current)

	try: 
		addin=fetcher.fetch_links(current) # the new outgoing urls
	except:
		print "*** Something bad happened (404)***\n"
		continue

	try: 
		for item in addin:
		# screening of the outgoing urls
			if "jstor" in item or "ieeexplore" in item or "onlinelibrary" in item or ".pdf" in item:
				continue
			if not "caltech.edu" in item:
				continue
			if item in dictionary_in.keys():
				dictionary_in[item]+=1
			else:
				dictionary_in[item]=1
示例#4
0
def crawlerRun(threadID, sleeptime):

    global poolLock, urlPool, urlFound, activeThreads, poolOpen, proxyLock, proxyInd, proxyList, poolLen, foundLen

    #        print "Thread " + str(threadID) + " Started"
    sys.stdout.flush()

    MAX_RESULTS = 100000  #20010 # How many results? Finite execution, rather than crawling entire reachable(URL_seeds)'
    POOL_LIMIT = 100000

    output = 10

    # file to save structure in
    try:
        f = open('structure.' + str(threadID) + '.dat', 'w')
    except IOError:
        print "Unable to open " + "structure." + str(threadID) + \
              ".dat for writing.  Thread " + str(threadID) + "exiting."
        sys.stdout.flush()
        exit()

    nlookups = 150
    myLink = None

    while foundLen < MAX_RESULTS:
        """ Changes url periodically to avoid lookup limit """
        if nlookups >= 20:

            #                    print "\t\t\t\tThread " + str(threadID) + " aquiring new proxy."
            sys.stdout.flush()
            proxyLock.acquire()
            myLink = changeURL()
            nlookups = 0
            myIP = myLink[0]
            myPort = myLink[1]
            proxyLock.release()
#                    print "\t\t\t\tUsing " + myIP + ":" + str(myPort)

        while poolLen == 0:
            #                    print "\t\t\t\tThread " + str(threadID) + " unable to retrieve user from pool." \
            #                          + " Pausing for " + str(sleeptime) + " sec."
            #                    sys.stdout.flush()
            time.sleep(sleeptime)
            poolLen = len(urlPool)

        poolLock.acquire()
        if poolLen > 0:
            user = urlPool.pop(0)  # fetch next page (FIFO -> Breath First)
        else:
            continue
        poolLock.release()
        followers = fetch_links(user, myIP, myPort)
        nlookups += 1

        if followers == None:
            poolLock.acquire()
            urlPool.insert(0, user)
            poolLock.release()
            proxyLock.acquire()
            myLink = changeURL()
            proxyLock.release()
            #                    print '\t\t\t\tProfile ' + str(user) + ' is busy.  Absorbing back into pool.'
            sys.stdout.flush()
            continue

        try:
            int(followers[0])
        except ValueError:
            poolLock.acquire()
            urlPool.append(user)
            poolLock.release()
            #                    print followers
            proxyLock.acquire()
            myLink = changeURL()
            proxyLock.release()
            continue

        urlFound.append(user)
        foundLen += 1

        if (not (followers == None) and len(followers) > 0
                and not (followers[0] == '')):
            new_pages = []
            # Add unencountered pages to queue
            for ids in followers:
                if not (ids in urlPool or ids in urlFound):
                    new_pages.append(ids)

            writeUser(f, user, followers)
            if poolOpen:
                poolLock.acquire()
                urlPool.extend(new_pages)  # add pages to queue
                poolLock.release()


#                foundLen = len(urlFound)

# Print progress
        if ((foundLen % output) == 0 and foundLen < MAX_RESULTS):
            poolLen = len(urlPool)
            print "Progress: %d pages crawled.  %d users in pool." % (foundLen,
                                                                      poolLen)
            sys.stdout.flush()

        # Closes url pool if max size is reached.  Prevents slow-down of crawl
        if poolOpen: poolLen = len(urlPool)
        if poolLen > POOL_LIMIT and poolOpen:
            print "\t\t\t\tMax URL Pool size reached! Closing Pool..."
            sys.stdout.flush()
            poolOpen = False

    f.close()

    # Output results
    print "Thread " + str(threadID) + " Finished! " + \
                 str(activeThreads - 1) + " Threads Running."
    sys.stdout.flush()
    activeThreads -= 1
示例#5
0
    urlFound = []

    if not len(sys.argv[1:]) == 2:
        print usage
        exit()

    try:
        seedID = sys.argv[1]
        nThreads = int(sys.argv[2])
    except ValueError:
        print usage
        exit()

    # Tests the seedID
    print "Beginning crawl at user ID " + seedID
    followers = fetch_links(seedID, proxyList[0][0], proxyList[1][0])
    #    print followers
    if followers == None:
        print usage
        print "Unable to open seedID.  Twitter may be busy.\n\n"
        exit()

    try:
        f = open('structure.' + str(nThreads) + '.dat', 'w')
        writeUser(f, seedID, followers)
        f.close()
    except IOError:
        print "Unable to open " + "structure." + str(nThreads) + \
              ".dat for writing.  Crawler exiting."
        sys.stdout.flush()
        exit()
示例#6
0
文件: crawler.py 项目: jjoo172/CS144
G = nx.DiGraph()
G_undir = nx.Graph()
root = "http://www.caltech.edu/"

bfs_queue.put(root)
G.add_node(root)
G_undir.add_node(root)

depth = 0

outcount = {}

while (bfs_queue.empty() == False and depth < 2001):
	cur_link = bfs_queue.get()
	links = fetcher.fetch_links(cur_link)

	if links != None:
		#iterate through children
		for link in links:
			if "caltech.edu" in link and link not in visited_links:
				time.sleep(0.5)

				# update graph and visited_links array
				bfs_queue.put(link)
				visited_links.append(link)

				# add to dictionary, as well as how many hyperlinks it has
				try:
					hyperlinks = fetcher.fetch_links(link)
					outcount[link] = len(hyperlinks)
示例#7
0
文件: crawler.py 项目: bslawski/CS145
def crawlerRun(threadID, sleeptime):

        global poolLock, urlPool, urlFound, activeThreads, poolOpen, proxyLock, proxyInd, proxyList, poolLen, foundLen

#        print "Thread " + str(threadID) + " Started"
        sys.stdout.flush()

	MAX_RESULTS = 100000 #20010 # How many results? Finite execution, rather than crawling entire reachable(URL_seeds)'
        POOL_LIMIT = 100000
	
	output = 10

        # file to save structure in
        try :
            f = open('structure.' + str(threadID) + '.dat','w')
        except IOError :
            print "Unable to open " + "structure." + str(threadID) + \
                  ".dat for writing.  Thread " + str(threadID) + "exiting."
            sys.stdout.flush()
            exit()

        nlookups = 150
        myLink = None

	while foundLen < MAX_RESULTS:

                """ Changes url periodically to avoid lookup limit """
                if nlookups >= 20:

#                    print "\t\t\t\tThread " + str(threadID) + " aquiring new proxy."
                    sys.stdout.flush()
                    proxyLock.acquire()
                    myLink = changeURL()
                    nlookups = 0
                    myIP = myLink[0]
                    myPort = myLink[1]
                    proxyLock.release()
#                    print "\t\t\t\tUsing " + myIP + ":" + str(myPort)

                while poolLen == 0 :
#                    print "\t\t\t\tThread " + str(threadID) + " unable to retrieve user from pool." \
#                          + " Pausing for " + str(sleeptime) + " sec."
#                    sys.stdout.flush()
                    time.sleep(sleeptime)
                    poolLen = len(urlPool)

                poolLock.acquire()
                if poolLen > 0:
                    user = urlPool.pop(0) # fetch next page (FIFO -> Breath First)
                else:
                    continue
                poolLock.release()
                followers = fetch_links(user, myIP, myPort)
                nlookups += 1

                if followers == None :
                    poolLock.acquire()
                    urlPool.insert(0, user)
                    poolLock.release()
                    proxyLock.acquire()
                    myLink = changeURL()
                    proxyLock.release()
#                    print '\t\t\t\tProfile ' + str(user) + ' is busy.  Absorbing back into pool.'
                    sys.stdout.flush()
                    continue

                try:
                    int(followers[0])
                except ValueError:
                    poolLock.acquire()
                    urlPool.append(user)
                    poolLock.release()
#                    print followers
                    proxyLock.acquire()
                    myLink = changeURL()
                    proxyLock.release()
                    continue 

                urlFound.append(user)
                foundLen += 1

		if (not (followers == None) and len(followers) > 0 and not (followers[0] == '')):
                        new_pages = []
			# Add unencountered pages to queue
                        for ids in followers :
                            if not (ids in urlPool or ids in urlFound) :
                                new_pages.append(ids)

                        writeUser(f, user, followers)
                        if poolOpen:
                            poolLock.acquire()
       			    urlPool.extend(new_pages) # add pages to queue	
                            poolLock.release()
        
#                foundLen = len(urlFound)

		# Print progress
		if ((foundLen % output) == 0 and foundLen < MAX_RESULTS):
                    poolLen = len(urlPool)
                    print "Progress: %d pages crawled.  %d users in pool." % (foundLen, poolLen)
                    sys.stdout.flush()

                # Closes url pool if max size is reached.  Prevents slow-down of crawl
                if poolOpen : poolLen = len(urlPool)
                if poolLen > POOL_LIMIT and poolOpen:
                    print "\t\t\t\tMax URL Pool size reached! Closing Pool..."
                    sys.stdout.flush()
                    poolOpen = False

	f.close()

	# Output results
	print "Thread " + str(threadID) + " Finished! " + \
              str(activeThreads - 1) + " Threads Running."
        sys.stdout.flush()
        activeThreads -= 1
示例#8
0
文件: crawler.py 项目: bslawski/CS145
    urlFound = []

    if not len(sys.argv[1:]) == 2 :
        print usage
        exit()

    try:
        seedID = sys.argv[1]
        nThreads = int(sys.argv[2])
    except ValueError:
        print usage
        exit()

    # Tests the seedID
    print "Beginning crawl at user ID " + seedID
    followers = fetch_links(seedID, proxyList[0][0], proxyList[1][0])
#    print followers
    if followers == None:
        print usage
        print "Unable to open seedID.  Twitter may be busy.\n\n"
        exit()


    try :
        f = open('structure.' + str(nThreads) + '.dat','w')
        writeUser(f, seedID, followers)
        f.close()
    except IOError :
        print "Unable to open " + "structure." + str(nThreads) + \
              ".dat for writing.  Crawler exiting."
        sys.stdout.flush()