def load_components(self, count=0): if count >= 2: #If the loader fails for 2 consecutive times, exit the engine custom_logger().log_message("Unable to load the engine. Exiting.", logger_handler.log_level_CRITICAL) print("Unable to load the engine. Exiting.") sys.exit(0) graph_obj = pagerank().unpickle_graph() indexer_obj = indexer().unpickle_indexer() if graph_obj is not None and indexer_obj is not None: lda_obj = lda() lda_obj.load_model() return graph_obj, indexer_obj, lda_obj if count > 0: custom_logger().log_message( "Unexpected error while trying to launch engine. Trying again.", logger_handler.log_level_ERROR) print( "Unexpected error while trying to launch engine. Trying again." ) #Prompt the user to reconstruct the models again val = input( "Processed information not found. Do you want to crawl again? (y/n) " ) val = val.lower() if val == 'y' or val == 'yes': try: print( "Launching web crawler. This operation will take a while to complete." ) crawler() print( "Web crawler operations completed! Launching components.") return self.load_components(count + 1) except Exception as e: custom_logger().log_message( "Unexpected error while crawling the web. Exiting.\n" + str(e), logger_handler.log_level_CRITICAL) print("Unexpected error while crawling the web. Exiting.") sys.exit(0) else: custom_logger().log_message( "Unable to load the search engine without the required components. Exiting.", logger_handler.log_level_CRITICAL) print( "Unable to load the search engine without the required components. Exiting." ) sys.exit(0)
def populateData(start_date, end_date): events = crawler(start_date, end_date) events = processGeoInfo(events) # populate data into db for e in events: try: add_activity(db_handler, e['Name'], e['CreatorId'], e['Location'], e['latlon']['lat'], \ e['latlon']['lng'], e['Date'], e['Time'], e['Description'], e['Tag']) except: continue
def facebook_crawl(): for i in range(20): try: c = crawler(0) print 'Crawler object created' print ' CRAWL_STARTING' c.start() except Exception as e: print e try: del c except: pass time.sleep(20)
def work_thread(): #print('enter',threading.current_thread().name) global detail_urls while(detail_urls): url = detail_urls.get_random_item() try: sleep_time = random.random()*6 time.sleep(sleep_time) cra = crawler() #print('crawler url:%s'%url) flag,res = cra.get_by_proxy(url=url,call_back=get_pagedetail_callback) if(flag): print(threading.current_thread().name,'get detail done') after_process(url,res) else: print(threading.current_thread().name,'get detail failed') except Exception as e: print(threading.current_thread().name, 'get detail done,error ending') print(threading.current_thread().name,e)
def run_crawler(database_file, url_file, crawler_id, number_processes): bot = crawler(database_file, url_file, crawler_id, number_processes) bot.crawl(depth=1)
import MySQLdb from crawler import * if __name__ == '__main__': crawl = crawler() rep = 'Y' while rep == 'y' or rep == 'Y': print "" print "-----------------------------------------------------------------------------" print "1. Add New Website" print "2. Delete Website" print "3. Parse News (Just for Demo)" print "4. Parse All News from existing xml(s)" print "-----------------------------------------------------------------------------" choice = int(raw_input("\t\t\t\tEnter Your Choice\n-----------------------------------------------------------------------------\n")) print "-----------------------------------------------------------------------------" if choice == 1: link = raw_input('\t\t\t\tEnter a link\n-----------------------------------------------------------------------------\n') crawl.add_web(link) elif choice == 2: link = raw_input('\t\t\t\tEnter a link\n-----------------------------------------------------------------------------\n') crawl.delete_web(link) elif choice == 3:
import socket import sys from crawler import * import string server_address = ('localhost', 10003) # Create a TCP/IP socket socks = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # Connect the socket to the port where the server is listening print >> sys.stderr, 'connecting to %s port %s' % server_address socks.connect(server_address) x = 1 data = 'aditya' while x == 1: l = socks.recv(2) length = int(l) data = socks.recv(length) urls = crawler(data) length = len(urls) socks.send((chr)(length)) socks.send(urls) print data x = 2 socks.close()
def test_emptyUrlList (self): self.bot=crawler(None, "empty.txt ") self.bot.crawl(depth=1) self.assertFalse ((self.bot) ==-1, 'this text file exists!')
#!/usr/bin/env python3 # -*- coding: utf-8 -*- from crawler import * import json, sys # example : crawler(['-b', 'PublicServan', '-i', '100', '101']) argvs = sys.argv[1:] filename = crawler(argvs) with open('ptt.kcm', 'w', encoding='utf-8') as f: for i in map(lambda x:x.get('content', ''), json.load(open(filename, 'r'))['articles']): f.write(i + '\n')
def setUp(self): self.con = lite.connect("dbFile.db") self.bot = crawler(self.con, "urls.txt") self.bot.crawl(depth=1) self.bot.generate_page_ranks(self.bot._links) self.curs = self.con.cursor()
from crawler import * from pagerank import page_rank import redis import time # Get crawler object and crawl on urls found in urls.txt crawler = crawler(None, 'urls.txt') start = time.time() crawler.crawl() print "Elapsed Time: %s" % (time.time() - start) # Get the data structures generated by the crawler lexicon = crawler.get_lexicon() inverted_index = crawler.get_inverted_index() resolved_inverted_index = crawler.get_resolved_inverted_index() document_index = crawler.get_document_index() # Run pagerank on the links generated by the crawler pagerank = page_rank(crawler._links) # Store data on persistent storage i.e. Redis rdb = redis.Redis() rdb.flushdb() all_words = '' for word in lexicon: rdb.set('lexicon:' + str(word), lexicon[word]) all_words = all_words + str(word) + " , " rdb.set('all_words', all_words.strip(' , ')) for word_id in inverted_index: rdb.set('inverted_index:' + str(word_id),
def course_info(): # build a request object # get related important factors from query result req = request.get_json(force=True) query_result = req.get('queryResult') # debug use # print(query_result) action = query_result.get('action') params = query_result.get('parameters') intents = query_result.get('intent').get('displayName') # if intents != 'askKB': CourseId = params.get('CourseId') # CourseId = re.sub(r'([a-zA-Z]{4}) - ([0-9]) - ([0-9]{3})', r'\1\2\3', CourseId) #different intents get different feedback if intents == "courseInfo": info = get_data(CourseId) message = "\n".join(i+': '+str(info[i]) for i in info.keys()) # denug use # print("message:", message) elif intents == 'courseInfo - custom': message = 'For more information about this course, click handbook link: ' + crawler(CourseId, 'handbook Link') elif intents == "Overview": message = crawler(CourseId, intents) elif intents == 'Timetable': if check_t(CourseId): message = 'You can click this link to see timetable: ' + crawler(CourseId, 'timetable Link') else: message = 'This course is not offered in this term!' elif intents == "Lecturer": if check_t(CourseId): message = "The Lecturer of this course is: " + crawler(CourseId, intents) else: message = 'This course is not offered in this term!' elif intents == 'Census': if check_t(CourseId): message = 'The Census Date is: '+ crawler(CourseId, 'timetable')['Census Date']+", which is the last day of dropping this course." else: message = 'This course is not offered in this term!' elif intents == 'Enrols': if check_t(CourseId): message = crawler(CourseId, intents) else: message = 'This course is not offered in this term!' elif intents == 'Status': if check_t(CourseId): message = "The status of this course is: " + crawler(CourseId, intents) else: message = 'This course is not offered in this term!' elif intents == 'Faculty': faculty = crawler(CourseId, intents) school = crawler(CourseId, 'School') message = "Faculty: " + faculty + ", School: " + school elif intents == 'OfferTerms': message = "This course is offered in " + crawler(CourseId, 'Offering Terms') elif intents == 'Prerequisite': prerequisite = crawler(CourseId, intents) if prerequisite: message = 'The Prerequisite of this course are: ' + prerequisite else: message = 'There is no prerequisite for this course.' elif intents == 'Study Level': message = "The Study Level of this course is: "+ crawler(CourseId, intents) elif intents == 'Unit of Credit': message = "Unit of Credit for this course is: " + crawler(CourseId, intents) elif intents == 'askKB': message = manager.find(params['Concept']) return {'fulfillmentText': message}
def test_urllistDoesNotExist (self): self.bot=crawler(None, "bad.txt ") self.bot.crawl(depth=1) self.assertFalse ((self.bot) ==-1, 'this text file exists!')
import MySQLdb from crawler import * if __name__ == '__main__': crawlerObj = crawler() rep = 'Y' while rep == 'y' or rep == 'Y': print "" print "-----------------------------------------------------------------------------" print "1. Add New Website" print "2. Delete Website" print "3. Parse News (Just for Demo)" print "4. Parse All News from existing xml(s)" print "-----------------------------------------------------------------------------" choice = int(raw_input("\t\t\t\tEnter Your Choice\n-----------------------------------------------------------------------------\n")) print "-----------------------------------------------------------------------------" if choice == 1: link = raw_input('\t\t\t\tEnter a link\n-----------------------------------------------------------------------------\n') crawlerObj.add_web(link) elif choice == 2: link = raw_input('\t\t\t\tEnter a link\n-----------------------------------------------------------------------------\n') crawlerObj.delete_web(link) elif choice == 3: crawlerObj.parse_one('http://rss.cnn.com/rss/edition_sport.rss')
# CSC326 Lab 3 # Python script for running the crawler and the PageRank algorithm, # and pretty prints PageRank scores in greatest-to-least PageRank sorted order # Note: This script assumes that: # urls.txt (containing a list of urls to be crawled) and pagerank.py is in the current directory # See crawler.py for function implementations from crawler import * import pprint if __name__ == "__main__": bot = crawler(None, "urls.txt") bot.crawl(depth=1) pprint.pprint(bot.get_sorted_pagerank()) pprint.pprint(bot.get_sorted_pagerank_url())
import threading from queue import Queue from parsing import * from demo import * from general import * from crawler import * from scrape import * PROJECT_NAME = 'TheNewBoston' HOMEPAGE = 'https://www.patreon.com/thenewboston' DOMAIN_NAME = get_domain_name(HOMEPAGE) QUEUE_FILE = PROJECT_NAME + '/queue.text' CRAWLED_FILE = PROJECT_NAME + '/crawled.text' NUMBER_OF_THREADS = 8 queue = Queue() crawler(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME) # Create worker threads (will die when main exits) def create_workers(): for _ in range(NUMBER_OF_THREADS): t = threading.Thread(target=work) t.daemon = True t.start() # Do the next job in the queue def work(): while True: url = queue.get() crawler.crawl_page(threading.current_thread().name, url)