def main(): """ the entrance of whole system :return: """ getter_queue = Queue(maxsize=MAXSIZE_OF_QUEUE) appender_queue = Queue(maxsize=MAXSIZE_OF_QUEUE) usage_queue = Queue(maxsize=MAXSIZE_OF_QUEUE) verify_queue = Queue(maxsize=MAXSIZE_OF_QUEUE) pm = ProxyManager.ProxyManager(getter_queue=getter_queue, appender_queue=appender_queue, usage_queue=usage_queue, verify_queue=verify_queue, host=HOST, database=DATABASE, pwd=PASSWORD, user=USER, port=PORT, multiple_timeout=MULTIPLE_TIMEOUT_WHILE_NO_PROXY, hia_amount=MAX_QUANTITY_OF_HIA_PROXY_SELECT, rarely_amount=MAX_QUANTITY_OF_RARELY_USED_PROXY_SELECT, rarely_time=INTERVAL_TIME_OF_RARELY_USED_PROXY) vm = VerifyManager.VerifyManager(verify_queue=verify_queue, feedback_queue=usage_queue, sleep_time=SLEEPING_TIME_FOR_VERIFY_MANAGER, interval_time=INTERVAL_TIME_OF_VERIFY) cm = CrawlerManager.CrawlerManager(url_getter=Parser.xici_url_construction, data_parse=Parser.xici_parse, data_queue=appender_queue, feedback_queue=usage_queue, proxy_queue=getter_queue, maxsize_queue=MAXSIZE_OF_QUEUE, interval_time=SLEEPING_TIME_FOR_CRAWLER_MANAGER, base_time=BASE_TIMEOUT_WHILE_WAITING_PROXY, multiple_time=MULTIPLE_TIMEOUT_WHILE_WAITING_PROXY) sm = Process(target=modify_launcher, args=(getter_queue, 20)) try: log_writer('pm start') pm.start() # proxy manager start log_writer('vm start') vm.start() # verify manager start log_writer('cm start') cm.start() # crawler manager start log_writer('squid modifier start') sm.start() # squid modifier start # the system controller --only func exit now while True: order = input() if order == 'exit': break finally: if pm.is_alive(): pm.terminate() if vm.is_alive(): vm.terminate() if cm.is_alive(): cm.terminate() if sm.is_alive(): sm.terminate()
import urllib2, urllib, re, time, sys, os, os.path, math, random, json, multiprocessing, requests from bs4 import BeautifulSoup import Keywords, MultiProcessor, ProxyManager, ProxyFetcher, TaskLogic, Worker # main if __name__ == '__main__': # change keywords_path, nothing else keywords_path = "/Users/fabianschneider/Desktop/writing/keywords-amz/legacy/" # logic go kw = Keywords.Keywords() # fetch current proxies ttt = time.time() print '\n\n\n> Preparing proxies...' proxy = ProxyManager.ProxyManager() list = ProxyFetcher.get_list(proxy) proxy.add_all(list) ttte = time.time() print '> > Completed. %i proxies available (took %is).\n\n\n' % (len( proxy.list), (ttte - ttt)) # bases baseline = raw_input("What's the baseline product?\n=> ") depth = int(raw_input("How deep do you wanna crawl? Rec. 1-2\n=> ")) amount = int( raw_input( "How many crawlers do you wanna enslave? Rec. 8ish\nIf you don't know what you're doing, please enter 0 to let the code figure it out.\n=> " )) time_start = time.time() print 'Alright cool. Getting to work.\n\n\n'