Пример #1
0
def main():
    """
    the entrance of whole system
    :return:
    """
    getter_queue = Queue(maxsize=MAXSIZE_OF_QUEUE)
    appender_queue = Queue(maxsize=MAXSIZE_OF_QUEUE)
    usage_queue = Queue(maxsize=MAXSIZE_OF_QUEUE)
    verify_queue = Queue(maxsize=MAXSIZE_OF_QUEUE)
    pm = ProxyManager.ProxyManager(getter_queue=getter_queue,
                                   appender_queue=appender_queue,
                                   usage_queue=usage_queue,
                                   verify_queue=verify_queue,
                                   host=HOST,
                                   database=DATABASE,
                                   pwd=PASSWORD,
                                   user=USER,
                                   port=PORT,
                                   multiple_timeout=MULTIPLE_TIMEOUT_WHILE_NO_PROXY,
                                   hia_amount=MAX_QUANTITY_OF_HIA_PROXY_SELECT,
                                   rarely_amount=MAX_QUANTITY_OF_RARELY_USED_PROXY_SELECT,
                                   rarely_time=INTERVAL_TIME_OF_RARELY_USED_PROXY)

    vm = VerifyManager.VerifyManager(verify_queue=verify_queue,
                                     feedback_queue=usage_queue,
                                     sleep_time=SLEEPING_TIME_FOR_VERIFY_MANAGER,
                                     interval_time=INTERVAL_TIME_OF_VERIFY)

    cm = CrawlerManager.CrawlerManager(url_getter=Parser.xici_url_construction,
                                       data_parse=Parser.xici_parse,
                                       data_queue=appender_queue,
                                       feedback_queue=usage_queue,
                                       proxy_queue=getter_queue,
                                       maxsize_queue=MAXSIZE_OF_QUEUE,
                                       interval_time=SLEEPING_TIME_FOR_CRAWLER_MANAGER,
                                       base_time=BASE_TIMEOUT_WHILE_WAITING_PROXY,
                                       multiple_time=MULTIPLE_TIMEOUT_WHILE_WAITING_PROXY)
    sm = Process(target=modify_launcher, args=(getter_queue, 20))
    try:
        log_writer('pm start')
        pm.start()  # proxy manager start
        log_writer('vm start')
        vm.start()  # verify manager start
        log_writer('cm start')
        cm.start()  # crawler manager start
        log_writer('squid modifier start')
        sm.start()    # squid modifier start
        # the system controller --only func exit now
        while True:
            order = input()
            if order == 'exit':
                break
    finally:
        if pm.is_alive():
            pm.terminate()
        if vm.is_alive():
            vm.terminate()
        if cm.is_alive():
            cm.terminate()
        if sm.is_alive():
            sm.terminate()
Пример #2
0
import urllib2, urllib, re, time, sys, os, os.path, math, random, json, multiprocessing, requests
from bs4 import BeautifulSoup
import Keywords, MultiProcessor, ProxyManager, ProxyFetcher, TaskLogic, Worker

# main
if __name__ == '__main__':
    # change keywords_path, nothing else
    keywords_path = "/Users/fabianschneider/Desktop/writing/keywords-amz/legacy/"

    # logic go
    kw = Keywords.Keywords()

    # fetch current proxies
    ttt = time.time()
    print '\n\n\n> Preparing proxies...'
    proxy = ProxyManager.ProxyManager()
    list = ProxyFetcher.get_list(proxy)
    proxy.add_all(list)
    ttte = time.time()
    print '> > Completed. %i proxies available (took %is).\n\n\n' % (len(
        proxy.list), (ttte - ttt))

    # bases
    baseline = raw_input("What's the baseline product?\n=> ")
    depth = int(raw_input("How deep do you wanna crawl? Rec. 1-2\n=> "))
    amount = int(
        raw_input(
            "How many crawlers do you wanna enslave? Rec. 8ish\nIf you don't know what you're doing, please enter 0 to let the code figure it out.\n=> "
        ))
    time_start = time.time()
    print 'Alright cool. Getting to work.\n\n\n'