Python Crawler.Crawler示例，crawler.Crawler.Crawler Python示例

示例#1

0

显示文件

def main():
    # will have to do some argparse stuff here to get log dir and the port
    # logdir and port come from command line
    global num_values

    db_name = "tensorview.db"
    parser = argparse.ArgumentParser(
        description="Parses relevant parameters for tensorview")
    parser.add_argument('--port',
                        dest='port',
                        type=int,
                        default=6886,
                        help="Port number to open server in")
    parser.add_argument(
        '--logdir',
        dest='dir',
        help="Log directory to obtain tensorflow event files from")
    parser.add_argument(
        '-n',
        dest='num',
        type=int,
        default=100,
        help="Metric parameters are aggregated from N most recent iterations")
    args = parser.parse_args()

    port = args.port
    logdir = args.dir
    num_values = args.num

    # Rebuild if log directory name was provided
    rebuild = (logdir is not None)
    experiments = None
    if rebuild:
        experiments = Crawler().crawl(logdir)

    print("IM RUNNING")
    Database.initialize_database(db_name, experiments, rebuild)
    app.run(debug=True, port=port, use_reloader=False)

示例#2

0

显示文件

文件： test_search_engine.py 项目： shine-st/search_engine

    def test_crawler(self):
        self.config.crawler.linksA.unlink(missing_ok=True)
        self.config.crawler.linksB.unlink(missing_ok=True)
        self.config.crawler.doc_id_mapping.unlink(missing_ok=True)
        self.config.crawler.doc_raw.unlink(missing_ok=True)
        self.config.crawler.bloom_filter.unlink(missing_ok=True)
        self.config.counter.unlink(missing_ok=True)

        crawler = Crawler(self.config)
        crawler.fetch()

        with open(self.config.crawler.doc_id_mapping, "r") as f:
            lines = f.read().splitlines()
            [id, u] = lines[0].split("\t")
            self.assertEqual(id, "1")
            self.assertEqual(u, self.config.crawler.seed_url)

        with open(self.config.crawler.linksB, "r") as f:
            lines = f.read().splitlines()
            self.assertTrue(len(lines) > 10)

        with open(self.config.crawler.linksA, "r") as f:
            lines = f.read().splitlines()
            self.assertTrue(len(lines) == 0)

        with open(self.config.crawler.doc_raw, "r") as f:
            lines = f.read().splitlines()
            [id, size, content] = lines[0].split("\t")
            self.assertEquals("1", id)
            self.assertTrue(len(content) > 0)

        crawler.fetch()

        with open(self.config.crawler.linksA, "r") as f:
            lines = f.read().splitlines()
            self.assertTrue(len(lines) > 10)

        crawler.bloom_filter.dump()

示例#3

0

显示文件

    def test_file_links(self):
        self.start_server(TestCrawler.FILE_LINKS_HTML)
        crawler = Crawler(TestCrawler.SERVER)
        crawler.crawl()
        expected = \
    '''[
  {
    "assets": [
      "http://127.0.0.1:5000/source.pdf", 
      "http://127.0.0.1:5000/source.txt"
    ], 
    "url": "http://127.0.0.1:5000/"
  }, 
  {
    "assets": [], 
    "url": "http://127.0.0.1:5000/source.pdf"
  }, 
  {
    "assets": [], 
    "url": "http://127.0.0.1:5000/source.txt"
  }
]'''
        self.assertMultiLineEqual(expected, crawler.assets_json())

示例#4

0

显示文件

文件： test_crawler.py 项目： rrbarbieri/crawler

    def test_crawl_max_depth(self):
        dbschema = "test_crawl_max_depth"
        self.setup_db(dbschema)
        session = self.Session()

        url = 'http://zero.webappsecurity.com/'
        url_list = ['http://zero.webappsecurity.com/']
        link = Link(url)
        session.add(link)
        session.commit()

        max_depth = 1
        c = Crawler(0, 'crawler', 'abc123', 'localhost', 0, dbschema, max_depth)
        result = c.crawl()

        link_list = session.query(Link)
        session.close()

        self.assertEqual(result, 0)
        self.assertEqual(len([l for l in link_list]), len(url_list))
        for link in link_list:
            self.assertTrue(link.url in url_list)
            self.assertEqual(link.status, Status.visited)

示例#5

0

显示文件

def run(nombre_producto,
        bool_teleg,
        modo_headless,
        seg_dormidos,
        prec_min=0,
        prec_max=20000,
        num_max_productos=50):

    # Segundos entre búsquedas
    segundos_dormidos = seg_dormidos  # 3600 seg = 1 hora

    options = Options()
    # Modo headless
    options.headless = False if modo_headless == 'n' else True

    crawl = Crawler(options)

    crawl.run(nombre_producto,
              bool_teleg,
              prec_min,
              prec_max,
              num_max_productos,
              sleep_time=segundos_dormidos)

示例#6

0

显示文件

文件： test_crawler.py 项目： rrbarbieri/crawler

    def test_crawl_product_found(self):
        dbschema = "test_crawl_product_found"
        self.setup_db(dbschema)
        session = self.Session()

        url = 'http://www.epocacosmeticos.com.br/212-vip-rose-eau-de-parfum-carolina-herrera-perfume-feminino/p'
        title = 'Perfume 212 VIP Rosé Carolina Herrera Feminino - Época Cosméticos'
        name = '212 VIP Rosé Carolina Herrera - Perfume Feminino - Eau de Parfum'
        link = Link(url)
        session.add(link)
        session.commit()

        c = Crawler(0, 'crawler', 'abc123', 'localhost', 0, dbschema, 1)
        result = c.crawl()

        product_list = session.query(Product)
        session.close()

        self.assertEqual(result, 0)
        for product in product_list:
            self.assertEqual(product.url, url)
            self.assertEqual(product.title, title)
            self.assertEqual(product.name, name)

示例#7

0

显示文件

文件： yun.py 项目： or0fun/Crawler

def bdworker(words, fromdate):
    crawler = Crawler()

    bd_index = 0

    length = 0

    filename = "output/baidu_" + fromdate + "_" + words + "_" + time.strftime(
        '%Y年%m月%d日%H时%M分%S秒', time.localtime(time.time())) + '.xls'

    while True:

        length = len(crawler.realResults)
        crawler.bdrun(words, fromdate, bd_index)
        if False == crawler.bd_result:
            break
        if len(crawler.realResults) == length:
            break
        bd_index += 20

    print len(crawler.realResults)

    saveToFile(crawler, filename)

示例#8

0

显示文件

文件： kepuzhongguo_crawler.py 项目： SkyInNight/rumor_analyze_crawler

def Foo(page_list):
    out = []
    url = 'https://piyao.kepuchina.cn/rumor/rumorlist'
    parser = KepuzhongguoParser()
    title_parser = TitleParser()
    ua = UserAgent()
    headers = {
                "Origin":url,
                'User-Agent':ua.random,
                "Accept-Language":"zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
                # "Accept-Encoding":"gzip, deflate, br",
                "Content-Type":"text/html; charset=utf-8",
            }
    crawler = Crawler()
    title_list = []
    for page in page_list:
        data = {'type':0,'keyword':0,'page':page}
        url_list = crawler.crawling_ke(url, rule_model=parser,data=data,headers=headers)
        for url_1 in url_list:
            title = crawler.crawling_ke(url_1, rule_model=title_parser, headers=headers)
            title_list.append(title)
        # out.append(url_list)
    return title_list

示例#9

0

显示文件

def add_doc(request):
    if request.method == "POST":
        urls_for_indexing = []
        uv = URLValidator(schemes=['http', 'https'])

        urls_from_form = request.POST.get('url')
        if urls_from_form:
            list_urls = urls_from_form.split(", ")
            for url in list_urls:
                try:
                    uv(url)
                except ValidationError:
                    continue

                urls_for_indexing.append(url)

        file_with_urls = request.FILES.get('file_url')
        if file_with_urls:
            for url in file_with_urls:
                url = url.strip()
                try:
                    uv(url)
                except ValidationError:
                    continue

                urls_for_indexing.append(url)
        if len(urls_for_indexing):
            crawler = Crawler(urls_for_indexing, width=20, deep=3)
            crawler.craaawl()
            text = 'Finished'
        else:
            text = 'Invalid URL'

    else:
        text = ''

    return render(request, 'search/add_doc.html', {'text': text})

示例#10

0

显示文件

文件： scheduler.py 项目： reinforcementdriving/supercrawler

    def run(self):
        # Generate crawler processes.
        print 'Start crawlers processes...'
        self.__crawlers = [
            Crawler(i, self.GPR, self.__crawlers_queue)
            for i in range(NUMBER_OF_CRAWLERS)
        ]
        print 'Done.'

        self.__crawlers_record = []
        for i in range(NUMBER_OF_CRAWLERS):
            # Put crawlers into available queue.
            self.__crawlers_queue.put(i)
            # Start crawler job.
            self.__crawlers[i].start()
            self.__crawlers_record.append(datetime.datetime.now())

        print 'Ready to accept request.'
        # Start waiting for request and dispatch to crawler process.
        while True:
            req = self.__request_queue.get()

            done = False

            while not done:
                # Get a avaiable crawler process id.
                id = None
                try:
                    id = self.__crawlers_queue.get(timeout=MAX_WAITING_TIME)
                    # Reset this crawlers last crawl time.
                    self.__crawlers_record[id] = datetime.datetime.now()
                except Q.Empty:
                    # If no crawler is available.
                    self._check_crawlers()
                if not id is None:
                    self.__crawlers[id].add_request(req)
                    done = True

示例#11

0

显示文件

文件： angiecr.py 项目： jdafer98/AngieSF

def angiecr(url, fixed, dictionary, custom_set, minus, mayus, numbers, special,
            size, nthreads):

    cr = Crawler()

    if url != None:
        cr.set_url(url)
        cr.set_size(size)
        cr.set_isFixed(True if fixed == 1 else False)
        if nthreads != 1:
            cr.set_nthreads(nthreads)

        if dictionary != None:
            cr.set_custom_dict_path(dictionary)
            cr.read_custom_dict()
            cr.begin_crawl(1)
        else:
            if custom_set != None:
                cr.set_custom_set_path(custom_set)
                cr.read_custom_set()
                cr.begin_crawl(0)
            else:
                if minus != 0 or mayus != 0 or numbers != 0 or special != 0:
                    cr.select_sets(True if minus == 1 else False,
                                   True if mayus == 1 else False,
                                   True if numbers == 1 else False,
                                   True if special == 1 else False)
                    cr.begin_crawl(0)
                else:
                    print(
                        "Error: Un ataque de fuerza bruta precisa de un set de caracteres. Selecciona minus, mayus, numbers o especial. También puedes construir un custom set o realizar un ataque de diccionario (--help)."
                    )

    else:
        print(
            'Error: falta la url introducida. Para mas información ejecutar con --help'
        )

示例#12

0

显示文件

def getCommunityInfo():
    locationMapping = LocationMapping()
    locationMappingToInt = LocationMappingToInt()
    crawler = Crawler(locationMapping, locationMappingToInt)
    houseInfoParser = HouseInfoParser()
    dao = DAO()

    lianjiaSiteName = 'http://sh.lianjia.com'
    lianjiaXiaoqu = 'xiaoqu'

    count = 0
    # 100 pages
    for i in xrange(100):
        crawUrl = lianjiaSiteName + '/' + lianjiaXiaoqu + '/d' + repr(i +
                                                                      1) + 'rs'

        communityResp = crawler.crawCommunity(crawUrl)
        if len(communityResp):
            communityInfo = houseInfoParser.parseCommunityHttpResponse(
                communityResp)

        if not len(communityInfo):
            break

        for i in xrange(len(communityInfo)):
            # If it already exists
            querySQL = "SELECT * FROM community where community_name='%s'" % communityInfo[
                i].strip()
            if dao.queryForExistence(querySQL):
                continue

            count = count + 1
            insertSQL = "INSERT INTO community(community_name, community_code) VALUES ('%s', %d)" % (
                communityInfo[i].strip(), count)
            dao.insert(insertSQL)

    dao.close()

示例#13

0

显示文件

async def echo(message: types.Message):
    # old style:
    # await bot.send_message(message.chat.id, message.text)
    crawler = Crawler()
    soup = crawler.get_soup(WORLDOMETER)
    data = []
    table = soup.select_one('table#main_table_countries_today')
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')
    for table_row in table_body.find_all('tr'):
        columns = table_row.findAll('td')
        output_row = []
        for column in columns:
            output_row.append(column.text.strip().lower())
        data.append(output_row)

    print(message.from_user.full_name)
    print(message.text)

    try:
        result = data[index_2d(data, message.text)[0]]
        await message.answer(
            'Latest Stat from WORLDOMETER for country %s \n'
            'Total Cases : %s \n'
            'New Cases : %s \n'
            'Total Deaths : %s \n'
            'New Deaths : %s \n'
            'Total Recovered : %s \n'
            'Active Cases : %s \n'
            'Serious : %s \n'
            'Total Case per 1M Population : %s \n' %
            (result[0], result[1], result[2], result[3], result[4], result[5],
             result[6], result[7], result[8]))
        #await message.answer('\n'.join(data[index_2d(data,message.text)[0]]))
    except:
        await message.answer(
            'please try other command or type country you want to get info')

示例#14

0

显示文件

def atualizar(bot, update, args):
    chat_id = update.message.chat_id
    if len(args) == 0:
        msg = "Matte kudasai!\nVocê não digitou sua senha.\nDigite /atualizar [Senha BU]."
        bot.send_photo(chat_id=chat_id, photo='http://static.tumblr.com/mxarhwc/kjylpbc32/yui_k-on.png.jpg')
        bot.send_message(chat_id=chat_id, text=msg)
        return
    elif len(args[0]) < 4 or len(args[0]) > 6:
        msg = "A senha precisa ter de 4 a 6 digitos.\nDigite /atualizar [Senha BU]."
        bot.send_photo(chat_id=chat_id, photo='https://i.kym-cdn.com/photos/images/newsfeed/000/189/032/1319151441001.png')
        bot.send_message(chat_id=chat_id, text=msg)
        return
    elif not verificarNumeros(args[0]):
        msg = "Subaru-kun, a senha deve possuir apenas números."
        bot.send_photo(chat_id=chat_id, photo='https://img.fireden.net/v/image/1484/98/1484987967315.jpg')
        bot.send_message(chat_id=chat_id, text=msg)
        return

    dataBase = Db()
    if dataBase.haveSelectUser(chat_id):
        matricula = dataBase.getMatricula(chat_id)
        crawler = Crawler()
        if crawler.crawler(matricula, args[0]):
            msg = "Atualizado!"
            main_menu_keyboard = [[telegram.KeyboardButton('/livros')]]
            reply_kb_markup = telegram.ReplyKeyboardMarkup(main_menu_keyboard, resize_keyboard=True, one_time_keyboard=True)
            bot.send_photo(chat_id=chat_id, photo='http://orig02.deviantart.net/cfa7/f/2012/259/9/e/mami_tomoe_render_by_moeblueberry1771-d5evnl7.png')
            bot.send_message(chat_id=chat_id, text=msg, reply_markup=reply_kb_markup)
        else:
            msg = "What isn't remembered never happened.\nMemory is merely a record.\nYou just need to re-write that record.\Matricula ou senha inválida"
            bot.send_photo(chat_id=chat_id, photo='https://wired-7.org/lain/src/1558910195719.jpg')
            bot.send_message(chat_id=chat_id, text=msg)
    else:
        msg = "Is this a usuário não cadastrado?\nDigite /cadastrar [Matricula]."
        bot.send_photo(chat_id=chat_id, photo='https://assets3.thrillist.com/v1/image/2762016/size/tmg-article_default_mobile.jpg')
        bot.send_message(chat_id=chat_id, text=msg)

示例#15

0

显示文件

def eventFC(crawlParams):

    seedURLs = crawlParams["seedURLs"]
    t = [(-1, p, -1, "") for p in seedURLs]
    priorityQueue = PriorityQueue(t)

    crawlParams["priorityQueue"] = priorityQueue

    eventModel = EventModel(crawlParams['No_Keywords'])
    eventModel.buildEventModel(crawlParams['model'],
                               crawlParams['eventType'],
                               minTopicTermFreq=20,
                               minLocTermFreq=crawlParams['minLocTermFreq'],
                               minDateTermFreq=20)

    crawlParams['scorer'] = eventModel
    crawler = Crawler(crawlParams)

    #crawler.crawl()
    qu = crawler.crawl()
    quS = '\n'.join([str(-1 * s[0]) + "," + s[1] for s in qu])
    with open('queueEvent.txt', 'w') as fw:
        fw.write(quS.encode('utf8'))
    return crawler.relevantPages

示例#16

0

显示文件

文件： dispatcher.py 项目： c24b/crawax

    def __init__(self, job_params):
        self.task = job_params
        try:
            if self.task['crawl'] is True:
                c = Crawler(self.task)
                c.crawl()
            elif self.task['report'] is True:
                #crawtext.py report <project> [((--email=<email>| -e <email>) -u <user> -p <passwd>)| (-o <outfile> |--o=<outfile>)]
                Report(self.task)
            elif self.task['export'] is True:
                #crawtext.py export [results|sources|logs|queue]  <project> [(-o <outfile> |--o=<outfile>)] [-t <type> | --type=<type>]
                Export(self.task)

            # elif self.task['extract'] is True:
            #new method for extract every url

        except KeyError:
            print self.task["project"]
            print "Project %s not configured properly" % str(
                self.task["project"])

            s = Scheduler(self.task)
            s.delete()
            print "deleting project"

示例#17

0

显示文件

文件： hollister.py 项目： devkook/ChangeGAN

def main():
    crawler = Crawler()

    def get_photos_from_url(gender, type, url):
        crawler.open(url)
        crawler.infinite_scroll()
        images = crawler.find_elements('img.grid-product__image')
        srcs = img2src(images)
        print '{} images got'.format(len(images))
        if gender == 'women':
            srcs = map(lambda src: src.replace('model', 'prod'), srcs)
        download_images(srcs, os.path.join('hollister', gender, type))

    # Men
    # get_photos_from_url('men', 'top', 'https://www.hollisterco.com/shop/wd/guys-tops/?search-field=&sort=newest&start=0&rows=90&filtered=true')
    # get_photos_from_url('men', 'top-1', 'https://www.hollisterco.com/shop/wd/guys-tops/?search-field=&sort=newest&start=90&rows=90&filtered=true')
    # get_photos_from_url('men', 'top-2', 'https://www.hollisterco.com/shop/wd/guys-tops/?search-field=&sort=newest&start=180&rows=90&filtered=true')

    # Women
    # get_photos_from_url('women', 'top', 'https://www.hollisterco.com/shop/wd/girls-tops/?search-field=&sort=newest&start=0&rows=90&filtered=true')
    get_photos_from_url(
        'women', 'top-1',
        'https://www.hollisterco.com/shop/wd/girls-tops/?search-field=&sort=newest&start=90&rows=90&filtered=true'
    )

示例#18

0

显示文件

文件： lebeige.py 项目： devkook/ChangeGAN

def main():
    crawler = Crawler()

    def get_photos_from_url(gender, type, url):
        crawler.open(url)

        images = crawler.find_elements('span.back' '> img')
        srcs = img2src(images)
        print '{} images got'.format(len(images))
        download_images(srcs, os.path.join('lebeige', gender, type))

    # Women
    get_photos_from_url(
        'women', 'shirts',
        'http://www.ssfshop.com/LEBEIGE/ssfshop/list?dspCtgryNo=SFMA41A02&brandShopNo=BDMA07A06&brndShopId=ECBVF&etcCtgryNo=&ctgrySectCd=&keyword=&leftBrandNM=LEBEIGE_ECBVF'
    )
    get_photos_from_url(
        'women', 'tshirts',
        'http://www.ssfshop.com/LEBEIGE/T-shirts/list?dspCtgryNo=SFMA41A01&brandShopNo=BDMA07A06&brndShopId=ECBVF&etcCtgryNo=&ctgrySectCd=&keyword=&leftBrandNM=LEBEIGE_ECBVF'
    )
    get_photos_from_url(
        'women', 'knitwear',
        'http://www.ssfshop.com/LEBEIGE/Knitwear/list?dspCtgryNo=SFMA41A03&brandShopNo=BDMA07A06&brndShopId=ECBVF&etcCtgryNo=&ctgrySectCd=&keyword=&leftBrandNM=LEBEIGE_ECBVF'
    )

示例#19

0

显示文件

文件： gucci.py 项目： devkook/ChangeGAN

def main():
    crawler = Crawler()

    def get_photos_from_url(gender, type, url):
        crawler.open(url)

        images = crawler.find_elements(
            'a.product-tiles-grid-item-link '
            '> div.product-tiles-grid-item-image-wrapper '
            '> div.product-tiles-grid-item-image '
            '> img')
        srcs = img2src(images)
        print '{} images got'.format(len(images))
        download_images(srcs, os.path.join('gucci', gender, type))

    # Men
    # get_photos_from_url('men', 'shirts', 'https://www.gucci.com/kr/ko/ca/men/mens-ready-to-wear/mens-shirts-c-men-readytowear-shirts')

    # Women
    # get_photos_from_url('women', 'shirts', 'https://www.gucci.com/kr/ko/ca/women/womens-ready-to-wear/womens-tops-shirts-c-women-readytowear-tops-and-shirts')
    get_photos_from_url(
        'women', 'sweatshirts',
        'https://www.gucci.com/kr/ko/ca/women/womens-ready-to-wear/womens-sweatshirts-t-shirts-c-women-ready-to-wear-new-sweatshirts'
    )

示例#20

0

显示文件

def main(token_id, screen_name_path, wpath):
    start_time = datetime.now()
    print 'Started at:', start_time
    crawler = Crawler(token_id=token_id)
    screen_names = get_screen_names(screen_name_path)
    count = 0
    with open(wpath, 'a') as wf:
        for sn in screen_names:
            try:
                user = crawler.get_user(sn)
                wf.write(json.dumps(user) + '\n')
                count += 1
                print count, 'users got.'
            #except tweepy.RateLimitError:
            #    print 'Exceeds rate limit, waiting...'
            #    time.sleep(120)
            except KeyboardInterrupt:
                end_time = datetime.now()
                print 'Ended at:', end_time
                print 'Duration:', (
                    end_time - start_time).total_seconds() / 3600., 'hours'
                exit()
            except Exception, ex:
                print ex

示例#21

0

显示文件

from pathlib import Path
import click
from multiprocessing import Pool
from tqdm import tqdm
from crawler import Crawler
import pandas as pd

crawler = Crawler(image_dir='./images',
                  data_dir='./data',
                  list_dir='./lists',
                  page_dir='./pages')


def crawl_page(thread, test):
    urls_pending, saved_pages = crawler.check_pending_pages(test)
    print("CRAWLING PAGE")
    print("=" * 60)
    print(f"downloaded {len(saved_pages)} pages")
    print(f"downloading remaining {len(urls_pending)} pages")
    print("=" * 60)

    # crawling
    if thread > 1:
        with Pool(thread) as p:
            _ = list(
                tqdm(p.imap_unordered(crawler.crawl_page, urls_pending),
                     total=len(urls_pending)))
    else:
        for url in tqdm(urls_pending):
            _ = crawler.crawl_page(url)

示例#22

0

显示文件

    def test_issue(self):
        crawler = Crawler(self.example_issue)

        self.assertTrue(crawler.items, True)
        for item in crawler.items:
            self.assertEqual(type(item), Issue)

示例#23

0

显示文件

文件： getter.py 项目： breaksoul/spider

 def __init__(self):
     self.redis = redisclient()
     self.crawler = Crawler()  #调用其他的函数也得初始化

示例#24

0

显示文件

    def test_repo(self):
        crawler = Crawler(self.example_repo)

        self.assertTrue(crawler.items, True)
        for item in crawler.items:
            self.assertEqual(type(item), Repository)

示例#25

0

显示文件

    def test_wiki(self):
        crawler = Crawler(self.example_wiki)

        self.assertTrue(crawler.items, True)
        for item in crawler.items:
            self.assertEqual(type(item), Wiki)

示例#26

0

显示文件

文件： crawl.py 项目： ckylape/quick-cache-warmer

        while True:
            url = q.get()
            crawler.warm_url(url)
            q.task_done()

    # create threads
    threads = int(os.environ.get('THREADS', 5))
    threads = threads if (threads > 0 and threads < 10) else 5
    for i in range(threads):
        worker = threading.Thread(target=add_to_queue, args=(i, QUEUE))
        worker.daemon = True
        worker.start()

    # start crawling
    started = time.time()
    crawler = Crawler()
    if args.sitemap is not None:
        offset = args.offset if (args.offset is not None) else 0
        crawler.sitemap_crawler(args.sitemap, args.count, offset)
    else:
        crawler.google_crawler('ga:%i' % args.id, args.count)

    # multithreaded cache warmer
    delay = float(os.environ.get('DELAY', 500))
    for url in crawler.urls:
        QUEUE.put(url)
        time.sleep(delay / 1000.0)

    # finsih the queue/threads
    try:
        term = threading.Thread(target=QUEUE.join)

示例#27

0

显示文件

文件： main.py 项目： 1512125/WebScience

import threading
from queue import Queue
from util import getDomainName
from util import initFolder
from crawler import Crawler

NUM_SPIDERS = 12
DEPTH = 7
HOMEPAGE = 'https://vnexpress.net/'
DOMAIN_NAME = getDomainName(HOMEPAGE)
initFolder(".")
Crawler(DOMAIN_NAME, HOMEPAGE, DEPTH)
q = Queue()


def work():
    while True:
        url = q.get()
        Crawler.crawlPage(threading.currentThread().name, url)
        q.task_done()


# Create spider threads (will be terminated when main exits)
def createCrawlers():
    for spider in range(NUM_SPIDERS):
        t = threading.Thread(target=work)
        t.daemon = True
        t.start()


# Each queued link is a new job

示例#28

0

显示文件

文件： get_ip.py 项目： andy521/ip_pool

 def __init__(self):
     self.redis = RedisClient()
     self.crawler = Crawler()

示例#29

0

显示文件

文件： website.py 项目： dani1793/Machine-Learning-Projects

 def __init__(self, url, targetPatterns):
     self.url = url
     self.targetPatterns = targetPatterns
     self.crawler = Crawler(self.url)
     self.results = []

示例#30

0

显示文件

文件： main.py 项目： mockystr/tp_crawler_api

from settings import RPS, START_URL
import asyncio
from crawler import Crawler

if __name__ == '__main__':
    # with pool ~14s
    # best score was 11.5

    # without pool ~50s
    # sync 78s
    asyncio.run(Crawler(start_url=START_URL, rps=RPS).main())