示例#1
0
 def getImagesFromBaidu(self, query):
     baidu_crawler = BaiduImageCrawler()
     baidu_crawler.crawl(keyword=query,
                         offset=0,
                         max_num=self.num_of_images,
                         min_size=(self.min_width, self.min_height),
                         max_size=None)
示例#2
0
def test_baidu():
    img_dir = osp.join(test_dir, 'baidu')
    search_filters = dict(size='large', color='blue')
    baidu_crawler = BaiduImageCrawler(
        downloader_threads=2, storage={'root_dir': img_dir})
    baidu_crawler.crawl('cat', filters=search_filters, max_num=5)
    shutil.rmtree(img_dir)
示例#3
0
def test_baidu():
    img_dir = osp.join(test_dir, 'baidu')
    search_filters = dict(size='large', color='blue')
    baidu_crawler = BaiduImageCrawler(downloader_threads=2,
                                      storage={'root_dir': img_dir})
    baidu_crawler.crawl('cat', filters=search_filters, max_num=5)
    shutil.rmtree(img_dir)
示例#4
0
def exe_crawl(arg):
    google_crawler = GoogleImageCrawler(
        downloader_cls=PrefixNameGoogleDownloader,
        feeder_threads=1,
        parser_threads=1,
        downloader_threads=4,
        storage={'root_dir': f'{arg.dict}/{arg.keyword}/google'})
    filters = dict(license=f'{arg.license}')
    google_crawler.crawl(keyword=f'{arg.keyword}',
                         filters=filters,
                         offset=0,
                         max_num=arg.max,
                         file_idx_offset=0)

    bing_crawler = BingImageCrawler(
        downloader_cls=PrefixNameBingDownloader,
        downloader_threads=4,
        storage={'root_dir': f'{arg.dict}/{arg.keyword}/bing'})
    bing_crawler.crawl(keyword=f'{arg.keyword}',
                       filters=filters,
                       offset=0,
                       max_num=arg.max)

    baidu_crawler = BaiduImageCrawler(
        downloader_cls=PrefixNameBaiduDownloader,
        storage={'root_dir': f'{arg.dict}/{arg.keyword}/baidu'})
    baidu_crawler.crawl(keyword=f'{arg.keyword}', offset=0, max_num=arg.max)
示例#5
0
def crawl(
        folder: str,
        search: str,
        maxnum: int,
        crawlers: [List[str]] = ['GOOGLE', 'BING', 'BAIDU']) -> Dict[str, str]:
    """Crawl web sites for images"""
    print('(1) Crawling ...')
    # prepare folders
    os.makedirs(folder, exist_ok=True)

    sources = {}
    if maxnum > 1000:
        print("Max num limited to 1000")
        maxnum = 1000

    for c in crawlers:
        print(f'    -> {c}')
        if c == 'GOOGLE':
            google_crawler = GoogleImageCrawler(
                downloader_cls=CustomDownloader,
                log_level=logging.CRITICAL,
                feeder_threads=1,
                parser_threads=1,
                downloader_threads=4,
                storage={'root_dir': folder})

            google_crawler.crawl(keyword=search,
                                 offset=0,
                                 max_num=maxnum,
                                 min_size=(200, 200),
                                 max_size=None,
                                 file_idx_offset=0)

        if c == 'BING':
            bing_crawler = BingImageCrawler(downloader_cls=CustomDownloader,
                                            log_level=logging.CRITICAL,
                                            downloader_threads=4,
                                            storage={'root_dir': folder})
            bing_crawler.crawl(keyword=search,
                               filters=None,
                               offset=0,
                               max_num=maxnum,
                               file_idx_offset='auto')

        if c == 'BAIDU':
            baidu_crawler = BaiduImageCrawler(downloader_cls=CustomDownloader,
                                              log_level=logging.CRITICAL,
                                              storage={'root_dir': folder})
            baidu_crawler.crawl(keyword=search,
                                offset=0,
                                max_num=maxnum,
                                min_size=(200, 200),
                                max_size=None,
                                file_idx_offset='auto')

    return {
        k: v
        for k, v in CustomDownloader.registry.items() if k is not None
    }
示例#6
0
def baidu_crawl(keyword):
    baidu_crawler = BaiduImageCrawler(
        feeder_threads=1,
        parser_threads=2,
        downloader_threads=32,
        storage={'root_dir': './crawl_img'})
    baidu_crawler.crawl(keyword=keyword, offset=0, max_num=50,
                    min_size=(400,400), max_size=None)
示例#7
0
def test_baidu(logo):
    baidu_crawler = BaiduImageCrawler(
        downloader_cls=MyImageDownloader,
        downloader_threads=4,
        storage={'root_dir': os.path.join(root, logo, 'baidu')},
        log_level=logging.INFO,
        filename=os.path.join(root, logo, 'baidu.txt'))
    baidu_crawler.crawl(logo, max_num=args.maxnum)
def start_crawler(path: Path, search_text, num_images):

    crawler = BaiduImageCrawler(feeder_threads=2,
                                parser_threads=2,
                                downloader_threads=4,
                                storage={'root_dir': path})

    crawler.crawl(keyword=search_text, min_size=(64, 64), max_num=num_images)
示例#9
0
 def doBaidu(self):
     for keyword in self.keywords:
         baidu_storage = {'root_dir': '%s/baidu-%s/' % (self.dest_dir, keyword)}
         if os.path.exists(baidu_storage['root_dir']):
             continue
         baidu_crawler = BaiduImageCrawler(parser_threads=self.thread_parser,
                                           downloader_threads=self.thread_downloader,
                                           storage=baidu_storage)
         baidu_crawler.crawl(keyword=keyword,
                             max_num=100000)
示例#10
0
def crawl_baidu(folder: str, search: str, maxnum: int, num_threads: int):
    baidu_crawler = BaiduImageCrawler(downloader_cls=CustomDownloader,
                                      log_level=logging.CRITICAL,
                                      downloader_threads=num_threads,
                                      storage={'root_dir': folder})
    baidu_crawler.crawl(keyword=search,
                        offset=0,
                        max_num=maxnum,
                        min_size=(200, 200),
                        max_size=None,
                        file_idx_offset='auto')
示例#11
0
def test_baidu(dir, keyword):
    keyword = keyword.replace(': flickr.com', '')
    print('启用百度爬虫', keyword)
    baidu_crawler = BaiduImageCrawler(
        # parser_threads=16,
        # downloader_threads=16,
        downloader_cls=Base64NameDownloader,
        storage={'root_dir': dir},
        log_level=logging.DEBUG)
    baidu_crawler.crawl(keyword=keyword,
                        offset=0,
                        max_num=1000,
                        min_size=None,
                        max_size=None)
示例#12
0
def getImg(keywords='', dirpath='', amount=0, source=4):
    if source == 1:
        print('\n--- 開始從「Google 圖片」下載---\n')
        google_crawler = GoogleImageCrawler(parser_threads=2,
                                            downloader_threads=4,
                                            storage={'root_dir': dirpath})
        google_crawler.crawl(keyword=keywords,
                             offset=0,
                             max_num=amount,
                             date_min=None,
                             date_max=None,
                             min_size=(200, 200),
                             max_size=None)

    elif source == 2:
        print('\n--- 開始從「Microsoft Bing」下載---\n')
        bing_crawler = BingImageCrawler(downloader_threads=4,
                                        storage={'root_dir': dirpath})
        bing_crawler.crawl(keyword=keywords,
                           offset=0,
                           max_num=amount,
                           min_size=None,
                           max_size=None)

    elif source == 3:
        print('\n--- 開始從「百度」下載---\n')
        baidu_crawler = BaiduImageCrawler(storage={'root_dir': dirpath})
        baidu_crawler.crawl(keyword=keywords,
                            offset=0,
                            max_num=amount,
                            min_size=None,
                            max_size=None)

    else:
        print('\n--- 開始從「Google 圖片」下載---\n')
        google_crawler = GoogleImageCrawler(parser_threads=2,
                                            downloader_threads=4,
                                            storage={'root_dir': dirpath})
        google_crawler.crawl(keyword=keywords,
                             offset=0,
                             max_num=amount,
                             date_min=None,
                             date_max=None,
                             min_size=(200, 200),
                             max_size=None)
        print('\n--- 開始從「Microsoft Bing」下載---\n')
        bing_crawler = BingImageCrawler(downloader_threads=4,
                                        storage={'root_dir': dirpath})
        bing_crawler.crawl(keyword=keywords,
                           offset=0,
                           max_num=amount,
                           min_size=None,
                           max_size=None)
        print('\n--- 開始從「百度」下載---\n')
        baidu_crawler = BaiduImageCrawler(storage={'root_dir': dirpath})
        baidu_crawler.crawl(keyword=keywords,
                            offset=0,
                            max_num=amount,
                            min_size=None,
                            max_size=None)
示例#13
0
def crawel_auto(search_word, get_num, dir_name):
    print("Googleのクローリングを開始しました。")
    # Google
    googleCrawler = GoogleImageCrawler(storage={"root_dir": f'{dir_name}/google'})
    googleCrawler.crawl(keyword=search_word, max_num=get_num)

    print("Baiduのクローリングを開始しました。")
    #Baidu
    baiduCrawler = BaiduImageCrawler(storage={"root_dir": f'{dir_name}/baidu'})
    baiduCrawler.crawl(keyword=search_word, max_num=get_num)

    print("Bingのクローリングを開始しました。")
    Bing
    bingCrawler = BingImageCrawler(storage={"root_dir": f'{dir_name}/bing'})
    bingCrawler.crawl(keyword=search_word, max_num=get_num)
示例#14
0
def Crawl_Image(key_word, raw_folder=RAW_FOLDER):
    google_crawler = GoogleImageCrawler(
        feeder_threads=1,
        parser_threads=1,
        downloader_threads=6,
        storage={'root_dir': raw_folder + key_word})

    google_crawler.crawl(keyword=key_word, offset=0, max_num=1000,
                         min_size=None, max_size=None, file_idx_offset=0)
    bing_crawler = BingImageCrawler(downloader_threads=6, storage={'root_dir': raw_folder + key_word})
    bing_crawler.crawl(keyword=key_word, filters={'type': 'photo'}, offset=0, max_num=1000)

    baidu_crawler = BaiduImageCrawler(storage={'root_dir': raw_folder + key_word})
    baidu_crawler.crawl(keyword=key_word, offset=0, max_num=1000,
                        min_size=None, max_size=None)
def image_crawler(baidu_path, bing_path, number_of_image, image_key_words):
    baidu_storage = {'root_dir': baidu_path}
    bing_storage = {'root_dir': bing_path}

    baidu_crawler = BaiduImageCrawler(parser_threads=8,
                                      downloader_threads=8,
                                      storage=baidu_storage)

    bingcrawler = BingImageCrawler(parser_threads=8,
                                   downloader_threads=8,
                                   storage=bing_storage)
    baidu_crawler.crawl(keyword=image_key_words,
                        max_num=number_of_image,
                        min_size=(200, 200))
    bingcrawler.crawl(keyword=image_key_words,
                      max_num=number_of_image,
                      min_size=(200, 200))
def baidu_bing_crwal(key_words=['中国人'], max_nums=[1000], save_root=r'./'):

    assert len(key_words) == len(max_nums), "关键词和数量必须一致"
    # 2个一起爬虫
    save_root1 = os.path.join(save_root, 'baidu')
    # 百度爬虫
    for i in range(len(key_words)):
        print('-' * 20)
        image_save_root = os.path.join(save_root1, str(i))

        if not os.path.exists(image_save_root):
            os.makedirs(image_save_root)

        storage = {'root_dir': image_save_root}
        crawler = BaiduImageCrawler(storage=storage)
        crawler.crawl(key_words[i], max_num=max_nums[i])

    # bing爬虫
    save_root2 = os.path.join(save_root, 'bing')
    for i in range(len(key_words)):
        print('-' * 20)
        image_save_root = os.path.join(save_root2, str(i))

        if not os.path.exists(image_save_root):
            os.makedirs(image_save_root)

        storage = {'root_dir': image_save_root}
        crawler = BingImageCrawler(storage=storage)

        crawler.crawl(key_words[i], max_num=max_nums[i])
    return
示例#17
0
def ICL(key='kutkop', qty=100, out_dir='/content/imgages'):
    '''ICL('kutkop', 100, '/content/images')'''
    
    google_crawler = GoogleImageCrawler(
        feeder_threads=1,
        parser_threads=1,
        downloader_threads=4,
        storage={'root_dir': out_dir })
 
    filters = dict(
        size='medium',
        # color='orange',
        # license='commercial,modify',
        # date=((2017, 1, 1), (2017, 11, 30)),
        )    
    google_crawler.crawl(
        keyword=key, 
        filters=filters, 
        offset=0, 
        max_num=qty,
        min_size=(400,400), 
        max_size=None, 
        file_idx_offset=0,
        )
    bing_crawler = BingImageCrawler(
        downloader_threads=4,
        storage={'root_dir': out_dir }
        )
    bing_crawler.crawl(
        keyword=key, 
        filters=None, 
        offset=0, 
        max_num=qty
        )
    baidu_crawler = BaiduImageCrawler(
        storage={'root_dir': out_dir }
        )
    baidu_crawler.crawl(
        keyword=key, 
        offset=0, 
        max_num=qty,
        min_size=(200,200), 
        max_size=None
        )
示例#18
0
def download_img(key, num):

    if not os.path.exists("./bing/" + key):
        os.makedirs("./bing/" + key)
    if not os.path.exists("./baidu/" + key):
        os.makedirs("./baidu/" + key)

    bing_storage = {'root_dir': './bing/' + key}
    bing_crawler = BingImageCrawler(parser_threads=2,
                                    downloader_threads=4,
                                    storage=bing_storage)
    bing_crawler.crawl(keyword=key, max_num=num)

    #谷歌图片爬虫
    baidu_storage = {'root_dir': './baidu/' + key}
    baidu_crawler = BaiduImageCrawler(parser_threads=2,
                                      downloader_threads=4,
                                      storage=baidu_storage)
    baidu_crawler.crawl(keyword=key, max_num=num)
示例#19
0
def Query(query, verb, google=True, google_year=1, bing=True, baidu=True):
    SAVE_DIR = os.path.join(ROOT_DIR, verb)
    if not os.path.exists(SAVE_DIR):
        os.makedirs(SAVE_DIR)

    # SAVE_DIR = os.path.join(ROOT_DIR, query)
    if google:
        google_path = os.path.join(SAVE_DIR, 'Google')
        if not os.path.exists(google_path):
            os.makedirs(google_path)
        google_crawler = GoogleImageCrawler(feeder_threads=1,
                                            parser_threads=1,
                                            downloader_threads=4,
                                            storage={'root_dir': google_path})
        now_year = 2018
        for past_year in range(google_year):
            from_year = now_year - past_year
            filters = dict(license='noncommercial,modify',
                           date=((from_year, 1, 1), (from_year, 12, 30)))
            google_crawler.crawl(keyword=query,
                                 filters=filters,
                                 max_num=1000,
                                 file_idx_offset='auto')

    if bing:
        bing_crawler = BingImageCrawler(
            downloader_threads=4,
            storage={'root_dir': os.path.join(SAVE_DIR, 'Bing')})
        filters_bing = dict(
            # size='large',
            # color='orange',
            license='noncommercial,modify')
        bing_crawler.crawl(keyword=query,
                           filters=filters_bing,
                           offset=0,
                           max_num=1000)

    if baidu:
        baidu_crawler = BaiduImageCrawler(
            storage={'root_dir': os.path.join(SAVE_DIR, 'Baidu')})
        baidu_crawler.crawl(keyword=query, offset=0, max_num=1000)
示例#20
0
def getImg(keywords='', dirpath='', amount=0, source=4):
    if source == 1:
        print('\n--- 開始從「Google 圖片」下載---\n')
        google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4,storage={'root_dir': dirpath})
        google_crawler.crawl(keyword=keywords, offset=0, max_num=amount, date_min=None, date_max=None, min_size=(200,200), max_size=None)

    elif source == 2:
        print('\n--- 開始從「Microsoft Bing」下載---\n')
        bing_crawler = BingImageCrawler(downloader_threads=4, storage={'root_dir': dirpath})
        bing_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None)

    elif source == 3:
        print('\n--- 開始從「百度」下載---\n')
        baidu_crawler = BaiduImageCrawler(storage={'root_dir': dirpath})
        baidu_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None)

    else:
        print('\n--- 開始從「Google 圖片」下載---\n')
        google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4,storage={'root_dir': dirpath})
        google_crawler.crawl(keyword=keywords, offset=0, max_num=amount, date_min=None, date_max=None, min_size=(200,200), max_size=None)
        print('\n--- 開始從「Microsoft Bing」下載---\n')
        bing_crawler = BingImageCrawler(downloader_threads=4, storage={'root_dir': dirpath})
        bing_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None)
        print('\n--- 開始從「百度」下載---\n')
        baidu_crawler = BaiduImageCrawler(storage={'root_dir': dirpath})
        baidu_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None)
示例#21
0
def init_crawler(path, crawler=None, nthreads=4):
    assert crawler != None, 'crawler is set as None.'
    if crawler in ['google']:
        m_crawler = GoogleImageCrawler(downloader_threads=nthreads,
                                       storage={'root_dir': path},
                                       log_level=logging.INFO)
    elif crawler in ['bing']:
        m_crawler = BingImageCrawler(storage={'root_dir': path},
                                     log_level=logging.INFO)
    elif crawler in ['baidu']:
        m_crawler = BaiduImageCrawler(downloader_threads=nthreads,
                                      storage={'root_dir': path})
    return m_crawler
示例#22
0
def crawl(storage_dir, age_range, age_interval, num_img_per_class):
    y = [x for x in range(age_range[0], age_range[1] + 1, age_interval)]
    if y[0] == 0: y[0] = y[0] + 1
    image_number = [num_img_per_class for i in range(4)]
    #two different crawler from Google and Baidu search engine
    google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4)
    baidu_crawler = BaiduImageCrawler(parser_threads=2, downloader_threads=4)
    for c in y:
        keyword_en = str(c) + " year old"  # english key words
        keyword_chn = str(c) + "岁"  #chinese key words
        #setting sub-dir
        storage = storage_dir + str(c)
        google_crawler.storage.root_dir = storage
        baidu_crawler.storage.root_dir = storage
        #crawling
        google_crawler.crawl(keyword=keyword_en,
                             file_idx_offset=0,
                             max_num=image_number[0],
                             date_min=None,
                             date_max=None,
                             min_size=(200, 200),
                             max_size=None)
        google_crawler.crawl(keyword=keyword_chn,
                             file_idx_offset=image_number[0],
                             max_num=image_number[1],
                             date_min=None,
                             date_max=None,
                             min_size=(200, 200),
                             max_size=None)
        baidu_crawler.crawl(keyword=keyword_en,
                            file_idx_offset=sum(image_number[0:2]),
                            max_num=image_number[2],
                            min_size=None,
                            max_size=None)
        baidu_crawler.crawl(keyword=keyword_chn,
                            file_idx_offset=sum(image_number[0:3]),
                            max_num=image_number[3],
                            min_size=None,
                            max_size=None)
示例#23
0
def test_baidu():
    img_dir = osp.join(test_dir, 'baidu')
    search_filters = dict(size='large', color='blue')
    baidu_crawler = BaiduImageCrawler(downloader_threads=2,
                                      storage={'root_dir': img_dir})
    baidu_crawler.crawl('color manga face', max_num=500)
示例#24
0
文件: crawl.py 项目: JM-221/webimg
def test_baidu():
    print('start testing BaiduImageCrawler')
    search_filters = dict(size='large', color='blue')
    baidu_crawler = BaiduImageCrawler(downloader_threads=4,
                                      storage={'root_dir': 'images/baidu'})
    baidu_crawler.crawl('cat', filters=search_filters, max_num=10)
示例#25
0
def crawl(
    folder: str,
    search: str,
    maxnum: int,
    crawlers: [List[str]] = ["GOOGLE", "BING", "BAIDU", "FLICKR"],
) -> Dict[str, str]:
    """Crawl web sites for images"""
    print("(1) Crawling ...")
    # prepare folders
    os.makedirs(folder, exist_ok=True)

    if maxnum > 1000:
        print("Max num limited to 1000")
        maxnum = 1000

    for c in crawlers:
        print(f"    -> {c}")
        if c == "GOOGLE":
            google_crawler = GoogleImageCrawler(
                downloader_cls=CustomDownloader,
                parser_cls=GoogleParser,
                log_level=logging.CRITICAL,
                feeder_threads=1,
                parser_threads=1,
                downloader_threads=4,
                storage={"root_dir": folder},
            )

            google_crawler.crawl(
                keyword=search,
                offset=0,
                max_num=maxnum,
                min_size=(200, 200),
                max_size=None,
                file_idx_offset=0,
            )

        if c == "BING":
            bing_crawler = BingImageCrawler(
                downloader_cls=CustomDownloader,
                log_level=logging.CRITICAL,
                downloader_threads=4,
                storage={"root_dir": folder},
            )
            bing_crawler.crawl(
                keyword=search,
                filters=None,
                offset=0,
                max_num=maxnum,
                file_idx_offset="auto",
            )

        if c == "BAIDU":
            baidu_crawler = BaiduImageCrawler(
                downloader_cls=CustomDownloader,
                log_level=logging.CRITICAL,
                storage={"root_dir": folder},
            )
            baidu_crawler.crawl(
                keyword=search,
                offset=0,
                max_num=maxnum,
                min_size=(200, 200),
                max_size=None,
                file_idx_offset="auto",
            )

        if c == "FLICKR":
            flick_api_key = os.environ.get("FLICKR_API_KEY")
            if not flick_api_key:
                print(
                    "Error: Flickr crawler requires FLICKR_API_KEY environment variable"
                    " to be set with your non-secret API key.")
                exit(-1)

            flickr_crawler = FlickrImageCrawler(
                flick_api_key,
                downloader_cls=CustomDownloader,
                log_level=logging.CRITICAL,
                storage={"root_dir": folder},
            )
            flickr_crawler.crawl(
                text=search,
                offset=0,
                max_num=maxnum,
                min_size=(200, 200),
                max_size=None,
                file_idx_offset="auto",
            )

    return {
        k: v
        for k, v in CustomDownloader.registry.items() if k is not None
    }
示例#26
0
def test_baidu():
    print('start testing BaiduImageCrawler')
    search_filters = dict(size='large', color='blue')
    baidu_crawler = BaiduImageCrawler(
        downloader_threads=30, storage={'root_dir': 'images/demon_slayer'})
    baidu_crawler.crawl('tanjiro', filters=search_filters, max_num=50)
示例#27
0
for keyword in keywords:

    save_path = 'D:/Korean Celeb Data/' + keyword

    google_crawler = GoogleImageCrawler(
        feeder_threads=1,
        parser_threads=1,
        downloader_threads=4,
        storage={'root_dir': save_path + '/google'})

    filters = dict(type="face")

    google_crawler.crawl(keyword=keyword,
                         filters=filters,
                         offset=0,
                         max_num=1000,
                         min_size=(200, 200),
                         max_size=None,
                         file_idx_offset=0)

    bing_crawler = BingImageCrawler(downloader_threads=4,
                                    storage={'root_dir': save_path + '/bing'})
    bing_crawler.crawl(keyword=keyword, filters=None, offset=0, max_num=1000)

    baidu_crawler = BaiduImageCrawler(
        storage={'root_dir': save_path + '/baidu'})
    baidu_crawler.crawl(keyword=keyword,
                        offset=0,
                        max_num=1000,
                        min_size=(200, 200),
                        max_size=None)
示例#28
0
from icrawler.builtin import BaiduImageCrawler
keyword = input("输入你想找图片的标签:")
google_storage = {'root_dir': r'./data'}
google_crawler = BaiduImageCrawler(parser_threads=4,
                                   downloader_threads=4,
                                   storage=google_storage)
google_crawler.crawl(keyword=keyword, max_num=1000)
示例#29
0
def test_baidu():
    baidu_crawler = BaiduImageCrawler(downloader_threads=4,
                                      storage={'root_dir': 'images/baidu'})
    baidu_crawler.crawl('bird', max_num=10)
示例#30
0
文件: crawl.py 项目: hellock/icrawler
def test_baidu():
    print('start testing BaiduImageCrawler')
    search_filters = dict(size='large', color='blue')
    baidu_crawler = BaiduImageCrawler(
        downloader_threads=4, storage={'root_dir': 'images/baidu'})
    baidu_crawler.crawl('cat', filters=search_filters, max_num=10)
示例#31
0
#!/usr/bin/env python3
from icrawler.builtin import GoogleImageCrawler, BingImageCrawler, BaiduImageCrawler
import sys
import time

keywords = sys.argv[1]
print('crawling search engines for images with description %s...' % keywords)
time.sleep(2)

google_crawler = GoogleImageCrawler(parser_threads=4,
                                    downloader_threads=8,
                                    storage={'root_dir': 'qrbooty/google'})
bing_crawler = BingImageCrawler(parser_threads=4,
                                downloader_threads=8,
                                storage={'root_dir': 'qrbooty/bing'})
baidu_crawler = BaiduImageCrawler(parser_threads=4,
                                  downloader_threads=8,
                                  storage={'root_dir': 'qrbooty/baidu'})

google_crawler.crawl(keyword=keywords, offset=0, max_num=1000)
bing_crawler.crawl(keyword=keywords, offset=0, max_num=1000)
baidu_crawler.crawl(keyword=keywords, offset=0, max_num=1000)

print('qrcrawler done.\n')
示例#32
0
"""

#谷歌图片爬虫
google_storage = {'root_dir': '/Users/cl/Desktop/icrawlerLearn/google'}
google_crawler = GoogleImageCrawler(parser_threads=4,
                                   downloader_threads=4,
                                   storage=google_storage)
google_crawler.crawl(keyword='beauty',
                     max_num=10)


#必应图片爬虫
bing_storage = {'root_dir': '/Users/cl/Desktop/icrawlerLearn/bing'}
bing_crawler = BingImageCrawler(parser_threads=2,
                                downloader_threads=4,
                                storage=bing_storage)
bing_crawler.crawl(keyword='beauty',
                   max_num=10)


#百度图片爬虫
baidu_storage = {'root_dir': '/Users/cl/Desktop/icrawlerLearn/baidu'}

baidu_crawler = BaiduImageCrawler(parser_threads=2,
                                  downloader_threads=4,
                                  storage=baidu_storage)
baidu_crawler.crawl(keyword='美女',
                    max_num=10)


示例#33
0
    google_crawler.crawl(keyword=(query), max_num=(num), file_idx_offset=0)

elif engine in ('Bing', 'bing'):
    bing_crawler = BingImageCrawler(downloader_cls=MyImageDownloader,
                                    feeder_threads=1,
                                    parser_threads=1,
                                    downloader_threads=4,
                                    storage={'root_dir': 'matches'})
    #        log_level=logging.INFO,
    #        extra_downloader_args={'log_file': 'meta.txt'})
    bing_crawler.crawl(keyword=(query), filters=None, offset=0, max_num=(num))

elif engine in ('Baidu', 'baidu'):
    baidu_crawler = BaiduImageCrawler(downloader_cls=MyImageDownloader,
                                      feeder_threads=1,
                                      parser_threads=1,
                                      downloader_threads=4,
                                      storage={'root_dir': 'matches'})
    #        log_level=logging.INFO,
    #        extra_downloader_args={'log_file': 'meta.txt'})
    baidu_crawler.crawl(keyword=(query),
                        offset=0,
                        max_num=(num),
                        min_size=(200, 200),
                        max_size=None)

elif engine in ('Greedy', 'greedy'):
    greedy_crawler = GreedyImageCrawler(downloader_cls=MyImageDownloader,
                                        feeder_threads=1,
                                        parser_threads=1,
                                        downloader_threads=4,