def baidu_bing_crwal(key_words=['中国人'], max_nums=[1000], save_root=r'./'): assert len(key_words) == len(max_nums), "关键词和数量必须一致" # 2个一起爬虫 save_root1 = os.path.join(save_root, 'baidu') # 百度爬虫 for i in range(len(key_words)): print('-' * 20) image_save_root = os.path.join(save_root1, str(i)) if not os.path.exists(image_save_root): os.makedirs(image_save_root) storage = {'root_dir': image_save_root} crawler = BaiduImageCrawler(storage=storage) crawler.crawl(key_words[i], max_num=max_nums[i]) # bing爬虫 save_root2 = os.path.join(save_root, 'bing') for i in range(len(key_words)): print('-' * 20) image_save_root = os.path.join(save_root2, str(i)) if not os.path.exists(image_save_root): os.makedirs(image_save_root) storage = {'root_dir': image_save_root} crawler = BingImageCrawler(storage=storage) crawler.crawl(key_words[i], max_num=max_nums[i]) return
def test_baidu(): img_dir = osp.join(test_dir, 'baidu') search_filters = dict(size='large', color='blue') baidu_crawler = BaiduImageCrawler( downloader_threads=2, storage={'root_dir': img_dir}) baidu_crawler.crawl('cat', filters=search_filters, max_num=5) shutil.rmtree(img_dir)
def exe_crawl(arg): google_crawler = GoogleImageCrawler( downloader_cls=PrefixNameGoogleDownloader, feeder_threads=1, parser_threads=1, downloader_threads=4, storage={'root_dir': f'{arg.dict}/{arg.keyword}/google'}) filters = dict(license=f'{arg.license}') google_crawler.crawl(keyword=f'{arg.keyword}', filters=filters, offset=0, max_num=arg.max, file_idx_offset=0) bing_crawler = BingImageCrawler( downloader_cls=PrefixNameBingDownloader, downloader_threads=4, storage={'root_dir': f'{arg.dict}/{arg.keyword}/bing'}) bing_crawler.crawl(keyword=f'{arg.keyword}', filters=filters, offset=0, max_num=arg.max) baidu_crawler = BaiduImageCrawler( downloader_cls=PrefixNameBaiduDownloader, storage={'root_dir': f'{arg.dict}/{arg.keyword}/baidu'}) baidu_crawler.crawl(keyword=f'{arg.keyword}', offset=0, max_num=arg.max)
def getImg(keywords='', dirpath='', amount=0, source=4): if source == 1: print('\n--- 開始從「Google 圖片」下載---\n') google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4,storage={'root_dir': dirpath}) google_crawler.crawl(keyword=keywords, offset=0, max_num=amount, date_min=None, date_max=None, min_size=(200,200), max_size=None) elif source == 2: print('\n--- 開始從「Microsoft Bing」下載---\n') bing_crawler = BingImageCrawler(downloader_threads=4, storage={'root_dir': dirpath}) bing_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None) elif source == 3: print('\n--- 開始從「百度」下載---\n') baidu_crawler = BaiduImageCrawler(storage={'root_dir': dirpath}) baidu_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None) else: print('\n--- 開始從「Google 圖片」下載---\n') google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4,storage={'root_dir': dirpath}) google_crawler.crawl(keyword=keywords, offset=0, max_num=amount, date_min=None, date_max=None, min_size=(200,200), max_size=None) print('\n--- 開始從「Microsoft Bing」下載---\n') bing_crawler = BingImageCrawler(downloader_threads=4, storage={'root_dir': dirpath}) bing_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None) print('\n--- 開始從「百度」下載---\n') baidu_crawler = BaiduImageCrawler(storage={'root_dir': dirpath}) baidu_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None)
def test_baidu(): img_dir = osp.join(test_dir, 'baidu') search_filters = dict(size='large', color='blue') baidu_crawler = BaiduImageCrawler(downloader_threads=2, storage={'root_dir': img_dir}) baidu_crawler.crawl('cat', filters=search_filters, max_num=5) shutil.rmtree(img_dir)
def getImg(keywords='', dirpath='', amount=0, source=4): if source == 1: print('\n--- 開始從「Google 圖片」下載---\n') google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4, storage={'root_dir': dirpath}) google_crawler.crawl(keyword=keywords, offset=0, max_num=amount, date_min=None, date_max=None, min_size=(200, 200), max_size=None) elif source == 2: print('\n--- 開始從「Microsoft Bing」下載---\n') bing_crawler = BingImageCrawler(downloader_threads=4, storage={'root_dir': dirpath}) bing_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None) elif source == 3: print('\n--- 開始從「百度」下載---\n') baidu_crawler = BaiduImageCrawler(storage={'root_dir': dirpath}) baidu_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None) else: print('\n--- 開始從「Google 圖片」下載---\n') google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4, storage={'root_dir': dirpath}) google_crawler.crawl(keyword=keywords, offset=0, max_num=amount, date_min=None, date_max=None, min_size=(200, 200), max_size=None) print('\n--- 開始從「Microsoft Bing」下載---\n') bing_crawler = BingImageCrawler(downloader_threads=4, storage={'root_dir': dirpath}) bing_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None) print('\n--- 開始從「百度」下載---\n') baidu_crawler = BaiduImageCrawler(storage={'root_dir': dirpath}) baidu_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None)
def getImagesFromBaidu(self, query): baidu_crawler = BaiduImageCrawler() baidu_crawler.crawl(keyword=query, offset=0, max_num=self.num_of_images, min_size=(self.min_width, self.min_height), max_size=None)
def crawl( folder: str, search: str, maxnum: int, crawlers: [List[str]] = ['GOOGLE', 'BING', 'BAIDU']) -> Dict[str, str]: """Crawl web sites for images""" print('(1) Crawling ...') # prepare folders os.makedirs(folder, exist_ok=True) sources = {} if maxnum > 1000: print("Max num limited to 1000") maxnum = 1000 for c in crawlers: print(f' -> {c}') if c == 'GOOGLE': google_crawler = GoogleImageCrawler( downloader_cls=CustomDownloader, log_level=logging.CRITICAL, feeder_threads=1, parser_threads=1, downloader_threads=4, storage={'root_dir': folder}) google_crawler.crawl(keyword=search, offset=0, max_num=maxnum, min_size=(200, 200), max_size=None, file_idx_offset=0) if c == 'BING': bing_crawler = BingImageCrawler(downloader_cls=CustomDownloader, log_level=logging.CRITICAL, downloader_threads=4, storage={'root_dir': folder}) bing_crawler.crawl(keyword=search, filters=None, offset=0, max_num=maxnum, file_idx_offset='auto') if c == 'BAIDU': baidu_crawler = BaiduImageCrawler(downloader_cls=CustomDownloader, log_level=logging.CRITICAL, storage={'root_dir': folder}) baidu_crawler.crawl(keyword=search, offset=0, max_num=maxnum, min_size=(200, 200), max_size=None, file_idx_offset='auto') return { k: v for k, v in CustomDownloader.registry.items() if k is not None }
def baidu_crawl(keyword): baidu_crawler = BaiduImageCrawler( feeder_threads=1, parser_threads=2, downloader_threads=32, storage={'root_dir': './crawl_img'}) baidu_crawler.crawl(keyword=keyword, offset=0, max_num=50, min_size=(400,400), max_size=None)
def test_baidu(logo): baidu_crawler = BaiduImageCrawler( downloader_cls=MyImageDownloader, downloader_threads=4, storage={'root_dir': os.path.join(root, logo, 'baidu')}, log_level=logging.INFO, filename=os.path.join(root, logo, 'baidu.txt')) baidu_crawler.crawl(logo, max_num=args.maxnum)
def start_crawler(path: Path, search_text, num_images): crawler = BaiduImageCrawler(feeder_threads=2, parser_threads=2, downloader_threads=4, storage={'root_dir': path}) crawler.crawl(keyword=search_text, min_size=(64, 64), max_num=num_images)
def doBaidu(self): for keyword in self.keywords: baidu_storage = {'root_dir': '%s/baidu-%s/' % (self.dest_dir, keyword)} if os.path.exists(baidu_storage['root_dir']): continue baidu_crawler = BaiduImageCrawler(parser_threads=self.thread_parser, downloader_threads=self.thread_downloader, storage=baidu_storage) baidu_crawler.crawl(keyword=keyword, max_num=100000)
def crawl_baidu(folder: str, search: str, maxnum: int, num_threads: int): baidu_crawler = BaiduImageCrawler(downloader_cls=CustomDownloader, log_level=logging.CRITICAL, downloader_threads=num_threads, storage={'root_dir': folder}) baidu_crawler.crawl(keyword=search, offset=0, max_num=maxnum, min_size=(200, 200), max_size=None, file_idx_offset='auto')
def test_baidu(dir, keyword): keyword = keyword.replace(': flickr.com', '') print('启用百度爬虫', keyword) baidu_crawler = BaiduImageCrawler( # parser_threads=16, # downloader_threads=16, downloader_cls=Base64NameDownloader, storage={'root_dir': dir}, log_level=logging.DEBUG) baidu_crawler.crawl(keyword=keyword, offset=0, max_num=1000, min_size=None, max_size=None)
def crawel_auto(search_word, get_num, dir_name): print("Googleのクローリングを開始しました。") # Google googleCrawler = GoogleImageCrawler(storage={"root_dir": f'{dir_name}/google'}) googleCrawler.crawl(keyword=search_word, max_num=get_num) print("Baiduのクローリングを開始しました。") #Baidu baiduCrawler = BaiduImageCrawler(storage={"root_dir": f'{dir_name}/baidu'}) baiduCrawler.crawl(keyword=search_word, max_num=get_num) print("Bingのクローリングを開始しました。") Bing bingCrawler = BingImageCrawler(storage={"root_dir": f'{dir_name}/bing'}) bingCrawler.crawl(keyword=search_word, max_num=get_num)
def Crawl_Image(key_word, raw_folder=RAW_FOLDER): google_crawler = GoogleImageCrawler( feeder_threads=1, parser_threads=1, downloader_threads=6, storage={'root_dir': raw_folder + key_word}) google_crawler.crawl(keyword=key_word, offset=0, max_num=1000, min_size=None, max_size=None, file_idx_offset=0) bing_crawler = BingImageCrawler(downloader_threads=6, storage={'root_dir': raw_folder + key_word}) bing_crawler.crawl(keyword=key_word, filters={'type': 'photo'}, offset=0, max_num=1000) baidu_crawler = BaiduImageCrawler(storage={'root_dir': raw_folder + key_word}) baidu_crawler.crawl(keyword=key_word, offset=0, max_num=1000, min_size=None, max_size=None)
def image_crawler(baidu_path, bing_path, number_of_image, image_key_words): baidu_storage = {'root_dir': baidu_path} bing_storage = {'root_dir': bing_path} baidu_crawler = BaiduImageCrawler(parser_threads=8, downloader_threads=8, storage=baidu_storage) bingcrawler = BingImageCrawler(parser_threads=8, downloader_threads=8, storage=bing_storage) baidu_crawler.crawl(keyword=image_key_words, max_num=number_of_image, min_size=(200, 200)) bingcrawler.crawl(keyword=image_key_words, max_num=number_of_image, min_size=(200, 200))
def ICL(key='kutkop', qty=100, out_dir='/content/imgages'): '''ICL('kutkop', 100, '/content/images')''' google_crawler = GoogleImageCrawler( feeder_threads=1, parser_threads=1, downloader_threads=4, storage={'root_dir': out_dir }) filters = dict( size='medium', # color='orange', # license='commercial,modify', # date=((2017, 1, 1), (2017, 11, 30)), ) google_crawler.crawl( keyword=key, filters=filters, offset=0, max_num=qty, min_size=(400,400), max_size=None, file_idx_offset=0, ) bing_crawler = BingImageCrawler( downloader_threads=4, storage={'root_dir': out_dir } ) bing_crawler.crawl( keyword=key, filters=None, offset=0, max_num=qty ) baidu_crawler = BaiduImageCrawler( storage={'root_dir': out_dir } ) baidu_crawler.crawl( keyword=key, offset=0, max_num=qty, min_size=(200,200), max_size=None )
def download_img(key, num): if not os.path.exists("./bing/" + key): os.makedirs("./bing/" + key) if not os.path.exists("./baidu/" + key): os.makedirs("./baidu/" + key) bing_storage = {'root_dir': './bing/' + key} bing_crawler = BingImageCrawler(parser_threads=2, downloader_threads=4, storage=bing_storage) bing_crawler.crawl(keyword=key, max_num=num) #谷歌图片爬虫 baidu_storage = {'root_dir': './baidu/' + key} baidu_crawler = BaiduImageCrawler(parser_threads=2, downloader_threads=4, storage=baidu_storage) baidu_crawler.crawl(keyword=key, max_num=num)
def Query(query, verb, google=True, google_year=1, bing=True, baidu=True): SAVE_DIR = os.path.join(ROOT_DIR, verb) if not os.path.exists(SAVE_DIR): os.makedirs(SAVE_DIR) # SAVE_DIR = os.path.join(ROOT_DIR, query) if google: google_path = os.path.join(SAVE_DIR, 'Google') if not os.path.exists(google_path): os.makedirs(google_path) google_crawler = GoogleImageCrawler(feeder_threads=1, parser_threads=1, downloader_threads=4, storage={'root_dir': google_path}) now_year = 2018 for past_year in range(google_year): from_year = now_year - past_year filters = dict(license='noncommercial,modify', date=((from_year, 1, 1), (from_year, 12, 30))) google_crawler.crawl(keyword=query, filters=filters, max_num=1000, file_idx_offset='auto') if bing: bing_crawler = BingImageCrawler( downloader_threads=4, storage={'root_dir': os.path.join(SAVE_DIR, 'Bing')}) filters_bing = dict( # size='large', # color='orange', license='noncommercial,modify') bing_crawler.crawl(keyword=query, filters=filters_bing, offset=0, max_num=1000) if baidu: baidu_crawler = BaiduImageCrawler( storage={'root_dir': os.path.join(SAVE_DIR, 'Baidu')}) baidu_crawler.crawl(keyword=query, offset=0, max_num=1000)
def crawl(storage_dir, age_range, age_interval, num_img_per_class): y = [x for x in range(age_range[0], age_range[1] + 1, age_interval)] if y[0] == 0: y[0] = y[0] + 1 image_number = [num_img_per_class for i in range(4)] #two different crawler from Google and Baidu search engine google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4) baidu_crawler = BaiduImageCrawler(parser_threads=2, downloader_threads=4) for c in y: keyword_en = str(c) + " year old" # english key words keyword_chn = str(c) + "岁" #chinese key words #setting sub-dir storage = storage_dir + str(c) google_crawler.storage.root_dir = storage baidu_crawler.storage.root_dir = storage #crawling google_crawler.crawl(keyword=keyword_en, file_idx_offset=0, max_num=image_number[0], date_min=None, date_max=None, min_size=(200, 200), max_size=None) google_crawler.crawl(keyword=keyword_chn, file_idx_offset=image_number[0], max_num=image_number[1], date_min=None, date_max=None, min_size=(200, 200), max_size=None) baidu_crawler.crawl(keyword=keyword_en, file_idx_offset=sum(image_number[0:2]), max_num=image_number[2], min_size=None, max_size=None) baidu_crawler.crawl(keyword=keyword_chn, file_idx_offset=sum(image_number[0:3]), max_num=image_number[3], min_size=None, max_size=None)
def test_baidu(): print('start testing BaiduImageCrawler') search_filters = dict(size='large', color='blue') baidu_crawler = BaiduImageCrawler( downloader_threads=30, storage={'root_dir': 'images/demon_slayer'}) baidu_crawler.crawl('tanjiro', filters=search_filters, max_num=50)
def crawl( folder: str, search: str, maxnum: int, crawlers: [List[str]] = ["GOOGLE", "BING", "BAIDU", "FLICKR"], ) -> Dict[str, str]: """Crawl web sites for images""" print("(1) Crawling ...") # prepare folders os.makedirs(folder, exist_ok=True) if maxnum > 1000: print("Max num limited to 1000") maxnum = 1000 for c in crawlers: print(f" -> {c}") if c == "GOOGLE": google_crawler = GoogleImageCrawler( downloader_cls=CustomDownloader, parser_cls=GoogleParser, log_level=logging.CRITICAL, feeder_threads=1, parser_threads=1, downloader_threads=4, storage={"root_dir": folder}, ) google_crawler.crawl( keyword=search, offset=0, max_num=maxnum, min_size=(200, 200), max_size=None, file_idx_offset=0, ) if c == "BING": bing_crawler = BingImageCrawler( downloader_cls=CustomDownloader, log_level=logging.CRITICAL, downloader_threads=4, storage={"root_dir": folder}, ) bing_crawler.crawl( keyword=search, filters=None, offset=0, max_num=maxnum, file_idx_offset="auto", ) if c == "BAIDU": baidu_crawler = BaiduImageCrawler( downloader_cls=CustomDownloader, log_level=logging.CRITICAL, storage={"root_dir": folder}, ) baidu_crawler.crawl( keyword=search, offset=0, max_num=maxnum, min_size=(200, 200), max_size=None, file_idx_offset="auto", ) if c == "FLICKR": flick_api_key = os.environ.get("FLICKR_API_KEY") if not flick_api_key: print( "Error: Flickr crawler requires FLICKR_API_KEY environment variable" " to be set with your non-secret API key.") exit(-1) flickr_crawler = FlickrImageCrawler( flick_api_key, downloader_cls=CustomDownloader, log_level=logging.CRITICAL, storage={"root_dir": folder}, ) flickr_crawler.crawl( text=search, offset=0, max_num=maxnum, min_size=(200, 200), max_size=None, file_idx_offset="auto", ) return { k: v for k, v in CustomDownloader.registry.items() if k is not None }
CloudTypesList_ENG = open('ColudListBaidu.txt', 'r') ## this one is Baidu crawller for cloudTypesName in CloudTypesList_CHINESE: cloud_type = cloudTypesName.strip('\n') imageDir = image_path + "\\" + cloud_type if not os.path.isdir(imageDir): os.mkdir(imageDir) print("imageDir--------------" + imageDir) baidu_crawler = BaiduImageCrawler(parser_threads=2, downloader_threads=4, storage={'root_dir': imageDir}) baidu_crawler.crawl(keyword=cloud_type, max_num=1000, file_idx_offset='auto') ## this one is GOOGLE crawller for cloudTypesName in CloudTypesList_ENG: cloud_type = cloudTypesName.strip('\n') imageDir = image_path + "\\" + cloud_type if not os.path.isdir(imageDir): os.mkdir(imageDir) print("imageDir--------------" + imageDir) google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4, storage={'root_dir': imageDir}) google_crawler.crawl(keyword=cloud_type,
#!/usr/bin/env python3 from icrawler.builtin import GoogleImageCrawler, BingImageCrawler, BaiduImageCrawler import sys import time keywords = sys.argv[1] print('crawling search engines for images with description %s...' % keywords) time.sleep(2) google_crawler = GoogleImageCrawler(parser_threads=4, downloader_threads=8, storage={'root_dir': 'qrbooty/google'}) bing_crawler = BingImageCrawler(parser_threads=4, downloader_threads=8, storage={'root_dir': 'qrbooty/bing'}) baidu_crawler = BaiduImageCrawler(parser_threads=4, downloader_threads=8, storage={'root_dir': 'qrbooty/baidu'}) google_crawler.crawl(keyword=keywords, offset=0, max_num=1000) bing_crawler.crawl(keyword=keywords, offset=0, max_num=1000) baidu_crawler.crawl(keyword=keywords, offset=0, max_num=1000) print('qrcrawler done.\n')
for keyword in keywords: save_path = 'D:/Korean Celeb Data/' + keyword google_crawler = GoogleImageCrawler( feeder_threads=1, parser_threads=1, downloader_threads=4, storage={'root_dir': save_path + '/google'}) filters = dict(type="face") google_crawler.crawl(keyword=keyword, filters=filters, offset=0, max_num=1000, min_size=(200, 200), max_size=None, file_idx_offset=0) bing_crawler = BingImageCrawler(downloader_threads=4, storage={'root_dir': save_path + '/bing'}) bing_crawler.crawl(keyword=keyword, filters=None, offset=0, max_num=1000) baidu_crawler = BaiduImageCrawler( storage={'root_dir': save_path + '/baidu'}) baidu_crawler.crawl(keyword=keyword, offset=0, max_num=1000, min_size=(200, 200), max_size=None)
from icrawler.builtin import BaiduImageCrawler keyword = input("输入你想找图片的标签:") google_storage = {'root_dir': r'./data'} google_crawler = BaiduImageCrawler(parser_threads=4, downloader_threads=4, storage=google_storage) google_crawler.crawl(keyword=keyword, max_num=1000)
""" #谷歌图片爬虫 google_storage = {'root_dir': '/Users/cl/Desktop/icrawlerLearn/google'} google_crawler = GoogleImageCrawler(parser_threads=4, downloader_threads=4, storage=google_storage) google_crawler.crawl(keyword='beauty', max_num=10) #必应图片爬虫 bing_storage = {'root_dir': '/Users/cl/Desktop/icrawlerLearn/bing'} bing_crawler = BingImageCrawler(parser_threads=2, downloader_threads=4, storage=bing_storage) bing_crawler.crawl(keyword='beauty', max_num=10) #百度图片爬虫 baidu_storage = {'root_dir': '/Users/cl/Desktop/icrawlerLearn/baidu'} baidu_crawler = BaiduImageCrawler(parser_threads=2, downloader_threads=4, storage=baidu_storage) baidu_crawler.crawl(keyword='美女', max_num=10)
def test_baidu(): print('start testing BaiduImageCrawler') search_filters = dict(size='large', color='blue') baidu_crawler = BaiduImageCrawler( downloader_threads=4, storage={'root_dir': 'images/baidu'}) baidu_crawler.crawl('cat', filters=search_filters, max_num=10)
storage={'root_dir': 'matches'}) # log_level=logging.INFO, # extra_downloader_args={'log_file': 'meta.txt'}) bing_crawler.crawl(keyword=(query), filters=None, offset=0, max_num=(num)) elif engine in ('Baidu', 'baidu'): baidu_crawler = BaiduImageCrawler(downloader_cls=MyImageDownloader, feeder_threads=1, parser_threads=1, downloader_threads=4, storage={'root_dir': 'matches'}) # log_level=logging.INFO, # extra_downloader_args={'log_file': 'meta.txt'}) baidu_crawler.crawl(keyword=(query), offset=0, max_num=(num), min_size=(200, 200), max_size=None) elif engine in ('Greedy', 'greedy'): greedy_crawler = GreedyImageCrawler(downloader_cls=MyImageDownloader, feeder_threads=1, parser_threads=1, downloader_threads=4, storage={'root_dir': 'matches'}) greedy_crawler.crawl(domains=(url), max_num=(num), min_size=None, max_size=None) #For Flickr
def test_baidu(): print('start testing BaiduImageCrawler') search_filters = dict(size='large', color='blue') baidu_crawler = BaiduImageCrawler(downloader_threads=4, storage={'root_dir': 'images/baidu'}) baidu_crawler.crawl('cat', filters=search_filters, max_num=10)
def test_baidu(): img_dir = osp.join(test_dir, 'baidu') search_filters = dict(size='large', color='blue') baidu_crawler = BaiduImageCrawler(downloader_threads=2, storage={'root_dir': img_dir}) baidu_crawler.crawl('color manga face', max_num=500)
def test_baidu(): baidu_crawler = BaiduImageCrawler(downloader_threads=4, storage={'root_dir': 'images/baidu'}) baidu_crawler.crawl('bird', max_num=10)