def exe_crawl(arg): google_crawler = GoogleImageCrawler( downloader_cls=PrefixNameGoogleDownloader, feeder_threads=1, parser_threads=1, downloader_threads=4, storage={'root_dir': f'{arg.dict}/{arg.keyword}/google'}) filters = dict(license=f'{arg.license}') google_crawler.crawl(keyword=f'{arg.keyword}', filters=filters, offset=0, max_num=arg.max, file_idx_offset=0) bing_crawler = BingImageCrawler( downloader_cls=PrefixNameBingDownloader, downloader_threads=4, storage={'root_dir': f'{arg.dict}/{arg.keyword}/bing'}) bing_crawler.crawl(keyword=f'{arg.keyword}', filters=filters, offset=0, max_num=arg.max) baidu_crawler = BaiduImageCrawler( downloader_cls=PrefixNameBaiduDownloader, storage={'root_dir': f'{arg.dict}/{arg.keyword}/baidu'}) baidu_crawler.crawl(keyword=f'{arg.keyword}', offset=0, max_num=arg.max)
def crawl_book(book, tag, add): num = book.split('.')[0] title = tag[num]['title'] entities = json.load(open(os.path.join(ENTITY_DIR, book), 'r')) crawled = set() for cpt in entities: for entity in entities[cpt]['cpt_key'][:2]: entity = entity[0] outpath = os.path.join(IMAGE_DIR, num, entity) if entity not in crawled and not os.path.exists( outpath) or add and len(os.listdir(outpath)) == 0: title = title.replace('"', '').replace("'", '') keyword = f"illustration {entity} {title}" # command = f'python {os.path.join(ROOT_DIR,"image_crawler","bbid.py")} -s "illustration {entity} {title}" -o "{outpath}" --limit 4' if not add else f'python {os.path.join(ROOT_DIR,"image_crawler","bbid.py")} -s "{entity} in {title}" -o "{outpath}" --limit 4' # cmd = os.popen(command) # cmd.read() # cmd.close() # print(f'book {num}, entity \'{entity}\' crawled.') # break bing_crawler = BingImageCrawler(storage={'root_dir': outpath}) bing_crawler.crawl(keyword=keyword, max_num=2) else: # print(f'{entity} passed.') continue # break print(f'==book {num}, \'{title}\' finished.==')
def pring(): # new folder name new_path = folder_path.get() + "\\" + folder_name.get() if not os.path.exists(new_path): # make new folder os.mkdir(new_path) # print("create new folder") # main program crawler = BingImageCrawler(storage={"root_dir": new_path}) if combo_1.get() != "None": filters = dict(size=combo_1.get()) else: filters = None crawler.crawl(keyword=picture_name.get(), filters=filters, offset=0, max_num=int(picture_num.get())) res = messagebox.askokcancel('finished!!!', 'Reset imput, but check folder?') folder_name.delete(0, "end") picture_name.delete(0, "end") picture_num.delete(0, "end") if res: tkinter.filedialog.askopenfilename(initialdir=new_path) else: messagebox.showinfo( 'failed...', 'A folder with the same name already exists\n' + new_path)
def getImg(keywords='', dirpath='', amount=0, source=4): if source == 1: print('\n--- 開始從「Google 圖片」下載---\n') google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4,storage={'root_dir': dirpath}) google_crawler.crawl(keyword=keywords, offset=0, max_num=amount, date_min=None, date_max=None, min_size=(200,200), max_size=None) elif source == 2: print('\n--- 開始從「Microsoft Bing」下載---\n') bing_crawler = BingImageCrawler(downloader_threads=4, storage={'root_dir': dirpath}) bing_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None) elif source == 3: print('\n--- 開始從「百度」下載---\n') baidu_crawler = BaiduImageCrawler(storage={'root_dir': dirpath}) baidu_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None) else: print('\n--- 開始從「Google 圖片」下載---\n') google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4,storage={'root_dir': dirpath}) google_crawler.crawl(keyword=keywords, offset=0, max_num=amount, date_min=None, date_max=None, min_size=(200,200), max_size=None) print('\n--- 開始從「Microsoft Bing」下載---\n') bing_crawler = BingImageCrawler(downloader_threads=4, storage={'root_dir': dirpath}) bing_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None) print('\n--- 開始從「百度」下載---\n') baidu_crawler = BaiduImageCrawler(storage={'root_dir': dirpath}) baidu_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None)
def getImg(keywords='', dirpath='', amount=0, source=4): if source == 1: print('\n--- 開始從「Google 圖片」下載---\n') google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4, storage={'root_dir': dirpath}) google_crawler.crawl(keyword=keywords, offset=0, max_num=amount, date_min=None, date_max=None, min_size=(200, 200), max_size=None) elif source == 2: print('\n--- 開始從「Microsoft Bing」下載---\n') bing_crawler = BingImageCrawler(downloader_threads=4, storage={'root_dir': dirpath}) bing_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None) elif source == 3: print('\n--- 開始從「百度」下載---\n') baidu_crawler = BaiduImageCrawler(storage={'root_dir': dirpath}) baidu_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None) else: print('\n--- 開始從「Google 圖片」下載---\n') google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4, storage={'root_dir': dirpath}) google_crawler.crawl(keyword=keywords, offset=0, max_num=amount, date_min=None, date_max=None, min_size=(200, 200), max_size=None) print('\n--- 開始從「Microsoft Bing」下載---\n') bing_crawler = BingImageCrawler(downloader_threads=4, storage={'root_dir': dirpath}) bing_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None) print('\n--- 開始從「百度」下載---\n') baidu_crawler = BaiduImageCrawler(storage={'root_dir': dirpath}) baidu_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None)
def getImagesFromBing(self, query): bing_crawler = BingImageCrawler(downloader_threads=4) bing_crawler.crawl(keyword=query, offset=0, max_num=self.num_of_images, min_size=(self.min_width, self.min_height), max_size=None)
def main(): parser = argparse.ArgumentParser( description="Simple tool to download images via Google Search.") parser.add_argument("-t", "--test", action='store_true') parser.add_argument("-p", "--path", type=str, default=os.getcwd(), help="Path to output images") parser.add_argument("-n", "--num", type=int, default=100, help="How many images do you need.") parser.add_argument("--threads", type=int, default=4, help="Number of threads to run.") parser.add_argument("-o", "--offset", type=int, default=0, help="Offset.") parser.add_argument("--min", type=int, default=None, help="Minimum size of the image.") parser.add_argument("--max", type=int, default=None, help="Maximum size of the image.") args = parser.parse_args() t0 = time.time() check_dir(args.path) for kwd in search_for: subdir = os.path.join(args.path, kwd) check_dir(subdir) print(" Item name = ", kwd) if len(search_for[kwd]) == 0: bing_crawler = BingImageCrawler(subdir) bing_crawler.crawl(keyword=kwd, offset=args.offset, max_num=args.num, feeder_thr_num=1, parser_thr_num=1, downloader_thr_num=args.threads, min_size=args.min, max_size=args.max) else: for j in range(0, len(search_for[kwd])): print(" : %s" % search_for[kwd][j]) ssubdir = os.path.join(subdir, search_for[kwd][j]) check_dir(ssubdir) pure_keyword = '%20' + search_for[kwd][j] pure_keyword = kwd + pure_keyword.replace(' ', '%20') bing_crawler = BingImageCrawler(storage={'root_dir': ssubdir}) bing_crawler.crawl(keyword=pure_keyword, offset=args.offset, max_num=args.num, min_size=args.min, max_size=args.max)
def crawl( folder: str, search: str, maxnum: int, crawlers: [List[str]] = ['GOOGLE', 'BING', 'BAIDU']) -> Dict[str, str]: """Crawl web sites for images""" print('(1) Crawling ...') # prepare folders os.makedirs(folder, exist_ok=True) sources = {} if maxnum > 1000: print("Max num limited to 1000") maxnum = 1000 for c in crawlers: print(f' -> {c}') if c == 'GOOGLE': google_crawler = GoogleImageCrawler( downloader_cls=CustomDownloader, log_level=logging.CRITICAL, feeder_threads=1, parser_threads=1, downloader_threads=4, storage={'root_dir': folder}) google_crawler.crawl(keyword=search, offset=0, max_num=maxnum, min_size=(200, 200), max_size=None, file_idx_offset=0) if c == 'BING': bing_crawler = BingImageCrawler(downloader_cls=CustomDownloader, log_level=logging.CRITICAL, downloader_threads=4, storage={'root_dir': folder}) bing_crawler.crawl(keyword=search, filters=None, offset=0, max_num=maxnum, file_idx_offset='auto') if c == 'BAIDU': baidu_crawler = BaiduImageCrawler(downloader_cls=CustomDownloader, log_level=logging.CRITICAL, storage={'root_dir': folder}) baidu_crawler.crawl(keyword=search, offset=0, max_num=maxnum, min_size=(200, 200), max_size=None, file_idx_offset='auto') return { k: v for k, v in CustomDownloader.registry.items() if k is not None }
def crawl_it(name_to_search): google_crawler = BingImageCrawler( feeder_threads=1, parser_threads=2, downloader_threads=4, downloader_cls=MyImageDownloader, storage={'root_dir': 'images/google'}) google_crawler.crawl(keyword=name_to_search, max_num=20, file_idx_offset=0)
def test_bing(logo): bing_crawler = BingImageCrawler( downloader_cls=MyImageDownloader, downloader_threads=4, storage={'root_dir': os.path.join(root, logo, 'bing')}, log_level=logging.INFO, filename=os.path.join(root, logo, 'bing.txt')) bing_crawler.crawl(logo, max_num=args.maxnum)
def search(names): names = list(set(names)) for name in names: profile_dirpath = os.path.join(PHOTO_DIRPATH, name) os.makedirs(profile_dirpath, exist_ok=True) crawler = BingImageCrawler(storage={"root_dir": profile_dirpath}) crawler.crawl(keyword=name, max_num=100) time.sleep(WAITING_TIME_SEARCH)
def download_data(dataset_name, classes, num_images): for c in classes: bing_crawler = BingImageCrawler( downloader_threads=6, storage={'root_dir': f'../static/datasets/{dataset_name}/{c}'}) bing_crawler.crawl(keyword=c, filters=None, offset=0, max_num=num_images * 2)
def fetchAndCropFace(keyword, max_num): input_file_path = f"./training_data/original/{keyword}/" output_file_path = f"./training_data/cropped_face/{keyword}/" echo("crawl images") crawler = BingImageCrawler(storage={"root_dir": input_file_path}) # crawler = GoogleImageCrawler(storage={"root_dir": input_file_path}) crawler.crawl(keyword=keyword, max_num=int(max_num)) echo("original images in: " + input_file_path) os.makedirs(input_file_path, exist_ok=True) echo("cropped face images in: " + output_file_path) os.makedirs(output_file_path, exist_ok=True) input_files = os.listdir(input_file_path) echo("fetched images count: " + str(len(input_files))) cascade = cv2.CascadeClassifier("./haarcascade_frontalface_alt.xml") windowName = 'window' cv2.namedWindow(windowName, cv2.WINDOW_KEEPRATIO | cv2.WINDOW_NORMAL) cv2.resizeWindow(windowName, 500, 500) for input_file in input_files: input_image = cv2.imread(input_file_path + input_file) if input_image is None: continue height, width, _ = input_image.shape cv2.imshow(windowName, input_image) cv2.waitKey(50) echo("detect face") face_rects = cascade.detectMultiScale(input_image, scaleFactor=1.1, minNeighbors=10, minSize=(10, 10)) if len(face_rects) == 0: echo("detect face failure") continue echo("detect face success") # 最初に検出した顔のみ取得 face_rect = face_rects[0] # 顔領域だけ切り出して保存 output_image = crop(input_image, face_rect) cv2.imwrite(output_file_path + input_file, output_image) # 顔領域に矩形を描画して表示 marked_input_image = drawRect(input_image, face_rect, (0, 255, 0)) cv2.imshow(windowName, marked_input_image) cv2.waitKey(50) cv2.destroyAllWindows()
def test_bing(): img_dir = osp.join(test_dir, 'bing2') bing_crawler = BingImageCrawler(downloader_threads=2, storage={'root_dir': img_dir}, log_level=logging.INFO) search_filters = dict(type='photo', license='commercial') bing_crawler.crawl('manga face color', max_num=1000, filters=search_filters)
def download_images(keyword): crawler = BingImageCrawler(parser_threads=5, downloader_threads=5, storage={ 'backend': 'FileSystem', 'root_dir': 'images' }) crawler.crawl(keyword=keyword, max_num=max_images, filters={'size': 'medium'})
def crawl_bing(folder: str, search: str, maxnum: int, num_threads: int): bing_crawler = BingImageCrawler(downloader_cls=CustomDownloader, log_level=logging.CRITICAL, downloader_threads=num_threads, storage={'root_dir': folder}) bing_crawler.crawl(keyword=search, filters=None, offset=0, max_num=maxnum, file_idx_offset='auto')
def get_images(keyword): bing_crawler = BingImageCrawler(downloader_cls=CustomLinkPrinter) bing_crawler.downloader.file_urls = [] bing_crawler.crawl(keyword=keyword, max_num=50) file_urls = bing_crawler.downloader.file_urls print(file_urls) make_hash_df(file_urls, keyword)
def doBing(self): for keyword in self.keywords: bing_storage = {'root_dir': '%s/bing-%s/' % (self.dest_dir, keyword)} if os.path.exists(bing_storage['root_dir']): continue bing_crawler = BingImageCrawler(parser_threads=self.thread_parser, downloader_threads=self.thread_downloader, storage=bing_storage) bing_crawler.crawl(keyword=keyword, max_num=100000)
def test_bing(): print('start testing BingImageCrawler') bing_crawler = BingImageCrawler(downloader_threads=2, storage={'root_dir': 'images/bing'}, log_level=logging.INFO) search_filters = dict(type='photo', license='commercial', layout='wide', size='large', date='pastmonth') bing_crawler.crawl('fatcat', max_num=10, filters=search_filters)
def crawl_images(keyword: str, max_num_images: int, save_dir: str, feeder_threads: int, parser_threads: int, downloader_threads: int): crawler = BingImageCrawler( feeder_threads=feeder_threads, parser_threads=parser_threads, downloader_threads=downloader_threads, log_level=logging.ERROR, storage={"root_dir": save_dir}, ) crawler.crawl(keyword=keyword, max_num=max_num_images)
def test_bing(): img_dir = osp.join(test_dir, 'bing') bing_crawler = BingImageCrawler(downloader_threads=2, storage={'root_dir': img_dir}, log_level=logging.INFO) search_filters = dict(type='photo', license='commercial', layout='wide', size='large', date='pastmonth') bing_crawler.crawl('cat', max_num=5, filters=search_filters) shutil.rmtree(img_dir)
def download(keyword): fd = dict(color="white", size="medium", layout="tall") bing_crawler = BingImageCrawler( parser_threads=5, downloader_threads=5, storage={'root_dir': 'imgs'}) bing_crawler.session.verify = False bing_crawler.crawl(keyword=keyword, max_num=10, min_size=(10, 10), max_size=None, filters=fd)
def main(): argv = sys.argv if len(argv) < 4: usage() exit(1) if not os.path.isdir(argv[1]): os.makedirs(argv[1]) crawler = BingImageCrawler(storage={"root_dir": argv[1]}) crawler.crawl(keyword=argv[2], max_num=int(argv[3]))
def main(city): bing_crawler = BingImageCrawler( downloader_threads=2, #put the name of directory as city storage={'root_dir': 'images/' + city}, log_level=logging.INFO) search_filters = dict(type='photo', license='commercial', layout='wide', size='large', date='pastmonth') bing_crawler.crawl(city, max_num=20, filters=None)
def download_images(celebs_file, save_dir): celebrities_list = [] with open(celebs_file) as f: for line in f: celebrities_list.append(line.strip()) for celeb in celebrities_list: images_dir = os.path.join(save_dir, celeb.lower().replace(" ", "_")) if not os.path.isdir(images_dir): os.mkdir(images_dir) bing_crawler = BingImageCrawler(downloader_threads=4,storage={'root_dir': images_dir}) bing_crawler.crawl(keyword=celeb, filters=None, offset=0, max_num=1000)
def test_bing(): print('start testing BingImageCrawler') bing_crawler = BingImageCrawler( downloader_threads=2, storage={'root_dir': 'images/bing'}, log_level=logging.INFO) search_filters = dict( type='photo', license='commercial', layout='wide', size='large', date='pastmonth') bing_crawler.crawl('cat', max_num=10, filters=search_filters)
def test_bing(): img_dir = osp.join(test_dir, 'bing') bing_crawler = BingImageCrawler( downloader_threads=2, storage={'root_dir': img_dir}, log_level=logging.INFO) search_filters = dict( type='photo', license='commercial', layout='wide', size='large', date='pastmonth') bing_crawler.crawl('cat', max_num=5, filters=search_filters) shutil.rmtree(img_dir)
def test_bing(dir, keyword): keyword = keyword.replace(': flickr.com', '') print('启用bing爬虫', keyword) bing_crawler = BingImageCrawler( # parser_threads=16, downloader_cls=Base64NameDownloader, downloader_threads=16, storage={'root_dir': dir}, log_level=logging.DEBUG) bing_crawler.crawl(keyword=keyword, offset=0, max_num=1000, min_size=None, max_size=None)
def crawel_auto(search_word, get_num, dir_name): print("Googleのクローリングを開始しました。") # Google googleCrawler = GoogleImageCrawler(storage={"root_dir": f'{dir_name}/google'}) googleCrawler.crawl(keyword=search_word, max_num=get_num) print("Baiduのクローリングを開始しました。") #Baidu baiduCrawler = BaiduImageCrawler(storage={"root_dir": f'{dir_name}/baidu'}) baiduCrawler.crawl(keyword=search_word, max_num=get_num) print("Bingのクローリングを開始しました。") Bing bingCrawler = BingImageCrawler(storage={"root_dir": f'{dir_name}/bing'}) bingCrawler.crawl(keyword=search_word, max_num=get_num)
def download_image( file_directory, search_query ): #webscraper function, google didn't work for me so we try the alternatives print(file_directory) bing_crawler = BingImageCrawler(parser_threads=2, downloader_threads=4, storage={'root_dir': file_directory}) picture_count = 1 for items in search_query: print(f"NEW IMAGE DOWNLOADING ITS AN IMAGE OF {items}") bing_crawler.crawl(keyword=items, filters=None, max_num=picture_count) picture_count += 1
def Crawl_Image(key_word, raw_folder=RAW_FOLDER): google_crawler = GoogleImageCrawler( feeder_threads=1, parser_threads=1, downloader_threads=6, storage={'root_dir': raw_folder + key_word}) google_crawler.crawl(keyword=key_word, offset=0, max_num=1000, min_size=None, max_size=None, file_idx_offset=0) bing_crawler = BingImageCrawler(downloader_threads=6, storage={'root_dir': raw_folder + key_word}) bing_crawler.crawl(keyword=key_word, filters={'type': 'photo'}, offset=0, max_num=1000) baidu_crawler = BaiduImageCrawler(storage={'root_dir': raw_folder + key_word}) baidu_crawler.crawl(keyword=key_word, offset=0, max_num=1000, min_size=None, max_size=None)
def image_crawler(baidu_path, bing_path, number_of_image, image_key_words): baidu_storage = {'root_dir': baidu_path} bing_storage = {'root_dir': bing_path} baidu_crawler = BaiduImageCrawler(parser_threads=8, downloader_threads=8, storage=baidu_storage) bingcrawler = BingImageCrawler(parser_threads=8, downloader_threads=8, storage=bing_storage) baidu_crawler.crawl(keyword=image_key_words, max_num=number_of_image, min_size=(200, 200)) bingcrawler.crawl(keyword=image_key_words, max_num=number_of_image, min_size=(200, 200))
def download(key='cat',max_num=100,storage_dir=None,store_to_array={'convert':False,'return':False},flag=0,array_name=None,pad=(28,28),slash='\\'): if storage_dir==None: raise Exception("Please, please mention storage_dir it cannot be None") if type(max_num)!=type(0): raise ValueError(f'max number is not in type int type_found :-{type(max_num)}') if max_num<=0: raise ValueError("Sorry, max_num must be greater than 0") if type(key)!=type('cat'): raise ValueError(f'key is not in type str type_found :-{type(key)}') google=BingImageCrawler(storage={'root_dir':storage_dir}) google.crawl(keyword=key,max_num=max_num) if store_to_array['convert']==True: array=convert_to_matrix(storage_dir,flag,pad,slash) if array_name!=None: save(array,array_name) if store_to_array['return']==True: return np.array(array)