def crawl(crawl_list, work_list): for i in range(len(crawl_list)): class_name = crawl_list[i] print("Now fetching class: %s" % class_name) output_dir = os.path.join(output_path, class_name) if not os.path.exists(output_dir): os.mkdir(output_dir) flickr_crawler = FlickrImageCrawler('', # put your Flickr API key here feeder_threads=2, parser_threads=10, downloader_threads=5, parser_cls=MyFlickrParser, downloader_cls=MyImageDownloader, storage={'root_dir': output_dir}, log_level=logging.ERROR) # Time counter prev_time = float("-inf") curr_time = float("-inf") for i in range(28): curr_time = time() elapsed = curr_time - prev_time print( "Now at iteration %d. Elapsed time: %.5fs." % (i, elapsed)) prev_time = curr_time flickr_crawler.crawl(max_num=4000, text=class_name, sort='relevance', per_page=500, min_upload_date=TODAY - (i+1) * delta, max_upload_date=TODAY - i * delta, extras='url_n,url_z,original_format,path_alias') work_list.append(class_name) if i >= len(crawl_list) - 1: work_list.append('end')
def test_flickr(): flickr_crawler = FlickrImageCrawler(apikey=None, downloader_threads=4, storage={'root_dir': 'images/flickr'}) flickr_crawler.crawl(max_num=10, tags='family,child', tag_mode='all', group_id='68012010@N00')
def start_flickr_crawler(path:Path, search_text:str, num_images:int, apikey:str): """Kicks off a Flickr download. Requires an apikey""" assert apikey != None, "Flickr requires an apikey: 'https://www.flickr.com/services/api/misc.api_keys.html'" crawler = FlickrImageCrawler( apikey, feeder_threads=2, parser_threads=2, downloader_threads=8, storage={'root_dir': path}) crawler.crawl(tags=search_text, max_num=num_images, tag_mode='all')
def search_image(pet, target, savePath): emotion = target.strip('\n') imageDir = savePath + os.sep + emotion print(imageDir) searchName = emotion + ' ' + pet print(searchName) flickr_crawler = FlickrImageCrawler(Flicker_API, storage={'root_dir': imageDir}) flickr_crawler.crawl(max_num=1000, tags=searchName, text=searchName) google_crawler = GoogleImageCrawler(storage={'root_dir': imageDir}) google_crawler.crawl(keyword=searchName, max_num=1000)
def test_flickr(): print('start testing FlickrImageCrawler') flickr_crawler = FlickrImageCrawler( apikey=None, parser_threads=2, downloader_threads=4, storage={'root_dir': 'images/flickr'}) flickr_crawler.crawl( max_num=10, tags='family,child', tag_mode='all', group_id='68012010@N00')
def start_flickr_crawler(path: Path, search_text: str, n_images: int, apikey: str): if apikey == None: print( "Flickr requires an apikey: 'https://www.flickr.com/services/api/misc.api_keys.html'" ) exit() crawler = FlickrImageCrawler(apikey, feeder_threads=2, parser_threads=2, downloader_threads=8, storage={'root_dir': path}) crawler.crawl(tags=search_text, max_num=n_images, tag_mode='all')
def get_images(img_path): if not os.path.exists(img_path): os.makedirs(img_path) flickr_crawler = FlickrImageCrawler('b040ad4b6a95ddaa8ad86f0762ebc828', downloader_cls=MyImageDownloader, downloader_threads=5, storage={'root_dir': img_path}) flickr_crawler.crawl( max_num=500, tags='cathedral', extras='description', group_id='92229480@N00', #Umbria min_upload_date=date(2005, 5, 1))
def crawl_images(tags, index, batch_size): # https://www.flickr.com/services/apps/create/noncommercial/? # key:6ea696a89e485ce8b39cd052cc1dbd01 # c3acc5f2a23734b4 i = int(batch_size/100)*index+1 print('index:',index,', i:',i) subdir_name = tags.replace(',', '_') subdir = os.path.join(ROOT_DIR, '{}_{}'.format(subdir_name, index)) print(subdir) set_sub_dir(subdir) flickr_crawler = FlickrImageCrawler(apikey='6ea696a89e485ce8b39cd052cc1dbd01', feeder_threads=1, parser_threads=1, downloader_threads=8, downloader_cls=FaceImageDownloader, storage={'root_dir': subdir}) flickr_crawler.crawl(max_num=batch_size, tags=tags, page=i, \ size_preference=['large 2048', 'large 1600', 'large'], \ min_upload_date=date(2010, 5, 25))
""" AUTHOR : Min Xue PURPOSE : To collect bouquet of flowers images from Flickr """ import os import sys from datetime import date from icrawler.builtin import FlickrImageCrawler image_path = '/Users/xuemin/Desktop/FlickrImageCollection/result' API_KEY = '13ef101ff4bac39647acb5531d8d0a3c' FlowerBreedList = open('List1.txt','rt') for nameList in FlowerBreedList: name = nameList.strip('\n') imageDir = image_path + '/' + name searchName = name flickr_crawler = FlickrImageCrawler(API_KEY, storage={'root_dir': imageDir}) flickr_crawler.crawl(max_num = 500, tags = searchName, text = searchName) print("Collection is done")
import os import pathlib import shutil from datetime import date from icrawler.builtin import (BaiduImageCrawler, BingImageCrawler, FlickrImageCrawler, GoogleImageCrawler) flickr = FlickrImageCrawler( '061beed3b1f1e2f1c95874ea769f7536', downloader_threads=4, storage={'root_dir': 'bike'}) flickr.crawl( group_id="60503902@N00", tags='bike', max_num=100, min_upload_date=date(2017, 1, 1)) flickr = FlickrImageCrawler( '061beed3b1f1e2f1c95874ea769f7536', downloader_threads=4, storage={'root_dir': 'car'}) flickr.crawl( group_id="2309748@N20", tags='car', max_num=100, min_upload_date=date(2017, 1, 1)) flickr = FlickrImageCrawler( '061beed3b1f1e2f1c95874ea769f7536', downloader_threads=4,
from datetime import date from icrawler.builtin import FlickrImageCrawler image_path = sys.argv[1] API_KEY = '6443fc1e493dab0e64443981f7364370' flowerWebSiteFile1 = "flowerNameSet1.html" flowerWebSiteFile2 = "flowerNameSet2.html" def getFlowerName(htmlFile): flowerNameList = [] with open(htmlFile, "r", encoding="UTF-8") as f: for line in f: if "alt=" in line: info = line[(line.index("alt=") + 5):] flowerName = info[:(info.index("image") - 1)] flowerNameList.append(flowerName) return flowerNameList flowerList1 = getFlowerName(flowerWebSiteFile1) flowerList2 = getFlowerName(flowerWebSiteFile2) flowerList = flowerList1 + flowerList2 for name in flowerList: imageDir = image_path + '/' + name flickr_crawler = FlickrImageCrawler(API_KEY, storage={'root_dir': imageDir}) flickr_crawler.crawl(max_num=4000, tags=name)
def crawl( folder: str, search: str, maxnum: int, crawlers: [List[str]] = ["GOOGLE", "BING", "BAIDU", "FLICKR"], ) -> Dict[str, str]: """Crawl web sites for images""" print("(1) Crawling ...") # prepare folders os.makedirs(folder, exist_ok=True) if maxnum > 1000: print("Max num limited to 1000") maxnum = 1000 for c in crawlers: print(f" -> {c}") if c == "GOOGLE": google_crawler = GoogleImageCrawler( downloader_cls=CustomDownloader, parser_cls=GoogleParser, log_level=logging.CRITICAL, feeder_threads=1, parser_threads=1, downloader_threads=4, storage={"root_dir": folder}, ) google_crawler.crawl( keyword=search, offset=0, max_num=maxnum, min_size=(200, 200), max_size=None, file_idx_offset=0, ) if c == "BING": bing_crawler = BingImageCrawler( downloader_cls=CustomDownloader, log_level=logging.CRITICAL, downloader_threads=4, storage={"root_dir": folder}, ) bing_crawler.crawl( keyword=search, filters=None, offset=0, max_num=maxnum, file_idx_offset="auto", ) if c == "BAIDU": baidu_crawler = BaiduImageCrawler( downloader_cls=CustomDownloader, log_level=logging.CRITICAL, storage={"root_dir": folder}, ) baidu_crawler.crawl( keyword=search, offset=0, max_num=maxnum, min_size=(200, 200), max_size=None, file_idx_offset="auto", ) if c == "FLICKR": flick_api_key = os.environ.get("FLICKR_API_KEY") if not flick_api_key: print( "Error: Flickr crawler requires FLICKR_API_KEY environment variable" " to be set with your non-secret API key.") exit(-1) flickr_crawler = FlickrImageCrawler( flick_api_key, downloader_cls=CustomDownloader, log_level=logging.CRITICAL, storage={"root_dir": folder}, ) flickr_crawler.crawl( text=search, offset=0, max_num=maxnum, min_size=(200, 200), max_size=None, file_idx_offset="auto", ) return { k: v for k, v in CustomDownloader.registry.items() if k is not None }
# Secret: # 69e361b8ccd8e185 Flickr_API_Key = 'bff26669f752de80bcc10f69c3d6fb92' CloudTypesList = open('CloudTypesList.txt', 'r') for cloudTypesName in CloudTypesList: cloud_type = cloudTypesName.strip('\n') imageDir = image_path + "\\" + cloud_type print("imageDir--------------" + imageDir) # flicker crawing flickr_crawler = FlickrImageCrawler(Flickr_API_Key, parser_threads=2, downloader_threads=4, storage={'root_dir': imageDir}) flickr_crawler.crawl(text=cloud_type, max_num=1000, tags=cloud_type) # google crawing google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4, storage={'root_dir': imageDir}) google_crawler.crawl(keyword=cloud_type, max_num=1000, file_idx_offset='auto') # bing crawing bing_crawler = BingImageCrawler(parser_threads=2, downloader_threads=4, storage={'root_dir': imageDir})
if not os.path.isdir(argv[1]): os.makedirs(argv[1]) #crawler = GoogleImageCrawler(storage = {"root_dir" : argv[1]}) crawler = GoogleImageCrawler(storage={'root_dir': f'{argv[1]}/google'}) crawler.crawl(keyword=argv[2], max_num=10000, min_size=(200, 200), max_size=None) #bing_crawler = BingImageCrawler(storage = {"root_dir" : argv[1]}) bing_crawler = BingImageCrawler(storage={'root_dir': f'{argv[1]}/bing'}) bing_crawler.crawl(keyword=argv[2], max_num=10000, min_size=(200, 200), max_size=None) #baidu_crawler = BaiduImageCrawler(storage = {"root_dir" : argv[1]}) baidu_crawler = BaiduImageCrawler(storage={'root_dir': f'{argv[1]}/baidu'}) baidu_crawler.crawl(keyword=argv[2], max_num=10000, min_size=(200, 200), max_size=None) flickr_crawler = FlickrImageCrawler(storage={'root_dir': f'{argv[1]}/flickr'}) flickr_crawler.crawl(keyword=argv[2], max_num=10000, min_size=(200, 200), max_size=None)
from datetime import date from icrawler.builtin import FlickrImageCrawler from icrawler.builtin import GoogleImageCrawler #google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4, storage={'root_dir': 'your_image_dir'}) #google_crawler.crawl(keyword='stairs', max_num=1000, date_min=None, date_max=None, min_size=(200,200), max_size=None) flickr_crawler = FlickrImageCrawler('9ec17606b35b36e913892f9c40b14374', storage={'root_dir': 'flickr'}) flickr_crawler.crawl(max_num=1000, tags='stairs', min_upload_date=date(1900, 5, 1)) ''' #from icrawler.examples import BingImageCrawler #from icrawler.examples import BaiduImageCrawler #from icrawler.examples import GoogleImageCrawler #google_crawler = GoogleImageCrawler('/home/chris/Desktop/stairs/ic/google') #google_crawler.crawl(keyword='stairs outside', offset=100, max_num=1000, # date_min=None, date_max=None, feeder_thr_num=1, # parser_thr_num=1, downloader_thr_num=4, # min_size=(200,200), max_size=None) #bing_crawler = BingImageCrawler('bing') #bing_crawler.crawl(keyword='stairs outside', offset=0, max_num=2000, # feeder_thr_num=1, parser_thr_num=1, downloader_thr_num=4, # min_size=None, max_size=None) #baidu_crawler = BaiduImageCrawler('/home/chris/Desktop/stairs/ic/baidu') #baidu_crawler.crawl(keyword='stairs outside', offset=0, max_num=2000, # feeder_thr_num=1, parser_thr_num=1, downloader_thr_num=4, # min_size=None, max_size=None)
downloader_threads=4, storage={'root_dir': 'your_image_dir'}) google_crawler.crawl(keyword='yadea brand logo', max_num=1000, min_size=(200, 200), max_size=None) bing_crawler = BingImageCrawler(parser_threads=2, downloader_threads=4, storage={'root_dir': 'your_image_dir'}) bing_crawler.crawl(keyword='yadea brand logo', max_num=1000, min_size=(200, 200), max_size=None) baidu_crawler = BaiduImageCrawler(parser_threads=2, downloader_threads=4, storage={'root_dir': 'your_image_dir'}) baidu_crawler.crawl(keyword='yadea brand logo', max_num=1000, min_size=(200, 200), max_size=None) flickr_crawler = FlickrImageCrawler(parser_threads=2, downloader_threads=4, storage={'root_dir': 'your_image_dir'}) flickr_crawler.crawl(keyword='yadea brand logo', max_num=1000, min_size=(200, 200), max_size=None)