def test_greedy(): img_dir = osp.join(test_dir, 'greedy') greedy_crawler = GreedyImageCrawler( parser_threads=2, storage={'root_dir': img_dir}) greedy_crawler.crawl( 'http://www.bbc.com/news', max_num=5, min_size=(100, 100)) shutil.rmtree(img_dir)
def test_greedy(): print('start testing GreedyImageCrawler') greedy_crawler = GreedyImageCrawler(parser_threads=4, storage={'root_dir': 'images/greedy'}) greedy_crawler.crawl('http://www.bbc.com/news', max_num=10, min_size=(100, 100))
def test_greedy(): img_dir = osp.join(test_dir, 'greedy') greedy_crawler = GreedyImageCrawler(parser_threads=2, storage={'root_dir': img_dir}) greedy_crawler.crawl('http://www.bbc.com/news', max_num=5, min_size=(100, 100)) shutil.rmtree(img_dir)
feeder_threads=1, parser_threads=1, downloader_threads=4, storage={'root_dir': 'matches'}) # log_level=logging.INFO, # extra_downloader_args={'log_file': 'meta.txt'}) baidu_crawler.crawl(keyword=(query), offset=0, max_num=(num), min_size=(200, 200), max_size=None) elif engine in ('Greedy', 'greedy'): greedy_crawler = GreedyImageCrawler(downloader_cls=MyImageDownloader, feeder_threads=1, parser_threads=1, downloader_threads=4, storage={'root_dir': 'matches'}) greedy_crawler.crawl(domains=(url), max_num=(num), min_size=None, max_size=None) #For Flickr #from datetime import date #from icrawler.builtin import FlickrImageCrawler #flickr_crawler = FlickrImageCrawler('your_apikey', # storage={'root_dir': 'your_image_dir'}) #flickr_crawler.crawl(max_num=1000, tags='child,baby', # group_id='68012010@N00', min_upload_date=date(2015, 5, 1))
def test_greedy(): print('start testing GreedyImageCrawler') greedy_crawler = GreedyImageCrawler( parser_threads=4, storage={'root_dir': 'images/greedy'}) greedy_crawler.crawl( 'http://www.bbc.com/news', max_num=10, min_size=(100, 100))
""" BING CRAWLER WITH ICRAWLER PACKAGE. """ from icrawler.builtin import BingImageCrawler, GreedyImageCrawler search_term = 'tsutsugamushi' """ bing_crawler = BingImageCrawler(downloader_threads=4, storage={'root_dir': search_term + ' crawled images'}) bing_crawler.crawl(keyword=search_term, filters=None, offset=0, max_num=1000) """ # search_url = 'https://bbc.com' greedy_crawler = GreedyImageCrawler( storage={ 'root_dir': 'greedy_bing_eng_url ' + search_term + ' crawled images' }) greedy_crawler.crawl(domains=search_url, max_num=1000, min_size=None, max_size=None)
def image_crawler(): table = dynamodb.Table('Images') target = request.form.get('target') num = request.form.get('num') num = int(num) radio = request.form.get('gridRadios') if radio == 'Greedy': url = target url = str(url) greedy_crawler = GreedyImageCrawler( storage={'root_dir': 'downloaded_pictures'}) greedy_crawler.crawl(domains=url, max_num=num, min_size=(200, 200), max_size=None) print(file_names) for file_name in file_names: response = table.put_item(Item={ 'username': session['username'], 'imagename': file_name, }) if radio == 'Instagram': looter = InstaLooter(directory="/tmp/", profile=target) looter.download_pictures(media_count=num) counter = 0 for media in looter.medias(): print(media) if (counter < num): if media['is_video']: continue # url = looter.get_post_info(media['code'])['video_url'] else: counter = counter + 1 url = media['display_src'] s3 = boto3.client('s3') fp = io.BytesIO(urlopen(url).read()) s3.upload_fileobj(fp, 'ece1779project', media['id'] + '.jpg') response = table.put_item( Item={ 'username': session['username'], 'imagename': media['id'] + '.jpg', }) else: break if radio == 'Google': google_crawler = GoogleImageCrawler( parser_threads=2, downloader_threads=4, storage={'root_dir': 'downloaded_pictures'}) google_crawler.crawl(keyword=target, max_num=num, date_min=None, date_max=None, min_size=(200, 200), max_size=None) for file_name in file_names: response = table.put_item(Item={ 'username': session['username'], 'imagename': file_name, }) return render_template("/imagecrawler/form.html")
def getImagesFromDomain(self, query, domain_url, num_pics): greedy_crawler = GreedyImageCrawler() greedy_crawler.crawl(domains=domain_url, max_num=self.num_of_images, min_size=(self.min_width, self.min_height), max_size=None)
# coding: utf-8 # In[ ]: from icrawler.builtin import GreedyImageCrawler greedy_crawler = GreedyImageCrawler(parser_threads=2, downloader_threads=2,storage={'root_dir': 'data'}) greedy_crawler.crawl(domains='www.***.com', max_num=1000, min_size=None, max_size=None) # In[ ]: from icrawler.builtin import BaiduImageCrawler baidu_crawler = BaiduImageCrawler(storage={'root_dir': 'data'}) baidu_crawler.crawl(keyword='猫',offset=0, max_num=1000,min_size=None, max_size=None) # In[ ]: from icrawler.builtin import GoogleImageCrawler Google_Crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=2, storage={'root_dir': 'data'}) Google_Crawler.crawl(keyword='flower', max_num=1000, date_min=None, date_max=None, min_size=(160,160), max_size=None)