示例#1
0
def crawl(crawl_list, work_list):
    for i in range(len(crawl_list)):
        class_name = crawl_list[i]
        print("Now fetching class: %s" % class_name)
        output_dir = os.path.join(output_path, class_name)
        if not os.path.exists(output_dir):
            os.mkdir(output_dir)
        flickr_crawler = FlickrImageCrawler('',    # put your Flickr API key here
                                            feeder_threads=2, parser_threads=10, downloader_threads=5,
                                            parser_cls=MyFlickrParser,
                                            downloader_cls=MyImageDownloader,
                                            storage={'root_dir': output_dir},
                                            log_level=logging.ERROR)

        # Time counter
        prev_time = float("-inf")
        curr_time = float("-inf")
        for i in range(28):
            curr_time = time()
            elapsed = curr_time - prev_time
            print(
                "Now at iteration %d. Elapsed time: %.5fs." % (i, elapsed))
            prev_time = curr_time
            flickr_crawler.crawl(max_num=4000, text=class_name, sort='relevance', per_page=500,
                                 min_upload_date=TODAY - (i+1) * delta, max_upload_date=TODAY - i * delta,
                                 extras='url_n,url_z,original_format,path_alias')

        work_list.append(class_name)
        if i >= len(crawl_list) - 1:
            work_list.append('end')
示例#2
0
def test_flickr():
    flickr_crawler = FlickrImageCrawler(apikey=None,
                                        downloader_threads=4,
                                        storage={'root_dir': 'images/flickr'})
    flickr_crawler.crawl(max_num=10,
                         tags='family,child',
                         tag_mode='all',
                         group_id='68012010@N00')
示例#3
0
def start_flickr_crawler(path:Path, search_text:str, num_images:int, apikey:str):
    """Kicks off a Flickr download. Requires an apikey"""
    assert apikey != None, "Flickr requires an apikey: 'https://www.flickr.com/services/api/misc.api_keys.html'"
    crawler = FlickrImageCrawler(
            apikey,
            feeder_threads=2,
            parser_threads=2,
            downloader_threads=8,
            storage={'root_dir': path})
    crawler.crawl(tags=search_text, max_num=num_images, tag_mode='all')
示例#4
0
def search_image(pet, target, savePath):
    emotion = target.strip('\n')
    imageDir = savePath + os.sep + emotion
    print(imageDir)
    searchName = emotion + ' ' + pet
    print(searchName)
    flickr_crawler = FlickrImageCrawler(Flicker_API, storage={'root_dir': imageDir})
    flickr_crawler.crawl(max_num=1000, tags=searchName, text=searchName)
    google_crawler = GoogleImageCrawler(storage={'root_dir': imageDir})
    google_crawler.crawl(keyword=searchName, max_num=1000)
示例#5
0
文件: crawl.py 项目: hellock/icrawler
def test_flickr():
    print('start testing FlickrImageCrawler')
    flickr_crawler = FlickrImageCrawler(
        apikey=None,
        parser_threads=2,
        downloader_threads=4,
        storage={'root_dir': 'images/flickr'})
    flickr_crawler.crawl(
        max_num=10,
        tags='family,child',
        tag_mode='all',
        group_id='68012010@N00')
def start_flickr_crawler(path: Path, search_text: str, n_images: int,
                         apikey: str):
    if apikey == None:
        print(
            "Flickr requires an apikey: 'https://www.flickr.com/services/api/misc.api_keys.html'"
        )
        exit()
    crawler = FlickrImageCrawler(apikey,
                                 feeder_threads=2,
                                 parser_threads=2,
                                 downloader_threads=8,
                                 storage={'root_dir': path})
    crawler.crawl(tags=search_text, max_num=n_images, tag_mode='all')
示例#7
0
def get_images(img_path):
    if not os.path.exists(img_path):
        os.makedirs(img_path)
    flickr_crawler = FlickrImageCrawler('b040ad4b6a95ddaa8ad86f0762ebc828',
                                        downloader_cls=MyImageDownloader,
                                        downloader_threads=5,
                                        storage={'root_dir': img_path})
    flickr_crawler.crawl(
        max_num=500,
        tags='cathedral',
        extras='description',
        group_id='92229480@N00',  #Umbria
        min_upload_date=date(2005, 5, 1))
示例#8
0
def crawl_images(tags, index, batch_size):
    # https://www.flickr.com/services/apps/create/noncommercial/?
    # key:6ea696a89e485ce8b39cd052cc1dbd01
    # c3acc5f2a23734b4

    i = int(batch_size/100)*index+1
    print('index:',index,', i:',i)
    subdir_name = tags.replace(',', '_')
    subdir = os.path.join(ROOT_DIR, '{}_{}'.format(subdir_name, index))
    print(subdir)
    set_sub_dir(subdir)

    flickr_crawler = FlickrImageCrawler(apikey='6ea696a89e485ce8b39cd052cc1dbd01',
                                        feeder_threads=1,
                                        parser_threads=1,
                                        downloader_threads=8,
                                        downloader_cls=FaceImageDownloader,
                                        storage={'root_dir': subdir})
    flickr_crawler.crawl(max_num=batch_size, tags=tags, page=i, \
                        size_preference=['large 2048', 'large 1600', 'large'], \
                        min_upload_date=date(2010, 5, 25))
示例#9
0
"""
 AUTHOR : Min Xue
 PURPOSE : To collect bouquet of flowers images from Flickr
"""

import os
import sys
from datetime import date
from icrawler.builtin import FlickrImageCrawler

image_path = '/Users/xuemin/Desktop/FlickrImageCollection/result'
API_KEY = '13ef101ff4bac39647acb5531d8d0a3c'

FlowerBreedList = open('List1.txt','rt')

for nameList in FlowerBreedList:
    name = nameList.strip('\n')
    imageDir = image_path + '/' + name
    searchName = name
    flickr_crawler = FlickrImageCrawler(API_KEY, storage={'root_dir': imageDir})
    flickr_crawler.crawl(max_num = 500, tags = searchName, text = searchName)

print("Collection is done")
示例#10
0
import os
import pathlib
import shutil
from datetime import date

from icrawler.builtin import (BaiduImageCrawler, BingImageCrawler,
                              FlickrImageCrawler, GoogleImageCrawler)

flickr = FlickrImageCrawler(
    '061beed3b1f1e2f1c95874ea769f7536',
    downloader_threads=4,
    storage={'root_dir': 'bike'})
flickr.crawl(
    group_id="60503902@N00",
    tags='bike',
    max_num=100,
    min_upload_date=date(2017, 1, 1))

flickr = FlickrImageCrawler(
    '061beed3b1f1e2f1c95874ea769f7536',
    downloader_threads=4,
    storage={'root_dir': 'car'})
flickr.crawl(
    group_id="2309748@N20",
    tags='car',
    max_num=100,
    min_upload_date=date(2017, 1, 1))

flickr = FlickrImageCrawler(
    '061beed3b1f1e2f1c95874ea769f7536',
    downloader_threads=4,
from datetime import date
from icrawler.builtin import FlickrImageCrawler

image_path = sys.argv[1]
API_KEY = '6443fc1e493dab0e64443981f7364370'

flowerWebSiteFile1 = "flowerNameSet1.html"
flowerWebSiteFile2 = "flowerNameSet2.html"


def getFlowerName(htmlFile):
    flowerNameList = []
    with open(htmlFile, "r", encoding="UTF-8") as f:
        for line in f:
            if "alt=" in line:
                info = line[(line.index("alt=") + 5):]
                flowerName = info[:(info.index("image") - 1)]
                flowerNameList.append(flowerName)
    return flowerNameList


flowerList1 = getFlowerName(flowerWebSiteFile1)
flowerList2 = getFlowerName(flowerWebSiteFile2)
flowerList = flowerList1 + flowerList2

for name in flowerList:
    imageDir = image_path + '/' + name
    flickr_crawler = FlickrImageCrawler(API_KEY,
                                        storage={'root_dir': imageDir})
    flickr_crawler.crawl(max_num=4000, tags=name)
示例#12
0
def crawl(
    folder: str,
    search: str,
    maxnum: int,
    crawlers: [List[str]] = ["GOOGLE", "BING", "BAIDU", "FLICKR"],
) -> Dict[str, str]:
    """Crawl web sites for images"""
    print("(1) Crawling ...")
    # prepare folders
    os.makedirs(folder, exist_ok=True)

    if maxnum > 1000:
        print("Max num limited to 1000")
        maxnum = 1000

    for c in crawlers:
        print(f"    -> {c}")
        if c == "GOOGLE":
            google_crawler = GoogleImageCrawler(
                downloader_cls=CustomDownloader,
                parser_cls=GoogleParser,
                log_level=logging.CRITICAL,
                feeder_threads=1,
                parser_threads=1,
                downloader_threads=4,
                storage={"root_dir": folder},
            )

            google_crawler.crawl(
                keyword=search,
                offset=0,
                max_num=maxnum,
                min_size=(200, 200),
                max_size=None,
                file_idx_offset=0,
            )

        if c == "BING":
            bing_crawler = BingImageCrawler(
                downloader_cls=CustomDownloader,
                log_level=logging.CRITICAL,
                downloader_threads=4,
                storage={"root_dir": folder},
            )
            bing_crawler.crawl(
                keyword=search,
                filters=None,
                offset=0,
                max_num=maxnum,
                file_idx_offset="auto",
            )

        if c == "BAIDU":
            baidu_crawler = BaiduImageCrawler(
                downloader_cls=CustomDownloader,
                log_level=logging.CRITICAL,
                storage={"root_dir": folder},
            )
            baidu_crawler.crawl(
                keyword=search,
                offset=0,
                max_num=maxnum,
                min_size=(200, 200),
                max_size=None,
                file_idx_offset="auto",
            )

        if c == "FLICKR":
            flick_api_key = os.environ.get("FLICKR_API_KEY")
            if not flick_api_key:
                print(
                    "Error: Flickr crawler requires FLICKR_API_KEY environment variable"
                    " to be set with your non-secret API key.")
                exit(-1)

            flickr_crawler = FlickrImageCrawler(
                flick_api_key,
                downloader_cls=CustomDownloader,
                log_level=logging.CRITICAL,
                storage={"root_dir": folder},
            )
            flickr_crawler.crawl(
                text=search,
                offset=0,
                max_num=maxnum,
                min_size=(200, 200),
                max_size=None,
                file_idx_offset="auto",
            )

    return {
        k: v
        for k, v in CustomDownloader.registry.items() if k is not None
    }
示例#13
0
# Secret:
# 69e361b8ccd8e185

Flickr_API_Key = 'bff26669f752de80bcc10f69c3d6fb92'

CloudTypesList = open('CloudTypesList.txt', 'r')

for cloudTypesName in CloudTypesList:
    cloud_type = cloudTypesName.strip('\n')
    imageDir = image_path + "\\" + cloud_type
    print("imageDir--------------" + imageDir)

    # flicker crawing
    flickr_crawler = FlickrImageCrawler(Flickr_API_Key,
                                        parser_threads=2,
                                        downloader_threads=4,
                                        storage={'root_dir': imageDir})
    flickr_crawler.crawl(text=cloud_type, max_num=1000, tags=cloud_type)

    # google crawing
    google_crawler = GoogleImageCrawler(parser_threads=2,
                                        downloader_threads=4,
                                        storage={'root_dir': imageDir})
    google_crawler.crawl(keyword=cloud_type,
                         max_num=1000,
                         file_idx_offset='auto')

    # bing crawing
    bing_crawler = BingImageCrawler(parser_threads=2,
                                    downloader_threads=4,
                                    storage={'root_dir': imageDir})
示例#14
0
if not os.path.isdir(argv[1]):
    os.makedirs(argv[1])

#crawler = GoogleImageCrawler(storage = {"root_dir" : argv[1]})
crawler = GoogleImageCrawler(storage={'root_dir': f'{argv[1]}/google'})
crawler.crawl(keyword=argv[2],
              max_num=10000,
              min_size=(200, 200),
              max_size=None)

#bing_crawler = BingImageCrawler(storage = {"root_dir" : argv[1]})
bing_crawler = BingImageCrawler(storage={'root_dir': f'{argv[1]}/bing'})
bing_crawler.crawl(keyword=argv[2],
                   max_num=10000,
                   min_size=(200, 200),
                   max_size=None)

#baidu_crawler = BaiduImageCrawler(storage = {"root_dir" : argv[1]})
baidu_crawler = BaiduImageCrawler(storage={'root_dir': f'{argv[1]}/baidu'})
baidu_crawler.crawl(keyword=argv[2],
                    max_num=10000,
                    min_size=(200, 200),
                    max_size=None)

flickr_crawler = FlickrImageCrawler(storage={'root_dir': f'{argv[1]}/flickr'})
flickr_crawler.crawl(keyword=argv[2],
                     max_num=10000,
                     min_size=(200, 200),
                     max_size=None)
示例#15
0
from datetime import date
from icrawler.builtin import FlickrImageCrawler
from icrawler.builtin import GoogleImageCrawler

#google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4, storage={'root_dir': 'your_image_dir'})
#google_crawler.crawl(keyword='stairs', max_num=1000, date_min=None, date_max=None, min_size=(200,200), max_size=None)

flickr_crawler = FlickrImageCrawler('9ec17606b35b36e913892f9c40b14374',
                                    storage={'root_dir': 'flickr'})
flickr_crawler.crawl(max_num=1000,
                     tags='stairs',
                     min_upload_date=date(1900, 5, 1))
'''
#from icrawler.examples import BingImageCrawler
#from icrawler.examples import BaiduImageCrawler
#from icrawler.examples import GoogleImageCrawler

#google_crawler = GoogleImageCrawler('/home/chris/Desktop/stairs/ic/google')
#google_crawler.crawl(keyword='stairs outside', offset=100, max_num=1000,
#                     date_min=None, date_max=None, feeder_thr_num=1,
#                     parser_thr_num=1, downloader_thr_num=4,
#                     min_size=(200,200), max_size=None)

#bing_crawler = BingImageCrawler('bing')
#bing_crawler.crawl(keyword='stairs outside', offset=0, max_num=2000,
#                   feeder_thr_num=1, parser_thr_num=1, downloader_thr_num=4,
#                   min_size=None, max_size=None)
#baidu_crawler = BaiduImageCrawler('/home/chris/Desktop/stairs/ic/baidu')
#baidu_crawler.crawl(keyword='stairs outside', offset=0, max_num=2000,
#                    feeder_thr_num=1, parser_thr_num=1, downloader_thr_num=4,
#                    min_size=None, max_size=None)
示例#16
0
                                    downloader_threads=4,
                                    storage={'root_dir': 'your_image_dir'})
google_crawler.crawl(keyword='yadea brand logo',
                     max_num=1000,
                     min_size=(200, 200),
                     max_size=None)

bing_crawler = BingImageCrawler(parser_threads=2,
                                downloader_threads=4,
                                storage={'root_dir': 'your_image_dir'})
bing_crawler.crawl(keyword='yadea brand logo',
                   max_num=1000,
                   min_size=(200, 200),
                   max_size=None)

baidu_crawler = BaiduImageCrawler(parser_threads=2,
                                  downloader_threads=4,
                                  storage={'root_dir': 'your_image_dir'})
baidu_crawler.crawl(keyword='yadea brand logo',
                    max_num=1000,
                    min_size=(200, 200),
                    max_size=None)

flickr_crawler = FlickrImageCrawler(parser_threads=2,
                                    downloader_threads=4,
                                    storage={'root_dir': 'your_image_dir'})
flickr_crawler.crawl(keyword='yadea brand logo',
                     max_num=1000,
                     min_size=(200, 200),
                     max_size=None)