Exemplo n.º 1
0
    def claw(self, keyword, size=10):
        start =time.time()

        google_crawler = GoogleImageCrawler(storage={'root_dir': '/home/zluo/food/' + keyword})
        google_crawler.crawl(keyword=keyword + ' dishes', max_num=size)
        end = time.time()
        print(end - start)
Exemplo n.º 2
0
def get_image(name, file_path, data_count, sample_filter = None):
    crawler = GoogleImageCrawler(storage = {"root_dir" : file_path + "/train" +"/" + name})
    
    filters = dict(
                size = "large", 
                type = "photo"
    )

    # クローリングの実行
    crawler.crawl(keyword=name, filters = sample_filter, max_num = data_count)
    
    # valディレクトリの作成
    if os.path.isdir(file_path + "/val" + "/" + name):
        shutil.rmtree(file_path + "/val"+ "/" +name)

    os.makedirs(file_path + "/val" +"/" + name)


    # ダウンロードファイルのリストを作成
    filelist = glob.glob(file_path + "/train" + "/" + name + "/*")
    # 訓練データの2割りをvalデータとして保存
    ration = 0.2
    val_files = random.sample(filelist, int(len(filelist)*ration))

    for line in val_files:
        shutil.move(line, file_path + "/val" + "/" + name)
Exemplo n.º 3
0
def crawl_image(keyword, max_num):
    try:
        crawler = GoogleImageCrawler(storage={'root_dir': 'images'})
        crawler.crawl(keyword=keyword, max_num=int(max_num))
        var.set('Crawling is done.')
    except ValueError:
        var.set('Maxinum number of images \n must be integer.')
Exemplo n.º 4
0
def _download(query, dir, amount_to_crawl):
    """
    Download ``amount_to_crawl`` images from Google Image Serach for query ``query``, save in directory ``dir``
    :param query: Search query for Google Image Search
    :param dir: Directory to save the results in
    :param amount_to_crawl:  Number of pictures to crawl
    """
    intlen = len(str(amount_to_crawl))
    google_crawler = GoogleImageCrawler(feeder_threads=10,
                                        parser_threads=1,
                                        log_level=100,
                                        downloader_threads=10,
                                        storage={'root_dir': dir})
    end_date = date.today()
    amount_crawled = 0
    while amount_crawled < amount_to_crawl > 0:
        crawling = min(max_per_iteration, amount_to_crawl - amount_crawled)
        date_filter = _get_date_filter(end_date)
        log(
            f'{get_progress((amount_crawled + crawling) / amount_to_crawl)} '
            f'crawling images '
            f'{amount_crawled:0{intlen}d} - {(amount_crawled + crawling):0{intlen}d} / {amount_to_crawl} '
            f'for \'{query}\''
            f' in daterange {date_filter}',
            end='\r')
        google_crawler.crawl(keyword=query,
                             filters={'date': date_filter},
                             max_num=crawling,
                             file_idx_offset='auto')
        amount_crawled += crawling
        end_date += relativedelta(years=-1)
    print('')
def image_downloader(keyworkds=[],dir_path="dataset",maximages=100):
    
    for i, item in enumerate(keyworkds):

        dir_path_keyword = os.path.join(dir_path,item)

        #If the directory not exist, create
        if not os.path.exists(dir_path_keyword):
            os.makedirs(dir_path_keyword)
        
        print('\n\tWriting on directory: ' + str(dir_path_keyword))
        print('\tThe images for keyword: ' + str(item) + '\n')

        google_crawler = GoogleImageCrawler(
            feeder_threads=1,
            parser_threads=2,
            downloader_threads=4,
            storage={'root_dir': dir_path_keyword})
        filters = dict(
            type='photo'
            # size='large',
            # color='orange',
            # license='commercial,modify',
            # date=((2017, 1, 1), (2017, 11, 30))
            )
        google_crawler.crawl(keyword=item, filters=filters, max_num=maximages, file_idx_offset=0)

    return(True)
Exemplo n.º 6
0
 def photo(self, photoDir, WORDS):
     # Creates instance of Class object GoogleImageCrawler under icrawler >
     # builtin > google.py file and passes MyImageDowloader class as the
     # downloader class instead of standard library's ImageDownloader class
     google_crawler = GoogleImageCrawler(
         downloader_cls=MyImageDownloader,
         parser_threads=2,
         downloader_threads=4,
         # stores the file where user indicates in script argument
         storage={'root_dir': photoDir})
     # This was the key statement that will assign the name_ of the phone
     # to the beginning of the filename for the photo downloaded
     google_crawler.downloader.prefix_name = self.name_
     # Set session.verify = False work around for excpetion from requests
     # found here: https://github.com/hellock/icrawler/issues/40
     google_crawler.session.verify = False
     # Get a random word from the list of WORDS passed
     word = random.choice(WORDS)
     # Actual call to crawl method to scrape Google images
     google_crawler.crawl(keyword=word, max_num=1)
     # Print location which was passed by script to function
     print(
         textwrap.dedent("""
         File has been downloaded to:
             {}""".format(photoDir)))
Exemplo n.º 7
0
def exe_crawl(arg):
    google_crawler = GoogleImageCrawler(
        downloader_cls=PrefixNameGoogleDownloader,
        feeder_threads=1,
        parser_threads=1,
        downloader_threads=4,
        storage={'root_dir': f'{arg.dict}/{arg.keyword}/google'})
    filters = dict(license=f'{arg.license}')
    google_crawler.crawl(keyword=f'{arg.keyword}',
                         filters=filters,
                         offset=0,
                         max_num=arg.max,
                         file_idx_offset=0)

    bing_crawler = BingImageCrawler(
        downloader_cls=PrefixNameBingDownloader,
        downloader_threads=4,
        storage={'root_dir': f'{arg.dict}/{arg.keyword}/bing'})
    bing_crawler.crawl(keyword=f'{arg.keyword}',
                       filters=filters,
                       offset=0,
                       max_num=arg.max)

    baidu_crawler = BaiduImageCrawler(
        downloader_cls=PrefixNameBaiduDownloader,
        storage={'root_dir': f'{arg.dict}/{arg.keyword}/baidu'})
    baidu_crawler.crawl(keyword=f'{arg.keyword}', offset=0, max_num=arg.max)
Exemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser(
        description='This script downloads image from image.baidu.com.')
    parser.add_argument(
        '-k',
        '--keyword',
        type=str,
        nargs='+',
        help='The keywords of images that users want to download.')
    parser.add_argument(
        '-d',
        '--dir',
        type=str,
        help='The dir that is used to save images. Default: ./image.',
        default='./image')
    parser.add_argument(
        '-n',
        '--num',
        type=int,
        help=
        'The maximum number of each keyword\'s iamges to be downloaded. It had better to be the times of 60. Default: 10.',
        default=60)
    args = parser.parse_args()
    max_num = args.num
    save_dir = args.dir
    keywords = args.keyword

    for keyword in keywords:
        print('Start to download the iamges of keyword: ' + keyword)
        google_storage = {'root_dir': save_dir + '/' + keyword}
        google_crawler = GoogleImageCrawler(parser_threads=4,
                                            downloader_threads=4,
                                            storage=google_storage)
        google_crawler.crawl(keyword=keyword, max_num=max_num)
Exemplo n.º 9
0
def getPoliticianImage():

    # image path : first argument
    image_path = sys.argv[1]

    # excel path : second argument
    excel_path = sys.argv[2]

    # max number of image : third argument
    max_num_image = int(sys.argv[3])

    excelFile = xlrd.open_workbook(excel_path)
    politician_key_value_list = excelFile.sheet_by_index(0)

    for i in range(0, politician_key_value_list.nrows):

        # folder name (politician english name)
        folder_name = politician_key_value_list.cell_value(i, 1)

        # directory which you can put
        total_path = image_path + '/' + folder_name

        google_crawler = GoogleImageCrawler(parser_threads=2,
                                            downloader_threads=8,
                                            storage={'root_dir': total_path})

        google_crawler.crawl(keyword=politician_key_value_list.cell_value(
            i, 0),
                             offset=0,
                             max_num=max_num_image,
                             date_min=None,
                             date_max=None,
                             min_size=(200, 200),
                             max_size=None)
Exemplo n.º 10
0
def imagecrawl(searchwords, imagenum, title):
    """
    searchwords: 検索クエリ str型.
    imagenum: 画像の枚数 int型.
    title: 作品タイトル train/validation 配下のディレクトリを指定 str型.
    """

    # ディレクトリのチェックと作成
    train_dir = './train/' + title + '/'  #train配下のディレクトリ
    dircheck(train_dir)
    valid_dir = './validation/' + title + '/'  #validation配下のディレクトリ
    dircheck(valid_dir)

    max_idx = max_file_idx(train_dir, valid_dir)

    # クローラで画像のダウンロード
    crawler = GoogleImageCrawler(storage={"root_dir": "tmp"})
    crawler.crawl(keyword=searchwords,
                  max_num=imagenum,
                  file_idx_offset=max_idx)

    # ダウンロードしたファイルをtrain, validationに分割し移動
    image_list = glob.glob('./tmp/*')
    random.shuffle(image_list)
    train_list, valid_list = np.split(np.array(image_list),
                                      [int(len(image_list) * 0.8)])
    train_list = list(train_list)
    valid_list = list(valid_list)

    for i in train_list:
        shutil.move(i, train_dir)
    for i in valid_list:
        shutil.move(i, valid_dir)
Exemplo n.º 11
0
def craw_image(key, i):
    save_dir = '/Users/fanyang/python/finalproject/recipeimage' '''change to savin dir'''
    crawler = GoogleImageCrawler(feeder_threads=1,
                                 parser_threads=2,
                                 downloader_threads=4,
                                 storage={'root_dir': save_dir + '/' + str(i)})
    crawler.crawl(keyword=key, max_num=3)
Exemplo n.º 12
0
def send_nudes(entities):
    """
    Sends nudes
    """
    print('--- Running send_nudes')
    # Get path of current file
    path = os.getcwd()
    path = path + '\img'

    # Choose random keyword
    keywrd = ['jesus', 'jesus staring', 'jesus wallpaper']

    # Google image search for 5 images
    google_crawler = GoogleImageCrawler(storage={'root_dir': path})
    google_crawler.crawl(keyword=rand_choice(keywrd), max_num=3)
    file_list = os.listdir(path=path)
    path_list = []

    # Choose random image to display
    for file in file_list:
        file = path + '\\' + str(file)
        path_list.append(file)
    image = rand_choice(path_list)

    # Display image
    img = Image.open(image)
    img.show()

    # Delete downloaded images
    for file in path_list:
        os.remove(file)

    text_resp = 'SENDING NUDES'
    return text_resp
Exemplo n.º 13
0
def pring():
    # new folder name
    new_path = folder_path.get() + "\\" + folder_name.get()
    if not os.path.exists(new_path):
        # make new folder
        os.mkdir(new_path)
        # print("create new folder")
        # main program
        crawler = GoogleImageCrawler(storage={"root_dir": new_path})
        if combo_1.get() != "None":
            filters = dict(size=combo_1.get())
        else:
            filters = None
        crawler.crawl(keyword=picture_name.get(),
                      filters=filters,
                      offset=0,
                      max_num=int(picture_num.get()))
        res = messagebox.askokcancel('finished!!!',
                                     'Reset input, but check folder?')
        folder_name.delete(0, "end")
        picture_name.delete(0, "end")
        picture_num.delete(0, "end")
        if res:
            tkinter.filedialog.askopenfilename(initialdir=new_path)
    else:

        messagebox.showinfo(
            'failed...',
            'A folder with the same name already exists\n' + new_path)
Exemplo n.º 14
0
def crawl_item(keyword, rootdir, max_num=500, language='vi'):
    '''
        max_num is used at every crawl at different time,
        so number of crawled image is max_num * len(data-1)
    '''
    global google_crawler
    storage = {'root_dir': rootdir}
    print('Starting to crawl {}'.format(keyword))
    # change the storage dir
    google_crawler = GoogleImageCrawler(
                        feeder_threads=1,
                        parser_threads=1,
                        downloader_threads=4,
                        storage=storage)
    for i in range(len(date)-1):
        try:
            google_crawler.crawl(
                keyword=keyword,
                filters={'date': (date[i], date[i+1])},
                max_num=max_num,
                file_idx_offset='auto',
                language='vi')
        except Exception as err:
            print(err)
        time.sleep(0.5)
    return
Exemplo n.º 15
0
def download_images(keyword, directory, quantity):

    os.chdir(directory)

    if keyword not in os.listdir():

        os.mkdir(keyword)

    os.chdir(keyword)

    google_crawler = GoogleImageCrawler(directory + '/' + keyword)
    google_crawler.crawl(
        keyword=keyword,
        offset=0,
        max_num=quantity,
        date_min=None,
        date_max=None,
        feeder_thr_num=1,
        parser_thr_num=1,
        downloader_thr_num=4,
        #min_size=(200,200), max_size=None)
        min_size=None,
        max_size=None)

    # adding keyword in the beginning of filename
    command = 'perl-rename \'s/(.*)/%s_$1/\' *' % (keyword)
    os.system(command)
Exemplo n.º 16
0
def my_crawl(name):
    '''
    uses Google Image Crawler to crawl google image and download, according to given keyword
    :param name:
    :return:
    '''
    class PrefixNameDownloader(ImageDownloader):
        def get_filename(self, task, default_ext):
            filename = super(PrefixNameDownloader,
                             self).get_filename(task, default_ext)
            return name + filename

    google_crawler = GoogleImageCrawler(
        feeder_threads=1,
        parser_threads=2,
        downloader_cls=PrefixNameDownloader,
        downloader_threads=4,
        storage={
            'root_dir': '/Volumes/USB STICK/image database/images/google3'
        })
    filters = dict(size='=512x512',
                   license='commercial,modify',
                   date=((2017, 1, 1), (2017, 11, 30)))
    google_crawler.crawl(keyword=name + 'filetype: jpg',
                         filters=filters,
                         max_num=500,
                         file_idx_offset=0)
Exemplo n.º 17
0
def crawl(
        folder: str,
        search: str,
        maxnum: int,
        crawlers: [List[str]] = ['GOOGLE', 'BING', 'BAIDU']) -> Dict[str, str]:
    """Crawl web sites for images"""
    print('(1) Crawling ...')
    # prepare folders
    os.makedirs(folder, exist_ok=True)

    sources = {}
    if maxnum > 1000:
        print("Max num limited to 1000")
        maxnum = 1000

    for c in crawlers:
        print(f'    -> {c}')
        if c == 'GOOGLE':
            google_crawler = GoogleImageCrawler(
                downloader_cls=CustomDownloader,
                log_level=logging.CRITICAL,
                feeder_threads=1,
                parser_threads=1,
                downloader_threads=4,
                storage={'root_dir': folder})

            google_crawler.crawl(keyword=search,
                                 offset=0,
                                 max_num=maxnum,
                                 min_size=(200, 200),
                                 max_size=None,
                                 file_idx_offset=0)

        if c == 'BING':
            bing_crawler = BingImageCrawler(downloader_cls=CustomDownloader,
                                            log_level=logging.CRITICAL,
                                            downloader_threads=4,
                                            storage={'root_dir': folder})
            bing_crawler.crawl(keyword=search,
                               filters=None,
                               offset=0,
                               max_num=maxnum,
                               file_idx_offset='auto')

        if c == 'BAIDU':
            baidu_crawler = BaiduImageCrawler(downloader_cls=CustomDownloader,
                                              log_level=logging.CRITICAL,
                                              storage={'root_dir': folder})
            baidu_crawler.crawl(keyword=search,
                                offset=0,
                                max_num=maxnum,
                                min_size=(200, 200),
                                max_size=None,
                                file_idx_offset='auto')

    return {
        k: v
        for k, v in CustomDownloader.registry.items() if k is not None
    }
Exemplo n.º 18
0
def test_google(logo):
    google_crawler = GoogleImageCrawler(
        downloader_cls=MyImageDownloader,
        downloader_threads=4,
        storage={'root_dir': os.path.join(root, logo, 'google')},
        log_level=logging.INFO,
        filename=os.path.join(root, logo, 'google.txt'))
    google_crawler.crawl(logo, max_num=args.maxnum)
Exemplo n.º 19
0
def crawl_images(image_dir, concept_keyword, N=10):
    google_crawler = GoogleImageCrawler(
        storage={'root_dir': os.path.join(image_dir, concept_keyword+"_before")},
        feeder_threads=1,
        parser_threads=2,
        downloader_threads=4,
    )
    google_crawler.crawl(keyword=concept_keyword, max_num=N)
Exemplo n.º 20
0
def test_google():
    google_crawler = GoogleImageCrawler(downloader_threads=4,
                                        storage={'root_dir': 'images/google'},
                                        log_level=logging.INFO)
    google_crawler.crawl('tesla',
                         max_num=10,
                         date_min=date(2016, 2, 1),
                         date_max=date(2016, 3, 15))
Exemplo n.º 21
0
def CrawlByName(name,numPictures,savedir):
    #Crawls numPictures amount of images from Goggle images with given search term
    print('Start Crawling...')
    capture = StringIO()
    sys.stderr = capture
    google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4,storage={'root_dir': str(savedir)})
    google_crawler.crawl(keyword=str(name),max_num=numPictures,date_min=None, date_max=None)
    return(capture.getvalue())#Return output with image links (needed later for crossvalidation)
Exemplo n.º 22
0
def getGoogleImage(keyword, dir, max):
    google_crawler = GoogleImageCrawler(parser_threads=2,
                                        downloader_threads=10,
                                        storage={'root_dir': dir})
    google_crawler.crawl(keyword=keyword,
                         offset=0,
                         max_num=max,
                         min_size=(100, 100),
                         max_size=None)
Exemplo n.º 23
0
def download_image(fish_name, max_num):
    google_crawler = GoogleImageCrawler(
        feeder_threads=1,
        parser_threads=1,
        downloader_threads=54,
        storage={'root_dir': 'fish_image/'+fish_name})

    google_crawler.crawl(keyword=fish_name, filters=None, offset=0, max_num=max_num,
                        min_size=None, max_size=None, file_idx_offset=0)
Exemplo n.º 24
0
def test_google():
    print('start testing GoogleImageCrawler')
    google_crawler = GoogleImageCrawler(
        downloader_threads=4,
        storage={'root_dir': 'images/google'},
        log_level=logging.INFO)
    search_filters = dict(
        size='large',
        color='orange',
        license='commercial,modify',
        date=(None, (2017, 11, 30)))
    google_crawler.crawl('cat', filters=search_filters, max_num=10)
Exemplo n.º 25
0
def test_google():
    img_dir = osp.join(test_dir, 'google')
    google_crawler = GoogleImageCrawler(
        downloader_threads=2,
        storage={'root_dir': img_dir},
        log_level=logging.INFO)
    search_filters = dict(
        size='large',
        color='orange',
        license='commercial,modify',
        date=(None, (2017, 11, 30)))
    google_crawler.crawl('cat', filters=search_filters, max_num=5)
    shutil.rmtree(img_dir)
Exemplo n.º 26
0
def getImg(keywords='', dirpath='', amount=0, source=4):
    if source == 1:
        print('\n--- 開始從「Google 圖片」下載---\n')
        google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4,storage={'root_dir': dirpath})
        google_crawler.crawl(keyword=keywords, offset=0, max_num=amount, date_min=None, date_max=None, min_size=(200,200), max_size=None)

    elif source == 2:
        print('\n--- 開始從「Microsoft Bing」下載---\n')
        bing_crawler = BingImageCrawler(downloader_threads=4, storage={'root_dir': dirpath})
        bing_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None)

    elif source == 3:
        print('\n--- 開始從「百度」下載---\n')
        baidu_crawler = BaiduImageCrawler(storage={'root_dir': dirpath})
        baidu_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None)

    else:
        print('\n--- 開始從「Google 圖片」下載---\n')
        google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4,storage={'root_dir': dirpath})
        google_crawler.crawl(keyword=keywords, offset=0, max_num=amount, date_min=None, date_max=None, min_size=(200,200), max_size=None)
        print('\n--- 開始從「Microsoft Bing」下載---\n')
        bing_crawler = BingImageCrawler(downloader_threads=4, storage={'root_dir': dirpath})
        bing_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None)
        print('\n--- 開始從「百度」下載---\n')
        baidu_crawler = BaiduImageCrawler(storage={'root_dir': dirpath})
        baidu_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None)
from icrawler.builtin import BaiduImageCrawler, BingImageCrawler, GoogleImageCrawler

google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4,
                                    storage={'root_dir': 'C:/Users/Saurabh/Desktop/'})
google_crawler.crawl(keyword='sandwich', offset=0, max_num=100,
                     date_min=None, date_max=None,
                     min_size=(200,200), max_size=None)