def save_cse_google_images(query, save_dir = 'crawled/API_googled/', min_width = 80, min_height = 80, verbose = True):
    # intializes google API Custom Search Engine
    sub_dir = query.replace(' ', '_')
    if not os.path.exists(save_dir + sub_dir):
        os.mkdir(save_dir + sub_dir)
    elif os.path.isfile(os.path.join(save_dir + sub_dir, 'ranks.csv')):
        if verbose:
            print '%s was already crawled, skipping...' % query
        return
    # used to not flood nameserver
    time.sleep(random.randint(1, 5))
    service = google_build("customsearch", "v1", developerKey=DEV_KEY)
    collection = service.cse()
    ranks = []
    if verbose:
        print 'Exact query: %s' % query
    # maximum 100 results per query allowed
    for i in xrange(1, 90, 10):
        if verbose:
            print 'Saving %s index %s ...' % (query, str(i))
        try:
            # sending request via google API
            result = collection.list(q=query, cx=CX_KEY, searchType='image', num=10, start=i).execute()
            images = [(item['link'], int(item['image']['height']), int(item['image']['width'])) for item in result['items']]
            if verbose:
                print 'Total images saving: %s ...' % str(len(images))
            save_images(i, images, ranks, save_dir, sub_dir, min_width, min_height, verbose=verbose)
        except Exception as e:
            if verbose:
                print 'Error: %s' % str(e)
    purify(save_dir + sub_dir + '/', verbose=verbose)
    if verbose:
        print 'Saving %s ...' % query
    rdf = pd.DataFrame(ranks, columns = ['File name', 'Score'])
    rdf.to_csv(save_dir + sub_dir + '/ranks.csv', index=False)
示例#2
0
def save_google_images(query, save_dir = 'crawled/google/', top_index = 64, min_width = 80, min_height = 80, verbose = True):
    if verbose:
        print 'Querying %s ...' % query
    query_list = [query.replace(' ', '%20'),
                  query.replace(' ', '%20') + '&imgtype=face',
                  query.replace(' ', '%20') + '%20person',
                  query.replace(' ', '%20') + '%20photos']
    sub_dir = query.replace(' ', '_')
    if not os.path.exists(save_dir + sub_dir):
        os.mkdir(save_dir + sub_dir)
    elif os.path.isfile(os.path.join(save_dir + sub_dir, 'ranks.csv')):
        if verbose:
            print '%s was already crawled, skipping...' % query
        return
    fetcher = urllib2.build_opener()
    ranks = []
    for q in query_list:
        if verbose:
            print 'Exact query: %s' % q
        # maximum 64 allowed
        for i in xrange(0, 64, 8):
            if verbose:
                print 'Saving %s index %s ...' % (query, str(i))
            try:
                searchUrl = "http://ajax.googleapis.com/ajax/services/search/images?v=1.0&q=" + q + "&rsz=8&start=" + str(i)
                f = fetcher.open(searchUrl)
                data = json.load(f)
                if data['responseStatus'] == 200:
                    images = [(link['unescapedUrl'], int(link['height']), int(link['width'])) for link in data['responseData']['results']]
                    if verbose:
                        print 'Total images saving: %s ...' % str(len(images))
                    save_images(i, images, ranks, save_dir, sub_dir, min_width, min_height, verbose = verbose)
            except Exception as e:
                print 'Error: %s' % str(e)
    purify(save_dir + sub_dir + '/', verbose=verbose)
    if verbose:
        print 'Saving %s ...' % query
    rdf = pd.DataFrame(ranks, columns = ['File name', 'Score'])
    rdf.to_csv(save_dir + sub_dir + '/ranks.csv', index=False)