示例#1
0
def run(total_hours=24, hourly_limit=300, shuffle=False, termlist=None):
    if termlist is None:
        termlist = space.load_termlist()

    total_requests = min(int(total_hours * hourly_limit), len(termlist))
    total_time = 60 * 60 * min(total_hours, len(termlist) / hourly_limit)
    wait_time = total_time / total_requests
    daily_max_requests = hourly_limit * 24

    try:
        import os
        os.mkdir('search_results')
    except Exception as e:
        # print("could not make directory", e)
        pass

    # not sure if shuffle is needed, if so try shuffling index
    term_indices = list(termlist.index)
    if shuffle:
        # raise NotImplementedError()
        print("shuffling termlist")
        random.shuffle(term_indices)
    if len(termlist) > daily_max_requests:
        print("Warning: termlist length is", len(termlist),
              "while max daily requests will be", daily_max_requests)
    if len(termlist) > total_requests:
        print(
            f"Warning: only querying {total_requests} of {len(termlist)} total terms (not enough time specified)"
        )
    space.write_logs(
        f"querying {total_requests} terms for a minimum of {printable_time(seconds=total_time)}",
        verbose=True)

    google_img_count = 0
    baidu_img_count = 0
    google_fails = []
    baidu_fails = []
    results = ResultSetList()

    start_ts = time.time()
    for i in range(0, total_requests):
        start_iter_ts = time.time()
        try:
            term_idx = term_indices.pop()
            english_term = termlist.loc[term_idx].english
            chinese_term = termlist.loc[term_idx].chinese
        except Exception as e:
            print("out of terms", term_idx, str(e))
            break
        try:
            label = termlist.loc[term_idx].label
        except Exception as e:
            label = "automated_scraper"
        result = ResultSet(english_term, chinese_term, label)
        print(
            f'request {i}, term idx {term_idx}: "{result.combined_term()}", (label: {label})'
        )
        if not english_term:
            print("\tskipping Google for term (English term not present)")
        else:
            try:
                urls = query_google(english_term)
                # print(f"\tGoogle got {len(urls)} images")
                result.add(urls[:MAX_PICTURES_PER], GOOGLE)
            except Exception as e:
                google_fails.append(e)
                print("\tGoogle fail")
        if not chinese_term:
            print("\tskipping Baidu for term (Chinese term not present)")
        else:
            try:
                urls = query_baidu(chinese_term)
                # print(f"\tbaidu got {len(urls)} images")
                result.add(urls[:MAX_PICTURES_PER], BAIDU)
            except Exception as e:
                baidu_fails.append(e)
                print("\tBaidu fail")
        # print("done querying search engines for term", english_term)
        results.add(result)

        # account for the time the calls took
        took = time.time() - start_iter_ts
        # add in random jitter
        time_noise = random.random() * 2 - 1
        # print("adding noise to wait time", printable_time(seconds=time_noise))

        # cache results. this is a backup and not meant to be a reliable data store
        if i % 25 == 24:
            try:
                update_results(results)
                results.clear()
                google_img_count += results.wrote[GOOGLE]
                baidu_img_count += results.wrote[BAIDU]
            except Exception as e:
                import traceback
                print(
                    "failed to write search results; waiting until next attempt:",
                    e)
                exc = traceback.format_exc()
                print(str(exc))
        time.sleep(max(0, wait_time - took + time_noise))

    if results.length > 0:
        try:
            update_results(results)
            results.clear()
            google_img_count += results.wrote[GOOGLE]
            baidu_img_count += results.wrote[BAIDU]
        except Exception as e:
            import traceback
            exc = traceback.format_exc()
            print(exc)
            print("Failed to update search results, waiting 1 minute")
            time.sleep(60)
            update_results(results)
            results.clear()
            google_img_count += results.wrote[GOOGLE]
            baidu_img_count += results.wrote[BAIDU]

    space.write_logs(
        f'wrote {results.wrote["google"]} google images and {results.wrote[BAIDU]} baidu images',
        verbose=True)
    if len(baidu_fails) > 0 or len(google_fails) > 0:
        space.write_error(
            f"Baidu failures: {len(baidu_fails)}, Google failures: {len(google_fails)}"
        )
    print("took", printable_time(seconds=time.time() - start_ts))
    return (google_img_count, baidu_img_count, total_requests)
    '''
    Given a set of searches that the scraper has created, post each individually to 
    the /createSearch endpoint. spaces_interface.write_search_results should have
    created a new list in each result object of the Digital Ocean URLs
    '''
    search_term_to_id = {}
    print(f"saving {results.length} search terms")
    for term,result in results.iterterm():
        post_result = post_search(result, '192.168.0.1')
        if not post_result:
            raise Exception("failed to post result for term " + term)
        if len(result.urls[GOOGLE]) != len(result.get_datalake_urls(GOOGLE)):
            post_images(post_result["search_id"], GOOGLE, result.get_datalake_urls(GOOGLE))
        else:
            post_images(post_result["search_id"], GOOGLE, result.get_datalake_urls(GOOGLE), result.urls[GOOGLE])
        if len(result.urls[BAIDU]) != len(result.get_datalake_urls(BAIDU)):
            post_images(post_result["search_id"], BAIDU, result.get_datalake_urls(BAIDU))
        else:
            post_images(post_result["search_id"], BAIDU, result.get_datalake_urls(BAIDU), result.urls[BAIDU])
        search_term_to_id[result.combined_term()] = post_result["search_id"]
    return search_term_to_id

if __name__ == "__main__":
    from results import ResultSet, ResultSetList
    import time
    result = ResultSet('bunny', '')
    result.add(['google.com', 'bunnies.io'], GOOGLE)
    result.set_datalake_urls(['datalake.com/google.com', 'datalake.com/bunnies.io'], GOOGLE)
    results = ResultSetList()
    results.add(result)
    save_search_results(results)
示例#3
0
 def __init__(self):
     self.results = ResultSet()  #list of dictionaries containing results
示例#4
0
class Connector(object):
    def __init__(self):
        self.results = ResultSet()  #list of dictionaries containing results

    def set_auth_type(self, auth_type):
        """Set the authorization type for given web service
        
        Allowed types:
        
        anon: web service only provides anonymous access
        login: user must be logged in to use the web service
        mixed: web service allows both anonymous and user-based access
        """
        pass

    def set_throttle(self, limit=None, units=None):
        """Set the request rate for the given service

        units: {'requests per second', 'requests per day', etc.}
        """
        self.delay = 0
        self.max_requests = 1e16
        self.made_requests = 0

    def throttle(f):
        """wrapper function for throttling web service requests"""
        def wrapper(self, *args, **kwargs):
            if self.made_requests < self.max_requests:
                time.sleep(self.delay)
                f(self, *args, **kwargs)
                self.made_requests += 1
            else:
                raise Exception, 'maximum request limit reached'

        return wrapper

    def set_url(self, url_str):
        self.url = Template(url_str)

    def set_parser(self, output_format):
        """sets the object variable to the correct output stream parser"""
        self.output_parser = output_parsers.get(output_format, lambda x: x)

    def get_url_params(self):
        p = {}
        for attr_name, attr_value in self.__class__.__dict__.items():
            if isinstance(attr_value, Input):
                p[attr_name] = getattr(attr_value, 'value')
        return p

    @throttle
    def fetch(self, config):
        for key, val in config.iteritems():
            setattr(getattr(self, key), 'value', val)

        url = self.url.substitute(self.get_url_params())
        r = requests.get(url)

        results = self.output_parser(r.text)
        self.parse_results(results)

    def fetchmany(self, config_list):
        for config in config_list:
            self.fetch(config)

    def parse_results(self):
        raise NotImplementedError

    def reset(self):
        self.results.clear_results()
示例#5
0
 def __init__(self):
     self.results = ResultSet()  # list of dictionaries containing results
示例#6
0
class Connector(object):
    def __init__(self):
        self.results = ResultSet()  # list of dictionaries containing results

    def set_auth_type(self, auth_type):
        """Set the authorization type for given web service
        
        Allowed types:
        
        anon: web service only provides anonymous access
        login: user must be logged in to use the web service
        mixed: web service allows both anonymous and user-based access
        """
        pass

    def set_throttle(self, limit=None, units=None):
        """Set the request rate for the given service

        units: {'requests per second', 'requests per day', etc.}
        """
        self.delay = 0
        self.max_requests = 1e16
        self.made_requests = 0

    def throttle(f):
        """wrapper function for throttling web service requests"""

        def wrapper(self, *args, **kwargs):
            if self.made_requests < self.max_requests:
                time.sleep(self.delay)
                f(self, *args, **kwargs)
                self.made_requests += 1
            else:
                raise Exception, "maximum request limit reached"

        return wrapper

    def set_url(self, url_str):
        self.url = Template(url_str)

    def set_parser(self, output_format):
        """sets the object variable to the correct output stream parser"""
        self.output_parser = output_parsers.get(output_format, lambda x: x)

    def get_url_params(self):
        p = {}
        for attr_name, attr_value in self.__class__.__dict__.items():
            if isinstance(attr_value, Input):
                p[attr_name] = getattr(attr_value, "value")
        return p

    @throttle
    def fetch(self, config):
        for key, val in config.iteritems():
            setattr(getattr(self, key), "value", val)

        url = self.url.substitute(self.get_url_params())
        r = requests.get(url)

        results = self.output_parser(r.text)
        self.parse_results(results)

    def fetchmany(self, config_list):
        for config in config_list:
            self.fetch(config)

    def parse_results(self):
        raise NotImplementedError

    def reset(self):
        self.results.clear_results()