Exemplo n.º 1
0
 def parse(self):
     raw_entities = []
     for placemark in self.xpath(self.root, './/ns:Placemark'):
         raw_entities.append(self.placemark_to_entity(placemark))
     entities = utils.parallelize(self.augment_entity, [(e,) for e in raw_entities])
     name = tostring(self.xpath(self.root, 'ns:Document/ns:name')[0])
     # TODO: Parse the latlngs into a Bounds object for the trip plan.
     # Right now this is happening the javascript as a hack.
     return data.TripPlan(name=name, entities=entities)
Exemplo n.º 2
0
def scrape_entities_from_page_source(url, page_source):
    if scrape_logic.is_url_handleable(url):
        return scrape_entities_from_url(url, page_source)
    else:
        urls = extract_urls_from_page_source(url, page_source)
        handleable_urls = set(u for u in urls if scrape_logic.is_url_handleable(u, allow_expansion=False))
        entity_lists = utils.parallelize(scrape_entities_from_url,
            [(u, None, True, None, False) for u in handleable_urls])
        return utils.flatten(entity_lists)
def augment_trip_plan(raw_trip_plan):
    location_latlng = raw_trip_plan.location_latlng.to_json_obj() if raw_trip_plan.location_latlng else None
    entities = utils.parallelize(
        utils.retryable(augment_entity, retries=3),
        [(e, location_latlng) for e in raw_trip_plan.entities])
    trip_plan = raw_trip_plan.copy()
    for i, entity in enumerate(entities):
        # If there's an RPC error, some of these may come back as None.
        # So as a fallback make sure we at least save the incoming entity.
        # TODO: Return an error message here so the user can be notified
        # that not all entities were saved.
        if not entity:
            entities[i] = raw_trip_plan.entities[i]
    trip_plan.entities = entities
    return trip_plan
Exemplo n.º 4
0
def build_scrapers(url, client_page_source=None, force_fetch_page=False, allow_expansion=True, for_guide=False):
    page_source_tree = html_parsing.parse_tree_from_string(client_page_source) if client_page_source else None
    if not page_source_tree and (url_requires_server_page_source(url) or force_fetch_page):
        page_source_tree = html_parsing.parse_tree(url)

    scraped_pages = []
    for scraper_class in ALL_SCRAPERS:
        handleable_urls = scraper_class.handleable_urls(url, page_source_tree, allow_expansion)
        if handleable_urls:
            reqs = [html_parsing.make_request(u) for u in handleable_urls]
            resps = utils.parallelize(utils.retryable(urllib2.urlopen, 3), [(req,) for req in reqs])
            for url, resp in zip(handleable_urls, resps):
                if not resp:
                    print "Failed to fetch url: %s" % url
                    continue
                tree = etree.parse(resp, html_parsing.htmlparser())
                scraper = scraper_class(url, tree, for_guide)
                scraped_pages.append(scraper)
            break
    return scraped_pages
Exemplo n.º 5
0
 def stopreadingCollaborator(self):
     print("=== Stoping (readers) %s collaborators ===" %
           self.__collab_type)
     utils.parallelize(utils.stopreading_collab, self.__collab_type,
                       self.__addresses)
Exemplo n.º 6
0
 def startCollaborator(self):
     print("=== Starting %s collaborators ===" % self.__collab_type)
     utils.parallelize(utils.start_collab, self.__collab_type,
                       self.__addresses)
Exemplo n.º 7
0
 def createCollaborator(self):
     print("=== Creating %s collaborators ===" % self.__collab_type)
     utils.parallelize(utils.create_collab, self.__collab_type,
                       self.__addresses)
Exemplo n.º 8
0
 def make_photo_urls(photo_objs):
     return utils.parallelize(resolve_photo_url,
         [(obj['photo_reference'], obj['width'], obj['height']) for obj in photo_objs])
Exemplo n.º 9
0
def prepare():
    # TODO equally distribute larger languages
    parallelize(_prepare, LANGUAGES, n_workers=4)
Exemplo n.º 10
0
def reencode_dir(input_dir, num_pools):
    raw_movie_files = glob('%s/*.mp4' % input_dir)
    mkdir_p('%s/%s' % (input_dir, DONE_DIR))
    mkdir_p('%s/%s' % (input_dir, OUTPUT_DIR))
    parallelize(reencode_single_file, raw_movie_files, num_pools=num_pools)
Exemplo n.º 11
0
    def refresh(self):
        try:
            if self.ready:
                status, dlq, ulq, shd = r = utils.parallelize(
                    self.client.get_status,
                    self.client.show_dl,
                    self.client.show_ul,
                    self.client.show_shared
                    )
                for i in r:
                    if isinstance(i, BaseException):
                        raise i

                # Queue merge (one file can be in two queues)
                downloads = {}
                for queue in (dlq, ulq, shd):
                    is_downloading = queue is dlq
                    for download in queue.itervalues():
                        dhash = download.partfile_hash.encode("hex")
                        if dhash in downloads:
                            downloads[dhash].update(download)
                            downloads[dhash]["is_downloading"] |= is_downloading
                        else:
                            downloads[dhash] = download
                            downloads[dhash]["is_downloading"] = is_downloading

                downloads_changed = downloads != self._data #frozen_cmp(downloads, self._data) != 0

                self._status.update(status)
                self._data = downloads

                if downloads_changed:
                    # Download updates
                    for dhash, download in downloads.iteritems():
                        if dhash in self._downloads:
                            self.outdated_downloads.add(self._downloads[dhash])
                        else:
                            self._downloads[dhash] = Download(self, download, None)
                            self.emit("download_new", self._downloads[dhash])

                    # Removing deleted downloads
                    unowned = not self.manager is self
                    for dhash in frozenset(downloads).symmetric_difference(self._downloads):
                        if self._downloads[dhash].finished:
                            self.outdated_downloads.add(self._downloads[dhash])
                        else:
                            self.emit("download_remove", self._downloads[dhash])
                            if unowned:
                                self.manager.remove(self._downloads[dhash])
                            del self._downloads[dhash]

                self._status_cache = ("",)
            else:
                self._status_cache = ("backend not ready",)
        except ec.ConnectionFailedError as e:
            # Amule daemon is prone to fail
            logger.exception(e)
            self._sync_numfails += 1
            if not self._sync_restarting_daemon and (
              self._connecting_to_kad or
              self._sync_numfails > self._sync_max_numfails
              ):
                # If connection failed during Kad connection or
                # sync failed more than _sync_numfails
                self.ready = False # Revert ready state
                self._sync_restarting_daemon = True
                logger.debug("Max ConnectionFailedError achieved, restarting daemon.")
                self.start_daemon()
                self._sync_restarting_daemon = False
        except BaseException as e:
            # Unexpected errors shouldn't ever happen
            logger.exception(e)
        else:
            # Once daemon starts to response, it should't fails, so
            # whe lower the fail tolerance
            if not self._sync_worked_once:
                self._sync_worked_once = True
                self._sync_max_numfails = 1
            self._sync_numfails = 0
        BackendBase.refresh(self)
Exemplo n.º 12
0
def process_file(fname, nprocesses=mp.cpu_count()):
    """Process a file using multiprocessing."""
    with open(fname, 'r', encoding='iso-8859-1') as text:
        strands = utils.spagettify(text, nprocesses)
        result = utils.parallelize(build_vocab, strands)
        return result
Exemplo n.º 13
0
def scrape_entities_from_url(url, page_source=None, force_fetch_page=False,
        max_results=None, allow_expansion=True, for_guide=False):
    scrapers = scrape_logic.build_scrapers(url, page_source, force_fetch_page, for_guide=for_guide)
    scrapers = scrapers[:max_results] if max_results else scrapers
    return utils.parallelize(entity_from_scraper, [(scr, url) for scr in scrapers])
Exemplo n.º 14
0
 def get_raw_entities(self):
     path = urlparse.urlparse(self.url).path
     links = self.root.xpath(".//div[@class='content']//a/@href")
     entity_links = [urlparse.urljoin(self.url, l.strip()) for l in links if l.startswith(path)]
     return utils.parallelize(self.scrape_entity_page, [(l,) for l in entity_links])