def __init__(self, uri): super(XgesProjectDescriptor, self).__init__() self._uri = uri self._xml_path = utils.url2path(uri) self._root = ET.parse(self._xml_path) self._duration = None
def new_from_uri(uri, verbose=False, full=False): media_path = utils.url2path(uri) descriptor_path = "%s.%s" % ( media_path, GstValidateMediaDescriptor.MEDIA_INFO_EXT) args = GstValidateMediaDescriptor.DISCOVERER_COMMAND.split(" ") args.append(uri) args.extend(["--output-file", descriptor_path]) if full: args.extend(["--full"]) if verbose: printc("Generating media info for %s\n" " Command: '%s'" % (media_path, ' '.join(args)), Colors.OKBLUE) try: subprocess.check_output(args, stderr=open(os.devnull)) except subprocess.CalledProcessError as e: if verbose: printc("Result: Failed", Colors.FAIL) else: loggable.warning("GstValidateMediaDescriptor", "Exception: %s" % e) return None if verbose: printc("Result: Passed", Colors.OKGREEN) return GstValidateMediaDescriptor(descriptor_path)
def set_settings(self, options, args, reporter): TestsManager.set_settings(self, options, args, reporter) try: os.makedirs(utils.url2path(options.dest)[0]) except OSError: pass
def set_settings(self, options, args, reporter): TestsManager.set_settings(self, options, args, reporter) self._scenarios.config = self.options try: os.makedirs(utils.url2path(options.dest)[0]) except OSError: pass
def set_settings(self, options, args, reporter): """Configures the manager based on the specified options.""" TestsManager.set_settings(self, options, args, reporter) PitiviTestsManager._scenarios.config = self.options try: os.makedirs(utils.url2path(options.dest)[0]) except OSError: pass
def test_url2path(self): """test Url2path Args: None Return None """ testUrl1 = "http://baidu.com/1_2" result1 = utils.url2path(testUrl1) self.assertEqual(result1, "http:\\\\baidu_com\\1_2")
def __init__(self, classname, options, reporter, project_uri, scenario=None, combination=None): super(GESTest, self).__init__(GES_LAUNCH_COMMAND, classname, options, reporter, scenario=scenario) self.project_uri = project_uri self.duration = find_xges_duration(utils.url2path(project_uri)) if self.duration is not None: self.duration = self.duration / utils.GST_SECOND else: self.duration = 2 * 60
def handle(self, task): """Fetching routine Args: task Task object Return: None """ history_lock = lock.Lock.retrive_lock() history_lock.acquire() if task.url in self.__history: logging.info("Url has been fetched: {url}".format(url=task.url)) history_lock.release() return self.__history.append(task.url) history_lock.release() if task.depth > self.__max_dep: raise SpiderException( "Not a valid task: {task}".format(task=str(task))) time.sleep(self.__frequency) fetched_page = page.Page(task.url, 5) try: fetched_page.hydrate() except urllib2.HTTPError as e: logging.error("HTTP ERROR {url}: {error}".format(url=task.url, error=str(e))) return except urllib2.URLError as e: logging.error("Url ERROR {url}: {error}".format(url=task.url, error=str(e))) return if task.depth < self.__max_dep: self.__add_task(fetched_page, task.depth) imgs = fetched_page.get_resource_url(self.__image_suffix) if len(imgs) == 0: return path = os.path.join(utils.realpath(self.__output_dir), utils.url2path(fetched_page.url)) try: if not os.path.isfile(path): output_file = open(path, "w") else: output_file = open(path, "a") except IOError as e: logging.error("Can't open file {path}: {error}".format(path=path, error=e[1])) return for img in imgs: url = self.__fix_up_url(img, fetched_page) output_file.write(url + "\n") output_file.close()
def handle(self, task): """Fetching routine Args: task Task object Return: None """ history_lock = lock.Lock.retrive_lock() history_lock.acquire() if task.url in self.__history: logging.info("Url has been fetched: {url}".format(url=task.url)) history_lock.release() return self.__history.append(task.url) history_lock.release() if task.depth > self.__max_dep: raise SpiderException("Not a valid task: {task}".format(task=str(task))) time.sleep(self.__frequency) fetched_page = page.Page(task.url, 5) try: fetched_page.hydrate() except urllib2.HTTPError as e: logging.error("HTTP ERROR {url}: {error}" .format(url=task.url, error=str(e))) return except urllib2.URLError as e: logging.error("Url ERROR {url}: {error}" .format(url=task.url, error=str(e))) return if task.depth < self.__max_dep: self.__add_task(fetched_page, task.depth) imgs = fetched_page.get_resource_url(self.__image_suffix) if len(imgs) == 0: return path = os.path.join(utils.realpath(self.__output_dir), utils.url2path(fetched_page.url)) try: if not os.path.isfile(path): output_file = open(path, "w") else: output_file = open(path, "a") except IOError as e: logging.error("Can't open file {path}: {error}" .format(path=path, error=e[1])) return for img in imgs: url = self.__fix_up_url(img, fetched_page) output_file.write(url + "\n") output_file.close()
def save_video_temp(self, url): try: if self.cache.get(collection_name='video_name', key=url): self.text(url) return except KeyError: print('not in cache') res = requests.get(url, stream=True) if res.status_code == 404: print('not found') raise http_404_exception with open('{}.mp4'.format(url2path(url)), 'ab')as f: for chuck in tqdm(res.iter_content(chunk_size=1024)): f.write(chuck) print('写入成功: {}'.format(url)) return url
def set_sample_paths(self): if not self.options.paths: if self.options.disable_recurse: return paths = [os.path.dirname(utils.url2path(self.project_uri))] else: paths = self.options.paths if not isinstance(paths, list): paths = [paths] for path in paths: # We always want paths separator to be cut with '/' for ges-launch path = path.replace("\\", "/") if not self.options.disable_recurse: self.add_arguments("--sample-path-recurse", quote_uri(path)) else: self.add_arguments("--sample-path", quote_uri(path))
def get_htmls(self, workers=20): if isinstance(self.cache, DiskCache): html_path = os.path.join(dir_path, 'html') if not os.path.exists(html_path): os.mkdir(html_path) if isinstance(self.cache, MongoCache): self.cache.collection_name = 'html' page_num = self.max_page download_list = [] for i in range(page_num): url = self.page_url.format(i) cache_url = url2path(url) # 判断是否有缓存 html_path = '{}.html'.format(cache_url) try: if self.cache[cache_url]: print('从缓存中加载: {}'.format(html_path)) except KeyError: download_list.append(url) with futures.ThreadPoolExecutor(workers) as executor: res = executor.map(self.get_html, sorted(download_list)) return len(list(res))
def get_html(self, url): time.sleep(random.random()) html = requests.get(url, headers=self.headers).text self.text(url) self.cache[url2path(url)] = html