def media_downloaded(self, response, request, info): try: match = re.search(r'http://newsvidhd[^\|]+(\d{4}/\d{2}/\d{2}/)', response.body) try: item = response.meta['item'] item['vlog'](response.body) if not match: item.start(self.STATE_TYPE2) log("NIU TYPE2 for {}".format(item['raw_url'])) return date = match.group(1) mend = response.body[match.end():] match = re.search(r"([\w]+).mp4,200000", mend) if not match: match = re.search(r"([\w]+).mp4,500000", mend) vid_name = match.group(1) item['playlist_url'] = "http://newsvidhd-vh.akamaihd.net/i/foxsports/prod/archive/"\ + date + "," + vid_name + ",.mp4.csmil/master.m3u8" item.finish(self.STATE_ID) return item except Exception as e: log( "Y-dl playlist extraction failed %s: %s" % (item['raw_url'], str(e)), ERROR) item['vlog'](response.body) return item except Exception as e: format_exc(self, "media_downloaded", e)
def open_spider(self, spider): self._top_dir = os.path.join(config_out, spider.allowed_domains[0]) try: dir_util.mkpath(self._top_dir) except Exception as e: log("Can't create output directory %s : %s" % (self._top_dir, str(e)), ERROR)
def media_downloaded(self, response, request, info): try: match = re.search(r'http://newsvidhd[^\|]+(\d{4}/\d{2}/\d{2}/)', response.body) try: item = response.meta['item'] item['vlog'](response.body) if not match: item.start(self.STATE_TYPE2) log("NIU TYPE2 for {}".format(item['raw_url'])) return date = match.group(1) mend = response.body[match.end():] match = re.search(r"([\w]+).mp4,200000", mend) if not match: match = re.search(r"([\w]+).mp4,500000", mend) vid_name = match.group(1) item['playlist_url'] = "http://newsvidhd-vh.akamaihd.net/i/foxsports/prod/archive/"\ + date + "," + vid_name + ",.mp4.csmil/master.m3u8" item.finish(self.STATE_ID) return item except Exception as e: log("Y-dl playlist extraction failed %s: %s" % (item['raw_url'], str(e)), ERROR) item['vlog'](response.body) return item except Exception as e: format_exc(self, "media_downloaded", e)
def get_media_requests(self, item, info): if NO_VIDEO: return try: for video_url in item['video_urls']: self._spider.start_state(item['raw_url'], self.STATE_ID) log("VideoDirect downloading %s " % (video_url), DEBUG) request = scrapy.Request( url=video_url, #"http://aaa[1]b(2).ru", method="GET", headers={ "Accept": "*/*", "User-Agent": "Mozilla", }, meta={ "item": item, "video_url": video_url }, #"download_timeout":600, dont_filter=True, ) yield request except Exception as e: format_exc(self, "get_media_requests", e)
def init_chrome_driver(timeout=30): chrome_options = Options() chrome_options.add_argument("--disable-bundled-ppapi-flash") chrome_options.add_argument("--disable-plugins-discovery") chrome_options.add_argument("--disable-webaudio") chrome_options.add_argument("--mute-audio") #chrome_options.add_argument("--no-startup-window") prefs = {} prefs["plugins.plugins_disabled"] = [ "Adobe Flash Player", "Shockwave Flash" ] prefs["profile.managed_default_content_settings.images"] = 2 #prefs["profile.managed_default_content_settings.media_stream"] = 2 chrome_options.add_experimental_option("prefs", prefs) path = Config.value(SECTION_COMMON, "chromedriver_path") if path: log("Chromedriver path: %s" % path, INFO) driver = webdriver.Chrome(executable_path=path, chrome_options=chrome_options) else: driver = webdriver.Chrome(chrome_options=chrome_options) driver.wait = WebDriverWait(driver, timeout) return driver
def save(self): try: f = open(self._file_path, "wb") for hsh in self._hashes: f.write(Index._long_string(hsh)) f.close() except Exception as e: log("Saving %s failed: %s" % (self._file_path, str(e)))
def _call(arglst): try: subprocess.check_call(arglst, stdout=None) except subprocess.CalledProcessError as e: return e.returncode except Exception as e: log("subporcess.check_call failed %s" % str(e), ERROR) return 2 return 0
def open_spider(self, spider): #super(self.Nwjs, self).open_spider(spider) MediaPipelineEx.open_spider(self, spider) spider._object_cleaner.add_command(_clean_if_need) global _driver if not _driver: _driver = init_chrome_driver(TIMEOUT) log("Starting Chromedriver login") do_login(self, _driver)
def process_request(self, request, spider): if ("webdriver" in request.meta and request.meta["webdriver"] == "do_use"): log("Selenium requesting {}".format(request.url)) try: spider.driver.get(request.url) except TimeoutException as we: log("Web driver: timeout at {}".format(request.url), logging.ERROR) return HtmlResponse(spider.driver.current_url, body=spider.driver.page_source, encoding="utf-8", request=request)
def process_item(self, item, spider): try: if self.NAME in spider.disabled_pipelines: return item selector = item['raw_html'] url = item['raw_url'] if not url or not selector: msg = "Invalid item %s" % str(item) log(msg, ERROR) #abstr = response.xpath("//p[@class='article-abstract']/text()").extract_first() item['title'] = selector.xpath(self._title_x).extract_first() if not item['title']: item['title'] = selector.xpath( "//head/title/text()").extract_first() if not item['title']: log("No title %s" % url, ERROR) raise DropItem() item['title'] = item['title'].strip() log("RawExtractor got title %s" % item['title'], DEBUG) item['title'] = self.encode_strip(item['title']) body = "" for p in selector.xpath(self._text_paragraph_x).extract(): body += " " + p if body: body = body.strip() elif self._abstract_paragraph_x: body = selector.xpath( self._abstract_paragraph_x).extract_first() if not body: log("No article text %s" % url, DEBUG) body = "" item['text'] = body.encode("ascii", "replace").strip(" -\n") item['pictures'] = selector.xpath(self._picture_x).extract() try: if self._time_format_in: dt_obj_localized = extract_dt_obj(selector, self._time_x, self._time_format_in) else: """Assuming ISO time with timezone on empty format list""" iso_s = selector.xpath(self._time_x).extract_first() dt_obj_localized = time_utils.dt_obj_from_iso(iso_s) item['time'] = time_utils.format_utc_from_localized( dt_obj_localized, self._time_format_out) except Exception as e: log("No time for %s %s" % (url, str(e)), DEBUG) item['time'] = "" return self._extract_more(item, spider) except Exception as e: if type(e) == DropItem: raise format_exc(self, "process_item", e)
def _request_failed(self, failure): log("Failed: %s" % str(failure), ERROR) """ HttpError ? if isinstance(failure.value, spidermiddlewares.HttpError): response = failure.value.response log("Code ", str(response.status)) else: log("Failed miserably: %s" % str(failure))""" log("Failed: %s" % str(failure))
def add(self, byte_str): byte_str = Index._strip_url(byte_str) try: f = open(self._log_file_path, "a") f.write(byte_str + "\n") except Exception as e: log("Index log write failed %s" % str(e), ERROR) crc = self._crc_fun(byte_str) self._hashes.update([crc])
def media_downloaded(self, response, request, info): try: item = response.meta['item'] (vpath, vname) = os.path.split(response.meta['video_url']) with open(os.path.join(item['path'], vname), "wb") as f: f.write(response.body) self._spider.finalize_state(item['raw_url'], self.STATE_ID) log("VideoDirect download complete %s for %s" % (request.url, item['raw_url']), WARNING) except Exception as e: format_exc(self, "media_downloaded", e)
def process_item(self, item, spider): try: if self.NAME in spider.disabled_pipelines: return item selector = item['raw_html'] url = item['raw_url'] if not url or not selector: msg = "Invalid item %s" % str(item) log(msg, ERROR) #abstr = response.xpath("//p[@class='article-abstract']/text()").extract_first() item['title'] = selector.xpath(self._title_x).extract_first() if not item['title']: item['title'] = selector.xpath("//head/title/text()").extract_first() if not item['title']: log("No title %s" % url, ERROR) raise DropItem() item['title'] = item['title'].strip() log("RawExtractor got title %s" % item['title'], DEBUG) item['title'] = self.encode_strip(item['title']) body = "" for p in selector.xpath(self._text_paragraph_x).extract(): body += " " + p if body: body = body.strip() elif self._abstract_paragraph_x: body = selector.xpath(self._abstract_paragraph_x).extract_first() if not body: log("No article text %s" % url, DEBUG) body = "" item['text'] = body.encode("ascii", "replace").strip(" -\n") item['pictures'] = selector.xpath(self._picture_x).extract() try: if self._time_format_in: dt_obj_localized = extract_dt_obj(selector, self._time_x, self._time_format_in) else: """Assuming ISO time with timezone on empty format list""" iso_s = selector.xpath(self._time_x).extract_first() dt_obj_localized = time_utils.dt_obj_from_iso(iso_s) item['time'] = time_utils.format_utc_from_localized(dt_obj_localized, self._time_format_out) except Exception as e: log("No time for %s %s" % (url, str(e)), DEBUG) item['time'] = "" return self._extract_more(item, spider) except Exception as e: if type(e) == DropItem: raise format_exc(self, "process_item", e)
def _run_item(self, response): try: url = response.request.url if not url in self._links: log("Response url doesn't match: %s" % url, INFO) item = self._item_class(self) item['raw_url'] = url response = self._prepare_response(response) item['raw_html'] = response.selector #item['raw_text'] = response.body return item except Exception as e: format_exc(self, "_run_item", e)
def media_downloaded(self, response, request, info): try: item = response.meta['item'] (vpath, vname) = os.path.split(response.meta['video_url']) with open(os.path.join(item['path'], vname), "wb") as f: f.write(response.body) self._spider.finalize_state(item['raw_url'], self.STATE_ID) log( "VideoDirect download complete %s for %s" % (request.url, item['raw_url']), WARNING) except Exception as e: format_exc(self, "media_downloaded", e)
def media_failed(self, failure, request, info): try: item = request.meta['item'] log("VideoDirect download failed %s for %s: %s" % (request.url, item['raw_url'], str(info)), ERROR) """DEBUG""" video_url = request.meta['video_url'] (vpath, vname) = os.path.split(video_url) vname = "FAIL" + vname with open(os.path.join(item['path'], vname), "wb") as f: f.write(video_url) except Exception as e: format_exc(self, "media_downloaded", e)
def __str__(self): obj_states = self try: started_str = str() for s in obj_states.started: started_str = started_str + s + " " finished_str = str() for s in obj_states.finished: finished_str = finished_str + s + " " return "{:<30} -> {:<30}".format(started_str, finished_str) except TypeError: log("format_string_indicators TypeError", ERROR) raise
def media_downloaded(self, response, request, info): try: match = re.search(r"x200[^\w]+(http[^\"\,]+\.mp4)", response.body) item = response.meta['item'] if match: item['ooyala_urls'].append(match.group(1)) return item else: item.start(OOYALA_JS_ID ) log("Ooyala0: type 2 %s" % item['raw_url'], WARNING) item['vlog'](response.body) return item #log("No ooyala match %s %s" % (request.url, item['raw_url'])) except Exception as e: format_exc(self, "media_downloaded", e)
def media_failed(self, failure, request, info): try: item = request.meta['item'] log( "VideoDirect download failed %s for %s: %s" % (request.url, item['raw_url'], str(info)), ERROR) """DEBUG""" video_url = request.meta['video_url'] (vpath, vname) = os.path.split(video_url) vname = "FAIL" + vname with open(os.path.join(item['path'], vname), "wb") as f: f.write(video_url) except Exception as e: format_exc(self, "media_downloaded", e)
def media_downloaded(self, response, request, info): try: match = re.search(r"x200[^\w]+(http[^\"\,]+\.mp4)", response.body) item = response.meta['item'] if match: item['ooyala_urls'].append(match.group(1)) return item else: item.start(OOYALA_JS_ID) log("Ooyala0: type 2 %s" % item['raw_url'], WARNING) item['vlog'](response.body) return item #log("No ooyala match %s %s" % (request.url, item['raw_url'])) except Exception as e: format_exc(self, "media_downloaded", e)
def phantom_login(self): global _webdriver if _webdriver: #login once per application run self.driver = _webdriver log("Reusing logged in webdriver", DEBUG) else: dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.loadImages"] = "false" dcap["phantomjs.page.settings.resourceTimeout"] = "120000" _webdriver = webdriver.PhantomJS(executable_path=Config.value(CONFIG_SECTION, "phantomjs_path"), desired_capabilities=dcap)#init_chrome_driver()# self.driver = _webdriver log("Starting PhantomJS login") do_login(self, self.driver) self.driver.implicitly_wait(PAGE_TIMEOUT) self.driver.set_page_load_timeout(PAGE_TIMEOUT) getLogger("selenium.webdriver").setLevel(INFO)
def handle_downloaded(inst, response, vid_name = ""): try: item = response.meta['item'] url = item['raw_url'] if not vid_name: vid_name = Ooyala1Pipeline.__vid_name(inst, response.meta['number']) with open(os.path.join(item['path'], vid_name + ".mp4"), "wb") as vid_f: vid_f.write(response.body) log("Finished downloading video %s for %s %s (%i)" % (vid_name, item['title'], item['raw_url'], len(response.body))) if len(response.body) < 5000: item['vlog'](response.body) #caller either should have id or finish by itself if hasattr(inst, 'STATE_ID'): item.finish(inst.STATE_ID) except Exception as e: format_exc(inst, "handle_downloaded", e)
def _extract_more(self, item, spider): try: selector = item['raw_html'] if "/video" in item['raw_url'] or selector.xpath("//article/descendant::div[re:test(@class, 'inline-player')]"): item['skip_video'] = False log("Video elements found in article %s" % item['raw_url'], DEBUG) else: item['skip_video'] = True log("No video elements in article %s" % item['raw_url'], DEBUG) item['twitter_data'] = \ [re.search(r'status/(\d+)', twit_lnk).group(1) for twit_lnk in selector.xpath("//blockquote[re:test(@class,'twitter-tweet')]/a[re:test(@href, 'status/(\d+)')]/@href").extract()] return item except Exception as e: format_exc(self, "_extract_more", e)
def phantom_login(self): global _webdriver if _webdriver: #login once per application run self.driver = _webdriver log("Reusing logged in webdriver", DEBUG) else: dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.loadImages"] = "false" dcap["phantomjs.page.settings.resourceTimeout"] = "120000" _webdriver = webdriver.PhantomJS( executable_path=Config.value(CONFIG_SECTION, "phantomjs_path"), desired_capabilities=dcap) #init_chrome_driver()# self.driver = _webdriver log("Starting PhantomJS login") do_login(self, self.driver) self.driver.implicitly_wait(PAGE_TIMEOUT) self.driver.set_page_load_timeout(PAGE_TIMEOUT) getLogger("selenium.webdriver").setLevel(INFO)
def yield_requests(inst, item, urls_field): try: assert item[urls_field], "Should be called for a filled item" i = 1 for vid in item[urls_field]: log("Started downloading video %s for %s" % (Ooyala1Pipeline.__vid_name(inst, i), item['raw_url'])) yield scrapy.Request( url=vid, method="GET", headers={ "Accept" : "*/*", "User-Agent" : "Mozilla", }, meta={'item':item, 'number':i}, ) i += 1 except Exception as e: format_exc(inst, "yield_requests", e)
def __init__(self, **kw): try: scrapy.Spider.__init__(self, **kw) self.video_processor = None """attributes to be overridden""" self._index = NotImplementedError() self.disabled_pipelines = [] self._page_count = 0 self._dates = [] self._links = {} self._video_msg = {} self._existent = {} self._next_page_url_interrupted = "" self._retry_count = 0 self._lnk_pos = 0 self._total_count = 0 self._object_cleaner = None dispatcher.connect(self._spider_idle, scrapy.signals.spider_idle) if kw.get('no_index', False): self._index = None else: self._index = Index(self.BASE_DOMAIN) self.__first_page = kw.get('first_page', False) self.start_url = kw.get('start_url') self._object_cleaner = kw.get('object_cleaner') if "/" == self.start_url[0]: self.start_url = self.BASE_URL + self.start_url log("\n\nSTART: %s" % self.start_url, INFO) self.logidx("\nLog for %s started %s" % (self.start_url, time.strftime("%b %d %H:%M:%S %Y"))) self._per_url_regex_xpath = () self._debug_url = "" except Exception as e: format_exc(self, "__init__", e)
def __init__(self, domain): """Loads data to memory. Creates index directory if needed and raises DistutilsFileError if failed. Raises IdexFingerpringException domain - index storage ID """ try: POLYNOMIAL = 0x1AABBCCDDFFEEDDCC # must be 65 bit long self._debug = 0 self._hashes = set() self._crc_fun = crcmod.mkCrcFun(POLYNOMIAL, initCrc=0) # When run from the unit test, index directory path will be tweaked in Config file_path = Config.value(mirror0.SECTION_COMMON, "index_directory") dir_util.mkpath(file_path) file_name = domain + ".crc64" self._file_path = os.path.join(file_path, file_name) with open(self._file_path, "a+b") as f: data = f.read() if len(data) % CRC_LEN: raise IndexFingerprintException("%s is corrupt!" % file_name) count = len(data) / CRC_LEN for i in range(0, count): string_val = data[i * CRC_LEN:(i + 1) * CRC_LEN] int_val = Index._string_long(string_val) self._hashes.update([int_val]) log("Read %i hashes from %s" % (count, file_name)) file_name = domain + ".log" self._log_file_path = os.path.join(file_path, file_name) # Rewrite through centralized logging with open(self._log_file_path, "a") as f: f.write("\n\nSTARTED %s\n" % time.strftime("%d %b %Y %H:%M:%S")) except IndexFingerprintException as e: format_exc(self, "__init__", e) log(self._file_path, ERROR) raise
def process_item(self, item, spider): if self.NAME in spider.disabled_pipelines: return item log("TextImage pipeline {0}".format(item['raw_url'])) try: spider.start_state(item['raw_url'], TextImagePipeline.STATE_ID) text_path = os.path.join(item['path'], ARTICLE_FILE) if item['text']: with open(text_path, "w") as f: f.write(item['text']) except Exception as e: log("Error writing article text %s : %s" % (item['raw_url'], str(e)), ERROR) picture_timeout = get_project_settings().get('DOWNLOAD_TIMEOUT', 30) if item['pictures']: log("Downloading images for {0}".format(item['raw_url']), DEBUG) i = 0 for img in item['pictures']: try: (foo, ext) = os.path.splitext(img) img_name = "%02i" % i + (ext if ext else "") img_path = os.path.join(item['path'], img_name) if "/" == img[0]: img = os.path.join(spider.BASE_URL, img[1:]) if not NO_PICTURES: fileobj = urllib2.urlopen(img, timeout=picture_timeout) with open(img_path, "wb") as f: f.write(fileobj.read()) i += 1 except Exception as e: log("Error writing article image %s : %s" % (img, str(e)), ERROR) if i: log("%i images retrieved for %s" % (i, item['title']), DEBUG) spider.finalize_state(item['raw_url'], TextImagePipeline.STATE_ID) return item
def __init__(self, domain): """Loads data to memory. Creates index directory if needed and raises DistutilsFileError if failed. Raises IdexFingerpringException domain - index storage ID """ try: POLYNOMIAL = 0x1AABBCCDDFFEEDDCC # must be 65 bit long self._debug = 0 self._hashes = set() self._crc_fun = crcmod.mkCrcFun(POLYNOMIAL, initCrc=0) # When run from the unit test, index directory path will be tweaked in Config file_path =Config.value(mirror0.SECTION_COMMON, "index_directory") dir_util.mkpath(file_path) file_name = domain + ".crc64" self._file_path = os.path.join(file_path, file_name) with open(self._file_path, "a+b") as f: data = f.read() if len(data) % CRC_LEN: raise IndexFingerprintException("%s is corrupt!" % file_name) count = len(data) / CRC_LEN for i in range(0, count): string_val = data[i*CRC_LEN : (i + 1)*CRC_LEN] int_val = Index._string_long(string_val) self._hashes.update([int_val]) log("Read %i hashes from %s" % (count, file_name)) file_name = domain + ".log" self._log_file_path = os.path.join(file_path, file_name) # Rewrite through centralized logging with open(self._log_file_path, "a") as f: f.write("\n\nSTARTED %s\n" % time.strftime("%d %b %Y %H:%M:%S")) except IndexFingerprintException as e: format_exc(self, "__init__", e) log(self._file_path, ERROR) raise
def get_media_requests(self, item, info): item['playlist_url'] = "" if item['ooyala_id']: try: item.start(self.STATE_ID) url = "http://player.ooyala.com/player_api/v1/metadata/embed_code/{}/{}".format(Niux.request_code, item['ooyala_id']) log("Preparing youtube-dl playlist %s " % (item['raw_url']), DEBUG) request = scrapy.Request( url=url, method="GET", headers={ "Accept" : "*/*", "User-Agent" : "Mozilla", }, meta={"item":item}, dont_filter=True, ) yield request except Exception as e: format_exc(self, "get_media_requests", e)
def _spider_idle(self, spider): """Collect more links, starting from the place previously stopped""" try: log("Spider {0} idle start".format(self.name), DEBUG) if self.video_processor: self.video_processor.wait_all_finished(self) if self._links or self._existent: #should complete all requests before going further self._index_successful() for link, states in self._links.viewitems(): self.logidx("%s %s" % (str(states), link)) lost = sum(1 for lnk, result in self._links.viewitems() if "?" == result) ok = sum(1 for lnk, result in self._links.viewitems() if not type(result) is str and self._is_successful(result)) log("Lost links: %i, OK: %i" % (lost, ok), WARNING) self._links.clear() if self.__first_page: return if self._next_page_url_interrupted: log("Idle, start collecting links") self.logidx("Requesting {0}".format(self._next_page_url_interrupted)) req = Spider._request(self._next_page_url_interrupted, self._collect_next_page_links) self._next_page_url_interrupted = "" self.crawler.engine.crawl(req, spider) except Exception as e: format_exc(self, "_spider_idle", e)
def _request_next_page_links(self, next_url, webdriver): if (len(self._links) >= LINKS_BATCH or not next_url): #request articles from collected links requests = [] for url in self._links: requests.append(Spider._request( url_=url, callback_=self._run_item, errback_=self._request_failed, dont_filter_=True, meta_={"webdriver" : webdriver,},)) self._lnk_pos += 1 self._next_page_url_interrupted = next_url #scrapy sends them in the reverse order requests.reverse() log(json.dumps(self._links, separators=("\n"," ")), DEBUG) log("Requesting articles") return requests if next_url: return Spider._request(url_=next_url, callback_=self._collect_next_page_links)
def init_chrome_driver(timeout=30): chrome_options = Options() chrome_options.add_argument("--disable-bundled-ppapi-flash") chrome_options.add_argument("--disable-plugins-discovery") chrome_options.add_argument("--disable-webaudio") chrome_options.add_argument("--mute-audio") #chrome_options.add_argument("--no-startup-window") prefs = {} prefs["plugins.plugins_disabled"] = ["Adobe Flash Player", "Shockwave Flash"] prefs["profile.managed_default_content_settings.images"] = 2 #prefs["profile.managed_default_content_settings.media_stream"] = 2 chrome_options.add_experimental_option("prefs", prefs) path = Config.value(SECTION_COMMON, "chromedriver_path") if path: log("Chromedriver path: %s" % path, INFO) driver = webdriver.Chrome(executable_path=path, chrome_options=chrome_options) else: driver = webdriver.Chrome(chrome_options=chrome_options) driver.wait = WebDriverWait(driver, timeout) return driver
def __init__(self, *, domain): """Loads data to memory. Creates index directory if needed and raises DistutilsFileError, OSError Raises IdexFingerpringException domain - index storage ID """ POLYNOMIAL = 0x1AABBCCDDFFEEDDCC # must be 65 bit long self._hashes = set() self._crc_fun = crcmod.mkCrcFun(POLYNOMIAL, initCrc=0) try: # When run from the unit test, index directory path will be tweaked in Config file_path = Config.value(mirror0.SECTION_COMMON, "index_directory") mkpath(file_path) file_name = domain + ".crc64" self._file_path = os.path.join(file_path, file_name) with open(self._file_path, "rb") as f: data = f.read() except (OSError, DistutilsFileError) as e: format_exc(self, "Index init failed", e) raise if len(data) % CRC_LEN: msg = "{} is corrupt!".format(self._file_path) log(msg, ERROR) raise IndexFingerprintException(msg) count = len(data) // CRC_LEN for i in range(0, count): string_val = data[i * CRC_LEN:(i + 1) * CRC_LEN] int_val = _bytes_to_long(string_val) self._hashes.update([int_val]) log("Read {} hashes from {}".format(count, file_name)) file_name = domain + ".log" self._log_file_path = os.path.join(file_path, file_name) self.index_log("\n\nSTARTED {}\n".format( time.strftime("%d %b %Y %H:%M:%S")))
def process_item(self, item, spider): if NO_VIDEO or ('skip_video' in item and item['skip_video']) or not item[self.download_url_field]: log("StreamPipeline skipping {}".format(item['raw_url'])) return item spider.video_processor = self try: #check if already downloaded (or tried to) and link to previously saved path if self._no_duplicates: video_fname = self.get_video_filename(item) if video_fname: if video_fname in self.__downloaded_files: ln_to = os.path.join(self.__downloaded_files[video_fname], video_fname) #if os.path.isfile(ln_to): ln_from = os.path.join(item['path'], video_fname) rln = self._call(["ln", "-s", "-f", "--no-dereference", ln_to, ln_from]) url = item["raw_url"] log("Linking {0} to {1} for {2}".format(ln_from, ln_to, url), DEBUG) spider.start_state(url, self.STATE_ID) spider.finalize_state(url, self.STATE_ID) return item#do not download else: #remember fname immediately, if not done before. don't wanna wait results #making things more complex. want to exclude duplicates of not-yet-finished videos. self.__downloaded_files[video_fname] = item['path'] #print "added {0}".format(video_fname) logfile_path = item['vlog'].file_path logfile = open(logfile_path, "w", 0) timeout = get_project_settings().get('DOWNLOAD_TIMEOUT', 30) data_dir = item['path'] cmdline = "youtube-dl --no-warnings " if int(Config.value(mirror0.SECTION_COMMON, "hls_prefer_native")): cmdline += "--hls-prefer-native " cmdline += "--no-part --socket-timeout {0} ".format(timeout) cmdline += "-o '%s" % data_dir cmdline += "/%(title)s-{0}.%(ext)s' ".format(self.__vcounter) cmdline += item[self.download_url_field] logfile.write(cmdline + "\n") self.__vcounter += 1 log("Starting {0} for {1}".format(item[self.download_url_field], item["raw_url"]), DEBUG) self._sub_proc.append( (subprocess.Popen([cmdline], stdout=logfile.fileno(), stderr=logfile.fileno(), shell=True), #stderr=subprocess.STDOUT, logfile, logfile_path, item["raw_url"],), ) #for key, value in logging.Logger.manager.loggerDict.iteritems(): except Exception as e: format_exc(self, "porcess_item", e) return item
def __init__(self, *, domain): """Loads data to memory. Creates index directory if needed and raises DistutilsFileError, OSError Raises IdexFingerpringException domain - index storage ID """ POLYNOMIAL = 0x1AABBCCDDFFEEDDCC # must be 65 bit long self._hashes = set() self._crc_fun = crcmod.mkCrcFun(POLYNOMIAL, initCrc=0) try: # When run from the unit test, index directory path will be tweaked in Config file_path =Config.value(mirror0.SECTION_COMMON, "index_directory") mkpath(file_path) file_name = domain + ".crc64" self._file_path = os.path.join(file_path, file_name) with open(self._file_path, "rb") as f: data = f.read() except (OSError, DistutilsFileError) as e: format_exc(self, "Index init failed", e) raise if len(data) % CRC_LEN: msg = "{} is corrupt!".format(self._file_path) log(msg, ERROR) raise IndexFingerprintException(msg) count = len(data) // CRC_LEN for i in range(0, count): string_val = data[i*CRC_LEN : (i + 1)*CRC_LEN] int_val = _bytes_to_long(string_val) self._hashes.update([int_val]) log("Read {} hashes from {}".format(count, file_name)) file_name = domain + ".log" self._log_file_path = os.path.join(file_path, file_name) self.index_log("\n\nSTARTED {}\n".format(time.strftime("%d %b %Y %H:%M:%S")))
def get_media_requests(self, item, info): if NO_VIDEO: return try: for video_url in item['video_urls']: self._spider.start_state(item['raw_url'], self.STATE_ID) log("VideoDirect downloading %s " % (video_url), DEBUG) request = scrapy.Request( url=video_url, #"http://aaa[1]b(2).ru", method="GET", headers={ "Accept" : "*/*", "User-Agent" : "Mozilla", }, meta={ "item":item, "video_url":video_url}, #"download_timeout":600, dont_filter=True, ) yield request except Exception as e: format_exc(self, "get_media_requests", e)
def get_media_requests(self, item, info): item['playlist_url'] = "" if item['ooyala_id']: try: item.start(self.STATE_ID) url = "http://player.ooyala.com/player_api/v1/metadata/embed_code/{}/{}".format( Niux.request_code, item['ooyala_id']) log("Preparing youtube-dl playlist %s " % (item['raw_url']), DEBUG) request = scrapy.Request( url=url, method="GET", headers={ "Accept": "*/*", "User-Agent": "Mozilla", }, meta={"item": item}, dont_filter=True, ) yield request except Exception as e: format_exc(self, "get_media_requests", e)
def get_media_requests(self, item, info): item['ooyala_urls'] = [] if NO_VIDEO: return try: for id in item['ooyala_video_ids']: url = "http://player.ooyala.com/player_api/v1/metadata/embed_code/89a379a0e1e94feca5bb87c46a8b2d5e/" + id log("Ooyala 0 requesting %s " % (item['raw_url']), DEBUG) request = scrapy.Request( url=url, #callback=self._request_done, method="GET", headers={ "Accept": "*/*", "User-Agent": "Mozilla", }, meta={"item": item}, dont_filter=True, ) yield request except Exception as e: format_exc(self, "get_media_requests", e)
def get_media_requests(self, item, info): item['ooyala_urls'] = [] if NO_VIDEO: return try: for id in item['ooyala_video_ids']: url = "http://player.ooyala.com/player_api/v1/metadata/embed_code/89a379a0e1e94feca5bb87c46a8b2d5e/" + id log("Ooyala 0 requesting %s " % (item['raw_url']), DEBUG) request = scrapy.Request( url=url, #callback=self._request_done, method="GET", headers={ "Accept" : "*/*", "User-Agent" : "Mozilla", }, meta={"item":item}, dont_filter=True, ) yield request except Exception as e: format_exc(self, "get_media_requests", e)
def _extract_more(self, item, spider): try: log("NewsExtractor start %s" % item['title'], DEBUG) selector = item['raw_html'] item['ooyala_id'] = selector.xpath("//div[re:test(@class, 'vms module')]/@vms-embedcode").extract_first() if item['ooyala_id']: log("Matched %s" % item['raw_url'], DEBUG) else: item['ooyala_id'] = "" log("Not matched %s" % item['raw_url'], DEBUG) return item except Exception as e: format_exc(self, "_extract_more", e)
def _extract_more(self, item, spider): try: log("NewsExtractor start %s" % item['title'], DEBUG) selector = item['raw_html'] item['ooyala_id'] = selector.xpath( "//div[re:test(@class, 'vms module')]/@vms-embedcode" ).extract_first() if item['ooyala_id']: log("Matched %s" % item['raw_url'], DEBUG) else: item['ooyala_id'] = "" log("Not matched %s" % item['raw_url'], DEBUG) return item except Exception as e: format_exc(self, "_extract_more", e)
def get_media_requests(self, item, info): item_url = item['raw_url'] try: data_cnt = len(item['twitter_data']) if data_cnt: log("%s extracting for %s" % (self.__class__.__name__, item['raw_url']), DEBUG) for data_num in range(0, data_cnt): prev_response = item['twitter_data'][data_num] item['twitter_data'][data_num] = None if not prev_response: log("Stopped num %i for %s" % (data_num, item_url), INFO) else: err_msg = "" try: next_link = self.extract_next_link(prev_response) except Exception as e: err_msg = str(e) next_link = None if next_link: self.start_state_if_needed(item, data_num) log("%i requesting %s for %s" % (data_num, next_link, item_url), DEBUG) yield scrapy.Request( url=next_link, method="GET", headers={ "Accept" : "*/*", "User-Agent" : "Mozilla", }, meta={'item':item, 'data_num':data_num}, ) else: item['vlog'](("data_num %i\n" % data_num) + prev_response.body) log("Extraction failed num %i: %s for %s" % (data_num, err_msg, item_url), DEBUG) except Exception as e: format_exc(self, "get_media_requests %s" % item_url, e)
def _extract_more(self, item, spider): try: log("AflExtractor start %s" % item['title'], DEBUG) selector = item['raw_html'] item['ooyala_video_ids'] = selector.xpath("//div[re:test(@class, 'ooyala-player')]/@data-content-id").extract() if item['ooyala_video_ids']: log("Matched %s" % item['raw_url'], DEBUG) else: log("Not matched %s" % item['raw_url'], DEBUG) #with open(self._idx_file, "a") as f: # f.write(item['raw_text'] + "\n\n") if self._need_twitter: item['twitter_data'] = \ [re.search(r'status/(\d+)', twit_lnk).group(1) for twit_lnk in selector.xpath("//blockquote[@class='twitter-video']/a[re:test(@href, 'status/\d+')]/@href").extract()] return item except Exception as e: format_exc(self, "_extract_more", e)
def process_item(self, item, spider): try: log("FSCreator start %s" % item['title'], DEBUG) #log("fs for %s" % item['title']) item_dir = os.path.join(self._top_dir, self.__class__.getItemDir(item, spider)) if os.path.isdir(item_dir): log("Article path exists, overwriting: %s" % item_dir, DEBUG) try: dir_util.mkpath(item_dir) except Exception as e: log("Can't create article directory %s : %s" % (item_dir, str(e)), ERROR) item['path'] = item_dir if not self.__vlog_dir: self.__vlog_dir = os.path.join(Config.value(SECTION_COMMON, "log_directory"), spider.name + "_streaming") shutil.rmtree(self.__vlog_dir, True) try: os.mkdir(self.__vlog_dir) except OSError as e: pass self.__need_clean = False logfile_path = os.path.join(self.__vlog_dir, item['title'] + ".log") class VideoLog: def __init__(self): self.logfile_path = None vlog = VideoLog() vlog.file_path = logfile_path vlog.__call__ = functools.partial(FSCreatorPipeline.append_file, logfile_path) item['vlog'] = vlog return self._create_more(item, spider) except Exception as e: if type(e) == DropItem: raise else: format_exc(self, "process_item", e)