示例#1
0
 def media_downloaded(self, response, request, info):
     try:
         match = re.search(r'http://newsvidhd[^\|]+(\d{4}/\d{2}/\d{2}/)',
                           response.body)
         try:
             item = response.meta['item']
             item['vlog'](response.body)
             if not match:
                 item.start(self.STATE_TYPE2)
                 log("NIU TYPE2 for {}".format(item['raw_url']))
                 return
             date = match.group(1)
             mend = response.body[match.end():]
             match = re.search(r"([\w]+).mp4,200000", mend)
             if not match:
                 match = re.search(r"([\w]+).mp4,500000", mend)
             vid_name = match.group(1)
             item['playlist_url'] = "http://newsvidhd-vh.akamaihd.net/i/foxsports/prod/archive/"\
                  + date + "," + vid_name + ",.mp4.csmil/master.m3u8"
             item.finish(self.STATE_ID)
             return item
         except Exception as e:
             log(
                 "Y-dl playlist extraction failed %s: %s" %
                 (item['raw_url'], str(e)), ERROR)
             item['vlog'](response.body)
             return item
     except Exception as e:
         format_exc(self, "media_downloaded", e)
示例#2
0
    def _spider_idle(self, spider):
        """Collect more links, starting from the place previously stopped"""
        try:
            log("Spider {0} idle start".format(self.name), DEBUG)
            if self.video_processor:
                self.video_processor.wait_all_finished(self)
            if self._links or self._existent:
                #should complete all requests before going further
                self._index_successful()
                for link, states in self._links.viewitems():
                    self.logidx("%s %s" % (str(states), link))

                lost = sum(1 for lnk, result in self._links.viewitems() if "?" == result)
                ok = sum(1 for lnk, result in self._links.viewitems() if not type(result) is str and self._is_successful(result))
                log("Lost links: %i, OK: %i" % (lost, ok), WARNING)
                self._links.clear()

                if self.__first_page:
                    return

                if self._next_page_url_interrupted:
                    log("Idle, start collecting links")
                    self.logidx("Requesting {0}".format(self._next_page_url_interrupted))
                    req = Spider._request(self._next_page_url_interrupted, self._collect_next_page_links)
                    self._next_page_url_interrupted = ""
                    self.crawler.engine.crawl(req, spider)
        except Exception as e:
            format_exc(self, "_spider_idle", e)
示例#3
0
 def handle_failed(inst, failure, request):
     try:
         format_exc(inst, "media_failed", failure)
         item = request.meta['item']
         item['vlog']("Ooyala1: " + str(failure))
     except Exception as e:
         format_exc(inst, "handle_failed", e)
示例#4
0
    def __init__(self, **kw):
        try:
            mirror0.generic_spider.Spider.__init__(self, **kw)
            self.less_vid = kw.get('less_vid', False)

        except Exception as e:
            format_exc(self, "__init__", e)
示例#5
0
    def __init__(self, **kw):
        try:
            mirror0.generic_spider.Spider.__init__(self, **kw)
            #super(self.__class__, self).__init__(kw)

        except Exception as e:
            format_exc(self, "__init__", e)
示例#6
0
 def media_downloaded(self, response, request, info):
     try:
         match = re.search(r'http://newsvidhd[^\|]+(\d{4}/\d{2}/\d{2}/)', response.body)
         try:
             item = response.meta['item']
             item['vlog'](response.body)
             if not match:
               item.start(self.STATE_TYPE2)
               log("NIU TYPE2 for {}".format(item['raw_url']))
               return
             date = match.group(1)
             mend = response.body[match.end():]
             match = re.search(r"([\w]+).mp4,200000", mend)
             if not match:
                 match = re.search(r"([\w]+).mp4,500000", mend)
             vid_name = match.group(1)
             item['playlist_url'] = "http://newsvidhd-vh.akamaihd.net/i/foxsports/prod/archive/"\
                  + date + "," + vid_name + ",.mp4.csmil/master.m3u8"
             item.finish(self.STATE_ID)
             return item
         except Exception as e:
             log("Y-dl playlist extraction failed %s: %s" % (item['raw_url'], str(e)), ERROR)
             item['vlog'](response.body)
             return item
     except Exception as e:
         format_exc(self, "media_downloaded", e)
示例#7
0
def do_login(inst, driver):
    try:
        driver.implicitly_wait(60)
        driver.set_page_load_timeout(60)
        log("Opening login page", DEBUG)
        try:
            driver.get("http://www.heraldsun.com.au/login")
        except TimeoutException as we:
            log("Login page timeout, continuing", INFO)
        cls_name = 'ncenvoy-identity ncenvoy-identity-login'
        xpath = "//iframe[@class='{}']".format(cls_name)
        el = driver.find_element_by_xpath(xpath)
        fname = el.get_attribute("name")
        log("Switching to frame {}".format(fname))
        driver.switch_to.frame(fname)
        driver.find_element_by_id("cam_password").send_keys("infillpaper01")
        driver.find_element_by_id("cam_username").send_keys("*****@*****.**")
        driver.find_element_by_class_name("button-submit").click()

        """
        xpath = '//p[contains(text(), "Thank you")]' 
        driver.wait.until(EC.presence_of_element_located(
              (By.XPATH, xpath)))
        """
        log("Login submitted")
    except TimeoutException as we:
        log("Web driver: timeout at login", ERROR)
        raise
    except WebDriverException as we:
        log("Web driver: %s" % we.msg, ERROR)
        raise
    except Exception as e:
        format_exc(inst, "__init__", e)
        raise
示例#8
0
    def __init__(self, **kw):
        try:
            mirror0.generic_spider.Spider.__init__(self, **kw)
            self.less_vid = kw.get('less_vid', False)

        except Exception as e:
            format_exc(self, "__init__", e)
示例#9
0
def do_login(inst, driver):
    try:
        driver.implicitly_wait(60)
        driver.set_page_load_timeout(60)
        log("Opening login page", DEBUG)
        try:
            driver.get("http://www.heraldsun.com.au/login")
        except TimeoutException as we:
            log("Login page timeout, continuing", INFO)
        cls_name = 'ncenvoy-identity ncenvoy-identity-login'
        xpath = "//iframe[@class='{}']".format(cls_name)
        el = driver.find_element_by_xpath(xpath)
        fname = el.get_attribute("name")
        log("Switching to frame {}".format(fname))
        driver.switch_to.frame(fname)
        driver.find_element_by_id("cam_password").send_keys("infillpaper01")
        driver.find_element_by_id("cam_username").send_keys(
            "*****@*****.**")
        driver.find_element_by_class_name("button-submit").click()
        """
        xpath = '//p[contains(text(), "Thank you")]' 
        driver.wait.until(EC.presence_of_element_located(
              (By.XPATH, xpath)))
        """
        log("Login submitted")
    except TimeoutException as we:
        log("Web driver: timeout at login", ERROR)
        raise
    except WebDriverException as we:
        log("Web driver: %s" % we.msg, ERROR)
        raise
    except Exception as e:
        format_exc(inst, "__init__", e)
        raise
示例#10
0
    def get_media_requests(self, item, info):

        if NO_VIDEO:
            return

        try:
            for video_url in item['video_urls']:
                self._spider.start_state(item['raw_url'], self.STATE_ID)
                log("VideoDirect downloading %s " % (video_url), DEBUG)
                request = scrapy.Request(
                    url=video_url,
                    #"http://aaa[1]b(2).ru",
                    method="GET",
                    headers={
                        "Accept": "*/*",
                        "User-Agent": "Mozilla",
                    },
                    meta={
                        "item": item,
                        "video_url": video_url
                    },  #"download_timeout":600,
                    dont_filter=True,
                )
                yield request
        except Exception as e:
            format_exc(self, "get_media_requests", e)
示例#11
0
 def media_downloaded(self, response, request, info):
     item = response.meta['item']
     try:
         data_num = response.meta['data_num']
         item['twitter_data'][data_num] = response
     except Exception as e:
         format_exc(self, "media_downloaded %s" % item['raw_url'], e)
示例#12
0
    def __init__(self, **kw):
        try:
            mirror0.generic_spider.Spider.__init__(self, **kw)
            #super(self.__class__, self).__init__(kw)

        except Exception as e:
            format_exc(self, "__init__", e)
示例#13
0
 def media_downloaded(self, response, request, info):
     try:
         data_num = response.meta['data_num']
         state_id = TwitterPipeline1.STATE_ID % data_num
         Ooyala1Pipeline.handle_downloaded(self, response, state_id) 
         response.meta['item'].finish(state_id)
     except Exception as e:
         format_exc(self, "media_downloaded", e)
示例#14
0
 def start_requests(self):
     try:
         yield self._request(
             url_=self.start_url,
             callback_=self._collect_next_page_links,
             )
     except Exception as e:
         format_exc(self, "start_requests", e)
示例#15
0
    def process_item(self, item, spider):
        if NO_VIDEO or ('skip_video' in item and item['skip_video']) or not item[self.download_url_field]:
            log("StreamPipeline skipping {}".format(item['raw_url']))
            return item

        spider.video_processor = self

        try:
            #check if already downloaded (or tried to) and link to previously saved path
            if self._no_duplicates:
                video_fname = self.get_video_filename(item)
                if video_fname:
                    if video_fname in self.__downloaded_files:
                       ln_to = os.path.join(self.__downloaded_files[video_fname], video_fname)
                       #if os.path.isfile(ln_to):
                       ln_from = os.path.join(item['path'], video_fname)
                       rln = self._call(["ln", "-s", "-f", "--no-dereference", ln_to, ln_from])
                       url = item["raw_url"]
                       log("Linking {0} to {1} for {2}".format(ln_from, ln_to, url), DEBUG)
                       spider.start_state(url, self.STATE_ID)
                       spider.finalize_state(url, self.STATE_ID)
                       return item#do not download
                    else:
                        #remember fname immediately, if not done before. don't wanna wait results
                        #making things more complex. want to exclude duplicates of not-yet-finished videos.
                        self.__downloaded_files[video_fname] = item['path']
                        #print "added {0}".format(video_fname)

            logfile_path = item['vlog'].file_path
            logfile = open(logfile_path, "w", 0)

            timeout = get_project_settings().get('DOWNLOAD_TIMEOUT', 30)
            data_dir = item['path']
            cmdline = "youtube-dl --no-warnings "
            if int(Config.value(mirror0.SECTION_COMMON, "hls_prefer_native")):
                cmdline += "--hls-prefer-native "
            cmdline += "--no-part --socket-timeout {0} ".format(timeout)
            cmdline += "-o '%s" % data_dir  
            cmdline += "/%(title)s-{0}.%(ext)s' ".format(self.__vcounter)
            cmdline += item[self.download_url_field]
            logfile.write(cmdline + "\n")
            self.__vcounter += 1

            log("Starting {0} for {1}".format(item[self.download_url_field], item["raw_url"]), DEBUG)

            self._sub_proc.append(
                (subprocess.Popen([cmdline], stdout=logfile.fileno(), stderr=logfile.fileno(), shell=True), #stderr=subprocess.STDOUT,
                 logfile,
                 logfile_path,
                 item["raw_url"],),
                )

            #for key, value in logging.Logger.manager.loggerDict.iteritems():

        except Exception as e:
            format_exc(self, "porcess_item", e)

        return item
示例#16
0
 def start_requests(self):
     try:
         """if scraping from videos page, getting video links directly from the start page top section"""
         yield self._request(
             url_=self.start_url,
             callback_=(self._collect_next_page_links if self.NORMAL == self.mode else self._run_item),
             )
     except Exception as e:
         format_exc(self, "start_requests", e)
示例#17
0
    def _links_from_response(self, response):
        try:
            links = []

            return links

        except Exception as e:
            format_exc(self, "_links_from_response", e)
            return None
示例#18
0
    def process_item(self, item, spider):
        try:
            if self.NAME in spider.disabled_pipelines:
                return item

            selector = item['raw_html']
            url = item['raw_url']
            if not url or not selector:
                msg = "Invalid item %s" % str(item)
                log(msg, ERROR)
            #abstr = response.xpath("//p[@class='article-abstract']/text()").extract_first()
            item['title'] = selector.xpath(self._title_x).extract_first()
            if not item['title']:
                item['title'] = selector.xpath(
                    "//head/title/text()").extract_first()
            if not item['title']:
                log("No title %s" % url, ERROR)
                raise DropItem()
            item['title'] = item['title'].strip()
            log("RawExtractor got title %s" % item['title'], DEBUG)

            item['title'] = self.encode_strip(item['title'])
            body = ""
            for p in selector.xpath(self._text_paragraph_x).extract():
                body += " " + p
            if body:
                body = body.strip()
            elif self._abstract_paragraph_x:
                body = selector.xpath(
                    self._abstract_paragraph_x).extract_first()
            if not body:
                log("No article text %s" % url, DEBUG)
                body = ""
            item['text'] = body.encode("ascii", "replace").strip(" -\n")

            item['pictures'] = selector.xpath(self._picture_x).extract()

            try:
                if self._time_format_in:
                    dt_obj_localized = extract_dt_obj(selector, self._time_x,
                                                      self._time_format_in)
                else:
                    """Assuming ISO time with timezone on empty format list"""
                    iso_s = selector.xpath(self._time_x).extract_first()
                    dt_obj_localized = time_utils.dt_obj_from_iso(iso_s)
                item['time'] = time_utils.format_utc_from_localized(
                    dt_obj_localized, self._time_format_out)
            except Exception as e:
                log("No time for %s %s" % (url, str(e)), DEBUG)
                item['time'] = ""

            return self._extract_more(item, spider)

        except Exception as e:
            if type(e) == DropItem:
                raise
            format_exc(self, "process_item", e)
示例#19
0
    def _links_from_response(self, response):
        try:
            links = []

            return links

        except Exception as e:
            format_exc(self, "_links_from_response", e)
            return None
示例#20
0
    def get_video_filename(self, item):
        try:
            cmdline = "youtube-dl --no-warnings --get-filename -o '%(title)s.%(ext)s' "
            cmdline += item[self.download_url_field]
            process = subprocess.Popen([cmdline], stdout=subprocess.PIPE, stderr=None, shell=True)
            out_err_tpl = process.communicate()
            return out_err_tpl[0].strip()

        except Exception as e:
            format_exc(self, "get_video_filename", e)
示例#21
0
 def save(self):
     """ Writes all data to disc
     """
     try:
         with open(self._file_path, "wb") as out_f:
             for hsh in self._hashes:
                 out_f.write(_long_to_bytes(hsh))
     except OSError as e:
         format_exc(self, "Saving {} failed".format(self._file_path), e)
         raise
示例#22
0
 def index_log(self, message):
     """ URLs added in text form are saved to separate file for human reference
     message: string
     """
     try:
         with open(self._log_file_path, "a") as f:
             f.write(message)
     except OSError as e:
         format_exc(self, "Index log write failed {}".format(self._file_path), e)
         raise
示例#23
0
 def save(self):
     """ Writes all data to disc
     """
     try:
         with open(self._file_path, "wb") as out_f:
             for hsh in self._hashes:
                 out_f.write(_long_to_bytes(hsh))
     except OSError as e:
         format_exc(self, "Saving {} failed".format(self._file_path), e)
         raise
示例#24
0
 def start_requests(self):
     try:
         """if scraping from videos page, getting video links directly from the start page top section"""
         yield self._request(
             url_=self.start_url,
             callback_=(self._collect_next_page_links
                        if self.NORMAL == self.mode else self._run_item),
         )
     except Exception as e:
         format_exc(self, "start_requests", e)
示例#25
0
 def index_log(self, message):
     """ URLs added in text form are saved to separate file for human reference
     message: string
     """
     try:
         with open(self._log_file_path, "a") as f:
             f.write(message)
     except OSError as e:
         format_exc(self,
                    "Index log write failed {}".format(self._file_path), e)
         raise
示例#26
0
    def _links_from_response(self, response):
        response = self._prepare_response(response)
        try:
            links =  response.xpath("//div[re:test(@class, 'list-item')][not(ancestor::*[re:test(@class, 'double-col')])]/div[re:test(@class, 'inner')]/h4/a/@href").extract()
            if not links:
                links = response.xpath("//div[re:test(@class, 'list-item')]/div[re:test(@class, 'inner')]/h4/a/@href").extract()
            return links

        except Exception as e:
            format_exc(self, "_links_from_response", e)
            return None
示例#27
0
    def __init__(self, **kw):
        try:
            mirror0.generic_spider.Spider.__init__(self, **kw)
            self._per_url_regex_xpath = ( 
                (r"video/sport/afl", "//a[@class='vms-list-item module']/@href"), 
                ("sport/afl^", '//div[@class="story-block "]/a[@class="thumb-link"]/@href'), #main page
                ("", '//div[@class="story-block "]/h4[@class="heading"]/a/@href'), #more-stories, clubs 
                )

        except Exception as e:
            format_exc(self, "__init__", e)
示例#28
0
    def media_downloaded(self, response, request, info):
        try:
            item = response.meta['item']
            (vpath, vname) = os.path.split(response.meta['video_url'])
            with open(os.path.join(item['path'], vname), "wb") as f: 
                f.write(response.body)

            self._spider.finalize_state(item['raw_url'], self.STATE_ID)
            log("VideoDirect download complete %s for %s" % (request.url, item['raw_url']), WARNING)
        except Exception as e:
            format_exc(self, "media_downloaded", e)
示例#29
0
 def _index_successful(self):
     try:
         self._links.update(self._existent)
         self._existent.clear()
         for link, state in self._links.viewitems():
             if not type(state) is str and self._is_successful(state):
                  if self._index:
                     self._index.add(link)
         if self._index:
             self._index.save()
     except Exception as e:
         format_exc(self, "_index_successful", e)
示例#30
0
    def process_item(self, item, spider):
        try:
            if self.NAME in spider.disabled_pipelines:
                return item 

            selector = item['raw_html']
            url = item['raw_url']
            if not url or not selector:
                msg = "Invalid item %s" % str(item)
                log(msg, ERROR)
            #abstr = response.xpath("//p[@class='article-abstract']/text()").extract_first()
            item['title'] = selector.xpath(self._title_x).extract_first()
            if not item['title']:
                item['title'] = selector.xpath("//head/title/text()").extract_first()
            if not item['title']:
                log("No title %s" % url, ERROR)
                raise DropItem()
            item['title'] = item['title'].strip()
            log("RawExtractor got title %s" % item['title'], DEBUG)

            item['title'] = self.encode_strip(item['title'])
            body = ""
            for p in selector.xpath(self._text_paragraph_x).extract():
                body += " " + p
            if body:
                body = body.strip()
            elif self._abstract_paragraph_x:
                body = selector.xpath(self._abstract_paragraph_x).extract_first()
            if not body:
                log("No article text %s" % url, DEBUG)
                body = ""
            item['text'] = body.encode("ascii", "replace").strip(" -\n")
            
            item['pictures'] = selector.xpath(self._picture_x).extract()

            try:
                if self._time_format_in:
                    dt_obj_localized = extract_dt_obj(selector, self._time_x, self._time_format_in)
                else:
                    """Assuming ISO time with timezone on empty format list"""
                    iso_s = selector.xpath(self._time_x).extract_first()
                    dt_obj_localized = time_utils.dt_obj_from_iso(iso_s)
                item['time'] = time_utils.format_utc_from_localized(dt_obj_localized, self._time_format_out) 
            except Exception as e:
                log("No time for %s %s" % (url, str(e)), DEBUG)
                item['time'] = ""

            return self._extract_more(item, spider)
            
        except Exception as e:
            if type(e) == DropItem:
                raise
            format_exc(self, "process_item", e)
示例#31
0
    def _links_from_response(self, response):
        try:
            if self.TITLE_PAGE == self.start_url:
                xpath_s = "//section[@class='breaking-news']/header/h1[re:test(text(), 'fantasy')]/../../div/ol/li/article/header/div/h1[@itemprop='name headline']/a/@href | //section[@class='breaking-news']/header/h1[re:test(text(), 'women')]/../../div/ol/li/article/header/div/h1[@itemprop='name headline']/a/@href | //section/header/h1/a[text()='More AFL News']/../../../div/div/div/div/div/article/header/div/h1[@itemprop='name headline']/a/@href"
            else:
                xpath_s = "//article/header[re:test(@class, 'article')]/div/h1/a/@href"

            links = response.xpath(xpath_s).extract()
            return links

        except Exception as e:
            format_exc(self, "_links_from_response", e)
            return None
示例#32
0
    def _links_from_response(self, response):
        try:
            if self.TITLE_PAGE == self.start_url:
                xpath_s = "//section[@class='breaking-news']/header/h1[re:test(text(), 'fantasy')]/../../div/ol/li/article/header/div/h1[@itemprop='name headline']/a/@href | //section[@class='breaking-news']/header/h1[re:test(text(), 'women')]/../../div/ol/li/article/header/div/h1[@itemprop='name headline']/a/@href | //section/header/h1/a[text()='More AFL News']/../../../div/div/div/div/div/article/header/div/h1[@itemprop='name headline']/a/@href"
            else:
                xpath_s = "//article/header[re:test(@class, 'article')]/div/h1/a/@href"
                
            links = response.xpath(xpath_s).extract()
            return links

        except Exception as e:
            format_exc(self, "_links_from_response", e)
            return None
示例#33
0
    def media_downloaded(self, response, request, info):
        try:
            item = response.meta['item']
            (vpath, vname) = os.path.split(response.meta['video_url'])
            with open(os.path.join(item['path'], vname), "wb") as f:
                f.write(response.body)

            self._spider.finalize_state(item['raw_url'], self.STATE_ID)
            log(
                "VideoDirect download complete %s for %s" %
                (request.url, item['raw_url']), WARNING)
        except Exception as e:
            format_exc(self, "media_downloaded", e)
示例#34
0
    def __init__(self, **kw):
        try:
            mirror0.generic_spider.Spider.__init__(self, **kw)

            if self.start_url.startswith(self.VIDEO_PATH):
                self.mode = self.VIDEO
                self.disabled_pipelines = [mirror0.generic_spider.text_image_pipeline.TextImagePipeline.NAME,
                                           mirror0.generic_spider.raw_extractor_pipeline.RawExtractorPipeline.NAME,]
            else:
                self.mode = self.NORMAL

        except Exception as e:
            format_exc(self, "__init__", e)
示例#35
0
    def get_video_filename(self, item):
        try:
            cmdline = "youtube-dl --no-warnings --get-filename -o '%(title)s.%(ext)s' "
            cmdline += item[self.download_url_field]
            process = subprocess.Popen([cmdline],
                                       stdout=subprocess.PIPE,
                                       stderr=None,
                                       shell=True)
            out_err_tpl = process.communicate()
            return out_err_tpl[0].strip()

        except Exception as e:
            format_exc(self, "get_video_filename", e)
示例#36
0
 def _run_item(self, response):
     try:
         url = response.request.url
         if not url in self._links:
             log("Response url doesn't match: %s" % url, INFO)
         item = self._item_class(self)
         item['raw_url'] = url
         response = self._prepare_response(response)
         item['raw_html'] = response.selector
         #item['raw_text'] = response.body
         return item
     except Exception as e:
         format_exc(self, "_run_item", e)
示例#37
0
    def _run_item(self, response):
        try:
            item = super(WatodaySpider, self)._run_item(response)

            if self.start_url.endswith(self.TITLE_PAGE):
                upath = url_path(response.request.url)
                if upath.startswith("video"):
                    item['out_dir'] = "video"
                else:
                    item['out_dir'] = "title-page"

            return item
        except Exception as e:
            format_exc(self, "_run_item", e)
示例#38
0
    def media_failed(self, failure, request, info):
        try:
            item = request.meta['item']
            log("VideoDirect download failed %s for %s: %s" % (request.url, item['raw_url'], str(info)), ERROR)

            """DEBUG"""
            video_url = request.meta['video_url']
            (vpath, vname) = os.path.split(video_url)
            vname = "FAIL" + vname
            with open(os.path.join(item['path'], vname), "wb") as f: 
                f.write(video_url)

        except Exception as e:
            format_exc(self, "media_downloaded", e)
示例#39
0
    def _run_item(self, response):
        try:
            item = super(WatodaySpider, self)._run_item(response)

            if self.start_url.endswith(self.TITLE_PAGE):
                upath = url_path(response.request.url)
                if upath.startswith("video"):
                    item['out_dir'] = "video"
                else:
                    item['out_dir'] = "title-page"

            return item
        except Exception as e:
            format_exc(self, "_run_item", e)
 def _extract_more(self, item, spider):
     try:
         log("NewsExtractor start %s" % item['title'], DEBUG)
         selector = item['raw_html']
         item['ooyala_id'] = selector.xpath("//div[re:test(@class, 'vms module')]/@vms-embedcode").extract_first()
         if item['ooyala_id']:
            log("Matched %s" % item['raw_url'], DEBUG)
         else:
             item['ooyala_id'] = "" 
             log("Not matched %s" % item['raw_url'], DEBUG)
     
         return item
     except Exception as e:
         format_exc(self, "_extract_more", e)
示例#41
0
 def wait_all_finished(self, spider):
     try:
         log("Waiting for video processes to complete...")
         i = len(self._sub_proc)
         for process, logfile, logfile_path, url in self._sub_proc:
             print "Left %i" % (i)
             if self._no_duplicates:
                 tail_proc = subprocess.Popen(["tail", "-f", "-n", "1", logfile_path], stdout=None, stderr=None)
             process.communicate()
             #process.wait()
             if self._no_duplicates:
                 tail_proc.terminate()
             logfile.close()
             # downloaded successfully
             if 0 == process.returncode:
                 spider.start_state(url, self.STATE_ID)
                 spider.finalize_state(url, self.STATE_ID)
             # return code 1 can indicate both no video on page or not supported page
             elif 1 == process.returncode:
                 # check_call accepts a list of arguments only
                 grepr = StreamPipeline._call(["grep", "ERROR.*Unsupported", logfile_path])
                 if 0 == grepr:
                     spider.start_state(url, self.STATE_NOVID)
                     spider.finalize_state(url, self.STATE_NOVID)
                 else:
                     grepr = StreamPipeline._call(["grep", "ERROR.*content ID", logfile_path])
                     spider.start_state(url, self.STATE_ID)
                     if 0 == grepr:
                         log("Content id error %s" % url, DEBUG)
                     else:
                         grepr = StreamPipeline._call(["grep", "ERROR.*timed out", logfile_path])
                         if 0 == grepr :
                             log("Ydl video timed out {0}".format(url), WARNING)   
                         elif 1 == grepr:
                             log("Log state is not known %s" % (url), WARNING)
                         else:
                             log("Grep return code %i %s" % (grepr, url), WARNING)
             elif -2 == process.returncode:
                 #interruped by user
                 spider.start_state(url, self.STATE_ID)
                 pass
             else:
                 #started state without later finalizing means fail
                 spider.start_state(url, self.STATE_ID)
                 log("Youtube-dl return code %i %s" % (process.returncode, url), WARNING)
             i -= 1
         self._sub_proc[:] = []
     except Exception as e:
         format_exc(self, "wait_all_finished", e)
         raise
示例#42
0
 def media_downloaded(self, response, request, info):
     try:
         match = re.search(r"x200[^\w]+(http[^\"\,]+\.mp4)", response.body)
         item = response.meta['item']
         if match:
             item['ooyala_urls'].append(match.group(1))
             return item
         else:
             item.start(OOYALA_JS_ID)
             log("Ooyala0: type 2 %s" % item['raw_url'], WARNING)
             item['vlog'](response.body)
             return item
             #log("No ooyala match %s %s" % (request.url, item['raw_url']))
     except Exception as e:
         format_exc(self, "media_downloaded", e)
示例#43
0
    def media_failed(self, failure, request, info):
        try:
            item = request.meta['item']
            log(
                "VideoDirect download failed %s for %s: %s" %
                (request.url, item['raw_url'], str(info)), ERROR)
            """DEBUG"""
            video_url = request.meta['video_url']
            (vpath, vname) = os.path.split(video_url)
            vname = "FAIL" + vname
            with open(os.path.join(item['path'], vname), "wb") as f:
                f.write(video_url)

        except Exception as e:
            format_exc(self, "media_downloaded", e)
    def _extract_more(self, item, spider):
            try:
                selector = item['raw_html']

                if spider.NORMAL == spider.mode:
                    item['video_urls'] = selector.xpath("//video/source[@type='video/mp4']/@src").extract()
                elif spider.VIDEO == spider.mode:
                    pass
                else:
                    assert "Wrong mode value"

                #['https://snappytv-a.akamaihd.net/video/928000/603p603/2016-06-04T12-05-17.467Z--35.797.mp4?token=1467913795_5faf17e8319b2988a149bfba6a686f40']
                return item
            except Exception as e:
                format_exc(self, "_extract_more", e)
示例#45
0
    def _links_from_response(self, response):
        try:
            if self.start_url.endswith(self.TITLE_PAGE):
                links = response.xpath("//article[re:test(@class, 'story')]/descendant::h3[@class='story__headline']/a/@href")\
                .extract()
                links = [lnk for lnk in links if self.TITLE_PAGE in lnk or self.VIDEO_PATH in lnk]
            else:
                links = response.xpath(\
                "//article[@class='story has-wof']/div[@class='story__wof']/h3[@class='story__headline']/a/@href | //article[re:test(@class, 'has-wof')]/h3[@class='story__headline']/a/@href"
                ).extract()
            return links

        except Exception as e:
            format_exc(self, "_links_from_response", e)
            return None
示例#46
0
 def media_downloaded(self, response, request, info):
     try:
         match = re.search(r"x200[^\w]+(http[^\"\,]+\.mp4)", response.body)
         item = response.meta['item']
         if match:
             item['ooyala_urls'].append(match.group(1))
             return item
         else:
             item.start(OOYALA_JS_ID )
             log("Ooyala0: type 2 %s" % item['raw_url'], WARNING)
             item['vlog'](response.body)
             return item
             #log("No ooyala match %s %s" % (request.url, item['raw_url']))
     except Exception as e:
         format_exc(self, "media_downloaded", e)
示例#47
0
    def __init__(self, **kw):
        try:
            mirror0.generic_spider.Spider.__init__(self, **kw)
            self._per_url_regex_xpath = (
                (r"video/sport/afl",
                 "//a[@class='vms-list-item module']/@href"),
                ("sport/afl^",
                 '//div[@class="story-block "]/a[@class="thumb-link"]/@href'
                 ),  #main page
                ("",
                 '//div[@class="story-block "]/h4[@class="heading"]/a/@href'
                 ),  #more-stories, clubs 
            )

        except Exception as e:
            format_exc(self, "__init__", e)
示例#48
0
    def __init__(self, **kw):
        try:
            mirror0.generic_spider.Spider.__init__(self, **kw)

           
            self.less_vid = kw.get('less_vid', False)
          #in this spider videos are downloaded with the framework so no need to wait for additional processes  
            self.video_processor = self

            self._per_url_regex_xpath = (
                ("nabchallenge" , "//h4[@class='partial--finals-video__caption']/a/@href | //h3[re:test(text(), 'News')]/parent::div/parent::div/following::div/div/div[re:test(@class, 'list-item')]/div[re:test(@class, 'inner')]/h4/a/@href"),
            )


        except Exception as e:
            format_exc(self, "__init__", e)
示例#49
0
    def _extract_more(self, item, spider):
        try:
            log("NewsExtractor start %s" % item['title'], DEBUG)
            selector = item['raw_html']
            item['ooyala_id'] = selector.xpath(
                "//div[re:test(@class, 'vms module')]/@vms-embedcode"
            ).extract_first()
            if item['ooyala_id']:
                log("Matched %s" % item['raw_url'], DEBUG)
            else:
                item['ooyala_id'] = ""
                log("Not matched %s" % item['raw_url'], DEBUG)

            return item
        except Exception as e:
            format_exc(self, "_extract_more", e)
示例#50
0
    def __init__(self, **kw):
        try:
            mirror0.generic_spider.Spider.__init__(self, **kw)

            if self.start_url.startswith(self.VIDEO_PATH):
                self.mode = self.VIDEO
                self.disabled_pipelines = [
                    mirror0.generic_spider.text_image_pipeline.
                    TextImagePipeline.NAME,
                    mirror0.generic_spider.raw_extractor_pipeline.
                    RawExtractorPipeline.NAME,
                ]
            else:
                self.mode = self.NORMAL

        except Exception as e:
            format_exc(self, "__init__", e)
示例#51
0
    def process_item(self, item, spider):
        try:
            META_FILE = "meta.dat"
            file_path = os.path.join(item['path'], META_FILE)
            f = open(file_path, "w")
            f.write("url=%s\n" % item['raw_url'])
            f.write("publishedUTC=%s\n" % item['time'])

            if "ooyala_video_ids" in item and item['ooyala_video_ids']:
                f.write("data-content-id=%s" % json.dumps(
                    item['ooyala_video_ids'], separators=(", ", " ")))
        except Exception as e:
            format_exc(self, "process_item", e)
        finally:
            f.close()

        return item
示例#52
0
    def _run_item(self, response):
        try:
            if self.mode == self.NORMAL:
                item = super(FoxsportsSpider, self)._run_item(response)
                if self.TITLE_PAGE == self.start_url:
                    item['out_dir'] = "title_page"
                yield item
            elif self.mode == self.VIDEO:
                response = self._prepare_response(response)
                for sel_item in response.selector.xpath(
                        "//li[re:test(@class,'fiso-video-mosaic')]"):
                    url = sel_item.xpath(
                        "./descendant::meta[@itemprop='contentURL']/@content"
                    ).extract_first()
                    if url:
                        debug_link_regex = ""
                        try:
                            debug_link_regex = Config.value(
                                mirror0.SECTION_COMMON, "debug_link_regex")
                        except Exception:
                            pass

                        if debug_link_regex:
                            if not re.search(debug_link_regex, url):
                                continue
                        title = sel_item.xpath(
                            "./descendant::meta[@itemprop='headline name']/@content"
                        ).extract_first()
                        time = sel_item.xpath(
                            "./descendant::meta[@itemprop='uploadDate']/@content"
                        ).extract_first()
                        item = self._item_class(self)
                        item['video_urls'] = [url]
                        item['title'] = RawExtractorPipeline.encode_strip(
                            title)
                        item['raw_url'] = url
                        item['time'] = time
                        self._links[url] = "?"

                        #"""DEBUG"""
                        yield item
                else:
                    assert "Wrong mode value"

        except Exception as e:
            format_exc(self, "_run_item", e)