Exemplo n.º 1
0
 def media_downloaded(self, response, request, info):
     try:
         match = re.search(r'http://newsvidhd[^\|]+(\d{4}/\d{2}/\d{2}/)',
                           response.body)
         try:
             item = response.meta['item']
             item['vlog'](response.body)
             if not match:
                 item.start(self.STATE_TYPE2)
                 log("NIU TYPE2 for {}".format(item['raw_url']))
                 return
             date = match.group(1)
             mend = response.body[match.end():]
             match = re.search(r"([\w]+).mp4,200000", mend)
             if not match:
                 match = re.search(r"([\w]+).mp4,500000", mend)
             vid_name = match.group(1)
             item['playlist_url'] = "http://newsvidhd-vh.akamaihd.net/i/foxsports/prod/archive/"\
                  + date + "," + vid_name + ",.mp4.csmil/master.m3u8"
             item.finish(self.STATE_ID)
             return item
         except Exception as e:
             log(
                 "Y-dl playlist extraction failed %s: %s" %
                 (item['raw_url'], str(e)), ERROR)
             item['vlog'](response.body)
             return item
     except Exception as e:
         format_exc(self, "media_downloaded", e)
Exemplo n.º 2
0
 def open_spider(self, spider):
     self._top_dir = os.path.join(config_out, 
         spider.allowed_domains[0])
     try:
         dir_util.mkpath(self._top_dir)
     except Exception as e:
         log("Can't create output directory %s : %s" % (self._top_dir, str(e)), ERROR)
Exemplo n.º 3
0
 def media_downloaded(self, response, request, info):
     try:
         match = re.search(r'http://newsvidhd[^\|]+(\d{4}/\d{2}/\d{2}/)', response.body)
         try:
             item = response.meta['item']
             item['vlog'](response.body)
             if not match:
               item.start(self.STATE_TYPE2)
               log("NIU TYPE2 for {}".format(item['raw_url']))
               return
             date = match.group(1)
             mend = response.body[match.end():]
             match = re.search(r"([\w]+).mp4,200000", mend)
             if not match:
                 match = re.search(r"([\w]+).mp4,500000", mend)
             vid_name = match.group(1)
             item['playlist_url'] = "http://newsvidhd-vh.akamaihd.net/i/foxsports/prod/archive/"\
                  + date + "," + vid_name + ",.mp4.csmil/master.m3u8"
             item.finish(self.STATE_ID)
             return item
         except Exception as e:
             log("Y-dl playlist extraction failed %s: %s" % (item['raw_url'], str(e)), ERROR)
             item['vlog'](response.body)
             return item
     except Exception as e:
         format_exc(self, "media_downloaded", e)
Exemplo n.º 4
0
    def get_media_requests(self, item, info):

        if NO_VIDEO:
            return

        try:
            for video_url in item['video_urls']:
                self._spider.start_state(item['raw_url'], self.STATE_ID)
                log("VideoDirect downloading %s " % (video_url), DEBUG)
                request = scrapy.Request(
                    url=video_url,
                    #"http://aaa[1]b(2).ru",
                    method="GET",
                    headers={
                        "Accept": "*/*",
                        "User-Agent": "Mozilla",
                    },
                    meta={
                        "item": item,
                        "video_url": video_url
                    },  #"download_timeout":600,
                    dont_filter=True,
                )
                yield request
        except Exception as e:
            format_exc(self, "get_media_requests", e)
Exemplo n.º 5
0
def init_chrome_driver(timeout=30):
    chrome_options = Options()
    chrome_options.add_argument("--disable-bundled-ppapi-flash")
    chrome_options.add_argument("--disable-plugins-discovery")
    chrome_options.add_argument("--disable-webaudio")
    chrome_options.add_argument("--mute-audio")
    #chrome_options.add_argument("--no-startup-window")
    prefs = {}
    prefs["plugins.plugins_disabled"] = [
        "Adobe Flash Player", "Shockwave Flash"
    ]
    prefs["profile.managed_default_content_settings.images"] = 2
    #prefs["profile.managed_default_content_settings.media_stream"] = 2
    chrome_options.add_experimental_option("prefs", prefs)

    path = Config.value(SECTION_COMMON, "chromedriver_path")
    if path:
        log("Chromedriver path: %s" % path, INFO)
        driver = webdriver.Chrome(executable_path=path,
                                  chrome_options=chrome_options)
    else:
        driver = webdriver.Chrome(chrome_options=chrome_options)

    driver.wait = WebDriverWait(driver, timeout)
    return driver
Exemplo n.º 6
0
 def save(self):
     try:
         f = open(self._file_path, "wb")
         for hsh in self._hashes:
             f.write(Index._long_string(hsh))
         f.close()
     except Exception as e:
         log("Saving %s failed: %s" % (self._file_path, str(e)))
Exemplo n.º 7
0
 def save(self):
     try:
         f = open(self._file_path, "wb")
         for hsh in self._hashes:
             f.write(Index._long_string(hsh))
         f.close()
     except Exception as e:
         log("Saving %s failed: %s" % (self._file_path, str(e)))
Exemplo n.º 8
0
 def _call(arglst):
     try:
         subprocess.check_call(arglst, stdout=None)
     except subprocess.CalledProcessError as e:
         return e.returncode
     except Exception as e:
         log("subporcess.check_call failed %s" % str(e), ERROR)
         return 2
     return 0
Exemplo n.º 9
0
 def _call(arglst):
     try:
         subprocess.check_call(arglst, stdout=None)
     except subprocess.CalledProcessError as e:
         return e.returncode
     except Exception as e:
         log("subporcess.check_call failed %s" % str(e), ERROR)
         return 2
     return 0
Exemplo n.º 10
0
 def open_spider(self, spider):
     #super(self.Nwjs, self).open_spider(spider)
     MediaPipelineEx.open_spider(self, spider)
     spider._object_cleaner.add_command(_clean_if_need)
     global _driver
     if not _driver:
         _driver = init_chrome_driver(TIMEOUT)
         log("Starting Chromedriver login")
         do_login(self, _driver)
Exemplo n.º 11
0
    def process_request(self, request, spider):
        if ("webdriver" in request.meta and request.meta["webdriver"] == "do_use"):
            log("Selenium requesting {}".format(request.url))
            try:
                spider.driver.get(request.url)
            except TimeoutException as we:
                log("Web driver: timeout at {}".format(request.url), logging.ERROR)

            return HtmlResponse(spider.driver.current_url, body=spider.driver.page_source, encoding="utf-8", request=request)
Exemplo n.º 12
0
    def process_item(self, item, spider):
        try:
            if self.NAME in spider.disabled_pipelines:
                return item

            selector = item['raw_html']
            url = item['raw_url']
            if not url or not selector:
                msg = "Invalid item %s" % str(item)
                log(msg, ERROR)
            #abstr = response.xpath("//p[@class='article-abstract']/text()").extract_first()
            item['title'] = selector.xpath(self._title_x).extract_first()
            if not item['title']:
                item['title'] = selector.xpath(
                    "//head/title/text()").extract_first()
            if not item['title']:
                log("No title %s" % url, ERROR)
                raise DropItem()
            item['title'] = item['title'].strip()
            log("RawExtractor got title %s" % item['title'], DEBUG)

            item['title'] = self.encode_strip(item['title'])
            body = ""
            for p in selector.xpath(self._text_paragraph_x).extract():
                body += " " + p
            if body:
                body = body.strip()
            elif self._abstract_paragraph_x:
                body = selector.xpath(
                    self._abstract_paragraph_x).extract_first()
            if not body:
                log("No article text %s" % url, DEBUG)
                body = ""
            item['text'] = body.encode("ascii", "replace").strip(" -\n")

            item['pictures'] = selector.xpath(self._picture_x).extract()

            try:
                if self._time_format_in:
                    dt_obj_localized = extract_dt_obj(selector, self._time_x,
                                                      self._time_format_in)
                else:
                    """Assuming ISO time with timezone on empty format list"""
                    iso_s = selector.xpath(self._time_x).extract_first()
                    dt_obj_localized = time_utils.dt_obj_from_iso(iso_s)
                item['time'] = time_utils.format_utc_from_localized(
                    dt_obj_localized, self._time_format_out)
            except Exception as e:
                log("No time for %s %s" % (url, str(e)), DEBUG)
                item['time'] = ""

            return self._extract_more(item, spider)

        except Exception as e:
            if type(e) == DropItem:
                raise
            format_exc(self, "process_item", e)
Exemplo n.º 13
0
 def open_spider(self, spider):
     #super(self.Nwjs, self).open_spider(spider)
     MediaPipelineEx.open_spider(self, spider)
     spider._object_cleaner.add_command(_clean_if_need)
     global _driver
     if not _driver:
         _driver = init_chrome_driver(TIMEOUT)
         log("Starting Chromedriver login")
         do_login(self, _driver)
Exemplo n.º 14
0
    def _request_failed(self, failure):
        log("Failed: %s" % str(failure), ERROR)

        """ HttpError ?
        if isinstance(failure.value, spidermiddlewares.HttpError):
            response = failure.value.response
            log("Code ", str(response.status))
        else:
            log("Failed miserably: %s" % str(failure))"""
        log("Failed: %s" % str(failure))
Exemplo n.º 15
0
    def add(self, byte_str):
        byte_str = Index._strip_url(byte_str)

        try:
            f = open(self._log_file_path, "a")
            f.write(byte_str + "\n")
        except Exception as e:
            log("Index log write failed %s" % str(e), ERROR)

        crc = self._crc_fun(byte_str)
        self._hashes.update([crc])
Exemplo n.º 16
0
    def media_downloaded(self, response, request, info):
        try:
            item = response.meta['item']
            (vpath, vname) = os.path.split(response.meta['video_url'])
            with open(os.path.join(item['path'], vname), "wb") as f: 
                f.write(response.body)

            self._spider.finalize_state(item['raw_url'], self.STATE_ID)
            log("VideoDirect download complete %s for %s" % (request.url, item['raw_url']), WARNING)
        except Exception as e:
            format_exc(self, "media_downloaded", e)
Exemplo n.º 17
0
    def add(self, byte_str):
        byte_str = Index._strip_url(byte_str)

        try:
            f = open(self._log_file_path, "a")
            f.write(byte_str + "\n")
        except Exception as e:
            log("Index log write failed %s" % str(e), ERROR)

        crc = self._crc_fun(byte_str)
        self._hashes.update([crc])
Exemplo n.º 18
0
    def process_item(self, item, spider):
        try:
            if self.NAME in spider.disabled_pipelines:
                return item 

            selector = item['raw_html']
            url = item['raw_url']
            if not url or not selector:
                msg = "Invalid item %s" % str(item)
                log(msg, ERROR)
            #abstr = response.xpath("//p[@class='article-abstract']/text()").extract_first()
            item['title'] = selector.xpath(self._title_x).extract_first()
            if not item['title']:
                item['title'] = selector.xpath("//head/title/text()").extract_first()
            if not item['title']:
                log("No title %s" % url, ERROR)
                raise DropItem()
            item['title'] = item['title'].strip()
            log("RawExtractor got title %s" % item['title'], DEBUG)

            item['title'] = self.encode_strip(item['title'])
            body = ""
            for p in selector.xpath(self._text_paragraph_x).extract():
                body += " " + p
            if body:
                body = body.strip()
            elif self._abstract_paragraph_x:
                body = selector.xpath(self._abstract_paragraph_x).extract_first()
            if not body:
                log("No article text %s" % url, DEBUG)
                body = ""
            item['text'] = body.encode("ascii", "replace").strip(" -\n")
            
            item['pictures'] = selector.xpath(self._picture_x).extract()

            try:
                if self._time_format_in:
                    dt_obj_localized = extract_dt_obj(selector, self._time_x, self._time_format_in)
                else:
                    """Assuming ISO time with timezone on empty format list"""
                    iso_s = selector.xpath(self._time_x).extract_first()
                    dt_obj_localized = time_utils.dt_obj_from_iso(iso_s)
                item['time'] = time_utils.format_utc_from_localized(dt_obj_localized, self._time_format_out) 
            except Exception as e:
                log("No time for %s %s" % (url, str(e)), DEBUG)
                item['time'] = ""

            return self._extract_more(item, spider)
            
        except Exception as e:
            if type(e) == DropItem:
                raise
            format_exc(self, "process_item", e)
Exemplo n.º 19
0
 def _run_item(self, response):
     try:
         url = response.request.url
         if not url in self._links:
             log("Response url doesn't match: %s" % url, INFO)
         item = self._item_class(self)
         item['raw_url'] = url
         response = self._prepare_response(response)
         item['raw_html'] = response.selector
         #item['raw_text'] = response.body
         return item
     except Exception as e:
         format_exc(self, "_run_item", e)
Exemplo n.º 20
0
    def media_downloaded(self, response, request, info):
        try:
            item = response.meta['item']
            (vpath, vname) = os.path.split(response.meta['video_url'])
            with open(os.path.join(item['path'], vname), "wb") as f:
                f.write(response.body)

            self._spider.finalize_state(item['raw_url'], self.STATE_ID)
            log(
                "VideoDirect download complete %s for %s" %
                (request.url, item['raw_url']), WARNING)
        except Exception as e:
            format_exc(self, "media_downloaded", e)
Exemplo n.º 21
0
    def media_failed(self, failure, request, info):
        try:
            item = request.meta['item']
            log("VideoDirect download failed %s for %s: %s" % (request.url, item['raw_url'], str(info)), ERROR)

            """DEBUG"""
            video_url = request.meta['video_url']
            (vpath, vname) = os.path.split(video_url)
            vname = "FAIL" + vname
            with open(os.path.join(item['path'], vname), "wb") as f: 
                f.write(video_url)

        except Exception as e:
            format_exc(self, "media_downloaded", e)
Exemplo n.º 22
0
    def process_request(self, request, spider):
        if ("webdriver" in request.meta
                and request.meta["webdriver"] == "do_use"):
            log("Selenium requesting {}".format(request.url))
            try:
                spider.driver.get(request.url)
            except TimeoutException as we:
                log("Web driver: timeout at {}".format(request.url),
                    logging.ERROR)

            return HtmlResponse(spider.driver.current_url,
                                body=spider.driver.page_source,
                                encoding="utf-8",
                                request=request)
Exemplo n.º 23
0
    def __str__(self):
        obj_states = self
        try:
            started_str = str()
            for s in obj_states.started:
                started_str = started_str + s + " " 

            finished_str = str()
            for s in obj_states.finished:
                finished_str = finished_str + s + " "

            return "{:<30} -> {:<30}".format(started_str, finished_str)
        except TypeError:
            log("format_string_indicators TypeError", ERROR)
            raise
Exemplo n.º 24
0
    def __str__(self):
        obj_states = self
        try:
            started_str = str()
            for s in obj_states.started:
                started_str = started_str + s + " "

            finished_str = str()
            for s in obj_states.finished:
                finished_str = finished_str + s + " "

            return "{:<30} -> {:<30}".format(started_str, finished_str)
        except TypeError:
            log("format_string_indicators TypeError", ERROR)
            raise
Exemplo n.º 25
0
 def media_downloaded(self, response, request, info):
     try:
         match = re.search(r"x200[^\w]+(http[^\"\,]+\.mp4)", response.body)
         item = response.meta['item']
         if match:
             item['ooyala_urls'].append(match.group(1))
             return item
         else:
             item.start(OOYALA_JS_ID )
             log("Ooyala0: type 2 %s" % item['raw_url'], WARNING)
             item['vlog'](response.body)
             return item
             #log("No ooyala match %s %s" % (request.url, item['raw_url']))
     except Exception as e:
         format_exc(self, "media_downloaded", e)
Exemplo n.º 26
0
    def media_failed(self, failure, request, info):
        try:
            item = request.meta['item']
            log(
                "VideoDirect download failed %s for %s: %s" %
                (request.url, item['raw_url'], str(info)), ERROR)
            """DEBUG"""
            video_url = request.meta['video_url']
            (vpath, vname) = os.path.split(video_url)
            vname = "FAIL" + vname
            with open(os.path.join(item['path'], vname), "wb") as f:
                f.write(video_url)

        except Exception as e:
            format_exc(self, "media_downloaded", e)
Exemplo n.º 27
0
 def media_downloaded(self, response, request, info):
     try:
         match = re.search(r"x200[^\w]+(http[^\"\,]+\.mp4)", response.body)
         item = response.meta['item']
         if match:
             item['ooyala_urls'].append(match.group(1))
             return item
         else:
             item.start(OOYALA_JS_ID)
             log("Ooyala0: type 2 %s" % item['raw_url'], WARNING)
             item['vlog'](response.body)
             return item
             #log("No ooyala match %s %s" % (request.url, item['raw_url']))
     except Exception as e:
         format_exc(self, "media_downloaded", e)
Exemplo n.º 28
0
 def phantom_login(self):
     global _webdriver
     if _webdriver: #login once per application run
         self.driver = _webdriver
         log("Reusing logged in webdriver", DEBUG)
     else:
         dcap = dict(DesiredCapabilities.PHANTOMJS)
         dcap["phantomjs.page.settings.loadImages"] = "false"
         dcap["phantomjs.page.settings.resourceTimeout"] = "120000"
         _webdriver = webdriver.PhantomJS(executable_path=Config.value(CONFIG_SECTION, "phantomjs_path"), desired_capabilities=dcap)#init_chrome_driver()#
         self.driver = _webdriver
         log("Starting PhantomJS login")
         do_login(self, self.driver)
         self.driver.implicitly_wait(PAGE_TIMEOUT)
         self.driver.set_page_load_timeout(PAGE_TIMEOUT)
         getLogger("selenium.webdriver").setLevel(INFO)
Exemplo n.º 29
0
 def handle_downloaded(inst, response, vid_name = ""):
     try:
         item = response.meta['item']
         url = item['raw_url']
         if not vid_name:
             vid_name = Ooyala1Pipeline.__vid_name(inst, response.meta['number'])
         with open(os.path.join(item['path'], vid_name + ".mp4"), "wb") as vid_f:
             vid_f.write(response.body)
         log("Finished downloading video %s for %s %s (%i)" % (vid_name, item['title'], item['raw_url'], len(response.body)))
         if len(response.body) < 5000:
             item['vlog'](response.body)
         #caller either should have id or finish by itself
         if hasattr(inst, 'STATE_ID'):
             item.finish(inst.STATE_ID)
     except Exception as e:
         format_exc(inst, "handle_downloaded", e)
Exemplo n.º 30
0
    def _extract_more(self, item, spider):
            try:
                selector = item['raw_html']
                if "/video" in item['raw_url'] or selector.xpath("//article/descendant::div[re:test(@class, 'inline-player')]"):
                    item['skip_video'] = False
                    log("Video elements found in article %s" % item['raw_url'], DEBUG)
                else:
                    item['skip_video'] = True
                    log("No video elements in article %s" % item['raw_url'], DEBUG)

                item['twitter_data'] = \
                    [re.search(r'status/(\d+)', twit_lnk).group(1) 
                        for twit_lnk in selector.xpath("//blockquote[re:test(@class,'twitter-tweet')]/a[re:test(@href, 'status/(\d+)')]/@href").extract()]
                
                return item
            except Exception as e:
                format_exc(self, "_extract_more", e)
Exemplo n.º 31
0
 def phantom_login(self):
     global _webdriver
     if _webdriver:  #login once per application run
         self.driver = _webdriver
         log("Reusing logged in webdriver", DEBUG)
     else:
         dcap = dict(DesiredCapabilities.PHANTOMJS)
         dcap["phantomjs.page.settings.loadImages"] = "false"
         dcap["phantomjs.page.settings.resourceTimeout"] = "120000"
         _webdriver = webdriver.PhantomJS(
             executable_path=Config.value(CONFIG_SECTION, "phantomjs_path"),
             desired_capabilities=dcap)  #init_chrome_driver()#
         self.driver = _webdriver
         log("Starting PhantomJS login")
         do_login(self, self.driver)
         self.driver.implicitly_wait(PAGE_TIMEOUT)
         self.driver.set_page_load_timeout(PAGE_TIMEOUT)
         getLogger("selenium.webdriver").setLevel(INFO)
Exemplo n.º 32
0
 def yield_requests(inst, item, urls_field):
     try:
         assert item[urls_field], "Should be called for a filled item"
         i = 1
         for vid in item[urls_field]:
             log("Started downloading video %s for %s" % (Ooyala1Pipeline.__vid_name(inst, i), item['raw_url']))
             yield scrapy.Request(
                 url=vid,
                 method="GET",
                 headers={
                     "Accept" : "*/*",
                     "User-Agent" : "Mozilla",
                 },
                 meta={'item':item, 'number':i},
             )
             i += 1
     except Exception as e:
         format_exc(inst, "yield_requests", e)
Exemplo n.º 33
0
    def __init__(self, **kw):
        try:
            scrapy.Spider.__init__(self, **kw)

            self.video_processor = None

            """attributes to be overridden"""
            self._index = NotImplementedError()
            self.disabled_pipelines = []

            self._page_count = 0
            self._dates = []
            self._links = {} 
            self._video_msg = {}
            self._existent = {}
            self._next_page_url_interrupted = ""
            self._retry_count = 0
            self._lnk_pos = 0
            self._total_count = 0
            self._object_cleaner = None

            dispatcher.connect(self._spider_idle, scrapy.signals.spider_idle)

            if kw.get('no_index', False):
                self._index = None
            else:
                self._index = Index(self.BASE_DOMAIN)

            self.__first_page = kw.get('first_page', False)

            self.start_url = kw.get('start_url')

            self._object_cleaner = kw.get('object_cleaner') 

            if "/" == self.start_url[0]:
                self.start_url = self.BASE_URL + self.start_url
            log("\n\nSTART: %s" % self.start_url, INFO)
            self.logidx("\nLog for %s started %s" % (self.start_url, time.strftime("%b %d %H:%M:%S %Y")))

            self._per_url_regex_xpath = () 
            self._debug_url = ""

        except Exception as e:
            format_exc(self, "__init__", e)
Exemplo n.º 34
0
    def __init__(self, domain):
        """Loads data to memory. Creates index directory if needed and raises DistutilsFileError if failed.
        Raises IdexFingerpringException

        domain - index storage ID
        """
        try:
            POLYNOMIAL = 0x1AABBCCDDFFEEDDCC  # must be 65 bit long

            self._debug = 0

            self._hashes = set()
            self._crc_fun = crcmod.mkCrcFun(POLYNOMIAL, initCrc=0)

            # When run from the unit test, index directory path will be tweaked in Config
            file_path = Config.value(mirror0.SECTION_COMMON, "index_directory")
            dir_util.mkpath(file_path)

            file_name = domain + ".crc64"
            self._file_path = os.path.join(file_path, file_name)
            with open(self._file_path, "a+b") as f:
                data = f.read()
                if len(data) % CRC_LEN:
                    raise IndexFingerprintException("%s is corrupt!" %
                                                    file_name)
                count = len(data) / CRC_LEN
                for i in range(0, count):
                    string_val = data[i * CRC_LEN:(i + 1) * CRC_LEN]
                    int_val = Index._string_long(string_val)
                    self._hashes.update([int_val])
                log("Read %i hashes from %s" % (count, file_name))

            file_name = domain + ".log"
            self._log_file_path = os.path.join(file_path, file_name)
            # Rewrite through centralized logging
            with open(self._log_file_path, "a") as f:
                f.write("\n\nSTARTED %s\n" %
                        time.strftime("%d %b %Y %H:%M:%S"))

        except IndexFingerprintException as e:
            format_exc(self, "__init__", e)
            log(self._file_path, ERROR)
            raise
Exemplo n.º 35
0
    def process_item(self, item, spider):
        if self.NAME in spider.disabled_pipelines:
            return item 

        log("TextImage pipeline {0}".format(item['raw_url']))
        try:
            spider.start_state(item['raw_url'], TextImagePipeline.STATE_ID)
            
            text_path = os.path.join(item['path'], ARTICLE_FILE) 
            if item['text']:
                with open(text_path, "w") as f:
                    f.write(item['text'])
        except Exception as e:
            log("Error writing article text %s : %s" % (item['raw_url'], str(e)), ERROR)

        picture_timeout = get_project_settings().get('DOWNLOAD_TIMEOUT', 30)

        if item['pictures']:
            log("Downloading images for {0}".format(item['raw_url']), DEBUG)
        i = 0
        for img in item['pictures']:
            try:
                (foo, ext) = os.path.splitext(img)
                img_name = "%02i" % i + (ext if ext else "")
                img_path = os.path.join(item['path'], img_name) 
                if "/" == img[0]:
                    img = os.path.join(spider.BASE_URL, img[1:])

                if not NO_PICTURES:
                    fileobj = urllib2.urlopen(img, timeout=picture_timeout)
                    with open(img_path, "wb") as f:
                        f.write(fileobj.read())
                i += 1

            except Exception as e:
                 log("Error writing article image %s : %s" % (img, str(e)), ERROR)     
        if i:
             log("%i images retrieved for %s" % (i, item['title']), DEBUG)
                
        spider.finalize_state(item['raw_url'], TextImagePipeline.STATE_ID)
        return item
Exemplo n.º 36
0
    def __init__(self, domain):
        """Loads data to memory. Creates index directory if needed and raises DistutilsFileError if failed.
        Raises IdexFingerpringException

        domain - index storage ID
        """
        try:
            POLYNOMIAL = 0x1AABBCCDDFFEEDDCC # must be 65 bit long

            self._debug = 0

            self._hashes = set()
            self._crc_fun = crcmod.mkCrcFun(POLYNOMIAL, initCrc=0)

            # When run from the unit test, index directory path will be tweaked in Config
            file_path =Config.value(mirror0.SECTION_COMMON, "index_directory") 
            dir_util.mkpath(file_path)

            file_name = domain + ".crc64"
            self._file_path = os.path.join(file_path, file_name)
            with open(self._file_path, "a+b") as f:
                data = f.read()
                if len(data) % CRC_LEN:
                    raise IndexFingerprintException("%s is corrupt!" % file_name)
                count = len(data) / CRC_LEN
                for i in range(0, count):
                    string_val = data[i*CRC_LEN : (i + 1)*CRC_LEN]
                    int_val = Index._string_long(string_val)
                    self._hashes.update([int_val])
                log("Read %i hashes from %s" % (count, file_name))

            file_name = domain + ".log"
            self._log_file_path = os.path.join(file_path, file_name)
            # Rewrite through centralized logging
            with open(self._log_file_path, "a") as f:
                f.write("\n\nSTARTED %s\n" % time.strftime("%d %b %Y %H:%M:%S"))

        except IndexFingerprintException as e:
            format_exc(self, "__init__", e)
            log(self._file_path, ERROR)
            raise
Exemplo n.º 37
0
    def get_media_requests(self, item, info):
        item['playlist_url'] = "" 

        if item['ooyala_id']:
            try:
                item.start(self.STATE_ID)
                url = "http://player.ooyala.com/player_api/v1/metadata/embed_code/{}/{}".format(Niux.request_code, item['ooyala_id'])
                log("Preparing youtube-dl playlist %s " % (item['raw_url']), DEBUG)
                request = scrapy.Request(
                    url=url,
                    method="GET",
                    headers={
                        "Accept" : "*/*",
                        "User-Agent" : "Mozilla",
                    },
                    meta={"item":item},
                    dont_filter=True,
                )
                yield request
            except Exception as e:
                format_exc(self, "get_media_requests", e)
Exemplo n.º 38
0
    def _spider_idle(self, spider):
        """Collect more links, starting from the place previously stopped"""
        try:
            log("Spider {0} idle start".format(self.name), DEBUG)
            if self.video_processor:
                self.video_processor.wait_all_finished(self)
            if self._links or self._existent:
                #should complete all requests before going further
                self._index_successful()
                for link, states in self._links.viewitems():
                    self.logidx("%s %s" % (str(states), link))

                lost = sum(1 for lnk, result in self._links.viewitems() if "?" == result)
                ok = sum(1 for lnk, result in self._links.viewitems() if not type(result) is str and self._is_successful(result))
                log("Lost links: %i, OK: %i" % (lost, ok), WARNING)
                self._links.clear()

                if self.__first_page:
                    return

                if self._next_page_url_interrupted:
                    log("Idle, start collecting links")
                    self.logidx("Requesting {0}".format(self._next_page_url_interrupted))
                    req = Spider._request(self._next_page_url_interrupted, self._collect_next_page_links)
                    self._next_page_url_interrupted = ""
                    self.crawler.engine.crawl(req, spider)
        except Exception as e:
            format_exc(self, "_spider_idle", e)
Exemplo n.º 39
0
    def _request_next_page_links(self, next_url, webdriver):

        if (len(self._links) >= LINKS_BATCH or not next_url):
            #request articles from collected links
            requests = []
            for url in self._links:
                requests.append(Spider._request(
                    url_=url, 
                    callback_=self._run_item,
                    errback_=self._request_failed,
                    dont_filter_=True,
                    meta_={"webdriver" : webdriver,},))
                self._lnk_pos += 1
            self._next_page_url_interrupted = next_url
            #scrapy sends them in the reverse order
            requests.reverse()
            log(json.dumps(self._links, separators=("\n"," ")), DEBUG)
            log("Requesting articles")
            return requests

        if next_url:
            return Spider._request(url_=next_url, callback_=self._collect_next_page_links)
Exemplo n.º 40
0
def init_chrome_driver(timeout=30):
    chrome_options = Options()
    chrome_options.add_argument("--disable-bundled-ppapi-flash")
    chrome_options.add_argument("--disable-plugins-discovery")
    chrome_options.add_argument("--disable-webaudio")
    chrome_options.add_argument("--mute-audio")
    #chrome_options.add_argument("--no-startup-window")
    prefs = {}
    prefs["plugins.plugins_disabled"] = ["Adobe Flash Player", "Shockwave Flash"]
    prefs["profile.managed_default_content_settings.images"] = 2
    #prefs["profile.managed_default_content_settings.media_stream"] = 2
    chrome_options.add_experimental_option("prefs", prefs)

    path = Config.value(SECTION_COMMON, "chromedriver_path")
    if path:
        log("Chromedriver path: %s" % path, INFO)
        driver = webdriver.Chrome(executable_path=path, chrome_options=chrome_options)
    else:
        driver = webdriver.Chrome(chrome_options=chrome_options)

    driver.wait = WebDriverWait(driver, timeout)
    return driver
Exemplo n.º 41
0
    def __init__(self, *, domain):
        """Loads data to memory. Creates index directory if needed and raises DistutilsFileError, OSError
        Raises IdexFingerpringException
        domain - index storage ID
        """
        POLYNOMIAL = 0x1AABBCCDDFFEEDDCC  # must be 65 bit long

        self._hashes = set()
        self._crc_fun = crcmod.mkCrcFun(POLYNOMIAL, initCrc=0)

        try:
            # When run from the unit test, index directory path will be tweaked in Config
            file_path = Config.value(mirror0.SECTION_COMMON, "index_directory")
            mkpath(file_path)

            file_name = domain + ".crc64"
            self._file_path = os.path.join(file_path, file_name)
            with open(self._file_path, "rb") as f:
                data = f.read()
        except (OSError, DistutilsFileError) as e:
            format_exc(self, "Index init failed", e)
            raise

        if len(data) % CRC_LEN:
            msg = "{} is corrupt!".format(self._file_path)
            log(msg, ERROR)
            raise IndexFingerprintException(msg)

        count = len(data) // CRC_LEN
        for i in range(0, count):
            string_val = data[i * CRC_LEN:(i + 1) * CRC_LEN]
            int_val = _bytes_to_long(string_val)
            self._hashes.update([int_val])
        log("Read {} hashes from {}".format(count, file_name))

        file_name = domain + ".log"
        self._log_file_path = os.path.join(file_path, file_name)
        self.index_log("\n\nSTARTED {}\n".format(
            time.strftime("%d %b %Y %H:%M:%S")))
Exemplo n.º 42
0
    def process_item(self, item, spider):
        if NO_VIDEO or ('skip_video' in item and item['skip_video']) or not item[self.download_url_field]:
            log("StreamPipeline skipping {}".format(item['raw_url']))
            return item

        spider.video_processor = self

        try:
            #check if already downloaded (or tried to) and link to previously saved path
            if self._no_duplicates:
                video_fname = self.get_video_filename(item)
                if video_fname:
                    if video_fname in self.__downloaded_files:
                       ln_to = os.path.join(self.__downloaded_files[video_fname], video_fname)
                       #if os.path.isfile(ln_to):
                       ln_from = os.path.join(item['path'], video_fname)
                       rln = self._call(["ln", "-s", "-f", "--no-dereference", ln_to, ln_from])
                       url = item["raw_url"]
                       log("Linking {0} to {1} for {2}".format(ln_from, ln_to, url), DEBUG)
                       spider.start_state(url, self.STATE_ID)
                       spider.finalize_state(url, self.STATE_ID)
                       return item#do not download
                    else:
                        #remember fname immediately, if not done before. don't wanna wait results
                        #making things more complex. want to exclude duplicates of not-yet-finished videos.
                        self.__downloaded_files[video_fname] = item['path']
                        #print "added {0}".format(video_fname)

            logfile_path = item['vlog'].file_path
            logfile = open(logfile_path, "w", 0)

            timeout = get_project_settings().get('DOWNLOAD_TIMEOUT', 30)
            data_dir = item['path']
            cmdline = "youtube-dl --no-warnings "
            if int(Config.value(mirror0.SECTION_COMMON, "hls_prefer_native")):
                cmdline += "--hls-prefer-native "
            cmdline += "--no-part --socket-timeout {0} ".format(timeout)
            cmdline += "-o '%s" % data_dir  
            cmdline += "/%(title)s-{0}.%(ext)s' ".format(self.__vcounter)
            cmdline += item[self.download_url_field]
            logfile.write(cmdline + "\n")
            self.__vcounter += 1

            log("Starting {0} for {1}".format(item[self.download_url_field], item["raw_url"]), DEBUG)

            self._sub_proc.append(
                (subprocess.Popen([cmdline], stdout=logfile.fileno(), stderr=logfile.fileno(), shell=True), #stderr=subprocess.STDOUT,
                 logfile,
                 logfile_path,
                 item["raw_url"],),
                )

            #for key, value in logging.Logger.manager.loggerDict.iteritems():

        except Exception as e:
            format_exc(self, "porcess_item", e)

        return item
Exemplo n.º 43
0
    def __init__(self, *, domain):
        """Loads data to memory. Creates index directory if needed and raises DistutilsFileError, OSError
        Raises IdexFingerpringException
        domain - index storage ID
        """
        POLYNOMIAL = 0x1AABBCCDDFFEEDDCC # must be 65 bit long

        self._hashes = set()
        self._crc_fun = crcmod.mkCrcFun(POLYNOMIAL, initCrc=0)

        try:
            # When run from the unit test, index directory path will be tweaked in Config
            file_path =Config.value(mirror0.SECTION_COMMON, "index_directory") 
            mkpath(file_path)

            file_name = domain + ".crc64"
            self._file_path = os.path.join(file_path, file_name)
            with open(self._file_path, "rb") as f:
                data = f.read()
        except (OSError, DistutilsFileError) as e:
            format_exc(self, "Index init failed", e)
            raise
                        
        if len(data) % CRC_LEN:
            msg = "{} is corrupt!".format(self._file_path)
            log(msg, ERROR)
            raise IndexFingerprintException(msg) 

        count = len(data) // CRC_LEN
        for i in range(0, count):
            string_val = data[i*CRC_LEN : (i + 1)*CRC_LEN]
            int_val = _bytes_to_long(string_val)
            self._hashes.update([int_val])
        log("Read {} hashes from {}".format(count, file_name))

        file_name = domain + ".log"
        self._log_file_path = os.path.join(file_path, file_name)
        self.index_log("\n\nSTARTED {}\n".format(time.strftime("%d %b %Y %H:%M:%S")))
Exemplo n.º 44
0
    def get_media_requests(self, item, info):

        if NO_VIDEO:
            return

        try:
            for video_url in item['video_urls']:
                self._spider.start_state(item['raw_url'], self.STATE_ID)
                log("VideoDirect downloading %s " % (video_url), DEBUG)
                request = scrapy.Request(
                    url=video_url,
                    #"http://aaa[1]b(2).ru",
                    method="GET",
                    headers={
                        "Accept" : "*/*",
                        "User-Agent" : "Mozilla",
                    },
                    meta={ "item":item, "video_url":video_url}, #"download_timeout":600,
                    dont_filter=True,
                )
                yield request
        except Exception as e:
            format_exc(self, "get_media_requests", e)
Exemplo n.º 45
0
    def get_media_requests(self, item, info):
        item['playlist_url'] = ""

        if item['ooyala_id']:
            try:
                item.start(self.STATE_ID)
                url = "http://player.ooyala.com/player_api/v1/metadata/embed_code/{}/{}".format(
                    Niux.request_code, item['ooyala_id'])
                log("Preparing youtube-dl playlist %s " % (item['raw_url']),
                    DEBUG)
                request = scrapy.Request(
                    url=url,
                    method="GET",
                    headers={
                        "Accept": "*/*",
                        "User-Agent": "Mozilla",
                    },
                    meta={"item": item},
                    dont_filter=True,
                )
                yield request
            except Exception as e:
                format_exc(self, "get_media_requests", e)
Exemplo n.º 46
0
    def get_media_requests(self, item, info):
        item['ooyala_urls'] = []

        if NO_VIDEO:
            return

        try:
            for id in item['ooyala_video_ids']:
                url = "http://player.ooyala.com/player_api/v1/metadata/embed_code/89a379a0e1e94feca5bb87c46a8b2d5e/" + id
                log("Ooyala 0 requesting %s " % (item['raw_url']), DEBUG)
                request = scrapy.Request(
                    url=url,
                    #callback=self._request_done,
                    method="GET",
                    headers={
                        "Accept": "*/*",
                        "User-Agent": "Mozilla",
                    },
                    meta={"item": item},
                    dont_filter=True,
                )
                yield request
        except Exception as e:
            format_exc(self, "get_media_requests", e)
Exemplo n.º 47
0
    def get_media_requests(self, item, info):
        item['ooyala_urls'] = []

        if NO_VIDEO:
            return

        try:
            for id in item['ooyala_video_ids']:
                url = "http://player.ooyala.com/player_api/v1/metadata/embed_code/89a379a0e1e94feca5bb87c46a8b2d5e/" + id
                log("Ooyala 0 requesting %s " % (item['raw_url']), DEBUG)
                request = scrapy.Request(
                    url=url,
                    #callback=self._request_done,
                    method="GET",
                    headers={
                        "Accept" : "*/*",
                        "User-Agent" : "Mozilla",
                    },
                    meta={"item":item},
                    dont_filter=True,
                )
                yield request
        except Exception as e:
            format_exc(self, "get_media_requests", e)
Exemplo n.º 48
0
 def _extract_more(self, item, spider):
     try:
         log("NewsExtractor start %s" % item['title'], DEBUG)
         selector = item['raw_html']
         item['ooyala_id'] = selector.xpath("//div[re:test(@class, 'vms module')]/@vms-embedcode").extract_first()
         if item['ooyala_id']:
            log("Matched %s" % item['raw_url'], DEBUG)
         else:
             item['ooyala_id'] = "" 
             log("Not matched %s" % item['raw_url'], DEBUG)
     
         return item
     except Exception as e:
         format_exc(self, "_extract_more", e)
Exemplo n.º 49
0
    def _extract_more(self, item, spider):
        try:
            log("NewsExtractor start %s" % item['title'], DEBUG)
            selector = item['raw_html']
            item['ooyala_id'] = selector.xpath(
                "//div[re:test(@class, 'vms module')]/@vms-embedcode"
            ).extract_first()
            if item['ooyala_id']:
                log("Matched %s" % item['raw_url'], DEBUG)
            else:
                item['ooyala_id'] = ""
                log("Not matched %s" % item['raw_url'], DEBUG)

            return item
        except Exception as e:
            format_exc(self, "_extract_more", e)
Exemplo n.º 50
0
    def get_media_requests(self, item, info):
        item_url = item['raw_url']
        try:
            data_cnt = len(item['twitter_data'])
            if data_cnt:
                log("%s extracting for %s" % (self.__class__.__name__, item['raw_url']), DEBUG)
            for data_num in range(0, data_cnt):
                prev_response = item['twitter_data'][data_num]
                item['twitter_data'][data_num] = None 
                if not prev_response:
                    log("Stopped num %i for %s" % (data_num, item_url), INFO)
                else:
                    err_msg = ""
                    try:
                        next_link = self.extract_next_link(prev_response)
                    except Exception as e:
                        err_msg = str(e) 
                        next_link = None
                    if next_link:
                        self.start_state_if_needed(item, data_num)
                        log("%i requesting %s for %s" % (data_num, next_link, item_url), DEBUG)
                        yield scrapy.Request(
                            url=next_link,
                            method="GET",
                            headers={
                                "Accept" : "*/*",
                                "User-Agent" : "Mozilla",
                            },
                            meta={'item':item, 'data_num':data_num},
                        )
                    else:
                        item['vlog'](("data_num %i\n" % data_num) + prev_response.body)
                        log("Extraction failed num %i: %s for %s" % (data_num, err_msg, item_url), DEBUG)

        except Exception as e:
            format_exc(self, "get_media_requests %s" % item_url, e)
Exemplo n.º 51
0
    def _extract_more(self, item, spider):
        try:
            log("AflExtractor start %s" % item['title'], DEBUG)
            selector = item['raw_html']
            item['ooyala_video_ids'] = selector.xpath("//div[re:test(@class, 'ooyala-player')]/@data-content-id").extract()

            if item['ooyala_video_ids']:
                log("Matched %s" % item['raw_url'], DEBUG)
            else:
                log("Not matched %s" % item['raw_url'], DEBUG)
                #with open(self._idx_file, "a") as f:
                #     f.write(item['raw_text'] + "\n\n")
        
            if self._need_twitter:
                item['twitter_data'] = \
                    [re.search(r'status/(\d+)', twit_lnk).group(1) 
                        for twit_lnk in selector.xpath("//blockquote[@class='twitter-video']/a[re:test(@href, 'status/\d+')]/@href").extract()]

            return item
        except Exception as e:
            format_exc(self, "_extract_more", e)
Exemplo n.º 52
0
    def process_item(self, item, spider):
        try:
            log("FSCreator start %s" % item['title'], DEBUG)
            #log("fs for %s" % item['title'])
            item_dir = os.path.join(self._top_dir, self.__class__.getItemDir(item, spider))
            if os.path.isdir(item_dir):
                log("Article path exists, overwriting: %s" % item_dir, DEBUG)
            try:
                dir_util.mkpath(item_dir)
            except Exception as e:
                log("Can't create article directory %s : %s" % (item_dir, str(e)), ERROR)
            
            item['path'] = item_dir

            if not self.__vlog_dir:
                self.__vlog_dir = os.path.join(Config.value(SECTION_COMMON, "log_directory"), spider.name + "_streaming")
                shutil.rmtree(self.__vlog_dir, True)
                try:
                    os.mkdir(self.__vlog_dir)
                except OSError as e:
                    pass
                self.__need_clean = False

            logfile_path = os.path.join(self.__vlog_dir, item['title'] + ".log")

            class VideoLog:
                def __init__(self):
                    self.logfile_path = None

            vlog = VideoLog()
            vlog.file_path = logfile_path
            vlog.__call__ = functools.partial(FSCreatorPipeline.append_file, logfile_path) 
            item['vlog'] = vlog 

            return self._create_more(item, spider)
        except Exception as e:
            if type(e) == DropItem:
                raise
            else:
                format_exc(self, "process_item", e)