def parse_kb(self, response): # initial html tokenization to find regions segmented by e.g. "======" # or "------" filtered = response.xpath( "//div[@class='sfdc_richtext']").extract()[0].split("=-") for entry in [x and x.strip() for x in filtered]: resp = HtmlResponse(url=response.url, body=entry, encoding=response.encoding) for link in resp.xpath("//a"): href = link.xpath("@href").extract()[0] if "cache-www" in href: text = resp.xpath("//text()").extract() text_next = link.xpath("following::text()").extract() item = FirmwareLoader(item=FirmwareImage(), response=response, date_fmt=["%b %d, %Y", "%B %d, %Y", "%m/%d/%Y"]) version = FirmwareLoader.find_version_period(text_next) if not version: version = FirmwareLoader.find_version_period(text) item.add_value("version", version) item.add_value("date", item.find_date(text)) item.add_value("url", href) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) yield item.load_item()
def parse_kb(self, response): # initial html tokenization to find regions segmented by e.g. "======" # or "------" filtered = response.xpath( "//div[@class='sfdc_richtext']").extract()[0].split("=-") for entry in [x and x.strip() for x in filtered]: resp = HtmlResponse(url=response.url, body=entry, encoding=response.encoding) for link in resp.xpath("//a"): href = link.xpath("@href").extract()[0] if "cache-www" in href: text = resp.xpath("//text()").extract() text_next = link.xpath("following::text()").extract() item = FirmwareLoader( item=FirmwareImage(), response=response, date_fmt=["%b %d, %Y", "%B %d, %Y", "%m/%d/%Y"]) version = FirmwareLoader.find_version_period(text_next) if not version: version = FirmwareLoader.find_version_period(text) item.add_value("version", version) item.add_value("date", item.find_date(text)) item.add_value("url", href) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) yield item.load_item()
def parse(self, response): for entry in response.xpath("//table/tr[position() > 3]"): if not entry.xpath("./td[2]/a"): continue text = entry.xpath("./td[2]/a//text()").extract()[0] href = entry.xpath("./td[2]/a/@href").extract()[0] date = entry.xpath("./td[3]//text()").extract()[0] # if "DSM" in response.url: if 'DSMUC' in response.url: software = 'DSMUC' elif 'DSM' in response.url: software = "DSM" elif 'VSM' in response.url: software = "VSM" elif "VSF" in response.url: software = "VSF" elif "SRM" in response.url: software = "SRM" else: continue # should not happen :-) if href.endswith('/'): build = None version = response.meta.get( "version", FirmwareLoader.find_version_period([text])) if not FirmwareLoader.find_version_period([text]): build = text[0: -1] yield Request( url=urllib.parse.urljoin(response.url, href), headers={"Referer": response.url}, meta={"build": build, "version": version}, callback=self.parse) elif all(not href.lower().endswith(x) for x in [".txt", ".md5", ".torrent"]): product = None basename = os.path.splitext(text)[0].split("_") if software in basename: if response.meta["build"] in basename: basename.remove(response.meta["build"]) basename.remove(software) product = " ".join(basename) else: # usually "synology_x86_ds13_1504 product = basename[-2] item = FirmwareLoader( item=FirmwareImage(), response=response, date_fmt=["%d-%b-%Y"]) item.add_value("build", response.meta["build"]) item.add_value("version", response.meta["version"]) if software == "DSM": item.add_value("mib", "https://global.download.synology.com/download/Document/Software/" "DeveloperGuide/Firmware/DSM/All/enu/Synology_MIB_File.zip") item.add_value("url", href) item.add_value("date", date) item.add_value("product", product) item.add_value("vendor", self.name) yield item.load_item()
def parse(self, response): if response.xpath("//form[@name='UCagreement']"): for href in response.xpath( "//div[@id='productAndDoc']").extract()[0].split('"'): if "downloads.polycom.com" in href: item = FirmwareLoader( item=FirmwareImage(), response=response, date_fmt=["%B %d, %Y"]) item.add_value("version", response.meta["version"]) item.add_value("url", href.encode("utf-8")) item.add_value("date", response.meta["date"]) item.add_value("description", response.meta["description"]) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) yield item.load_item() elif response.xpath("//div[@id='ContentChannel']"): for entry in response.xpath("//div[@id='ContentChannel']//li"): if not entry.xpath("./a"): continue text = entry.xpath("./a//text()").extract()[0] href = entry.xpath("./a/@href").extract()[0].strip() date = entry.xpath("./span//text()").extract() path = urlparse.urlparse(href).path if any(x in text.lower() for x in ["end user license agreement", "eula", "release notes", "mac os", "windows", "guide", "(pdf)", "sample"]) or href.endswith(".pdf"): continue elif any(path.endswith(x) for x in [".htm", ".html"]) or "(html)" in text.lower(): yield Request( url=urlparse.urljoin( response.url, PolycomSpider.fix_url(href)), meta={"product": response.meta["product"] if "product" in response.meta else text, "date": date, "version": FirmwareLoader.find_version_period([text]), "description": text}, headers={"Referer": response.url}, callback=self.parse) elif path: item = FirmwareLoader( item=FirmwareImage(), response=response, date_fmt=["%B %d, %Y"]) item.add_value( "version", FirmwareLoader.find_version_period([text])) item.add_value("url", href.encode("utf-8")) item.add_value("date", item.find_date(date)) item.add_value("description", text) # item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) yield item.load_item()
def parse_download(self, response): for firmware in response.xpath( "//li[@class='categoryBucket categoryBucketId-7']//li[@class='record ']" ): product = response.xpath( "//div[@class='prodNavHeaderBody']//text()").extract( )[0].replace(" Support & Drivers", "") date = firmware.xpath( ".//ul[@class='dateVersion']//strong/text()").extract() version = firmware.xpath( ".//ul[@class='dateVersion']//strong/text()").extract() href = firmware.xpath(".//a/@href").extract()[0].replace( "file-download", "file-redirect") text = firmware.xpath(".//a//text()").extract()[0] item = FirmwareLoader(item=FirmwareImage(), response=response, date_fmt=["%b %d, %Y"]) item.add_value("url", href) item.add_value("product", product) item.add_value("date", item.find_date(date)) item.add_value("description", text) item.add_value("version", item.find_version_period(version)) item.add_value("vendor", self.name) yield item.load_item()
def parse_kb(self, response): mib = None # need to perform some nasty segmentation because different firmware versions are not clearly separated # reverse order to get MIB before firmware items for entry in reversed( response.xpath( "//div[@id='support-article-downloads']/div/p")): for segment in reversed(entry.extract().split("<br><br>")): resp = HtmlResponse(url=response.url, body=segment, encoding=response.encoding) for href in resp.xpath("//a/@href").extract(): text = resp.xpath("//text()").extract() if "MIBs" in href: mib = href elif "firmware" in href: text = resp.xpath("//text()").extract() item = FirmwareLoader(item=FirmwareImage(), response=resp, date_fmt=["%m/%d/%Y"]) item.add_value("date", item.find_date(text)) item.add_xpath("url", "//a/@href") item.add_value("mib", mib) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) item.add_value( "version", FirmwareLoader.find_version_period(text)) yield item.load_item()
def parse_kb(self, response): mib = None # need to perform some nasty segmentation because different firmware versions are not clearly separated # reverse order to get MIB before firmware items for entry in reversed(response.xpath( "//div[@id='support-article-downloads']/div/p")): for segment in reversed(entry.extract().split("<br><br>")): resp = HtmlResponse( url=response.url, body=segment, encoding=response.encoding) for href in resp.xpath("//a/@href").extract(): text = resp.xpath("//text()").extract() if "MIBs" in href: mib = href elif "firmware" in href: text = resp.xpath("//text()").extract() item = FirmwareLoader( item=FirmwareImage(), response=resp, date_fmt=["%m/%d/%Y"]) item.add_value("date", item.find_date(text)) item.add_xpath("url", "//a/@href") item.add_value("mib", mib) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) item.add_value( "version", FirmwareLoader.find_version_period(text)) yield item.load_item()
def parse_product(self, response): # types: firmware = 20, gpl source = 30, bios = 3 for entry in response.xpath( "//div[@id='div_type_20']/div[@id='download-os-answer-table']"): item = FirmwareLoader(item=FirmwareImage(), response=response, date_fmt=["%Y/%m/%d"]) version = FirmwareLoader.find_version_period( entry.xpath("./p//text()").extract()) gpl = None # grab first download link (e.g. DLM instead of global or p2p) href = entry.xpath("./table//tr[3]//a/@href").extract()[0] # attempt to find matching source code entry if version: for source in response.xpath("//div[@id='div_type_30']/div[@id='download-os-answer-table']"): if version in "".join(source.xpath("./p//text()").extract()): gpl = source.xpath("./table//tr[3]//a/@href").extract()[0] item.add_value("version", version) item.add_value("date", item.find_date(entry.xpath("./table//tr[2]/td[1]//text()").extract())) item.add_value("description", " ".join(entry.xpath("./table//tr[1]//td[1]//text()").extract())) item.add_value("url", href) item.add_value("sdk", gpl) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) yield item.load_item()
def parse(self, response): for link in response.xpath("//table//tr"): if not link.xpath("./td[2]/a"): continue text = link.xpath("./td[2]/a/text()").extract()[0] href = link.xpath("./td[2]//@href").extract()[0] if ".." in href: continue elif href.endswith("/"): build = response.meta.get("build", None) product = response.meta.get("product", None) if not product: product = text elif not build: build = text.replace("build", "") yield Request( url=urlparse.urljoin(response.url, href), headers={"Referer": response.url}, meta={"build": build, "product": product}, callback=self.parse, ) elif any(href.endswith(x) for x in [".bin", ".elf", ".fdt", ".imx", ".chk", ".trx"]): item = FirmwareLoader(item=FirmwareImage(), response=response, date_fmt=["%Y-%m-%d"]) item.add_value("build", response.meta["build"]) item.add_value("url", href) item.add_value("version", FirmwareLoader.find_version_period(os.path.splitext(text)[0].split("-"))) item.add_value("date", item.find_date(link.xpath("./td[3]/text()").extract())) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) yield item.load_item()
def parse_product(self, response): # types: firmware = 20, gpl source = 30, bios = 3 for entry in response.xpath( "//div[@id='div_type_20']/div[@id='download-os-answer-table']"): item = FirmwareLoader(item=FirmwareImage(), response=response, date_fmt=["%Y/%m/%d"]) version = FirmwareLoader.find_version_period( entry.xpath("./p//text()").extract()) gpl = None # grab first download link (e.g. DLM instead of global or p2p) href = entry.xpath("./table//tr[3]//a/@href").extract()[0] # attempt to find matching source code entry if version: for source in response.xpath("//div[@id='div_type_30']/div[@id='download-os-answer-table']"): if version in "".join(source.xpath("./p//text()").extract()): gpl = source.xpath("./table//tr[3]//a/@href").extract()[0] item.add_value("version", version) item.add_value("date", item.find_date(entry.xpath("./table//tr[2]/td[1]//text()").extract())) item.add_value("description", " ".join(entry.xpath("./table//tr[1]//td[1]//text()").extract())) item.add_value("url", href) item.add_value("sdk", gpl) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) yield item.load_item()
def parse(self, response): for link in response.xpath("//a"): text = link.xpath("text()").extract()[0] href = link.xpath("@href").extract()[0] yield Request( url=urlparse.urljoin(response.url, href), headers={"Referer": response.url}, meta={"version": FirmwareLoader.find_version_period(text)}, callback=self.parse_url)
def parse(self, response): for link in response.xpath("//a"): text = link.xpath("text()").extract()[0] href = link.xpath("@href").extract()[0] yield Request( url=urlparse.urljoin(response.url, href), headers={"Referer": response.url}, meta={"version": FirmwareLoader.find_version_period(text)}, callback=self.parse_url)
def parse(self, response): for link in response.xpath("//a"): text = link.xpath("text()").extract_first() href = link.xpath("@href").extract_first() if text is None and href == "/": # <a href="/"><em>(root)</em></a> continue yield Request( url=urllib.parse.urljoin(response.url, href), headers={"Referer": response.url}, meta={"version": FirmwareLoader.find_version_period(text)}, callback=self.parse_url)
def parse_product(self, response): for image in response.xpath( "//div[@id='accordion-2']//tr[position() > 1]"): text = image.xpath("./td[2]//a[1]/text()").extract() if "firmware" in "".join(text).lower(): item = FirmwareLoader(item=FirmwareImage(), response=response, selector=image, date_fmt=["%Y-%m-%d"]) item.add_xpath("date", "td[1]//text()") item.add_value("description", text) item.add_xpath("url", "td[2]//a[1]/@href") item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) item.add_value( "version", FirmwareLoader.find_version_period(text)) yield item.load_item()
def parse_product(self, response): for href in response.xpath("//a/@href").extract(): if href.endswith(".npk") or href.endswith(".lzb"): text = response.xpath("//text()").extract() basename = href.split("/")[-1] item = FirmwareLoader( item=FirmwareImage(), response=response, date_fmt=["%Y-%b-%d"]) item.add_value("date", item.find_date(text)) item.add_value("url", href) item.add_value("product", basename[0: basename.rfind("-")]) item.add_value("vendor", self.name) item.add_value( "version", FirmwareLoader.find_version_period(text)) yield item.load_item()
def parse_product(self, response): for image in response.xpath( "//div[@id='accordion-2']//tr[position() > 1]"): text = image.xpath("./td[2]//a[1]/text()").extract() if "firmware" in "".join(text).lower(): item = FirmwareLoader(item=FirmwareImage(), response=response, selector=image, date_fmt=["%Y-%m-%d"]) item.add_xpath("date", "td[1]//text()") item.add_value("description", text) item.add_xpath("url", "td[2]//a[1]/@href") item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) item.add_value( "version", FirmwareLoader.find_version_period(text)) yield item.load_item()
def parse_product(self, response): for href in response.xpath("//a/@href").extract(): if href.endswith(".npk") or href.endswith(".lzb"): text = response.xpath("//text()").extract() basename = href.split("/")[-1] item = FirmwareLoader(item=FirmwareImage(), response=response, date_fmt=["%Y-%b-%d"]) item.add_value("date", item.find_date(text)) item.add_value("url", href) item.add_value("product", basename[0:basename.rfind("-")]) item.add_value("vendor", self.name) item.add_value("version", FirmwareLoader.find_version_period(text)) yield item.load_item()
def parse(self, response): for link in response.xpath("//a"): text = link.xpath(".//text()").extract()[0] href = link.xpath(".//@href").extract()[0] if ".." in href: continue elif href.endswith('/'): yield Request(url=urlparse.urljoin(response.url, href), headers={"Referer": response.url}, callback=self.parse) elif href.endswith(".gz") and ".iso" not in href: # strip off multiple file extensions basename = os.path.splitext(text)[0] while ".img" in basename or ".iso" in basename: basename = os.path.splitext(basename)[0] basename = basename.split("-") version = FirmwareLoader.find_version_period(basename) # attempt to parse filename and generate product/version # strings remove = [version] if version else [] for i in range(0, len(basename)): if "BETA" in basename[i]: version += "-%s%s" % (basename[i], basename[i + 1]) remove.append(basename[i]) remove.append(basename[i + 1]) elif "RC" in basename[i]: version += "-%s" % (basename[i]) remove.append(basename[i]) elif "RELEASE" in basename[i]: remove.append(basename[i]) basename = [x for x in basename if x not in remove] item = FirmwareLoader(item=FirmwareImage(), response=response, date_fmt=["%d-%b-%Y"]) item.add_value("version", version) item.add_value("url", href) item.add_value( "date", item.find_date(link.xpath("following::text()").extract())) item.add_value("product", "-".join(basename)) item.add_value("vendor", self.name) yield item.load_item()
def parse(self, response): for link in response.xpath("//a"): text = link.xpath(".//text()").extract()[0] href = link.xpath(".//@href").extract()[0] if ".." in href: continue elif href.endswith('/'): yield Request( url=urlparse.urljoin(response.url, href), headers={"Referer": response.url}, callback=self.parse) elif href.endswith(".gz") and ".iso" not in href: # strip off multiple file extensions basename = os.path.splitext(text)[0] while ".img" in basename or ".iso" in basename: basename = os.path.splitext(basename)[0] basename = basename.split("-") version = FirmwareLoader.find_version_period(basename) # attempt to parse filename and generate product/version # strings remove = [version] if version else [] for i in range(0, len(basename)): if "BETA" in basename[i]: version += "-%s%s" % (basename[i], basename[i + 1]) remove.append(basename[i]) remove.append(basename[i + 1]) elif "RC" in basename[i]: version += "-%s" % (basename[i]) remove.append(basename[i]) elif "RELEASE" in basename[i]: remove.append(basename[i]) basename = [x for x in basename if x not in remove] item = FirmwareLoader( item=FirmwareImage(), response=response, date_fmt=["%d-%b-%Y"]) item.add_value("version", version) item.add_value("url", href) item.add_value("date", item.find_date( link.xpath("following::text()").extract())) item.add_value("product", "-".join(basename)) item.add_value("vendor", self.name) yield item.load_item()
def parse(self, response): for link in response.xpath("//table//tr"): if not link.xpath("./td[2]/a"): continue text = link.xpath("./td[2]/a/text()").extract()[0] href = link.xpath("./td[2]//@href").extract()[0] if ".." in href: continue elif href.endswith('/'): build = response.meta.get("build", None) product = response.meta.get("product", None) if not product: product = text elif not build: build = text.replace("build", "") yield Request(url=urlparse.urljoin(response.url, href), headers={"Referer": response.url}, meta={ "build": build, "product": product }, callback=self.parse) elif any( href.endswith(x) for x in [".bin", ".elf", ".fdt", ".imx", ".chk", ".trx"]): item = FirmwareLoader(item=FirmwareImage(), response=response, date_fmt=["%Y-%m-%d"]) item.add_value("build", response.meta["build"]) item.add_value("url", href) item.add_value( "version", FirmwareLoader.find_version_period( os.path.splitext(text)[0].split("-"))) item.add_value( "date", item.find_date(link.xpath("./td[3]/text()").extract())) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) yield item.load_item()
def parse_product(self, response): for i in range(0, len(response.xpath("//ul[@id='normaltab2']//a"))): if "firmware" in "".join(response.xpath( "//ul[@id='normaltab2']/li[%d]/a//text()" % (i + 1)).extract()).lower(): for entry in response.xpath( "//div[@id='normalcon2']/div[%d]//table/tr[1]" % (i + 1)): version = entry.xpath("./td[2]//text()").extract() date = entry.xpath("./td[4]//text()").extract() href = entry.xpath("./td[5]//a/@href").extract()[0] item = FirmwareLoader( item=FirmwareImage(), response=response, date_fmt=["%Y-%m-%d"]) item.add_value( "version", FirmwareLoader.find_version_period(version)) item.add_value("url", href) item.add_value("date", item.find_date(date)) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.vendor) yield item.load_item()
def parse_product(self, response): for i in range(0, len(response.xpath("//ul[@id='normaltab2']//a"))): if "firmware" in "".join(response.xpath( "//ul[@id='normaltab2']/li[%d]/a//text()" % (i + 1)).extract()).lower(): for entry in response.xpath( "//div[@id='normalcon2']/div[%d]//table/tr[1]" % (i + 1)): version = entry.xpath("./td[2]//text()").extract() date = entry.xpath("./td[4]//text()").extract() href = entry.xpath("./td[5]//a/@href").extract()[0] item = FirmwareLoader( item=FirmwareImage(), response=response, date_fmt=["%Y-%m-%d"]) item.add_value( "version", FirmwareLoader.find_version_period(version)) item.add_value("url", href) item.add_value("date", item.find_date(date)) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.vendor) yield item.load_item()
def parse_product(self, response): for row in response.xpath( "//div[@class='main_data_block']//table/tr[position() > 1]"): text = row.xpath("./td[1]//text()").extract() edition = row.xpath("./td[2]//text()").extract() date = row.xpath("./td[4]//text()").extract() hrefs = row.xpath("./td[5]//a/@href").extract() if hrefs: item = FirmwareLoader( item=FirmwareImage(), response=response, date_fmt=["%Y/%m/%d"]) item.add_value( "version", FirmwareLoader.find_version_period(edition)) item.add_value("build", FirmwareLoader.find_build(edition)) item.add_value("url", hrefs[0]) item.add_value("date", item.find_date(date)) item.add_value("description", text[2].strip()) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) yield item.load_item()
def parse_download(self, response): for firmware in response.xpath("//li[@class='categoryBucket categoryBucketId-7']//li[@class='record ']"): product = ( response.xpath("//div[@class='prodNavHeaderBody']//text()") .extract()[0] .replace(" Support & Drivers", "") ) date = firmware.xpath(".//ul[@class='dateVersion']//strong/text()").extract() version = firmware.xpath(".//ul[@class='dateVersion']//strong/text()").extract() href = firmware.xpath(".//a/@href").extract()[0].replace("file-download", "file-redirect") text = firmware.xpath(".//a//text()").extract()[0] item = FirmwareLoader(item=FirmwareImage(), response=response, date_fmt=["%b %d, %Y"]) item.add_value("url", href) item.add_value("product", product) item.add_value("date", item.find_date(date)) item.add_value("description", text) item.add_value("version", item.find_version_period(version)) item.add_value("vendor", self.name) yield item.load_item()
def parse_product(self, response): for row in response.xpath( "//div[@class='main_data_block']//table/tr[position() > 1]"): text = row.xpath("./td[1]//text()").extract() edition = row.xpath("./td[2]//text()").extract() date = row.xpath("./td[4]//text()").extract() hrefs = row.xpath("./td[5]//a/@href").extract() if hrefs: item = FirmwareLoader(item=FirmwareImage(), response=response, date_fmt=["%Y/%m/%d"]) item.add_value("version", FirmwareLoader.find_version_period(edition)) item.add_value("build", FirmwareLoader.find_build(edition)) item.add_value("url", hrefs[0]) item.add_value("date", item.find_date(date)) item.add_value("description", text[2].strip()) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) yield item.load_item()
def parse_json(self, response): mib = None json_response = json.loads(response.body_as_unicode()) for entry in reversed(json_response["item"]): for file in reversed(entry["file"]): if file["filetypename"].lower() == "firmware" or file["isFirmF"] == "1": item = FirmwareLoader(item=FirmwareImage(), response=response, date_fmt=["%m/%d/%y"]) item.add_value("version", FirmwareLoader.find_version_period([file["name"]])) item.add_value("date", file["date"]) item.add_value("description", file["name"]) item.add_value("url", file["url"]) item.add_value("build", response.meta["revision"]) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) item.add_value("mib", mib) yield item.load_item() elif "MIB" in file["name"]: mib = file["url"]
def parse_product(self, response): for section in response.xpath("//ul[@id='tab_conbox']/li"): if u"升级软件" in "".join(section.xpath("./h3//text()").extract()): for entry in section.xpath(".//dd/a"): text = entry.xpath(".//text()").extract() href = entry.xpath("./@href").extract()[0] desc = text[0] # reverse text because hw version can come before version # e.g. "FH330升级软件(V1.0) V1.0.0.24_CN" if len(text) == 1: text = text[0].split() text.reverse() item = FirmwareLoader(item=FirmwareImage(), response=response) item.add_value("version", FirmwareLoader.find_version_period(text)) item.add_value("url", href) item.add_value("product", response.meta["product"]) item.add_value("description", desc) item.add_value("vendor", self.vendor) yield item.load_item()
def parse_product(self, response): for section in response.xpath("//ul[@id='tab_conbox']/li"): if u"升级软件" in "".join(section.xpath("./h3//text()").extract()): for entry in section.xpath(".//dd/a"): text = entry.xpath(".//text()").extract() href = entry.xpath("./@href").extract()[0] desc = text[0] # reverse text because hw version can come before version # e.g. "FH330升级软件(V1.0) V1.0.0.24_CN" if len(text) == 1: text = text[0].split() text.reverse() item = FirmwareLoader( item=FirmwareImage(), response=response) item.add_value( "version", FirmwareLoader.find_version_period(text)) item.add_value("url", href) item.add_value("product", response.meta["product"]) item.add_value("description", desc) item.add_value("vendor", self.vendor) yield item.load_item()
def parse_json(self, response): mib = None json_response = json.loads(response.body_as_unicode()) for entry in reversed(json_response["item"]): for file in reversed(entry["file"]): if file["filetypename"].lower() == "firmware" or file[ "isFirmF"] == "1": item = FirmwareLoader(item=FirmwareImage(), response=response, date_fmt=["%m/%d/%y"]) item.add_value("version", FirmwareLoader.find_version_period([file["name"]])) item.add_value("date", file["date"]) item.add_value("description", file["name"]) item.add_value("url", file["url"]) item.add_value("build", response.meta["revision"]) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) item.add_value("mib", mib) yield item.load_item() elif "MIB" in file["name"]: mib = file["url"]
def parse(self, response): for entry in response.xpath("//table/tr[position() > 3]"): if not entry.xpath("./td[2]/a"): continue text = entry.xpath("./td[2]/a//text()").extract()[0] href = entry.xpath("./td[2]/a/@href").extract()[0] date = entry.xpath("./td[3]//text()").extract()[0] if "DSM" in response.url: if href.endswith('/'): build = None version = response.meta.get( "version", FirmwareLoader.find_version_period([text])) if not FirmwareLoader.find_version_period([text]): build = text[0: -1] yield Request( url=urlparse.urljoin(response.url, href), headers={"Referer": response.url}, meta={"build": build, "version": version}, callback=self.parse) elif all(not href.lower().endswith(x) for x in [".txt", ".md5", ".torrent"]): product = None basename = os.path.splitext(text)[0].split("_") if "DSM" in basename: if response.meta["build"] in basename: basename.remove(response.meta["build"]) basename.remove("DSM") product = " ".join(basename) else: product = basename[-2] item = FirmwareLoader( item=FirmwareImage(), response=response, date_fmt=["%d-%b-%Y"]) item.add_value("build", response.meta["build"]) item.add_value("version", response.meta["version"]) item.add_value( "mib", "http://dedl.synology.com/download/Document/MIBGuide/Synology_MIB_File.zip") item.add_value("url", href) item.add_value("date", date) item.add_value("product", product) item.add_value("vendor", self.name) yield item.load_item() elif "VSFirmware" in response.url: if href.endswith('/'): version, build = text[0: -1].split("-") yield Request( url=urlparse.urljoin(response.url, href), headers={"Referer": response.url}, meta={"build": build, "version": version}, callback=self.parse) elif all(not href.lower().endswith(x) for x in [".txt", ".md5", ".torrent"]): basename = os.path.splitext(text)[0].split("_") item = FirmwareLoader( item=FirmwareImage(), response=response, date_fmt=["%d-%b-%Y"]) item.add_value("build", response.meta["build"]) item.add_value("version", response.meta["version"]) item.add_value("url", href) item.add_value("date", date) item.add_value("product", basename[0]) item.add_value("vendor", self.name) yield item.load_item()
def parse(self, response): # choose the "Product Drilldown" button if response.xpath( "//a[@id='ctl00_ctl00_ctl00_mainContent_localizedContent_bodyCenter_BasicSearchPanel_btnAdvancedSearch']" ): href = NetgearSpider.strip_js( response.xpath( "//a[@id='ctl00_ctl00_ctl00_mainContent_localizedContent_bodyCenter_BasicSearchPanel_btnAdvancedSearch']/@href" ).extract()[0]) yield FormRequest.from_response(response, formname="aspnetForm", formdata={"__EVENTTARGET": href}, headers={"Referer": response.url}, callback=self.parse) # continue iterating through product/model/os selector else: if response.xpath("//div[@id='LargeFirmware']//a"): mib = None for entry in response.xpath("//div[@id='LargeFirmware']//a"): href = entry.xpath("./@data-durl").extract() text = entry.xpath(".//text()").extract() # sometimes it is 'href' instead of 'data-durl' if not href: href = entry.xpath("./@href").extract() if "firmware" in " ".join(text).lower(): item = FirmwareLoader(item=FirmwareImage(), response=response) item.add_value( "version", FirmwareLoader.find_version_period(text)) item.add_value("url", href[0]) item.add_value("description", text[0]) item.add_value("mib", mib) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) yield item.load_item() elif "mib" in " ".join(text).lower(): mib = urlparse.urljoin(response.url, href[0].strip()) elif "" not in response.xpath( "//select[@name='ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProduct']/option/@value" ).extract(): for entry in response.xpath( "//select[@name='ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProduct']/option" ): rsrc = entry.xpath("./@value").extract()[0] text = entry.xpath(".//text()").extract() if text and (response.url, rsrc) not in self.visited: self.visited.append((response.url, rsrc)) yield FormRequest.from_response( response, formname="aspnetForm", formdata={ "__EVENTTARGET": "ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProduct", "ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProduct": rsrc, "__ASYNCPOST:": "true" }, meta={"product": text[0]}, headers={"Referer": response.url}, callback=self.parse) elif "" not in response.xpath( "//select[@name='ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProductFamily']/option/@value" ).extract(): for entry in response.xpath( "//select[@name='ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProductFamily']/option" ): rsrc = entry.xpath("./@value").extract()[0] text = entry.xpath(".//text()").extract() if text and (response.url, rsrc) not in self.visited: self.visited.append((response.url, rsrc)) yield FormRequest.from_response( response, formname="aspnetForm", formdata={ "__EVENTTARGET": "ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProductFamily", "ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProductFamily": rsrc, "__ASYNCPOST:": "true" }, headers={"Referer": response.url}, callback=self.parse) elif "" not in response.xpath( "//select[@name='ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProductCategory']/option/@value" ).extract(): for entry in response.xpath( "//select[@name='ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProductCategory']/option" ): rsrc = entry.xpath("./@value").extract()[0] text = entry.xpath(".//text()").extract() if text and (response.url, rsrc) not in self.visited: self.visited.append((response.url, rsrc)) yield FormRequest.from_response( response, formname="aspnetForm", formdata={ "__EVENTTARGET": "ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProductCategory", "ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProductCategory": rsrc, "__ASYNCPOST:": "true" }, headers={"Referer": response.url}, callback=self.parse)
def parse(self, response): for entry in response.xpath("//table/tr[position() > 3]"): if not entry.xpath("./td[2]/a"): continue text = entry.xpath("./td[2]/a//text()").extract()[0] href = entry.xpath("./td[2]/a/@href").extract()[0] date = entry.xpath("./td[3]//text()").extract()[0] if "DSM" in response.url: if href.endswith('/'): build = None version = response.meta.get( "version", FirmwareLoader.find_version_period([text])) if not FirmwareLoader.find_version_period([text]): build = text[0:-1] yield Request(url=urlparse.urljoin(response.url, href), headers={"Referer": response.url}, meta={ "build": build, "version": version }, callback=self.parse) elif all(not href.lower().endswith(x) for x in [".txt", ".md5", ".torrent"]): product = None basename = os.path.splitext(text)[0].split("_") if "DSM" in basename: if response.meta["build"] in basename: basename.remove(response.meta["build"]) basename.remove("DSM") product = " ".join(basename) else: product = basename[-2] item = FirmwareLoader(item=FirmwareImage(), response=response, date_fmt=["%d-%b-%Y"]) item.add_value("build", response.meta["build"]) item.add_value("version", response.meta["version"]) item.add_value( "mib", "http://dedl.synology.com/download/Document/MIBGuide/Synology_MIB_File.zip" ) item.add_value("url", href) item.add_value("date", date) item.add_value("product", product) item.add_value("vendor", self.name) yield item.load_item() elif "VSFirmware" in response.url: if href.endswith('/'): version, build = text[0:-1].split("-") yield Request(url=urlparse.urljoin(response.url, href), headers={"Referer": response.url}, meta={ "build": build, "version": version }, callback=self.parse) elif all(not href.lower().endswith(x) for x in [".txt", ".md5", ".torrent"]): basename = os.path.splitext(text)[0].split("_") item = FirmwareLoader(item=FirmwareImage(), response=response, date_fmt=["%d-%b-%Y"]) item.add_value("build", response.meta["build"]) item.add_value("version", response.meta["version"]) item.add_value("url", href) item.add_value("date", date) item.add_value("product", basename[0]) item.add_value("vendor", self.name) yield item.load_item()
def parse(self, response): if response.xpath("//form[@name='UCagreement']"): for href in response.xpath( "//div[@id='productAndDoc']").extract()[0].split('"'): if "downloads.polycom.com" in href: item = FirmwareLoader(item=FirmwareImage(), response=response, date_fmt=["%B %d, %Y"]) item.add_value("version", response.meta["version"]) item.add_value("url", href.encode("utf-8")) item.add_value("date", response.meta["date"]) item.add_value("description", response.meta["description"]) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) yield item.load_item() elif response.xpath("//div[@id='ContentChannel']"): for entry in response.xpath("//div[@id='ContentChannel']//li"): if not entry.xpath("./a"): continue text = entry.xpath("./a//text()").extract()[0] href = entry.xpath("./a/@href").extract()[0].strip() date = entry.xpath("./span//text()").extract() path = urlparse(href).path if any(x in text.lower() for x in [ "end user license agreement", "eula", "release notes", "mac os", "windows", "guide", "(pdf)", "sample" ]) or href.endswith(".pdf"): continue elif any(path.endswith(x) for x in [".htm", ".html"]) or "(html)" in text.lower(): yield Request(url=urljoin(response.url, PolycomSpider.fix_url(href)), meta={ "product": response.meta["product"] if "product" in response.meta else text, "date": date, "version": FirmwareLoader.find_version_period( [text]), "description": text }, headers={"Referer": response.url}, callback=self.parse) elif path: item = FirmwareLoader(item=FirmwareImage(), response=response, date_fmt=["%B %d, %Y"]) item.add_value("version", FirmwareLoader.find_version_period([text])) item.add_value("url", href.encode("utf-8")) item.add_value("date", item.find_date(date)) item.add_value("description", text) # item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) yield item.load_item()
def parse(self, response): # choose the "Product Drilldown" button if response.xpath( "//a[@id='ctl00_ctl00_ctl00_mainContent_localizedContent_bodyCenter_BasicSearchPanel_btnAdvancedSearch']"): href = NetgearSpider.strip_js(response.xpath( "//a[@id='ctl00_ctl00_ctl00_mainContent_localizedContent_bodyCenter_BasicSearchPanel_btnAdvancedSearch']/@href").extract()[0]) yield FormRequest.from_response(response, formname="aspnetForm", formdata={"__EVENTTARGET": href}, headers={"Referer": response.url}, callback=self.parse) # continue iterating through product/model/os selector else: if response.xpath("//div[@id='LargeFirmware']//a"): mib = None for entry in response.xpath("//div[@id='LargeFirmware']//a"): href = entry.xpath("./@data-durl").extract() text = entry.xpath(".//text()").extract() # sometimes it is 'href' instead of 'data-durl' if not href: href = entry.xpath("./@href").extract() if "firmware" in " ".join(text).lower(): item = FirmwareLoader( item=FirmwareImage(), response=response) item.add_value( "version", FirmwareLoader.find_version_period(text)) item.add_value("url", href[0]) item.add_value("description", text[0]) item.add_value("mib", mib) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) yield item.load_item() elif "mib" in " ".join(text).lower(): mib = urlparse.urljoin(response.url, href[0].strip()) elif "" not in response.xpath("//select[@name='ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProduct']/option/@value").extract(): for entry in response.xpath( "//select[@name='ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProduct']/option"): rsrc = entry.xpath("./@value").extract()[0] text = entry.xpath(".//text()").extract() if text and (response.url, rsrc) not in self.visited: self.visited.append((response.url, rsrc)) yield FormRequest.from_response(response, formname="aspnetForm", formdata={"__EVENTTARGET": "ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProduct", "ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProduct": rsrc, "__ASYNCPOST:": "true"}, meta={ "product": text[0]}, headers={ "Referer": response.url}, callback=self.parse) elif "" not in response.xpath("//select[@name='ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProductFamily']/option/@value").extract(): for entry in response.xpath( "//select[@name='ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProductFamily']/option"): rsrc = entry.xpath("./@value").extract()[0] text = entry.xpath(".//text()").extract() if text and (response.url, rsrc) not in self.visited: self.visited.append((response.url, rsrc)) yield FormRequest.from_response(response, formname="aspnetForm", formdata={"__EVENTTARGET": "ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProductFamily", "ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProductFamily": rsrc, "__ASYNCPOST:": "true"}, headers={ "Referer": response.url}, callback=self.parse) elif "" not in response.xpath("//select[@name='ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProductCategory']/option/@value").extract(): for entry in response.xpath( "//select[@name='ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProductCategory']/option"): rsrc = entry.xpath("./@value").extract()[0] text = entry.xpath(".//text()").extract() if text and (response.url, rsrc) not in self.visited: self.visited.append((response.url, rsrc)) yield FormRequest.from_response(response, formname="aspnetForm", formdata={"__EVENTTARGET": "ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProductCategory", "ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProductCategory": rsrc, "__ASYNCPOST:": "true"}, headers={ "Referer": response.url}, callback=self.parse)