Пример #1
0
	def Kuvat(self):
		kuvat = []
	
		
		images = self.soup.find_all("img")
		for image in images:
			x = image.get("alt")
			if x is None or not "strip" in x.lower():
				continue

			kuva = dict(nimi=None, src=None, filetype=None)
			try:
				if image["src"].index("//") == 0:
					image["src"] = "http:{}".format(image["src"])
			except: pass
			try:
				if image["src"].index("./") == 0:
					image["src"] = image["src"].replace("./", "/")
			except: pass

			
			kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi
			if "://" in image["src"]:
				kuva["src"] = url_fix("{}".format(image["src"]))
			else:
				kuva["src"] = url_fix("{}/{}".format(self.sarjakuva.url, image["src"]))
			kuva["filetype"] = "{}".format(image["src"].split(".")[-1])

			kuvat.append(kuva)
		
		return kuvat
Пример #2
0
	def Kuvat(self):
		kuvat = []
	
		div = self.soup.find("div", { "class": "comic_group" })
		images = div.find_all("img")
		for image in images:
			
			kuva = dict(nimi=None, src=None, filetype=None)
			try:
				if image["src"].index("//") == 0:
					image["src"] = "http:{}".format(image["src"])
			except: pass
			try:
				if image["src"].index("./") == 0:
					image["src"] = image["src"].replace("./", "/")
			except: pass

			
			kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi
			if "://" in image["src"]:
				kuva["src"] = url_fix("{}".format(image["src"].strip()))
			else:
				uu = "/".join(self.urli.split("/")[:-1])
				kuva["src"] = url_fix("{}/{}".format(uu, image["src"].strip()))
			kuva["filetype"] = "{}".format(image["src"].split(".")[-1])

			kuvat.append(kuva)
		
		return kuvat
Пример #3
0
def test_url_fixing():
    """URL fixing"""
    x = url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffskl\xe4rung)')
    assert x == 'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29'

    x = url_fix('http://example.com/?foo=%2f%2f')
    assert x == 'http://example.com/?foo=%2f%2f'
Пример #4
0
	def Kuvat(self):
		kuvat = []
		
		#mages = self.soup.find_all("img", { "class": "strip" })
		#for image in images:
		image = self.soup.find(id="comicimg")
		if image:
			kuva = dict(nimi=None, src=None, filetype=None)
			try:
				if image["src"].index("//") == 0:
					image["src"] = "http:{}".format(image["src"])
			except: pass
			try:
				if image["src"].index("./") == 0:
					image["src"] = image["src"].replace("./", "/")
			except: pass

			kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi
			kuva["src"] = url_fix(
							"{}".format(image["src"])
						)
			if not "://" in kuva["src"]:
				kuva["src"] = url_fix(
							"{}/{}".format(self.sarjakuva.url, image["src"])
						)
			kuva["filetype"] = "{}".format(image["src"].split(".")[-1])
			
			kuvat.append(kuva)

		return kuvat
Пример #5
0
    def wiki_image(self, addr, alt, class_='wiki', lineno=0):
        """Create HTML for a wiki image."""

        addr = addr.strip()
        chunk = ''
        if hatta.parser.external_link(addr):
            return html.img(src=url_fix(addr), class_="external", alt=alt)
        if '#' in addr:
            addr, chunk = addr.split('#', 1)
        if addr == '':
            return html.a(name=chunk)
        elif addr.startswith(':'):
            if chunk:
                chunk = '#' + chunk
            alias = self.link_alias(addr[1:])
            href = url_fix(alias + chunk)
            return html.img(src=href, class_="external alias", alt=alt)
        elif addr in self.storage:
            mime = page_mime(addr)
            if mime.startswith('image/'):
                return html.img(src=self.get_download_url(addr),
                                class_=class_,
                                alt=alt)
            else:
                return html.img(href=self.get_download_url(addr), alt=alt)
        else:
            return html.a(html(alt), href=self.get_url(addr))
Пример #6
0
	def Kuvat(self):
		kuvat = []
		div = self.soup.find("noscript")
		image = div.find("img")
			
		kuva = dict(nimi=None, src=None, filetype="png")
		try:
			if image["src"].index("//") == 0:
				image["src"] = "http:{}".format(image["src"])
		except: pass
		try:
			if image["src"].index("./") == 0:
				image["src"] = image["src"].replace("./", "/")
		except: pass
		image["src"] = image["src"].split("?")[0]
		image["src"] = "{}".format(image["src"].replace("_250.", "_1280."))
		image["src"] = "{}".format(image["src"].replace("_500.", "_1280."))
		kuva["nimi"] = "{}.{}".format(image["src"].split("/")[-1], kuva["filetype"]) # kuvan nimi = tiedoston nimi
		
		kuva["src"] = url_fix("{}".format(image["src"]))
		if not "://" in image["src"]:
			kuva["src"] = url_fix(
							"{}{}".format(self.sarjakuva.url, image["src"])
						)
		#kuva["filetype"] = u"{}".format(image["src"].split(".")[-1])

		kuvat.append(kuva)
		
		return kuvat
Пример #7
0
	def Kuvat(self):
		kuvat = []
		#div = self.soup.find("div", { "class": "comic-content"})
		
		#images = div.find_all("img")
		#for image in images:
		table = self.soup.find(id="comic")
		if table is None:
			table = self.soup.find("table", { "class": "shadow"} )
		
		images = table.find_all("img")
		for image in images:
			kuva = dict(nimi=None, src=None, filetype=None)
			try:
				if image["src"].index("//") == 0:
					image["src"] = "http:{}".format(image["src"])
			except: pass
			try:
				if image["src"].index("./") == 0:
					image["src"] = image["src"].replace("./", "/")
			except: pass
			
			#image["src"] = u"{}".format(image["src"].replace(u"_250.", u"_1280."))
			
			kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi
			if "://" in image["src"]:
				kuva["src"] = url_fix("{}".format(image["src"]))
			else:
				kuva["src"] = url_fix("{}/{}".format(self.sarjakuva.url, image["src"]))
			kuva["filetype"] = "{}".format(image["src"].split(".")[-1])

			kuvat.append(kuva)
		
		return kuvat
Пример #8
0
	def Kuvat(self):
		kuvat = []
		#div = self.soup.find("div", { "class": "comic-content"})
		
		#images = div.find_all("img")
		#for image in images:
		content = self.soup.find("div", { "class": "content"})
		section = content.find("section")
		entry = section.find("div", { "class": "entry"})
		image = entry.find("img")
		

		kuva = dict(nimi=None, src=None, filetype=None)
		try:
			if image["src"].index("//") == 0:
				image["src"] = "http:{}".format(image["src"])
		except: pass
		try:
			if image["src"].index("./") == 0:
				image["src"] = image["src"].replace("./", "/")
		except: pass

		
		kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi
		if "://" in image["src"]:
			kuva["src"] = url_fix("{}".format(image["src"]))
		else:
			kuva["src"] = url_fix("{}/{}".format(self.sarjakuva.url, image["src"]))
		kuva["filetype"] = "{}".format(image["src"].split(".")[-1])

		kuvat.append(kuva)
		
		return kuvat
Пример #9
0
	def Kuvat(self):
		kuvat = []
	
		div = self.soup.find(id="cc")
		images = div.find_all("img")
		for image in images:

			kuva = dict(nimi=None, src=None, filetype=None)
			try:
				if image["src"].index("//") == 0:
					image["src"] = "http:{}".format(image["src"])
			except: pass
			try:
				if image["src"].index("./") == 0:
					image["src"] = image["src"].replace("./", "/")
			except: pass

			
			kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi
			if "://" in image["src"]:
				kuva["src"] = url_fix("{}".format(image["src"]))
			else:
				kuva["src"] = url_fix("{}/{}".format(self.sarjakuva.url, image["src"]))
			kuva["filetype"] = "{}".format(image["src"].split(".")[-1])

			kuvat.append(kuva)
		
		return kuvat
Пример #10
0
    def test_url_fixing(self):
        x = urls.url_fix(
            u'http://de.wikipedia.org/wiki/Elf (Begriffskl\xe4rung)')
        assert x == 'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29'

        x = urls.url_fix('http://example.com/?foo=%2f%2f')
        assert x == 'http://example.com/?foo=%2f%2f'
Пример #11
0
    def test_url_fixing(self):
        x = urls.url_fix(
            u'http://de.wikipedia.org/wiki/Elf (Begriffskl\xe4rung)')
        self.assert_line_equal(
            x, 'http://de.wikipedia.org/wiki/Elf%20(Begriffskl%C3%A4rung)')

        x = urls.url_fix("http://just.a.test/$-_.+!*'(),")
        self.assert_equal(x, "http://just.a.test/$-_.+!*'(),")
Пример #12
0
def test_url_fixing_qs():
    x = urls.url_fix(b'http://example.com/?foo=%2f%2f')
    assert x == 'http://example.com/?foo=%2f%2f'

    x = urls.url_fix('http://acronyms.thefreedictionary.com/'
                     'Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation')
    assert x == ('http://acronyms.thefreedictionary.com/'
                 'Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation')
Пример #13
0
def test_url_fixing_qs():
    x = urls.url_fix(b'http://example.com/?foo=%2f%2f')
    assert x == 'http://example.com/?foo=%2f%2f'

    x = urls.url_fix(
        'http://acronyms.thefreedictionary.com/Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation'
    )
    assert x == 'http://acronyms.thefreedictionary.com/Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation'
Пример #14
0
    def test_url_fixing(self):
        x = urls.url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffskl\xe4rung)')
        assert x == 'http://de.wikipedia.org/wiki/Elf%20(Begriffskl%C3%A4rung)'

        x = urls.url_fix("http://just.a.test/$-_.+!*'(),")
        assert x == "http://just.a.test/$-_.+!*'(),"

        x = urls.url_fix('http://example.com/?foo=%2f%2f')
        assert x == 'http://example.com/?foo=%2f%2f'
Пример #15
0
    def test_url_fixing(self):
        x = urls.url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffskl\xe4rung)')
        assert x == 'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29'

        x = urls.url_fix('http://example.com/?foo=%2f%2f')
        assert x == 'http://example.com/?foo=%2f%2f'

        x = urls.url_fix('http://acronyms.thefreedictionary.com/Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation')
        assert x == 'http://acronyms.thefreedictionary.com/Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation'
Пример #16
0
def test_url_fixing():
    x = urls.url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffskl\xe4rung)')
    assert x == 'http://de.wikipedia.org/wiki/Elf%20(Begriffskl%C3%A4rung)'

    x = urls.url_fix("http://just.a.test/$-_.+!*'(),")
    assert x == "http://just.a.test/$-_.+!*'(),"

    x = urls.url_fix('http://höhöhö.at/höhöhö/hähähä')
    assert x == r'http://xn--hhh-snabb.at/h%C3%B6h%C3%B6h%C3%B6/h%C3%A4h%C3%A4h%C3%A4'
Пример #17
0
def test_url_fixing():
    x = urls.url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffskl\xe4rung)')
    assert x == 'http://de.wikipedia.org/wiki/Elf%20(Begriffskl%C3%A4rung)'

    x = urls.url_fix("http://just.a.test/$-_.+!*'(),")
    assert x == "http://just.a.test/$-_.+!*'(),"

    x = urls.url_fix('http://höhöhö.at/höhöhö/hähähä')
    assert x == r'http://xn--hhh-snabb.at/h%C3%B6h%C3%B6h%C3%B6/h%C3%A4h%C3%A4h%C3%A4'
Пример #18
0
    def test_url_fixing_qs(self):
        x = urls.url_fix(b'http://example.com/?foo=%2f%2f')
        self.assert_line_equal(x, 'http://example.com/?foo=%2f%2f')

        x = urls.url_fix(
            'http://acronyms.thefreedictionary.com/Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation'
        )
        self.assert_equal(
            x,
            'http://acronyms.thefreedictionary.com/Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation'
        )
Пример #19
0
    def test_url_fixing(self):
        x = urls.url_fix(
            u'http://de.wikipedia.org/wiki/Elf (Begriffskl\xe4rung)')
        assert x == 'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29'

        x = urls.url_fix('http://example.com/?foo=%2f%2f')
        assert x == 'http://example.com/?foo=%2f%2f'

        x = urls.url_fix(
            'http://acronyms.thefreedictionary.com/Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation'
        )
        assert x == 'http://acronyms.thefreedictionary.com/Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation'
Пример #20
0
def test_url_fixing_qs():
    x = urls.url_fix(b"http://example.com/?foo=%2f%2f")
    assert x == "http://example.com/?foo=%2f%2f"

    x = urls.url_fix(
        "http://acronyms.thefreedictionary.com/"
        "Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation"
    )
    assert x == (
        "http://acronyms.thefreedictionary.com/"
        "Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation"
    )
Пример #21
0
	def Kuvat(self):
		kuvat = []
		#div = self.soup.find("div", { "class": "comic-content"})
		
		#images = div.find_all("img")
		#for image in images:
		
		# image = self.soup.find("img", { "class": "alignnone"})
		# if image is None:
		# 	image = self.soup.find("img", { "class": "aligncenter"})
		
		images = self.soup.find_all("img")
		for image in images:
			
			try:
				width = image.get("width")
				
				if int(width) < 400:
					continue
			except Exception as e: 
				#print e
				continue
			

			
			kuva = dict(nimi=None, src=None, filetype=None)
			try:
				image["src"] = image["src"].split("?")[0]
				if image["src"].index("//") == 0:
					image["src"] = "http:{}".format(image["src"])
			except: pass
			try:
				if image["src"].index("./") == 0:
					image["src"] = image["src"].replace("./", "/")
			except: pass
			
			#image["src"] = u"{}".format(image["src"].replace(u"_250.", u"_1280."))
			
			kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi
			
			if "data:image" in image["src"]:
				kuva["src"] = image["src"]
			elif "://" in image["src"]:
				kuva["src"] = url_fix("{}".format(image["src"]))
			else:
				kuva["src"] = url_fix("{}{}".format(self.sarjakuva.url, image["src"]))
			kuva["filetype"] = "{}".format(image["src"].split(".")[-1])

			kuvat.append(kuva)
		
		return kuvat
Пример #22
0
def coerce_url(url: str) -> str:
    """
    Coerce URL to valid format

    :param url: URL
    :return: str
    """
    url.strip()
    if url.startswith("is_feed://"):
        return url_fix("http://{0}".format(url[7:]))
    for proto in ["http://", "https://"]:
        if url.startswith(proto):
            return url_fix(url)
    return url_fix("http://{0}".format(url))
Пример #23
0
    def _write(self,
               path,
               iname=None,
               data=None,
               replace=False,
               url=None,
               type=type,
               **kwargs):
        if not (iname or data or url):
            raise Exeption('Either iname, data or url need to be passed')

        data = dumps(data) if isinstance(data, dict) or isinstance(
            data, list) else data
        data = data.encode('utf-8') if isinstance(data, str) else data

        if url and type == 'Reference':
            kwargs.update({'replace': replace, 'url': url, 'type': type})
            r = self._put(url_fix(urljoin(self.url, path)),
                          params=kwargs,
                          data='')
        else:
            with TemporaryFile(mode='wb+') as f, open(
                    iname, mode='rb') if iname else BytesIO(
                        data) if data else htopen(url, mode='rb') as i:
                hasher = sha256()

                b = None
                while b == None or b != b'':
                    b = i.read(100 * 1024)
                    f.write(b)
                    hasher.update(b)

                f.seek(0)

                r = self._put(url_fix(urljoin(self.url, path)),
                              params={
                                  'replace':
                                  replace,
                                  'expected_hash':
                                  'SHA256:' + hasher.digest().hex(),
                                  'url':
                                  url,
                                  'type':
                                  type
                              },
                              files={path: f})

        if r.status_code not in [200, 204]:
            raise Exception('%d %s' % (r.status_code, r.text))
Пример #24
0
	def Kuvat(self):
		kuvan_nimi = None
		src = None
		
		kuvat = []
		
		centers = self.soup.find_all("center")
		
		for i in centers:
			center = i.find("center")
			image = i.find("img")
			br = i.find("br")
			if center is None and image and br:		
				kuva = dict(nimi=None, src=None)
				image["src"] = image["src"].replace("\n", "")
				if image["src"].index("//") == 0:
					image["src"] = "http:{}".format(image["src"])
				#image["src"] = u"{}".format(image["src"].replace(u"_250.", u"_1280."))
				kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi
				kuva["src"] = url_fix(
								"{}".format(image["src"])
							)
				kuva["filetype"] = "{}".format(image["src"].split(".")[-1])
				
				kuvat.append(kuva)
		
		return kuvat
Пример #25
0
	def Kuvat(self):
		kuvat = []
		ul = self.soup.find("ul", {"class":"latest-blog-posts-list"})
		images = ul.find_all("img")
		for image in images:
			if not "comic" in image["src"]:
				continue

				
			kuva = dict(nimi=None, src=None)
			image["src"] = image["src"].split("?")[0]
			try:
				if image["src"].index("//") == 0:
					image["src"] = "http:{}".format(image["src"])
			except: pass
			try:
				if image["src"].index("./") == 0:
					image["src"] = image["src"].replace("./", "/")
			except: pass
			image["src"] = "{}".format(image["src"].replace("_250.", "_1280."))
			image["src"] = "{}".format(image["src"].replace("_500.", "_1280."))
			kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi
			kuva["src"] = url_fix("{}".format(image["src"]))
			kuva["filetype"] = "{}".format(image["src"].split(".")[-1])

			found = self.sessio.query(Strippi).filter(
						Strippi.sarjakuva_id == self.sarjakuva.id,
						Strippi.url == image["src"]
					).first()

			if not found:
				kuvat.append(kuva)
			
		
		return kuvat
Пример #26
0
	def Kuvat(self):
		kuvat = []
		div = self.soup.find("div", { "class": "comic" })
		image = div.find("img")
		
		kuva = dict(nimi=None, src=None)
		try:
			if image["src"].index("//") == 0:
				image["src"] = "http:{}".format(image["src"])
		except: pass
		try:
			if image["src"].index("./") == 0:
				image["src"] = image["src"].replace("./", "/")
		except: pass
		image["src"] = "{}".format(image["src"].replace("_250.", "_1280."))
		image["src"] = "{}".format(image["src"].replace("_500.", "_1280."))

		kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi
		kuva["src"] = url_fix(
						"{}/{}".format(self.sarjakuva.url, image["src"])
					)
		kuva["filetype"] = "{}".format(image["src"].split(".")[-1])

		kuvat.append(kuva)
		
		return kuvat
Пример #27
0
	def Kuvat(self):
		kuvat = []
		
		#mages = self.soup.find_all("img", { "class": "strip" })
		#for image in images:
		div = self.soup.find("noscript")
		image = div.find("img")

		kuva = dict(nimi=None, src=None, filetype=None)
		try:
			image["src"] = image["src"].split("?")[0]
			if image["src"][-1] == "/":
				image["src"] = image["src"][:-1]
				kuva["filetype"] = "jpg"
			if image["src"].index("//") == 0:
				image["src"] = "http:{}".format(image["src"])
		except: pass
		try:
			if image["src"].index("./") == 0:
				image["src"] = image["src"].replace("./", "/")
		except: pass

		kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi
		kuva["src"] = url_fix(
						"{}".format(image["src"])
					)
		if kuva["filetype"] is None:
			kuva["filetype"] = "{}".format(image["src"].split(".")[-1])
		
		kuvat.append(kuva)
		
		return kuvat
Пример #28
0
	def Kuvat(self):
		kuvat = []
		#div = self.soup.find("div", { "class": "comic-content"})
		
		#images = div.find_all("img")
		#for image in images:
		image = self.soup.find("img", { "class": "img-comic" })
		kuva = dict(nimi=None, src=None, filetype="jpg")
		try:
			if image["src"].index("//") == 0:
				image["src"] = "http:{}".format(image["src"])
		except: pass
		try:
			if image["src"].index("./") == 0:
				image["src"] = image["src"].replace("./", "/")
		except: pass
		
		image["src"] = "{}".format(image["src"].replace("_250.", "_1280."))
		
		kuva["nimi"] = "{}.{}".format(image["src"].split("/")[-1], kuva["filetype"]) # kuvan nimi = tiedoston nimi
		kuva["src"] = url_fix(
						"{}".format(image["src"])
					)
		#kuva["filetype"] = u"{}".format(image["src"].split(".")[-1])

		kuvat.append(kuva)
		
		return kuvat
Пример #29
0
 def fix_target(self):
     if isinstance(self.target.data, str):
         self.target.data = self.target.data.strip()
         pre, sep, _ = self.target.data.partition("//")
         if not sep:
             self.target.data = f"http://{pre}"
         self.target.data = url_fix(self.target.data)
Пример #30
0
    def parse_product(self, response):
        soup = BeautifulSoup(response.body, 'lxml')

        p = Product()

        for element, path in self.selectors.viewitems():
            node = soup.select_one(path)

            if not node:
                continue
            if element == 'image':
                p[element] = url_fix(urljoin(response.url, node['src']))
            else:
                p[element] = text(node)

        if 'name' in p and 'number' in p:
            p['url'] = response.url
            p['pricing'], p['discountcode'] = get_prices(soup)
            soup.decompose()
            yield p
        else:
            # Only follow links on non-product pages
            soup.decompose()
            for link in self.link_extractor.extract_links(response):
                yield Request(url=link.url)
Пример #31
0
	def Kuvat(self):
		kuvat = []
		div = self.soup.find(id="wsite-content")
		
		figures = div.find_all("div", {"class": "wsite-image"})
		for figure in figures:
			images = figure.find_all("img")
			for image in images:
				kuva = dict(nimi=None, src=None)
				try:
					if image["src"].index("//") == 0:
						image["src"] = "http:{}".format(image["src"])
				except: pass
				try:
					if image["src"].index("./") == 0:
						image["src"] = image["src"].replace("./", "/")
				except: pass
				image["src"] = image["src"].split("?")[0]
				image["src"] = "{}".format(image["src"].replace("_250.", "_1280."))
				image["src"] = "{}".format(image["src"].replace("_500.", "_1280."))
				kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi
				kuva["src"] = url_fix(
								"{}{}".format(self.sarjakuva.url, image["src"])
							)
				kuva["filetype"] = "{}".format(image["src"].split(".")[-1])

				kuvat.append(kuva)
		
		return kuvat
Пример #32
0
    def __init__(self, content):
        self.content = content.encode('ascii', 'ignore')

        # make safe, but retain spaces
        self._words = wz.url_fix(self.content.lower()).replace('%20', ' ').split(' ')

        self._body_normalized = ' ' + ' '.join(self._words) + ' '
Пример #33
0
	def Kuvat(self):
		kuvat = []
		div = self.soup.find("div", { "class": "comic-content"})
		
		images = div.find_all("img")
		for image in images:
			kuva = dict(nimi=None, src=None)
			try:
				if image["src"].index("//") == 0:
					image["src"] = "http:{}".format(image["src"])
			except: pass
			try:
				if image["src"].index("./") == 0:
					image["src"] = image["src"].replace("./", "/")
			except: pass
			
			image["src"] = "{}".format(image["src"].replace("_250.", "_1280."))
			kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi
			kuva["src"] = url_fix(
							"{}{}".format("http://www.interrobangstudios.com/", image["src"])
						)
			kuva["filetype"] = "{}".format(image["src"].split(".")[-1])

			kuvat.append(kuva)
		
		return kuvat
Пример #34
0
def _validate_url(url: str) -> str:
    '''Validate a URL.

    Given a string, return a sanitized URL, or raise InvalidURLError if
    the string is not a valid URL.

    Args:
        url (str): The string to validate as a URL

    Returns:
        str: The sanitized, validated URL

    Raises:
        InvalidURLError: The argument is not a valid URL
    '''
    if not url or not isinstance(url, str): raise InvalidURLError
    # KISS. Can be expanded later if desired.
    valid_schemes = ['http', 'https']
    valid_netloc_pattern = re.compile(r'\w+\.\w+')

    url_tuple = url_parse(url, scheme='http')
    scheme, netloc, path = url_tuple.scheme, url_tuple.netloc, url_tuple.path
    if scheme not in valid_schemes: raise InvalidURLError
    if not re.match(valid_netloc_pattern, netloc) and \
       (netloc or not re.match(valid_netloc_pattern, path)):
        raise InvalidURLError
    return url_fix(url)
Пример #35
0
	def Kuvat(self):
		kuvat = []
		articles = self.soup.find_all("article")
		for article in articles:
			figures = article.find_all("figure")
			for figure in figures:
				images = figure.find_all("img")
				for image in images:
					kuva = dict(nimi=None, src=None)
					try:
						if image["src"].index("//") == 0:
							image["src"] = "http:{}".format(image["src"])
					except: pass
					try:
						if image["src"].index("./") == 0:
							image["src"] = image["src"].replace("./", "/")
					except: pass
					#image["src"] = u"{}".format(image["src"].replace(u"_250.", u"_1280."))
					kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi
					kuva["src"] = url_fix(
									"{}".format(image["src"])
								)
					kuva["filetype"] = "{}".format(image["src"].split(".")[-1])

					kuvat.append(kuva)
		
		return kuvat
Пример #36
0
	def Kuvat(self):
		
		kuvat = []

		# table = self.soup.find("table")
		# table = table.find("tbody")
		# table = self.soup.find("tr").find("td")

		found = self.soup.find_all("img")
		for image in found:
			try:
				if image["src"].index("images/") == 0:
					kuva = dict(nimi=None, src=None)
					# if image["src"].index("//") == 0:
					# 	image["src"] = u"http:{}".format(image["src"])
					#image["src"] = u"{}".format(image["src"].replace(u"_250.", u"_1280."))
					kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi
					kuva["src"] = url_fix(
									"{}{}".format(self.sarjakuva.url, image["src"])
								)
					kuva["filetype"] = "{}".format(image["src"].split(".")[-1])

					kuvat.append(kuva)
					
					break

			except Exception as e: pass

		return kuvat
Пример #37
0
	def Kuvat(self):
		kuvat = []
		div = self.soup.find(id="content")
		article = div.find("article")
		divs = article.find_all("div", {"class": "entry-content"})
		for div in divs:
			images = div.find_all("img")
			for image in images:
				kuva = dict(nimi=None, src=None)
				try:
					if image["src"].index("//") == 0:
						image["src"] = "http:{}".format(image["src"])
				except: pass
				try:
					if image["src"].index("./") == 0:
						image["src"] = image["src"].replace("./", "/")
				except: pass
				image["src"] = "{}".format(image["src"].replace("_250.", "_1280."))
				image["src"] = "{}".format(image["src"].replace("_500.", "_1280."))
				kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi
				kuva["src"] = url_fix(
								"{}".format(image["src"])
							)
				kuva["filetype"] = "{}".format(image["src"].split(".")[-1])

				kuvat.append(kuva)
		
		return kuvat
Пример #38
0
	def Kuvat(self):		
		kuvat = []
	# 	articles = self.soup.find_all("article")
	# 	for article in articles:
	# 		figures = article.find_all("figure")
	# 		for figure in figures:
	# 			images = figure.find_all("img")
	# #for image in images:
		
		kuva = dict(nimi=None, src=None)
		div = self.soup.find(id="comic-page")
		if div is None:
			div = self.soup.find(id="comic")
			if div is None:
				div = self.soup.find("div", { "class": "comic" })
		

		images = div.find_all("img")
		for image in images:
			if "?" in image["src"]:
				image["src"] = image["src"].split("?")[0]

			
			if image["src"].index("//") == 0:
				image["src"] = "http:{}".format(image["src"])
			#image["src"] = u"{}".format(image["src"].replace(u"_250.", u"_1280."))
			kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi
			kuva["src"] = url_fix(
							"{}".format(image["src"])
						)
			kuva["filetype"] = "{}".format(image["src"].split(".")[-1])

			kuvat.append(kuva)
		
		return kuvat
Пример #39
0
    def __init__(self, content):
        self.content = content

        # make safe, but retain spaces
        self._words = wz.url_fix(content).replace('%20', ' ').split(' ')

        self._body_normalized = ' ' + ' '.join(self._words) + ' '
Пример #40
0
 def test_quoting(self):
     self.assert_strict_equal(urls.url_quote(u'\xf6\xe4\xfc'),
                              '%C3%B6%C3%A4%C3%BC')
     self.assert_strict_equal(urls.url_unquote(urls.url_quote(u'#%="\xf6')),
                              u'#%="\xf6')
     self.assert_strict_equal(urls.url_quote_plus('foo bar'), 'foo+bar')
     self.assert_strict_equal(urls.url_unquote_plus('foo+bar'), u'foo bar')
     self.assert_strict_equal(urls.url_quote_plus('foo+bar'), 'foo%2Bbar')
     self.assert_strict_equal(urls.url_unquote_plus('foo%2Bbar'),
                              u'foo+bar')
     self.assert_strict_equal(
         urls.url_encode({
             b'a': None,
             b'b': b'foo bar'
         }), 'b=foo+bar')
     self.assert_strict_equal(
         urls.url_encode({
             u'a': None,
             u'b': u'foo bar'
         }), 'b=foo+bar')
     self.assert_strict_equal(
         urls.url_fix(
             u'http://de.wikipedia.org/wiki/Elf (Begriffsklärung)'),
         'http://de.wikipedia.org/wiki/Elf%20(Begriffskl%C3%A4rung)')
     self.assert_strict_equal(urls.url_quote_plus(42), '42')
     self.assert_strict_equal(urls.url_quote(b'\xff'), '%FF')
Пример #41
0
    def to_python(self, value):
        "Normalize data to a list of strings."

        # Return an empty list if no input was given.
        if not value:
            return []
        return [url_fix(x.strip()) for x in value.split("\n") if x.strip()]
Пример #42
0
	def Loop(self, url=None, sessio=db.session):
		self.sessio = sessio
		self.Init(url)

		kuvat = ["jpg", "jpeg", "gif", "png", "svg"]
		links = self.soup.find_all("a")
		count = 0
		loaded = sessio.query(Strippi.url).filter(
				Strippi.sarjakuva_id==self.sarjakuva.id
			).all()
		loaded = [i.url for i in loaded]
		
		for link in links:
			
			nimi = link["href"]

			src = url_fix(
					"{}{}".format(self.sarjakuva.last_url, nimi)
				)
			filetype = "{}".format(nimi.split(".")[-1])
			
			if src in loaded:
				continue

			count += 1
			if not filetype in kuvat: # ei oikeanlainen kuva
				continue
			
			self.Save(nimi, src, filetype)

		return None
Пример #43
0
    def __init__(self, path='/', base_url=None, query_string=None,
                 method='GET', input_stream=None, content_type=None,
                 content_length=None, errors_stream=None, multithread=False,
                 multiprocess=False, run_once=False, headers=None, data=None,
                 environ_base=None, environ_overrides=None, charset='utf-8'):
        path_s = make_literal_wrapper(path)
        if query_string is None and path_s('?') in path:
            path, query_string = path.split(path_s('?'), 1)
        self.charset = charset
        self.path = iri_to_uri(path)
        if base_url is not None:
            base_url = url_fix(iri_to_uri(base_url, charset), charset)
        self.base_url = base_url
        if isinstance(query_string, (bytes, str)):
            self.query_string = query_string
        else:
            if query_string is None:
                query_string = MultiDict()
            elif not isinstance(query_string, MultiDict):
                query_string = MultiDict(query_string)
            self.args = query_string
        self.method = method
        if headers is None:
            headers = Headers()
        elif not isinstance(headers, Headers):
            headers = Headers(headers)
        self.headers = headers
        if content_type is not None:
            self.content_type = content_type
        if errors_stream is None:
            errors_stream = sys.stderr
        self.errors_stream = errors_stream
        self.multithread = multithread
        self.multiprocess = multiprocess
        self.run_once = run_once
        self.environ_base = environ_base
        self.environ_overrides = environ_overrides
        self.input_stream = input_stream
        self.content_length = content_length
        self.closed = False

        if data:
            if input_stream is not None:
                raise TypeError('can\'t provide input stream and data')
            if isinstance(data, str):
                data = data.encode(self.charset)
            if isinstance(data, bytes):
                self.input_stream = BytesIO(data)
                if self.content_length is None:
                    self.content_length = len(data)
            else:
                for key, value in _iter_data(data):
                    if (
                        isinstance(value, (tuple, dict)) or
                        hasattr(value, 'read')
                    ):
                        self._add_file_from_data(key, value)
                    else:
                        self.form.setlistdefault(key).append(value)
Пример #44
0
    def get_query(self, url, sleep=0.0, force=False):
        # Get LastFM key and cache duration
        ConfigParam = self.env['ir.config_parameter'].sudo()
        fm_key = ConfigParam.get_param('oomusic.lastfm_key')
        fm_cache = int(ConfigParam.get_param('oomusic.lastfm_cache', 112))
        fm_info = ConfigParam.get_param('oomusic.fm_info', 'auto')
        if not fm_key:
            return '{}'

        url = url_fix(url + '&api_key=' + fm_key +
                      '&format=json').encode('utf-8')
        url_hash = hashlib.sha1(url).hexdigest()

        new_cr = self.pool.cursor()
        Lastfm = self.with_env(self.env(cr=new_cr)).search([('name', '=',
                                                             url_hash)])
        if force or not Lastfm or Lastfm.expiry_date < fields.Datetime.now():
            content = '{}'
            if fm_info == 'manual' and not force:
                Lastfm.env.cr.rollback()
                Lastfm.env.cr.close()
                content = Lastfm.content or content
                return content
            try:
                time.sleep(sleep)
                r = requests.get(url, timeout=3.0)
                if r.status_code == 200:
                    content = r.content.decode('utf-8')
            except:
                _logger.info('Error while fetching URL "%s"',
                             url,
                             exc_info=True)

            expiry_date = datetime.datetime.utcnow() + datetime.timedelta(
                days=fm_cache)
            removal_date = datetime.datetime.utcnow() + datetime.timedelta(
                days=fm_cache + 14)

            # Save in cache
            with self.pool.cursor() as cr:
                new_self = Lastfm.with_env(self.env(cr=cr))
                if not Lastfm:
                    writer = new_self.create
                else:
                    writer = new_self.write
                writer({
                    'name': url_hash,
                    'url': url,
                    'content': content,
                    'expiry_date': expiry_date.strftime(DATETIME_FORMAT),
                    'removal_date': removal_date.strftime(DATETIME_FORMAT),
                })

        else:
            content = Lastfm.content or '{}'

        Lastfm.env.cr.rollback()
        Lastfm.env.cr.close()
        return content
Пример #45
0
def step1(client_id, redirect_url, scope):
    # 1. Send a user to authorize your app
    auth_url = ('''https://login.xero.com/identity/connect/authorize?''' +
                '''response_type=code''' + '''&client_id=''' + client_id +
                '''&redirect_uri=''' + redirect_url + '''&scope=''' + scope +
                '''&state=123''')

    webbrowser.open_new(url_fix(auth_url))
Пример #46
0
 def test_quoting(self):
     assert urls.url_quote(u'\xf6\xe4\xfc') == '%C3%B6%C3%A4%C3%BC'
     assert urls.url_unquote(urls.url_quote(u'#%="\xf6')) == u'#%="\xf6'
     assert urls.url_quote_plus('foo bar') == 'foo+bar'
     assert urls.url_unquote_plus('foo+bar') == 'foo bar'
     assert urls.url_encode({'a': None, 'b': 'foo bar'}) == 'b=foo+bar'
     assert urls.url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffsklärung)') == \
            'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29'
Пример #47
0
    def get_query(self, url, sleep=0.0, force=False):
        ConfigParam = self.env["ir.config_parameter"].sudo()
        sp_cache = int(ConfigParam.get_param("oomusic.spotify_cache", 182))
        ext_info = ConfigParam.get_param("oomusic.ext_info", "auto")

        url = url_fix(url).encode("utf-8")
        url_hash = hashlib.sha1(url).hexdigest()

        Spotify = self.search([("name", "=", url_hash)])
        if force or not Spotify or Spotify.expiry_date < fields.Datetime.now():
            content = "{}"
            if ext_info == "manual" and not force:
                content = Spotify.content or content
                return content
            try:
                time.sleep(sleep)
                headers = {
                    "Authorization":
                    "Bearer {}".format(
                        self.env["oomusic.spotify.token"]._get_token())
                }
                r = requests.get(url, headers=headers, timeout=3.0)
                if r.status_code == 200:
                    content = r.content.decode("utf-8")
                else:
                    _logger.info(
                        "Error while fetching URL '%s'. Error code: %s", url,
                        r.status_code)
                    return content
            except:
                _logger.info("Error while fetching URL '%s'",
                             url,
                             exc_info=True)
                return content

            expiry_date = datetime.datetime.utcnow() + datetime.timedelta(
                days=sp_cache)
            removal_date = datetime.datetime.utcnow() + datetime.timedelta(
                days=sp_cache + 14)

            # Save in cache
            if not Spotify:
                writer = self.create
            else:
                writer = Spotify.write
            writer({
                "name": url_hash,
                "url": url,
                "content": content,
                "expiry_date": expiry_date,
                "removal_date": removal_date,
            })
            self.env.cr.commit()

        else:
            content = Spotify.content or "{}"

        return content
Пример #48
0
    def gallery(self, **kwargs):
        link, doc, doc_abspath = self._get_link(**kwargs)
        if not os.path.isdir(doc_abspath):
            abort(404)

        values = {
            "doc":
            doc,
            "browse":
            "/ooshare/browse?token={}&doc={}".format(kwargs["token"], doc),
            "imgs":
            sorted(
                [{
                    "name":
                    f.name,
                    "url":
                    url_fix("/ooshare/img?token={}&doc={}".format(
                        kwargs["token"], os.path.join(doc, f.name))),
                    "url_thumb":
                    url_fix("/ooshare/img?token={}&doc={}&thumb=1".format(
                        kwargs["token"], os.path.join(doc, f.name))),
                } for f in os.scandir(doc_abspath) if f.is_file()
                 and ft.guess(os.path.join(doc_abspath, f.name)) and ft.guess(
                     os.path.join(doc_abspath, f.name)).extension in IMG_EXT],
                key=lambda d: d["name"],
            ),
            "vids":
            sorted(
                [{
                    "name":
                    f.name,
                    "url":
                    url_fix("/ooshare/vid?token={}&doc={}".format(
                        kwargs["token"], os.path.join(doc, f.name))),
                    "mime":
                    VID_EXT[os.path.splitext(f.name)[1][1:]],
                } for f in os.scandir(doc_abspath)
                 if f.is_file() and os.path.splitext(f.name)[1][1:] in VID_EXT
                 ],
                key=lambda d: d["name"],
            ),
        }

        res = request.render("ooshare.gallery", values)
        return res
Пример #49
0
def proxy(url):
    #url 不能为unicode,要转为utf-8
    #url = url.encode('utf-8')
    url = url_fix(url)
    try:
        content = urllib2.urlopen(url, timeout=120).read()
        return content
    except:
        return u'error'
Пример #50
0
def coerce_url(url: str, https: bool = True) -> str:
    """
    Coerce URL to valid format

    :param url: URL
    :param https: Force https if no scheme in url
    :return: str
    """
    url.strip()
    if url.startswith("feed://"):
        return url_fix("http://{0}".format(url[7:]))
    for proto in ["http://", "https://"]:
        if url.startswith(proto):
            return url_fix(url)
    if https:
        return url_fix("https://{0}".format(url))
    else:
        return url_fix("http://{0}".format(url))
Пример #51
0
    def run(self, url, year, created):
        """Run Celery Task.
        """
        self.job_id = self.request.id
        self.url = url_fix(url)
        time_started = datetime.now()
        self._log.info('Start calculating simhashes.')
        self.download_errors = 0
        if not self.url:
            self._log.error('did not give url parameter')
            return {'status': 'error', 'info': 'URL is required.'}
        if not year:
            self._log.error('did not give year parameter')
            return {'status': 'error', 'info': 'Year is required.'}
        # fetch captures
        self.update_state(
            state='PENDING',
            meta={'info': 'Fetching %s captures for year %s' % (url, year)})
        resp = self.fetch_cdx(url, year)
        if resp.get('status') == 'error':
            return resp
        captures = resp.get('captures')
        total = len(captures)
        self.seen = dict()
        # calculate simhashes in parallel
        i = 0
        final_results = {}
        for res in self.tpool.map(self.get_calc, captures):
            if not res:
                continue
            (timestamp, simhash) = res
            if simhash:
                final_results[timestamp] = simhash
            if i % 10 == 0:
                self.update_state(state='PENDING',
                                  meta={
                                      'info':
                                      'Processed %d out of %d captures.' %
                                      (i, total)
                                  })
            i += 1

        self._log.info('%d final results for %s and year %s.',
                       len(final_results), self.url, year)
        if final_results:
            try:
                urlkey = surt(self.url)
                self.redis.hmset(urlkey, final_results)
                self.redis.expire(urlkey, self.simhash_expire)
            except RedisError as exc:
                self._log.error('cannot write simhashes to Redis for URL %s',
                                self.url,
                                exc_info=1)

        duration = (datetime.now() - time_started).seconds
        self._log.info('Simhash calculation finished in %.2fsec.', duration)
        return {'duration': str(duration)}
Пример #52
0
def post_url():
    data = request.get_json()
    url = url_fix(data['url'])
    slug = r.get(url_prefix + url)
    if not slug:
        slug = base62.encode(r.incr('next_url_id'))
        r.hmset(slug_prefix + slug, {'url': url, 'visited': 0})
        r.set(url_prefix + url, slug)
    return jsonify({'url': url, 'slug': slug})
Пример #53
0
    def wiki_link(self, addr, label=None, class_=None, image=None, lineno=0):
        """Create HTML for a wiki link."""

        addr = addr.strip()
        text = escape(label or addr)
        chunk = ''
        if class_ is not None:
            classes = [class_]
        else:
            classes = []
        if hatta.parser.external_link(addr):
            classes.append('external')
            if addr.startswith('mailto:'):
                # Obfuscate e-mails a little bit.
                classes.append('mail')
                text = text.replace('@', '&#64;').replace('.', '&#46;')
                href = escape(addr).replace('@', '%40').replace('.', '%2E')
            else:
                href = escape(url_fix(addr))
        else:
            if '#' in addr:
                addr, chunk = addr.split('#', 1)
                chunk = '#' + url_fix(chunk)
            if addr.startswith(':'):
                alias = self.link_alias(addr[1:])
                href = escape(url_fix(alias) + chunk)
                classes.append('external')
                classes.append('alias')
            elif addr.startswith('+'):
                href = '/'.join(
                    [self.request.script_root, '+' + escape(addr[1:])])
                classes.append('special')
            elif addr == '':
                href = escape(chunk)
                classes.append('anchor')
            else:
                classes.append('wiki')
                href = escape(self.get_url(addr) + chunk)
                if addr not in self.storage:
                    classes.append('nonexistent')
        class_ = escape(' '.join(classes) or '')
        # We need to output HTML on our own to prevent escaping of href
        return '<a href="%s" class="%s" title="%s">%s</a>' % (
            href, class_, escape(addr + chunk), image or text)
Пример #54
0
def request(url):
    """
    default call to the api
    call http://endpoint/v1/{url}

    return the response and the url called (it might have been modified with the normalization)
    """
    norm_url = url_fix(_api_current_root_point + url)  # normalize url
    raw_response = requests.get(norm_url)

    return json.loads(raw_response.text), norm_url, raw_response.status_code
Пример #55
0
 def download_capture(self, ts):
     """Download capture from WBM and update job status.
     Return capture body (probably HTML text)
     """
     try:
         self._log.info('fetching capture %s %s', ts, self.url)
         resp = self.http.request('GET',
                                  '/web/%sid_/%s' % (ts, url_fix(self.url)))
         return resp.data.decode('utf-8', 'ignore')
     except HTTPError as exc:
         self._log.error('cannot fetch capture %s %s (%s)', ts, self.url,
                         exc)
Пример #56
0
 def save(cls, destination_mapping: str) -> SlugMapping:
     """Save a destination mapping as a UrlMapping and return the SlugMapping."""
     record = UrlMapping(
         slug=cls.generate_slug(destination_mapping=destination_mapping),
         destination_url=url_fix(destination_mapping),
     )
     record.save()
     return SlugMapping(
         slug=record.slug,
         destination_url=record.destination_url,
         created_at=record.created_at,
     )
Пример #57
0
    def get_query(self, url, sleep=0.0, force=False):
        # Get LastFM key and cache duration
        ConfigParam = self.env["ir.config_parameter"].sudo()
        fm_key = ConfigParam.get_param("oomusic.lastfm_key")
        fm_cache = int(ConfigParam.get_param("oomusic.lastfm_cache", 112))
        ext_info = ConfigParam.get_param("oomusic.ext_info", "auto")
        if not fm_key:
            return "{}"

        url = url_fix(url + "&api_key=" + fm_key +
                      "&format=json").encode("utf-8")
        url_hash = hashlib.sha1(url).hexdigest()

        Lastfm = self.search([("name", "=", url_hash)])
        if force or not Lastfm or Lastfm.expiry_date < fields.Datetime.now():
            content = "{}"
            if ext_info == "manual" and not force:
                content = Lastfm.content or content
                return content
            try:
                time.sleep(sleep)
                r = requests.get(url, timeout=3.0)
                if r.status_code == 200:
                    content = r.content.decode("utf-8")
            except:
                _logger.info('Error while fetching URL "%s"',
                             url,
                             exc_info=True)

            expiry_date = datetime.datetime.utcnow() + datetime.timedelta(
                days=fm_cache)
            removal_date = datetime.datetime.utcnow() + datetime.timedelta(
                days=fm_cache + 14)

            # Save in cache
            if not Lastfm:
                writer = self.create
            else:
                writer = Lastfm.write
            writer({
                "name": url_hash,
                "url": url,
                "content": content,
                "expiry_date": expiry_date,
                "removal_date": removal_date,
            })
            self.env.cr.commit()

        else:
            content = Lastfm.content or "{}"

        return content
def fix_license_and_urls(data_frame):
    """
    Creates license URL and display name and fixes broken URLs from see_also_links and reference_list.

    :param data_frame: DataFrame to perform fixing operation on
    :return: Fixed DataFrame
    """

    # JOB: Transform license object to string
    license_base = 'https://choosealicense.com/licenses/'
    data_frame['license_url'] = data_frame['license'].apply(
        lambda lic: license_base + lic.get('key') if lic else None)
    data_frame['license'] = data_frame['license'].apply(
        lambda lic: lic.get('name') if lic else None)

    # JOB: Fix potentially broken URLs
    data_frame['see_also_links'] = data_frame['see_also_links'].apply(
        lambda ref_list: [url_fix(ref_link) for ref_link in ref_list])
    data_frame['reference_list'] = data_frame['reference_list'].apply(
        lambda ref_list: [url_fix(ref_link) for ref_link in ref_list])

    return data_frame
def getMarketItems(url, count, currency, start=0):
    if not url.startswith('http://') and not url.startswith('https://'):
        url = 'http://' + url

    url = url_fix(url)
    curr = CURRENCY[currency][0]
    urlextender = '/render/?query=&start=%s&count=%s&currency=%s' % (
        start, count, curr)

    try:
        request = requests.get(url + urlextender)
    except requests.ConnectionError:
        return 'Could not connect. Check URL and make sure you can connect to the internet.', None
    except exceptions.InvalidURL:
        return 'URL is invalid, please check your market URL.', None

    if request.status_code == 404:
        return 'Could not connect to Steam. Retry in a few minutes and check URL.', None
    if len(request.text) < 1000:
        return 'Response from Steam contains no skin data, URL is probably invalid.', None
    if request.url != url + urlextender:
        return 'Page redirected to %s, so no skins were found. Check your market URL.' % request.url, None

    data = request.text.split('"listinginfo":')[1].split(',"assets":')[0]
    try:
        data = json.loads(data, object_pairs_hook=OrderedDict)
    except ValueError:
        return 'Response from Steam contains no skin data, URL is probably invalid.', None

    # assetID => [marketID, inspect link, formatted price]
    datadic = OrderedDict()
    soldcount = 0
    for marketID in data:
        try:
            price = int(data[marketID]['converted_price']) + int(
                data[marketID]['converted_fee'])
            padded = "%03d" % (price, )
            price = padded[0:-2] + '.' + padded[-2:]
        except KeyError:
            price = 'SOLD'
            soldcount += 1
            continue  # Delete this line to keep SOLD ITEMS in the result
        link = data[marketID]['asset']['market_actions'][0]['link']
        assetID = data[marketID]['asset']['id']
        datadic[assetID] = [
            marketID,
            link.replace('%assetid%', assetID).replace('%listingid%',
                                                       marketID), price
        ]

    return datadic, soldcount
Пример #60
0
def handleWord(word):
    g.word = WordDict(word)
    url = 'http://dictionaryapi.com/api/v1/references/collegiate/xml/%s?key=%s' % (
        word, API_KEY)
    xml = urllib2.urlopen(url_fix(url))
    try:
        dom = parse(xml)
    except:
        return g.word
    if dom.getElementsByTagName('entry'):
        handleEntries(dom.getElementsByTagName('entry'))
    else:
        handleSuggestion(dom.getElementsByTagName('suggestion'))
    return g.word