def fetch_albums(url): html = urlopen(url) found = re.findall(r'<td class="Title".*?<a href="/music/url\?q=(/music/album\?id%3D.*?)".*?>(.*?)</a>', html) print '# albums:', len(found), urllib.unquote(url) for link, title in found: link = 'http://www.google.cn'+link.split('&')[0] title = unescape(title) print urllib.unquote(link), '|', title found = re.findall(r'<td>.*?<a class="imglink" href="/music/url\?q=(.*?)"', html) pages = [ 'http://www.google.cn'+urllib.unquote(i.split('&')[0]) for i in found ] cache[url] = True for page in pages: if page not in cache: cache[page] = False another_page = None for page, done in cache.iteritems(): if not done: another_page = page break if another_page: fetch_albums(another_page)
def get_m3u8(url): with utils.urlopen(url) as response: html = response.read().decode() regex = r"hlsManifestUrl\":\"([^\"]+)" result = re.search(regex, html).group(1) return result
def _refresh_credentials(self, refresh_token): headers = {'Content-Type': 'application/x-www-form-urlencoded'} body = { 'refresh_token': refresh_token, 'client_id': self.CLIENT_ID, 'client_secret': self.CLIENT_NOT_SO_SECRET, 'grant_type': 'refresh_token' } body = urllib.urlencode(body) request = urllib2.Request(self.TOKEN_URL, data=body, headers=headers) try: response = utils.urlopen(request) except urllib2.HTTPError as e: error = json.load(e) raise Exception( error.get('error_description') or error.get('error') or str(e)) data = json.load(response) return { 'access_token': data['access_token'], 'refresh_token': data.get('refresh_token', refresh_token) }
def rescan_root(request, root): result = "success" for x in WebPath.get_root_nodes(): if x.url == root: spider.pause() x.delete() spider.add(WebPath.add_root(root)) spider.unpause() break else: try: urlopen(root) spider.add(WebPath.add_root(url=root)) except Exception, e: logger.debug("don't like %s %s %s", root, e, request.META) result = "failure"
def _request_credentials(self, authorization_key): headers = {'Content-Type': 'application/x-www-form-urlencoded'} body = { 'code': authorization_key, 'redirect_uri': 'urn:ietf:wg:oauth:2.0:oob', 'client_id': self.CLIENT_ID, 'client_secret': self.CLIENT_NOT_SO_SECRET, 'scope': self.SCOPE, 'grant_type': 'authorization_code' } body = urllib.urlencode(body) request = urllib2.Request(self.TOKEN_URL, data=body, headers=headers) try: response = utils.urlopen(request) except urllib2.HTTPError as e: error = json.load(e) raise Exception( error.get('error_description') or error.get('error') or str(e)) data = json.load(response) return { 'access_token': data['access_token'], 'refresh_token': data['refresh_token'] }
def _request_credentials(self, authorization_key): headers = { 'Content-Type': 'application/x-www-form-urlencoded' } body = { 'code': authorization_key, 'client_id': self.CLIENT_ID, 'client_secret': self.CLIENT_NOT_SO_SECRET, 'grant_type': 'authorization_code' } body = urllib.urlencode(body) request = urllib2.Request(self.TOKEN_URL, data=body, headers=headers) try: response = utils.urlopen(request) except urllib2.HTTPError as e: error = json.load(e) raise Exception(error.get('error_description') or error.get('error') or str(e)) data = json.load(response) return { 'access_token': data['access_token'] }
def main(parser, args): import meyectl import utils options = parse_options(parser, args) meyectl.configure_logging('webhook', options.log_to_file) meyectl.configure_tornado() logging.debug('hello!') logging.debug('method = %s' % options.method) logging.debug('url = %s' % options.url) headers = {} parts = urlparse.urlparse(options.url) url = options.url data = None if options.method == 'POST': headers['Content-Type'] = 'text/plain' data = '' elif options.method == 'POSTf': # form url-encoded headers['Content-Type'] = 'application/x-www-form-urlencoded' data = parts.query url = options.url.split('?')[0] elif options.method == 'POSTj': # json headers['Content-Type'] = 'application/json' data = urlparse.parse_qs(parts.query) data = {k: v[0] for (k, v) in data.iteritems()} data = json.dumps(data) url = options.url.split('?')[0] else: # GET pass request = urllib2.Request(url, data, headers=headers) try: utils.urlopen(request, timeout=settings.REMOTE_REQUEST_TIMEOUT) logging.debug('webhook successfully called') except Exception as e: logging.error('failed to call webhook: %s' % e) logging.debug('bye!')
def main(parser, args): import meyectl import utils options = parse_options(parser, args) meyectl.configure_logging('webhook', options.log_to_file) meyectl.configure_tornado() logging.debug('hello!') logging.debug('method = %s' % options.method) logging.debug('url = %s' % options.url) headers = {} parts = urllib.parse.urlparse(options.url) url = options.url data = None if options.method == 'POST': headers['Content-Type'] = 'text/plain' data = '' elif options.method == 'POSTf': # form url-encoded headers['Content-Type'] = 'application/x-www-form-urlencoded' data = parts.query url = options.url.split('?')[0] elif options.method == 'POSTj': # json headers['Content-Type'] = 'application/json' data = urllib.parse.parse_qs(parts.query) data = {k: v[0] for (k, v) in data.items()} data = json.dumps(data) url = options.url.split('?')[0] else: # GET pass request = urllib.request.Request(url, data, headers=headers) try: utils.urlopen(request, timeout=settings.REMOTE_REQUEST_TIMEOUT) logging.debug('webhook successfully called') except Exception as e: logging.error('failed to call webhook: %s' % e) logging.debug('bye!')
def rescan_root(request, root): result = "success" for x in WebPath.get_root_nodes(): if x.url == root: spider.pause() x.delete() spider.add(WebPath.add_root(root)) spider.unpause() break else: try: urlopen(root) spider.add(WebPath.add_root(url=root)) except Exception, e: print "don't like", root, e print request.META result = "failure"
def _request(self, url, body=None, headers=None, retry_auth=True): if not self._credentials: if not self._authorization_key: msg = 'missing authorization key' self.error(msg) raise Exception(msg) self.debug('requesting credentials') try: self._credentials = self._request_credentials( self._authorization_key) self.save() except Exception as e: self.error('failed to obtain credentials: %s' % e) raise headers = headers or {} headers[ 'Authorization'] = 'Bearer %s' % self._credentials['access_token'] self.debug('requesting %s' % url) request = urllib2.Request(url, data=body, headers=headers) try: response = utils.urlopen(request) except urllib2.HTTPError as e: if e.code == 401 and retry_auth: # unauthorized, access token may have expired try: self.debug( 'credentials have probably expired, refreshing them') self._credentials = self._refresh_credentials( self._credentials['refresh_token']) self.save() # retry the request with refreshed credentials return self._request(url, body, headers, retry_auth=False) except Exception: self.error('refreshing credentials failed') raise else: try: e = json.load(e) msg = e['error']['message'] except Exception: msg = str(e) self.error('request failed: %s' % msg) raise Exception(msg) except Exception as e: self.error('request failed: %s' % e) raise return response.read()
def get(url): try: print url gen_log.info("GET: URL[%s]", url) return utils.urlopen(url).read() except: gen_log.info(traceback.format_exc()) print traceback.format_exc() return None
def build_req(video_id): video_url = f"https://www.youtube.com/watch?v={video_id}" info_req = urllib.request.Request( video_url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36', } ) return utils.urlopen(info_req)
def get_youtube_id(url): try: return re.search(r'^.*(?:(?:youtu\.be\/|v\/|vi\/|u\/\w\/|embed\/)|(?:(?:watch)?\?v(?:i)?=|\&v(?:i)?=))([^#\&\?]*).*', url).group(1) except: with utils.urlopen(url) as response: html_raw = response.read().decode() regex = r'<meta itemprop="videoId" content="(.+?)">' result = re.search(regex, html_raw).group(1) return result
def processItem(self,item): hash = item.hash() from cache import cacheFolder, cached cacheFile = join(cacheFolder, hash) try: data = urlopen(item.url).read() open(cacheFile, "wb").write(data) cached(item) except URLError: item.failed = True item.save()
def processItem(self, item): hash = item.hash() from cache import cacheFolder, cached cacheFile = join(cacheFolder, hash) try: data = urlopen(item.url).read() open(cacheFile, "wb").write(data) cached(item) except URLError: item.failed = True item.save()
def _request(self, url, body=None, headers=None, retry_auth=True): if not self._credentials: if not self._authorization_key: msg = 'missing authorization key' self.error(msg) raise Exception(msg) self.debug('requesting credentials') try: self._credentials = self._request_credentials(self._authorization_key) self.save() except Exception as e: self.error('failed to obtain credentials: %s' % e) raise headers = headers or {} headers['Authorization'] = 'Bearer %s' % self._credentials['access_token'] self.debug('requesting %s' % url) request = urllib2.Request(url, data=body, headers=headers) try: response = utils.urlopen(request) except urllib2.HTTPError as e: if e.code == 401 and retry_auth: # unauthorized, access token may have expired try: self.debug('credentials have probably expired, refreshing them') self._credentials = self._refresh_credentials(self._credentials['refresh_token']) self.save() # retry the request with refreshed credentials return self._request(url, body, headers, retry_auth=False) except Exception as e: self.error('refreshing credentials failed') raise else: try: e = json.load(e) msg = e['error']['message'] except Exception: msg = str(e) self.error('request failed: %s' % msg) raise Exception(msg) except Exception as e: self.error('request failed: %s' % e) raise return response.read()
def artist(url): html = urlopen(url) found = re.findall(r'<a href="/music/url\?q=(/music/album\?.*?)&.*?>(.*?)</a>', html.split('所有专辑', 1)[1]) albums = dict(found) artist = trim_title(html) print artist, 'albums', len(albums) for href, title in sorted(albums.items(), lambda i,j: cmp(i[1],j[1])): url = 'http://www.google.cn%s' % urllib.unquote(href) print '%s |%s' % (url, unescape(title))
def get_json(video_url, file=None): video_id = get_youtube_id(video_url) info_req = urllib.request.Request( video_url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36', } ) with utils.urlopen(info_req) as response: data = response.read().decode() match = re.findall(r'"itag":(\d+),"url":"([^"]+)"', data) match = dict(x for x in match) best = { "video": None, "audio": None, "metadata": get_youtube_video_info(video_id, data), "version": VERSION, "createTime": datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat() } for itag in PRIORITY["VIDEO"]: itag = str(itag) if itag in match: best["video"] = { itag: match[itag].replace("\\u0026", "\u0026") } break for itag in PRIORITY["AUDIO"]: itag = str(itag) if itag in match: best["audio"] = { itag: match[itag].replace("\\u0026", "\u0026") } break if best["video"] is None or best["audio"] is None: if best["video"] is None: utils.warn(f" {video_id} got empty video sources.") if best["audio"] is None: utils.warn(f" {video_id} got empty audio sources.") utils.warn(" Printng match...") print(match) if file is not None: with open(file, "w", encoding="utf8") as f: json.dump(best, f, indent=4, ensure_ascii=False) return best
def processItem(self, current): try: test = current.root except WebPath.DoesNotExist: print "skipping, as no more root!" current.delete() return try: page = urlopen(current.url) except URLError: print "fail", current.url current.failed = True current.save() return try: url = page.geturl() soup = BeautifulSoup(page) for link in soup.findAll("a"): try: resolved = urljoin(url, link["href"]) except KeyError: print "skipping due to lack of href", link continue if len(resolved) < len(url): # up link, skip print "skipping",resolved, url continue if resolved[-1] == "/": # directory if WebPath.objects.filter(url=resolved).count() == 0: child = current.add_child(url=resolved) self.add(child) else: # file? (_, ext) = splitext(resolved) ext = ext.lower() if ext in known_extensions: if MusicFile.objects.filter(url=resolved).count() == 0: mf = MusicFile(parent=current, url = resolved) mf.save() else: print "Can't handle", resolved, ext, len(ext) current.checked = True current.save() except ObjectDoesNotExist: # we got deleted current.delete()
def processItem(self, current): try: test = current.root except WebPath.DoesNotExist: logger.debug("skipping, as no more root!") current.delete() return try: page = urlopen(current.url) except URLError: logger.debug("fail", current.url) current.failed = True current.save() return try: url = page.geturl() soup = BeautifulSoup(page) for link in soup.findAll("a"): try: resolved = urljoin(url, link["href"]) except KeyError: logger.debug("skipping due to lack of href %s", link) continue if len(resolved) < len(url): # up link, skip logger.debug("skipping %s %s", resolved, url) continue if resolved[-1] == "/": # directory if WebPath.objects.filter(url=resolved).count() == 0: child = current.add_child(url=resolved) self.add(child) else: # file? (_, ext) = splitext(resolved) ext = ext.lower() if ext in known_extensions: if MusicFile.objects.filter(url=resolved).count() == 0: mf = MusicFile(parent=current, url=resolved) mf.save() else: logger.debug("can't handle %s %s %s", resolved, ext, len(ext)) current.checked = True current.save() except ObjectDoesNotExist: # we got deleted current.delete()
def get_response(operation_name, data, encoding, **headers): config = get_config_store() app_name = config.get("keys", "app_name") endpoint = config.get("endpoints", "product") http_headers = {"X-EBAY-SOA-OPERATION-NAME": operation_name, "X-EBAY-SOA-SECURITY-APPNAME": app_name, "X-EBAY-SOA-RESPONSE-DATA-FORMAT": encoding} http_headers.update(headers) # req = urllib2.Request(endpoint, data, http_headers) req = Request(endpoint, data, http_headers) # res = urllib2.urlopen(req) res = urlopen(req) data = res.read() return data
def get_response(operation_name, data, encoding, **headers): config = get_config_store() access_token = config.get("auth", "token") endpoint = config.get("endpoints", "best_match") http_headers = {"X-EBAY-SOA-OPERATION-NAME": operation_name, "X-EBAY-SOA-SECURITY-TOKEN": access_token, "X-EBAY-SOA-RESPONSE-DATA-FORMAT": encoding} http_headers.update(headers) # req = urllib2.Request(endpoint, data, http_headers) req = Request(endpoint, data, http_headers) # res = urllib2.urlopen(req) res = urlopen(req) data = res.read() return data
def getHtml(url, data, headers): req = utils.Request(url, data, headers) req.add_header('Referer', 'http://www.porndig.com/videos/') print data print len(data) req.add_header('Content-Length', len(data)) try: response = utils.urlopen(req) if response.info().get('Content-Encoding') == 'gzip': buf = StringIO( response.read()) f = gzip.GzipFile(fileobj=buf) data = f.read() f.close() else: data = response.read() try: utils.cj.save(cookiePath) except: pass response.close() except urllib2.HTTPError as e: print e.code print e.read() return data
def onlive(video_data): data = { 'title': text.PUSHALERT_TITLE.format(**video_data["metadata"]), "message": text.PUSHALERT_MESSAGE.format(**video_data["metadata"]), "url": f"https://www.youtube.com/watch?v={video_data['metadata']['id']}" } if const.PUSHALERT_ICON: data["icon"] = const.PUSHALERT_ICON else: data["icon"] = utils.get_avatar(video_data['metadata']["channelURL"]) data = urllib.parse.urlencode(data).encode() req = urllib.request.Request(url="https://api.pushalert.co/rest/v1/send", data=data) req.add_header("Authorization", f"api_key={const.PUSHALERT_API_KEY}") try: return utils.urlopen(req) except: return
def onlive(video_data): if "thumbnailUrl" in video_data['metadata']: image = video_data['metadata']["thumbnailUrl"] else: image = video_data['metadata']["thumbnail"] data = { "to": const.FCM_TARGET, "validateOnly": False, "notification": { 'title': text.FCM_TITLE.format(**video_data["metadata"]), "body": text.FCM_MESSAGE.format(**video_data["metadata"]), "click_action": f"https://www.youtube.com/watch?v={video_data['metadata']['id']}", "image": image } } if const.FCM_ICON: data["notification"]["icon"] = const.FCM_ICON else: data["notification"]["icon"] = utils.get_avatar( video_data['metadata']["channelURL"]) data = json.dumps(data).encode() req = urllib.request.Request(url="https://fcm.googleapis.com/fcm/send", method="POST", data=data) req.add_header('Content-Type', 'application/json') req.add_header("Authorization", f"key={const.FCM_API_KEY}") try: return utils.urlopen(req) except: return
def gotcha(album_url): html = urlopen(album_url) name = re.findall(r'<span class="Title">(.*?)</span>', html)[0] desc = re.findall(r'<td .*? class="Description">(.*?)</td>', html, re.S)[0] match = re.findall(r'歌手.*?(<a .*?>(.*?)</a>|<span.*?>(.*?)</span>)', desc, re.S)[0] singer = match[1] or match[2] y,m,d = re.findall(r'出版时间.*?(\d+)年(\d+)月(\d+)日', desc, re.S)[0] pub = datetime.date(*map(int, [y,m,d])) company = re.findall(r'唱片公司:(.*?)$', desc, re.S)[0].strip() print '-'*10 print unescape(singer), unescape(name), pub, unescape(company) s = 0 gid = None for number, title, script in re.findall(r'<td class="number .*?>(.*?)</td>.*?<td class="Title .*?<a .*?>(.*?)</a>.*?<td class="Icon.*?<a .*?title="下载".*?onclick="(.*?)"', html, re.S): if s: print datetime.datetime.now(), 'take a rest ..', s time.sleep(s) m = re.findall(r'download.html(\?id%3D.*?)\\x26', script, re.S) if not m: print 'no download for this song' continue q = m[0] iframe = "http://www.google.cn/music/top100/musicdownload" + urllib.unquote(q) html = urlopen(iframe) url = re.findall(r'<a href="/music/top100/url\?q=(.*?)"', html, re.S)[0].split('&', 1)[0] url = urllib.unquote(url) url_hash = hashlib.md5(url).hexdigest().upper() fname = url2fname(url, url_hash) print number, unescape(title), url, fname b = time.time() if DEBUG: print 'pretend to fetch audio file ...' else: urlopen(url, fname) d = time.time() - b s = max(60*3.1 - d, 30) #s = 60 m = Gotcha(number=number.strip('.'), title=unescape(title), singer=unescape(singer), album=unescape(name), pub=pub, company=unescape(company), url=url, url_hash=url_hash, path=fname.replace(DOWNLOAD_DIR, '').lstrip('/'), start=datetime.datetime.fromtimestamp(b), duration=d, ) if DEBUG: print 'pretend to save ', m gid = 0 else: try: m.save() gid = m.id except IntegrityError: print >> sys.stderr, 'duplicate' s = 0 return gid
def get_image(url): with utils.urlopen(url) as response: data = response.read() b64 = base64.b64encode(data).decode() return f"data:image/jpeg;base64,{b64}"