Exemplo n.º 1
0
def fetch_albums(url):
    html = urlopen(url)

    found = re.findall(r'<td class="Title".*?<a href="/music/url\?q=(/music/album\?id%3D.*?)".*?>(.*?)</a>', html)
    print '# albums:', len(found), urllib.unquote(url)
    for link, title in found:
        link = 'http://www.google.cn'+link.split('&')[0]
        title = unescape(title)
        print urllib.unquote(link), '|', title

    found = re.findall(r'<td>.*?<a class="imglink" href="/music/url\?q=(.*?)"', html)
    pages = [ 'http://www.google.cn'+urllib.unquote(i.split('&amp;')[0]) for i in found ]

    cache[url] = True
    for page in pages:
        if page not in cache:
            cache[page] = False

    another_page = None
    for page, done in cache.iteritems():
        if not done:
            another_page = page
            break

    if another_page:
        fetch_albums(another_page)
Exemplo n.º 2
0
def get_m3u8(url):
    with utils.urlopen(url) as response:
        html = response.read().decode()
        regex = r"hlsManifestUrl\":\"([^\"]+)"
        result = re.search(regex, html).group(1)

        return result
Exemplo n.º 3
0
    def _refresh_credentials(self, refresh_token):
        headers = {'Content-Type': 'application/x-www-form-urlencoded'}

        body = {
            'refresh_token': refresh_token,
            'client_id': self.CLIENT_ID,
            'client_secret': self.CLIENT_NOT_SO_SECRET,
            'grant_type': 'refresh_token'
        }
        body = urllib.urlencode(body)

        request = urllib2.Request(self.TOKEN_URL, data=body, headers=headers)

        try:
            response = utils.urlopen(request)

        except urllib2.HTTPError as e:
            error = json.load(e)
            raise Exception(
                error.get('error_description') or error.get('error') or str(e))

        data = json.load(response)

        return {
            'access_token': data['access_token'],
            'refresh_token': data.get('refresh_token', refresh_token)
        }
Exemplo n.º 4
0
def rescan_root(request, root):
    result = "success"
    for x in WebPath.get_root_nodes():
        if x.url == root:
            spider.pause()
            x.delete()
            spider.add(WebPath.add_root(root))
            spider.unpause()
            break
    else:
        try:
            urlopen(root)
            spider.add(WebPath.add_root(url=root))
        except Exception, e:
            logger.debug("don't like %s %s %s", root, e, request.META)
            result = "failure"
Exemplo n.º 5
0
    def _request_credentials(self, authorization_key):
        headers = {'Content-Type': 'application/x-www-form-urlencoded'}

        body = {
            'code': authorization_key,
            'redirect_uri': 'urn:ietf:wg:oauth:2.0:oob',
            'client_id': self.CLIENT_ID,
            'client_secret': self.CLIENT_NOT_SO_SECRET,
            'scope': self.SCOPE,
            'grant_type': 'authorization_code'
        }
        body = urllib.urlencode(body)

        request = urllib2.Request(self.TOKEN_URL, data=body, headers=headers)

        try:
            response = utils.urlopen(request)

        except urllib2.HTTPError as e:
            error = json.load(e)
            raise Exception(
                error.get('error_description') or error.get('error') or str(e))

        data = json.load(response)

        return {
            'access_token': data['access_token'],
            'refresh_token': data['refresh_token']
        }
Exemplo n.º 6
0
    def _request_credentials(self, authorization_key):
        headers = {
            'Content-Type': 'application/x-www-form-urlencoded'
        }
        
        body = {
            'code': authorization_key,
            'client_id': self.CLIENT_ID, 
            'client_secret': self.CLIENT_NOT_SO_SECRET,
            'grant_type': 'authorization_code'
        }
        body = urllib.urlencode(body)

        request = urllib2.Request(self.TOKEN_URL, data=body, headers=headers)
        
        try:
            response = utils.urlopen(request)
        
        except urllib2.HTTPError as e:
            error = json.load(e)
            raise Exception(error.get('error_description') or error.get('error') or str(e))
        
        data = json.load(response)
        
        return {
            'access_token': data['access_token']
        }
Exemplo n.º 7
0
def main(parser, args):
    import meyectl
    import utils
    
    options = parse_options(parser, args)
    
    meyectl.configure_logging('webhook', options.log_to_file)
    meyectl.configure_tornado()

    logging.debug('hello!')
    logging.debug('method = %s' % options.method)
    logging.debug('url = %s' % options.url)
    
    headers = {}    
    parts = urlparse.urlparse(options.url)
    url = options.url
    data = None

    if options.method == 'POST':
        headers['Content-Type'] = 'text/plain'
        data = ''

    elif options.method == 'POSTf': # form url-encoded
        headers['Content-Type'] = 'application/x-www-form-urlencoded'
        data = parts.query
        url = options.url.split('?')[0]

    elif options.method == 'POSTj': # json
        headers['Content-Type'] = 'application/json'
        data = urlparse.parse_qs(parts.query)
        data = {k: v[0] for (k, v) in data.iteritems()}
        data = json.dumps(data)
        url = options.url.split('?')[0]

    else: # GET
        pass

    request = urllib2.Request(url, data, headers=headers)
    try:
        utils.urlopen(request, timeout=settings.REMOTE_REQUEST_TIMEOUT)
        logging.debug('webhook successfully called')

    except Exception as e:
        logging.error('failed to call webhook: %s' % e)

    logging.debug('bye!')
Exemplo n.º 8
0
def main(parser, args):
    import meyectl
    import utils

    options = parse_options(parser, args)

    meyectl.configure_logging('webhook', options.log_to_file)
    meyectl.configure_tornado()

    logging.debug('hello!')
    logging.debug('method = %s' % options.method)
    logging.debug('url = %s' % options.url)

    headers = {}
    parts = urllib.parse.urlparse(options.url)
    url = options.url
    data = None

    if options.method == 'POST':
        headers['Content-Type'] = 'text/plain'
        data = ''

    elif options.method == 'POSTf':  # form url-encoded
        headers['Content-Type'] = 'application/x-www-form-urlencoded'
        data = parts.query
        url = options.url.split('?')[0]

    elif options.method == 'POSTj':  # json
        headers['Content-Type'] = 'application/json'
        data = urllib.parse.parse_qs(parts.query)
        data = {k: v[0] for (k, v) in data.items()}
        data = json.dumps(data)
        url = options.url.split('?')[0]

    else:  # GET
        pass

    request = urllib.request.Request(url, data, headers=headers)
    try:
        utils.urlopen(request, timeout=settings.REMOTE_REQUEST_TIMEOUT)
        logging.debug('webhook successfully called')

    except Exception as e:
        logging.error('failed to call webhook: %s' % e)

    logging.debug('bye!')
Exemplo n.º 9
0
def rescan_root(request, root):
    result = "success"
    for x in WebPath.get_root_nodes():
        if x.url == root:
            spider.pause()
            x.delete()
            spider.add(WebPath.add_root(root))
            spider.unpause()
            break
    else:
        try:
            urlopen(root)
            spider.add(WebPath.add_root(url=root))
        except Exception, e:
            print "don't like", root, e
            print request.META
            result = "failure"
Exemplo n.º 10
0
    def _request(self, url, body=None, headers=None, retry_auth=True):
        if not self._credentials:
            if not self._authorization_key:
                msg = 'missing authorization key'
                self.error(msg)
                raise Exception(msg)

            self.debug('requesting credentials')
            try:
                self._credentials = self._request_credentials(
                    self._authorization_key)
                self.save()

            except Exception as e:
                self.error('failed to obtain credentials: %s' % e)
                raise

        headers = headers or {}
        headers[
            'Authorization'] = 'Bearer %s' % self._credentials['access_token']

        self.debug('requesting %s' % url)
        request = urllib2.Request(url, data=body, headers=headers)
        try:
            response = utils.urlopen(request)

        except urllib2.HTTPError as e:
            if e.code == 401 and retry_auth:  # unauthorized, access token may have expired
                try:
                    self.debug(
                        'credentials have probably expired, refreshing them')
                    self._credentials = self._refresh_credentials(
                        self._credentials['refresh_token'])
                    self.save()

                    # retry the request with refreshed credentials
                    return self._request(url, body, headers, retry_auth=False)

                except Exception:
                    self.error('refreshing credentials failed')
                    raise

            else:
                try:
                    e = json.load(e)
                    msg = e['error']['message']

                except Exception:
                    msg = str(e)

                self.error('request failed: %s' % msg)
                raise Exception(msg)

        except Exception as e:
            self.error('request failed: %s' % e)
            raise

        return response.read()
Exemplo n.º 11
0
def get(url):
    try:
        print url
        gen_log.info("GET: URL[%s]", url)
        return utils.urlopen(url).read()
    except:
        gen_log.info(traceback.format_exc())
        print traceback.format_exc()
        return None
Exemplo n.º 12
0
def build_req(video_id):
    video_url = f"https://www.youtube.com/watch?v={video_id}"
    info_req = urllib.request.Request(
        video_url, 
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
        }
    )
    return utils.urlopen(info_req)
Exemplo n.º 13
0
def get_youtube_id(url):
    try:
        return re.search(r'^.*(?:(?:youtu\.be\/|v\/|vi\/|u\/\w\/|embed\/)|(?:(?:watch)?\?v(?:i)?=|\&v(?:i)?=))([^#\&\?]*).*', url).group(1)
    except:
        with utils.urlopen(url) as response:
            html_raw = response.read().decode()
            regex = r'<meta itemprop="videoId" content="(.+?)">'
            result = re.search(regex, html_raw).group(1)

            return result
Exemplo n.º 14
0
 def processItem(self,item):
     hash = item.hash()
     from cache import cacheFolder, cached
     cacheFile = join(cacheFolder, hash)
     try:
         data = urlopen(item.url).read()
         open(cacheFile, "wb").write(data)
         cached(item)
     except URLError:
         item.failed = True
         item.save()
Exemplo n.º 15
0
 def processItem(self, item):
     hash = item.hash()
     from cache import cacheFolder, cached
     cacheFile = join(cacheFolder, hash)
     try:
         data = urlopen(item.url).read()
         open(cacheFile, "wb").write(data)
         cached(item)
     except URLError:
         item.failed = True
         item.save()
Exemplo n.º 16
0
    def _request(self, url, body=None, headers=None, retry_auth=True):
        if not self._credentials:
            if not self._authorization_key:
                msg = 'missing authorization key'
                self.error(msg)
                raise Exception(msg)

            self.debug('requesting credentials')
            try:
                self._credentials = self._request_credentials(self._authorization_key)
                self.save()
            
            except Exception as e:
                self.error('failed to obtain credentials: %s' % e)
                raise

        headers = headers or {}
        headers['Authorization'] = 'Bearer %s' % self._credentials['access_token']
        
        self.debug('requesting %s' % url)
        request = urllib2.Request(url, data=body, headers=headers)
        try:
            response = utils.urlopen(request)
        
        except urllib2.HTTPError as e:
            if e.code == 401 and retry_auth: # unauthorized, access token may have expired
                try:
                    self.debug('credentials have probably expired, refreshing them')
                    self._credentials = self._refresh_credentials(self._credentials['refresh_token'])
                    self.save()
                    
                    # retry the request with refreshed credentials
                    return self._request(url, body, headers, retry_auth=False)

                except Exception as e:
                    self.error('refreshing credentials failed')
                    raise
                
            else:
                try:
                    e = json.load(e)
                    msg = e['error']['message']
                
                except Exception:
                    msg = str(e)
                    
                self.error('request failed: %s' % msg)
                raise Exception(msg)

        except Exception as e:
            self.error('request failed: %s' % e)
            raise

        return response.read()
Exemplo n.º 17
0
def artist(url):
    html = urlopen(url)
    
    found = re.findall(r'<a href="/music/url\?q=(/music/album\?.*?)&.*?>(.*?)</a>',
                       html.split('所有专辑', 1)[1])
    albums = dict(found)
    artist = trim_title(html)
    print artist, 'albums', len(albums)
    
    for href, title in sorted(albums.items(), lambda i,j: cmp(i[1],j[1])):
        url = 'http://www.google.cn%s' % urllib.unquote(href)
        print '%s |%s' % (url, unescape(title))
Exemplo n.º 18
0
def get_json(video_url, file=None):
    video_id = get_youtube_id(video_url)

    info_req = urllib.request.Request(
        video_url, 
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
        }
    )
    with utils.urlopen(info_req) as response:
        data = response.read().decode()

        match = re.findall(r'"itag":(\d+),"url":"([^"]+)"', data)
        match = dict(x for x in match)

        best = {
            "video": None,
            "audio": None,
            "metadata": get_youtube_video_info(video_id, data),
            "version": VERSION,
            "createTime": datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()
        }

        for itag in PRIORITY["VIDEO"]:
            itag = str(itag)
            if itag in match:
                best["video"] = {
                    itag: match[itag].replace("\\u0026", "\u0026")
                }
                break
        for itag in PRIORITY["AUDIO"]:
            itag = str(itag)
            if itag in match:
                best["audio"] = {
                    itag: match[itag].replace("\\u0026", "\u0026")
                }
                break

        if best["video"] is None or best["audio"] is None:
            if best["video"] is None:
                utils.warn(f" {video_id} got empty video sources.")
            if best["audio"] is None:
                utils.warn(f" {video_id} got empty audio sources.")
            
            utils.warn(" Printng match...")
            print(match)
            
        if file is not None:
            with open(file, "w", encoding="utf8") as f:
                json.dump(best, f, indent=4, ensure_ascii=False)
        return best
Exemplo n.º 19
0
    def processItem(self, current):
        try:
            test = current.root
        except WebPath.DoesNotExist:
            print "skipping, as no more root!"
            current.delete()
            return

        try:
            page = urlopen(current.url)
        except URLError:
            print "fail", current.url
            current.failed = True
            current.save()
            return

        try:
            url = page.geturl()
            soup = BeautifulSoup(page)
        
            for link in soup.findAll("a"):
                try:
                    resolved = urljoin(url, link["href"])
                except KeyError:
                    print "skipping due to lack of href", link
                    continue
                if len(resolved) < len(url): # up link, skip
                    print "skipping",resolved, url
                    continue
                if resolved[-1] == "/": # directory
                    if WebPath.objects.filter(url=resolved).count() == 0:
                        child = current.add_child(url=resolved)
                        self.add(child)
                else: # file?
                    (_, ext) = splitext(resolved)
                    ext = ext.lower()
                    if ext in known_extensions:
                        if MusicFile.objects.filter(url=resolved).count() == 0:
                            mf = MusicFile(parent=current, url = resolved)
                            mf.save()
                    else:
                        print "Can't handle", resolved, ext, len(ext)

            current.checked = True
            current.save()
        except ObjectDoesNotExist:
            # we got deleted
            current.delete()
Exemplo n.º 20
0
Arquivo: spider.py Projeto: Ferada/nih
    def processItem(self, current):
        try:
            test = current.root
        except WebPath.DoesNotExist:
            logger.debug("skipping, as no more root!")
            current.delete()
            return

        try:
            page = urlopen(current.url)
        except URLError:
            logger.debug("fail", current.url)
            current.failed = True
            current.save()
            return

        try:
            url = page.geturl()
            soup = BeautifulSoup(page)

            for link in soup.findAll("a"):
                try:
                    resolved = urljoin(url, link["href"])
                except KeyError:
                    logger.debug("skipping due to lack of href %s", link)
                    continue
                if len(resolved) < len(url):  # up link, skip
                    logger.debug("skipping %s %s", resolved, url)
                    continue
                if resolved[-1] == "/":  # directory
                    if WebPath.objects.filter(url=resolved).count() == 0:
                        child = current.add_child(url=resolved)
                        self.add(child)
                else:  # file?
                    (_, ext) = splitext(resolved)
                    ext = ext.lower()
                    if ext in known_extensions:
                        if MusicFile.objects.filter(url=resolved).count() == 0:
                            mf = MusicFile(parent=current, url=resolved)
                            mf.save()
                    else:
                        logger.debug("can't handle %s %s %s", resolved, ext, len(ext))

            current.checked = True
            current.save()
        except ObjectDoesNotExist:
            # we got deleted
            current.delete()
Exemplo n.º 21
0
def get_response(operation_name, data, encoding, **headers):
    config = get_config_store()
    app_name = config.get("keys", "app_name")
    endpoint = config.get("endpoints", "product")

    http_headers = {"X-EBAY-SOA-OPERATION-NAME": operation_name,
                    "X-EBAY-SOA-SECURITY-APPNAME": app_name,
                    "X-EBAY-SOA-RESPONSE-DATA-FORMAT": encoding}

    http_headers.update(headers)

    # req = urllib2.Request(endpoint, data, http_headers)
    req = Request(endpoint, data, http_headers)
    # res = urllib2.urlopen(req)
    res = urlopen(req)
    data = res.read()
    return data
Exemplo n.º 22
0
def get_response(operation_name, data, encoding, **headers):
    config = get_config_store()
    access_token = config.get("auth", "token")
    endpoint = config.get("endpoints", "best_match")

    http_headers = {"X-EBAY-SOA-OPERATION-NAME": operation_name,
                    "X-EBAY-SOA-SECURITY-TOKEN": access_token,
                    "X-EBAY-SOA-RESPONSE-DATA-FORMAT": encoding}

    http_headers.update(headers)

    # req = urllib2.Request(endpoint, data, http_headers)
    req = Request(endpoint, data, http_headers)
    # res = urllib2.urlopen(req)
    res = urlopen(req)
    data = res.read()
    return data
Exemplo n.º 23
0
def getHtml(url, data, headers):
    req = utils.Request(url, data, headers)
    req.add_header('Referer', 'http://www.porndig.com/videos/')
    print data
    print len(data)
    req.add_header('Content-Length', len(data))
    try:
        response = utils.urlopen(req)
        if response.info().get('Content-Encoding') == 'gzip':
            buf = StringIO( response.read())
            f = gzip.GzipFile(fileobj=buf)
            data = f.read()
            f.close()
        else:
            data = response.read()
        try: utils.cj.save(cookiePath)
        except: pass
        response.close()
    except urllib2.HTTPError as e:
        print e.code
        print e.read()
    return data
Exemplo n.º 24
0
def onlive(video_data):
    data = {
        'title': text.PUSHALERT_TITLE.format(**video_data["metadata"]),
        "message": text.PUSHALERT_MESSAGE.format(**video_data["metadata"]),
        "url":
        f"https://www.youtube.com/watch?v={video_data['metadata']['id']}"
    }
    if const.PUSHALERT_ICON:
        data["icon"] = const.PUSHALERT_ICON
    else:
        data["icon"] = utils.get_avatar(video_data['metadata']["channelURL"])

    data = urllib.parse.urlencode(data).encode()

    req = urllib.request.Request(url="https://api.pushalert.co/rest/v1/send",
                                 data=data)
    req.add_header("Authorization", f"api_key={const.PUSHALERT_API_KEY}")

    try:
        return utils.urlopen(req)
    except:
        return
Exemplo n.º 25
0
def onlive(video_data):
    if "thumbnailUrl" in video_data['metadata']:
        image = video_data['metadata']["thumbnailUrl"]
    else:
        image = video_data['metadata']["thumbnail"]

    data = {
        "to": const.FCM_TARGET,
        "validateOnly": False,
        "notification": {
            'title': text.FCM_TITLE.format(**video_data["metadata"]),
            "body": text.FCM_MESSAGE.format(**video_data["metadata"]),
            "click_action":
            f"https://www.youtube.com/watch?v={video_data['metadata']['id']}",
            "image": image
        }
    }

    if const.FCM_ICON:
        data["notification"]["icon"] = const.FCM_ICON
    else:
        data["notification"]["icon"] = utils.get_avatar(
            video_data['metadata']["channelURL"])

    data = json.dumps(data).encode()

    req = urllib.request.Request(url="https://fcm.googleapis.com/fcm/send",
                                 method="POST",
                                 data=data)
    req.add_header('Content-Type', 'application/json')

    req.add_header("Authorization", f"key={const.FCM_API_KEY}")

    try:
        return utils.urlopen(req)
    except:
        return
Exemplo n.º 26
0
def gotcha(album_url):
    html = urlopen(album_url)
    name = re.findall(r'<span class="Title">(.*?)</span>', html)[0]
    desc = re.findall(r'<td .*? class="Description">(.*?)</td>', html, re.S)[0]
    match = re.findall(r'歌手.*?(<a .*?>(.*?)</a>|<span.*?>(.*?)</span>)', desc, re.S)[0]
    singer = match[1] or match[2]
    y,m,d = re.findall(r'出版时间.*?(\d+)&#24180;(\d+)&#26376;(\d+)&#26085;', desc, re.S)[0]
    pub = datetime.date(*map(int, [y,m,d]))
    company = re.findall(r'唱片公司:(.*?)$', desc, re.S)[0].strip()
    print '-'*10
    print unescape(singer), unescape(name), pub, unescape(company)

    s = 0
    gid = None
    for number, title, script in re.findall(r'<td class="number .*?>(.*?)</td>.*?<td class="Title .*?<a .*?>(.*?)</a>.*?<td class="Icon.*?<a .*?title="下载".*?onclick="(.*?)"', html, re.S):
        if s:
            print datetime.datetime.now(), 'take a rest ..', s
            time.sleep(s)
            
        m = re.findall(r'download.html(\?id%3D.*?)\\x26', script, re.S)
        if not m:
            print 'no download for this song'
            continue
        q = m[0]
        iframe = "http://www.google.cn/music/top100/musicdownload" + urllib.unquote(q)
        
        html = urlopen(iframe)
        url = re.findall(r'<a href="/music/top100/url\?q=(.*?)"', html, re.S)[0].split('&', 1)[0]
        url = urllib.unquote(url)
        url_hash = hashlib.md5(url).hexdigest().upper()
        fname = url2fname(url, url_hash)
        
        print number, unescape(title), url, fname
        b = time.time()
        if DEBUG:
            print 'pretend to fetch audio file ...'
        else:
            urlopen(url, fname)
        d = time.time() - b
        s = max(60*3.1 - d, 30)
        #s = 60

        m = Gotcha(number=number.strip('.'),
                   title=unescape(title),
                   singer=unescape(singer),
                   album=unescape(name),
                   pub=pub,
                   company=unescape(company),
                   url=url,
                   url_hash=url_hash,
                   path=fname.replace(DOWNLOAD_DIR, '').lstrip('/'),
                   start=datetime.datetime.fromtimestamp(b),
                   duration=d,
                   )
        if DEBUG:
            print 'pretend to save ', m
            gid = 0
        else:
            try:
                m.save()
                gid = m.id
            except IntegrityError:
                print >> sys.stderr, 'duplicate'
                s = 0
    return gid
Exemplo n.º 27
0
def get_image(url):
    with utils.urlopen(url) as response:
        data = response.read()
        b64 = base64.b64encode(data).decode()

        return f"data:image/jpeg;base64,{b64}"