def archive_files(yga, subdir=None): logger = logging.getLogger(name="archive_files") try: if subdir: file_json = yga.files(sfpath=subdir) else: file_json = yga.files() except Exception: logger.error("Couldn't access Files functionality for this group") return with open('fileinfo.json', 'wb') as f: json.dump(file_json['dirEntries'], codecs.getwriter('utf-8')(f), ensure_ascii=False, indent=4) n = 0 sz = len(file_json['dirEntries']) for path in file_json['dirEntries']: n += 1 if path['type'] == 0: # Regular file name = html_unescape(path['fileName']) logger.info("Fetching file '%s' (%d/%d)", name, n, sz) with open(sanitise_file_name(name), 'wb') as f: yga.download_file(path['downloadURL'], f) elif path['type'] == 1: # Directory name = html_unescape(path['fileName']) logger.info("Fetching directory '%s' (%d/%d)", name, n, sz) with Mkchdir(name): pathURI = unquote(path['pathURI']) archive_files(yga, subdir=pathURI)
def archive_photos(yga): logger = logging.getLogger(name="archive_photos") try: nb_albums = yga.albums(count=5)['total'] + 1 except Exception: logger.error("Couldn't access Photos functionality for this group") return albums = yga.albums(count=nb_albums) n = 0 with open('albums.json', 'wb') as f: json.dump(albums['albums'], codecs.getwriter('utf-8')(f), ensure_ascii=False, indent=4) for a in albums['albums']: n += 1 name = html_unescape(a['albumName']) # Yahoo sometimes has an off-by-one error in the album count... logger.info("Fetching album '%s' (%d/%d)", name, n, albums['total']) folder = "%d-%s" % (a['albumId'], name) with Mkchdir(folder): photos = yga.albums(a['albumId']) pages = int(photos['total'] / 100 + 1) p = 0 for page in range(pages): photos = yga.albums(a['albumId'], start=page * 100, count=100) with open('photos-%d.json' % page, 'wb') as f: json.dump(photos['photos'], codecs.getwriter('utf-8')(f), ensure_ascii=False, indent=4) for photo in photos['photos']: p += 1 pname = html_unescape(photo['photoName']) logger.info("Fetching photo '%s' (%d/%d)", pname, p, photos['total']) photoinfo = get_best_photoinfo(photo['photoInfo']) fname = "%d-%s.jpg" % (photo['photoId'], pname) with open(sanitise_file_name(fname), 'wb') as f: for i in range(TRIES): try: yga.download_file(photoinfo['displayURL'], f) break except requests.exceptions.HTTPError as err: logger.error( "HTTP error (sleeping before retry, try %d: %s", i, err) time.sleep(HOLDOFF)
def archive_photos(yga): logger = logging.getLogger(name="archive_photos") try: nb_albums = yga.albums(count=5)['total'] + 1 except Exception: logger.error("Couldn't access Photos functionality for this group") return albums = yga.albums(count=nb_albums) n = 0 with open('albums.json', 'wb') as f: json.dump(albums['albums'], codecs.getwriter('utf-8')(f), ensure_ascii=False, indent=4) for a in albums['albums']: n += 1 name = html_unescape(a['albumName']) # Yahoo sometimes has an off-by-one error in the album count... logger.info("Fetching album '%s' (%d/%d)", name, n, albums['total']) folder = "%d-%s" % (a['albumId'], name) with Mkchdir(folder): photos = yga.albums(a['albumId']) pages = int(photos['total'] / 100 + 1) p = 0 for page in range(pages): photos = yga.albums(a['albumId'], start=page * 100, count=100) with open('photos-%d.json' % page, 'wb') as f: json.dump(photos['photos'], codecs.getwriter('utf-8')(f), ensure_ascii=False, indent=4) for photo in photos['photos']: p += 1 pname = html_unescape(photo['photoName']) fname = sanitise_file_name("%d-%s.jpg" % (photo['photoId'], pname)) if file_keep(fname, hacky_vars['file'], "photo: %s" % (fname, )) is False: logger.info("Fetching photo '%s' (%d/%d)", pname, p, photos['total']) with open(fname, 'wb') as f: process_single_photo(yga, photo['photoInfo'], f) set_mtime(fname, photo['creationDate']) set_mtime(sanitise_folder_name(folder), a['modificationDate'])
async def work(worker_id: int): i = 0 async with ClientSession( cookie_jar=session.cookie_jar) as worker_session: while True: request_options = dict( url=f'{origin}/owa/?ae=Dialog&t=AddressBook&ctx=1', data={ **form_data, 'hidpg': worker_id + i * num_concurrent }, headers={'Connection': 'Keep-Alive'}) async with worker_session.post(**request_options) as response: data = (await response.content.read()).decode() parsed_html_bs: BeautifulSoup = BeautifulSoup( markup=html_unescape(data), features='html.parser') page_names = { td.text.rstrip() for td in parsed_html_bs.select( selector= 'table.lvw > tr:nth-child(n+4) > td:nth-child(3)') if td.text.rstrip() } if len(page_names) == 0: break all_names.update(page_names) i += 1
def cleanup_ywh_redirects_from_html( ywh_domain: str, html: str, ) -> str: """ Replace YesWeHack redirects with real URLs. Args: ywh_domain: a base domain of the YWH redirects html: an html Returns: the cleaned html """ redirect_base_re = re.escape(f'{ywh_domain}/redirect?url=') pattern = re.compile(f'"(https?://{redirect_base_re}[^ "]*)"') redirect_urls = pattern.findall(html) for redirect_url in redirect_urls: real_url = _extract_real_url_from_redirect( redirect_url=html_unescape(redirect_url), ) html = html.replace( redirect_url, html_escape(real_url or ''), ) return html
def sanitize(text: str): """ Sanitize text removing html encoded parts, double spaces and so on """ if text is None: return None text = html_unescape(html_unescape(text)) for key, value in replaces.items(): text = text.replace(key, value) text = re.sub(r'[ \t]+', ' ', text).strip() return text
async def get_account_identity_from_ui( session: ClientSession, origin: str, **extra_request_keywords) -> Dict[str, str]: """ :param session: An authenticated `aiohttp.ClientSession`. :param origin: The origin part of the URL of the OWA. :return: """ # TODO: I would like something like Javascript Promises instead of this pattern. json_results = None def find_json_results(node, _) -> None: if node.type == 'Property' and node.key.value == 'JsonResults': nonlocal json_results json_results = json_loads(node.value.value) async with session.get(url=f'{origin}/ecp/PersonalSettings/HomePage.aspx', **extra_request_keywords) as response: data = (await response.content.read()).decode() esprima_parse(code=pq( html_unescape(data))('script[type="text/javascript"]')[-1].text, delegate=find_json_results) # TODO: Reconsider return value. Return `json_results` instead? return json_results['Output'][0]
def get_last_n_filtered_tweets(twitter_oauth_session, screen_name, n): base_URI = f'https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name={screen_name}&count=200' all_tweet_texts = [] last_id = 0 while (len(all_tweet_texts) < n): current_tweets = json.loads( twitter_oauth_session.request( 'GET', f"{base_URI}&max_id={last_id}" if last_id else base_URI).text) if current_tweets[-1]['id'] == last_id: break for tweet in current_tweets: #Skip replies if current_tweets[0]['in_reply_to_status_id']: continue #Skip retweets if current_tweets[0]['retweeted']: continue #Skip tweets with any users tagged in it if bool(re_tagged_username_pattern.search(tweet['text'])): continue #Redact all urls, twitter automatically converts urls to the t.co shortened format so we simply replace those tweet['text'] = re_link_pattern.sub('[url redacted]', tweet['text']).strip() if tweet['text'] == '[url redacted]': continue #If it passed all these checks, we add it to our all_tweet_texts list to eventually return all_tweet_texts.append(html_unescape(tweet['text'])) last_id = current_tweets[-1]['id'] return all_tweet_texts[:n]
def get_item_group_from_feedparser(parser): """ Retrieve all items from feedparser and return item group. :type parser: 'feedparser.FeedParserDict' :rtype: ItemGroup """ items = list() logging.info('Loop for retrieving items.') for item in parser.entries: try: text, img_links = format_description(item.description) except AttributeError: continue if text: new_item = Item(title=html_unescape(item.title), date=item.published, link=item.link, text=text, img_links=img_links) items.append(new_item) return ItemGroup(feed=parser.feed.title, items=items)
class Huya(Plugin): _re_url = re.compile(r'https?://(?:www\.)?huya\.com/(?P<channel>[^/]+)') _re_stream = re.compile(r'"stream"\s?:\s?"([^"]+)"') _schema_data = validate.Schema( { # 'status': int, # 'msg': validate.any(None, str), 'data': [{ 'gameStreamInfoList': [{ 'sCdnType': str, 'sStreamName': str, 'sFlvUrl': str, 'sFlvUrlSuffix': str, 'sFlvAntiCode': validate.all(str, validate.transform(lambda v: html_unescape(v))), # 'sHlsUrl': str, # 'sHlsUrlSuffix': str, # 'sHlsAntiCode': validate.all(str, validate.transform(lambda v: html_unescape(v))), validate.optional('iIsMultiStream'): int, 'iPCPriorityRate': int, }] }], # 'vMultiStreamInfo': [{ # 'sDisplayName': str, # 'iBitRate': int, # }], }, validate.get('data'), validate.get(0), validate.get('gameStreamInfoList'), ) QUALITY_WEIGHTS = {} @classmethod def can_handle_url(cls, url): return cls._re_url.match(url) is not None @classmethod def stream_weight(cls, key): weight = cls.QUALITY_WEIGHTS.get(key) if weight: return weight, 'huya' return Plugin.stream_weight(key) def _get_streams(self): res = self.session.http.get(self.url) data = self._re_stream.search(res.text) if not data: return data = parse_json(base64.b64decode(data.group(1)), schema=self._schema_data) for info in data: log.trace(f'{info!r}') flv_url = f'{info["sFlvUrl"]}/{info["sStreamName"]}.{info["sFlvUrlSuffix"]}?{info["sFlvAntiCode"]}' name = f'source_{info["sCdnType"].lower()}' self.QUALITY_WEIGHTS[name] = info['iPCPriorityRate'] yield name, HTTPStream(self.session, flv_url) log.debug(f'QUALITY_WEIGHTS: {self.QUALITY_WEIGHTS!r}')
def unescape(text): """ * Unescape HTML special chars\n * @param string text\n * @return string """ return html_unescape(text)
def _get_streams(self): streamdata = None if self.get_option("email"): if self.login(self.get_option("email"), self.get_option("password")): log.info("Logged in as {0}".format(self.get_option("email"))) self.save_cookies(lambda c: "steamMachineAuth" in c.name) # Handle steam.tv URLs if self.matches[1] is not None: # extract the steam ID from the page res = self.session.http.get(self.url) for div in itertags(res.text, 'div'): if div.attributes.get("id") == "webui_config": broadcast_data = html_unescape(div.attributes.get("data-broadcast")) steamid = parse_json(broadcast_data).get("steamid") self.url = self._watch_broadcast_url + steamid # extract the steam ID from the URL steamid = self.match.group(1) res = self.session.http.get(self.url) # get the page to set some cookies sessionid = res.cookies.get('sessionid') while streamdata is None or streamdata["success"] in ("waiting", "waiting_for_start"): streamdata = self._get_broadcast_stream(steamid, sessionid=sessionid) if streamdata["success"] == "ready": return DASHStream.parse_manifest(self.session, streamdata["url"]) elif streamdata["success"] == "unavailable": log.error("This stream is currently unavailable") return else: r = streamdata["retry"] / 1000.0 log.info("Waiting for stream, will retry again in {} seconds...".format(r)) time.sleep(r)
def get_home_title(url: str, home_title_list: List[str]) -> None: """Get homepage of the url and return it's title. home_title_list will be used to return the thread result. This function is invoked through a thread. """ # Todo: cache the result. home_url = '://'.join(urlparse(url)[:2]) with requests_get(home_url, stream=True, headers=USER_AGENT_HEADER, timeout=15) as r: try: check_response_headers(r) except ( RequestException, StatusCodeError, ContentTypeError, ContentLengthError, ): return content = next(r.iter_content(MAX_RESPONSE_LENGTH)) m = CHARSET(content) html = content.decode(m[1].decode() if m else r.encoding) m = TITLE_TAG(html) title = html_unescape(m['result']) if m else None home_title_list.append(title)
def format_tweet_as_text(tweet): tweet = tweet.AsDict() assert not any(key[0] == '_' and key[-1] == '_' for key in tweet.keys()) retweet = tweet.get('retweeted_status') if retweet: assert not any(key[0] == '_' and key[-1] == '_' for key in retweet.keys()) retweet['_first_username_'] = tweet['user']['name'] tweet = retweet username = tweet['user']['name'] tweet['_username_'] = click.style(' ' + username + ' ', fg='white', bg='black') created_at = parse_time(tweet['created_at']) tweet['_time_ago_'] = click.style(time_ago(created_at), fg='red') # Decorate text text = html_unescape(tweet['text']) urls = tweet.get('urls', []) + tweet.get('media', []) url_pairs = [(url['url'], url['expanded_url']) for url in urls] text = expand_urls(text, url_pairs) mentions = [mention['screen_name'] for mention in tweet.get('user_mentions', [])] text = decorate_user_mentions(text, mentions, underline=True) hashtags = [hashtag['text'] for hashtag in tweet.get('hashtags', [])] text = decorate_hashtags(text, hashtags, underline=True) tweet['_aligned_text_'] = align_text(text, margin='\t', skip_first_line=True) return _text_formatter.format(FORMAT_RETWEET if retweet else FORMAT_TWEET, created_at, **tweet)
def find_iframe(self, url): self.session.http.headers.update({"User-Agent": useragents.CHROME}) res = self.session.http.get(self.url) for iframe_url in self.iframe_re.findall(res.text): if "googletagmanager" not in iframe_url: iframe_url = html_unescape(iframe_url) return update_scheme(self.url, iframe_url)
def archive_files(yga, subdir=None): logger = logging.getLogger(name="archive_files") try: if subdir: file_json = yga.files(sfpath=subdir) else: file_json = yga.files() except Exception: logger.error("Couldn't access Files functionality for this group") return with open('fileinfo.json', 'wb') as f: json.dump(file_json['dirEntries'], codecs.getwriter('utf-8')(f), ensure_ascii=False, indent=4) n = 0 sz = len(file_json['dirEntries']) for path in file_json['dirEntries']: n += 1 if path['type'] == 0: # Regular file name = html_unescape(path['fileName']) new_name = sanitise_file_name("%d_%s" % (n, name)) if file_keep(new_name, hacky_vars['file'], ": %s" % (new_name, )) is False: logger.info("Fetching file '%s' as '%s' (%d/%d)", name, new_name, n, sz) with open(new_name, 'wb') as f: try: yga.download_file(path['downloadURL'], f) except: pass # Bad size exceptions can sometimes cause issues going from -f to -i. set_mtime(new_name, path['createdTime']) elif path['type'] == 1: # Directory name = html_unescape(path['fileName']) new_name = "%d_%s" % (n, name) logger.info("Fetching directory '%s' as '%s' (%d/%d)", name, sanitise_folder_name(new_name), n, sz) with Mkchdir(new_name): # (new_name sanitised again by Mkchdir) pathURI = unquote(path['pathURI']) archive_files(yga, subdir=pathURI) set_mtime(sanitise_folder_name(new_name), path['createdTime'])
async def quote(message, *_, author: str = ""): """Affiche une citation au hazard ou pour un auteur donné""" global dayly_timer, twenty_timer now = datetime.now() # Update dayly quotes everyday if now - dayly_timer >= timedelta(days=1): dayly_timer = now await get_quotes() # One quote allow each 20 minutes if now - twenty_timer < timedelta(seconds=5): await thenumberone.send_message(message.channel, "Je manque d'inspiration...") return twenty_timer = now # If an author is specified, select one randomly from that author if author: if author.casefold() in "chuck norris": async with thenumberone.http.session.get( "http://www.chucknorrisfacts.fr/api/get?data=tri:alea;nb:3" ) as resp: assert resp.status == 200 quote = Quote( "Chuck Norris", html_unescape( max(await resp.json(), key=itemgetter("points"))["fact"])) else: author_quotes = list( filter( lambda quote: author.casefold() in quote.author.casefold(), quotes)) if not author_quotes: await thenumberone.send_message( message.channel, "Je ne connais pas cet auteur.") return quote = choice(author_quotes) # If there are dayly_quotes left use one elif dayly_quotes: quote = dayly_quotes.pop() # Otherwise use our quotes else: quote = quotes.pop() quotes.appendleft(quote) await thenumberone.send_message(message.channel, f"{quote.author}: *{quote.text}*")
def val(self): '''Unescape HTML entities into corresponding Unicode characters. Named (&), decimal (&), and hex (& and &) formats are supported. Unknown entities are left intact. As of Python 2.7 and Python 3.6 the following 252 named entities are recognized and unescaped: https://github.com/python/cpython/blob/2.7/Lib/htmlentitydefs.py https://github.com/python/cpython/blob/3.6/Lib/html/entities.py ''' return html_unescape(self.raw_val)
def extract_next_page_url(archive_page_raw_html, settings): """ extracts and returns next page URL from an HTML code if there is one... """ next_page_url_format_re = settings['NEXT_PAGE_URL_FORMAT_RE'] code_line = next_page_url_format_re.search(archive_page_raw_html) if code_line is not None: code_line = code_line.group(0) code_line = settings['BEFORE_NEXT_PAGE_URL_RE'].sub( settings['before_next_page_url_repl'], code_line) code_line = settings['AFTER_NEXT_PAGE_URL_RE'].sub( settings['after_next_page_url_repl'], code_line) code_line = html_unescape(code_line) return code_line
def extract_article_urls_from_page(archive_page_raw_html, settings): """ extracts and returns as a list the URLs belonging to articles from an HTML code """ urls = set() article_url_format_re = settings['ARTICLE_URL_FORMAT_RE'] for code_line in article_url_format_re.findall(archive_page_raw_html): code_line = settings['BEFORE_ARTICLE_URL_RE'].sub( settings['before_article_url_repl'], code_line) code_line = settings['AFTER_ARTICLE_URL_RE'].sub( settings['after_article_url_repl'], code_line) code_line = html_unescape(code_line) urls.add(code_line) return urls
async def get_trivias(): if TRIVIA_REQUEST_LOCK.locked(): await TRIVIA_REQUEST_LOCK return async with TRIVIA_REQUEST_LOCK: async with SLASH_CLIENT.http.get(TRIVIA_URL, params={ 'amount': 100, 'category': 31 }) as response: json = await response.json() for trivia_data in json['results']: trivia = ( html_unescape(trivia_data['question']), html_unescape(trivia_data['correct_answer']), [ html_unescape(element) for element in trivia_data['incorrect_answers'] ], ) TRIVIA_QUEUE.append(trivia)
def get_title(self): if self.title is None: if not self.html_text: self.html_text = self._res_text(self.url) _og_title_re = re.compile(r'<meta\s*property="og:title"\s*content="(?P<title>[^<>]+)"\s*/?>') _title_re = re.compile(r'<title[^<>]*>(?P<title>[^<>]+)</title>') m = _og_title_re.search(self.html_text) or _title_re.search(self.html_text) if m: self.title = re.sub(r'[\s]+', ' ', m.group('title')) self.title = re.sub(r'^\s*|\s*$', '', self.title) self.title = html_unescape(self.title) if self.title is None: # fallback if there is no <title> self.title = self.url return self.title
def _get_streams(self): """ Find the streams for vk.com :return: """ self.session.http.headers.update({'User-Agent': useragents.IPHONE_6}) # If this is a 'videos' catalog URL # with an video ID in the GET request, get that instead url = self.follow_vk_redirect(self.url) m = self._url_re.match(url) if not m: log.error('URL is not compatible: {0}'.format(url)) return video_id = m.group('video_id') log.debug('video ID: {0}'.format(video_id)) params = { 'act': 'show_inline', 'al': '1', 'video': video_id, } res = self.session.http.post(self.API_URL, params=params) for _i in itertags(res.text, 'iframe'): if _i.attributes.get('src'): iframe_url = update_scheme(self.url, _i.attributes['src']) log.debug('Found iframe: {0}'.format(iframe_url)) yield from self.session.streams(iframe_url).items() for _i in itertags(res.text.replace('\\', ''), 'source'): if _i.attributes.get('type') == 'application/vnd.apple.mpegurl': video_url = html_unescape(_i.attributes['src']) streams = HLSStream.parse_variant_playlist( self.session, video_url) if not streams: yield 'live', HLSStream(self.session, video_url) else: yield from streams.items() elif _i.attributes.get('type') == 'video/mp4': q = 'vod' video_url = _i.attributes['src'] m = self._vod_quality_re.search(video_url) if m: q = '{0}p'.format(m.group(1)) yield q, HTTPStream(self.session, video_url)
def url2dict(url: str) -> Dict[str, Any]: """Get url and return the result as a dictionary.""" d = defaultdict(lambda: None) # Creating a thread to request homepage title in background home_title_list = [] # A mutable variable used to get the thread result home_title_thread = Thread(target=get_home_title, args=(url, home_title_list)) home_title_thread.start() html = get_html(url) d['url'] = find_url(html, url) m = TITLE_TAG(html) html_title = html_unescape(m['result']) if m else None if html_title: d['html_title'] = html_title # d['html_title'] is used in waybackmechine.py. authors = find_authors(html) if authors: d['authors'] = authors d['issn'] = find_issn(html) d['pmid'] = find_pmid(html) d['doi'] = find_doi(html) d['volume'] = find_volume(html) d['issue'] = find_issue(html) d['page'] = find_pages(html) d['journal'] = find_journal(html) if d['journal']: d['cite_type'] = 'journal' else: d['cite_type'] = 'web' d['website'] = find_site_name(html, html_title, url, authors, home_title_list, home_title_thread) d['title'] = find_title(html, html_title, url, authors, home_title_list, home_title_thread) date = find_date(html, url) if date: d['date'] = date d['year'] = str(date.year) lang_match = LANG_SEARCH(html) if lang_match is not None: d['language'] = lang_match[1] else: d['language'] = classify(html)[0] return d
def test_string_markdown_link(self): # markdown2 and markdown escape the email address try: from html import unescape as html_unescape except ImportError: from HTMLParser import HTMLParser html_unescape = HTMLParser().unescape p = StringHTMLProperty(self.client, 'test', '1', None, 'test', u2s(u'A link <*****@*****.**>')) m = html_unescape(p.markdown().strip()) m = self.mangleMarkdown2(m) self.assertEqual( m, u2s(u'<p>A link <a href="mailto:[email protected]">[email protected]</a></p>' ))
def _parse_streams(self, res): _found_stream_url = False for meta in itertags(res.text, "meta"): if meta.attributes.get("property") == "og:video:url": stream_url = html_unescape(meta.attributes.get("content")) if ".mpd" in stream_url: for s in DASHStream.parse_manifest(self.session, stream_url).items(): yield s _found_stream_url = True elif ".mp4" in stream_url: yield "vod", HTTPStream(self.session, stream_url) _found_stream_url = True break else: log.debug("No meta og:video:url") if _found_stream_url: return for match in self._src_re.finditer(res.text): stream_url = match.group("url") if "\\/" in stream_url: # if the URL is json encoded, decode it stream_url = parse_json("\"{}\"".format(stream_url)) if ".mpd" in stream_url: yield from DASHStream.parse_manifest(self.session, stream_url).items() elif ".mp4" in stream_url: yield match.group(1), HTTPStream(self.session, stream_url) else: log.debug("Non-dash/mp4 stream: {0}".format(stream_url)) match = self._dash_manifest_re.search(res.text) if match: # facebook replaces "<" characters with the substring "\\x3C" manifest = match.group("manifest").replace("\\/", "/") manifest = bytes(unquote_plus(manifest), "utf-8").decode("unicode_escape") # Ignore unsupported manifests until DASH SegmentBase support is implemented if "SegmentBase" in manifest: log.error("Skipped DASH manifest with SegmentBase streams") else: yield from DASHStream.parse_manifest(self.session, manifest).items()
def parse(self, tree): xpath = '//h1[contains(@class, "ArticleTitle")]/text()' self.update_item('title', tree.xpath(xpath)) xpath = '//*[@class="authors__list"]//*[@class="authors__name"]/text()' authors = tree.xpath(xpath) authors = [html_unescape(author) for author in authors] self.update_item('authors', authors) xpath = '//*[contains(@class, "KeywordGroup")]//*[contains(@class, "Keyword")]/text()' keywords = tree.xpath(xpath) self.update_item('keywords', keywords) xpath = '//*[@id="article-actions"]//*[contains(@class, "download-article")]/a[1]/@href' download_link = tree.xpath(xpath) download_link = self.urljoin(download_link[0]) if download_link else '' self.update_item('download_link', download_link) xpath = '//*[@id="doi-url"]/text()' doi_link = tree.xpath(xpath) doi_link = doi_link[0] if doi_link else '' self.update_item('doi_link', doi_link)
def find_title( html: str, html_title: str, url: str, authors: List[Tuple[str, str]], home_title: List[str], thread: Thread, ) -> Optional[str]: """Return (title_string, where_info).""" m = TITLE_SEARCH(html) if m: return parse_title( html_unescape(m['result']), url, authors, home_title, thread, )[1] elif html_title: return parse_title(html_title, url, authors, home_title, thread)[1] else: return None
def stackexchange(auth_token): user = requests.get( 'https://api.stackexchange.com/2.2/me', params={ 'site': 'codegolf', 'filter': '!)RwcIFN1JaCrhVpgyYeR_oO*', # Constant obtained from SE API explorer 'access_token': auth_token, 'key': auth.get('stackexchange.com').get('key') }).json().get('items')[0] display_name = html_unescape(user.get('display_name')) return user.get('user_id'), { 'name': display_name, 'avatar': user.get('profile_image'), 'email': None, 'identifier': display_name }
def _get_streams(self): p = urlparse(self.url) if "ott.streann.com" != p.netloc: self._domain = p.netloc res = self.session.http.get(self.url) for iframe in itertags(res.text, "iframe"): iframe_url = html_unescape(iframe.attributes.get("src")) if "ott.streann.com" in iframe_url: self.url = iframe_url break else: log.error("Could not find 'ott.streann.com' iframe") return if not self._domain and self.get_option("url"): self._domain = urlparse(self.get_option("url")).netloc if self._domain is None: log.error("Missing source URL use --streann-url") return self.session.http.headers.update({"Referer": self.url}) # Get the query string encrypted_data = urlparse(self.url).query data = base64.b64decode(encrypted_data) # and decrypt it passphrase = self.passphrase() if passphrase: log.debug("Found passphrase") params = decrypt_openssl(data, passphrase) config = parse_qsd(params.decode("utf8")) log.trace(f"config: {config!r}") token = self.get_token(**config) if not token: return hls_url = self.stream_url.format(time=self.time, deviceId=self.device_id, token=token, **config) log.debug("URL={0}".format(hls_url)) return HLSStream.parse_variant_playlist( self.session, hls_url, acceptable_status=(200, 403, 404, 500))
def url2dict(url: str) -> Dict[str, Any]: """Get url and return the result as a dictionary.""" d = defaultdict(lambda: None) # Creating a thread to request homepage title in background home_title_list = [] # A mutable variable used to get the thread result home_title_thread = Thread( target=get_home_title, args=(url, home_title_list)) home_title_thread.start() html = get_html(url) d['url'] = find_url(html, url) m = TITLE_TAG(html) html_title = html_unescape(m['result']) if m else None if html_title: d['html_title'] = html_title # d['html_title'] is used in waybackmechine.py. authors = find_authors(html) if authors: d['authors'] = authors d['issn'] = find_issn(html) d['pmid'] = find_pmid(html) d['doi'] = find_doi(html) d['volume'] = find_volume(html) d['issue'] = find_issue(html) d['page'] = find_pages(html) d['journal'] = find_journal(html) if d['journal']: d['cite_type'] = 'journal' else: d['cite_type'] = 'web' d['website'] = find_site_name( html, html_title, url, authors, home_title_list, home_title_thread) d['title'] = find_title( html, html_title, url, authors, home_title_list, home_title_thread) date = find_date(html, url) if date: d['date'] = date d['year'] = str(date.year) d['language'] = classify(html)[0] return d
def _get_streams(self): if "cdn.bg" in urlparse(self.url).netloc: iframe_url = self.url h = self.session.get_option("http-headers") if h and h.get("Referer"): _referer = h.get("Referer") else: log.error( "Missing Referer for iframe URL, use --http-header \"Referer=URL\" " ) return else: _referer = self.url res = self.session.http.get(self.url) m = self._re_frame.search(res.text) if m: iframe_url = m.group(1) else: for iframe in itertags(res.text, "iframe"): iframe_url = iframe.attributes.get("src") if iframe_url and "cdn.bg" in iframe_url: iframe_url = update_scheme("https://", html_unescape(iframe_url), force=False) break else: return log.debug(f"Found iframe: {iframe_url}") res = self.session.http.get(iframe_url, headers={"Referer": _referer}) stream_url = self.stream_schema.validate(res.text) if "geoblock" in stream_url: log.error("Geo-restricted content") return return HLSStream.parse_variant_playlist( self.session, update_scheme(iframe_url, stream_url), headers={"Referer": "https://i.cdn.bg/"}, )
def get_home_title(url: str, home_title_list: List[str]) -> None: """Get homepage of the url and return it's title. home_title_list will be used to return the thread result. This function is invoked through a thread. """ # Todo: cache the result. home_url = '://'.join(urlparse(url)[:2]) with request( home_url, spoof=True, stream=True ) as r: try: check_response_headers(r) except ( RequestException, StatusCodeError, ContentTypeError, ContentLengthError, ): return content = next(r.iter_content(MAX_RESPONSE_LENGTH)) m = CHARSET(content) html = content.decode(m[1].decode() if m else r.encoding) m = TITLE_TAG(html) title = html_unescape(m['result']) if m else None home_title_list.append(title)
def render_template(template, **kw): return html_unescape(jin_env.get_template(template).render(**kw))
def unescape(string): if string is None: return None return html_unescape(string)