Пример #1
0
def archive_files(yga, subdir=None):
    logger = logging.getLogger(name="archive_files")
    try:
        if subdir:
            file_json = yga.files(sfpath=subdir)
        else:
            file_json = yga.files()
    except Exception:
        logger.error("Couldn't access Files functionality for this group")
        return

    with open('fileinfo.json', 'wb') as f:
        json.dump(file_json['dirEntries'],
                  codecs.getwriter('utf-8')(f),
                  ensure_ascii=False,
                  indent=4)

    n = 0
    sz = len(file_json['dirEntries'])
    for path in file_json['dirEntries']:
        n += 1
        if path['type'] == 0:
            # Regular file
            name = html_unescape(path['fileName'])
            logger.info("Fetching file '%s' (%d/%d)", name, n, sz)
            with open(sanitise_file_name(name), 'wb') as f:
                yga.download_file(path['downloadURL'], f)

        elif path['type'] == 1:
            # Directory
            name = html_unescape(path['fileName'])
            logger.info("Fetching directory '%s' (%d/%d)", name, n, sz)
            with Mkchdir(name):
                pathURI = unquote(path['pathURI'])
                archive_files(yga, subdir=pathURI)
Пример #2
0
def archive_photos(yga):
    logger = logging.getLogger(name="archive_photos")
    try:
        nb_albums = yga.albums(count=5)['total'] + 1
    except Exception:
        logger.error("Couldn't access Photos functionality for this group")
        return
    albums = yga.albums(count=nb_albums)
    n = 0

    with open('albums.json', 'wb') as f:
        json.dump(albums['albums'],
                  codecs.getwriter('utf-8')(f),
                  ensure_ascii=False,
                  indent=4)

    for a in albums['albums']:
        n += 1
        name = html_unescape(a['albumName'])
        # Yahoo sometimes has an off-by-one error in the album count...
        logger.info("Fetching album '%s' (%d/%d)", name, n, albums['total'])

        folder = "%d-%s" % (a['albumId'], name)

        with Mkchdir(folder):
            photos = yga.albums(a['albumId'])
            pages = int(photos['total'] / 100 + 1)
            p = 0

            for page in range(pages):
                photos = yga.albums(a['albumId'], start=page * 100, count=100)
                with open('photos-%d.json' % page, 'wb') as f:
                    json.dump(photos['photos'],
                              codecs.getwriter('utf-8')(f),
                              ensure_ascii=False,
                              indent=4)

                for photo in photos['photos']:
                    p += 1
                    pname = html_unescape(photo['photoName'])
                    logger.info("Fetching photo '%s' (%d/%d)", pname, p,
                                photos['total'])

                    photoinfo = get_best_photoinfo(photo['photoInfo'])
                    fname = "%d-%s.jpg" % (photo['photoId'], pname)
                    with open(sanitise_file_name(fname), 'wb') as f:
                        for i in range(TRIES):
                            try:
                                yga.download_file(photoinfo['displayURL'], f)
                                break
                            except requests.exceptions.HTTPError as err:
                                logger.error(
                                    "HTTP error (sleeping before retry, try %d: %s",
                                    i, err)
                                time.sleep(HOLDOFF)
Пример #3
0
def archive_photos(yga):
    logger = logging.getLogger(name="archive_photos")
    try:
        nb_albums = yga.albums(count=5)['total'] + 1
    except Exception:
        logger.error("Couldn't access Photos functionality for this group")
        return
    albums = yga.albums(count=nb_albums)
    n = 0

    with open('albums.json', 'wb') as f:
        json.dump(albums['albums'],
                  codecs.getwriter('utf-8')(f),
                  ensure_ascii=False,
                  indent=4)

    for a in albums['albums']:
        n += 1
        name = html_unescape(a['albumName'])
        # Yahoo sometimes has an off-by-one error in the album count...
        logger.info("Fetching album '%s' (%d/%d)", name, n, albums['total'])

        folder = "%d-%s" % (a['albumId'], name)

        with Mkchdir(folder):
            photos = yga.albums(a['albumId'])
            pages = int(photos['total'] / 100 + 1)
            p = 0

            for page in range(pages):
                photos = yga.albums(a['albumId'], start=page * 100, count=100)
                with open('photos-%d.json' % page, 'wb') as f:
                    json.dump(photos['photos'],
                              codecs.getwriter('utf-8')(f),
                              ensure_ascii=False,
                              indent=4)

                for photo in photos['photos']:
                    p += 1
                    pname = html_unescape(photo['photoName'])
                    fname = sanitise_file_name("%d-%s.jpg" %
                                               (photo['photoId'], pname))
                    if file_keep(fname, hacky_vars['file'], "photo: %s" %
                                 (fname, )) is False:
                        logger.info("Fetching photo '%s' (%d/%d)", pname, p,
                                    photos['total'])
                        with open(fname, 'wb') as f:
                            process_single_photo(yga, photo['photoInfo'], f)
                        set_mtime(fname, photo['creationDate'])

        set_mtime(sanitise_folder_name(folder), a['modificationDate'])
Пример #4
0
    async def work(worker_id: int):
        i = 0
        async with ClientSession(
                cookie_jar=session.cookie_jar) as worker_session:
            while True:
                request_options = dict(
                    url=f'{origin}/owa/?ae=Dialog&t=AddressBook&ctx=1',
                    data={
                        **form_data, 'hidpg': worker_id + i * num_concurrent
                    },
                    headers={'Connection': 'Keep-Alive'})

                async with worker_session.post(**request_options) as response:
                    data = (await response.content.read()).decode()

                parsed_html_bs: BeautifulSoup = BeautifulSoup(
                    markup=html_unescape(data), features='html.parser')

                page_names = {
                    td.text.rstrip()
                    for td in parsed_html_bs.select(
                        selector=
                        'table.lvw > tr:nth-child(n+4) > td:nth-child(3)')
                    if td.text.rstrip()
                }

                if len(page_names) == 0:
                    break

                all_names.update(page_names)

                i += 1
Пример #5
0
def cleanup_ywh_redirects_from_html(
    ywh_domain: str,
    html: str,
) -> str:
    """
    Replace YesWeHack redirects with real URLs.

    Args:
        ywh_domain: a base domain of the YWH redirects
        html: an html

    Returns:
        the cleaned html
    """
    redirect_base_re = re.escape(f'{ywh_domain}/redirect?url=')

    pattern = re.compile(f'"(https?://{redirect_base_re}[^ "]*)"')
    redirect_urls = pattern.findall(html)
    for redirect_url in redirect_urls:
        real_url = _extract_real_url_from_redirect(
            redirect_url=html_unescape(redirect_url), )
        html = html.replace(
            redirect_url,
            html_escape(real_url or ''),
        )
    return html
Пример #6
0
def sanitize(text: str):
    """
        Sanitize text removing html encoded parts, double spaces and so on
    """

    if text is None:
        return None

    text = html_unescape(html_unescape(text))

    for key, value in replaces.items():
        text = text.replace(key, value)

    text = re.sub(r'[ \t]+', ' ', text).strip()

    return text
Пример #7
0
async def get_account_identity_from_ui(
        session: ClientSession, origin: str,
        **extra_request_keywords) -> Dict[str, str]:
    """

    :param session: An authenticated `aiohttp.ClientSession`.
    :param origin: The origin part of the URL of the OWA.
    :return:
    """

    # TODO: I would like something like Javascript Promises instead of this pattern.
    json_results = None

    def find_json_results(node, _) -> None:
        if node.type == 'Property' and node.key.value == 'JsonResults':
            nonlocal json_results
            json_results = json_loads(node.value.value)

    async with session.get(url=f'{origin}/ecp/PersonalSettings/HomePage.aspx',
                           **extra_request_keywords) as response:
        data = (await response.content.read()).decode()
        esprima_parse(code=pq(
            html_unescape(data))('script[type="text/javascript"]')[-1].text,
                      delegate=find_json_results)

    # TODO: Reconsider return value. Return `json_results` instead?
    return json_results['Output'][0]
Пример #8
0
def get_last_n_filtered_tweets(twitter_oauth_session, screen_name, n):
    base_URI = f'https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name={screen_name}&count=200'

    all_tweet_texts = []
    last_id = 0
    while (len(all_tweet_texts) < n):
        current_tweets = json.loads(
            twitter_oauth_session.request(
                'GET',
                f"{base_URI}&max_id={last_id}" if last_id else base_URI).text)

        if current_tweets[-1]['id'] == last_id: break

        for tweet in current_tweets:
            #Skip replies
            if current_tweets[0]['in_reply_to_status_id']: continue

            #Skip retweets
            if current_tweets[0]['retweeted']: continue

            #Skip tweets with any users tagged in it
            if bool(re_tagged_username_pattern.search(tweet['text'])): continue

            #Redact all urls, twitter automatically converts urls to the t.co shortened format so we simply replace those
            tweet['text'] = re_link_pattern.sub('[url redacted]',
                                                tweet['text']).strip()
            if tweet['text'] == '[url redacted]': continue

            #If it passed all these checks, we add it to our all_tweet_texts list to eventually return
            all_tweet_texts.append(html_unescape(tweet['text']))
        last_id = current_tweets[-1]['id']

    return all_tweet_texts[:n]
Пример #9
0
def get_item_group_from_feedparser(parser):
    """ Retrieve all items from feedparser and return item group.

    :type parser: 'feedparser.FeedParserDict'

    :rtype: ItemGroup
    """
    items = list()

    logging.info('Loop for retrieving items.')
    for item in parser.entries:
        try:
            text, img_links = format_description(item.description)
        except AttributeError:
            continue

        if text:
            new_item = Item(title=html_unescape(item.title),
                            date=item.published,
                            link=item.link,
                            text=text,
                            img_links=img_links)

            items.append(new_item)

    return ItemGroup(feed=parser.feed.title, items=items)
Пример #10
0
class Huya(Plugin):
    _re_url = re.compile(r'https?://(?:www\.)?huya\.com/(?P<channel>[^/]+)')
    _re_stream = re.compile(r'"stream"\s?:\s?"([^"]+)"')
    _schema_data = validate.Schema(
        {
            # 'status': int,
            # 'msg': validate.any(None, str),
            'data': [{
                'gameStreamInfoList': [{
                    'sCdnType': str,
                    'sStreamName': str,
                    'sFlvUrl': str,
                    'sFlvUrlSuffix': str,
                    'sFlvAntiCode': validate.all(str, validate.transform(lambda v: html_unescape(v))),
                    # 'sHlsUrl': str,
                    # 'sHlsUrlSuffix': str,
                    # 'sHlsAntiCode': validate.all(str, validate.transform(lambda v: html_unescape(v))),
                    validate.optional('iIsMultiStream'): int,
                    'iPCPriorityRate': int,
                }]
            }],
            # 'vMultiStreamInfo': [{
            #    'sDisplayName': str,
            #    'iBitRate': int,
            # }],
        },
        validate.get('data'),
        validate.get(0),
        validate.get('gameStreamInfoList'),
    )
    QUALITY_WEIGHTS = {}

    @classmethod
    def can_handle_url(cls, url):
        return cls._re_url.match(url) is not None

    @classmethod
    def stream_weight(cls, key):
        weight = cls.QUALITY_WEIGHTS.get(key)
        if weight:
            return weight, 'huya'

        return Plugin.stream_weight(key)

    def _get_streams(self):
        res = self.session.http.get(self.url)
        data = self._re_stream.search(res.text)

        if not data:
            return

        data = parse_json(base64.b64decode(data.group(1)), schema=self._schema_data)
        for info in data:
            log.trace(f'{info!r}')
            flv_url = f'{info["sFlvUrl"]}/{info["sStreamName"]}.{info["sFlvUrlSuffix"]}?{info["sFlvAntiCode"]}'
            name = f'source_{info["sCdnType"].lower()}'
            self.QUALITY_WEIGHTS[name] = info['iPCPriorityRate']
            yield name, HTTPStream(self.session, flv_url)

        log.debug(f'QUALITY_WEIGHTS: {self.QUALITY_WEIGHTS!r}')
Пример #11
0
 def unescape(text):
     """
 *  Unescape HTML special chars\n
 *  @param string text\n
 *  @return string
     """
     return html_unescape(text)
Пример #12
0
    def _get_streams(self):
        streamdata = None
        if self.get_option("email"):
            if self.login(self.get_option("email"), self.get_option("password")):
                log.info("Logged in as {0}".format(self.get_option("email")))
                self.save_cookies(lambda c: "steamMachineAuth" in c.name)

        # Handle steam.tv URLs
        if self.matches[1] is not None:
            # extract the steam ID from the page
            res = self.session.http.get(self.url)
            for div in itertags(res.text, 'div'):
                if div.attributes.get("id") == "webui_config":
                    broadcast_data = html_unescape(div.attributes.get("data-broadcast"))
                    steamid = parse_json(broadcast_data).get("steamid")
                    self.url = self._watch_broadcast_url + steamid

        # extract the steam ID from the URL
        steamid = self.match.group(1)
        res = self.session.http.get(self.url)  # get the page to set some cookies
        sessionid = res.cookies.get('sessionid')

        while streamdata is None or streamdata["success"] in ("waiting", "waiting_for_start"):
            streamdata = self._get_broadcast_stream(steamid,
                                                    sessionid=sessionid)

            if streamdata["success"] == "ready":
                return DASHStream.parse_manifest(self.session, streamdata["url"])
            elif streamdata["success"] == "unavailable":
                log.error("This stream is currently unavailable")
                return
            else:
                r = streamdata["retry"] / 1000.0
                log.info("Waiting for stream, will retry again in {} seconds...".format(r))
                time.sleep(r)
Пример #13
0
def get_home_title(url: str, home_title_list: List[str]) -> None:
    """Get homepage of the url and return it's title.

    home_title_list will be used to return the thread result.
    This function is invoked through a thread.
    """
    # Todo: cache the result.
    home_url = '://'.join(urlparse(url)[:2])
    with requests_get(home_url,
                      stream=True,
                      headers=USER_AGENT_HEADER,
                      timeout=15) as r:
        try:
            check_response_headers(r)
        except (
                RequestException,
                StatusCodeError,
                ContentTypeError,
                ContentLengthError,
        ):
            return
        content = next(r.iter_content(MAX_RESPONSE_LENGTH))
    m = CHARSET(content)
    html = content.decode(m[1].decode() if m else r.encoding)
    m = TITLE_TAG(html)
    title = html_unescape(m['result']) if m else None
    home_title_list.append(title)
Пример #14
0
def format_tweet_as_text(tweet):
    tweet = tweet.AsDict()
    assert not any(key[0] == '_' and key[-1] == '_' for key in tweet.keys())

    retweet = tweet.get('retweeted_status')
    if retweet:
        assert not any(key[0] == '_' and key[-1] == '_' for key in retweet.keys())
        retweet['_first_username_'] = tweet['user']['name']
        tweet = retweet

    username = tweet['user']['name']
    tweet['_username_'] = click.style(' ' + username + ' ',
                                      fg='white', bg='black')

    created_at = parse_time(tweet['created_at'])
    tweet['_time_ago_'] = click.style(time_ago(created_at), fg='red')

    # Decorate text
    text = html_unescape(tweet['text'])

    urls = tweet.get('urls', []) + tweet.get('media', [])
    url_pairs = [(url['url'], url['expanded_url']) for url in urls]
    text = expand_urls(text, url_pairs)

    mentions = [mention['screen_name'] for mention in tweet.get('user_mentions', [])]
    text = decorate_user_mentions(text, mentions, underline=True)

    hashtags = [hashtag['text'] for hashtag in tweet.get('hashtags', [])]
    text = decorate_hashtags(text, hashtags, underline=True)

    tweet['_aligned_text_'] = align_text(text, margin='\t', skip_first_line=True)

    return _text_formatter.format(FORMAT_RETWEET if retweet else FORMAT_TWEET,
                                  created_at,
                                  **tweet)
Пример #15
0
 def find_iframe(self, url):
     self.session.http.headers.update({"User-Agent": useragents.CHROME})
     res = self.session.http.get(self.url)
     for iframe_url in self.iframe_re.findall(res.text):
         if "googletagmanager" not in iframe_url:
             iframe_url = html_unescape(iframe_url)
             return update_scheme(self.url, iframe_url)
Пример #16
0
def archive_files(yga, subdir=None):
    logger = logging.getLogger(name="archive_files")
    try:
        if subdir:
            file_json = yga.files(sfpath=subdir)
        else:
            file_json = yga.files()
    except Exception:
        logger.error("Couldn't access Files functionality for this group")
        return

    with open('fileinfo.json', 'wb') as f:
        json.dump(file_json['dirEntries'],
                  codecs.getwriter('utf-8')(f),
                  ensure_ascii=False,
                  indent=4)

    n = 0
    sz = len(file_json['dirEntries'])
    for path in file_json['dirEntries']:
        n += 1
        if path['type'] == 0:
            # Regular file
            name = html_unescape(path['fileName'])
            new_name = sanitise_file_name("%d_%s" % (n, name))
            if file_keep(new_name, hacky_vars['file'], ": %s" %
                         (new_name, )) is False:
                logger.info("Fetching file '%s' as '%s' (%d/%d)", name,
                            new_name, n, sz)
                with open(new_name, 'wb') as f:
                    try:
                        yga.download_file(path['downloadURL'], f)
                    except:
                        pass  # Bad size exceptions can sometimes cause issues going from -f to -i.
                set_mtime(new_name, path['createdTime'])

        elif path['type'] == 1:
            # Directory
            name = html_unescape(path['fileName'])
            new_name = "%d_%s" % (n, name)
            logger.info("Fetching directory '%s' as '%s' (%d/%d)", name,
                        sanitise_folder_name(new_name), n, sz)
            with Mkchdir(new_name):  # (new_name sanitised again by Mkchdir)
                pathURI = unquote(path['pathURI'])
                archive_files(yga, subdir=pathURI)
            set_mtime(sanitise_folder_name(new_name), path['createdTime'])
Пример #17
0
async def quote(message, *_, author: str = ""):
    """Affiche une citation au hazard ou pour un auteur donné"""

    global dayly_timer, twenty_timer
    now = datetime.now()

    # Update dayly quotes everyday
    if now - dayly_timer >= timedelta(days=1):
        dayly_timer = now
        await get_quotes()

    # One quote allow each 20 minutes
    if now - twenty_timer < timedelta(seconds=5):
        await thenumberone.send_message(message.channel,
                                        "Je manque d'inspiration...")
        return
    twenty_timer = now

    # If an author is specified, select one randomly from that author
    if author:
        if author.casefold() in "chuck norris":
            async with thenumberone.http.session.get(
                    "http://www.chucknorrisfacts.fr/api/get?data=tri:alea;nb:3"
            ) as resp:
                assert resp.status == 200
                quote = Quote(
                    "Chuck Norris",
                    html_unescape(
                        max(await resp.json(),
                            key=itemgetter("points"))["fact"]))

        else:
            author_quotes = list(
                filter(
                    lambda quote: author.casefold() in quote.author.casefold(),
                    quotes))
            if not author_quotes:
                await thenumberone.send_message(
                    message.channel, "Je ne connais pas cet auteur.")
                return
            quote = choice(author_quotes)

    # If there are dayly_quotes left use one
    elif dayly_quotes:
        quote = dayly_quotes.pop()

    # Otherwise use our quotes
    else:
        quote = quotes.pop()
        quotes.appendleft(quote)
    await thenumberone.send_message(message.channel,
                                    f"{quote.author}: *{quote.text}*")
Пример #18
0
    def val(self):
        '''Unescape HTML entities into corresponding Unicode characters.

        Named (&amp;), decimal (&#38;), and hex (&#x26; and &#x0026;) formats
        are supported. Unknown entities are left intact.

        As of Python 2.7 and Python 3.6 the following 252 named entities are
        recognized and unescaped:

            https://github.com/python/cpython/blob/2.7/Lib/htmlentitydefs.py
            https://github.com/python/cpython/blob/3.6/Lib/html/entities.py
        '''
        return html_unescape(self.raw_val)
Пример #19
0
def extract_next_page_url(archive_page_raw_html, settings):
    """
        extracts and returns next page URL from an HTML code if there is one...
    """
    next_page_url_format_re = settings['NEXT_PAGE_URL_FORMAT_RE']
    code_line = next_page_url_format_re.search(archive_page_raw_html)
    if code_line is not None:
        code_line = code_line.group(0)
        code_line = settings['BEFORE_NEXT_PAGE_URL_RE'].sub(
            settings['before_next_page_url_repl'], code_line)
        code_line = settings['AFTER_NEXT_PAGE_URL_RE'].sub(
            settings['after_next_page_url_repl'], code_line)
        code_line = html_unescape(code_line)
    return code_line
Пример #20
0
def extract_article_urls_from_page(archive_page_raw_html, settings):
    """
        extracts and returns as a list the URLs belonging to articles from an HTML code
    """
    urls = set()
    article_url_format_re = settings['ARTICLE_URL_FORMAT_RE']
    for code_line in article_url_format_re.findall(archive_page_raw_html):
        code_line = settings['BEFORE_ARTICLE_URL_RE'].sub(
            settings['before_article_url_repl'], code_line)
        code_line = settings['AFTER_ARTICLE_URL_RE'].sub(
            settings['after_article_url_repl'], code_line)
        code_line = html_unescape(code_line)
        urls.add(code_line)
    return urls
Пример #21
0
async def get_trivias():
    if TRIVIA_REQUEST_LOCK.locked():
        await TRIVIA_REQUEST_LOCK
        return

    async with TRIVIA_REQUEST_LOCK:
        async with SLASH_CLIENT.http.get(TRIVIA_URL,
                                         params={
                                             'amount': 100,
                                             'category': 31
                                         }) as response:
            json = await response.json()

        for trivia_data in json['results']:
            trivia = (
                html_unescape(trivia_data['question']),
                html_unescape(trivia_data['correct_answer']),
                [
                    html_unescape(element)
                    for element in trivia_data['incorrect_answers']
                ],
            )

            TRIVIA_QUEUE.append(trivia)
Пример #22
0
 def get_title(self):
     if self.title is None:
         if not self.html_text:
             self.html_text = self._res_text(self.url)
         _og_title_re = re.compile(r'<meta\s*property="og:title"\s*content="(?P<title>[^<>]+)"\s*/?>')
         _title_re = re.compile(r'<title[^<>]*>(?P<title>[^<>]+)</title>')
         m = _og_title_re.search(self.html_text) or _title_re.search(self.html_text)
         if m:
             self.title = re.sub(r'[\s]+', ' ', m.group('title'))
             self.title = re.sub(r'^\s*|\s*$', '', self.title)
             self.title = html_unescape(self.title)
         if self.title is None:
             # fallback if there is no <title>
             self.title = self.url
     return self.title
Пример #23
0
    def _get_streams(self):
        """
        Find the streams for vk.com
        :return:
        """
        self.session.http.headers.update({'User-Agent': useragents.IPHONE_6})

        # If this is a 'videos' catalog URL
        # with an video ID in the GET request, get that instead
        url = self.follow_vk_redirect(self.url)

        m = self._url_re.match(url)
        if not m:
            log.error('URL is not compatible: {0}'.format(url))
            return

        video_id = m.group('video_id')
        log.debug('video ID: {0}'.format(video_id))

        params = {
            'act': 'show_inline',
            'al': '1',
            'video': video_id,
        }
        res = self.session.http.post(self.API_URL, params=params)

        for _i in itertags(res.text, 'iframe'):
            if _i.attributes.get('src'):
                iframe_url = update_scheme(self.url, _i.attributes['src'])
                log.debug('Found iframe: {0}'.format(iframe_url))
                yield from self.session.streams(iframe_url).items()

        for _i in itertags(res.text.replace('\\', ''), 'source'):
            if _i.attributes.get('type') == 'application/vnd.apple.mpegurl':
                video_url = html_unescape(_i.attributes['src'])
                streams = HLSStream.parse_variant_playlist(
                    self.session, video_url)
                if not streams:
                    yield 'live', HLSStream(self.session, video_url)
                else:
                    yield from streams.items()
            elif _i.attributes.get('type') == 'video/mp4':
                q = 'vod'
                video_url = _i.attributes['src']
                m = self._vod_quality_re.search(video_url)
                if m:
                    q = '{0}p'.format(m.group(1))
                yield q, HTTPStream(self.session, video_url)
Пример #24
0
def url2dict(url: str) -> Dict[str, Any]:
    """Get url and return the result as a dictionary."""
    d = defaultdict(lambda: None)
    # Creating a thread to request homepage title in background
    home_title_list = []  # A mutable variable used to get the thread result
    home_title_thread = Thread(target=get_home_title,
                               args=(url, home_title_list))
    home_title_thread.start()

    html = get_html(url)
    d['url'] = find_url(html, url)
    m = TITLE_TAG(html)
    html_title = html_unescape(m['result']) if m else None
    if html_title:
        d['html_title'] = html_title
    # d['html_title'] is used in waybackmechine.py.
    authors = find_authors(html)
    if authors:
        d['authors'] = authors
    d['issn'] = find_issn(html)
    d['pmid'] = find_pmid(html)
    d['doi'] = find_doi(html)
    d['volume'] = find_volume(html)
    d['issue'] = find_issue(html)
    d['page'] = find_pages(html)
    d['journal'] = find_journal(html)
    if d['journal']:
        d['cite_type'] = 'journal'
    else:
        d['cite_type'] = 'web'
        d['website'] = find_site_name(html, html_title, url, authors,
                                      home_title_list, home_title_thread)
    d['title'] = find_title(html, html_title, url, authors, home_title_list,
                            home_title_thread)
    date = find_date(html, url)
    if date:
        d['date'] = date
        d['year'] = str(date.year)

    lang_match = LANG_SEARCH(html)
    if lang_match is not None:
        d['language'] = lang_match[1]
    else:
        d['language'] = classify(html)[0]

    return d
Пример #25
0
    def test_string_markdown_link(self):
        # markdown2 and markdown escape the email address
        try:
            from html import unescape as html_unescape
        except ImportError:
            from HTMLParser import HTMLParser
            html_unescape = HTMLParser().unescape

        p = StringHTMLProperty(self.client, 'test', '1', None, 'test',
                               u2s(u'A link <*****@*****.**>'))
        m = html_unescape(p.markdown().strip())
        m = self.mangleMarkdown2(m)

        self.assertEqual(
            m,
            u2s(u'<p>A link <a href="mailto:[email protected]">[email protected]</a></p>'
                ))
Пример #26
0
    def _parse_streams(self, res):
        _found_stream_url = False
        for meta in itertags(res.text, "meta"):
            if meta.attributes.get("property") == "og:video:url":
                stream_url = html_unescape(meta.attributes.get("content"))
                if ".mpd" in stream_url:
                    for s in DASHStream.parse_manifest(self.session,
                                                       stream_url).items():
                        yield s
                        _found_stream_url = True
                elif ".mp4" in stream_url:
                    yield "vod", HTTPStream(self.session, stream_url)
                    _found_stream_url = True
                break
        else:
            log.debug("No meta og:video:url")

        if _found_stream_url:
            return

        for match in self._src_re.finditer(res.text):
            stream_url = match.group("url")
            if "\\/" in stream_url:
                # if the URL is json encoded, decode it
                stream_url = parse_json("\"{}\"".format(stream_url))
            if ".mpd" in stream_url:
                yield from DASHStream.parse_manifest(self.session,
                                                     stream_url).items()
            elif ".mp4" in stream_url:
                yield match.group(1), HTTPStream(self.session, stream_url)
            else:
                log.debug("Non-dash/mp4 stream: {0}".format(stream_url))

        match = self._dash_manifest_re.search(res.text)
        if match:
            # facebook replaces "<" characters with the substring "\\x3C"
            manifest = match.group("manifest").replace("\\/", "/")
            manifest = bytes(unquote_plus(manifest),
                             "utf-8").decode("unicode_escape")
            # Ignore unsupported manifests until DASH SegmentBase support is implemented
            if "SegmentBase" in manifest:
                log.error("Skipped DASH manifest with SegmentBase streams")
            else:
                yield from DASHStream.parse_manifest(self.session,
                                                     manifest).items()
Пример #27
0
 def parse(self, tree):
     xpath = '//h1[contains(@class, "ArticleTitle")]/text()'
     self.update_item('title', tree.xpath(xpath))
     xpath = '//*[@class="authors__list"]//*[@class="authors__name"]/text()'
     authors = tree.xpath(xpath)
     authors = [html_unescape(author) for author in authors]
     self.update_item('authors', authors)
     xpath = '//*[contains(@class, "KeywordGroup")]//*[contains(@class, "Keyword")]/text()'
     keywords = tree.xpath(xpath)
     self.update_item('keywords', keywords)
     xpath = '//*[@id="article-actions"]//*[contains(@class, "download-article")]/a[1]/@href'
     download_link = tree.xpath(xpath)
     download_link = self.urljoin(download_link[0]) if download_link else ''
     self.update_item('download_link', download_link)
     xpath = '//*[@id="doi-url"]/text()'
     doi_link = tree.xpath(xpath)
     doi_link = doi_link[0] if doi_link else ''
     self.update_item('doi_link', doi_link)
Пример #28
0
def find_title(
    html: str,
    html_title: str,
    url: str,
    authors: List[Tuple[str, str]],
    home_title: List[str],
    thread: Thread,
) -> Optional[str]:
    """Return (title_string, where_info)."""
    m = TITLE_SEARCH(html)
    if m:
        return parse_title(
            html_unescape(m['result']), url, authors, home_title, thread,
        )[1]
    elif html_title:
        return parse_title(html_title, url, authors, home_title, thread)[1]
    else:
        return None
Пример #29
0
def stackexchange(auth_token):
    user = requests.get(
        'https://api.stackexchange.com/2.2/me',
        params={
            'site': 'codegolf',
            'filter':
            '!)RwcIFN1JaCrhVpgyYeR_oO*',  # Constant obtained from SE API explorer
            'access_token': auth_token,
            'key': auth.get('stackexchange.com').get('key')
        }).json().get('items')[0]

    display_name = html_unescape(user.get('display_name'))

    return user.get('user_id'), {
        'name': display_name,
        'avatar': user.get('profile_image'),
        'email': None,
        'identifier': display_name
    }
Пример #30
0
    def _get_streams(self):
        p = urlparse(self.url)
        if "ott.streann.com" != p.netloc:
            self._domain = p.netloc
            res = self.session.http.get(self.url)
            for iframe in itertags(res.text, "iframe"):
                iframe_url = html_unescape(iframe.attributes.get("src"))
                if "ott.streann.com" in iframe_url:
                    self.url = iframe_url
                    break
            else:
                log.error("Could not find 'ott.streann.com' iframe")
                return

        if not self._domain and self.get_option("url"):
            self._domain = urlparse(self.get_option("url")).netloc

        if self._domain is None:
            log.error("Missing source URL use --streann-url")
            return

        self.session.http.headers.update({"Referer": self.url})
        # Get the query string
        encrypted_data = urlparse(self.url).query
        data = base64.b64decode(encrypted_data)
        # and decrypt it
        passphrase = self.passphrase()
        if passphrase:
            log.debug("Found passphrase")
            params = decrypt_openssl(data, passphrase)
            config = parse_qsd(params.decode("utf8"))
            log.trace(f"config: {config!r}")
            token = self.get_token(**config)
            if not token:
                return
            hls_url = self.stream_url.format(time=self.time,
                                             deviceId=self.device_id,
                                             token=token,
                                             **config)
            log.debug("URL={0}".format(hls_url))
            return HLSStream.parse_variant_playlist(
                self.session, hls_url, acceptable_status=(200, 403, 404, 500))
Пример #31
0
def url2dict(url: str) -> Dict[str, Any]:
    """Get url and return the result as a dictionary."""
    d = defaultdict(lambda: None)
    # Creating a thread to request homepage title in background
    home_title_list = []  # A mutable variable used to get the thread result
    home_title_thread = Thread(
        target=get_home_title, args=(url, home_title_list))
    home_title_thread.start()

    html = get_html(url)
    d['url'] = find_url(html, url)
    m = TITLE_TAG(html)
    html_title = html_unescape(m['result']) if m else None
    if html_title:
        d['html_title'] = html_title
    # d['html_title'] is used in waybackmechine.py.
    authors = find_authors(html)
    if authors:
        d['authors'] = authors
    d['issn'] = find_issn(html)
    d['pmid'] = find_pmid(html)
    d['doi'] = find_doi(html)
    d['volume'] = find_volume(html)
    d['issue'] = find_issue(html)
    d['page'] = find_pages(html)
    d['journal'] = find_journal(html)
    if d['journal']:
        d['cite_type'] = 'journal'
    else:
        d['cite_type'] = 'web'
        d['website'] = find_site_name(
            html, html_title, url, authors, home_title_list, home_title_thread)
    d['title'] = find_title(
        html, html_title, url, authors, home_title_list, home_title_thread)
    date = find_date(html, url)
    if date:
        d['date'] = date
        d['year'] = str(date.year)
    d['language'] = classify(html)[0]
    return d
Пример #32
0
    def _get_streams(self):
        if "cdn.bg" in urlparse(self.url).netloc:
            iframe_url = self.url
            h = self.session.get_option("http-headers")
            if h and h.get("Referer"):
                _referer = h.get("Referer")
            else:
                log.error(
                    "Missing Referer for iframe URL, use --http-header \"Referer=URL\" "
                )
                return
        else:
            _referer = self.url
            res = self.session.http.get(self.url)
            m = self._re_frame.search(res.text)
            if m:
                iframe_url = m.group(1)
            else:
                for iframe in itertags(res.text, "iframe"):
                    iframe_url = iframe.attributes.get("src")
                    if iframe_url and "cdn.bg" in iframe_url:
                        iframe_url = update_scheme("https://",
                                                   html_unescape(iframe_url),
                                                   force=False)
                        break
                else:
                    return
        log.debug(f"Found iframe: {iframe_url}")

        res = self.session.http.get(iframe_url, headers={"Referer": _referer})
        stream_url = self.stream_schema.validate(res.text)
        if "geoblock" in stream_url:
            log.error("Geo-restricted content")
            return

        return HLSStream.parse_variant_playlist(
            self.session,
            update_scheme(iframe_url, stream_url),
            headers={"Referer": "https://i.cdn.bg/"},
        )
Пример #33
0
def get_home_title(url: str, home_title_list: List[str]) -> None:
    """Get homepage of the url and return it's title.

    home_title_list will be used to return the thread result.
    This function is invoked through a thread.
    """
    # Todo: cache the result.
    home_url = '://'.join(urlparse(url)[:2])
    with request(
        home_url, spoof=True, stream=True
    ) as r:
        try:
            check_response_headers(r)
        except (
            RequestException, StatusCodeError,
            ContentTypeError, ContentLengthError,
        ):
            return
        content = next(r.iter_content(MAX_RESPONSE_LENGTH))
    m = CHARSET(content)
    html = content.decode(m[1].decode() if m else r.encoding)
    m = TITLE_TAG(html)
    title = html_unescape(m['result']) if m else None
    home_title_list.append(title)
Пример #34
0
def render_template(template, **kw):
    return html_unescape(jin_env.get_template(template).render(**kw))
Пример #35
0
def unescape(string):
    if string is None:
        return None
    return html_unescape(string)