Exemplo n.º 1
0
def update_story_has_mathjax(storys=None):
    story_ids = _get_story_ids(storys)
    LOG.info('total %s storys', len(story_ids))
    for story_id in tqdm.tqdm(story_ids, ncols=80, ascii=True):
        with transaction.atomic():
            story = Story.objects.only('id', 'content', '_version').get(pk=story_id)
            if processor.story_has_mathjax(story.content):
                story.has_mathjax = True
                story.save()
Exemplo n.º 2
0
def test_story_has_mathjax():
    has_mathjax_cases = [
        r'$x^{y^z}=(1+{\rm e}^x)^{-2xy^w}$',
        r'$f(x,y,z) = 3y^2z \left( 3+\frac{7x+5}{1+y^2} \right)$',
        r'$1 \over 3$',
        r'$\vec{a} \cdot \vec{b}=0$',
        r'<p>这里 $n$ 是特征',
        r'向量 $\vec x$ 的长度,即特征的维数。',
        r'<code>$v_i$</code> 是长度',
        r'为 $k$ 的向量,与特征 id 对应,称为特征的隐向量。',
        r'`sum_(i=1)^n i^3=((n(n+1))/2)^2`',
        r'<code>`sum_(i=1)^n i^3=((n(n+1))/2)^2`</code>',
    ]
    not_mathjax_cases = [
        r'$10 aaa $10  $10 aaa $10',
        r'$10 $10  $10 $10',
        r'$10.0',
        r'100$ 100$',
        r'console.log($.fn.jquery); window.$;',
        r'$ === jQuery; typeof($);',
        r"$('p,div'); $('p.red,p.green');",
        r"""
        The model of subscription premium audio content is popular in China,
        where Ximalaya, a unicorn consumer audio platform, has a subscription
        feature for $3 monthly that enables users to access over 4000 e-books
        and over 300 premium audio courses or podcasts. Audio content is also
        available a la carte starting at $0.03 per short, serialized book chapter,
        or anywhere from $10 to $45 for paid audio courses.
        """,
        r"""$ shellcheck test.sh
        In test.sh line 4:
        if[ $# -eq 0 ]""",
        r'$ shellcheck if[ $# -eq 0 ]',
        '$x^\n{y^z}$',
        r'$x^{$y^z}$',
        '`x^\n{y^z}`',
        r'```x^{y^z}```',
    ]
    for text in has_mathjax_cases:
        assert story_has_mathjax(text), text
    for text in not_mathjax_cases:
        assert not story_has_mathjax(text), text
Exemplo n.º 3
0
def _get_storys(entries: list):
    storys = deque(maxlen=300)  # limit num storys
    while entries:
        data = entries.pop()
        story = {}
        content = ''
        if data["content"]:
            # both content and summary will in content list, peek the longest
            for x in data["content"]:
                value = x["value"]
                if value and len(value) > len(content):
                    content = value
        if not content:
            content = data["description"]
        if not content:
            content = data["summary"]
        story['has_mathjax'] = story_has_mathjax(content)
        link = normlize_url(data["link"])
        valid_link = ''
        if link:
            try:
                valid_link = validate_url(link)
            except Invalid:
                LOG.warning(f'invalid story link {link!r}')
        story['link'] = valid_link
        content = story_html_clean(content)
        if len(content) >= 1024 * 1024:
            msg = 'too large story link=%r content length=%s, will only save plain text!'
            LOG.warning(msg, link, len(content))
            content = story_html_to_text(content)
        content = process_story_links(content, valid_link)
        story['content'] = content
        summary = data["summary"]
        if not summary:
            summary = content
        summary = shorten(story_html_to_text(summary), width=300)
        story['summary'] = summary
        title = shorten(data["title"] or link or summary, 200)
        unique_id = shorten(data['id'] or link or title, 200)
        content_hash_base64 = compute_hash_base64(content, summary, title)
        story['title'] = title
        story['content_hash_base64'] = content_hash_base64
        story['unique_id'] = unique_id
        story['author'] = shorten(data["author"], 200)
        story['dt_published'] = _get_dt_published(data)
        story['dt_updated'] = _get_dt_updated(data)
        storys.append(story)
    return list(storys)
Exemplo n.º 4
0
Arquivo: rss.py Projeto: XZYCR7/rssant
def _get_storys(entries: list):
    storys = deque(maxlen=300)  # limit num storys
    while entries:
        data = entries.pop()
        story = {}
        story['unique_id'] = shorten(_get_story_unique_id(data), 200)
        content = ''
        if data["content"]:
            # both content and summary will in content list, peek the longest
            for x in data["content"]:
                value = x["value"]
                if value and len(value) > len(content):
                    content = value
        if not content:
            content = data["description"]
        if not content:
            content = data["summary"]
        story['has_mathjax'] = story_has_mathjax(content)
        content = story_html_clean(content)
        content = process_story_links(content, data["link"])
        story['content'] = content
        summary = data["summary"]
        if not summary:
            summary = content
        # TODO: performance
        summary = shorten(story_html_to_text(summary), width=300)
        story['summary'] = summary
        story['link'] = data["link"]
        title = shorten(data["title"] or story['link'] or story['unique_id'], 200)
        content_hash_base64 = compute_hash_base64(content, summary, title)
        story['title'] = title
        story['content_hash_base64'] = content_hash_base64
        story['author'] = shorten(data["author"], 200)
        story['dt_published'] = _get_dt_published(data)
        story['dt_updated'] = _get_dt_updated(data)
        storys.append(story)
    return list(storys)