Exemplo n.º 1
0
 def testPullReferences(self):
     markup = 'Something <ref name="foo">Something where</ref>over'
     p = WikiMarkup(markup)
     ref = p.find_references(pull = True)
     self.assertEqual(ref, ['<ref name="foo">Something where</ref>'])
     got = p.render()
     self.assertEqual('<p>Something over</p>', got)
Exemplo n.º 2
0
 def testFindReferences(self):
     markup = 'Something <ref name="foo">Something where</ref>over'
     p = WikiMarkup(markup)
     ref = p.find_references()
     self.assertEqual(ref, ['<ref name="foo">Something where</ref>'])
     got = p.render()
     self.assert_("Something where" in got)
Exemplo n.º 3
0
def formatblob(text, filename=None, language=None):
    #if markdown and any(filter(lambda ext: filename.endswith(ext), ['.md', '.mkdown', '.txt'])):
    #    return markdown(text)
    if (filename is None):
        return pygmentize(code, filename, language);

    if any(filter(lambda ext: filename.endswith(ext), ['.rs', '.txt'])):
        return restructure(text)
    
    if filename.endswith('.mw'):
        wm = WikiMarkup(text)
        wm.set_link_postfix('.mw')
        rendered = wm.render()
        return rendered.decode('utf-8')
    
    return pygmentize(text, filename, language);
Exemplo n.º 4
0
Arquivo: parse.py Projeto: natano/misc
def main():
    now = datetime.now()
    page_dates = []
    revision_dates = []

    corpora = defaultdict(lambda: set())

    stemmer = GermanStemmer()

    for page, revisions in MediawikiDump(sys.stdin).iterpages():
        timestamp = revisions[0]['timestamp']
        page_dates.append(timestamp)

        first = revisions[0]
        stems = set()
        for year in xrange(first['timestamp'].year, now.year+1):
            revisions_in_year = [r for r in revisions if r['timestamp'].year == year]
            revision_dates.extend(r['timestamp'] for r in revisions_in_year)
            if revisions_in_year:
                stems = set()
                for revision in revisions_in_year:
                    html = WikiMarkup(revision['text'].encode('utf-8')).render()
                    text = clean_html(html.decode('utf-8'))
                    # TODO: remove remaining markup
                    words = WORD_RE.findall(text)
                    stems.update(stemmer.stem(word) for word in words)
            corpora[year].update(stems)

    page_dates.sort()
    revision_dates.sort()

    delta = relativedelta(revision_dates[-1], revision_dates[0])
    months = delta.years * 12 + delta.months

    outdir = os.path.abspath('./out')
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    fig = pyplot.figure()
    ax = fig.add_subplot(111)
    ax.plot_date(page_dates, range(1, len(page_dates)+1), '-')
    ax.hist(date2num(revision_dates), months, histtype='step')
    ax.set_xlabel(u'Year')
    ax.legend([u'Total No. of Pages', u'New Revisions per month'])
    fig.autofmt_xdate()
    fig.savefig(os.path.join(outdir, 'pages.png'), format='png')

    fig = pyplot.figure()
    ax = fig.add_subplot(111)
    timestamps = [datetime(year, 12, 31) for year in sorted(corpora.keys())]
    counts = [len(corpora[t.year]) for t in timestamps]
    ax.plot_date(timestamps, counts, '-')
    ax.set_xlabel(u'Year')
    ax.legend([u'No. of distinct tokens'])
    fig.autofmt_xdate()
    fig.savefig(os.path.join(outdir, 'tokens.png'), format='png')

    years = sorted(corpora.keys())
    years = range(years[0], years[-1])
    year_pairs = zip(years, years[1:])
    for year1, year2 in year_pairs:
        current = corpora.get(year1, set())
        next_ = corpora.get(year2, set())
        filename = '{:04d}-{:04d}.diff'.format(year1, year2)
        with open(os.path.join(outdir, filename), 'w') as f:
            for token in sorted(current - next_):
                f.write(u'-{}\n'.format(token).encode('utf-8'))
            for token in sorted(next_ - current):
                f.write(u'+{}\n'.format(token).encode('utf-8'))
Exemplo n.º 5
0
 def checkMarkup(self, markup, wanted):
     p = WikiMarkup(markup)
     got = p.render()
     self.assertEqual(got, wanted)
Exemplo n.º 6
0
 def testLinkPrefixRendering(self):
     markup = 'foobar [[Woo]]'
     p = WikiMarkup(markup)
     p.set_link_prefix('http://www.google.com/?q=')
     got = p.render()
     self.assertEqual('<p>foobar <a href="http://www.google.com/?q=Woo">Woo</a></p>', got)
Exemplo n.º 7
0
 def testLinkPostfixRendering(self):
     markup = 'foobar [[Woo]]'
     p = WikiMarkup(markup)
     p.set_link_postfix('.mw')
     got = p.render()
     self.assertEqual('<p>foobar <a href="Woo.mw">Woo</a></p>', got)
Exemplo n.º 8
0
 def render_mediawiki(content):
     wm = WikiMarkup(content)
     wm.set_link_postfix('.mw')
     rendered = wm.render()
     return rendered.decode('utf-8')