示例#1
0
def main(articles_dir, out_dir):
    articles = [x.strip() for x in os.listdir(articles_dir)]
    for name in articles:
        fn = os.path.join(articles_dir, layout.safe_fn(name.strip()))
        ofn = os.path.join(out_dir, layout.safe_fn(name.strip()))
        if not os.path.exists(ofn):
            print fn
            tree = parse_article(fn, articles)
            open(ofn,'w').write('<!DOCTYPE html>\n' + etree.tostring(tree))
    return tree
示例#2
0
def main(articles_fn, out_dir):
    bot = fetcher.Fetcher(report_cb=report_cb)
    articles = open(articles_fn).readlines()
    for i,name in enumerate(articles):
        name = name.strip().decode('utf8')
        fn = os.path.join(out_dir, layout.safe_fn(name))
        if os.path.exists(fn):
            continue
        url = 'http://en.m.wikipedia.org/wiki/' + urllib2.quote(name.encode('utf8'), safe='')
        bot.add(url, fn)
    bot.run()
示例#3
0
def main(articles_fn, articles_dir, index_dir):
    ix = create_in(index_dir, schema)
    writer = ix.writer()

    articles = open(articles_fn).readlines()
    for i,name in enumerate(articles):
        name = name.strip().decode('utf8')
        fn = os.path.join(articles_dir, layout.safe_fn(name))
        if not os.path.exists(fn):
            print 'not there', fn
            continue
        else:
            print 'indexing', fn
        text = get_text(fn)
#        print text[:400]
        writer.add_document(title=name, path=fn, content=text)

    writer.commit(optimize=True) # optimize did not result in any size improvements
    print '%d docs in index' % ix.doc_count()