def main(articles_dir, out_dir): articles = [x.strip() for x in os.listdir(articles_dir)] for name in articles: fn = os.path.join(articles_dir, layout.safe_fn(name.strip())) ofn = os.path.join(out_dir, layout.safe_fn(name.strip())) if not os.path.exists(ofn): print fn tree = parse_article(fn, articles) open(ofn,'w').write('<!DOCTYPE html>\n' + etree.tostring(tree)) return tree
def main(articles_fn, out_dir): bot = fetcher.Fetcher(report_cb=report_cb) articles = open(articles_fn).readlines() for i,name in enumerate(articles): name = name.strip().decode('utf8') fn = os.path.join(out_dir, layout.safe_fn(name)) if os.path.exists(fn): continue url = 'http://en.m.wikipedia.org/wiki/' + urllib2.quote(name.encode('utf8'), safe='') bot.add(url, fn) bot.run()
def main(articles_fn, articles_dir, index_dir): ix = create_in(index_dir, schema) writer = ix.writer() articles = open(articles_fn).readlines() for i,name in enumerate(articles): name = name.strip().decode('utf8') fn = os.path.join(articles_dir, layout.safe_fn(name)) if not os.path.exists(fn): print 'not there', fn continue else: print 'indexing', fn text = get_text(fn) # print text[:400] writer.add_document(title=name, path=fn, content=text) writer.commit(optimize=True) # optimize did not result in any size improvements print '%d docs in index' % ix.doc_count()