Python parse_article_url 예제들, mwlib.mwapidb.parse_article_url Python 예제들

예제 #1

0

파일 보기

파일: zipcreator.py 프로젝트: cscott/wikiserver

def make_zip_file(output, env,
    status=None,
    num_threads=10,
    imagesize=800,
):
    if status is None:
        status = lambda **kwargs: None
    
    if output is None:
        fd, output = tempfile.mkstemp(suffix='.zip')
        os.close(fd)
    
    fd, tmpzip = tempfile.mkstemp(suffix='.zip', dir=os.path.dirname(output))
    os.close(fd)
    zf = zipfile.ZipFile(tmpzip, 'w')
    
    try:
        articles = metabook.get_item_list(env.metabook, filter_type='article')
        
        if num_threads > 0:
            z = ThreadedZipCreator(zf,
                imagesize=imagesize,
                num_threads=num_threads,
                status=status,
                num_articles=len(articles),
            )
        else:
            z = ZipCreator(zf,
                imagesize=imagesize,
                status=status,
                num_articles=len(articles),
            )
        
        # if articles:
        #     class IncProgress(object):
        #         inc = 100./len(articles)
        #         p = 0
        #         def __call__(self, title):
        #             self.p += self.inc
        #             status(progress=int(self.p), article=title)
        #     inc_progress = IncProgress()
        # else:
        #     inc_progress = None
        
        for item in articles:
            d = mwapidb.parse_article_url(item['title'].encode('utf-8'))
            if d is not None:
                item['title'] = d['title']
                item['revision'] = d['revision']
                wikidb = mwapidb.WikiDB(api_helper=d['api_helper'])
                imagedb = mwapidb.ImageDB(api_helper=d['api_helper'])
            else:
                wikidb = env.wiki
                imagedb = env.images
            z.addArticle(item['title'],
                revision=item.get('revision', None),
                wikidb=wikidb,
                imagedb=imagedb,
            )
        
        for license in env.get_licenses():
            z.parseArticle(
                title=license['title'],
                raw=license['wikitext'],
                wikidb=env.wiki,
                imagedb=env.images,
            )
        
        z.join()
        z.addObject('metabook.json', json.dumps(env.metabook))
        zf.close()
        if os.path.exists(output): # Windows...
            os.unlink(output)
        os.rename(tmpzip, output)
    
        if env.images and hasattr(env.images, 'clear'):
            env.images.clear()
    
        status(progress=100)
        return output
    finally:
        if os.path.exists(tmpzip):
            utils.safe_unlink(tmpzip)

예제 #2

0

파일 보기

def make_zip_file(
    output,
    env,
    status=None,
    num_threads=10,
    imagesize=800,
):
    if status is None:
        status = lambda **kwargs: None

    if output is None:
        fd, output = tempfile.mkstemp(suffix='.zip')
        os.close(fd)

    fd, tmpzip = tempfile.mkstemp(suffix='.zip', dir=os.path.dirname(output))
    os.close(fd)
    zf = zipfile.ZipFile(tmpzip, 'w')

    try:
        articles = metabook.get_item_list(env.metabook, filter_type='article')

        if num_threads > 0:
            z = ThreadedZipCreator(
                zf,
                imagesize=imagesize,
                num_threads=num_threads,
                status=status,
                num_articles=len(articles),
            )
        else:
            z = ZipCreator(
                zf,
                imagesize=imagesize,
                status=status,
                num_articles=len(articles),
            )

        # if articles:
        #     class IncProgress(object):
        #         inc = 100./len(articles)
        #         p = 0
        #         def __call__(self, title):
        #             self.p += self.inc
        #             status(progress=int(self.p), article=title)
        #     inc_progress = IncProgress()
        # else:
        #     inc_progress = None

        for item in articles:
            d = mwapidb.parse_article_url(item['title'].encode('utf-8'))
            if d is not None:
                item['title'] = d['title']
                item['revision'] = d['revision']
                wikidb = mwapidb.WikiDB(api_helper=d['api_helper'])
                imagedb = mwapidb.ImageDB(api_helper=d['api_helper'])
            else:
                wikidb = env.wiki
                imagedb = env.images
            z.addArticle(
                item['title'],
                revision=item.get('revision', None),
                wikidb=wikidb,
                imagedb=imagedb,
            )

        for license in env.get_licenses():
            z.parseArticle(
                title=license['title'],
                raw=license['wikitext'],
                wikidb=env.wiki,
                imagedb=env.images,
            )

        z.join()
        z.addObject('metabook.json', json.dumps(env.metabook))
        zf.close()
        if os.path.exists(output):  # Windows...
            os.unlink(output)
        os.rename(tmpzip, output)

        if env.images and hasattr(env.images, 'clear'):
            env.images.clear()

        status(progress=100)
        return output
    finally:
        if os.path.exists(tmpzip):
            utils.safe_unlink(tmpzip)

예제 #3

0

파일 보기

파일: zipcreator.py 프로젝트: josephreisinger/Parallel-Textual-Extraction

def make_zip_file(output, env,
    status=None,
    num_threads=10,
    imagesize=800,
):
    if output is None:
        fd, output = tempfile.mkstemp(suffix='.zip')
        os.close(fd)
    
    fd, tmpzip = tempfile.mkstemp(suffix='.zip', dir=os.path.dirname(output))
    os.close(fd)
    zf = zipfile.ZipFile(tmpzip, 'w')
    
    try:
        articles = metabook.get_item_list(env.metabook, filter_type='article')
        
        z = ZipCreator(zf,
            imagesize=imagesize,
            num_threads=num_threads,
            status=status,
            num_articles=len(articles),
        )
        
        for item in articles:
            d = mwapidb.parse_article_url(item['title'].encode('utf-8'))
            if d is not None:
                item['title'] = d['title']
                item['revision'] = d['revision']
                wikidb = mwapidb.WikiDB(api_helper=d['api_helper'])
                imagedb = mwapidb.ImageDB(api_helper=d['api_helper'])
            else:
                wikidb = env.wiki
                imagedb = env.images
            z.addArticle(item['title'],
                revision=item.get('revision', None),
                wikidb=wikidb,
                imagedb=imagedb,
            )

        for license in env.get_licenses():
            z.parseArticle(
                title=license['title'],
                raw=license['wikitext'],
                wikidb=env.wiki,
                imagedb=env.images,
            )
        
        z.join()
        
        # using check() is a bit rigorous: sometimes articles just cannot be
        # fetched -- PDFs should be generated nevertheless
        #z.check(articles)

        z.addObject('metabook.json', json.dumps(env.metabook))

        # add stats for later analysis
        z.node_stats["Chapter"] = len(metabook.get_item_list(env.metabook, filter_type='chapter'))
        z.addObject('node_stats.json', json.dumps(z.node_stats)) 

        zf.close()
        if os.path.exists(output): # Windows...
            os.unlink(output)
        os.rename(tmpzip, output)
    
        if env.images and hasattr(env.images, 'clear'):
            env.images.clear()
    
        if status is not None:
            status(progress=100)
        return output
    finally:
        if os.path.exists(tmpzip):
            utils.safe_unlink(tmpzip)