Exemplo n.º 1
0
        dest='robots',
        default='/lm/data2/scrapers/eng-USA/epg/www.tv.com'
                '/log/robots.zip',
        help='Set the robots for robots.zip file [Default %default]'
    )
    parser.add_option(
        '--basepath',
        '-b',
        dest='basepath',
        default='/lm/data2/',
        help='Set the basepath for outputfile location [Default %default]'
    )

    options, args = parser.parse_args()

    log = Logger(options.debug)

    myScraper = WebScraper(
        scraperType = 'scrapers',
        topic       = 'urls',
        lang        = 'xxx-XXX',
        name        = 'gatherproxy.com',
        frequency   = 'inc',
    )
    if options.robots:
        # set the robots.txt for the scraper
        myScraper.setRobotsTxt(url='http://gatherproxy.com/',
                               zip=options.robots)

    myScraper.setBasePath(options.basepath)
Exemplo n.º 2
0
        dest='badUrlsFile',
        default='/lm/data2/scrapers/zho-CHN/movies/v.qq.com.movie'
        '/log.inc/v.qq.com.movie.badUrls.lst',
        help=
        'Prints unusable URLs to external file instead of halting the scraper.'
    )

    parser.add_option(
        '--small',
        action='store_true',
        dest='run_small',
        default=False,
        help='if run spider by small data set, this is for debug.')

    options, args = parser.parse_args()
    log = Logger(options.debug)
    if options.run_small:
        run_small = options.run_small

    if options.html:
        myScraper = HTMLScraper(scraperType=u'scrapers',
                                topic=u'movies',
                                lang=u'zho-CHN',
                                name=u'v.qq.com.movie',
                                frequency=u'versions')
        myScraper.inputDataBall(options.html)
    else:
        myScraper = WebScraper(scraperType=u'scrapers',
                               topic=u'movies',
                               lang=u'zho-CHN',
                               name=u'v.qq.com.movie',