示例#1
0
def main():
    args = ARGS.parse_args()
    if '://' not in args.url:
        args.url = 'http://' + args.url

    loop = asyncio.get_event_loop()
    host = urlparse(args.url).netloc
    crawler = crawling.Crawler(args.url,
                               max_redirect=args.max_redirect,
                               max_tries=args.max_tries,
                               max_tasks=args.max_tasks,
                               host=host)

    try:
        loop.run_until_complete(crawler.crawl())
    except KeyboardInterrupt:
        import sys
        sys.stderr.flush()
        print('\nInterrupted\n')
    finally:
        reporting.report(crawler)
        loop.run_until_complete(crawler.close())
        loop.stop()
        loop.run_forever()
        loop.close()
示例#2
0
def main():
  """ Main program.

  Parse arguments, run crawler, print report
  """

  args = ARGS.parse_args()
  if not args.b and not args.f:
    print "Use --help for command line help"
    return

  if args.b:
    bugs = {fix_url(bug) for bug in args.b}
  else:
    bugs = [fix_url(bug) for bug in read(args.f[0])]

  try:
    output = []
    start_time = time.time()

    for bug in bugs:
      result = crawler.download(bug)
      output.append(result)

    total_time = round(time.time() - start_time, 2)
    print "It took %s seconds to download %s bug reports!" % (total_time, len(bugs))

    report(output)
  except KeyboardInterrupt:
    print "Interupted!"
  except crawler.BugNotFound, e:
    print "An error occurred while crawling bug: " + bug
    print e.message
示例#3
0
def create_spider(raw_url):

    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]

    loop = asyncio.get_event_loop()

    crawler = crawling.Crawler(raw_url,
        exclude=None,
        strict=True,
        max_redirect=10,
        max_tries=4,
        max_tasks=10,
    )

    try:
        loop.run_until_complete(crawler.crawl())  # Crawler gonna crawl.

    except KeyboardInterrupt:
        return False
        sys.stderr.flush()
        print('\nInterrupted\n')

    finally:
        reporting.report(crawler)
        crawler.close()
        return True
示例#4
0
def main():
    """Main program.

    Parse arguments, set up event loop, run crawler, print report.
    """
    args = ARGS.parse_args()
    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
    logging.basicConfig(level=levels[min(args.level, len(levels) - 1)])

    if args.iocp:
        from asyncio.windows_events import ProactorEventLoop

        loop = ProactorEventLoop()
        asyncio.set_event_loop(loop)
    elif args.select:
        loop = asyncio.SelectorEventLoop()
        asyncio.set_event_loop(loop)
    else:
        loop = asyncio.get_event_loop()

    if not args.roots:
        r = redis.StrictRedis(host="localhost", port=6379, db=0)
        data = [r.blpop("queue:urls_to_crawl")]
        # data.append(r.blpop('queue:urls_to_crawl'))
        # data.append(r.blpop('queue:urls_to_crawl'))
        roots, scrape_data = init_data(data)
        s = None  # Scraper(scrape_data)
    else:
        roots = {fix_url(root) for root in args.roots}
        s = None

    crawler = crawling.Crawler(
        roots,
        scraper=s,
        data_handler=None,
        exclude=args.exclude,
        strict=args.strict,
        max_redirect=args.max_redirect,
        max_tries=args.max_tries,
        max_tasks=args.max_tasks,
    )
    try:
        loop.run_until_complete(crawler.crawl())  # Crawler gonna crawl.
    except KeyboardInterrupt:
        sys.stderr.flush()
        print("\nInterrupted\n")
    finally:
        reporting.report(crawler)  ########## REPORTING
        crawler.close()

        # next two lines are required for actual aiohttp resource cleanup
        loop.stop()
        loop.run_forever()

        loop.close()
示例#5
0
def main():
    """Main program.

    Parse arguments, set up event loop, run crawler, print report.
    """
    args = ARGS.parse_args()
    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
    logging.basicConfig(level=levels[min(args.level, len(levels) - 1)])

    if args.iocp:
        from asyncio.windows_events import ProactorEventLoop
        loop = ProactorEventLoop()
        asyncio.set_event_loop(loop)
    elif args.select:
        loop = asyncio.SelectorEventLoop()
        asyncio.set_event_loop(loop)
    else:
        loop = asyncio.get_event_loop()

    if not args.roots:
        r = redis.StrictRedis(host='localhost', port=6379, db=0)
        data = [r.blpop('queue:urls_to_crawl')]
        # data.append(r.blpop('queue:urls_to_crawl'))
        # data.append(r.blpop('queue:urls_to_crawl'))
        roots, scrape_data = init_data(data)
        s = None  #Scraper(scrape_data)
    else:
        roots = {fix_url(root) for root in args.roots}
        s = None

    crawler = crawling.Crawler(roots,
                               scraper=s,
                               data_handler=None,
                               exclude=args.exclude,
                               strict=args.strict,
                               max_redirect=args.max_redirect,
                               max_tries=args.max_tries,
                               max_tasks=args.max_tasks)
    try:
        loop.run_until_complete(crawler.crawl())  # Crawler gonna crawl.
    except KeyboardInterrupt:
        sys.stderr.flush()
        print('\nInterrupted\n')
    finally:
        reporting.report(crawler)  ########## REPORTING
        crawler.close()

        # next two lines are required for actual aiohttp resource cleanup
        loop.stop()
        loop.run_forever()

        loop.close()
示例#6
0
def main():
    """Main program.

    Parse arguments, set up event loop, run crawler, print report.
    """
    args = ARGS.parse_args()
    if not args.roots:
        print('Use --help for command line help')
        return

    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
    logging.basicConfig(level=levels[min(args.level, len(levels) - 1)])

    if args.iocp:
        from asyncio.windows_events import ProactorEventLoop
        loop = ProactorEventLoop()
        asyncio.set_event_loop(loop)
    elif args.select:
        loop = asyncio.SelectorEventLoop()
        asyncio.set_event_loop(loop)
    else:
        loop = asyncio.get_event_loop()

    roots = {fix_url(root) for root in args.roots}

    crawler = crawling.Crawler(
        roots,
        exclude=args.exclude,
        strict=args.strict,
        max_redirect=args.max_redirect,
        max_tries=args.max_tries,
        max_tasks=args.max_tasks,
        proxy=args.proxy,
        loop=loop,
    )
    try:
        loop.run_until_complete(crawler.crawl())  # Crawler gonna crawl.
    except KeyboardInterrupt:
        sys.stderr.flush()
        print('\nInterrupted\n')
    finally:
        now = datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')
        with open("{}.log".format(now), "w") as f:
            reporting.report(crawler, file=f)

        # next two lines are required for actual aiohttp resource cleanup
        loop.stop()
        loop.run_forever()

        loop.close()
示例#7
0
def main():
    "Parse arguments, set up event loop, run crawler and print a report."

    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
    if args["--quiet"]:
        logging.basicConfig(level=levels[0])
    else:
        logging.basicConfig(level=levels[int(args["--verbose"])])

    # Not sure how to set --strict to True by default with docopts. So this is
    # where we handle strict vs lenient.
    if args["--lenient"]:
        args["--strict"] = False
    else:
        args["--strict"] = True

    if args["--iocp"]:
        from asyncio.windows_events import ProactorEventLoop
        loop = ProactorEventLoop()
        asyncio.set_event_loop(loop)
    elif args["--select"]:
        loop = asyncio.SelectorEventLoop()
        asyncio.set_event_loop(loop)
    else:
        loop = asyncio.get_event_loop()

    # Set comprehension to avoid redundancy.
    roots = {fix_url(root) for root in args["<root>"]}

    # Instantiating the crawler with our arguments.
    crawler = crawling.Crawler(roots,
                               exclude=args["--exclude"],
                               strict=args["--strict"],
                               max_redirect=int(args["--max-redirect"]),
                               max_tries=int(args["--max-tries"]),
                               max_tasks=int(args["--max-tasks"]),
                               max_pool=int(args["--max-pool"])
                               )

    # "And this is where the magic happens."
    try:
        loop.run_until_complete(crawler.crawl())
    except KeyboardInterrupt:
        sys.stderr.flush()
        print('\nInterrupted\n')
    finally:
        reporting.report(crawler)
        crawler.close()
        loop.close()
示例#8
0
def main():
    """Main program.

    Parse arguments, set up event loop, run crawler, print report.
    """
    args = ARGS.parse_args()
    if not args.roots:
        print('Use --help for command line help')
        return

    levels = [logging.ERROR, logging.WARN, logging.INFO,
              logging.DEBUG]  # ERROR=0 WARN=1 越小越严重
    logging.basicConfig(level=levels[min(args.level, len(levels) - 1)])

    # 以下条件语句内区分了不同的循环方式,IOCP,select等,涉及系统底层socket操作,代码层面略。
    if args.iocp:
        from asyncio.windows_events import ProactorEventLoop
        loop = ProactorEventLoop()
        asyncio.set_event_loop(loop)
    elif args.select:  # 效率较低
        loop = asyncio.SelectorEventLoop()
        asyncio.set_event_loop(loop)
    else:
        loop = asyncio.get_event_loop()  # 默认循环方式

    roots = {fix_url(root) for root in args.roots}  # args.roots is a list

    crawler = crawling.Crawler(
        roots,
        exclude=args.exclude,
        strict=args.strict,
        max_redirect=args.max_redirect,
        max_tries=args.max_tries,
        max_tasks=args.max_tasks,
    )
    try:
        loop.run_until_complete(crawler.crawl())  # Crawler gonna crawl.
    except KeyboardInterrupt:
        sys.stderr.flush()  # 清理内存
        print('\nInterrupted\n')
    finally:
        reporting.report(crawler)  # 打印爬取结果,或输出结果到文件
        crawler.close()  # aiohttp loop close

        # next two lines are required for actual aiohttp resource cleanup
        loop.stop()
        loop.run_forever()  # clean up process

        loop.close()  # 移除signal处理器
示例#9
0
文件: crawl.py 项目: chtcvl/crawler
def main():
    """Main program.

    Parse arguments, set up event loop, run crawler, print report.
    """
    args = ARGS.parse_args()
    # if not args.roots:
    #     print('Use --help for command line help')
    #     return

    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
    logging.basicConfig(level=levels[min(args.level, len(levels) - 1)])

    if args.iocp:
        from asyncio.windows_events import ProactorEventLoop
        loop = ProactorEventLoop()
        asyncio.set_event_loop(loop)
    elif args.select:
        loop = asyncio.SelectorEventLoop()
        asyncio.set_event_loop(loop)
    else:
        loop = asyncio.get_event_loop()

    roots = {fix_url(root) for root in args.roots}
    roots = {'https://sauna.ru/'}
    crawler = crawling.Crawler(
        roots,
        exclude='\/(review|news|addFavorite|panorama|comment)',
        strict=args.strict,
        max_redirect=args.max_redirect,
        max_tries=args.max_tries,
        max_tasks=args.max_tasks,
        scrape_nonhtml=False,
    )
    try:
        loop.run_until_complete(crawler.dbconnect())
        loop.run_until_complete(crawler.crawl())  # Crawler gonna crawl.
    except KeyboardInterrupt:
        sys.stderr.flush()
        print('\nInterrupted\n')
    finally:
        reporting.report(crawler)
        crawler.close()

        # next two lines are required for actual aiohttp resource cleanup
        loop.stop()
        loop.run_forever()

        loop.close()
示例#10
0
def main():
    loop = asyncio.get_event_loop()
    roots = ("http://doc.1.com/platform/realname/", )
    crawler = Crawler(roots)
    try:
        loop.run_until_complete(crawler.crawl())
    except KeyboardInterrupt:
        sys.stderr.flush()
        print("\nInterrupted\n")
    finally:
        f = open("report.txt", "w+")
        report(crawler, file=f)
        crawler.close()
        loop.stop()
        loop.run_forever()
        loop.close()
示例#11
0
def main():
    """Main program.

    Parse arguments, set up event loop, run crawler, print report.
    """
    args = ARGS.parse_args()
    if not args.roots:
        print('Use --help for command line help')
        return

    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
    # logging.basicConfig(filename='crawl.log',level=levels[min(args.level, len(levels)-1)])
    logging.basicConfig(filename='crawl.log',level=levels[min(args.level, len(levels)-1)])

    if args.iocp:
        from asyncio.windows_events import ProactorEventLoop
        loop = ProactorEventLoop()
        asyncio.set_event_loop(loop)
    elif args.select:
        loop = asyncio.SelectorEventLoop()
        asyncio.set_event_loop(loop)
    else:
        loop = asyncio.get_event_loop()

    roots = {fix_url(root) for root in args.roots}

    crawler = crawling.Crawler(roots,
                               exclude=args.exclude,
                               strict=args.strict,
                               max_redirect=args.max_redirect,
                               max_tries=args.max_tries,
                               max_tasks=args.max_tasks,
                               )
    try:
        loop.run_until_complete(crawler.crawl())  # Crawler gonna crawl.
    except KeyboardInterrupt:
        sys.stderr.flush()
        print('\nInterrupted\n')
    finally:
        reporting.report(crawler)
        crawler.close()

        # next two lines are required for actual aiohttp resource cleanup
        loop.stop()
        loop.run_forever()

        loop.close()
示例#12
0
文件: cratchit.py 项目: w8s/cratchit
def generate_reports():
    print '\nGenerating reports for %d team members.' % len(s['members'])

    members = s['members']

    for member in members:
        print "\nReport for %s:" % member
        # get case resolves

        projects, project_cases = reporting.get_resolved_cases(s['config'], member, today, lastweek)

        resolves = len(project_cases)

        print 'Resolves: %d' % resolves

        activity_projects, activity_cases = reporting.get_case_activity(s['config'], member, today, lastweek)

        activity = len(activity_cases)

        print 'Activity: %d' % activity

        member_repo_list, changeset_list = reporting.get_commits(s['config'], member, today, lastweek)

        commits = 0
        for item in changeset_list:
            for key, value in item.iteritems():
                commits += len(value)

        print 'Commits: %d' % commits

        member.add_overview_data({'date' : datetime.now(),
                                  'resolves' : resolves,
                                  'activity' : activity,
                                  'commits'  : commits})

        save_dir = os.path.join(s['config']['home'], "reports", member.username)

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        graph_file = graphing.graph(member, save_dir)

        reporting.report(member, save_dir, graph_file, projects, project_cases, activity_projects, activity_cases, member_repo_list, changeset_list)

    s['members'] = members
示例#13
0
def main():
    args = ARGS.parse_args()
    if not args.roots:
        print('Use --help for command line help')
        return

    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
    logging.basicConfig(level=levels[min(args.level, len(levels) - 1)])

    if args.iocp:
        from asyncio.windows_events import ProactorEventLoop
        loop = ProactorEventLoop()
        asyncio.set_event_loop(loop)
    elif args.select:
        loop = asyncio.SelectorEventLoop()
        asyncio.set_event_loop(loop)
    else:
        loop = asyncio.get_event_loop()

    roots = {fix_url(root) for root in args.roots}

    crawler = crawling.Crawler(
        roots,
        exclude=args.exclude,
        strict=args.strict,
        max_redirect=args.max_redirect,
        max_tries=args.max_tries,
        max_tasks=args.max_tasks,
    )
    try:
        loop.run_until_complete(crawler.crawl())
    except KeyboardInterrupt:
        sys.stderr.flush()
        print('\nInterrupted\n')
    finally:
        reporting.report(crawler)
        loop.run_until_complete(crawler.close())

        loop.stop()
        loop.run_forever()

        loop.close()
示例#14
0
def main():
    """Main program.

    Parse arguments, set up event loop, run crawler, print report.
    """
    args = ARGS.parse_args()
    if not args.roots:
        print('Use --help for command line help')
        return

    global config
    global headers

    config = configparser.ConfigParser()
    config.read('client_app.ini')

    headers = {"User-Agent": config['client']['user-agent']}

    # @todo: figure out what to do with these. Currently just for creating the auth URL
    scopes = [
        'publicData', 'characterContactsRead', 'characterFittingsRead',
        'characterLocationRead'
    ]

    if args.auth:
        id = bytes("{}:{}".format(config['client']['Key'],
                                  config['client']['secret']),
                   encoding="utf-8")
        headers.update({
            "Authorization": b"Basic " + base64.b64encode(id),
            "Content-Type": "application/x-www-form-urlencoded"
        })

        if config['client'].get('refresh', None) and not args.invalid:
            print("Using Refresh token to login")
            # do requests here to get auth/refresh code and stick them in config (save maybe?)
            r = requests.post(
                'https://login.eveonline.com/oauth/token',
                data="grant_type=refresh_token&refresh_token={}".format(
                    config['client']['refresh']),
                headers=headers).json()
            headers.update(
                {"Authorization": "Bearer {}".format(r['access_token'])})
        else:

            def handleLogin(httpd, parts):
                # do requests here to get auth/refresh code and stick them in config (save maybe?)
                r = requests.post(
                    'https://login.eveonline.com/oauth/token',
                    data="grant_type=authorization_code&code={}".format(
                        parts['code'][0]),
                    headers=headers).json()

                config["client"]["refresh"] = r['refresh_token']
                with open('client_app.ini', 'w') as configfile:
                    config.write(configfile)

                headers.update(
                    {"Authorization": "Bearer {}".format(r['access_token'])})
                httpd.stop()

            httpd = StoppableHTTPServer(('', 6789), AuthHandler)
            url = "https://login.eveonline.com/oauth/authorize/?response_type=code&scope={}&redirect_uri=http://localhost:6789/&client_id={}".format(
                "+".join(scopes), config['client']['key'])
            print("Please go here to authenticate: \n {}".format(url))
            httpd.serve(handleLogin)

    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
    logging.basicConfig(level=levels[min(args.level, len(levels) - 1)])

    if args.iocp:
        from asyncio.windows_events import ProactorEventLoop
        loop = ProactorEventLoop()
        asyncio.set_event_loop(loop)
    elif args.select:
        loop = asyncio.SelectorEventLoop()
        asyncio.set_event_loop(loop)
    else:
        loop = asyncio.get_event_loop()

    roots = {fix_url(root) for root in args.roots}

    crawler = crawling.Crawler(
        roots,
        exclude=args.exclude,
        strict=args.strict,
        max_redirect=args.max_redirect,
        max_tries=args.max_tries,
        max_tasks=args.max_tasks,
        headers=headers,
        follow_pages=args.follow_pages,
    )
    try:
        loop.run_until_complete(crawler.crawl())  # Crawler gonna crawl.
    except KeyboardInterrupt:
        sys.stderr.flush()
        print('\nInterrupted\n')
    finally:
        reporting.report(crawler)
        crawler.close()

        # next two lines are required for actual aiohttp resource cleanup
        loop.stop()
        loop.run_forever()

        loop.close()
示例#15
0
def main():
    """Main program.

    Parse arguments, set up event loop, run crawler, print report.
    """
    args = ARGS.parse_args()
    if not args.roots:
        print('Use --help for command line help')
        return

    global config
    global headers

    config = configparser.ConfigParser()
    config.read('client_app.ini')

    headers = {
        "User-Agent": config['client']['user-agent']
    }

    # @todo: figure out what to do with these. Currently just for creating the auth URL
    scopes = [
        'publicData',
        'characterContactsRead',
        'characterFittingsRead',
        'characterLocationRead'
    ]

    if args.auth:
        id = bytes("{}:{}".format(config['client']['Key'], config['client']['secret']), encoding="utf-8")
        headers.update({
            "Authorization": b"Basic " + base64.b64encode(id),
            "Content-Type": "application/x-www-form-urlencoded"
        })

        if config['client'].get('refresh', None) and not args.invalid:
            print("Using Refresh token to login")
            # do requests here to get auth/refresh code and stick them in config (save maybe?)
            r = requests.post('https://login.eveonline.com/oauth/token',
                              data="grant_type=refresh_token&refresh_token={}".format(config['client']['refresh']),
                              headers=headers).json()
            headers.update({"Authorization": "Bearer {}".format(r['access_token'])})
        else:
            def handleLogin(httpd, parts):
                # do requests here to get auth/refresh code and stick them in config (save maybe?)
                r = requests.post('https://login.eveonline.com/oauth/token',
                                  data="grant_type=authorization_code&code={}".format(parts['code'][0]),
                                  headers=headers).json()

                config["client"]["refresh"] = r['refresh_token']
                with open('client_app.ini', 'w') as configfile:
                    config.write(configfile)

                headers.update({"Authorization": "Bearer {}".format(r['access_token'])})
                httpd.stop()

            httpd = StoppableHTTPServer(('', 6789), AuthHandler)
            url = "https://login.eveonline.com/oauth/authorize/?response_type=code&scope={}&redirect_uri=http://localhost:6789/&client_id={}".format("+".join(scopes), config['client']['key'])
            print("Please go here to authenticate: \n {}".format(url))
            httpd.serve(handleLogin)

    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
    logging.basicConfig(level=levels[min(args.level, len(levels)-1)])

    if args.iocp:
        from asyncio.windows_events import ProactorEventLoop
        loop = ProactorEventLoop()
        asyncio.set_event_loop(loop)
    elif args.select:
        loop = asyncio.SelectorEventLoop()
        asyncio.set_event_loop(loop)
    else:
        loop = asyncio.get_event_loop()

    roots = {fix_url(root) for root in args.roots}

    crawler = crawling.Crawler(roots,
                               exclude=args.exclude,
                               strict=args.strict,
                               max_redirect=args.max_redirect,
                               max_tries=args.max_tries,
                               max_tasks=args.max_tasks,
                               headers=headers,
                               follow_pages=args.follow_pages,
                               )
    try:
        loop.run_until_complete(crawler.crawl())  # Crawler gonna crawl.
    except KeyboardInterrupt:
        sys.stderr.flush()
        print('\nInterrupted\n')
    finally:
        reporting.report(crawler)
        crawler.close()

        # next two lines are required for actual aiohttp resource cleanup
        loop.stop()
        loop.run_forever()

        loop.close()
示例#16
0
    def report(self, path,version="unknown"):

        report = reporting.report(self.windSpeedBins, self.turbulenceBins,version)
        report.report(path, self)
示例#17
0
#!/usr/bin/env python2.7
from __future__ import absolute_import, division, print_function, unicode_literals

import sys
from os import path

ourdir = path.realpath(path.dirname(__file__))
sys.path.insert(0, ourdir)

import reporting

argv = sys.argv[1:]
start, end = reporting.parse(argv)
cmd = ['ledger'] + argv + reporting.list_datfiles(start, end)

reporting.report(cmd)
示例#18
0
        )
    else:
        logging.basicConfig(\
            level   = leveldict[loglevel], \
            format  = logfmt, \
            datefmt = datefmt \
        )


    storage = FileManager(datadir, False, False)
    gazette_ia = GazetteIA(storage, access_key, secret_key, loglevel, logfile)

    stats        = Stats()
    if relurls:
        for relurl in relurls:
            handle_relurl(gazette_ia, relurl, to_upload, to_update, stats)
    elif from_stdin:
        for line in sys.stdin:
            relurl = line.strip()
            handle_relurl(gazette_ia, relurl, to_upload, to_update, stats)
    else:        
        for relurl in storage.find_matching_relurls(srcnames, start_ts, end_ts):
            handle_relurl(gazette_ia, relurl, to_upload, to_update, stats)



    if to_addrs:
        msg = stats.get_message(srcnames)
        reporting.report(gmail_user, gmail_pwd, to_addrs, \
                        'Stats for gazette on %s' % datetime.date.today(), msg)