Пример #1
0
def cli(args, dbm):
    sn = getCurrentSnapshot()

    dbConf = readDBConfFromFile(args.config)
    db = DBClient(dbm)
    rdf_dir = args.dir

    if not os.path.exists(rdf_dir):
        os.mkdir(rdf_dir)
    sn_dir = os.path.join(rdf_dir, str(sn))
    if not os.path.exists(sn_dir):
        os.mkdir(sn_dir)

    tasks = []
    if args.portalid:
        P = db.Session.query(Portal).filter(Portal.id == args.portalid).one()
        if P is None:
            log.warn("PORTAL NOT IN DB", portalid=args.portalid)
            return
        else:
            tasks.append((P, dbConf, sn, sn_dir))
    else:
        for P in db.Session.query(Portal):
            tasks.append((P, dbConf, sn, sn_dir))

    log.info("START FETCH",
             processors=args.processors,
             dbConf=dbConf,
             portals=len(tasks))

    pool = Pool(args.processors)
    for x in pool.imap(streamCSVs, tasks):
        pid, sn = x[0].id, x[1]
        log.info("RECEIVED RESULT", portalid=pid, snapshot=sn)
Пример #2
0
def cli(args,dbm):
    if args.snapshot:
        sn = args.snapshot
    else:
        sn = getCurrentSnapshot()

    dbConf= readDBConfFromFile(args.config)
    db= DBClient(dbm)

    tasks=[]
    if args.portalid:
        P = db.Session.query(Portal).filter(Portal.id==args.portalid).one()
        if P is None:
            log.warn("PORTAL NOT IN DB", portalid=args.portalid)
            return
        else:
            tasks.append((P, dbConf,sn))
    else:
        for P in db.Session.query(Portal):
            tasks.append((P, dbConf,sn))

    log.info("START FRESHNESS", processors=args.processors, dbConf=dbConf, portals=len(tasks))

    pool = Pool(args.processors)
    for x in pool.imap(change_history,tasks):
        pid,sn =x[0].id, x[1]
        log.info("RECEIVED RESULT", portalid=pid, snapshot=sn)
Пример #3
0
def cli(args, dbm):
    dbConf = readDBConfFromFile(args.config)
    db = DBClient(dbm)
    if not args.sn:
        sn = getCurrentSnapshot()
    else:
        sn = args.sn

    directory = args.directory

    tasks = []
    if args.portalid:
        P = db.Session.query(Portal).filter(Portal.id == args.portalid).one()
        if P is None:
            log.warn("PORTAL NOT IN DB", portalid=args.portalid)
            return
        else:
            tasks.append((P, dbConf, sn, directory))
    else:
        for P in db.Session.query(Portal):
            tasks.append((P, dbConf, sn, directory))

    log.info("START FETCH",
             processors=args.processors,
             dbConf=dbConf,
             portals=len(tasks))

    portals = []
    pool = Pool(args.processors)
    for x in pool.imap(generate_schemadotorg_files, tasks):
        pid, lastmod, sn = x[0].id, x[1], x[2]
        portals.append((pid, lastmod))
        log.info("RECEIVED RESULT", portalid=pid)

    create_portal_sitemapindex(portals, directory)
Пример #4
0
def cli(args, dbm):
    sn = getCurrentSnapshot()

    dbConf = readDBConfFromFile(args.config)
    db = DBClient(dbm)

    aggregateFormatDist(db, sn)
Пример #5
0
def cli(args, dbm):
    sn = getCurrentSnapshot()

    dbConf = readDBConfFromFile(args.config)
    db = DBClient(dbm)

    store_local = None
    if args.config:
        with open(args.config) as f:
            config = yaml.load(f)
            if 'git' in config and 'datadir' in config['git']:
                store_local = config['git']['datadir']

    tasks = []
    if args.portalid:
        P = db.Session.query(Portal).filter(Portal.id == args.portalid).one()
        if P is None:
            log.warn("PORTAL NOT IN DB", portalid=args.portalid)
            return
        else:
            tasks.append((P, dbConf, sn, store_local))
    else:
        if args.repair:
            valid = db.Session.query(PortalSnapshot.portalid).filter(
                PortalSnapshot.snapshot == sn).filter(
                    PortalSnapshot.status == 200).subquery()

            for P in db.Session.query(Portal).filter(Portal.id.notin_(valid)):
                PS = db.Session.query(PortalSnapshot).filter(
                    PortalSnapshot.snapshot == sn).filter(
                        PortalSnapshot.portalid == P.id)
                PS.delete(synchronize_session=False)
                PSQ = db.Session.query(PortalSnapshotQuality).filter(
                    PortalSnapshotQuality.snapshot == sn).filter(
                        PortalSnapshotQuality.portalid == P.id)
                PSQ.delete(synchronize_session=False)
                tasks.append((P, dbConf, sn, store_local))
        else:
            for P in db.Session.query(Portal):
                tasks.append((P, dbConf, sn, store_local))

    log.info("START FETCH",
             processors=args.processors,
             dbConf=dbConf,
             portals=len(tasks))

    pool = Pool(args.processors)
    for x in pool.imap(fetchHttp, tasks):
        pid, sn = x[0].id, x[1]
        log.info("RECEIVED RESULT", portalid=pid, snapshot=sn)
Пример #6
0
def start (argv):
    print argv
    start= time.time()
    pa = argparse.ArgumentParser(description='Open Portal Watch toolset.', prog='odpw')
    

    logg=pa.add_argument_group("Logging")
    logg.add_argument(
        '-d', '--debug',
        help="Print lots of debugging statements",
        action="store_const", dest="loglevel", const=logging.DEBUG,
        default=logging.WARNING
    )
    logg.add_argument(
        '-v', '--verbose',
        help="Be verbose",
        action="store_const", dest="loglevel", const=logging.INFO,
        default=logging.WARNING
    )
    
    config=pa.add_argument_group("Config")
    config.add_argument('-c','--config', help="config file", dest='config')
    
    sp = pa.add_subparsers(title='Modules', description="Available sub modules")
    for sm in submodules:
        smpa = sp.add_parser(sm.name(), help=sm.help())
        sm.setupCLI(smpa)
        smpa.set_defaults(func=sm.cli)



    m=set([])
    for k,v in sys.modules.items():
        if v is not None:
            if '.' in k:
                m.add(k.split('.')[0])
            else:
                m.add(k)

    #for i in m:
    #    print i

    args = pa.parse_args(args=argv)
    
        
    db=readDBConfFromFile(args.config)
    if args.config:
        try:
            with open(args.config) as f_conf:
                config = yaml.load(f_conf)
                if 'logging' in config:
                    print "setup logging"
                    logging.config.dictConfig(config['logging'])
                else:
                    ##load basic logging
                    logconf = os.path.join(odpw.__path__[0], 'resources/logging', 'logging.yaml')
                    with open(logconf) as f:
                        logging.config.dictConfig(yaml.load(f))




        except Exception as e:
            print "Exception during config initialisation",e
            return
    else:
        ##load basic logging
        logconf = os.path.join(odpw.__path__[0], 'resources/logging', 'logging.yaml')
        with open(logconf) as f:
            logging.config.dictConfig(yaml.load(f))
        logging.basicConfig(level=args.loglevel)

    #config the structlog
    config_logging()
    log = structlog.get_logger()
    
    try:
        log.info("CMD ARGS", args=str(args))
    
        dbm = DBManager(**db)
        args.func(args , dbm)
    except OperationalError as e:
        log.fatal("DB Connection Exception: ", msg=e.message)
    except Exception as e:
        log.fatal("Uncaught exception", exc_info=True)
    end = time.time()
    secs = end - start
    msecs = secs * 1000
    log.info("END MAIN", time_elapsed=msecs)

    Timer.printStats()
    ErrorHandler.printStats()