示例#1
0
def record_history(name, when, errors):
    """
    Record a plugin name and runtime in the history table
    """
    db = CrawlDBI.DBI(dbtype='crawler')
    if not db.table_exists(table='history'):
        dbschem.make_table('history')
    db.insert(table='history',
              fields=['plugin', 'runtime', 'errors'],
              data=[(name, when, errors)])
    db.close()
示例#2
0
def mpra_record_recent(type, start, end, hits):
    """
    Record the most recent record reported so we don't report records
    repeatedly. However, if recent is not later than the time already stored,
    we don't want to update it.
    """
    dbschem.make_table('mpra')
    db = CrawlDBI.DBI(dbtype="crawler")
    db.insert(table='mpra',
              fields=['type', 'scan_time', 'start_time', 'end_time', 'hits'],
              data=[(type, int(time.time()), int(start), int(end), hits)])
    db.close()
示例#3
0
def cvv_ttype_table(argv):
    """ttype_table - create (or drop) table tape_types

    usage: cv ttype_table [-d] {-D|-r /opt/hpss}

    Without the -D/--drop option, create the table tape_types in the mysql
    database. Populate it with information from an HPSS build tree (default is
    /opt/hpss).

    With -D or --drop, drop the table.
    """
    p = optparse.OptionParser()
    p.add_option('-D',
                 '--drop',
                 action='store_true',
                 default=False,
                 dest='drop',
                 help='drop the table')
    p.add_option('-d',
                 '--debug',
                 action='store_true',
                 default=False,
                 dest='debug',
                 help='run the debugger')
    p.add_option('-r',
                 '--root',
                 action='store',
                 default='',
                 dest='hpssroot',
                 help='where to look for data')
    try:
        (o, a) = p.parse_args(argv)
    except SystemExit:
        return

    if o.debug:
        pdb.set_trace()

    # lookup and report tape type for each pathname specified
    if o.drop:
        result = dbschem.drop_table(table="tape_types")
        print result
    else:
        dbschem.make_table("tape_types")

        hpssroot = o.hpssroot
        if hpssroot == '':
            hpssroot = os.getenv("HPSS_ROOT")
        if hpssroot is None:
            hpssroot = "/opt/hpss"

        tape_types_populate(hpssroot)
示例#4
0
def record_checked_ids(cfg, low, high, correct, error):
    """
    Save checked NSOBJECT ids in the HPSSIC database.

    If we check a range and get no hits (i.e., no NSOBJECT ids exist in the
    range), we'll store

       (<time>, <low-id>, <high-id>, 0, 0)

    If we get a hit with the right copy count, we store it by itself as

       (<time>, <hit-id>, <hit-id>, 1, 0)

    If we get a hit with the wrong copy count, we store it by itself as

       (<time>, <hit-id>, <hit-id>, 0, 1)
    """
    tabname = cfg.get(sectname(), 'table_name')

    result = dbschem.make_table(tabname)
    ts = int(time.time())
    CrawlConfig.log("recording checked ids %d to %d at %d" % (low, high, ts))
    db = CrawlDBI.DBI(dbtype="crawler")
    db.insert(table=tabname,
              fields=['check_time',
                      'low_nsobj_id',
                      'high_nsobj_id',
                      'correct',
                      'error'],
              data=[(ts, low, high, correct, error)])
    db.close()
示例#5
0
def record_checked_ids(cfg, low, high, correct, error):
    """
    Save checked NSOBJECT ids in the HPSSIC database.

    If we check a range and get no hits (i.e., no NSOBJECT ids exist in the
    range), we'll store

       (<time>, <low-id>, <high-id>, 0, 0)

    If we get a hit with the right copy count, we store it by itself as

       (<time>, <hit-id>, <hit-id>, 1, 0)

    If we get a hit with the wrong copy count, we store it by itself as

       (<time>, <hit-id>, <hit-id>, 0, 1)
    """
    tabname = cfg.get(sectname(), 'table_name')

    result = dbschem.make_table(tabname)
    ts = int(time.time())
    CrawlConfig.log("recording checked ids %d to %d at %d" % (low, high, ts))
    db = CrawlDBI.DBI(dbtype="crawler")
    db.insert(table=tabname,
              fields=[
                  'check_time', 'low_nsobj_id', 'high_nsobj_id', 'correct',
                  'error'
              ],
              data=[(ts, low, high, correct, error)])
    db.close()
示例#6
0
def cvv_ttype_table(argv):
    """ttype_table - create (or drop) table tape_types

    usage: cv ttype_table [-d] {-D|-r /opt/hpss}

    Without the -D/--drop option, create the table tape_types in the mysql
    database. Populate it with information from an HPSS build tree (default is
    /opt/hpss).

    With -D or --drop, drop the table.
    """
    p = optparse.OptionParser()
    p.add_option('-D', '--drop',
                 action='store_true', default=False, dest='drop',
                 help='drop the table')
    p.add_option('-d', '--debug',
                 action='store_true', default=False, dest='debug',
                 help='run the debugger')
    p.add_option('-r', '--root',
                 action='store', default='', dest='hpssroot',
                 help='where to look for data')
    try:
        (o, a) = p.parse_args(argv)
    except SystemExit:
        return

    if o.debug:
        pdb.set_trace()

    # lookup and report tape type for each pathname specified
    if o.drop:
        result = dbschem.drop_table(table="tape_types")
        print result
    else:
        dbschem.make_table("tape_types")

        hpssroot = o.hpssroot
        if hpssroot == '':
            hpssroot = os.getenv("HPSS_ROOT")
        if hpssroot is None:
            hpssroot = "/opt/hpss"

        tape_types_populate(hpssroot)
示例#7
0
def history_load(loadlist, filename):
    """
    Each plugin's sublib has a load_history() routine that knows how to load
    its data to the history file.

    Unfortunately, we do have to know here something special about plugin 'cv'
    to warn the user when a filename was specified without 'cv' in the load
    list or vice versa and when to pass filename to the plugin's load_history()
    method.
    """
    cfg = CrawlConfig.add_config()
    pluglist = U.csv_list(cfg.get_d('crawler', 'plugins', U.default_plugins()))
    ll = U.csv_list(loadlist)
    if 'all' in ll or ll == []:
        ll = copy.deepcopy(pluglist)

    if filename is None and 'cv' in ll:
        print(MSG.history_cv_not_loaded)
        ll.remove('cv')
    elif filename is not None and 'cv' not in ll:
        print(MSG.history_filename_ignored)

    unk_plugs = [x for x in ll if x not in pluglist]
    if 0 < len(unk_plugs):
        print(MSG.unrecognized_plugin_S % ', '.join(unk_plugs))
        map(ll.remove, unk_plugs)

    if ll == []:
        return

    dbschem.make_table('history')
    for plug in [x for x in ll if x in pluglist]:
        print("loading %s..." % plug)
        if plug == 'cv' and filename is not None:
            args = [filename]
        else:
            args = []
        p = CrawlPlugin.CrawlPlugin(name=plug, cfg=cfg)
        p.load_history(*args)
示例#8
0
def history_load(loadlist, filename):
    """
    Each plugin's sublib has a load_history() routine that knows how to load
    its data to the history file.

    Unfortunately, we do have to know here something special about plugin 'cv'
    to warn the user when a filename was specified without 'cv' in the load
    list or vice versa and when to pass filename to the plugin's load_history()
    method.
    """
    cfg = CrawlConfig.add_config()
    pluglist = U.csv_list(cfg.get_d('crawler', 'plugins', U.default_plugins()))
    ll = U.csv_list(loadlist)
    if 'all' in ll or ll == []:
        ll = copy.deepcopy(pluglist)

    if filename is None and 'cv' in ll:
        print(MSG.history_cv_not_loaded)
        ll.remove('cv')
    elif filename is not None and 'cv' not in ll:
        print(MSG.history_filename_ignored)

    unk_plugs = [x for x in ll if x not in pluglist]
    if 0 < len(unk_plugs):
        print(MSG.unrecognized_plugin_S % ', '.join(unk_plugs))
        map(ll.remove, unk_plugs)

    if ll == []:
        return

    dbschem.make_table('history')
    for plug in [x for x in ll if x in pluglist]:
        print("loading %s..." % plug)
        if plug == 'cv' and filename is not None:
            args = [filename]
        else:
            args = []
        p = CrawlPlugin.CrawlPlugin(name=plug, cfg=cfg)
        p.load_history(*args)
示例#9
0
    def ex_nihilo(cls, dataroot='/'):
        """
        Start from scratch. Create the database if necessary. Create the
        table(s) if necessary. Bootstrap the queue by adding the root
        director(ies).

        Field path is the location of the file or directory in the HPSS
        archive.

        Field type is 'f' for files or 'd' for directories.

        Field cos is the class of service for the file. For directories, cos is
        empty.

        Field cart starts with a null value. When populated from hsi, it may be
        set to the name of a tape cartridge or to ''. Empty files take up no
        space on any cartridge, so for them the field is empty.

        Field checksum is 0 if we have not computed or discoverd a checksum for
        the file. Once we know a checksum has been stored for the file, we set
        this to 1.

        Field last_check is the epoch time at which the file was last checked.

        Field fails is the number of times hashcreate and/or hashverify has
        failed on the file.

        Field reported is 0 or 1 indicating whether we've reported
        """
        dbschem.make_table("checkables")
        if type(dataroot) == str:
            dataroot = [dataroot]

        if type(dataroot) == list:
            for root in dataroot:
                r = Checkable(path=root, type='d', in_db=False, dirty=True)
                r.load()
                r.persist()
示例#10
0
    def ex_nihilo(cls, dataroot='/'):
        """
        Start from scratch. Create the database if necessary. Create the
        table(s) if necessary. Bootstrap the queue by adding the root
        director(ies).

        Field path is the location of the file or directory in the HPSS
        archive.

        Field type is 'f' for files or 'd' for directories.

        Field cos is the class of service for the file. For directories, cos is
        empty.

        Field cart starts with a null value. When populated from hsi, it may be
        set to the name of a tape cartridge or to ''. Empty files take up no
        space on any cartridge, so for them the field is empty.

        Field checksum is 0 if we have not computed or discoverd a checksum for
        the file. Once we know a checksum has been stored for the file, we set
        this to 1.

        Field last_check is the epoch time at which the file was last checked.

        Field fails is the number of times hashcreate and/or hashverify has
        failed on the file.

        Field reported is 0 or 1 indicating whether we've reported
        """
        dbschem.make_table("checkables")
        if type(dataroot) == str:
            dataroot = [dataroot]

        if type(dataroot) == list:
            for root in dataroot:
                r = Checkable(path=root, type='d', in_db=False, dirty=True)
                r.load()
                r.persist()
示例#11
0
def update_stats(cmf):
    """
    Record the values in tuple cmf in table cvstats in the database. If the
    table does not exist, create it.
    """
    result = dbschem.make_table(stats_table)
    db = CrawlDBI.DBI(dbtype="crawler")
    if result == "Created":
        db.insert(table=stats_table,
                  fields=["rowid", "matches", "failures"],
                  data=[(1, 0, 0)])

    db.update(table=stats_table,
              fields=["matches", "failures"],
              data=[cmf],
              where="rowid = 1")
    db.close()
示例#12
0
def update_stats(cmf):
    """
    Record the values in tuple cmf in table cvstats in the database. If the
    table does not exist, create it.
    """
    result = dbschem.make_table(stats_table)
    db = CrawlDBI.DBI(dbtype="crawler")
    if result == "Created":
        db.insert(table=stats_table,
                  fields=["rowid", "matches", "failures"],
                  data=[(1, 0, 0)])

    db.update(table=stats_table,
              fields=["matches", "failures"],
              data=[cmf],
              where="rowid = 1")
    db.close()
示例#13
0
def get_last_rpt_time(db):
    """
    Retrieve the last report time from the report table. If the table does not
    exist before make_table ('Created' in result), the table is empty so we
    just return 0 to indicate no last report time.
    """
    result = dbschem.make_table("report")
    if "Created" in result:
        rval = 0
    else:
        rows = db.select(table='report', fields=['max(report_time)'])
        (rval) = rows[0][0]
        if rval is None:
            rval = 0

    CrawlConfig.log("time of last report: %d" % rval)
    return rval
示例#14
0
def get_last_rpt_time(db):
    """
    Retrieve the last report time from the report table. If the table does not
    exist before make_table ('Created' in result), the table is empty so we
    just return 0 to indicate no last report time.
    """
    result = dbschem.make_table("report")
    if "Created" in result:
        rval = 0
    else:
        rows = db.select(table='report',
                         fields=['max(report_time)'])
        (rval) = rows[0][0]
        if rval is None:
            rval = 0

    CrawlConfig.log("time of last report: %d" % rval)
    return rval
示例#15
0
def lscos_populate():
    """
    If table lscos already exists, we're done. Otherwise, retrieve the lscos
    info from hsi, create the table, and fill the table in.

    We store the min_size and max_size for each COS as text strings containing
    digits because the largest sizes are already within three orders of
    magnitude of a mysql bigint and growing.
    """
    db = CrawlDBI.DBI(dbtype="crawler")
    tabname = 'lscos'
    st = dbschem.make_table(tabname)
    szrgx = "(\d+([KMGT]B)?)"
    rgx = ("\s*(\d+)\s*(([-_a-zA-Z0-9]+\s)+)\s+[UGAN]*\s+(\d+)" +
           "\s+(ALL)?\s+%s\s+-\s+%s" % (szrgx, szrgx))
    if "Created" == st:
        H = hpss.HSI()
        raw = H.lscos()
        H.quit()

        z = [x.strip() for x in raw.split('\r')]
        rules = [q for q in z if '----------' in q]
        first = z.index(rules[0]) + 1
        second = z[first:].index(rules[0]) + first
        lines = z[first:second]
        data = []
        for line in lines:
            m = U.rgxin(rgx, line)
            (cos, desc, copies, lo_i, hi_i) = (m[0],
                                               m[1].strip(),
                                               m[3],
                                               U.scale(m[5], kb=1024),
                                               U.scale(m[7], kb=1024))
            data.append((cos, desc, copies, lo_i, hi_i))

        db.insert(table=tabname,
                  fields=['cos', 'name', 'copies', 'min_size', 'max_size'],
                  data=data)
        rval = MSG.table_created_S % tabname
    else:
        rval = MSG.table_already_S % tabname

    db.close()
    return rval