Пример #1
0
def cmd_href(NAME, settings):
    sa = Locator.get_session(settings.session_name, settings.dbref)
    if NAME:
        rs = Locator.search(ref=NAME)
    else:
        rs = Locator.all()
    if not rs:
        log.std("Nothing")
    for r in rs:
        print r.ref
Пример #2
0
def cmd_dlcs_import(opts, settings):
    """
    TODO: built into generic import/export (ie. complete set)  so heuristics can
        update all stats each import.. or find some way to fragment dataset.
    """
    importFile = opts.args.FILE
    data = dlcs_parse_xml(open(importFile).read())
    sa = Locator.get_session('default', opts.flags.dbref)
    #sa = model.get_session(opts.flags.dbref, metadata=SqlBase.metadata)
    tags_stat = {}
    domains_stat = {}
    # first pass: validate, track stats and create Locator records where missing
    for post in data['posts']:
        href = post['href']
        dt = datetime.strptime(post['time'], ISO_8601_DATETIME)
# validate URL
        url = urlparse(href)
        domain = url[1]
        if not domain:
            log.std("Ignored domainless (non-net?) URIRef: %s", href)
            continue
        assert re.match('[a-z0-9]+(\.[a-z0-9]+)*', domain), domain
# get/init Locator
        lctr = Locator.fetch((Locator.ref == href,), exists=False)
        if lctr:
            if lctr.date_added != dt:
                lctr.date_added = dt
                sa.add(lctr)
        else:
            lctr = Locator(
                    global_id=href,
                    ref=href,
                    date_added=datetime.strptime(post['time'], ISO_8601_DATETIME)
                )
            lctr.init_defaults()
            log.std("new: %s", lctr)
            sa.add(lctr)
# get/init Bookmark
        bm = Bookmark.fetch((Bookmark.ref_id == lctr.lctr_id,), exists=False)
        if bm:
            if bm.date_added != dt:
                bm.date_added = dt
                sa.add(bm)
            if bm.ref_id != lctr.lctr_id:
                bm.ref = lctr
                sa.add(bm)
        else:
            bm = Bookmark.fetch((Bookmark.name == post['description'],), exists=False)
            if bm:
                log.std("Name already exists: %r" % post['description'])
                continue
            bm = Bookmark(
                    ref=lctr,
                    name=post['description'],
                    extended=post['extended'],
                    tags=post['tag'].replace(' ', ', '),
                    date_added=datetime.strptime(post['time'], ISO_8601_DATETIME)
                )
            bm.init_defaults()
            log.std("new: %s", bm)
            sa.add(bm)
# track domain frequency
        if domain in domains_stat:
            domains_stat[domain] += 1
        else:
            domains_stat[domain] = 1
# track tag frequency
        for tag in post['tag'].split(' '):
            if tag in tags_stat:
                tags_stat[tag] += 1
            else:
                tags_stat[tag] = 1
    log.std("Checked %i locator references", len(data['posts']))
    sa.commit()
# Prepare domain stats
    avgDomainFreq = sum(domains_stat.values())/(len(domains_stat)*1.0)
    hiDomainFreq = max(domains_stat.values())
    log.std("Found domain usage (max/avg): %i/%i", hiDomainFreq, avgDomainFreq)
    domains = 0
    domainOffset = int(opts.flags.domain_offset)
    if domainOffset == 0:
        domainOffset = hiFreq
    elif domainOffset == -1:
        domainOffset = round(hiDomainFreq * 0.2)
    log.std("Setting domain-offset: %i", domainOffset)
# get/init Domains
    for domain in domains_stat:
        freq = domains_stat[domain]
        if freq >= domainOffset:
            domains += 1
            domain_record = Domain.fetch((Domain.name == domain,), exists=False)
            if not domain_record:
                domain_record = Domain(name=domain)
                domain_record.init_defaults()
                sa.add(domain_record)
    sa.commit()
    log.std("Checked %i domains", len(domains_stat))
    log.std("Tracking %i domains", domains)
# Prepare tag stats
    avgFreq = sum(tags_stat.values())/(len(tags_stat)*1.0)
    hiFreq = max(tags_stat.values())
    log.std("Found tag usage (max/avg): %i/%i", hiFreq, avgFreq)
    tagOffset = int(opts.flags.tag_offset)
    if tagOffset == 0:
        tagOffset = hiFreq
    elif tagOffset == -1:
        tagOffset = round(hiFreq * 0.1)
    log.std("Setting tag-offset: %i", tagOffset)
# get/init Tags
    tags = 0
    for tag in tags_stat:
        freq = tags_stat[tag]
        if not re.match('[A-Za-z0-9-]+', tag):
            log.std("Non-std tag %s", tag)
        if freq >= tagOffset:
            tags += 1
            t = Node.fetch((Node.name == tag,), exists=False)
            if not t:
                t = Tag(name=tag)
                t.init_defaults()
                log.std("new: %s", t)
                sa.add(t)
            # store frequencies
            # TODO tags_freq
    log.std("Checked %i tags", len(tags_stat))
    log.std("Tracking %i tags", tags)
    sa.commit()