示例#1
0
    def dlcs_post_import(self, prog=None, opts=None, sa=None, *paths):
        from pydelicious import dlcs_parse_xml
        GroupNode = res.iface.IGroupNode(prog)
        # print list of existing later
        grouptags = []
        for p in paths:
            for post in self.execute( 'dlcs_post_read', dict( p=p), 'gen-all-key:href' ):
                pass

            data = dlcs_parse_xml(open(p).read())
            for post in data['posts']:
                lctrs = [ d['lctr'] for d in self.assert_locator(
                            sa=sa, href=post['href'], opts=opts) ]
                self.execute( 'assert_locator', dict(href=post['href']) )
                if not lctrs:
                    continue
                bm = self.execute( 'assert_locator', bm_dict, 'first-key:bm' )

                lctr = lctrs.pop()
                bms = [ d['bm'] for d in self.add( sa=sa,
                    href=post['href'],
                    name=post['description'],
                    ext=post['extended'],
                    tags=post['tag'],
                    opts=opts) ]
                if not bms:
                    continue
                bm = bms.pop()
                tags = [ GroupNode.fetch(( GroupNode.name == t, ), _sa=sa )
                        for t in post['tag'].split(' ') ]
                [ grouptags.append(t) for t in tags if t ]
                for tag in tags:
                    if not tag:
                        continue
                    tag.subnodes.append( bm )
                    sa.add( tag )
        for tag in grouptags:
            print 'Tag', tag.name
            for node in tag.subnodes:
                print node.node_id, node.name,
                if hasattr(node, 'ref'):
                    print node.ref
                else:
                    print

        if opts.rsr_auto_commit:
            sa.commit()
示例#2
0
文件: dlcs.py 项目: yunxingwoo/PCI
def cached_posts(conf, dlcs, noupdate=False):
    """
    Same as cached_tags but for the post list.
    """
    posts_file = conf.get('local-files', 'posts')
    if not exists(posts_file):
        print >>sys.stderr, "cached_posts: Fetching new post list..."
        cache_file(posts_file, dlcs.posts_all(_raw=True))
    else:
        if not noupdate:
            lastupdate = dlcs.posts_update()['time']
            if time.gmtime(getmtime(posts_file)) < lastupdate:
                print >>sys.stderr, "cached_posts: Updating post list..."
                cache_file(posts_file, dlcs.posts_all(_raw=True))
        elif DEBUG: print >>sys.stderr, "cached_posts: Forced read from cached file..."
    posts = dlcs_parse_xml(open(posts_file))
    return posts
示例#3
0
def cached_posts(conf, dlcs, noupdate=False):
    """
    Same as cached_tags but for the post list.
    """
    posts_file = conf.get('local-files', 'posts')
    if not exists(posts_file):
        print >> sys.stderr, "cached_posts: Fetching new post list..."
        cache_file(posts_file, dlcs.posts_all(_raw=True))
    else:
        if not noupdate:
            lastupdate = dlcs.posts_update()['update']['time']
            if time.gmtime(getmtime(posts_file)) < lastupdate:
                print >> sys.stderr, "cached_posts: Updating post list..."
                cache_file(posts_file, dlcs.posts_all(_raw=True))
        elif DEBUG:
            print >> sys.stderr, "cached_posts: Forced read from cached file..."
    posts = dlcs_parse_xml(open(posts_file))
    return posts
示例#4
0
文件: dlcs.py 项目: yunxingwoo/PCI
def cached_tags(conf, dlcs, noupdate=False):
    """
    Make sure the tag list is cached locally. Updates when the file is
    older than the last time the posts where updated (according to
    del.icio.us posts/update, which only notes new posts, not any updates).
    """
    tags_file = conf.get('local-files', 'tags')
    if not exists(tags_file):
        print >>sys.stderr, "cached_tags: Fetching new tag list..."
        cache_file(tags_file, dlcs.tags_get(_raw=True))
    else:
        if not noupdate:
            lastupdate = dlcs.posts_update()['update']['time']
            if time.gmtime(getmtime(tags_file)) < lastupdate:
                print >>sys.stderr, "cached_tags: Updating tag list..."
                cache_file(tags_file, dlcs.tags_get(_raw=True))
        elif DEBUG: print >>sys.stderr, "cached_tags: Forced read from cached file..."
    tags = dlcs_parse_xml(open(tags_file))
    return tags
示例#5
0
def cached_tags(conf, dlcs, noupdate=False):
    """
    Make sure the tag list is cached locally. Updates when the file is
    older than the last time the posts where updated (according to
    del.icio.us posts/update, which only notes new posts, not any updates).
    """
    tags_file = conf.get('local-files', 'tags')
    if not exists(tags_file):
        print >> sys.stderr, "cached_tags: Fetching new tag list..."
        cache_file(tags_file, dlcs.tags_get(_raw=True))
    else:
        if not noupdate:
            lastupdate = dlcs.posts_update()['update']['time']
            if time.gmtime(getmtime(tags_file)) < lastupdate:
                print >> sys.stderr, "cached_tags: Updating tag list..."
                cache_file(tags_file, dlcs.tags_get(_raw=True))
        elif DEBUG:
            print >> sys.stderr, "cached_tags: Forced read from cached file..."
    tags = dlcs_parse_xml(open(tags_file))
    return tags
示例#6
0
def cmd__dlcs_import(FILE, opts, g):

    """
    Import from (old pre-2009) del.icio.us posts XML export using
    pydelicious library.
    """
    global ctx

    importer = res.bm.BmImporter(ctx.sa_session)
    data = dlcs_parse_xml(open(FILE).read())

    # Validate URL, track tag/domain and create records where missing
    for post in data['posts']:
        href = post['href']
        dt = datetime.strptime(post['time'], ISO_8601_DATETIME)
        lctr = importer.init_locator(href, dt)
        if not lctr:
            continue

        tagcsv = unicode(post['tag'].replace(' ', ', '))
        bm = importer.init_bookmark(lctr, dt,
                post['description'], post['extended'], tagcsv )
        if not bm:
            continue

        # commit every x records
        importer.batch_flush(g)

    log.std("Checked %i locator references", len(data['posts']))
    importer.flush(g)

    # proc/fetch/init Domains
    importer.flush_domains(g)

    # proc/fetch/init Tags
    importer.flush_tags(g)
示例#7
0
def cmd_dlcs_import(opts, settings):
    """
    TODO: built into generic import/export (ie. complete set)  so heuristics can
        update all stats each import.. or find some way to fragment dataset.
    """
    importFile = opts.args.FILE
    data = dlcs_parse_xml(open(importFile).read())
    sa = Locator.get_session('default', opts.flags.dbref)
    #sa = model.get_session(opts.flags.dbref, metadata=SqlBase.metadata)
    tags_stat = {}
    domains_stat = {}
    # first pass: validate, track stats and create Locator records where missing
    for post in data['posts']:
        href = post['href']
        dt = datetime.strptime(post['time'], ISO_8601_DATETIME)
# validate URL
        url = urlparse(href)
        domain = url[1]
        if not domain:
            log.std("Ignored domainless (non-net?) URIRef: %s", href)
            continue
        assert re.match('[a-z0-9]+(\.[a-z0-9]+)*', domain), domain
# get/init Locator
        lctr = Locator.fetch((Locator.ref == href,), exists=False)
        if lctr:
            if lctr.date_added != dt:
                lctr.date_added = dt
                sa.add(lctr)
        else:
            lctr = Locator(
                    global_id=href,
                    ref=href,
                    date_added=datetime.strptime(post['time'], ISO_8601_DATETIME)
                )
            lctr.init_defaults()
            log.std("new: %s", lctr)
            sa.add(lctr)
# get/init Bookmark
        bm = Bookmark.fetch((Bookmark.ref_id == lctr.lctr_id,), exists=False)
        if bm:
            if bm.date_added != dt:
                bm.date_added = dt
                sa.add(bm)
            if bm.ref_id != lctr.lctr_id:
                bm.ref = lctr
                sa.add(bm)
        else:
            bm = Bookmark.fetch((Bookmark.name == post['description'],), exists=False)
            if bm:
                log.std("Name already exists: %r" % post['description'])
                continue
            bm = Bookmark(
                    ref=lctr,
                    name=post['description'],
                    extended=post['extended'],
                    tags=post['tag'].replace(' ', ', '),
                    date_added=datetime.strptime(post['time'], ISO_8601_DATETIME)
                )
            bm.init_defaults()
            log.std("new: %s", bm)
            sa.add(bm)
# track domain frequency
        if domain in domains_stat:
            domains_stat[domain] += 1
        else:
            domains_stat[domain] = 1
# track tag frequency
        for tag in post['tag'].split(' '):
            if tag in tags_stat:
                tags_stat[tag] += 1
            else:
                tags_stat[tag] = 1
    log.std("Checked %i locator references", len(data['posts']))
    sa.commit()
# Prepare domain stats
    avgDomainFreq = sum(domains_stat.values())/(len(domains_stat)*1.0)
    hiDomainFreq = max(domains_stat.values())
    log.std("Found domain usage (max/avg): %i/%i", hiDomainFreq, avgDomainFreq)
    domains = 0
    domainOffset = int(opts.flags.domain_offset)
    if domainOffset == 0:
        domainOffset = hiFreq
    elif domainOffset == -1:
        domainOffset = round(hiDomainFreq * 0.2)
    log.std("Setting domain-offset: %i", domainOffset)
# get/init Domains
    for domain in domains_stat:
        freq = domains_stat[domain]
        if freq >= domainOffset:
            domains += 1
            domain_record = Domain.fetch((Domain.name == domain,), exists=False)
            if not domain_record:
                domain_record = Domain(name=domain)
                domain_record.init_defaults()
                sa.add(domain_record)
    sa.commit()
    log.std("Checked %i domains", len(domains_stat))
    log.std("Tracking %i domains", domains)
# Prepare tag stats
    avgFreq = sum(tags_stat.values())/(len(tags_stat)*1.0)
    hiFreq = max(tags_stat.values())
    log.std("Found tag usage (max/avg): %i/%i", hiFreq, avgFreq)
    tagOffset = int(opts.flags.tag_offset)
    if tagOffset == 0:
        tagOffset = hiFreq
    elif tagOffset == -1:
        tagOffset = round(hiFreq * 0.1)
    log.std("Setting tag-offset: %i", tagOffset)
# get/init Tags
    tags = 0
    for tag in tags_stat:
        freq = tags_stat[tag]
        if not re.match('[A-Za-z0-9-]+', tag):
            log.std("Non-std tag %s", tag)
        if freq >= tagOffset:
            tags += 1
            t = Node.fetch((Node.name == tag,), exists=False)
            if not t:
                t = Tag(name=tag)
                t.init_defaults()
                log.std("new: %s", t)
                sa.add(t)
            # store frequencies
            # TODO tags_freq
    log.std("Checked %i tags", len(tags_stat))
    log.std("Tracking %i tags", tags)
    sa.commit()
示例#8
0
 def dlcs_post_read(self, p):
     from pydelicious import dlcs_parse_xml
     data = dlcs_parse_xml(open(p).read())
     for post in data['posts']:
         yield post