def dlcs_post_import(self, prog=None, opts=None, sa=None, *paths): from pydelicious import dlcs_parse_xml GroupNode = res.iface.IGroupNode(prog) # print list of existing later grouptags = [] for p in paths: for post in self.execute( 'dlcs_post_read', dict( p=p), 'gen-all-key:href' ): pass data = dlcs_parse_xml(open(p).read()) for post in data['posts']: lctrs = [ d['lctr'] for d in self.assert_locator( sa=sa, href=post['href'], opts=opts) ] self.execute( 'assert_locator', dict(href=post['href']) ) if not lctrs: continue bm = self.execute( 'assert_locator', bm_dict, 'first-key:bm' ) lctr = lctrs.pop() bms = [ d['bm'] for d in self.add( sa=sa, href=post['href'], name=post['description'], ext=post['extended'], tags=post['tag'], opts=opts) ] if not bms: continue bm = bms.pop() tags = [ GroupNode.fetch(( GroupNode.name == t, ), _sa=sa ) for t in post['tag'].split(' ') ] [ grouptags.append(t) for t in tags if t ] for tag in tags: if not tag: continue tag.subnodes.append( bm ) sa.add( tag ) for tag in grouptags: print 'Tag', tag.name for node in tag.subnodes: print node.node_id, node.name, if hasattr(node, 'ref'): print node.ref else: print if opts.rsr_auto_commit: sa.commit()
def cached_posts(conf, dlcs, noupdate=False): """ Same as cached_tags but for the post list. """ posts_file = conf.get('local-files', 'posts') if not exists(posts_file): print >>sys.stderr, "cached_posts: Fetching new post list..." cache_file(posts_file, dlcs.posts_all(_raw=True)) else: if not noupdate: lastupdate = dlcs.posts_update()['time'] if time.gmtime(getmtime(posts_file)) < lastupdate: print >>sys.stderr, "cached_posts: Updating post list..." cache_file(posts_file, dlcs.posts_all(_raw=True)) elif DEBUG: print >>sys.stderr, "cached_posts: Forced read from cached file..." posts = dlcs_parse_xml(open(posts_file)) return posts
def cached_posts(conf, dlcs, noupdate=False): """ Same as cached_tags but for the post list. """ posts_file = conf.get('local-files', 'posts') if not exists(posts_file): print >> sys.stderr, "cached_posts: Fetching new post list..." cache_file(posts_file, dlcs.posts_all(_raw=True)) else: if not noupdate: lastupdate = dlcs.posts_update()['update']['time'] if time.gmtime(getmtime(posts_file)) < lastupdate: print >> sys.stderr, "cached_posts: Updating post list..." cache_file(posts_file, dlcs.posts_all(_raw=True)) elif DEBUG: print >> sys.stderr, "cached_posts: Forced read from cached file..." posts = dlcs_parse_xml(open(posts_file)) return posts
def cached_tags(conf, dlcs, noupdate=False): """ Make sure the tag list is cached locally. Updates when the file is older than the last time the posts where updated (according to del.icio.us posts/update, which only notes new posts, not any updates). """ tags_file = conf.get('local-files', 'tags') if not exists(tags_file): print >>sys.stderr, "cached_tags: Fetching new tag list..." cache_file(tags_file, dlcs.tags_get(_raw=True)) else: if not noupdate: lastupdate = dlcs.posts_update()['update']['time'] if time.gmtime(getmtime(tags_file)) < lastupdate: print >>sys.stderr, "cached_tags: Updating tag list..." cache_file(tags_file, dlcs.tags_get(_raw=True)) elif DEBUG: print >>sys.stderr, "cached_tags: Forced read from cached file..." tags = dlcs_parse_xml(open(tags_file)) return tags
def cached_tags(conf, dlcs, noupdate=False): """ Make sure the tag list is cached locally. Updates when the file is older than the last time the posts where updated (according to del.icio.us posts/update, which only notes new posts, not any updates). """ tags_file = conf.get('local-files', 'tags') if not exists(tags_file): print >> sys.stderr, "cached_tags: Fetching new tag list..." cache_file(tags_file, dlcs.tags_get(_raw=True)) else: if not noupdate: lastupdate = dlcs.posts_update()['update']['time'] if time.gmtime(getmtime(tags_file)) < lastupdate: print >> sys.stderr, "cached_tags: Updating tag list..." cache_file(tags_file, dlcs.tags_get(_raw=True)) elif DEBUG: print >> sys.stderr, "cached_tags: Forced read from cached file..." tags = dlcs_parse_xml(open(tags_file)) return tags
def cmd__dlcs_import(FILE, opts, g): """ Import from (old pre-2009) del.icio.us posts XML export using pydelicious library. """ global ctx importer = res.bm.BmImporter(ctx.sa_session) data = dlcs_parse_xml(open(FILE).read()) # Validate URL, track tag/domain and create records where missing for post in data['posts']: href = post['href'] dt = datetime.strptime(post['time'], ISO_8601_DATETIME) lctr = importer.init_locator(href, dt) if not lctr: continue tagcsv = unicode(post['tag'].replace(' ', ', ')) bm = importer.init_bookmark(lctr, dt, post['description'], post['extended'], tagcsv ) if not bm: continue # commit every x records importer.batch_flush(g) log.std("Checked %i locator references", len(data['posts'])) importer.flush(g) # proc/fetch/init Domains importer.flush_domains(g) # proc/fetch/init Tags importer.flush_tags(g)
def cmd_dlcs_import(opts, settings): """ TODO: built into generic import/export (ie. complete set) so heuristics can update all stats each import.. or find some way to fragment dataset. """ importFile = opts.args.FILE data = dlcs_parse_xml(open(importFile).read()) sa = Locator.get_session('default', opts.flags.dbref) #sa = model.get_session(opts.flags.dbref, metadata=SqlBase.metadata) tags_stat = {} domains_stat = {} # first pass: validate, track stats and create Locator records where missing for post in data['posts']: href = post['href'] dt = datetime.strptime(post['time'], ISO_8601_DATETIME) # validate URL url = urlparse(href) domain = url[1] if not domain: log.std("Ignored domainless (non-net?) URIRef: %s", href) continue assert re.match('[a-z0-9]+(\.[a-z0-9]+)*', domain), domain # get/init Locator lctr = Locator.fetch((Locator.ref == href,), exists=False) if lctr: if lctr.date_added != dt: lctr.date_added = dt sa.add(lctr) else: lctr = Locator( global_id=href, ref=href, date_added=datetime.strptime(post['time'], ISO_8601_DATETIME) ) lctr.init_defaults() log.std("new: %s", lctr) sa.add(lctr) # get/init Bookmark bm = Bookmark.fetch((Bookmark.ref_id == lctr.lctr_id,), exists=False) if bm: if bm.date_added != dt: bm.date_added = dt sa.add(bm) if bm.ref_id != lctr.lctr_id: bm.ref = lctr sa.add(bm) else: bm = Bookmark.fetch((Bookmark.name == post['description'],), exists=False) if bm: log.std("Name already exists: %r" % post['description']) continue bm = Bookmark( ref=lctr, name=post['description'], extended=post['extended'], tags=post['tag'].replace(' ', ', '), date_added=datetime.strptime(post['time'], ISO_8601_DATETIME) ) bm.init_defaults() log.std("new: %s", bm) sa.add(bm) # track domain frequency if domain in domains_stat: domains_stat[domain] += 1 else: domains_stat[domain] = 1 # track tag frequency for tag in post['tag'].split(' '): if tag in tags_stat: tags_stat[tag] += 1 else: tags_stat[tag] = 1 log.std("Checked %i locator references", len(data['posts'])) sa.commit() # Prepare domain stats avgDomainFreq = sum(domains_stat.values())/(len(domains_stat)*1.0) hiDomainFreq = max(domains_stat.values()) log.std("Found domain usage (max/avg): %i/%i", hiDomainFreq, avgDomainFreq) domains = 0 domainOffset = int(opts.flags.domain_offset) if domainOffset == 0: domainOffset = hiFreq elif domainOffset == -1: domainOffset = round(hiDomainFreq * 0.2) log.std("Setting domain-offset: %i", domainOffset) # get/init Domains for domain in domains_stat: freq = domains_stat[domain] if freq >= domainOffset: domains += 1 domain_record = Domain.fetch((Domain.name == domain,), exists=False) if not domain_record: domain_record = Domain(name=domain) domain_record.init_defaults() sa.add(domain_record) sa.commit() log.std("Checked %i domains", len(domains_stat)) log.std("Tracking %i domains", domains) # Prepare tag stats avgFreq = sum(tags_stat.values())/(len(tags_stat)*1.0) hiFreq = max(tags_stat.values()) log.std("Found tag usage (max/avg): %i/%i", hiFreq, avgFreq) tagOffset = int(opts.flags.tag_offset) if tagOffset == 0: tagOffset = hiFreq elif tagOffset == -1: tagOffset = round(hiFreq * 0.1) log.std("Setting tag-offset: %i", tagOffset) # get/init Tags tags = 0 for tag in tags_stat: freq = tags_stat[tag] if not re.match('[A-Za-z0-9-]+', tag): log.std("Non-std tag %s", tag) if freq >= tagOffset: tags += 1 t = Node.fetch((Node.name == tag,), exists=False) if not t: t = Tag(name=tag) t.init_defaults() log.std("new: %s", t) sa.add(t) # store frequencies # TODO tags_freq log.std("Checked %i tags", len(tags_stat)) log.std("Tracking %i tags", tags) sa.commit()
def dlcs_post_read(self, p): from pydelicious import dlcs_parse_xml data = dlcs_parse_xml(open(p).read()) for post in data['posts']: yield post