def smoke_test(): """ Simple smoke/acceptance test. Crawl user info, store them in pickle file. """ habrauser = '******' user = crawl_user(habrauser) print user ut.data2pickle(user, '../data/users/' + habrauser + '.dat')
def flush_userbase(): ut.data2pickle(users, "../data/allusers.dat") for user in users: user = user.replace("\n", "") filename = "../data/users/" + user + "@.dat" if not os.path.exists(filename): print "crawling user: <%s>" % user user_data = cu.crawl_user(user) ut.data2pickle(user_data, filename)
def reload_users(): """ Build friends graph from full habrauser DB and store it as edgelist (easy readable from igraph). """ user_files_list = os.listdir(USERS_DIR) for userid, filename in enumerate(user_files_list): user = ut.pickle2data( os.path.join(USERS_DIR, filename) ) so_user = so.SmartObject(user) ut.data2pickle(so_user, os.path.join(USERS_DIR, filename))
def get_comment_tree(postpath): url_to_parse = 'http://habrahabr.ru/' + postpath root = ut.doc4url(url_to_parse) if not root: return None author = root.xpath('//div[@class="author"]/a')[0].text print author comment_root_tree = {} ## Словарь вложенных словарей ## автор ## |→ автор_комментарария ## |→ автор подкомментария def dfs_process(node, tree): """ Рекурсивно идет вглубь от node и набивает словарь-дерево tree """ print node.get('id') comments = node.xpath('.//div[@id="comments" or @class="reply_comments"]')[0] for comment in comments.xpath('./div[@class="comment_item"]'): author = comment.xpath('.//a[@class="username"]')[0].text print author child_tree = {} dfs_process(comment, child_tree) tree[author] = deepcopy(child_tree) dfs_process(root, comment_root_tree) comment_tree = {author: comment_root_tree} print 'tree:', comment_tree thepostsdir = os.path.join(ha.get_data_dir(), 'posts') filename = os.path.join(thepostsdir, postpath) + '.dat' ut.createdir(os.path.split(filename)[0]) ut.data2pickle(comment_tree, filename)