def get_unclassified_posts(posts_chunk, unclassified_hn_posts, chunk_number): for i, postId in enumerate(posts_chunk): ranking = chunk_number * 20 + (i + 1) # Check if post was already classified old_post = posts.find_one({'id': postId}) new_post = get_hn_post(postId) if not old_post: if new_post and 'url' in new_post: text = get_link_content(new_post['url']) if text: print(new_post['url']) time = datetime.datetime\ .fromtimestamp(int(new_post['time'])) post_data = { 'id': postId, 'url': new_post['url'], 'title': new_post['title'], 'text': text, 'time': time, 'score': new_post['score'], 'username': new_post['by'], 'ranking': ranking } if 'descendants'in new_post: post_data['comments'] = new_post['descendants'] unclassified_hn_posts.append(post_data) else: update_post(old_post, new_post, ranking)
def get_subreddit_samples(category, subreddit, posts_chunk, samples): # Get sample from each post's url for post in posts_chunk: print(subreddit + ': ' + post.url) # If it's a self related post then take selftext as content if post.is_self: content = post.selftext else: content = get_link_content(post.url) if content: samples.append({'text': content, 'label': category})
def get_subreddit_samples(category, subreddit, posts_chunk, samples): # Get sample from each post's url for post in posts_chunk: print(subreddit + ': ' + post.url) # If it's a self related post then take selftext as content if post.is_self: content = post.selftext else: content = get_link_content(post.url) if content: samples.append({ 'text': content, 'label': category })
def classify_top_posts(max_posts=None, logger=None): top_posts_ids = firebase.get('/v0/topstories', None) new_posts = [] for i, post_id in enumerate(top_posts_ids): ranking = i + 1 post = get_hn_post(post_id) cached_post = db.posts.find_one({'id': post_id}) if logger: logger.debug(u'#{} Procesing {} ("{}")'.format(ranking, post['id'], post['title'])) if cached_post: if logger: logger.debug('----> Already classified, updating...') update_post(post, cached_post, ranking) else: if post and 'url' in post: text = get_link_content(post['url']) post_data = { 'id': post_id, 'url': post['url'], 'title': post['title'], 'text': text, 'time': datetime.datetime .fromtimestamp(int(post['time'])), 'score': post['score'], 'username': post['by'], 'ranking': ranking } if 'descendants'in post: post_data['comments'] = post['descendants'] if text and text.strip(): # Has good text, queue for classification... if logger: logger.debug('----> Queuing for classification...') new_posts.append(post_data) else: if logger: logger.debug('----> Unclassifiable, inserting as random...') post_data['result'] = { 'label': 'random', 'probability': '--' } post_data['original_result'] = None db.posts.insert(post_data) if max_posts and ranking >= max_posts: break # Classify posts if new_posts: if logger: logger.debug("Classifying {} queued posts with MonkeyLearn".format(len(new_posts))) ml = MonkeyLearn(MONKEYLEARN_TOKEN) result = ml.classifiers.classify( MONKEYLEARN_MODULE_ID, (p['text'] for p in new_posts) ).result # Add classification data to new posts and save to db for i, post in enumerate(new_posts): if result[i][0]['probability'] > 0.5: post['result'] = result[i][0] else: post['result'] = { 'label': 'random', 'probability': '--' } post['original_result'] = result[i][0] if logger: logger.debug('{} {}',format(post['ranking'], post['title'])) db.posts.insert(post) # Deleted old posts with same ranking db.posts.delete_many({'id': {'$nin': top_posts_ids}})
def classify_top_posts(max_posts=None, logger=None): top_posts_ids = firebase.get('/v0/topstories', None) new_posts = [] for i, post_id in enumerate(top_posts_ids): ranking = i + 1 post = get_hn_post(post_id) cached_post = db.posts.find_one({'id': post_id}) if logger: logger.debug(u'#{} Procesing {} ("{}")'.format( ranking, post['id'], post['title'])) if cached_post: if logger: logger.debug('----> Already classified, updating...') update_post(post, cached_post, ranking) else: if post and 'url' in post: text = get_link_content(post['url']) post_data = { 'id': post_id, 'url': post['url'], 'title': post['title'], 'text': text, 'time': datetime.datetime.fromtimestamp(int(post['time'])), 'score': post['score'], 'username': post['by'], 'ranking': ranking } if 'descendants' in post: post_data['comments'] = post['descendants'] if text and text.strip(): # Has good text, queue for classification... if logger: logger.debug('----> Queuing for classification...') new_posts.append(post_data) else: if logger: logger.debug( '----> Unclassifiable, inserting as random...') post_data['result'] = { 'label': 'random', 'probability': '--' } post_data['original_result'] = None db.posts.insert(post_data) if max_posts and ranking >= max_posts: break # Classify posts if new_posts: if logger: logger.debug("Classifying {} queued posts with MonkeyLearn".format( len(new_posts))) ml = MonkeyLearn(MONKEYLEARN_TOKEN) result = ml.classifiers.classify(MONKEYLEARN_MODULE_ID, (p['text'] for p in new_posts)).result # Add classification data to new posts and save to db for i, post in enumerate(new_posts): if result[i][0]['probability'] > 0.5: post['result'] = result[i][0] else: post['result'] = {'label': 'random', 'probability': '--'} post['original_result'] = result[i][0] if logger: logger.debug('{} {}', format(post['ranking'], post['title'])) db.posts.insert(post) # Deleted old posts with same ranking db.posts.delete_many({'id': {'$nin': top_posts_ids}})