def main(): """ Gets top 30 stories on HN, summarizes and stores in the database """ # Grab top 30 stories hn = HackerNews() stories = hn.top_stories(limit=30) front_page_list = [] for story in stories: # Only continue if not a job, or ask (ie external link) and not in db if story.url: # Put story id into front page list front_page_list.append(story.item_id) if Story.query.filter_by(id=story.item_id).first() is None: # Generate summary summary = gen_summary(story.url) # Store in db s = Story(id=story.item_id, title=story.title, summary=summary, url=story.url) db.session.add(s) db.session.commit() # Delete all rows in front page list Position.query.delete() # Add story ids to positions list for i in range(0, len(front_page_list)): db.session.add(Position(id=front_page_list[i], position=i)) db.session.commit()
class Hacker(object): def __init__(self, vim): self.vim = vim self.hn = HackerNews() self.urls = None @neovim.command("Test") def test(self): self.vim.command("vsplit") @neovim.command('HackerNews') def fill_buffer(self): stories = [] urls = {} for story in self.hn.top_stories()[0:30]: item = self.hn.get_item(story) stories.append(item.title) urls[item.title] = item.url self.vim.command("split HackerNews") self.vim.command("buffer HackerNews") self.vim.command("set buftype=nofile") self.vim.command("set bufhidden=hide") self.vim.command("setlocal noswapfile") self.vim.current.buffer[:] = stories self.urls = urls @neovim.command('HackerOpen') def autocmd_handler(self): url = self.urls[self.vim.current.line] webbrowser.open_new_tab(url)
def refresh_posts(): hn = HackerNews() for story in hn.top_stories(limit=10): # Only viewing top 10 posts on HN story_id = hn.get_item(story) # Tweets title, story URL, and comments if len(story_id.title) > 76: # Adjusting for max tweet length story_title = (story_id.title.rsplit(' ', 1)[0] + '\n') else: story_title = (story_id.title + '\n') story_comments = ('Cmts: https://news.ycombinator.com/item?id=%s' % str(story_id.item_id)) # Check to see if post has an external link if story_id.url is None: try: # If tweet is a duplicate, ignores the post and doesn't tweet api.update_status(story_title + story_comments) except tweepy.error.TweepError: continue else: story_url = ('Link: ' + story_id.url + '\n') # If tweet is a duplicate, ignores the post and doesn't tweet try: api.update_status(story_title + story_url + story_comments) except tweepy.error.TweepError: continue
def getHNData(verbose=False, limit=100, sub="showstories"): from hackernews import HackerNews from hackernews import settings import hoverpy, time, os dbpath = "data/hn.%s.db" % sub with hoverpy.HoverPy(recordMode="once", dbpath=dbpath) as hp: if not hp.mode() == "capture": settings.supported_api_versions[ "v0"] = "http://hacker-news.firebaseio.com/v0/" hn = HackerNews() titles = [] print("GETTING HACKERNEWS %s DATA" % sub) subs = { "showstories": hn.show_stories, "askstories": hn.ask_stories, "jobstories": hn.job_stories, "topstories": hn.top_stories } start = time.time() for story_id in subs[sub](limit=limit): story = hn.get_item(story_id) if verbose: print(story.title.lower()) titles.append(story.title.lower()) print("got %i hackernews titles in %f seconds" % (len(titles), time.time() - start)) return titles
def get_news(from_num, num_headlines): hn = HackerNews() print "Starting HN" news_lst = [] for story_id in hn.top_stories(limit=num_headlines): news_lst.append(hn.get_item(story_id).title + "\n") return news_lst
class TestGetUser(unittest.TestCase): def setUp(self): self.hn = HackerNews() def test_get_user(self): user = self.hn.get_user('pg') self.assertIsInstance(user, User) self.assertEqual(user.user_id, 'pg') self.assertEqual(user.created, datetime.datetime.fromtimestamp(1160418092)) self.assertEqual(repr(user), '<hackernews.User: pg>') def test_get_invalid_user(self): self.assertRaises(InvalidUserID, self.hn.get_user, 'a') def test_get_user_expand(self): user = self.hn.get_user('avinassh', expand=True) self.assertIsInstance(user, User) self.assertEqual(user.user_id, 'avinassh') self.assertIsInstance(user.comments[0], Item) self.assertIsInstance(user.stories[0], Item) def tearDown(self): self.hn.session.close()
def grab_last_n_days(num_days): start_time = time.time() cutoff_time = start_time - (num_days * 24 * 60 * 60) hn = HackerNews() max_id = item_id = hn.get_max_item_id() stats = collections.Counter() step = 100 try: while True: item_ids = range(item_id, item_id - step, -1) stats.update(add_many_items(item_ids)) item = hn[item_id - step + 1] if item and item.get("time", start_time) < cutoff_time: break item_id -= step print(max_id - item_id, item_id, time.ctime(item.get("time", start_time))) finally: count = sum(stats.values()) spent = time.time() - start_time bleh = dict(count=count, spent=spent, stats=stats) print("Fetched {count} items in {spent:.2f} seconds ".format(**bleh) + "({count / spent:.1f} items per second)".format(**bleh)) print( " {stats['new']} new items, {stats['updated']} updates, {stats['same']} same as before" .format(**bleh))
def load_name_to_mongo(): """ Gets data from HN and cleans it 50 headlines split up into syllables of proper nouns """ dic = pyphen.Pyphen(lang='en') hn = HackerNews() top_stories = hn.top_stories(limit=50) words = [ title for story in top_stories for title in filter(str.isalpha, story.title.split(' ')) ] syllables = [ syllable for word in words if not word.islower() and not word.isupper() for syllable in dic.inserted(word).split("-") ] mixed_bag = [syllable for syllable in syllables if len(syllables) > 1] return mixed_bag
def handler(event, context): hn = HackerNews() results = [] for story_id in hn.top_stories(limit=10): results.append(hn.get_item(story_id).title) return json.dumps(results)
def test_can_setup_with_max(self): """Test if a HackerNews instance can be created with custom max.""" hackernews = HackerNews() hackernews.setup(url=self.url, max=50) self.assertEqual(hackernews.url, self.url) self.assertEqual(hackernews.max, 50) self.assertEqual(hackernews.file, './hackernews.txt')
def test_can_setup_with_defaults(self): """Test if a HackerNews instance with a custom URL gets default max and file name fields.""" hackernews = HackerNews() hackernews.setup(url=self.url) self.assertEqual(hackernews.url, self.url) self.assertEqual(hackernews.max, 25) self.assertEqual(hackernews.file, './hackernews.txt')
def get_hackernews_article(): hn_wrapper = HackerNews() index = random.choice(hn_wrapper.top_stories()) story = hn_wrapper.get_item(index) result = story.title if story.url is not None: result += "\n" + story.url return result
def test_setup_hackernews_with_file(self): """Test if a HackerNews instance can be created with a custom output file name. """ hackernews = HackerNews() hackernews.setup(url=self.url, file='hackernews2.txt') self.assertEqual(hackernews.url, self.url) self.assertEqual(hackernews.max, 25) self.assertEqual(hackernews.file, 'hackernews2.txt')
def test_setup_hackernews_max_file(self): """Test if a HackerNews instance can be created with custom URL, max, and filename fields. """ hackernews = HackerNews() hackernews.setup(url=self.url, max=50, file='hackernews3.txt') self.assertEqual(hackernews.url, self.url) self.assertEqual(hackernews.max, 50) self.assertEqual(hackernews.file, 'hackernews3.txt')
def add_many_items(item_ids): hn = HackerNews() reqs = (grequests.get("https://hacker-news.firebaseio.com/v0/item/" + str(item_id) + ".json") for item_id in item_ids) result = [] for resp in grequests.imap(reqs, size=50): item = resp.json() if not item: continue result.append(hn.update_item(item)) return result
def getHN_stories(self, article_limit): hn = HackerNews() articles_to_retrieve = int(article_limit*1.5) top_story_ids = hn.top_stories(limit=articles_to_retrieve) stories = [] for story_id in top_story_ids: stories.append(hn.get_item(story_id)) return stories
class TestGetMaxItem(unittest.TestCase): def setUp(self): self.hn = HackerNews() def test_get_max_item(self): max_item_id = self.hn.get_max_item() self.assertIsInstance(max_item_id, int) def test_get_max_item_expand(self): max_item = self.hn.get_max_item(expand=True) self.assertIsInstance(max_item, Item) def tearDown(self): self.hn.session.close()
def getNews(): hn = HackerNews() story_id = [] items_list = [] #add the stories ids for stories in hn.top_stories(limit=100): story_id.append(stories) #get the ids and exrtact the useful information out of it for ids in story_id: items_list.append(hn.get_item(ids)) return items_list
def getHackerNewsEntries(): hn = HackerNews() listHN = getHackerNewsIds()[0:5*num] print(listHN[0]) title = [] cont = 0 i = 0 #14117882 while True and cont < num: try: item = listHN[i] html = urlopen( url + str(item) + url2).read() parsedJson = json.loads( html.decode('utf-8') ) title.append( (parsedJson['title'],parsedJson['url']) ) cont = cont + 1 except: print("Error: " + url + str(item) + url2 ) i = i + 1 string = "" shortenedList = [] for i in range(num): try: article = parser.parse_article(title[i][1]) article.json() word_count = article.json()['word_count'] domain = article.json()['domain'] string = string + "<b>" + str( i + 1 ) + ") " + title[i][0] + "</b>\n" + "<a href=\"" + title[i][1] + "\">" + domain + "</a>" + getTimeReadingString(word_count) + "\n\n" shortenedList.append( shortener.short(title[i][1]) ) except: print("Error: ") return string, shortenedList
class TestGetMaxItem(unittest.TestCase): def setUp(self): self.hn = HackerNews() def test_get_max_item(self): max_item_id = self.hn.get_max_item() self.assertIsInstance(max_item_id, int)
def refresh(self, widget=None, no_timer=False, chrome_data_directory=None, firefox_data_directory=None): """Refreshes the menu """ try: # Create an array of 20 false to denote matches in History searchResults = [False] * 20 data = list(reversed(HackerNews.getHomePage()[0:20])) urls = [item['url'] for item in data] if chrome_data_directory: searchResults = self.mergeBoolArray(searchResults, Chrome.search(urls, chrome_data_directory)) if firefox_data_directory: searchResults = self.mergeBoolArray(searchResults, Firefox.search(urls, firefox_data_directory)) # Remove all the current stories for i in self.menu.get_children(): if hasattr(i, 'url'): self.menu.remove(i) # Add back all the refreshed news for index, item in enumerate(data): item['history'] = searchResults[index] if item['url'].startswith('item?id='): item['url'] = "https://news.ycombinator.com/" + item['url'] self.addItem(item) # Catch network errors except requests.exceptions.RequestException as e: print "[+] There was an error in fetching news items" finally: # Call every 10 minutes if not no_timer: gtk.timeout_add(10 * 30 * 1000, self.refresh, widget, no_timer, chrome_data_directory)
def updateHackerNews(): sql = 'truncate discussion' database_execute(sql) hn = HackerNews() id=1 stories=hn.top_stories(limit=30) for story_id in stories: item=hn.get_item(story_id) id=story_id url="https://news.ycombinator.com/item?id="+str(story_id) title=item.title.replace("'","") score=item.score sql = "insert into discussion values('%s','%s','%s','%s')"%(id,title,url,score) #FL.debug(sql) database_execute(sql) return "success"
class TestaskStories(unittest.TestCase): def setUp(self): self.hn = HackerNews() def test_ask_stories(self): ask_stories = self.hn.ask_stories() self.assertIsInstance(ask_stories, list) self.assertIsNotNone(ask_stories)
class TestGetAsync(unittest.TestCase): def setUp(self): self.hn = HackerNews() self.url = 'https://hacker-news.firebaseio.com/v0/item/8863.json' self.err_url = 'https://hacker-news.firebaseio.com/v0/items/8863.json' def test_get_async(self): response = self.hn._run_async([self.url]) self.assertEqual(response[0]['id'], 8863) self.assertEqual(response[0]['by'], 'dhouston') def test_get_async_error(self): response = self.hn._run_async([self.err_url]) self.assertEqual(response, [None]) def tearDown(self): self.hn.session.close()
class TestNewStories(unittest.TestCase): def setUp(self): self.hn = HackerNews() def test_new_stories(self): new_stories = self.hn.new_stories() self.assertIsInstance(new_stories, list) self.assertIsNotNone(new_stories)
class TestjobStories(unittest.TestCase): def setUp(self): self.hn = HackerNews() def test_job_stories(self): job_stories = self.hn.job_stories() self.assertIsInstance(job_stories, list) self.assertIsNotNone(job_stories)
class TestGetItem(unittest.TestCase): def setUp(self): self.hn = HackerNews() def test_get_item(self): item = self.hn.get_item(8863) self.assertIsInstance(item, Item) self.assertEqual(item.item_id, 8863) self.assertEqual(item.by, "dhouston")
class TestShowStories(unittest.TestCase): def setUp(self): self.hn = HackerNews() def test_show_stories(self): show_stories = self.hn.show_stories(limit=10) self.assertIsInstance(show_stories, list) self.assertIsInstance(show_stories[0], Item) self.assertIsNotNone(show_stories) def test_show_stories_raw(self): show_stories = self.hn.show_stories(raw=True) self.assertIsInstance(show_stories, list) self.assertIsInstance(show_stories[0], str) self.assertIsNotNone(show_stories) def tearDown(self): self.hn.session.close()
def update_hackernews(user, update): hn = HackerNews() for story_id in hn.top_stories(limit=15): post = postObject() hn_story = hn.get_item(story_id) message = hn_story.text post.mainlabel = hn_story.title.encode('ascii', 'ignore') post.time = str(hn_story.submission_time) post.sublabel = str(hn_story.score) + " points by " + hn_story.by post.message = message if message is not None else "Read more" post.type = 'hackernews' post.link = "https://news.ycombinator.com/" if post.mainlabel not in user.hackerNewsFeed: update.append(post.to_json()) user.hackerNewsFeed.append(post.mainlabel) return update
class TestshowStories(unittest.TestCase): def setUp(self): self.hn = HackerNews() def test_show_stories(self): show_stories = self.hn.show_stories() self.assertIsInstance(show_stories, list) self.assertIsNotNone(show_stories)
def __init__(self, logger): """ Initializes an instance of Scraper. Requires that a logger to denote the progress of the Scraper to be passed in. """ self.phrases = [] self.hackernews = HackerNews() self.output = mp.Queue() self.logger = logger
class TestTopStories(unittest.TestCase): def setUp(self): self.hn = HackerNews() def test_top_stories(self): top_stories = self.hn.top_stories() self.assertIsInstance(top_stories, list) self.assertIsNotNone(top_stories)
class TestGetItemsByIDs(unittest.TestCase): def setUp(self): self.hn = HackerNews() def test_get_items_by_ids(self): items = self.hn.get_items_by_ids([1, 2, 3]) self.assertIsInstance(items, list) self.assertEqual(len(items), 3) self.assertIsInstance(items[0], Item) def test_get_items_by_ids_filtered(self): items = self.hn.get_items_by_ids([i for i in range(1, 50)], item_type='story') self.assertIsInstance(items, list) self.assertEqual(len(items), 37) self.assertIsInstance(items[0], Item) def tearDown(self): self.hn.session.close()
def sync_with_hacker_news(): hn = HackerNews() for story_id in hn.top_stories(limit=90): story = hn.get_item(story_id) persisted_news_item = NewsItem.query.get(story_id) if persisted_news_item: print "Updating story:", story_id persisted_news_item.upvotes = story.score persisted_news_item.comments = comment_count(story) else: print "Adding story:", story_id news_item = NewsItem( id=story_id, url=story.url, posted_on=story.submission_time, upvotes=story.score, comments=comment_count(story)) db.session.add(news_item) for user in User.query.all(): db.session.add(UserNewsItem(user=user, news_item=news_item)) db.session.commit()
class TestUpdates(unittest.TestCase): def setUp(self): self.hn = HackerNews() def test_top_stories(self): updates = self.hn.updates() self.assertIsNotNone(updates) self.assertIsInstance(updates, dict) self.assertIsInstance(updates['profiles'], list) self.assertIsInstance(updates['items'], list)
class TestGetUser(unittest.TestCase): def setUp(self): self.hn = HackerNews() def test_get_user(self): user = self.hn.get_user('pg') self.assertIsInstance(user, User) self.assertEqual(user.user_id, 'pg') self.assertEqual(user.created, datetime.datetime.fromtimestamp(1160418092))
def process(self, msg): """ `hn:` top\n `hn: last """ params = msg.extract_parameters(self.parameters) from hackernews import HackerNews hn = HackerNews() [ msg.reply( "{title} - {score} - {url}".format(**hn.get_item(s).__dict__)) for s in (hn.new_stories(int(params['limit'])) if params['hn'] == "last" else hn.top_stories(int(params['limit']))) ] return True
class TestJobStories(unittest.TestCase): def setUp(self): self.hn = HackerNews() def test_job_stories(self): job_stories = self.hn.job_stories(limit=10) self.assertIsInstance(job_stories, list) self.assertIsInstance(job_stories[0], Item) self.assertIsNotNone(job_stories) def test_job_stories_raw(self): job_stories = self.hn.job_stories(raw=True) self.assertIsInstance(job_stories, list) self.assertIsInstance(job_stories[0], str) self.assertIsNotNone(job_stories) def tearDown(self): self.hn.session.close()
def refresh(self, widget=None, data=None): data = reversed(HackerNews.getHomePage()[0:20]); #Remove all the current stories for i in self.menu.get_children(): if(hasattr(i,'url')): self.menu.remove(i) #Add back all the refreshed news for i in data: self.addItem(i) #Call every 5 minutes gtk.timeout_add(5*60*1000, self.refresh)
class TestGetLast(unittest.TestCase): def setUp(self): self.hn = HackerNews() def test_get_item(self): items = self.hn.get_last(5) self.assertIsInstance(items, list) self.assertEqual(len(items), 5) self.assertIsInstance(items[0], Item) def tearDown(self): self.hn.session.close()
class TestGetUsersByIDs(unittest.TestCase): def setUp(self): self.hn = HackerNews() def test_get_users_by_ids(self): users = self.hn.get_users_by_ids(['pg', 'tptacek', 'jacquesm']) self.assertIsInstance(users, list) self.assertEqual(len(users), 3) self.assertIsInstance(users[0], User) def tearDown(self): self.hn.session.close()
class TestAll(unittest.TestCase): def setUp(self): self.hn = HackerNews() @unittest.skip("Skip for timeout issue due to long runtime") def test_get_all(self): items = self.hn.get_all() self.assertIsInstance(items, list) self.assertIsInstance(items[0], Item) def tearDown(self): self.hn.session.close()
class TestHackerNewsParser(unittest.TestCase): def setUp(self): self.hn = HackerNews() def test_parsed_score(self): """Every score should be a digit""" for news in self.hn.parse_news_list(): self.assertTrue(news['score'] is None or \ news['score'].isdigit()) def test_parse_comhead(self): # test removed www self.assertEqual(self.hn.parse_comhead('www.googlE.com'), 'google.com') # test whole hostname self.assertEqual(self.hn.parse_comhead('plus.googlE.com'), 'plus.google.com') # test hostname with github user self.assertEqual(self.hn.parse_comhead('www.github.com/polyrabbit'), 'github.com/polyrabbit') self.assertEqual(self.hn.parse_comhead('github.com/'), 'github.com')
def refresh(self, widget=None, no_timer=False): """Refreshes the menu """ data = reversed(HackerNews.getHomePage()[0:20]) #Remove all the current stories for i in self.menu.get_children(): if hasattr(i, 'url'): self.menu.remove(i) #Add back all the refreshed news for i in data: self.addItem(i) #Call every 5 minutes if not no_timer: gtk.timeout_add(10 * 60 * 1000, self.refresh)
class TestGetSync(unittest.TestCase): def setUp(self): self.hn = HackerNews() self.url = 'https://hacker-news.firebaseio.com/v0/item/8863.json' self.err_url = 'https://hacker-news.firebaseio.com/v0/items/8863.json' def test_get_sync(self): response = self.hn._get_sync(self.url) self.assertEqual(response['id'], 8863) self.assertEqual(response['by'], 'dhouston') def test_get_sync_error(self): self.assertRaises(HTTPError, self.hn._get_sync, self.err_url) def tearDown(self): self.hn.session.close()
class HackerNewsTestCase(unittest.TestCase): def setUp(self): self.hn = HackerNews() def test_top_story_count(self): top_stories = self.hn.top_stories() self.assertEqual(len(top_stories), 100) def test_max_item_positive_integer(self): max_item = self.hn.max_item() self.assertGreaterEqual(max_item, 0) def test_updates_result_is_dict(self): updates = self.hn.updates() self.assertIsInstance(updates, dict) def test_item_result_is_dict(self): item = self.hn.item(1) self.assertIsInstance(item, Item) def test_user_result_is_dict(self): item = self.hn.user('pg') self.assertIsInstance(item, User) def test_user_created_is_datetime(self): item = self.hn.user('pg') self.assertIsInstance(item.created, datetime) def test_item_time_is_datetime(self): item = self.hn.item('1') if item.get('time'): self.assertIsInstance(item.time, datetime) def test_raises_connection_timeout(self): hn = HackerNews(timeout=1) hn.url = "http://192.0.2.0" # RFC 5735 TEST-NET-1 self.assertRaises(ConnectTimeout, hn.top_stories) def test_object_to_dict(self): item = self.hn.item('1') self.assertIsInstance(item.__dict__, dict)
def setUp(self): self.hn = HackerNews()
def setUp(self): self.hn = HackerNews() self.url = 'https://hacker-news.firebaseio.com/v0/item/8863.json' self.err_url = 'https://hacker-news.firebaseio.com/v0/items/8863.json'