def get_news(from_num, num_headlines): hn = HackerNews() print "Starting HN" news_lst = [] for story_id in hn.top_stories(limit=num_headlines): news_lst.append(hn.get_item(story_id).title + "\n") return news_lst
def refresh_posts(): hn = HackerNews() for story in hn.top_stories(limit=10): # Only viewing top 10 posts on HN story_id = hn.get_item(story) # Tweets title, story URL, and comments if len(story_id.title) > 76: # Adjusting for max tweet length story_title = (story_id.title.rsplit(' ', 1)[0] + '\n') else: story_title = (story_id.title + '\n') story_comments = ('Cmts: https://news.ycombinator.com/item?id=%s' % str(story_id.item_id)) # Check to see if post has an external link if story_id.url is None: try: # If tweet is a duplicate, ignores the post and doesn't tweet api.update_status(story_title + story_comments) except tweepy.error.TweepError: continue else: story_url = ('Link: ' + story_id.url + '\n') # If tweet is a duplicate, ignores the post and doesn't tweet try: api.update_status(story_title + story_url + story_comments) except tweepy.error.TweepError: continue
def main(): """ Gets top 30 stories on HN, summarizes and stores in the database """ # Grab top 30 stories hn = HackerNews() stories = hn.top_stories(limit=30) front_page_list = [] for story in stories: # Only continue if not a job, or ask (ie external link) and not in db if story.url: # Put story id into front page list front_page_list.append(story.item_id) if Story.query.filter_by(id=story.item_id).first() is None: # Generate summary summary = gen_summary(story.url) # Store in db s = Story(id=story.item_id, title=story.title, summary=summary, url=story.url) db.session.add(s) db.session.commit() # Delete all rows in front page list Position.query.delete() # Add story ids to positions list for i in range(0, len(front_page_list)): db.session.add(Position(id=front_page_list[i], position=i)) db.session.commit()
class Hacker(object): def __init__(self, vim): self.vim = vim self.hn = HackerNews() self.urls = None @neovim.command("Test") def test(self): self.vim.command("vsplit") @neovim.command('HackerNews') def fill_buffer(self): stories = [] urls = {} for story in self.hn.top_stories()[0:30]: item = self.hn.get_item(story) stories.append(item.title) urls[item.title] = item.url self.vim.command("split HackerNews") self.vim.command("buffer HackerNews") self.vim.command("set buftype=nofile") self.vim.command("set bufhidden=hide") self.vim.command("setlocal noswapfile") self.vim.current.buffer[:] = stories self.urls = urls @neovim.command('HackerOpen') def autocmd_handler(self): url = self.urls[self.vim.current.line] webbrowser.open_new_tab(url)
def handler(event, context): hn = HackerNews() results = [] for story_id in hn.top_stories(limit=10): results.append(hn.get_item(story_id).title) return json.dumps(results)
def load_name_to_mongo(): """ Gets data from HN and cleans it 50 headlines split up into syllables of proper nouns """ dic = pyphen.Pyphen(lang='en') hn = HackerNews() top_stories = hn.top_stories(limit=50) words = [ title for story in top_stories for title in filter(str.isalpha, story.title.split(' ')) ] syllables = [ syllable for word in words if not word.islower() and not word.isupper() for syllable in dic.inserted(word).split("-") ] mixed_bag = [syllable for syllable in syllables if len(syllables) > 1] return mixed_bag
class TestTopStories(unittest.TestCase): def setUp(self): self.hn = HackerNews() def test_top_stories(self): top_stories = self.hn.top_stories() self.assertIsInstance(top_stories, list) self.assertIsNotNone(top_stories)
class TestTopStories(unittest.TestCase): def setUp(self): self.hn = HackerNews() def test_top_stories(self): top_stories = self.hn.top_stories(limit=10) self.assertIsInstance(top_stories, list) self.assertIsInstance(top_stories[0], Item) self.assertIsNotNone(top_stories) def test_top_stories_raw(self): top_stories = self.hn.top_stories(raw=True) self.assertIsInstance(top_stories, list) self.assertIsInstance(top_stories[0], str) self.assertIsNotNone(top_stories) def tearDown(self): self.hn.session.close()
def get_hackernews_article(): hn_wrapper = HackerNews() index = random.choice(hn_wrapper.top_stories()) story = hn_wrapper.get_item(index) result = story.title if story.url is not None: result += "\n" + story.url return result
def getHN_stories(self, article_limit): hn = HackerNews() articles_to_retrieve = int(article_limit*1.5) top_story_ids = hn.top_stories(limit=articles_to_retrieve) stories = [] for story_id in top_story_ids: stories.append(hn.get_item(story_id)) return stories
def getNews(): hn = HackerNews() story_id = [] items_list = [] #add the stories ids for stories in hn.top_stories(limit=100): story_id.append(stories) #get the ids and exrtact the useful information out of it for ids in story_id: items_list.append(hn.get_item(ids)) return items_list
def updateHackerNews(): sql = 'truncate discussion' database_execute(sql) hn = HackerNews() id=1 stories=hn.top_stories(limit=30) for story_id in stories: item=hn.get_item(story_id) id=story_id url="https://news.ycombinator.com/item?id="+str(story_id) title=item.title.replace("'","") score=item.score sql = "insert into discussion values('%s','%s','%s','%s')"%(id,title,url,score) #FL.debug(sql) database_execute(sql) return "success"
def update_hackernews(user, update): hn = HackerNews() for story_id in hn.top_stories(limit=15): post = postObject() hn_story = hn.get_item(story_id) message = hn_story.text post.mainlabel = hn_story.title.encode('ascii', 'ignore') post.time = str(hn_story.submission_time) post.sublabel = str(hn_story.score) + " points by " + hn_story.by post.message = message if message is not None else "Read more" post.type = 'hackernews' post.link = "https://news.ycombinator.com/" if post.mainlabel not in user.hackerNewsFeed: update.append(post.to_json()) user.hackerNewsFeed.append(post.mainlabel) return update
def sync_with_hacker_news(): hn = HackerNews() for story_id in hn.top_stories(limit=90): story = hn.get_item(story_id) persisted_news_item = NewsItem.query.get(story_id) if persisted_news_item: print "Updating story:", story_id persisted_news_item.upvotes = story.score persisted_news_item.comments = comment_count(story) else: print "Adding story:", story_id news_item = NewsItem( id=story_id, url=story.url, posted_on=story.submission_time, upvotes=story.score, comments=comment_count(story)) db.session.add(news_item) for user in User.query.all(): db.session.add(UserNewsItem(user=user, news_item=news_item)) db.session.commit()
def process(self, msg): """ `hn:` top\n `hn: last """ params = msg.extract_parameters(self.parameters) from hackernews import HackerNews hn = HackerNews() [ msg.reply( "{title} - {score} - {url}".format(**hn.get_item(s).__dict__)) for s in (hn.new_stories(int(params['limit'])) if params['hn'] == "last" else hn.top_stories(int(params['limit']))) ] return True
class HackerNewsTestCase(unittest.TestCase): def setUp(self): self.hn = HackerNews() def test_top_story_count(self): top_stories = self.hn.top_stories() self.assertEqual(len(top_stories), 100) def test_max_item_positive_integer(self): max_item = self.hn.max_item() self.assertGreaterEqual(max_item, 0) def test_updates_result_is_dict(self): updates = self.hn.updates() self.assertIsInstance(updates, dict) def test_item_result_is_dict(self): item = self.hn.item(1) self.assertIsInstance(item, Item) def test_user_result_is_dict(self): item = self.hn.user('pg') self.assertIsInstance(item, User) def test_user_created_is_datetime(self): item = self.hn.user('pg') self.assertIsInstance(item.created, datetime) def test_item_time_is_datetime(self): item = self.hn.item('1') if item.get('time'): self.assertIsInstance(item.time, datetime) def test_raises_connection_timeout(self): hn = HackerNews(timeout=1) hn.url = "http://192.0.2.0" # RFC 5735 TEST-NET-1 self.assertRaises(ConnectTimeout, hn.top_stories) def test_object_to_dict(self): item = self.hn.item('1') self.assertIsInstance(item.__dict__, dict)
def scrape_hacker_news(): hn = HackerNews() item_id_list = hn.top_stories() stories_list = [] for item_id in item_id_list: print item_id try: # is_pres_count = StoryModel.objects.filter(story_id=item_id).count() # if (is_pres_count > 0): # continue try: hn_story = hn.get_item(item_id) # print hn_story page = urllib2.urlopen(hn_story.url) bs = BeautifulSoup(page.read()) content = bs.get_text() content = ' '.join(word for word in content.split('\n') if word != '') story = StoryModel( story_id=hn_story.item_id, title=hn_story.title, link=hn_story.url, points=hn_story.score, # content = content, # submitter=hn_story published_time=hn_story.submission_time) # story.save() stories_list.append(story) except Exception as e: print("error while retrieving : %s" % (e)) continue except Exception as e: print("error while retrieving : %s" % (e)) continue print("the size of the story list is %s" % (len(stories_list))) return stories_list
class HackerNewsTestCase(unittest.TestCase): def setUp(self): self.hn = HackerNews() def test_top_story_count(self): top_stories = self.hn.top_stories() self.assertEqual(len(top_stories), 100) def test_max_item_positive_integer(self): max_item = self.hn.max_item() self.assertGreaterEqual(max_item, 0) def test_updates_result_is_dict(self): updates = self.hn.updates() self.assertIsInstance(updates, dict) def test_item_result_is_dict(self): item = self.hn.item(1) self.assertIsInstance(item, Item) def test_user_result_is_dict(self): item = self.hn.user('pg') self.assertIsInstance(item, User) def test_user_created_is_datetime(self): item = self.hn.user('pg') self.assertIsInstance(item.created, datetime) def test_item_time_is_datetime(self): item = self.hn.item('1') self.assertIsInstance(item.time, datetime) def test_raises_connection_timeout(self): hn = HackerNews(timeout=1) hn.url = "http://192.0.2.0" # RFC 5735 TEST-NET-1 self.assertRaises(ConnectTimeout, hn.top_stories) def test_object_to_dict(self): item = self.hn.item('1') self.assertIsInstance(item.__dict__, dict)
class HN: def __init__(self, speaker): self.speaker = speaker self.hn = HackerNews() def get_top_stories(self): ids = self.hn.top_stories(limit=10) for id in ids: item = self.hn.get_item(id) print(item.title) self.speaker.say(item.title) #time.sleep(5) def check_command(self, data): if "news" in data: if internet_on() == False: self.speaker.say("no internet connection try later") return false if "check" in data: self.get_top_stories()
def test_save_item(self): hn = HackerNews() item_id_list = hn.top_stories() for item_id in item_id_list[:5]: try: is_pres_count = StoryModel.objects.filter( story_id=item_id).count() if (is_pres_count == 0): continue except Exception as e: print("Error occured : %s" % (e)) continue hn_story = hn.get_item(item_id) story = StoryModel( title=hn_story.title, link=hn_story.url, points=hn_story.score, # submitter=hn_story published_time=hn_story.submission_time) story.save()
db = client.get_database() print "CONNECTED TO " + dbURL links = [] tagMap = {} tagSet = set() # Make tag set and tag map for tag in db.tags.find(): tagSet.add(tag["name"].lower()) # Make tag map to get back to correct casing tagMap[tag["name"].lower()] = tag["name"] # Get new links for story_id in hn.top_stories(limit=1000): item = hn.get_item(story_id) url = item.url print item # Check if link is already in database if db.unrelatedlinks.find_one({'url': item.url}) is not None: continue try: response = requests.get(url) except: continue # Get description soup = BeautifulSoup(response.text)
class HackNews: def __init__(self): self.hn = HackerNews() self.jsonObj = [] def displayHackNews(self, jobsOrHeadlines): if jobsOrHeadlines == "headlines": return self.topStories() elif jobsOrHeadlines == "jobs": return self.jobAds() else: resp.message( "Oops, wrong catagory! Text us: 'HACKNEWS: jobs' or 'HACKNEWS: headlines'" ) def topStories(self): uncleanHeadline = "" cleanHeadline = "" textReturn = "" for story_id in self.hn.top_stories(limit=10): uncleanHeadline = str(self.hn.get_item(story_id)) uncleanHeadline = uncleanHeadline.split(' - ', 1) cleanHeadline = uncleanHeadline[1][:-1] textReturn += cleanHeadline + '\n\n' self.jsonObj.append({"title": cleanHeadline}) if (cleanHeadline and cleanHeadline != ""): self.jsonObj.append({"sucess": "true"}) else: self.jsonObj.append({"sucess": "false"}) return textReturn def jobAds(self): textReturn = "" numLoops = 0 maxLoops = 10 for story_id in self.hn.top_stories(): numLoops += 1 story = self.hn.get_item(story_id) if numLoops >= 10: break if story.item_type == 'job': uncleanHeadline = str(story) uncleanHeadline = uncleanHeadline.split(' - ', 1) cleanHeadline = uncleanHeadline[1][:-1] textReturn += cleanHeadline + '\n' if cleanHeadline and cleanHeadline != "": self.jsonObj.append({"title": cleanHeadline}) if textReturn == "": textReturn += "No jobs have been posted in Top Stories, try again tomorrow!" self.jsonObj.append({"sucess": "false"}) else: self.jsonObj.append({"sucess": "true"}) return textReturn def convertToJson(self): return self.jsonObj
from hackernews import HackerNews from .models import Story hn = HackerNews() for stories in hn.top_stories(limit=10): stories = hn.get_item(stories) print stories.title print stories.url print stories.score print stories.by print stories.submission_time print stories.item_id Story.objects.create(title=stories.title, url = stories.url, \ score = stories.score, submitter = stories.by, \ timestamp = stories.submission_time, hn_id = stories.item_id)
places.append("Donut Touch") places.append("New York Bagels") places.append("Karl Strauss") number = random.randint(0,len(places) - 1) sc.rtm_send_message(chan, "You should go to %s to for food." % places[number]) ####JIRA STUFF elif "!helpdesk" in message: request = message[10:] new_issue = j.create_issue(project="IT", summary=request, description="Created by Slack", issuetype={'name':'Service Request'}, reporter={"name": email}) #edit project ID to match. sc.rtm_send_message(chan, "Your helpdesk ticket for '%s' has been created." % request) ####Hacker News Stuff elif "!hn" in message: n=0 sc.rtm_send_message(chan,"Top 2 HackerNews Stories:") for story_id in hn.top_stories(limit=2): derp = hn.get_item(story_id) derp = str(derp) print "derp is:" print derp herp = derp print "herp is:" print herp derpy = derp.split(":")[1] print "derpy is:" print derpy derpy = derpy.split("-")[0] print "derpy is" print derpy derpy = derpy.strip() print "derpy is"
from hackernews import HackerNews hn = HackerNews() top_story_ids = hn.top_stories() print(top_story_ids)
# Get feed entries if len(config['feed_sources']) > 0: feed_links = config['feed_sources'] pattern = re.compile("=(.*)$") for one_feed in feed_links: one_feed_parsed = feedparser.parse(one_feed) for one_entry in one_feed_parsed.entries: urls.append(pattern.search(one_entry.id).group(1)) titles.append(one_entry.title) selftexts.append(one_entry.description) # Going through the top hacker news items if config['add_hn_entries']: hn = HackerNews() num_posts = config['hn_num_posts'] for story_id in hn.top_stories(limit=num_posts): one_item = hn.get_item(story_id) if one_item.item_type in ['poll', 'story' ] and one_item.descendants >= 10: urls.append('https://news.ycombinator.com/item?id=' + str(one_item.item_id)) titles.append(one_item.title) selftexts.append('Article from HackerNews') new_data = pd.DataFrame({ 'url': urls, 'title': titles, 'selftext': selftexts, 'time': datetime.now() })
class Scraper(object): """ Scrapes various services, namely Reddit and HackerNews used to gather data to feed into the SentenceGenerator model. """ def __init__(self, logger): """ Initializes an instance of Scraper. Requires that a logger to denote the progress of the Scraper to be passed in. """ self.phrases = [] self.hackernews = HackerNews() self.output = mp.Queue() self.logger = logger def gather_reddit_data(self): """ Gathers comments and submission titles from Reddit. Returns an updated list of pharses after the Reddit data has been gathered. """ # split the list of subreddits to allow for parallel processing subreddit_sublists = Scraper._split_into_sublists(DEFAULT_SUBREDDITS, \ NUM_SUBREDDITS_PROCESSOR) # setup processes, run, and collect results reddit_processes = [mp.Process(target=self._gather_reddit_data, args=(subreddits,)) \ for subreddits in subreddit_sublists] self._execute_and_collect_processes(reddit_processes) def gather_hn_data(self): """ Gathers comments and submission titles from HN. Returns an updated list of pharses after the HN data has been gathered. """ # get top stories from HN and split the list top_stories = self.hackernews.top_stories()[:3] stories_sublists = Scraper._split_into_sublists(top_stories, NUM_HN_THREAD_PROCESSOR) hn_processes = [mp.Process(target=self._gather_hn_data, args=(stories,)) for stories in stories_sublists] self._execute_and_collect_processes(hn_processes) def _execute_and_collect_processes(self, processes): """ Executes and collects the results of the phrases the scraper has gathered. """ for p_num, process in enumerate(processes): self.logger.debug("Starting process %d " % p_num) process.start() for p_num, process in enumerate(processes): self.logger.debug("Joining process %d " % p_num) process.join() self.logger.debug("Combining results") while self.output.qsize(): phrase = self.output.get() try: phrase = phrase.decode("utf-8").encode("ascii", "ignore") self.phrases.append(phrase) except UnicodeEncodeError: self.logger.warning("Phrase %s could not be decoded " %phrase) @classmethod def _split_into_sublists(cls, lst, size): """ Splits the list, lst, into sublists of size 'size'. Returns a new list consisting of len(l) / size sublists of size 'size'. """ sublists = [] for i in xrange(0, len(lst), size): sublists.append(lst[i : i + size]) return sublists def _gather_reddit_data(self, subreddits): """ Gathers data from the Reddit API. The param, subreddits, are all the subreddits this process will gather data from and output represents the joint result of multiple threads. """ reddit = praw.Reddit(user_agent="Scrum Generator") for subreddit in subreddits: # force lazy eval by converting to list top_submissions = list(reddit.get_subreddit(subreddit).get_top(limit=2)) titles = [entry.title.encode("utf8", "ignore") for entry in top_submissions] comments = sum([[c.body for c in submission.comments \ if not isinstance(c, praw.objects.MoreComments)] \ for submission in top_submissions], []) for comment in comments: self.output.put(Scraper._clean_data(comment)) for title in titles: self.output.put(Scraper._clean_data(title)) def _gather_hn_data(self, entries): """ Gathers data from the Hacker News API. The param, entries, represents all of the posts this process will gather data from. """ for entry in entries: response = urllib2.urlopen(HN_BASE_API_ENDPOINT + str(entry)).read() soup = BeautifulSoup(response, "html.parser") all_comments = soup.findAll("span", {"class" : "comment"}) for comment in all_comments: cleaned_html = re.sub('<[^<]+?>|reply|\n', "", comment.text) cleaned_data = Scraper._clean_data(cleaned_html) self.output.put(cleaned_data) @classmethod def _clean_data(cls, phrase): """ Cleans each phrase from both Reddit and HackerNews to be processed by SentenceGenerator. Returns a cleaned string free of parens, curly and square brackets, and quotes along with spaces after punctuation. """ # replace illegal chracaters cleaned_phrase = re.sub("[(%~`<>#:@/^*&$\t?=|){}\\[\\]\"\n]", "", phrase) # make sure each period is proceeded by a space for proper punctuation cleaned_phrase = re.sub(r'[?!.]([a-zA-Z])', r'. \1', cleaned_phrase) return cleaned_phrase def insert_into_db(self): """ Inserts the data into the Postgres DB. """ self.logger.debug("Inserting data in to the database") if len(self.phrases) == 0: self.logger.info("No phrases to insert!") else: self.logger.debug("Attempting to insert %d phrases into the database" \ % len(self.phrases)) conn = psycopg2.connect(database=os.environ["DATABASE"], user=os.environ["USER"]) cur = conn.cursor() successful_insertion = 0 for phrase in self.phrases: self.logger.debug("Attempting to insert %s..." % phrase) phrase_hash = int(hashlib.sha1(phrase).hexdigest(), 16) % 10 ** 8 phrase = phrase.replace("'", "''") # escape quotes sql_string = "INSERT INTO phrases (phrase, phrase_hash) VALUES ('%s', '%d')" \ % (phrase, phrase_hash) try: cur.execute(sql_string) self.logger.debug("Successfully inserted %s" % phrase) successful_insertion += 1 conn.commit() except psycopg2.IntegrityError: # duplicate comments not allowed self.logger.warn("The phrase '%s' is already in the database" % phrase) conn.rollback() self.logger.debug("Successfully inserted %d / %d phrases into the db" \ % (successful_insertion, len(self.phrases)))
from hackernews import HackerNews hn = HackerNews() stories = hn.top_stories() for story in stories: print(story)
# using the haxor hackernews python api wrapper # don't resubmit a query with frequency greater than every 30 seconds from hackernews import HackerNews hn = HackerNews() from datetime import datetime import matplotlib from matplotlib import pyplot as plt story_tup_list = [] comment_tup = () get_comments = False now = datetime.now() top_story_ids = hn.top_stories(limit=30) for story_id in top_story_ids: story = hn.get_item(story_id) story_tup = (story.title, story.score, story.submission_time) story_tup_list.append(story_tup) if (get_comments == True): for comment_id in story.kids: comment = hn.get_item(comment_id) comment_tup = (comment.submission_time, comment.text) story_tup_list.sort(key=lambda tup: tup[1], reverse=True) for story in story_tup_list[:5]: print story[0], story[1] string_list = story[0].split(" ")
class HackNews: def __init__(self): self.hn = HackerNews() self.jsonObj = [] def displayHackNews(self, jobsOrHeadlines): if jobsOrHeadlines == "headlines": return self.topStories() elif jobsOrHeadlines == "jobs": return self.jobAds() else: resp.message("Oops, wrong catagory! Text us: 'HACKNEWS: jobs' or 'HACKNEWS: headlines'") def topStories(self): uncleanHeadline = "" cleanHeadline = "" textReturn = "" for story_id in self.hn.top_stories(limit=10): uncleanHeadline = str(self.hn.get_item(story_id)) uncleanHeadline = uncleanHeadline.split(' - ', 1 ) cleanHeadline = uncleanHeadline[1][:-1] textReturn += cleanHeadline + '\n\n' self.jsonObj.append({ "title" : cleanHeadline }) if(cleanHeadline and cleanHeadline != ""): self.jsonObj.append({ "sucess" : "true" }) else: self.jsonObj.append({ "sucess" : "false" }) return textReturn def jobAds(self): textReturn = "" numLoops = 0 maxLoops = 10 for story_id in self.hn.top_stories(): numLoops += 1 story = self.hn.get_item(story_id) if numLoops >= 10: break if story.item_type == 'job': uncleanHeadline = str(story) uncleanHeadline = uncleanHeadline.split(' - ', 1 ) cleanHeadline = uncleanHeadline[1][:-1] textReturn += cleanHeadline + '\n' if cleanHeadline and cleanHeadline != "": self.jsonObj.append({ "title" : cleanHeadline }) if textReturn == "": textReturn += "No jobs have been posted in Top Stories, try again tomorrow!" self.jsonObj.append({ "sucess" : "false" }) else: self.jsonObj.append({ "sucess" : "true" }) return textReturn def convertToJson(self): return self.jsonObj