Python HackerNews.top_stories 예제들, hackernews.HackerNews.top_stories Python 예제들

예제 #1

0

파일 보기

파일: run.py 프로젝트: AustinBenavides/HackTX2015

def get_news(from_num, num_headlines):
	hn = HackerNews()
	print "Starting HN"
	news_lst = []
	for story_id in hn.top_stories(limit=num_headlines):
		news_lst.append(hn.get_item(story_id).title + "\n")
	return news_lst

예제 #2

0

파일 보기

파일: hackerNewsTwitterBot.py 프로젝트: justintranjt/hacker-news-twitter-bot

def refresh_posts():
    hn = HackerNews()
    for story in hn.top_stories(limit=10):  # Only viewing top 10 posts on HN
        story_id = hn.get_item(story)

        #  Tweets title, story URL, and comments
        if len(story_id.title) > 76:  # Adjusting for max tweet length
            story_title = (story_id.title.rsplit(' ', 1)[0] + '\n')
        else:
            story_title = (story_id.title + '\n')

        story_comments = ('Cmts: https://news.ycombinator.com/item?id=%s' %
                          str(story_id.item_id))

        #  Check to see if post has an external link
        if story_id.url is None:
            try:  # If tweet is a duplicate, ignores the post and doesn't tweet
                api.update_status(story_title + story_comments)
            except tweepy.error.TweepError:
                continue
        else:
            story_url = ('Link: ' + story_id.url + '\n')
            # If tweet is a duplicate, ignores the post and doesn't tweet
            try:
                api.update_status(story_title + story_url + story_comments)
            except tweepy.error.TweepError:
                continue

예제 #3

0

파일 보기

파일: summarize.py 프로젝트: rkcf/tldr-hn

def main():
    """ Gets top 30 stories on HN, summarizes and stores in the database """
    # Grab top 30 stories
    hn = HackerNews()
    stories = hn.top_stories(limit=30)
    front_page_list = []

    for story in stories:
        # Only continue if not a job, or ask (ie external link) and not in db
        if story.url:
            # Put story id into front page list
            front_page_list.append(story.item_id)
            if Story.query.filter_by(id=story.item_id).first() is None:
                # Generate summary
                summary = gen_summary(story.url)
                # Store in db
                s = Story(id=story.item_id,
                          title=story.title,
                          summary=summary,
                          url=story.url)
                db.session.add(s)
                db.session.commit()

    # Delete all rows in front page list
    Position.query.delete()
    # Add story ids to positions list
    for i in range(0, len(front_page_list)):
        db.session.add(Position(id=front_page_list[i], position=i))
    db.session.commit()

예제 #4

0

파일 보기

파일: hacker.py 프로젝트: stuartwesterman/neovim-hackernews

class Hacker(object):
    def __init__(self, vim):
        self.vim = vim
        self.hn = HackerNews()
        self.urls = None


    @neovim.command("Test")
    def test(self):
        self.vim.command("vsplit")

    @neovim.command('HackerNews')
    def fill_buffer(self):

        stories = []
        urls = {}
        for story in self.hn.top_stories()[0:30]:
            item = self.hn.get_item(story)
            stories.append(item.title)
            urls[item.title] = item.url

        self.vim.command("split HackerNews")
        self.vim.command("buffer HackerNews")
        self.vim.command("set buftype=nofile")
        self.vim.command("set bufhidden=hide")
        self.vim.command("setlocal noswapfile")
        self.vim.current.buffer[:] = stories
        self.urls = urls

    @neovim.command('HackerOpen')
    def autocmd_handler(self):
        url = self.urls[self.vim.current.line]
        webbrowser.open_new_tab(url)

예제 #5

0

파일 보기

파일: code.py 프로젝트: donny/zitting

def handler(event, context):
    hn = HackerNews()
    results = []
    for story_id in hn.top_stories(limit=10):
        results.append(hn.get_item(story_id).title)

    return json.dumps(results)

예제 #6

0

파일 보기

파일: generate_name.py 프로젝트: veekaybee/hustlr

def load_name_to_mongo():
    """
     Gets data from HN and cleans it
     50 headlines split up into syllables of proper nouns
     """

    dic = pyphen.Pyphen(lang='en')

    hn = HackerNews()

    top_stories = hn.top_stories(limit=50)

    words = [
        title for story in top_stories
        for title in filter(str.isalpha, story.title.split(' '))
    ]

    syllables = [
        syllable for word in words
        if not word.islower() and not word.isupper()
        for syllable in dic.inserted(word).split("-")
    ]

    mixed_bag = [syllable for syllable in syllables if len(syllables) > 1]

    return mixed_bag

예제 #7

0

파일 보기

파일: test_top_stories.py 프로젝트: olivierh59500/haxor

class TestTopStories(unittest.TestCase):

    def setUp(self):
        self.hn = HackerNews()

    def test_top_stories(self):
        top_stories = self.hn.top_stories()
        self.assertIsInstance(top_stories, list)
        self.assertIsNotNone(top_stories)

예제 #8

0

파일 보기

class TestTopStories(unittest.TestCase):
    def setUp(self):
        self.hn = HackerNews()

    def test_top_stories(self):
        top_stories = self.hn.top_stories(limit=10)
        self.assertIsInstance(top_stories, list)
        self.assertIsInstance(top_stories[0], Item)
        self.assertIsNotNone(top_stories)

    def test_top_stories_raw(self):
        top_stories = self.hn.top_stories(raw=True)
        self.assertIsInstance(top_stories, list)
        self.assertIsInstance(top_stories[0], str)
        self.assertIsNotNone(top_stories)

    def tearDown(self):
        self.hn.session.close()

예제 #9

0

파일 보기

파일: messenger.py 프로젝트: malcolm-mergulhao/starter-python-bot

def get_hackernews_article():
    hn_wrapper = HackerNews()
    index = random.choice(hn_wrapper.top_stories())
    story = hn_wrapper.get_item(index)

    result = story.title
    if story.url is not None:
        result += "\n" + story.url

    return result

예제 #10

0

파일 보기

파일: test_top_stories.py 프로젝트: avinassh/haxor

class TestTopStories(unittest.TestCase):

    def setUp(self):
        self.hn = HackerNews()

    def test_top_stories(self):
        top_stories = self.hn.top_stories(limit=10)
        self.assertIsInstance(top_stories, list)
        self.assertIsInstance(top_stories[0], Item)
        self.assertIsNotNone(top_stories)

    def test_top_stories_raw(self):
        top_stories = self.hn.top_stories(raw=True)
        self.assertIsInstance(top_stories, list)
        self.assertIsInstance(top_stories[0], str)
        self.assertIsNotNone(top_stories)

    def tearDown(self):
        self.hn.session.close()

예제 #11

0

파일 보기

파일: python_web_service.py 프로젝트: KaffeLatte/pn_stack

    def getHN_stories(self, article_limit):
        hn = HackerNews()

        articles_to_retrieve = int(article_limit*1.5)
        top_story_ids = hn.top_stories(limit=articles_to_retrieve)

        stories = []
        for story_id in top_story_ids:
            stories.append(hn.get_item(story_id))

        return stories

예제 #12

0

파일 보기

파일: ExtractHackerNews.py 프로젝트: pratikpalashikar/TweetHackBot

def getNews():

    hn = HackerNews()
    story_id = []
    items_list = []

    #add the stories ids
    for stories in hn.top_stories(limit=100):
        story_id.append(stories)

    #get the ids and exrtact the useful information out of it
    for ids in story_id:
        items_list.append(hn.get_item(ids))

    return items_list

예제 #13

0

파일 보기

파일: helloworld.py 프로젝트: NanYoMy/hackernews

def updateHackerNews():
    sql = 'truncate discussion'
    database_execute(sql)
    hn = HackerNews()
    id=1
    stories=hn.top_stories(limit=30)
    for story_id in stories:
        item=hn.get_item(story_id)
        id=story_id
        url="https://news.ycombinator.com/item?id="+str(story_id)
        title=item.title.replace("'","")
        score=item.score
        sql = "insert into discussion values('%s','%s','%s','%s')"%(id,title,url,score)
        #FL.debug(sql)
        database_execute(sql)
    return "success"

예제 #14

0

파일 보기

파일: server.py 프로젝트: TVFlash/midia

def update_hackernews(user, update):
    hn = HackerNews()

    for story_id in hn.top_stories(limit=15):
        post = postObject()
        hn_story = hn.get_item(story_id)
        message = hn_story.text

        post.mainlabel = hn_story.title.encode('ascii', 'ignore')
        post.time = str(hn_story.submission_time)
        post.sublabel = str(hn_story.score) + " points by " + hn_story.by
        post.message = message if message is not None else "Read more"
        post.type = 'hackernews'
        post.link = "https://news.ycombinator.com/"
        if post.mainlabel not in user.hackerNewsFeed:
            update.append(post.to_json())
            user.hackerNewsFeed.append(post.mainlabel)
    return update

예제 #15

0

파일 보기

파일: models.py 프로젝트: fayazkhan/reverse_hn_feed

def sync_with_hacker_news():
    hn = HackerNews()
    for story_id in hn.top_stories(limit=90):
        story = hn.get_item(story_id)
        persisted_news_item = NewsItem.query.get(story_id)
        if persisted_news_item:
            print "Updating story:", story_id
            persisted_news_item.upvotes = story.score
            persisted_news_item.comments = comment_count(story)
        else:
            print "Adding story:", story_id
            news_item = NewsItem(
                id=story_id, url=story.url, posted_on=story.submission_time,
                upvotes=story.score,
                comments=comment_count(story))
            db.session.add(news_item)
            for user in User.query.all():
                db.session.add(UserNewsItem(user=user, news_item=news_item))
    db.session.commit()

예제 #16

0

파일 보기

    def process(self, msg):
        """
        `hn:` top\n
        `hn: last
        """

        params = msg.extract_parameters(self.parameters)

        from hackernews import HackerNews
        hn = HackerNews()

        [
            msg.reply(
                "{title} - {score} - {url}".format(**hn.get_item(s).__dict__))
            for s in (hn.new_stories(int(params['limit'])) if params['hn'] ==
                      "last" else hn.top_stories(int(params['limit'])))
        ]

        return True

예제 #17

0

파일 보기

파일: tests.py 프로젝트: chaconnewu/hackernews-python

class HackerNewsTestCase(unittest.TestCase):

    def setUp(self):
        self.hn = HackerNews()

    def test_top_story_count(self):
        top_stories = self.hn.top_stories()
        self.assertEqual(len(top_stories), 100)

    def test_max_item_positive_integer(self):
        max_item = self.hn.max_item()
        self.assertGreaterEqual(max_item, 0)

    def test_updates_result_is_dict(self):
        updates = self.hn.updates()
        self.assertIsInstance(updates, dict)

    def test_item_result_is_dict(self):
        item = self.hn.item(1)
        self.assertIsInstance(item, Item)

    def test_user_result_is_dict(self):
        item = self.hn.user('pg')
        self.assertIsInstance(item, User)

    def test_user_created_is_datetime(self):
        item = self.hn.user('pg')
        self.assertIsInstance(item.created, datetime)

    def test_item_time_is_datetime(self):
        item = self.hn.item('1')
        if item.get('time'):
            self.assertIsInstance(item.time, datetime)

    def test_raises_connection_timeout(self):
        hn = HackerNews(timeout=1)
        hn.url = "http://192.0.2.0"  # RFC 5735 TEST-NET-1
        self.assertRaises(ConnectTimeout, hn.top_stories)

    def test_object_to_dict(self):
        item = self.hn.item('1')
        self.assertIsInstance(item.__dict__, dict)

예제 #18

0

파일 보기

파일: scraper.py 프로젝트: hardyarora/NewsScraper

def scrape_hacker_news():
    hn = HackerNews()
    item_id_list = hn.top_stories()
    stories_list = []
    for item_id in item_id_list:
        print item_id
        try:
            # is_pres_count = StoryModel.objects.filter(story_id=item_id).count()

            # if (is_pres_count > 0):
            # 	continue

            try:
                hn_story = hn.get_item(item_id)
                # print hn_story
                page = urllib2.urlopen(hn_story.url)
                bs = BeautifulSoup(page.read())

                content = bs.get_text()
                content = ' '.join(word for word in content.split('\n')
                                   if word != '')
                story = StoryModel(
                    story_id=hn_story.item_id,
                    title=hn_story.title,
                    link=hn_story.url,
                    points=hn_story.score,
                    # content = content,
                    # submitter=hn_story
                    published_time=hn_story.submission_time)
                # story.save()

                stories_list.append(story)
            except Exception as e:
                print("error while retrieving : %s" % (e))
                continue
        except Exception as e:
            print("error while retrieving : %s" % (e))
            continue
    print("the size of the story list is %s" % (len(stories_list)))
    return stories_list

예제 #19

0

파일 보기

파일: tests.py 프로젝트: gouthambs/hackernews-python

class HackerNewsTestCase(unittest.TestCase):
    def setUp(self):
        self.hn = HackerNews()

    def test_top_story_count(self):
        top_stories = self.hn.top_stories()
        self.assertEqual(len(top_stories), 100)

    def test_max_item_positive_integer(self):
        max_item = self.hn.max_item()
        self.assertGreaterEqual(max_item, 0)

    def test_updates_result_is_dict(self):
        updates = self.hn.updates()
        self.assertIsInstance(updates, dict)

    def test_item_result_is_dict(self):
        item = self.hn.item(1)
        self.assertIsInstance(item, Item)

    def test_user_result_is_dict(self):
        item = self.hn.user('pg')
        self.assertIsInstance(item, User)

    def test_user_created_is_datetime(self):
        item = self.hn.user('pg')
        self.assertIsInstance(item.created, datetime)

    def test_item_time_is_datetime(self):
        item = self.hn.item('1')
        self.assertIsInstance(item.time, datetime)

    def test_raises_connection_timeout(self):
        hn = HackerNews(timeout=1)
        hn.url = "http://192.0.2.0"  # RFC 5735 TEST-NET-1
        self.assertRaises(ConnectTimeout, hn.top_stories)

    def test_object_to_dict(self):
        item = self.hn.item('1')
        self.assertIsInstance(item.__dict__, dict)

예제 #20

0

파일 보기

파일: hn.py 프로젝트: codebhendi/alfred-bot

class HN:
    def __init__(self, speaker):
        self.speaker = speaker
        self.hn = HackerNews()

    def get_top_stories(self):
        ids = self.hn.top_stories(limit=10)

        for id in ids:
            item = self.hn.get_item(id)
            print(item.title)
            self.speaker.say(item.title)

            #time.sleep(5)

    def check_command(self, data):
        if "news" in data:
            if internet_on() == False:
                self.speaker.say("no internet connection try later")
                return false

            if "check" in data:
                self.get_top_stories()

예제 #21

0

파일 보기

파일: tests.py 프로젝트: hardyarora/NewsScraper

    def test_save_item(self):

        hn = HackerNews()
        item_id_list = hn.top_stories()
        for item_id in item_id_list[:5]:

            try:
                is_pres_count = StoryModel.objects.filter(
                    story_id=item_id).count()

                if (is_pres_count == 0):
                    continue
            except Exception as e:
                print("Error occured : %s" % (e))
                continue

            hn_story = hn.get_item(item_id)
            story = StoryModel(
                title=hn_story.title,
                link=hn_story.url,
                points=hn_story.score,
                # submitter=hn_story
                published_time=hn_story.submission_time)
            story.save()

예제 #22

0

파일 보기

db = client.get_database()

print "CONNECTED TO " + dbURL

links = []
tagMap = {}
tagSet = set()

# Make tag set and tag map
for tag in db.tags.find():
    tagSet.add(tag["name"].lower())
    # Make tag map to get back to correct casing
    tagMap[tag["name"].lower()] = tag["name"]

# Get new links
for story_id in hn.top_stories(limit=1000):
    item = hn.get_item(story_id)
    url = item.url
    print item

    # Check if link is already in database
    if db.unrelatedlinks.find_one({'url': item.url}) is not None:
        continue

    try:
        response = requests.get(url)
    except:
        continue

    # Get description
    soup = BeautifulSoup(response.text)

예제 #23

0

파일 보기

class HackNews:
    def __init__(self):
        self.hn = HackerNews()
        self.jsonObj = []

    def displayHackNews(self, jobsOrHeadlines):
        if jobsOrHeadlines == "headlines":
            return self.topStories()

        elif jobsOrHeadlines == "jobs":
            return self.jobAds()

        else:
            resp.message(
                "Oops, wrong catagory! Text us: 'HACKNEWS: jobs' or 'HACKNEWS: headlines'"
            )

    def topStories(self):
        uncleanHeadline = ""
        cleanHeadline = ""

        textReturn = ""

        for story_id in self.hn.top_stories(limit=10):
            uncleanHeadline = str(self.hn.get_item(story_id))
            uncleanHeadline = uncleanHeadline.split(' - ', 1)
            cleanHeadline = uncleanHeadline[1][:-1]

            textReturn += cleanHeadline + '\n\n'

            self.jsonObj.append({"title": cleanHeadline})

        if (cleanHeadline and cleanHeadline != ""):
            self.jsonObj.append({"sucess": "true"})
        else:
            self.jsonObj.append({"sucess": "false"})

        return textReturn

    def jobAds(self):

        textReturn = ""

        numLoops = 0
        maxLoops = 10

        for story_id in self.hn.top_stories():
            numLoops += 1

            story = self.hn.get_item(story_id)

            if numLoops >= 10:
                break

            if story.item_type == 'job':

                uncleanHeadline = str(story)
                uncleanHeadline = uncleanHeadline.split(' - ', 1)

                cleanHeadline = uncleanHeadline[1][:-1]

                textReturn += cleanHeadline + '\n'

                if cleanHeadline and cleanHeadline != "":
                    self.jsonObj.append({"title": cleanHeadline})

        if textReturn == "":
            textReturn += "No jobs have been posted in Top Stories, try again tomorrow!"
            self.jsonObj.append({"sucess": "false"})
        else:
            self.jsonObj.append({"sucess": "true"})

        return textReturn

    def convertToJson(self):

        return self.jsonObj

예제 #24

0

파일 보기

from hackernews import HackerNews
from .models import Story

hn = HackerNews()

for stories in hn.top_stories(limit=10):

    stories = hn.get_item(stories)
    print stories.title
    print stories.url
    print stories.score
    print stories.by
    print stories.submission_time
    print stories.item_id

    Story.objects.create(title=stories.title, url = stories.url, \
    score = stories.score, submitter = stories.by, \
    timestamp = stories.submission_time, hn_id = stories.item_id)

예제 #25

0

파일 보기

파일: superbot.py 프로젝트: PatrickGarrity/SuperSlackBot

							places.append("Donut Touch")
							places.append("New York Bagels")
							places.append("Karl Strauss")

							number = random.randint(0,len(places) - 1)
							sc.rtm_send_message(chan, "You should go to %s to for food." % places[number])						
####JIRA STUFF
						elif "!helpdesk" in message:
							request = message[10:]
							new_issue = j.create_issue(project="IT", summary=request, description="Created by Slack", issuetype={'name':'Service Request'}, reporter={"name": email}) #edit project ID to match.
							sc.rtm_send_message(chan, "Your helpdesk ticket for '%s' has been created." % request)
####Hacker News Stuff
						elif "!hn" in message:
							n=0
							sc.rtm_send_message(chan,"Top 2 HackerNews Stories:")
							for story_id in hn.top_stories(limit=2):
								derp = hn.get_item(story_id)
								derp = str(derp)
								print "derp is:"
								print derp
								herp = derp
								print "herp is:"
								print herp
								derpy = derp.split(":")[1]
								print "derpy is:"
								print derpy
								derpy = derpy.split("-")[0]
								print "derpy is"
								print derpy
								derpy = derpy.strip()
								print "derpy is"

예제 #26

0

파일 보기

파일: collector.py 프로젝트: peterbabinec/hntrends

from hackernews import HackerNews

hn = HackerNews()

top_story_ids = hn.top_stories()
print(top_story_ids)

예제 #27

0

파일 보기

파일: generate_feed.py 프로젝트: sainathadapa/generate-rss-feed

# Get feed entries
if len(config['feed_sources']) > 0:
    feed_links = config['feed_sources']
    pattern = re.compile("=(.*)$")
    for one_feed in feed_links:
        one_feed_parsed = feedparser.parse(one_feed)
        for one_entry in one_feed_parsed.entries:
            urls.append(pattern.search(one_entry.id).group(1))
            titles.append(one_entry.title)
            selftexts.append(one_entry.description)

# Going through the top hacker news items
if config['add_hn_entries']:
    hn = HackerNews()
    num_posts = config['hn_num_posts']
    for story_id in hn.top_stories(limit=num_posts):
        one_item = hn.get_item(story_id)
        if one_item.item_type in ['poll', 'story'
                                  ] and one_item.descendants >= 10:
            urls.append('https://news.ycombinator.com/item?id=' +
                        str(one_item.item_id))
            titles.append(one_item.title)
            selftexts.append('Article from HackerNews')

new_data = pd.DataFrame({
    'url': urls,
    'title': titles,
    'selftext': selftexts,
    'time': datetime.now()
})

예제 #28

0

파일 보기

파일: scraper.py 프로젝트: gestone/TechGen

class Scraper(object):
    """
    Scrapes various services, namely Reddit and HackerNews
    used to gather data to feed into the SentenceGenerator
    model.
    """

    def __init__(self, logger):
        """
        Initializes an instance of Scraper. Requires that a logger
        to denote the progress of the Scraper to be passed in.
        """
        self.phrases = []
        self.hackernews = HackerNews()
        self.output = mp.Queue()
        self.logger = logger


    def gather_reddit_data(self):
        """
        Gathers comments and submission titles from Reddit.
        Returns an updated list of pharses after the Reddit data has been gathered.
        """

        # split the list of subreddits to allow for parallel processing
        subreddit_sublists = Scraper._split_into_sublists(DEFAULT_SUBREDDITS, \
                                NUM_SUBREDDITS_PROCESSOR)

        # setup processes, run, and collect results
        reddit_processes = [mp.Process(target=self._gather_reddit_data, args=(subreddits,)) \
                            for subreddits in subreddit_sublists]

        self._execute_and_collect_processes(reddit_processes)

    def gather_hn_data(self):
        """
        Gathers comments and submission titles from HN.
        Returns an updated list of pharses after the HN data has been gathered.
        """

        # get top stories from HN and split the list
        top_stories = self.hackernews.top_stories()[:3]
        stories_sublists = Scraper._split_into_sublists(top_stories, NUM_HN_THREAD_PROCESSOR)
        hn_processes = [mp.Process(target=self._gather_hn_data, args=(stories,))
                        for stories in stories_sublists]

        self._execute_and_collect_processes(hn_processes)

    def _execute_and_collect_processes(self, processes):
        """
        Executes and collects the results of the phrases the scraper has gathered.
        """

        for p_num, process in enumerate(processes):
            self.logger.debug("Starting process %d " % p_num)
            process.start()

        for p_num, process in enumerate(processes):
            self.logger.debug("Joining process %d " % p_num)
            process.join()

        self.logger.debug("Combining results")
        while self.output.qsize():
            phrase = self.output.get()
            try:
                phrase = phrase.decode("utf-8").encode("ascii", "ignore")
                self.phrases.append(phrase)
            except UnicodeEncodeError:
                self.logger.warning("Phrase %s could not be decoded " %phrase)

    @classmethod
    def _split_into_sublists(cls, lst, size):
        """
        Splits the list, lst, into sublists of size 'size'.
        Returns a new list consisting of len(l) / size sublists
        of size 'size'.
        """
        sublists = []
        for i in xrange(0, len(lst), size):
            sublists.append(lst[i : i + size])
        return sublists


    def _gather_reddit_data(self, subreddits):
        """
        Gathers data from the Reddit API. The param, subreddits,
        are all the subreddits this process will gather data from
        and output represents the joint result of multiple threads.
        """

        reddit = praw.Reddit(user_agent="Scrum Generator")
        for subreddit in subreddits:
            # force lazy eval by converting to list
            top_submissions = list(reddit.get_subreddit(subreddit).get_top(limit=2))
            titles = [entry.title.encode("utf8", "ignore") for entry in top_submissions]
            comments = sum([[c.body for c in submission.comments \
                            if not isinstance(c, praw.objects.MoreComments)] \
                            for submission in top_submissions], [])

            for comment in comments:
                self.output.put(Scraper._clean_data(comment))

            for title in titles:
                self.output.put(Scraper._clean_data(title))

    def _gather_hn_data(self, entries):
        """
        Gathers data from the Hacker News API. The param, entries,
        represents all of the posts this process will gather data
        from.
        """
        for entry in entries:
            response = urllib2.urlopen(HN_BASE_API_ENDPOINT + str(entry)).read()
            soup = BeautifulSoup(response, "html.parser")
            all_comments = soup.findAll("span", {"class" : "comment"})
            for comment in all_comments:
                cleaned_html = re.sub('<[^<]+?>|reply|\n', "", comment.text)
                cleaned_data = Scraper._clean_data(cleaned_html)
                self.output.put(cleaned_data)


    @classmethod
    def _clean_data(cls, phrase):
        """
        Cleans each phrase from both Reddit and HackerNews
        to be processed by SentenceGenerator.
        Returns a cleaned string free of parens, curly and square
        brackets, and quotes along with spaces after punctuation.
        """

        # replace illegal chracaters
        cleaned_phrase = re.sub("[(%~`<>#:@/^*&$\t?=|){}\\[\\]\"\n]", "", phrase)
        # make sure each period is proceeded by a space for proper punctuation
        cleaned_phrase = re.sub(r'[?!.]([a-zA-Z])', r'. \1', cleaned_phrase)

        return cleaned_phrase

    def insert_into_db(self):
        """
        Inserts the data into the Postgres DB.
        """
        self.logger.debug("Inserting data in to the database")
        if len(self.phrases) == 0:
            self.logger.info("No phrases to insert!")
        else:
            self.logger.debug("Attempting to insert %d phrases into the database" \
                                % len(self.phrases))
            conn = psycopg2.connect(database=os.environ["DATABASE"], user=os.environ["USER"])
            cur = conn.cursor()

            successful_insertion = 0

            for phrase in self.phrases:
                self.logger.debug("Attempting to insert %s..." % phrase)
                phrase_hash = int(hashlib.sha1(phrase).hexdigest(), 16) % 10 ** 8
                phrase = phrase.replace("'", "''") # escape quotes
                sql_string = "INSERT INTO phrases (phrase, phrase_hash) VALUES ('%s', '%d')" \
                              % (phrase, phrase_hash)
                try:
                    cur.execute(sql_string)
                    self.logger.debug("Successfully inserted %s" % phrase)
                    successful_insertion += 1
                    conn.commit()
                except psycopg2.IntegrityError: # duplicate comments not allowed
                    self.logger.warn("The phrase '%s' is already in the database" % phrase)
                    conn.rollback()

            self.logger.debug("Successfully inserted %d / %d phrases into the db" \
                                % (successful_insertion, len(self.phrases)))

예제 #29

0

파일 보기

파일: hackernews.py 프로젝트: malavmodi/HackerNews

from hackernews import HackerNews
hn = HackerNews()

stories = hn.top_stories()
for story in stories:
    print(story)

예제 #30

0

파일 보기

# using the haxor hackernews python api wrapper
# don't resubmit a query with frequency greater than every 30 seconds

from hackernews import HackerNews
hn = HackerNews()
from datetime import datetime
import matplotlib
from matplotlib import pyplot as plt

story_tup_list = []
comment_tup = ()
get_comments = False
now = datetime.now()

top_story_ids = hn.top_stories(limit=30)

for story_id in top_story_ids:
    story = hn.get_item(story_id)
    story_tup = (story.title, story.score, story.submission_time)
    story_tup_list.append(story_tup)

    if (get_comments == True):
        for comment_id in story.kids:
            comment = hn.get_item(comment_id)
            comment_tup = (comment.submission_time, comment.text)

story_tup_list.sort(key=lambda tup: tup[1], reverse=True)
for story in story_tup_list[:5]:
    print story[0], story[1]
    string_list = story[0].split(" ")

예제 #31

0

파일 보기

파일: hacknews.py 프로젝트: SDharan93/bridge2grid-backend

class HackNews:

	def __init__(self):
		self.hn = HackerNews()
		self.jsonObj = []

	def displayHackNews(self, jobsOrHeadlines):
		if jobsOrHeadlines == "headlines":
			return self.topStories()

		elif jobsOrHeadlines == "jobs":
			return self.jobAds()

		else:
			resp.message("Oops, wrong catagory! Text us: 'HACKNEWS: jobs' or 'HACKNEWS: headlines'")

	def topStories(self):
		uncleanHeadline = ""
		cleanHeadline = ""

		textReturn = ""

		for story_id in self.hn.top_stories(limit=10):
			uncleanHeadline = str(self.hn.get_item(story_id))
			uncleanHeadline = uncleanHeadline.split(' - ', 1 )
			cleanHeadline = uncleanHeadline[1][:-1]

			textReturn += cleanHeadline + '\n\n'

			self.jsonObj.append({ "title" : cleanHeadline })

		if(cleanHeadline and cleanHeadline != ""):
			self.jsonObj.append({ "sucess" : "true" })
		else:
			self.jsonObj.append({ "sucess" : "false" })

		return textReturn

	def jobAds(self):

		textReturn = ""

		numLoops = 0
		maxLoops = 10

		for story_id in self.hn.top_stories():
			numLoops += 1

			story = self.hn.get_item(story_id)

			if numLoops >= 10:
				break

			if story.item_type == 'job':

				uncleanHeadline = str(story)
				uncleanHeadline = uncleanHeadline.split(' - ', 1 )

				cleanHeadline = uncleanHeadline[1][:-1]

				textReturn += cleanHeadline + '\n'

				if cleanHeadline and cleanHeadline != "":
					self.jsonObj.append({ "title" : cleanHeadline })




		if textReturn == "":
			textReturn += "No jobs have been posted in Top Stories, try again tomorrow!"
			self.jsonObj.append({ "sucess" : "false" })
		else:
			self.jsonObj.append({ "sucess" : "true" })

		return textReturn

	def convertToJson(self):

		return self.jsonObj