def test_slugify_on_save(self): item = NewsItem(title="My News Item", publish_date=now(), create_user_id=1, feed_id=1) item.save() self.assertEquals(item.slug, 'my-news-item')
def add(request): if request.method == "POST": feed_id = request.POST.get("feed") title = request.POST.get("title") link = request.POST.get("link") summary = request.POST.get("summary") date = request.POST.get("date") if not feed_id or not title or not summary: return HttpResponseBadRequest( "Bad Request: must include feed, title, and summary\n") if date: try: date = dateutil.parser.parse(date) except: return HttpResponseBadRequest( "Bad Request: invalid date format\n") else: date = datetime.datetime.now() if not link: link = "" try: feed = NewsFeed.objects.get(id=feed_id) except NewsFeed.DoesNotExist: return HttpResponseNotFound("Not Found: no such feed id\n") item = NewsItem(feed=feed, title=title, link=link, date=date, summary=summary) item.save() # get previous item id prev_id = None try: prev_items = NewsItem.objects.filter( feed=feed, id__lt=item.id).order_by("-id")[:1] if len(prev_items) > 0: prev_id = prev_items[0].id except: pass out = dict() if prev_id: out["prev_id"] = str(prev_id) out["items"] = [item.to_json()] hr = dict() hr["body"] = json.dumps(out) + "\n" fanout_publish("feed-" + str(feed.id), str(item.id), str(prev_id), hr) return HttpResponse("Posted\n") else: return HttpResponseNotAllowed(["POST"])
def create_news_items(is_sticky=False, amount=1): for i in range(0, amount): item = NewsItem(title='silly news item name', publish_date=now(), published=True, create_user_id=1, feed_id=randint(1, 2), sticky=is_sticky) item.save()
def get_items(): page = int(request.query.page or 1) limit = int(request.query.limit or 100) items = NewsItem.select().paginate(page, limit) result = [] for item in items: result.append(model_to_dict(item)) pagination = get_pagination_object(NewsItem.select(), page, limit) response.set_header('content-type', 'application/json') response_obj = {"data": result, "pagination": pagination} return json.dumps(response_obj, cls=MyEncoder)
def add(request): if request.method == "POST": feed_id = request.POST.get("feed") title = request.POST.get("title") link = request.POST.get("link") summary = request.POST.get("summary") date = request.POST.get("date") if not feed_id or not title or not summary: return HttpResponseBadRequest("Bad Request: must include feed, title, and summary\n") if date: try: date = dateutil.parser.parse(date) except: return HttpResponseBadRequest("Bad Request: invalid date format\n") else: date = datetime.datetime.now() if not link: link = "" try: feed = NewsFeed.objects.get(id=feed_id) except NewsFeed.DoesNotExist: return HttpResponseNotFound("Not Found: no such feed id\n") item = NewsItem(feed=feed, title=title, link=link, date=date, summary=summary) item.save() # get previous item id prev_id = None try: prev_items = NewsItem.objects.filter(feed=feed, id__lt=item.id).order_by("-id")[:1] if len(prev_items) > 0: prev_id = prev_items[0].id except: pass out = dict() if prev_id: out["prev_id"] = str(prev_id) out["items"] = [item.to_json()] hr = dict() hr["body"] = json.dumps(out) + "\n" fanout_publish("feed-" + str(feed.id), str(item.id), str(prev_id), hr) return HttpResponse("Posted\n") else: return HttpResponseNotAllowed(["POST"])
def get_item(news_id): response.set_header("content-type", "application/json") try: item = NewsItem.select().where(NewsItem.id == news_id).get() obj = model_to_dict(item) response_obj = {"data": obj} return json.dumps(response_obj, cls=MyEncoder) except DoesNotExist: abort(404)
def post(self): # takes a NewsItem key and adds the loggeg-in user to votes # returns the news object with the updated vote count ? user = users.get_current_user() if user: key = cgi.escape(self.request.get('key')) if key: item = NewsItem.get_by_key_name(key) if user not in item.bumps: item.bumps.append(user) item.put() self.response.out.write(item.key().name()) else: self.error(401)
def post(self): key = self.request.get('key') feed = NewsFeed.get_by_key_name(key) # FIXME check if feed was retrieved result = urlfetch.fetch(feed.url) if result.status_code == 200: rssfeed = feedparser.parse(result.content) for i in rssfeed.entries: item = NewsItem(key_name=i.guid) item.url = i.link item.title = i.title item.text = i.summary item.date = datetime.datetime(*i.date_parsed[:6]) item.orderdate = datetime.datetime(*i.date_parsed[:6]) item.source = feed item.put() feed.last_fetch = datetime.datetime.now() feed.put() taskqueue.add(queue_name='fetch-news-queue', url='/admin/feeds/fetch/', params={'key':feed.key}) self.response.out.write('feed pulled') else: self.error(500)
def get(self): url = settings.YAHOO_PIPE % 'rss' result = urlfetch.fetch(url) if result.status_code == 200: feed = feedparser.parse(result.content) for i in feed.entries: item = NewsItem(key_name=i.guid) item.url = i.link item.title = i.title item.text = i.summary item.date = datetime.datetime(*i.date_parsed[:6]) item.orderdate = datetime.datetime(*i.date_parsed[:6]) item.put() items = db.GqlQuery("SELECT * FROM NewsItem ORDER BY orderdate DESC LIMIT 100") context = {'news':items } #context = add_user_to_context(context) self.response.out.write( template.render(tmpl('templates/news2.html'), context )) else: self.response.out.write('err')
CUR_DIR = os.path.dirname(os.path.realpath(__file__)) from models import NewsItem # Set up the template engine to look in the current directory template_loader = jinja2.FileSystemLoader('templates') template_env = jinja2.Environment(loader=template_loader) # Load the template file template_file = "report.tpl.txt" template = template_env.get_template(template_file) # Load all the news items from the past week seven_days_ago_ts = arrow.utcnow().ceil('hour').replace(days=-2).timestamp news_items = NewsItem.select().where(NewsItem.published_ts>seven_days_ago_ts) # Create a dictionary with all the stories grouped by source sources = {} for news_item in news_items: if not sources.has_key(news_item.source): sources[news_item.source] = { 'items': [], 'name': news_item.source, } # Add the news item sources[news_item.source]['items'].append(news_item) # Sort the news items for each key
h1_el = soup.find('h1', text='Weather Articles') tr_els = h1_el.findAllNext('tr') for tr_el in tr_els: # Make sure the URL is absolute link = tr_el.a['href'].strip() if not link.startswith('http'): link = 'http://www.wunderground.com' + link # Create a hash from the URL to make a unique identifier url_hash = hashlib.md5(link).hexdigest() # See if the item already exists try: item = NewsItem.get(NewsItem.url_hash==url_hash) print 'Item Exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating new item.' item = NewsItem() summary = tr_el.p.text.strip() headline = tr_el.h3.text.strip() # Try to get the opengraph data try: link_request = requests.get(link) links_soup = bs4.BeautifulSoup(link_request.text, 'html.parser') meta_og_title_el = links_soup.find('meta', {'property': 'og:title'}) meta_og_desc_el = links_soup.find('meta', {'property': 'og:description'})
# Set up the template engine to look in the current directory template_loader = jinja2.FileSystemLoader('templates') template_env = jinja2.Environment(loader=template_loader) # Adding filters to enviroment to make them visible in the template template_env.filters['format_date'] = reformat_date template_env.filters['get_source_url'] = get_source_url # Load the template file template_file = "index.tpl.html" template = template_env.get_template(template_file) # Load all the news items three_days_ago = arrow.utcnow().to('US/Eastern').replace( hours=-72).format('YYYY-MM-DD') news_items = NewsItem.select().where(NewsItem.published_date > three_days_ago, NewsItem.hidden == 0) news_items.order_by(NewsItem.published_ts) # Render the template context = { 'news_items': news_items, 'updated_eastern': arrow.utcnow().to('US/Eastern') } output = template.render(context) # Save the output filepath = os.path.join(CUR_DIR, 'output/sources.html') with codecs.open(filepath, 'w', 'utf-8') as f: f.write(output)
import peewee import sys from models import NewsItem if len(sys.argv) < 2: sys.exit('Usage: $ python hide_story.py <story id>') else: id = sys.argv[1] try: id = int(id) except ValueError: sys.exit("Invalid Story ID") # Try to fetch the item try: item = NewsItem.get(NewsItem.id==id) except peewee.DoesNotExist: sys.exit("Error! News Item with ID %d not found" % id) print 'You are attempting to hide story id %d' % id print 'Headline: %s' % item.title confirm = raw_input("Are you sure? Y/n: ") if confirm == 'Y': item.hidden = True item.save()
'travel', ) # See if any of the skippable ids are in the story ids if pcollid in skippable_collection_ids: print 'Skipping %s story' % pcollid continue # If it's also published on weather underground, skip it if 'wunderground' in tags: print 'Skipping Weather Underground Story' continue # See if the story already exists try: item = NewsItem.get(NewsItem.url_hash == url_hash) print 'Item Exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating new item.' item = NewsItem() item.url_hash = url_hash item.title = unidecode.unidecode(entry['title'].strip()) item.summary = unidecode.unidecode(entry['description'].strip()) item.source = "Weather Channel" item.link = link item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp
request = requests.get(source_url) soup = bs4.BeautifulSoup(request.text, 'html.parser') article_els = soup.findAll('article', class_='story_list span3 col') for article_el in article_els: div_el = article_el.find('div', class_='content') link = 'http://www.sandiegouniontribune.com' + div_el.a['href'] # Create a hash from the URL to make a unique identifier url_hash = hashlib.md5(link).hexdigest() # See if the item already exists try: item = NewsItem.get(NewsItem.url_hash==url_hash) print 'Item Exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating new item.' item = NewsItem() date = div_el.find('p', class_='date').text.replace('Updated', '').strip() dt = dateutil.parser.parse(date) dt = dt.replace(tzinfo=pytz.timezone('US/Pacific')) published_date = arrow.get(dt).date().strftime('%Y-%m-%d') headline = div_el.a.text.strip() published_ts = arrow.get(dt).to('UTC').timestamp summary = ''
#! /usr/bin/env python ## To be Run every hour from scrap import UnilagScrap as Scrap from models import NewsItem from datetime import datetime scrapper = Scrap() news_items = scrapper.get_news_items() for item in news_items: ## let's insert stuff # first check that it does not already exist! to_db = NewsItem.select().where(NewsItem.slug == item['slug']) if len(to_db) == 0: # item has not been in the db before to_db = NewsItem(news_hash=item['news_hash'], slug=item['slug'], news_title=item['title'], news_link=item['link'], date_updated=item['date_updated'], intro_text=item['intro_text'], scrapped_at=datetime.now()) to_db.save() else: to_db = to_db.get() # check if the hash has changed if to_db.news_hash != item['news_hash']: to_db.news_hash = item['news_hash']
source_url = 'http://blogs.seattletimes.com/today/category/weather-beat/feed/' feed = feedparser.parse(source_url) entries = feed.entries for entry in entries: link = entry.link url_hash = hashlib.md5(link).hexdigest() date = entry.published_parsed published_date = arrow.get(date).to('US/Pacific').date().strftime('%Y-%m-%d') published_ts = arrow.get(date).to('US/Pacific').to('UTC').timestamp # See if we already have this story try: NewsItem.get(NewsItem.url_hash==url_hash) print 'Item exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating item.' item = NewsItem() headline = entry.title summary = entry.summary item.url_hash = url_hash item.link = link item.source = 'Seattle Times' item.title = headline item.summary = summary item.published_date = published_date
'travel', ) # See if any of the skippable ids are in the story ids if pcollid in skippable_collection_ids: print 'Skipping %s story' % pcollid continue # If it's also published on weather underground, skip it if 'wunderground' in tags: print 'Skipping Weather Underground Story' continue # See if the story already exists try: item = NewsItem.get(NewsItem.url_hash==url_hash) print 'Item Exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating new item.' item = NewsItem() item.url_hash = url_hash item.title = unidecode.unidecode(entry['title'].strip()) item.summary = unidecode.unidecode(entry['description'].strip()) item.source = "Weather Channel" item.link = link item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp
import peewee import sys from models import NewsItem if len(sys.argv) < 2: sys.exit('Usage: $ python hide_story.py <story id>') else: id = sys.argv[1] try: id = int(id) except ValueError: sys.exit("Invalid Story ID") # Try to fetch the item try: item = NewsItem.get(NewsItem.id == id) except peewee.DoesNotExist: sys.exit("Error! News Item with ID %d not found" % id) print 'You are attempting to hide story id %d' % id print 'Headline: %s' % item.title confirm = raw_input("Are you sure? Y/n: ") if confirm == 'Y': item.hidden = True item.save()
h1_el = soup.find('h1', text='Weather Articles') tr_els = h1_el.findAllNext('tr') for tr_el in tr_els: # Make sure the URL is absolute link = tr_el.a['href'].strip() if not link.startswith('http'): link = 'http://www.wunderground.com' + link # Create a hash from the URL to make a unique identifier url_hash = hashlib.md5(link).hexdigest() # See if the item already exists try: item = NewsItem.get(NewsItem.url_hash == url_hash) print 'Item Exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating new item.' item = NewsItem() summary = tr_el.p.text.strip() headline = tr_el.h3.text.strip() # Try to get the opengraph data try: link_request = requests.get(link) links_soup = bs4.BeautifulSoup(link_request.text, 'html.parser') meta_og_title_el = links_soup.find('meta', {'property': 'og:title'}) meta_og_desc_el = links_soup.find('meta',
'PM Update', ) # Skip the story if it starts with "D.C. area forecast" prefix_match = False for prefix in skippable_headline_prefixes: if entry.title.startswith(prefix): prefix_match = True if prefix_match: print 'Skipping story' continue # See if we already have this story try: NewsItem.get(NewsItem.url_hash == url_hash) print 'Item exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating item.' item = NewsItem() soup = bs4.BeautifulSoup(entry.description, 'html.parser') item.summary = unidecode.unidecode(soup.text.strip()) item.title = unidecode.unidecode(entry.title) item.url_hash = url_hash item.link = link item.authors = '' item.source = 'Capital WX Gang' item.published_date = published_date
'PM Update', ) # Skip the story if it starts with "D.C. area forecast" prefix_match = False for prefix in skippable_headline_prefixes: if entry.title.startswith(prefix): prefix_match = True if prefix_match: print 'Skipping story' continue # See if we already have this story try: NewsItem.get(NewsItem.url_hash==url_hash) print 'Item exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating item.' item = NewsItem() soup = bs4.BeautifulSoup(entry.description, 'html.parser') item.summary = unidecode.unidecode(soup.text.strip()) item.title = unidecode.unidecode(entry.title) item.url_hash = url_hash item.link = link item.authors = '' item.source = 'Capital WX Gang' item.published_date = published_date
headline = li_el.find('h4').text date = li_el.find('h5').text description = li_el.find('p').text # Parse the date dt = dateutil.parser.parse(date) dt = dt.replace(tzinfo=pytz.timezone('US/Eastern')) utc_dt = arrow.get(dt).to('UTC') published_date = arrow.get(dt).date().strftime('%Y-%m-%d') # Create a unique identifier from the hash of the URL url_hash = hashlib.md5(link).hexdigest() # See if the story already exists try: item = NewsItem.get(NewsItem.url_hash==url_hash) print 'Item Exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating new item.' item = NewsItem() # Try to get the opengraph data try: link_request = requests.get(link) links_soup = bs4.BeautifulSoup(link_request.text, 'html.parser') meta_og_title_el = links_soup.find('meta', {'property': 'og:title'}) meta_og_desc_el = links_soup.find('meta', {'property': 'og:description'}) meta_og_url_el = links_soup.find('meta', {'property': 'og:url'}) except Exception, e: meta_og_title_el = None
feed = feedparser.parse(source_url) entries = feed.entries for entry in entries: link = entry.link url_hash = hashlib.md5(link).hexdigest() date = entry.published_parsed published_date = arrow.get(date).to('US/Pacific').date().strftime( '%Y-%m-%d') published_ts = arrow.get(date).to('US/Pacific').to('UTC').timestamp # See if we already have this story try: NewsItem.get(NewsItem.url_hash == url_hash) print 'Item exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating item.' item = NewsItem() headline = entry.title summary = entry.summary item.url_hash = url_hash item.link = link item.source = 'Seattle Times' item.title = headline item.summary = summary item.published_date = published_date
return '#' # Set up the template engine to look in the current directory template_loader = jinja2.FileSystemLoader('templates') template_env = jinja2.Environment(loader=template_loader) # Adding filters to enviroment to make them visible in the template template_env.filters['format_date'] = reformat_date template_env.filters['get_source_url'] = get_source_url # Load the template file template_file = "index.tpl.html" template = template_env.get_template(template_file) # Load all the news items three_days_ago = arrow.utcnow().to('US/Eastern').replace(hours=-72).format('YYYY-MM-DD') news_items = NewsItem.select().where( NewsItem.published_date > three_days_ago, NewsItem.hidden == 0 ) news_items.order_by(NewsItem.published_ts) # Render the template context = {'news_items': news_items, 'updated_eastern': arrow.utcnow().to('US/Eastern') } output = template.render(context) # Save the output filepath = os.path.join(CUR_DIR, 'output/sources.html') with codecs.open(filepath, 'w', 'utf-8') as f: f.write(output)