link = entry.link url_hash = hashlib.md5(link).hexdigest() date = entry.published_parsed published_date = arrow.get(date).to('US/Pacific').date().strftime('%Y-%m-%d') published_ts = arrow.get(date).to('US/Pacific').to('UTC').timestamp # See if we already have this story try: NewsItem.get(NewsItem.url_hash==url_hash) print 'Item exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating item.' item = NewsItem() headline = entry.title summary = entry.summary item.url_hash = url_hash item.link = link item.source = 'Seattle Times' item.title = headline item.summary = summary item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save()
link = entry.link url_hash = hashlib.md5(link).hexdigest() date = entry.published_parsed published_date = arrow.get(date).to('US/Pacific').date().strftime( '%Y-%m-%d') published_ts = arrow.get(date).to('US/Pacific').to('UTC').timestamp # See if we already have this story try: NewsItem.get(NewsItem.url_hash == url_hash) print 'Item exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating item.' item = NewsItem() headline = entry.title summary = entry.summary item.url_hash = url_hash item.link = link item.source = 'Seattle Times' item.title = headline item.summary = summary item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save()
for prefix in skippable_headline_prefixes: if entry.title.startswith(prefix): prefix_match = True if prefix_match: print 'Skipping story' continue # See if we already have this story try: NewsItem.get(NewsItem.url_hash==url_hash) print 'Item exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating item.' item = NewsItem() soup = bs4.BeautifulSoup(entry.description, 'html.parser') item.summary = unidecode.unidecode(soup.text.strip()) item.title = unidecode.unidecode(entry.title) item.url_hash = url_hash item.link = link item.authors = '' item.source = 'Capital WX Gang' item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save()
# See if any of the skippable ids are in the story ids if pcollid in skippable_collection_ids: print 'Skipping %s story' % pcollid continue # If it's also published on weather underground, skip it if 'wunderground' in tags: print 'Skipping Weather Underground Story' continue # See if the story already exists try: item = NewsItem.get(NewsItem.url_hash==url_hash) print 'Item Exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating new item.' item = NewsItem() item.url_hash = url_hash item.title = unidecode.unidecode(entry['title'].strip()) item.summary = unidecode.unidecode(entry['description'].strip()) item.source = "Weather Channel" item.link = link item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save()
meta_og_title_el = links_soup.find('meta', {'property': 'og:title'}) meta_og_desc_el = links_soup.find('meta', {'property': 'og:description'}) meta_og_url_el = links_soup.find('meta', {'property': 'og:url'}) except Exception, e: meta_og_title_el = None meta_og_desc_el = None meta_og_url_el = None if meta_og_title_el is not None: headline = meta_og_title_el['content'].strip() if meta_og_desc_el is not None: description = meta_og_desc_el['content'].strip() if meta_og_url_el is not None: link = meta_og_url_el['content'] item.link = link item.url_hash = url_hash item.title = headline item.summary = description item.source = "Accuweather" item.published_date = published_date item.published_ts = utc_dt.timestamp item.inserted_ts = arrow.utcnow().timestamp item.save() # Sleep between requests time.sleep(1)
# See if any of the skippable ids are in the story ids if pcollid in skippable_collection_ids: print 'Skipping %s story' % pcollid continue # If it's also published on weather underground, skip it if 'wunderground' in tags: print 'Skipping Weather Underground Story' continue # See if the story already exists try: item = NewsItem.get(NewsItem.url_hash == url_hash) print 'Item Exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating new item.' item = NewsItem() item.url_hash = url_hash item.title = unidecode.unidecode(entry['title'].strip()) item.summary = unidecode.unidecode(entry['description'].strip()) item.source = "Weather Channel" item.link = link item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save()
for prefix in skippable_headline_prefixes: if entry.title.startswith(prefix): prefix_match = True if prefix_match: print 'Skipping story' continue # See if we already have this story try: NewsItem.get(NewsItem.url_hash == url_hash) print 'Item exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating item.' item = NewsItem() soup = bs4.BeautifulSoup(entry.description, 'html.parser') item.summary = unidecode.unidecode(soup.text.strip()) item.title = unidecode.unidecode(entry.title) item.url_hash = url_hash item.link = link item.authors = '' item.source = 'Capital WX Gang' item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save()