from creds import db from newspaper import Article from contextlib import closing with closing(db.cursor()) as cur: cur.execute( """ select url, id,headline from database.table where url != '' and url is not null and scraped_content is null order by id """ ) rows = cur.fetchall() for article in rows: url = article[0] lang = "en" int_id = article[1] headline = article[2] print str(int_id) try: print "getting article..." article_get = Article(url=url, language=lang) print "downloading..." article_get.download() print "parsing..." article_get.parse() print "setting scraped content..." scraped_content = article_get.text print "importing data..." cur.execute( """
end_date=date, metrics= 'ga:pageviews,ga:sessionDuration,ga:pageviewsPerSession,ga:bounces,ga:sessions', dimensions= 'ga:channelGrouping', # sort='-ga:visits', #filters='ga:medium==organic', #start_index='1', max_results='25').execute() json_str = json.dumps(response) json_dict = json.loads(json_str) data = json_dict['rows'] for record in data: channel = record[0] page_views = record[1] avg_session_dur = record[2] views_per_session = record[3] bounces = record[4] sessions = record[5] with closing(db.cursor()) as cur: cur.execute( """ insert ignore into database.table (date, channel,type, page_views, bounces, avg_session_dur, views_per_session,sessions) values (%s,%s,%s,%s,%s,%s,%s,%s) on duplicate key update page_views = %s, bounces = %s, avg_session_dur = %s, views_per_session = %s, sessions = %s """, (date, channel, type, page_views, bounces, avg_session_dur, views_per_session, sessions, page_views, bounces, avg_session_dur, views_per_session, sessions)) db.commit()
def insert_time(self): try: with closing(db.cursor()) as cur: cur.execute(""" insert ignore into gnip.twitter (post_id, author_name, author_username, author_link, author_created, author_profile_image, author_personal_url, author_followers, author_following, author_lists_count, author_statuses_count, author_time_zone, author_verified, author_languages, author_favorites_count, post_type, post_date, post_source, post_link, post_content, post_favorites_count, post_hashtags, post_trends, post_urls, post_mentions, post_symbols, media_urls, post_retweet_count, post_language, post_tags, post_clients, author_location, author_bio, post_channel, post_lat, post_long ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) """,(self.post_id, self.author_name, self.author_username, self.author_link, self.author_created, self.author_profile_image, self.author_personal_url, self.author_followers, self.author_following, self.author_lists_count, self.author_statuses_count, self.author_time_zone, self.author_verified, self.author_languages, self.author_favorites_count, self.post_type, self.post_date, self.post_source, self.post_link, self.post_content, self.post_favorites_count, self.post_hashtags, self.post_trends, self.post_urls, self.post_mentions, self.post_symbols, self.media_urls, self.post_retweet_count, self.post_language, self.topic_tags, self.client_tags, self.author_location, self.author_bio, self.post_action, self.lat, self.long ) ) db.commit() except MySQLdb.Error, e: logging.error(datetime.datetime.now(),'SQL Error: ' + e) time.sleep(60) continue
def insert_time(self): try: with closing(db.cursor()) as cur: cur.execute( """ insert ignore into gnip.twitter (post_id, author_name, author_username, author_link, author_created, author_profile_image, author_personal_url, author_followers, author_following, author_lists_count, author_statuses_count, author_time_zone, author_verified, author_languages, author_favorites_count, post_type, post_date, post_source, post_link, post_content, post_favorites_count, post_hashtags, post_trends, post_urls, post_mentions, post_symbols, media_urls, post_retweet_count, post_language, post_tags, post_clients, author_location, author_bio, post_channel, post_lat, post_long ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) """, (self.post_id, self.author_name, self.author_username, self.author_link, self.author_created, self.author_profile_image, self.author_personal_url, self.author_followers, self.author_following, self.author_lists_count, self.author_statuses_count, self.author_time_zone, self.author_verified, self.author_languages, self.author_favorites_count, self.post_type, self.post_date, self.post_source, self.post_link, self.post_content, self.post_favorites_count, self.post_hashtags, self.post_trends, self.post_urls, self.post_mentions, self.post_symbols, self.media_urls, self.post_retweet_count, self.post_language, self.topic_tags, self.client_tags, self.author_location, self.author_bio, self.post_action, self.lat, self.long)) db.commit() except MySQLdb.Error, e: logging.error(datetime.datetime.now(), 'SQL Error: ' + e) time.sleep(60) continue