def _extract_result(result, keywords, domain=u"google_search"): post = Post() post.title = result['title'] post.url = result['link'] post.domain = domain post.content = result['snippet'] post.post_type = result['kind'] if 'cacheId' not in result: post.guid = compute_author_guid_by_author_name(u'{}_{}'.format( post.url, keywords)) else: post.guid = compute_author_guid_by_author_name(u'{}_{}_{}'.format( post.url, keywords, result['cacheId'])) post.post_id = post.guid return post
def save_to_db(self): self._db.addPosts(self._posts) self._db.add_claim_connections(self._claim_tweet_connections) self._db.add_claim_connections(self._post_comment_connections) authors = [] reddit_authors = [] for i, redditor in enumerate(set(self._redditors)): for attempt in xrange(self._number_of_attempts): try: self._retrive_reddit_author(authors, i, reddit_authors, redditor) print("\rretrive reddit author {0}/{1}".format(i, len(self._redditors)), end='') except prawcore.exceptions.ServerError as e: print('Server overload code 503, save to DB and sleep 30 sec and try again') self.save_to_db() time.sleep(5) # 30) except Exception as e: print('\r retrive reddit author {0}/{1} exception: {2}'.format(i, len(self._redditors), e.message), end='') print() for i, redditor in enumerate(set(self._deleted_redditors)): author = Author() author.name = "deleted" author.author_guid = compute_author_guid_by_author_name(redditor) author.domain = u'reddit' author.author_type = u'deleted' authors.append(author) self._db.add_authors_fast(authors) self._db.add_reddit_authors(reddit_authors) self._posts = [] self._claim_tweet_connections = [] self._redditors = [] self._deleted_redditors = [] self._post_comment_connections = []
def _convert_row_to_post(self, row): post = Post() claim_id = unicode(row['claim_id']) title = unicode(row['title'], errors='replace') post.content = title description = unicode(row['description'], errors='replace') post.description = description url = unicode(row['url']) post.url = url publication_date = row['publication_date'] post.date = date(publication_date) post_guid = compute_post_guid(self._social_network_url, claim_id, publication_date) post.guid = post_guid post.post_id = post_guid post.domain = self._domain post.author = self._author_name author_guid = compute_author_guid_by_author_name(self._author_name) post.author_guid = author_guid post.post_osn_guid = post_guid keywords = unicode(row['keywords']) post.tags = keywords post_type = unicode(row['post_type']) post.post_type = post_type return post
def _add_post(self, author, date, post_osn_id, score=0, upvote_ratio=-1): post = Post() post.post_osn_id = post_osn_id post.author = str(author) post.author_guid = compute_author_guid_by_author_name(post.author) post.created_at = str_to_date(date, formate="%d/%m/%Y %H:%M") post.url = 'https://www.reddit.com{}'.format( post.author) # just for test post.guid = compute_post_guid(post.url, post.post_osn_id, date_to_str(post.created_at)) post.domain = 'reddit_comment' post.post_type = 'reddit_comment' post.post_id = post.guid reddit_post = RedditPost() reddit_post.post_id = post.post_id reddit_post.guid = post.guid reddit_post.score = score if upvote_ratio != -1: post.domain = 'reddit_post' post.post_type = 'reddit_post' reddit_post.upvote_ratio = upvote_ratio reddit_post.ups = int( round((reddit_post.upvote_ratio * reddit_post.score) / (2 * reddit_post.upvote_ratio - 1)) if reddit_post.upvote_ratio != 0.5 else round(reddit_post.score / 2)) reddit_post.downs = reddit_post.ups - reddit_post.score else: reddit_post.ups = -1 reddit_post.downs = -1 reddit_post.upvote_ratio = -1 self._db.addPosts([post, reddit_post]) return post, reddit_post
def _add_author(self, name=None, link_karma=None, comment_karma=None, is_employee=0, is_mod=0, is_gold=0, author_osn_id=None): author = Author() reddit_author = RedditAuthor() author.name = name author.author_screen_name = author.name author.author_guid = compute_author_guid_by_author_name(author.name) author.domain = 'reddit' author.author_osn_id = author_osn_id author.author_full_name = name author.url = 'https://www.reddit.com/user/' + name reddit_author.name = author.name reddit_author.author_guid = author.author_guid reddit_author.comments_count = None reddit_author.comment_karma = comment_karma reddit_author.link_karma = link_karma reddit_author.is_gold = is_gold reddit_author.is_moderator = is_mod reddit_author.is_employee = is_employee self._db.add_authors([author]) self._db.add_reddit_authors([reddit_author])
def _convert_tweet_to_post(self, tweet, post_type): post = Post() post.post_osn_id = str(tweet.id) post_creation_date = tweet.date created_at = str(date_to_str(post_creation_date)) post.created_at = created_at post.date = post_creation_date post.favorite_count = tweet.favorites post.retweet_count = tweet.retweets post.content = str(tweet.text) author_name = str(tweet.username) post.author = author_name post.author_guid = compute_author_guid_by_author_name(author_name) post_url = tweet.permalink post.url = str(post_url) post_guid = compute_post_guid(post_url, author_name, created_at) post.guid = post_guid post.post_id = post_guid post.domain = self._domain post.post_type = post_type return post
def _convert_row_to_post(self, row): # [site, social_id, username_hash, comment_time, comment_tokens] print("\rInsert post to DataFrame {0}/{1}".format( self._current_row, len(self.posts_csv_df)), end="") self._current_row += 1 date = datetime.datetime.fromtimestamp(row['comment_time']) post = Post() claim_id = compute_author_guid_by_author_name(str(row['social_id'])) post.post_id = str( compute_post_guid(row['site'] + str(claim_id), row['username_hash'], date_to_str(date))) post.content = str(row['comment_tokens']) post.author = str(row['username_hash']) post.author_guid = str(row['username_hash']) post.domain = str(row['site']) post.date = date self._posts.append(post) claim_tweet_connection = Claim_Tweet_Connection() claim_tweet_connection.claim_id = str(claim_id) claim_tweet_connection.post_id = str(post.post_id) self._claim_tweet_connections.append(claim_tweet_connection) if self._current_row % self._max_posts_without_save == 0: self._save_posts_and_connections()
def _convert_reddit_author_to_author(self, redditor): author = Author() author.name = getattr(redditor, 'name', '') author.author_screen_name = author.name author.author_guid = compute_author_guid_by_author_name(author.name) author.domain = u'reddit' author.created_at = datetime.fromtimestamp(getattr(redditor, 'created_utc', 0)) author.author_osn_id = getattr(redditor, 'id', '') author.author_full_name = getattr(redditor, 'fullname', '') author.url = u'https://www.reddit.com/user/' + redditor.name return author
def _json_user_to_db_author_converter(self, user, domain='Instagram_author'): author = Author() author.name = user['username'] author.author_screen_name = author.name author.author_guid = compute_author_guid_by_author_name(author.name) author.domain = domain author.author_type = domain author.author_osn_id = user['id'] author.author_full_name = user['full_name'] author.description = user.setdefault('biography', None) author.url = 'https://www.instagram.com/' + author.author_screen_name author.profile_image_url = user['profile_pic_url'] return author
def _json_comment_to_db_comment_converter(self, post, domain="Instagram_comment"): rpost = Post() rpost.post_osn_id = str(post['id']) rpost.created_at = datetime.fromtimestamp(post['created_at']) rpost.author = post['owner']['id'] rpost.author_guid = compute_author_guid_by_author_name(rpost.author) rpost.url = str('https://www.instagram.com/p/{}/'.format(post['shortcode'])) rpost.content = post['text'] rpost.guid = compute_post_guid(rpost.url, rpost.post_osn_id, date_to_str(rpost.created_at)) rpost.domain = domain rpost.post_type = domain rpost.post_id = rpost.guid return rpost
def _json_post_to_db_post_converter(self, post, domain="Instagram_post"): rpost = Post() rpost.post_osn_id = str(post['id']) rpost.created_at = datetime.fromtimestamp(post['taken_at_timestamp']) rpost.author = post['owner']['id'] rpost.author_guid = compute_author_guid_by_author_name(rpost.author) rpost.url = str('https://www.instagram.com/p/{}/'.format(post['shortcode'])) rpost.content = ', '.join(x['node']['text'] for x in post['edge_media_to_caption']['edges']) rpost.guid = compute_post_guid(rpost.url, rpost.post_osn_id, date_to_str(rpost.created_at)) rpost.domain = domain rpost.post_type = domain rpost.post_id = rpost.guid return rpost
def _convert_row_to_claim(self, claim_row): claim = Claim() claim.claim_id = compute_author_guid_by_author_name( str(claim_row['social_id'])) claim.domain = str(claim_row['site']) claim.verdict = str(claim_row['ruling_val']) claim.verdict_date = datetime.datetime.fromtimestamp( claim_row['ruling_time']) self._claims.append(claim) attribute_name = self.__class__.__name__ + "_claim_type" author_feature = BaseFeatureGenerator.create_author_feature( attribute_name, claim.claim_id, claim.verdict, self._window_start, self._window_end) self._author_features.append(author_feature) attribute_name = self.__class__.__name__ + "_claim_id" author_feature = BaseFeatureGenerator.create_author_feature( attribute_name, claim.claim_id, claim.claim_id, self._window_start, self._window_end) self._author_features.append(author_feature)
def convert_comment_to_post(self, comment, submission, domain=u"Reddit"): post = Post() post.post_osn_id = unicode(comment.id) post.created_at = datetime.fromtimestamp(comment.created) post.date = datetime.fromtimestamp(comment.created) if hasattr(comment, 'author') and comment.author: post.author = unicode(comment.author.name) self._redditors.append(comment.author) else: self._deleted_redditors.append(str(post.date)) post.author = unicode('') post.author_guid = compute_author_guid_by_author_name(post.author) post.url = unicode('https://www.reddit.com' + '/'.join(getattr(comment, 'permalink', '').split('/')[3:7])) post.title = unicode(submission.title) post.content = unicode(getattr(comment, 'body', '').encode('utf-8').strip()) post.guid = compute_post_guid(post.url, post.post_osn_id, date_to_str(post.created_at)) post.domain = domain post.post_type = domain post.post_id = post.guid post.url = u'https://www.reddit.com{}'.format(comment.permalink) return post
def _convert_tweet_dict_to_post(self, tweet_dict): post = Post() post_osn_id = tweet_dict['id_str'] post.post_osn_id = post_osn_id author_osn_id = tweet_dict['author_osn_id'] author = self._author_osn_id_author_dict[author_osn_id] author_screen_name = author.author_screen_name post.author = author_screen_name post.author_guid = compute_author_guid_by_author_name( author_screen_name) created_at = tweet_dict['created_at'] post.created_at = created_at creation_date_str = extract_tweet_publiction_date(created_at) creation_date = str_to_date(creation_date_str) post.date = creation_date post.favorite_count = tweet_dict['favorite_count'] post.retweet_count = tweet_dict['retweet_count'] post.reply_count = tweet_dict['reply_count'] post.content = str(tweet_dict['full_text']) post.domain = self._domain post.language = str(tweet_dict['lang']) post_url = "https://twitter.com/{0}/status/{1}".format( author_screen_name, post_osn_id) post.url = post_url post_guid = compute_post_guid(post_url, author_screen_name, creation_date_str) post.guid = post_guid post.post_id = post_guid return post
def execute(self, window_start): author_screen_names_df = pd.read_csv(self._path_to_file) author_screen_names = author_screen_names_df[ 'author_screen_name'].tolist() authors = [] for i, author_screen_name in enumerate(author_screen_names): author = Author() msg = "\rCreate author: [{0}/{1}]".format(i, len(author_screen_names)) print(msg, end="") author.author_screen_name = author_screen_name author.name = author_screen_name author_guid = compute_author_guid_by_author_name( author_screen_name) author.author_guid = author_guid author.domain = self._domain authors.append(author) self._db.addPosts(authors)
def _parse_articles_lst_to_articles(self, all_articles_lst_of_dics): print("###### 'Entering _parse_articles_lst_to_articles'") parsed_articles_lst = [] claims_lst = [] posts_lst = [] articles_lst = [] article_items_lst = [] # Parsing articles list of dictionaries data, received using the API. for news_articles_dic in all_articles_lst_of_dics: print("###### 'PARSING: {}'".format(str(news_articles_dic))) parsed_articles_lst += self._parse_news_article(news_articles_dic) # For each news article dictionary commit: for parsed_news_article in parsed_articles_lst: print("###### 'Iterating parsed_articles_lst single item: {}'". format(str(parsed_news_article))) # Building: claim & News_Article & News_Article_Item objects. claim = Claim() post = Post() article = News_Article() article_item = News_Article_Item() # Initializing Claim object with data: identifier = compute_post_guid(parsed_news_article['url'], parsed_news_article['author'], parsed_news_article['publishedAt']) claim.claim_id = post.post_id = post.guid = post.post_osn_guid = article.post_id = article_item.post_id = unicode( identifier) author_guid = compute_author_guid_by_author_name( parsed_news_article['author']) post.author_guid = article.author_guid = article_item.author_guid = unicode( author_guid) post.author = article.author = unicode( parsed_news_article['author']) post.title = claim.title = article.title = unicode( parsed_news_article['title']) post.content = article_item.content = unicode( parsed_news_article['content']) post.description = claim.description = article.description = unicode( parsed_news_article['description']) post.date = post.created_at = claim.verdict_date = article.published_date = datetime.datetime.strptime( parsed_news_article['publishedAt'], '%Y-%m-%d %H:%M:%S') article_item.source_newsapi_internal_id = unicode( parsed_news_article['source_id']) article_item.source_newsapi_internal_name = unicode( parsed_news_article['source_name']) post.url = claim.url = article.url = unicode( parsed_news_article['url']) article_item.img_url = unicode(parsed_news_article['urlToImage']) post.post_type = claim.verdict = unicode( "TRUE" ) # todo: Add constant. We assume all news articles are true. post.domain = claim.domain = unicode( "NewsSite") # todo: Add constant. # Update objects lists: posts_lst.append(post) claims_lst.append(claim) articles_lst.append(article) article_items_lst.append(article_item) print("###### 'EXITING _parse_articles_lst_to_articles'") return posts_lst, claims_lst, articles_lst, article_items_lst