def _add_post(self, title, content, author_guid): post = Post() post.author = author_guid post.author_guid = author_guid post.content = content post.title = title post.domain = u'test' post.post_id = len(self._posts) post.guid = post.post_id post.date = date('2020-01-01 23:59:59') self._db.addPost(post) self._db.session.commit() self._posts.append(post)
def add_review_to_restorunt(self, review, api_id, json_id): p = Post() p.author_guid = api_id p.author = json_id p.domain = 'Restaurant' p.content = review['text'] p.created_at = review['date'] p.favorite_count = review['useful'] p.post_id = review['review_id'] return p
def _convert_row_to_post(self, row): # [site, social_id, username_hash, comment_time, comment_tokens] print("\rInsert post to DataFrame {0}/{1}".format( self._current_row, len(self.posts_csv_df)), end="") self._current_row += 1 date = datetime.datetime.fromtimestamp(row['comment_time']) post = Post() claim_id = compute_author_guid_by_author_name(str(row['social_id'])) post.post_id = str( compute_post_guid(row['site'] + str(claim_id), row['username_hash'], date_to_str(date))) post.content = str(row['comment_tokens']) post.author = str(row['username_hash']) post.author_guid = str(row['username_hash']) post.domain = str(row['site']) post.date = date self._posts.append(post) claim_tweet_connection = Claim_Tweet_Connection() claim_tweet_connection.claim_id = str(claim_id) claim_tweet_connection.post_id = str(post.post_id) self._claim_tweet_connections.append(claim_tweet_connection) if self._current_row % self._max_posts_without_save == 0: self._save_posts_and_connections()
def _add_post(self, title, content, _domain=u'Microblog'): post = Post() post.author = self._author.author_full_name post.author_guid = self._author.author_guid post.content = content post.title = title post.domain = _domain post.post_id = title post.guid = title self._db.addPost(post) self._posts.append(post)
def _add_post(self, title, content): post = Post() post.author = self._author.author_full_name post.author_guid = self._author.author_guid post.content = content post.title = title post.domain = u'test' post.post_id = len(self._posts) post.guid = post.post_id self._db.addPost(post) self._posts.append(post)
def _extract_result(result, keywords, domain=u"google_search"): post = Post() post.title = result['title'] post.url = result['link'] post.domain = domain post.content = result['snippet'] post.post_type = result['kind'] if 'cacheId' not in result: post.guid = compute_author_guid_by_author_name(u'{}_{}'.format( post.url, keywords)) else: post.guid = compute_author_guid_by_author_name(u'{}_{}_{}'.format( post.url, keywords, result['cacheId'])) post.post_id = post.guid return post
def add_graph_to_db(cls, graph): post = Post(post_id=str(graph.graph['name']), domain='flickr', post_osn_id=str(graph.graph['name'])) post.post_type = 'labels' author_connections = [] for edge in graph.edges(): author_connections.append( AuthorConnection(source_author_guid=edge[0], destination_author_guid=edge[1], connection_type=graph.graph['name'])) authors = [] for node in graph.nodes(): authors.append( Author(name=str(node), domain=str(graph.graph['name']), author_guid=str(node))) cls._db.addPosts([post]) cls._db.addPosts(author_connections) cls._db.addPosts(authors)
def fill_tweet_retweet_connection(self): ''' Fetches the original tweets being retweeted by our posts. Updates the followig tables: * Post_Citations table with tweet-retweet connection * Posts table with missing tweets * Authors with the authors of the missing tweets ''' retweets_with_no_tweet_citation = self._db.get_retweets_with_no_tweet_citation() logging.info("Updating tweet-retweet connection of {0} retweets".format(len(retweets_with_no_tweet_citation))) self._posts = [] self._authors = [] self._post_citatsions = [] i = 1 for post_guid, post_url in retweets_with_no_tweet_citation.iteritems(): # logging.info("Analyzing retweet: {0} - {1}".format(post_guid, post_url)) msg = "\r Analyzing retweet: {0} - {1} [{2}".format(post_guid, post_url, i) + "/" + str( len(retweets_with_no_tweet_citation)) + '] ' print(msg, end="") i += 1 tweet_data = self.extract_retweet_data(retweet_guid=post_guid, retweet_url=post_url) if tweet_data is not None: if not self._db.isPostExist(tweet_data.tweet_url): post = Post(guid=tweet_data.tweet_guid, post_id=tweet_data.tweet_guid, url=tweet_data.tweet_url, date=str_to_date(tweet_data.tweet_date), title=tweet_data.tweet_content, content=tweet_data.tweet_content, post_osn_id=tweet_data.tweet_twitter_id, retweet_count=tweet_data.tweet_retweet_count, favorite_count=tweet_data.tweet_favorite_count, author=tweet_data.tweet_author_name, author_guid=tweet_data.tweet_author_guid, domain=self._domain, original_tweet_importer_insertion_date=unicode(get_current_time_as_string())) self._posts.append(post) if not self._db.is_author_exists(tweet_data.tweet_author_guid, self._domain): author = Author(name=tweet_data.tweet_author_name, domain=self._domain, author_guid=tweet_data.tweet_author_guid, original_tweet_importer_insertion_date=unicode(get_current_time_as_string())) self._authors.append(author) if not self._db.is_post_citation_exist(tweet_data.retweet_guid, tweet_data.tweet_guid): post_citation = Post_citation(post_id_from=tweet_data.retweet_guid, post_id_to=tweet_data.tweet_guid, url_from=tweet_data.retweet_url, url_to=tweet_data.tweet_url) self._post_citatsions.append(post_citation) self.update_tables_with_tweet_retweet_data(self._posts, self._authors, self._post_citatsions)
def _create_post_citation_by_row(self, reblogged_from_metadata): original_post = Post() parent_post_id = reblogged_from_metadata["parent_post_id"] original_post.post_osn_id = parent_post_id original_post.post_id = parent_post_id parent_post_blog_id = reblogged_from_metadata["parent_post_blog_id"] original_post.author = parent_post_blog_id original_post.author_guid = compute_author_guid_by_author_name(parent_post_blog_id) original_post.domain = self._domain parent_post_short_url = self._convert_to_unicode_value(reblogged_from_metadata["parent_post_short_url"]) self._set_post_url(parent_post_short_url, parent_post_blog_id, original_post) if parent_post_id not in self._post_dict: self._post_dict[parent_post_id] = original_post return original_post
def _convert_tweet_dict_to_post(self, tweet_dict): post = Post() post_osn_id = tweet_dict['id_str'] post.post_osn_id = post_osn_id author_osn_id = tweet_dict['author_osn_id'] author = self._author_osn_id_author_dict[author_osn_id] author_screen_name = author.author_screen_name post.author = author_screen_name post.author_guid = compute_author_guid_by_author_name( author_screen_name) created_at = tweet_dict['created_at'] post.created_at = created_at creation_date_str = extract_tweet_publiction_date(created_at) creation_date = str_to_date(creation_date_str) post.date = creation_date post.favorite_count = tweet_dict['favorite_count'] post.retweet_count = tweet_dict['retweet_count'] post.reply_count = tweet_dict['reply_count'] post.content = str(tweet_dict['full_text']) post.domain = self._domain post.language = str(tweet_dict['lang']) post_url = "https://twitter.com/{0}/status/{1}".format( author_screen_name, post_osn_id) post.url = post_url post_guid = compute_post_guid(post_url, author_screen_name, creation_date_str) post.guid = post_guid post.post_id = post_guid return post
def convert_comment_to_post(self, comment, submission, domain=u"Reddit"): post = Post() post.post_osn_id = unicode(comment.id) post.created_at = datetime.fromtimestamp(comment.created) post.date = datetime.fromtimestamp(comment.created) if hasattr(comment, 'author') and comment.author: post.author = unicode(comment.author.name) self._redditors.append(comment.author) else: self._deleted_redditors.append(str(post.date)) post.author = unicode('') post.author_guid = compute_author_guid_by_author_name(post.author) post.url = unicode('https://www.reddit.com' + '/'.join(getattr(comment, 'permalink', '').split('/')[3:7])) post.title = unicode(submission.title) post.content = unicode(getattr(comment, 'body', '').encode('utf-8').strip()) post.guid = compute_post_guid(post.url, post.post_osn_id, date_to_str(post.created_at)) post.domain = domain post.post_type = domain post.post_id = post.guid post.url = u'https://www.reddit.com{}'.format(comment.permalink) return post
def setUp(self): self.config = getConfig() self._db = DB() self._db.setUp() self.timeline_overlap = TimelineOverlapVisualizationGenerator() author1 = Author() author1.name = 'acquired_user' author1.domain = 'Microblog' author1.author_guid = 'acquired_user' author1.author_screen_name = 'acquired_user' author1.author_full_name = 'acquired_user' author1.author_osn_id = 1 author1.created_at = datetime.datetime.now() author1.missing_data_complementor_insertion_date = datetime.datetime.now( ) author1.xml_importer_insertion_date = datetime.datetime.now() author1.author_type = 'bad_actor' author1.author_sub_type = 'acquired' self._db.add_author(author1) for i in range(1, 11): post1 = Post() post1.post_id = 'bad_post' + str(i) post1.author = 'acquired_user' post1.guid = 'bad_post' + str(i) post1.date = datetime.datetime.now() post1.domain = 'Microblog' post1.author_guid = 'acquired_user' post1.content = 'InternetTV love it' + str(i) post1.xml_importer_insertion_date = datetime.datetime.now() self._db.addPost(post1) author = Author() author.name = 'TestUser1' author.domain = 'Microblog' author.author_guid = 'TestUser1' author.author_screen_name = 'TestUser1' author.author_full_name = 'TestUser1' author.author_osn_id = 2 author.created_at = datetime.datetime.now() author.missing_data_complementor_insertion_date = datetime.datetime.now( ) author.xml_importer_insertion_date = datetime.datetime.now() self._db.add_author(author) for i in range(1, 11): post = Post() post.post_id = 'TestPost' + str(i) post.author = 'TestUser1' post.guid = 'TestPost' + str(i) post.date = datetime.datetime.now() post.domain = 'Microblog' post.author_guid = 'TestUser1' post.content = 'InternetTV love it' + str(i) post.xml_importer_insertion_date = datetime.datetime.now() self._db.addPost(post) self._db.commit()
def _convert_row_to_post(self, row): post = Post() claim_id = unicode(row['claim_id']) title = unicode(row['title'], errors='replace') post.content = title description = unicode(row['description'], errors='replace') post.description = description url = unicode(row['url']) post.url = url publication_date = row['publication_date'] post.date = date(publication_date) post_guid = compute_post_guid(self._social_network_url, claim_id, publication_date) post.guid = post_guid post.post_id = post_guid post.domain = self._domain post.author = self._author_name author_guid = compute_author_guid_by_author_name(self._author_name) post.author_guid = author_guid post.post_osn_guid = post_guid keywords = unicode(row['keywords']) post.tags = keywords post_type = unicode(row['post_type']) post.post_type = post_type return post
def _add_post(self, post_id, content, url, _domain=u'Microblog'): post = Post() post.author = u'test_user' post.author_guid = u'test_user' post.content = content post.title = post_id post.domain = _domain post.post_id = post_id post.guid = post_id post.url = url post.source_url = url self._db.addPost(post) self._posts[post_id] = post
def create_dummy_post(self): post = Post() post.post_id = unicode(self.post_id) post.author = u"author" post.guid = unicode(generate_random_guid()) post.title = u"title" post.url = u"http://google.com" post.date = str_to_date("2016-08-24 10:00:15") post.content = u"text" post.is_detailed = True post.is_LB = False post.is_valid = True post.domain = u"Google" post.author_guid = unicode(self.author_guid) post.post_osn_id = 123455678 post.retweet_count = 11 post.favorite_count = 10 post.created_at = u"2016-08-24 10:00:15" return post
def _json_post_to_db_post_converter(self, post, domain=u"Instagram_post"): rpost = Post() rpost.post_osn_id = unicode(post[u'id']) rpost.created_at = datetime.fromtimestamp(post[u'taken_at_timestamp']) rpost.author = post[u'owner'][u'id'] rpost.author_guid = compute_author_guid_by_author_name(rpost.author) rpost.url = unicode('https://www.instagram.com/p/{}/'.format( post[u'shortcode'])) rpost.content = u', '.join( x[u'node'][u'text'] for x in post[u'edge_media_to_caption'][u'edges']) rpost.guid = compute_post_guid(rpost.url, rpost.post_osn_id, date_to_str(rpost.created_at)) rpost.domain = domain rpost.post_type = domain rpost.post_id = rpost.guid return rpost
def convert_claim_to_post(claim): from DB.schema_definition import Post post = Post() post.post_id = claim.claim_id post.content = claim.title post.description = claim.description post.url = claim.url post.date = claim.verdict_date post.domain = 'Claim' post.author = 'no author' post.author_guid = 'no author' post.guid = compute_post_guid(claim.url, post.author, date_to_str(post.date)) post.post_osn_guid = post.guid post.tags = claim.keywords post.post_type = claim.verdict return post
def setUp(self): self._db = DB() self._db.setUp() self.author_guid = u"author_guid" author = Author() author.author_guid = self.author_guid author.author_full_name = u'author' author.name = u'author_name' author.author_screen_name = u'author_screen_name' author.domain = u'Microblog' author.statuses_count = 10 author.friends_count = 5 author.followers_count = 6 author.favourites_count = 8 author.author_sub_type = u"bot" author.author_type = u"bad" author.created_at = u"2017-06-17 05:00:00" author.default_profile = True author.default_profile_image = True author.verified = True self._db.add_author(author) post = Post() post.author = self.author_guid post.author_guid = self.author_guid post.content = u"content" post.title = u"title" post.domain = u"domain" post.post_id = u"post_id" post.guid = post.post_id post.date = convert_str_to_unicode_datetime("2017-06-14 05:00:00") post.created_at = post.date self._db.addPost(post) self._db.session.commit() self.feature_prefix = u"AccountPropertiesFeatureGenerator_" self.account_properties_feature_generator = AccountPropertiesFeatureGenerator( self._db, **{ 'authors': [author], 'posts': { self.author_guid: [post] } }) self.account_properties_feature_generator.execute()
def extract_post(self, data, post_type): post = Post() if data['publish_date'] is None: publish_date_date = calendar.timegm(time.gmtime()) * 1000 else: publish_date_date = data['publish_date']['$date'] date_str = datetime.datetime.fromtimestamp( publish_date_date / 1000).strftime('%Y-%m-%d %H:%M:%S') post.post_id = compute_post_guid(data['url'], data['source'], date_str) post.guid = post.post_id post.author_guid = compute_author_guid_by_author_name(data['source']) post.author = str(data['source']) post.date = convert_str_to_unicode_datetime(date_str) post.title = str(data['title']) post.url = str(data['url']) post.source_url = str(data['source']) post.content = str(data['text']) post.tags = ','.join(data['keywords']) post.domain = self._domain post.post_type = post_type if 'description' not in data['meta_data']: post.description = "" else: post.description = str(data['meta_data']['description']) return post
def photo_xml_to_post(self, child): p = Post() p.title = str(child.find('title').text) p.url = str(child.find('urls').find('url').text) try: p.tags = ','.join(tag.text for tag in child.find('tags').findall('tag')) except: pass p.created_at = str(child.find('dates').get('posted')) p.date = datetime.datetime.fromtimestamp(int(p.created_at)) p.author = str(child.find('owner').get('nsid')) p.domain = 'flickr' p.author_guid = compute_author_guid_by_author_name(p.author) p.retweet_count = int(child.find('comments').text) p.post_id = compute_post_guid(p.url, p.author, date_to_str(p.date)) p.post_osn_id = str(child.get('id')) if child.find('labels') is not None: p.post_type = ','.join( tag.text for tag in child.find('labels').findall('label')) return p
def _parse_articles_lst_to_articles(self, all_articles_lst_of_dics): print("###### 'Entering _parse_articles_lst_to_articles'") parsed_articles_lst = [] claims_lst = [] posts_lst = [] articles_lst = [] article_items_lst = [] # Parsing articles list of dictionaries data, received using the API. for news_articles_dic in all_articles_lst_of_dics: print("###### 'PARSING: {}'".format(str(news_articles_dic))) parsed_articles_lst += self._parse_news_article(news_articles_dic) # For each news article dictionary commit: for parsed_news_article in parsed_articles_lst: print("###### 'Iterating parsed_articles_lst single item: {}'". format(str(parsed_news_article))) # Building: claim & News_Article & News_Article_Item objects. claim = Claim() post = Post() article = News_Article() article_item = News_Article_Item() # Initializing Claim object with data: identifier = compute_post_guid(parsed_news_article['url'], parsed_news_article['author'], parsed_news_article['publishedAt']) claim.claim_id = post.post_id = post.guid = post.post_osn_guid = article.post_id = article_item.post_id = unicode( identifier) author_guid = compute_author_guid_by_author_name( parsed_news_article['author']) post.author_guid = article.author_guid = article_item.author_guid = unicode( author_guid) post.author = article.author = unicode( parsed_news_article['author']) post.title = claim.title = article.title = unicode( parsed_news_article['title']) post.content = article_item.content = unicode( parsed_news_article['content']) post.description = claim.description = article.description = unicode( parsed_news_article['description']) post.date = post.created_at = claim.verdict_date = article.published_date = datetime.datetime.strptime( parsed_news_article['publishedAt'], '%Y-%m-%d %H:%M:%S') article_item.source_newsapi_internal_id = unicode( parsed_news_article['source_id']) article_item.source_newsapi_internal_name = unicode( parsed_news_article['source_name']) post.url = claim.url = article.url = unicode( parsed_news_article['url']) article_item.img_url = unicode(parsed_news_article['urlToImage']) post.post_type = claim.verdict = unicode( "TRUE" ) # todo: Add constant. We assume all news articles are true. post.domain = claim.domain = unicode( "NewsSite") # todo: Add constant. # Update objects lists: posts_lst.append(post) claims_lst.append(claim) articles_lst.append(article) article_items_lst.append(article_item) print("###### 'EXITING _parse_articles_lst_to_articles'") return posts_lst, claims_lst, articles_lst, article_items_lst
def _create_post_by_row(self, record_dict): post = Post() post_id = self._convert_to_unicode_value(record_dict["post_id"]) post.post_osn_id = post_id post.post_id = post_id author_name = self._convert_to_unicode_value(record_dict["tumblog_id"]) post.author = author_name post_short_url = self._convert_to_unicode_value( record_dict["post_short_url"]) self._set_post_url(post_short_url, author_name, post) post_creation_date = self._convert_to_unicode_value( record_dict["created_time_epoch"]) post.created_at = post_creation_date if post_creation_date is not None: post_formatted_creation_date, str_post_formatted_creation_date = convert_epoch_timestamp_to_datetime( post_creation_date) post.date = post_formatted_creation_date else: str_post_formatted_creation_date = self._set_start_date() post.guid = compute_post_guid(post.url, author_name, str_post_formatted_creation_date) post.post_osn_guid = post.guid post.title = self._convert_to_unicode_value(record_dict["post_title"]) post_content = record_dict["post_content"] if post_content != 'NULL': content = json.loads(post_content.decode("utf-8")) #content = eval(record_dict["post_content"]) final_content = "" if 'title' in content.keys(): title = content['title'] final_content += title if 'text' in content.keys(): text = content['text'] final_content += text post.content = self._convert_to_unicode_value(final_content) post.domain = self._domain post.author_guid = compute_author_guid_by_author_name(author_name) post.post_type = self._convert_to_unicode_value( record_dict["post_type"]) post.post_format = self._convert_to_unicode_value( record_dict["post_format"]) post.reblog_key = self._convert_to_unicode_value( record_dict["post_reblog_key"]) post.tags = self._convert_to_unicode_value(record_dict["post_tags"]) post.state = self._convert_to_unicode_value(record_dict["post_state"]) if post.post_osn_id not in self._post_dict: self._post_dict[post.post_osn_id] = post return post
def _add_post(self, author_guid, title, content, domain=u'Microblog'): post = Post() post.author = author_guid post.author_guid = author_guid post.content = content post.title = title post.domain = domain post.post_id = title post.guid = post.post_id post.is_detailed = True post.is_LB = False self._db.addPost(post) self._posts.append(post)
def _add_post(self, author, date, post_osn_id, score=0, upvote_ratio=-1): post = Post() post.post_osn_id = post_osn_id post.author = str(author) post.author_guid = compute_author_guid_by_author_name(post.author) post.created_at = str_to_date(date, formate="%d/%m/%Y %H:%M") post.url = 'https://www.reddit.com{}'.format( post.author) # just for test post.guid = compute_post_guid(post.url, post.post_osn_id, date_to_str(post.created_at)) post.domain = 'reddit_comment' post.post_type = 'reddit_comment' post.post_id = post.guid reddit_post = RedditPost() reddit_post.post_id = post.post_id reddit_post.guid = post.guid reddit_post.score = score if upvote_ratio != -1: post.domain = 'reddit_post' post.post_type = 'reddit_post' reddit_post.upvote_ratio = upvote_ratio reddit_post.ups = int( round((reddit_post.upvote_ratio * reddit_post.score) / (2 * reddit_post.upvote_ratio - 1)) if reddit_post.upvote_ratio != 0.5 else round(reddit_post.score / 2)) reddit_post.downs = reddit_post.ups - reddit_post.score else: reddit_post.ups = -1 reddit_post.downs = -1 reddit_post.upvote_ratio = -1 self._db.addPosts([post, reddit_post]) return post, reddit_post
def create_post(self, _author_guid1, post_content): post = Post() post.post_id = u'TestPost' post.author = u'TechmarketNG' post.guid = u'TestPost' post.url = u'Url_From' tempDate = u'2016-05-05 00:00:00' day = datetime.timedelta(1) post.date = datetime.datetime.strptime(tempDate, '%Y-%m-%d %H:%M:%S') + day post.domain = u'Microblog' post.author_guid = _author_guid1 post.content = post_content post.xml_importer_insertion_date = datetime.datetime.now() self._db.addPost(post) self._db.commit()
def _convert_tweet_to_post(self, tweet, post_type): post = Post() post.post_osn_id = unicode(tweet.id) post_creation_date = tweet.date created_at = unicode(date_to_str(post_creation_date)) post.created_at = created_at post.date = post_creation_date post.favorite_count = tweet.favorites post.retweet_count = tweet.retweets post.content = unicode(tweet.text) author_name = unicode(tweet.username) post.author = author_name # post.author_guid = compute_author_guid_by_author_name(author_name) post_url = tweet.permalink post.url = unicode(post_url) post_guid = compute_post_guid(post_url, author_name, created_at) post.guid = post_guid post.post_id = post_guid post.domain = self._domain post.post_type = post_type return post
def _json_comment_to_db_comment_converter(self, post, domain=u"Instagram_comment"): rpost = Post() rpost.post_osn_id = unicode(post[u'id']) rpost.created_at = datetime.fromtimestamp(post[u'created_at']) rpost.author = post[u'owner'][u'id'] rpost.author_guid = compute_author_guid_by_author_name(rpost.author) rpost.url = unicode('https://www.instagram.com/p/{}/'.format( post[u'shortcode'])) rpost.content = post[u'text'] rpost.guid = compute_post_guid(rpost.url, rpost.post_osn_id, date_to_str(rpost.created_at)) rpost.domain = domain rpost.post_type = domain rpost.post_id = rpost.guid return rpost
def _create_post(self, original_liar_dataset_id, speaker, targeted_label, statement): post = Post() post.post_id = str(original_liar_dataset_id) post_guid = compute_post_guid(self._social_network_url, original_liar_dataset_id, '2007-01-01 00:00:00') post.guid = post_guid post.domain = self._domain post.author = speaker author_guid = compute_author_guid_by_author_name(speaker) post.author_guid = author_guid post.post_osn_guid = post_guid post.date = date('2007-01-01 00:00:00') post.post_type = targeted_label post.content = statement return post
def _add_post(self, title, content, date_str, domain=u'Microblog'): post = Post() post.author = self._author.author_guid post.author_guid = self._author.author_guid post.content = content post.title = title post.domain = domain post.post_id = title post.guid = post.post_id post.date = convert_str_to_unicode_datetime(date_str) post.created_at = post.date self._db.addPost(post) self._posts.append(post) self._author.statuses_count += 1
def _generate_comment(self, instagram_comment, post): comment = Post() comment.date = datetime.datetime.fromtimestamp(instagram_comment['created_at']) comment.post_osn_id = instagram_comment['id'] comment.content = str(instagram_comment['text']) comment.author = str(instagram_comment['owner']['username']) comment.author_guid = str(instagram_comment['owner']['id']) comment.url = '{}{}/'.format(post.url, comment.post_osn_id) comment.domain = 'Instagram' comment.post_type = 'comment' comment.post_id = str(comment.post_osn_id) return comment