def collect_tweets_by_manual_keywords(self):
        old_terms, terms_to_add, all_config_terms, term_term_id_dict = self._convert_keywords_to_terms_by_given_dict(
        )
        topics = self._create_topics_by_given_dict(term_term_id_dict)
        self._db.addPosts(terms_to_add)
        self._db.addPosts(topics)

        today = datetime.datetime.today()
        month_interval = timedelta(self._month_interval * 365 / 12)
        before = today - month_interval

        start_date = date_to_str(before, "%Y-%m-%d")
        end_date = date_to_str(today, "%Y-%m-%d")

        for i, term in enumerate(all_config_terms):
            print("\rTerm: {0} {1}/{2}".format(term, i, len(all_config_terms)),
                  end='')

            tweets = self._collect_tweets_by_term(term, start_date, end_date)
            print("\rTerm: {0} Num tweets retrieved: {1}".format(
                term, len(tweets)),
                  end='')
            posts, claim_post_connections = self._convert_tweets_to_posts(
                tweets, term_term_id_dict[term], self._domain)

            self._db.addPosts(posts)
            self._db.addPosts(claim_post_connections)
Пример #2
0
    def _retrieve_old_tweets(self, claim, content):
        datetime_object = claim.verdict_date
        month_interval = timedelta(self._month_interval * 365 / 12)
        start_date = date_to_str(datetime_object - month_interval, "%Y-%m-%d")
        end_date = date_to_str(datetime_object + month_interval, "%Y-%m-%d")
        tweets = []
        try:
            tweets = self._retrieve_tweets_between_dates(claim, content, start_date, end_date)
        except:
            e = sys.exc_info()[0]
            print("tweet content: {0}, error:{1}".format(content, e))

        return tweets
Пример #3
0
    def append_comments(self, submission, origin_post_id, publish_date):
        datetime_object = publish_date
        month_interval = timedelta(self._month_interval * 365 / 12)
        start_date = time.mktime((datetime_object - month_interval).timetuple())
        end_date = time.mktime((datetime_object + month_interval).timetuple())
        publish_date = time.mktime(publish_date.timetuple())

        for comment in submission.comments:
            if isinstance(comment, MoreComments):
                continue
            if comment.created > end_date or comment.created < start_date:
                continue

            created_at = datetime.fromtimestamp(comment.created)
            url = unicode(submission.url + comment.id)
            comment_guid = compute_post_guid(url, comment.id, date_to_str(created_at))

            if start_date < comment.created <= publish_date:
                if comment.score >= 0 and len(comment.body.split(' ')) > 3:
                    self.convert_comment_to_post(comment, submission)
                    self._post_id_tweets_id_before_dict[origin_post_id].add(comment_guid)
            elif publish_date < comment.created <= end_date:
                if comment.score >= 0 and len(comment.body.split(' ')) > 3:
                    self.convert_comment_to_post(comment, submission)
                    self._post_id_tweets_id_after_dict[origin_post_id].add(comment_guid)
    def _convert_tweet_to_post(self, tweet, post_type):
        post = Post()

        post.post_osn_id = unicode(tweet.id)
        post_creation_date = tweet.date
        created_at = unicode(date_to_str(post_creation_date))
        post.created_at = created_at

        post.date = post_creation_date
        post.favorite_count = tweet.favorites
        post.retweet_count = tweet.retweets
        post.content = unicode(tweet.text)

        author_name = unicode(tweet.username)
        post.author = author_name
        # post.author_guid = compute_author_guid_by_author_name(author_name)
        post_url = tweet.permalink
        post.url = unicode(post_url)

        post_guid = compute_post_guid(post_url, author_name, created_at)
        post.guid = post_guid
        post.post_id = post_guid
        post.domain = self._domain

        post.post_type = post_type
        return post
    def _add_post(self, author, date, post_osn_id, score=0, upvote_ratio=-1):
        post = Post()
        post.post_osn_id = post_osn_id
        post.author = str(author)
        post.author_guid = compute_author_guid_by_author_name(post.author)
        post.created_at = str_to_date(date, formate="%d/%m/%Y %H:%M")
        post.url = 'https://www.reddit.com{}'.format(
            post.author)  # just for test
        post.guid = compute_post_guid(post.url, post.post_osn_id,
                                      date_to_str(post.created_at))
        post.domain = 'reddit_comment'
        post.post_type = 'reddit_comment'
        post.post_id = post.guid

        reddit_post = RedditPost()
        reddit_post.post_id = post.post_id
        reddit_post.guid = post.guid
        reddit_post.score = score
        if upvote_ratio != -1:
            post.domain = 'reddit_post'
            post.post_type = 'reddit_post'
            reddit_post.upvote_ratio = upvote_ratio
            reddit_post.ups = int(
                round((reddit_post.upvote_ratio * reddit_post.score) /
                      (2 * reddit_post.upvote_ratio - 1)) if
                reddit_post.upvote_ratio != 0.5 else round(reddit_post.score /
                                                           2))
            reddit_post.downs = reddit_post.ups - reddit_post.score
        else:
            reddit_post.ups = -1
            reddit_post.downs = -1
            reddit_post.upvote_ratio = -1

        self._db.addPosts([post, reddit_post])
        return post, reddit_post
Пример #6
0
    def _convert_row_to_post(self, row):
        # [site, social_id, username_hash, comment_time, comment_tokens]
        print("\rInsert post to DataFrame {0}/{1}".format(
            self._current_row, len(self.posts_csv_df)),
              end="")
        self._current_row += 1
        date = datetime.datetime.fromtimestamp(row['comment_time'])
        post = Post()
        claim_id = compute_author_guid_by_author_name(str(row['social_id']))
        post.post_id = str(
            compute_post_guid(row['site'] + str(claim_id),
                              row['username_hash'], date_to_str(date)))
        post.content = str(row['comment_tokens'])
        post.author = str(row['username_hash'])
        post.author_guid = str(row['username_hash'])
        post.domain = str(row['site'])
        post.date = date
        self._posts.append(post)

        claim_tweet_connection = Claim_Tweet_Connection()
        claim_tweet_connection.claim_id = str(claim_id)
        claim_tweet_connection.post_id = str(post.post_id)
        self._claim_tweet_connections.append(claim_tweet_connection)

        if self._current_row % self._max_posts_without_save == 0:
            self._save_posts_and_connections()
Пример #7
0
 def test_retrieve_tweets_by_content_between_dates_before(self):
     self._add_claim(u"post0", u"The Rock Running for President",
                     u"2017-02-03 00:00:00")
     self._db.commit()
     date_interval_dict = defaultdict(set)
     claim_date = self._claims[u"post0"].verdict_date
     since_date = str_to_date(u"2016-08-03 00:00:00")
     self.tweets_crawler._limit_start_date = True
     self.tweets_crawler._limit_end_date = True
     tweets = self.tweets_crawler._retrieve_tweets_between_dates(
         self._claims[u"post0"], u"The Rock Running for President",
         date_to_str(since_date, "%Y-%m-%d"),
         date_to_str(claim_date, "%Y-%m-%d"))
     tweets_date = map(lambda tweet: tweet.date, tweets)
     self.assertTrue(
         all([since_date <= date < claim_date for date in tweets_date]))
     self.assertGreaterEqual(100, len(tweets))
 def _json_comment_to_db_comment_converter(self, post, domain="Instagram_comment"):
     rpost = Post()
     rpost.post_osn_id = str(post['id'])
     rpost.created_at = datetime.fromtimestamp(post['created_at'])
     rpost.author = post['owner']['id']
     rpost.author_guid = compute_author_guid_by_author_name(rpost.author)
     rpost.url = str('https://www.instagram.com/p/{}/'.format(post['shortcode']))
     rpost.content = post['text']
     rpost.guid = compute_post_guid(rpost.url, rpost.post_osn_id, date_to_str(rpost.created_at))
     rpost.domain = domain
     rpost.post_type = domain
     rpost.post_id = rpost.guid
     return rpost
 def _json_post_to_db_post_converter(self, post, domain="Instagram_post"):
     rpost = Post()
     rpost.post_osn_id = str(post['id'])
     rpost.created_at = datetime.fromtimestamp(post['taken_at_timestamp'])
     rpost.author = post['owner']['id']
     rpost.author_guid = compute_author_guid_by_author_name(rpost.author)
     rpost.url = str('https://www.instagram.com/p/{}/'.format(post['shortcode']))
     rpost.content = ', '.join(x['node']['text'] for x in post['edge_media_to_caption']['edges'])
     rpost.guid = compute_post_guid(rpost.url, rpost.post_osn_id, date_to_str(rpost.created_at))
     rpost.domain = domain
     rpost.post_type = domain
     rpost.post_id = rpost.guid
     return rpost
Пример #10
0
 def convert_comment_to_post(self, comment, submission, domain=u"Reddit"):
     post = Post()
     post.post_osn_id = unicode(comment.id)
     post.created_at = datetime.fromtimestamp(comment.created)
     post.date = datetime.fromtimestamp(comment.created)
     if hasattr(comment, 'author') and comment.author:
         post.author = unicode(comment.author.name)
         self._redditors.append(comment.author)
     else:
         self._deleted_redditors.append(str(post.date))
         post.author = unicode('')
     post.author_guid = compute_author_guid_by_author_name(post.author)
     post.url = unicode('https://www.reddit.com' + '/'.join(getattr(comment, 'permalink', '').split('/')[3:7]))
     post.title = unicode(submission.title)
     post.content = unicode(getattr(comment, 'body', '').encode('utf-8').strip())
     post.guid = compute_post_guid(post.url, post.post_osn_id, date_to_str(post.created_at))
     post.domain = domain
     post.post_type = domain
     post.post_id = post.guid
     post.url = u'https://www.reddit.com{}'.format(comment.permalink)
     return post