Пример #1
0
 def __init__(self, db):
     # AbstractController.__init__(self, db)
     self._db = db
     self._twitter_rest_api = Twitter_Rest_Api(db)
     self._config_parser = getConfig()
     self._domain = unicode(self._config_parser.get("DEFAULT", "domain"))
     self._users_to_add = []
     self._post_to_add = []
Пример #2
0
 def __init__(self, db):
     Method_Executor.__init__(self, db)
     self._topics_path = self._config_parser.eval(self.__class__.__name__,
                                                  "topics_path")
     self._judgment_path = self._config_parser.eval(self.__class__.__name__,
                                                    "judgment_path")
     self._num_of_relevant_tweets = self._config_parser.eval(
         self.__class__.__name__, "num_of_relevant_tweets")
     self._num_of_description_words = self._config_parser.eval(
         self.__class__.__name__, "num_of_description_words")
     self._twitter_api = Twitter_Rest_Api(db)
    def __init__(self, db):
        AbstractController.__init__(self, db)

        self._actions = self._config_parser.eval(self.__class__.__name__, "actions")

        self._targeted_twitter_author_ids = self._config_parser.eval(self.__class__.__name__, "targeted_twitter_author_ids")

        self._targeted_twitter_post_ids = self._config_parser.eval(self.__class__.__name__, "targeted_twitter_post_ids")

        self._targeted_twitter_author_names = self._config_parser.eval(self.__class__.__name__, "targeted_twitter_author_names")

        self._social_network_crawler = Twitter_Rest_Api(db)
Пример #4
0
 def __init__(self, db):
     Method_Executor.__init__(self, db)
     self._twitter_api = TwitterApiRequester()
     self._social_network_crawler = Twitter_Rest_Api(db)
     self._influence_strategy = self._config_parser.eval(self.__class__.__name__, "post_strategy")
     self._source_group = self._config_parser.eval(self.__class__.__name__, "source_group")
     self._target_group = self._config_parser.eval(self.__class__.__name__, "target_group")
     self._user_id = self._config_parser.eval(self.__class__.__name__, "user_id")
     self._number_of_posts = self._config_parser.eval(self.__class__.__name__, "number_of_posts")
     self._retweet_precent = self._config_parser.eval(self.__class__.__name__, "retweet_precent")
     self._related_hashtags = self._config_parser.eval(self.__class__.__name__, "related_hashtags")
     self._posts_num = self._config_parser.eval(self.__class__.__name__, "posts_num")
 def __init__(self, db):
     Method_Executor.__init__(self, db)
     self._twitter_api = TwitterApiRequester()
     self._social_network_crawler = Twitter_Rest_Api(db)
     self._target_id = self._config_parser.eval(self.__class__.__name__, "target_id")
     self._source_id = self._config_parser.eval(self.__class__.__name__, "source_id")
     self.source_username = self._config_parser.eval(self.__class__.__name__, "source_username")
    def __init__(self, db):
        Method_Executor.__init__(self, db)
        self._actions = self._config_parser.eval(self.__class__.__name__,
                                                 "actions")

        self._minimal_num_of_posts = self._config_parser.eval(
            self.__class__.__name__, "minimal_num_of_posts")
        self._limit_friend_follower_number = self._config_parser.eval(
            self.__class__.__name__, "limit_friend_follower_number")
        self._maximal_tweets_count_in_timeline = self._config_parser.eval(
            self.__class__.__name__, "maximal_tweets_count_in_timeline")

        self._found_twitter_users = []
        self._social_network_crawler = Twitter_Rest_Api(db)
        self._suspended_authors = []
        self._max_users_without_saving = self._config_parser.eval(
            self.__class__.__name__, "max_users_without_saving")
        self._posts = []
        self._authors = []
        self._post_citatsions = []
 def __init__(self, db):
     Method_Executor.__init__(self, db)
     # taken from http://techslides.com/hacking-the-google-trends-api
     self._url = "https://trends.google.com/trends/hottrends/atom/feed?pn=p1"
     self._retrieve_news_by_keywords = self._config_parser.eval(
         self.__class__.__name__, "retrieve_news_by_keywords")
     self._num_of_top_terms = self._config_parser.eval(
         self.__class__.__name__, "num_of_top_terms")
     self._generic_twitter_crawler = Generic_Twitter_Crawler(self._db)
     self._topic_term_manager = Topic_Term_Manager(db)
     self._twitter_rest_api = Twitter_Rest_Api(db)
Пример #8
0
    def __init__(self):
        config_parser = getConfig()
        logging.config.fileConfig(getConfig().get("DEFAULT",
                                                  "logger_conf_file"))
        logger = logging.getLogger(getConfig().get("DEFAULT", "logger_name"))

        logger.info("Start Execution ... ")

        self._missing_retweets_not_retrived_from_vico_file_name = config_parser.get(
            self.__class__.__name__,
            "missing_retweets_not_retrived_from_vico_file_name")
        self._missing_tweets_not_retrived_from_vico_file_name = config_parser.get(
            self.__class__.__name__,
            "missing_tweets_not_retrived_from_vico_file_name")
        self._retweets_retrieved_from_vico_file_name = config_parser.get(
            self.__class__.__name__, "retweets_retrieved_from_vico_file_name")
        self._tweets_retrieved_from_vico_file_name = config_parser.get(
            self.__class__.__name__, "tweets_retrieved_from_vico_file_name")
        self._path = config_parser.get(self.__class__.__name__, "path")
        self._backup_path = config_parser.get(self.__class__.__name__,
                                              "backup_path")
        self._csv_header = config_parser.eval(self.__class__.__name__,
                                              "csv_header")
        self._csv_header_bad_actors_vico_retrieved_posts = config_parser.eval(
            self.__class__.__name__,
            "csv_header_bad_actors_vico_retrieved_posts")

        targeted_twitter_post_ids = config_parser.get(
            "BadActorsCollector", "targeted_twitter_post_ids")
        self._targeted_twitter_post_ids = create_ids_from_config_file(
            targeted_twitter_post_ids)

        self._original_statuses = config_parser.eval(self.__class__.__name__,
                                                     "original_statuses")

        self._csv_importer = PostCSVExporter()

        self._social_network_crawler = Twitter_Rest_Api()

        self._db = DB()
        self._db.setUp()
Пример #9
0
    def setUp(self):
        TestBase.setUp(self)
        self.config = getConfig()
        from DB.schema_definition import DB
        self.db = DB()
        self.db.setUp()
        self.social_network_crawler = Twitter_Rest_Api(self.db)
        self.xml_importer = XMLImporter(self.db)
        self.create_author_table = CreateAuthorTables(self.db)
        self._targeted_twitter_author_ids = self.config.eval('BadActorsCollector', "targeted_twitter_author_ids")

        self._targeted_twitter_post_ids = self.config.eval('BadActorsCollector', "targeted_twitter_post_ids")
        self._bad_actor_collector = BadActorsCollector(self.db)
    def setUp(self):
        self.config = getConfig()
        self.db = DB()
        self.db.setUp()
        self.social_network_crawler = Twitter_Rest_Api(self.db)
        self.xml_importer = XMLImporter(self.db)
        self.create_author_table = CreateAuthorTables(self.db)
        self._targeted_twitter_author_ids = self.config.eval(
            'BadActorsCollector', "targeted_twitter_author_ids")
        self._domain = u'Microblog'

        self._targeted_twitter_post_ids = self.config.eval(
            'BadActorsCollector', "targeted_twitter_post_ids")
        self._bad_actor_collector = BadActorsCollector(self.db)

        #The Author and Post for test_mark_missing_bad_retweeters_retrieved_from_vico
        self._author_guid1 = compute_author_guid_by_author_name(
            u'TechmarketNG')
        author = Author()
        author.name = u'TechmarketNG'
        author.domain = self._domain
        author.protected = 0
        author.author_guid = self._author_guid1
        author.author_screen_name = u'TechmarketNG'
        author.author_full_name = u'Techmarket'
        author.statuses_count = 10
        author.author_osn_id = 149159975
        author.followers_count = 12
        author.created_at = datetime.datetime.strptime('2016-04-02 00:00:00',
                                                       '%Y-%m-%d %H:%M:%S')
        author.missing_data_complementor_insertion_date = datetime.datetime.now(
        )
        author.xml_importer_insertion_date = datetime.datetime.now()
        self.db.add_author(author)

        post = Post()
        post.post_id = u'TestPost'
        post.author = u'TechmarketNG'
        post.guid = u'TestPost'
        post.url = u'TestPost'
        tempDate = u'2016-05-05 00:00:00'
        day = datetime.timedelta(1)
        post.date = datetime.datetime.strptime(tempDate,
                                               '%Y-%m-%d %H:%M:%S') + day * 1
        post.domain = self._domain
        post.author_guid = self._author_guid1
        post.content = u"InternetTV love it RT @benny_metanya #wow"
        post.xml_importer_insertion_date = datetime.datetime.now()
        self.db.addPost(post)

        self.db.commit()
class AsonamHoneypotImporter(AbstractController):
    def __init__(self, db):
        super(AsonamHoneypotImporter, self).__init__(db)
        self.twitter_rest_api = Twitter_Rest_Api(self._db)
        self._data_path = self._config_parser.eval(self.__class__.__name__,
                                                   "data_path")

    def execute(self, window_start=None):
        all_sub_files = listdir(self._data_path)
        for data_file in all_sub_files:
            self.add_tweet_data_to_db(data_file)

    def add_tweet_data_to_db(self, data_file):
        type_to_users_tweets_dict = self.load_file(self._data_path + data_file)
        for author_type in type_to_users_tweets_dict:
            tweets_ids = []
            for author in type_to_users_tweets_dict[author_type]:
                tweets_ids.extend(
                    type_to_users_tweets_dict[author_type][author])
            tweets = self.get_tweets(tweets_ids, str(author_type))
            self.twitter_rest_api._save_posts_and_authors(
                tweets, str(author_type))

    def load_file(self, file_path):
        file = open(file_path, 'r', newline='')
        csv_file = csv.reader(file, delimiter=' ')
        type_to_users_tweets = defaultdict(dict)
        for row in csv_file:
            author_type = row[0]
            author_id = row[1]
            tweet_ids = row[2:]
            type_to_users_tweets[author_type][author_id] = tweet_ids
        file.close()
        return type_to_users_tweets

    def get_tweets(self, tweets_ids, author_type=""):
        return self.twitter_rest_api.get_tweets_by_ids(tweets_ids, author_type)
class US_2016_Presidential_Election_Importer(Method_Executor):
    def __init__(self, db):
        Method_Executor.__init__(self, db)
        self._data_folder = self._config_parser.eval(self.__class__.__name__, "data_folder")

        self._social_network_crawler = Twitter_Rest_Api(db)

    def retrieve_tweets_from_scratch(self):
        file_names = os.listdir(self._data_folder)

        for file_name in file_names:
            lines = [line.rstrip('\n') for line in open(self._data_folder + file_name)]

            num_of_tweet_ids = len(lines)
            msg = "\r Number of tweets ids left to retrieve is: {0}".format(num_of_tweet_ids)
            print(msg, end="")

            self._social_network_crawler.get_tweets_by_tweet_ids_and_add_to_db(lines)


    def continue_retrieving_tweets_in_case_of_crush(self):
        file_names = os.listdir(self._data_folder)
        for file_name in file_names:
            total_tweet_ids = [line.rstrip('\n') for line in open(self._data_folder + file_name)]
            total_tweet_ids = set(total_tweet_ids)

            already_found_tweets_ids_tuples = self._db.get_post_osn_ids()
            already_found_tweets_ids = [tweets_ids_tuple[0] for tweets_ids_tuple in already_found_tweets_ids_tuples]
            already_found_tweets_ids = set(already_found_tweets_ids)

            left_to_retrieve_tweet_ids = total_tweet_ids - already_found_tweets_ids
            left_to_retrieve_tweet_ids = list(left_to_retrieve_tweet_ids)
            num_of_tweet_ids_to_retrieve = len(left_to_retrieve_tweet_ids)
            msg = "\r Number of tweets ids left to retrieve is: {0}".format(num_of_tweet_ids_to_retrieve)
            print(msg, end="")

            self._social_network_crawler.get_tweets_by_tweet_ids_and_add_to_db(left_to_retrieve_tweet_ids)
    def __init__(self, db):
        Method_Executor.__init__(self, db)
        self._data_folder = self._config_parser.eval(self.__class__.__name__, "data_folder")

        self._social_network_crawler = Twitter_Rest_Api(db)
Пример #14
0
class MissingVicoPostsRetriever():
    def __init__(self):
        config_parser = getConfig()
        logging.config.fileConfig(getConfig().get("DEFAULT",
                                                  "logger_conf_file"))
        logger = logging.getLogger(getConfig().get("DEFAULT", "logger_name"))

        logger.info("Start Execution ... ")

        self._missing_retweets_not_retrived_from_vico_file_name = config_parser.get(
            self.__class__.__name__,
            "missing_retweets_not_retrived_from_vico_file_name")
        self._missing_tweets_not_retrived_from_vico_file_name = config_parser.get(
            self.__class__.__name__,
            "missing_tweets_not_retrived_from_vico_file_name")
        self._retweets_retrieved_from_vico_file_name = config_parser.get(
            self.__class__.__name__, "retweets_retrieved_from_vico_file_name")
        self._tweets_retrieved_from_vico_file_name = config_parser.get(
            self.__class__.__name__, "tweets_retrieved_from_vico_file_name")
        self._path = config_parser.get(self.__class__.__name__, "path")
        self._backup_path = config_parser.get(self.__class__.__name__,
                                              "backup_path")
        self._csv_header = config_parser.eval(self.__class__.__name__,
                                              "csv_header")
        self._csv_header_bad_actors_vico_retrieved_posts = config_parser.eval(
            self.__class__.__name__,
            "csv_header_bad_actors_vico_retrieved_posts")

        targeted_twitter_post_ids = config_parser.get(
            "BadActorsCollector", "targeted_twitter_post_ids")
        self._targeted_twitter_post_ids = create_ids_from_config_file(
            targeted_twitter_post_ids)

        self._original_statuses = config_parser.eval(self.__class__.__name__,
                                                     "original_statuses")

        self._csv_importer = PostCSVExporter()

        self._social_network_crawler = Twitter_Rest_Api()

        self._db = DB()
        self._db.setUp()

    def execute(self):
        '''
        id = 714718743973208064
        id = 3190956770
        timeline = self._social_network_crawler.get_timeline_by_user_id(id)
        x = 3
        '''
        #timelines = self.collect_bad_actors_not_retrieved_from_vico_timelines()
        #self.export_retweets_vico_not_retrieved(timelines)
        self.export_tweets_vico_not_retrieved()
        #self.export_tweets_retrieved_from_vico()
        #self.export_retweets_vico_retrieved()

    def export_retweets_vico_not_retrieved(self, bad_actors_timelines):
        #
        # A retweet is defined as a post that has no text of its own. It always starts with RT @creator.
        # If you reply to the tweet it is not defined as retweet.
        # A retweet from the timeline always has a retweeted_status object which includes the original status.
        # The retweet's text always starts with RT: @creator and the text of the user.
        #
        missing_retweets = []

        for timeline in bad_actors_timelines:
            missing_post = self.find_missing_retweet(timeline)
            if missing_post is not None:
                missing_retweets.append(missing_post)

        if len(missing_retweets) > 0:
            self.move_existing_file_to_backup(
                self._path, self._backup_path,
                self._missing_retweets_not_retrived_from_vico_file_name)
            missing_posts_content = self.create_missing_posts_content_for_csv(
                missing_retweets)
            full_path_file_name = self._path + self._missing_retweets_not_retrived_from_vico_file_name
            self._csv_importer.write_content_to_csv(missing_posts_content,
                                                    full_path_file_name,
                                                    self._csv_header)

    def find_missing_retweet(self, timeline):
        for post in timeline:
            retweeted_status = post.retweeted_status
            if retweeted_status is not None:
                original_post_id = retweeted_status.id

                for post_id in self._targeted_twitter_post_ids:
                    if original_post_id == post_id:
                        return post
        return None

    def create_missing_posts_content_for_csv(self, missing_posts):
        missing_posts_content = []

        for missing_post in missing_posts:
            post_twitter_id = str(missing_post.id)
            missing_author_screen_name = missing_post.user.screen_name
            content = missing_post.text
            created_at = missing_post.created_at
            url = "http://twitter.com/" + missing_author_screen_name + "/status/" + str(
                post_twitter_id)
            user_mentions = missing_post.user_mentions
            user_mention = user_mentions[0]

            original_author_twitter_id = str(user_mention.id)
            original_author_screen = user_mention.screen_name

            missing_post_content = [
                post_twitter_id, missing_author_screen_name, content,
                created_at, url, original_author_twitter_id,
                original_author_screen
            ]
            missing_posts_content.append(missing_post_content)

        return missing_posts_content

    def create_missing_posts_content_for_csv(self, missing_posts):
        missing_posts_content = []

        for missing_post in missing_posts:
            post_twitter_id = str(missing_post.id)
            missing_author_screen_name = missing_post.user.screen_name
            content = missing_post.text
            created_at = missing_post.created_at
            url = "http://twitter.com/" + missing_author_screen_name + "/status/" + str(
                post_twitter_id)
            user_mentions = missing_post.user_mentions
            if len(user_mentions) > 0:
                user_mention = user_mentions[0]

                original_author_twitter_id = str(user_mention.id)
                original_author_screen = user_mention.screen_name

            else:
                urls = missing_post.urls
                original_url = urls[0]
                original_url = original_url.expanded_url
                relevant_part = original_url.split("https://twitter.com/", 1)
                screen_name_status_id = relevant_part[1].split("/status/", 1)
                original_author_twitter_id = str(screen_name_status_id[1])
                original_author_screen = screen_name_status_id[0]

            missing_post_content = [
                post_twitter_id, missing_author_screen_name, content,
                created_at, url, original_author_twitter_id,
                original_author_screen
            ]
            missing_posts_content.append(missing_post_content)

        return missing_posts_content

    def move_existing_file_to_backup(self, original_path, backup_path,
                                     file_name):
        logging.info("move_existing_file_to_backup ")
        full_path_output_file = original_path + file_name
        if os.path.isfile(full_path_output_file):
            full_path_backup_output_file = backup_path + file_name
            if os.path.isfile(full_path_backup_output_file):
                os.remove(full_path_backup_output_file)
            os.rename(full_path_output_file, full_path_backup_output_file)

    def export_tweets_vico_not_retrieved(self):
        bad_actors_timelines = self.collect_bad_actors_timelines()

        missing_tweets = []

        for timeline in bad_actors_timelines:
            missing_post = self.find_missing_tweet(timeline)
            if missing_post is not None:
                missing_tweets.append(missing_post)

        if len(missing_tweets) > 0:
            self.move_existing_file_to_backup(
                self._path, self._backup_path,
                self._missing_tweets_not_retrived_from_vico_file_name)
            missing_posts_content = self.create_missing_posts_content_for_csv(
                missing_tweets)
            full_path_file_name = self._path + self._missing_tweets_not_retrived_from_vico_file_name
            self._csv_importer.write_content_to_csv(missing_posts_content,
                                                    full_path_file_name,
                                                    self._csv_header)

    def find_missing_tweet(self, timeline):
        for post in timeline:
            retweeted_status = post.retweeted_status
            if retweeted_status is None:
                urls = post.urls
                if len(urls) > 0:
                    url = urls[0]
                    original_status_url = url.expanded_url
                    if original_status_url in self._original_statuses:
                        return post
        return None

    def collect_bad_actors_not_retrieved_from_vico_timelines(self):
        timelines = []
        bad_actors_not_found_by_vico_authors_ids = self._db.get_bad_actor_retweeters_not_retrieved_from_vico(
        )
        for id in bad_actors_not_found_by_vico_authors_ids:
            timeline = self._social_network_crawler.get_timeline_by_user_id(id)
            if timeline is not None:
                timelines.append(timeline)
        return timelines

    def collect_bad_actors_timelines(self):
        timelines = []
        bad_actors_not_found_by_vico_authors_ids = self._db.get_bad_actor_ids()
        for id in bad_actors_not_found_by_vico_authors_ids:
            timeline = self._social_network_crawler.get_timeline_by_user_id(id)
            if timeline is not None:
                timelines.append(timeline)
        return timelines

    def export_retweets_vico_retrieved(self):
        retweets = self._db.get_bad_actors_retweets_retrieved_by_vico()
        self.move_existing_file_to_backup(
            self._path, self._backup_path,
            self._retweets_retrieved_from_vico_file_name)
        vico_retweets_content = self.create_bad_actors_posts_content_for_csv(
            retweets)
        full_path_file_name = self._path + self._retweets_retrieved_from_vico_file_name
        self._csv_importer.write_content_to_csv(
            vico_retweets_content, full_path_file_name,
            self._csv_header_bad_actors_vico_retrieved_posts)

    def create_bad_actors_posts_content_for_csv(self, retweets):
        retweets_content = []

        for retweet in retweets:
            post_id = str(retweet.post_id)
            author = retweet.author
            guid = retweet.guid
            title = retweet.title
            url = retweet.url
            date = retweet.date
            content = retweet.content
            domain = retweet.domain
            author_guid = retweet.author_guid

            retweet_content = [
                post_id, author, guid, title, url, date, content, domain,
                author_guid
            ]
            retweets_content.append(retweet_content)

        return retweets_content

    def export_tweets_retrieved_from_vico(self):
        tweets = self._db.get_bad_actor_tweets_from_vico()
        self.move_existing_file_to_backup(
            self._path, self._backup_path,
            self._tweets_retrieved_from_vico_file_name)

        vico_tweets_content = self.create_bad_actors_posts_content_for_csv(
            tweets)
        full_path_file_name = self._path + self._tweets_retrieved_from_vico_file_name
        self._csv_importer.write_content_to_csv(
            vico_tweets_content, full_path_file_name,
            self._csv_header_bad_actors_vico_retrieved_posts)

    pass
Пример #15
0
class Generic_Twitter_Crawler(object):
    def __init__(self, db):
        # AbstractController.__init__(self, db)
        self._db = db
        self._twitter_rest_api = Twitter_Rest_Api(db)
        self._config_parser = getConfig()
        self._domain = unicode(self._config_parser.get("DEFAULT", "domain"))
        self._users_to_add = []
        self._post_to_add = []

    def retrieve_and_save_data_from_twitter_by_terms(self, keywords, terms,
                                                     topics):
        posts, total_twitter_users, connections = self.get_posts_and_authors_by_terms(
            keywords)
        self._db.addPosts(posts)
        self._add_users_to_db(total_twitter_users)

        self._db.addPosts(terms)
        self._db.addPosts(topics)
        self._db.addPosts(connections)

    def commit_db(self):
        self._db.addPosts(self._post_to_add)
        self._add_users_to_db(self._users_to_add)
        self._db.commit()
        self._users_to_add = []
        self._post_to_add = []

    def retrieve_and_save_data_from_twitter_by_post_id(self, post_id, label):
        post, user = self.get_post_and_author_by_post_id(post_id)
        try:
            converted_post = self._db.create_post_from_tweet_data(
                post, self._domain)
            converted_post.post_type = label
            self._users_to_add.append(user)
            self._post_to_add.append(converted_post)

        except TwitterError as e:
            exception_response = e[0][0]
            logging.info("e.massage =" + exception_response["message"])
            code = exception_response["code"]
            logging.info("e.code =" + str(exception_response["code"]))

            if code == 88:
                sec = self._twitter_rest_api.get_sleep_time_for_twitter_status_id(
                )
                logging.info("Seconds to wait from catched crush is: " +
                             str(sec))
                if sec != 0:
                    commons.count_down_time(sec)
                    self._num_of_twitter_status_id_requests = 0
                return self.retrieve_and_save_data_from_twitter_by_post_id(
                    post_id, label)

    def get_posts_and_authors_by_terms(self, keywords):
        term_tweets_dict = self.get_posts_by_terms(keywords)
        total_twitter_users = []
        total_posts = []
        connections = []
        for term, tweets in term_tweets_dict.iteritems():
            posts = []
            for tweet in tweets:
                post = self._db.create_post_from_tweet_data(
                    tweet, self._domain)
                term_post_connection, term_author_connection = self._create_connections(
                    term, post)
                connections.append(term_post_connection)
                connections.append(term_author_connection)
                posts.append(post)
            total_posts += posts
            #posts += [self._db.create_post_from_tweet_data(tweet, self._domain) for tweet in term_tweets_dict[term]]
            total_twitter_users += [
                post.user for post in term_tweets_dict[term]
            ]
        return total_posts, total_twitter_users, connections

    def get_post_and_author_by_post_id(self, post_id):
        post = self._twitter_rest_api.get_post_by_post_id(post_id)
        user = post.user
        return post, user

    def _add_users_to_db(self, total_twitter_users):
        author_type = None
        insertion_type = None
        self._twitter_rest_api.save_authors_and_connections(
            total_twitter_users, author_type, insertion_type)

    def get_posts_by_terms(self, terms):
        return self._twitter_rest_api.get_posts_by_terms(terms)

    def _create_connections(self, term, post):
        term_post_connection = AuthorConnection()

        term_post_connection.source_author_guid = term
        term_post_connection.destination_author_guid = post.post_id
        term_post_connection.connection_type = u"term-post"

        term_author_connection = AuthorConnection()
        term_author_connection.source_author_guid = term
        term_author_connection.destination_author_guid = post.author_guid
        term_author_connection.connection_type = u"term-author"

        return term_post_connection, term_author_connection
Пример #16
0
class PostManager(Method_Executor):
    def __init__(self, db):
        Method_Executor.__init__(self, db)
        self._twitter_api = TwitterApiRequester()
        self._social_network_crawler = Twitter_Rest_Api(db)
        self._influence_strategy = self._config_parser.eval(self.__class__.__name__, "post_strategy")
        self._source_group = self._config_parser.eval(self.__class__.__name__, "source_group")
        self._target_group = self._config_parser.eval(self.__class__.__name__, "target_group")
        self._user_id = self._config_parser.eval(self.__class__.__name__, "user_id")
        self._number_of_posts = self._config_parser.eval(self.__class__.__name__, "number_of_posts")
        self._retweet_precent = self._config_parser.eval(self.__class__.__name__, "retweet_precent")
        self._related_hashtags = self._config_parser.eval(self.__class__.__name__, "related_hashtags")
        self._posts_num = self._config_parser.eval(self.__class__.__name__, "posts_num")

    def _publish_post(self, post, message, media):
        self._twitter_api = TwitterApiRequester()
        statuses = self._twitter_api.api.PostUpdate(message, media)
        activity = self._db.create_activity(self._user_id, post.post_osn_id, statuses.id, 'twitter_post', 'twitter',
                                            message, datetime.datetime.utcnow(), "twitter")
        return activity

    def _retweet_post(self, post):
        self._twitter_api = TwitterApiRequester()
        statuses = self._twitter_api.api.PostRetweet(post.post_osn_id, trim_user=False)
        activity = self._db.create_activity(self._user_id, post.post_osn_id, statuses.id, 'twitter_retweet', 'twitter',
                                            post.content, datetime.datetime.utcnow(), "twitter")
        return activity

    def _get_posts(self):

        team_guid = self._db.get_author_guid_by_screen_name(self._source_group)
        team_posts = []
        if (len(team_guid) == 1):
            team_posts = self._db.get_posts_by_author_guid(team_guid[0])
        else:
            for i in team_guid:
                team_posts.append(self._db.get_posts_by_author_guid(i))
            team_posts = [sublist for item in team_posts for sublist in item]

        return self._split_into_retweet_and_tweet(team_posts)

    def _split_into_retweet_and_tweet(self,team_posts):
        team_posts_without_retweet = []
        team_posts_with_retweet = []
        for post in team_posts:
            prefix = str(post.content[0:2])
            if prefix != "RT":
                team_posts_without_retweet.append(post)
            else:
                team_posts_with_retweet.append(post)
        return team_posts_without_retweet, team_posts_with_retweet

    def influence_strategy_sort(self, team_posts_without_retweet, team_posts_with_retweet):
        if self._influence_strategy == "last":
            team_posts_without_retweet.sort(key=lambda x: x.date, reverse=True)
            team_posts_with_retweet.sort(key=lambda x: x.date, reverse=True)

        if self._influence_strategy == "popular":
            team_posts_without_retweet.sort(key=lambda x: x.favorite_count, reverse=True)
            team_posts_with_retweet.sort(key=lambda x: x.favorite_count, reverse=True)

        return team_posts_without_retweet, team_posts_with_retweet

    def _create_tweet_content(self, post):

        tweet_length = 270
        message = post.content
        media = post.media_path
        message = message + '\n' + "@" + self._target_group + " #" + self._target_group + " " + str(
            datetime.datetime.utcnow())
        for i in self._related_hashtags:
            if (len(message + " " + i) > tweet_length):
                break
            else:
                message = message + " " + i
        if (len(message) > tweet_length):
            message = message[0:tweet_length]

        return message, media

    def _post_func(self, post, team_posts_without_retweet):
        try:
            message, media = self._create_tweet_content(post)
            activity = self._publish_post(post, message, media)
            flag = 'True'
            print("date: " + str(datetime.datetime.utcnow()) + " post number: " + str(
                self._posts_num) + " succeed to send a tweet")
            self._db.addPosts([activity])
            return str(flag), team_posts_without_retweet
        except Exception as e:
            print("Failed  {}".format(e))
            flag = False
            if (len(team_posts_without_retweet) >= 1):
                del team_posts_without_retweet[0]
            return str(flag), team_posts_without_retweet

    def _retweet_func(self, post, team_posts_with_retweet):
        try:
            activity = self._retweet_post(post)
            flag = 'True'
            print("date: " + str(datetime.datetime.utcnow()) + " post number: " + str(
                self._posts_num) + " succeed to send a retweet")
            self._db.addPosts([activity])
            return str(flag), team_posts_with_retweet
        except Exception as e:
            print("Failed  {}".format(e))
            flag = False
            if (len(team_posts_with_retweet) >= 1):
                del team_posts_with_retweet[0]
            return str(flag), team_posts_with_retweet

    def _execute_post_process(self, team_posts_without_retweet, team_posts_with_retweet):

        if (self._number_of_posts <= self._posts_num):
            return
        else:
            team_posts_without_retweet, team_posts_with_retweet = self.influence_strategy_sort(
                team_posts_without_retweet, team_posts_with_retweet)

            flag = 'False'
            self._posts_num = self._posts_num + 1
            coin = random.uniform(0, 1)

            while flag == 'False':
                if (coin >= self._retweet_precent):
                    post, team_posts_without_retweet, team_posts_with_retweet = self._selecting_post(
                        team_posts_without_retweet, team_posts_with_retweet, "post")
                else:
                    post, team_posts_without_retweet, team_posts_with_retweet = self._selecting_post(
                        team_posts_without_retweet, team_posts_with_retweet, "retweet")

                if (coin >= self._retweet_precent):
                    flag, team_posts_without_retweet = self._post_func(post, team_posts_without_retweet)

                else:
                    flag,team_posts_with_retweet = self._retweet_func(post, team_posts_with_retweet)


    def _selecting_post(self, team_posts_without_retweet, team_posts_with_retweet, type):

        post_exist = True
        while post_exist == True:
            if type == "post":
                ans = team_posts_without_retweet[0]
                message = ans.content
                while "@" + self._target_group in message:
                    if (len(team_posts_without_retweet) >= 1):
                        del team_posts_without_retweet[0]
                        ans = team_posts_without_retweet[0]
                        message = ans.content
                    else:
                        print("End of tweets")
            else:
                ans = team_posts_with_retweet[0]

            post_exist = self._db.check_if_post_sent(ans, self._user_id)
            if (post_exist == True):
                if type == "post":
                    if (len(team_posts_without_retweet) >= 1):
                        del team_posts_without_retweet[0]
                    else:
                        print("End of tweets")
                else:
                    if (len(team_posts_with_retweet) >= 1):
                        del team_posts_with_retweet[0]
                    else:
                        print("End of tweets")

        return ans, team_posts_without_retweet, team_posts_with_retweet

    def time_schedule(self):

        hours_in_a_day = 24 * 60
        minute_window = float(hours_in_a_day) / self._number_of_posts

        while True:
            self._posts_num = 0
            self._convert_timeline_tweets_to_posts_for_author_screen_names(self._source_group)
            without_retweet, with_retweet = self._get_posts()
            self._execute_post_process(without_retweet, with_retweet)

            schedule.every(minute_window).minutes.do(self._execute_post_process, without_retweet, with_retweet)

            while True:
                if (self._number_of_posts <= self._posts_num):
                    break
                schedule.run_pending()
                time.sleep(1)

    def calculate_posts_stat(self):

        author_guid = "0927dc1a-8bcb-3488-99ed-7a962aee56e2"
        date = "2020-03-04 03:28:20"

        ids = self._db.source_destination()
        author_posts = self._db.posts_statics_from_date(author_guid, date)
        author_posts_guid = self._db.posts_statics_guids(author_guid, date)
        author_posts_guid = [ids[i] for i in author_posts_guid]
        influencers_posts = self._db.posts_statics_from_date_for_specific_posts(author_posts_guid)

        df1 = pd.DataFrame(author_posts,
                           columns=['author_guid', 'post_count', 'retweet_sum', 'favorite_sum', 'retweet_avg',
                                    'favorite_avg'])
        df2 = pd.DataFrame(influencers_posts,
                           columns=['author_guid', 'post_count', 'retweet_sum', 'favorite_sum', 'retweet_avg',
                                    'favorite_avg'])
        frames = [df1, df2]
        result = pd.concat(frames)
        result.to_csv(author_guid + ".csv")

    def _convert_timeline_tweets_to_posts_for_author_screen_names(self, author_screen_names):
        posts = []
        for i, account_screen_name in enumerate(author_screen_names):
            try:

                timeline_tweets = self._social_network_crawler.get_timeline(account_screen_name, 3200)
                if timeline_tweets is not None:
                    print("\rSearching timeline tweets for author_guid: {0} {1}/{2} retrieved:{3}".format(
                        account_screen_name, i,
                        len(author_screen_names), len(timeline_tweets)),
                          end='')
                    for timeline_tweet in timeline_tweets:
                        post = self._db.create_post_from_tweet_data_api(timeline_tweet, self._domain)
                        posts.append(post)
            except requests.exceptions.ConnectionError as errc:
                x = 3


            except TwitterError as e:
                if e.message == "Not authorized.":
                    logging.info("Not authorized for user id: {0}".format(account_screen_name))
                    continue

        self._db.addPosts(posts)
        self.fill_data_for_sources()

    def fill_author_guid_to_posts(self):
        posts = self._db.get_all_posts()
        num_of_posts = len(posts)
        for i, post in enumerate(posts):
            msg = "\rPosts to fill: [{0}/{1}]".format(i, num_of_posts)
            print(msg, end="")
            post.author_guid = compute_author_guid_by_author_name(post.author)
        self._db.addPosts(posts)
        self._db.insert_or_update_authors_from_posts(self._domain, {}, {})

    def fill_data_for_sources(self):
        print("---complete_missing_information_for_authors_by_screen_names ---")

        twitter_author_screen_names = self._db.get_missing_data_twitter_screen_names_by_posts()
        author_type = None
        are_user_ids = False
        inseration_type = DB_Insertion_Type.MISSING_DATA_COMPLEMENTOR
        # retrieve_full_data_for_missing_users
        i = 1
        for author_screen_names in self._split_into_equal_chunks(twitter_author_screen_names, 10000):
            twitter_users = self._social_network_crawler.handle_get_users_request(
                author_screen_names, are_user_ids, author_type, inseration_type)

            print('retrieve authors {}/{}'.format(i * 10000,
                                                  len(twitter_author_screen_names)))
            i += 1
            self._social_network_crawler.save_authors_and_connections(twitter_users, author_type, inseration_type)

        self.fill_author_guid_to_posts()

        print("---complete_missing_information_for_authors_by_screen_names was completed!!!!---")
        #logging.info("---complete_missing_information_for_authors_by_screen_names was completed!!!!---")

    def _split_into_equal_chunks(self,elements, num_of_chunks):
        """Yield successive n-sized chunks from l."""
        for i in range(0, len(elements), num_of_chunks):
            yield elements[i:i + num_of_chunks]
Пример #17
0
class Generic_Twitter_Crawler(object):
    def __init__(self, db):
        # AbstractController.__init__(self, db)
        self._db = db
        self._twitter_rest_api = Twitter_Rest_Api(db)
        self._config_parser = getConfig()
        self._domain = unicode(self._config_parser.get("DEFAULT", "domain"))

    def retrieve_and_save_data_from_twitter_by_terms(self, terms):
        posts, total_twitter_users = self.get_posts_and_authors_by_terms(terms)
        self._db.addPosts(posts)
        self._add_users_to_db(total_twitter_users)

    def retrive_and_save_data_from_twitter_by_post_id(self, post_id, label):
        post, user = self.get_post_and_author_by_post_id(post_id)
        try:
            converted_post = self._db.create_post_from_tweet_data(
                post, self._domain)
            converted_post.post_type = label
            self._db.addPost(converted_post)
            self._add_users_to_db([user])
            self._db.commit()

        except TwitterError as e:
            exception_response = e[0][0]
            logging.info("e.massage =" + exception_response["message"])
            code = exception_response["code"]
            logging.info("e.code =" + str(exception_response["code"]))

            if code == 88:
                sec = self._twitter_rest_api.get_sleep_time_for_twitter_status_id(
                )
                logging.info("Seconds to wait from catched crush is: " +
                             str(sec))
                if sec != 0:
                    commons.count_down_time(sec)
                    self._num_of_twitter_status_id_requests = 0
                return self.retrive_and_save_data_from_twitter_by_post_id(
                    post_id, label)

    def get_posts_and_authors_by_terms(self, terms):
        term_posts_dictionary = self.get_posts_by_terms(terms)
        total_twitter_users = []
        posts = []
        for term in term_posts_dictionary:
            posts += [
                self._db.create_post_from_tweet_data(tweet, self._domain)
                for tweet in term_posts_dictionary[term]
            ]
            total_twitter_users += [
                post.user for post in term_posts_dictionary[term]
            ]
        return posts, total_twitter_users

    def get_post_and_author_by_post_id(self, post_id):
        post = self._twitter_rest_api.get_post_by_post_id(post_id)
        user = post.user
        return post, user

    def _add_users_to_db(self, total_twitter_users):
        author_type = None
        insertion_type = None
        self._twitter_rest_api.save_authors_and_connections(
            total_twitter_users, author_type, insertion_type)

    def get_posts_by_terms(self, terms):
        return self._twitter_rest_api.get_posts_by_terms(terms)
class MissingDataComplementor(Method_Executor):
    def __init__(self, db):
        Method_Executor.__init__(self, db)
        self._actions = self._config_parser.eval(self.__class__.__name__,
                                                 "actions")

        self._minimal_num_of_posts = self._config_parser.eval(
            self.__class__.__name__, "minimal_num_of_posts")
        self._limit_friend_follower_number = self._config_parser.eval(
            self.__class__.__name__, "limit_friend_follower_number")
        self._maximal_tweets_count_in_timeline = self._config_parser.eval(
            self.__class__.__name__, "maximal_tweets_count_in_timeline")

        self._found_twitter_users = []
        self._social_network_crawler = Twitter_Rest_Api(db)
        self._suspended_authors = []
        self._max_users_without_saving = self._config_parser.eval(
            self.__class__.__name__, "max_users_without_saving")
        self._posts = []
        self._authors = []
        self._post_citatsions = []

    def setUp(self):
        pass

    def fill_author_guid_to_posts(self):
        posts = self._db.get_posts()
        num_of_posts = len(posts)
        for i, post in enumerate(posts):
            msg = "\rPosts to fill: [{0}/{1}]".format(i, num_of_posts)
            print(msg, end="")
            post.author_guid = compute_author_guid_by_author_name(post.author)
        self._db.addPosts(posts)
        self._db.insert_or_update_authors_from_posts(self._domain, {}, {})

    def fill_data_for_followers(self):
        self._fill_data_for_author_connection_type(
            Author_Connection_Type.FOLLOWER)
        logging.info("---Finished crawl_followers_by_author_ids")

    def fill_data_for_friends(self):
        self._fill_data_for_author_connection_type(
            Author_Connection_Type.FRIEND)
        logging.info("---Finished crawl_friends_by_author_ids")

    def _fill_data_for_author_connection_type(self, connection_type):
        # TEST
        self._db.get_authors_by_domain("Microblog")
        # TEST

        cursor = self._db.get_followers_or_friends_candidats(
            connection_type, self._domain, self._limit_friend_follower_number)
        followers_or_friends_candidats = self._db.result_iter(cursor)
        followers_or_friends_candidats = [
            author_id[0] for author_id in followers_or_friends_candidats
        ]
        print("---crawl_followers_by_author_ids---")
        author_type = None
        are_user_ids = True
        insertion_type = DB_Insertion_Type.MISSING_DATA_COMPLEMENTOR
        crawl_users_by_author_ids_func_name = "crawl_users_by_author_ids"
        getattr(self._social_network_crawler,
                crawl_users_by_author_ids_func_name)(
                    followers_or_friends_candidats, connection_type,
                    author_type, are_user_ids, insertion_type)
        self._db.convert_temp_author_connections_to_author_connections(
            self._domain)

    def crawl_followers_by_author_ids(self, author_ids):
        print("---crawl_followers_by_author_ids---")
        author_type = None
        are_user_ids = True
        inseration_type = DB_Insertion_Type.MISSING_DATA_COMPLEMENTOR
        self._social_network_crawler.crawl_followers_by_twitter_author_ids(
            author_ids, author_type, are_user_ids, inseration_type)

    def crawl_friends_by_author_ids(self, author_ids):
        print("---crawl_friends_by_author_ids---")
        author_type = None
        are_user_ids = True
        inseration_type = DB_Insertion_Type.MISSING_DATA_COMPLEMENTOR
        self._social_network_crawler.crawl_friends_by_twitter_author_ids(
            author_ids, author_type, are_user_ids, inseration_type)

    def create_author_screen_names(self):
        screen_names = self._db.get_screen_names_for_twitter_authors_by_posts()
        return screen_names

    def fill_data_for_sources(self):
        print(
            "---complete_missing_information_for_authors_by_screen_names ---")
        logging.info(
            "---complete_missing_information_for_authors_by_screen_names ---")
        # twitter_author_screen_names = self.create_author_screen_names()
        twitter_author_screen_names = self._db.get_missing_data_twitter_screen_names(
        )
        # twitter_author_screen_names = (twitter_author.name for twitter_author in twitter_authors)
        # twitter_author_screen_names = list(twitter_author_screen_names)

        author_type = None
        are_user_ids = False
        inseration_type = DB_Insertion_Type.MISSING_DATA_COMPLEMENTOR
        # retrieve_full_data_for_missing_users
        total_twitter_users = self._social_network_crawler.handle_get_users_request(
            twitter_author_screen_names, are_user_ids, author_type,
            inseration_type)

        self._social_network_crawler.save_authors_and_connections(
            total_twitter_users, author_type, inseration_type)

        print(
            "---complete_missing_information_for_authors_by_screen_names was completed!!!!---"
        )
        logging.info(
            "---complete_missing_information_for_authors_by_screen_names was completed!!!!---"
        )
        return total_twitter_users

    def complete_missing_information_for_authors_by_ids(self):
        print("---complete_missing_information_for_authors_by_ids ---")
        logging.info("---complete_missing_information_for_authors_by_ids ---")
        # twitter_author_screen_names = self.create_author_screen_names()
        twitter_author_screen_names = self._db.get_missing_data_twitter_screen_names(
        )
        # twitter_author_screen_names = (twitter_author.name for twitter_author in twitter_authors)
        # twitter_author_screen_names = list(twitter_author_screen_names)

        author_type = None
        are_user_ids = False
        inseration_type = DB_Insertion_Type.MISSING_DATA_COMPLEMENTOR
        # retrieve_full_data_for_missing_users
        total_twitter_users = self._social_network_crawler.handle_get_users_request(
            twitter_author_screen_names, are_user_ids, author_type,
            inseration_type)
        # return self._found_twitter_users
        print(
            "---complete_missing_information_for_authors was completed!!!!---")
        logging.info(
            "---complete_missing_information_for_authors was completed!!!!---")
        return total_twitter_users

    def mark_suspended_or_not_existed_authors(self):
        suspended_authors = self._db.get_authors_for_mark_as_suspended_or_not_existed(
        )
        for suspended_author in suspended_authors:
            suspended_author.is_suspended_or_not_exists = self._window_start
            self._db.set_inseration_date(
                suspended_author, DB_Insertion_Type.MISSING_DATA_COMPLEMENTOR)
        self._social_network_crawler.save_authors(suspended_authors)

    def mark_suspended_from_twitter(self):
        self._suspended_authors = []
        suspected_authors = self._db.get_not_suspended_authors(self._domain)
        suspected_authors_names = [author.name for author in suspected_authors]
        chunks = split_into_equal_chunks(
            suspected_authors_names, self._social_network_crawler.
            _maximal_user_ids_allowed_in_single_get_user_request)
        total_chunks = list(chunks)
        chunks = split_into_equal_chunks(
            suspected_authors_names, self._social_network_crawler.
            _maximal_user_ids_allowed_in_single_get_user_request)
        i = 1
        for chunk_of_names in chunks:
            msg = "\rChunck of author to Twitter: [{0}/{1}]".format(
                i, len(total_chunks))
            print(msg, end="")
            i += 1
            set_of_send_author_names = set(chunk_of_names)
            set_of_received_author_names = set(
                self._social_network_crawler.
                get_active_users_names_by_screen_names(chunk_of_names))
            author_names_of_suspendend_or_not_exists = set_of_send_author_names - set_of_received_author_names
            self._update_suspended_authors_by_screen_names(
                author_names_of_suspendend_or_not_exists)
        self._db.add_authors(self._suspended_authors)

    def _update_suspended_authors_by_screen_names(
            self, author_names_of_suspendend_or_not_exists):
        for author_name in author_names_of_suspendend_or_not_exists:
            user_guid = compute_author_guid_by_author_name(
                author_name).replace("-", "")
            suspended_author = self._db.get_author_by_author_guid(user_guid)

            suspended_author.is_suspended_or_not_exists = self._window_start
            suspended_author.author_type = Author_Type.BAD_ACTOR
            self._db.set_inseration_date(
                suspended_author, DB_Insertion_Type.MISSING_DATA_COMPLEMENTOR)
            self._suspended_authors.append(suspended_author)

            num_of_suspended_authors = len(self._suspended_authors)
            if num_of_suspended_authors == self._max_users_without_saving:
                self._db.add_authors(self._suspended_authors)
                self._suspended_authors = []

    def fill_tweet_retweet_connection(self):
        '''
        Fetches the original tweets being retweeted by our posts.
        Updates the followig tables:
         * Post_Citations table with tweet-retweet connection
         * Posts table with missing tweets
         * Authors with the authors of the missing tweets
        '''
        retweets_with_no_tweet_citation = self._db.get_retweets_with_no_tweet_citation(
        )
        logging.info(
            "Updating tweet-retweet connection of {0} retweets".format(
                len(retweets_with_no_tweet_citation)))
        self._posts = []
        self._authors = []
        self._post_citatsions = []
        i = 1
        for post_guid, post_url in retweets_with_no_tweet_citation.iteritems():
            # logging.info("Analyzing retweet: {0} - {1}".format(post_guid, post_url))
            msg = "\r Analyzing retweet: {0} - {1} [{2}".format(
                post_guid, post_url, i) + "/" + str(
                    len(retweets_with_no_tweet_citation)) + '] '
            print(msg, end="")
            i += 1
            tweet_data = self.extract_retweet_data(retweet_guid=post_guid,
                                                   retweet_url=post_url)
            if tweet_data is not None:

                if not self._db.isPostExist(tweet_data.tweet_url):
                    post = Post(guid=tweet_data.tweet_guid,
                                post_id=tweet_data.tweet_guid,
                                url=tweet_data.tweet_url,
                                date=str_to_date(tweet_data.tweet_date),
                                title=tweet_data.tweet_content,
                                content=tweet_data.tweet_content,
                                post_osn_id=tweet_data.tweet_twitter_id,
                                retweet_count=tweet_data.tweet_retweet_count,
                                favorite_count=tweet_data.tweet_favorite_count,
                                author=tweet_data.tweet_author_name,
                                author_guid=tweet_data.tweet_author_guid,
                                domain=self._domain,
                                original_tweet_importer_insertion_date=unicode(
                                    get_current_time_as_string()))
                    self._posts.append(post)

                if not self._db.is_author_exists(tweet_data.tweet_author_guid,
                                                 self._domain):
                    author = Author(
                        name=tweet_data.tweet_author_name,
                        domain=self._domain,
                        author_guid=tweet_data.tweet_author_guid,
                        original_tweet_importer_insertion_date=unicode(
                            get_current_time_as_string()))
                    self._authors.append(author)

                if not self._db.is_post_citation_exist(tweet_data.retweet_guid,
                                                       tweet_data.tweet_guid):
                    post_citation = Post_citation(
                        post_id_from=tweet_data.retweet_guid,
                        post_id_to=tweet_data.tweet_guid,
                        url_from=tweet_data.retweet_url,
                        url_to=tweet_data.tweet_url)
                    self._post_citatsions.append(post_citation)

        self.update_tables_with_tweet_retweet_data(self._posts, self._authors,
                                                   self._post_citatsions)

    def extract_retweet_data(self, retweet_guid, retweet_url):
        '''
        :param retweet_guid: the guid of the retweet
        :param retweet_url: the url of the retweet
        :return: a RetweetData holding the data of the retweet
        '''
        try:
            retweet_id = self.extract_tweet_id(retweet_url)
            if retweet_id is None:
                return None

            retweet_status = self._social_network_crawler.get_status_by_twitter_status_id(
                retweet_id)
            tweet_status_dict = retweet_status.AsDict()
            if 'retweeted_status' in tweet_status_dict:
                tweet_status_dict = tweet_status_dict['retweeted_status']
                tweet_post_twitter_id = unicode(str(tweet_status_dict['id']))
                tweet_author_name = unicode(
                    tweet_status_dict['user']['screen_name'])
                tweet_url = unicode(
                    generate_tweet_url(tweet_post_twitter_id,
                                       tweet_author_name))
                tweet_creation_time = unicode(tweet_status_dict['created_at'])
                tweet_str_publication_date = unicode(
                    extract_tweet_publiction_date(tweet_creation_time))
                tweet_guid = unicode(
                    compute_post_guid(
                        post_url=tweet_url,
                        author_name=tweet_author_name,
                        str_publication_date=tweet_str_publication_date))
                tweet_author_guid = unicode(
                    compute_author_guid_by_author_name(tweet_author_name))
                tweet_author_guid = unicode(tweet_author_guid.replace("-", ""))
                tweet_content = unicode(tweet_status_dict['text'])
                tweet_retweet_count = unicode(
                    tweet_status_dict['retweet_count'])
                tweet_favorite_count = unicode(
                    tweet_status_dict['favorite_count'])

                retweet_data = RetweetData(
                    retweet_guid=retweet_guid,
                    retweet_url=retweet_url,
                    tweet_guid=tweet_guid,
                    tweet_url=tweet_url,
                    tweet_author_name=tweet_author_name,
                    tweet_author_guid=tweet_author_guid,
                    tweet_date=tweet_str_publication_date,
                    tweet_content=tweet_content,
                    tweet_twitter_id=tweet_post_twitter_id,
                    tweet_retweet_count=tweet_retweet_count,
                    tweet_favorite_count=tweet_favorite_count)
                return retweet_data
            else:
                return None

        except TwitterError as e:
            exception_response = e[0][0]
            logging.info("e.massage =" + exception_response["message"])
            code = exception_response["code"]
            logging.info("e.code =" + str(exception_response["code"]))

            self.update_tables_with_tweet_retweet_data(self._posts,
                                                       self._authors,
                                                       self._post_citatsions)
            self._posts = []
            self._authors = []
            self._post_citatsions = []

            if code == 88:
                sec = self._social_network_crawler.get_sleep_time_for_twitter_status_id(
                )
                logging.info("Seconds to wait from catched crush is: " +
                             str(sec))
                if sec != 0:
                    count_down_time(sec)
                    self._num_of_twitter_status_id_requests = 0
                return self._social_network_crawler.get_status(retweet_id)

        except Exception as e:
            logging.error(
                "Cannot fetch data for retweet: {0}. Error message: {1}".
                format(retweet_url, e.message))
            return None

    def extract_tweet_id(self, post_url):
        post_url = str(post_url)
        pattern = re.compile("http(.*)://twitter.com/(.*)/statuses/(.*)")
        extracted_info = pattern.findall(post_url)
        if extracted_info == []:
            pattern = re.compile("http(.*)://twitter.com/(.*)/status/(.*)")
            extracted_info = pattern.findall(post_url)
        if len(extracted_info[0]) < 2:
            return None
        else:
            return extracted_info[0][2]

    def update_tables_with_tweet_retweet_data(self, posts, authors,
                                              post_citatsions):
        self._db.addPosts(posts)
        self._db.add_authors(authors)
        self._db.addReferences(post_citatsions)

    def fill_authors_time_line(self):
        '''
        Fetches the posts for the authors that are given under authors_twitter_ids_for_timeline_filling in the config file +
        update the db
        '''
        self._db.create_authors_index()
        self._db.create_posts_index()
        author_screen_names_number_of_posts = self._db.get_author_screen_names_and_number_of_posts(
            self._minimal_num_of_posts)
        author_screen_names_number_of_posts_dict = self._create_author_screen_name_number_of_posts_dictionary(
            author_screen_names_number_of_posts)
        index = 1
        for author_name in author_screen_names_number_of_posts_dict:
            print("Get timeline for {0} : {1}/{2}".format(
                author_name, str(index),
                str(len(author_screen_names_number_of_posts_dict))))
            index += 1
            posts = []
            logging.info("Fetching timeline for author: " + str(author_name))
            posts_counter = 0
            try:
                posts_needed_from_osn = self._minimal_num_of_posts - author_screen_names_number_of_posts_dict[
                    author_name]
                timeline = self._social_network_crawler.get_timeline_by_author_name(
                    author_name, posts_needed_from_osn)
                # logging.info("Retrived timeline lenght: " + str(len(timeline)))
                if timeline is not None:
                    for post in timeline:
                        tweet_post_twitter_id = str(post.id)
                        tweet_url = generate_tweet_url(tweet_post_twitter_id,
                                                       author_name)
                        tweet_creation_time = post.created_at
                        tweet_str_publication_date = extract_tweet_publiction_date(
                            tweet_creation_time)
                        tweet_guid = compute_post_guid(
                            post_url=tweet_url,
                            author_name=author_name,
                            str_publication_date=tweet_str_publication_date)
                        if self._db.contains_post(tweet_url):
                            continue
                        posts_counter = posts_counter + 1
                        tweet_author_guid = compute_author_guid_by_author_name(
                            author_name)
                        post = self._db.create_post_from_tweet_data(
                            post, self._domain)
                        posts.append(post)
            except Exception as e:
                logging.error(
                    "Cannot fetch data for author: {0}. Error message: {1}".
                    format(author_name, e.message))
            logging.info("Number of posts inserted for author {0}: {1}".format(
                author_name, posts_counter))
            self._db.addPosts(posts)

    def assign_manually_labeled_authors(self):
        self._db.assign_manually_labeled_authors()

    def delete_acquired_authors(self):
        self._db.delete_acquired_authors()
        self._db.delete_posts_with_missing_authors()

    def delete_manually_labeled_authors(self):
        self._db.delete_manually_labeled_authors()
        self._db.delete_posts_with_missing_authors()

    def assign_acquired_and_crowd_turfer_profiles(self):
        self._db.assign_crowdturfer_profiles()
        self._db.assign_acquired_profiles()

    def _create_author_screen_name_number_of_posts_dictionary(
            self, author_screen_names_number_of_posts):
        author_screen_names_number_of_posts_dict = {}
        for record in author_screen_names_number_of_posts:
            author_screen_name = record[0]
            num_of_posts = record[1]
            author_screen_names_number_of_posts_dict[
                author_screen_name] = num_of_posts
        logging.info("Number of users to retrieve timelines: " +
                     str(len(author_screen_names_number_of_posts_dict)))
        return author_screen_names_number_of_posts_dict
Пример #19
0
class BadActorsCollector(Method_Executor):
    def __init__(self, db):
        AbstractExecutor.__init__(self, db)

        self._actions = self._config_parser.eval(self.__class__.__name__,
                                                 "actions")

        self._targeted_twitter_author_ids = self._config_parser.eval(
            self.__class__.__name__, "targeted_twitter_author_ids")

        self._targeted_twitter_post_ids = self._config_parser.eval(
            self.__class__.__name__, "targeted_twitter_post_ids")

        self._targeted_twitter_author_names = self._config_parser.eval(
            self.__class__.__name__, "targeted_twitter_author_names")

        self._social_network_crawler = Twitter_Rest_Api(db)

    def setUp(self):
        pass

    def crawl_bad_actors_followers(self):
        print("---crawl_bad_actors_followers_and_retweeters ---")
        bad_actor_type = Author_Type.BAD_ACTOR
        bad_actors_collector_inseration_type = DB_Insertion_Type.BAD_ACTORS_COLLECTOR
        connection_type = Author_Connection_Type.FOLLOWER
        are_user_ids = True
        self._social_network_crawler.crawl_users_by_author_ids(
            self._targeted_twitter_author_ids, connection_type, bad_actor_type,
            are_user_ids, bad_actors_collector_inseration_type)

        self._db.convert_temp_author_connections_to_author_connections(
            self._domain)

    def crawl_bad_actors_retweeters(self):
        bad_actor_type = Author_Type.BAD_ACTOR
        bad_actors_collector_inseration_type = DB_Insertion_Type.BAD_ACTORS_COLLECTOR
        are_user_ids = True
        self._social_network_crawler.crawl_retweeters_by_post_id(
            self._targeted_twitter_post_ids, are_user_ids, bad_actor_type,
            bad_actors_collector_inseration_type)

    def mark_missing_bad_retweeters(self):
        print("mark_missing_bad_retweeters_retrieved_from_vico")
        missing_bad_actors = []
        i = 0

        cursor = self._db.get_cooperated_authors(
            self._targeted_twitter_author_names, self._domain)

        targeted_twitter_author_guid_generator = self._db.result_iter(cursor)

        for missing_author_guid in targeted_twitter_author_guid_generator:
            i += 1
            missing_author_guid = unicode(missing_author_guid[0])
            result = self._db.get_author_by_author_guid_and_domain(
                missing_author_guid, self._domain)
            if len(result) > 0:
                missing_author = result[0]

                missing_author.author_type = Author_Type.BAD_ACTOR
                missing_author.mark_missing_bad_actor_retweeters_insertion_date = self._window_start

                missing_bad_actors.append(missing_author)

            else:
                logging.info("GUID = " + missing_author_guid)
        logging.info("number of missing bad actors found are:" +
                     str(len(missing_bad_actors)))
        self._db.add_authors(missing_bad_actors)
import logging
from logging import config

from DB.schema_definition import DB
from Twitter_API.twitter_api_requester import TwitterApiRequester
from configuration.config_class import getConfig
from twitter_rest_api.twitter_rest_api import Twitter_Rest_Api

if __name__ == '__main__':
    #config_parser = Configuration.get_config_parser()
    config_parser = getConfig()
    logging.config.fileConfig(config_parser.get("Logger", "logger_conf_file"))
    logging.info("Start program...")
    print("Start program...")

    social_network_crawler = Twitter_Rest_Api()

    #twitter_rest_api.crawl_followers()

    targeted_twitter_author_ids = []
    targeted_twitter_author_ids.append(targeted_twitter_author_id)
    are_user_ids = True
    social_network_crawler.crawl_followers_by_twitter_author_ids(
        targeted_twitter_author_ids, bad_actor_type, are_user_ids)
    # twitter_rest_api.crawl_
    '''

    db = DB()
    db.setUp()

    logging.info("Creating TwitterApiRequester")
Пример #21
0
class Trec2012MicroblogTrackImporter(Method_Executor):
    def __init__(self, db):
        Method_Executor.__init__(self, db)
        self._topics_path = self._config_parser.eval(self.__class__.__name__,
                                                     "topics_path")
        self._judgment_path = self._config_parser.eval(self.__class__.__name__,
                                                       "judgment_path")
        self._num_of_relevant_tweets = self._config_parser.eval(
            self.__class__.__name__, "num_of_relevant_tweets")
        self._num_of_description_words = self._config_parser.eval(
            self.__class__.__name__, "num_of_description_words")
        self._twitter_api = Twitter_Rest_Api(db)

    def load_data(self):
        topics = self._read_trec_topics(self._topics_path)
        topic_judgments = self._read_judgments(self._judgment_path)
        claims = self._extract_claims_from_judgments(topics, topic_judgments)
        self._db.addPosts(claims)
        self._create_tweet_corpus_from_judgments(self._judgment_path)

    def set_description_from_relevant(self):
        claims = self._db.get_claims()
        topic_judgments = self._read_judgments(self._judgment_path)
        posts = self._db.get_posts()
        post_dict = {p.post_id: p for p in posts}
        for claim in claims:
            topic_id = int(claim.claim_id)
            # tweets = self._twitter_api.get_tweets_by_ids(topic_judgments[topic_id][:self._num_of_relevant_tweets])
            # posts, authors = self._db.convert_tweets_to_posts_and_authors(tweets, self._domain)
            posts = list(
                map(post_dict.get,
                    topic_judgments[topic_id][:self._num_of_relevant_tweets]))
            claim_content = OrderedSet(claim.keywords.lower().split())
            for post in posts:
                list(map(claim_content.add, clean_tweet(post.content).split()))
                # if len(claim_content) > 25:
                #     break
            claim.description = clean_claim_description(
                ' '.join(claim_content), True)
        self._db.addPosts(claims)

    def set_description_from_tf_idf_best(self):
        tf_idf_vectorizer = TfidfVectorizer(stop_words='english')
        posts = self._db.get_posts()
        corpus = [clean_claim_description(p.content, True) for p in posts]
        tf_idf_vectorizer.fit_transform(corpus)
        word_tf_idf_dict = defaultdict(
            float,
            list(
                zip(tf_idf_vectorizer.get_feature_names(),
                    tf_idf_vectorizer.idf_)))
        post_dict = {p.post_id: p for p in posts}
        topic_judgments = self._read_judgments(self._judgment_path)

        claims = self._db.get_claims()
        for i, claim in enumerate(claims):
            init_query_words = set(claim.keywords.lower().split())
            claim_content = set()
            relevant_posts_ids = topic_judgments[int(claim.claim_id)]
            words = set()
            for post_id in relevant_posts_ids:
                if post_id in post_dict:
                    words.update(
                        clean_claim_description(
                            post_dict.get(post_id).content, True).split())
            best_words = sorted(words,
                                key=lambda k: word_tf_idf_dict[k],
                                reverse=True)[:self._num_of_description_words +
                                              len(init_query_words)]
            claim_content.update(best_words)
            pass
            claim_content = claim_content - init_query_words
            claim_description = clean_claim_description(
                ' '.join(claim_content), True)
            claim.description = ' '.join(claim_description.split())

        self._db.addPosts(claims)

    def _read_trec_topics(self, topics_path):
        trec_topic_fields = ['num', 'query', 'querytime', 'querytweettime']
        TrecTopic = namedtuple('TrecTopic', trec_topic_fields)
        topic_file = open(topics_path)
        trec_topics = []
        for topic_xml in topic_file.read().split('\n\n')[:-1]:
            trec_topic_dict = xmltodict.parse(topic_xml)
            trec_topic = TrecTopic._make(
                [trec_topic_dict['top'][field] for field in trec_topic_fields])
            trec_topics.append(trec_topic)
        return trec_topics

    # def _read_judgments(self, judgment_path):
    #     judgments = pd.read_csv(judgment_path, sep=' ', names=['topic_id', 'Q', 'tweet_id', 'rel'])
    #     tweets = self._twitter_api.get_tweets_by_ids(judgments['tweet_id'], pre_save=False)
    #     topic_tweet_id_dict = dict(judgments[['topic_id', 'tweet_id']].to_records(index=False))
    #     tweet_id_rel_dict = dict(judgments[['tweet_id', 'rel']].to_records(index=False))
    #     posts, authors = self._db.convert_tweets_to_posts_and_authors(tweets, u'Trec2012')

    def _read_judgments(self, judgment_path):
        topic_high_relevant_judgment_dict = defaultdict(list)
        topic_relevant_judgment_dict = defaultdict(list)

        for topic, Q, docid, rel in csv.reader(open(judgment_path, "rb"),
                                               delimiter=' '):
            if int(rel) > 1:
                topic_high_relevant_judgment_dict[int(topic)].append(docid)
            elif int(rel) == 1:
                topic_relevant_judgment_dict[int(topic)].append(docid)
        for topic_id, tweet_ids in topic_relevant_judgment_dict.items():
            # if topic_id not in topic_high_relevant_judgment_dict:
            topic_high_relevant_judgment_dict[topic_id].extend(tweet_ids)
        return topic_high_relevant_judgment_dict

    def _extract_claims_from_judgments(self, topics, topic_judgments):
        claims = []
        for trec_topic in topics:
            # tweet_id = tweet_ids[0]
            topic_id = int(parse.parse('Number: MB{}', trec_topic.num)[0])
            # tweets = self._twitter_api.get_tweets_by_ids(topic_judgments[topic_id][:10])
            # posts, authors = self._db.convert_tweets_to_posts_and_authors(tweets, self._domain)
            claim_content = set(trec_topic.query.split())
            # for post in []:
            #     claim_content.update(clean_tweet(post.content).split())
            #     if len(claim_content) > 25:
            #         break
            claim = self._convet_trec_topic_to_claim(' '.join(claim_content),
                                                     topic_id, trec_topic)
            claims.append(claim)
        return claims

    def _convet_trec_topic_to_claim(self, claim_content, topic_id, trec_topic):
        claim = Claim()
        claim.claim_id = topic_id
        claim.verdict_date = parser.parse(trec_topic.querytime).date()
        claim.domain = 'Trec2012'
        claim.title = trec_topic.query
        claim.keywords = trec_topic.query
        claim.description = claim_content
        return claim

    def _create_tweet_corpus_from_judgments(self, judgment_path):
        judgment_df = pd.read_csv(
            judgment_path,
            delimiter=' ',
            names=['topic', 'Q', 'docid', 'rel'],
        )
        tweet_ids = judgment_df['docid'].tolist()
        tweets = self._twitter_api.get_tweets_by_ids(tweet_ids, pre_save=False)
        posts, authors = self._db.convert_tweets_to_posts_and_authors(
            tweets, 'Trec2012')
        claim_tweet_connections = []
        for post in posts:
            post.post_id = str(post.post_osn_id)
        post_osn_id_posts_dict = set(p.post_osn_id for p in posts)
        for topic_id, post_osn_id in judgment_df[['topic', 'docid'
                                                  ]].to_records(index=False):
            if post_osn_id in post_osn_id_posts_dict:
                claim_tweet_connection = Claim_Tweet_Connection()
                claim_tweet_connection.claim_id = str(topic_id)
                claim_tweet_connection.post_id = str(post_osn_id)
                claim_tweet_connections.append(claim_tweet_connection)
        self._db.addPosts(claim_tweet_connections)
        self._db.addPosts(posts)
        judgment_df[judgment_df['docid'].isin(post_osn_id_posts_dict)].to_csv(
            judgment_path + '_filtered', sep=' ', header=False, index=False)
 def __init__(self, db):
     super(AsonamHoneypotImporter, self).__init__(db)
     self.twitter_rest_api = Twitter_Rest_Api(self._db)
     self._data_path = self._config_parser.eval(self.__class__.__name__,
                                                "data_path")