def __init__(self, db): # AbstractController.__init__(self, db) self._db = db self._twitter_rest_api = Twitter_Rest_Api(db) self._config_parser = getConfig() self._domain = unicode(self._config_parser.get("DEFAULT", "domain")) self._users_to_add = [] self._post_to_add = []
def __init__(self, db): Method_Executor.__init__(self, db) self._topics_path = self._config_parser.eval(self.__class__.__name__, "topics_path") self._judgment_path = self._config_parser.eval(self.__class__.__name__, "judgment_path") self._num_of_relevant_tweets = self._config_parser.eval( self.__class__.__name__, "num_of_relevant_tweets") self._num_of_description_words = self._config_parser.eval( self.__class__.__name__, "num_of_description_words") self._twitter_api = Twitter_Rest_Api(db)
def __init__(self, db): AbstractController.__init__(self, db) self._actions = self._config_parser.eval(self.__class__.__name__, "actions") self._targeted_twitter_author_ids = self._config_parser.eval(self.__class__.__name__, "targeted_twitter_author_ids") self._targeted_twitter_post_ids = self._config_parser.eval(self.__class__.__name__, "targeted_twitter_post_ids") self._targeted_twitter_author_names = self._config_parser.eval(self.__class__.__name__, "targeted_twitter_author_names") self._social_network_crawler = Twitter_Rest_Api(db)
def __init__(self, db): Method_Executor.__init__(self, db) self._twitter_api = TwitterApiRequester() self._social_network_crawler = Twitter_Rest_Api(db) self._influence_strategy = self._config_parser.eval(self.__class__.__name__, "post_strategy") self._source_group = self._config_parser.eval(self.__class__.__name__, "source_group") self._target_group = self._config_parser.eval(self.__class__.__name__, "target_group") self._user_id = self._config_parser.eval(self.__class__.__name__, "user_id") self._number_of_posts = self._config_parser.eval(self.__class__.__name__, "number_of_posts") self._retweet_precent = self._config_parser.eval(self.__class__.__name__, "retweet_precent") self._related_hashtags = self._config_parser.eval(self.__class__.__name__, "related_hashtags") self._posts_num = self._config_parser.eval(self.__class__.__name__, "posts_num")
def __init__(self, db): Method_Executor.__init__(self, db) self._twitter_api = TwitterApiRequester() self._social_network_crawler = Twitter_Rest_Api(db) self._target_id = self._config_parser.eval(self.__class__.__name__, "target_id") self._source_id = self._config_parser.eval(self.__class__.__name__, "source_id") self.source_username = self._config_parser.eval(self.__class__.__name__, "source_username")
def __init__(self, db): Method_Executor.__init__(self, db) self._actions = self._config_parser.eval(self.__class__.__name__, "actions") self._minimal_num_of_posts = self._config_parser.eval( self.__class__.__name__, "minimal_num_of_posts") self._limit_friend_follower_number = self._config_parser.eval( self.__class__.__name__, "limit_friend_follower_number") self._maximal_tweets_count_in_timeline = self._config_parser.eval( self.__class__.__name__, "maximal_tweets_count_in_timeline") self._found_twitter_users = [] self._social_network_crawler = Twitter_Rest_Api(db) self._suspended_authors = [] self._max_users_without_saving = self._config_parser.eval( self.__class__.__name__, "max_users_without_saving") self._posts = [] self._authors = [] self._post_citatsions = []
def __init__(self, db): Method_Executor.__init__(self, db) # taken from http://techslides.com/hacking-the-google-trends-api self._url = "https://trends.google.com/trends/hottrends/atom/feed?pn=p1" self._retrieve_news_by_keywords = self._config_parser.eval( self.__class__.__name__, "retrieve_news_by_keywords") self._num_of_top_terms = self._config_parser.eval( self.__class__.__name__, "num_of_top_terms") self._generic_twitter_crawler = Generic_Twitter_Crawler(self._db) self._topic_term_manager = Topic_Term_Manager(db) self._twitter_rest_api = Twitter_Rest_Api(db)
def __init__(self): config_parser = getConfig() logging.config.fileConfig(getConfig().get("DEFAULT", "logger_conf_file")) logger = logging.getLogger(getConfig().get("DEFAULT", "logger_name")) logger.info("Start Execution ... ") self._missing_retweets_not_retrived_from_vico_file_name = config_parser.get( self.__class__.__name__, "missing_retweets_not_retrived_from_vico_file_name") self._missing_tweets_not_retrived_from_vico_file_name = config_parser.get( self.__class__.__name__, "missing_tweets_not_retrived_from_vico_file_name") self._retweets_retrieved_from_vico_file_name = config_parser.get( self.__class__.__name__, "retweets_retrieved_from_vico_file_name") self._tweets_retrieved_from_vico_file_name = config_parser.get( self.__class__.__name__, "tweets_retrieved_from_vico_file_name") self._path = config_parser.get(self.__class__.__name__, "path") self._backup_path = config_parser.get(self.__class__.__name__, "backup_path") self._csv_header = config_parser.eval(self.__class__.__name__, "csv_header") self._csv_header_bad_actors_vico_retrieved_posts = config_parser.eval( self.__class__.__name__, "csv_header_bad_actors_vico_retrieved_posts") targeted_twitter_post_ids = config_parser.get( "BadActorsCollector", "targeted_twitter_post_ids") self._targeted_twitter_post_ids = create_ids_from_config_file( targeted_twitter_post_ids) self._original_statuses = config_parser.eval(self.__class__.__name__, "original_statuses") self._csv_importer = PostCSVExporter() self._social_network_crawler = Twitter_Rest_Api() self._db = DB() self._db.setUp()
def setUp(self): TestBase.setUp(self) self.config = getConfig() from DB.schema_definition import DB self.db = DB() self.db.setUp() self.social_network_crawler = Twitter_Rest_Api(self.db) self.xml_importer = XMLImporter(self.db) self.create_author_table = CreateAuthorTables(self.db) self._targeted_twitter_author_ids = self.config.eval('BadActorsCollector', "targeted_twitter_author_ids") self._targeted_twitter_post_ids = self.config.eval('BadActorsCollector', "targeted_twitter_post_ids") self._bad_actor_collector = BadActorsCollector(self.db)
def setUp(self): self.config = getConfig() self.db = DB() self.db.setUp() self.social_network_crawler = Twitter_Rest_Api(self.db) self.xml_importer = XMLImporter(self.db) self.create_author_table = CreateAuthorTables(self.db) self._targeted_twitter_author_ids = self.config.eval( 'BadActorsCollector', "targeted_twitter_author_ids") self._domain = u'Microblog' self._targeted_twitter_post_ids = self.config.eval( 'BadActorsCollector', "targeted_twitter_post_ids") self._bad_actor_collector = BadActorsCollector(self.db) #The Author and Post for test_mark_missing_bad_retweeters_retrieved_from_vico self._author_guid1 = compute_author_guid_by_author_name( u'TechmarketNG') author = Author() author.name = u'TechmarketNG' author.domain = self._domain author.protected = 0 author.author_guid = self._author_guid1 author.author_screen_name = u'TechmarketNG' author.author_full_name = u'Techmarket' author.statuses_count = 10 author.author_osn_id = 149159975 author.followers_count = 12 author.created_at = datetime.datetime.strptime('2016-04-02 00:00:00', '%Y-%m-%d %H:%M:%S') author.missing_data_complementor_insertion_date = datetime.datetime.now( ) author.xml_importer_insertion_date = datetime.datetime.now() self.db.add_author(author) post = Post() post.post_id = u'TestPost' post.author = u'TechmarketNG' post.guid = u'TestPost' post.url = u'TestPost' tempDate = u'2016-05-05 00:00:00' day = datetime.timedelta(1) post.date = datetime.datetime.strptime(tempDate, '%Y-%m-%d %H:%M:%S') + day * 1 post.domain = self._domain post.author_guid = self._author_guid1 post.content = u"InternetTV love it RT @benny_metanya #wow" post.xml_importer_insertion_date = datetime.datetime.now() self.db.addPost(post) self.db.commit()
class AsonamHoneypotImporter(AbstractController): def __init__(self, db): super(AsonamHoneypotImporter, self).__init__(db) self.twitter_rest_api = Twitter_Rest_Api(self._db) self._data_path = self._config_parser.eval(self.__class__.__name__, "data_path") def execute(self, window_start=None): all_sub_files = listdir(self._data_path) for data_file in all_sub_files: self.add_tweet_data_to_db(data_file) def add_tweet_data_to_db(self, data_file): type_to_users_tweets_dict = self.load_file(self._data_path + data_file) for author_type in type_to_users_tweets_dict: tweets_ids = [] for author in type_to_users_tweets_dict[author_type]: tweets_ids.extend( type_to_users_tweets_dict[author_type][author]) tweets = self.get_tweets(tweets_ids, str(author_type)) self.twitter_rest_api._save_posts_and_authors( tweets, str(author_type)) def load_file(self, file_path): file = open(file_path, 'r', newline='') csv_file = csv.reader(file, delimiter=' ') type_to_users_tweets = defaultdict(dict) for row in csv_file: author_type = row[0] author_id = row[1] tweet_ids = row[2:] type_to_users_tweets[author_type][author_id] = tweet_ids file.close() return type_to_users_tweets def get_tweets(self, tweets_ids, author_type=""): return self.twitter_rest_api.get_tweets_by_ids(tweets_ids, author_type)
class US_2016_Presidential_Election_Importer(Method_Executor): def __init__(self, db): Method_Executor.__init__(self, db) self._data_folder = self._config_parser.eval(self.__class__.__name__, "data_folder") self._social_network_crawler = Twitter_Rest_Api(db) def retrieve_tweets_from_scratch(self): file_names = os.listdir(self._data_folder) for file_name in file_names: lines = [line.rstrip('\n') for line in open(self._data_folder + file_name)] num_of_tweet_ids = len(lines) msg = "\r Number of tweets ids left to retrieve is: {0}".format(num_of_tweet_ids) print(msg, end="") self._social_network_crawler.get_tweets_by_tweet_ids_and_add_to_db(lines) def continue_retrieving_tweets_in_case_of_crush(self): file_names = os.listdir(self._data_folder) for file_name in file_names: total_tweet_ids = [line.rstrip('\n') for line in open(self._data_folder + file_name)] total_tweet_ids = set(total_tweet_ids) already_found_tweets_ids_tuples = self._db.get_post_osn_ids() already_found_tweets_ids = [tweets_ids_tuple[0] for tweets_ids_tuple in already_found_tweets_ids_tuples] already_found_tweets_ids = set(already_found_tweets_ids) left_to_retrieve_tweet_ids = total_tweet_ids - already_found_tweets_ids left_to_retrieve_tweet_ids = list(left_to_retrieve_tweet_ids) num_of_tweet_ids_to_retrieve = len(left_to_retrieve_tweet_ids) msg = "\r Number of tweets ids left to retrieve is: {0}".format(num_of_tweet_ids_to_retrieve) print(msg, end="") self._social_network_crawler.get_tweets_by_tweet_ids_and_add_to_db(left_to_retrieve_tweet_ids)
def __init__(self, db): Method_Executor.__init__(self, db) self._data_folder = self._config_parser.eval(self.__class__.__name__, "data_folder") self._social_network_crawler = Twitter_Rest_Api(db)
class MissingVicoPostsRetriever(): def __init__(self): config_parser = getConfig() logging.config.fileConfig(getConfig().get("DEFAULT", "logger_conf_file")) logger = logging.getLogger(getConfig().get("DEFAULT", "logger_name")) logger.info("Start Execution ... ") self._missing_retweets_not_retrived_from_vico_file_name = config_parser.get( self.__class__.__name__, "missing_retweets_not_retrived_from_vico_file_name") self._missing_tweets_not_retrived_from_vico_file_name = config_parser.get( self.__class__.__name__, "missing_tweets_not_retrived_from_vico_file_name") self._retweets_retrieved_from_vico_file_name = config_parser.get( self.__class__.__name__, "retweets_retrieved_from_vico_file_name") self._tweets_retrieved_from_vico_file_name = config_parser.get( self.__class__.__name__, "tweets_retrieved_from_vico_file_name") self._path = config_parser.get(self.__class__.__name__, "path") self._backup_path = config_parser.get(self.__class__.__name__, "backup_path") self._csv_header = config_parser.eval(self.__class__.__name__, "csv_header") self._csv_header_bad_actors_vico_retrieved_posts = config_parser.eval( self.__class__.__name__, "csv_header_bad_actors_vico_retrieved_posts") targeted_twitter_post_ids = config_parser.get( "BadActorsCollector", "targeted_twitter_post_ids") self._targeted_twitter_post_ids = create_ids_from_config_file( targeted_twitter_post_ids) self._original_statuses = config_parser.eval(self.__class__.__name__, "original_statuses") self._csv_importer = PostCSVExporter() self._social_network_crawler = Twitter_Rest_Api() self._db = DB() self._db.setUp() def execute(self): ''' id = 714718743973208064 id = 3190956770 timeline = self._social_network_crawler.get_timeline_by_user_id(id) x = 3 ''' #timelines = self.collect_bad_actors_not_retrieved_from_vico_timelines() #self.export_retweets_vico_not_retrieved(timelines) self.export_tweets_vico_not_retrieved() #self.export_tweets_retrieved_from_vico() #self.export_retweets_vico_retrieved() def export_retweets_vico_not_retrieved(self, bad_actors_timelines): # # A retweet is defined as a post that has no text of its own. It always starts with RT @creator. # If you reply to the tweet it is not defined as retweet. # A retweet from the timeline always has a retweeted_status object which includes the original status. # The retweet's text always starts with RT: @creator and the text of the user. # missing_retweets = [] for timeline in bad_actors_timelines: missing_post = self.find_missing_retweet(timeline) if missing_post is not None: missing_retweets.append(missing_post) if len(missing_retweets) > 0: self.move_existing_file_to_backup( self._path, self._backup_path, self._missing_retweets_not_retrived_from_vico_file_name) missing_posts_content = self.create_missing_posts_content_for_csv( missing_retweets) full_path_file_name = self._path + self._missing_retweets_not_retrived_from_vico_file_name self._csv_importer.write_content_to_csv(missing_posts_content, full_path_file_name, self._csv_header) def find_missing_retweet(self, timeline): for post in timeline: retweeted_status = post.retweeted_status if retweeted_status is not None: original_post_id = retweeted_status.id for post_id in self._targeted_twitter_post_ids: if original_post_id == post_id: return post return None def create_missing_posts_content_for_csv(self, missing_posts): missing_posts_content = [] for missing_post in missing_posts: post_twitter_id = str(missing_post.id) missing_author_screen_name = missing_post.user.screen_name content = missing_post.text created_at = missing_post.created_at url = "http://twitter.com/" + missing_author_screen_name + "/status/" + str( post_twitter_id) user_mentions = missing_post.user_mentions user_mention = user_mentions[0] original_author_twitter_id = str(user_mention.id) original_author_screen = user_mention.screen_name missing_post_content = [ post_twitter_id, missing_author_screen_name, content, created_at, url, original_author_twitter_id, original_author_screen ] missing_posts_content.append(missing_post_content) return missing_posts_content def create_missing_posts_content_for_csv(self, missing_posts): missing_posts_content = [] for missing_post in missing_posts: post_twitter_id = str(missing_post.id) missing_author_screen_name = missing_post.user.screen_name content = missing_post.text created_at = missing_post.created_at url = "http://twitter.com/" + missing_author_screen_name + "/status/" + str( post_twitter_id) user_mentions = missing_post.user_mentions if len(user_mentions) > 0: user_mention = user_mentions[0] original_author_twitter_id = str(user_mention.id) original_author_screen = user_mention.screen_name else: urls = missing_post.urls original_url = urls[0] original_url = original_url.expanded_url relevant_part = original_url.split("https://twitter.com/", 1) screen_name_status_id = relevant_part[1].split("/status/", 1) original_author_twitter_id = str(screen_name_status_id[1]) original_author_screen = screen_name_status_id[0] missing_post_content = [ post_twitter_id, missing_author_screen_name, content, created_at, url, original_author_twitter_id, original_author_screen ] missing_posts_content.append(missing_post_content) return missing_posts_content def move_existing_file_to_backup(self, original_path, backup_path, file_name): logging.info("move_existing_file_to_backup ") full_path_output_file = original_path + file_name if os.path.isfile(full_path_output_file): full_path_backup_output_file = backup_path + file_name if os.path.isfile(full_path_backup_output_file): os.remove(full_path_backup_output_file) os.rename(full_path_output_file, full_path_backup_output_file) def export_tweets_vico_not_retrieved(self): bad_actors_timelines = self.collect_bad_actors_timelines() missing_tweets = [] for timeline in bad_actors_timelines: missing_post = self.find_missing_tweet(timeline) if missing_post is not None: missing_tweets.append(missing_post) if len(missing_tweets) > 0: self.move_existing_file_to_backup( self._path, self._backup_path, self._missing_tweets_not_retrived_from_vico_file_name) missing_posts_content = self.create_missing_posts_content_for_csv( missing_tweets) full_path_file_name = self._path + self._missing_tweets_not_retrived_from_vico_file_name self._csv_importer.write_content_to_csv(missing_posts_content, full_path_file_name, self._csv_header) def find_missing_tweet(self, timeline): for post in timeline: retweeted_status = post.retweeted_status if retweeted_status is None: urls = post.urls if len(urls) > 0: url = urls[0] original_status_url = url.expanded_url if original_status_url in self._original_statuses: return post return None def collect_bad_actors_not_retrieved_from_vico_timelines(self): timelines = [] bad_actors_not_found_by_vico_authors_ids = self._db.get_bad_actor_retweeters_not_retrieved_from_vico( ) for id in bad_actors_not_found_by_vico_authors_ids: timeline = self._social_network_crawler.get_timeline_by_user_id(id) if timeline is not None: timelines.append(timeline) return timelines def collect_bad_actors_timelines(self): timelines = [] bad_actors_not_found_by_vico_authors_ids = self._db.get_bad_actor_ids() for id in bad_actors_not_found_by_vico_authors_ids: timeline = self._social_network_crawler.get_timeline_by_user_id(id) if timeline is not None: timelines.append(timeline) return timelines def export_retweets_vico_retrieved(self): retweets = self._db.get_bad_actors_retweets_retrieved_by_vico() self.move_existing_file_to_backup( self._path, self._backup_path, self._retweets_retrieved_from_vico_file_name) vico_retweets_content = self.create_bad_actors_posts_content_for_csv( retweets) full_path_file_name = self._path + self._retweets_retrieved_from_vico_file_name self._csv_importer.write_content_to_csv( vico_retweets_content, full_path_file_name, self._csv_header_bad_actors_vico_retrieved_posts) def create_bad_actors_posts_content_for_csv(self, retweets): retweets_content = [] for retweet in retweets: post_id = str(retweet.post_id) author = retweet.author guid = retweet.guid title = retweet.title url = retweet.url date = retweet.date content = retweet.content domain = retweet.domain author_guid = retweet.author_guid retweet_content = [ post_id, author, guid, title, url, date, content, domain, author_guid ] retweets_content.append(retweet_content) return retweets_content def export_tweets_retrieved_from_vico(self): tweets = self._db.get_bad_actor_tweets_from_vico() self.move_existing_file_to_backup( self._path, self._backup_path, self._tweets_retrieved_from_vico_file_name) vico_tweets_content = self.create_bad_actors_posts_content_for_csv( tweets) full_path_file_name = self._path + self._tweets_retrieved_from_vico_file_name self._csv_importer.write_content_to_csv( vico_tweets_content, full_path_file_name, self._csv_header_bad_actors_vico_retrieved_posts) pass
class Generic_Twitter_Crawler(object): def __init__(self, db): # AbstractController.__init__(self, db) self._db = db self._twitter_rest_api = Twitter_Rest_Api(db) self._config_parser = getConfig() self._domain = unicode(self._config_parser.get("DEFAULT", "domain")) self._users_to_add = [] self._post_to_add = [] def retrieve_and_save_data_from_twitter_by_terms(self, keywords, terms, topics): posts, total_twitter_users, connections = self.get_posts_and_authors_by_terms( keywords) self._db.addPosts(posts) self._add_users_to_db(total_twitter_users) self._db.addPosts(terms) self._db.addPosts(topics) self._db.addPosts(connections) def commit_db(self): self._db.addPosts(self._post_to_add) self._add_users_to_db(self._users_to_add) self._db.commit() self._users_to_add = [] self._post_to_add = [] def retrieve_and_save_data_from_twitter_by_post_id(self, post_id, label): post, user = self.get_post_and_author_by_post_id(post_id) try: converted_post = self._db.create_post_from_tweet_data( post, self._domain) converted_post.post_type = label self._users_to_add.append(user) self._post_to_add.append(converted_post) except TwitterError as e: exception_response = e[0][0] logging.info("e.massage =" + exception_response["message"]) code = exception_response["code"] logging.info("e.code =" + str(exception_response["code"])) if code == 88: sec = self._twitter_rest_api.get_sleep_time_for_twitter_status_id( ) logging.info("Seconds to wait from catched crush is: " + str(sec)) if sec != 0: commons.count_down_time(sec) self._num_of_twitter_status_id_requests = 0 return self.retrieve_and_save_data_from_twitter_by_post_id( post_id, label) def get_posts_and_authors_by_terms(self, keywords): term_tweets_dict = self.get_posts_by_terms(keywords) total_twitter_users = [] total_posts = [] connections = [] for term, tweets in term_tweets_dict.iteritems(): posts = [] for tweet in tweets: post = self._db.create_post_from_tweet_data( tweet, self._domain) term_post_connection, term_author_connection = self._create_connections( term, post) connections.append(term_post_connection) connections.append(term_author_connection) posts.append(post) total_posts += posts #posts += [self._db.create_post_from_tweet_data(tweet, self._domain) for tweet in term_tweets_dict[term]] total_twitter_users += [ post.user for post in term_tweets_dict[term] ] return total_posts, total_twitter_users, connections def get_post_and_author_by_post_id(self, post_id): post = self._twitter_rest_api.get_post_by_post_id(post_id) user = post.user return post, user def _add_users_to_db(self, total_twitter_users): author_type = None insertion_type = None self._twitter_rest_api.save_authors_and_connections( total_twitter_users, author_type, insertion_type) def get_posts_by_terms(self, terms): return self._twitter_rest_api.get_posts_by_terms(terms) def _create_connections(self, term, post): term_post_connection = AuthorConnection() term_post_connection.source_author_guid = term term_post_connection.destination_author_guid = post.post_id term_post_connection.connection_type = u"term-post" term_author_connection = AuthorConnection() term_author_connection.source_author_guid = term term_author_connection.destination_author_guid = post.author_guid term_author_connection.connection_type = u"term-author" return term_post_connection, term_author_connection
class PostManager(Method_Executor): def __init__(self, db): Method_Executor.__init__(self, db) self._twitter_api = TwitterApiRequester() self._social_network_crawler = Twitter_Rest_Api(db) self._influence_strategy = self._config_parser.eval(self.__class__.__name__, "post_strategy") self._source_group = self._config_parser.eval(self.__class__.__name__, "source_group") self._target_group = self._config_parser.eval(self.__class__.__name__, "target_group") self._user_id = self._config_parser.eval(self.__class__.__name__, "user_id") self._number_of_posts = self._config_parser.eval(self.__class__.__name__, "number_of_posts") self._retweet_precent = self._config_parser.eval(self.__class__.__name__, "retweet_precent") self._related_hashtags = self._config_parser.eval(self.__class__.__name__, "related_hashtags") self._posts_num = self._config_parser.eval(self.__class__.__name__, "posts_num") def _publish_post(self, post, message, media): self._twitter_api = TwitterApiRequester() statuses = self._twitter_api.api.PostUpdate(message, media) activity = self._db.create_activity(self._user_id, post.post_osn_id, statuses.id, 'twitter_post', 'twitter', message, datetime.datetime.utcnow(), "twitter") return activity def _retweet_post(self, post): self._twitter_api = TwitterApiRequester() statuses = self._twitter_api.api.PostRetweet(post.post_osn_id, trim_user=False) activity = self._db.create_activity(self._user_id, post.post_osn_id, statuses.id, 'twitter_retweet', 'twitter', post.content, datetime.datetime.utcnow(), "twitter") return activity def _get_posts(self): team_guid = self._db.get_author_guid_by_screen_name(self._source_group) team_posts = [] if (len(team_guid) == 1): team_posts = self._db.get_posts_by_author_guid(team_guid[0]) else: for i in team_guid: team_posts.append(self._db.get_posts_by_author_guid(i)) team_posts = [sublist for item in team_posts for sublist in item] return self._split_into_retweet_and_tweet(team_posts) def _split_into_retweet_and_tweet(self,team_posts): team_posts_without_retweet = [] team_posts_with_retweet = [] for post in team_posts: prefix = str(post.content[0:2]) if prefix != "RT": team_posts_without_retweet.append(post) else: team_posts_with_retweet.append(post) return team_posts_without_retweet, team_posts_with_retweet def influence_strategy_sort(self, team_posts_without_retweet, team_posts_with_retweet): if self._influence_strategy == "last": team_posts_without_retweet.sort(key=lambda x: x.date, reverse=True) team_posts_with_retweet.sort(key=lambda x: x.date, reverse=True) if self._influence_strategy == "popular": team_posts_without_retweet.sort(key=lambda x: x.favorite_count, reverse=True) team_posts_with_retweet.sort(key=lambda x: x.favorite_count, reverse=True) return team_posts_without_retweet, team_posts_with_retweet def _create_tweet_content(self, post): tweet_length = 270 message = post.content media = post.media_path message = message + '\n' + "@" + self._target_group + " #" + self._target_group + " " + str( datetime.datetime.utcnow()) for i in self._related_hashtags: if (len(message + " " + i) > tweet_length): break else: message = message + " " + i if (len(message) > tweet_length): message = message[0:tweet_length] return message, media def _post_func(self, post, team_posts_without_retweet): try: message, media = self._create_tweet_content(post) activity = self._publish_post(post, message, media) flag = 'True' print("date: " + str(datetime.datetime.utcnow()) + " post number: " + str( self._posts_num) + " succeed to send a tweet") self._db.addPosts([activity]) return str(flag), team_posts_without_retweet except Exception as e: print("Failed {}".format(e)) flag = False if (len(team_posts_without_retweet) >= 1): del team_posts_without_retweet[0] return str(flag), team_posts_without_retweet def _retweet_func(self, post, team_posts_with_retweet): try: activity = self._retweet_post(post) flag = 'True' print("date: " + str(datetime.datetime.utcnow()) + " post number: " + str( self._posts_num) + " succeed to send a retweet") self._db.addPosts([activity]) return str(flag), team_posts_with_retweet except Exception as e: print("Failed {}".format(e)) flag = False if (len(team_posts_with_retweet) >= 1): del team_posts_with_retweet[0] return str(flag), team_posts_with_retweet def _execute_post_process(self, team_posts_without_retweet, team_posts_with_retweet): if (self._number_of_posts <= self._posts_num): return else: team_posts_without_retweet, team_posts_with_retweet = self.influence_strategy_sort( team_posts_without_retweet, team_posts_with_retweet) flag = 'False' self._posts_num = self._posts_num + 1 coin = random.uniform(0, 1) while flag == 'False': if (coin >= self._retweet_precent): post, team_posts_without_retweet, team_posts_with_retweet = self._selecting_post( team_posts_without_retweet, team_posts_with_retweet, "post") else: post, team_posts_without_retweet, team_posts_with_retweet = self._selecting_post( team_posts_without_retweet, team_posts_with_retweet, "retweet") if (coin >= self._retweet_precent): flag, team_posts_without_retweet = self._post_func(post, team_posts_without_retweet) else: flag,team_posts_with_retweet = self._retweet_func(post, team_posts_with_retweet) def _selecting_post(self, team_posts_without_retweet, team_posts_with_retweet, type): post_exist = True while post_exist == True: if type == "post": ans = team_posts_without_retweet[0] message = ans.content while "@" + self._target_group in message: if (len(team_posts_without_retweet) >= 1): del team_posts_without_retweet[0] ans = team_posts_without_retweet[0] message = ans.content else: print("End of tweets") else: ans = team_posts_with_retweet[0] post_exist = self._db.check_if_post_sent(ans, self._user_id) if (post_exist == True): if type == "post": if (len(team_posts_without_retweet) >= 1): del team_posts_without_retweet[0] else: print("End of tweets") else: if (len(team_posts_with_retweet) >= 1): del team_posts_with_retweet[0] else: print("End of tweets") return ans, team_posts_without_retweet, team_posts_with_retweet def time_schedule(self): hours_in_a_day = 24 * 60 minute_window = float(hours_in_a_day) / self._number_of_posts while True: self._posts_num = 0 self._convert_timeline_tweets_to_posts_for_author_screen_names(self._source_group) without_retweet, with_retweet = self._get_posts() self._execute_post_process(without_retweet, with_retweet) schedule.every(minute_window).minutes.do(self._execute_post_process, without_retweet, with_retweet) while True: if (self._number_of_posts <= self._posts_num): break schedule.run_pending() time.sleep(1) def calculate_posts_stat(self): author_guid = "0927dc1a-8bcb-3488-99ed-7a962aee56e2" date = "2020-03-04 03:28:20" ids = self._db.source_destination() author_posts = self._db.posts_statics_from_date(author_guid, date) author_posts_guid = self._db.posts_statics_guids(author_guid, date) author_posts_guid = [ids[i] for i in author_posts_guid] influencers_posts = self._db.posts_statics_from_date_for_specific_posts(author_posts_guid) df1 = pd.DataFrame(author_posts, columns=['author_guid', 'post_count', 'retweet_sum', 'favorite_sum', 'retweet_avg', 'favorite_avg']) df2 = pd.DataFrame(influencers_posts, columns=['author_guid', 'post_count', 'retweet_sum', 'favorite_sum', 'retweet_avg', 'favorite_avg']) frames = [df1, df2] result = pd.concat(frames) result.to_csv(author_guid + ".csv") def _convert_timeline_tweets_to_posts_for_author_screen_names(self, author_screen_names): posts = [] for i, account_screen_name in enumerate(author_screen_names): try: timeline_tweets = self._social_network_crawler.get_timeline(account_screen_name, 3200) if timeline_tweets is not None: print("\rSearching timeline tweets for author_guid: {0} {1}/{2} retrieved:{3}".format( account_screen_name, i, len(author_screen_names), len(timeline_tweets)), end='') for timeline_tweet in timeline_tweets: post = self._db.create_post_from_tweet_data_api(timeline_tweet, self._domain) posts.append(post) except requests.exceptions.ConnectionError as errc: x = 3 except TwitterError as e: if e.message == "Not authorized.": logging.info("Not authorized for user id: {0}".format(account_screen_name)) continue self._db.addPosts(posts) self.fill_data_for_sources() def fill_author_guid_to_posts(self): posts = self._db.get_all_posts() num_of_posts = len(posts) for i, post in enumerate(posts): msg = "\rPosts to fill: [{0}/{1}]".format(i, num_of_posts) print(msg, end="") post.author_guid = compute_author_guid_by_author_name(post.author) self._db.addPosts(posts) self._db.insert_or_update_authors_from_posts(self._domain, {}, {}) def fill_data_for_sources(self): print("---complete_missing_information_for_authors_by_screen_names ---") twitter_author_screen_names = self._db.get_missing_data_twitter_screen_names_by_posts() author_type = None are_user_ids = False inseration_type = DB_Insertion_Type.MISSING_DATA_COMPLEMENTOR # retrieve_full_data_for_missing_users i = 1 for author_screen_names in self._split_into_equal_chunks(twitter_author_screen_names, 10000): twitter_users = self._social_network_crawler.handle_get_users_request( author_screen_names, are_user_ids, author_type, inseration_type) print('retrieve authors {}/{}'.format(i * 10000, len(twitter_author_screen_names))) i += 1 self._social_network_crawler.save_authors_and_connections(twitter_users, author_type, inseration_type) self.fill_author_guid_to_posts() print("---complete_missing_information_for_authors_by_screen_names was completed!!!!---") #logging.info("---complete_missing_information_for_authors_by_screen_names was completed!!!!---") def _split_into_equal_chunks(self,elements, num_of_chunks): """Yield successive n-sized chunks from l.""" for i in range(0, len(elements), num_of_chunks): yield elements[i:i + num_of_chunks]
class Generic_Twitter_Crawler(object): def __init__(self, db): # AbstractController.__init__(self, db) self._db = db self._twitter_rest_api = Twitter_Rest_Api(db) self._config_parser = getConfig() self._domain = unicode(self._config_parser.get("DEFAULT", "domain")) def retrieve_and_save_data_from_twitter_by_terms(self, terms): posts, total_twitter_users = self.get_posts_and_authors_by_terms(terms) self._db.addPosts(posts) self._add_users_to_db(total_twitter_users) def retrive_and_save_data_from_twitter_by_post_id(self, post_id, label): post, user = self.get_post_and_author_by_post_id(post_id) try: converted_post = self._db.create_post_from_tweet_data( post, self._domain) converted_post.post_type = label self._db.addPost(converted_post) self._add_users_to_db([user]) self._db.commit() except TwitterError as e: exception_response = e[0][0] logging.info("e.massage =" + exception_response["message"]) code = exception_response["code"] logging.info("e.code =" + str(exception_response["code"])) if code == 88: sec = self._twitter_rest_api.get_sleep_time_for_twitter_status_id( ) logging.info("Seconds to wait from catched crush is: " + str(sec)) if sec != 0: commons.count_down_time(sec) self._num_of_twitter_status_id_requests = 0 return self.retrive_and_save_data_from_twitter_by_post_id( post_id, label) def get_posts_and_authors_by_terms(self, terms): term_posts_dictionary = self.get_posts_by_terms(terms) total_twitter_users = [] posts = [] for term in term_posts_dictionary: posts += [ self._db.create_post_from_tweet_data(tweet, self._domain) for tweet in term_posts_dictionary[term] ] total_twitter_users += [ post.user for post in term_posts_dictionary[term] ] return posts, total_twitter_users def get_post_and_author_by_post_id(self, post_id): post = self._twitter_rest_api.get_post_by_post_id(post_id) user = post.user return post, user def _add_users_to_db(self, total_twitter_users): author_type = None insertion_type = None self._twitter_rest_api.save_authors_and_connections( total_twitter_users, author_type, insertion_type) def get_posts_by_terms(self, terms): return self._twitter_rest_api.get_posts_by_terms(terms)
class MissingDataComplementor(Method_Executor): def __init__(self, db): Method_Executor.__init__(self, db) self._actions = self._config_parser.eval(self.__class__.__name__, "actions") self._minimal_num_of_posts = self._config_parser.eval( self.__class__.__name__, "minimal_num_of_posts") self._limit_friend_follower_number = self._config_parser.eval( self.__class__.__name__, "limit_friend_follower_number") self._maximal_tweets_count_in_timeline = self._config_parser.eval( self.__class__.__name__, "maximal_tweets_count_in_timeline") self._found_twitter_users = [] self._social_network_crawler = Twitter_Rest_Api(db) self._suspended_authors = [] self._max_users_without_saving = self._config_parser.eval( self.__class__.__name__, "max_users_without_saving") self._posts = [] self._authors = [] self._post_citatsions = [] def setUp(self): pass def fill_author_guid_to_posts(self): posts = self._db.get_posts() num_of_posts = len(posts) for i, post in enumerate(posts): msg = "\rPosts to fill: [{0}/{1}]".format(i, num_of_posts) print(msg, end="") post.author_guid = compute_author_guid_by_author_name(post.author) self._db.addPosts(posts) self._db.insert_or_update_authors_from_posts(self._domain, {}, {}) def fill_data_for_followers(self): self._fill_data_for_author_connection_type( Author_Connection_Type.FOLLOWER) logging.info("---Finished crawl_followers_by_author_ids") def fill_data_for_friends(self): self._fill_data_for_author_connection_type( Author_Connection_Type.FRIEND) logging.info("---Finished crawl_friends_by_author_ids") def _fill_data_for_author_connection_type(self, connection_type): # TEST self._db.get_authors_by_domain("Microblog") # TEST cursor = self._db.get_followers_or_friends_candidats( connection_type, self._domain, self._limit_friend_follower_number) followers_or_friends_candidats = self._db.result_iter(cursor) followers_or_friends_candidats = [ author_id[0] for author_id in followers_or_friends_candidats ] print("---crawl_followers_by_author_ids---") author_type = None are_user_ids = True insertion_type = DB_Insertion_Type.MISSING_DATA_COMPLEMENTOR crawl_users_by_author_ids_func_name = "crawl_users_by_author_ids" getattr(self._social_network_crawler, crawl_users_by_author_ids_func_name)( followers_or_friends_candidats, connection_type, author_type, are_user_ids, insertion_type) self._db.convert_temp_author_connections_to_author_connections( self._domain) def crawl_followers_by_author_ids(self, author_ids): print("---crawl_followers_by_author_ids---") author_type = None are_user_ids = True inseration_type = DB_Insertion_Type.MISSING_DATA_COMPLEMENTOR self._social_network_crawler.crawl_followers_by_twitter_author_ids( author_ids, author_type, are_user_ids, inseration_type) def crawl_friends_by_author_ids(self, author_ids): print("---crawl_friends_by_author_ids---") author_type = None are_user_ids = True inseration_type = DB_Insertion_Type.MISSING_DATA_COMPLEMENTOR self._social_network_crawler.crawl_friends_by_twitter_author_ids( author_ids, author_type, are_user_ids, inseration_type) def create_author_screen_names(self): screen_names = self._db.get_screen_names_for_twitter_authors_by_posts() return screen_names def fill_data_for_sources(self): print( "---complete_missing_information_for_authors_by_screen_names ---") logging.info( "---complete_missing_information_for_authors_by_screen_names ---") # twitter_author_screen_names = self.create_author_screen_names() twitter_author_screen_names = self._db.get_missing_data_twitter_screen_names( ) # twitter_author_screen_names = (twitter_author.name for twitter_author in twitter_authors) # twitter_author_screen_names = list(twitter_author_screen_names) author_type = None are_user_ids = False inseration_type = DB_Insertion_Type.MISSING_DATA_COMPLEMENTOR # retrieve_full_data_for_missing_users total_twitter_users = self._social_network_crawler.handle_get_users_request( twitter_author_screen_names, are_user_ids, author_type, inseration_type) self._social_network_crawler.save_authors_and_connections( total_twitter_users, author_type, inseration_type) print( "---complete_missing_information_for_authors_by_screen_names was completed!!!!---" ) logging.info( "---complete_missing_information_for_authors_by_screen_names was completed!!!!---" ) return total_twitter_users def complete_missing_information_for_authors_by_ids(self): print("---complete_missing_information_for_authors_by_ids ---") logging.info("---complete_missing_information_for_authors_by_ids ---") # twitter_author_screen_names = self.create_author_screen_names() twitter_author_screen_names = self._db.get_missing_data_twitter_screen_names( ) # twitter_author_screen_names = (twitter_author.name for twitter_author in twitter_authors) # twitter_author_screen_names = list(twitter_author_screen_names) author_type = None are_user_ids = False inseration_type = DB_Insertion_Type.MISSING_DATA_COMPLEMENTOR # retrieve_full_data_for_missing_users total_twitter_users = self._social_network_crawler.handle_get_users_request( twitter_author_screen_names, are_user_ids, author_type, inseration_type) # return self._found_twitter_users print( "---complete_missing_information_for_authors was completed!!!!---") logging.info( "---complete_missing_information_for_authors was completed!!!!---") return total_twitter_users def mark_suspended_or_not_existed_authors(self): suspended_authors = self._db.get_authors_for_mark_as_suspended_or_not_existed( ) for suspended_author in suspended_authors: suspended_author.is_suspended_or_not_exists = self._window_start self._db.set_inseration_date( suspended_author, DB_Insertion_Type.MISSING_DATA_COMPLEMENTOR) self._social_network_crawler.save_authors(suspended_authors) def mark_suspended_from_twitter(self): self._suspended_authors = [] suspected_authors = self._db.get_not_suspended_authors(self._domain) suspected_authors_names = [author.name for author in suspected_authors] chunks = split_into_equal_chunks( suspected_authors_names, self._social_network_crawler. _maximal_user_ids_allowed_in_single_get_user_request) total_chunks = list(chunks) chunks = split_into_equal_chunks( suspected_authors_names, self._social_network_crawler. _maximal_user_ids_allowed_in_single_get_user_request) i = 1 for chunk_of_names in chunks: msg = "\rChunck of author to Twitter: [{0}/{1}]".format( i, len(total_chunks)) print(msg, end="") i += 1 set_of_send_author_names = set(chunk_of_names) set_of_received_author_names = set( self._social_network_crawler. get_active_users_names_by_screen_names(chunk_of_names)) author_names_of_suspendend_or_not_exists = set_of_send_author_names - set_of_received_author_names self._update_suspended_authors_by_screen_names( author_names_of_suspendend_or_not_exists) self._db.add_authors(self._suspended_authors) def _update_suspended_authors_by_screen_names( self, author_names_of_suspendend_or_not_exists): for author_name in author_names_of_suspendend_or_not_exists: user_guid = compute_author_guid_by_author_name( author_name).replace("-", "") suspended_author = self._db.get_author_by_author_guid(user_guid) suspended_author.is_suspended_or_not_exists = self._window_start suspended_author.author_type = Author_Type.BAD_ACTOR self._db.set_inseration_date( suspended_author, DB_Insertion_Type.MISSING_DATA_COMPLEMENTOR) self._suspended_authors.append(suspended_author) num_of_suspended_authors = len(self._suspended_authors) if num_of_suspended_authors == self._max_users_without_saving: self._db.add_authors(self._suspended_authors) self._suspended_authors = [] def fill_tweet_retweet_connection(self): ''' Fetches the original tweets being retweeted by our posts. Updates the followig tables: * Post_Citations table with tweet-retweet connection * Posts table with missing tweets * Authors with the authors of the missing tweets ''' retweets_with_no_tweet_citation = self._db.get_retweets_with_no_tweet_citation( ) logging.info( "Updating tweet-retweet connection of {0} retweets".format( len(retweets_with_no_tweet_citation))) self._posts = [] self._authors = [] self._post_citatsions = [] i = 1 for post_guid, post_url in retweets_with_no_tweet_citation.iteritems(): # logging.info("Analyzing retweet: {0} - {1}".format(post_guid, post_url)) msg = "\r Analyzing retweet: {0} - {1} [{2}".format( post_guid, post_url, i) + "/" + str( len(retweets_with_no_tweet_citation)) + '] ' print(msg, end="") i += 1 tweet_data = self.extract_retweet_data(retweet_guid=post_guid, retweet_url=post_url) if tweet_data is not None: if not self._db.isPostExist(tweet_data.tweet_url): post = Post(guid=tweet_data.tweet_guid, post_id=tweet_data.tweet_guid, url=tweet_data.tweet_url, date=str_to_date(tweet_data.tweet_date), title=tweet_data.tweet_content, content=tweet_data.tweet_content, post_osn_id=tweet_data.tweet_twitter_id, retweet_count=tweet_data.tweet_retweet_count, favorite_count=tweet_data.tweet_favorite_count, author=tweet_data.tweet_author_name, author_guid=tweet_data.tweet_author_guid, domain=self._domain, original_tweet_importer_insertion_date=unicode( get_current_time_as_string())) self._posts.append(post) if not self._db.is_author_exists(tweet_data.tweet_author_guid, self._domain): author = Author( name=tweet_data.tweet_author_name, domain=self._domain, author_guid=tweet_data.tweet_author_guid, original_tweet_importer_insertion_date=unicode( get_current_time_as_string())) self._authors.append(author) if not self._db.is_post_citation_exist(tweet_data.retweet_guid, tweet_data.tweet_guid): post_citation = Post_citation( post_id_from=tweet_data.retweet_guid, post_id_to=tweet_data.tweet_guid, url_from=tweet_data.retweet_url, url_to=tweet_data.tweet_url) self._post_citatsions.append(post_citation) self.update_tables_with_tweet_retweet_data(self._posts, self._authors, self._post_citatsions) def extract_retweet_data(self, retweet_guid, retweet_url): ''' :param retweet_guid: the guid of the retweet :param retweet_url: the url of the retweet :return: a RetweetData holding the data of the retweet ''' try: retweet_id = self.extract_tweet_id(retweet_url) if retweet_id is None: return None retweet_status = self._social_network_crawler.get_status_by_twitter_status_id( retweet_id) tweet_status_dict = retweet_status.AsDict() if 'retweeted_status' in tweet_status_dict: tweet_status_dict = tweet_status_dict['retweeted_status'] tweet_post_twitter_id = unicode(str(tweet_status_dict['id'])) tweet_author_name = unicode( tweet_status_dict['user']['screen_name']) tweet_url = unicode( generate_tweet_url(tweet_post_twitter_id, tweet_author_name)) tweet_creation_time = unicode(tweet_status_dict['created_at']) tweet_str_publication_date = unicode( extract_tweet_publiction_date(tweet_creation_time)) tweet_guid = unicode( compute_post_guid( post_url=tweet_url, author_name=tweet_author_name, str_publication_date=tweet_str_publication_date)) tweet_author_guid = unicode( compute_author_guid_by_author_name(tweet_author_name)) tweet_author_guid = unicode(tweet_author_guid.replace("-", "")) tweet_content = unicode(tweet_status_dict['text']) tweet_retweet_count = unicode( tweet_status_dict['retweet_count']) tweet_favorite_count = unicode( tweet_status_dict['favorite_count']) retweet_data = RetweetData( retweet_guid=retweet_guid, retweet_url=retweet_url, tweet_guid=tweet_guid, tweet_url=tweet_url, tweet_author_name=tweet_author_name, tweet_author_guid=tweet_author_guid, tweet_date=tweet_str_publication_date, tweet_content=tweet_content, tweet_twitter_id=tweet_post_twitter_id, tweet_retweet_count=tweet_retweet_count, tweet_favorite_count=tweet_favorite_count) return retweet_data else: return None except TwitterError as e: exception_response = e[0][0] logging.info("e.massage =" + exception_response["message"]) code = exception_response["code"] logging.info("e.code =" + str(exception_response["code"])) self.update_tables_with_tweet_retweet_data(self._posts, self._authors, self._post_citatsions) self._posts = [] self._authors = [] self._post_citatsions = [] if code == 88: sec = self._social_network_crawler.get_sleep_time_for_twitter_status_id( ) logging.info("Seconds to wait from catched crush is: " + str(sec)) if sec != 0: count_down_time(sec) self._num_of_twitter_status_id_requests = 0 return self._social_network_crawler.get_status(retweet_id) except Exception as e: logging.error( "Cannot fetch data for retweet: {0}. Error message: {1}". format(retweet_url, e.message)) return None def extract_tweet_id(self, post_url): post_url = str(post_url) pattern = re.compile("http(.*)://twitter.com/(.*)/statuses/(.*)") extracted_info = pattern.findall(post_url) if extracted_info == []: pattern = re.compile("http(.*)://twitter.com/(.*)/status/(.*)") extracted_info = pattern.findall(post_url) if len(extracted_info[0]) < 2: return None else: return extracted_info[0][2] def update_tables_with_tweet_retweet_data(self, posts, authors, post_citatsions): self._db.addPosts(posts) self._db.add_authors(authors) self._db.addReferences(post_citatsions) def fill_authors_time_line(self): ''' Fetches the posts for the authors that are given under authors_twitter_ids_for_timeline_filling in the config file + update the db ''' self._db.create_authors_index() self._db.create_posts_index() author_screen_names_number_of_posts = self._db.get_author_screen_names_and_number_of_posts( self._minimal_num_of_posts) author_screen_names_number_of_posts_dict = self._create_author_screen_name_number_of_posts_dictionary( author_screen_names_number_of_posts) index = 1 for author_name in author_screen_names_number_of_posts_dict: print("Get timeline for {0} : {1}/{2}".format( author_name, str(index), str(len(author_screen_names_number_of_posts_dict)))) index += 1 posts = [] logging.info("Fetching timeline for author: " + str(author_name)) posts_counter = 0 try: posts_needed_from_osn = self._minimal_num_of_posts - author_screen_names_number_of_posts_dict[ author_name] timeline = self._social_network_crawler.get_timeline_by_author_name( author_name, posts_needed_from_osn) # logging.info("Retrived timeline lenght: " + str(len(timeline))) if timeline is not None: for post in timeline: tweet_post_twitter_id = str(post.id) tweet_url = generate_tweet_url(tweet_post_twitter_id, author_name) tweet_creation_time = post.created_at tweet_str_publication_date = extract_tweet_publiction_date( tweet_creation_time) tweet_guid = compute_post_guid( post_url=tweet_url, author_name=author_name, str_publication_date=tweet_str_publication_date) if self._db.contains_post(tweet_url): continue posts_counter = posts_counter + 1 tweet_author_guid = compute_author_guid_by_author_name( author_name) post = self._db.create_post_from_tweet_data( post, self._domain) posts.append(post) except Exception as e: logging.error( "Cannot fetch data for author: {0}. Error message: {1}". format(author_name, e.message)) logging.info("Number of posts inserted for author {0}: {1}".format( author_name, posts_counter)) self._db.addPosts(posts) def assign_manually_labeled_authors(self): self._db.assign_manually_labeled_authors() def delete_acquired_authors(self): self._db.delete_acquired_authors() self._db.delete_posts_with_missing_authors() def delete_manually_labeled_authors(self): self._db.delete_manually_labeled_authors() self._db.delete_posts_with_missing_authors() def assign_acquired_and_crowd_turfer_profiles(self): self._db.assign_crowdturfer_profiles() self._db.assign_acquired_profiles() def _create_author_screen_name_number_of_posts_dictionary( self, author_screen_names_number_of_posts): author_screen_names_number_of_posts_dict = {} for record in author_screen_names_number_of_posts: author_screen_name = record[0] num_of_posts = record[1] author_screen_names_number_of_posts_dict[ author_screen_name] = num_of_posts logging.info("Number of users to retrieve timelines: " + str(len(author_screen_names_number_of_posts_dict))) return author_screen_names_number_of_posts_dict
class BadActorsCollector(Method_Executor): def __init__(self, db): AbstractExecutor.__init__(self, db) self._actions = self._config_parser.eval(self.__class__.__name__, "actions") self._targeted_twitter_author_ids = self._config_parser.eval( self.__class__.__name__, "targeted_twitter_author_ids") self._targeted_twitter_post_ids = self._config_parser.eval( self.__class__.__name__, "targeted_twitter_post_ids") self._targeted_twitter_author_names = self._config_parser.eval( self.__class__.__name__, "targeted_twitter_author_names") self._social_network_crawler = Twitter_Rest_Api(db) def setUp(self): pass def crawl_bad_actors_followers(self): print("---crawl_bad_actors_followers_and_retweeters ---") bad_actor_type = Author_Type.BAD_ACTOR bad_actors_collector_inseration_type = DB_Insertion_Type.BAD_ACTORS_COLLECTOR connection_type = Author_Connection_Type.FOLLOWER are_user_ids = True self._social_network_crawler.crawl_users_by_author_ids( self._targeted_twitter_author_ids, connection_type, bad_actor_type, are_user_ids, bad_actors_collector_inseration_type) self._db.convert_temp_author_connections_to_author_connections( self._domain) def crawl_bad_actors_retweeters(self): bad_actor_type = Author_Type.BAD_ACTOR bad_actors_collector_inseration_type = DB_Insertion_Type.BAD_ACTORS_COLLECTOR are_user_ids = True self._social_network_crawler.crawl_retweeters_by_post_id( self._targeted_twitter_post_ids, are_user_ids, bad_actor_type, bad_actors_collector_inseration_type) def mark_missing_bad_retweeters(self): print("mark_missing_bad_retweeters_retrieved_from_vico") missing_bad_actors = [] i = 0 cursor = self._db.get_cooperated_authors( self._targeted_twitter_author_names, self._domain) targeted_twitter_author_guid_generator = self._db.result_iter(cursor) for missing_author_guid in targeted_twitter_author_guid_generator: i += 1 missing_author_guid = unicode(missing_author_guid[0]) result = self._db.get_author_by_author_guid_and_domain( missing_author_guid, self._domain) if len(result) > 0: missing_author = result[0] missing_author.author_type = Author_Type.BAD_ACTOR missing_author.mark_missing_bad_actor_retweeters_insertion_date = self._window_start missing_bad_actors.append(missing_author) else: logging.info("GUID = " + missing_author_guid) logging.info("number of missing bad actors found are:" + str(len(missing_bad_actors))) self._db.add_authors(missing_bad_actors)
import logging from logging import config from DB.schema_definition import DB from Twitter_API.twitter_api_requester import TwitterApiRequester from configuration.config_class import getConfig from twitter_rest_api.twitter_rest_api import Twitter_Rest_Api if __name__ == '__main__': #config_parser = Configuration.get_config_parser() config_parser = getConfig() logging.config.fileConfig(config_parser.get("Logger", "logger_conf_file")) logging.info("Start program...") print("Start program...") social_network_crawler = Twitter_Rest_Api() #twitter_rest_api.crawl_followers() targeted_twitter_author_ids = [] targeted_twitter_author_ids.append(targeted_twitter_author_id) are_user_ids = True social_network_crawler.crawl_followers_by_twitter_author_ids( targeted_twitter_author_ids, bad_actor_type, are_user_ids) # twitter_rest_api.crawl_ ''' db = DB() db.setUp() logging.info("Creating TwitterApiRequester")
class Trec2012MicroblogTrackImporter(Method_Executor): def __init__(self, db): Method_Executor.__init__(self, db) self._topics_path = self._config_parser.eval(self.__class__.__name__, "topics_path") self._judgment_path = self._config_parser.eval(self.__class__.__name__, "judgment_path") self._num_of_relevant_tweets = self._config_parser.eval( self.__class__.__name__, "num_of_relevant_tweets") self._num_of_description_words = self._config_parser.eval( self.__class__.__name__, "num_of_description_words") self._twitter_api = Twitter_Rest_Api(db) def load_data(self): topics = self._read_trec_topics(self._topics_path) topic_judgments = self._read_judgments(self._judgment_path) claims = self._extract_claims_from_judgments(topics, topic_judgments) self._db.addPosts(claims) self._create_tweet_corpus_from_judgments(self._judgment_path) def set_description_from_relevant(self): claims = self._db.get_claims() topic_judgments = self._read_judgments(self._judgment_path) posts = self._db.get_posts() post_dict = {p.post_id: p for p in posts} for claim in claims: topic_id = int(claim.claim_id) # tweets = self._twitter_api.get_tweets_by_ids(topic_judgments[topic_id][:self._num_of_relevant_tweets]) # posts, authors = self._db.convert_tweets_to_posts_and_authors(tweets, self._domain) posts = list( map(post_dict.get, topic_judgments[topic_id][:self._num_of_relevant_tweets])) claim_content = OrderedSet(claim.keywords.lower().split()) for post in posts: list(map(claim_content.add, clean_tweet(post.content).split())) # if len(claim_content) > 25: # break claim.description = clean_claim_description( ' '.join(claim_content), True) self._db.addPosts(claims) def set_description_from_tf_idf_best(self): tf_idf_vectorizer = TfidfVectorizer(stop_words='english') posts = self._db.get_posts() corpus = [clean_claim_description(p.content, True) for p in posts] tf_idf_vectorizer.fit_transform(corpus) word_tf_idf_dict = defaultdict( float, list( zip(tf_idf_vectorizer.get_feature_names(), tf_idf_vectorizer.idf_))) post_dict = {p.post_id: p for p in posts} topic_judgments = self._read_judgments(self._judgment_path) claims = self._db.get_claims() for i, claim in enumerate(claims): init_query_words = set(claim.keywords.lower().split()) claim_content = set() relevant_posts_ids = topic_judgments[int(claim.claim_id)] words = set() for post_id in relevant_posts_ids: if post_id in post_dict: words.update( clean_claim_description( post_dict.get(post_id).content, True).split()) best_words = sorted(words, key=lambda k: word_tf_idf_dict[k], reverse=True)[:self._num_of_description_words + len(init_query_words)] claim_content.update(best_words) pass claim_content = claim_content - init_query_words claim_description = clean_claim_description( ' '.join(claim_content), True) claim.description = ' '.join(claim_description.split()) self._db.addPosts(claims) def _read_trec_topics(self, topics_path): trec_topic_fields = ['num', 'query', 'querytime', 'querytweettime'] TrecTopic = namedtuple('TrecTopic', trec_topic_fields) topic_file = open(topics_path) trec_topics = [] for topic_xml in topic_file.read().split('\n\n')[:-1]: trec_topic_dict = xmltodict.parse(topic_xml) trec_topic = TrecTopic._make( [trec_topic_dict['top'][field] for field in trec_topic_fields]) trec_topics.append(trec_topic) return trec_topics # def _read_judgments(self, judgment_path): # judgments = pd.read_csv(judgment_path, sep=' ', names=['topic_id', 'Q', 'tweet_id', 'rel']) # tweets = self._twitter_api.get_tweets_by_ids(judgments['tweet_id'], pre_save=False) # topic_tweet_id_dict = dict(judgments[['topic_id', 'tweet_id']].to_records(index=False)) # tweet_id_rel_dict = dict(judgments[['tweet_id', 'rel']].to_records(index=False)) # posts, authors = self._db.convert_tweets_to_posts_and_authors(tweets, u'Trec2012') def _read_judgments(self, judgment_path): topic_high_relevant_judgment_dict = defaultdict(list) topic_relevant_judgment_dict = defaultdict(list) for topic, Q, docid, rel in csv.reader(open(judgment_path, "rb"), delimiter=' '): if int(rel) > 1: topic_high_relevant_judgment_dict[int(topic)].append(docid) elif int(rel) == 1: topic_relevant_judgment_dict[int(topic)].append(docid) for topic_id, tweet_ids in topic_relevant_judgment_dict.items(): # if topic_id not in topic_high_relevant_judgment_dict: topic_high_relevant_judgment_dict[topic_id].extend(tweet_ids) return topic_high_relevant_judgment_dict def _extract_claims_from_judgments(self, topics, topic_judgments): claims = [] for trec_topic in topics: # tweet_id = tweet_ids[0] topic_id = int(parse.parse('Number: MB{}', trec_topic.num)[0]) # tweets = self._twitter_api.get_tweets_by_ids(topic_judgments[topic_id][:10]) # posts, authors = self._db.convert_tweets_to_posts_and_authors(tweets, self._domain) claim_content = set(trec_topic.query.split()) # for post in []: # claim_content.update(clean_tweet(post.content).split()) # if len(claim_content) > 25: # break claim = self._convet_trec_topic_to_claim(' '.join(claim_content), topic_id, trec_topic) claims.append(claim) return claims def _convet_trec_topic_to_claim(self, claim_content, topic_id, trec_topic): claim = Claim() claim.claim_id = topic_id claim.verdict_date = parser.parse(trec_topic.querytime).date() claim.domain = 'Trec2012' claim.title = trec_topic.query claim.keywords = trec_topic.query claim.description = claim_content return claim def _create_tweet_corpus_from_judgments(self, judgment_path): judgment_df = pd.read_csv( judgment_path, delimiter=' ', names=['topic', 'Q', 'docid', 'rel'], ) tweet_ids = judgment_df['docid'].tolist() tweets = self._twitter_api.get_tweets_by_ids(tweet_ids, pre_save=False) posts, authors = self._db.convert_tweets_to_posts_and_authors( tweets, 'Trec2012') claim_tweet_connections = [] for post in posts: post.post_id = str(post.post_osn_id) post_osn_id_posts_dict = set(p.post_osn_id for p in posts) for topic_id, post_osn_id in judgment_df[['topic', 'docid' ]].to_records(index=False): if post_osn_id in post_osn_id_posts_dict: claim_tweet_connection = Claim_Tweet_Connection() claim_tweet_connection.claim_id = str(topic_id) claim_tweet_connection.post_id = str(post_osn_id) claim_tweet_connections.append(claim_tweet_connection) self._db.addPosts(claim_tweet_connections) self._db.addPosts(posts) judgment_df[judgment_df['docid'].isin(post_osn_id_posts_dict)].to_csv( judgment_path + '_filtered', sep=' ', header=False, index=False)
def __init__(self, db): super(AsonamHoneypotImporter, self).__init__(db) self.twitter_rest_api = Twitter_Rest_Api(self._db) self._data_path = self._config_parser.eval(self.__class__.__name__, "data_path")