class Pusher(TwitterCrawler): ''' Kelas ini berfungsi untuk menggiring data keluar dari crawler, dan menyalurkannya ke fungsi dari masing-masing dokumen. ''' def __init__(self): self.tp = TwitterParser() self.tmc = TwitterMongoConnector() def pushPostDocument(self, complete_dict): post_document = self.tp.getPostDocument(complete_dict) self.tmc.updatePost(post_document) def pushAccountDocument(self, complete_dict): for i in range(5, 0, -1): time.sleep(1) self.prompt('Updating account in {}'.format(i)) account_document = self.tp.getAccountDocument(complete_dict) self.tmc.updateAccount(account_document)
def __init__(self): logger = logging.getLogger() logger.setLevel(logging.DEBUG) if not logger.handlers: logpath = '/home/addi/egovbench/logs/twitter/egovbench_twittertrigger.log' try: formatter = logging.Formatter( '%(asctime)s - %(levelname)s - %(message)s') fh = RotatingFileHandler(logpath, maxBytes=20971520, backupCount=5) fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) logger.addHandler(fh) ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter(formatter) logger.addHandler(ch) except FileNotFoundError: self.createdirectory(logpath) self.tmc = TwitterMongoConnector()
def __init__(self, filterDict): super(TwitterScorer, self).__init__( filterDict, TwitterMongoConnector(), 'tweet_favoriteCount', 'tweet_replyCount', 'tweet_retweetCount', 'account_id', 'tweet_type' ) self.filter_dict = filterDict self.tmc = TwitterMongoConnector()
def getPostTypeScoreDocument(self): update_document = {} post_types = self.tmc.getPostTypeDistinct('tweet_type') for post_type in post_types: posttypeattribute = {'tweet_type': post_type} super(TwitterScorer, self).__init__( posttypeattribute, TwitterMongoConnector(), 'tweet_favoriteCount', 'tweet_replyCount', 'tweet_retweetCount', 'account_id', 'tweet_type' ) self.prompt('{} Creating score document . . .'.format(json.dumps(self.filter_dict))) update_document['_id'] = posttypeattribute['tweet_type'] update_document['result.scores'] = {} update_document['result.scores']['engagement_index_score'] = self.getEngagementIndexScore() self.prompt('{} Score document created!'.format(json.dumps(self.filter_dict))) self.mongo_connector_class.updatePostTypeResult(update_document)
def getAccountPostTypeScoreDocument(self): update_document = {} post_types = self.tmc.getPostTypeDistinct('tweet_type') for post_type in post_types: self.filter_dict.pop('tweet_type', None) posttypeattribute = {'tweet_type': post_type} posttypeattribute.update(self.filter_dict) super(TwitterScorer, self).__init__( posttypeattribute, TwitterMongoConnector(), 'tweet_favoriteCount', 'tweet_replyCount', 'tweet_retweetCount', 'account_id', 'tweet_type' ) self.prompt('{} Creating score document . . .'.format(json.dumps(self.filter_dict))) update_document['account_id'] = posttypeattribute['account_id'].lower() update_document['post_type_result.%s.scores' % (post_type)] = {} update_document['post_type_result.%s.scores' % (post_type)]['engagement_index_score'] = self.getEngagementIndexScore() self.prompt('{} Score document created!'.format(json.dumps(self.filter_dict))) return update_document
def __init__(self): ''' Inisiasi: -TwitterParser(): Diambil dari file egovbench_parser.py, kelas ini berfungsi untuk memparsing json/dict hasil crawling ke dalam struktur penyimpanan document di MongoDB -TwitterMongoConnector(): Diambil dari file egovbench_mongo.py, kelas ini berfungsi untuk menyalurkan hasil parsing ke dalam MongoDB. ''' self.tp = TwitterParser() self.tmc = TwitterMongoConnector()
def __init__(self, credFile='/home/addi/egovbench/apps/pythons/egovbench_credentials.ini', confFile='/home/addi/egovbench/apps/pythons/egovbench_config.ini'): confparser = SafeConfigParser() confparser.read(credFile) access_token = confparser.get('TwitterCredentials', 'access_token') access_token_secret = confparser.get('TwitterCredentials', 'access_token_secret') consumer_key = confparser.get('TwitterCredentials', 'consumer_key') consumer_secret = confparser.get('TwitterCredentials', 'consumer_secret') authHandler = tweepy.OAuthHandler(consumer_key, consumer_secret) authHandler.set_access_token(access_token, access_token_secret) self.twitterAPI = tweepy.API(authHandler, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) confparser2 = SafeConfigParser() confparser2.read(confFile) self.crawllimit = int(confparser2.get('CrawlerConfig', 'crawllimit')) logger = logging.getLogger() logger.setLevel(logging.DEBUG) if not logger.handlers: logpath = '/home/addi/egovbench/logs/twitter/egovbench_twittercrawler.log' try: formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh = RotatingFileHandler(logpath, maxBytes=20971520, backupCount=5) fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) logger.addHandler(fh) ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter(formatter) logger.addHandler(ch) except FileNotFoundError: self.createdirectory(logpath) self.p = Pusher() self.tmc = TwitterMongoConnector()
class Pusher(): ''' Kelas ini berfungsi untuk menggiring data keluar dari crawler, dan menyalurkannya ke fungsi dari masing-masing dokumen. ''' def __init__(self): ''' Inisiasi: -TwitterParser(): Diambil dari file egovbench_parser.py, kelas ini berfungsi untuk memparsing json/dict hasil crawling ke dalam struktur penyimpanan document di MongoDB -TwitterMongoConnector(): Diambil dari file egovbench_mongo.py, kelas ini berfungsi untuk menyalurkan hasil parsing ke dalam MongoDB. ''' self.tp = TwitterParser() self.tmc = TwitterMongoConnector() def pushPostDocument(self, complete_dict): ''' Fungsi ini memanggil fungsi getPostDocument() dari kelas TwitterParser() untuk mengubah data crawling ke dalam struktur post document, dan memasukannya ke dalam post collection di MongoDB dengan menggunakan fungsi updatePost() milik TwitterMongoConnector() ''' post_document = self.tp.getPostDocument(complete_dict) self.tmc.updatePost(post_document) def pushAccountDocument(self, complete_dict): ''' Fungsi ini memanggil fungsi getAccountDocument() dari kelas TwitterParser() untuk mengubah data crawling ke dalam struktur account document, dan memasukannya ke dalam account collection di MongoDB dengan menggunakan fungsi updateAccount() milik TwitterMongoConnector() ''' account_document = self.tp.getAccountDocument(complete_dict) self.tmc.updateAccount(account_document)
def __init__(self): self.tp = TwitterParser() self.tmc = TwitterMongoConnector()
class TwitterCrawler(): def createdirectory(self, path): os.makedirs(os.path.dirname(path), exist_ok=True) def __init__(self, credFile='/home/addi/egovbench/apps/pythons/egovbench_credentials.ini', confFile='/home/addi/egovbench/apps/pythons/egovbench_config.ini'): confparser = SafeConfigParser() confparser.read(credFile) access_token = confparser.get('TwitterCredentials', 'access_token') access_token_secret = confparser.get('TwitterCredentials', 'access_token_secret') consumer_key = confparser.get('TwitterCredentials', 'consumer_key') consumer_secret = confparser.get('TwitterCredentials', 'consumer_secret') authHandler = tweepy.OAuthHandler(consumer_key, consumer_secret) authHandler.set_access_token(access_token, access_token_secret) self.twitterAPI = tweepy.API(authHandler, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) confparser2 = SafeConfigParser() confparser2.read(confFile) self.crawllimit = int(confparser2.get('CrawlerConfig', 'crawllimit')) logger = logging.getLogger() logger.setLevel(logging.DEBUG) if not logger.handlers: logpath = '/home/addi/egovbench/logs/twitter/egovbench_twittercrawler.log' try: formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh = RotatingFileHandler(logpath, maxBytes=20971520, backupCount=5) fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) logger.addHandler(fh) ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter(formatter) logger.addHandler(ch) except FileNotFoundError: self.createdirectory(logpath) self.p = Pusher() self.tmc = TwitterMongoConnector() def prompt(self, texts): logging.info('[EGOVBENCH_TWITTERCRAWLER]>' + ' ' + texts) def launch(self): self.prompt('Launching . . .') accounts = self.tmc.collectAccounts() for account in accounts: pemda_id = account['_id'] pemda_name = account['name'] pemda_account = account['twitter_resmi'] if pemda_account is not '': try: self.crawlTweets(pemda_id, pemda_name, pemda_account) except NoAccountException as e: logging.critical(e) pass def crawlTweets(self, pemdaID, pemdaName, accountID): self.prompt('(pemda_id: {}, pemda_name: {}, pemda_account: {}) Crawl Started !'.format(pemdaID, pemdaName, accountID)) # Mengecek apakah id akun tersebut ada di database account_exist = self.tmc.checkAccount(accountID.lower()) # Bila ada, crawl akan di limit sebesar self.crawllimit. Bila tidak, akan mengcrawling akun secara menyeluruh. if account_exist: crawllimit = self.crawllimit else: crawllimit = None complete_list = [] tweets_crawled = 0 complete_dict = {} complete_dict['account'] = {} complete_dict['account']['account_id'] = accountID.lower() try: for tweets in tweepy.Cursor( self.twitterAPI.user_timeline, screen_name=accountID, count=100, include_rts=True, tweet_mode='extended').items(): json_str = json.dumps(tweets._json) j_results = json.loads(json_str) if 'RT @' not in j_results['full_text']: # accout_id_number dan followerCount hanya dapat diambil setelah mendapatkan data hasil crawl (berupa tweet) complete_dict['account']['account_id_number'] = j_results['user']['id_str'] complete_dict['account']['account_followerCount'] = j_results['user']['followers_count'] complete_dict['post'] = {} complete_dict['post']['tweet_id'] = j_results['id_str'] complete_dict['post']['tweet_message'] = eu.cleanStrings(j_results['full_text']) complete_dict['post']['tweet_createdDate'] = eu.formatTwitterTime(j_results['created_at']) complete_dict['post']['tweet_retweetCount'] = j_results['retweet_count'] complete_dict['post']['tweet_favoriteCount'] = j_results['favorite_count'] complete_dict['post']['tweet_type'] = "text" # Jenis tweet berupa text (dari hasil crawling) tidak memiliki atribut tweet_type, maka harus di-inisiasi secara manual if 'entities' in j_results: if 'media' in j_results['entities']: complete_dict['post']['tweet_type'] = j_results['entities']['media'][0]['type'] if 'extended_entities' in j_results: if 'media' in j_results['extended_entities']: complete_dict['post']['tweet_type'] = j_results['extended_entities']['media'][0]['type'] complete_dict['post']['tweet_replyCount'] = 0 complete_list.append(complete_dict.copy()) # Counter tweets_crawled += 1 self.prompt('(account_id: {}, tweet_id: {}) Tweets Crawled ! total: {}'.format(accountID, complete_dict['post']['tweet_id'], tweets_crawled)) # Berhenti crawling bila telah mencapai crawllimit if tweets_crawled == crawllimit: break if complete_list: # Mengambil angka reply dengan memanfaatkan API Search pada method collectReplies() self.collectReplies(complete_list) # Mem-push json/dict untuk membuat post document for one_complete_dict in complete_list: self.p.pushPostDocument(one_complete_dict) # Mem-push json/dict untuk membuat account document self.p.pushAccountDocument(complete_dict) except tweepy.TweepError as e: logging.error(e) if e.reason == 'Twitter error response: status code = 404': raise NoAccountException self.prompt('(pemda_id: {}, pemda_name: {}, pemda_account: {}) Done Crawling !'.format(pemdaID, pemdaName, accountID)) def collectReplies(self, completeList): ''' Pengambilan reply dilakukan dengan mengcrawl akun dengan menggunakan API search dan menggunakan keyword (q) "@ + accountID". Selanjutnya, hasil crawl akan diproses dengan mencocokkan atribut ['in_reply_to_status_id_str'] milik tweet hasil crawl dengan tweetID. Bila cocok, replies_collected akan bertambah 1. Terakhir, method ini akan mengembalikan nilai replies_collected sebagai jumlah reply dari tweetID. ''' self.prompt("(account_id: {}) Collecting Replies . . .".format(completeList[0]['account']['account_id'])) try: for tweets in tweepy.Cursor( self.twitterAPI.search, q='@' + completeList[0]['account']['account_id'], include_rts=True, tweet_mode='extended').items(): json_str = json.dumps(tweets._json) j_results = json.loads(json_str) # Mencocokkan atribut for complete_dict in completeList: if complete_dict['post']['tweet_id'] == j_results['in_reply_to_status_id_str']: complete_dict['post']['tweet_replyCount'] += 1 self.prompt("(tweet_id: {}) {} Reply collected!".format(complete_dict['post']['tweet_id'], complete_dict['post']['tweet_replyCount'])) except tweepy.TweepError as e: logging.error(e) if e.reason == 'Twitter error response: status code = 404': raise NoAccountException
def crawlTweets(self, pemdaID, pemdaName, accountID): ''' Memulia crawling dengan argumen : pemdaID: ID pemda pemdaName: Nama pemda accountID: ID akun twitter resmi milik pemda ''' self.prompt( '(pemda_id: {}, pemda_name: {}, pemda_account: {}) Crawl Started !' .format(pemdaID, pemdaName, accountID)) # Mengecek apakah id akun tersebut ada di database tmc = TwitterMongoConnector() account_exist = tmc.checkAccount(int(pemdaID), accountID.lower()) # Bila ada, crawl akan di limit sebesar self.crawllimit. Bila tidak, akan mengcrawling akun secara menyeluruh. if account_exist: crawllimit = self.crawllimit else: crawllimit = None ''' Memulai crawling: Hasil crawling akan diserialisasi ke dalam bentuk json/dict sebagai berikut: { pemda_id: <id pemda> pemda_name: <nama pemda> account: { account_id: <id akun> account_id_number: <id akun dalam bentuk nomor> account_followerCount: <jumlah follower dari akun tersebut. Hanya dapat diambil dari data tweet (hasil crawl) yang masuk> } post: { tweet_id: <id unik tweet> tweet_message: <isi/pesan dari tweet> tweet_createdDate: <tanggal dibuatnya tweet> tweet_retweetCount: <jumlah retweet dari tweet> tweet_favoriteCount: <jumlah favorite dari tweet> tweet_type: <jenis tweet> tweet_replyCount: <jumlah reply dari tweet. Dicari secara manual dengan menggunakan API search> } } ''' try: tweets_crawled = 0 complete_dict = {} # Mengubah pemdaID ke dalam bentuk int (bentuk string dari spreadsheet) complete_dict['pemda_id'] = int(pemdaID) complete_dict['pemda_name'] = pemdaName complete_dict['account'] = {} # Mengubah id akun dari spreadsheet ke huruf kecil complete_dict['account']['account_id'] = accountID.lower() complete_dict['post'] = {} for tweets in tweepy.Cursor(self.twitterAPI.user_timeline, screen_name=accountID, count=100, include_rts=True, tweet_mode='extended').items(): json_str = json.dumps(tweets._json) j_results = json.loads(json_str) if 'RT @' not in j_results['full_text']: # accout_id_number dan followerCount hanya dapat diambil setelah mendapatkan data hasil crawl (berupa tweet) complete_dict['account']['account_id_number'] = j_results[ 'user']['id_str'] complete_dict['account'][ 'account_followerCount'] = j_results['user'][ 'followers_count'] complete_dict['post']['tweet_id'] = j_results['id_str'] complete_dict['post']['tweet_message'] = eu.cleanStrings( j_results['full_text']) complete_dict['post'][ 'tweet_createdDate'] = eu.formatTwitterTime( j_results['created_at']) complete_dict['post']['tweet_retweetCount'] = j_results[ 'retweet_count'] complete_dict['post']['tweet_favoriteCount'] = j_results[ 'favorite_count'] # Jenis tweet berupa text (dari hasil crawling) tidak memiliki atribut tweet_type, maka harus di-inisiasi secara manual if 'media' in j_results['entities']: complete_dict['post']['tweet_type'] = j_results[ 'entities']['media'][0]['type'] else: complete_dict['post']['tweet_type'] = "text" # Counter tweets_crawled += 1 self.prompt( '(account_id: {}, tweet_id: {}) Tweets Crawled ! total: {}' .format(accountID, complete_dict['post']['tweet_id'], tweets_crawled)) # Mengambil angka reply dengan memanfaatkan API Search pada method collectReplies() complete_dict['post'][ 'tweet_replyCount'] = self.collectReplies( accountID, j_results['id_str']) # Mem-push json/dict untuk membuat post document self.p.pushPostDocument(complete_dict) # Berhenti crawling bila telah mencapai crawllimit if tweets_crawled == crawllimit: break # Mem-push json/dict untuk membuat account document self.p.pushAccountDocument(complete_dict) except tweepy.TweepError as e: logging.error(e) if e.reason == 'Twitter error response: status code = 404': raise NoAccountException self.prompt( '(pemda_id: {}, pemda_name: {}, pemda_account: {}) Done Crawling !' .format(pemdaID, pemdaName, accountID))
class TwitterScorer(EIScorer): def __init__(self, filterDict): super(TwitterScorer, self).__init__( filterDict, TwitterMongoConnector(), 'tweet_favoriteCount', 'tweet_replyCount', 'tweet_retweetCount', 'account_id', 'tweet_type' ) self.filter_dict = filterDict self.tmc = TwitterMongoConnector() def getAccountStatisticDocument(self): self.prompt('{} Creating statistic document . . .'.format(json.dumps(self.filter_dict))) update_document = {} update_document['account_id'] = self.filter_dict['account_id'].lower() update_document['account_followerCount'] = self.getFollowerCount() update_document['result.statistics'] = {} update_document['result.statistics']['tweetCount'] = self.getPostCount() update_document['result.statistics']['favoriteCount'] = self.getFieldSum('tweet_favoriteCount') update_document['result.statistics']['replyCount'] = self.getFieldSum('tweet_replyCount') update_document['result.statistics']['retweetCount'] = self.getFieldSum('tweet_retweetCount') self.prompt('{} Statistic document created!'.format(json.dumps(self.filter_dict))) return update_document def getAccountScoreDocument(self): update_document = {} self.prompt('{} Creating score document . . .'.format(json.dumps(self.filter_dict))) update_document['account_id'] = self.filter_dict['account_id'].lower() update_document['result.scores'] = {} update_document['result.scores']['popularity_favoriteScore'] = {} update_document['result.scores']['popularity_favoriteScore']['popularity_favoriteScore_1'] = self.getP1() update_document['result.scores']['popularity_favoriteScore']['popularity_favoriteScore_3'] = self.getP3() update_document['result.scores']['commitment_replyScore'] = {} update_document['result.scores']['commitment_replyScore']['commitment_replyScore_1'] = self.getC1() update_document['result.scores']['commitment_replyScore']['commitment_replyScore_3'] = self.getC3() update_document['result.scores']['virality_retweetScore'] = {} update_document['result.scores']['virality_retweetScore']['virality_retweetScore_1'] = self.getV1() update_document['result.scores']['virality_retweetScore']['virality_retweetScore_3'] = self.getV3() update_document['result.scores']['engagement_index_score'] = self.getEngagementIndexScore() engagement_index_score_normalized = self.getAccountNormalizedEngagementIndexScore() update_document['result.scores']['engagement_index_score_normalized'] = engagement_index_score_normalized * 100 if engagement_index_score_normalized else None self.prompt('{} Score document created!'.format(json.dumps(self.filter_dict))) return update_document def getAccountPostTypeScoreDocument(self): update_document = {} post_types = self.tmc.getPostTypeDistinct('tweet_type') for post_type in post_types: self.filter_dict.pop('tweet_type', None) posttypeattribute = {'tweet_type': post_type} posttypeattribute.update(self.filter_dict) super(TwitterScorer, self).__init__( posttypeattribute, TwitterMongoConnector(), 'tweet_favoriteCount', 'tweet_replyCount', 'tweet_retweetCount', 'account_id', 'tweet_type' ) self.prompt('{} Creating score document . . .'.format(json.dumps(self.filter_dict))) update_document['account_id'] = posttypeattribute['account_id'].lower() update_document['post_type_result.%s.scores' % (post_type)] = {} update_document['post_type_result.%s.scores' % (post_type)]['engagement_index_score'] = self.getEngagementIndexScore() self.prompt('{} Score document created!'.format(json.dumps(self.filter_dict))) return update_document def getPostTypeStatisticDocument(self): update_document = {} post_types = self.tmc.getPostTypeDistinct('tweet_type') for post_type in post_types: posttypeattribute = {'tweet_type': post_type} super(TwitterScorer, self).__init__( posttypeattribute, TwitterMongoConnector(), 'tweet_favoriteCount', 'tweet_replyCount', 'tweet_retweetCount', 'account_id', 'tweet_type' ) self.prompt('{} Creating statistic document . . .'.format(json.dumps(self.filter_dict))) update_document['_id'] = posttypeattribute['tweet_type'] update_document['result.statistics'] = {} update_document['result.statistics']['tweetCount'] = self.getPostCount() self.prompt('{} Statistic document created!'.format(json.dumps(self.filter_dict))) self.mongo_connector_class.updatePostTypeResult(update_document) def getPostTypeScoreDocument(self): update_document = {} post_types = self.tmc.getPostTypeDistinct('tweet_type') for post_type in post_types: posttypeattribute = {'tweet_type': post_type} super(TwitterScorer, self).__init__( posttypeattribute, TwitterMongoConnector(), 'tweet_favoriteCount', 'tweet_replyCount', 'tweet_retweetCount', 'account_id', 'tweet_type' ) self.prompt('{} Creating score document . . .'.format(json.dumps(self.filter_dict))) update_document['_id'] = posttypeattribute['tweet_type'] update_document['result.scores'] = {} update_document['result.scores']['engagement_index_score'] = self.getEngagementIndexScore() self.prompt('{} Score document created!'.format(json.dumps(self.filter_dict))) self.mongo_connector_class.updatePostTypeResult(update_document)
class TwitterTrigger(): def createdirectory(self, path): os.makedirs(os.path.dirname(path), exist_ok=True) def __init__(self): logger = logging.getLogger() logger.setLevel(logging.DEBUG) if not logger.handlers: logpath = '/home/addi/egovbench/logs/twitter/egovbench_twittertrigger.log' try: formatter = logging.Formatter( '%(asctime)s - %(levelname)s - %(message)s') fh = RotatingFileHandler(logpath, maxBytes=20971520, backupCount=5) fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) logger.addHandler(fh) ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter(formatter) logger.addHandler(ch) except FileNotFoundError: self.createdirectory(logpath) self.tmc = TwitterMongoConnector() def prompt(self, texts): logging.info('[EGOVBENCH_TWITTERTRIGGER]>' + ' ' + texts) def launch(self): self.prompt('Launching trigger . . .') self.tmc.resetTemp() counter = 0 while True: cursor = self.tmc.activateTailableCursor() while cursor.alive: try: message = cursor.next() self.prompt('(account_id: {}) Message received!'.format( message['id'])) self.pushAccountResult(message['id']) self.prompt( '===================================================================' ) counter += 1 if counter % 100 == 0: self.pushPostTypeResult() except StopIteration: time.sleep(1) def pushPostTypeResult(self): ts = TwitterScorer(None) ts.getPostTypeStatisticDocument() ts.getPostTypeScoreDocument() def pushAccountResult(self, value): filter_dict = {'account_id': value} ts = TwitterScorer(filter_dict) accountStatisticDocument = ts.getAccountStatisticDocument() self.tmc.updateAccountResult(accountStatisticDocument) accountScoreDocument = ts.getAccountScoreDocument() self.tmc.updateAccountResult(accountScoreDocument) accountPostTypeScoreDocument = ts.getAccountPostTypeScoreDocument() self.tmc.updateAccountResult(accountPostTypeScoreDocument) self.tmc.updatePemdaScores(value)