def parse_tweet_item(self, items): for k,v in items.items(): # assert k == v['id_str'], (k,v) tweet = Tweet() tweet['id_'] = k tweet['raw_data'] = v yield tweet
def parse_databreaches( self, items): # alle relevanten infos für unser DjangoItem for k, v in items.items(): # aus dem tweetItem des Crawlers gewonnen # assert k == v['id_str'], (k,v) tweet = Tweet() user = User tweet['id_'] = k tweet['raw_data'] = v daten = tweet[ 'raw_data'] # raw_date ist dictionary mit ensprechenden variablen nach daten id2 = schluessel_id #schlüssel id = str( daten['user_id'] ) # aus daten werden die jeweiligen infos in variablen gespeichert worte_anzahl = len(re.findall( r'\w+', daten['full_text'])) # wortanzahl mit library bestimmen datum = daten['created_at'].split( )[1] + " " + daten['created_at'].split( )[2] + " " + daten['created_at'].split( )[5] # datum string aufsplitten um nur tag,monat,jahr zu erhalten date_time_obj = datetime.datetime.strptime( datum, '%b %d %Y') # datum string als datetime object item = Tweet_item( ) # dem Tweet item werden die werte aus den Variablen übergeben item['userId'] = id item['unternehmensname'] = unternehmensname item['datum'] = date_time_obj.date( ) # datetime object als nur date object übergeben item['woerter'] = worte_anzahl item['schluesselId'] = id2 yield item
def parse_tweet_item( self, tweet_items, users, ): for _, tweet_item in tweet_items.items(): tweet = Tweet() tweet["collected_at"] = datetime.now() user = users[tweet_item["user_id"]] tweet["created_at"] = datetime.strptime( tweet_item["created_at"], "%a %b %d %H:%M:%S %z %Y", ) tweet[ "url"] = f"https://twitter.com/{user['user_name']}/status/{tweet_item['id']}" tweet["tweet_id"] = tweet_item["id"] tweet["content"] = tweet_item["full_text"] tweet["language"] = language_codes[tweet_item["lang"]] try: tweet["tweet_client"] = re.search( ">(.*?)<", tweet_item["source"], ).group(1) except AttributeError: tweet["tweet_client"] = None tweet["retweet_count"] = tweet_item["retweet_count"] tweet["favorite_count"] = tweet_item["favorite_count"] tweet["reply_count"] = tweet_item["reply_count"] tweet["quote_count"] = tweet_item["quote_count"] tweet["in_reply_to_status_id"] = tweet_item[ "in_reply_to_status_id"] tweet["in_reply_to_user_id"] = tweet_item["in_reply_to_user_id"] tweet["user"] = user with open(f"output/{tweet['tweet_id']}.json", "w+") as fp: json.dump(tweet_item, fp) yield tweet
def parse_tweet_item(self, items): logger.debug(f'items ontvangen {items}') tw_xpaden = { 'ID': './/@data-tweet-id', 'user_id': './/@data-user-id', 'text': './/div[@class="js-tweet-text-container"]/p', 'url': './/@data-permalink-path', 'timestamp': './/div[@class="stream-item-header"]/small[@class="time"]/a/span/@data-time', 'retweets': './/span[contains(@class, "ProfileTweet-action--retweet")]//@data-tweet-stat-count', 'favorites': './/span[contains(@class, "ProfileTweet-action--favorite")]//@data-tweet-stat-count', 'replies': './/span[contains(@class, "ProfileTweet-action--reply")]//@data-tweet-stat-count', 'conversation_id': '//@data-conversation-id', 'lang': self.lang } user_xpaden = { 'ID': './/@data-user-id', 'screenname': './/@data-screen-name', 'name': './/@data-name' } for item in items: tweet = ItemLoader(Tweet(), item) for keys, values in tw_xpaden.items(): tweet.add_xpath(keys, values) yield tweet.load_item() if self.crawl_user: user = ItemLoader(User(), item) for keys, values in user_xpaden.items(): user.add_xpath(keys, values) yield user.load_item() logger.info('pagina compleet geparsed op tweets')
def parse_tweet_item(self, items): for item in items: try: tweet = Tweet() tweet['usernameTweet'] = item.xpath('.//span[@class="username u-dir"]/b/text()').extract()[0] ID = item.xpath('.//@data-tweet-id').extract() if not ID: continue tweet['ID'] = ID[0] ### get text content tweet['text'] = ' '.join( item.xpath('.//div[@class="js-tweet-text-container"]/p//text()').extract()).replace(' # ', '#').replace( ' @ ', '@') if tweet['text'] == '': # If there is not text, we ignore the tweet continue ### get meta data tweet['url'] = item.xpath('.//@data-permalink-path').extract()[0] nbr_retweet = item.css('span.ProfileTweet-action--retweet > span.ProfileTweet-actionCount').xpath( '@data-tweet-stat-count').extract() if nbr_retweet: tweet['nbr_retweet'] = int(nbr_retweet[0]) else: tweet['nbr_retweet'] = 0 nbr_favorite = item.css('span.ProfileTweet-action--favorite > span.ProfileTweet-actionCount').xpath( '@data-tweet-stat-count').extract() if nbr_favorite: tweet['nbr_favorite'] = int(nbr_favorite[0]) else: tweet['nbr_favorite'] = 0 nbr_reply = item.css('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount').xpath( '@data-tweet-stat-count').extract() if nbr_reply: tweet['nbr_reply'] = int(nbr_reply[0]) else: tweet['nbr_reply'] = 0 tweet['datetime'] = datetime.fromtimestamp(int( item.xpath('.//div[@class="stream-item-header"]/small[@class="time"]/a/span/@data-time').extract()[ 0])).strftime('%Y-%m-%d %H:%M:%S') ### get photo has_cards = item.xpath('.//@data-card-type').extract() if has_cards and has_cards[0] == 'photo': tweet['has_image'] = True tweet['images'] = item.xpath('.//*/div/@data-image-url').extract() elif has_cards: logger.debug('Not handle "data-card-type":\n%s' % item.xpath('.').extract()[0]) ### get animated_gif has_cards = item.xpath('.//@data-card2-type').extract() if has_cards: if has_cards[0] == 'animated_gif': tweet['has_video'] = True tweet['videos'] = item.xpath('.//*/source/@video-src').extract() elif has_cards[0] == 'player': tweet['has_media'] = True tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract() elif has_cards[0] == 'summary_large_image': tweet['has_media'] = True tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract() elif has_cards[0] == 'amplify': tweet['has_media'] = True tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract() elif has_cards[0] == 'summary': tweet['has_media'] = True tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract() elif has_cards[0] == '__entity_video': pass # TODO # tweet['has_media'] = True # tweet['medias'] = item.xpath('.//*/div/@data-src').extract() else: # there are many other types of card2 !!!! logger.debug('Not handle "data-card2-type":\n%s' % item.xpath('.').extract()[0]) is_reply = item.xpath('.//div[@class="ReplyingToContextBelowAuthor"]').extract() tweet['is_reply'] = is_reply != [] is_retweet = item.xpath('.//span[@class="js-retweet-text"]').extract() tweet['is_retweet'] = is_retweet != [] tweet['user_id'] = item.xpath('.//@data-user-id').extract()[0] yield tweet if self.crawl_user: ### get user info user = User() user['ID'] = tweet['user_id'] user['name'] = item.xpath('.//@data-name').extract()[0] user['screen_name'] = item.xpath('.//@data-screen-name').extract()[0] user['avatar'] = \ item.xpath('.//div[@class="content"]/div[@class="stream-item-header"]/a/img/@src').extract()[0] yield user except: logger.error("Error tweet:\n%s" % item.xpath('.').extract()[0])
def parse_tweet_item(items): for item in items: tweet = Tweet() tweet['usernameTweet'] = item.xpath( './/span[@class="username u-dir u-textTruncate"]/b/text()' ).extract()[0] tweet['lang'] = item.xpath('.//@lang').get() if tweet['lang'] not in {'en', 'und'}: raise NameError('not support lang') ID = item.xpath('.//@data-tweet-id').extract() if not ID: raise NameError('no ID') tweet['ID'] = ID[0] ### get text content tweet['text'] = ' '.join( item.xpath( './/div[@class="js-tweet-text-container"]/p//text()|.//div[@class="js-tweet-text-container"]/p//img/@alt' ).extract()).replace(' # ', '#').replace(' @ ', '@') tweet['emoji'] = ' '.join( item.xpath( './/div[@class="js-tweet-text-container"]/p//img/@alt' ).extract()) if tweet['text'] == '': # If there is not text, we ignore the tweet raise NameError('empty text') ### get meta data tweet['url'] = item.xpath( './/@data-permalink-path').extract()[0] '''nbr_retweet = item.css('span.ProfileTweet-action--retweet > span.ProfileTweet-actionCount').xpath( '@data-tweet-stat-count').extract() if nbr_retweet: tweet['nbr_retweet'] = int(nbr_retweet[0]) else: tweet['nbr_retweet'] = 0 nbr_favorite = item.css('span.ProfileTweet-action--favorite > span.ProfileTweet-actionCount').xpath( '@data-tweet-stat-count').extract() if nbr_favorite: tweet['nbr_favorite'] = int(nbr_favorite[0]) else: tweet['nbr_favorite'] = 0 nbr_reply = item.css('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount').xpath( '@data-tweet-stat-count').extract() if nbr_reply: tweet['nbr_reply'] = int(nbr_reply[0]) else: tweet['nbr_reply'] = 0''' tweet['datetime'] = datetime.fromtimestamp( int( item.xpath('.//small[@class="time"]/a/span/@data-time') .extract()[0])).strftime('%Y-%m-%d %H:%M:%S') ### get photo has_cards = item.xpath('.//@data-card-type').extract() if has_cards and has_cards[0] == 'photo': tweet['has_image'] = True tweet['images'] = item.xpath( './/*/div/@data-image-url').extract() elif has_cards: logger.debug('Not handle "data-card-type":\n%s' % item.xpath('.').extract()[0]) ### get animated_gif has_cards = item.xpath('.//@data-card2-type').extract() if has_cards: if has_cards[0] == 'animated_gif': tweet['has_video'] = True tweet['videos'] = item.xpath( './/*/source/@video-src').extract() elif has_cards[0] == 'player': tweet['has_media'] = True tweet['medias'] = item.xpath( './/*/div/@data-card-url').extract() elif has_cards[0] == 'summary_large_image': tweet['has_media'] = True tweet['medias'] = item.xpath( './/*/div/@data-card-url').extract() elif has_cards[0] == 'amplify': tweet['has_media'] = True tweet['medias'] = item.xpath( './/*/div/@data-card-url').extract() elif has_cards[0] == 'summary': tweet['has_media'] = True tweet['medias'] = item.xpath( './/*/div/@data-card-url').extract() elif has_cards[0] == '__entity_video': pass # TODO # tweet['has_media'] = True # tweet['medias'] = item.xpath('.//*/div/@data-src').extract() else: # there are many other types of card2 !!!! logger.debug('Not handle "data-card2-type":\n%s' % item.xpath('.').extract()[0]) is_reply = item.xpath( './/div[@class="ReplyingToContextBelowAuthor"]').extract() tweet['is_reply'] = is_reply != [] if tweet['is_reply']: tweet['reply_to'] = item.xpath( './/div[@class="ReplyingToContextBelowAuthor"]//@href|.//div[@class="ReplyingToContextBelowAuthor"]//@data-user-id' ).extract() #href uid is_retweet = item.xpath( './/span[@class="js-retweet-text"]').extract() tweet['is_retweet'] = is_retweet != [] tweet['user_id'] = item.xpath('.//@data-user-id').extract()[0] yield tweet
def parse_tweet_item(self, items): for item in items: try: tweet = Tweet() tweet['usernameTweet'] = item.xpath('.//span[@class="username u-dir"]/b/text()').extract()[0] ID = item.xpath('.//@data-tweet-id').extract() if not ID: continue tweet['ID'] = ID[0] ### get text content tweet['text'] = ' '.join( item.xpath('.//div[@class="js-tweet-text-container"]/p//text()|.//div[@class="js-tweet-text-container"]/p//img/@alt').extract()).replace(' # ', '#').replace( ' @ ', '@') # NOTE Allways the query search is the first index if self.query.split(',')[0] not in tweet['text'] and self.query.split(',')[0] : # If query is not empty and not in text, we ignore the tweet continue if detect_langs(tweet['text'])[0].lang != self.lang: # If language is not correctly detected, we ignore it continue ### get meta data tweet['url'] = item.xpath('.//@data-permalink-path').extract()[0] nbr_retweet = item.css('span.ProfileTweet-action--retweet > span.ProfileTweet-actionCount').xpath( '@data-tweet-stat-count').extract() if nbr_retweet: tweet['nbr_retweet'] = int(nbr_retweet[0]) else: tweet['nbr_retweet'] = 0 nbr_favorite = item.css('span.ProfileTweet-action--favorite > span.ProfileTweet-actionCount').xpath( '@data-tweet-stat-count').extract() if nbr_favorite: tweet['nbr_favorite'] = int(nbr_favorite[0]) else: tweet['nbr_favorite'] = 0 nbr_reply = item.css('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount').xpath( '@data-tweet-stat-count').extract() if nbr_reply: tweet['nbr_reply'] = int(nbr_reply[0]) else: tweet['nbr_reply'] = 0 tweet['datetime'] = datetime.fromtimestamp(int( item.xpath('.//div[@class="stream-item-header"]/small[@class="time"]/a/span/@data-time').extract()[ 0])).strftime('%Y-%m-%d %H:%M:%S') ### get photo has_cards = item.xpath('.//@data-card-type').extract() if has_cards and has_cards[0] == 'photo': tweet['has_image'] = True tweet['images'] = item.xpath('.//*/div/@data-image-url').extract() elif has_cards: logger.debug('Not handle "data-card-type":\n%s' % item.xpath('.').extract()[0]) ### get animated_gif has_cards = item.xpath('.//@data-card2-type').extract() if has_cards: if has_cards[0] == 'animated_gif': tweet['has_video'] = True tweet['videos'] = item.xpath('.//*/source/@video-src').extract() elif has_cards[0] == 'player': tweet['has_media'] = True tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract() elif has_cards[0] == 'summary_large_image': tweet['has_media'] = True tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract() elif has_cards[0] == 'amplify': tweet['has_media'] = True tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract() elif has_cards[0] == 'summary': tweet['has_media'] = True tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract() elif has_cards[0] == '__entity_video': pass # TODO # tweet['has_media'] = True # tweet['medias'] = item.xpath('.//*/div/@data-src').extract() else: # there are many other types of card2 !!!! logger.debug('Not handle "data-card2-type":\n%s' % item.xpath('.').extract()[0]) is_reply = item.xpath('.//div[@class="ReplyingToContextBelowAuthor"]').extract() tweet['is_reply'] = is_reply != [] is_retweet = item.xpath('.//span[@class="js-retweet-text"]').extract() tweet['is_retweet'] = is_retweet != [] tweet['user_id'] = item.xpath('.//@data-user-id').extract()[0] yield tweet except: logger.error("Error tweet:\n%s" % item.xpath('.').extract()[0])
def parse_tweet_item(self, items): for item in items: try: tweet = Tweet() tweet['usernameTweet'] = item.xpath( './/span[@class="username u-dir"]/b/text()').extract()[0] ID = item.xpath('.//@data-tweet-id').extract() if not ID: continue tweet['ID'] = ID[0] tweet['text'] = ' '.join( item.xpath( './/div[@class="js-tweet-text-container"]/p//text()'). extract()).replace(' # ', '#').replace(' @ ', '@') logger.debug(tweet['text']) if tweet['text'] == '': continue tweet['url'] = item.xpath( './/@data-permalink-path').extract()[0] nbr_retweet = item.xpath( './/button[@data-modal="ProfileTweet-retweet"]/span/span/text()' ).extract() if nbr_retweet: tweet['nbr_retweet'] = int(nbr_retweet[0]) else: tweet['nbr_retweet'] = 0 nbr_favorite = item.xpath( './/button[@class="ProfileTweet-actionButton js-actionButton js-actionFavorite"]/span/span/text()' ).extract() if nbr_favorite: tweet['nbr_favorite'] = int(nbr_favorite[0]) else: tweet['nbr_favorite'] = 0 nbr_reply = item.xpath( './/button[@class="ProfileTweet-actionButton js-actionButton js-actionReply"]/span/span/text()' ).extract() if nbr_reply: tweet['nbr_reply'] = int(nbr_reply[0]) else: tweet['nbr_reply'] = 0 tweet['datetime'] = datetime.fromtimestamp( int( item.xpath( './/div[@class="stream-item-header"]/small[@class="time"]/a/span/@data-time' ).extract()[0])).strftime('%Y-%m-%d %H:%M:%S') has_cards = item.xpath('.//@data-card-type').extract() if has_cards and has_cards[0] == 'photo': tweet['has_image'] = True tweet['images'] = item.xpath( './/*/div/@data-image-url').extract() elif has_cards: logger.debug('Not handle "data-card-type":\n%s' % item.xpath('.').extract()[0]) has_cards = item.xpath('.//@data-card2-type').extract() if has_cards: if has_cards[0] == 'animated_gif': tweet['has_video'] = True tweet['videos'] = item.xpath( './/*/source/@video-src').extract() elif has_cards[0] == 'player': tweet['has_media'] = True tweet['medias'] = item.xpath( './/*/div/@data-card-url').extract() elif has_cards[0] == 'summary_large_image': tweet['has_media'] = True tweet['medias'] = item.xpath( './/*/div/@data-card-url').extract() elif has_cards[0] == 'amplify': tweet['has_media'] = True tweet['medias'] = item.xpath( './/*/div/@data-card-url').extract() elif has_cards[0] == 'summary': tweet['has_media'] = True tweet['medias'] = item.xpath( './/*/div/@data-card-url').extract() elif has_cards[0] == '__entity_video': pass else: logger.debug('error: "data-card2-type":\n%s' % item.xpath('.').extract()[0]) is_reply = item.xpath( './/div[@class="ReplyingToContextBelowAuthor"]').extract() tweet['is_reply'] = is_reply != [] is_retweet = item.xpath( './/span[@class="js-retweet-text"]').extract() tweet['is_retweet'] = is_retweet != [] tweet['user_id'] = item.xpath('.//@data-user-id').extract()[0] yield tweet if self.crawl_user: # 拿用户信息 user = User() user['ID'] = tweet['user_id'] user['name'] = item.xpath('.//@data-name').extract()[0] user['screen_name'] = item.xpath( './/@data-screen-name').extract()[0] user['avatar'] = \ item.xpath('.//div[@class="content"]/div[@class="stream-item-header"]/a/img/@src').extract()[0] yield user except: logger.error("error :\n%s" % item.xpath('.').extract()[0])
def parse_tweet_item(self, items, index): tweet = Tweet() tweet['post_content'] = [] tweet['user_name'] = [] tweet['URL'] = [] tweet['num_retweets'] = [] tweet['num_likes'] = [] tweet['num_comments'] = [] tweet['timestamp'] = [] tweet['symbols'] = [] tweet['user_account'] = [] for item in items: try: # ID = item.xpath('.//@data-tweet-id').extract() # if not ID: # continue # tweet['ID'] = ID[0] ### get text content timestamp = datetime.fromtimestamp(int( item.xpath('.//div[@class="stream-item-header"]/small[@class="time"]/a/span/@data-time').extract()[ 0])).strftime('%Y-%m-%d %H:%M:%S') # if timestamp text = ''.join( item.xpath('.//div[@class="js-tweet-text-container"]/p//text()').extract()).replace(' # ','#').replace(' @ ','@') if text == '': continue tweet['post_content'].append(text) tweet['user_name'].append( item.xpath('.//span[@class="username u-dir u-textTruncate"]/b/text()').extract()[0]) ### get meta data tweet['URL'].append(item.xpath('.//@data-permalink-path').extract()[0]) nbr_retweet = item.css('span.ProfileTweet-action--retweet > span.ProfileTweet-actionCount').xpath( '@data-tweet-stat-count').extract() if nbr_retweet: tweet['num_retweets'].append(int(nbr_retweet[0])) else: tweet['num_retweets'].append(0) nbr_favorite = item.css('span.ProfileTweet-action--favorite > span.ProfileTweet-actionCount').xpath( '@data-tweet-stat-count').extract() if nbr_favorite: tweet['num_likes'].append(int(nbr_favorite[0])) else: tweet['num_likes'].append(0) nbr_reply = item.css('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount').xpath( '@data-tweet-stat-count').extract() if nbr_reply: tweet['num_comments'].append(int(nbr_reply[0])) else: tweet['num_comments'].append(0) tweet['timestamp'].append(timestamp) tweet['symbols'].append(Tickers[index]) ### get photo # has_cards = item.xpath('.//@data-card-type').extract() # if has_cards and has_cards[0] == 'photo': # tweet['has_image'] = True # tweet['images'] = item.xpath('.//*/div/@data-image-url').extract() # elif has_cards: # logger.debug('Not handle "data-card-type":\n%s' % item.xpath('.').extract()[0]) ### get animated_gif # has_cards = item.xpath('.//@data-card2-type').extract() # if has_cards: # if has_cards[0] == 'animated_gif': # tweet['has_video'] = True # tweet['videos'] = item.xpath('.//*/source/@video-src').extract() # elif has_cards[0] == 'player': # tweet['has_media'] = True # tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract() # elif has_cards[0] == 'summary_large_image': # tweet['has_media'] = True # tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract() # elif has_cards[0] == 'amplify': # tweet['has_media'] = True # tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract() # elif has_cards[0] == 'summary': # tweet['has_media'] = True # tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract() # elif has_cards[0] == '__entity_video': # pass # TODO # # tweet['has_media'] = True # # tweet['medias'] = item.xpath('.//*/div/@data-src').extract() # else: # there are many other types of card2 !!!! # logger.debug('Not handle "data-card2-type":\n%s' % item.xpath('.').extract()[0]) # # is_reply = item.xpath('.//div[@class="ReplyingToContextBelowAuthor"]').extract() # tweet['is_reply'] = is_reply != [] # # is_retweet = item.xpath('.//span[@class="js-retweet-text"]').extract() # tweet['is_retweet'] = is_retweet != [] tweet['user_account'].append(item.xpath('.//@data-user-id').extract()[0]) # if self.crawl_user: # ### get user info # user = User() # user['ID'] = tweet['user_id'] # user['name'] = item.xpath('.//@data-name').extract()[0] # user['screen_name'] = item.xpath('.//@data-screen-name').extract()[0] # user['avatar'] = \ # item.xpath('.//div[@class="content"]/div[@class="stream-item-header"]/a/img/@src').extract()[0] # yield user except: logger.error("Error tweet:\n%s" % item.xpath('.').extract()[0]) # raise yield tweet