Exemplo n.º 1
0
 def parse_tweet_item(self, items):
     for k,v in items.items():
         # assert k == v['id_str'], (k,v)
         tweet = Tweet()
         tweet['id_'] = k
         tweet['raw_data'] = v
         yield tweet
Exemplo n.º 2
0
    def parse_databreaches(
            self, items):  # alle relevanten infos für unser DjangoItem
        for k, v in items.items():  # aus dem tweetItem des Crawlers gewonnen
            # assert k == v['id_str'], (k,v)
            tweet = Tweet()
            user = User
            tweet['id_'] = k
            tweet['raw_data'] = v
            daten = tweet[
                'raw_data']  # raw_date ist dictionary mit ensprechenden variablen nach daten
            id2 = schluessel_id  #schlüssel
            id = str(
                daten['user_id']
            )  # aus daten werden die jeweiligen infos in variablen gespeichert
            worte_anzahl = len(re.findall(
                r'\w+',
                daten['full_text']))  # wortanzahl mit library bestimmen
            datum = daten['created_at'].split(
            )[1] + " " + daten['created_at'].split(
            )[2] + " " + daten['created_at'].split(
            )[5]  # datum string aufsplitten um nur tag,monat,jahr zu erhalten
            date_time_obj = datetime.datetime.strptime(
                datum, '%b %d %Y')  # datum string als datetime object
            item = Tweet_item(
            )  # dem Tweet item werden die werte aus den Variablen übergeben
            item['userId'] = id
            item['unternehmensname'] = unternehmensname
            item['datum'] = date_time_obj.date(
            )  # datetime object  als nur date object übergeben

            item['woerter'] = worte_anzahl
            item['schluesselId'] = id2
            yield item
Exemplo n.º 3
0
    def parse_tweet_item(
        self,
        tweet_items,
        users,
    ):
        for _, tweet_item in tweet_items.items():

            tweet = Tweet()

            tweet["collected_at"] = datetime.now()
            user = users[tweet_item["user_id"]]

            tweet["created_at"] = datetime.strptime(
                tweet_item["created_at"],
                "%a %b %d %H:%M:%S %z %Y",
            )

            tweet[
                "url"] = f"https://twitter.com/{user['user_name']}/status/{tweet_item['id']}"
            tweet["tweet_id"] = tweet_item["id"]
            tweet["content"] = tweet_item["full_text"]
            tweet["language"] = language_codes[tweet_item["lang"]]
            try:
                tweet["tweet_client"] = re.search(
                    ">(.*?)<",
                    tweet_item["source"],
                ).group(1)
            except AttributeError:
                tweet["tweet_client"] = None

            tweet["retweet_count"] = tweet_item["retweet_count"]
            tweet["favorite_count"] = tweet_item["favorite_count"]
            tweet["reply_count"] = tweet_item["reply_count"]
            tweet["quote_count"] = tweet_item["quote_count"]

            tweet["in_reply_to_status_id"] = tweet_item[
                "in_reply_to_status_id"]
            tweet["in_reply_to_user_id"] = tweet_item["in_reply_to_user_id"]

            tweet["user"] = user

            with open(f"output/{tweet['tweet_id']}.json", "w+") as fp:
                json.dump(tweet_item, fp)

            yield tweet
Exemplo n.º 4
0
    def parse_tweet_item(self, items):
        logger.debug(f'items ontvangen {items}')
        tw_xpaden = {
            'ID': './/@data-tweet-id',
            'user_id': './/@data-user-id',
            'text': './/div[@class="js-tweet-text-container"]/p',
            'url': './/@data-permalink-path',
            'timestamp':
            './/div[@class="stream-item-header"]/small[@class="time"]/a/span/@data-time',
            'retweets':
            './/span[contains(@class, "ProfileTweet-action--retweet")]//@data-tweet-stat-count',
            'favorites':
            './/span[contains(@class, "ProfileTweet-action--favorite")]//@data-tweet-stat-count',
            'replies':
            './/span[contains(@class, "ProfileTweet-action--reply")]//@data-tweet-stat-count',
            'conversation_id': '//@data-conversation-id',
            'lang': self.lang
        }

        user_xpaden = {
            'ID': './/@data-user-id',
            'screenname': './/@data-screen-name',
            'name': './/@data-name'
        }

        for item in items:
            tweet = ItemLoader(Tweet(), item)
            for keys, values in tw_xpaden.items():
                tweet.add_xpath(keys, values)
            yield tweet.load_item()

            if self.crawl_user:
                user = ItemLoader(User(), item)
                for keys, values in user_xpaden.items():
                    user.add_xpath(keys, values)
                yield user.load_item()
        logger.info('pagina compleet geparsed op tweets')
Exemplo n.º 5
0
    def parse_tweet_item(self, items):
        for item in items:
            try:
                tweet = Tweet()

                tweet['usernameTweet'] = item.xpath('.//span[@class="username u-dir"]/b/text()').extract()[0]

                ID = item.xpath('.//@data-tweet-id').extract()
                if not ID:
                    continue
                tweet['ID'] = ID[0]

                ### get text content
                tweet['text'] = ' '.join(
                    item.xpath('.//div[@class="js-tweet-text-container"]/p//text()').extract()).replace(' # ',
                                                                                                        '#').replace(
                    ' @ ', '@')
                if tweet['text'] == '':
                    # If there is not text, we ignore the tweet
                    continue

                ### get meta data
                tweet['url'] = item.xpath('.//@data-permalink-path').extract()[0]

                nbr_retweet = item.css('span.ProfileTweet-action--retweet > span.ProfileTweet-actionCount').xpath(
                    '@data-tweet-stat-count').extract()
                if nbr_retweet:
                    tweet['nbr_retweet'] = int(nbr_retweet[0])
                else:
                    tweet['nbr_retweet'] = 0

                nbr_favorite = item.css('span.ProfileTweet-action--favorite > span.ProfileTweet-actionCount').xpath(
                    '@data-tweet-stat-count').extract()
                if nbr_favorite:
                    tweet['nbr_favorite'] = int(nbr_favorite[0])
                else:
                    tweet['nbr_favorite'] = 0

                nbr_reply = item.css('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount').xpath(
                    '@data-tweet-stat-count').extract()
                if nbr_reply:
                    tweet['nbr_reply'] = int(nbr_reply[0])
                else:
                    tweet['nbr_reply'] = 0

                tweet['datetime'] = datetime.fromtimestamp(int(
                    item.xpath('.//div[@class="stream-item-header"]/small[@class="time"]/a/span/@data-time').extract()[
                        0])).strftime('%Y-%m-%d %H:%M:%S')

                ### get photo
                has_cards = item.xpath('.//@data-card-type').extract()
                if has_cards and has_cards[0] == 'photo':
                    tweet['has_image'] = True
                    tweet['images'] = item.xpath('.//*/div/@data-image-url').extract()
                elif has_cards:
                    logger.debug('Not handle "data-card-type":\n%s' % item.xpath('.').extract()[0])

                ### get animated_gif
                has_cards = item.xpath('.//@data-card2-type').extract()
                if has_cards:
                    if has_cards[0] == 'animated_gif':
                        tweet['has_video'] = True
                        tweet['videos'] = item.xpath('.//*/source/@video-src').extract()
                    elif has_cards[0] == 'player':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract()
                    elif has_cards[0] == 'summary_large_image':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract()
                    elif has_cards[0] == 'amplify':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract()
                    elif has_cards[0] == 'summary':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract()
                    elif has_cards[0] == '__entity_video':
                        pass  # TODO
                        # tweet['has_media'] = True
                        # tweet['medias'] = item.xpath('.//*/div/@data-src').extract()
                    else:  # there are many other types of card2 !!!!
                        logger.debug('Not handle "data-card2-type":\n%s' % item.xpath('.').extract()[0])

                is_reply = item.xpath('.//div[@class="ReplyingToContextBelowAuthor"]').extract()
                tweet['is_reply'] = is_reply != []

                is_retweet = item.xpath('.//span[@class="js-retweet-text"]').extract()
                tweet['is_retweet'] = is_retweet != []

                tweet['user_id'] = item.xpath('.//@data-user-id').extract()[0]
                yield tweet

                if self.crawl_user:
                    ### get user info
                    user = User()
                    user['ID'] = tweet['user_id']
                    user['name'] = item.xpath('.//@data-name').extract()[0]
                    user['screen_name'] = item.xpath('.//@data-screen-name').extract()[0]
                    user['avatar'] = \
                        item.xpath('.//div[@class="content"]/div[@class="stream-item-header"]/a/img/@src').extract()[0]
                    yield user
            except:
                logger.error("Error tweet:\n%s" % item.xpath('.').extract()[0])
Exemplo n.º 6
0
        def parse_tweet_item(items):
            for item in items:

                tweet = Tweet()

                tweet['usernameTweet'] = item.xpath(
                    './/span[@class="username u-dir u-textTruncate"]/b/text()'
                ).extract()[0]
                tweet['lang'] = item.xpath('.//@lang').get()
                if tweet['lang'] not in {'en', 'und'}:
                    raise NameError('not support lang')
                ID = item.xpath('.//@data-tweet-id').extract()
                if not ID:
                    raise NameError('no ID')
                tweet['ID'] = ID[0]

                ### get text content
                tweet['text'] = ' '.join(
                    item.xpath(
                        './/div[@class="js-tweet-text-container"]/p//text()|.//div[@class="js-tweet-text-container"]/p//img/@alt'
                    ).extract()).replace(' # ', '#').replace(' @ ', '@')
                tweet['emoji'] = ' '.join(
                    item.xpath(
                        './/div[@class="js-tweet-text-container"]/p//img/@alt'
                    ).extract())
                if tweet['text'] == '':
                    # If there is not text, we ignore the tweet
                    raise NameError('empty text')

                ### get meta data
                tweet['url'] = item.xpath(
                    './/@data-permalink-path').extract()[0]
                '''nbr_retweet = item.css('span.ProfileTweet-action--retweet > span.ProfileTweet-actionCount').xpath(
                    '@data-tweet-stat-count').extract()
                if nbr_retweet:
                    tweet['nbr_retweet'] = int(nbr_retweet[0])
                else:
                    tweet['nbr_retweet'] = 0

                nbr_favorite = item.css('span.ProfileTweet-action--favorite > span.ProfileTweet-actionCount').xpath(
                    '@data-tweet-stat-count').extract()
                if nbr_favorite:
                    tweet['nbr_favorite'] = int(nbr_favorite[0])
                else:
                    tweet['nbr_favorite'] = 0

                nbr_reply = item.css('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount').xpath(
                    '@data-tweet-stat-count').extract()
                if nbr_reply:
                    tweet['nbr_reply'] = int(nbr_reply[0])
                else:
                    tweet['nbr_reply'] = 0'''

                tweet['datetime'] = datetime.fromtimestamp(
                    int(
                        item.xpath('.//small[@class="time"]/a/span/@data-time')
                        .extract()[0])).strftime('%Y-%m-%d %H:%M:%S')

                ### get photo
                has_cards = item.xpath('.//@data-card-type').extract()
                if has_cards and has_cards[0] == 'photo':
                    tweet['has_image'] = True
                    tweet['images'] = item.xpath(
                        './/*/div/@data-image-url').extract()
                elif has_cards:
                    logger.debug('Not handle "data-card-type":\n%s' %
                                 item.xpath('.').extract()[0])

                ### get animated_gif
                has_cards = item.xpath('.//@data-card2-type').extract()
                if has_cards:
                    if has_cards[0] == 'animated_gif':
                        tweet['has_video'] = True
                        tweet['videos'] = item.xpath(
                            './/*/source/@video-src').extract()
                    elif has_cards[0] == 'player':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath(
                            './/*/div/@data-card-url').extract()
                    elif has_cards[0] == 'summary_large_image':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath(
                            './/*/div/@data-card-url').extract()
                    elif has_cards[0] == 'amplify':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath(
                            './/*/div/@data-card-url').extract()
                    elif has_cards[0] == 'summary':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath(
                            './/*/div/@data-card-url').extract()
                    elif has_cards[0] == '__entity_video':
                        pass  # TODO
                        # tweet['has_media'] = True
                        # tweet['medias'] = item.xpath('.//*/div/@data-src').extract()
                    else:  # there are many other types of card2 !!!!
                        logger.debug('Not handle "data-card2-type":\n%s' %
                                     item.xpath('.').extract()[0])

                is_reply = item.xpath(
                    './/div[@class="ReplyingToContextBelowAuthor"]').extract()
                tweet['is_reply'] = is_reply != []
                if tweet['is_reply']:
                    tweet['reply_to'] = item.xpath(
                        './/div[@class="ReplyingToContextBelowAuthor"]//@href|.//div[@class="ReplyingToContextBelowAuthor"]//@data-user-id'
                    ).extract()
                #href uid
                is_retweet = item.xpath(
                    './/span[@class="js-retweet-text"]').extract()
                tweet['is_retweet'] = is_retweet != []

                tweet['user_id'] = item.xpath('.//@data-user-id').extract()[0]
                yield tweet
Exemplo n.º 7
0
    def parse_tweet_item(self, items):
        for item in items:
            try:
                tweet = Tweet()

                tweet['usernameTweet'] = item.xpath('.//span[@class="username u-dir"]/b/text()').extract()[0]

                ID = item.xpath('.//@data-tweet-id').extract()
                if not ID:
                    continue
                tweet['ID'] = ID[0]

                ### get text content
                tweet['text'] = ' '.join(
                    item.xpath('.//div[@class="js-tweet-text-container"]/p//text()|.//div[@class="js-tweet-text-container"]/p//img/@alt').extract()).replace(' # ',
                                                                                                        '#').replace(
                    ' @ ', '@')
                # NOTE Allways the query search is the first index
                if self.query.split(',')[0] not in tweet['text'] and self.query.split(',')[0] :
                    # If query is not empty and not in text, we ignore the tweet
                    continue

                if detect_langs(tweet['text'])[0].lang != self.lang:
                    # If language is not correctly detected, we ignore it
                    continue
                
                ### get meta data
                tweet['url'] = item.xpath('.//@data-permalink-path').extract()[0]

                nbr_retweet = item.css('span.ProfileTweet-action--retweet > span.ProfileTweet-actionCount').xpath(
                    '@data-tweet-stat-count').extract()
                if nbr_retweet:
                    tweet['nbr_retweet'] = int(nbr_retweet[0])
                else:
                    tweet['nbr_retweet'] = 0

                nbr_favorite = item.css('span.ProfileTweet-action--favorite > span.ProfileTweet-actionCount').xpath(
                    '@data-tweet-stat-count').extract()
                if nbr_favorite:
                    tweet['nbr_favorite'] = int(nbr_favorite[0])
                else:
                    tweet['nbr_favorite'] = 0

                nbr_reply = item.css('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount').xpath(
                    '@data-tweet-stat-count').extract()
                if nbr_reply:
                    tweet['nbr_reply'] = int(nbr_reply[0])
                else:
                    tweet['nbr_reply'] = 0

                tweet['datetime'] = datetime.fromtimestamp(int(
                    item.xpath('.//div[@class="stream-item-header"]/small[@class="time"]/a/span/@data-time').extract()[
                        0])).strftime('%Y-%m-%d %H:%M:%S')

                ### get photo
                has_cards = item.xpath('.//@data-card-type').extract()
                if has_cards and has_cards[0] == 'photo':
                    tweet['has_image'] = True
                    tweet['images'] = item.xpath('.//*/div/@data-image-url').extract()
                elif has_cards:
                    logger.debug('Not handle "data-card-type":\n%s' % item.xpath('.').extract()[0])

                ### get animated_gif
                has_cards = item.xpath('.//@data-card2-type').extract()
                if has_cards:
                    if has_cards[0] == 'animated_gif':
                        tweet['has_video'] = True
                        tweet['videos'] = item.xpath('.//*/source/@video-src').extract()
                    elif has_cards[0] == 'player':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract()
                    elif has_cards[0] == 'summary_large_image':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract()
                    elif has_cards[0] == 'amplify':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract()
                    elif has_cards[0] == 'summary':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract()
                    elif has_cards[0] == '__entity_video':
                        pass  # TODO
                        # tweet['has_media'] = True
                        # tweet['medias'] = item.xpath('.//*/div/@data-src').extract()
                    else:  # there are many other types of card2 !!!!
                        logger.debug('Not handle "data-card2-type":\n%s' % item.xpath('.').extract()[0])

                is_reply = item.xpath('.//div[@class="ReplyingToContextBelowAuthor"]').extract()
                tweet['is_reply'] = is_reply != []

                is_retweet = item.xpath('.//span[@class="js-retweet-text"]').extract()
                tweet['is_retweet'] = is_retweet != []

                tweet['user_id'] = item.xpath('.//@data-user-id').extract()[0]
                yield tweet
            except:
                logger.error("Error tweet:\n%s" % item.xpath('.').extract()[0])
Exemplo n.º 8
0
    def parse_tweet_item(self, items):
        for item in items:
            try:
                tweet = Tweet()
                tweet['usernameTweet'] = item.xpath(
                    './/span[@class="username u-dir"]/b/text()').extract()[0]
                ID = item.xpath('.//@data-tweet-id').extract()
                if not ID:
                    continue
                tweet['ID'] = ID[0]

                tweet['text'] = ' '.join(
                    item.xpath(
                        './/div[@class="js-tweet-text-container"]/p//text()').
                    extract()).replace(' # ', '#').replace(' @ ', '@')
                logger.debug(tweet['text'])
                if tweet['text'] == '':
                    continue
                tweet['url'] = item.xpath(
                    './/@data-permalink-path').extract()[0]
                nbr_retweet = item.xpath(
                    './/button[@data-modal="ProfileTweet-retweet"]/span/span/text()'
                ).extract()
                if nbr_retweet:
                    tweet['nbr_retweet'] = int(nbr_retweet[0])
                else:
                    tweet['nbr_retweet'] = 0

                nbr_favorite = item.xpath(
                    './/button[@class="ProfileTweet-actionButton js-actionButton js-actionFavorite"]/span/span/text()'
                ).extract()
                if nbr_favorite:
                    tweet['nbr_favorite'] = int(nbr_favorite[0])
                else:
                    tweet['nbr_favorite'] = 0

                nbr_reply = item.xpath(
                    './/button[@class="ProfileTweet-actionButton js-actionButton js-actionReply"]/span/span/text()'
                ).extract()
                if nbr_reply:
                    tweet['nbr_reply'] = int(nbr_reply[0])
                else:
                    tweet['nbr_reply'] = 0

                tweet['datetime'] = datetime.fromtimestamp(
                    int(
                        item.xpath(
                            './/div[@class="stream-item-header"]/small[@class="time"]/a/span/@data-time'
                        ).extract()[0])).strftime('%Y-%m-%d %H:%M:%S')
                has_cards = item.xpath('.//@data-card-type').extract()
                if has_cards and has_cards[0] == 'photo':
                    tweet['has_image'] = True
                    tweet['images'] = item.xpath(
                        './/*/div/@data-image-url').extract()
                elif has_cards:
                    logger.debug('Not handle "data-card-type":\n%s' %
                                 item.xpath('.').extract()[0])

                has_cards = item.xpath('.//@data-card2-type').extract()

                if has_cards:
                    if has_cards[0] == 'animated_gif':
                        tweet['has_video'] = True
                        tweet['videos'] = item.xpath(
                            './/*/source/@video-src').extract()
                    elif has_cards[0] == 'player':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath(
                            './/*/div/@data-card-url').extract()
                    elif has_cards[0] == 'summary_large_image':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath(
                            './/*/div/@data-card-url').extract()
                    elif has_cards[0] == 'amplify':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath(
                            './/*/div/@data-card-url').extract()
                    elif has_cards[0] == 'summary':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath(
                            './/*/div/@data-card-url').extract()
                    elif has_cards[0] == '__entity_video':
                        pass
                    else:
                        logger.debug('error: "data-card2-type":\n%s' %
                                     item.xpath('.').extract()[0])
                is_reply = item.xpath(
                    './/div[@class="ReplyingToContextBelowAuthor"]').extract()
                tweet['is_reply'] = is_reply != []

                is_retweet = item.xpath(
                    './/span[@class="js-retweet-text"]').extract()
                tweet['is_retweet'] = is_retweet != []

                tweet['user_id'] = item.xpath('.//@data-user-id').extract()[0]
                yield tweet

                if self.crawl_user:
                    # 拿用户信息
                    user = User()

                    user['ID'] = tweet['user_id']
                    user['name'] = item.xpath('.//@data-name').extract()[0]
                    user['screen_name'] = item.xpath(
                        './/@data-screen-name').extract()[0]
                    user['avatar'] = \
                        item.xpath('.//div[@class="content"]/div[@class="stream-item-header"]/a/img/@src').extract()[0]
                    yield user
            except:
                logger.error("error :\n%s" % item.xpath('.').extract()[0])
    def parse_tweet_item(self, items, index):
        tweet = Tweet()

        tweet['post_content'] = []
        tweet['user_name'] = []
        tweet['URL'] = []
        tweet['num_retweets'] = []
        tweet['num_likes'] = []
        tweet['num_comments'] = []
        tweet['timestamp'] = []
        tweet['symbols'] = []
        tweet['user_account'] = []

        for item in items:
            try:
                # ID = item.xpath('.//@data-tweet-id').extract()
                # if not ID:
                #     continue
                # tweet['ID'] = ID[0]

                ### get text content
                timestamp = datetime.fromtimestamp(int(
                    item.xpath('.//div[@class="stream-item-header"]/small[@class="time"]/a/span/@data-time').extract()[
                        0])).strftime('%Y-%m-%d %H:%M:%S')
                # if timestamp
                text = ''.join(
                    item.xpath('.//div[@class="js-tweet-text-container"]/p//text()').extract()).replace(' # ','#').replace(' @ ','@')
                if text == '':
                    continue
                tweet['post_content'].append(text)
                tweet['user_name'].append(
                    item.xpath('.//span[@class="username u-dir u-textTruncate"]/b/text()').extract()[0])
                ### get meta data
                tweet['URL'].append(item.xpath('.//@data-permalink-path').extract()[0])

                nbr_retweet = item.css('span.ProfileTweet-action--retweet > span.ProfileTweet-actionCount').xpath(
                    '@data-tweet-stat-count').extract()
                if nbr_retweet:
                    tweet['num_retweets'].append(int(nbr_retweet[0]))
                else:
                    tweet['num_retweets'].append(0)

                nbr_favorite = item.css('span.ProfileTweet-action--favorite > span.ProfileTweet-actionCount').xpath(
                    '@data-tweet-stat-count').extract()
                if nbr_favorite:
                    tweet['num_likes'].append(int(nbr_favorite[0]))
                else:
                    tweet['num_likes'].append(0)

                nbr_reply = item.css('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount').xpath(
                    '@data-tweet-stat-count').extract()
                if nbr_reply:
                    tweet['num_comments'].append(int(nbr_reply[0]))
                else:
                    tweet['num_comments'].append(0)

                tweet['timestamp'].append(timestamp)
                tweet['symbols'].append(Tickers[index])
                ### get photo
                # has_cards = item.xpath('.//@data-card-type').extract()
                # if has_cards and has_cards[0] == 'photo':
                #     tweet['has_image'] = True
                #     tweet['images'] = item.xpath('.//*/div/@data-image-url').extract()
                # elif has_cards:
                #     logger.debug('Not handle "data-card-type":\n%s' % item.xpath('.').extract()[0])

                ### get animated_gif
                # has_cards = item.xpath('.//@data-card2-type').extract()
                # if has_cards:
                #     if has_cards[0] == 'animated_gif':
                #         tweet['has_video'] = True
                #         tweet['videos'] = item.xpath('.//*/source/@video-src').extract()
                #     elif has_cards[0] == 'player':
                #         tweet['has_media'] = True
                #         tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract()
                #     elif has_cards[0] == 'summary_large_image':
                #         tweet['has_media'] = True
                #         tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract()
                #     elif has_cards[0] == 'amplify':
                #         tweet['has_media'] = True
                #         tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract()
                #     elif has_cards[0] == 'summary':
                #         tweet['has_media'] = True
                #         tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract()
                #     elif has_cards[0] == '__entity_video':
                #         pass  # TODO
                #         # tweet['has_media'] = True
                #         # tweet['medias'] = item.xpath('.//*/div/@data-src').extract()
                #     else:  # there are many other types of card2 !!!!
                #         logger.debug('Not handle "data-card2-type":\n%s' % item.xpath('.').extract()[0])
                #
                # is_reply = item.xpath('.//div[@class="ReplyingToContextBelowAuthor"]').extract()
                # tweet['is_reply'] = is_reply != []
                #
                # is_retweet = item.xpath('.//span[@class="js-retweet-text"]').extract()
                # tweet['is_retweet'] = is_retweet != []

                tweet['user_account'].append(item.xpath('.//@data-user-id').extract()[0])


                # if self.crawl_user:
                #     ### get user info
                #     user = User()
                #     user['ID'] = tweet['user_id']
                #     user['name'] = item.xpath('.//@data-name').extract()[0]
                #     user['screen_name'] = item.xpath('.//@data-screen-name').extract()[0]
                #     user['avatar'] = \
                #         item.xpath('.//div[@class="content"]/div[@class="stream-item-header"]/a/img/@src').extract()[0]
                #     yield user
            except:
                logger.error("Error tweet:\n%s" % item.xpath('.').extract()[0])
                # raise
        yield tweet