Exemplo n.º 1
0
 def parse_node(self, response, node):
     """Parse response into UrlItem."""
     item = UrlItem()
     if self.provider == 'self':
         link = node.xpath('link/text()').extract_first()
     elif self.provider == 'feedburner':
         link = node.xpath(
             '*[local-name()="origLink"]/text()').extract_first()
     else:
         logger.error('Unrecognized feed provider %r', self.provider)
         return
     date_published = node.xpath('pubDate/text()').extract_first()
     if link is not None:
         item['raw'] = link.strip()
         item['date_published'] = utc_from_str(date_published)
         yield item
     else:
         logger.error('Unexpected item: (%s, %s) from %r', link,
                      date_published, response.url)
         return
Exemplo n.º 2
0
    def parse(self, jd):
        """The main parse function.

        Parameters
        ---------
        jd : json
            Tweet json data.

        Procedures
        ----------
        1) validate `jd`
        2) extract URL and hashtag from `jd`
        3) insert into database
        """
        logger.debug('Parsing one tweet, begin')
        #
        # validation
        #
        try:
            tw_raw_id = jd['id']
            created_at = utc_from_str(jd['created_at'])
            user_raw_id = jd['user']['id']
        except KeyError as e:
            logger.error('Invalid tweet: %s', e)
            return None
        #
        # extract url, hashtag and associated tweet status id
        #
        urls_set = set()
        hashtags_set = set()
        entities_list = []
        if 'entities' in jd:
            entities_list.append(jd['entities'])
        if 'quoted_status' in jd:
            q_jd = jd['quoted_status']
            if 'entities' in q_jd:
                entities_list.append(q_jd['entities'])
        if 'retweeted_status' in jd:
            re_jd = jd['retweeted_status']
            if 'entities' in re_jd:
                entities_list.append(re_jd['entities'])
            if 'quoted_status' in re_jd and\
                    'entities' in re_jd['quoted_status']:
                entities_list.append(re_jd['quoted_status']['entities'])
        for entities in entities_list:
            if entities:
                self._parse_entities(entities, urls_set, hashtags_set)
        # This tweet should contain urls
        if len(urls_set) == 0 and self.save_none_url_tweet is False:
            logger.debug('No url found in %s, ignore!', tw_raw_id)
            return None
        #
        # Insert into database
        #
        # creating user
        logger.debug('creating user')
        muser = get_or_create_m(self.session,
                                TwitterUser,
                                data=dict(raw_id=user_raw_id),
                                fb_uk='raw_id')
        # creating tweet
        logger.debug('creating tweet')
        mtweet = Tweet(raw_id=tw_raw_id,
                       json_data=jd,
                       created_at=created_at,
                       user_id=muser.id)
        self.session.add(mtweet)
        try:
            self.session.commit()
            logger.debug('Inserted tweet %r', tw_raw_id)
        except IntegrityError as e:
            logger.warning('Tweet %s existed in db: %s', tw_raw_id, e)
            self.session.rollback()
            return
        # creating urls
        logger.debug('creating urls')
        for url in urls_set:
            murl = get_or_create_murl(self.session,
                                      data=dict(raw=url),
                                      platform_id=self.platform_id)
            self.session.add(AssTweetUrl(tweet_id=mtweet.id, url_id=murl.id))
            try:
                self.session.commit()
            except IntegrityError as e:
                logger.error('ass_tweet_url IntegrityError, see: %s', e)
                self.session.rollback()
        # creating hashtags
        logger.debug('creating hashtags')
        for hashtag in hashtags_set:
            mhashtag = get_or_create_m(self.session,
                                       Hashtag,
                                       data=dict(text=hashtag),
                                       fb_uk='text')
            self.session.add(
                AssTweetHashtag(tweet_id=mtweet.id, hashtag_id=mhashtag.id))
            try:
                self.session.commit()
            except IntegrityError as e:
                logger.error('ass_tweet_hashtag IntegrityError, see: %s', e)
                self.session.rollback()
        # paring associate tweet
        q1 = """
        INSERT INTO ass_tweet (id, retweeted_status_id, quoted_status_id,
            in_reply_to_status_id)
        SELECT id,
            CAST(json_data#>>'{retweeted_status, id}' AS BIGINT),
            CAST(json_data#>>'{quoted_status, id}' AS BIGINT),
            CAST(json_data->>'in_reply_to_status_id' AS BIGINT)
        FROM tweet
        WHERE id=:tweet_id
        """
        q1 = text(q1).bindparams(tweet_id=mtweet.id)
        try:
            self.session.execute(q1)
            self.session.commit()
        except DataError as e:
            # Handle \u0000 exception that postgresql json do not support
            logger.warning(e)
            self.session.rollback()
            q2 = r"""
            UPDATE tweet SET json_data=regexp_replace(
                        json_data::text, '\\u0000', '\\\\u0000', 'g')::json
            WHERE id=:tweet_id
            """
            q2 = text(q2).bindparams(tweet_id=mtweet.id)
            self.session.execute(q2)
            self.session.commit()
            logger.warning('json_data is updated (\\u0000 to \\\\u0000)')
            self.session.execute(q1)
            self.session.commit()
        logger.debug('Parsing one tweet, done.')
Exemplo n.º 3
0
    def search(self,
               query,
               n1=100,
               n2=100000,
               sort_by='relevant',
               use_lucene_syntax=False,
               min_score_of_recent_sorting=0.4,
               min_date_published=None):
        """Return the matched articles from lucene.

        Parameters
        ----------
        query : string
            The query string.
        n1 : int
            How many result finally returned.
        n2 : int
            How many search results returned when sort by recent.
        sort_by : string
            {'relevant', 'recent'}, the sorting order when doing lucene searching.
        min_score_of_recent_sorting : float
            The min score when sorting by 'recent'.
        min_date_published : datetime
            The min date_published when filtering lucene searching results.

        Returns
        -------
        tuple
            (total_hits, df), where total_hits represents the total number
            of hits and df is a pandas.DataFrame object. df.columns = ['id',
            'canonical_url', 'title', 'date_published', 'domain', 'site_type',
            'score']
        """
        if min_date_published is not None:
            dt2 = datetime.utcnow()
            if isinstance(min_date_published, datetime):
                dt1 = min_date_published
            elif isinstance(min_date_published, str):
                dt1 = utc_from_str(min_date_published)
            q_dates = self.query_between_dates(dt1, dt2)
        try:
            if use_lucene_syntax is False:
                query = clean_query(query)
            q = self.mul_parser.parse(self.mul_parser, query)
            logger.warning(q)
            if 'date_published:' in query:
                end = query.find('AND date_published')
                q_without_date_publushed = query[:end]
                logger.warning(q_without_date_publushed)
                q = self.mul_parser.parse(self.mul_parser,
                                          q_without_date_publushed)
                date_published_splits = query.split('date_published:[')
                date_range = date_published_splits[len(date_published_splits) -
                                                   1]
                date_range = date_range[:-1]
                logger.warning(date_range)
                if 'TO' in date_range:
                    date_range_splits = date_range.split('TO')
                    dt1_string = date_range_splits[0]
                    # handling when regex presents
                    if '*' in dt1_string:
                        date1_end = dt1_string.find('*') - 1
                        dt1_string = dt1_string[:date1_end]
                        logger.warning(dt1_string)
                    dt1 = utc_from_str(dt1_string)
                    dt2_string = date_range_splits[1]
                    if '*' in dt2_string:
                        date2_end = dt2_string.find('*') - 1
                        dt2_string = dt2_string[:date2_end]
                        logger.warning(dt2_string)
                    dt2 = utc_from_str(dt2_string)
                    query_dates = self.query_between_dates(dt1, dt2)
                    q = combine_queries(q, query_dates)
            if min_date_published is not None:
                q = combine_queries(q, q_dates)
            logger.warning('Parsed query: %s', q)
        except Exception as e:
            logger.error(e)
            if use_lucene_syntax is True:
                raise APIParseError("""Error when parse the query string! \
You are quering with lucene syntax, be careful of your query string!""")
            else:
                raise APIParseError('Error when parse the query string!')

        cnames = [
            'id', 'canonical_url', 'title', 'date_published', 'domain',
            'site_type', 'score'
        ]
        if sort_by == 'relevant':
            top_docs = self.isearcher.search(q, n1)
            score_docs = top_docs.scoreDocs
            total_hits = top_docs.totalHits
            if total_hits == 0:
                df = pd.DataFrame()
            else:
                records = [self.fetch_one_doc(sd) for sd in score_docs]

                # Index in each record of canonical URL and title
                canonical_url, title = 1, 2
                # Store 2-tuples of (site, article title) as keys in dict then
                # turn back to list
                unique_docs = dict()
                for record in records:
                    key = (record[canonical_url], record[title])
                    if key not in unique_docs:
                        unique_docs[key] = record
                # Include only unique records
                records = list(unique_docs.values())
                df = pd.DataFrame(records, columns=cnames)
                df['date_published'] = pd.to_datetime(df['date_published'])
            return total_hits, df
        elif sort_by == 'recent':
            counter = 0
            records = []
            top_field_docs = self.isearcher.search(q, n2, self.sort_by_recent,
                                                   True, True)
            if top_field_docs.maxScore >= min_score_of_recent_sorting:
                for sd in top_field_docs.scoreDocs:
                    if sd.score >= min_score_of_recent_sorting:
                        records.append(self.fetch_one_doc(sd))
                        counter += 1
                        if counter == n1:
                            break
            if counter == 0:
                df = pd.DataFrame()
            else:
                df = pd.DataFrame(records, columns=cnames)
                df['date_published'] = pd.to_datetime(df['date_published'])
            return counter, df
Exemplo n.º 4
0
    def _parse_l2(self, jd):
        """Second Level parsing, the main function is to build parsed objects
        for the most complicate table:

            twitter_network_edge

        Parameters
        ---------
        jd: JSON
            A tweet JSON object.
        """
        #
        # Make sure up-using class members are flushed
        #
        self.full_user = list()
        self.edges = set()

        # start parsing
        tweet_raw_id = jd['id']
        user_raw_id = jd['user']['id']
        user_screen_name = jd['user']['screen_name']
        self.created_at = utc_from_str(jd['created_at'])
        # add this user as full_user
        self.full_user.append(
            (user_raw_id, user_screen_name, jd['user']['followers_count'],
             jd['user'], self.created_at))
        quoted_status_id = None
        retweeted_status_id = None
        if 'quoted_status' in jd:
            quoted_user_id = jd['quoted_status']['user']['id']
            quoted_screen_name = jd['quoted_status']['user']['screen_name']
            quoted_status_id = jd['quoted_status']['id']
            self.full_user.append(
                (quoted_user_id, quoted_screen_name,
                 jd['quoted_status']['user']['followers_count'],
                 jd['quoted_status']['user'], self.created_at))
        if 'retweeted_status' in jd:
            retweeted_user_id = jd['retweeted_status']['user']['id']
            retweeted_screen_name = jd['retweeted_status']['user'][
                'screen_name']
            retweeted_status_id = jd['retweeted_status']['id']
            self.full_user.append(
                (retweeted_user_id, retweeted_screen_name,
                 jd['retweeted_status']['user']['followers_count'],
                 jd['retweeted_status']['user'], self.created_at))
        in_reply_to_status_id = jd['in_reply_to_status_id']
        in_reply_to_user_id = jd['in_reply_to_user_id']
        in_reply_to_screen_name = jd['in_reply_to_screen_name']
        if in_reply_to_user_id is not None and\
                in_reply_to_screen_name is not None:
            self.in_reply_to_user = (in_reply_to_user_id,
                                     in_reply_to_screen_name)
        self.ass_tweet = (tweet_raw_id, retweeted_status_id, quoted_status_id,
                          in_reply_to_status_id)
        #
        # Building edges
        #

        # 2-1) retweet, focusing on retweeted_status
        #               edge direction: from retweeted_user to current user
        if retweeted_status_id is not None:
            logger.debug('2-1-a) building edges for retweet=%s', tweet_raw_id)
            for u in self.urls['retweet']:
                self.edges.add((tweet_raw_id, retweeted_user_id, user_raw_id,
                                u, False, False, 'retweet'))
        # 2-2) reply, focusing on current status
        #             edges direction: from current user to mentions
        if in_reply_to_status_id is not None:
            logger.debug('2-1-b) building edges for reply=%s', tweet_raw_id)
            # in_reply_to_user, edge
            for url in self.urls['this']:
                self.edges.add((tweet_raw_id, user_raw_id, in_reply_to_user_id,
                                url, False, False, 'reply'))
            # mentions, edges
            for mention_id, mention_screen_name in self.mentions['this']:
                if mention_id != in_reply_to_user_id:
                    for url in self.urls['this']:
                        self.edges.add((tweet_raw_id, user_raw_id, mention_id,
                                        url, False, True, 'reply'))
        # 2-3) quote
        if quoted_status_id is not None:
            # 2-3-1) retweeted quote, focusing on quoted_status
            #                         treated as retweet edge
            if retweeted_status_id is not None:
                logger.debug(
                    '2-1-c) building edges for the quote of a retweet=%s',
                    tweet_raw_id)
                for url in self.urls['quote']:
                    self.edges.add((tweet_raw_id, retweeted_user_id,
                                    user_raw_id, url, True, False, 'retweet'))
            # 2-3-2) replied quote, focusing on quoted_status
            #                       treated as reply edge
            elif in_reply_to_status_id is not None:
                logger.debug(
                    '2-1-c) building edges for the quote of a reply=%s',
                    tweet_raw_id)
                # in_reply_to_user, edges for quoted url
                for url in self.urls['quote']:
                    self.edges.add(
                        (tweet_raw_id, user_raw_id, in_reply_to_user_id, url,
                         True, False, 'reply'))
                # mentions, edges for quoted url
                for mention_id, mention_screen_name in self.mentions['this']:
                    if mention_id != in_reply_to_user_id:
                        for url in self.urls['quote']:
                            self.edges.add(
                                (tweet_raw_id, user_raw_id, mention_id, url,
                                 True, True, 'reply'))
            # 2-3-3) pure quote
            else:
                logger.debug('2-1-c) Building edges for a pure quote=%s',
                             tweet_raw_id)
                # a. information edges: from quoted_user to this_user
                #                       for urls inputted by quoted user
                for url in self.urls['quote']:
                    self.edges.add((tweet_raw_id, quoted_user_id, user_raw_id,
                                    url, True, False, 'quote'))
                # b. information edges: from this_user to mentioned_users
                #                       of this_user
                #                       for both urls inputted by this user
                #                       and quoted_user
                for mention_id, mention_screen_name in self.mentions['this']:
                    for url in self.urls['quote']:
                        self.edges.add((tweet_raw_id, user_raw_id, mention_id,
                                        url, True, True, 'quote'))
                    for url in self.urls['this']:
                        self.edges.add((tweet_raw_id, user_raw_id, mention_id,
                                        url, False, True, 'quote'))
        # 2-4) original tweet
        if retweeted_status_id is None and in_reply_to_status_id is None\
                and quoted_status_id is None:
            logger.debug('2-1-d) building edges for original tweet=%s',
                         tweet_raw_id)
            for mention_id, mention_screen in self.mentions['this']:
                for url in self.urls['this']:
                    self.edges.add((tweet_raw_id, user_raw_id, mention_id, url,
                                    False, True, 'origin'))
Exemplo n.º 5
0
    def search(self, query, n1=100, n2=100000,
               sort_by='relevant',
               use_lucene_syntax=False,
               min_score_of_recent_sorting=0.4,
               min_date_published=None):
        """Return the matched articles from lucene.

        Parameters
        ----------
        query : string
            The query string.
        n1 : int
            How many result finally returned.
        n2 : int
            How many search results returned when sort by recent.
        sort_by : string
            {'relevant', 'recent'}, the sorting order when doing lucene searching.
        min_score_of_recent_sorting : float
            The min score when sorting by 'recent'.
        min_date_published : datetime<Plug>(neosnippet_expand)
            The min date_published when filtering lucene searching results.

        Returns
        -------
        tuple
            (total_hits, df), where total_hits represents the total number
            of hits and df is a pandas.DataFrame object. df.columns = ['id',
            'canonical_url', 'title', 'date_published', 'domain', 'site_type',
            'score']
        """
        if min_date_published is not None:
            dt2 = datetime.utcnow()
            if isinstance(min_date_published, datetime):
                dt1 = min_date_published
            elif isinstance(min_date_published, basestring):
                dt1 = utc_from_str(min_date_published)
            sf = self.prepare_chained_filter(dt1, dt2)
        else:
            sf = self.dup_filter
        try:
            if use_lucene_syntax is False:
                query = clean_query(query)
            q = self.mul_parser.parse(self.mul_parser, query)
            logger.debug('Parsed query: %s', q)
        except Exception as e:
            logger.error(e)
            if use_lucene_syntax is True:
                raise APIParseError("""Error when parse the query string! \
You are quering with lucene syntax, be careful of your query string!""")
            else:
                raise APIParseError('Error when parse the query string!')

        cnames = ['id', 'canonical_url', 'title', 'date_published',
                  'domain', 'site_type', 'score']
        if sort_by == 'relevant':
            top_docs = self.isearcher.search(q, sf, n1)
            score_docs = top_docs.scoreDocs
            total_hits = top_docs.totalHits
            if total_hits == 0:
                df = pd.DataFrame()
            else:
                records = [self.fetch_one_doc(sd) for sd in score_docs]
                df = pd.DataFrame(records, columns=cnames)
                df['date_published'] = pd.to_datetime(df['date_published'])
            return total_hits, df
        elif sort_by == 'recent':
            counter = 0
            records = []
            top_field_docs = self.isearcher.search(q, sf, n2,
                                                   self.sort_by_recent,
                                                   True, True)
            if top_field_docs.maxScore >= min_score_of_recent_sorting:
                for sd in top_field_docs.scoreDocs:
                    if sd.score >= min_score_of_recent_sorting:
                        records.append(self.fetch_one_doc(sd))
                        counter += 1
                        if counter == n1:
                            break
            if counter == 0:
                df = pd.DataFrame()
            else:
                df = pd.DataFrame(records, columns=cnames)
                df['date_published'] = pd.to_datetime(df['date_published'])
            return counter, df
Exemplo n.º 6
0
 def parse_new_one(self, jd, session, g_urls_map, g_uusers_set,
                   g_edges_set):
     # validate jd
     jd = replace_null_byte(jd)
     try:
         tw_raw_id = jd['id']
         created_at = utc_from_str(jd['created_at'])
         user_raw_id = jd['user']['id']
     except KeyError as e:
         logger.error('Invalid tweet: %s', e)
         return None
     # parsing, level 1
     l_urls, l_mentions, l_hashtags = self._parse_l1(jd)
     if len(l_urls['union']) == 0 and self.save_none_url_tweet is False:
         logger.warning('Ignore tweet %r with no urls!', tw_raw_id)
         return None
     # saving, level 1
     logger.debug('Saving this user ...')
     muser = get_or_create_m(session,
                             TwitterUser,
                             data=dict(raw_id=user_raw_id),
                             fb_uk='raw_id')
     logger.debug('Saving this tweet ...')
     muser_id = muser.id
     mtweet = Tweet(raw_id=tw_raw_id,
                    json_data=jd,
                    created_at=created_at,
                    user_id=muser_id)
     session.add(mtweet)
     try:
         session.commit()
         logger.debug('Inserted tweet %r', tw_raw_id)
     except IntegrityError as e:
         logger.warning('Tweet %s existed in db: %s', tw_raw_id, e)
         session.rollback()
         return None
     mtweet_id = mtweet.id
     logger.debug('Saving AssTweet ...')
     retweeted_status_id = None
     quoted_status_id = None
     if 'quoted_status' in jd:
         quoted_status_id = jd['quoted_status']['id']
     if 'retweeted_status' in jd:
         retweeted_status_id = jd['retweeted_status']['id']
     in_reply_to_status_id = jd['in_reply_to_status_id']
     session.add(
         AssTweet(id=mtweet_id,
                  retweeted_status_id=retweeted_status_id,
                  quoted_status_id=quoted_status_id,
                  in_reply_to_status_id=in_reply_to_status_id))
     try:
         session.commit()
     except IntegrityError as e:
         logger.warning(e)
         session.rollback()
     logger.debug('Saving urls ...')
     for u in l_urls['union']:
         if len(u) > MAX_URL_LEN:
             murl_id = -1
         else:
             murl_id = get_or_create_murl(session,
                                          data=dict(raw=u),
                                          platform_id=self.platform_id).id
             # Saving AssTweetUrl
             session.add(AssTweetUrl(tweet_id=mtweet_id, url_id=murl_id))
             try:
                 session.commit()
             except IntegrityError as e:
                 logger.error('ass_tweet_url IntegrityError, see: %s', e)
                 session.rollback()
         g_urls_map[u] = murl_id
     # creating hashtags
     logger.debug('creating hashtags ...')
     for hashtag in l_hashtags['union']:
         mhashtag = get_or_create_m(session,
                                    Hashtag,
                                    data=dict(text=hashtag),
                                    fb_uk='text')
         session.add(
             AssTweetHashtag(tweet_id=mtweet.id, hashtag_id=mhashtag.id))
         try:
             session.commit()
         except IntegrityError as e:
             logger.error('ass_tweet_hashtag IntegrityError, see: %s', e)
             session.rollback()
     self._parse_l2(jd, l_urls, l_mentions, g_urls_map, g_uusers_set,
                    g_edges_set)
Exemplo n.º 7
0
    def parse(self, jd):
        """The main parse function.

        Parameters
        ---------
        jd : json
            Tweet json data.

        Procedures
        ----------
        1) do roughly parsing to validate `jd`
        2) carefully parsing and insert into database
        3) other associations
        """
        logger.debug('Parsing one tweet, begin ...')
        #
        # 1) do roughly parsing to validate the tweet
        #
        # 1-1) parsing necessary fields, if failed then it is not a valid tweet
        logger.debug('Replacing null byte if existing ...')
        jd = replace_null_byte(jd, self.fp)
        logger.debug('1) Roughly parsing ...')
        try:
            tw_raw_id = jd['id']
            created_at = utc_from_str(jd['created_at'])
            user_raw_id = jd['user']['id']
        except KeyError as e:
            logger.error('Invalid tweet: %s', e)
            return None
        # 1-2) roughly parsing
        entities_list = []
        quoted_status_id = None
        retweeted_status_id = None
        if 'entities' in jd:
            entities_list.append(jd['entities'])
        if 'quoted_status' in jd:
            quoted_jd = jd['quoted_status']
            quoted_user_jd = jd['quoted_status']['user']
            quoted_status_id = quoted_jd['id']
            if 'entities' in quoted_jd:
                entities_list.append(quoted_jd['entities'])
        if 'retweeted_status' in jd:
            retweeted_jd = jd['retweeted_status']
            retweeted_user_jd = jd['retweeted_status']['user']
            retweeted_status_id = retweeted_jd['id']
            if 'entities' in retweeted_jd:
                entities_list.append(retweeted_jd['entities'])
        in_reply_to_status_id = jd['in_reply_to_status_id']
        in_reply_to_user_id = jd['in_reply_to_user_id']
        in_reply_to_screen_name = jd['in_reply_to_screen_name']

        urls_set = set()
        hashtags_set = set()
        mentions_set = set()
        for entities in entities_list:
            if entities:
                self._parse_entities(entities, urls_set, hashtags_set,
                                     mentions_set)
        # This tweet should contain urls
        if len(urls_set) == 0 and self.save_none_url_tweet is False:
            logger.warning('No url found in tweet %s, ignore!', tw_raw_id)
            return None
        #
        # 2) carefully parsing and saving into database
        #
        logger.debug('2) Carefully parsing and saving ...')
        logger.debug('2-0) Saving twitter_user raw_id=%s ...', user_raw_id)
        muser = get_or_create_m(self.session,
                                TwitterUser,
                                data=dict(raw_id=user_raw_id),
                                fb_uk='raw_id')
        logger.debug('Saving this user into twitter_user_union as well ...')
        create_or_update_muser(
            self.session,
            data=dict(raw_id=user_raw_id,
                      screen_name=jd['user']['screen_name'],
                      followers_count=jd['user']['followers_count'],
                      profile=jd['user'],
                      updated_at=created_at))
        # creating tweet
        logger.debug('2-0) Saving tweet raw_id=%s ...', tw_raw_id)
        if self.saved_tweet is True:
            mtweet = self.session.query(Tweet).filter_by(
                raw_id=tw_raw_id).one()
        else:
            mtweet = Tweet(raw_id=tw_raw_id,
                           json_data=jd,
                           created_at=created_at,
                           user_id=muser.id)
            self.session.add(mtweet)
            try:
                self.session.commit()
                logger.debug('Inserted tweet %r', tw_raw_id)
            except IntegrityError as e:
                logger.warning('Tweet %s existed in db: %s', tw_raw_id, e)
                self.session.rollback()
                return None
        tweet_id = mtweet.id
        # Saving all urls and mapping the saved id
        url_map = dict()
        logger.debug('2-0) Saving all urls and associating with tweet...')
        for url in urls_set:
            murl = get_or_create_murl(self.session,
                                      data=dict(raw=url),
                                      platform_id=self.platform_id)
            url_map[url] = murl.id
            # saving ass_tweet_url
            if self.saved_tweet is False:
                self.session.add(
                    AssTweetUrl(tweet_id=tweet_id, url_id=url_map[url]))
                try:
                    self.session.commit()
                except IntegrityError as e:
                    logger.error('ass_tweet_url IntegrityError, see: %s', e)
                    self.session.rollback()
        # 2-1) retweet, focusing on retweeted_status
        #               edge direction: from retweeted_user to current user
        if retweeted_status_id is not None:
            logger.debug(
                '2-1-a) Saving the retweeted user into twitter_user_union ...')
            retweeted_user_id = retweeted_user_jd['id']
            retweeted_screen_name = retweeted_user_jd['screen_name']
            create_or_update_muser(
                self.session,
                data=dict(raw_id=retweeted_user_id,
                          screen_name=retweeted_screen_name,
                          followers_count=retweeted_user_jd['followers_count'],
                          profile=retweeted_user_jd,
                          updated_at=created_at))
            # retweeted user has been saved above, should be removed from mentions
            try:
                mentions_set.remove((retweeted_user_id, retweeted_screen_name))
            except KeyError as e:
                logger.warning('Tweet %r: retweeted user not in mentions',
                               tw_raw_id)
            logger.debug('2-1-a) Saving edges for retweet ...')
            self._save_edges(url_map,
                             retweeted_jd['entities'],
                             tweet_id,
                             tw_raw_id,
                             from_raw_id=retweeted_user_id,
                             to_raw_id=user_raw_id,
                             is_quoted_url=False,
                             is_mention=False,
                             tweet_type='retweet')
        # 2-2) reply, focusing on current status
        #             edges direction: from current user to mentions
        if in_reply_to_status_id is not None:
            # mentioned users would be saved later
            logger.debug('2-1-b) Saving edges for reply ...')
            # in_reply_to_user
            self._save_edges(url_map,
                             jd['entities'],
                             tweet_id,
                             tw_raw_id,
                             from_raw_id=user_raw_id,
                             to_raw_id=in_reply_to_user_id,
                             is_quoted_url=False,
                             is_mention=False,
                             tweet_type='reply')
            # mentions
            for m in jd['entities']['user_mentions']:
                to_raw_id = m.get('id')
                if to_raw_id and to_raw_id != in_reply_to_user_id:
                    self._save_edges(url_map,
                                     jd['entities'],
                                     tweet_id,
                                     tw_raw_id,
                                     from_raw_id=user_raw_id,
                                     to_raw_id=to_raw_id,
                                     is_quoted_url=False,
                                     is_mention=True,
                                     tweet_type='reply')
        # 2-3) quote
        if quoted_status_id is not None:
            logger.debug(
                '2-1-c) Saving the quoted user into twitter_user_union ...')
            quoted_user_id = quoted_user_jd['id']
            quoted_screen_name = quoted_user_jd['screen_name']
            create_or_update_muser(
                self.session,
                data=dict(raw_id=quoted_user_id,
                          screen_name=quoted_screen_name,
                          followers_count=quoted_user_jd['followers_count'],
                          profile=quoted_user_jd,
                          updated_at=created_at))
            # 2-3-1) retweeted quote, focusing on quoted_status
            #                         treated as retweet edge
            if retweeted_status_id is not None:
                logger.debug(
                    '2-1-c) Saving edges for quoting part of retweet ...')
                self._save_edges(url_map,
                                 quoted_jd['entities'],
                                 tweet_id,
                                 tw_raw_id,
                                 from_raw_id=retweeted_user_jd['id'],
                                 to_raw_id=user_raw_id,
                                 is_quoted_url=True,
                                 is_mention=False,
                                 tweet_type='retweet')
            # 2-3-2) replied quote, focusing on quoted_status
            #                       treated as reply edge
            elif in_reply_to_status_id is not None:
                logger.debug(
                    '2-1-c) Saving edges for quoting part of reply ...')
                # in_reply_to_user
                self._save_edges(url_map,
                                 quoted_jd['entities'],
                                 tweet_id,
                                 tw_raw_id,
                                 from_raw_id=user_raw_id,
                                 to_raw_id=in_reply_to_user_id,
                                 is_quoted_url=True,
                                 is_mention=False,
                                 tweet_type='reply')
                # mentions
                for m in jd['entities']['user_mentions']:
                    to_raw_id = m.get('id')
                    if to_raw_id and to_raw_id != in_reply_to_user_id:
                        self._save_edges(url_map,
                                         quoted_jd['entities'],
                                         tweet_id,
                                         tw_raw_id,
                                         from_raw_id=user_raw_id,
                                         to_raw_id=to_raw_id,
                                         is_quoted_url=True,
                                         is_mention=True,
                                         tweet_type='reply')
            # 2-3-3) pure quote
            else:
                logger.debug(
                    '2-1-c) Saving edge for pure quote part of quote ...')
                self._save_edges(url_map,
                                 quoted_jd['entities'],
                                 tweet_id,
                                 tw_raw_id,
                                 from_raw_id=quoted_user_jd['id'],
                                 to_raw_id=user_raw_id,
                                 is_quoted_url=True,
                                 is_mention=False,
                                 tweet_type='quote')
                logger.debug(
                    '2-1-c) Saving edges for original part of quote ...')
                for m in jd['entities']['user_mentions']:
                    to_raw_id = m.get('id')
                    if to_raw_id:
                        self._save_edges(url_map,
                                         jd['entities'],
                                         tweet_id,
                                         tw_raw_id,
                                         from_raw_id=user_raw_id,
                                         to_raw_id=to_raw_id,
                                         is_quoted_url=False,
                                         is_mention=True,
                                         tweet_type='quote')
        # 2-4) original tweet
        if retweeted_status_id is None and in_reply_to_status_id is None\
            and quoted_status_id is None and 'entities' in jd and\
            'user_mentions' in jd['entities']:
            logger.debug('2-1-d) Saving edges for original tweet ...')
            for m in jd['entities']['user_mentions']:
                to_raw_id = m.get('id')
                if to_raw_id:
                    self._save_edges(url_map,
                                     jd['entities'],
                                     tweet_id,
                                     tw_raw_id,
                                     from_raw_id=user_raw_id,
                                     to_raw_id=to_raw_id,
                                     is_quoted_url=False,
                                     is_mention=True,
                                     tweet_type='origin')
        # saving all mentions ...
        logger.debug('3) Saving all mentions ...')
        # add the in_reply_to_user
        mentions_set.add((in_reply_to_user_id, in_reply_to_screen_name))
        for user_raw_id, screen_name in mentions_set:
            create_or_update_muser(self.session,
                                   data=dict(raw_id=user_raw_id,
                                             screen_name=screen_name,
                                             updated_at=created_at))
        # saving hashtags
        logger.debug('3) creating hashtags')
        if self.saved_tweet is False:
            for hashtag in hashtags_set:
                mhashtag = get_or_create_m(self.session,
                                           Hashtag,
                                           data=dict(text=hashtag),
                                           fb_uk='text')
                self.session.add(
                    AssTweetHashtag(tweet_id=tweet_id, hashtag_id=mhashtag.id))
                try:
                    self.session.commit()
                except IntegrityError as e:
                    logger.error('ass_tweet_hashtag IntegrityError, see: %s',
                                 e)
                    self.session.rollback()
        # saving associate tweet
        logger.debug('3 Saving ass_tweet ...')
        if self.saved_tweet is False:
            create_m(self.session,
                     AssTweet,
                     data=dict(id=tweet_id,
                               retweeted_status_id=retweeted_status_id,
                               quoted_status_id=quoted_status_id,
                               in_reply_to_status_id=in_reply_to_status_id))
        logger.debug('Parsing one tweet, done.')