예제 #1
0
파일: sns.py 프로젝트: zjcom/hoaxy-backend
    def reparse_db(cls, session, args):
        """Load tweets from file into database.
        """
        def iter_rows_0(rs):
            """Return iterable for row[0] in rs"""
            for row in rs:
                yield row[0]

        parser = Parser()
        bucket_size = args['--window-size']
        plain_sql = args['--plain-sql']
        delete_tables = args['--delete-tables']
        ignore_tables = args['--ignore-tables']
        counter = 0
        table_deletes_sql = dict(ass_tweet="""\
            DELETE FROM ass_tweet AS atw
            USING tweet AS tw, UNNEST(:ids) AS t(raw_id)
            WHERE tw.raw_id=t.raw_id AND atw.id=tw.id
            """,
                                 ass_tweet_url="""\
            DELETE FROM ass_tweet_url AS atu
            USING tweet AS tw, UNNEST(:ids) AS t(raw_id)
            WHERE tw.raw_id=t.raw_id AND atu.id=tw.id
            """,
                                 ass_tweet_hashtag="""\
            DELETE FROM ass_tweet_hashtag AS ath
            USING tweet AS tw, UNNEST(:ids) AS t(raw_id)
            WHERE tw.raw_id=t.raw_id AND ath.id=tw.id
            """,
                                 twitter_network_edge="""\
            DELETE FROM twitter_network_edge AS tne
            USING UNNEST(:ids) AS t(raw_id)
            WHERE tne.tweet_raw_id=t.raw_id
            """)

        for tn in delete_tables:
            del_tn = table_deletes_sql.get(tn)
            if del_tn is None:
                raise ValueError('Unsupported deletion of table %s', tn)
        platform_id = get_platform_id(session, N_PLATFORM_TWITTER)
        logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
        rs = session.execute(text(plain_sql))
        affected_ids = [row[0] for row in rs]
        logger.info('Total number of tweets to reparse: %s', len(affected_ids))
        w_query = """\
        SELECT tw.json_data AS jd
        FROM UNNEST(:ids) AS t(raw_id)
        JOIN tweet AS tw ON tw.raw_id=t.raw_id
        """
        for chunk in chunked_iterable(affected_ids, bucket_size):
            for tn in delete_tables:
                del_tn = table_deletes_sql[tn]
                try:
                    session.execute(text(del_tn).bindparams(ids=chunk))
                    session.commit()
                    logger.info('Table %s deleted successfully!', tn)
                except SQLAlchemyError as err:
                    logger.exception(err)
                    raise
            rs = session.execute(text(w_query).bindparams(ids=chunk))
            jds = iter_rows_0(rs)
            parser.bulk_parse_and_save(jds,
                                       session,
                                       platform_id,
                                       multiprocesses=True,
                                       ignore_tables=ignore_tables)
            counter += len(chunk)
            logger.info('Current Number of reparsed tweets: %s', counter)
        logger.info('Total number of reparsed tweets: %s!', counter)
        logger.info('Reparse done, exit!')
예제 #2
0
파일: sns.py 프로젝트: zjcom/hoaxy-backend
 def load_tweets(cls, session, args):
     """Load tweets from file into database.
     """
     parser = Parser()
     ntweets = args['--number-of-tweets']
     strict_on_error = args['--strict-on-error']
     true_counter = 0
     counter = 0
     jds = []
     f = xopen(args['<filepath>'])
     platform_id = get_platform_id(session, N_PLATFORM_TWITTER)
     bucket_size = args['--window-size']
     try:
         for line in f:
             counter += 1
             if line:
                 try:
                     jd = json.loads(line)
                     if 'in_reply_to_status_id' in jd and 'user' in jd and\
                             'text' in jd:
                         jds.append(json.loads(line))
                         true_counter += 1
                     else:
                         logger.error('Not a tweet at line %s, raw data %r',
                                      counter, jd)
                         if strict_on_error:
                             sys.exit(1)
                         continue
                 except Exception as e:
                     msg = 'JSON loads error at line %s: %r, raw data: %r'
                     logger.error(msg, counter, e, line)
                     if strict_on_error:
                         sys.exit(1)
                     continue
             else:
                 logger.error('Empty line at line %s', counter)
             if ntweets is not None and ntweets == true_counter:
                 logger.warning(
                     'Reaching the number of tweets %s at line %s', ntweets,
                     counter)
                 # break the loop
                 break
             if true_counter % bucket_size == 0:
                 logger.warning('Reading %s lines, %s tweets parsed',
                                counter, true_counter)
                 parser.bulk_parse_and_save(jds,
                                            session,
                                            platform_id,
                                            multiprocesses=True)
                 jds = []
     except Exception as err:
         logger.exception(err)
         logger.info('Saving successfully read tweets ...')
     if jds:
         parser.bulk_parse_and_save(jds,
                                    session,
                                    platform_id,
                                    multiprocesses=True)
         jds = []
         logger.warning('Reading %s lines, %s tweets parsed', counter,
                        true_counter)