def reparse_db(cls, session, args): """Load tweets from file into database. """ def iter_rows_0(rs): """Return iterable for row[0] in rs""" for row in rs: yield row[0] parser = Parser() bucket_size = args['--window-size'] plain_sql = args['--plain-sql'] delete_tables = args['--delete-tables'] ignore_tables = args['--ignore-tables'] counter = 0 table_deletes_sql = dict(ass_tweet="""\ DELETE FROM ass_tweet AS atw USING tweet AS tw, UNNEST(:ids) AS t(raw_id) WHERE tw.raw_id=t.raw_id AND atw.id=tw.id """, ass_tweet_url="""\ DELETE FROM ass_tweet_url AS atu USING tweet AS tw, UNNEST(:ids) AS t(raw_id) WHERE tw.raw_id=t.raw_id AND atu.id=tw.id """, ass_tweet_hashtag="""\ DELETE FROM ass_tweet_hashtag AS ath USING tweet AS tw, UNNEST(:ids) AS t(raw_id) WHERE tw.raw_id=t.raw_id AND ath.id=tw.id """, twitter_network_edge="""\ DELETE FROM twitter_network_edge AS tne USING UNNEST(:ids) AS t(raw_id) WHERE tne.tweet_raw_id=t.raw_id """) for tn in delete_tables: del_tn = table_deletes_sql.get(tn) if del_tn is None: raise ValueError('Unsupported deletion of table %s', tn) platform_id = get_platform_id(session, N_PLATFORM_TWITTER) logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) rs = session.execute(text(plain_sql)) affected_ids = [row[0] for row in rs] logger.info('Total number of tweets to reparse: %s', len(affected_ids)) w_query = """\ SELECT tw.json_data AS jd FROM UNNEST(:ids) AS t(raw_id) JOIN tweet AS tw ON tw.raw_id=t.raw_id """ for chunk in chunked_iterable(affected_ids, bucket_size): for tn in delete_tables: del_tn = table_deletes_sql[tn] try: session.execute(text(del_tn).bindparams(ids=chunk)) session.commit() logger.info('Table %s deleted successfully!', tn) except SQLAlchemyError as err: logger.exception(err) raise rs = session.execute(text(w_query).bindparams(ids=chunk)) jds = iter_rows_0(rs) parser.bulk_parse_and_save(jds, session, platform_id, multiprocesses=True, ignore_tables=ignore_tables) counter += len(chunk) logger.info('Current Number of reparsed tweets: %s', counter) logger.info('Total number of reparsed tweets: %s!', counter) logger.info('Reparse done, exit!')
def load_tweets(cls, session, args): """Load tweets from file into database. """ parser = Parser() ntweets = args['--number-of-tweets'] strict_on_error = args['--strict-on-error'] true_counter = 0 counter = 0 jds = [] f = xopen(args['<filepath>']) platform_id = get_platform_id(session, N_PLATFORM_TWITTER) bucket_size = args['--window-size'] try: for line in f: counter += 1 if line: try: jd = json.loads(line) if 'in_reply_to_status_id' in jd and 'user' in jd and\ 'text' in jd: jds.append(json.loads(line)) true_counter += 1 else: logger.error('Not a tweet at line %s, raw data %r', counter, jd) if strict_on_error: sys.exit(1) continue except Exception as e: msg = 'JSON loads error at line %s: %r, raw data: %r' logger.error(msg, counter, e, line) if strict_on_error: sys.exit(1) continue else: logger.error('Empty line at line %s', counter) if ntweets is not None and ntweets == true_counter: logger.warning( 'Reaching the number of tweets %s at line %s', ntweets, counter) # break the loop break if true_counter % bucket_size == 0: logger.warning('Reading %s lines, %s tweets parsed', counter, true_counter) parser.bulk_parse_and_save(jds, session, platform_id, multiprocesses=True) jds = [] except Exception as err: logger.exception(err) logger.info('Saving successfully read tweets ...') if jds: parser.bulk_parse_and_save(jds, session, platform_id, multiprocesses=True) jds = [] logger.warning('Reading %s lines, %s tweets parsed', counter, true_counter)