def process_raw_message_batch(realm_id: int, raw_messages: List[Dict[str, Any]], subscriber_map: Dict[int, Set[int]], user_id_mapper: IdMapper, user_handler: UserHandler, attachment_handler: AttachmentHandler, get_recipient_id: Callable[[ZerverFieldsT], int], is_pm_data: bool, output_dir: str) -> None: def fix_mentions(content: str, mention_user_ids: Set[int]) -> str: for user_id in mention_user_ids: user = user_handler.get_user(user_id=user_id) hipchat_mention = '@{short_name}'.format(**user) zulip_mention = '@**{full_name}**'.format(**user) content = content.replace(hipchat_mention, zulip_mention) content = content.replace('@here', '@**all**') return content mention_map: Dict[int, Set[int]] = dict() zerver_message = [] import html2text h = html2text.HTML2Text() for raw_message in raw_messages: # One side effect here: message_id = NEXT_ID('message') mention_user_ids = { user_id_mapper.get(id) for id in set(raw_message['mention_user_ids']) if user_id_mapper.has(id) } mention_map[message_id] = mention_user_ids content = fix_mentions( content=raw_message['content'], mention_user_ids=mention_user_ids, ) content = h.handle(content) if len(content) > 10000: logging.info('skipping too-long message of length %s' % (len(content), )) continue date_sent = raw_message['date_sent'] try: recipient_id = get_recipient_id(raw_message) except KeyError: logging.debug( "Could not find recipient_id for a message, skipping.") continue rendered_content = None if is_pm_data: topic_name = '' else: topic_name = 'imported from hipchat' user_id = raw_message['sender_id'] # Another side effect: extra_content = attachment_handler.handle_message_data( realm_id=realm_id, message_id=message_id, sender_id=user_id, attachment=raw_message['attachment'], files_dir=raw_message['files_dir'], ) if extra_content: has_attachment = True content += '\n' + extra_content else: has_attachment = False message = build_message( content=content, message_id=message_id, date_sent=date_sent, recipient_id=recipient_id, rendered_content=rendered_content, topic_name=topic_name, user_id=user_id, has_attachment=has_attachment, ) zerver_message.append(message) zerver_usermessage = make_user_messages( zerver_message=zerver_message, subscriber_map=subscriber_map, is_pm_data=is_pm_data, mention_map=mention_map, ) message_json = dict( zerver_message=zerver_message, zerver_usermessage=zerver_usermessage, ) dump_file_id = NEXT_ID('dump_file_id') message_file = "/messages-%06d.json" % (dump_file_id, ) create_converted_data_files(message_json, output_dir, message_file)
def do_convert_data(input_tar_file: str, output_dir: str, masking_content: bool, api_token: Optional[str] = None, slim_mode: bool = False) -> None: input_data_dir = untar_input_file(input_tar_file) attachment_handler = AttachmentHandler() user_handler = UserHandler() subscriber_handler = SubscriberHandler() user_id_mapper = IdMapper() stream_id_mapper = IdMapper() realm_id = 0 realm = make_realm(realm_id=realm_id) # users.json -> UserProfile raw_user_data = read_user_data(data_dir=input_data_dir) convert_user_data( user_handler=user_handler, slim_mode=slim_mode, user_id_mapper=user_id_mapper, raw_data=raw_user_data, realm_id=realm_id, ) normal_users = user_handler.get_normal_users() # Don't write zerver_userprofile here, because we # may add more users later. # streams.json -> Stream raw_stream_data = read_room_data(data_dir=input_data_dir) zerver_stream = convert_room_data( raw_data=raw_stream_data, subscriber_handler=subscriber_handler, stream_id_mapper=stream_id_mapper, user_id_mapper=user_id_mapper, realm_id=realm_id, api_token=api_token, ) realm['zerver_stream'] = zerver_stream zerver_recipient = build_recipients( zerver_userprofile=normal_users, zerver_stream=zerver_stream, ) realm['zerver_recipient'] = zerver_recipient if api_token is None: if slim_mode: public_stream_subscriptions: List[ZerverFieldsT] = [] else: public_stream_subscriptions = build_public_stream_subscriptions( zerver_userprofile=normal_users, zerver_recipient=zerver_recipient, zerver_stream=zerver_stream, ) private_stream_subscriptions = build_stream_subscriptions( get_users=subscriber_handler.get_users, zerver_recipient=zerver_recipient, zerver_stream=[ stream_dict for stream_dict in zerver_stream if stream_dict['invite_only'] ], ) stream_subscriptions = public_stream_subscriptions + private_stream_subscriptions else: stream_subscriptions = build_stream_subscriptions( get_users=subscriber_handler.get_users, zerver_recipient=zerver_recipient, zerver_stream=zerver_stream, ) personal_subscriptions = build_personal_subscriptions( zerver_recipient=zerver_recipient, ) zerver_subscription = personal_subscriptions + stream_subscriptions realm['zerver_subscription'] = zerver_subscription zerver_realmemoji = write_emoticon_data( realm_id=realm_id, data_dir=input_data_dir, output_dir=output_dir, ) realm['zerver_realmemoji'] = zerver_realmemoji subscriber_map = make_subscriber_map( zerver_subscription=zerver_subscription, ) logging.info('Start importing message data') for message_key in [ 'UserMessage', 'NotificationMessage', 'PrivateUserMessage' ]: write_message_data( realm_id=realm_id, slim_mode=slim_mode, message_key=message_key, zerver_recipient=zerver_recipient, subscriber_map=subscriber_map, data_dir=input_data_dir, output_dir=output_dir, masking_content=masking_content, stream_id_mapper=stream_id_mapper, user_id_mapper=user_id_mapper, user_handler=user_handler, attachment_handler=attachment_handler, ) # Order is important here...don't write users until # we process everything else, since we may introduce # mirror users when processing messages. realm['zerver_userprofile'] = user_handler.get_all_users() realm['sort_by_date'] = True create_converted_data_files(realm, output_dir, '/realm.json') logging.info('Start importing avatar data') write_avatar_data( raw_user_data=raw_user_data, output_dir=output_dir, user_id_mapper=user_id_mapper, realm_id=realm_id, ) attachment_handler.write_info( output_dir=output_dir, realm_id=realm_id, ) logging.info('Start making tarball') subprocess.check_call( ["tar", "-czf", output_dir + '.tar.gz', output_dir, '-P']) logging.info('Done making tarball')
def do_convert_data(input_tar_file: str, output_dir: str) -> None: input_data_dir = untar_input_file(input_tar_file) attachment_handler = AttachmentHandler() user_handler = UserHandler() realm_id = 0 realm = make_realm(realm_id=realm_id) # users.json -> UserProfile raw_user_data = read_user_data(data_dir=input_data_dir) convert_user_data( user_handler=user_handler, raw_data=raw_user_data, realm_id=realm_id, ) normal_users = user_handler.get_normal_users() # Don't write zerver_userprofile here, because we # may add more users later. # streams.json -> Stream raw_stream_data = read_room_data(data_dir=input_data_dir) zerver_stream = convert_room_data( raw_data=raw_stream_data, realm_id=realm_id, ) realm['zerver_stream'] = zerver_stream zerver_recipient = build_recipients( zerver_userprofile=normal_users, zerver_stream=zerver_stream, ) realm['zerver_recipient'] = zerver_recipient zerver_subscription = build_subscriptions( zerver_userprofile=normal_users, zerver_recipient=zerver_recipient, zerver_stream=zerver_stream, ) realm['zerver_subscription'] = zerver_subscription zerver_realmemoji = write_emoticon_data( realm_id=realm_id, data_dir=input_data_dir, output_dir=output_dir, ) realm['zerver_realmemoji'] = zerver_realmemoji logging.info('Start importing message data') for message_key in ['UserMessage', 'PrivateUserMessage']: write_message_data( realm_id=realm_id, message_key=message_key, zerver_recipient=zerver_recipient, zerver_subscription=zerver_subscription, data_dir=input_data_dir, output_dir=output_dir, user_handler=user_handler, attachment_handler=attachment_handler, ) # Order is important here...don't write users until # we process everything else, since we may introduce # mirror users when processing messages. realm['zerver_userprofile'] = user_handler.get_all_users() create_converted_data_files(realm, output_dir, '/realm.json') logging.info('Start importing avatar data') write_avatar_data( raw_user_data=raw_user_data, output_dir=output_dir, realm_id=realm_id, ) attachment_handler.write_info( output_dir=output_dir, realm_id=realm_id, ) logging.info('Start making tarball') subprocess.check_call(["tar", "-czf", output_dir + '.tar.gz', output_dir, '-P']) logging.info('Done making tarball')