def bulk_import_user_message_data(data: TableData, dump_file_id: int) -> None: model = UserMessage table = 'zerver_usermessage' lst = data[table] def process_batch(items: List[Dict[str, Any]]) -> None: ums = [ UserMessageLite( user_profile_id = item['user_profile_id'], message_id = item['message_id'], flags=item['flags'], ) for item in items ] bulk_insert_ums(ums) chunk_size = 10000 process_list_in_batches( lst=lst, chunk_size=chunk_size, process_batch=process_batch, ) logging.info("Successfully imported %s from %s[%s]." % (model, table, dump_file_id))
def bulk_import_user_message_data(data: TableData, dump_file_id: int) -> None: model = UserMessage table = 'zerver_usermessage' lst = data[table] # IMPORTANT NOTE: We do not use any primary id # data from either the import itself or ID_MAP. # We let the DB itself generate ids. Note that # no tables use user_message.id as a foreign key, # so we can safely avoid all re-mapping complexity. def process_batch(items: List[Dict[str, Any]]) -> None: ums = [ UserMessageLite( user_profile_id = item['user_profile_id'], message_id = item['message_id'], flags=item['flags'], ) for item in items ] bulk_insert_ums(ums) chunk_size = 10000 process_list_in_batches( lst=lst, chunk_size=chunk_size, process_batch=process_batch, ) logging.info("Successfully imported %s from %s[%s]." % (model, table, dump_file_id))
def process_message_file(realm_id: int, fn: str, fn_id: int, files_dir: str, get_recipient_id: Callable[[ZerverFieldsT], int], message_key: str, zerver_subscription: List[ZerverFieldsT], data_dir: str, output_dir: str, user_handler: UserHandler, attachment_handler: AttachmentHandler) -> None: def get_raw_messages(fn: str) -> List[ZerverFieldsT]: data = json.load(open(fn)) flat_data = [ d[message_key] for d in data if message_key in d ] return [ dict( fn_id=fn_id, sender_id=d['sender']['id'], receiver_id=d.get('receiver', {}).get('id'), content=d['message'], mention_user_ids=d['mentions'], pub_date=str_date_to_float(d['timestamp']), attachment=d['attachment'], files_dir=files_dir, ) for d in flat_data ] raw_messages = get_raw_messages(fn) def process_batch(lst: List[Any]) -> None: process_raw_message_batch( realm_id=realm_id, raw_messages=lst, zerver_subscription=zerver_subscription, user_handler=user_handler, attachment_handler=attachment_handler, get_recipient_id=get_recipient_id, output_dir=output_dir, ) chunk_size = 1000 process_list_in_batches( lst=raw_messages, chunk_size=chunk_size, process_batch=process_batch, )
def process_message_file(realm_id: int, slim_mode: bool, fn: str, fn_id: str, files_dir: str, get_recipient_id: Callable[[ZerverFieldsT], int], message_key: str, subscriber_map: Dict[int, Set[int]], data_dir: str, output_dir: str, is_pm_data: bool, masking_content: bool, user_id_mapper: IdMapper, user_handler: UserHandler, attachment_handler: AttachmentHandler) -> None: def get_raw_messages(fn: str) -> List[ZerverFieldsT]: with open(fn) as f: data = ujson.load(f) flat_data = [d[message_key] for d in data if message_key in d] def get_raw_message(d: Dict[str, Any]) -> Optional[ZerverFieldsT]: sender_id = get_hipchat_sender_id( realm_id=realm_id, slim_mode=slim_mode, message_dict=d, user_id_mapper=user_id_mapper, user_handler=user_handler, ) if sender_id is None: return None if is_pm_data: # We need to compare with str() on both sides here. # In Stride, user IDs are strings, but in HipChat, # they are integers, and fn_id is always a string. if str(sender_id) != str(fn_id): # PMs are in multiple places in the Hipchat export, # and we only use the copy from the sender return None content = d['message'] if masking_content: content = re.sub('[a-z]', 'x', content) content = re.sub('[A-Z]', 'X', content) return dict( fn_id=fn_id, sender_id=sender_id, receiver_id=d.get('receiver', {}).get('id'), content=content, mention_user_ids=d.get('mentions', []), date_sent=str_date_to_float(d['timestamp']), attachment=d.get('attachment'), files_dir=files_dir, ) raw_messages = [] for d in flat_data: raw_message = get_raw_message(d) if raw_message is not None: raw_messages.append(raw_message) return raw_messages raw_messages = get_raw_messages(fn) def process_batch(lst: List[Any]) -> None: process_raw_message_batch( realm_id=realm_id, raw_messages=lst, subscriber_map=subscriber_map, user_id_mapper=user_id_mapper, user_handler=user_handler, attachment_handler=attachment_handler, get_recipient_id=get_recipient_id, is_pm_data=is_pm_data, output_dir=output_dir, ) chunk_size = 1000 process_list_in_batches( lst=raw_messages, chunk_size=chunk_size, process_batch=process_batch, )
def process_posts( num_teams: int, team_name: str, realm_id: int, post_data: List[Dict[str, Any]], get_recipient_id_from_receiver_name: Callable[[str, int], int], subscriber_map: Dict[int, Set[int]], output_dir: str, is_pm_data: bool, masking_content: bool, user_id_mapper: IdMapper, user_handler: UserHandler, zerver_realmemoji: List[Dict[str, Any]], total_reactions: List[Dict[str, Any]], uploads_list: List[ZerverFieldsT], zerver_attachment: List[ZerverFieldsT], mattermost_data_dir: str, ) -> None: post_data_list = [] for post in post_data: if "team" not in post: # Mattermost doesn't specify a team for private messages # in its export format. This line of code requires that # we only be importing data from a single team (checked # elsewhere) -- we just assume it's the target team. post_team = team_name else: post_team = post["team"] if post_team == team_name: post_data_list.append(post) def message_to_dict(post_dict: Dict[str, Any]) -> Dict[str, Any]: sender_username = post_dict["user"] sender_id = user_id_mapper.get(sender_username) content = post_dict["message"] if masking_content: content = re.sub("[a-z]", "x", content) content = re.sub("[A-Z]", "X", content) if "reactions" in post_dict: reactions = post_dict["reactions"] or [] else: reactions = [] message_dict = dict( sender_id=sender_id, content=content, date_sent=int(post_dict["create_at"] / 1000), reactions=reactions, ) if "channel" in post_dict: message_dict["channel_name"] = post_dict["channel"] elif "channel_members" in post_dict: # This case is for handling posts from PMs and huddles, not channels. # PMs and huddles are known as direct_channels in Slack and hence # the name channel_members. channel_members = post_dict["channel_members"] if len(channel_members) > 2: message_dict["huddle_name"] = generate_huddle_name(channel_members) elif len(channel_members) == 2: message_dict["pm_members"] = channel_members else: raise AssertionError("Post without channel or channel_members key.") if post_dict.get("attachments"): message_dict["attachments"] = post_dict["attachments"] return message_dict raw_messages = [] for post_dict in post_data_list: raw_messages.append(message_to_dict(post_dict)) message_replies = post_dict["replies"] # Replies to a message in Mattermost are stored in the main message object. # For now, we just append the replies immediately after the original message. if message_replies is not None: for reply in message_replies: if "channel" in post_dict: reply["channel"] = post_dict["channel"] else: # nocoverage reply["channel_members"] = post_dict["channel_members"] raw_messages.append(message_to_dict(reply)) def process_batch(lst: List[Dict[str, Any]]) -> None: process_raw_message_batch( realm_id=realm_id, raw_messages=lst, subscriber_map=subscriber_map, user_id_mapper=user_id_mapper, user_handler=user_handler, get_recipient_id_from_receiver_name=get_recipient_id_from_receiver_name, is_pm_data=is_pm_data, output_dir=output_dir, zerver_realmemoji=zerver_realmemoji, total_reactions=total_reactions, uploads_list=uploads_list, zerver_attachment=zerver_attachment, mattermost_data_dir=mattermost_data_dir, ) chunk_size = 1000 process_list_in_batches( lst=raw_messages, chunk_size=chunk_size, process_batch=process_batch, )
def process_posts(team_name: str, realm_id: int, post_data: List[Dict[str, Any]], get_recipient_id: Callable[[ZerverFieldsT], int], subscriber_map: Dict[int, Set[int]], output_dir: str, is_pm_data: bool, masking_content: bool, user_id_mapper: IdMapper, user_handler: UserHandler, username_to_user: Dict[str, Dict[str, Any]], zerver_realmemoji: List[Dict[str, Any]], total_reactions: List[Dict[str, Any]]) -> None: post_data_list = [d for d in post_data if d["team"] == team_name] def message_to_dict(post_dict: Dict[str, Any]) -> Dict[str, Any]: sender_id = user_id_mapper.get(post_dict["user"]) content = post_dict['message'] if masking_content: content = re.sub('[a-z]', 'x', content) content = re.sub('[A-Z]', 'X', content) if "reactions" in post_dict: reactions = post_dict["reactions"] or [] else: reactions = [] return dict(sender_id=sender_id, receiver_id=post_dict["channel"], content=content, pub_date=int(post_dict['create_at'] / 1000), reactions=reactions) raw_messages = [] for post_dict in post_data_list: raw_messages.append(message_to_dict(post_dict)) message_replies = post_dict["replies"] # Replies to a message in Mattermost are stored in the main message object. # For now, we just append the replies immediately after the original message. if message_replies is not None: for reply in message_replies: reply["channel"] = post_dict["channel"] raw_messages.append(message_to_dict(reply)) def process_batch(lst: List[Dict[str, Any]]) -> None: process_raw_message_batch( realm_id=realm_id, raw_messages=lst, subscriber_map=subscriber_map, user_id_mapper=user_id_mapper, user_handler=user_handler, get_recipient_id=get_recipient_id, is_pm_data=is_pm_data, output_dir=output_dir, zerver_realmemoji=zerver_realmemoji, total_reactions=total_reactions, ) chunk_size = 1000 process_list_in_batches( lst=raw_messages, chunk_size=chunk_size, process_batch=process_batch, )
def process_posts(num_teams: int, team_name: str, realm_id: int, post_data: List[Dict[str, Any]], get_recipient_id_from_receiver_name: Callable[[str, int], int], subscriber_map: Dict[int, Set[int]], output_dir: str, is_pm_data: bool, masking_content: bool, user_id_mapper: IdMapper, user_handler: UserHandler, username_to_user: Dict[str, Dict[str, Any]], zerver_realmemoji: List[Dict[str, Any]], total_reactions: List[Dict[str, Any]]) -> None: post_data_list = [] for post in post_data: if "team" not in post: # Mattermost doesn't specify a team for private messages # in its export format. This line of code requires that # we only be importing data from a single team (checked # elsewhere) -- we just assume it's the target team. post_team = team_name else: post_team = post["team"] if post_team == team_name: post_data_list.append(post) def message_to_dict(post_dict: Dict[str, Any]) -> Dict[str, Any]: sender_id = user_id_mapper.get(post_dict["user"]) content = post_dict['message'] if masking_content: content = re.sub('[a-z]', 'x', content) content = re.sub('[A-Z]', 'X', content) if "reactions" in post_dict: reactions = post_dict["reactions"] or [] else: reactions = [] return dict(sender_id=sender_id, receiver_id=post_dict["channel"], content=content, date_sent=int(post_dict['create_at'] / 1000), reactions=reactions) raw_messages = [] for post_dict in post_data_list: raw_messages.append(message_to_dict(post_dict)) message_replies = post_dict["replies"] # Replies to a message in Mattermost are stored in the main message object. # For now, we just append the replies immediately after the original message. if message_replies is not None: for reply in message_replies: reply["channel"] = post_dict["channel"] raw_messages.append(message_to_dict(reply)) def process_batch(lst: List[Dict[str, Any]]) -> None: process_raw_message_batch( realm_id=realm_id, raw_messages=lst, subscriber_map=subscriber_map, user_id_mapper=user_id_mapper, user_handler=user_handler, get_recipient_id_from_receiver_name= get_recipient_id_from_receiver_name, is_pm_data=is_pm_data, output_dir=output_dir, zerver_realmemoji=zerver_realmemoji, total_reactions=total_reactions, ) chunk_size = 1000 process_list_in_batches( lst=raw_messages, chunk_size=chunk_size, process_batch=process_batch, )
def process_messages( realm_id: int, messages: List[Dict[str, Any]], subscriber_map: Dict[int, Set[int]], is_pm_data: bool, username_to_user_id_map: Dict[str, str], user_id_mapper: IdMapper, user_handler: UserHandler, user_id_to_recipient_id: Dict[int, int], stream_id_mapper: IdMapper, stream_id_to_recipient_id: Dict[int, int], huddle_id_mapper: IdMapper, huddle_id_to_recipient_id: Dict[int, int], room_id_to_room_map: Dict[str, Dict[str, Any]], dsc_id_to_dsc_map: Dict[str, Dict[str, Any]], direct_id_to_direct_map: Dict[str, Dict[str, Any]], huddle_id_to_huddle_map: Dict[str, Dict[str, Any]], zerver_realmemoji: List[ZerverFieldsT], total_reactions: List[ZerverFieldsT], uploads_list: List[ZerverFieldsT], zerver_attachment: List[ZerverFieldsT], upload_id_to_upload_data_map: Dict[str, Dict[str, Any]], output_dir: str, ) -> None: def list_reactions( reactions: Dict[str, Dict[str, Any]]) -> List[Dict[str, Any]]: # List of dictionaries of form: # {"name": "smile", "user_id": 2} reactions_list: List[Dict[str, Any]] = [] for react_code in reactions: name = react_code.split(":")[1] usernames = reactions[react_code]["usernames"] for username in usernames: rc_user_id = username_to_user_id_map[username] user_id = user_id_mapper.get(rc_user_id) reactions_list.append({"name": name, "user_id": user_id}) return reactions_list def message_to_dict(message: Dict[str, Any]) -> Dict[str, Any]: rc_sender_id = message["u"]["_id"] sender_id = user_id_mapper.get(rc_sender_id) content = message["msg"] if message.get("reactions"): reactions = list_reactions(message["reactions"]) else: reactions = [] message_dict = dict( sender_id=sender_id, content=content, date_sent=int(message["ts"].timestamp()), reactions=reactions, has_link=True if message.get("urls") else False, ) # Add recipient_id and topic to message_dict if is_pm_data: # Message is in a PM or a huddle. rc_channel_id = message["rid"] if rc_channel_id in huddle_id_to_huddle_map: huddle_id = huddle_id_mapper.get(rc_channel_id) message_dict["recipient_id"] = huddle_id_to_recipient_id[ huddle_id] else: rc_member_ids = direct_id_to_direct_map[rc_channel_id]["uids"] if rc_sender_id == rc_member_ids[0]: zulip_member_id = user_id_mapper.get(rc_member_ids[1]) message_dict["recipient_id"] = user_id_to_recipient_id[ zulip_member_id] else: zulip_member_id = user_id_mapper.get(rc_member_ids[0]) message_dict["recipient_id"] = user_id_to_recipient_id[ zulip_member_id] # PMs and huddles don't have topics, but topic_name field is required in `build_message`. message_dict["topic_name"] = "" elif message["rid"] in dsc_id_to_dsc_map: # Message is in a discussion dsc_channel = dsc_id_to_dsc_map[message["rid"]] parent_channel_id = dsc_channel["prid"] stream_id = stream_id_mapper.get(parent_channel_id) message_dict["recipient_id"] = stream_id_to_recipient_id[stream_id] # In case you change this, please also change the topic name used # in discussion mention to topic mention conversion below, while # adding the Rocket.Chat channel mention data to message_dict. message_dict[ "topic_name"] = f'{dsc_channel["fname"]} (Imported from Rocket.Chat)' else: stream_id = stream_id_mapper.get(message["rid"]) message_dict["recipient_id"] = stream_id_to_recipient_id[stream_id] message_dict["topic_name"] = "Imported from Rocket.Chat" # Add user mentions to message_dict mention_user_ids = set() for mention in message.get("mentions", []): mention_id = mention["_id"] if mention_id in ["all", "here"]: continue user_id = user_id_mapper.get(mention_id) mention_user_ids.add(user_id) message_dict["mention_user_ids"] = mention_user_ids # Add channel mentions to message_dict rc_channel_mention_data: List[Dict[str, str]] = [] for mention in message.get("channels", []): mention_rc_channel_id = mention["_id"] mention_rc_channel_name = mention["name"] rc_mention = f"#{mention_rc_channel_name}" if mention_rc_channel_id in room_id_to_room_map: # Channel is converted to a stream. converted_stream_name = mention_rc_channel_name rc_channel = room_id_to_room_map[mention_rc_channel_id] if rc_channel.get("teamMain") is True: # Channel is a team's main channel converted_stream_name = "[TEAM] " + converted_stream_name zulip_mention = f"#**{converted_stream_name}**" elif mention_rc_channel_id in dsc_id_to_dsc_map: # Channel is a discussion and is converted to a topic. dsc_channel = dsc_id_to_dsc_map[mention_rc_channel_id] parent_channel_id = dsc_channel["prid"] parent_rc_channel = room_id_to_room_map[parent_channel_id] converted_topic_name = f'{dsc_channel["fname"]} (Imported from Rocket.Chat)' parent_stream_name = parent_rc_channel["name"] if parent_rc_channel.get("teamMain") is True: # Parent channel is a team's main channel parent_stream_name = "[TEAM] " + parent_stream_name zulip_mention = f"#**{parent_stream_name}>{converted_topic_name}**" mention_data = { "rc_mention": rc_mention, "zulip_mention": zulip_mention } rc_channel_mention_data.append(mention_data) message_dict["rc_channel_mention_data"] = rc_channel_mention_data # Add uploaded file (attachment) to message_dict if message.get("file"): message_dict["file"] = message["file"] return message_dict raw_messages: List[Dict[str, Any]] = [] for message in messages: if message.get("t") is not None: # Messages with a type are system notifications like user_joined # that we don't include. continue raw_messages.append(message_to_dict(message)) def process_batch(lst: List[Dict[str, Any]]) -> None: process_raw_message_batch( realm_id=realm_id, raw_messages=lst, subscriber_map=subscriber_map, user_handler=user_handler, is_pm_data=is_pm_data, output_dir=output_dir, zerver_realmemoji=zerver_realmemoji, total_reactions=total_reactions, uploads_list=uploads_list, zerver_attachment=zerver_attachment, upload_id_to_upload_data_map=upload_id_to_upload_data_map, ) chunk_size = 1000 process_list_in_batches( lst=raw_messages, chunk_size=chunk_size, process_batch=process_batch, )
def process_messages( realm_id: int, messages: List[Dict[str, Any]], subscriber_map: Dict[int, Set[int]], is_pm_data: bool, username_to_user_id_map: Dict[str, str], user_id_mapper: IdMapper, user_handler: UserHandler, user_id_to_recipient_id: Dict[int, int], stream_id_mapper: IdMapper, stream_id_to_recipient_id: Dict[int, int], huddle_id_mapper: IdMapper, huddle_id_to_recipient_id: Dict[int, int], thread_id_mapper: IdMapper, room_id_to_room_map: Dict[str, Dict[str, Any]], dsc_id_to_dsc_map: Dict[str, Dict[str, Any]], direct_id_to_direct_map: Dict[str, Dict[str, Any]], huddle_id_to_huddle_map: Dict[str, Dict[str, Any]], zerver_realmemoji: List[ZerverFieldsT], total_reactions: List[ZerverFieldsT], uploads_list: List[ZerverFieldsT], zerver_attachment: List[ZerverFieldsT], upload_id_to_upload_data_map: Dict[str, Dict[str, Any]], output_dir: str, ) -> None: def list_reactions( reactions: Dict[str, Dict[str, Any]]) -> List[Dict[str, Any]]: # List of dictionaries of form: # {"name": "smile", "user_id": 2} reactions_list: List[Dict[str, Any]] = [] for react_code in reactions: name = react_code.split(":")[1] usernames = reactions[react_code]["usernames"] for username in usernames: rc_user_id = username_to_user_id_map[username] user_id = user_id_mapper.get(rc_user_id) reactions_list.append({"name": name, "user_id": user_id}) return reactions_list def message_to_dict(message: Dict[str, Any]) -> Dict[str, Any]: rc_sender_id = message["u"]["_id"] sender_id = user_id_mapper.get(rc_sender_id) content = message["msg"] if message.get("reactions"): reactions = list_reactions(message["reactions"]) else: reactions = [] message_dict = dict( sender_id=sender_id, content=content, date_sent=int(message["ts"].timestamp()), reactions=reactions, has_link=True if message.get("urls") else False, ) # Add recipient_id to message_dict if is_pm_data: # Message is in a PM or a huddle. rc_channel_id = message["rid"] if rc_channel_id in huddle_id_to_huddle_map: huddle_id = huddle_id_mapper.get(rc_channel_id) message_dict["recipient_id"] = huddle_id_to_recipient_id[ huddle_id] else: rc_member_ids = direct_id_to_direct_map[rc_channel_id]["uids"] if rc_sender_id == rc_member_ids[0]: zulip_member_id = user_id_mapper.get(rc_member_ids[1]) message_dict["recipient_id"] = user_id_to_recipient_id[ zulip_member_id] else: zulip_member_id = user_id_mapper.get(rc_member_ids[0]) message_dict["recipient_id"] = user_id_to_recipient_id[ zulip_member_id] elif message["rid"] in dsc_id_to_dsc_map: # Message is in a discussion dsc_channel = dsc_id_to_dsc_map[message["rid"]] parent_channel_id = dsc_channel["prid"] stream_id = stream_id_mapper.get(parent_channel_id) message_dict["recipient_id"] = stream_id_to_recipient_id[stream_id] else: stream_id = stream_id_mapper.get(message["rid"]) message_dict["recipient_id"] = stream_id_to_recipient_id[stream_id] # Add topic name to message_dict message_dict["topic_name"] = get_topic_name(message, dsc_id_to_dsc_map, thread_id_mapper, is_pm_data) # Add user mentions to message_dict mention_user_ids = set() wildcard_mention = False for mention in message.get("mentions", []): mention_id = mention["_id"] if mention_id in ["all", "here"]: wildcard_mention = True continue user_id = user_id_mapper.get(mention_id) mention_user_ids.add(user_id) message_dict["mention_user_ids"] = mention_user_ids message_dict["wildcard_mention"] = wildcard_mention # Add channel mentions to message_dict rc_channel_mention_data: List[Dict[str, str]] = [] for mention in message.get("channels", []): mention_rc_channel_id = mention["_id"] mention_rc_channel_name = mention["name"] rc_mention = f"#{mention_rc_channel_name}" if mention_rc_channel_id in room_id_to_room_map: # Channel is converted to a stream. rc_channel = room_id_to_room_map[mention_rc_channel_id] converted_stream_name = get_stream_name(rc_channel) zulip_mention = f"#**{converted_stream_name}**" elif mention_rc_channel_id in dsc_id_to_dsc_map: # Channel is a discussion and is converted to a topic. dsc_channel = dsc_id_to_dsc_map[mention_rc_channel_id] parent_channel_id = dsc_channel["prid"] if (parent_channel_id in direct_id_to_direct_map or parent_channel_id in huddle_id_to_huddle_map): # Discussion belongs to a direct channel and thus, should not be # linked. # This logging statement serves the side benefit of avoiding the # CPython optimization for `continue` so that the coverage reports # aren't misleading. logging.info( "skipping direct messages discussion mention: %s", dsc_channel["fname"]) continue converted_topic_name = get_topic_name( message={"rid": mention_rc_channel_id}, dsc_id_to_dsc_map=dsc_id_to_dsc_map, thread_id_mapper=thread_id_mapper, ) parent_rc_channel = room_id_to_room_map[parent_channel_id] parent_stream_name = get_stream_name(parent_rc_channel) zulip_mention = f"#**{parent_stream_name}>{converted_topic_name}**" mention_data = { "rc_mention": rc_mention, "zulip_mention": zulip_mention } rc_channel_mention_data.append(mention_data) message_dict["rc_channel_mention_data"] = rc_channel_mention_data # Add uploaded file (attachment) to message_dict if message.get("file"): message_dict["file"] = message["file"] return message_dict raw_messages: List[Dict[str, Any]] = [] for message in messages: if message.get("t") is not None: # Messages with a type are system notifications like user_joined # that we don't include. continue raw_messages.append(message_to_dict(message)) def process_batch(lst: List[Dict[str, Any]]) -> None: process_raw_message_batch( realm_id=realm_id, raw_messages=lst, subscriber_map=subscriber_map, user_handler=user_handler, is_pm_data=is_pm_data, output_dir=output_dir, zerver_realmemoji=zerver_realmemoji, total_reactions=total_reactions, uploads_list=uploads_list, zerver_attachment=zerver_attachment, upload_id_to_upload_data_map=upload_id_to_upload_data_map, ) chunk_size = 1000 process_list_in_batches( lst=raw_messages, chunk_size=chunk_size, process_batch=process_batch, )
def process_messages( realm_id: int, messages: List[Dict[str, Any]], subscriber_map: Dict[int, Set[int]], is_pm_data: bool, username_to_user_id_map: Dict[str, str], user_id_mapper: IdMapper, user_handler: UserHandler, user_id_to_recipient_id: Dict[int, int], stream_id_mapper: IdMapper, stream_id_to_recipient_id: Dict[int, int], huddle_id_mapper: IdMapper, huddle_id_to_recipient_id: Dict[int, int], dsc_id_to_dsc_map: Dict[str, Dict[str, Any]], direct_id_to_direct_map: Dict[str, Dict[str, Any]], huddle_id_to_huddle_map: Dict[str, Dict[str, Any]], total_reactions: List[ZerverFieldsT], output_dir: str, ) -> None: def list_reactions( reactions: Dict[str, Dict[str, Any]]) -> List[Dict[str, Any]]: # List of dictionaries of form: # {"name": "smile", "user_id": 2} reactions_list: List[Dict[str, Any]] = [] for react_code in reactions: name = react_code.split(":")[1] usernames = reactions[react_code]["usernames"] for username in usernames: rc_user_id = username_to_user_id_map[username] user_id = user_id_mapper.get(rc_user_id) reactions_list.append({"name": name, "user_id": user_id}) return reactions_list def message_to_dict(message: Dict[str, Any]) -> Dict[str, Any]: rc_sender_id = message["u"]["_id"] sender_id = user_id_mapper.get(rc_sender_id) content = message["msg"] if message.get("reactions"): reactions = list_reactions(message["reactions"]) else: reactions = [] message_dict = dict( sender_id=sender_id, content=content, date_sent=int(message["ts"].timestamp()), reactions=reactions, ) # Add recipient_id and topic to message_dict if is_pm_data: # Message is in a PM or a huddle. rc_channel_id = message["rid"] if rc_channel_id in huddle_id_to_huddle_map: huddle_id = huddle_id_mapper.get(rc_channel_id) message_dict["recipient_id"] = huddle_id_to_recipient_id[ huddle_id] else: rc_member_ids = direct_id_to_direct_map[rc_channel_id]["uids"] if rc_sender_id == rc_member_ids[0]: zulip_member_id = user_id_mapper.get(rc_member_ids[1]) message_dict["recipient_id"] = user_id_to_recipient_id[ zulip_member_id] else: zulip_member_id = user_id_mapper.get(rc_member_ids[0]) message_dict["recipient_id"] = user_id_to_recipient_id[ zulip_member_id] # PMs and huddles don't have topics, but topic_name field is required in `build_message`. message_dict["topic_name"] = "" elif message["rid"] in dsc_id_to_dsc_map: # Message is in a discussion dsc_channel = dsc_id_to_dsc_map[message["rid"]] parent_channel_id = dsc_channel["prid"] stream_id = stream_id_mapper.get(parent_channel_id) message_dict["recipient_id"] = stream_id_to_recipient_id[stream_id] message_dict[ "topic_name"] = f'{dsc_channel["fname"]} (Imported from Rocket.Chat)' else: stream_id = stream_id_mapper.get(message["rid"]) message_dict["recipient_id"] = stream_id_to_recipient_id[stream_id] message_dict["topic_name"] = "Imported from Rocket.Chat" # Add mentions to message_dict mention_user_ids = set() for mention in message.get("mentions", []): mention_id = mention["_id"] if mention_id in ["all", "here"]: continue user_id = user_id_mapper.get(mention_id) mention_user_ids.add(user_id) message_dict["mention_user_ids"] = mention_user_ids return message_dict raw_messages: List[Dict[str, Any]] = [] for message in messages: if message.get("t") is not None: # Messages with a type are system notifications like user_joined # that we don't include. continue raw_messages.append(message_to_dict(message)) def process_batch(lst: List[Dict[str, Any]]) -> None: process_raw_message_batch( realm_id=realm_id, raw_messages=lst, subscriber_map=subscriber_map, user_handler=user_handler, is_pm_data=is_pm_data, output_dir=output_dir, total_reactions=total_reactions, ) chunk_size = 1000 process_list_in_batches( lst=raw_messages, chunk_size=chunk_size, process_batch=process_batch, )
def process_message_file(realm_id: int, fn: str, fn_id: int, files_dir: str, get_recipient_id: Callable[[ZerverFieldsT], int], message_key: str, zerver_subscription: List[ZerverFieldsT], data_dir: str, output_dir: str, user_handler: UserHandler, attachment_handler: AttachmentHandler) -> None: def get_raw_messages(fn: str) -> List[ZerverFieldsT]: with open(fn) as f: data = ujson.load(f) flat_data = [d[message_key] for d in data if message_key in d] def get_raw_message(d: Dict[str, Any]) -> ZerverFieldsT: if isinstance(d['sender'], str): # Some Hipchat instances just give us a person's # name in the sender field for NotificationMessage. # We turn them into a mirror user. mirror_user = user_handler.get_mirror_user( realm_id=realm_id, name=d['sender'], ) sender_id = mirror_user['id'] else: sender_id = d['sender']['id'] return dict( fn_id=fn_id, sender_id=sender_id, receiver_id=d.get('receiver', {}).get('id'), content=d['message'], mention_user_ids=d.get('mentions', []), pub_date=str_date_to_float(d['timestamp']), attachment=d.get('attachment'), files_dir=files_dir, ) raw_messages = [] for d in flat_data: raw_message = get_raw_message(d) raw_messages.append(raw_message) return raw_messages raw_messages = get_raw_messages(fn) def process_batch(lst: List[Any]) -> None: process_raw_message_batch( realm_id=realm_id, raw_messages=lst, zerver_subscription=zerver_subscription, user_handler=user_handler, attachment_handler=attachment_handler, get_recipient_id=get_recipient_id, output_dir=output_dir, ) chunk_size = 1000 process_list_in_batches( lst=raw_messages, chunk_size=chunk_size, process_batch=process_batch, )