def export_chat_csv(chat, filename, db, messages): """ Exports the chat messages to a CSV data file. @param chat chat data dict, as returned from SkypeDatabase @param filename full path and filename of resulting file @param db SkypeDatabase instance @param messages list of message data dicts """ parser = skypedata.MessageParser(db, chat=chat, stats=False) dialect = csv.excel # csv.excel.delimiter default "," is not actually used by Excel. # Default linefeed "\r\n" would cause another "\r" to be written. dialect.delimiter, dialect.lineterminator = ";", "\r" with open(filename, "wb") as f: writer = csv.writer(f, dialect) writer.writerow(["Time", "Author", "Message"]) for m in messages: text = parser.parse(m, output={"format": "text"}) try: text = text.decode("utf-8") except UnicodeError: pass values = [ m["datetime"].strftime("%Y-%m-%d %H:%M:%S"), m["from_dispname"], text ] values = [v.encode("latin1", "replace") for v in values] writer.writerow(values)
def export_chats_xlsx(chats, filename, db, messages=None, skip=True, progress=None): """ Exports the chats to a single XLSX file with chats on separate worksheets. @param chats list of chat data dicts, as returned from SkypeDatabase @param filename full path and filename of resulting file @param db SkypeDatabase instance @param messages list of messages to export if a single chat @param skip whether to skip chats with no messages @param progress function called before exporting each chat, with the number of messages exported so far @return number of chats exported """ count, style = 0, {0: "timestamp", 2: "wrap", 3: "hidden"} writer = xlsx_writer(filename, autowrap=[2]) message_count = 0 for chat in chats: if skip and not messages and not chat["message_count"]: main.log("Skipping exporting %s: no messages.", chat["title_long_lc"]) continue # continue for chat in chats main.status("Exporting %s.", chat["title_long_lc"]) if progress: progress(message_count) parser = skypedata.MessageParser(db, chat=chat, stats=False) writer.add_sheet(chat["title"]) writer.set_header(True) writer.writerow(["Time", "Author", "Message", "Skype Name"], {3: "boldhidden"}) writer.set_header(False) msgs = messages or db.get_messages(chat) for m in msgs: text = parser.parse(m, output={"format": "text"}) try: text = text.decode("utf-8") except UnicodeError: pass values = [m["datetime"], m["from_dispname"], text, m["author"]] style[1] = "local" if db.id == m["author"] else "remote" writer.writerow(values, style) message_count += chat["message_count"] count += 1 writer.close() return count
def run(self): self._is_running = True while self._is_running: search = self._queue.get() self._stop_work = False self._drop_results = False if search: parser = skypedata.MessageParser(search["db"]) # {"html": html with results, "map": link data map} # map data: {"contact:666": {"contact": {contact data}}, } result = {"html": "", "map": {}, "search": search} result_count = 0 counts = {"chats": 0, "contacts": 0, "messages": 0} # For finding matching items pattern = re.compile(".*(%s)" % re.escape(search["text"]), re.IGNORECASE) # For replacing matching text with bolded text pattern_replace = re.compile( "(%s)+" % re.escape(search["text"]), re.IGNORECASE) chats = search["db"].get_conversations() chats.sort(key=lambda x: x["title"]) chat_map = {} # {chat id: {chat data}} for chat in chats: chat_map[chat["id"]] = chat if "conversations" in search["tables"]: title_matches = False matching_authors = [] if pattern.match(chat["title"]): title_matches = True for participant in chat["participants"]: c = participant["contact"] if c: for n in [ c["fullname"], c["displayname"], c["identity"] ]: if n and pattern.match(n) \ and c not in matching_authors: matching_authors.append(c) if title_matches or matching_authors: result_count += 1 counts["chats"] += 1 title = chat["title"] if title_matches: title = pattern_replace.sub( lambda x: "<b>%s</b>" % x.group(0), title) entry = "<tr><td align='right' valign='top'>" \ "<font color='%s'>%s</font></td>" \ "<td colspan='2'><a href='chat:%s'>" \ "<font color='%s'>%s</font></a><br />" % ( conf.HistoryGreyColour, result_count, chat["id"], conf.HistoryLinkColour, chat["title_long"] ) if title_matches: entry += u"Title matches.<br />" if matching_authors: entry += u"Participant matches: %s.<br />" % \ ", ".join([u"%s (%s)" % ( pattern_replace.sub( lambda x: "<b>%s</b>" % x.group(0), c["fullname"] or c["displayname"] ), pattern_replace.sub( lambda x: "<b>%s</b>" % x.group(0), c["identity"] ) ) for c in matching_authors ]) entry += "</td></tr>" result["html"] += entry result["map"]["chat:%s" % chat["id"]] = \ {"chat": chat} if not counts["chats"] % conf.SearchResultsChunk \ and not self._drop_results: self._postback(result) result = { "html": "", "map": {}, "search": search } if self._stop_work: break # break for chat in chats if result["html"] and not self._drop_results: self._postback(result) result = {"html": "", "map": {}, "search": search} if not self._stop_work and "contacts" in search["tables"]: contacts = search["db"].get_contacts() # Possibly more: country (ISO code, need map), birthday (base # has YYYYMMDD in integer field). match_fields = [ "displayname", "skypename", "province", "city", "pstnnumber", "phone_home", "phone_office", "phone_mobile", "homepage", "emails", "about", "mood_text" ] field_titles = { "displayname": "Display name", "skypename": "Skype Name", "province": "State/Province", "city": "City", "pstnnumber": "Phone", "phone_home": "Home phone", "phone_office": "Office phone", "phone_mobile": "Mobile phone", "homepage": "Website", "emails": "Emails", "about": "About me", "mood_text": "Mood", "country": "Country/Region", "province": "State/Province", } for contact in contacts: match = False fields_filled = {} for field in match_fields: if contact[field]: value = contact[field] if pattern.match(contact[field]): match = True value = pattern_replace.sub( lambda x: "<b>%s</b>" % x.group(0), value) fields_filled[field] = value if match: entry = u"" if (not counts["contacts"]) and result_count: entry += "<tr><td colspan='3'><hr /></td></tr>" result_count += 1 counts["contacts"] += 1 entry += u"<tr><td align='right' valign='top'>" \ "<font color='%s'>%s</font></td>" \ "<td colspan='2'>" \ "<font color='%s'>Contact %s</font>" \ "<br /><table>" % ( conf.HistoryGreyColour, result_count, conf.ResultContactFieldColour, pattern_replace.sub( lambda x: "<b>%s</b>" % x.group(0), contact["name"] ) ) for field in match_fields: if field in fields_filled: entry += u"<tr><td nowrap valign='top'>" \ "<font color='%s'>%s</font></td>" \ "<td> </td><td>%s</td></tr>" \ % (conf.ResultContactFieldColour, field_titles[field], fields_filled[field] ) entry += "</table><br /></td></tr>" result["html"] += entry result["map"]["contact:%s" % contact["id"]] = \ {"contact": contact} if not \ counts["contacts"] % conf.SearchResultsChunk \ and not self._drop_results: self._postback(result) result = { "html": "", "map": {}, "search": search } if self._stop_work: break # break for contact in contacts if result["html"] and not self._drop_results: self._postback(result) result = {"html": "", "map": {}, "search": search} if not self._stop_work and "messages" in search["tables"]: chat_messages = {} # {chat id: [message, ]} chat_order = [] # [chat id, ] messages = search["db"].get_messages( ascending=False, body_like=search["text"]) for m in messages: chat = chat_map.get(m["convo_id"], None) chat_title = chat["title_long"] entry = u"" if (not counts["messages"]) and result_count: entry += "<tr><td colspan='3'><hr /></td></tr>" result["html"] += entry result_count += 1 counts["messages"] += 1 time_value = datetime.datetime.fromtimestamp( m["timestamp"]).strftime("%d.%m.%Y %H:%M") displayname = m["from_dispname"] body = parser.parse( m, pattern_replace, html={"w": search["window"].Size.width * 3 / 5}) if type(body) is str: body = body.decode("utf-8") entry = \ u"<tr><td align='right' valign='top'>" \ "<font color='%s'>%s</font></td>" \ "<td valign='top'><a href='message:%s'>" \ "<font color='%s'>%s%s</font></a></td>" \ "<td align='right' nowrap> " \ "<font color='%s'>%s</font></td></tr><tr>" \ "<td></td>" \ "<td width='100%%' valign='top' colspan='2'>" \ "%s<br/></td></tr>" \ % (conf.HistoryGreyColour, result_count, m["id"], conf.HistoryLinkColour, displayname, "" if ( skypedata.CHATS_TYPE_SINGLE == chat["type"] \ and m["author"] != search["db"].id ) else " in %s" % chat_title, conf.HistoryTimestampColour, time_value, body ) result["html"] += entry result["map"]["message:%s" % m["id"]] = \ {"chat": chat, "message": m} if self._stop_work: break # break for m in messages if not counts["messages"] % conf.SearchResultsChunk \ and not self._drop_results: self._postback(result) result = {"html": "", "map": {}, "search": search} if counts["messages"] >= conf.SearchMessagesMax: break if self._stop_work \ or counts["messages"] >= conf.SearchMessagesMax: break # break for c in chat_order final_text = "No matches found." if result_count: final_text = "" for table, count in counts.items(): if count: final_text += "%s%d %s" \ % (", " if final_text else "", count, table) final_text = "Found %s." % final_text if self._stop_work: final_text += " Search stopped by user." if counts["messages"] >= conf.SearchMessagesMax: final_text += " Search stopped at message limit %s." \ % conf.SearchMessagesMax result["html"] += "</table><br />%s</font>" % final_text result["done"] = True if not self._drop_results: self._postback(result)
def get_chat_diff(self, chat, db1, db2): """ Compares the chat in the two databases and returns the differences as { "messages": [[messages different in db1], [..db2]], "participants": [[participants different in db1], [..db2]] }. """ c = chat messages1 = list(db1.get_messages(c["c1"])) if c["c1"] else [] messages2 = list(db2.get_messages(c["c2"])) if c["c2"] else [] c1m_diff = [] # Messages different in chat 1 c2m_diff = [] # Messages different in chat 2 participants1 = c["c1"]["participants"] if c["c1"] else [] participants2 = c["c2"]["participants"] if c["c2"] else [] c1p_diff = [] # Participants different in chat 1 c2p_diff = [] # Participants different in chat 2 c1p_map = dict((p["identity"], p) for p in participants1) c2p_map = dict((p["identity"], p) for p in participants2) m1map = {} # {remote_id: [message, ], } m2map = {} # {remote_id: [message, ], } m1_no_remote_ids = [] # [message, ] with a NULL remote_id m2_no_remote_ids = [] # [message, ] with a NULL remote_id m1bodymap = {} # {author+type+body: [message, ], } m2bodymap = {} # {author+type+body: [message, ], } difftexts = {} # {id(message): text, } # Skip comparing messages if one side is completely empty parser1, parser2 = None, None if not messages1: c2m_diff, messages1, messages2 = messages2, [], [] elif not messages2: c1m_diff, messages1, messages2 = messages1, [], [] else: parser1 = skypedata.MessageParser(db1) parser2 = skypedata.MessageParser(db2) # Assemble maps by remote_id and create diff texts. remote_id is not # unique and can easily have duplicates. for messages, idmap, noidmap, bodymap, parser, mdiff in [ (messages1, m1map, m1_no_remote_ids, m1bodymap, parser1, c1m_diff), (messages2, m2map, m2_no_remote_ids, m2bodymap, parser2, c2m_diff) ]: for m in messages: if m["remote_id"]: if m["remote_id"] not in idmap: idmap[m["remote_id"]] = [] idmap[m["remote_id"]].append(m) else: noidmap.append(m) # In these messages, parsed body can differ even though # message is the same: contact names are taken from current # database values. Using raw values instead. if m["type"] in self.MESSAGE_TYPES_IGNORE_BODY: t = m["author"] if skypedata.MESSAGES_TYPE_LEAVE \ == m["type"] else m["identities"] else: t = parser.parse(m, text={"wrap": False}) t = t if type(t) is str else t.encode("utf-8") difftext = difftexts[id(m)] = "%s-%s-%s" % ( m["author"].encode("utf-8"), m["type"], t) if difftext not in bodymap: bodymap[difftext] = [] bodymap[difftext].append(m) # Compare assembled remote_id maps between databases and see if there # are no messages with matching body in the other database. for remote_id, m in [(r, j) for r, i in m1map.items() for j in i]: if remote_id in m2map: if not filter(lambda x: difftexts[id(m)] == difftexts[id(x)], m2map[remote_id]): # No message with same remote_id has same body c1m_diff.append(m) else: c1m_diff.append(m) for remote_id, m in [(r, j) for r, i in m2map.items() for j in i]: if remote_id in m1map: if not filter(lambda x: difftexts[id(m)] == difftexts[id(x)], m1map[remote_id]): # No message with same remote_id has same body c2m_diff.append(m) else: c2m_diff.append(m) # For messages with no remote_id-s, compare by author-type-body key # and see if there are no matching messages sufficiently close in time. for m in m1_no_remote_ids: potential_matches = m2bodymap.get(difftexts[id(m)], []) # Allow a 3-minute leeway between timestamps of duplicated messages if not [ i for i in potential_matches if (abs(i["timestamp"] - m["timestamp"]) < 3 * 60) ]: c1m_diff.append(m) for m in m2_no_remote_ids: potential_matches = m1bodymap.get(difftexts[id(m)], []) # Allow a 3-minute leeway between timestamps of duplicated messages if not [ i for i in potential_matches if (abs(i["timestamp"] - m["timestamp"]) < 180) ]: c2m_diff.append(m) for p in participants1: if p["identity"] not in c2p_map: c1p_diff.append(p) for p in participants2: if p["identity"] not in c1p_map: c2p_diff.append(p) c1m_diff.sort(lambda a, b: cmp(a["datetime"], b["datetime"])) c2m_diff.sort(lambda a, b: cmp(a["datetime"], b["datetime"])) result = { "messages": [c1m_diff, c2m_diff], "participants": [c1p_diff, c2p_diff] } return result
def export_chat_template(chat, filename, db, messages): """ Exports the chat messages to file using templates. @param chat chat data dict, as returned from SkypeDatabase @param filename full path and filename of resulting file, file extension .html|.txt determines file format @param db SkypeDatabase instance @param messages list of message data dicts """ tmpfile, tmpname = None, None # Temporary file for exported messages try: is_html = filename.lower().endswith(".html") parser = skypedata.MessageParser(db, chat=chat, stats=True) namespace = { "db": db, "chat": chat, "messages": messages, "parser": parser } # As HTML and TXT contain statistics in their headers before # messages, write out all messages to a temporary file first, # statistics will be available for the main file after parsing. # Cannot keep all messages in memory at once - very large chats # (500,000+ messages) can take gigabytes. tmpname = util.unique_path("%s.messages" % filename) tmpfile = open(tmpname, "w+") mtemplate = templates.CHAT_MESSAGES_HTML if is_html \ else templates.CHAT_MESSAGES_TXT step.Template(mtemplate, strip=False).stream(tmpfile, namespace) namespace["stats"] = stats = parser.get_collected_stats() namespace.update({ "date1": stats["startdate"].strftime("%d.%m.%Y") if stats.get("startdate") else "", "date2": stats["enddate"].strftime("%d.%m.%Y") if stats.get("enddate") else "", "emoticons_used": [x for x in parser.emoticons_unique if hasattr(emoticons, x)], "message_count": stats.get("messages", 0), }) if is_html: # Collect chat and participant images. namespace.update({ "participants": [], "chat_picture_size": None, "chat_picture_raw": None, }) if chat["meta_picture"]: raw = skypedata.fix_image_raw(chat["meta_picture"]) namespace["chat_picture_raw"] = raw namespace["chat_picture_size"] = util.img_size(raw) contacts = dict((c["skypename"], c) for c in db.get_contacts()) partics = dict((p["identity"], p) for p in chat["participants"]) # There can be authors not among participants, and vice versa for author in stats["authors"].union(partics): contact = partics.get(author, {}).get("contact") contact = contact or contacts.get(author, {}) contact = contact or {"identity": author, "name": author} bmp = contact.get("avatar_bitmap") raw = contact.get("avatar_raw_small") or "" raw_large = contact.get("avatar_raw_large") or "" if not raw and not bmp: raw = skypedata.get_avatar_raw(contact, conf.AvatarImageSize) raw = bmp and util.img_wx_to_raw(bmp) or raw if raw: raw_large = raw_large or skypedata.get_avatar_raw( contact, conf.AvatarImageLargeSize) contact["avatar_raw_small"] = raw contact["avatar_raw_large"] = raw_large contact["rank"] = partics.get(author, {}).get("rank") namespace["participants"].append(contact) tmpfile.flush(), tmpfile.seek(0) namespace["message_buffer"] = iter(lambda: tmpfile.read(65536), "") template = templates.CHAT_HTML if is_html else templates.CHAT_TXT with open(filename, "w") as f: step.Template(template, strip=False).stream(f, namespace) finally: if tmpfile: util.try_until(tmpfile.close) if tmpname: util.try_until(lambda: os.unlink(tmpname))
def run(self): self._is_running = True # For identifying "chat:xxx" and "from:xxx" keywords query_parser = searchparser.SearchQueryParser() result = None while self._is_running: try: search = self._queue.get() if not search: continue # continue while self._is_running is_text_output = ("text" == search.get("output")) wrap_html = None # MessageParser wrap function, for HTML output if is_text_output: TEMPLATES = { "chat": templates.SEARCH_ROW_CHAT_TXT, "contact": templates.SEARCH_ROW_CONTACT_TXT, "message": templates.SEARCH_ROW_MESSAGE_TXT, "table": templates.SEARCH_ROW_TABLE_HEADER_TXT, "row": templates.SEARCH_ROW_TABLE_TXT, } wrap_b = lambda x: "**%s**" % x.group(0) output = {"format": "text"} else: TEMPLATES = { "chat": templates.SEARCH_ROW_CHAT_HTML, "contact": templates.SEARCH_ROW_CONTACT_HTML, "message": templates.SEARCH_ROW_MESSAGE_HTML, "table": templates.SEARCH_ROW_TABLE_HEADER_HTML, "row": templates.SEARCH_ROW_TABLE_HTML, } wrap_b = lambda x: "<b>%s</b>" % x.group(0) output = {"format": "html"} width = search.get("width", -1) if width > 0: dc = wx.MemoryDC() dc.SetFont( wx.Font(8, wx.SWISS, wx.NORMAL, wx.NORMAL, face=conf.HistoryFontName)) wrap_html = lambda x: wx.lib.wordwrap.wordwrap( x, width, dc) output["wrap"] = True main.log("Searching for \"%(text)s\" in %(table)s (%(db)s)." % search) self._stop_work = False self._drop_results = False parser = skypedata.MessageParser(search["db"], wrapfunc=wrap_html) # {"output": text with results, "map": link data map} # map data: {"contact:666": {"contact": {contact data}}, } result_type, result_count, count = None, 0, 0 result = { "output": "", "map": {}, "search": search, "count": 0 } sql, params, match_words = query_parser.Parse(search["text"]) # Turn wildcard characters * into regex-compatible .* match_words_re = [ ".*".join(map(re.escape, w.split("*"))) for w in match_words ] patt = "(%s)" % "|".join(match_words_re) # For replacing matching words with <b>words</b> pattern_replace = re.compile(patt, re.IGNORECASE) # Find chats with a matching title or matching participants chats = search["db"].get_conversations() chats.sort(key=lambda x: x["title"]) chat_map = {} # {chat id: {chat data}} template_chat = step.Template(TEMPLATES["chat"]) for chat in chats: chat_map[chat["id"]] = chat if "conversations" == search["table"] and match_words: title_matches = False matching_authors = [] if self.match_all(chat["title"], match_words): title_matches = True for participant in chat["participants"]: contact = participant["contact"] if contact: for n in filter(None, [ contact["fullname"], contact["displayname"], contact["identity"] ]): if self.match_all(n, match_words) \ and contact not in matching_authors: matching_authors.append(contact) if title_matches or matching_authors: count += 1 result_count += 1 result["output"] += template_chat.expand(locals()) key = "chat:%s" % chat["id"] result["map"][key] = {"chat": chat["id"]} if not count % conf.SearchResultsChunk \ and not self._drop_results: result["count"] = result_count self.postback(result) result = { "output": "", "map": {}, "search": search, "count": 0 } if self._stop_work: break # break for chat in chats if result["output"] and not self._drop_results: result["count"] = result_count self.postback(result) result = { "output": "", "map": {}, "search": search, "count": 0 } # Find contacts with a matching name if not self._stop_work and "contacts" == search["table"] \ and match_words: count = 0 contacts = search["db"].get_contacts() # Possibly more: country (ISO code, need map), birthday # (base has YYYYMMDD in integer field). match_fields = [ "displayname", "skypename", "province", "city", "pstnnumber", "phone_home", "phone_office", "phone_mobile", "homepage", "emails", "about", "mood_text", ] template_contact = step.Template(TEMPLATES["contact"]) for contact in contacts: match = False fields_filled = {} for field in match_fields: if contact[field]: val = contact[field] if self.match_all(val, match_words): match = True val = pattern_replace.sub(wrap_b, val) fields_filled[field] = val if match: count += 1 result_count += 1 result["output"] += template_contact.expand( locals()) if not (self._drop_results or count % conf.SearchResultsChunk): result["count"] = result_count self.postback(result) result = { "output": "", "map": {}, "search": search, "count": 0 } if self._stop_work: break # break for contact in contacts if result["output"] and not self._drop_results: result["count"] = result_count self.postback(result) result = { "output": "", "map": {}, "search": search, "count": 0 } # Find messages with a matching body if not self._stop_work and "messages" == search["table"]: template_message = step.Template(TEMPLATES["message"]) count, result_type = 0, "messages" chat_messages = {} # {chat id: [message, ]} chat_order = [] # [chat id, ] messages = search["db"].get_messages( additional_sql=sql, additional_params=params, ascending=False, use_cache=False) for m in messages: chat = chat_map.get(m["convo_id"]) body = parser.parse( m, pattern_replace if match_words else None, output) count += 1 result_count += 1 result["output"] += template_message.expand(locals()) key = "message:%s" % m["id"] result["map"][key] = { "chat": chat["id"], "message": m["id"] } if is_text_output or ( not self._drop_results and not count % conf.SearchResultsChunk): result["count"] = result_count self.postback(result) result = { "output": "", "map": {}, "search": search, "count": 0 } if self._stop_work or (not is_text_output and count >= conf.SearchMessagesMax): break # break for m in messages infotext = search["table"] if not self._stop_work and "all tables" == search["table"]: infotext, result_type = "", "table row" # Search over all fields of all tables. template_table = step.Template(TEMPLATES["table"]) template_row = step.Template(TEMPLATES["row"]) for table in search["db"].get_tables(): table["columns"] = search["db"].get_table_columns( table["name"]) sql, params, words = \ query_parser.Parse(search["text"], table) if not sql: continue # continue for table in search["db"].. infotext += (", " if infotext else "") + table["name"] rows = search["db"].execute(sql, params) row = rows.fetchone() if not row: continue # continue for table in search["db"].. result["output"] = template_table.expand(locals()) count = 0 while row: count += 1 result_count += 1 result["output"] += template_row.expand(locals()) key = "table:%s:%s" % (table["name"], count) result["map"][key] = { "table": table["name"], "row": row } if not count % conf.SearchResultsChunk \ and not self._drop_results: result["count"] = result_count self.postback(result) result = { "output": "", "map": {}, "search": search, "count": 0 } if self._stop_work or (not is_text_output and result_count >= conf.SearchTableRowsMax): break # break while row row = rows.fetchone() if not self._drop_results: if not is_text_output: result["output"] += "</table>" result["count"] = result_count self.postback(result) result = { "output": "", "map": {}, "search": search, "count": 0 } infotext += " (%s)" % util.plural("result", count) if self._stop_work or ( not is_text_output and result_count >= conf.SearchTableRowsMax): break # break for table in search["db"].. single_table = ("," not in infotext) infotext = "table%s: %s" % \ ("" if single_table else "s", infotext) if not single_table: infotext += "; %s in total" % \ util.plural("result", result_count) final_text = "No matches found." if self._drop_results: result["output"] = "" if result_count: final_text = "Finished searching %s." % infotext if self._stop_work: final_text += " Stopped by user." elif "messages" == result_type and not is_text_output \ and count >= conf.SearchMessagesMax: final_text += " Stopped at %s limit %s." % \ (result_type, conf.SearchMessagesMax) elif "table row" == result_type and not is_text_output \ and count >= conf.SearchTableRowsMax: final_text += " Stopped at %s limit %s." % \ (result_type, conf.SearchTableRowsMax) result[ "output"] += "</table><br /><br />%s</font>" % final_text if is_text_output: result["output"] = "" result["done"] = True result["count"] = result_count self.postback(result) main.log("Search found %(count)s results." % result) except Exception as e: if not result: result = {} result["done"], result["error"] = True, traceback.format_exc() result["error_short"] = "%s: %s" % (type(e).__name__, e.message) self.postback(result)
def get_chat_diff_left(self, chat, db1, db2): """ Compares the chat in the two databases and returns the differences from the left as {"messages": [message IDs different in db1], "participants": [participants different in db1] }. """ c = chat participants1 = c["c1"]["participants"] if c["c1"] else [] participants2 = c["c2"]["participants"] if c["c2"] else [] c2p_map = dict((p["identity"], p) for p in participants2) c1p_diff = [p for p in participants1 if p["identity"] not in c2p_map] c1m_diff = [] # [(id, datetime), ] messages different in chat 1 if not c["messages1"]: messages1, messages2 = [], [] # Left side empty, skip all messages elif not c["messages2"]: messages1, messages2 = [], [] # Right side empty, take whole left messages_all = db1.get_messages(c["c1"], use_cache=False) c1m_diff = [(m["id"], m["datetime"]) for m in messages_all] else: messages1 = db1.get_messages(c["c1"], use_cache=False) messages2 = db2.get_messages(c["c2"], use_cache=False) parser1 = skypedata.MessageParser(db1) parser2 = skypedata.MessageParser(db2) m1map = {} # {remote_id: [(id, datetime), ], } m2map = {} # {remote_id: [(id, datetime), ], } m1_no_remote_ids = [] # [(id, datetime), ] with a NULL remote_id m2_no_remote_ids = [] # [(id, datetime), ] with a NULL remote_id m1bodymap = {} # {author+type+body: [(id, datetime), ], } m2bodymap = {} # {author+type+body: [(id, datetime), ], } difftexts = {} # {(id, datetime): text, } # Assemble maps by remote_id and create diff texts. remote_id is # not unique and can easily have duplicates. things = [(messages1, m1map, m1_no_remote_ids, m1bodymap, parser1), (messages2, m2map, m2_no_remote_ids, m2bodymap, parser2)] for messages, idmap, noidmap, bodymap, parser in things: for i, m in enumerate(messages): # Avoid keeping whole messages in memory, can run out. m_cache = (m["id"], m.get("datetime")) if m["remote_id"]: if m["remote_id"] not in idmap: idmap[m["remote_id"]] = [] idmap[m["remote_id"]].append(m_cache) else: noidmap.append(m_cache) # In these messages, parsed body can differ even though # message is the same: contact names are taken from current # database values. Using raw values instead. if m["type"] in self.MESSAGE_TYPES_IGNORE_BODY: t = m["identities"] if skypedata.MESSAGES_TYPE_LEAVE == m["type"]: t = m["author"] else: t = parser.parse(m, output={"format": "text"}) t = t if isinstance(t, str) else t.encode("utf-8") author = (m["author"] or "").encode("utf-8") difftext = "%s-%s-%s" % (author, m["type"], t) difftexts[m_cache] = difftext if difftext not in bodymap: bodymap[difftext] = [] bodymap[difftext].append(m_cache) if i and not i % self.REFRESH_COUNT: self.yield_ui() # Compare assembled remote_id maps between databases and see if # there are no messages with matching body in the other database. remote_id_messages = [(r, j) for r, i in m1map.items() for j in i] for i, (remote_id, m) in enumerate(remote_id_messages): if remote_id in m2map: is_match = lambda x: (difftexts[m] == difftexts[x]) if not any(filter(is_match, m2map[remote_id])): c1m_diff.append(m) # Nothing with same remote_id+body else: c1m_diff.append(m) if i and not i % self.REFRESH_COUNT: self.yield_ui() # For messages with no remote_id-s, compare by author-type-body key # and see if there are no matching messages close in time. for i, m in enumerate(m1_no_remote_ids): potential_matches = m2bodymap.get(difftexts[m], []) if not [ m2 for m2 in potential_matches if self.match_time(m[1], m2[1], 180) ]: c1m_diff.append(m) if i and not i % self.REFRESH_COUNT: self.yield_ui() message_ids1 = [m[0] for m in sorted(c1m_diff, key=lambda x: x[1])] result = {"messages": message_ids1, "participants": c1p_diff} return result
def get_chat_diff(self, chat, db1, db2): """ Compares the chat in the two databases and returns the differences as {"messages": [[IDs of messages different in db1], [..db2]], "participants": [[participants different in db1], [..db2]]}. """ c = chat messages1 = db1.get_messages(c["c1"], use_cache=False) \ if c["c1"] else [] messages2 = db2.get_messages(c["c2"], use_cache=False) \ if c["c2"] else [] c1m_diff = [] # Messages different in chat 1 c2m_diff = [] # Messages different in chat 2 participants1 = c["c1"]["participants"] if c["c1"] else [] participants2 = c["c2"]["participants"] if c["c2"] else [] c1p_diff = [] # Participants different in chat 1 c2p_diff = [] # Participants different in chat 2 c1p_map = dict((p["identity"], p) for p in participants1) c2p_map = dict((p["identity"], p) for p in participants2) m1map = {} # {remote_id: [(id, datetime), ], } m2map = {} # {remote_id: [(id, datetime), ], } m1_no_remote_ids = [] # [(id, datetime), ] with a NULL remote_id m2_no_remote_ids = [] # [(id, datetime), ] with a NULL remote_id m1bodymap = {} # {author+type+body: [(id, datetime), ], } m2bodymap = {} # {author+type+body: [(id, datetime), ], } difftexts = {} # {(id, datetime): text, } # Skip comparing messages if one side is completely empty parser1, parser2 = None, None if not messages1: c2m_diff = [(m["id"], m.get("datetime")) for m in messages2] messages1, messages2 = [], [] elif not messages2: c1m_diff = [(m["id"], m.get("datetime")) for m in messages1] messages1, messages2 = [], [] else: parser1 = skypedata.MessageParser(db1) parser2 = skypedata.MessageParser(db2) # Assemble maps by remote_id and create diff texts. remote_id is # not unique and can easily have duplicates. things = [(messages1, m1map, m1_no_remote_ids, m1bodymap, parser1), (messages2, m2map, m2_no_remote_ids, m2bodymap, parser2)] for messages, idmap, noidmap, bodymap, parser in things: for i, m in enumerate(messages): # Avoid keeping whole messages in memory, can easily run out. m_cache = (m["id"], m.get("datetime")) if m["remote_id"]: if m["remote_id"] not in idmap: idmap[m["remote_id"]] = [] idmap[m["remote_id"]].append(m_cache) else: noidmap.append(m_cache) # In these messages, parsed body can differ even though # message is the same: contact names are taken from current # database values. Using raw values instead. if m["type"] in self.MESSAGE_TYPES_IGNORE_BODY: t = m["identities"] if skypedata.MESSAGES_TYPE_LEAVE == m["type"]: t = m["author"] else: t = parser.parse(m, output={"format": "text"}) t = t if isinstance(t, str) else t.encode("utf-8") author = (m["author"] or "").encode("utf-8") difftext = "%s-%s-%s" % (author, m["type"], t) difftexts[m_cache] = difftext if difftext not in bodymap: bodymap[difftext] = [] bodymap[difftext].append(m_cache) if i and not i % self.REFRESH_COUNT: self.yield_ui() # Compare assembled remote_id maps between databases and see if there # are no messages with matching body in the other database. remote_id_maps = [(m1map, m2map, c1m_diff), (m2map, m1map, c2m_diff)] for map1, map2, output in remote_id_maps: remote_id_messages = [(r, j) for r, i in map1.items() for j in i] for i, (remote_id, m) in enumerate(remote_id_messages): if remote_id in map2: is_match = lambda x: difftexts[m] == difftexts[x] if not any(filter(is_match, map2[remote_id])): output.append( m) # Nothing with same remote_id and body else: output.append(m) if i and not i % self.REFRESH_COUNT: self.yield_ui() # For messages with no remote_id-s, compare by author-type-body key # and see if there are no matching messages sufficiently close in time. no_remote_ids = [(m1_no_remote_ids, c1m_diff, m2bodymap), (m2_no_remote_ids, c2m_diff, m1bodymap)] for m_no_remote_ids, output, mbodymap in no_remote_ids: for i, m in enumerate(m_no_remote_ids): potential_matches = mbodymap.get(difftexts[m], []) if not [ m2 for m2 in potential_matches if self.match_time(m[1], m2[1], 180) ]: output.append(m) if i and not i % self.REFRESH_COUNT: self.yield_ui() for p in participants1: if p["identity"] not in c2p_map: c1p_diff.append(p) for p in participants2: if p["identity"] not in c1p_map: c2p_diff.append(p) c1m_diff.sort(lambda a, b: cmp(a[1], b[1])) c2m_diff.sort(lambda a, b: cmp(a[1], b[1])) message_ids1 = [m[0] for m in c1m_diff] message_ids2 = [m[0] for m in c2m_diff] result = { "messages": [message_ids1, message_ids2], "participants": [c1p_diff, c2p_diff] } return result
def export_chat_template(chat, filename, db, messages): """ Exports the chat messages to file using templates. @param chat chat data dict, as returned from SkypeDatabase @param filename full path and filename of resulting file, file extension .html|.txt determines file format @param db SkypeDatabase instance @param messages list of message data dicts """ tmpfile, tmpname = None, None # Temporary file for exported messages try: is_html = filename.lower().endswith(".html") parser = skypedata.MessageParser(db, chat=chat, stats=is_html) namespace = { "db": db, "chat": chat, "messages": messages, "parser": parser } if is_html: # Collect chat and participant images. namespace.update({ "participants": [], "chat_picture_size": None, "chat_picture_raw": None, }) if chat["meta_picture"]: raw = skypedata.fix_image_raw(chat["meta_picture"]) imgparser = ImageFile.Parser() imgparser.feed(raw) img = imgparser.close() namespace.update(chat_picture_size=img.size, chat_picture_raw=raw) for p in chat["participants"]: contact = p["contact"].copy() namespace["participants"].append(contact) contact.update(avatar_raw_small="", avatar_raw_large="") bmp = contact.get("avatar_bitmap") raw = contact.get("avatar_raw_small") raw_large = contact.get("avatar_raw_large") if not raw and not bmp: raw = skypedata.get_avatar_raw(contact, conf.AvatarImageSize) if raw: p["contact"]["avatar_raw_small"] = raw raw = bmp and util.wx_bitmap_to_raw(bmp) or raw if raw: if not raw_large: size_large = conf.AvatarImageLargeSize raw_large = skypedata.get_avatar_raw( contact, size_large) p["contact"]["avatar_raw_large"] = raw_large contact["avatar_raw_small"] = raw contact["avatar_raw_large"] = raw_large # As HTML and TXT contain statistics in their headers before # messages, write out all messages to a temporary file first, # statistics will be available for the main file after parsing. # Cannot keep all messages in memory at once - very large chats # (500,000+ messages) can take gigabytes. tmpname = util.unique_path("%s.messages" % filename) tmpfile = open(tmpname, "w+") mtemplate = templates.CHAT_MESSAGES_HTML if is_html \ else templates.CHAT_MESSAGES_TXT step.Template(mtemplate, strip=False).stream(tmpfile, namespace) namespace["stats"] = stats = parser.get_collected_stats() namespace.update({ "date1": stats["startdate"].strftime("%d.%m.%Y") if stats.get("startdate") else "", "date2": stats["enddate"].strftime("%d.%m.%Y") if stats.get("enddate") else "", "emoticons_used": list( filter(lambda e: hasattr(emoticons, e), parser.emoticons_unique)), "message_count": stats.get("messages", 0), }) tmpfile.flush(), tmpfile.seek(0) namespace["message_buffer"] = iter(lambda: tmpfile.read(65536), "") template = templates.CHAT_HTML if is_html else templates.CHAT_TXT with open(filename, "w") as f: step.Template(template, strip=False).stream(f, namespace) finally: if tmpfile: util.try_until(tmpfile.close) if tmpname: util.try_until(lambda: os.unlink(tmpname))
def export_chat(chat, messages, filename, db): """ Exports the chat messages to file. @param chat chat data dict, as returned from SkypeDatabase @param messages list of message data dicts @param filename full path and filename of resulting file, file extension .html|.txt determines file format @param db SkypeDatabase instance """ result = False f = None try: is_html = filename.lower().endswith(".html") is_csv = filename.lower().endswith(".csv") is_txt = filename.lower().endswith(".txt") if is_txt: f = codecs.open(filename, "w", "utf-8") else: f = open(filename, "w") # @todo add stats? parser = skypedata.MessageParser(db) chat_title = chat["title_long_lc"] main_data = { "title": chat_title, "date1": messages[0]["datetime"].strftime("%d.%m.%Y") \ if len(messages) else "", "date2": messages[-1]["datetime"].strftime("%d.%m.%Y") \ if len(messages) else "", "messages_total": util.plural("message", chat["message_count"]), "chat_created": chat["created_datetime"].strftime("%d.%m.%Y") \ if chat["created_datetime"] else "", "app": conf.Title, "now": datetime.datetime.now() \ .strftime("%d.%m.%Y %H:%M"), "db": db.filename, "count": str(len(messages)), "chat_info": "Showing %s" \ % util.plural("message", len(messages)), } if is_html: # Write HTML header and table header header_data = dict([(k, escape(v)) for k, v in main_data.items()]) if header_data["date1"] and header_data["date2"]: header_data["chat_info"] += \ " from <b>%(date1)s</b> to <b>%(date2)s</b>" % header_data header_data["chat_info"] += ".<br />" if header_data["chat_created"]: header_data["chat_info"] += \ "Chat created on <b>%(chat_created)s</b>" % header_data if header_data["messages_total"]: header_data["chat_info"] += \ ("," if header_data["chat_created"] else "Chat has") +\ " <b>%(messages_total)s</b> in total" % header_data if header_data["chat_created"] or header_data["messages_total"]: header_data["chat_info"] += ".<br />" header_data.update({ "title": "History of Skype " + header_data["title"], "css_avatars": "", "css_chat_picture": "", "header_left": "", "header_right": "", "header_link": "" }) if chat["meta_picture"]: raw = chat["meta_picture"].encode("latin1") if raw.startswith("\0"): # For some reason, Skype image blobs # can start with a null byte. raw = raw[1:] if raw.startswith("\0"): raw = "\xFF" + raw[1:] header_data["header_left"] = htmltag("span", { "class": "chat_picture", "title": chat["title"] }) img = wx.ImageFromStream(cStringIO.StringIO(raw)) header_data["css_chat_picture"] = cssrule("span.chat_picture",{ "background": "url(data:image/jpg;base64,%s) " \ "center center no-repeat" % base64.b64encode(raw), "margin": "0px 10px 0px 10px", "display": "block", "width": "%spx" % img.Width, "height": "%spx" % img.Height, }) if chat["participants"]: for p in chat["participants"]: avatar_class = "avatar__default" if "avatar_image" in p["contact"] \ and p["contact"]["avatar_image"]: raw = p["contact"]["avatar_image"].encode("latin1") if raw.startswith("\0"): # For some reason, Skype avatar image blobs # can start with a null byte. raw = raw[1:] if raw.startswith("\0"): #raw = raw[1:] raw = "\xFF" + raw[1:] # Replace dots and commas, as they are # not valid CSS identifier characters avatar_class = "avatar_" \ + p["identity"].replace(".", "___") \ .replace(",", "---") header_data["css_avatars"] += cssrule( "span.%s" % avatar_class, { "background": "url(data:image/jpg;base64,%s) center " \ "center no-repeat" % base64.b64encode(raw) }) if skypedata.CHATS_TYPE_SINGLE == chat["type"]: title = p["contact"]["name"] name = escape(p["contact"]["name"]) if p["contact"]["name"] != p["identity"]: title += " (%s)" % p["identity"] name += "<br />(%s)" % escape(p["identity"]) side = "right" if (p["identity"] == db.id) else "left" header_data["header_%s" % side] = "<div>%s" \ "<br />%s</div>" % ( htmltag("span", {"title": title, "class": "avatar header %s" % avatar_class} ), name ) if skypedata.CHATS_TYPE_SINGLE != chat["type"]: header_data["header_link"] = PARTICIPANTS_LINK for k, v in header_data.items(): header_data[k] = str(v) f.write(CHAT_HTML_HEADER % header_data) if skypedata.CHATS_TYPE_SINGLE != chat["type"]: for p in sorted(chat["participants"], key=lambda a: a["contact"]["name"].lower()): img_attr = {"class": "avatar avatar__default"} img_attr["title"] = p["contact"]["name"] if p["contact"]["name"] != p["identity"]: img_attr["title"] += " (%s)" % p["identity"] if "avatar_image" in p["contact"] \ and p["contact"]["avatar_image"]: # Replace dots and commas, as they are not valid # CSS identifier characters img_attr["class"] = "avatar avatar_%s" \ % p["identity"].replace(".", "___") \ .replace(",", "---") name = escape(p["contact"]["name"]) if p["contact"]["name"] != p["identity"]: name += "<br />(%s)" % escape(p["identity"]) f.write(" <span>%(img)s%(name)s</span>\n" % { "name": name, "img": htmltag("span", img_attr), }) f.write(" </div>\r\n </td>\r\n</tr>\r\n</table>\r\n" \ "</td></tr><tr><td><table class='content_table'>\r\n") elif is_txt: main_data["hr"] = "-" * 79 f.write("History of Skype %(title)s.\r\n" \ "Showing %(count)s messages" % main_data) if main_data["date1"] and main_data["date2"]: f.write(" from %(date1)s to %(date2)s" % main_data) f.write(".\r\n") if main_data["chat_created"]: f.write("Chat created on %(chat_created)s" % main_data) else: f.write("Chat has") if main_data["messages_total"]: f.write(("," if main_data["chat_created"] else "") + " %(messages_total)s in total" % main_data) f.write(".\r\n") f.write( "Source: %(db)s.\r\n" \ "Exported with %(app)s on %(now)s." \ "\r\n%(hr)s\r\n" % main_data ) elif is_csv: # Initialize CSV writer and write header row dialect = csv.excel # Default is "," which is actually not Excel dialect.delimiter = ";" # Default is "\r\n", which causes another "\r" to be written dialect.lineterminator = "\r" csv_writer = csv.writer(f, dialect) csv_writer.writerow(["Time", "Author", "Message"]) colourmap = collections.defaultdict(lambda: "remote") colourmap[db.id] = "local" previous_day = datetime.date.fromtimestamp(0) for m in messages: if m["datetime"].date() != previous_day: # Day has changed: insert a date header previous_day = m["datetime"].date() weekday = previous_day.strftime("%A").capitalize() date = previous_day.strftime("%d. %B %Y") if locale.getpreferredencoding(): weekday = weekday.decode(locale.getpreferredencoding()) date = date.decode(locale.getpreferredencoding()) if is_html: f.write( "<tr>\r\n\t<td class='t1'></td>\r\n\t" \ "<td class='day t2'></td>\r\n\t" \ "<td class='day t3'></td>\r\n\t" \ "<td class='day' colspan='2'><span class='weekday'>" \ "%(weekday)s</span>, %(date)s</td>\r\n</tr>\r\n" % { "weekday": escape(weekday), "date": escape(date) }) elif is_txt: f.write("\r\n%(weekday)s, %(date)s\r\n%(hr)s\r\n\r\n" % { "weekday": weekday, "date": date, "hr": "-" * 40 }) if is_html: body = parser.parse(m, html={"w": -1, "export": True}) f.write("<tr>\r\n\t" \ "<td class='author %(authorclass)s' colspan='2'>" \ "%(name)s</td>\r\n\t" \ "<td class='t3'></td>\r\n\t<td>%(text)s</td>\r\n\t" \ "<td class='timestamp'>%(time)s</td>\r\n</tr>\r\n" % { "authorclass": colourmap[m["author"]], "time": m["datetime"].strftime("%H:%S"), "name": escape(m["from_dispname"]), "text": body }) else: parsed_text = parser.parse(m, text=True) try: parsed_text = parsed_text.decode("utf-8") except Exception, e: pass if is_txt: f.write( "%(datetime)s %(name)s:\r\n%(text)s\r\n\r\n" % { "datetime": m["datetime"].strftime("%H:%S"), "name": m["from_dispname"], "text": parsed_text }) elif is_csv: try: parsed_text = parser.parse(m, text=True) parsed_text = parsed_text.decode("utf-8") except Exception, e: pass values = [ m["datetime"].strftime("%Y-%m-%d %H:%M:%S"), m["from_dispname"].encode("utf-8"), parsed_text.encode("utf-8") ] csv_writer.writerow(values)