def parse_string(string, pattern, path=False): s = re.split('{(.*?)}', pattern) counts = {} for i in range(0, len(s), 2): s[i] = re.escape(s[i]) for i in range(1, len(s), 2): item = s[i].split(' ', 1) key = item[0] if len(item) == 1: c = '[^' + re.escape(sep) + ']' if path else '.' fmt = "(?P<%s%i>{}*?)".format(c) else: escaped_item = re.escape(item[1]) fmt = "(?P<%s%i>{})?".format(escaped_item) if key not in counts: counts[key] = 0 counts[key] += 1 s[i] = fmt % (key, counts[key]) regex_pattern = ''.join(s) s = re.search(regex_pattern, string) if not s: return None results = {} for key, value in iter(s.groupdict().items()): k = re.sub('\d', '', key) if k in results and results[k] != value: raise ParseError("Problem parsing string '%s'" % string) results[k] = value return results
def _parse_time(self, timestr, fmt): match = re.match(self.TIMESTR_RE, timestr) try: ts1, ts2 = match.groups() ts2 = ts2.replace(':', '') except: raise ParseError("problem parsing time string %s" % timestr) dt = datetime.datetime.strptime(ts1, fmt) return dt.replace(tzinfo=getoffset(None, ts2))
def _parse_line(self, line, conversation, source, transformed_source): """Return (cons, attrs)""" status_html = [] attrs = {} cons = None for elem in BeautifulSoup(line, ['lxml', 'xml']).children: if isinstance(elem, Comment): alternate, status_html = elem.split('|', 1) attrs['alternate'] = True if alternate else False status_html = [NavigableString(status_html)] continue for key in ('alias', 'sender', 'auto', 'time'): attrs[key] = elem.get(key, '') if attrs['sender'] == source: attrs['sender'] = transformed_source attrs['isuser'] = True else: attrs['isuser'] = False attrs['auto'] = bool(attrs['auto']) if attrs['time']: fmt = self.STRPTIME_FMT_CONVERSATION attrs['time'] = self._parse_time(attrs['time'], fmt) attrs['html'] = list(elem.children) if elem.name == 'status': cons = Status attrs['type'] = self.STATUS_TYPEMAP.get(elem.get('type'), None) if attrs['type'] in Status.USER_TYPES: attrs['msg_html'] = attrs['html'] attrs['html'] = status_html elif elem.name == 'event': cons = Event attrs['type'] = self.EVENT_TYPEMAP.get(elem.get('type'), None) elif elem.name == 'message': cons = Message else: raise TypeError("unknown type '%s' for entry" % elem.name) if not attrs['sender'] and not attrs['alias']: print_d("%s is a system entry" % elem) attrs['system'] = True if not cons: raise (ParseError("could not parse line: '%s'" % line)) return cons, attrs
def parse_conversation(self, conversation): lines = conversation.lines xml_header = lines.pop(0) conversation.original_parser_name = self.type for e in BeautifulSoup(lines.pop(0), ['lxml', 'xml']).children: if isinstance(e, Comment): conversation.original_parser_name = e.split('/')[1] else: service = self.SERVICE_MAP[e.get('service')] source = e.get('account') conversation.resource = e.get('resource') transformed_source = \ self.TRANSFORMS['source'](source, conversation) if transformed_source != conversation.source or \ service != conversation.service: raise ParseError("mismatch between path and chatinfo for '%s" % conversation.path) latest_time = conversation.time for line in lines: if line == "</chat>": continue cons, attrs = self._parse_line(line, conversation, source, transformed_source) if attrs['time'] < latest_time: attrs['delayed'] = True else: latest_time = attrs['time'] try: conversation.entries.append(cons(**attrs)) except Exception as err: print_e("Problem with element %s" % e) raise err return conversation
def write(self, path, conversations): if len(conversations) != 1: raise ParseError(("'%s' only supports one conversation " "per file:\n '%s' has %i") % (self.type, path, len(conversations))) conversation = conversations[0] file_object = codecs.open(path, 'wb', 'utf-8') util.write_comment( file_object, const.HEADER_COMMENT % conversation.original_parser_name) self._write_title(file_object, conversation) for entry in conversation.entries: timefmt = self.TIME_FMT_CONVERSATION_WITH_DATE if entry.delayed \ else self.TIME_FMT_CONVERSATION self._write_entry(file_object, entry, conversation, timefmt) file_object.write('\n') # newline at end file_object.write('</body></html>\n') file_object.close() self.copy_images(path, conversation)
def _parse_line(self, line, conversation, base_time): """Return (cons, attrs)""" attrs = dict(alias=None, time=None, sender=None, type=None, html=None) line, comment = self._get_line_data(line) if not line and not comment: print_d("Skipping line %s" % line) return None, None # unrepresentable entry dump if not line: cons, attrs = Entry.from_dump(comment) return cons, attrs matched = False for regex in (self.MESSAGE_LINE_RE, self.STATUS_LINE_RE, self.ERROR_LINE_RE): m = regex.match(line) if m: matched = True break if not matched: raise ParseError("could not parse line '%s'" % line) # Message elif regex == self.MESSAGE_LINE_RE: color = m.group('color') attrs['alternate'] = color == self.ALTERNATE_COLOR timestr = m.group('time') attrs['alias'] = m.group('name') attrs['auto'] = m.group('auto') htmlstr = m.group('html') if color == self.SOURCE_COLOR: attrs['sender'] = conversation.source attrs['isuser'] = True elif conversation.isgroup: # groupchats don't use aliases attrs['sender'] = comment if comment else attrs['alias'] elif color == self.DESTINATION_COLOR: attrs['sender'] = conversation.destination attrs['isuser'] = False cons = Message # Status elif regex == self.STATUS_LINE_RE: timestr = m.group('time') htmlstr = m.group('html') cons = Status # Error elif regex == self.ERROR_LINE_RE: timestr = m.group('time') htmlstr = m.group('html') attrs['color'] = self.ERROR_COLOR cons = Status attrs['type'] = Status.ERROR parsed = parse(timestr, default=datetime.datetime.min, ignoretz=True) # delayed has full date in timestamp if parsed.date() == datetime.date.min: attrs['delayed'] = False attrs['time'] = parsed.replace(day=base_time.day, month=base_time.month, year=base_time.year, tzinfo=base_time.tzinfo) else: attrs['delayed'] = True attrs['time'] = parsed.replace(tzinfo=base_time.tzinfo) attrs['html'] = \ list(BeautifulSoup('<foo>%s</foo>' % htmlstr).foo.children) # parse status if cons == Status: self._parse_status(comment, attrs, conversation) if not attrs['type']: print_d("No type found for status '%s': using SYSTEM" % line) attrs['type'] = Status.SYSTEM return (cons, attrs)
def parse_conversation(self, conversation): with codecs.open(conversation.path, encoding='utf-8') as f: data = f.read().strip() lines = data.split('\n') if not lines[-1]: del lines[-1] if lines[-1].endswith('</html>'): del lines[-1] title_line, comment = self._get_line_data((lines.pop(0))) info, conversation.original_parser_name = \ self._parse_title(title_line, comment, conversation) for k, v in iter(info.items()): # no way to determine resource without reading file first if k == 'resource': conversation.resource = v cv = getattr(conversation, k) if v != cv: raise ParseError("mismatch between filename and header " "%s: '%s' != '%s'" % (k, v, cv)) prev_time = conversation.time senders_by_alias = {} ignore_aliases = set() attrs_list = [] i = 0 while i < len(lines): line = lines[i] while True: if line.endswith('<br/>') or line.startswith('<!--'): break line += '\n' + lines[i + 1] i += 1 i += 1 try: cons, attrs = self._parse_line(line, conversation, prev_time) except ArgumentError as e: print_e('Error on line %s' % line) raise e if not attrs: continue if attrs['time'] < prev_time and not attrs['delayed']: attrs['time'] += datetime.timedelta(days=1) prev_time = attrs['time'] s = attrs['sender'] a = attrs['alias'] if s and a and a not in ignore_aliases: s2 = senders_by_alias.get(a, s) if s != s2: print_d('Multiple senders found for %s (%s)' % (a, '%s, %s' % (s, s2))) ignore_aliases.add(a) del senders_by_alias[a] senders_by_alias[a] = s attrs_list.append((cons, attrs)) conversation.entries, conversation.images = \ self._get_entries_and_images(conversation, senders_by_alias, attrs_list) return conversation
def write(self, path, conversations): if len(conversations) != 1: raise ParseError( ("'%s' only supports one conversation per file:" "\n %s has %i") % (self.type, path, len(conversations))) conversation = conversations[0] file_object = codecs.open(path, 'wb', 'utf-8') file_object.write(self.XML_HEADER + '\n') untransformed_source = self.UNTRANSFORMS['source'](conversation.source, conversation) attrs = dict(xmlns=self.XMLNS, account=untransformed_source, service=self.PAM_ECIVRES[conversation.service], resource=conversation.resource) # this attribute will only be useful if we're not the original parser if conversation.isgroup and \ conversation.original_parser_name != self.type: attrs['groupchat'] = "true" util.write_comment( file_object, const.HEADER_COMMENT % conversation.original_parser_name) self._write_xml(file_object, 'chat', attrs, conversation, close=False) file_object.write('\n') for i, entry in enumerate(conversation.entries): attrs = { 'alias': entry.alias, 'sender': (untransformed_source if entry.sender == conversation.source else entry.sender) } if isinstance(entry, Message): name = 'message' if entry.auto: attrs['auto'] = "true" elif isinstance(entry, Status): name = 'status' attrs['type'] = self.PAMEPYT_SUTATS[entry.type] elif isinstance(entry, Event): name = 'event' attrs['type'] = self.PAMEPYT_TNEVE[entry.type] # no alias for event attrs['alias'] = '' if entry.system: # no alias or sender for these del attrs['alias'] del attrs['sender'] elif not attrs['alias']: del attrs['alias'] f1 = self.TIME_FMT_CONVERSATION[:-2] f2 = self.TIME_FMT_CONVERSATION[-2:] v1 = entry.time.strftime(f1) v2 = entry.time.strftime(f2) v = v1 + v2[:3] + ':' + v2[3:] attrs['time'] = v # comments should look like 1|status text comment = ['1', ''] if entry.alternate else ['', ''] if isinstance(entry, Status) and entry.type in Status.USER_TYPES: htmlattr = 'msg_html' if entry.has_other_html: comment[1] = ''.join([x.string for x in entry.html]) else: htmlattr = 'html' if [x for x in comment if x]: util.write_comment(file_object, '|'.join(comment)) self._write_xml(file_object, name, attrs, conversation, contents=getattr(entry, htmlattr)) if i != len(conversation.entries) - 1: file_object.write('\n') file_object.write('</chat>') file_object.close() self.copy_images(path, conversation)