startmin=txtlog[13:15] startsec=txtlog[15:17] timezone=txtlog[17:20]+":"+txtlog[20:22] # variables to keep track of time the first and last messages firsttstamp = '' # this one only updates once lasttstamp = '' # this one keeps updating # create an XML soup for the converted log with the header info soup = BeautifulStoneSoup('<?xml version="1.0" encoding=""?>') # Populate the soup chatTag = Tag(soup, "chat") chatTag['xmlns'] = XMLNS chatTag['account'] = account chatTag['service'] = SERVICE chatTag['adiumversion'] = ADIUMVERSION chatTag['buildid'] = BUILDID soup.append(chatTag) # Parse a pidgin log file and compose the Adium log file # Pidgin log format note: # 1) Messages starts from the 2nd line # 2) The first line of a message starts with a timestamp embraced by () # * (5/6/2015 12:13:14 AM) -- for msg date different from what is recorded in the filename) # * (2:05:59 PM) -- for msg date the same as the one recored in the filename # 3) If the current line is a continuation of the message started in some previous line, it will not have the above time header. # 4) After the timestamp, it has optional sender info before the next :. No sender info if the message is a status update. # * (2:05:59 PM) [email protected] # Adium log format note: # 1) Sample log format # <chat xmlns="http://purl.org/net/ulf/ns/0.4-02" account="*****@*****.**" service="Jabber" adiumversion="1.5.7" buildid="c72b164f75a7"> # <event type="WindowOpened" sender="*****@*****.**" time="2014-12-04T16:14:01+08:00"> # </event> # <message sender="*****@*****.**" time="2014-12-04T16:14:01+08:00">
def output(self, filename, os=sys.stdout): if self._articles == None: print 'Please parse file(s) first' return elif not self._articles: print 'No articles detected in parse file.' return osoup = BeautifulStoneSoup() channel = Tag(osoup, 'channel', [('id', '1')]) osoup.append(channel) # journal meta (extract from first article) article = self._articles[0] jtitle = Tag(osoup, 'title') jtitle.append('Science Magazine') jhomepage = Tag(osoup, 'homepage') jhomepage.append('http://www.sciencemag.org/content/current') pubdate = article.find('pub-date') jpubdate = Tag(osoup, 'pubDate') jpubdate.append('%(year)s-%(month)s-%(day)s' % {'year' : pubdate.year.contents[0], 'month' : pubdate.month.contents[0], 'day' : pubdate.day.contents[0]}) jvolume = article.find('volume') jissue = article.find('issue') jcoverimg = Tag(osoup, 'image', [('type', 'cover'), ('url','http://coverurl')]) channel.append(jtitle) channel.append(jhomepage) channel.append(jpubdate) channel.append(jvolume) channel.append(jissue) channel.append(jcoverimg) # article meta sectiondict = {} for article in self._articles: # item, fpage fpagetag = article.find('fpage') fpage = fpagetag.contents[0] try: subpage = fpagetag['seq'] except KeyError, msg: subpage = '' item = Tag(osoup, 'item', [('fpage',fpage), ('subpage',subpage)]) # title atitle = article.find('article-title') atitle.name = 'title' # pubdate pubdate = article.find('pub-date') apubdate = Tag(osoup, 'pubDate') apubdate.append('%(year)s-%(month)s-%(day)s' % {'year' : pubdate.year.contents[0], 'month' : pubdate.month.contents[0], 'day' : pubdate.day.contents[0]}) # overline overline = article.find('subj-group', {'subj-group-type' : 'heading' } ).subject.contents[0] aoverline = Tag(osoup, 'overline') aoverline.append(overline) # authors contribs = article.findAll('contrib') acontribs = Tag(osoup, 'authors') for c in contribs: ctype = c['contrib-type'] fname = c.find('given-names').contents[0] lname = c.find('surname').contents[0] acontrib = Tag(osoup, ctype) afname = Tag(osoup, 'fname') afname.append(fname) alname = Tag(osoup, 'lname') alname.append(lname) acontrib.append(alname) acontrib.append(afname) acontribs.append(acontrib) # summary teaser = article.find('abstract', {'abstract-type':'teaser'}) if not teaser: teaser = '' asummary = Tag(osoup, 'summary') asummary.append(teaser) # text abody = article.find('body') try: abody.name = 'text' except AttributeError: abody = Tag(osoup, 'text') # images aimages = Tag(osoup, 'images') aimage = Tag(osoup, 'image') aimage.append('test image') aimages.append(aimage) # links alinks = Tag(osoup, 'links') alinkabs = Tag(osoup, 'link', [('type','abstract')]) alinkabs.append('abstract url') alinkfull = Tag(osoup, 'link', [('type','full')]) alinkfull.append('full url') alinks.append(alinkabs) alinks.append(alinkfull) # categories acats = Tag(osoup, 'categories') cats = article.findAll('subj-group', {'subj-group-type' : 'field'}) for cat in cats: acat = Tag(osoup, 'category') acat.append(cat.subject.contents[0]) acats.append(acat) # section sectionname = article.find('subj-group', {'subj-group-type' : 'article-type' } ).subject.contents[0] if sectionname in sectiondict: section = sectiondict[sectionname] else: section = Tag(osoup, 'section', [('id','#'), ('title', sectionname)]) sectiondict[sectionname] = section channel.append(section) # build item item.append(atitle) item.append(apubdate) item.append(aoverline) item.append(acontribs) item.append(asummary) item.append(abody) item.append(aimages) item.append(alinks) item.append(acats) section.append(item)
def format_wp(outfile): # extract list of threads threads = json.load(open(dumpfile,'r')) # set up xml output f = open(outfile,'w') soup = BeautifulStoneSoup() soup.append(ProcessingInstruction('xml version="1.0" encoding="UTF-8')) rss = Tag(soup, 'rss', [('version','2.0'), ('xmlns:excerpt','http://wordpress.org/export/1.0/excerpt/'), ('xmlns:content','http://purl.org/rss/1.0/modules/content/'), ('xmlns:wfw','http://wellformedweb.org/CommentAPI/'), ('xmlns:dc','http://purl.org/dc/elements/1.1/'), ('xmlns:wp','http://wordpress.org/export/1.0/')]) soup.append(rss) channel = Tag(soup, 'channel') clink = Tag(soup, 'link') clink.append('http://news.sciencemag.org/scienceinsider') rss.append(channel) channel.append(clink) print 'Reformatting comments in',len(threads),'threads from json into xml...' # print threads in descending order of date threads = sorted(threads.iteritems(), key=operator.itemgetter(0)) ncom = 0 for uniq,thread in threads: item = Tag(soup,'item') channel.append(item) title = Tag(soup,'title') title.append('Comments for '+thread[0]['url']) link = Tag(soup,'link') link.append(thread[0]['url']) guid = Tag(soup, 'guid', [('isPermaLink','false')]) guid.append(thread[0]['url']) id = Tag(soup,'wp:post_id') id.append(thread[0]['uniq']) item.append(title) item.append(link) item.append(guid) item.append(id) for comment in thread: ctag = Tag(soup, 'wp:comment') id = Tag(soup,'wp:comment_id') id.append(comment['id']) author = Tag(soup,'wp:comment_author') author.append(CData(comment['author'])) email = Tag(soup,'wp:comment_author_email') if comment['email']: email.append(comment['email']) ip = Tag(soup,'wp:comment_author_ip') ip.append(comment['ip']) date = Tag(soup,'wp:comment_date') date.append(comment['created_on']) dategmt = Tag(soup,'wp:post_date_gmt') dategmt.append(comment['created_on']) text = Tag(soup,'wp:comment_content') text.append(CData(comment['text'])) status = Tag(soup,'wp:comment_approved') status.append('1') type = Tag(soup,'wp:comment_type') parent = Tag(soup,'wp:comment_parent') parent.append('0') user = Tag(soup,'wp:comment_user_id') user.append('0') item.append(ctag) ctag.append(id) ctag.append(author) ctag.append(email) ctag.append(ip) ctag.append(date) ctag.append(dategmt) ctag.append(text) ctag.append(status) ctag.append(type) ctag.append(parent) ctag.append(user) ncom += 1 print 'Outputted',ncom,'comments.' print_soup(f,soup)