def get_thread_mails(self): mails = { 'member' : {}, 'messages' : [], } try: mails['member']['pseudo'] = self.parser.tocleanstring(self.document.getroot().cssselect('div#message_heading div.username span.name')[0]) except IndexError: mails['member']['pseudo'] = 'Unknown' for li in reversed(self.document.xpath('//ul[@id="thread"]//li[contains(@id, "message_")]')): try: txt = self.parser.tostring(li.xpath('.//div[@class="message_body"]')[0]) except IndexError: continue # 'Match' message txt = html2text(txt).strip() m = re.search(r'(\d+), ', li.xpath('.//span[@class="timestamp"]//script')[0].text) assert m date = local2utc(datetime.fromtimestamp(int(m.group(1)))) id_from = li.find('a').attrib['href'].split('/')[-1].split('?')[0] mails['messages'].append({ 'date' : date, 'message' : unicode(txt), 'id_from' : unicode(id_from), }) return mails
def parse_date(self, date_s): date_s = date_s.strip().encode('utf-8') if not date_s: date = datetime.now() else: date = datetime.strptime(date_s, u'le %d/%m/%Y \xe0 %H:%M.'.encode('utf-8')) return local2utc(date)
def parse(self): self.url = '%s#%s' % (self.preurl, self.div.attrib['id']) self.title = unicode(self.browser.parser.select(self.div.find('h2'), 'a.title', 1).text) try: a = self.browser.parser.select(self.div.find('p'), 'a[rel=author]', 1) except BrokenPageError: self.author = 'Anonyme' self.username = None else: self.author = unicode(a.text) self.username = unicode(a.attrib['href'].split('/')[2]) self.date = datetime.strptime(self.browser.parser.select(self.div.find('p'), 'time', 1).attrib['datetime'].split('+')[0], '%Y-%m-%dT%H:%M:%S') self.date = local2utc(self.date) content = self.div.find('div') try: signature = self.browser.parser.select(content, 'p.signature', 1) except BrokenPageError: # No signature. pass else: content.remove(signature) self.signature = self.browser.parser.tostring(signature) self.body = self.browser.parser.tostring(content) self.score = int(self.browser.parser.select(self.div.find('p'), 'span.score', 1).text) forms = self.browser.parser.select(self.div.find('footer'), 'form.button_to') if len(forms) > 0: self.relevance_url = forms[0].attrib['action'].rstrip('for').rstrip('against') self.relevance_token = self.browser.parser.select(forms[0], 'input[name=authenticity_token]', 1).attrib['value']
def __init__(self, browser, url, tree): Content.__init__(self, browser) self.url = url self.id = url2id(self.url) if tree is None: return header = tree.find('header') self.title = u' — '.join([a.text for a in header.find('h1').findall('a')]) try: a = self.browser.parser.select(header, 'a[rel=author]', 1) except BrokenPageError: self.author = 'Anonyme' self.username = None else: self.author = unicode(a.text) self.username = unicode(a.attrib['href'].split('/')[2]) self.body = self.browser.parser.tostring(self.browser.parser.select(tree, 'div.content', 1)) try: self.date = datetime.strptime(self.browser.parser.select(header, 'time', 1).attrib['datetime'].split('+')[0], '%Y-%m-%dT%H:%M:%S') self.date = local2utc(self.date) except BrokenPageError: pass for form in self.browser.parser.select(tree.find('footer'), 'form.button_to'): if form.attrib['action'].endswith('/for'): self.relevance_url = form.attrib['action'].rstrip('for').rstrip('against') self.relevance_token = self.browser.parser.select(form, 'input[name=authenticity_token]', 1).attrib['value'] self.score = int(self.browser.parser.select(tree, 'div.figures figure.score', 1).text)
def parse_date(s): s = s.replace(u'Fév', 'Feb') \ .replace(u'Avr', 'Apr') \ .replace(u'Mai', 'May') \ .replace(u'Juin', 'Jun') \ .replace(u'Juil', 'Jul') \ .replace(u'Aoû', 'Aug') \ .replace(u'Ao\xfbt', 'Aug') \ .replace(u'Déc', 'Dec') return local2utc(_parse_dt(s))
def parse_dt(s): now = datetime.datetime.now() if s is None: return local2utc(now) if 'minutes ago' in s: m = int(s.split()[0]) d = now - datetime.timedelta(minutes=m) elif u'–' in s: # Date in form : "Yesterday – 20:45" day, hour = s.split(u'–') day = day.strip() hour = hour.strip() if day == 'Yesterday': d = now - datetime.timedelta(days=1) elif day == 'Today': d = now hour = _parse_dt(hour) d = datetime.datetime(d.year, d.month, d.day, hour.hour, hour.minute) else: #if ',' in s: # Date in form : "Dec 28, 2011") d = _parse_dt(s) return local2utc(d)
def parse_dt(s): now = datetime.datetime.now() if s is None: return local2utc(now) if "minutes ago" in s: m = int(s.split()[0]) d = now - datetime.timedelta(minutes=m) elif u"–" in s: # Date in form : "Yesterday – 20:45" day, hour = s.split(u"–") day = day.strip() hour = hour.strip() if day == "Yesterday": d = now - datetime.timedelta(days=1) elif day == "Today": d = now hour = _parse_dt(hour) d = datetime.datetime(d.year, d.month, d.day, hour.hour, hour.minute) else: # if ',' in s: # Date in form : "Dec 28, 2011") d = _parse_dt(s) return local2utc(d)
def parse(self): self.url = '%s#%s' % (self.preurl, self.div.attrib['id']) self.title = unicode( self.browser.parser.select(self.div.find('h2'), 'a.title', 1).text) try: a = self.browser.parser.select(self.div.find('p'), 'a[rel=author]', 1) except BrokenPageError: self.author = 'Anonyme' self.username = None else: self.author = unicode(a.text) self.username = unicode(a.attrib['href'].split('/')[2]) self.date = datetime.strptime( self.browser.parser.select(self.div.find('p'), 'time', 1).attrib['datetime'].split('+')[0], '%Y-%m-%dT%H:%M:%S') self.date = local2utc(self.date) content = self.div.find('div') try: signature = self.browser.parser.select(content, 'p.signature', 1) except BrokenPageError: # No signature. pass else: content.remove(signature) self.signature = self.browser.parser.tostring(signature) self.body = self.browser.parser.tostring(content) self.score = int( self.browser.parser.select(self.div.find('p'), 'span.score', 1).text) forms = self.browser.parser.select(self.div.find('footer'), 'form.button_to') if len(forms) > 0: self.relevance_url = forms[0].attrib['action'].rstrip( 'for').rstrip('against') self.relevance_token = self.browser.parser.select( forms[0], 'input[name=authenticity_token]', 1).attrib['value']
def __init__(self, browser, url, tree): Content.__init__(self, browser) self.url = url self.id = url2id(self.url) if tree is None: return header = tree.find('header') self.title = u' — '.join( [a.text for a in header.find('h1').xpath('.//a')]) try: a = self.browser.parser.select(header, 'a[rel=author]', 1) except BrokenPageError: self.author = 'Anonyme' self.username = None else: self.author = unicode(a.text) self.username = unicode(a.attrib['href'].split('/')[2]) self.body = self.browser.parser.tostring( self.browser.parser.select(tree, 'div.content', 1)) try: self.date = datetime.strptime( self.browser.parser.select(header, 'time', 1).attrib['datetime'].split('+')[0], '%Y-%m-%dT%H:%M:%S') self.date = local2utc(self.date) except BrokenPageError: pass for form in self.browser.parser.select(tree.find('footer'), 'form.button_to'): if form.attrib['action'].endswith('/for'): self.relevance_url = form.attrib['action'].rstrip( 'for').rstrip('against') self.relevance_token = self.browser.parser.select( form, 'input[name=authenticity_token]', 1).attrib['value'] self.score = int( self.browser.parser.select(tree, 'div.figures figure.score', 1).text)
def parse_dt(s): d = _parse_dt(s) return local2utc(d)