def get_date(self): # Meta is often foobar because of proxy bugs pre, po = self.get_permalink().split('?title=') citelink = pre + '?title=Special:Cite&page=' + po _, _, cite_HTML_u, resp = get_HTML(citelink) day, month, year = re.search('''<li> Date of last revision: (\d{1,2}) (\w+) (\d\d\d\d)''', cite_HTML_u).groups() month = fe.MONTH2DIGIT[month[0:3].lower()] return '%d%02d%02d' %(int(year), int(month), int(day))
def __init__(self, url, comment): print("Scraping default Web page;"), self.url = url self.comment = comment try: self.html_b, self.HTML_p, self.html_u, self.resp = get_HTML( url, cache_control = 'no-cache') except IOError: self.html_b, self.HTML_p, self.html_u, self.resp = (None, None, None, None) self.text = None if self.html_b: self.text = get_text(url)
def get_date(self): '''find date within <span id="mw-revision-date">19:09, 1 April 2008</span>''' _, _, versioned_HTML_u, resp = get_HTML(self.get_permalink()) time, day, month, year = re.search('''<span id="mw-revision-date">(.*?), (\d{1,2}) (\w+) (\d\d\d\d)</span>''', versioned_HTML_u).groups() month = fe.MONTH2DIGIT[month[0:3].lower()] return '%d%02d%02d' %(int(year), int(month), int(day))