Exemplo n.º 1
0
 def get_date(self): # Meta is often foobar because of proxy bugs
     pre, po = self.get_permalink().split('?title=')
     citelink = pre + '?title=Special:Cite&page=' + po
     _, _, cite_HTML_u, resp = get_HTML(citelink)
     day, month, year = re.search('''<li> Date of last revision: (\d{1,2}) (\w+) (\d\d\d\d)''', cite_HTML_u).groups()
     month = fe.MONTH2DIGIT[month[0:3].lower()]
     return '%d%02d%02d' %(int(year), int(month), int(day))
Exemplo n.º 2
0
 def __init__(self, url, comment):
     print("Scraping default Web page;"),
     self.url = url
     self.comment = comment
     try:
         self.html_b, self.HTML_p, self.html_u, self.resp = get_HTML(
             url, cache_control = 'no-cache')
     except IOError:
         self.html_b, self.HTML_p, self.html_u, self.resp = (None, None, 
             None, None)
         
     self.text = None
     if self.html_b:
         self.text = get_text(url)
Exemplo n.º 3
0
 def get_date(self):
     '''find date within <span id="mw-revision-date">19:09, 1 April 2008</span>'''
     _, _, versioned_HTML_u, resp = get_HTML(self.get_permalink())
     time, day, month, year = re.search('''<span id="mw-revision-date">(.*?), (\d{1,2}) (\w+) (\d\d\d\d)</span>''', versioned_HTML_u).groups()
     month = fe.MONTH2DIGIT[month[0:3].lower()]        
     return '%d%02d%02d' %(int(year), int(month), int(day))