def extract_pages_and_next_link(content): html = BeautifulSoup(content) pages = html.find('div', {'id': 'mw-pages'}) links = pages.findAll('a') titles = [] last_link = '' for link in links: text = link.getText() title = link.get('title') href = link.get('href') m = re.match(r'^/wiki/(.*)$', href) if m: w = m.group(1) wu = urllib.unquote(w) # print wu # print repr(wu) # print wu.decode('utf8') if text != title: raise Exception('Mistake in link: %s' % unicode(link)) titles.append(text) last_link = href m = re.match(r'^/w/index\.php\?title=(.*)$', last_link) if not m: raise Exception('Cannot find "next" link') return last_link, '\n'.join(titles)
class Parser: def __init__(self, path_in, path_out, name, file_in_dir=None, slug=None): self.path_in = path_in self.path_out = path_out self.name = name self.file_in_dir = file_in_dir self.dub = slug in ['dmitriev', 'efremova', 'dic_fwords', 'dic_synonims'] self.html = '' if file_in_dir: self.filename = join(self.path_in, self.name, self.file_in_dir) else: self.filename = join(self.path_in, self.name) self.out_filename = join(self.path_out, self.name) def read_file(self): content = open(self.filename).read() self.html = BeautifulSoup(content) def parse_file(self): self.h1 = self.dt = self.dd = '' try: self.h1 = self.html.find('h1').getText() except AttributeError, e: print '#', self.name, "#", "'h1' not found:", e return False # try: # dl = self.html.find('dl') # except AttributeError, e: # print '#', self.name, "#", "'dl' not found:", e # return False # self.dt = dl.find('dt').getText() # self.dd = unicode(dl.find('dd')) try: self.dt = self.html.find('dt').getText() except AttributeError, e: print '##', self.name, "-", "DT not found:", e return False
version = "Opera/9.80 (Windows NT 6.2; WOW64) Presto/2.12.388 Version/12.16" urllib._urlopener = AppURLopener() files = os.listdir(path) start = 0 threads = 1 # for filename in files[len(files) / 2 - 1:0:-1]: # for filename in files[len(files) / 2:]: # for filename in files[len(files)-1:0:-1]: for filename in files[start::threads]: content = open(join(path, filename)).read() html = BeautifulSoup(content) table = html.find('table', {'class': 'mw-allpages-table-chunk'}) if not table: print '# Failed', filename continue links = table.findAll('a') for link in links: word = link.getText() cls = link.get('class') url = link.get('href') if edit: url += '?action=edit' content2 = '' ok = False dt = datetime.now().strftime("[%H:%M:%S]") dt = "(%s) %s" % (start, dt)
def read_file(self): content = open(self.filename).read() self.html = BeautifulSoup(content)
def extract_wiki_text(content): html = BeautifulSoup(content) return html.find('textarea').getText()