def parse(address, learnmode=False): pages = Set([address]) addr_site = re.findall('^.+/',address)[0] addr_topic = re.findall('[^/]*$',address)[0] site = get_site(address) with open("wedt/forumstrings/"+site, 'r') as f: strings = json.loads(f.read()) topic = Topic() scores = [] classes = [] p = 0 while p < len(pages): page = get_page(sorted(pages)[p]) soup = BeautifulSoup(page) # find links to other pages if strings.get('pagination',''): # for forums without pagination for pagination in soup(True, attrs={"class": strings['pagination']}): for link in pagination("a", href=re.compile(re.escape(addr_topic))): t = re.search('[^/]*('+strings['page']+')[0-9]+', link['href']) if t: pages.add(addr_site+t.group(0)) # extract posts container if possible postlist = (soup.find("div", id=strings['postlist']) if strings.get('postlist','') else soup) author = [] title = [] text = [] source = [] link = [] for postbody in postlist(True, attrs={"class": strings['postbody']}): txt = '\n'.join(postbody.stripped_strings) if txt: text.append(txt) source.append(''.join(map(unicode,postbody.contents))) if strings.get('username',''): for username in postlist(True, attrs={"class": strings['username']}): author.append(unicode(username.string)) if strings.get('optitle',""): optitle = postlist.find(True, id=strings['optitle']) title.append(unicode(optitle.string)) if strings.get('posttitle',""): for posttitle in postlist(True, attrs={"class": strings['posttitle']}): title.append(unicode(posttitle.string)) if strings.get('postlink',""): for postlink in postlist('a', attrs={"class": strings['postlink']}): link.append(addr_site+re.search('[^/]*$', postlink['href']).group(0)) if learnmode: for postscore in postlist(True, attrs={"class": strings['postscore']}): scores.append(unicode(postscore.span.string)) classes.append("acc" if strings['postaccepted'] in postscore.get_text() else "nope") if len(text)-len(link)==1: #OP isn't permalinked link = [address] + link for (u,t,b,l,s) in izip_longest(author, title, text, link, source, fillvalue=""): topic.append(Post(u,t,b,l,s)) # go to the next page p=p+1 return (topic, scores, classes) if learnmode else topic