def parse_subpods(self, pod): subpods = pod.findall('.//subpod') sub_out = [] if len(subpods) >= 1: for subpod in subpods: podstr = '' subtitle = subpod.get('title', '') plaintext = content(subpod.find('plaintext')) if plaintext: if subtitle: podstr = self.fore.subpod(subtitle) + '\n' podstr += plaintext clean_podstr = soupparser.unescape(podstr.strip()) sub_out.append(clean_podstr) elif self.fetch_pics: pics = subpod.findall('img') self.last_pics += pics sub_out.append('(Type :p ' + str(len(self.last_pics)) + ' to see picture)') return sub_out if sub_out else None else: return None
def get_commit(db, msg): for l in db: if l[1].startswith(msg): return l[0] if l[1].startswith(unescape(msg)): return l[0] return None
def _get_home_page(self): self.s.get( 'http://jwxt.sustc.edu.cn/jsxsd/xsxk/xsxk_index?jx0502zbid=054B5FA7E55F44E0BB3D24DB3BC561' ) r = self.s.get('http://jwxt.sustc.edu.cn/jsxsd') text = r.content.decode('utf-8') txt = unescape(text) return txt
def get_culture_paragraphs_page(culture, s): # Each culture has a url, which we fetch to tell the site we want that culture next culture_path = urllib2.unquote(unescape(culture['href'])) single_culture_result_url = "http://ehrafworldcultures.yale.edu/ehrafe/" + culture_path print "GET {}".format(single_culture_result_url) prod_server_result = s.get(single_culture_result_url) assert prod_server_result.status_code == 200 pause() culture_code = re.search("[&\?]owc=([A-Z0-9]*)&", single_culture_result_url).groups()[0] # Now actually load the results for the culture. The site already knows the one we want load_results_url = 'http://ehrafworldcultures.yale.edu/ehrafe/pageHitsAjax.do?&howMany=99999999' print "GET {}".format(load_results_url) single_culture_result = s.get(load_results_url) assert single_culture_result.status_code == 200 pause() single_culture_result_doc = hack_single_culture_result(single_culture_result.content) print "PARSE {}".format(load_results_url) single_culture_result_dom = fromstring(single_culture_result_doc) print "PARSED" return single_culture_result_dom, culture_code
def _get_home_page(self): r = self.s.get('http://sakai.sustc.edu.cn/portal') text = r.content.decode('utf-8') txt = unescape(text) return txt
def _get_home_page(self): r = self.s.get(self.site) text = r.content.decode('utf-8') txt = unescape(text) return txt