Пример #1
0
    def parse_subpods(self, pod):
        subpods = pod.findall('.//subpod')
        sub_out = []
        if len(subpods) >= 1:
            for subpod in subpods:
                podstr = ''
                subtitle = subpod.get('title', '')
                plaintext = content(subpod.find('plaintext'))

                if plaintext:
                    if subtitle:
                        podstr = self.fore.subpod(subtitle) + '\n'
                    podstr += plaintext

                    clean_podstr = soupparser.unescape(podstr.strip())
                    sub_out.append(clean_podstr)
                elif self.fetch_pics:
                    pics = subpod.findall('img')
                    self.last_pics += pics
                    sub_out.append('(Type :p ' + str(len(self.last_pics)) +
                                   ' to see picture)')

            return sub_out if sub_out else None
        else:
            return None
Пример #2
0
def get_commit(db, msg):
    for l in db:
        if l[1].startswith(msg):
            return l[0]
        if l[1].startswith(unescape(msg)):
            return l[0]
    return None
Пример #3
0
 def _get_home_page(self):
     self.s.get(
         'http://jwxt.sustc.edu.cn/jsxsd/xsxk/xsxk_index?jx0502zbid=054B5FA7E55F44E0BB3D24DB3BC561'
     )
     r = self.s.get('http://jwxt.sustc.edu.cn/jsxsd')
     text = r.content.decode('utf-8')
     txt = unescape(text)
     return txt
Пример #4
0
def get_culture_paragraphs_page(culture, s):
    # Each culture has a url, which we fetch to tell the site we want that culture next
    culture_path = urllib2.unquote(unescape(culture['href']))
    single_culture_result_url = "http://ehrafworldcultures.yale.edu/ehrafe/" + culture_path
    print "GET {}".format(single_culture_result_url)
    prod_server_result = s.get(single_culture_result_url)
    assert prod_server_result.status_code == 200
    pause()
    culture_code = re.search("[&\?]owc=([A-Z0-9]*)&",
                             single_culture_result_url).groups()[0]

    # Now actually load the results for the culture. The site already knows the one we want
    load_results_url = 'http://ehrafworldcultures.yale.edu/ehrafe/pageHitsAjax.do?&howMany=99999999'
    print "GET {}".format(load_results_url)
    single_culture_result = s.get(load_results_url)
    assert single_culture_result.status_code == 200
    pause()

    single_culture_result_doc = hack_single_culture_result(single_culture_result.content)

    print "PARSE {}".format(load_results_url)
    single_culture_result_dom = fromstring(single_culture_result_doc)
    print "PARSED"
    return single_culture_result_dom, culture_code
Пример #5
0
 def _get_home_page(self):
     r = self.s.get('http://sakai.sustc.edu.cn/portal')
     text = r.content.decode('utf-8')
     txt = unescape(text)
     return txt
Пример #6
0
 def _get_home_page(self):
     r = self.s.get(self.site)
     text = r.content.decode('utf-8')
     txt = unescape(text)
     return txt