def getTotalPage(url): #url = "http://www.datashanghai.gov.cn/query!queryProduct.action?currentPage=1" html = getHtml(url).decode('utf-8') reg = r"totalPage = '(\d{1,5})'" p = re.compile(reg) result = re.findall(p, html) return int(result[0])
def getPageLinked(url): html = getHtml(url).decode('utf-8') start = int(html.find('class="list"')) end = int(html.find('id="pageSpan"')) content = html[start:end] reg = r'<a href="query!(.*)" title="(.*)" target=".*">(\s*.*){12}<strong class=".*">(.*)</strong></dt>' p = re.compile(reg) result = re.findall(p, content) return result
def getPageLinked(url): html = getHtml(url).decode('utf-8') start = int(html.find('class="list"')) end = int(html.find('id="pageSpan"')) content = html[start:end] reg = r'<a href="query!(.*)" title=' p = re.compile(reg) result = re.findall(p, content) return result
def getSHPageLinked(url): html = getHtml(url).decode('utf-8') start = int(html.find('class="list"')) end = int(html.find('id="pageSpan"')) content = html[start:end] reg = r'<a href="query!(.*)" title="(.*)" target=".*">(\s*.*){12}<strong class=".*">(.*)</strong></dt>' p = re.compile(reg) result = re.findall(p, content) return result
def getFromSubject(url): html = getHtml(url).decode('utf-8') sign = int(html.find('ess_ctr473_contentpane')) start = int(html.find('<ul', sign)) end = int(html.find('</div>', sign)) content = html[start:end] reg = r'' p = re.compile(reg) result = re.findall(p, content) return content
def getFromGov(url): html = getHtml(url).decode('utf-8') sign = int(html.find('ess_ctr506_OrganizationsListTree_divDataOrg')) start = int(html.find('<ul>', sign)) end = int(html.find('</ul>', sign)) content = html[start:end] reg = r'<a href="../(.*)">(.*)((\d*))</a>' p = re.compile(reg) result = re.findall(p, content) return result
def getSHInfo(pageUrl, dataType, title): html = getHtml(url + pageUrl).decode('utf-8') reg = r'<td>(.*)\r*\s*</td>' sign = int(html.find('</table>')) p = re.compile(reg) result = re.findall(p,html[:sign]) res = list(result) res.insert(0, title) res.insert(1, dataType) resStr = str(res) resStr = resStr.replace(' ', '') res = eval(resStr) return res
def getPageLinked(url, jg): html = getHtml(url).decode('utf-8') reg = r'<a id=".*" class="hylName" href=".*">(.*)</a>(.*\s*){2}(.*)\r' p = re.compile(reg) result = re.findall(p, html) res = [] for each in result: tempList = [] tempList.append(each[0]) tempList.append(each[2].replace(' ', '')) tempList.append(jg) res.append(tempList) return res
def getInfo(pageUrl, dataType, title): html = getHtml(url + pageUrl).decode('utf-8') reg = r'<td>(.*)\r*\s*</td>' sign = int(html.find('</table>')) p = re.compile(reg) result = re.findall(p, html[:sign]) res = list(result) res.insert(0, title) res.insert(1, dataType) resStr = str(res) resStr = resStr.replace(' ', '') res = eval(resStr) return res
def getResourceId(url): html = getHtml(url).decode('utf-8') sign = int(html.find('25bde262-31b4-4901-8d53-527631005f6a')) start = int(html.find("<div", sign)) end = int(html.find("</div>", sign)) resource = html[start:end] sign = int(html.find('60d79024-a7f3-4c73-8b78-b7153fa1f1aa')) start = int(html.find("<div", sign)) end = int(html.find("</div>", sign)) orgenization = html[start:end] reg = r'<a href=".*" id="(.*)" class="list-group-item text-center" title=".*">([\u4e00-\u9fa5]*)</a>' p = re.compile(reg) orgResult = re.findall(p, orgenization) resResult = re.findall(p, resource) return orgResult, resResult
def getWHInfo(pageUrl): html = getHtml(url + pageUrl).decode('utf-8') reg = r'<td>\s*(\w*)\r*\s*</td>' p = re.compile(reg) result = re.findall(p, html) return result
def getWHAllPageLinked(url): html = getHtml(url).decode('utf-8') data = json.loads(html) return data
def getInfo(pageUrl): html = getHtml(pageUrl).decode('utf-8') reg = r'<span id=".*" class="indent">(.*)</span>' p = re.compile(reg) result = re.findall(p,html) return result
def getAllPageLinked(url): html = getHtml(url).decode('utf-8') data = json.loads(html) return data
def getInfo(pageUrl): html = getHtml(url + pageUrl).decode('utf-8') reg = r'<td>\s*(\w*)\r*\s*</td>' p = re.compile(reg) result = re.findall(p,html) return result
def getInfo(pageUrl): html = getHtml(pageUrl).decode('utf-8') reg = r'<span id=".*" class="indent">(.*)</span>' p = re.compile(reg) result = re.findall(p, html) return result