def getOrgNumber(urlPage,prefix,filename): result = set() print("getting org number:") html = getInfo.getInfo(urlPage) # print(html) G = re.search(r'org_type=0&field_type=0&area_type=0&province_type=0&market_type=0&keywords=&org_search=&page=(\d+)">末页</a>',html) # print(G.group(0)) if G: lastPage = int(G.group(1)) else: lastPage = 2 print("last: %d" % lastPage) for i in range(1, lastPage): url = urlPage + str(i) print("for: %d" %i) # fp = open(str(i)+".html","w",encoding = "utf-8") html = getInfo.getInfo(url) # fp.write(html) # fp.close() result = result.union(findOrg.findOrg(html)) tmp = list(result)[:] tmp.sort() result = tmp print("writing org number to %s..." %filename) fp = open(filename,"w+",encoding = "utf-8") for i in result: fp.write(prefix+str(i)+"/"+'\n') fp.close() print("succ writing to %s!" %filename) return result
def getOrgNumber(urlPage,prefix,filename): result = set() print("getting org number:") html = getInfo.getInfo(urlPage) # print(html) G = re.search(r'page=(\d+)">末页</a>',html) # print(G.group(0)) if G: lastPage = int(G.group(1)) else: lastPage = 2 print("lastPage: %d" %lastPage) for i in range(1, lastPage): url = urlPage + str(i) # print(url) print("for: %d" %i) html = getInfo.getInfo(url) print("html get suc!") result = result.union(findOrg.findOrg(html)) tmp = list(result)[:] tmp.sort() result = tmp # print("result: ",result) # print("writing org number to %s..." %filename) # fp = open(filename,"w+",encoding = "utf-8") # for i in result: # fp.write(prefix+str(i)+"/"+'\n') # fp.close() # print("succ writing to %s!" %filename) return result,lastPage-1
def readFile(nameOfFile, helper, command): file = open(nameOfFile, 'r') list = [] # 保存目标文件中的所有ip for line in file: list.append(getip(line)[0]) file.close() for i in list: getInfo(i, helper, command) helper.close()
def submit(): coin = coinEntry.get() coinParts = coin.split() if len(coinParts) > 1: newCoinList = [] for part in coinParts: newCoinList.append(part.capitalize()) Coin = ' '.join(newCoinList) else: Coin = coin.capitalize() try: statsDict, abrv = gi.getStats(coin) infoDict = gi.getInfo(coin) infoTitle.config(text=f'{Coin} ({abrv})') title.config(text=f'{Coin} ({abrv})') fillStats(statsDict) fillInfo(infoDict) # makes it so if the error message was displayed, it will display again without having to run the code over again errorLabel.config(fg=bg1) except: errorLabel.config(fg=down) errorLabel.config( text= f'\'{coin}\' is not a coin. Please check spelling and try again') errorLabel.place(relx=0.18, rely=0.96) coinEntry.delete(0, 50)
def findHire(url, ngo, numHire): html = getInfo.getInfo(url) loc = 0 while loc != -1: perhire = perHire() m = re.search( r'<td><a href="(.*)" title="(.+)" target= "_blank" class="it1">(.+)</a></td>', html[loc:]) if m == None: break # perhire.urlLink = perhire.post = m.group(3) loc = loc + m.end() m = re.search(r'<td>(.*)</td>', html[loc:]) perhire.unit = m.group(1) loc = loc + m.end() m = re.search(r'<td>(.*)</td>', html[loc:]) perhire.worksite = m.group(1) loc = loc + m.end() m = re.search(r'<td>(.*)</td>', html[loc:]) perhire.updateTime = m.group(1) loc = loc + m.end() # perhire.show() ngo.hire.append(perhire) return ngo
def findLoc(url): html = getInfo(url) locG = re.search(r"var content = '(.*)'",html) return locG.group(1).strip() # s = findLoc("http://www.chinadevelopmentbrief.org.cn/org30/org_map/") # print(s)
def searchMachine(keyword): print(keyword) url = 'https://nifmbapi.maruhan.co.jp/api/v1.4/hall/machine/search' query = {'hall_code': 1061, 'key_word': keyword, 'ps_type': 'S'} #1クエリストリングの作成 url_with_query = "{}?{}".format(url, urllib.parse.urlencode(query)) #print(url_with_query) response = urllib.request.urlopen(url_with_query) content = json.loads(response.read().decode('utf8')) #pprint.pprint(content) if not (content["child_halls"]): #print("要素なし") return else: model_name = content["child_halls"][0]["models"][0]["model_name"] machine_number = content["child_halls"][0]["models"][0]["groups"][0][ "machines"][0]["machine_number"] slump = getSlump.getSlump(content["child_halls"][0]["models"][0] ["groups"][0]["machines"][0]["machine_id"]) allStarts, bb, rb = getInfo.getInfo( content["child_halls"][0]["models"][0]["groups"][0]["machines"][0] ["machine_id"]) ''' print("機種名:"+ model_name) print("台番号:"+ machine_number) print("差枚:"+ str(slump)) print("総回転数:"+ str(allStarts)) ''' return createDF(model_name, machine_number, slump, allStarts, bb, rb)
def findHire(url,ngo,numHire): html = getInfo.getInfo(url) loc = 0 while loc != -1: perhire = perHire() m = re.search(r'<td><a href="(.*)" title="(.+)" target= "_blank" class="it1">(.+)</a></td>',html[loc:]) if m == None: break # perhire.urlLink = perhire.post = m.group(3) loc = loc + m.end() m = re.search(r'<td>(.*)</td>',html[loc:]) perhire.unit = m.group(1) loc = loc + m.end() m = re.search(r'<td>(.*)</td>',html[loc:]) perhire.worksite = m.group(1) loc = loc + m.end() m = re.search(r'<td>(.*)</td>',html[loc:]) perhire.updateTime = m.group(1) loc = loc + m.end() # perhire.show() ngo.hire.append(perhire) return ngo
def thread_download(num): #download information temp_list = list() temp = num while True: try: check = getInfo.getInfo(movie_names[num]) temp_list.append(check) if check == "": movie_info[temp] = temp_list return movie_info[temp] else: num = num + 8 except: movie_info[temp] = temp_list return movie_info[temp]
def getOrgInfo(url): ngo = NGO() url = url.strip() html = getInfo.getInfo(url) soup = BeautifulSoup(html) # print(soup) ngo.name = soup.find(id="org-header").find("h1").get_text() print(ngo.name) location = soup.find("h3",text = "办公地址") ngo.location = location.findNextSibling("p").get_text() print("location: %s" %ngo.location) connInfoList = soup.find("h3",text = "联系方式").find_next_siblings("p") tem = list() for x in connInfoList: s = x.get_text().replace(" ","").replace("\n","") # print(s) # s = x.get_text().replace(" ","").replace("\n"," ").replace("\r","") if s == "访问机构网站": s = "网站: " + x.a['href'] tem.append(s) ngo.connectionInfo = tem # print("connectionInfo: ",ngo.connectionInfo) tem = soup.find("span",text = "成立时间: ") if tem: tem = tem.find_parent("p").get_text() Ga = re.search("(\d*)年",tem) if Ga: if Ga.group(1): ngo.esTime = int( Ga.group(1))#year print("year: %d" %ngo.esTime ) Ga = re.search("<span>全职人数: </span>(\d*)",html) if Ga: if Ga.group(1): ngo.scale = int(Ga.group(1)) #scale print("scale: %d" %ngo.scale) return ngo
def getOrgInfo(url): ngo = NGO() ngo.url = url url = url.strip() html = getInfo.getInfo(url) soup = BeautifulSoup(html) # print(soup.prettify()) if soup.find("h3"): title = soup.find("h3") if title.get_text() != "": ngo.name = title.get_text().splitlines()[0] print(ngo.name) content = title.find_next_siblings("p") s = "" encontent = soup.find("p", class_="sch_con2 l") if content != []: per = content[0] # print(per.prettify()) for w in per.find_all("p"): s += replace_with_newlines(w) + "\n" if per.find_all("div"): totalDiv = per.find_all("div") for w in totalDiv: s += w.get_text() + "\n" if encontent != None and encontent.find("b"): enb = encontent.find("b") ngo.enName = enb.get_text() ngo.description = s # print(s) #format # tem = list() lines = ngo.description.replace("\t", "").replace("\xa0", " ").splitlines() # lines = ngo.description.splitlines() # print(lines) if ngo.name != "" and re.search('[a-zA-Z]', ngo.name[0]): ngo.name, ngo.enName = ngo.enName, ngo.name ngo.sponsors = [] for i in range(0, len(lines)): Ga = re.search(".*资助者$", lines[i]) if Ga: for j in range(i + 1, len(lines)): if lines[j] == "" and j < len(lines) - 1 and ( lines[j + 1] == "" or re.search("(.*)合作", lines[j + 1])): for k in range(i + 1, j): if lines[k] != "": Ga = re.search("(.*)", lines[k]) if Ga: lines[k] = lines[k].replace( Ga.group(0), "") if "、" in lines[k]: words = lines[k].split("、") elif ";" in lines[k]: words = lines[k].split(";") elif "," in lines[k]: words = lines[k].split(",") elif "," in lines[k] or "," in lines[k]: words = lines[k].split(",") else: words = lines[k].split(" ") for q in words: if q: # print(q) ngo.sponsors.append(q) break break ngo.partners = [] for i in range(0, len(lines)): Ga = re.search(".*合作伙伴$", lines[i]) if Ga: for j in range(i + 1, len(lines)): if lines[j] == "" and j < len(lines) - 1 and ( lines[j + 1] == "" or re.search("(.*)独特性", lines[j + 1])): for k in range(i + 1, j): if lines[k] != "": # print("lines: ",lines[k]) Ga = re.search("(.*)", lines[k]) if Ga: lines[k] = lines[k].replace( Ga.group(0), "\n") # print("lines: ",lines[k]) if "、" in lines[k]: words = lines[k].split("、") # print(words) elif ";" in lines[k]: words = lines[k].split(";") elif "," in lines[k]: words = lines[k].split(",") elif "," in lines[k] or "," in lines[k]: words = lines[k].split(",") else: words = lines[k].split(" ") for q in words: if q: # print(q) ngo.partners.append(q) break break Ga = re.search("(.*)地址[:|:](.*)\n(.*)", ngo.description) if Ga: if Ga.group(2): ngo.location = Ga.group(2) else: ngo.location = Ga.group(3) if " " in ngo.location: ngo.location = ngo.location.split(" ")[0] Ga = re.search("负责人[:|:](.*)", ngo.description) if Ga: ngo.personCharge = Ga.group(1) Ga = re.search("员工人数[:|:](.*)", ngo.description) if Ga: ngo.scale = Ga.group(1) Ga = re.search("成立时间[:|:](.*)", ngo.description) if Ga: ngo.esTime = Ga.group(1) print(ngo.name, ngo.location, ngo.personCharge, ngo.scale, ngo.esTime, ngo.partners, ngo.sponsors) return ngo
import getInfo import os import time import re s1 = r'<div class="ml_name mt15">' s2_1 = r'<h1>' s2_2 = r'</h1>' s3_1 = r'<font>' s3_2 = r'</font>' name = enName = esTime = location = area = scale = description = info = recruit = image = None fp = open("Pages.txt", "r", encoding="utf-8") # os.system("pause") # time.sleep(2) # url = st.strip() html = getInfo.getInfo('http://www.chinadevelopmentbrief.org.cn/org33/') loc = html.find(s1) if loc != -1: x = html.find(s2_1, loc + len(s1)) if x == -1:pass y = html.find(s2_2, x + len(s2_1)) if y == -1:pass name = html[x + len(s2_1):y] print(name) m = html.find(s3_1, y + len(s2_2)) if m == -1:pass n = html.find(s3_2, m + len(s3_1)) if(n == -1):pass enName = html[m + len(s3_1):n] print(enName) Gr = re.search(r'<li><font>成立时间:</font>(\d+)年</li>', html[n:])
fread = file(filename) for line in fread: uid_list.append(line.strip()) def writefile(filename, content): fw = file(filename, 'a') fw.write(content) fw.close() if __name__ == '__main__': username = '' pwd = '' WBLogin = weiboLogin.weiboLogin() if (WBLogin.login(username, pwd) == 'servertime_error'): print 'login failed. check out your network.' sys.exit() uid_list = [] get_uid('C:/Result1.txt', uid_list) path = 'C:/weibodata' if not os.path.exists(path): os.mkdir(path) for uid in uid_list: try: InfoPage = getInfo.getInfo() InfoPage.get_info(uid) except Exception as e: writefile('C:/id.txt', str(uid) + '\n')
from sendToLocations import sendToLocations from getNgrokUrl import getNgrokUrl from getInfo import getInfo from changeWebhookUrl import changeWebhookUrl import time previousWarningUrl = input() start = time.time() ngrokUrl = '' # Check for new warnings, ngrok url for each time looping through while True: try: message, locations = getInfo(previousWarningUrl) except IndexError: time.sleep(10) continue if message != '': responses = sendToLocations(locations, message) for response in responses: if response['status'] == 'success': print(response) else: continue else: pass url = getNgrokUrl() print(url) if url != ngrokUrl:
from getInfo import getInfo from wInExcel import openxl from geturl import getUrl from getpage import getPage list = getPage() urls = getUrl(list) l = [] for url in urls: print(url) list = getInfo(url) l.extend(list) openxl(l)
def findImage(url): html = getInfo.getInfo(url) logo = re.search( r'<div class="ml_logo"><img width="230px" height="115px" src="(.*)" /></div>', html) return logo.group(1)
def getOrgInfo(url): ngo = NGO() ngo.url = url url = url.strip() html = getInfo.getInfo(url) soup = BeautifulSoup(html) # print(soup.prettify()) if soup.find("h3"): title = soup.find("h3") if title.get_text() != "": ngo.name = title.get_text().splitlines()[0] print(ngo.name) content = title.find_next_siblings("p") s = "" encontent = soup.find("p",class_ = "sch_con2 l") if content != []: per = content[0] # print(per.prettify()) for w in per.find_all("p"): s += replace_with_newlines(w) + "\n" if per.find_all("div"): totalDiv = per.find_all("div") for w in totalDiv: s += w.get_text()+"\n" if encontent != None and encontent.find("b"): enb =encontent.find("b") ngo.enName = enb.get_text() ngo.description = s # print(s) #format # tem = list() lines = ngo.description.replace("\t", "").replace("\xa0"," ").splitlines() # lines = ngo.description.splitlines() # print(lines) if ngo.name != "" and re.search('[a-zA-Z]',ngo.name[0]): ngo.name,ngo.enName = ngo.enName,ngo.name ngo.sponsors = [] for i in range(0,len(lines)): Ga = re.search(".*资助者$",lines[i]) if Ga: for j in range(i+1,len(lines)): if lines[j] == "" and j<len(lines)-1 and ( lines[j+1] == "" or re.search( "(.*)合作",lines[j+1] ) ): for k in range(i+1,j): if lines[k] != "": Ga = re.search("(.*)",lines[k]) if Ga: lines[k] = lines[k].replace(Ga.group(0),"") if "、" in lines[k]: words = lines[k].split("、") elif ";" in lines[k]: words = lines[k].split(";") elif "," in lines[k]: words = lines[k].split(",") elif "," in lines[k] or "," in lines[k]: words = lines[k].split(",") else: words = lines[k].split(" ") for q in words: if q: # print(q) ngo.sponsors.append(q) break break ngo.partners = [] for i in range(0,len(lines)): Ga = re.search(".*合作伙伴$",lines[i]) if Ga: for j in range(i+1,len(lines)): if lines[j] == "" and j <len(lines)-1 and ( lines[j+1] == "" or re.search("(.*)独特性",lines[j+1]) ): for k in range(i+1,j): if lines[k] != "": # print("lines: ",lines[k]) Ga = re.search("(.*)",lines[k]) if Ga: lines[k] = lines[k].replace(Ga.group(0),"\n") # print("lines: ",lines[k]) if "、" in lines[k]: words = lines[k].split("、") # print(words) elif ";" in lines[k]: words = lines[k].split(";") elif "," in lines[k]: words = lines[k].split(",") elif "," in lines[k] or "," in lines[k]: words = lines[k].split(",") else: words = lines[k].split(" ") for q in words: if q: # print(q) ngo.partners.append(q) break break Ga = re.search("(.*)地址[:|:](.*)\n(.*)",ngo.description) if Ga: if Ga.group(2): ngo.location = Ga.group(2) else: ngo.location = Ga.group(3) if " " in ngo.location: ngo.location = ngo.location.split(" ")[0] Ga = re.search("负责人[:|:](.*)",ngo.description) if Ga: ngo.personCharge = Ga.group(1) Ga = re.search("员工人数[:|:](.*)",ngo.description) if Ga: ngo.scale = Ga.group(1) Ga = re.search("成立时间[:|:](.*)",ngo.description) if Ga: ngo.esTime = Ga.group(1) print(ngo.name,ngo.location,ngo.personCharge,ngo.scale,ngo.esTime,ngo.partners,ngo.sponsors) return ngo
from getOrgInfo import getOrgInfo import time import pickle preSite = "http://www.ngo20map.com/Index/list_index?&p=" lastPage = 84 pat = re.compile('<a href="(.*)" target="_blank">(.*)</a>') pre = "http://www.ngo20map.com/" st = 1 cnt = 0 ngo = list() for i in range(st,lastPage): print("page: %d\n" %i) # time.sleep(2) url = (preSite+str(i)).strip() html = getInfo(url) soup = BeautifulSoup(html) # print(soup) fSite = re.findall(pat,html) for j in range(0,len(fSite)): # time.sleep(0.5) cnt = cnt + 1 orgurl = pre+fSite[j][0] print(orgurl) ngo.append( getOrgInfo(orgurl) ) print("\n") print("\n") print(cnt) fp = open("orginfo.pkl","wb") pickle.dump(ngo,fp)
def findActive(url): html = getInfo.getInfo(url) m = re.search(r'<div class="flxx">(.*)</div>',html) return m.group(1)
def findLoc(url): html = getInfo(url) locG = re.search(r"var content = '(.*)'",html) return locG.group(1).strip()
def getOrgInfo(start,end,orgSiteList): s1 = r'<div class="ml_name mt15">' s2_1 = r'<h1>' s2_2 = r'</h1>' s3_1 = r'<font>' s3_2 = r'</font>' name = enName = esTime = location = field = scale = description = info = recruit = image = "" suff = ["org_active/","org_hire/","org_image/","org_map/"] ans = list() for st in range(start,end): print("seq: %d" %st) url = orgSiteList[st].strip() ngo = orgClass.NGO() print(url[url.find(r'/org'):len(url)-1]) orgNumber = int( url[url.find(r'/org')+len('/org'):len(url)-1] ) # print(orgNumber) ngo.orgNumber = orgNumber html = getInfo.getInfo(url) loc = html.find(s1) if loc != -1: x = html.find(s2_1, loc + len(s1)) if x == -1: continue y = html.find(s2_2, x + len(s2_1)) if y == -1: continue ngo.name = html[x + len(s2_1):y] m = html.find(s3_1, y + len(s2_2)) if m == -1: continue n = html.find(s3_2, m + len(s3_1)) if(n == -1): continue ngo.enName = html[m + len(s3_1):n] Gr = re.search(r'<li><font>成立时间:</font>(\d*)年</li>', html[n:]) if Gr.group(1): ngo.esTime = int(Gr.group(1)) print("year: %d" %ngo.esTime) Ga = re.search(r'<li><font>工作领域:</font>(.*)</li>', html[Gr.end():]) if Ga.group(1): field = Ga.group(1) ngo.field = field.split(" ") print(ngo.field) Gr = re.search(r'<li><font>机构规模:\s{0,10}</font>(.*)</li>', html[Ga.end():]) if Gr.group(1): ngo.scale = Gr.group(1) print(ngo.scale) Ga = re.search(r'<div class="jgjs mt20">(.*)>',html[Gr.end():]) # s = Ga.group(0) # print(s) x = Gr.end() + Ga.end() sec = html.find(r"</",x) ngo.description = html[x:sec] #description # print(ngo.description) ngo.image = findImage.findImage(url + suff[2]) # print(ngo.image) ngo.location = findLoc.findLoc(url+suff[3]) print("location: %s"%ngo.location) # ngo.show() ans.append(ngo) return ans
def getOrgInfo(url): ngo = NGO() url = url.strip() html = getInfo.getInfo(url) soup = BeautifulSoup(html) # print(soup.prettify()) kinddiv = soup.find("div",text="类别 : ") if kinddiv: kind = kinddiv.parent.find("div",class_ = "OrgInfodataItemContent").get_text() print(kind) if kind != '非政府组织': return -1 namediv = soup.find("div",text="名称 : ") if namediv: name = namediv.parent.find("div",class_ = "OrgInfodataItemContent").get_text() print(name) ga = re.search(r'(([(|(].*[)|)]))',name) if ga: name = name.replace(ga.group(1),"") ngo.name = name print(name) timediv = soup.find("div",text="成立日期 : ") # print(timediv) if timediv: timestr = timediv.parent.find("div",class_ = "OrgInfodataItemContent").get_text() if re.search("(\d+)",timestr): time = int( re.search("(\d+)",timestr).group(1) ) ngo.esTime = time print(time) fi = soup.find("div",text = re.compile("工作领域.*")) if fi: fie = fi.find_next("div",class_ = "OrgInfodataItemContent") if fie: ngo.field = [fields[i] for i in replace_with_newlines(fie).splitlines()] print(ngo.field) des = soup.find("div",text = re.compile("成立背景.*")) if des: descri = des.find_next("div",class_="OrgInfoSectionContent") if descri: ngo.description = descri.get_text() # print(ngo.description) des = soup.find("div",text = re.compile(("在中国的CSR项目.*"))) if des: descri = des.find_next("div",class_="OrgInfoSectionContent") if descri: ngo.description += replace_with_newlines(descri) # print(ngo.description) partnersdiv = soup.find("div",text=re.compile(".*主要合作伙伴.*")) if partnersdiv: partnerstr = replace_with_newlines( partnersdiv.find_next("div",class_="OrgInfoSectionContent") ).splitlines() # print(partnerstr) tem = list() for w in partnerstr: if "," in w: for q in w.split(","): if q.find("\uf0fc") == -1 and q.find("-") == -1: tem.append(q) if "、" in w: for q in w.split("、"): if q.find("\uf0fc") == -1 and q.find("-") == -1: tem.append(q) else: for q in w.split(): if q.find("\uf0fc") == -1 and q.find("-") == -1: tem.append(q) ngo.partners = tem if ngo.partners != []: s = ngo.partners[len(ngo.partners)-1] if s[len(s)-1] == "等": ngo.partners[len(ngo.partners)-1] = s.replace(s[len(s)-1],"") # print(ngo.partners) return ngo
fread = file(filename) for line in fread: uid_list.append(line.strip()) def writefile(filename,content): fw = file(filename,'a') fw.write(content) fw.close() if __name__ == '__main__': username = '' pwd = '' WBLogin = weiboLogin.weiboLogin() if(WBLogin.login(username, pwd)=='servertime_error'): print 'login failed. check out your network.' sys.exit() uid_list=[] get_uid('C:/Result1.txt',uid_list) path='C:/weibodata' if not os.path.exists(path): os.mkdir(path) for uid in uid_list: try: InfoPage=getInfo.getInfo() InfoPage.get_info(uid) except Exception as e: writefile('C:/id.txt',str(uid)+'\n')
from getOrgInfo import getOrgInfo import time import pickle preSite = "http://www.ngo20map.com/Index/list_index?&p=" lastPage = 84 pat = re.compile('<a href="(.*)" target="_blank">(.*)</a>') pre = "http://www.ngo20map.com/" st = 1 cnt = 0 ngo = list() for i in range(st, lastPage): print("page: %d\n" % i) # time.sleep(2) url = (preSite + str(i)).strip() html = getInfo(url) soup = BeautifulSoup(html) # print(soup) fSite = re.findall(pat, html) for j in range(0, len(fSite)): # time.sleep(0.5) cnt = cnt + 1 orgurl = pre + fSite[j][0] print(orgurl) ngo.append(getOrgInfo(orgurl)) print("\n") print("\n") print(cnt) fp = open("orginfo.pkl", "wb") pickle.dump(ngo, fp)
def getOrgInfo(url): ngo = NGO() url = url.strip() html = getInfo.getInfo(url) soup = BeautifulSoup(html) # print(soup.prettify()) kinddiv = soup.find("div", text="类别 : ") if kinddiv: kind = kinddiv.parent.find("div", class_="OrgInfodataItemContent").get_text() print(kind) if kind != '非政府组织': return -1 namediv = soup.find("div", text="名称 : ") if namediv: name = namediv.parent.find("div", class_="OrgInfodataItemContent").get_text() print(name) ga = re.search(r'(([(|(].*[)|)]))', name) if ga: name = name.replace(ga.group(1), "") ngo.name = name print(name) timediv = soup.find("div", text="成立日期 : ") # print(timediv) if timediv: timestr = timediv.parent.find( "div", class_="OrgInfodataItemContent").get_text() if re.search("(\d+)", timestr): time = int(re.search("(\d+)", timestr).group(1)) ngo.esTime = time print(time) fi = soup.find("div", text=re.compile("工作领域.*")) if fi: fie = fi.find_next("div", class_="OrgInfodataItemContent") if fie: ngo.field = [ fields[i] for i in replace_with_newlines(fie).splitlines() ] print(ngo.field) des = soup.find("div", text=re.compile("成立背景.*")) if des: descri = des.find_next("div", class_="OrgInfoSectionContent") if descri: ngo.description = descri.get_text() # print(ngo.description) des = soup.find("div", text=re.compile(("在中国的CSR项目.*"))) if des: descri = des.find_next("div", class_="OrgInfoSectionContent") if descri: ngo.description += replace_with_newlines(descri) # print(ngo.description) partnersdiv = soup.find("div", text=re.compile(".*主要合作伙伴.*")) if partnersdiv: partnerstr = replace_with_newlines( partnersdiv.find_next( "div", class_="OrgInfoSectionContent")).splitlines() # print(partnerstr) tem = list() for w in partnerstr: if "," in w: for q in w.split(","): if q.find("\uf0fc") == -1 and q.find("-") == -1: tem.append(q) if "、" in w: for q in w.split("、"): if q.find("\uf0fc") == -1 and q.find("-") == -1: tem.append(q) else: for q in w.split(): if q.find("\uf0fc") == -1 and q.find("-") == -1: tem.append(q) ngo.partners = tem if ngo.partners != []: s = ngo.partners[len(ngo.partners) - 1] if s[len(s) - 1] == "等": ngo.partners[len(ngo.partners) - 1] = s.replace( s[len(s) - 1], "") # print(ngo.partners) return ngo
import getInfo import os import time import re s1 = r'<div class="ml_name mt15">' s2_1 = r'<h1>' s2_2 = r'</h1>' s3_1 = r'<font>' s3_2 = r'</font>' name = enName = esTime = location = area = scale = description = info = recruit = image = None fp = open("Pages.txt", "r", encoding="utf-8") # os.system("pause") # time.sleep(2) # url = st.strip() html = getInfo.getInfo('http://www.chinadevelopmentbrief.org.cn/org33/') loc = html.find(s1) if loc != -1: x = html.find(s2_1, loc + len(s1)) if x == -1: pass y = html.find(s2_2, x + len(s2_1)) if y == -1: pass name = html[x + len(s2_1):y] print(name) m = html.find(s3_1, y + len(s2_2)) if m == -1: pass n = html.find(s3_2, m + len(s3_1)) if (n == -1): pass enName = html[m + len(s3_1):n] print(enName) Gr = re.search(r'<li><font>成立时间:</font>(\d+)年</li>', html[n:])
#!/usr/bin/python #! coding:utf-8 from getInfo import getInfo import os import time if __name__ == '__main__': successCount = 0 failedCount = 0 while True: usefulInfo = getInfo(); if usefulInfo == -1: failedCount=failedCount+1 continue else: successCount=successCount+1 os.system('clear') print time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))+"\t成功:"+str(successCount)+"\t失败:"+str(failedCount) print '----------------------------------------------------------------------' for element in usefulInfo[0:10]: if element['isTransfer']==True: print str(element["interest"]) + "\t" + str(element["borrowerLevel"])+ "\t" + str(element["displayLoanType"])+ "\t" + str(element["share"]) + "\t" + "转" else: print str(element["interest"]) + "\t" + str(element["borrowerLevel"])+ "\t" + str(element["displayLoanType"])+ "\t" + str(element["finishedRatio"]) time.sleep(5)
import getUrl import getInfo from urllib import request import re import time import os targetFile = './viewAndRank.txt' url_filename = './blogUrl.txt' if os.path.exists(targetFile): print(targetFile, ' is existed') else: print('creat ', targetFile, " to store CSDN blog's view and rank num") with open(targetFile, 'w') as t: t.write('date\t\t\tViewNum\t\tRankNum\n') t.close() now = time.strftime('%Y-%m-%d %H:%M:%S') view_num, rank_num = getInfo.getInfo(url_filename) with open(targetFile, 'a') as t: info = now + '\t' + view_num + '\t\t' + rank_num + '\n' t.write(info) t.close()
def findActive(url): html = getInfo.getInfo(url) m = re.search(r'<div class="flxx">(.*)</div>', html) return m.group(1)
from getInfo import getInfo import xlsxwriter import requests import traceback try: if __name__ == '__main__': info = getInfo() saveName = input("请输入结果数据文件名(不要填写尾缀名):") savePath = "\\".join( info.pathList[0].split("\\")[0:-1]) + "\\" + saveName + ".xlsx" newExl = xlsxwriter.Workbook(savePath) sheet1 = newExl.add_worksheet() #添加列标题 sheet1.write_row("A1", ["月份"] + info.exlNameList) #添加行标题 sheet1.write_column( "A2", ["留存人数", "工作量50-100h人数", "工作量100-150h人数", "工作量大于150h人数"]) #添加sheet1数据 sheet1.write_row("B2", info.differenceList) sheet1.write_row("B3", info.count50List) sheet1.write_row("B4", info.count100List) sheet1.write_row("B5", info.count150List) #添加名单 for index, name in enumerate(info.exlNameList): sheetNew = newExl.add_worksheet(name) #添加列标题 sheetNew.write_row( "A1", ["工作量50-100h人数", "工作量100-150h人数", "工作量大于150h人数"]) #添加数据 sheetNew.write_column("A2", info.time50List[index])
def findImage(url): html = getInfo.getInfo(url) logo = re.search(r'<div class="ml_logo"><img width="230px" height="115px" src="(.*)" /></div>',html) return logo.group(1)
def findLoc(url): html = getInfo(url) locG = re.search(r"var content = '(.*)'", html) return locG.group(1).strip()
def show_type(name): Help = getInfo() res_data = Help.getDataFromCompany_mul(name) return json.dumps(res_data)