def download_latest(): if not util.is_trade_day(): return latest_day = util.get_today() re_download = True pagesize = 88 stocks = load_all_stocks() count = len(stocks) params = [s[0] for s in stocks] pagecount = int(math.ceil(count / pagesize)) dir_today = '%s%s/' % (config.daily_data_dir, latest_day) print "download 下载文件" for i in range(0, pagecount + 1): print i url = const_base_url + ','.join( params[i * pagesize:(i + 1) * pagesize]) lfile = '%s%s.csv' % (dir_today, i) if re_download: if not os.path.exists(dir_today): os.mkdir(dir_today) if os.path.exists(lfile): os.remove(lfile) try: browser.downad_and_save(url, lfile) except Exception, e: print str(e)
def download_latest(): if not util.is_trade_day(): return latest_day = util.get_today() re_download = True pagesize = 88 stocks = load_all_stocks() count = len(stocks) params = [s[0] for s in stocks] pagecount = int(math.ceil(count/pagesize)) dir_today = '%s%s/' %(config.daily_data_dir,latest_day) print "download 下载文件" for i in range(0,pagecount+1): print i url = const_base_url + ','.join(params[i*pagesize:(i+1)*pagesize]) lfile = '%s%s.csv' %(dir_today,i) if re_download: if not os.path.exists(dir_today): os.mkdir(dir_today) if os.path.exists(lfile): os.remove(lfile) try: browser.downad_and_save(url,lfile) except Exception,e: print str(e)
def download_history(stock_no): url = 'http://table.finance.yahoo.com/table.csv?s=%s' % (stock_no) lfile = '%s%s.csv' %(config.history_data_dir,stock_no) # print url ,lfile try: # if os.path.exists(lfile): # os.remove(lfile) if not os.path.exists(lfile): print stock_no browser.downad_and_save(url,lfile) except Exception,e: print str(e)
def download_history(stock_no): url = 'http://table.finance.yahoo.com/table.csv?s=%s' % (stock_no) lfile = '%s%s.csv' % (config.history_data_dir, stock_no) # print url ,lfile try: # if os.path.exists(lfile): # os.remove(lfile) if not os.path.exists(lfile): print stock_no browser.downad_and_save(url, lfile) except Exception, e: print str(e)
def download_price(theater_id,movie_id): # http://bj.nuomi.com/pcindex/main/timetable?cinemaid=1c2e250a3e9691059ee32187&mid=9762&needMovieInfo=0&tploption=5&_=1448004690864#j-movie-list1 url = "http://bj.nuomi.com/pcindex/main/timetable?cinemaid=%s&mid=%s&needMovieInfo=0"%(theater_id,movie_id) lfile = '%snuomi/price_%s_%s.html' % (download_dir,theater_id,movie_id) respHtml = browser.downad_and_save(url,lfile) soup = BeautifulSoup(respHtml, fromEncoding=htmlCharset) li = [] dom_divs = soup.findAll('div',attrs = {'class': 'list'}) for day,div in enumerate(dom_divs): trs = div.findAll('tr') rows = [] for tr in trs: tds = tr.findAll('td') if not tds: continue p = tds[3].find('span') pp = p.contents[0].split(';')[-1] order_url = completeInnerUrl("http://bj.nuomi.com/", tds[4].find('a')['href']) li.append(','.join([str(day),theater_id,movie_id,tds[0].contents[0].strip(),tds[1].contents[0],pp.strip(),order_url])) csv_file = '%snuomi/price_%s_%s.csv' % (download_dir,theater_id,movie_id) save_file(csv_file,li)
def download_list(): proxies = util.load_proxies() for i in range(1,802): #802 url = 'http://www.cnhm.net/plant/index/page/%d' % (i) lfile = '%scnhm/list_%d.html' % (config.local_dir,i) proxy = random.choice(proxies).strip() print i,url,proxy respHtml = browser.downad_and_save(url,lfile) soup = BeautifulSoup(respHtml, fromEncoding=htmlCharset) table = soup.find('div',attrs = {'class': 'main'}) if not table: os.system('rm -f %s'%(lfile)) print 'err' continue xitems = [] tr_list = table.findAll('li') for tr in tr_list: plant_name = tr.find('div',attrs = {'class': 'PlantName'}) namea = plant_name.find('a') uid = util.completeInnerUrl(url,namea['href']) cn_name = namea['title'] lating_name = tr.find('div',attrs = {'class': 'LatinName'})['title'] section = tr.find('div',attrs = {'class': 'section'}).contents[0] genera = tr.find('div',attrs = {'class': 'genera'}).contents[0] xitems.append(','.join([uid,cn_name,lating_name,section,genera ])) with open('%scnhm/xlist_%s.csv'%(config.local_dir,i),'w') as f: f.write('\r\n'.join(xitems)+'\r\n') f.close()
def down_details(): proxies = load_proxies() with open('%sdouban/topics1.csv'%(local_dir),'r') as f: lines = f.readlines() i = 0 for l in lines: i = i + 1 url = l.strip() topic_id = url.split('/')[-2] lfile = '%sdouban/topic_%s.html' % (local_dir,topic_id) proxy = random.choice(proxies).strip() print i,url,proxy respHtml = '' try: respHtml = browser.downad_and_save(url,lfile,proxy) except: print 'err.........' continue soup = BeautifulSoup(respHtml, fromEncoding=htmlCharset) resp1 = soup.find('div', attrs = {'id': 'link-report'}) #楼贴 if resp1: main_content = resp1.text dom_imgs = resp1.findAll('img') imgs = [img['src'] for img in dom_imgs] #评论 resp2 = soup.find('ul', attrs = {'id': 'comments'}) if not resp2: continue dom_li = resp2.findAll('li') users = [] for li in dom_li: x = li.find('h4') user = x.find('a')['href'] if x.find('a') else '' # print user content = li.find('p').text if li.find('p') else '' #div user-face # print content users.append(user) # print li #div id = link-report #ul id = comments with open('%sdouban/user_%s.csv'%(local_dir,topic_id),'w') as f: f.write('\r\n'.join(users)+'\r\n') f.close()
def down_items(): for i in range(1,36): lfile = '%smengsang_list_%s.html' % (local_dir,i) urls = parse_list(lfile) xitems = [] for url in urls: print i,url fname = url.replace('http://www.mengsang.com/','').replace('/','_') item_lfile = '%smengsang_items/%s' % (local_dir,fname) #url 中有空格? xurl = url.strip().replace(' ','%20') respHtml = browser.downad_and_save(xurl,item_lfile) #parse url_segemts = url.replace('http://www.mengsang.com/','').split('/') # print url_segemts[-4] htmlCharset = "GBK" #gb2312和gbk还是有差别的 soup = BeautifulSoup(respHtml, fromEncoding=htmlCharset) # name = soup.find('h2').contents[0].strip() resp2 = soup.findAll('span', attrs = {'class': 'mainBoxTitleCon'}) sorts = resp2[0].findAll('a') shu = sorts[-1].contents[0] ke = sorts[-2].contents[0] resp3 = soup.findAll('div', attrs = {'class': 'pt5'}) name = resp3[-2].contents[1] name_en = resp3[-1].contents[1] if len(resp3[-1].contents)>1 else " " name_en = name_en.replace(',',' ') src = soup.find('div', attrs = {'class': 'imgCenter'}).find('img') img_src = completeInnerUrl(url , src['src']) tmp = url_segemts[-4] if len(url_segemts)>3 else '' xitem = ','.join([name,name_en,tmp,ke,shu,url,img_src]) xitems.append(xitem) with open('%stest_%s.csv'%(local_dir,i),'w') as f: f.write('\r\n'.join(xitems)+'\r\n') f.close()
def download(i,word,proxy): lfile = '%sbaidu_img/plant_%s.html' % (config.local_dir,i) url = 'http://image.baidu.com/search/index?tn=baiduimage&ie=utf-8&word=%s' % (word) respHtml = browser.downad_and_save(url,lfile,proxy) img_urls = re.findall(r'"objURL":"([^\"]*)"',respHtml) str_img_urls = '\r\n'.join(img_urls)+'\r\n' with open('%sbaidu_img/img_%s.txt'%(config.local_dir,i),'w') as f: f.write(str_img_urls) f.close() attrs = re.findall(r'"@attr": { "key":([^\}]*)\}',respHtml) str_attrs = '\r\n'.join([attr.replace('"value":','') for attr in attrs])+'\r\n' with open('%sbaidu_img/attr_%s.txt'%(config.local_dir,i),'w') as f: f.write(str_attrs) f.close() x = re.findall(r'\d+',i) #一定是字符问题,但无法知晓具体原因 import_db(int(x[0]),str_img_urls,attrs)
def download_movie_playing(): url = "http://bj.nuomi.com/movie/" lfile = '%smovies_playing.html' % (root_dir) respHtml = browser.downad_and_save(url,lfile) soup = BeautifulSoup(respHtml, fromEncoding=htmlCharset) li = [] # <div class="section-item clearfix no-top-border"> dom_movies = soup.find('div',attrs = {'class': 'section-item clearfix no-top-border'}) dom_a = dom_movies.findAll('a') for m in dom_a: # dom_a = m.find('a') uid = m['href'].split('/')[-1] uri = completeInnerUrl("http://bj.nuomi.com/",m['href']) name = m.contents[0] li.append("Movie,%s,%s,%s"%(uid,uri,name)) csv_file = '%snuomi/movie_result.csv' % (download_dir) save_file(csv_file,li)
def download_mtheater_list(page_index=1,proxy=None): url = "http://t.dianping.com/movie/beijing/tab_cinema?pageno=%d" % (page_index) lfile = '%smtheater_list_%d.html' % (root_dir,page_index) respHtml = browser.downad_and_save(url,lfile,proxy) soup = BeautifulSoup(respHtml, fromEncoding=htmlCharset) li = [] # <div class="index-cinema-list"> # <li class="item Fix"> dom_theaterlist = soup.find('div',attrs = {'class': 'index-cinema-list'}) dom_mtheaters = dom_theaterlist.findAll('li',attrs = {'class': 'item Fix'}) for mt in dom_mtheaters: dom_a = mt.findAll('a')[1] uid = dom_a['href'].split('/')[-1] uri = completeInnerUrl("http://t.dianping.com/",dom_a['href']) name = dom_a.contents[0] li.append("MovieTheater,北京,%s,%s,%s"%(uid,uri,name)) csv_file = '%smtheater_result_%d.csv' % (root_dir,page_index) save_file(csv_file,li)
def run(group): # http://www.douban.com/group/aizhiwu/discussion?start=0 25 50 for i in range(0,30): url = 'http://www.douban.com/group/%s/discussion?start=%d' % (group,i*25) lfile = '%sdouban/%s_list_%d.html' % (local_dir,group,i) print group,i,url respHtml = browser.downad_and_save(url,lfile) soup = BeautifulSoup(respHtml, fromEncoding=htmlCharset) table = soup.find('table',attrs = {'class': 'olt'}) if not table: break tr_list = table.findAll('tr',attrs = {'class': ''}) xitems = [] for tr in tr_list: td_list = tr.findAll('td') title = td_list[0].find('a') title_href = title['href'] title_cn = title.contents[0].replace(',','') if title.contents else '' user = td_list[1].find('a') user_href = user['href'] user_name = user.contents[0].replace(',','') if user.contents else '' reply_count = td_list[2].contents[0] if td_list[2].contents else 0 last_reply = td_list[3].contents[0] xitems.append(','.join([group,title_cn,title_href,user_name,user_href,str(reply_count),last_reply ] )) with open('%sdouban/list_%s_%s.csv'%(local_dir,group,i),'w') as f: f.write('\r\n'.join(xitems)+'\r\n') f.close()
def run(): for i in range(1,36): print i url = 'http://www.mengsang.com/category/list_48_%s.html' % (i) lfile = '%smengsang_list_%s.html' % (local_dir,i) browser.downad_and_save(url,lfile)
def download_detail(url,proxy=None): x = re.findall(r'\d+',url) plant_id = int(x[0]) lfile = '%scnhm/detail_%d.html' % (config.local_dir,plant_id) respHtml = browser.downad_and_save(url,lfile,proxy)