Exemplo n.º 1
0
def download_latest():
    if not util.is_trade_day(): return

    latest_day = util.get_today()
    re_download = True

    pagesize = 88
    stocks = load_all_stocks()
    count = len(stocks)
    params = [s[0] for s in stocks]
    pagecount = int(math.ceil(count / pagesize))

    dir_today = '%s%s/' % (config.daily_data_dir, latest_day)

    print "download 下载文件"
    for i in range(0, pagecount + 1):
        print i
        url = const_base_url + ','.join(
            params[i * pagesize:(i + 1) * pagesize])
        lfile = '%s%s.csv' % (dir_today, i)
        if re_download:
            if not os.path.exists(dir_today):
                os.mkdir(dir_today)
            if os.path.exists(lfile):
                os.remove(lfile)
            try:
                browser.downad_and_save(url, lfile)
            except Exception, e:
                print str(e)
Exemplo n.º 2
0
def download_latest():
    if not util.is_trade_day(): return 

    latest_day = util.get_today()
    re_download = True

    pagesize = 88
    stocks = load_all_stocks()
    count = len(stocks)
    params = [s[0] for s in stocks]     
    pagecount = int(math.ceil(count/pagesize)) 
    
    dir_today = '%s%s/' %(config.daily_data_dir,latest_day)   

    print "download 下载文件"    
    for i in range(0,pagecount+1):
        print i
        url = const_base_url + ','.join(params[i*pagesize:(i+1)*pagesize])        
        lfile = '%s%s.csv' %(dir_today,i)
        if re_download:
            if not os.path.exists(dir_today):
                os.mkdir(dir_today) 
            if os.path.exists(lfile):
                os.remove(lfile) 
            try:
                browser.downad_and_save(url,lfile)
            except Exception,e:
                print str(e)
Exemplo n.º 3
0
def download_history(stock_no):    
    url = 'http://table.finance.yahoo.com/table.csv?s=%s' % (stock_no)     
    lfile = '%s%s.csv' %(config.history_data_dir,stock_no)
    # print url ,lfile     
    try:
        # if os.path.exists(lfile):
        #     os.remove(lfile)
        if not os.path.exists(lfile):
            print stock_no
            browser.downad_and_save(url,lfile)
    except Exception,e:
        print str(e) 
Exemplo n.º 4
0
def download_history(stock_no):
    url = 'http://table.finance.yahoo.com/table.csv?s=%s' % (stock_no)
    lfile = '%s%s.csv' % (config.history_data_dir, stock_no)
    # print url ,lfile
    try:
        # if os.path.exists(lfile):
        #     os.remove(lfile)
        if not os.path.exists(lfile):
            print stock_no
            browser.downad_and_save(url, lfile)
    except Exception, e:
        print str(e)
Exemplo n.º 5
0
def download_price(theater_id,movie_id):
    # http://bj.nuomi.com/pcindex/main/timetable?cinemaid=1c2e250a3e9691059ee32187&mid=9762&needMovieInfo=0&tploption=5&_=1448004690864#j-movie-list1
    url = "http://bj.nuomi.com/pcindex/main/timetable?cinemaid=%s&mid=%s&needMovieInfo=0"%(theater_id,movie_id)
    lfile = '%snuomi/price_%s_%s.html' % (download_dir,theater_id,movie_id) 
    respHtml = browser.downad_and_save(url,lfile)  
    soup = BeautifulSoup(respHtml, fromEncoding=htmlCharset)

    li = []
    dom_divs = soup.findAll('div',attrs = {'class': 'list'})
    for day,div in enumerate(dom_divs):        
        trs = div.findAll('tr')        
        rows = []
        for tr in trs: 
            tds = tr.findAll('td') 
            
            if not tds: continue

            p = tds[3].find('span')
            pp = p.contents[0].split(';')[-1]
            order_url = completeInnerUrl("http://bj.nuomi.com/", tds[4].find('a')['href']) 

            li.append(','.join([str(day),theater_id,movie_id,tds[0].contents[0].strip(),tds[1].contents[0],pp.strip(),order_url]))
          

        csv_file = '%snuomi/price_%s_%s.csv' % (download_dir,theater_id,movie_id) 
        save_file(csv_file,li) 
Exemplo n.º 6
0
def download_list():
    proxies = util.load_proxies()

    for i in range(1,802): #802               
        url = 'http://www.cnhm.net/plant/index/page/%d' % (i)
        lfile = '%scnhm/list_%d.html' % (config.local_dir,i) 

        proxy = random.choice(proxies).strip()
        print i,url,proxy

        respHtml = browser.downad_and_save(url,lfile)         
        soup = BeautifulSoup(respHtml, fromEncoding=htmlCharset)

        table = soup.find('div',attrs = {'class': 'main'})
        if not table:
            os.system('rm -f %s'%(lfile))
            print 'err'
            continue 

        xitems = []
        tr_list = table.findAll('li')
        for tr in tr_list:
            plant_name = tr.find('div',attrs = {'class': 'PlantName'})
            namea = plant_name.find('a')
            uid = util.completeInnerUrl(url,namea['href'])
            cn_name = namea['title']
            lating_name = tr.find('div',attrs = {'class': 'LatinName'})['title']            
            section = tr.find('div',attrs = {'class': 'section'}).contents[0]
            genera = tr.find('div',attrs = {'class': 'genera'}).contents[0]
            xitems.append(','.join([uid,cn_name,lating_name,section,genera ]))

        with open('%scnhm/xlist_%s.csv'%(config.local_dir,i),'w') as f:
            f.write('\r\n'.join(xitems)+'\r\n')
            f.close()
Exemplo n.º 7
0
def down_details():
    proxies = load_proxies()

    with open('%sdouban/topics1.csv'%(local_dir),'r') as f:
        lines = f.readlines()
        i = 0 
        for l in lines:    
            i = i + 1        
            url = l.strip()
            topic_id = url.split('/')[-2]
            lfile = '%sdouban/topic_%s.html' % (local_dir,topic_id)  
            
            proxy = random.choice(proxies).strip()
            print i,url,proxy
            
            respHtml = ''
            try:
                respHtml = browser.downad_and_save(url,lfile,proxy)  
            except:
                print 'err.........'
                continue

            soup = BeautifulSoup(respHtml, fromEncoding=htmlCharset)
            resp1 = soup.find('div', attrs = {'id': 'link-report'}) 
        
            #楼贴
            if resp1:
                main_content = resp1.text
                dom_imgs = resp1.findAll('img')
                imgs = [img['src'] for img in dom_imgs]

            #评论
            resp2 = soup.find('ul', attrs = {'id': 'comments'})
            if not resp2:
                continue

            dom_li = resp2.findAll('li')

            users = []
            for li in dom_li:
                x = li.find('h4')
                user = x.find('a')['href'] if x.find('a')  else ''
                # print user
                content = li.find('p').text if li.find('p')  else ''
                #div user-face
                # print content
                users.append(user) 
                # print li

            #div id = link-report
            #ul id = comments

            with open('%sdouban/user_%s.csv'%(local_dir,topic_id),'w') as f:
                f.write('\r\n'.join(users)+'\r\n')
                f.close()
Exemplo n.º 8
0
def down_items():        
    for i in range(1,36):    
        lfile = '%smengsang_list_%s.html' % (local_dir,i) 
        urls = parse_list(lfile)

        xitems = []      
        for url in urls:
            print i,url 
            fname = url.replace('http://www.mengsang.com/','').replace('/','_') 
            item_lfile = '%smengsang_items/%s' % (local_dir,fname)
            #url 中有空格?
            xurl = url.strip().replace(' ','%20')
            respHtml = browser.downad_and_save(xurl,item_lfile)   

            #parse
            url_segemts = url.replace('http://www.mengsang.com/','').split('/') 
            # print  url_segemts[-4]        
            
            htmlCharset = "GBK"  #gb2312和gbk还是有差别的
            soup = BeautifulSoup(respHtml, fromEncoding=htmlCharset)
            # name = soup.find('h2').contents[0].strip() 

            resp2 = soup.findAll('span', attrs = {'class': 'mainBoxTitleCon'})
            sorts = resp2[0].findAll('a')
            shu = sorts[-1].contents[0]
            ke = sorts[-2].contents[0]

            resp3 = soup.findAll('div', attrs = {'class': 'pt5'})
            name = resp3[-2].contents[1]
            name_en = resp3[-1].contents[1] if len(resp3[-1].contents)>1 else " "
            name_en = name_en.replace(',',' ')

            src = soup.find('div', attrs = {'class': 'imgCenter'}).find('img')
            img_src =  completeInnerUrl(url , src['src'])  

            tmp = url_segemts[-4] if len(url_segemts)>3 else ''

            xitem =  ','.join([name,name_en,tmp,ke,shu,url,img_src]) 
            
            xitems.append(xitem)        
    
        with open('%stest_%s.csv'%(local_dir,i),'w') as f:
            f.write('\r\n'.join(xitems)+'\r\n')
            f.close()
Exemplo n.º 9
0
def download(i,word,proxy):
    lfile = '%sbaidu_img/plant_%s.html' % (config.local_dir,i) 
    url = 'http://image.baidu.com/search/index?tn=baiduimage&ie=utf-8&word=%s' % (word)
    respHtml = browser.downad_and_save(url,lfile,proxy)
    
    img_urls = re.findall(r'"objURL":"([^\"]*)"',respHtml)
    str_img_urls = '\r\n'.join(img_urls)+'\r\n'
    with open('%sbaidu_img/img_%s.txt'%(config.local_dir,i),'w') as f:
        f.write(str_img_urls)
        f.close()

    attrs = re.findall(r'"@attr": { "key":([^\}]*)\}',respHtml)  
    str_attrs = '\r\n'.join([attr.replace('"value":','') for attr in attrs])+'\r\n'   
    with open('%sbaidu_img/attr_%s.txt'%(config.local_dir,i),'w') as f:
        f.write(str_attrs)
        f.close()

    x = re.findall(r'\d+',i)    #一定是字符问题,但无法知晓具体原因 
    import_db(int(x[0]),str_img_urls,attrs)
Exemplo n.º 10
0
def download_movie_playing():
    url = "http://bj.nuomi.com/movie/" 
    lfile = '%smovies_playing.html' % (root_dir) 

    respHtml = browser.downad_and_save(url,lfile)  
    soup = BeautifulSoup(respHtml, fromEncoding=htmlCharset)

    li = []
    # <div class="section-item clearfix no-top-border">
    dom_movies = soup.find('div',attrs = {'class': 'section-item clearfix no-top-border'})
    dom_a = dom_movies.findAll('a')
    for m in dom_a:
        # dom_a = m.find('a')
        uid = m['href'].split('/')[-1]
        uri = completeInnerUrl("http://bj.nuomi.com/",m['href'])
        name = m.contents[0]
        li.append("Movie,%s,%s,%s"%(uid,uri,name))   
    
    csv_file = '%snuomi/movie_result.csv' % (download_dir) 
    save_file(csv_file,li)
Exemplo n.º 11
0
def download_mtheater_list(page_index=1,proxy=None): 
    url = "http://t.dianping.com/movie/beijing/tab_cinema?pageno=%d" % (page_index)
    lfile = '%smtheater_list_%d.html' % (root_dir,page_index) 

    respHtml = browser.downad_and_save(url,lfile,proxy)  
    soup = BeautifulSoup(respHtml, fromEncoding=htmlCharset)

    li = []
    # <div class="index-cinema-list">
    # <li class="item Fix">
    dom_theaterlist = soup.find('div',attrs = {'class': 'index-cinema-list'}) 
    dom_mtheaters = dom_theaterlist.findAll('li',attrs = {'class': 'item Fix'})
    for mt in dom_mtheaters:
        dom_a = mt.findAll('a')[1]
        uid = dom_a['href'].split('/')[-1]
        uri = completeInnerUrl("http://t.dianping.com/",dom_a['href'])
        name = dom_a.contents[0] 
        li.append("MovieTheater,北京,%s,%s,%s"%(uid,uri,name))   
    
    csv_file = '%smtheater_result_%d.csv' % (root_dir,page_index) 
    save_file(csv_file,li)
Exemplo n.º 12
0
def run(group):
    # http://www.douban.com/group/aizhiwu/discussion?start=0  25 50
    for i in range(0,30):
        url = 'http://www.douban.com/group/%s/discussion?start=%d' % (group,i*25)
        lfile = '%sdouban/%s_list_%d.html' % (local_dir,group,i)  

        print group,i,url

        respHtml = browser.downad_and_save(url,lfile)         
        soup = BeautifulSoup(respHtml, fromEncoding=htmlCharset)
        table = soup.find('table',attrs = {'class': 'olt'})
        if not table:
            break

        tr_list = table.findAll('tr',attrs = {'class': ''})
       

        xitems = []
        for tr in tr_list:
            td_list = tr.findAll('td')

            title = td_list[0].find('a')
            title_href = title['href']
            title_cn = title.contents[0].replace(',','') if title.contents else ''
            
            user = td_list[1].find('a')
            user_href = user['href']
            user_name = user.contents[0].replace(',','') if user.contents else ''

            reply_count = td_list[2].contents[0] if td_list[2].contents else 0
            last_reply = td_list[3].contents[0]

            xitems.append(','.join([group,title_cn,title_href,user_name,user_href,str(reply_count),last_reply ]  ))

        with open('%sdouban/list_%s_%s.csv'%(local_dir,group,i),'w') as f:
            f.write('\r\n'.join(xitems)+'\r\n')
            f.close()
Exemplo n.º 13
0
def run():
    for i in range(1,36):
        print i
        url = 'http://www.mengsang.com/category/list_48_%s.html' % (i)
        lfile = '%smengsang_list_%s.html' % (local_dir,i)        
        browser.downad_and_save(url,lfile) 
Exemplo n.º 14
0
def download_detail(url,proxy=None):
    x = re.findall(r'\d+',url)  
    plant_id = int(x[0])    
    lfile = '%scnhm/detail_%d.html' % (config.local_dir,plant_id) 
    respHtml = browser.downad_and_save(url,lfile,proxy)