Exemplo n.º 1
0
 def get_subs(self, fan_id,
              sub_id_dict):  #error runing! get nothing! need fix!
     fan_sub_url = self.get_fan_sub_first_url(fan_id)
     resp = func.GetHttpContent("GET", fan_sub_url)
     if resp is None:
         func._print("failed get html .The url = %s \n" % (fan_sub_url))
         return 0
     sub_id_list = []
     ttl_subs_num = self.get_uid_from_tudou_response(resp,
                                                     sub_id_list,
                                                     pagenum=1)
     #print("the number of subs in the json from the fan id %s : %s "% (fan_id, ttl_subs_num))
     self.store_id_to_dict(sub_id_list, sub_id_dict)
     page_ttl = ttl_subs_num / max_num_per_page  # ttl_subs_num max is 999, per page max_num_per_page
     if (ttl_subs_num % max_num_per_page > 0):
         page_ttl = page_ttl + 1
     sub_id_counter = len(sub_id_list)
     for pagenum in range(2, page_ttl +
                          1):  #next page from 2 to page_ttl , more subs
         fan_sub_url = self.get_sub_page(fan_id, pagenum)
         resp = func.GetHttpContent("GET", fan_sub_url)
         if resp is None:
             func._print("failed get html .The url = %s \n" % (fan_sub_url))
             continue
         sub_id_list = []
         self.get_uid_from_tudou_response(resp, sub_id_list, pagenum)
         self.store_id_to_dict(sub_id_list, sub_id_dict)
         sub_id_counter = sub_id_counter + len(sub_id_list)
     print("scrapy %s subs found from the fan id %s : %s " %
           (sub_id_counter, fan_id))
     return sub_id_counter
Exemplo n.º 2
0
    def get_sub_fan(self, sub_id):
        all_fan_id_list = []
        sub_fan_url = self.getsubfanurl(sub_id)
        #get fan's home url from sub_fan_url
        resp = func.GetHttpContent(sub_fan_url)
        if resp is None:
            func._print("failed get html .The url = %s \n" % (sub_fan_url))
            return
        fan_id_list = []
        ttl_fans_num = self.get_uid_from_tudou_response(resp,
                                                        fan_id_list,
                                                        pagenum=1)
        all_fan_id_list = all_fan_id_list + fan_id_list
        page_ttl = ttl_fans_num / max_num_per_page
        if (ttl_fans_num % max_num_per_page > 0):
            page_ttl = page_ttl + 1

        for pagenum in range(2, page_ttl +
                             1):  #next page from 2 to page_ttl , more fans
            sub_fan_url = self.getsubfanurl_page(sub_id, pagenum)
            resp = func.GetHttpContent(sub_fan_url)
            if resp is None:
                func._print("failed get html .The url = %s \n" % (sub_fan_url))
                continue
            fan_id_list = []
            self.get_uid_from_tudou_response(resp, fan_id_list, pagenum)
            all_fan_id_list = all_fan_id_list + fan_id_list
        print("the number of fans found from sub id %s: %s " %
              (sub_id, len(all_fan_id_list)))
        return all_fan_id_list
Exemplo n.º 3
0
    def get_fans(self, sub_id):
        all_fan_id_list = []

        #要获取到页数,无法mt
        sub_fan_url = self.getsubfanurl(sub_id)
        #get fan's home url from sub_fan_url
        resp = func.GetHttpContent(sub_fan_url)
        if resp is None:
            func._print("failed get html .The url = %s \n" % (sub_fan_url))
            return
        fan_id_list = []
        ttl_fans_num = self.get_uid_from_tudou_response(resp, fan_id_list)
        all_fan_id_list = all_fan_id_list + fan_id_list
        page_ttl = ttl_fans_num / max_num_per_page
        if (ttl_fans_num % max_num_per_page > 0):
            page_ttl = page_ttl + 1
        #print("the number of fans found from sub id %s: %s "% (sub_id, len(all_fan_id_list)))
        #print("the number of pages of fans of sub id %s: %s "% (sub_id, page_ttl ))
        if (page_ttl > 1):
            sub_fan_url_list = []
            for pagenum in range(2, page_ttl +
                                 1):  #next page from 2 to page_ttl , more fans
                sub_fan_url = self.getsubfanurl_page(sub_id, pagenum)
                sub_fan_url_list.append(sub_fan_url)

            mt.runMT("get_fans", get_tudou_json, sub_fan_url_list, False, None,
                     store_to_list, all_fan_id_list)
        #print("found %s fans   from sub id %s: %s "% (len(all_fan_id_list), sub_id  ))
        return all_fan_id_list
Exemplo n.º 4
0
def extract(the_url, _xpath_list):
    html = ''
    try:
        html = func.GetHttpContent(the_url)
        #f = open(file_name, 'w')
        #f.write(html)
        #f.close
        #html = open(file_name).read()#.decode(decoding)
        #print html
    except:
        pass

    if html is None:
        return
    tree = etree.HTML(html)

    item1 = tree.xpath(_xpath_list[0])
    item2 = tree.xpath(_xpath_list[1])
    #item3 =  tree.xpath(xpath_list[2])

    items_dict = dict(zip(item1, item2))

    item_no = 0
    print(the_url)
    for goods_name, price in items_dict.iteritems():
        item_no = item_no + 1
        print "%-3s %-30s \t %s" % (item_no, goods_name.text, price.text)
Exemplo n.º 5
0
def get_tudou_json(_url):
    resp = func.GetHttpContent(_url)
    if resp is None:
        func._print("failed get html .The url = %s \n" % (_url))
        return None
    id_list = []
    #print json_txt
    jsn = json.loads(resp)
    ttl_num = jsn['data']['total']
    for each_data in jsn['data']['dataList']:
        id = each_data['uid']
        id_list.append(id)
    return id_list
Exemplo n.º 6
0
def get_html_func(_url):
    resp = func.GetHttpContent(_url)
    if resp is None:
        func._print("failed get html .The url = %s \n" % (_url))
    return resp
Exemplo n.º 7
0
    #print json_txt
    jsn = json.loads(resp) 
    ttl_num = jsn['data']['total']
    for each_data in  jsn['data']['dataList']:
        id = each_data['uid'] 
        id_list.append(id)                   
    return id_list
      

td_first_video_url = 'http://www.tudou.com/programs/view/KCByQcmiCHc/'

td = 'http://tdrec.youku.com/tjpt/tdrec?encode=utf-8&count=20&juid=019fd7i8t9vb9&pcode=20000300&userid=68259393&itemid=202693301&_=1432960070475'
td_url = 'http://tdrec.youku.com/tjpt/tdrec?encode=utf-8&count=20&itemid=202693301&pcode=20000300'
	
yk = 'http://ykrec.youku.com/video/packed/list.json?guid=1425299560757n5F&vid=227687152&sid=0&cate=90&apptype=1&pg=1&module=1&pl=20&needTags=1&atrEnable=true&callback=RelationAsync.videoCallback&uid=59780527&t=0.028587819542735815'
resp = func.GetHttpContent( yk )           
print resp
if resp is not None :
        id_name_xpath = '//*[@id="tjptList"]/li[1]/div[2]/h6/a'
        play_times_xpath = '//*[@id="tjptList"]/li[1]/div[2]/p[2]'
        yk_xpath = '//*[@id="relationvideo_async"]/div/div[1]/div[3]/div[1]/a'
        tree = etree.HTML(resp)
        
        r_list =  tree.xpath(id_name_xpath)
 
        if len(r_list) > 0:    
           print  r_list[0].text.strip() 
   
        r_list =  tree.xpath(play_times_xpath)
        if len(r_list) > 0:    
           print   r_list[0].text.strip()