Exemplo n.º 1
0
def getsoup(request):
    if request.method == 'POST':
        form = SoupForm(request.POST)
        if form.is_valid():
            clean_data = form.cleaned_data
            # as below, will grab the data of the url
            url = clean_data['url']
            print url
            website = '360buy'
            cate = clean_data['cate']
            # store the url into a file named try.txt
            #rd = getRandomStr(10)
            #rd = getRandom.getRandomStr(10)
            rd = getRandom.getUUID()
            #path_img = os.path.join(settings.GRAB_IMG_ROOT, rd)
            # os.path.join(os.path.dirname(__file__), 'templates').replace('\\','/'),
            #path_img = os.path.join(os.path.join(os.path.dirname(__file__)), '..\\imgdb\\taobao_' + rd + '.jpg')
            #localfile = os.path.join(os.path.join(os.path.dirname(__file__)), '..\\imgdb\\url_' + rd + '.txt')
            #getHtml.grabHref(url, localfile)
            #getResource.grabHref(url, localfile)
            #getResource.grab_360buy(url, localfile)
            #getResource.grab_360buy_saveToModel(url, 1, 1, localfile)
            if website == 'taobao':
                getResource.grabHref(url, localfile)
                data = taobao_lib.get_json(url)
                json_data = json.loads(data)
                json.loads(data, None)
                json_item_list = json_data['itemList']
                for item in json_item_list:
                    price = item['currentPrice']
                    name = item['fullTitle']
                    url = item['storeLink']
                    img_url = item['image']
                    #save img
                    saveImg.saveImg(img_url, path_img)
            elif website == '360buy':
		#debug
		print settings.MEDIA_ROOT	
		target_dir = settings.MEDIA_ROOT + 'jd360/'
		img_root = target_dir + time.strftime('%Y%m%d')
		#now = time.strftime('%H%M%S')
		if not os.path.exists(img_root):
			os.mkdir(img_root) # make directory
	        # img_root = os.path.join(settings.MEDIA_ROOT, 'jd360/') 
		#create the path
		#os.mkdir(img_root)
                # getResource.grab_360buy(url, img_root)
		#row = re.findall("\w+",url);
		
		#for url_li in row:
		#	print 'url_li:???????????'+url_li
		getResource.grab_360buy_bag_m(url, img_root)
		print 'img_root-----------'
		print img_root
                #print name + price + url + img_url
            return render_to_response('beautiful_soup.html',{'form': form, 'ans':img_root})
    else:
        form = SoupForm(initial={'url':'http://list.jd.com/1672-2576-5262.html'})
    return render_to_response('beautiful_soup.html',{'form': form})
Exemplo n.º 2
0
def grab_360buy_saveToModel(url, id_cate, id_s, localfile):
    request = urllib2.Request(url=url, headers={'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3' })
    response = urllib2.urlopen(request)
    HTML_response = response.read()
    soup = BeautifulSoup(HTML_response,from_encoding="gb18030")

    if soup:
        tag_div = soup.find_all('div', id = 'plist')
        if tag_div:
            tag_item_li = tag_div[0].find_all('li')
            #myfile = open(localfile,'w')
            i = 0
            # get the default
            m_cate = models.Category.objects.get(id=id_cate)
            m_s = models.Seller.objects.get(id=id_s)
            print m_cate
            print type(m_s)
            myfile = open(localfile,'w')
            for li in tag_item_li:
                i += 1
                #get the tag of each div
                div = li.find_all('div')
                if div:
                    print str(i)+'........'
                    p_img = div[0]
                    p_name = div[4]
                    p_price = div[5]
                    #save img
                    url_item = p_img.a['href']
                    url_img = p_img.img['data-lazyload']
                    path_dir = os.path.join(os.path.dirname(localfile))
                    path_img = os.path.join(path_dir , str(i)+'.jpg')
                    saveImg.saveImg(url_img, path_img)
                    ##save price
                    #url_price = p_price.img['data-lazyload']
                    #path_price = os.path.join(path_dir, str(i)+'_price.jpg')
                    #saveImg.saveImg(url_price, path_price)
                    #save to model
                    m_com = models.Commidity(url=url_item, price=0.0, name=str(p_name.a.contents))
                    m_com.categories = m_cate
                    m_com.seller = m_s
                    m_com.save()
                    print m_com
                    m_p = models.Picture(dir=path_dir,commidity=m_com.id)
                    m_p.save()
                    print m_p
                    #get info
                    myfile.write( str(path_img) + '---')
                    myfile.write( str(p_name.a.contents) + '---')
                    myfile.write( str(p_price.img['data-lazyload']) + '---')
                    myfile.write('\r\n')
                else:
                    print 'it is empty of div.... f**k'
            myfile.close()
    return True
Exemplo n.º 3
0
def grab_360buy(url, localfile):
    #print localfile
    soup = makeSoup(url)
    if soup:
        tag_div = soup.find_all('div', id = 'plist')
        if tag_div:
            tag_item_li = tag_div[0].find_all('li')
            #myfile = open(localfile,'w')
            i = 0
            for li in tag_item_li:
                i += 1
                #get the tag of each div
                div = li.find_all('div')
                if div:
                    #print str(i)+'........'
                    p_img = div[0]
                    p_name = div[2]
                    p_price = div[3]
		    # the url of the item
                    #url_img = p_img.img['data-lazyload']
		    #debug
		    print '-------------------------------'
		    print p_img
	            if p_img.img:
					url_img = p_img.img['src']
					url_item = p_img.img['alt']
					path_dir = os.path.join(os.path.dirname(localfile))
					img_name = getRandom.getRandomStr(5)
                    #path_img = os.path.join(path_dir , img_name +'.jpg')
					path_img = os.path.join(path_dir , str(i) +'.jpg')
		    		#debug
					#print url_img
		    		#print path_img
					saveImg.saveImg(url_img, path_img)
                    ##save price
                    #if p_price and p_name and p_img:
                        #url_price = p_price.img['data-lazyload']
                        #path_price = os.path.join(path_dir, img_name+'_price.jpg')
                        #saveImg.saveImg(url_price, path_price)
                        ##get info
                        #myfile.write( str(path_img) + '---')
                        #myfile.write( str(p_name.a.contents) + '---')
                        #myfile.write( str(p_price.img['data-lazyload']) + '---')
                        #myfile.write('\r\n')
                else:
                    print 'it is empty of div.... f**k'
            #myfile.close()
    return True
Exemplo n.º 4
0
def grab_360buy_bag_m(url, localfile):
	soup = makeSoup(url)
	if soup:
		tag_div = soup.find_all('div', id = 'plist')
		if tag_div:
			tag_div_a = tag_div[0].find_all('a', target='_blank')
			#data = []
			i=0
			path_dir = os.path.join(os.path.dirname(localfile))
			print len(tag_div_a)
			for a in tag_div_a:	
				img = a.find('img')
				if not img :
					continue
				print '>>>>>>>>>>>>>>>>>>>>'+str(i)
				url_item = a['href']
				url_img = a.find('img').get('src')
				url_img2 = a.find('img').get('src2')
				desc = a.find('img')['alt']
				#data.append([url_item,url_img,desc])
				if url_img and url_img2:
					continue
				elif not url_img:
					url_img = url_img2
				#print desc
				#print url_item
				#print url_img
				#print img
				#print '----------------------------'
				i += 1
				#img_name = getRandom.getRandomStr(5)
                		path_img = os.path.join(path_dir , str(i) +'.jpg')
				#path_img = os.path.join(path_dir , str(i++) +'.jpg')
				saveImg.saveImg(url_img, path_img)
				#save to db
				#saveDB.saveToImagedata(cid, comid, price, desc, url, localfile, gender)
				saveDB.saveToImagedata(1, 0, desc, url_item, path_img, '1')
	else:
		print "It's empty!!!!! fuque......."
	return True
Exemplo n.º 5
0
def getsoup(request):
    if request.method == 'POST':
        form = SoupForm(request.POST)
        if form.is_valid():
            clean_data = form.cleaned_data
            # as below, will grab the data of the url
            url = clean_data['url']
            print url
            website = '360buy'
            cate = clean_data['cate']
            # store the url into a file named try.txt
            #rd = getRandomStr(10)
            #rd = getRandom.getRandomStr(10)
            rd = getRandom.getUUID()
            #path_img = os.path.join(settings.GRAB_IMG_ROOT, rd)
            # os.path.join(os.path.dirname(__file__), 'templates').replace('\\','/'),
            #path_img = os.path.join(os.path.join(os.path.dirname(__file__)), '..\\imgdb\\taobao_' + rd + '.jpg')
            #localfile = os.path.join(os.path.join(os.path.dirname(__file__)), '..\\imgdb\\url_' + rd + '.txt')
            #getHtml.grabHref(url, localfile)
            #getResource.grabHref(url, localfile)
            #getResource.grab_360buy(url, localfile)
            #getResource.grab_360buy_saveToModel(url, 1, 1, localfile)
            if website == 'taobao':
                getResource.grabHref(url, localfile)
                data = taobao_lib.get_json(url)
                json_data = json.loads(data)
                json.loads(data, None)
                json_item_list = json_data['itemList']
                for item in json_item_list:
                    price = item['currentPrice']
                    name = item['fullTitle']
                    url = item['storeLink']
                    img_url = item['image']
                    #save img
                    saveImg.saveImg(img_url, path_img)
            elif website == '360buy':
                #debug
                print settings.MEDIA_ROOT
                target_dir = settings.MEDIA_ROOT + 'jd360/'
                img_root = target_dir + time.strftime('%Y%m%d')
                #now = time.strftime('%H%M%S')
                if not os.path.exists(img_root):
                    os.mkdir(img_root)  # make directory

# img_root = os.path.join(settings.MEDIA_ROOT, 'jd360/')
                #create the path
                #os.mkdir(img_root)
                # getResource.grab_360buy(url, img_root)
                #row = re.findall("\w+",url);

                #for url_li in row:
                #	print 'url_li:???????????'+url_li
                getResource.grab_360buy_bag_m(url, img_root)
                print 'img_root-----------'
                print img_root
            #print name + price + url + img_url
            return render_to_response('beautiful_soup.html', {
                'form': form,
                'ans': img_root
            })
    else:
        form = SoupForm(
            initial={'url': 'http://list.jd.com/1672-2576-5262.html'})
    return render_to_response('beautiful_soup.html', {'form': form})