/
taobaommscrapy.py
148 lines (112 loc) · 3.89 KB
/
taobaommscrapy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# -*- Coding:UTF-8 -*-
import re
import os
import requests
from pyquery import PyQuery as pq
from selenium import webdriver
from ConfigParser import ConfigParser
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class taobaommscrapy():
def __init__(self):
conf = ConfigParser()
conf.read('conf.ini')
self.url = 'http://mm.taobao.com/json/request_top_list.htm?page='
self.login_data = dict(conf.items('account'))
self.headers = dict(conf.items('header'))
self.ses = requests.session()
def getPage(self,ipage):
try:
res = self.ses.get(url = self.url+str(ipage), headers = self.headers)
return res.text.decode('utf-8')
except requests.exceptions.RequestException:
print 'open '+self.url+' error'
def getContent(self,html):
doc = pq(html)
ladyname = []
ladywebpage = []
for data in doc('a.lady-name'):
ladyname.append(pq(data).text())
ladywebpage.append(pq(data).attr('href'))
#return ladyname
return ladyname,ladywebpage
#get url from js with phantomjs and solumn
def getPersonallink(self,url):
local_url = 'http:' + url
driver = webdriver.PhantomJS(executable_path='/usr/bin/phantomjs')
driver.get(local_url)
doc = pq(driver.page_source.decode('utf-8'))
return doc('div.mm-p-domain-info li').find('span').eq(0).text()
def getPersonalPage(self,url):
try:
local_url = 'http:'+url
res = self.ses.get(url = local_url,headers = self.headers)
return res.text.decode('utf-8')
except requests.exceptions.RequestException:
print 'open PersonalPage error: '+local_url
def getAllimgurl(self,html):
doc = pq(html)
imgurl = []
for data in doc('img'):
string = pq(data).attr('src')
if not string:
continue
havehttps = re.search(r'http',string)
if not havehttps:
imgurl.append(pq(data).attr('src'))
return imgurl
def saveImg(self,imgurl,fileName):
try:
res = self.ses.get(url = imgurl)
data = res.content
with open(fileName,'wb') as f:
f.write(data)
except requests.exceptions.RequestException:
print 'open image url error'
#
def mkdir(self, path):
path = path.strip()
#
isExists = os.path.exists(path)
if not isExists:
#
os.makedirs(path)
return True
else:
print path+'dir exists'
return False
def scrapyOnePage(self,ipage):
html = self.getPage(ipage)
lady = self.getContent(html)
ladyname = lady[0]
ladyurl = lady[1]
for i in range(len(ladyname)):
self.mkdir(ladyname[i])
personlink = self.getPersonallink(ladyurl[i])
html = self.getPersonalPage(personlink)
allImagUrl = self.getAllimgurl(html)
k = 0
for item in allImagUrl:
imgurl = 'http:'+item.strip()
k += 1
fileName = ladyname[i] + "/" + str(k) + '.jpg'
IsfileExists = os.path.exists(fileName)
if IsfileExists:
print 'The'+str(k)+' picture for '+ ladyname[i]+' already exists'
else:
print 'The'+str(k)+' picture for '+ ladyname[i]+' is saving'
self.saveImg(imgurl,fileName)
MM = taobaommscrapy()
MM.scrapyOnePage(2)
#lady = MM.getContent()
#print lady[0][:]
#html = MM.getPersonalPage(MM.getPersonallink(lady[1][0]))
#allImgUrl = MM.getAllimgurl(html)
#imgurl = 'http:'+allImgUrl[1]
#print imgurl
#fileName = '1.jpg'
#MM.saveImg(imgurl, fileName)
#print len(allImgUrl)
#for item in MM.getAllimgurl(html):
# print item