def getSkuId(self, url): headers = {'User-Agent': self.userAgent} r = requests.get(url, headers=headers) if 200 != r.status_code: print 'Unable to get long URL for "', skuid, '" with an error (', r.status_code, '):\n', r.text return None url = getMatchString(r.content, r"hrl='(.*?)'") #print 'Long url:', url headers = {'User-Agent': self.userAgent} r = requests.get(url, headers=headers) if 200 != r.status_code: print 'Unable to get information page for "', skuid, '" with an error (', r.status_code, '):\n', r.text return None data = getMatchString(r.content, r'window._itemOnly = (.*?);') #print data obj = json.loads(data.decode('utf-8', 'ignore')) obj = obj.pop('item') skuId = obj.pop('areaSkuId') return skuId
def getSlogan(skuid): path = OutputPath.getDataPath(skuid, 'html') SKU_MAIN_URL_TEMPLATE = 'http://item.m.jd.com/product/{}.html' url = SKU_MAIN_URL_TEMPLATE.format(skuid) ret = Network.saveGetUrl(path, url) #print 'Update', path, ':', ret if ret < 0: return None #PATTERN = r'<div class="prod-act">(.*?)</div>' PATTERN = r'<div class="prod-act">(.*?)<' with open(path) as fp: content = fp.read() slogan = getMatchString(content, PATTERN) if slogan is not None and not isinstance(slogan, unicode): slogan = unicode(slogan, errors='ignore') return slogan return None
def getShareUrl(self, skuid): url = self.shareUrl.format(skuid) headers = {'User-Agent': self.userAgent} for retries in range(3): if retries is not 0: self.reset() time.sleep(1) if self.loginType is 1: if not self.plogin(retries): continue cookies = self.pCookies else: if not self.login(): continue cookies = self.cookies try: r = requests.get(url, cookies=cookies, headers=headers) except Exception as e: print 'Unable to get sharing URL for "', skuid, '" with an error:\n', e continue if 200 != r.status_code: print 'Unable to get sharing URL for "', skuid, '" with an error (', r.status_code, '):\n', r.text continue content = r.content.replace('\n', '') data = getMatchString(content, r'itemshare\((.*?)\)') obj = json.loads(data.decode('utf-8', 'ignore')) retCode = int(obj.pop('retCode')) if retCode is 0: return obj.pop('skuurl') elif 1000 == retCode: # Unlogin print 'Unlogin to get sharing URL for "', skuid, '" with an error (', retCode, '):\n', r.text continue print 'Unable to get sharing URL for "', skuid, '" with an error (', retCode, '):\n', r.text return None self.reset() return None
def getSlogan(self): path = OutputPath.getDataPath(self.skuid, 'html') url = Infor.SKU_MAIN_URL_TEMPLATE.format(self.skuid) ret = Network.saveGetUrl(path, url) if ret < 0: return None with open(path) as fp: content = fp.read() slogan = getMatchString(content, Infor.SLOGAN_PATTERN) if slogan is not None and not isinstance(slogan, unicode): slogan = unicode(slogan, errors='ignore') return slogan return None
def getShareUrl(self, skuid): if self.loginMethod is 1: self.plogin() cookies = self.pCookies else: self.login() cookies = self.cookies url = self.shareUrl.format(skuid) headers = {'User-Agent': self.userAgent} try: r = requests.get(url, cookies=cookies, headers=headers) except Exception as e: print 'Unable to get sharing URL for "', skuid, '" with an error:\n', e return None if 200 != r.status_code: print 'Unable to get sharing URL for "', skuid, '" with an error (', r.status_code, '):\n', r.text return None content = r.content.replace('\n', '') data = getMatchString(content, r'itemshare\((.*?)\)') obj = json.loads(data.decode('utf-8', 'ignore')) retCode = int(obj.pop('retCode')) if retCode is not 0: print 'Unable to get sharing URL for "', skuid, '" with an error (', r.status_code, '):\n', r.text # XXX: Reset but let this message failed because of less complicated logistic. It will re-login # when call the function again. self.reset() return None return obj.pop('skuurl')
def getImage(self): path = OutputPath.getDataPath(self.skuid, 'html') url = Infor.SKU_MAIN_URL_TEMPLATE.format(self.skuid) ret = Network.saveGetUrl(path, url) if ret < 0: return None with open(path) as fp: content = fp.read() m = re.search(Infor.IMAGE_PATTERN, content) if m is None: return None url = getMatchString(m.group(0), Infor.MARK_PATTERN) return url return None
def parseImpl(self, content): for i in range(1): start = content.find('#') if start < 0: break end = content.rfind('#') if end is start: break return content[start:end + 1] for rule in self.rules: value = getMatchString(content, rule) if value is not None: value = value.strip() if len(value) > 0: return '#{}#'.format(value) return None