def parse_comment(self, goodid, callback=None, meta=None, keyword=None, goodsname=None, client=None): i = 0 if callback is None: callback = self.comment_detail while self.comment_switch.get(goodid, True): #self.comment_switch.get(goodid)默认值为True # print(goodid) logging.info('Fetching %s comments, Page %s,begin' % (goodid, i + 1)) callback(goodid=goodid, page=i, callback=self.comment_detail, meta=meta, keyword=keyword, goodsname=goodsname, client=client) logging.info('Fetching %s comments, Page %s,done' % (goodid, i + 1)) i += 1 # 更新模式下,最多翻看self.maxpage页 if self.update_status and i >= self.maxpage: break pass
def __init__(self): try: # print(mysqlconfig) self.conn = pymysql.connect(**mysqlconfig) except Exception as e: logging.error( 'Fatal Error :Mysql connect get an Fatal Error : %s' % e) else: logging.info('Mysql connect success')
def __init__(self): try: self.mconn = pymongo.MongoClient(mongohost, int(mongoport)) self.db = self.mconn['eshop'] self.collection = self.db['reviews_JD6'] except Exception as e: logging.error( 'Fatal Error :Mongodb connect get an Fatal Error : %s' % e) else: logging.info('Mongodb connect success!')
def insert(self, doc): if isinstance(doc, (list, set, tuple)): try: self.collection.insert_many(doc) # 插入多个 except Exception as e: logging.info('insert Failed,{}'.format(e)) elif isinstance(doc, dict): try: self.collection.insert_one(doc) #测试时候使用 except Exception as e: logging.info('insert Failed,{}'.format(e))
def run(self): if self.lock == False: print('{}爬取结束'.format(self.keyword2)) return i = 0 while self.switch: i += 1 # print(i) logging.info(self.goodsname + ' ' + self.keyword + ': ' + 'Page {} start..'.format(i)) self.__search(page=i, callback=self.parse_comment) logging.info(self.goodsname + ' ' + self.keyword + ': ' + 'Page {} crawled..'.format(i)) time.sleep(1) print(self.keyword + '第%s页' % i) print('%s 爬取完成' % self.keyword)
def __search(self, page=1, callback=None, meta=None): ''' page:第多少个半页 ''' # 解析第一部分 第一部分和第二部分可以合并 # 解析第二部分 # 处理分页 if page >= 200: return refer = self.refer url2 = 'https://search.jd.com/s_new.php' headers = {} headers['Referer'] = refer headers.update(self.headers) data2 = { 'keyword': self.keyword, 'enc': 'utf-8', 'qrst': '1', 'rt': '1', 'stop': '1', 'vt': '2', 'wq': self.keyword, 'page': page, 's': (page - 1) * 30 + 1, 'scrolling': 'y', 'log_id': time.time(), 'tpl': '1_M', } # print('测试') #测试时候使 try: proxy = get_one_proxy() proxies = {'http': proxy, 'https': proxy} resp = requests.get(url2, params=data2, headers=headers, proxies=proxies) except Exception as e: logging.error('Fatal error:' + url2 + 'downloaded fail') return # code = resp.encoding logging.info('status code : {}'.format(resp.status_code)) # print(resp.status_code) result = resp.text # print(result) html = etree.HTML(result) items = html.xpath(r'//li[@class = "gl-item"]') length = len(items) if length == 0: self.switch = False for item in items: temp_url = item.xpath(r'.//div[@class="p-img"]/a/@href') # print(temp_url) if len(temp_url) > 0: _ = re.findall(self.pattern, temp_url[0]) if len(_) > 0: url = self.urlmoduel.format(_[0]) goodid = _[0] # print(url) else: continue pass else: continue # 为了数据完整性,此处需要修改 res = etree.tostring(item) cod = chardet.detect(res).get("encoding") res = res.decode(cod) # kw = self.keyword.split(' ') reT = self.keyword2 + '[a-zA-Z]' # print(reT) res = re.sub(r'<font.+?>', '', res) res = re.sub(r'</font>', '', res) tres = etree.HTML(res) tres = tres.xpath(r'//a/em/text()') # 获取标题 if len(tres): res = tres[0] else: print('空') continue print(res) if re.search(reT, res, re.S): logging.info('Invalid Match ') # print(goodid,'x') continue if self.keyword2 not in res: continue if '显示器' not in res: continue else: logging.info('{}'.format(goodid)) print(res) # print(reT) # print(goodid,'okay') # continue #测试的时候使用 if goodid in self.setfilter: #去掉爬过了的网页 continue else: self.setfilter.add(goodid) print(goodid) #测试 callback(goodid=goodid, callback=self.comment_detail) '''break # 必须删除,调试的时候使用
def jd(): sqllist = [] sql0 = 'replace into eshop (MD5_id,content,creationTime,score,referenceName,referenceTime,source) VALUES ' for i in range(len(temp)): _ = {} _["MD5_id"] = temp[i].get("md5_id", None) _["content"] = temp[i].get("content", None) _["creationTime"] = temp[i].get("creationTime", None) _["score"] = temp[i].get("score", None) _["referenceName"] = temp[i].get("referenceName", None) _["referenceTime"] = temp[i].get("referenceTime", None) _["source"] = temp[i].get("source", None) _["content"] = re.sub(r"\'", "\\\'", _["content"]) _["content"] = re.sub(r'\"', '\\\"', _["content"]) # _["content"] = re.sub(r"\'","\\\'",_["content"]) # print(_) if _["MD5_id"] is None: logging.info("MD5_id is none:") continue else: sqltemp = "({},{},{},{},{},{},{})".format( "'{}' ".format(_["MD5_id"]), "'{}' ".format(_["content"]), "'{}' ".format(_["creationTime"]), "{}".format(_["score"]), "'{}' ".format(_["referenceName"]), "'{}' ".format(_["referenceTime"]), "'{}' ".format(_['source'])) sqllist.append(sqltemp) if i % 10000 == 0: print('#') time.sleep(1) # 处理插入数量过大问题 length = len(sqllist) first = 0 last = 0 while first < length: last = first + 20000 if last > length: last = -1 sqllisttemp = sqllist[first:] sql = sql0 + '\n,'.join(sqllisttemp) mysqlconn = MysqlPipeline() try: s = mysqlconn.run(sql) print(s) print('结束') except Exception as e: print(e) break else: sqllisttemp = sqllist[first:last] sql = sql0 + '\n,'.join(sqllisttemp) mysqlconn = MysqlPipeline() try: s = mysqlconn.run(sql) print(s) except Exception as e: print(e) print(first, '~', last) first = last