def gen_start_urls(): with open(ITEM_ID_PATH) as file: for line in file: item_id = int(line.strip()) query = session.query(Item).filter_by(id=item_id) if not session.query(query.exists()).scalar(): yield 'https://item.taobao.com/item.htm?id=' + str(item_id)
def create_train_test(train_pos_path=TRAIN_POS_PATH, train_neg_path=TRAIN_NEG_PATH, test_pos_path=TEST_POS_PATH, test_neg_path=TEST_NEG_PATH): """ 用数据库中所有非默认评论创建训练和测试样本,保证正负样本数一样 """ pos, neg = [], [] for content, rate in Review.filter_default( session.query(Review.content, Review.rate).filter(Review.content != '')): if Rate(rate).is_good: pos.append(content) else: neg.append(content) size = min(len(pos), len(neg)) size_train = int(size * 0.8) pos = choices(pos, k=size) neg = choices(neg, k=size) for data, path in ((pos[:size_train], train_pos_path), (neg[:size_train], train_neg_path), (pos[size_train:], test_pos_path), (neg[size_train:], test_neg_path)): with codecs.open(path, 'w', 'utf-8') as file: file.writelines(data)
def main(): for index, review in enumerate(Review.filter_default( session.query(Review))): eval_classify(review.content, review.rate) if index % 100 == 99: print('total_contents =', total_contents) print('total_correct =', total_correct) print('correct_rate =', total_correct / total_contents)
def start(self): print('认为有用则按Y,认为没用则按N:\n') # 防止matplotlib阻塞 plt.ion() for item in (session.query(Item).filter( Item.reviews.any(Review.is_useful.is_(None)))): # 画评价数量-时间图 dates, good_bars, bad_bars = draw_plot.draw_rate_time_plot( item.reviews) for review in item.reviews: if review.is_useful is not None: continue # if review.is_default(): # review.is_useful = False # session.commit() # continue # 显示评论 print('用户信用等级:', review.user_rank) try: print('评价:', Rate(review.rate).name) except ValueError: print('评价: 未知({})'.format(review.rate)) print('内容:', review.content) if review.appends: print('追评:', review.appends) print('有图片' if review.has_photo else '无图片') cur_date_bar = original_color = None if review.date is not None: print('时间:', review.date.isoformat()) index = (review.date.date() - dates[0]).days cur_date_bar = (good_bars[index] if review.is_good else bad_bars[index]) original_color = cur_date_bar.get_facecolor() cur_date_bar.set_color('r') else: print('时间: 未知') plt.show() # 输入是否有用 self._pressed_key = '' while self._pressed_key not in ('y', 'n'): self._canvas.start_event_loop() if self.stop: return print(self._pressed_key) print('') review.is_useful = self._pressed_key == 'y' session.commit() if cur_date_bar is not None: cur_date_bar.set_color(original_color) plt.cla()
def start_requests(self): for item in session.query(Item).filter(Item.sold_count.is_(None)): url = ('https://detailskip.taobao.com/service/getData/1/p1' '/item/detail/sib.htm?itemId={}&sellerId={}&modules' '=soldQuantity&callback=onSibRequestSuccess').format( item.id, item.shop.seller_id) headers = { 'referer': 'https://item.taobao.com/item.htm?id=' + str(item.id) } yield Request(url, dont_filter=True, headers=headers)
def parse(self, response): data = response.text[response.text.find('{'):response.text.rfind('}') + 1] data = json.loads(data) sold_quantity = data['data']['soldQuantity'] match = re.search(r'itemId=(\d+)', response.url) if not match: return item = session.query(Item).filter_by(id=match[1]).first() item.sold_count = sold_quantity['soldTotalCount'] item.confirm_count = sold_quantity['confirmGoodsCount'] session.commit()
def parse_shop(self): try: # 卖家 data = self.driver.execute_script('return g_config.idata') seller_id = int(data['seller']['id']) query = session.query(Seller).filter_by(id=seller_id) if not session.query(query.exists()).scalar(): session.add(Seller(id=seller_id, age=data['seller']['shopAge'])) # 商店 shop_id = int(data['shop']['id']) query = session.query(Shop).filter_by(id=shop_id) if not session.query(query.exists()).scalar(): session.add( Shop(id=shop_id, url=data['shop']['url'], seller_id=seller_id)) # 商品 self.item_id = int(data['item']['id']) sell_counter = self.driver.find_element_by_css_selector( 'div.tb-sell-counter a') match = re.search(r'售出(\d+)件.*?成功(\d+)件', sell_counter.get_attribute('title')) session.add( Item(id=self.item_id, title=data['item']['title'], shop_id=shop_id, sold_count=int(match[1]) if match is not None else 0, confirm_count=int(match[2]) if match is not None else 0)) except: self.logger.exception('解析商店时出错:') return False return True
def create_train_test(train_pos_path=TRAIN_POS_PATH, train_neg_path=TRAIN_NEG_PATH, test_pos_path=TEST_POS_PATH, test_neg_path=TEST_NEG_PATH): """ 创建训练和测试样本,保证正负样本数一样 """ pos, neg = [], [] for item in (session.query(Item).filter( Item.reviews.any(Review.is_useful.isnot(None)))): diffs = get_diffs(item.reviews) if not diffs: continue for review, diff in zip(item.reviews, diffs): if (review.is_useful is None # 未标注 or review.date is None # 未知日期 ): continue sample = [ review.user_rank, # 用户信用等级 len(review.content) + len(review.appends), # 评论长度 review.has_photo, # 是否有图片 bool(review.appends), # 是否有追评 diff, # 评论数量差分 ] if review.is_useful: pos.append(sample) else: neg.append(sample) size = min(len(pos), len(neg)) size_train = int(size * 0.8) pos = choices(pos, k=size) neg = choices(neg, k=size) for data, path in ((pos[:size_train], train_pos_path), (neg[:size_train], train_neg_path), (pos[size_train:], test_pos_path), (neg[size_train:], test_neg_path)): with open(path, 'wb') as file: pickle.dump(data, file)
def draw_plot_per_item(draw_func, plots_dir=PLOTS_DIR): """ 每个商品画一个图,保存到文件 :param draw_func: 画图函数,参数:reviews :param plots_dir: 保存图像的文件夹 """ for item in session.query(Item): print(item.id, item.title) filename = '{} {}.png'.format(item.id, item.title) filename = replace_illegal_chars(filename) path = plots_dir + '/' + filename if exists(path): continue draw_func(item.reviews) plt.savefig(path) plt.cla()