示例#1
0
def getSubTopics(topic_id):
    offset = 0

    while 1:
        form_data = {'method': 'next', 'params': '{"topic_id": %s, "offset": %s, "hash_id": ""}' % (topic_id, offset)}
        try:
            response = requests.post(url=subTopic_url, data=form_data, headers=requestHeader, proxies=rand_proxy())
            datas = response.content.decode('utf-8')
            jr = json.loads(datas)
            # convert string array to string
            body = ''.join(jr['msg'])
            items = subTopic_p.findall(body)
            if len(items) == 0:
                break

            for item in items:
                #logger.info(item[0], item[1])
                yield(item)

            offset += 20
        except Exception as e:
            # A 400 means that the request was malformed. 
            # In other words, the data stream sent by the client to the server didn't follow the rules
            logger.error(e)
            logger.info('args -> topic_id: {0}, offset: {1}'.format(topic_id, offset))
示例#2
0
文件: SBPR.py 项目: zzg2008/NeuRec
 def __init__(self, sess, dataset):
     config = configparser.ConfigParser()
     config.read("conf/SBPR.properties")
     self.conf = dict(config.items("hyperparameters"))
     self.socialpath = self.conf["socialpath"]
     self.learning_rate = eval(self.conf["learning_rate"])
     self.embedding_size = eval(self.conf["embedding_size"])
     self.learner = self.conf["learner"]
     self.loss_function = self.conf["loss_function"]
     self.topK = eval(self.conf["topk"])
     self.num_epochs = eval(self.conf["num_epochs"])
     self.reg_mf = eval(self.conf["reg_mf"])
     self.batch_size = eval(self.conf["batch_size"])
     self.verbose = eval(self.conf["verbose"])
     self.dataset = dataset
     self.num_users = dataset.num_users
     self.num_items = dataset.num_items
     self.userids = self.dataset.userids
     self.dataset_name = dataset.dataset_name
     self.userouterids = self.userids.keys()
     trainMatrix = self.dataset.trainMatrix.tocsr()
     self.train_dict = {
         u: set(pos_item.indices)
         for u, pos_item in enumerate(trainMatrix)
     }
     self.socialMatrix = self._get_social_data()
     self.userSocialItemsSetList = self._get_SocialItemsSet_sun()
     logger.info("init finished")
     self.sess = sess
示例#3
0
 def get_coin_price(self, symbol):
     self.ws_connect()
     self.socketData = None
     threading.Thread(target=self.socket_recv, args=(self,)).start()
     i = 0
     while not self.socketData:
         time.sleep(0.1)
         i += 1
         if i == 150:
             self.ping = True
             try:
                 self.ws.send(b"ping")
                 logger.info("ping.........")
             except Exception as e:
                 logger.info("ping exception,{}".format(e))
             time.sleep(1)
             break
     if self.ping and self.socketData != 'pong':
         logger.warning("ping failed,reconnect!")
         self.ping = False
         self.ws.close()
         self.get_coin_price(symbol)
         return
     res = None
     try:
         res = json.loads(self.socketData)
     except Exception as e:
         logger.error("{} : {}".format(self.socketData, e))
     if res and res.get("data") is not None:
         data = res.get("data")[0]
         price_info = self.priceInfo[symbol]
         price_info["asks"] = list(map(lambda x: list(map(lambda d: float(d), x)), data["asks"]))
         price_info["bids"] = list(map(lambda x: list(map(lambda d: float(d), x)), data["bids"]))
示例#4
0
def test_model(model, dataset, num_thread=10):
    eval_begin = time()
    model_name = str(model.__class__).split(sep=".")[-1].replace("\'>", "")
    if dataset.splitter == "loo":
        (hits, ndcgs, aucs) = evaluate_by_loo(model, dataset.testMatrix,
                                              dataset.testNegatives,
                                              num_thread)
        hr = np.array(hits).mean()
        ndcg = np.array(ndcgs).mean()
        auc = np.array(aucs).mean()
        logger.info(
            "[model=%s]: [Test HR = %.6f, NDCG = %.6f,AUC = %.6f] [Time=%.1fs]"
            % (model_name, hr, ndcg, auc, time() - eval_begin))

    else:
        (pres, recs, maps, ndcgs,
         mrrs) = evaluate_by_foldout(model, dataset.testMatrix,
                                     dataset.testNegatives, num_thread)
        Precision = np.array(pres).mean()
        Recall = np.array(recs).mean()
        MAP = np.array(maps).mean()
        NDCG = np.array(ndcgs).mean()
        MRR = np.array(mrrs).mean()
        logger.info(
            "[model=%s][%.1fs]: [Test Precision = %.6f, Recall= %.6f, MAP= %.6f, NDCG= %.6f, MRR= %.6f][topk=%.4s]"
            % (model_name, time() - eval_begin, Precision, Recall, MAP, NDCG,
               MRR, model.topK))
示例#5
0
 def login(self):
     if self.cookie:
         logger.info("检测到cookie文件,直接使用cookie登录")
         self.cookie_login()
     else:
         logger.info("使用email登录")
         self.common_login()
示例#6
0
def check_proxy(threads=10):
    logger.info('start checking proxy')
    t_list = []
    for i in range(threads):
        t = Thread(target=task, args=())
        t_list.append(t)

    for t in t_list:
        t.start()
        t.join()
示例#7
0
def main():
    c = Config()
    mongo = MongoClient(c.get('mongo', 'host'), int(c.get('mongo', 'port')))
    am = AccountManager(c.get('zhihu', 'email'), c.get('zhihu', 'password'))
    cookie = am.load_cookie()
    requestHeader.update(cookie)
    mdb = mongo.zhihu
    for topic in mdb.topic.find():
        tid = topic['tid']
        logger.info('get sub topics of {0}'.format(tid))
        for subtopic in getSubTopics(tid):
            mdb.sub_topic.insert_one({'sub_tid': subtopic[0], 'sub_name': subtopic[1]})
示例#8
0
 def ws_connect(self):
     if self.ws is None or not self.ws.connected:
         try:
             self.ws = create_connection("wss://real.okex.com:10442/ws/v3", timeout=5)
             logger.info('websocket connected!')
             pair = self.SYMBOL_T.upper().replace("_", "-")
             sub_param = {"op": "subscribe", "args": ["spot/depth5:{}".format(pair)]}
             sub_str = json.dumps(sub_param)
             self.ws.send(sub_str)
             result = self.inflate(self.ws.recv())
             logger.info("{} subscribe:{}".format(pair, result))
         except Exception as e:
             logger.error('\nconnect ws error[{}],retry...'.format(e))
             time.sleep(2)
             self.ws_connect()
示例#9
0
 def get_account_info(self):
     logger.info('-----------------------------------spot account info--------------------------------------------')
     try:
         accounts = ['USDT', self.BALANCE_T.upper()]
         for symbol in accounts:
             t_account = spotAPI.get_coin_account_info(symbol)
             if t_account.get('currency') == symbol:
                 logger.info("%s:balance %s available %s frozen %s" % (symbol, t_account["available"],
                                                                       t_account["available"],
                                                                       t_account["frozen"]))
             else:
                 logger.warning("getAccountInfo Fail,Try again!")
                 self.get_account_info()
     except Exception as err:
         logger.error(err)
         self.get_account_info()
示例#10
0
def main():
    c = Config()
    mongo = MongoClient(c.get('mongo', 'host'), int(c.get('mongo', 'port')))
    #am = AccountManager()
    #cookie = am.load_cookie()
    #requestHeader.update(cookie)
    rQ = Queue(connection=Redis())
    #cursor = mongo.zhihu.sub_topic.find({}, {'sub_tid': 1, '_id': 0}, no_cursor_timeout=True).skip(32).limit(200)
    cursor = mongo.zhihu.sub_topic.find({}, {'sub_tid': 1, '_id': 0}).skip(150).limit(20000).batch_size(10)
    for subtopic in cursor:
        stid = subtopic['sub_tid']

        #mongo.zhihu.sub_topic.update_one({'sub_tid': stid}, {'$set': {'max_page': page_no}})
        #rQ.enqueue(questions_per_page, stid, requestHeader)
        questions_per_topic(stid, requestHeader, rQ)

    logger.info('done')
示例#11
0
 def makePool(self,ConnNum):
     print("====================开始创建数据库连接池...==============================")
     startTime = time.time()
     retry = 0
     while(1):
         try:
             self.pool = PooledDB(
                 pymssql,
                 ConnNum,
                 host=self.host,user=self.user,password=self.password,database=self.db,charset="utf8")
             break
         except Exception as e:
             logger.error("连接数据库失败 ")
             retry += 1
             logger.info("尝试第%s次重新创建数据库连接池..."%retry)
     print("<<<<< 创建时间:"+str(int(time.time()-startTime))+"s 连接数:"+str(ConnNum)+" >>>>>")
     print("====================创建数据库连接池完成!==============================")
示例#12
0
def getTopics():
    url = 'https://www.zhihu.com/topics'
    c = Config()
    account = AccountManager(c.get('zhihu', 'email'), c.get('zhihu', 'password'))
    session = requests.session()
    session.headers = requestHeader
    session.cookies.update(account.load_cookie())
    
    response = session.get(url)
    pattern = re.compile('<li class="zm-topic-cat-item" data-id="(\d+)"><a href="#(.*)?">')
    results = re.findall(pattern,response.text)

    logger.info(results)
    for t in results:
        yield(t)
        #db['topic'].insert_one({'tid': t[0], 'tname': t[1]})

    session.close()
示例#13
0
def max_page(topic_id, header):
    question_url = 'https://www.zhihu.com/topic/{0}/top-answers'.format(
        topic_id)
    err = 0
    while 1:
        user_agent = random.choice(agents)
        proxy = rand_proxy()
        ip = proxy['http'].split(':')[1][2:]
        header.update({'User-Agent': user_agent})
        try:
            response = requests.get(question_url,
                                    headers=header,
                                    proxies=proxy)
        except Exception as e:
            logger.error(e)
            logger.error(topic_id)
            continue

        logger.info('visit: %s' % question_url)
        if response.status_code != 200:
            logger.error('{0} ERROR'.format(question_url))
            logger.error(header)
            return
        html = response.content.decode('utf-8')
        html_tree = etree.HTML(html)
        page_numbers = html_tree.xpath(
            '//div[@class="zm-invite-pager"]/span/a/text()')
        try:
            # span.text: 上一页 1 2 3 ... 13801 下一页
            return page_numbers[-2]
        except Exception as e:
            if html.find('系统检测到您的帐号或IP存在异常流量') > -1:
                logger.error(
                    '统检测到您的帐号或IP存在异常流量, proxy: {0}, user-agent: {1}'.format(
                        proxy, user_agent))
                if err == 5:
                    break
                err += 1
                continue

            logger.error(e)
            logger.error('topic_id: {0}'.format(topic_id))
            return 1
示例#14
0
def crawl_and_check(process_num=20):
    logger.info('start crawling proxies')
    crawler = ProxyGetter()
    crawler.crawl_proxies()
    logger.info('end crawling')

    logger.info('start cleaning proxies')
    threads = []
    for p in range(process_num):
        proc = Thread(target=task, args=())
        proc.start()
        threads.append(proc)

    for t in threads:
        t.join()

    logger.info('cleaning proxies complete')
    rdb = redis.Redis()
    logger.info('valid_proxy pool size: {0}'.format(rdb.llen('valid_proxy')))
示例#15
0
 def crawl_proxies(self):
     for m in dir(ProxyGetter):
         crawl_method_r = re.match('freeProxy\d+', m)
         if not crawl_method_r:
             continue
         crawl_method = crawl_method_r.group() 
         logger.info('running %s' % crawl_method)
         # catch generator exceptions 
         try:
             for proxy in getattr(ProxyGetter, crawl_method)():
                 if proxy:
                     # Using LREM and replacing it if it was found.
                     # LREM list 0 "hello", 0 means remove all elements equal to value
                     self.db.lrem(RAW_PROXY_QUEUE, num=0, value=proxy)
                     self.db.lpush(RAW_PROXY_QUEUE, proxy)
                     #logger.info("fetch proxy:{0}, {1}".format(crawl_method, proxy))
         except Exception as e:
             logger.error(e)
             continue
示例#16
0
def questions_per_page(topic_id, page, header):
    question_url = 'https://www.zhihu.com/topic/{0}/questions?page={1}'.format(
        topic_id, page)
    user_agent = random.choice(agents)
    header.update({'User-Agent': user_agent})
    html = requests.get(question_url, headers=header,
                        proxies=rand_proxy()).content.decode('utf-8')
    questions = re.findall(question_p, html)
    for q in questions:
        try:
            mongo_conn().questions.insert_one({
                'qid': q[1],
                'stid': topic_id,
                'href': q[0],
                'name': q[2]
            })
        except DuplicateKeyError as e:
            logger.error(e)
            logger.info("topic_id: {0}, href: {1} exists".format(
                topic_id, q[0]))
示例#17
0
 def make_order(cls, my_order_info):
     logger.info('-----------------------------------------spot order----------------------------------------------')
     result = {}
     try:
         result = spotAPI.take_order(my_order_info.orderType, my_order_info.symbol, 2, my_order_info.price,
                                     my_order_info.amount)
     except Exception as e:
         logger.error("***trade:%s" % e)
     if result is not None and result.get('result'):
         logger.info(
             "Order {} {} {} {} {} {}".format(result['order_id'], my_order_info.symbol, my_order_info.orderType,
                                              my_order_info.price, my_order_info.amount,
                                              from_time_stamp()))
         return result['order_id']
     else:
         logger.error(
             "order failed!{} {} {} {} {}".format(my_order_info.symbol, my_order_info.orderType, my_order_info.price,
                                                  my_order_info.amount,
                                                  round(my_order_info.price * my_order_info.amount, 3)))
         return -1
示例#18
0
 def SELECT(self,query,param=()):
     """
     主要处理select语句
     :param query: str      sql请求
     :param param: tuple    填入参数,tuple格式
     :return: result: tuple 或 None
     """
     conn = self.conn
     cur = conn.cursor()
     result = None
     try:
         startTime = time.time()
         cur.execute(query,param)
         result = cur.fetchall()
         logger.info(query + "," + str(param) + " Execute time:" + str(time.time()-startTime))
     except Exception as e:
         logger.error("Error: unable to fecth data with sql query: " + query + "," + str(param))
         logger.error(traceback.format_exc())
     cur.close()
     return result
示例#19
0
文件: MyUtil.py 项目: xianyin/balance
def send_email(content, _subtype='plain', _subject="bitcoinrobot"):
    # 第三方 SMTP 服务
    mail_host = "smtp.gmail.com"  # 设置服务器
    mail_user = "******"  # 用户名
    mail_pass = "******"  # 口令

    message = MIMEText(content, _subtype, 'utf-8')
    message['From'] = Header(mail_user)
    message['To'] = Header(",".join(receivers))
    message['Subject'] = Header(_subject)
    try:
        server = smtplib.SMTP_SSL(mail_host, 465)
        server.ehlo()
        server.login(mail_user, mail_pass)
        server.sendmail(mail_user, receivers, message.as_string())
        server.close()
        logger.info("邮件发送成功")
        return True
    except smtplib.SMTPException as err:
        logger.error("Error: 邮件发送失败,{}".format(err))
        return False
示例#20
0
def top_answers(topic_id, page, header):
    question_url = 'https://www.zhihu.com/topic/{0}/top-answers?page={1}'.format(
        topic_id, page)
    proxy = rand_proxy()
    user_agent = random.choice(agents)
    header.update({'User-Agent': user_agent})
    try:
        html = requests.get(question_url, headers=header,
                            proxies=proxy).content.decode('utf-8')
    except Exception as e:
        logger.error('exception url: %s' % question_url)
        logger.error(e)
        top_answers(topic_id, page, header)

    # 查找本页第一个问题的点赞数量,如果小于1000,忽略本页内容
    first_vote = max_vote_p.search(html)
    if first_vote:
        max_vote = first_vote.group(1)
        if int(max_vote) < 1000:
            logger.info('ignore %s, max_vote:%s' % (question_url, max_vote))
            return

    answers = re.findall(top_answer_p, html)
    if len(answers) == 0:
        logger.error('{0} answers not found, proxy: {1}'.format(
            question_url, proxy))

        return
    logger.info('{0} found answer {1}'.format(question_url, len(answers)))
    for a in answers:
        qid, aid, href = a[1], a[2], a[0]
        try:
            mongo_conn().answers.insert_one({
                'topic': topic_id,
                'question': a[1],
                'answer': a[2],
                'href': a[0]
            })
        except DuplicateKeyError as e:
            return
示例#21
0
 def UPDATE(self,query,param=()):
     """
     处理update和delete和insert语句
     :param query: str      sql请求
     :param param: tuple    填入参数,tuple格式
     :return: bool
     """
     conn = self.conn
     cur = conn.cursor()
     result = True
     try:
         startTime = time.time()
         cur.execute(query,param)
         conn.commit()
         logger.info(query + "," + str(param) + " Execute time:" + str(time.time() - startTime))
     except Exception as e:
         logger.error("Error: unable to fecth data with sql query: " + query + "," + str(param))
         logger.error(traceback.format_exc())
         conn.rollback()
         result = False
     cur.close()
     return result
示例#22
0
    def cookie_login(self):
        # 获取基本的cookie
        self.session.get(mainPageURL)

        # 添加用户配置的认证Cookie
        cookie = self.load_cookie()
        requests.utils.add_dict_to_cookiejar(self.session.cookies, cookie)

        # 检验是否成功登陆
        response = self.session.get(authTestURL)

        if response.status_code == 200:
            logger.info('知乎账户登陆成功')
            return True
        else:
            logger.info('知乎账户登陆失败')
            logger.info(response.text)
            return False
示例#23
0
 def check_order_status(self, my_order_info, wait_count=0):
     order_id = my_order_info.orderId
     order_result = {}
     try:
         order_result = spotAPI.get_order_info(my_order_info.orderId, my_order_info.symbol)
     except Exception as e:
         logger.error("***orderinfo:%s" % e)
     if order_result is not None and order_result.get('order_id') == my_order_info.orderId:
         order = order_result
         order_id = order["order_id"]
         status = order["status"]
         filled_size = float(order["filled_size"])
         if filled_size > 0:
             my_order_info.set_deal_amount(filled_size)
             my_order_info.set_avg_price(float(order["filled_notional"]) / filled_size)
         if status == self.CANCELLED_STATUS:
             logger.info("order {} canceled".format(order_id))
         elif status == 'open':
             if wait_count == self.TRADE_WAIT_COUNT:
                 logger.info("timeout no deal")
             else:
                 logger.info("no deal")
         elif status == 'part_filled':
             if wait_count == self.TRADE_WAIT_COUNT:
                 logger.info("timeout part deal {}".format(my_order_info.dealAmount))
             else:
                 logger.info("part deal {}".format(my_order_info.dealAmount))
         elif status == self.FILLED_STATUS:
             logger.info("order {} filled".format(order_id))
         elif status == 'canceling':
             logger.info("order {} canceling".format(order_id))
         elif status == 'ordering':
             logger.info("order {} ordering".format(order_id))
         return status
     else:
         logger.warning("order {} checkOrderStatus failed,try again.".format(order_id))
         return self.check_order_status(my_order_info, wait_count)
示例#24
0
    def common_login(self):
        try:
            response = self.session.get(signURL).content.decode('utf-8')
            # 获取 _xsrf
            xsrf_p = '<input type="hidden" name="_xsrf" value="([0-9a-z]*)"/>'
            result = re.search(xsrf_p, response)
            if result:
                _xsrf = result.group(1)
            else:
                logger.info('xsrf not found')
                return False

            captcha = self.session.get(captchaURL %
                                       (time.time() * 1000)).content
            with open(self.captchaFile, 'wb') as output:
                output.write(captcha)

            #subprocess.call(self.captchaFile, shell=True)
            captcha = input('input captcha:')

            # login
            form_data = {
                '_xsrf': _xsrf,
                'email': self.email,
                'password': self.password,
                'remember_me': True,
                'captcha': captcha
            }
            self.requestHeader.update({
                'X-Requested-With': 'XMLHttpRequest',
                'X-Xsrftoken': _xsrf
            })
            self.session.headers = self.requestHeader
            response = self.session.post(url=loginURL, data=form_data)
            if response.status_code == 200:
                logger.info(response.text)
                # 检查是否已经登陆成功
                response = self.session.get(authTestURL)
                if response.status_code == 200:
                    # 保存登陆认证cookie
                    self.cookie = self.session.cookies.get_dict()
                    logger.info('知乎账户登陆成功')
                    os.remove(self.captchaFile)

                    with open(self.cookieFile, 'w') as output:
                        cookies = self.session.cookies.get_dict()
                        json.dump(cookies, output)
                        logger.info("已在同目录下生成cookie文件")

        except Exception as e:
            logger.info('知乎账户登陆失败')
            logger.error(e)
        finally:
            self.session.close()
            logger.info('session closed')
示例#25
0
    def load_data_by_user_time(self):
        logger.info("Loading interaction records from %s " % (self.path))
        pos_per_user = {}
        num_ratings = 0
        num_items = 0
        num_users = 0
        #user/item {raw id, inner id} map
        userids = {}
        itemids = {}
        # inverse views of userIds, itemIds,
        idusers = {}
        iditems = {}
        with open(self.path + ".rating", 'r') as f:
            for line in f.readlines():
                useridx, itemidx, rating, time = line.strip().split(
                    self.separator)

                if float(rating) >= self.threshold:
                    num_ratings += 1
                    if itemidx not in itemids:
                        iditems[num_items] = itemidx
                        itemids[itemidx] = num_items
                        num_items += 1

                    if useridx not in userids:
                        idusers[num_users] = useridx
                        userids[useridx] = num_users
                        num_users += 1
                        pos_per_user[userids[useridx]] = []
                    pos_per_user[userids[useridx]].append(
                        (itemids[itemidx], 1, int(float(time))))
                else:
                    num_ratings += 1
                    if itemidx not in itemids:
                        iditems[num_items] = itemidx
                        itemids[itemidx] = num_items
                        num_items += 1

                    if useridx not in userids:
                        idusers[num_users] = useridx
                        userids[useridx] = num_users
                        num_users += 1
                        pos_per_user[userids[useridx]] = []
                    pos_per_user[userids[useridx]].append(
                        (itemids[itemidx], rating, int(float(time))))
                # rating_matrix[self.userids[useridx],self.itemids[itemidx]] = rating
            for u in np.arange(num_users):
                pos_per_user[u] = sorted(pos_per_user[u], key=lambda d: d[2])
            logger.info(
                "\"num_users\": %d,\"num_items\":%d, \"num_ratings\":%d\n" %
                (num_users, num_items, num_ratings))
            userseq = deepcopy(pos_per_user)
            train_dict = {}
            train_matrix = sp.dok_matrix((num_users, num_items),
                                         dtype=np.float32)
            test_matrix = sp.dok_matrix((num_users, num_items),
                                        dtype=np.float32)
            time_matrix = sp.dok_matrix((num_users, num_items),
                                        dtype=np.float32)
            for u in np.arange(num_users):
                if len(pos_per_user[u]) < 3:
                    test_item = -1
                    continue

                test_item = pos_per_user[u][-1]
                pos_per_user[u].pop()

                test_matrix[u, test_item[0]] = test_item[1]
                time_matrix[u, test_item[0]] = test_item[2]
                items = []
                for enlement in pos_per_user[u]:
                    items.append(enlement[0])
                    train_matrix[u, enlement[0]] = enlement[1]
                    time_matrix[u, enlement[0]] = enlement[2]
                train_dict[u] = items
        return train_matrix, train_dict, test_matrix, userseq, userids, itemids, time_matrix
示例#26
0
def per_question(q_href):
    #def per_question(q_href, cookie):
    time.sleep(random.randint(1, 8))
    q_url = 'https://www.zhihu.com%s' % q_href
    proxy = rand_proxy()
    user_agent = random.choice(agents)
    header = requestHeader
    header.update({'User-Agent': user_agent})
    try:
        #response = requests.get(q_url, headers=header, proxies=proxy, cookies=cookie).content
        response = requests.get(q_url, headers=header, proxies=proxy).content
        html = response.decode('utf-8')
    except Exception as e:
        logger.error('exception url: %s' % q_url)
        logger.error(e)
        #logger.info(response)
        #sys.exit()
        per_question(q_href)

    #if '系统检测到您的帐号或IP存在异常流量' in html:
    #    logger.error('proxy error, {0}'.format(proxy))
    #    raise Exception

    tree = etree.HTML(html)
    tags = tree.xpath('//div[@class="Popover"]/text()')

    #question_a = tree.xpath('//h1[@class="QuestionHeader-title"]/text()')
    question_a = tree.xpath('//title[@data-react-helmet="true"]/text()')
    if question_a:
        title = question_a[0].replace(' - 知乎', '')
        if '安全验证' == title:
            logger.error('proxy error, {0}'.format(proxy))
            raise Exception

        logger.info(title)
    else:
        logger.error('%s title not found' % q_url)
        if '你正在使用的浏览器版本过低' in html:
            logger.info(user_agent)
            per_question(q_href)
        else:
            raise Exception

    #detail_a = tree.xpath('//div[@class="QuestionHeader-detail"]/div/div/span/text()')
    #if detail_a:
    #    content = detail_a[0]
    #else:
    #    content = None

    topics = tree.xpath('//a[@class="TopicLink"]')
    sub_topic = mongo_conn().sub_topic
    for t in topics:
        # https://www.zhihu.com/topic/19552832
        tid = t.attrib['href'].split('/')[-1]
        name = t.xpath('.//text()')[0]
        try:
            sub_topic.insert_one({'sub_tid': tid, 'sub_name': name})
        except DuplicateKeyError as e:
            continue

    items = tree.xpath('//div[@class="ContentItem AnswerItem"]')
    for i in items:
        # "1792 人赞同了该回答"
        vote_text = i.xpath('.//span[@class="Voters"]/button/text()')
        if len(vote_text) == 0:
            logger.info('%s no votes' % q_url)
            break

        vote_num = re.match('\d+', vote_text[0]).group()
        if int(vote_num) >= 800:
            href = i.xpath('.//meta[@itemprop="url"]')[1].attrib['content']
            answer = i.xpath(
                './/span[@class="RichText CopyrightRichText-richText"]')[0]
            s = etree.tostring(answer).decode('utf-8')
            body = html2text.html2text(s.replace('<br>', ''))

            try:
                mongo_conn().top_answers.insert_one({
                    'title': title,
                    'answer': body,
                    'href': href,
                    'vote': vote_num
                })
            except DuplicateKeyError as e:
                continue
示例#27
0
    def load_data_by_user_time(self):
        logger.info("Loading interaction records from %s " % (self.path))
        pos_per_user = {}
        num_ratings = 0
        num_items = 0
        num_users = 0
        #user/item {raw id, inner id} map
        userids = {}
        itemids = {}
        # inverse views of userIds, itemIds,
        idusers = {}
        iditems = {}
        with open(self.path, 'r') as f:
            for line in f.readlines():
                if self.data_format == "UIRT":
                    useridx, itemidx, rating, time = line.strip().split(
                        self.separator)
                    if float(rating) < self.threshold:
                        continue
                elif self.data_format == "UIT":
                    useridx, itemidx, time = line.strip().split(self.separator)
                    rating = 1
                elif self.data_format == "UIR":
                    useridx, itemidx, rating = line.strip().split(
                        self.separator)
                    if float(rating) < self.threshold:
                        continue
                elif self.data_format == "UI":
                    useridx, itemidx = line.strip().split(self.separator)
                    rating = 1

                else:
                    print("please choose a correct data format. ")

                num_ratings += 1
                if itemidx not in itemids:
                    iditems[num_items] = itemidx
                    itemids[itemidx] = num_items
                    num_items += 1

                if useridx not in userids:
                    idusers[num_users] = useridx
                    userids[useridx] = num_users
                    num_users += 1
                    pos_per_user[userids[useridx]] = []
                if self.data_format == "UIRT" or self.data_format == "UIT":
                    pos_per_user[userids[useridx]].append(
                        (itemids[itemidx], rating, int(float(time))))

                else:
                    pos_per_user[userids[useridx]].append(
                        (itemids[itemidx], rating, 1))

        if self.data_format == "UIRT" or self.data_format == "UIT":
            for u in range(num_users):
                pos_per_user[u] = sorted(pos_per_user[u], key=lambda d: d[2])
        logger.info("\"num_users\": %d,\"num_items\":%d, \"num_ratings\":%d" %
                    (num_users, num_items, num_ratings))
        userseq = deepcopy(pos_per_user)
        train_dict = {}
        train_matrix = sp.dok_matrix((num_users, num_items), dtype=np.float32)
        test_matrix = sp.dok_matrix((num_users, num_items), dtype=np.float32)
        time_matrix = sp.dok_matrix((num_users, num_items), dtype=np.float32)
        for u in range(num_users):
            num_ratings_by_user = len(pos_per_user[u])
            num_test_ratings = math.floor(
                float(self.splitterRatio[1]) * num_ratings_by_user)
            if len(pos_per_user[u]) >= 2 and num_test_ratings >= 1:
                for _ in range(num_test_ratings):
                    test_item = pos_per_user[u][-1]
                    pos_per_user[u].pop()
                    test_matrix[u, test_item[0]] = test_item[1]
                    time_matrix[u, test_item[0]] = test_item[2]
            items = []
            for enlement in pos_per_user[u]:
                items.append(enlement[0])
                train_matrix[u, enlement[0]] = enlement[1]
                time_matrix[u, enlement[0]] = enlement[2]
            train_dict[u] = items
        return train_matrix, train_dict, test_matrix, userseq, userids, itemids, time_matrix
示例#28
0
    def load_pre_splitter_data(self):
        pos_per_user = {}
        num_items, num_users = 0, 0
        userids, itemids, idusers, iditems = {}, {}, {}, {}
        # Get number of users and items
        with open(self.path + ".train.rating", 'r') as f:
            for line in f.readlines():
                useridx, itemidx, rating, time = line.strip().split(
                    self.separator)
                if float(rating) >= self.threshold:
                    if itemidx not in itemids:
                        iditems[num_items] = itemidx
                        itemids[itemidx] = num_items
                        num_items += 1

                    if useridx not in userids:
                        idusers[num_users] = useridx
                        userids[useridx] = num_users
                        num_users += 1
                        pos_per_user[userids[useridx]] = []
                    pos_per_user[userids[useridx]].append(
                        [itemids[itemidx], 1, int(time)])
                else:
                    if itemidx not in itemids:
                        iditems[num_items] = itemidx
                        itemids[itemidx] = num_items
                        num_items += 1

                    if useridx not in userids:
                        idusers[num_users] = useridx
                        userids[useridx] = num_users
                        num_users += 1
                        pos_per_user[userids[useridx]] = []
                    pos_per_user[userids[useridx]].append(
                        (itemids[itemidx], rating, int(time)))

            train_dict = {}
            for u in range(num_users):
                pos_per_user[u] = sorted(pos_per_user[u], key=lambda d: d[2])
                items = []
                for enlement in pos_per_user[u]:
                    items.append(enlement[0])
                train_dict[u] = items

        with open(self.path + ".test.rating", 'r') as f:
            for line in f.readlines():
                useridx, itemidx, rating, time = line.strip().split(
                    self.separator)
                if float(rating) >= self.threshold:
                    if itemidx not in itemids:
                        iditems[num_items] = itemidx
                        itemids[itemidx] = num_items
                        num_items += 1

                    if useridx not in userids:
                        idusers[num_users] = useridx
                        userids[useridx] = num_users
                        num_users += 1
                        pos_per_user[userids[useridx]] = []
                    pos_per_user[userids[useridx]].append(
                        [itemids[itemidx], 1, int(time)])
                else:
                    if itemidx not in itemids:
                        iditems[num_items] = itemidx
                        itemids[itemidx] = num_items
                        num_items += 1

                    if useridx not in userids:
                        idusers[num_users] = useridx
                        userids[useridx] = num_users
                        num_users += 1
                        pos_per_user[userids[useridx]] = []
                    pos_per_user[userids[useridx]].append(
                        [itemids[itemidx], rating,
                         int(time)])
        for u in range(num_users):
            pos_per_user[u] = sorted(pos_per_user[u], key=lambda d: d[2])

        train_matrix = sp.dok_matrix((num_users, num_items), dtype=np.float32)
        time_matrix = sp.dok_matrix((num_users, num_items), dtype=np.float32)
        with open(self.path + ".train.rating", "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split("\t")
                user, item, rating, time = userids[arr[0]], itemids[
                    arr[1]], float(arr[2]), float(arr[3])
                if float(rating) >= self.threshold:
                    train_matrix[user, item] = 1

                else:
                    train_matrix[user, item] = rating
                time_matrix[user, item] = time
                line = f.readline()
        logger.info("already load the trainMatrix...")

        test_matrix = sp.dok_matrix((num_users, num_items), dtype=np.float32)
        with open(self.path + ".test.rating", "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split("\t")
                user, item, rating, time = userids[arr[0]], itemids[
                    arr[1]], float(arr[2]), float(arr[3])
                if float(rating) >= self.threshold:
                    test_matrix[user, item] = 1
                else:
                    test_matrix[user, item] = rating
                time_matrix[user, item] = time
                line = f.readline()
        logger.info("already load the trainMatrix...")

        return train_matrix, train_dict, test_matrix, pos_per_user, userids, itemids, time_matrix