def getShop(self,keyword):
     """取出前30个商品,并判断是否为配件"""
     goalKey = "手机"
     configutil = ConfigUtil()
     URL = configutil.getSearch_SuningPrd()+"/"+urllib.parse.quote(keyword)
     # URL = "http://search.suning.com"+"/"+urllib.parse.quote(keyword)
     response = SpiderUtil.getHtml(URL + "/")
     vonder_shops = re.findall(r'<li docType="1".*id="(.*?)"',response) + re.findall(r'<div.*product-box  basic.*id="(.*?)"',response)
     flagList = []
     checkResult = {}  # 报告中使用
     for vonder_shop in vonder_shops:
         goalFlag = 0
         number =vonder_shop.split("-")
         url = "https://product.suning.com/"+number[0]+"/"+number[1]+".html"
         soup = SpiderUtil.getSoupContent(url)
         logger.info("测试商品: %s"%vonder_shop)
         breadcrumb_title = ""
         if soup != "":
             breadcrumb_title_tmp = SpiderUtil.findContentbyTagClass(soup, "span", "breadcrumb-title")
             if breadcrumb_title_tmp!=[]:
                 breadcrumb_title = breadcrumb_title_tmp[0]["title"]
             if self.isExistErrorKey(breadcrumb_title):
                     if self.isExistGoalKey(soup):
                         goalFlag = 1 #表示此商品是手机
         flagList.append(goalFlag)
         if goalFlag == 1:
             checkResult.update({number[1]:goalKey})
         else:
             checkResult.update({number[1]:"配件"})
     return flagList,checkResult
Пример #2
0
 def setUpClass(cls):
     proDir = os.path.join(os.getcwd(),
                           Constant.PATH_FOR_CONF)  # 动态获取配置文件目录相对路径
     headerPath = os.path.join(
         proDir, Constant.HEADER_FILE_NAME)  # 动态获取配置文件ini的相对路径
     cls.headers = OperJson.read_json(headerPath)
     cls._MockDate = OperJson.read_json(
         os.path.join(proDir, Constant.MOCK_FILE_NAME))
     config = ConfigUtil()
     cls.host = config.getRun("host")
 def setUpClass(self):
     logger.info("---start test suite(%s)---" % __name__)
     if self.writeExcelFlag:
         self.startTime = datetime.datetime.now()
         config = ConfigUtil()
         self.environment = config.getSelectedEnvironment()
         self.search = self.environment[0]
         self.list = self.environment[1]
         self.assertIsNot(self.environment, "", "Please choose one environment to run!")
         self.file_path = os.path.join(os.getcwd(), Constant.PATH_FOR_FILES)
         file_abspath = os.path.join(self.file_path, "telephone.txt")
         self.ku = KeywordUtil(file_abspath)
 def get_HitLevel(self,keyword):
     """从wwsy中获取目录id和权重值"""
     Config = ConfigUtil()
     wwsy = Config.getRun("wwsy_prd") + urllib.parse.quote(keyword)
     # wwsy = "http://10.104.242.58/qa?&gender=1&rewrite=1&gender=1&import=1&rett=1&shop=1&source=search&semantic=1&gbk=0&q="+urllib.parse.quote(keyword)
     request = RequestUtil()
     reponse = request.get(wwsy, {})
     HitLevel = {}
     if reponse["sort_res"] != None:
         entitys = reponse["sort_res"][0]["entitys"]
         if entitys !=None:
             for entity in entitys[:3]:#只取前三个
                 HitLevel.update({entity["entity_id"]:entity["entity_s"]})
     return HitLevel
Пример #5
0
 def _print(level,source,text):
     '''
     打印
     :param self:
     :return: 无
     '''
     timestamp = time.time()
     log_timestamp = int(round(timestamp * 1000))
     log_timestring = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
     pid=os.getpid()
     #将log写入console
     config_level = ConfigUtil.get('log', 'level')
     enum_level=['DEBUG', 'INFO', 'WARN', 'ERROR']
     level_index=enum_level.index(level) if level in enum_level else -1
     config_level_index=enum_level.index(config_level) if config_level in enum_level else -1
     if level_index >= config_level_index and level_index>=0 and config_level_index>=0:
         print('[%s] [%s] [%d] [%s]: %s' % (log_timestring,level,pid,source,text))
     #启动线程将log写入mongodb的logs集合
     logdict={
         'timestamp':log_timestamp,
         'timestring':log_timestring,
         'level':level,
         'pid':pid,
         'source':source,
         'text':text
     }
     mongoLog.put_queue(logdict)
     pass
Пример #6
0
 def run(self):
     '''
     每隔1秒,循环读取tasks
     交给Downloader
     :return:
     '''
     #创建进程池
     pool = Pool()
     while True:
         #获取一条待执行的Task,并置为doing状态
         task = self.taskUtil.get_ready()
         if task is not None and len(task) > 0 or True:
             Log.i('-----------------------------')
             #用进程池启动Downloader
             pool.apply_async(self.run_downloader, args=(task, ))
         #休眠n秒(从配置文件中读取)
         items = ConfigUtil.getItems('scheduler')
         interval_min = items['interval_min']
         interval_max = items['interval_max']
         seconds = random.randint(int(interval_min), int(interval_max))
         Log.i('Start sleep ' + str(seconds) + ' seconds')
         time.sleep(seconds)
     pool.close()
     pool.join()
     log.i('All subprocesses done.')
Пример #7
0
 def compare_brand(self, kewords_random):
     flag = True
     # kewords_random = ["http://t.cn/rs719hp"]
     for keyword in kewords_random:
         logger.info("测试keyword为:%s" % (keyword))
         brands = self.get_brand(keyword)
         logger.info("页面取得品牌:%s" % brands)
         Config = ConfigUtil()
         wwsy = Config.getRun("wwsy_prd") + urllib.parse.quote(keyword)
         logger.info(wwsy)
         request = RequestUtil()
         reponse = request.get(wwsy, {})
         # 空列表循环是否报错
         brand_wwsy = "未命中"
         # logger.info("问题%s"%tmp)
         if reponse["sort_res"] != None:
             for member in reponse["sort_res"]:
                 tmp1 = member["entitys"]
                 if member["entitys"] != None:
                     for entitys in member["entitys"]:
                         if entitys["entity_n"] == "brand_Name":
                             brand_wwsy = entitys[
                                 "entity_v"]  # 同时命中两个品牌的情况??
                             break
             logger.info("wwsy取的品牌:%s" % brand_wwsy)
         if brand_wwsy in brands:
             wordMatch = "pass"
         else:
             if brand_wwsy == "未命中":
                 wordMatch = "pass"
             else:
                 wordMatch = "fail"
         if wordMatch == "fail":
             flag = False
         result = SearchResult()
         result.setBrand(brands)
         result.setBrand_wwsy(brand_wwsy)
         result.setKeyword(keyword)
         result.setMatchStatus(wordMatch)
         self.searchresults.append(result)
     return flag
Пример #8
0
    def run(self):
        '''
        生产进程执行,每隔60*60*60*24秒,循环读取tasks
        :return:
        '''
        Log.i ('ProducerUrl.run() in {0}'.format(time.ctime()))
        while True:
            #生产URL
            if USE_SOURCEURL_TYPE is True:
                if USE_ASYNCTASK_TYPE is True:
                    urlInformationList = ConfigUtil.readSourceListByParams(self.begin, self.end)
                else:
                    urlInformationList = ConfigUtil.readSourceList()
            else:
                urlInformationList = ConfigUtil.readTaskList()

            if urlInformationList is None:
                continue

            for urlInfor in urlInformationList:
                data = urlInfor.class2dict()
                diststrjson = json.dumps(data)
                Log.i(diststrjson)
                KafkaOperator = kafkaUrlinformation()
                KafkaOperator.producerUrl(diststrjson)

            #日执行一次不用休眠了.使用crontab定时任务驱动
            if self.crontab==1:
                os._exit(0)
            else:
                # #休眠n秒(从配置文件中读取)
                items=ConfigUtil.getItems('producerScheduler')
                interval_min = items['interval_min']
                interval_max = items['interval_max']
                seconds=random.randint(int(interval_min),int(interval_max))
                Log.i('StartProducerUrl sleep ' + str(seconds) + ' seconds')
                time.sleep(seconds)
    def run(self):
        '''
        生产进程执行,每隔60*60*60*24秒,循环读取tasks
        :return:
        '''
        Log.i('ProducerUrl.run() in {0}'.format(time.ctime()))
        while True:
            #监听数据
            DictData = self.pipeDictData.recv()
            if DictData is None:
                continue
            #源数据处理(实体类)
            self.URL_inf.dict2class(DictData)
            #检查Mongo
            if self.mogodbControl is None:
                self.mogodbControl = Mongodb_Operator(
                    DbdataCCGPDFZB["host"], DbdataCCGPDFZB["port"],
                    DbdataCCGPDFZB["db_name"],
                    DbdataCCGPDFZB["default_collection"])
            #检查Kafka
            if self.KafkaOperator is None:
                self.KafkaOperator = localKafkaUrlinformation()
            #查重
            uuid = self.get_md5(self.URL_inf.Urlname, self.URL_inf.title)
            item = {"uuid": uuid}
            value = self.mogodbControl.findone(
                item, self.__Sendcollection)  # 查询到返回文档
            # #TODO 插入数据库有问题
            if value is not None:
                continue
            #获取首页内容
            self.URL_inf = self.downLoadHtml()
            if self.URL_inf is None:
                continue
            #异步保存数据
            self.savedata(self.URL_inf)

            # #休眠n秒(从配置文件中读取)
            items = ConfigUtil.getItems('consumerScheduler')
            interval_min = items['interval_min']
            interval_max = items['interval_max']
            seconds = random.randint(int(interval_min), int(interval_max))
            Log.i('StartProducerUrl sleep ' + str(seconds) + ' seconds')
            time.sleep(seconds)
Пример #10
0
 def run(self):
     '''
     获取免费IP代理进程执行,循环读取tasks
     :return:
     '''
     Log.i('proxyIpPool.run() in {0}'.format(time.ctime()))
     while True:
         #调用本地和远程的免费ip代理api并推进ip消息队列
         proxyIpPool = getIpProxyPool()
         #统一改成本地
         proxyIpPoolFromeRemote = getIpProxyPool()
         # proxyIpPoolFromeRemote = getIpProxyPoolFromeRemote()
         if proxyIpPool is not None:
             self.queueDictData.put(proxyIpPool)
         if proxyIpPoolFromeRemote is not None:
             self.queueDictData.put(proxyIpPoolFromeRemote)
         # 休眠n秒(从配置文件中读取)
         items = ConfigUtil.getItems('proxyIpScheduler')
         interval_min = items['interval_min']
         interval_max = items['interval_max']
         seconds = random.randint(int(interval_min), int(interval_max))
         Log.i('proxyIpPool sleep ' + str(seconds) + ' seconds')
         time.sleep(seconds)
Пример #11
0
    def simpleRun(self):
        '''
        生产进程执行,每隔60*60*60*24秒,循环读取tasks
        :return:
        '''
        Log.i('ProducerUrl.run() in {0}'.format(time.ctime()))
        while True:
            #资源检查
            # KafkaOperator = kafkaUrlinformation()
            KafkaOperator = localKafkaUrlinformation()
            # if self.mogodbControl is None:
            #     self.mogodbControl = Mongodb_Operator(Dbdata["host"], Dbdata["port"], Dbdata["db_name"],
            #                                       Dbdata["default_collection"])
            #解析数据源
            # if USE_SOURCEURL_TYPE is True:
            #     if USE_ASYNCTASK_TYPE is True:
            #         urlInformationList = ConfigUtil.readSourceListRealTime()
            #     else:
            #         urlInformationList = ConfigUtil.readSourceList()
            # else:
            #     urlInformationList = ConfigUtil.readTaskList()

            urlInformationList = ConfigUtil.readSourceListRealTime()

            #爬取,解析子URL
            if urlInformationList is None:
                continue

            for urlInfor in urlInformationList:
                data = urlInfor.class2dict()

                #获取首页内容
                dowloadData = self.downLoadHtml(data)
                if dowloadData is None:
                    continue
                # 解析提取分页url
                pageData = self.getPageNumFromHome(dowloadData)
                if pageData is None:
                    continue
                for pageIndex in pageData:
                    # 获取首页内容
                    dowloadPageData = self.downLoadHtml(pageIndex.class2dict())
                    if dowloadPageData is None:
                        continue
                    #提取子链接
                    # self.URL_inf.dict2class(pageIndex)
                    ccgpChildrenLink = self.getChildrenLink(dowloadPageData)
                    if ccgpChildrenLink is None:
                        continue
                    #KAFKA下发子链接
                    for link in ccgpChildrenLink:
                        # 检查Mongo
                        if self.mogodbControl is None:
                            self.mogodbControl = Mongodb_Operator(
                                DbdataCCGPDFZB["host"], DbdataCCGPDFZB["port"],
                                DbdataCCGPDFZB["db_name"],
                                DbdataCCGPDFZB["default_collection"])
                        # 查重,不重复发送到kafka节省资源
                        if link.title is None:  #标题为空的不发送
                            continue
                        uuid = self.get_md5(link.Urlname, link.title)
                        item = {"uuid": uuid}
                        value = self.mogodbControl.findone(
                            item, self.__Sendcollection)  # 查询到返回文档
                        # #TODO 插入数据库有问题
                        if value is not None:  #数据库查重
                            continue
                        # 于浩说不要发父链接给他
                        if link.DeepNum >= 0:
                            producerData = json.dumps(link.class2dict())
                            Log.i("produce<<" + producerData)
                            KafkaOperator.producerUrl(producerData)

            #日执行一次不用休眠了.使用crontab定时任务驱动
            if self.crontab == 1:
                os._exit(0)
            else:
                # #休眠n秒(从配置文件中读取)
                items = ConfigUtil.getItems('producerScheduler')
                interval_min = items['interval_min']
                interval_max = items['interval_max']
                seconds = random.randint(int(interval_min), int(interval_max))
                Log.i('StartProducerUrl sleep ' + str(seconds) + ' seconds')
                time.sleep(seconds)
Пример #12
0
 def __init__(self):
     self.headers = {"content-type": "application/json;charset=UTF-8"}
     config = ConfigUtil()
     self.segURL = config.getRun(Constant.CONFIG_PARAM_RUN_SEGMENTATION)
Пример #13
0
import requests
import json
import os
import logging
import unittest
import datetime
from utils.Constant import Constant
from utils.KeywordUtil import KeywordUtil
from utils.RequestUtil import RequestUtil
from utils.ConfigUtil import ConfigUtil
from utils.ExcelUtil import ExcelUtil
from format.suning.SearchResult import SearchResult

logger = logging.getLogger(Constant.LOGGER_NAME)
req = RequestUtil()
config = ConfigUtil()

"""评价数为0的商品排序靠前"""
class NumberOfEvaluation0(unittest.TestCase):
    all_results = []

    @classmethod
    def setUpClass(self):
        self.startTime = datetime.datetime.now()
        logger.info("---start test suite(%s)---" % __name__)
        file_path = os.path.join(os.getcwd(), Constant.PATH_FOR_FILES)
        file_abspath = os.path.join(file_path, "topQuery_suning_1million.txt")
        # topQuery_suning_1million.txt  topQuery.txt   test.txt
        self.all_keywords = KeywordUtil(file_abspath).getAllKeywords()

    @classmethod
Пример #14
0
#! /usr/bin/python3
# -*- coding: utf8 -*-
import logging
import os
import time
import unittest
# from utils import HTMLSearchReport
from utils import HTMLTestRunner as HTMLSearchReport
from utils.CaseUtil import CaseUtil
from utils.ConfigUtil import ConfigUtil
from utils.Constant import Constant
from utils.LogSingleton import LogSingleton

if __name__ == '__main__':
    config = ConfigUtil()
    # 选择去运行case目录下的哪些自动化脚本
    casePattern = config.getRun(Constant.CONFIG_PARAM_RUN_PATTERN)

    logger = LogSingleton()
    logger = logging.getLogger(Constant.LOGGER_NAME)

    # 报告存放路径
    report_path = os.path.join(os.getcwd(), Constant.PATH_FOR_REPORTS)

    now = time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime(time.time()))
    report_abspath = os.path.join(report_path, "report_" + now + ".html")
    fp = open(report_abspath, 'wb')
    logger.info("Created a report file(%s)" % report_abspath)
    runner = HTMLSearchReport.HTMLTestRunner(
        stream=fp,
        verbosity=2,
    def verify_PrecisionRate(self,keywords):
        count = 0
        errorKey = []
        for keyword in keywords:
            logger.info("测试分词:%s"%keyword)
            keyword = keyword.replace("-", '%252d')  #quote对‘-’不编码
            RateSum = []
            result = SearchResult()
            config = ConfigUtil()
            url = config.getSearch_SuningPrd()
            soup = SpiderUtil.getSoupContent(url+"/"+urllib.parse.quote(keyword)+"/")
            vonder_shops = self.get_partnumber(url,keyword)
            logger.info(url+"/"+urllib.parse.quote(keyword)+"/")
            #**********处理qurey分词********************
            if soup != '':
                div = soup.find("div", class_="no-result-tips")  # 考虑推荐的情况,会有两个class[no-result-tips no-result-proposal]
                if div != None:
                    if len(div["class"]) == 1:  # 考虑改写的情况,用改写之后的值匹配
                        tmp = div.strong.text
                        keyword = re.findall(r'我们为您提供"(.*?)".*的搜索结果',div.strong.text)[0]
                qurey_segList = self.deal_QuerySeg(keyword)  # 存放query的每个分词找到同义词
                allShopNum = int(SpiderUtil.getTotalCount(soup))  # 考虑召回商品少于5个情况,没有召回商品的情况
                if allShopNum != 0:
                    if allShopNum > 4:
                        getShopNum = 5  # 取召回商品的前5个
                    else:
                        getShopNum = allShopNum
                    titles = SpiderUtil.getTitles(soup, getShopNum)
                    auxdescriptions = SpiderUtil.getAuxdescription(soup, getShopNum)
                    storenames = SpiderUtil.getStoreName(soup, getShopNum)
                    for k in range(len(titles)):
                        redundancy_seg = ["官方", "旗舰", "店", "苏宁", "自营"]
                        storename_seg = self.get_Seg("query",storenames[k])
                        storenames[k] = ''.join([i for i in storename_seg if i not in redundancy_seg])  # 将店铺名中的相关词去掉
                        if storenames[k] in keyword:  # 考虑keyword中命中了店铺名
                            keyword_new = keyword.replace(storenames[k],"")  # 去掉店铺名
                            qurey_segList = self.deal_QuerySeg(keyword_new)
                        if qurey_segList ==[] or keyword in storenames[k]:  # 考虑搜索词正好是店铺名称
                            HitRate_title = 1.0
                            HitRate_auxdescription = 1.0
                        else:
                            HitRate_title = self.count_HitRat(qurey_segList, titles[k])
                            HitRate_auxdescription = self.count_HitRat(qurey_segList, auxdescriptions[k])
                        hitrate = max(HitRate_title, HitRate_auxdescription)
                        if hitrate == 0:  # 若标题和卖点都没有中,考虑查看是否命中通子码信息
                            BjOtherTxt = self.check_HitBjOtherTxt(vonder_shops[k])
                            hitrate = self.count_HitRat(qurey_segList, ''.join(BjOtherTxt))
                        RateSum.append(hitrate)
                    AverageHitRate = sum(RateSum) / len(RateSum)
                    AverageHitRate = float("%.2f"%AverageHitRate)
                else:
                    titles = ["非常抱歉!没有找到与' *** ' 相关的商品。"]
                    RateSum = []
                    AverageHitRate = -1

            else:
                titles = ["根据相关法律法规和政策,无法显示相关的商品"]
                RateSum = []
                AverageHitRate = -1

            #************报告**************
            if 0 <= AverageHitRate <= 0.6:
                wordMatch = "fail"
                count += 1
                errorKey.append(keyword)
                if keyword.isdigit():  # 考虑qurey是商品编码的情况
                    partnumber = list(map(lambda x:x.split("-")[1],vonder_shops))
                    if partnumber.count(keyword) == len(partnumber) and partnumber != []:
                        wordMatch = "pass"
                        AverageHitRate = 1
                        RateSum = [1.0]*len(partnumber)
                        count = count-1
                        errorKey.pop()
            elif 0.6 < AverageHitRate <= 1:
                wordMatch = "pass"
            else:
                wordMatch = "warn"
            newTiles = list(map(lambda x:"<br>"+x,titles)) #为了报告中换行
            result.setKeyword(keyword)
            result.setWords(qurey_segList)
            result.setTitle(newTiles)
            result.setHitRate(RateSum)
            result.setAverageHitRate(AverageHitRate)
            result.setMatchStatus(wordMatch)
            self.searchresults3.append(result)
        return count,errorKey