예제 #1
0
    def test(self):
        df = self.fromOSS("downData/www_tianyancha_com/detail/company_1","2018022700-2018022800")
        def stat(record):
            """
            record 就是每个json结果
            :param record:
            :return:
            """
            business_info = record.get("business_info ", {})
            #数量
            annualReport_Num = len(business_info.get("annualReport",[]))
            #是否为空,1,0
            has_tianyan_risk = 1 if business_info.get("tianyan_risk",None) else 0
            #长度
            company_name_len = len(record.get("company_title", {}).get("company_name", ""))
            result = {
                "annualReport_Num":annualReport_Num,
                "has_tianyan_risk":has_tianyan_risk,
                "company_name_len":company_name_len
            }
            return result#(annualReport_Num,has_tianyan_risk,company_name_len)
        #TODO:series->list->df,应该可以直接优化成series->df
        data = df.apply(lambda x:stat(x.values[0][0]),axis=1).tolist()
        df2 = pd.DataFrame.from_dict(data)
        info = df2.describe()
        logDebug("""
        #####################\n
        this is the stat sample,you can try it and save the resulut to db for further visualization\n
        #####################\n
        %s\n
        #####################\n
        """%info)

        result = info.to_dict()
예제 #2
0
    def getPageNum(self, url, listConf):
        """翻页获取翻页页数的结果"""
        config = listConf
        # 不分页 TAG_LIST_TOTAL_PAGE_NUM总页数
        if not config.get(TAG_LIST_TOTAL_PAGE_NUM, None):  # 不分页
            return (1, None)
        content = self.http.get(url)  # 获取响应
        if self.http.isBlocked():  # antiBlockUtil返回blocked的值或False
            return (0, ERR_BLOCK)

        if TAG_LIST_NO_RESULT in config and content.find(config[TAG_LIST_NO_RESULT]) > 0:
            logDebug("%s url=%s\n%s" % (getTimestamp(), url, config[TAG_LIST_NO_RESULT]))
            return (0, None)
        result = {}
        # 解析并且翻页的结果
        Extractor(self.http, self.parser).getResultByContent(content,
                                                             {TAG_LIST_TOTAL_PAGE_NUM: config[TAG_LIST_TOTAL_PAGE_NUM]},
                                                             result)
        # 优化了一下
        # totalPageNum = int(result[TAG_LIST_TOTAL_PAGE_NUM].strip())
        if result[TAG_LIST_TOTAL_PAGE_NUM] != "":
            totalPageNum = int(result[TAG_LIST_TOTAL_PAGE_NUM].strip())
        else:
            totalPageNum = 1    # 此处改动 0 -->1

        return (totalPageNum, None)
예제 #3
0
파일: aliyun.py 프로젝트: Curiou/redundancy
    def listDir(self, dir1, timeRange=None):
        """
        :param dir1:downData/www_tianyancha_com/detail/company_1
        :param timeRange: fmt:2018020100-2018020200
        :return:
        """
        objects = []
        tryTime = 0  #

        def getNameTS(name):
            m = re.search(r"(\d{4})/(\d{2})/(\d{2})/(\d{2})", name)
            return "%s%s%s%s" % (m.group(1), m.group(2), m.group(3),
                                 m.group(4))

        while tryTime < 3:
            try:
                dir1 = preProcessDir(dir1)
                # 列出bucket中”fun/”目录下所有文件
                beginTime, endTime = timeRange.split("-") if timeRange else (
                    None, None)
                import oss2
                for idx, object_info in enumerate(
                        oss2.ObjectIterator(self.oss, prefix=dir1)):
                    if beginTime and endTime:
                        ts = getNameTS(object_info.key)
                        if ts < beginTime or ts >= endTime:
                            # logDebug("ignore:%s"%object_info.key)
                            continue
                    objects.append(object_info.key)
                    logDebug("%s:%s" % (idx, object_info.key))
                return objects
            except Exception, e:
                logException()
            tryTime += 1
            time.sleep(1)
예제 #4
0
    def downLists(self, listConf, listItemConf, resultHandlerClass, urlMgr):
        """
        :param listConf: 列表配置
        :param listItemConf: 列表项配置
        :param resultHandlerClass: 结果处理类
        :param urlMgr: urlManager 提供url
        :return:
        """

        # 打印时间戳
        logInfo("%s_begin downLists" % (getTimestamp()))
        err = num = 0
        # url跳页 处理
        for url in urlMgr.pageUrls(listConf):
            try:
                # debug 打印url
                logDebug(url)
                # 中转 获取原网页的源码content 并交给 downOneList2 处理
                self.downOneList(url, listConf, listItemConf,
                                 resultHandlerClass())
                num += 1
                # 检查下载状态
                err = self.checkDownStatus(num)
                if IS_ERROR(err):  # 如果err<0 break
                    break
            except Exception:
                logException()
        return err
예제 #5
0
 def testReadAliyun(self):
     """
     :return:
     """
     from superbase.utility.aliyun import AliYun
     lines = AliYun().readJsonFromAliyun('downData/www_qichacha_com')
     for line in lines:
         logDebug(line['name'])
예제 #6
0
def asyncRun(cmd, shell=True):
    try:
        logDebug("asyncRun:%s" % cmd)
        subprocess.Popen(cmd,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.STDOUT,
                         shell=shell)
    except Exception, e:
        logException()
예제 #7
0
파일: ioUtil.py 프로젝트: Curiou/redundancy
def printDict(dict, log=None):
    """
    输出dict形式
    :param dict:  字典的数据
    :param log: 是否写入日志中
    :return:
    """
    info = json.dumps(dict, indent=4, ensure_ascii=False)
    if info:
        if not log:
            from superbase.utility.logUtil import logDebug
            logDebug(info)
        else:
            log(info)
예제 #8
0
 def exe(conn, sql, values):
     try:
         # 创建执行的游标
         cur = conn.cursor()
         # 如果values列表 非空
         if values:
             # 执行sql命令
             cur.execute(sql, values)
         else:
             cur.execute(sql)
         if gConfig.get("debug.sql", 0):
             logDebug(cur._last_executed)
         # 把创建的游标 放到队列中 put 发送 出去
         q.put(("ok", cur))
     except Exception:
         err = traceback.format_exc()
         # 错误 放到队列中 put 发送 出去
         q.put(("error", err))
예제 #9
0
 def getMissedBatch(self):
     db = gTop.get(CFG_DB_MONITOR)
     batches = [
         batch[0] for batch in db.query(
             "select batch from job where name='tycdetail_fetcher' and status=100"
         )
     ]
     batches2 = [
         batch[0] for batch in db.query(
             "select batch from batch where batch like 'tycdetail_%' and closed=0"
         )
     ]
     result = list(set(batches) - set(batches2))
     if result:
         sql = "select id, name,status,batch,beginTime from job where batch in ('%s')" % (
             "','".join(result))
         logDebug(sql)
         for id, name, status, batch, beginTime in db.query(sql):
             if status != 100:
                 db.update("job", {"status": 2}, "where id=%s" % id)
         for batch2 in result:
             db.update("batch", {"closed": 0}, "where batch='%s'" % batch2)
예제 #10
0
파일: fanli.py 프로젝트: Curiou/redundancy
    def updateResume2(self):
        """
        python spider/sample/fanli.py update.time=600 updateResume2
        :return:
        """
        self.http.get("https://www.lagou.com/")
        driver = self.http.driver
        e = findElement(driver, By.XPATH,
                        '//*[@id="changeCityBox"]/ul/li[2]/a')
        if e:  #弹出地域选择
            e.click()
        driver.find_element_by_xpath(
            '//*[@id="lg_tbar"]/div/ul/li[1]/a').click()
        driver.find_element_by_xpath(
            '/html/body/section/div[1]/div[1]/ul/li[1]').click()
        driver.find_element_by_xpath(
            '/html/body/section/div[1]/div[2]/form/div[1]/input').send_keys(
                "*****@*****.**")
        driver.find_element_by_xpath(
            '/html/body/section/div[1]/div[2]/form/div[2]/input').send_keys(
                "pwdh8f_lagou")
        driver.find_element_by_xpath(
            '/html/body/section/div[1]/div[2]/form/div[5]/input').click()

        time.sleep(10)
        num = 0
        while num < 300000:
            driver.find_element_by_xpath(
                '//*[@id="lg_tbar"]/div/ul/li[2]/a').click()  #简历
            time.sleep(5)
            driver.find_element_by_xpath(
                '//*[@id="workExperience"]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/div/em'
            ).click()  #edit
            time.sleep(5)
            driver.find_element_by_xpath(
                '//*[@id="currentUpJobForm"]/div/div[6]/input').click()  #edit
            time.sleep(gConfig.get("update.time", 300))
            logDebug("update %s" % num)
            num += 1
예제 #11
0
    def downOneList2(self, url, content, listConf, listItemConf,
                     resultHandler):
        """
        downOneList 的具体实现
        :param url: 只是起到log作用
        :param content: 页面内容
        :param listConf: 列表配置
        :param listItemConf: 列表项配置
        :param resultHandler: 结果的handler
        :return: error:-1,ok:0
        """

        # pq(etree.parse())直接接受一个文档,按照文档结构解析
        # StringIO经常被用来作字符串的缓存,因为StringIO的一些接口和文件操作是一致的,
        # 同样的代码,可以同时当成文件操作或者StringIO操作。
        # getroot 获取原网页的根
        root = pq(etree.parse(StringIO(content), self.parser).getroot())
        # list 行数组的模式
        css = listConf[TAG_LIST_ITEMS]
        trs = root(css)

        if trs and len(trs) > 0:
            for idx, tr in enumerate(trs):  # enumerate 列举
                try:
                    result = {}
                    # 把提取的原网页内容 以行组模式 css选择器 为匹配方式 并以dict形式 保存到result中
                    self.extractor.getResult(pq(tr), listItemConf, result)
                    # debug输出
                    logDebug(getPrintDict(result))
                    # 输出 -->BaseResultHandler().handle(result)
                    resultHandler.handle(result)
                except Exception:
                    logException()
        else:
            # 没有这个lsits 并打印错误的url
            logError("%s !no lists" % url)
            return -1
        return 0
예제 #12
0
파일: fanli.py 프로젝트: Curiou/redundancy
 def updateResume(self):
     """
     python spider/sample/fanli.py update.time=600 updateResume
     :return:
     """
     while True:
         self.http.get("https://www.liepin.com/sh/")
         driver = self.http.driver
         driver.find_element_by_xpath(
             '//*[@id="home"]/div[2]/div[1]/div/div/section[2]/form/div[3]/p/a'
         ).click()
         time.sleep(5)
         driver.find_element_by_xpath(
             '//*[@id="home"]/div[2]/div[1]/div/div/section[1]/div[1]/form/div[1]/input'
         ).send_keys("*****@*****.**")
         driver.find_element_by_xpath(
             '//*[@id="home"]/div[2]/div[1]/div/div/section[1]/div[1]/form/div[2]/input'
         ).send_keys("pwdh8f_liepin")
         driver.find_element_by_xpath(
             '//*[@id="home"]/div[2]/div[1]/div/div/section[1]/div[1]/form/input[3]'
         ).click()
         time.sleep(10)
         num = 0
         while num < 300000:
             e = findElement(
                 driver, By.XPATH,
                 '// *[ @ id = "home"] / div[3] / div[2] / div[1] / div[4] / ul / li[1] / a'
             )
             if e:
                 e.click()
                 time.sleep(gConfig.get("update.time", 300))
                 self.http.get("https://c.liepin.com/")
                 logDebug("update %s" % num)
                 num += 1
             else:
                 logError("try again!")
                 break
예제 #13
0
파일: ioUtil.py 프로젝트: Curiou/redundancy
                if (os.path.isdir(d) == True):
                    deleteNullDir(d)
        if not os.listdir(dirr):
            info = 'del empty dir: %s' % dirr
            try:
                os.rmdir(dirr)
                logInfo("done:%s" % info)
            except Exception, e:
                logException("fail-%s" % info)

    ds = list(os.walk(root))  # 获得所有文件夹的信息列表
    stopTime = time.time() - before
    for d in ds:  # 遍历该列表
        try:
            from superbase.utility.logUtil import logDebug
            logDebug("processing-%s" % d[0])
            os.chdir(d[0])  # 进入本级路径,防止找不到文件而报错
            if d[2] != []:  # 如果该路径下有文件
                for x in d[2]:  # 遍历这些文件
                    try:
                        lastmodifytime = os.stat(x).st_mtime  # 获取文件创建时间
                        if lastmodifytime < stopTime:
                            try:
                                info = "del-%s/%s" % (d[0], x)
                                os.remove(x)  # 删掉
                                logInfo("done:%s" % info)
                            except Exception, e:
                                logException("fail:%s" % info)
                    except Exception, e:
                        logException()
        except Exception, e:
예제 #14
0
def runProcess(cmd,
               outInfo=None,
               maxOutInfoNum=1000,
               debug=False,
               redirect=False,
               exitInfo=None):
    """
    运行多进程
    :param cmd:
    :param outInfo: 输出的console信息list
    :param log: 可定制的logger
    :param maxOutInfoNum: 最多输出的console 信息行数
    :param debug: debug模式只是输出命令行
    :param redirectFile: 是否用重定向文件模式
    :param ,exitInfo: 遇到该消息退出
    :return:
    """
    # cmd += "\n" #what the hell use it?
    from superbase.utility.logUtil import logInfo
    try:
        if redirect:
            idx = cmd.rfind(">")
            if idx > 0:  # 判断是否需要重定向,重定向必须是绝对路径
                outfile = cmd[idx + 1:].strip()
                outfile = os.path.abspath(outfile)
                logInfo("redirect-file=%s" % outfile)
                dir1 = os.path.dirname(outfile)
                from superbase.utility.ioUtil import mkdir
                mkdir(dir1)
                redirectFile = open(outfile, "w")
                cmd = cmd[:idx]
        else:
            redirectFile = None
        logDebug("\n%s the cmd is %s\n" % (timeUtil.getCurTime(), cmd))
        if debug:
            return
        p = subprocess.Popen(cmd,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=True)
        lineNum = 0
        while True:
            line = p.stdout.readline()
            if not line:
                break
            if exitInfo and line.find(exitInfo) >= 0:
                break
            # log.debug(line)
            if (outInfo != None):
                outInfo.append(line)
                lineNum += 1
                if maxOutInfoNum > 0 and lineNum > maxOutInfoNum:
                    del outInfo[:-1]
                    lineNum = 0
                    if redirectFile:
                        redirectFile.flush()
                if redirectFile:
                    redirectFile.write(line)
        if redirectFile:
            redirectFile.close()

        logDebug("process-done:%s" % cmd)
    except Exception:
        from superbase.utility.logUtil import logException
        logException()

    return outInfo
예제 #15
0
 def showProcess(self, exe="python"):
     logDebug(showProcess(exe))