예제 #1
0
파일: aliyun.py 프로젝트: Curiou/redundancy
    def listDir(self, dir1, timeRange=None):
        """
        :param dir1:downData/www_tianyancha_com/detail/company_1
        :param timeRange: fmt:2018020100-2018020200
        :return:
        """
        objects = []
        tryTime = 0  #

        def getNameTS(name):
            m = re.search(r"(\d{4})/(\d{2})/(\d{2})/(\d{2})", name)
            return "%s%s%s%s" % (m.group(1), m.group(2), m.group(3),
                                 m.group(4))

        while tryTime < 3:
            try:
                dir1 = preProcessDir(dir1)
                # 列出bucket中”fun/”目录下所有文件
                beginTime, endTime = timeRange.split("-") if timeRange else (
                    None, None)
                import oss2
                for idx, object_info in enumerate(
                        oss2.ObjectIterator(self.oss, prefix=dir1)):
                    if beginTime and endTime:
                        ts = getNameTS(object_info.key)
                        if ts < beginTime or ts >= endTime:
                            # logDebug("ignore:%s"%object_info.key)
                            continue
                    objects.append(object_info.key)
                    logDebug("%s:%s" % (idx, object_info.key))
                return objects
            except Exception, e:
                logException()
            tryTime += 1
            time.sleep(1)
예제 #2
0
    def extract(self):
        """

        :return:
        """
        root = "C:/tempAliyun/downData/www_liepin_com"
        root2 = "E:/shanghai51"

        with codecs.open("%s/shanghailiepin.txt" % root2,
                         "w",
                         encoding="utf-8") as f2:
            f2.write("%s##%s##%s##%s##%s##%s\n" %
                     ("name", "address", "website", "type", "name2", "info"))
            files = glob.glob("%s/*/*/*/*/*/*/*/*.gz" % root)
            total = 0
            for idx, fileName2 in enumerate(files):
                num = 0
                with gzip.open(fileName2, 'rb') as f:
                    for line in f:
                        try:
                            d = json.loads(line)
                            if u"上海" in d.get("name", ""):
                                f2.write(
                                    u"%s##%s##%s##%s##%s##%s\n" %
                                    (d.get("name", ""), d.get("address", ""),
                                     d.get("website", ""), d.get("type", ""),
                                     d.get("businessLicense",
                                           ""), str2line(d.get("info", ""))))
                                num += 1
                        except Exception:
                            logException()
                total += num
                logInfo("%s:num=%s,total=%s" % (idx, num, total))
예제 #3
0
파일: aliyun.py 프로젝트: Curiou/redundancy
    def readJsonFromAliyun(self, aliDir, timeRange=None, dest=None):
        """
        一个生成器
        用法:
        lines = AliYun().readJsonFromAliyun('downData/www_tianyancha_com/detail/company_1', '2018020100-2018020300')
        for line in lines:
            logDebug(line['name'])
        :param aliDir: downData/www_tianyancha_com/detail/company_1
        :param timeRange: fmt:2018020100-2018020200
        :param localRoot:如果指定就用该目录,推荐不指定
        :return:
        """

        if not dest:
            dest = self._getDefaultDownRoot()

        files = self.listDir(aliDir, timeRange)
        for fileName in files:
            try:
                fileName2 = os.path.join(dest, fileName)
                if not os.path.exists(fileName2):
                    self.downFile(fileName, dest)
                with gzip.open(fileName2, 'rb') as f:
                    for line in f:
                        yield json.loads(line)

            except Exception, e:
                logException()
예제 #4
0
    def report(title, content, t_address, jobName=None, batch=None, needLog=1):
        """
        报告
        :param title: 邮件标题
        :param content: 邮件内容
        :param t_address: 收件地址
        :param needLog: 0,no log,1,send job logFile,2,send node log,3 both
        :return:
        """
        attaches = []
        if needLog:
            try:
                jobLog, nodeLog = getLogFileName(batch, jobName)
                logInfo("jobLog=%s\nnodeLog=%s" % (jobLog, nodeLog))

                def getOneAttach(log):
                    if os.path.exists(log):
                        size = os.path.getsize(log)
                        if size > 1024 * 500:  # >500k,gizp it
                            log = gzipOneFile(log)
                        fname = os.path.split(log)[1]
                        attaches.append((fname, log))
                        logInfo("add one attach%s,size=%s" % (log, size))

                if needLog & 1:
                    getOneAttach(jobLog)
                if needLog & 2:
                    getOneAttach(nodeLog)
            except Exception:
                logException()
        # 发送邮件
        Mail.sendEmail(title, content, t_address, attaches=attaches)
예제 #5
0
    def saveSyncPoint(self, result,sync2Remote=False):
        """
        保存同步点
        :param result:
        :param sync2Remote:默认每次都同步到remote
        :return:返回去掉syncInfo的数据
        """
        if self.index:
            try:
                index = self.index
                #如果result中 同步点信息
                if CFG_DOWN_SYNCINFO in result:
                    #把result中 已经同步的信息 赋值与syncInfo
                    syncInfo = result.get(CFG_DOWN_SYNCINFO,{})
                    #删除result 中的同步点信息
                    del result[CFG_DOWN_SYNCINFO]
                    data = {
                        "id": md5(index),
                        "idx": index,
                        "syncInfo": syncInfo,
                        "upTime": getTimestamp()#时间戳
                    }
                    json2File(self.localFile,data)
                    self.saveNum += 1
                    if sync2Remote or (self.saveNum%gConfig.get(CFG_DOWN_SYNCINTERVAL,5)==1): #默认每5次同步到remote:
                        self.syncToRemote(data)

            except Exception, e:
                logException()

            return result
예제 #6
0
    def resetDb(self):
        # 连接数据库
        tryTime = 10  # try 10次连接
        while tryTime > 0:
            try:
                self.close()
                database, host, user, passwd, port = self.curCfg
                from superbase.globalData import gConfig

                if self.dictCursor:
                    from MySQLdb.cursors import DictCursor
                    self.conn = MySQLdb.connect(host=host,
                                                user=user,
                                                passwd=passwd,
                                                db=database,
                                                port=port,
                                                charset="utf8",
                                                cursorclass=DictCursor)
                else:
                    self.conn = MySQLdb.connect(host=host,
                                                user=user,
                                                passwd=passwd,
                                                db=database,
                                                port=port,
                                                charset="utf8")
                if self.conn:
                    return

            except Exception:
                logException("tryTime-%s" % tryTime)
            tryTime -= 1
예제 #7
0
def createDb(dbNameKey, dbParams=None):
    """
    创建数据库
    :param dbNameKey: 目前只有db.monitor数据监控 and db.business数据业务
    :param dbParams: DEV and TEST 有默认参数,ONLINE需要通过jobManager分配
    :return:
    """
    from superbase.globalData import gConfig, gTop
    # CFG_DB_DISABLE 禁用,deprecated
    if gConfig.get(CFG_DB_DISABLE, 0):
        return
    # 获取数据监控或者数据业务
    dbName = gConfig.get(dbNameKey)

    try:
        from superbase.globalData import gConfig
        db_params = AccountManager().getAccount(
            dbNameKey) if not dbParams else dbParams
        if not db_params:
            return

        db = createDb2(dbName,
                       db_params,
                       dictCursor=gConfig.get(CFG_DB_DICTCURSOR, 0))
        # 把数据业务或者数据监控和mysql连接 配置到 全局数据单点控制 中
        gTop.set(dbNameKey, db)
    except Exception:
        logException()
예제 #8
0
    def downLists(self, listConf, listItemConf, resultHandlerClass, urlMgr):
        """
        :param listConf: 列表配置
        :param listItemConf: 列表项配置
        :param resultHandlerClass: 结果处理类
        :param urlMgr: urlManager 提供url
        :return:
        """

        # 打印时间戳
        logInfo("%s_begin downLists" % (getTimestamp()))
        err = num = 0
        # url跳页 处理
        for url in urlMgr.pageUrls(listConf):
            try:
                # debug 打印url
                logDebug(url)
                # 中转 获取原网页的源码content 并交给 downOneList2 处理
                self.downOneList(url, listConf, listItemConf,
                                 resultHandlerClass())
                num += 1
                # 检查下载状态
                err = self.checkDownStatus(num)
                if IS_ERROR(err):  # 如果err<0 break
                    break
            except Exception:
                logException()
        return err
예제 #9
0
        def __decorator(*params):

            try:
                self.jobBegin()
                return func(*params)
            except Exception, e:
                self.jobError = e
                logException()
예제 #10
0
def asyncRun(cmd, shell=True):
    try:
        logDebug("asyncRun:%s" % cmd)
        subprocess.Popen(cmd,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.STDOUT,
                         shell=shell)
    except Exception, e:
        logException()
예제 #11
0
 def __decorator(*params):
     tryTime = 0
     while tryTime < maxTry:
         try:
             return func(*params)
         except Exception:
             logException("url=%s" % params[0])
         tryTime += 1
         time.sleep(tsleep)
예제 #12
0
def reloadModule(name):
    """
    重新加载模块
    :param name: 模块名
    :return:
    """
    try:
        reload(sys.modules[name])
    except Exception:
        logException()
예제 #13
0
파일: aliyun.py 프로젝트: Curiou/redundancy
    def downDir(self, dir1, dest=None, timeRange=None):
        if not dest:
            dest = self._getDefaultDownRoot()

        objects = self.listDir(dir1, timeRange)
        for obj in objects:
            try:
                self.downFile(obj, dest)
            except Exception:
                logException()
예제 #14
0
def callFunction(func, argv):
    """
    调用函数优化
    :param func: 要调用的方法
    :param argv: 参数
    :return:
    """
    try:
        return apply(func, argv)
    except Exception:
        logException()
예제 #15
0
 def close(self):
     """
     #关闭数据库 回收资源
     :return:
     """
     try:
         if self.conn:
             self.conn.close()
     except Exception:
         logException()
     self.conn = None
예제 #16
0
 def closeCursor(self, cur):
     """
     关闭游标
     :param cur:掌舵者
     :return:
     """
     try:
         if cur:
             cur.close()
     except Exception, e:
         logException()
예제 #17
0
def applyFunc(obj, strFunc, arrArgs):
    """
    调用方法
    :param obj: 要使用的对象
    :param strFunc: 方法名
    :param arrArgs: 参数
    :return:
    """
    try:
        return callFunc(obj, strFunc, arrArgs)
    except:
        logException()
예제 #18
0
def safeReg1(reg, str, tag):
    """
    :param reg: pattern
    :param str:
    :param tag: for debug
    :return:
    """
    try:
        # search搜索 str
        return reg.search(str).group(1)
    except Exception:
        logException("regError:%s--%s" % (tag, str))
예제 #19
0
    def handle(self, result):
        try:
            result = self.preProcess(result)
            if result:

                r1 = self.sync.saveSyncPoint(result)
                path = self.getSavePath()
                fileName = self.getFileName()
                fileName = os.path.join(path, fileName)
                self.saveFile(fileName,r1)

        except Exception:
            logException()
예제 #20
0
파일: aliyun.py 프로젝트: Curiou/redundancy
 def upDir(self, dir1, gzfirst=True):
     if gzfirst:
         gzAllFiles(dir1)
     files = getGZFromDir(dir1)
     for file in files:
         try:
             if True:  # os.path.getsize(file)>30:#小于30Byte的文件不处理
                 self.upFile(file)
             else:
                 logInfo("the file is too small,give up")
         except Exception:
             logException()
     return len(files)
예제 #21
0
 def getResult(parent, template, result):
     """
     :param parent: pyquery
     :param template: key:value, value is one of CssElement,ListElement,EmmbedElement
     :param result: 保存结果的字典
     :return:
     """
     try:
         # 遍历传入元素的节点
         for key, element in template.items():
             value = element.parse(parent)  # 保存到字典中
             result[key] = value
     except Exception:
         logException()  # 记录异常到log日志
예제 #22
0
파일: aliyun.py 프로젝트: Curiou/redundancy
 def upFile(self, file1):
     dest = self.preProcessPath(file1[len(self.prefix):])
     MAX_TRY_UP = 10
     tryTime = 0  #
     while tryTime < MAX_TRY_UP:
         try:
             with open(file1, 'rb') as fileobj:
                 res = self.oss.put_object(dest, fileobj)
             if res and res.status == 200:
                 logInfo("ret=%s,file=%s" % (res.status, dest))
                 return
         except Exception, e:
             logException()
         tryTime += 1
         time.sleep(1)
예제 #23
0
 def handle(self, parent, css, attr):
     """
     解析获取数据
     :param parent: 上一级
     :param css: css定位路径
     :param attr: 属性
     :return: 获取的值
     """
     value = ""
     try:
         value = self.func(parent, css, attr,
                           self.otherParams)  # 通过回调函数获取页面的数据
     except Exception:
         logException()  # 记录异常到log日志
     return value
예제 #24
0
def safeEval(data, ret=None):
    """
    :param data: e.g {'a':1}
    :return: None if error else the result
    """
    try:
        if data:
            import ast
            # literal_eval 安全评估表达式节点或包含Python的字符串表达式。
            return ast.literal_eval(data)
    except Exception:
        from superbase.utility.logUtil import logException
        logException(data)

    return ret
예제 #25
0
def getPageSync(driver, url, key, val,timeout=0):
    """
    获取页面同步
    :return:
    """
    element = None
    try:

        driver.get(url)
        if key:
            TIME_OUT_SEC = timeout if timeout else 10
            element = WebDriverWait(driver, TIME_OUT_SEC).until(lambda x: x.find_element(key, val))
    except Exception:
        logException()
    return element
예제 #26
0
def getElementSync(driver, keyVal,timeout=0):
    """
    获取元素同步
    :param driver:
    :param keyVal: (key,val),eg.(By.ID,"id1")
    :return:
    """
    element = None
    try:
        key, val = keyVal
        TIME_OUT_SEC = timeout if timeout else 10
        # 直到 找到元素
        element = WebDriverWait(driver, TIME_OUT_SEC, 0.1, True).until(lambda x: x.find_element(key, val))
    except Exception:
        logException()
    return element
예제 #27
0
 def handle(self, parent, css, attr):
     """
     解析获取数据
     :param parent: 上一级
     :param css: css定位下一级url的路径
     :param attr: 属性
     :return: 获取的值
     """
     value = ""
     try:
         url = Extractor.getValue(parent, css,
                                  attr)  # 通过CSS Selector获取下一级页面的url
         value = self.func(url, self.conf,
                           self.otherParams)  # 通过回调函数获取下一级页面的数据
     except Exception:
         logException()  # 记录异常到log日志
     return value
예제 #28
0
 def handle(self, parent, css, attr):
     """
     解析获取数据
     :param parent: 上一级
     :param css: css定位路径
     :param attr: 选取的属性
     :return: 返回正则处理后的值
     """
     value = ""
     try:
         value = Extractor.getValue(parent, css, attr)  # 通过CSS Selector获取数据
         m = self.pat.search(value)  # 进行正则匹配
         value = m.group(1).strip()  # 选取匹配的第一个元素,病去除两端空白
     except Exception:
         logException("regex-error:value=%s css=%s attr=%s pat=%s" %
                      (value, css, attr, self.debugInfo))  # 记录异常到log日志
     return value
예제 #29
0
파일: ioUtil.py 프로젝트: Curiou/redundancy
 def deleteNullDir(dirr):
     """
     删除路径下的空目录
     :param dirr:
     :return:
     """
     if os.path.isdir(dirr):
         for p in os.listdir(dirr):
             d = os.path.join(dirr, p)
             if (os.path.isdir(d) == True):
                 deleteNullDir(d)
     if not os.listdir(dirr):
         info = 'del empty dir: %s' % dirr
         try:
             os.rmdir(dirr)
             logInfo("done:%s" % info)
         except Exception, e:
             logException("fail-%s" % info)
예제 #30
0
 def safeExecute(self, sql, values=None):
     """
     sql执行函数
     :param sql:
     :param values:
     :return:
     """
     try:
         # 创建 游标
         cur = self.conn.cursor()
         if values:
             # 执行sql命令
             cur.execute(sql, values)
         else:
             cur.execute(sql)
         return cur
     except Exception:
         logException()