def _formatResponse(requestInfo, execute, urlInfo, fileInfo=('','')): '''格式化UrlItem''' try: response = {} response['nettime'] = requestInfo['nettime'] if requestInfo['error'] or requestInfo['http_code'] != 200: response['status'] = 3 response['http_code'] = requestInfo['http_code'] response['error'] = repr(requestInfo['error']) response['end_at'] = now_format() response['depth'] = urlInfo['depth'] else: response['status'] = 2 response['end_at'] = now_format() response['depth'] = urlInfo['depth'] response['md5_url'] = md5(urlInfo['url']) response['md5_body'] = md5(requestInfo['body']) response['redirects'] = json.dumps(requestInfo['redirects'], ensure_ascii=False) response['http_code'] = requestInfo['http_code'] response['request_headers'] = json.dumps(requestInfo['request_headers'], ensure_ascii=False) response['response_headers'] = json.dumps(requestInfo['response_headers'], ensure_ascii=False) response['file_name'] = fileInfo[0] response['file_path'] = fileInfo[1] return response except Exception as e: logger.exception(e) return False
def mg_spiderjsurl_save(record, urlid=None): if 'id' in record.keys(): if not urlid: urlid = record['id'] del (record['id']) else: del (record['id']) if not urlid and 'url' not in record.keys(): return False # if not urlid and 'method' not in record.keys(): record['method'] = 'GET' if 'url' in record.keys(): record['md5_url'] = md5(record['url']) if 'body' in record.keys(): record['md5_body'] = md5(record['body']) # dbFields = [ 'id', 'url', 'md5_url', 'referer', 'method', 'status', 'start_at', 'end_at', 'create_at' ] mongoFields = [ 'id', 'url', 'md5_url', 'referer', 'method', 'http_code,', 'request_headers,', 'response_headers', 'redirects', 'body', 'md5_body', 'parse_result', 'error', 'status', 'start_at', 'end_at', 'create_at' ] insertRow = {} for field in dbFields: if field in record.keys(): insertRow[field] = record[field] if insertRow: if urlid: db.updatebyid('spiderjs_url', insertRow, urlid) else: urlid = db.insert('spiderjs_url', insertRow) jsRow = db.fetchone("select * from spiderjs_url where id=:id limit 1", {'id': urlid}) mongoRow = mgdb.c_getbyid('spiderjsurl', urlid) mongoRow = mongoRow if mongoRow else {} for field in mongoFields: value = '' if field in mongoRow.keys(): value = mongoRow[field] if field in record.keys(): value = record[field] if field in jsRow.keys(): value = jsRow[field] mongoRow[field] = value mgdb.spiderjsurl_save(mongoRow) return urlid
def crawljs(taskInfo): try: #已抓取过,不再抓取 if taskInfo['status'] not in (0, 1): return True #抓取页面源代码 requestInfo = spiderRequest(taskInfo['url']) parseResults = [] results = _parseForJs(taskInfo['url']) for record in results: urlRow = _parseForUrl(record) if not urlRow: continue parseResults.append(urlRow) updateRow = {} updateRow['id'] = taskInfo['id'] updateRow['http_code'] = requestInfo['http_code'] updateRow['response_headers'] = json.dumps( requestInfo['response_headers'], ensure_ascii=False) updateRow['body'] = requestInfo['body'] updateRow['md5_body'] = md5(requestInfo['body']) updateRow['parse_result'] = json.dumps(parseResults, ensure_ascii=False) updateRow['status'] = 2 updateRow['end_at'] = getTime('%Y-%m-%d %H:%M:%S') #保存数据结果 mg_spiderjsurl_save(updateRow) except Exception as e: logger.exception(e) return False
def _urls_uniq(urlItems): '''对格式化后的URL去重''' urlsUniq = {} for i in urlItems: key = md5(i['url'] + i['method'] + str(i['invisible'])) if key not in urlsUniq.keys(): urlsUniq[key] = i return list(urlsUniq.values())
def download(requestInfo, urlInfo, execute, fileType = 'html'): '''下载文件''' try: #如果有异常,直接返回 md5Body = md5(requestInfo['body']) result = mgdb.static_get(execute['domain'], md5Body) if result: return (result['file_name'], result['file_key']) localfile = '%s/%s/%s.tmp' % (PATH_TMP_UPLOAD, execute['domain'], md5Body) if not exists(dirname(localfile)): mkdirs(dirname(localfile)) if fileType == 'html': filename = "%s_%s.html" % (execute['id'], urlInfo['id']) filekey = 'html/%s/%s/%s_%s.html.%s' % (execute['domain'], execute['task_id'], execute['id'], urlInfo['id'], md5Body) fwriteBin(localfile, requestInfo['body']) fileType = 'html' else: filename = basename(requestInfo['url']) filekey = "static/%s/%s.%s" % (execute['domain'], requestInfo['url'][7:], md5Body) fwriteBin(localfile, requestInfo['body']) fileType = 'img' filepath = ydfs_upload(filekey, localfile) mgdb.c_insert('static', _formatStatic(execute['domain'], requestInfo['url'], filename, filekey, fileType, md5Body)) return (filename, filekey) except Exception as e: logger.exception(e) return ('', '')
def _formatOutlink(execute, referer, url, md5Body='', invisible=0): '''格式化外链''' return { 'task_id': execute['task_id'], 'execute_id': execute['id'], 'domain': execute['domain'], 'referer': referer, 'md5_referer': md5(referer), 'url': url, 'md5_url': md5(url), 'md5_body': md5Body, 'invisible': invisible, 'filterwords': '', 'date':getDate(), 'create_at': now_format(), 'update_at': now_format() }
def getToken_key(key): appObj = db.fetchone('select * from app where unique_key=:key', {'key':key}) if not appObj: return False tokenExpired = getTime("%Y-%m-%d %H:%M:%S" , (getTime() + 7200)) token = md5(tokenExpired) db.updatebyid('app', {'token':token, 'token_expired':tokenExpired}, appObj['id']) return {'token':token, 'expired':tokenExpired}
def _checkUrlExists(executeid, url, method, invisible): '''检查url是否存在''' key = 'exists_%s' % executeid value = method + '-' + url + str(invisible) hkey = md5(value) if redis.hexists(key, hkey): return True redis.hset(key, hkey, value) redis.expire(key, 86400) return False
def _formatStatic(domain, url, filename, filekey, filetype, md5Body): return { 'domain': domain, 'url': url, 'file_name': filename, 'file_key': filekey, 'file_type': filetype, 'md5_url': md5(url), 'md5_body': md5Body, 'create_at': now_format(), 'update_at': now_format(), }
def execCasper(content=None): try: filename = "%s/%s_%s" % (PATH_TMP_NODEJS, getTime('%Y%m%d'), md5(content)) write(filename, content) cmd = 'casperjs ' + filename child = Popen(cmd, shell=True, close_fds=True, bufsize=-1, stdout=PIPE, stderr=STDOUT) output = child.stdout.read().decode() #remove(filename) return output except Exception as e: logger.exception(e) return False