def __init__(self, red, key, user): self.key = key self.red = red data = json.loads(user) self.product_id = data.get('product_id') self.url = data.get('url') self.email = data.get('email') self.guid = data.get('guid') self.spider_name = 'tb_comment' self.spargs = data self.sql = SqlHelper() self.spargs['red'] = self.red self.spargs['sql'] = self.sql if not os.path.exists('log'): os.makedirs('log') configure_logging(install_root_handler = False) logging.basicConfig( filename = 'log/%s.log' % self.product_id, format = '%(levelname)s %(asctime)s: %(message)s', level = logging.DEBUG )
def __init__(self, *a , **kw): super(RecipeDetail, self).__init__(*a, **kw) self.dir_name = 'log/%s' % self.name self.sql = SqlHelper() self.init() utils.make_dir(self.dir_name)
def GET(self): try: sql = SqlHelper() inputs = web.input() name = inputs.get('name') proxy = Proxy() proxy.set_value( ip=inputs.get('ip'), port=inputs.get('port'), country=inputs.get('country', None), anonymity=inputs.get('anonymity', None), https=inputs.get('https', 'no'), speed=inputs.get('speed', -1), source=inputs.get('source', name), ) utils.sql_insert_proxy(sql, name, proxy) command = "SELECT ip FROM {0} WHERE ip={1} AND port={2}".format( name, inputs.get('ip'), inputs.get('port')) res = sql.query_one(command) return res is None except: pass return False
def __init__(self, *a, **kw): super(GameUrls, self).__init__(*a, **kw) self.dir_game = 'log/%s' % self.name self.sql = SqlHelper() self.init() utils.make_dir(self.dir_game)
def __init__(self, *a, **kw): super(BaseSpider, self).__init__(*a, **kw) self.urls = [] self.headers = {} self.timeout = 10 self.sql = SqlHelper() self.dir_log = 'log/proxy/%s' % self.name
def __init__(self, *a, **kw): super(GameInfo, self).__init__(*a, **kw) self.dir_game = 'log/%s' % self.name self.sql = SqlHelper() self.init() utils.make_dir(self.dir_game) self.error_count = 0
def __init__(self, name=None, **kwargs): super(Validator, self).__init__(name, **kwargs) self.sql = SqlHelper() self.dir_log = 'log/validator/%s' % self.name self.timeout = 10 self.urls = [] self.headers = None self.success_mark = ''
def randitem(spargs): guid = spargs.get('guid', 0) utils.push_redis(guid, 0, '正在随机产生商品链接', save_to_mysql=False) url = 'https://diviner.taobao.com/diviner?p=610009&callback=jsonpCallbackMoreGood&lid=1&uuid=122270672' \ '.1492415671516609876050.1492415672.1492415672.1492415672.1&pin=&lim=100&ec=utf-8&_=1492415813682' headers = { 'Host': 'diviner.taobao.com', 'Referer': 'https://www.taobao.com/', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 Firefox/52.0' } cookies = { '__jda': '122270672.1492415671516609876050.1492415672.1492415672.1492415672.1', '__jdb': '122270672.1.1492415671516609876050|1.1492415672', '__jdc': '122270672', '__jdv': '122270672|direct|-|none|-|1492415671524', '__jdu': '1492415671516609876050', } r = requests.get(url=url, headers=headers, cookies=cookies, timeout=20) pattern = re.compile('"sku":(\d+),', re.S) ids = re.findall(pattern, r.text) id = random.choice(ids) url = 'https://item.taobao.com/%s.html' % str(id) utils.push_redis(guid, 0, '生成商品链接:<a href="%s" target="_blank">%s' % (url, url), save_to_mysql=False) sql = SqlHelper() command = "SELECT id FROM {table} WHERE id={product_id}". \ format(table = config.tb_item_table, product_id = id) result = sql.query_one(command) # 如果数据库中没有,则重新抓取 if result == None: cmd = 'cd {dir};python manage.py real_time_analysis -a name={name} -a guid={guid} ' \ '-a product_id={product_id} -a url={url};'. \ format(url = str(url), name = 'tb', dir = settings.BASE_DIR, guid = guid, product_id = id) subprocess.Popen(cmd, shell=True) else: # 如果数据库中存在则,直接读取数据库中数据 command = "SELECT * FROM {0} WHERE product_id={1} ORDER BY id". \ format(config.analysis_item_table, id) result = sql.query(command) for res in result: utils.push_redis(guid, res[1], res[2], res[3], save_to_mysql=False)
def __init__(self): super(Crawler, self).__init__() self.album_prefix = 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20={0}&page={1}' self.image_prefix = 'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id={0}&album_id={1}&page={2}' self.image_pattern = re.compile('''img.*290x10000.jpg''', re.U) self.image_name_pattern = re.compile('''"picId":"(.*?)"''', re.U) self.model_pattern = re.compile( '''<a class="lady-name" href="(.*?)".*>(.*?)</a>''', re.U) self.album_pattern = re.compile('''.*album_id=(.*?)&.*''', re.U) self.links = [] self.ids = [] self.names = [] self.sql = SqlHelper()
def GET(self): try: sql = SqlHelper() inputs = web.input() name = inputs.get('name') ip = inputs.get('ip') command = "DELETE FROM {0} WHERE ip=\'{1}\'".format(name, ip) sql.execute(command) command = "SELECT ip FROM {0} WHERE ip=\'{1}\'".format(name, ip) res = sql.query_one(command) return res is None except: pass return False
def __init__(self, name=None, **kwargs): super(JDSpider, self).__init__(name, **kwargs) self.product_id = kwargs.get('product_id', -1) self.log('product_id:%s' % self.product_id) self.item_table = 'item_%s' % self.product_id self.product_page = '%s_page' % self.product_id self.log_dir = 'log/%s' % self.product_id self.is_record_page = False if self.is_record_page: utils.make_dir(self.log_dir) self.sql = SqlHelper() self.red = redis.StrictRedis(host=config.redis_host, port=config.redis_part, db=config.redis_db, password=config.redis_pass)
def handle(self, *args, **options): reload(sys) sys.setdefaultencoding('utf-8') os.chdir(sys.path[0]) spargs = utils.arglist_to_dict(options['spargs']) if not os.path.exists('log'): os.makedirs('log') configure_logging(install_root_handler=False) logging.basicConfig(filename='log/%s.log' % spargs.get('user_id'), format='%(levelname)s %(asctime)s: %(message)s', level=logging.ERROR) guid = spargs.get('guid', '0') user_id = spargs.get('user_id', '0') logging.warn('user_id') if guid == '0' or user_id == '0': utils.log('分析数据传入参数不对,接收到的参数为: spargs:%s' % spargs) utils.push_redis(guid=guid, user_id=user_id, info='分析数据传入参数不对,接收到的参数为:%s' % spargs) utils.push_redis(guid=guid, user_id=user_id, info='finish') return utils.log('开始分析:%s' % spargs) sql = SqlHelper() red = redis.StrictRedis(host=config.redis_host, port=config.redis_part, db=config.redis_db, password=config.redis_pass) spargs['sql'] = sql spargs['red'] = red # 运行爬虫 logging.warn(spargs) runspider(spargs) # 开启分析 logging.warn(spargs) analysis = RealTimeAnalysis(**spargs) analysis.run()
def GET(self): try: sql = SqlHelper() inputs = web.input() name = inputs.get('name') command = "SELECT * FROM {0}".format(name) result = sql.query(command) data = [{ 'ip': item[1], 'port': item[2], 'speed': item[6] } for item in result] data = json.dumps(data, indent=4) return data except: pass return []
def GET(self): try: sql = SqlHelper() inputs = web.input() name = inputs.get('name') anonymity = inputs.get('anonymity', None) https = inputs.get('https', None) order = inputs.get('order', 'speed') sort = inputs.get('sort', 'asc') count = inputs.get('count', 100) command = '' if anonymity is None and https is None: command = "SELECT * FROM {name} ORDER BY {order} {sort} LIMIT {count}". \ format(name = name, order = order, sort = sort, count = count) elif anonymity is not None and https is None: command = "SELECT * FROM {name} WHERE anonymity=\'{anonymity}\' ORDER BY {order} {sort} " \ "LIMIT {count}". \ format(name = name, anonymity = anonymity, order = order, sort = sort, count = count) elif anonymity is None and https is not None: command = "SELECT * FROM {name} WHERE https=\'{https}\' ORDER BY {order} {sort} LIMIT {count}". \ format(name = name, https = https, order = order, sort = sort, count = count) elif anonymity is not None and https is not None: command = "SELECT * FROM {name} WHERE anonymity=\'{anonymity}\' AND https=\'{https}\' ORDER BY " \ "{order} {sort} limit {count}". \ format(name = name, anonymity = anonymity, https = https, order = order, sort = sort, count = count) result = sql.query(command) data = [{ 'id': item[0], 'ip': item[1], 'port': item[2], 'anonymity': item[4], 'https': item[5], 'speed': item[6], 'save_time': str(item[8]) } for item in result] data = json.dumps(data, indent=4) return data except Exception, e: utils.log('select exception msg:%s' % e) pass
def GET(self): try: sql = SqlHelper() inputs = web.input() name = inputs.get('name') anonymity = inputs.get('anonymity', None) https = inputs.get('https', None) sort = inputs.get('sort', 'speed') count = inputs.get('count', 100) command = '' if anonymity is None and https is None: command = "SELECT * FROM {0} ORDER BY {1} LIMIT {2}".format( name, sort, count) elif anonymity is not None and https is None: command = "SELECT * FROM {0} WHERE anonymity=\'{1}\' ORDER BY {2} LIMIT {3}". \ format(name, anonymity, sort, count) elif anonymity is None and https is not None: command = "SELECT * FROM {0} WHERE https=\'{1}\' ORDER BY {2} LIMIT {3}". \ format(name, https, sort, count) elif anonymity is not None and https is not None: command = "SELECT * FROM {0} WHERE anonymity=\'{1}\' AND https=\'{2}\' ORDER BY {3} limit {4}". \ format(name, anonymity, https, sort, count) result = sql.query(command) data = [{ 'ip': item[1], 'port': item[2], 'speed': item[6] } for item in result] data = json.dumps(data, indent=4) return data except: pass return []
def __init__(self, *a, **kwargs): super(AssetStoreSpider, self).__init__(*a, **kwargs) # 存储插件下载的目录 self.dir_plugins = 'Plugins/' self.dir_all = self.dir_plugins + 'all' utils.make_dir(self.dir_plugins) utils.make_dir(self.dir_all) # 所有插件的一个列表 self.plugin_list = [] self.sql = SqlHelper() self.table_name = config.assetstore_table_name self.priority_adjust = 2 # unity 的版本 self.unity_version = '' # 请求 header self.headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Connection': 'keep-alive', 'Host': 'www.assetstore.unity3d.com', 'Referer': 'https://www.assetstore.unity3d.com/en/', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0', 'X-Kharma-Version': self.unity_version, 'X-Requested-With': 'UnityAssetStore', 'X-Unity-Session': '26c4202eb475d02864b40827dfff11a14657aa41', } self.init()
if __name__ == '__main__': if not os.path.exists('log'): os.makedirs('log') if not os.path.exists('temp'): os.makedirs('temp') reload(sys) sys.setdefaultencoding('utf-8') logging.basicConfig(filename='log/job.log', format='%(levelname)s %(asctime)s: %(message)s', level=logging.DEBUG) sql = SqlHelper() red = redis.StrictRedis(host='localhost', port=6379, db=10) init() wx = MyWXBot() t1 = threading.Thread(target=wx.run_wx) t2 = threading.Thread(target=wx.user_query_job) t3 = threading.Thread(target=wx.crawl_boss_job) t4 = threading.Thread(target=wx.crawl_lagou_job) t5 = threading.Thread(target=wx.crawl_liepin_job) t1.start() t2.start() t3.start() t4.start() t5.start()
def __init__(self): self.sql = SqlHelper() self.weather_table_name = config.weather_table self.user_table_name = config.user_table
def runspider(request): data = { 'status': 'failure', 'guid': '0', 'info': '', } try: # 正式环境用 post 请求 url = request.POST.get('url') force = request.POST.get('force', 'false') pattern = re.compile('user-rate-') urls = re.split(pattern, url) user_id = urls[1] pattern = re.compile('\w+', re.S) user_id = re.search(pattern, user_id).group() sql = SqlHelper() utils.log('user_id:%s' % user_id) if 'rate.taobao.com' in url and user_id != None: data['status'] = 'success' data['guid'] = str(random.randint(1000000000000, 9999999999999)) + '_' + str( random.randint(100, 999)) data['info'] = '成功接收数据,正在为您抓取并分析数据,精彩稍候呈现', command = "SELECT id FROM {table} WHERE id={user_id}". \ format(table = config.tb_item_table, user_id = user_id) result = sql.query_one(command) if result == None: name = 'tb_comment' cmd = 'python manage.py real_time_analysis -a name={name} -a guid={guid} ' \ '-a user_id={user_id} -a url={url};'. \ format(url = str(url), name = name, dir = settings.BASE_DIR, guid = data.get('guid'), user_id = user_id) logging.warn(cmd) subprocess.Popen(cmd, shell=True) else: if force == 'false': utils.log('数据库中存在数据,从数据库中取出分析结果') command = "SELECT * FROM {0} WHERE user_id={1} ORDER BY id". \ format(config.analysis_item_table, user_id) result = sql.query(command) for res in result: utils.push_redis(data.get('guid'), res[1], res[2], res[3], save_to_mysql=False) else: command = "DELETE FROM {0} WHERE produce_id={1}".format( config.analysis_item_table, user_id) sql.execute(command) #重新分析数据 cmd = 'cd {dir};python manage.py analysis -a url={url} -a name={name} -a guid={guid} -a ' \ 'user_id={user_id};'. \ format(url = url, name = 'tb', dir = settings.BASE_DIR, guid = data.get('guid'), user_id = user_id) subprocess.Popen(cmd, shell=True) else: data[ 'info'] = '传入网址有误,请检查后重新输入,请输入以下格式的网址:\n%s' % 'https://rate.taobao.com/user-rate-UvGv0MFc0vFILvgTT.htm' except Exception, e: logging.error('run spider exception:%s' % e) data['info'] = '出现错误,错误原因:%s' % e
def runspider(request): data = { 'status': 'failure', 'guid': '0', 'info': '', } try: # 正式环境用 post 请求 url = request.POST.get('url') force = request.POST.get('force', 'false') pattern = re.compile('\d+', re.S) product_id = re.search(pattern, url).group() sql = SqlHelper() utils.log('product_id:%s' % product_id) if 'item.jd.com' in url and product_id != None: data['status'] = 'success' data['guid'] = str(uuid.uuid4()) data['info'] = '成功接收数据,正在为您抓取并分析数据,精彩稍候呈现', command = "SELECT id FROM {table} WHERE id={product_id}". \ format(table = config.jd_item_table, product_id = product_id) result = sql.query_one(command) if result == None: name = 'jd' cmd = 'cd {dir};python manage.py real_time_analysis -a name={name} -a guid={guid} ' \ '-a product_id={product_id} -a url={url};'. \ format(url = str(url), name = name, dir = settings.BASE_DIR, guid = data.get('guid'), product_id = product_id) subprocess.Popen(cmd, shell=True) else: if force == 'false': utils.log('数据库中存在数据,从数据库中取出分析结果') command = "SELECT * FROM {0} WHERE product_id={1} ORDER BY id". \ format(config.analysis_item_table, product_id) result = sql.query(command) for res in result: utils.push_redis(data.get('guid'), res[1], res[2], res[3], save_to_mysql=False) else: command = "DELETE FROM {0} WHERE produce_id={1}".format( config.analysis_item_table, product_id) sql.execute(command) #重新分析数据 cmd = 'cd {dir};python manage.py analysis -a url={url} -a name={name} -a guid={guid} -a ' \ 'product_id={product_id};'. \ format(url = url, name = 'jd', dir = settings.BASE_DIR, guid = data.get('guid'), product_id = product_id) subprocess.Popen(cmd, shell=True) else: data[ 'info'] = '传入网址有误,请检查后重新输入,请输入以下格式的网址:\n%s' % 'https://item.jd.com/3995645.html' except Exception, e: logging.error('run spider exception:%s' % e) data['info'] = '出现错误,错误原因:%s' % e
def main(n): _tc = TimeCnt() _tc.cnt_time() #model_labels_2 = model_labels_2 #model_labels = model_labels #model_columns_base = model_columns_base #print("> this is sql test") #shandong_ = SqlHelper(Config_map_shandong) #guangxi_ = SqlHelper(Config_guangxi) #storm_shandong_= SqlHelper(Configstormshandong) #storm_110_ = SqlHelper(Configooo) _tc.cnt_time() today = todayStr() #from load_mysqL_from_localcpk import load_mongodb_conn #bond_risk_ = load_mysqL_from_localcpk.load_mongodb_conn() mysql_bond_risk_ = SqlHelper(Config_bond_risk) #cursor = bond_risk_.find() #for i in cursor: #print(i) #pdb.set_trace() # timeFormat(i['date_input']) label_120_ = mysql_bond_risk_.execute("select label from middleTable where (to_days(now()) - to_days(date)>=%d);"% n) compname_120_ = mysql_bond_risk_.execute("select compname from middleTable where (to_days(now()) - to_days(date)>=%d);"% n) #label_120_ = mysql_bond_risk_.execute("select label from middleTable where (to_days(now()) - to_days(date_input) <=720 and to_days(now()) - to_days(date)<=%d);"% n) #_tc.cnt_time() #compname_120_ = mysql_bond_risk_.execute("select compname from middleTable where (to_days(now()) - to_days(date_input) <=720 and to_days(now()) - to_days(date)<=%d);"% n) #label_120_, compname_120_ = get_label_time_window(bond_risk_, n, n-120) set_ = set() [set_.add(i) for i in label_120_] label_lst_ = list(set_) _tc.cnt_time() set_.clear() [set_.add(i) for i in compname_120_] compname_lst_ = list(set_) _dic = get_label_120(mysql_bond_risk_, compname_lst_, label_lst_, n) _tc.cnt_time() _panel = pd.Panel(_dic) _panel = _panel.fillna(0.0) #filter_data_by_time(bond_risk_, today) _tc.cnt_time() df_4_model = pd.DataFrame(index=compname_lst_, columns = model_labels + model_labels_2) df_4_model = df_4_model.fillna(0.0) df_4_model = df_4_model.astype(np.float64) _index = list(df_4_model.index) _columns = model_labels _columns_2 = model_labels_2 #print("> ready to get data") _cnt = 0 for i in _index: _cnt+=1 if _cnt % 100 ==1: pass #print(">>>> !!! handle the,", i, _cnt) #if _cnt > 300: #break #print("> handle the,", i, _cnt) for c in _columns: df_4_model.loc[i,c] = cell_fill(_panel, i,c) df_4_model.loc[i, "企业名称"] = i df_4_model.loc[i, "发布日期"] = datetime.datetime.now() df_4_model.loc[i, "credit_recent"] = 0 df_4_model.loc[i, "credit_ago"] = 0 df_4_model.loc[i, "credit_trend"] = 0 df_4_model.loc[i, "60"] = _panel[i].loc[60,:].sum() df_4_model.loc[i, "120"] = _panel[i].loc[120,:].sum() df_4_model.loc[i, "180"] = _panel[i].loc[180,:].sum() df_4_model.loc[i, "债券风险60"] = group_cnt_key_word("债券风险60",i,_panel) df_4_model.loc[i, "债券风险120"] = group_cnt_key_word("债券风险120",i,_panel) df_4_model.loc[i, "债券风险180"] = group_cnt_key_word("债券风险180",i,_panel) df_4_model.loc[i, "个人风险60"] = group_cnt_key_word("个人风险60",i,_panel) df_4_model.loc[i, "个人风险120"] = group_cnt_key_word("个人风险120",i,_panel) df_4_model.loc[i, "个人风险180"] = group_cnt_key_word("个人风险180",i,_panel) df_4_model.loc[i, "财务风险60"] = group_cnt_key_word("财务风险60",i,_panel) df_4_model.loc[i, "财务风险120"] = group_cnt_key_word("财务风险120",i,_panel) df_4_model.loc[i, "财务风险180"] = group_cnt_key_word("财务风险180",i,_panel) df_4_model.loc[i, "经营风险60"] = group_cnt_key_word("经营风险60",i,_panel) df_4_model.loc[i, "经营风险120"] = group_cnt_key_word("经营风险120",i,_panel) df_4_model.loc[i, "经营风险180"] = group_cnt_key_word("经营风险180",i,_panel) df_4_model.loc[i, "行业风险60"] = group_cnt_key_word("行业风险60",i,_panel) df_4_model.loc[i, "行业风险120"] = group_cnt_key_word("行业风险120",i,_panel) df_4_model.loc[i, "行业风险180"] = group_cnt_key_word("行业风险180",i,_panel) df_4_model.loc[i, "企业风险60"] = group_cnt_key_word("企业风险60",i,_panel) df_4_model.loc[i, "企业风险120"] = group_cnt_key_word("企业风险120",i,_panel) df_4_model.loc[i, "企业风险180"] = group_cnt_key_word("企业风险180",i,_panel) #df_4_model = df_4_model.applymap(lambda x : np.NaN if x==0 else x) df_4_model.loc[i, "sub120_60"] = df_4_model.loc[i, "120"] - df_4_model.loc[i, "60"] df_4_model.loc[i, "sub180_120"] = df_4_model.loc[i, "180"] - df_4_model.loc[i, "120"] #df_4_model = df_4_model.applymap(lambda x : np.NaN if x==-1 else x) #df_4_model = df_4_model.applymap(lambda x : np.NaN if x==0 else x) _x = df_4_model.drop(["企业名称","发布日期","Label"],1) _z = pd.read_csv("/home/siyuan/bond_risk/_z.csv").drop(["Unnamed: 0","发布日期","Label"],1) #_z.index = _z["企业名称"] _z = _z.drop("企业名称", axis=1) _x.columns = list(_z.columns) # !! filter #_x = _x[(_x["sub120_60"]>0) & (_x["60"]>0)] #_x = _x[(_x["60"]>0)] _x = _x[(_x["120"]>0)] train_separator = len(_x.index) #print(train_separator) _pred_data = pd.concat([_x, _z], axis=0) _pred_data = set_dummy(_pred_data, False) # output predict label bst = xgb.Booster() bst.load_model("/home/siyuan/data/xgb.model") #pdb.set_trace() #_lz = pd.read_csv("/home/siyuan/bond_risk/_z.csv")["Label"] result_ = predict(bst, _pred_data, _pred_data.iloc[1]) dict_ = dict(zip(list(_pred_data.index), result_)) dict_res = dict(zip(list(_pred_data.index)[:train_separator], result_[:train_separator])) #dict_res = dict(zip(list(_pred_data.index), result_)) #print(collections.Counter(list(result_))) #print(collections.Counter(list(result_)[:train_separator])) cnt = 0 #pdb.set_trace() #print(dict_res) for i in dict_res.keys(): #sql_ = "INSERT INTO resultTable VALUES('', '%s', CURTIME(), '%s');"%(i,str(format(dict_res[i],'.9e'))) sql_ = "INSERT INTO resultTable VALUES('', '%s', CURTIME(), '%s');"%(i,str(format(dict_res[i],'.9e'))) #print(sql_) sql_res_ = mysql_bond_risk_.execute(sql_) #print(sql_res_) cnt+=1 pdb.set_trace() mysql_bond_risk_.connect.commit()