def arpriori_trade(): while 1: full_table_name = tasks_queue.get() log.info(full_table_name) query = \ """SELECT T.* FROM( SELECT shop_id, GROUP_CONCAT(DISTINCT(item_id)) AS list FROM %s GROUP BY shop_id, nick)T WHERE LOCATE(',', T.list)<>0""" \ % full_table_name db_conn = ibbdlib.get_db_conn(**db_server) metadata = dict() try: log.info('%s query data...', full_table_name) for row in db_conn.iter(query): if not metadata.get(row.shop_id): metadata[row.shop_id] = [row.list.split(',')] else: metadata[row.shop_id].append(row.list.split(',')) log.info('%s run apriori...', full_table_name) for (shop_id, t) in metadata.items(): t = Apriori.apriori(t, 0.1) query = 'INSERT IGNORE INTO ibbd2.ststc_shop_apriori values(%%s, CURDATE(), %s, NOW())' % ','.join(['%s' ] * 4) db_conn.executemany(query, [[shop_id, pair.split(',')[0], pair.split(',')[1], data['sup'], data['num']] for (pair, data) in t.items() if len(pair.split(',')) == 2]) log.info('%s finished', full_table_name) except: log.error(traceback.format_exc()) finally: db_conn.close() tasks_queue.task_done()
def main(): sids = list(sorted(_get_shop_ids())) step = len(sids) / split_tb_num interval = [] for i in range(split_tb_num): interval.append(sids[(i + 1) * step]) db_conn = ibbdlib.get_db_conn(host='223.4.155.172', user='******', passwd='spider') sql_dml_list = [] for i in range(len(interval)): ddl_table = '%s.%s_%d' % (db, his_table, i + 1) ddl = "DROP TABLE IF EXISTS %s" % ddl_table db_conn.execute(ddl) ddl = 'CREATE TABLE %s LIKE topdata.top_item_trade' % ddl_table db_conn.execute(ddl) sql = \ """INSERT IGNORE INTO %s SELECT *, MD5(CONCAT_WS(',', item_id, nick, rank, trade_time)) FROM %s.%s WHERE shop_id %s""" if i == 0: sql = sql % (ddl_table, his_db, his_table, "< '%s'" % interval[i]) elif i == len(interval) - 1: sql = sql % (ddl_table, his_db, his_table, ">= '%s'" % interval[i]) else: sql = sql % (ddl_table, his_db, his_table, ">='%s' AND shop_id <'%s'" % (interval[i], interval[i + 1])) sql_dml_list.append(sql) print '%s;'%sql db_conn.close()
def arpriori_trade(full_table_name): query = \ """SELECT T.* FROM( SELECT shop_id, GROUP_CONCAT(DISTINCT(item_id)) AS list FROM topdata.top_item_trade_his_20121219_1 GROUP BY shop_id, nick)T WHERE LOCATE(',', T.list)<>0 """ db_conn = ibbdlib.get_db_conn(**db_server) metadata = dict() try: for row in db_conn.iter(cmd_str): if not metadata.get(row.shop_id): metadata[row.shop_id] = [row.list.split(',')] else: metadata[row.shop_id].append(row.list.split(',')) for (shop_id, t) in metadata.items(): print shop_id t = Apriori.apriori(t, 0.1) print json.dumps(t, indent=4) query = 'INSERT IGNORE INTO ibbd2.ststc_shop_apriori values(%%s, CURDATE(), %s, NOW())' % ','.join(['%s'] * 4) db_conn.executemany(query, [[shop_id, pair.split(',')[0], pair.split(',')[1], data['sup'], data['num']] for (pair, data) in t.items() if len(pair.split(',')) == 2]) except Exception, e: print e
def split_trade_data(): sids = list(sorted(_get_shop_ids())) step = len(sids) / (split_tb_num - 1) interval = [sids[0]] for i in range(split_tb_num - 1): interval.append(sids[(i + 1) * step - 1]) interval.append(sids[-1]) return interval db_conn = ibbdlib.get_db_conn(host='223.4.246.146', user='******', passwd='huangj') for i in range(len(interval) - 1): ddl_table = '%s.%s_%d' % (db, his_table, i + 1) ddl = 'DROP TABLE IF EXISTS %s' % ddl_table db_conn.execute(ddl) ddl = 'CREATE TABLE %s LIKE topdata.top_item_trade' % ddl_table print 'create table', ddl_table db_conn.execute(ddl) sql = \ """INSERT IGNORE INTO %s SELECT *, MD5(CONCAT_WS(',', item_id, nick, rank, trade_time)) FROM %s.%s WHERE shop_id %s""" sql = sql % (ddl_table, his_db, his_table, ">='%s' AND shop_id <'%s'" % (interval[i], interval[i + 1])) if i == len(interval) - 2: sql = sql.replace('<', '<=') print 'migrate metadata to', ddl_table, print db_conn.execute_rowcount(sql) db_conn.close()
def split_topspider_table(full_table): db_conn = ibbdlib.get_db_conn(host='223.4.155.172', user='******', passwd='spider') cmd_str = "SELECT DISTINCT(shop_id) AS shop_id FROM %s" % (full_table) sids = [] for row in db_conn.iter(cmd_str): sids.append(row.shop_id) length = len(sids) / 100 i = 1 sid_dict = dict() for sid in sids: if sid_dict.get(i): if len(sid_dict[i]) < length: sid_dict[i].append(sid) else: i += 1 sid_dict[i] = [sid] else: sid_dict[i] = [sid] for k in sid_dict.keys(): print k, len(sid_dict[k]) db_conn.executemany( "INSERT IGNORE INTO topdata.top_trade_dist(table_name, shop_id) values(%s, %s)", [['top_item_trade_%d' % k, sid] for sid in sid_dict[k]]) db_conn.close()
def main(): sids = list(sorted(_get_shop_ids())) step = len(sids) / split_tb_num interval = [] for i in range(split_tb_num): interval.append(sids[(i + 1) * step]) db_conn = ibbdlib.get_db_conn(host='223.4.155.172', user='******', passwd='spider') sql_dml_list = [] for i in range(len(interval)): ddl_table = '%s.%s_%d' % (db, his_table, i + 1) ddl = "DROP TABLE IF EXISTS %s" % ddl_table db_conn.execute(ddl) ddl = 'CREATE TABLE %s LIKE topdata.top_item_trade' % ddl_table db_conn.execute(ddl) sql = \ """INSERT IGNORE INTO %s SELECT *, MD5(CONCAT_WS(',', item_id, nick, rank, trade_time)) FROM %s.%s WHERE shop_id %s""" if i == 0: sql = sql % (ddl_table, his_db, his_table, "< '%s'" % interval[i]) elif i == len(interval) - 1: sql = sql % (ddl_table, his_db, his_table, ">= '%s'" % interval[i]) else: sql = sql % (ddl_table, his_db, his_table, ">='%s' AND shop_id <'%s'" % (interval[i], interval[i + 1])) sql_dml_list.append(sql) print '%s;' % sql db_conn.close()
def _get_shop_ids(): db_conn = ibbdlib.get_db_conn(**db_server) cmd_str = 'SELECT DISTINCT(shop_id) AS shop_id FROM %s.%s' % ( his_db, his_table) sids = list(db_conn.iter(cmd_str)) db_conn.close() return [row.shop_id for row in sids]
def _get_shop_ids(): db_conn = ibbdlib.get_db_conn(host='223.4.155.172', user='******', passwd='spider') cmd_str = 'SELECT DISTINCT(shop_id) AS shop_id FROM %s.%s' % (his_db, his_table) sids = [] for row in db_conn.iter(cmd_str): sids.append(row.shop_id) db_conn.close() return sids
def updatePrice(): db_conn = ibbdlib.get_db_conn(**db_server) cmd_str = 'SELECT DISTINCT(item_id) AS item_id FROM ibbd2.user_item_monitor' iids = [row.item_id for row in list(db_conn.iter(cmd_str))] db_conn.close() re_conn = ibbdlib.get_redis_conn(**redis_server) re_conn.sadd('Schedule_itempricemonitor', *iids) del re_conn
def _get_shop_ids(): db_conn = ibbdlib.get_db_conn(host='223.4.155.172', user='******', passwd='spider') cmd_str = 'SELECT DISTINCT(shop_id) AS shop_id FROM %s.%s' % (his_db, his_table) sids = list(db_conn.iter(cmd_str)) db_conn.close() return [row.shop_id for row in sids]
def main(): dbConn = ibbdlib.get_db_conn() cats = [] for t in re.findall(r'\n(.+?)\n\{(.+?)\}' ,s.decode('utf-8'), re.S): print t[0].encode('GBK') print ','.join(t[1].replace(' ', '').split()).encode('GBK') cats.extend([[t[0].encode('utf-8'), t2.encode('utf-8')] for t2 in t[1].replace(' ', '').split()]) dbConn.executemany("INSERT IGNORE INTO ibbd2.lib_industry_2 VALUES(%s,%s)", cats) dbConn.close()
def master(): db_conn = ibbdlib.get_db_conn(**db_server['223.4.155.172']) db_conn.execute("TRUNCATE TABLE ibbd2.lib_cat_industry_keyword_2") db_conn.close() ibbdlib.migratetable.migrate_table(db_server['223.4.155.152'], db_server['223.4.155.172'], 'ibbd2.user2') ibbdlib.migratetable.migrate_table(db_server['223.4.155.152'], db_server['223.4.155.172'], 'ibbd2.user_keywords') ibbdlib.migratetable.migrate_table(db_server['223.4.155.152'], db_server['223.4.155.172'], 'ibbd2.lib_cat_industry_keyword_2') ibbdlib.migratetable.migrate_table_by_query( db_server['223.4.155.152'], db_server['223.4.155.172'], 'ibbd2.user_shop_monitor_items', """SELECT user, shop_type, (CASE WHEN shop_id='' THEN 0 ELSE shop_id END) shop_id, user_num_id, item_id, status, population_tsmp FROM ibbd2.user_shop_monitor_items""") # keep connection dbConnPool = None reConn = None while True: try: dbConnPool = ConnectionPool(max_connections=1000, host='localhost', user='******', passwd='ibbd_etl_secure', db='topspider', charset='utf8') # reConn = getRedisConn2() reConn = ibbdlib.get_redis_conn(**redis_server) break except Exception as e: print e runKeywordsSchedule(reConn, dbConnPool) _ensure_schedule_complete(reConn) dbConnPool.disconnect() dbConnPool = ConnectionPool(max_connections=1000, host='localhost', user='******', passwd='ibbd_etl_secure', db='topspider', charset='utf8') while True: try: # migrateScheduleToRedis(reConn, dbConnPool) break except: traceback.print_exc() dbConnPool.disconnect() del reConn
def rollbackNullSiteIdItemSchedule(): print 'init...' server = get_re_conn() iids = [] db_conn = ibbdlib.get_db_conn() for row in db_conn.iter("SELECT item_id FROM ibbd2.top_cat_items WHERE site_id=''"): iids.append(row.item_id) db_conn.close() print 'Schedule_itemFullInfo', len(iids) server.sadd('Schedule_itemFullInfo', *iids) del server
def main(): dbConn = ibbdlib.get_db_conn() cats = [] for t in re.findall(r'\n(.+?)\n\{(.+?)\}', s.decode('utf-8'), re.S): print t[0].encode('GBK') print ','.join(t[1].replace(' ', '').split()).encode('GBK') cats.extend([[t[0].encode('utf-8'), t2.encode('utf-8')] for t2 in t[1].replace(' ', '').split()]) dbConn.executemany("INSERT IGNORE INTO ibbd2.lib_industry_2 VALUES(%s,%s)", cats) dbConn.close()
def get_tables(db): db_conn = ibbdlib.get_db_conn(**db_server) cmdStr = \ """SELECT CONCAT(TABLE_SCHEMA, '.', TABLE_NAME) AS TABLE_NAME, TABLE_TYPE, TABLE_ROWS, DATA_LENGTH, INDEX_LENGTH, TABLE_COMMENT FROM information_schema.TABLES WHERE TABLE_SCHEMA='%s' ORDER BY TABLE_NAME""" \ % db info_dict = dict((row.TABLE_NAME, dict((k, ((v.encode('utf-8' ) if isinstance(v, unicode) else v) if v else '')) for (k, v) in row.items())) for row in db_conn.query(cmdStr)) db_conn.close() return info_dict
def get_tables(db): db_conn = ibbdlib.get_db_conn(**db_server) cmdStr = \ """SELECT CONCAT(TABLE_SCHEMA, '.', TABLE_NAME) AS TABLE_NAME, TABLE_TYPE, TABLE_ROWS, DATA_LENGTH, INDEX_LENGTH, TABLE_COMMENT FROM information_schema.TABLES WHERE TABLE_SCHEMA='%s' ORDER BY TABLE_NAME""" \ % db info_dict = dict( (row.TABLE_NAME, dict((k, (( v.encode('utf-8') if isinstance(v, unicode) else v) if v else '')) for (k, v) in row.items())) for row in db_conn.query(cmdStr)) db_conn.close() return info_dict
def _prepar_trade_date(): # rename schedule redis_conn = ibbdlib.get_redis_conn(**redis_server) if redis_conn.scard('Schedule_itemTrade2') > 0: redis_conn.rename('Schedule_itemTrade2', 'Schedule_itemTrade2_bak') # insert lock db_conn = ibbdlib.get_db_conn(**db_server) # while True: # try: # is_inserting = False # for row in db_conn.iter("SHOW PROCESSLIST"): # if row.Info: # if row.Info.encode('utf-8', 'ignore').find('top_item_trade') > -1: # is_inserting = True # if is_inserting: # print 'processing insert lock, wait for 10 seconds' # time.sleep(10) # else: # print 'no insert lock' # break # except: # traceback.print_exc() try: # db_conn.execute( # "ALTER TABLE topspider.top_item_trade RENAME topspider.top_item_trade_his_%s" % fetch_date_short) # db_conn.execute( # "CREATE TABLE topspider.top_item_trade LIKE topspider.top_item_trade_his_%s" % fetch_date_short) for i in range(split_tb_num): table_name = 'top_item_trade_his_%s_%d' % ( datetime.now().strftime('%Y%m%d'), i + 1) db_conn.execute( "CREATE TABLE topspider.%s LIKE topspider.top_item_trade" % table_name) db_conn.execute("TRUNCATE TABLE topspider.top_itemsearchresult") except: traceback.print_exc() db_conn.close() # rename schedule name if redis_conn.scard('Schedule_itemTrade2_bak') > 0: redis_conn.rename('Schedule_itemTrade2_bak', 'Schedule_itemTrade2') pass del redis_conn
def get_columns(db): db_conn = ibbdlib.get_db_conn(**db_server) cmdStr = \ """SELECT CONCAT(TABLE_SCHEMA, '.', TABLE_NAME) AS TABLE_NAME,ORDINAL_POSITION, COLUMN_NAME, COLUMN_TYPE, COLUMN_KEY, COLUMN_COMMENT FROM information_schema.COLUMNS WHERE TABLE_SCHEMA='%s'""" \ % db info_dict = dict() for row in db_conn.query(cmdStr): if row.TABLE_NAME in info_dict: info_dict[row.TABLE_NAME].append(dict((k, ((v.encode('utf-8' ) if isinstance(v, unicode) else v) if v else '')) for (k, v) in row.items())) else: info_dict[row.TABLE_NAME] = [dict((k, ((v.encode('utf-8' ) if isinstance(v, unicode) else v) if v else '')) for (k, v) in row.items())] db_conn.close() return info_dict
def get_columns(db): db_conn = ibbdlib.get_db_conn(**db_server) cmdStr = \ """SELECT CONCAT(TABLE_SCHEMA, '.', TABLE_NAME) AS TABLE_NAME,ORDINAL_POSITION, COLUMN_NAME, COLUMN_TYPE, COLUMN_KEY, COLUMN_COMMENT FROM information_schema.COLUMNS WHERE TABLE_SCHEMA='%s'""" \ % db info_dict = dict() for row in db_conn.query(cmdStr): if row.TABLE_NAME in info_dict: info_dict[row.TABLE_NAME].append( dict((k, ((v.encode('utf-8') if isinstance(v, unicode) else v ) if v else '')) for (k, v) in row.items())) else: info_dict[row.TABLE_NAME] = [ dict((k, ((v.encode('utf-8') if isinstance(v, unicode) else v ) if v else '')) for (k, v) in row.items()) ] db_conn.close() return info_dict
def _prepar_trade_date(): # rename schedule redis_conn = ibbdlib.get_redis_conn(**redis_server) if redis_conn.scard("Schedule_itemTrade2") > 0: redis_conn.rename("Schedule_itemTrade2", "Schedule_itemTrade2_bak") # insert lock db_conn = ibbdlib.get_db_conn(**db_server) # while True: # try: # is_inserting = False # for row in db_conn.iter("SHOW PROCESSLIST"): # if row.Info: # if row.Info.encode('utf-8', 'ignore').find('top_item_trade') > -1: # is_inserting = True # if is_inserting: # print 'processing insert lock, wait for 10 seconds' # time.sleep(10) # else: # print 'no insert lock' # break # except: # traceback.print_exc() try: # db_conn.execute( # "ALTER TABLE topspider.top_item_trade RENAME topspider.top_item_trade_his_%s" % fetch_date_short) # db_conn.execute( # "CREATE TABLE topspider.top_item_trade LIKE topspider.top_item_trade_his_%s" % fetch_date_short) for i in range(split_tb_num): table_name = "top_item_trade_his_%s_%d" % (datetime.now().strftime("%Y%m%d"), i + 1) db_conn.execute("CREATE TABLE topspider.%s LIKE topspider.top_item_trade" % table_name) db_conn.execute("TRUNCATE TABLE topspider.top_itemsearchresult") except: traceback.print_exc() db_conn.close() # rename schedule name if redis_conn.scard("Schedule_itemTrade2_bak") > 0: redis_conn.rename("Schedule_itemTrade2_bak", "Schedule_itemTrade2") pass del redis_conn
def split_topspider_table(full_table): db_conn = ibbdlib.get_db_conn(host='223.4.155.172', user='******', passwd='spider') cmd_str = "SELECT DISTINCT(shop_id) AS shop_id FROM %s" % (full_table) sids = [] for row in db_conn.iter(cmd_str): sids.append(row.shop_id) length = len(sids) / 100 i = 1 sid_dict = dict() for sid in sids: if sid_dict.get(i): if len(sid_dict[i]) < length: sid_dict[i].append(sid) else: i +=1 sid_dict[i] = [sid] else: sid_dict[i] = [sid] for k in sid_dict.keys(): print k, len(sid_dict[k]) db_conn.executemany("INSERT IGNORE INTO topdata.top_trade_dist(table_name, shop_id) values(%s, %s)", [['top_item_trade_%d'%k, sid] for sid in sid_dict[k]]) db_conn.close()
def _prepar_trade_date(): # rename schedule redis_conn = ibbdlib.get_redis_conn() if redis_conn.scard('Schedule_itemTrade2') > 0: redis_conn.rename('Schedule_itemTrade2', 'Schedule_itemTrade2_bak') # insert lock db_conn = ibbdlib.get_db_conn(**db_server) while 1: try: is_inserting = False for row in db_conn.iter("SHOW PROCESSLIST"): if row.Info: if row.Info.encode('utf-8', 'ignore').find('top_item_trade') > -1: is_inserting = True if is_inserting: print 'processing insert lock, wait for 10 seconds' time.sleep(10) else: print 'no insert lock' break except: traceback.print_exc() try: db_conn.execute( "ALTER TABLE topspider.top_item_trade RENAME topspider.top_item_trade_his_%s" % fetch_date_short) db_conn.execute( "CREATE TABLE topspider.top_item_trade LIKE topspider.top_item_trade_his_%s" % fetch_date_short) except: traceback.print_exc() db_conn.close() # rename schedule name if redis_conn.scard('Schedule_itemTrade2_bak') > 0: redis_conn.rename('Schedule_itemTrade2_bak', 'Schedule_itemTrade2') del redis_conn
import ibbdlib import ibbdlib.tornadolib from dateutil.parser import parse as dateparser keyword = '香港虚拟主机' keyword_en = 'hongkongvirtualhost' from_date_str = '2012-12-09' to_date_str = '2012-12-20' from_date = dateparser(from_date_str) to_date = dateparser(to_date_str) date_range_str = [(from_date + timedelta(i)).strftime('%Y%m%d') for i in range((to_date - from_date).days + 1)] db_server = {'host': '223.4.155.152', 'user': '******', 'passwd': 'spider'} db_conn = ibbdlib.get_db_conn(**db_server) try: trade_source_table_list = [] for date_str in date_range_str: select_query = \ "SELECT TABLE_NAME FROM information_schema.TABLES WHERE TABLE_SCHEMA='topdata' AND TABLE_NAME LIKE %s ORDER BY TABLE_NAME" for table_name in db_conn.iter(select_query, 'top_item_trade_his_%s%%' % date_str): print table_name.TABLE_NAME trade_source_table_list.append(table_name.TABLE_NAME) if db_server['host'].find('152') != -1: trade_source_table_list.extend(['top_item_trade_his_%d' % (i+1) for i in range(26)]) except Exception, e: traceback.print_exc() db_conn.close()
#!/usr/bin/python2.7 # -*- coding: utf-8 -*- import MySQLdb import ibbdlib # db_conn = MySQLdb.connect(host='223.4.155.172', user='******', passwd='spider') db_conn = ibbdlib.get_db_conn(host='223.4.155.172', user='******', passwd='spider') # cursor = db_conn.cursor() cursor = db_conn._db.cursor() try: print cursor.callproc('ibbd2.sp_insert_date', tuple(['2012-12-12'])) except Exception, e: print e finally: cursor.close() db_conn.close()
'host': '223.4.155.152', 'user': '******', 'passwd': 'spider', 'port': 3306, }, '223.4.155.172': { 'host': '223.4.155.172', 'user': '******', 'passwd': 'spider', 'port': 3306, } } if __name__ == '__main__': print 'start...' db_conn = ibbdlib.get_db_conn(**db_server['223.4.155.172']) db_dml_connpool = ibbdlib.connectionpool.ConnectionPool( connection_class=MySQLdb.connections.Connection, max_connections=100, **db_server['223.4.155.172']) iids = [ row.item_id for row in db_conn.iter( "SELECT DISTINCT item_id FROM topspider.top_itemsearchresult WHERE keyword='{}'" .format(keyword)) ] iids_errors = [] for i, iid in enumerate(iids): print '%d of %d %s' % (i + 1, len(iids), iid) try: # (data, t1, t2) = getTrades3('http://item.taobao.com/item.htm?id=%s' % iid,
def save_log_file(log): db_conn = ibbdlib.get_db_conn() with open(log) as f: # db_conn.executemany("INSERT IGNORE INTO ibbdlog.ibbd_log values(%s)"%(','.join(['%s']*8)), map(lambda row: row.replace('\n','').split('\t') + [hashlib.md5(row.split('\t')[0]+row.split('\t')[6]).hexdigest()] if len(row.split('\t')) >= 7, open(log).readlines())) db_conn.executemany("INSERT IGNORE INTO ibbdlog.ibbd_log values(%s)"%(','.join(['%s']*8)), [row.replace('\n','').split('\t') + [hashlib.md5(row.split('\t')[0]+row.split('\t')[6]).hexdigest()] for row in open(log).readlines() if len(row.split('\t')) >= 7]) db_conn.close()
def _get_db_server(): return ibbdlib.get_db_conn(**db_server['223.4.155.172'])
def master(): db_conn = ibbdlib.get_db_conn(**db_server['223.4.155.172']) db_conn.execute("TRUNCATE TABLE ibbd2.lib_cat_industry_keyword_2") db_conn.close() ibbdlib.migratetable.migrate_table( db_server['223.4.155.152'], db_server['223.4.155.172'], 'ibbd2.user2') ibbdlib.migratetable.migrate_table( db_server['223.4.155.152'], db_server['223.4.155.172'], 'ibbd2.user_keywords') ibbdlib.migratetable.migrate_table( db_server['223.4.155.152'], db_server['223.4.155.172'], 'ibbd2.lib_cat_industry_keyword_2') ibbdlib.migratetable.migrate_table_by_query( db_server['223.4.155.152'], db_server['223.4.155.172'], 'ibbd2.user_shop_monitor_items', """SELECT user, shop_type, (CASE WHEN shop_id='' THEN 0 ELSE shop_id END) shop_id, user_num_id, item_id, status, population_tsmp FROM ibbd2.user_shop_monitor_items""" ) # keep connection dbConnPool = None reConn = None while True: try: dbConnPool = ConnectionPool(max_connections=1000, host='localhost', user='******', passwd='ibbd_etl_secure', db='topspider', charset='utf8') # reConn = getRedisConn2() reConn = ibbdlib.get_redis_conn(**redis_server) break except Exception as e: print e runKeywordsSchedule(reConn, dbConnPool) _ensure_schedule_complete(reConn) dbConnPool.disconnect() dbConnPool = ConnectionPool(max_connections=1000, host='localhost', user='******', passwd='ibbd_etl_secure', db='topspider', charset='utf8') while True: try: # migrateScheduleToRedis(reConn, dbConnPool) break except: traceback.print_exc() dbConnPool.disconnect() del reConn
db_server = {'223.4.155.152': { 'host': '223.4.155.152', 'user': '******', 'passwd': 'spider', 'port': 3306, }, '223.4.155.172': { 'host': '223.4.155.172', 'user': '******', 'passwd': 'spider', 'port': 3306, }} if __name__ == '__main__': print 'start...' db_conn = ibbdlib.get_db_conn(**db_server['223.4.155.172']) db_dml_connpool = ibbdlib.connectionpool.ConnectionPool(connection_class=MySQLdb.connections.Connection, max_connections=100, **db_server['223.4.155.172']) iids = [row.item_id for row in db_conn.iter("SELECT DISTINCT item_id FROM topspider.top_itemsearchresult WHERE keyword='{}'".format(keyword))] iids_errors = [] for i, iid in enumerate(iids): print '%d of %d %s' % (i+1, len(iids), iid) try: # (data, t1, t2) = getTrades3('http://item.taobao.com/item.htm?id=%s' % iid, # fromTsmp='1900/1/1', # toTsmp='2013/1/18', # reqinterval=0.2) # saveTopTrade(data, connpool=db_dml_connpool, table='topspider.top_item_trade_hongkongvps') # print len(data) topitem = TopItem('http://item.taobao.com/item.htm?id=%s' % iid).getFullInfo()
#!/usr/bin/python2.7 # -*- coding: utf-8 -*- import ibbdlib import ibbdlib.redislib from datetime import datetime, timedelta cmd_str = "SELECT item_id, MAX(date_of) AS date_of FROM ibbd2.top_item2 GROUP BY item_id" commands = [] db_conn = ibbdlib.get_db_conn() for row in db_conn.iter(cmd_str): commands.append(['HSET', 'TopItemTradeUpdateTime', str(row.item_id), (row.date_of+timedelta(1)).strftime('%Y/%m/%d')]) print commands[0] db_conn.close() re_conn = ibbdlib.get_redis_conn() ibbdlib.redislib.redis_pipe(re_conn, commands) del re_conn
def _get_db_conn(): return ibbdlib.get_db_conn(**db_server)
#!/usr/bin/python2.7 # -*- coding: utf-8 -*- import re import requests import ibbdlib site = 'http://www.taobao.com/' req = requests.get(site) site_encoding = req.encoding site_content = req.text.encode('utf-8') meta_category_items = re.findall(r'<div\s+class\="category-item\s+.+?">.+?</div>', site_content, re.S) category_items = [] for category_item in meta_category_items: cat = re.findall(r'<img.+?alt\=\"(.+?)\"', category_item, re.S)[0] industrys = re.findall(r'<h5>\s*<a.+?>(.+?)<', category_item, re.S) category_items.extend([[cat, t] for t in industrys]) db_conn = ibbdlib.get_db_conn(**{'host': '223.4.155.152', 'user': '******', 'passwd': 'spider'}) try: db_conn.executemany("INSERT IGNORE INTO ibbd2.lib_industry_3 values(%s, %s)", category_items) except Exception, e: print e finally: db_conn.close()
#!/usr/bin/python2.7 # -*- coding: utf-8 -*- import ibbdlib import ibbdlib.redislib from datetime import datetime, timedelta cmd_str = "SELECT item_id, MAX(date_of) AS date_of FROM ibbd2.top_item2 GROUP BY item_id" commands = [] db_conn = ibbdlib.get_db_conn() for row in db_conn.iter(cmd_str): commands.append([ 'HSET', 'TopItemTradeUpdateTime', str(row.item_id), (row.date_of + timedelta(1)).strftime('%Y/%m/%d') ]) print commands[0] db_conn.close() re_conn = ibbdlib.get_redis_conn() ibbdlib.redislib.redis_pipe(re_conn, commands) del re_conn
import re import requests import ibbdlib site = 'http://www.taobao.com/' req = requests.get(site) site_encoding = req.encoding site_content = req.text.encode('utf-8') meta_category_items = re.findall( r'<div\s+class\="category-item\s+.+?">.+?</div>', site_content, re.S) category_items = [] for category_item in meta_category_items: cat = re.findall(r'<img.+?alt\=\"(.+?)\"', category_item, re.S)[0] industrys = re.findall(r'<h5>\s*<a.+?>(.+?)<', category_item, re.S) category_items.extend([[cat, t] for t in industrys]) db_conn = ibbdlib.get_db_conn(**{ 'host': '223.4.155.152', 'user': '******', 'passwd': 'spider' }) try: db_conn.executemany( "INSERT IGNORE INTO ibbd2.lib_industry_3 values(%s, %s)", category_items) except Exception, e: print e finally: db_conn.close()
import ibbdlib import ibbdlib.tornadolib from dateutil.parser import parse as dateparser keyword = '香港虚拟主机' keyword_en = 'hongkongvirtualhost' from_date_str = '2012-12-09' to_date_str = '2012-12-20' from_date = dateparser(from_date_str) to_date = dateparser(to_date_str) date_range_str = [(from_date + timedelta(i)).strftime('%Y%m%d') for i in range((to_date - from_date).days + 1)] db_server = {'host': '223.4.155.152', 'user': '******', 'passwd': 'spider'} db_conn = ibbdlib.get_db_conn(**db_server) try: trade_source_table_list = [] for date_str in date_range_str: select_query = \ "SELECT TABLE_NAME FROM information_schema.TABLES WHERE TABLE_SCHEMA='topdata' AND TABLE_NAME LIKE %s ORDER BY TABLE_NAME" for table_name in db_conn.iter(select_query, 'top_item_trade_his_%s%%' % date_str): print table_name.TABLE_NAME trade_source_table_list.append(table_name.TABLE_NAME) if db_server['host'].find('152') != -1: trade_source_table_list.extend( ['top_item_trade_his_%d' % (i + 1) for i in range(26)]) except Exception, e: traceback.print_exc()
#!/usr/bin/python2.7 # -*- coding: utf-8 -*- import traceback import json from collections import Counter import ibbdlib db_conn = ibbdlib.get_db_conn(host="223.4.155.152", user="******", passwd="spider") # remote_db_conn = ibbdlib.get_db_conn(host='223.4.155.172', user='******', passwd='spider') def magic(): query = """SELECT T1.user, T1.keyword, T2.cat, T2.industry FROM ibbd2.user_keywords T1 LEFT JOIN ibbd2.lib_cat_industry_keyword_2 T2 ON T1.keyword = T2.keyword WHERE T1.status='1' AND T1.industry='' AND T2.keyword IS NOT NULL""" user_industry = dict() for row in db_conn.iter(query): user = row.user.encode("utf-8") industry = "{0}-{1}".format(row.cat.encode("utf-8"), row.industry.encode("utf-8")) if user_industry.get(user): user_industry[user].append(industry) else: user_industry[user] = [industry] for user in user_industry.keys(): c = Counter()