def productInfo(): db = DBService(dbName='jddata', tableName='jdproductbaseinfo2database') data = db.getData(var='productHref,commentCount', limit=200000) proDict = {} for item in data: proDict[item[0]] = item[1] return proDict
def gen_url(): def url_join(t): if '.html' in t: return None else: temp = t.rsplit('/', 1) return temp[0] + '/contactinfo/' + temp[1] + '.html' def change_par(x): if '//www' in x: return url_join(x) elif '//pt' in x: return url_join(x.replace('//pt', '//www')) elif '//ru' in x: return url_join(x.replace('//ru', '//www')) elif '//es' in x: return url_join(x.replace('//es', '//www')) else: return None db_g = DBService(dbName=db_name, tableName='aliexpress_temp', **connect_dict) href_list_t = db_g.getData(var='store_href', distinct=True) href_s = map( lambda t: change_par(t), map( lambda x: x[0], href_list_t ) ) return list(set(filter(lambda x: 1 if x else 0, href_s)))
def craweldhref(): db = DBService('elec_platform', 'yms_tmall_shopinfo_com_withoutjudge') href = db.getData(var='href') href = [item[0] for item in href] F = lambda x: x[:-1] if x[-1] == '/' else x href = map(F, href) print(len(href)) return href
def gen_url(): DB = DBService(dbName="alibaba", tableName="alibaba_cow_powder_3") url_detail_page = DB.getData(var="credit_detail_href", distinct=True) urls = map(lambda x: x[0] if x else " ", url_detail_page) url = [] for t in urls: if t: url.append(t) return url
def companyInfo(): # 返回公司信息,字典形式 db = DBService(dbName='jddata', tableName='thirdPartShopInfo') data = db.getData(limit=200000) data = [item for item in data if not item[2] == '-'] comDict = {} for item in data: comDict[item[1]] = item[1:] return comDict
def commentHrefList(): db = DBService('elec_platform', 'tmall_baseinfo_everyweek') judgePageHref = db.getData(var='name,href,judgepage_href') judgePageHref = [tuple(item) for item in judgePageHref if not 'http' in item[2]] judgePageHref = [item for item in judgePageHref if not item[2].isnumeric()] judgePageHref = set(judgePageHref) judgePageHref = list(judgePageHref) print(len(judgePageHref)) return judgePageHref
def proxy_collection(): # get proxies from website proxies_list_website = pc.get_proxies_from_website() # at the same time , get other proxies from local database table_names_proxies = 'proxy_other_source,proxy_you_dai_li' proxies_list_local = list() for proxies_t_n in table_names_proxies.split(','): dbs = DBService(dbName='base', tableName=proxies_t_n, **connect_dict) proxies_list_local += map(lambda x: x[0], dbs.getData(var='proxy_port')) return list(set(proxies_list_website + proxies_list_local))
def begin(): db = DBService(dbName='jddata', tableName='thirdPartShopInfo') data = db.getData() title = db.getTableTitle()[1:-2] S = set() for item in data: S.add(tuple(item[1:-2])) data = [] for item in S: data.append(list(item)) csv = CSV() csv.writeCsv(savePath='D:/spider', fileTitle=title, data=data, fileName='jdData')
def sumCommentCount(): db = DBService(dbName='jddata', tableName='thirdPartShopInfoAddCommnetCount') # db = DBService(dbName='jddata', tableName='thirdPartShopInfoAddtest') data = db.getData(var='shopName,commnetCount') dict = {} for item in data: if item[0] in dict.keys(): dict[item[0]] = int(item[1]) + dict[item[0]] else: dict[item[0]] = int(item[1]) data = [] for item in dict.items(): data.append([item[0], item[1]]) csv = CSV() csv.writeCsv(savePath='D:/spider', fileTitle=['shopName', 'commnetCount'], data=data, fileName='jdDataSum')
def startUrlList(self): """ # 方法重载 :return: """ dbs = DBService(dbName='jddata', tableName='jdproductbaseinfo2database') data = dbs.getData(var='productHref,sku', distinct=True) dataThirdPartBase = [item[0] for item in data if len(item[1]) >= 10] dataHadCrawled = DBService(dbName='jddata', tableName='thirdPartShopInfo').getData(var='productHref') if not dataHadCrawled: return dataThirdPartBase dataHadCrawled = set([item[0] for item in dataHadCrawled]) dataThirdPart = [item for item in dataThirdPartBase if item not in dataHadCrawled] dataThirdPart = [item for item in dataThirdPart if item[:4] == 'http'] # print len(dataThirdPart) return dataThirdPart
def savePicture(): from screenShot import saveScreenShot from ms_spider_fw.DBSerivce import DBService import time import random db = DBService(dbName='tmalldata', tableName='tmall_baseinfo_realtime') data = db.getData(var='name,href', distinct=True) nameD = map(lambda x: x[0], data) data = map(lambda x: x[1], data) print(len(data)) dri = None for url in data: name=nameD[data.index(url)] print(name) dri = saveScreenShot(url, driver=dri,title=name) time.sleep(abs(random.gauss(3, 2)))
# connect_dict = {'host': '10.118.187.12', 'user': '******', 'passwd': 'admin', 'charset': 'utf8'} connect_dict = {'host': 'localhost', 'user': '******', 'passwd': '', 'charset': 'utf8'} # db_server = DBService(dbName=db_name, tableName=table_name, **connect_dict) # proxy_list = map(lambda x: x[0], db_server.getData(var='proxy_port', distinct=True)) # for p in proxy_list: # qu_proxy_test.put(p) patt_ip = re.compile(r'(?<![\.\d])(?:\d{1,3}\.){3}\d{1,3}(?![\.\d])') proxy_list = [] for table_name in table_name_s.split(','): print table_name db_server = DBService(dbName=db_name, tableName=table_name, **connect_dict) if db_server.isTableExist(): proxy_list += map(lambda x: x[0], db_server.getData(var='proxy_port')) proxy_list_t=list(set(proxy_list)) for p in proxy_list_t: qu_proxy_test.put(p) def original_ip_address(): t = requests.get('http://httpbin.org/ip').text return json.loads(t).get('origin') original = original_ip_address() def test():
#!/usr/bin/env python # -*- encoding: utf-8 -*- from ms_spider_fw.DBSerivce import DBService import json import re import requests import sys from datetime import datetime reload(sys) sys.setdefaultencoding('utf8') db_server = DBService(dbName='test', tableName='weibo_cellphone') # , **connect_dict) data = db_server.getData(var='detail_json', limit=20) data = filter(lambda x: 1 if x[0][0] == '{' else 0, filter(lambda x: 1 if x[0] else 0, data)) re_sub_p = re.compile('<.+?>') re_sub_t = re.compile('\+\d+?\s') def time_format(ori): if not ori: return '' o = re.sub(re_sub_t, '', ori) s = datetime.strptime(o, '%a %b %d %H:%M:%S %Y') return s.strftime('%Y-%m-%d %H:%M:%S') # extract_info from json string def extract_info(x):
def startUrlList(self): db = DBService(dbName="jddata", tableName="thirdPartShopAppID") data = db.getData(var="appID") data = ["http://mall.jd.com/view_search-" + item[0] + "-0-5-1-24-1.html" for item in data if item[0]] return data
import threading import time from Queue import Queue as qu from ms_proxy import proxy_test from ms_spider_fw.DBSerivce import DBService # config text db_name = 'b2c_base' # give some tables name to extract proxy list to test , different table name be combined use ',' table_name_s = 'proxy_you_dai_li,proxy_xi_ci_dai_li' connect_dict = {'host': '10.118.187.12', 'user': '******', 'passwd': 'admin', 'charset': 'utf8'} proxy_list = [] for table_name in table_name_s.split(','): db_server = DBService(dbName=db_name, tableName=table_name, **connect_dict) proxy_list += map(lambda x: x[0], db_server.getData(var='proxy_port', distinct=True)) # with open("d:/proxy_2.txt", 'r')as f: # t = f.read() # proxy_list = t.split('\n') # script qu_proxy_test = qu(0) qu_proxy_ok = qu(0) for t in set(proxy_list): qu_proxy_test.put(t) def test(): while qu_proxy_test.qsize():
#coding:utf8 __author__ = '613108' from ms_spider_fw.DBSerivce import DBService dbs=DBService(dbName='elec_platform',tableName='tmall_baseinfo_everyweek') data=dbs.getData() data=[item for item in data if int(item[-2])>=35] print(len(data))
#!/usr/bin/env python # -*- encoding: utf-8 -*- from ms_spider_fw.DBSerivce import DBService import json import re import requests import sys reload(sys) sys.setdefaultencoding('utf8') db_server = DBService(dbName='platform_data', tableName='jd_comment_woman_cloth') data = db_server.getData(var='comment_json', distinct=True, limit=10) data = filter(lambda x: 1 if x[0][0] == '{' else 0, filter(lambda x: 1 if x[0] else 0, data)) re_sub_p = re.compile('<.+?>') # extract_info from json string def extract_info(x): try: d_t = json.loads(x[0]) d = d_t['comments'] return [ { "id": it.get("id"), "content": it.get("content").replace('\n', ''), "creationtime": it.get("creationTime"), "referencename": it.get("referenceName"), "referencetime": it.get("referenceTime"),
def startUrlList(self): db = DBService(dbName="jddata", tableName="jd_shop_gradeHref") data = db.getData() data = map(lambda x: x[0], data) return data
__author__ = '613108' from ms_spider_fw.DBSerivce import DBService dbs = DBService(dbName='jddata', tableName='thirdPartShopInfo') companyCount = dbs.getData(var='companyName', distinct=True) shopCount1 = dbs.getData(var='shopHref', distinct=True) shopCount2 = dbs.getData(var='shopName', distinct=True) gradeHref = dbs.getData(var='gradeHref', distinct=True) print len(companyCount) print len(shopCount1) print len(shopCount2) print len(gradeHref)
def startUrlList(self): db = DBService(dbName='jddata', tableName='thirdPartShopInfo') data = db.getData(var='shopHref', distinct=True) data = [item[0] for item in data] return data