def gen_url(): def url_join(t): if '.html' in t: return None else: temp = t.rsplit('/', 1) return temp[0] + '/contactinfo/' + temp[1] + '.html' def change_par(x): if '//www' in x: return url_join(x) elif '//pt' in x: return url_join(x.replace('//pt', '//www')) elif '//ru' in x: return url_join(x.replace('//ru', '//www')) elif '//es' in x: return url_join(x.replace('//es', '//www')) else: return None db_g = DBService(dbName=db_name, tableName='aliexpress_temp', **connect_dict) href_list_t = db_g.getData(var='store_href', distinct=True) href_s = map( lambda t: change_par(t), map( lambda x: x[0], href_list_t ) ) return list(set(filter(lambda x: 1 if x else 0, href_s)))
def push2DB(): from ms_spider_fw.DBSerivce import DBService data=getKeyword() db=DBService('taobaodata','keyword') tableTitle=['categoryFi', 'categorySe', 'categoryTi'] db.createTable(tableTitle=tableTitle) db.data2DB(data=data)
def spiderMain(): """ # main主程序 :return: """ dler = Dler() dler.downLoad(100) DB = DBService(dbName='jddata', tableName='thirdPartShopInfo') DB.createTable( tableTitle=['productHref', 'companyName', 'shopName', 'shopHref', 'scoreSum', 'scoreProduct', 'scoreProductAvg', 'scoreService', 'scoreServiceAvg', 'scoreExpress', 'scoreExpressAvg', 'gradeHref']) while True: que = DBN.queueForDownLoad if not que.empty(): url, src = que.get() pPer = PPer(src) temp = pPer.pageParser() # proxy_test=temp[0] # if proxy_test=='-': # continue # else: # print(proxy_test) print(temp[0]) DB.data2DB(data=[url] + temp) else: time.sleep(1)
def productInfo(): db = DBService(dbName='jddata', tableName='jdproductbaseinfo2database') data = db.getData(var='productHref,commentCount', limit=200000) proDict = {} for item in data: proDict[item[0]] = item[1] return proDict
def craweldhref(): db = DBService('elec_platform', 'yms_tmall_shopinfo_com_withoutjudge') href = db.getData(var='href') href = [item[0] for item in href] F = lambda x: x[:-1] if x[-1] == '/' else x href = map(F, href) print(len(href)) return href
def spiderMain(): # 主程序 from ms_spider_fw.CSVService import CSV dler = Dler() dler.downLoad(100) DB = DBService(dbName="jddata", tableName="shop_grade_score") DB.createTable( tableTitle=[ "gradeHref", "totalScore", "totalScore_avg", "productScore", "productScore_avg", "productQualityScore", "productQualityScore_avg", "productDescribeScore", "productDescribeScore_avg", "returnExchangeRate", "returnExchangeRate_avg", "serviceScore", "serviceScore_avg", "sellerCSI", "sellerCSI_avg", "distributionCSI", "distributionCSI_avg", "onlineServiceCSI", "onlineServiceCSI_avg", "returnExchangeCSI", "returnExchangeCSI_avg", "temporalityScore", "temporalityScore_avg", "expScore", "expScore_avg", "sendPromptnessScore", "sendPromptnessScore_avg", "returnExchangeTime", "returnExchangeTime_avg", "onLineSeriveTime", "onLineSeriveTime_avg", "spider_time", ] ) # while True: que = DBN.queueForDownLoad while True: url, src = que.get() try: pPer = PPer(src) result = pPer.pageParser() total = [url] + result DB.data2DB(data=total) print(result) except: continue
def companyInfo(): # 返回公司信息,字典形式 db = DBService(dbName='jddata', tableName='thirdPartShopInfo') data = db.getData(limit=200000) data = [item for item in data if not item[2] == '-'] comDict = {} for item in data: comDict[item[1]] = item[1:] return comDict
def gen_url(): DB = DBService(dbName="alibaba", tableName="alibaba_cow_powder_3") url_detail_page = DB.getData(var="credit_detail_href", distinct=True) urls = map(lambda x: x[0] if x else " ", url_detail_page) url = [] for t in urls: if t: url.append(t) return url
def run(thread_count=1000): run_test(thread_count) db_server_c = DBService(dbName=db_name, tableName='proxy_ok', **connect_dict) db_server_c.createTable(tableTitle=['proxy_port', 'test_time'], x='Y') res = [] print '#'*100 print qu_proxy_ok.qsize() while qu_proxy_ok.qsize(): res.append(qu_proxy_ok.get()) db_server_c.data2DB(data=res)
def proxy_collection(): # get proxies from website proxies_list_website = pc.get_proxies_from_website() # at the same time , get other proxies from local database table_names_proxies = 'proxy_other_source,proxy_you_dai_li' proxies_list_local = list() for proxies_t_n in table_names_proxies.split(','): dbs = DBService(dbName='base', tableName=proxies_t_n, **connect_dict) proxies_list_local += map(lambda x: x[0], dbs.getData(var='proxy_port')) return list(set(proxies_list_website + proxies_list_local))
def commentHrefList(): db = DBService('elec_platform', 'tmall_baseinfo_everyweek') judgePageHref = db.getData(var='name,href,judgepage_href') judgePageHref = [tuple(item) for item in judgePageHref if not 'http' in item[2]] judgePageHref = [item for item in judgePageHref if not item[2].isnumeric()] judgePageHref = set(judgePageHref) judgePageHref = list(judgePageHref) print(len(judgePageHref)) return judgePageHref
def run(thread_count=20000): muti_thread_test(thread_count) db_server_c = DBService(dbName=db_name, tableName='proxy_ok', **connect_dict) db_server_c.createTable(tableTitle=['proxy_port', 'test_time'], x='Y') res = [] while qu_proxy_ok.qsize(): res.append([ qu_proxy_ok.get(), time.strftime('%Y-%m-%d %X', time.localtime()) ]) db_server_c.data2DB(data=res)
def begin(): db = DBService(dbName='jddata', tableName='thirdPartShopInfo') data = db.getData() title = db.getTableTitle()[1:-2] S = set() for item in data: S.add(tuple(item[1:-2])) data = [] for item in S: data.append(list(item)) csv = CSV() csv.writeCsv(savePath='D:/spider', fileTitle=title, data=data, fileName='jdData')
def sumCommentCount(): db = DBService(dbName='jddata', tableName='thirdPartShopInfoAddCommnetCount') # db = DBService(dbName='jddata', tableName='thirdPartShopInfoAddtest') data = db.getData(var='shopName,commnetCount') dict = {} for item in data: if item[0] in dict.keys(): dict[item[0]] = int(item[1]) + dict[item[0]] else: dict[item[0]] = int(item[1]) data = [] for item in dict.items(): data.append([item[0], item[1]]) csv = CSV() csv.writeCsv(savePath='D:/spider', fileTitle=['shopName', 'commnetCount'], data=data, fileName='jdDataSum')
def startUrlList(self): """ # 方法重载 :return: """ dbs = DBService(dbName='jddata', tableName='jdproductbaseinfo2database') data = dbs.getData(var='productHref,sku', distinct=True) dataThirdPartBase = [item[0] for item in data if len(item[1]) >= 10] dataHadCrawled = DBService(dbName='jddata', tableName='thirdPartShopInfo').getData(var='productHref') if not dataHadCrawled: return dataThirdPartBase dataHadCrawled = set([item[0] for item in dataHadCrawled]) dataThirdPart = [item for item in dataThirdPartBase if item not in dataHadCrawled] dataThirdPart = [item for item in dataThirdPart if item[:4] == 'http'] # print len(dataThirdPart) return dataThirdPart
def savePicture(): from screenShot import saveScreenShot from ms_spider_fw.DBSerivce import DBService import time import random db = DBService(dbName='tmalldata', tableName='tmall_baseinfo_realtime') data = db.getData(var='name,href', distinct=True) nameD = map(lambda x: x[0], data) data = map(lambda x: x[1], data) print(len(data)) dri = None for url in data: name=nameD[data.index(url)] print(name) dri = saveScreenShot(url, driver=dri,title=name) time.sleep(abs(random.gauss(3, 2)))
def spiderMain(): """ # main主程序 :return: """ dler = Dler() dler.downLoad(10) DB = DBService(#host='localhost', # user='******', # passwd='', # charset='utf8', # dbName='spider', dbName='alibaba', tableName='alibaba_cow_powder_3') DB.createTable(tableTitle= ['company_name', 'keyword', 'sale', 'href', 'member_id', 'offer_id', 'cxt_year', 'credit_detail_href', 'goods_from', 'product_title_sample', 'product_detail_sample', 'location', 'url_base']) while True: que = DBN.queueForDownLoad if not que.empty(): url, src = que.get() pPer = PPer(src) temp = pPer.pageParser() if temp: temp = map(lambda x: x + [url], temp) DB.data2DB(data=temp) print(u'++成功:%s'%url) else: print(u'--失败:%s'%url) else: time.sleep(1)
def spiderMain(): # 主程序 dler = Dler() dler.downLoad(100) DB = DBService(dbName="jddata", tableName="thirdPartShopSearchPage") DB.createTable(tableTitle=["tttt"]) while True: que = DBN.queueForDownLoad if not que.empty(): url, src = que.get() pPer = PPer(src) temp = pPer.pageParser() print("=" * 30) print(url) for item in temp: print(item) # DB.data2DB(data=[url] + temp) else: time.sleep(1)
def spiderMain(): # 主程序 dler = Dler() dler.downLoad(100) DB = DBService(dbName='jddata', tableName='thirdPartShopAppID') DB.createTable(tableTitle=['shopHref','appID']) while True: que = DBN.queueForDownLoad if not que.empty(): url, src = que.get() pPer = PPer(src) temp = pPer.pageParser() print('='*30) print(url) print(temp) if temp: DB.data2DB(data=[url] + temp) else: time.sleep(1)
def get_parser(url, driver): import random time.sleep(abs(random.gauss(5, 5))) driver.get(url) print(driver.title) contacts_name = "-" contacts_sex = "-" contacts_job = "-" try: contacts_name = driver.find_element_by_css_selector(".contact-info .membername").text contacts_sex = driver.find_element_by_css_selector(".contact-info>dl>dd").text.split(" ")[1] contacts_job = driver.find_element_by_css_selector(".contact-info>dl>dd").text.split("(")[1] contacts_job = contacts_job.split(")")[0] except: pass phone_frames = driver.find_elements_by_css_selector(".contcat-desc dl") cell_phone = "-" tel_phone = "-" fax_phone = "-" shop_addr = "-" for i in range(len(phone_frames)): text = driver.find_element_by_css_selector(".contcat-desc dl:nth-child(" + str(i + 1) + ") dt").text.strip() if text == u"移动电话:": cell_phone = driver.find_element_by_css_selector(".contcat-desc dl:nth-child(" + str(i + 1) + ") dd").text continue elif text == u"电 话:": tel_phone = driver.find_element_by_css_selector(".contcat-desc dl:nth-child(" + str(i + 1) + ") dd").text continue elif text == u"传 真:": fax_phone = driver.find_element_by_css_selector(".contcat-desc dl:nth-child(" + str(i + 1) + ") dd").text continue elif text == u"地 址:": shop_addr = driver.find_element_by_css_selector(".contcat-desc dl:nth-child(" + str(i + 1) + ") dd").text continue spider_time = time.strftime("%Y-%m-%d %X", time.localtime()) result = [contacts_name, contacts_sex, contacts_job, cell_phone, tel_phone, fax_phone, shop_addr, spider_time, url] DB = DBService(dbName="alibaba", tableName="alibaba_cow_powder_phone") DB.data2DB(data=result)
def getCategoryAndStartUrl(): import json global queue_for_url_targetBase queue_for_url_targetBase = Queue(0) src = myUrlOpen.requestByProxy('http://dc.3.cn/category/get?callback=getCategoryCallback') srcTemp = src.split('(', 1)[1][:-1] srcTemp = srcTemp.decode('gbk', 'ignore') srcJson = json.loads(srcTemp)['data'] category = [] for Fi in srcJson: targetFi = Fi['s'] for Se in targetFi: targetSeTitle = Se['n'] targetSe = Se['s'] for Ti in targetSe: targetTiTitle = Ti['n'] targetTi = Ti['s'] for Fo in targetTi: targetFoTitle = Fo['n'] categoryTemp = [targetSeTitle.split('|')[1], targetSeTitle.split('|')[0], targetTiTitle.split('|')[1], targetTiTitle.split('|')[0], targetFoTitle.split('|')[1], targetFoTitle.split('|')[0]] category.append(categoryTemp) queue_for_url_targetBase.put((targetFoTitle.split('|')[1], targetFoTitle.split('|')[0])) db = DBService(dbName='jddata', tableName='jdkeyword') db.createTable(tableTitle=['category_fi_name', 'category_fi', 'category_se_name', 'category_se', 'category_ti_name', 'category_ti']) db.data2DB(data=category) # for item in category: # print(item) # try: # db.data2DB(data=item) # except:continue # print('=' * 50) return category
def dataGen(): comDict = companyInfo() proDict = productInfo() dict = {} for item in comDict.items(): if item[0] in proDict.keys(): dict[item[0]] = comDict[item[0]] + [proDict[item[0]]] else: continue data = [item[1] for item in dict.items()] db1 = DBService(dbName='jddata', tableName='thirdPartShopInfo') title = db1.getTableTitle() title = title + ['commnetCount'] print(title) db2 = DBService(dbName='jddata', tableName='thirdPartShopInfoAddtest') db2.createTable(tableTitle=title) db2.data2DB(data=data)
#coding:utf8 __author__ = '613108' from ms_spider_fw.DBSerivce import DBService dbs=DBService(dbName='elec_platform',tableName='tmall_baseinfo_everyweek') data=dbs.getData() data=[item for item in data if int(item[-2])>=35] print(len(data))
def putDataIntoDB(path): data = getData(path=path) dbs = DBService(dbName='elec_platform', tableName='tmall_baseinfo_weekly_2016') dbs.data2DB(data=data)
print(len(judgePageHref)) return judgePageHref def craweldhref(): db = DBService('elec_platform', 'yms_tmall_shopinfo_com_withoutjudge') href = db.getData(var='href') href = [item[0] for item in href] F = lambda x: x[:-1] if x[-1] == '/' else x href = map(F, href) print(len(href)) return href def href(): temp1 = commentHrefList() temp2 = craweldhref() temp2 = set(temp2) temp3 = [] for item in temp1: if not item[1] in temp2: temp3.append(list(item)) else: continue temp3=[[item[0],item[1]+'/','http://rate.taobao.com/user-rate-'+item[2]+'.htm']for item in temp3] return temp3 temp = href() db=DBService('elec_platform', 'yms_tmall_shopinfo_com_withoutjudge') db.data2DB(data=temp,tableTitle=['name','href','judgepage_href'])
def startUrlList(self): db = DBService(dbName='jddata', tableName='thirdPartShopInfo') data = db.getData(var='shopHref', distinct=True) data = [item[0] for item in data] return data
__author__ = '613108' from ms_spider_fw.DBSerivce import DBService dbs = DBService(dbName='jddata', tableName='thirdPartShopInfo') companyCount = dbs.getData(var='companyName', distinct=True) shopCount1 = dbs.getData(var='shopHref', distinct=True) shopCount2 = dbs.getData(var='shopName', distinct=True) gradeHref = dbs.getData(var='gradeHref', distinct=True) print len(companyCount) print len(shopCount1) print len(shopCount2) print len(gradeHref)
def startUrlList(self): db = DBService(dbName="jddata", tableName="thirdPartShopAppID") data = db.getData(var="appID") data = ["http://mall.jd.com/view_search-" + item[0] + "-0-5-1-24-1.html" for item in data if item[0]] return data
import pymysql import json from ms_spider_fw.DBSerivce import DBService import threading import Queue json_file_queue = Queue.Queue(0) connect_jd = pymysql.connect( host='10.118.187.12', user='******', passwd='admin', database='platform_data' ) connect_dict = {'host': '10.118.187.12', 'user': '******', 'passwd': 'admin', 'charset': 'utf8'} dbs = DBService(dbName='platform_data', tableName='jd_data_temp_0326', **connect_dict) dbs.createTable( tableTitle= map(lambda x: x.strip(), 'shop_name, addr, com_name, shop_href, cate_0, score_summary, ' 'express_score, product_score, service_score,product_href, vender_id, ' 'sku_id, size_count'.split(',')) ) def get_min_max_id(): sql_min = 'SELECT MIN(id) FROM jd_product_detail' sql_max = 'SELECT MAX(id) FROM jd_product_detail' cur = connect_jd.cursor() cur.execute(sql_min) min_id = cur.fetchall()
reload(sys) sys.setdefaultencoding('utf8') # config_text db_name = 'platform_data' table_name = 'suning' table_title = 'product_url,catalogue,sub_catalogue,product_title,promotion_desc,origin_price,price,' \ 'product_stars,comment_count,sending_service,other_service,product_params,shop_name,' \ 'shop_href,product_rating,product_rating_avg,serice_rating,service_rating_avg,express_rating,' \ 'express_rating_avg,com_name_tel,crawl_time' url_start = 'http://www.suning.com/emall/pgv_10052_10051_1_.html' # start url for crawl,string connect_dict = {'host': '10.118.187.12', 'user': '******', 'passwd': 'admin', 'charset': 'utf8'} # script db_server = DBService(dbName=db_name, tableName=table_name, **connect_dict) if not db_server.isTableExist(): db_server.createTable(tableTitle=table_title.split(',')) class Handler(BaseHandler): crawl_config = { } @every(minutes=24 * 60) def on_start(self): self.crawl(url_start, callback=self.step_first) @config(age=2 * 24 * 60 * 60) def step_first(self, response):