def crawling(): PROJECT_NAME = 'intranet' HOMEPAGE = 'http://intranet.iitg.ernet.in/' DOMAIN_NAME = getDomainName(HOMEPAGE) QUEUE_FILE = PROJECT_NAME + '/queue.txt' CRAWLED_FILE = PROJECT_NAME + '/crawled.txt' NOTLINKS_FILE = PROJECT_NAME + '/notLinks.txt' CANNOTOPEN_FILE = PROJECT_NAME + '/cannotOpen.txt' NUMBER_OF_THREADS = 8 queue = Queue() spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME) # Create worker threads (die when main exits) def createWorkers(): for _ in range(NUMBER_OF_THREADS): t = threading.Thread(target=work) t.daemon = True t.start() # Do the next job in queue def work(): while True: url = queue.get() spider.crawlPage(threading.current_thread().name, url) queue.task_done() # Each queued link is a new job def createJobs(): for link in fileToSet(QUEUE_FILE): queue.put(link) queue.join() crawl() #check if there are links in queue, if so then crawl def crawl(): queuedLinks = fileToSet(QUEUE_FILE) if len(queuedLinks) > 0: print(str(len(queuedLinks)) + ' links in queue') createJobs() createWorkers() crawl()
def test_marks_episode_as_do_not_download(self): mockDAL = Mock(spec=DAL.DAL) mockDAL.get_eps_for_show = MagicMock(return_value = {1: [1,2,3,4,5,6,7,8,9,10,11,12]}) spiderbro = spider.spider(mockDAL) spiderbro.config.force_learn = True spiderbro.find_torrents_for_show("Constantine") mockDAL.mark_episode_do_not_download.assert_called_once_with("Constantine", 1, 13, unittest.mock.ANY, unittest.mock.ANY)
def __init__(self, proxies={'http': 'http://127.0.0.1:8080', 'https': 'http://127.0.0.1:8080'}): """ Creates an instance of the ZAP api client. :Parameters: - `proxies`: dictionary of ZAP proxies to use. Note that all of the other classes in this directory are generated new ones will need to be manually added to this file """ self.__proxies = proxies self.acsrf = acsrf(self) self.ajaxSpider = ajaxSpider(self) self.ascan = ascan(self) self.authentication = authentication(self) self.autoupdate = autoupdate(self) self.brk = brk(self) self.context = context(self) self.core = core(self) self.forcedUser = forcedUser(self) self.httpsessions = httpSessions(self) self.importLogFiles = importLogFiles(self) self.params = params(self) self.pnh = pnh(self) self.pscan = pscan(self) self.script = script(self) self.search = search(self) self.selenium = selenium(self) self.sessionManagement = sessionManagement(self) self.spider = spider(self) self.users = users(self)
def __init__(self, num): self.num = num self.sp = spider.spider() if num == 4 or num == 5: self.sp.code = 'gb2312' self.html = "" self.engineName = r'.*?java.*?工程师.*?'
def gather(q,urls): """ Parses each url in list urls and puts the item in queue q. """ for url in urls: miner = spider.spider() q.put(miner.parse(url),True)
def get_match_ids(): Requst = spider() res_match_ids = [] next_id = '5584623680' last_id = next_id while True: url, params = gen_match_id_by_opendota(next_id) try: JsonMatches = Requst.GET(url, params) #print(JsonMatches) ids = get_all_match_ids(JsonMatches) add = 0 for i in ids: if i not in res_match_ids: res_match_ids.append(i) add += 1 print('get ids size = {}, add {} to res match ids, now size={}, now id ={}'.format(len(ids), add, len(res_match_ids), next_id)) next_id = get_next_id(ids) if next_id is None: next_id = int(last_id) - 100 last_id = next_id if len(res_match_ids) > 50000: saveData('data/match_ids.data', res_match_ids) except: print('error')
def main(): print('现在是:' + datetime.now().strftime("%Y-%m-%d %H:%M:%S")) #爬取 print '开始爬取' web = spider() web.start() print '爬取完成' #转pdf print '开始转换' for title in web.titleSet: dirtopdf(title) dir_merge_pdf(title) print '转换完成' #发送到Kindle print '开始发送' files = os.listdir("pdf") for file in files: sendEMAIL(file) print '发送完成' print '开始清理' for title in web.titleSet: shutil.rmtree(title) for file in os.listdir("pdf"): os.remove("pdf/" + file) print '清理完成' print 'ALL DONE!'
def test(): # url="http://www.mm131.com/chemo/" # sp=spider() # sp.getWebList(url) sp = spider() sp.getcode('http://www.mm131.com/xiaohua/12.html') sp.getimgdict("http://www.mm131.com/qingchun/1.html")
def __init__(self, num): threading.Thread.__init__(self) self.setDaemon(True) self.sp = spider() self.nulcount = 1 #self.headurl=source.urllist[num][1] self.headurl = "" self.stop = False self.num = num
def main(): sched = BlockingScheduler() sched.add_job(spider.spider(), 'interval', seconds=21600) print 'Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C') try: sched.start() except (KeyboardInterrupt, SystemExit): pass
def run(self): global timer_interval htmlMaker = HtmlMaker(self.REDIS_IP, self.REDIS_PORT, self.OUTPUT_PATH) sp = spider(self.LOG_ADDRESS,self.LOG_FORMAT,self.LOG_LEVEL,self.REDIS_IP,self.REDIS_PORT, self.REDIS_FREQUENCE,self.SPIDER_KEYS) while self.run_status: sp.capture() htmlMaker.makeHtml() self.sleep(timer_interval)
def spider_task(): _REDIS.setnx(_PAGE_INDEX, 1) page_index = _REDIS.get(_PAGE_INDEX) task = spider.spider() while task.start(page_index): page_index = _REDIS.incr(_PAGE_INDEX) logging.info('Crawl the index page %s to finish' % page_index) while task.start(): logging.info('Crawl the first page to finish, sleep one day') time.sleep(60 * 60 * 24)
def rumMain(now): ''' 主程序''' t_bg = time.perf_counter() logger.info('[WeiBo System] start to run...') message_file_path = os.path.join(path_message, f'messeage_{now}.txt') spider(urls=[ f'https://weibo.com/{ii}?is_all=1&stat_date={months}#feedtop' for ii in targets ], message_file_path=message_file_path) try: send_email(f'Weibo{now}', message_file_path) logger.info('[Email] send e-mail successfully...') except: logger.warning(f'[Email] failed to send {message_file_path}!!!') t_ed = time.perf_counter() logger.info(f'[WeiBo System] end to run, escape {t_ed - t_bg:.2f} secs...')
def test_login(self, stock): code = stock[0] market = stock[1] global path path = './data/stock/%s/xueqiu/raw/'%(code) url = 'http://xueqiu.com/stock/industry/stockList.json?type=1&code=%s%s&size=8&_=1433829008414'%(market,code) sp = spider() print url stock_html = sp.getsource_xueqiu(url, 'utf-8') print stock_html
def preprocess(self, preproc_params=None): ''' Pre-train the data with the provided preproc_params ''' if preproc_params == None: preproc_params = self.preproc_params elif self.preproc_params == None: self.preproc_params = preproc_params skip_flag = False else: skip_flag = True p = preproc_params # alias # discrete is no-op only for in-core; otherwise, skip_flag if same params discrete = p['discrete'] skip_flag &= i_reduced == self.preproc_params['i_reduced'] skip_flag &= discrete == self.preproc_params['discrete'] if ~self.outofcore: self.Xd = self.data[0] self.Xc = self.data[1] self.y = self.data[2] self.w = self.data[3] elif ~skip_flag: i_reduced = self.i_reduced with open("higgs/working/reduced_train.dat", 'rb') as f: for i in range(4*i_reduced): temp = pickle.load(f) self.Xd = pickle.load(f) self.Xc = pickle.load(f) self.y = pickle.load(f) self.w = pickle.load(f) temp = None # prevent pickling memory leak if discrete: self.X = self.Xd else: self.X = self.Xc if ~skip_flag: self.reload_cv() # skip_flag spider if same params skip_flag &= p["spider"] == self.preproc_params["spider"] if ~skip_flag: spider_params = p["spider"] spider_params['metric'] = 'wminkowski' spider_params['w'] = np.max(self.X, axis=0) self.X, self.y, self.w = spider(self.X, self.y, self.w, **spider_params) # skip_flag feature_selection if same params skip_flag &= p["feature_selection"] == self.preproc_params["feature_selection"] if ~skip_flag: fs_alg = p["feature_selection"]["algorithm"] fs_params = p["feature_selection"]["params"] fs = FeatureSelector(algorithm=fs_alg) fs.fit(self.Xd, self.y, **fs_params) self.X = fs.transform(self.X) self.preproc_params = p
def test_topit(page_num=20, thread_num=10, limit=None, img_store_path="./pics/"): global sqlite global url_pool global g_workers global img_download_counter img_download_counter = 0 URL_PREFIX = "http://www.topit.me/" def construct_root_url(num=2): url_prefix = URL_PREFIX url = lambda n: url_prefix + "?p=" + str(n) start_pages = map(url, range(num)) start_pages = set(start_pages) return start_pages DB_PATH = "./topit.db" sqlite = sqlite3.connect(DB_PATH) cur = sqlite.cursor() cur.execute("SELECT url FROM urls") urls = cur.fetchall() map(url_pool.add, urls) del urls urls = construct_root_url(num=page_num) urls = list(urls) tasks = generate_init_tasks(urls=urls, img_store_path=img_store_path, ) g_workers = [] store_queue = queue.Queue() img_download_queue = queue.Queue() parse_queue = queue.Queue() download_queue = queue.Queue() spider_instance = spider(urls=urls, store_queue=store_queue, img_download_queue=img_download_queue, download_queue=download_queue, parse_queue=parse_queue, limits=limit ) spider_instance._setup(tasks, download_queue) for i in range(thread_num): w = gevent.spawn(spider_instance.run,) w.working = None w.page = None g_workers.append(w) gevent.joinall(g_workers)
def application(): from skywalking.trace.context import get_context get_context().put_correlation("correlation", "correlation") # @runnable(op="/test") # def post(): # requests.post("http://127.0.0.1:9092/users") # # from threading import Thread # t = Thread(target=post) # t.start() # # res = requests.post("http://127.0.0.1:9092/users") # # t.join() mysqldb = MysqlTaskConfig().get_instance() spider(69, 70, mysqldb, "xiaohongshu") from kafka import KafkaProducer producer = KafkaProducer(bootstrap_servers=['127.0.0.1:9092'], api_version=(1, 0, 1)) producer.send('skywalking', b'xiaohongshu') return jsonify({"status": "okokokok"})
def run(email, password): spider.set_repo('repo_file') bot = spider.spider('my_friends', email, password) bot.log.setLevel(20) rid, login_info = bot.login() if rid is None: print('spider login error. detail:{}'.format(login_info)) return else: print('spider login success. rid={}'.format(rid)) spider.spider.getNet2(bot, rid) return rid
def MuluPaQu(): print "===================================" print MuluPaQuInf.url try: print MuluPaQuInf.url global count count = 0 spider.spider_url.append(MuluPaQuInf.url) spider.dir_url.append(MuluPaQuInf.url.split('?')[0]) path = os.path.dirname(os.path.realpath(__file__)) + "\\MLPQ_scan.txt" if os.path.exists(path): os.remove(path) print spider.spider_url flag = 0 while len(spider.spider_url) > 0: try: spider(spider.spider_url.pop()) flag = 1 except: pass if flag == 1: result4 = list() data4 = file('./file/result/MLPQ_scan.txt') for line in data4: result4.append({"id": line[5:-1]}) final = list() final.append({"content": result4, "ways": "MuluPaQu"}) str1 = 'file\history\op' + str(Optimes) + '.txt' with open(str1, 'a+') as f: f.write(str(final).replace("\'", "\"")) global Optimes Optimes = Optimes + 1 f = open('./file/optimes.txt', 'wb') f.write(str(Optimes)) return jsonify(result4) except: print "--------------------Error----------------------"
def get_data(locals): location = { "杭州": '080200', "上海": '020000', "北京": '010000', "广州": '030200', "深圳": '040000', '武汉': '180200', '宁波': '080300', "苏州": '070300', '南京': '070200', '长沙': '190200', '成都': '090200', '重庆': '060000', '昆明': '250200', '西安': '200200', '哈尔滨': '220200', '大连': '230300', '长春': '240200' } for local in locals: local_code = location[local] if not os.path.exists('data'): os.mkdir('data') file = 'data\\{}.csv'.format(local) with open(file, 'w') as f: f.close() for page in range(1, 2001): url = 'https://search.51job.com/list/{}' \ ',000000,0000,00,9,99,+,2,{}.html' \ '?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&' \ 'jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='.format(local_code,page) with open('data\\t.text', 'w') as f: f.writelines(url) f.close() spider(url, file) print("保存成功!", end=" ") print("location:{},page={}".format(local, page))
def main(): if len(sys.argv) < 1: print "Specify URL" sys.exit(1) try: the_spider = spider(0, 10, True) the_spider.work(sys.argv[1]) except: raise
def orderSort(currentLink, workName, tag4link, class4link, tag4nextPage, class4nextPage, volume): link = currentLink while True: crawler = spider(link, workName) furtherLink = crawler.getLinks(tag4link, class4link) if furtherLink != None: for i in furtherLink: fileGenerator(workName, i, volume) link = crawler.nextPage(tag4nextPage, class4nextPage) if link == None: break time.sleep(1)
def __init__(self): Frame.__init__(self) self.spider = spider.spider() self.text = Text() self.alertButton1 = Button(self, text='单进程爬取网页', command=lambda: thread_it(self.pachong)) self.alertButton2 = Button(self, text='多进程爬取网站', command=lambda: thread_it(self.morepachong)) self.nameInput = Entry(self, width=50) self.listbox = Listbox(self) self.quitButton = Button(self, text='Quit', command=self.quit)
def main(): init() spiders = [] threadid = 2 exitFlag = 0 proxiesPool.update() # 先获取代理ip proxyThread = spider.spider( # 开启代理ip线程 updateProxiesPool, 1, exitFlag=exitFlag, pool=proxiesPool) proxyThread.start() spiders.append(proxyThread) for i in range(0, 30): # 开启获取用户线程 sp = spider.spider(crawlUppers, threadid, carry=upperInputTask, output=upperOutputTask, cfg=cfg, pool=proxiesPool) sp.start() spiders.append(sp) threadid = threadid + 1 # 主线程, 插入数据库 while not upperInputTask.empty() or not upperOutputTask.empty(): if not upperOutputTask.empty(): upper = upperOutputTask.get() db.execute( sqlInsertUpperInfo, (upper.mid, upper.follower, upper.uname, upper.face, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(upper.regtime)), upper.sex, upper.level, upper.vipType, upper.vipStatus)) exitFlag = 1 for i in spiders: i.join()
def contenter(BigworkName, tag4pat, class4pat, tag4doc, class4doc, volume): folder = BigworkName + '_linker' workName = BigworkName + '_content' position = [0,0] logs = None try: file = open(workName+'.log', 'r') logs = file.readlines() file.close() logs = logs[-1] if '-------Programme Finished!-------' in logs: logs = 'end' else: logs = logs.split(' ') # current finished position: file # line # logs = [int(logs[4]) - 1,int(logs[6]) - 1] except: pass if logs == None: logger(workName, "-------Start Programme!-------") elif logs == 'end': print("The program is finished!") return None else: logger(workName, "-------Restart Programme!-------") position = logs while True: link = filePicker(BigworkName, folder, position) if link == None: break else: link = link.replace('\n','') crawler = spider(link,workName) logger(workName, 'Current finished position: file ' + str(position[0] + 1) + ' line ' + str(position[1] + 1)) text_patiant = crawler.getText(tag4pat, class4pat)[0] if text_patiant != None: text_doctor = crawler.getText(tag4doc, class4doc)[0] if text_doctor != None: textGenerator(workName, text_patiant, 'pat', position, volume, 1) textGenerator(workName, text_doctor, 'doc', position, volume) if position[1] == volume - 1: position[0] += 1 position[1] = 0 else: position[1] += 1 time.sleep(1)
def write_to_csv(city, area, random_delay): ''' :param city: 城市名 :param area: 区域名 :return: 将爬取的数据写入文件:ershoufang-city-area.csv ''' city_ch = cities[city] area_ch = get_city_area(city)[area] print('Now writing {0}|{1}'.format(city_ch, area_ch), 'to csv') with open('ershoufang-{0}-{1}.csv'.format(city_ch, area_ch), 'w') as csvfile: for info in spider(city, area, random_delay): print("Now wrting:", '|'.join(info[0:5])) csvfile.write('|'.join(info)) csvfile.write("\n")
def run(meth,orig_id=None): repo_mode, repo_name, user, passwd = init_config() spider.set_repo(repo_mode) tt = spider.spider(repo_name,user,passwd) tt.log.setLevel(20) my_rid, login_info = tt.login() if my_rid is None: print('spider login error. detail:{}'.format(login_info)) if not input('continue for test?(1/0)'): return None else: my_rid='11111111' else: print('spider login success. rid={}'.format(my_rid)) if orig_id is None: orig_id = my_rid meth(tt,orig_id)
def main(): global myRequests ,headers # ssl._create_default_https_context = ssl._create_unverified_context headers = {'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.125 Safari/537.36'} myRequests = requests.Session() myRequests.headers.update(headers) link = weixin() link.start() spi = spider() while 1: price = spi.start() print(price) # for price in prices: # print (price[1]) link.sendMsg('mountain blue','Daen' ,str(price)) time.sleep(10)
def get_info(): url = request.form.get('url') if re.search(u"http", url) is None: url = "http://" + url error = None #spider(url) iteminfo = spider(url) if iteminfo == {}: error = "invalid item" return render_template("show_info.html", error=error) if iteminfo != {}: title = iteminfo['t'].decode('utf-8') price = iteminfo['p'].decode('utf-8') links = iteminfo['l'] #return iteminfo[1] return render_template("show_info.html", title=title, price=price, links=links )
def __init__(self, proxies=None, apikey=None): """ Creates an instance of the ZAP api client. :Parameters: - `proxies`: dictionary of ZAP proxies to use. Note that all of the other classes in this directory are generated new ones will need to be manually added to this file """ self.__proxies = proxies or { 'http': 'http://127.0.0.1:8080', 'https': 'http://127.0.0.1:8080' } self.__apikey = apikey self.acsrf = acsrf(self) self.ajaxSpider = ajaxSpider(self) self.ascan = ascan(self) self.authentication = authentication(self) self.authorization = authorization(self) self.autoupdate = autoupdate(self) self.brk = brk(self) self.context = context(self) self.core = core(self) self.forcedUser = forcedUser(self) self.httpsessions = httpSessions(self) self.importLogFiles = importLogFiles(self) self.params = params(self) self.pnh = pnh(self) self.pscan = pscan(self) self.reveal = reveal(self) self.script = script(self) self.search = search(self) self.selenium = selenium(self) self.sessionManagement = sessionManagement(self) self.spider = spider(self) self.stats = stats(self) self.users = users(self) # not very nice, but prevents warnings when accessing the ZAP API via https requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
def spider_btn(self): ''' 爬虫开启按钮 ''' try: #清空日志及表格 self.textBrowser_spider.clear() self.tableWidget.setRowCount(0) #得到开始结束id 保存路径 start_id = self.lineEdit_start_id.text() end_id = self.lineEdit_end_id.text() filename = self.lineEdit_save.text() #实例爬虫对象 开启爬虫 self.spiderObj = spider(start_id, end_id, filename) self.spiderObj.sig_one_end.connect(self.spider_end) self.spiderObj.sig_item_end.connect(self.spider_item) self.spiderObj.sig_end.connect(self.spider_end) self.spiderObj.start() except Exception as e: print(e)
def main(): Config = config('conf/config.ini') MatchesDetailBaseUrl = Config.getMatchesUrl() Requst = spider() #get parsed matches id MatchesIds = readMatchIdFromFile('data/match_ids.data') AlreadyDoneIdFile = 'tmp/AlreadyDoneId.data' AlreadyDoneId = [] try: AlreadyDoneId = readAlreadyDoneIds(AlreadyDoneIdFile) except: traceback.print_exc() ids = list(set(MatchesIds) - set(AlreadyDoneId)) #get matches detail by matches id count = 0 for item in ids: retry = 0 while retry < 3: try: MatchID = item StoreFileName = 'data/matchesdata/{}.data'.format(MatchID) if os.path.exists(StoreFileName): print('file {} exists.'.format(StoreFileName)) AlreadyDoneId.append(MatchID) break print('start match_id={}'.format(MatchID)) MatchesDetailUrl = changeMatchId(MatchesDetailBaseUrl, str(MatchID)) print(MatchesDetailUrl) JsonMatchesDetail = getMatchesDetail(Requst, MatchesDetailUrl) saveData(StoreFileName, JsonMatchesDetail) AlreadyDoneId.append(MatchID) if count %100 == 0: saveData(AlreadyDoneIdFile, AlreadyDoneId) break except: traceback.print_exc() print('exception in {}, retry {} times'.format(MatchID,retry)) retry+=1 count += 1
def get_info(): url = request.form.get('url') if re.search(u"http", url) is None: url = "http://" + url error = None #spider(url) iteminfo = spider(url) if iteminfo == {}: error = "invalid item" return render_template("show_info.html", error=error) if iteminfo != {}: title = iteminfo['t'].decode('utf-8') price = iteminfo['p'].decode('utf-8') links = iteminfo['l'] #return iteminfo[1] return render_template("show_info.html", title=title, price=price, links=links)
def __init__(self, proxies={'http': 'http://127.0.0.1:8080', 'https': 'http://127.0.0.1:8080'}): """ Creates an instance of the ZAP api client. :Parameters: - `proxies`: dictionary of ZAP proxies to use. Note that all of the other classes in this directory are generated new ones will need to be manually added to this file """ self.__proxies = proxies self.acsrf = acsrf(self) self.ascan = ascan(self) self.auth = auth(self) self.autoupdate = autoupdate(self) self.context = context(self) self.core = core(self) self.params = params(self) self.pscan = pscan(self) self.search = search(self) self.spider = spider(self)
def init(self): # This should all be done by the manifest parser globals.marek = marek.marek(120, 100, 2) # initialize the marek object increment = 580/int(globals.spider_number) for i in range(20, 600, increment): sprite = spider.spider(i, 420) images = [engine.load_image('spider.png')] ani = em.Animation(images) event = em.SpriteEvent("onIdle", None, ani, 0, 0) sprite.em.add(event) event = em.SpriteEvent("onWalkLeft", None, ani, 0, 0) sprite.em.add(event) event = em.SpriteEvent("onWalkRight", None, ani, 0, 0) sprite.em.add(event) event = em.SpriteEvent("onJump", None, ani, 0, 0) sprite.em.add(event) event = em.SpriteEvent("onFall", None, ani, 0, 0) sprite.em.add(event) event = em.SpriteEvent("onShoot", None, ani, 0, 0) sprite.em.add(event) event = em.Event("onCollide", None) sprite.em.add(event) globals.spiders.append(sprite) globals.camera = engine.Camera(globals.map, globals.marek, globals.window.width, globals.window.height)
def weiboIdToUrl(url): html = spider(url) return re.search(r'私信</a> \<a href="/(.*?)/info"\>资料</a>', html, re.S).group(1)
# COOKIES = weiboCookies # url = weiboURL # weiboIdSpider(spider(url + 'follow')) zhihuCookies = { "Cookie": "_ga=GA1.2.200227533.1448713358; " + "q_c1=3c8a6952ff6b451186e548a78e07e5f3|1448717055000|1448717055000; " + "_za=c4856f7e-2b10-4c6f-ac1c-7120243828b1; _xsrf=53de1f0fc43b2118b48cfe714e889872; __utmt=1; " + 'cap_id="NmJkYTc0OWUzZTQ4NGQyY2E3MjQ2ZmI0NWU0Mzk1MzM=|1448868103|a22b3ff3843b0e08bf078ff3cabd69c590ffe399"; ' + "__utma=51854390.200227533.1448713358.1448868015.1448868015.1; " + "__utmb=51854390.16.9.1448868181265; __utmc=51854390;" + "__utmz=51854390.1448868015.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; " + "__utmv=51854390.000--|2=registration_date=20130912=1^3=entry_date=20151128=1; " 'z_c0="QUFBQXZRc2VBQUFYQUFBQVlRSlZUWGVHZzFhbTcyREhCeDYwdWRFRUx2RlZvemFOWm14ck9BPT0=|1448868215|cce7afe611ff3806d63d8d8b943b36af4c40302d"; ' + 'unlock_ticket="QUFBQXZRc2VBQUFYQUFBQVlRSlZUWDhBWEZiRXBVVTZxdFA0bTluOEVHMWFIY3pNU2dvWWVBPT0=|1448868215|f0ede700d045c55277c0f34f20a00f332f6be485"' } zhihuURL = 'http://www.zhihu.com/people/xie-ke-41/followees' # url = zhihuURL # COOKIES = zhihuCookies # zhihuIdSpider(spider(url)) COOKIES = weiboCookies url = 'http://weibo.cn/2807748433/info' weiboInfoSpider(spider(url))
def test_gets_only_aired_eps(self): spiderbro = spider.spider() mockdate = date(2015, 1, 20) # only season ep 9 has aired at this point tvdb_episodes, status = spiderbro.get_tvdb_episodes('Constantine', mockdate) self.assertDictEqual(tvdb_episodes, {1:[1,2,3,4,5,6,7,8,9]})
def test_ignores_specials(self): spiderbro = spider.spider() tvdb_episodes = spiderbro.get_tvdb_episodes('Doctor Who (2005)') self.assertNotIn(0, tvdb_episodes)
#coding = utf-8 import re import sys import spider name=1 def p(dom,url): global name if re.findall('400-188-6666',str(dom)): f = open('F:\\webdeveloper\\z7z8\\88\\'+str(name)+'.html','wb') f1 = open('F:\\webdeveloper\\z7z8\\88\\list.txt','a') f1.write(url) f.write(dom) name=name+1 print(sys.argv) sp = spider.spider(sys.argv[1],int(sys.argv[2]),int(sys.argv[3]),p) sp.start() print(sp.pagelist())
def test_dont_mark_download_whole_season_if_season_not_finished_airing(self): mockDAL = Mock(spec=DAL.DAL) mockDAL.get_eps_for_show = MagicMock(return_value = {}) spiderbro = spider.spider(mockDAL) tvdb_episodes = spiderbro.get_missing_episodes('Orphan Black') self.assertNotEqual(tvdb_episodes[3], [-1])
def test_mark_show_as_ended(self): mockDAL = Mock(spec=DAL.DAL) spiderbro = spider.spider(mockDAL) tvdb_episodes, status = spiderbro.get_tvdb_episodes('Life On Mars') mockDAL.mark_show_ended.assert_called_with('Life On Mars')
def test_updates_show_table(self): mockDAL = Mock(spec=DAL.DAL) mockDAL.get_eps_for_show = MagicMock(return_value = {1: [1, 2]}) spiderbro = spider.spider(mockDAL) missing_episodes = spiderbro.get_missing_episodes("Life On Mars") mockDAL.update_show_table.assert_called_with('Life On Mars')
def scrap(start_day=None, end_day=None, start=1, end=20): logger.clear() logger.begin(start_day, end_day, start) logger.log("Try to get expressions...", flush=True) if end != None: expressions = expression.objects.filter(id__range=(start, end)).order_by('id') else: expressions = expression.objects.filter(id__range=(start, 3000)).order_by('id') s = spider() logger.log("Try to login...", flush=True) browser = s.login() cnt = 0 file_path = '' for item in expressions: cnt += 1 logger.log(u"第" + str(item.id) + u"个表达式:" + item.name, count=item.id, flush=True) #验证是否登录 check_login = s.check_login(browser) if not json.loads(check_login)['success']: logger.log('check is not login , sleep 100s ,then try login again') time.sleep(100) browser = s.login() file_path = s.get_xls_by_expression(item.content, browser, start_day, end_day) if file_path != None: file_path = os.path.normpath(file_path) #logger.log(file_path) rows = excel_table_byindex(file_path, include_name=False) #删除文件 os.remove(file_path) for row in rows: # 忽略第一行 if row == rows[0]: continue apply_num = row[0] # 查重 p = patent.objects.filter(apply_number=apply_num) if len(p) > 0: logger.log("{0} update!".format(apply_num)) p = p[0] records = excute_record.objects.filter(expression=item, time_stamp=row[6]) if len(records) > 0: record = records[0] else: record = excute_record(expression=item, time_stamp=row[6]) record.save() p.record = record p.apply_number = row[0] p.name = row[1] p.main_classify_code = row[2] p.classify_code = row[3] p.apply_man = row[4] p.invente_man = row[5] p.publicity_date = row[6] p.publicity_code = row[7] p.patent_agent = row[8] p.agent = row[9] p.aplly_date = row[10] p.address = row[11] p.priority = row[12] p.province_code = row[13] p.abstract = row[14] p.main_right = row[15] p.international_apply = row[16] p.international_publicity = row[17] p.enter_country_date = row[18] p.right_demand = row[20] p.valid_state = row[21] p.state_code = row[22] p.type = row[23] p.save() continue logger.log(apply_num) #插入纪录 records = excute_record.objects.filter(expression=item, time_stamp=row[6]) # row[6]==public data # if len(records) > 0: # logger.log("record already exist !") record = records[0] else: record = excute_record(expression=item, time_stamp=row[6]) # row[6]==public data # record.save() p = patent( # 对应的执行记录 record=record, # 申请号 apply_number=(row[0]), # 名称 name=(row[1]), # 主分类号 main_classify_code=row[2], #分类号 classify_code=row[3], #申请(专利权)人 apply_man=row[4], #发明(设计)人 invente_man=row[5], #公开(公告)日 publicity_date=(row[6]), #公开(公告)号 publicity_code=row[7], # 专利代理机构 patent_agent=row[8], # 代理人 agent=row[9], # 申请日 aplly_date=row[10], # 地址 address=row[11], # 优先权 priority=row[12], # 国省代码 province_code=row[13], # 摘要 abstract=row[14], # 主权项 main_right=row[15], # 国际申请 international_apply=row[16], # 国际公布 international_publicity=row[17], # 进入国家日期 enter_country_date=row[18], # 权利要求书 right_demand=row[20], # 法律状态 valid_state=row[21], # 专利状态代码 state_code=row[22], # 专利类型 type=row[23] ) try: p.save() except Exception, e: logger.log(str(e), flush=True) logger.log('failed to save patent!',flush=True)
hos_level = { 33:'三级甲等', 30:'三级医院', 20:'二级医院', 10:'一级医院', } num = 0 for x in hos_class: #遍历医院类型 for y in hos_level: #遍历医院等级 for z in hos_area: #遍历医院所属地区 #开始抓取页面并解析 hos_class_key = x hos_level_key = y hos_area_key = z url = "http://www.zj12580.cn/hos/all?page=1&pageSize=30&levelId="+`hos_level_key`+"&typeId="+`hos_class_key`+"&areaId="+`hos_area_key` html = spider(url) print hos_level[hos_level_key],hos_class[hos_class_key],hos_area[hos_area_key] # 进行页面解析 soup = BeautifulSoup(html) content = soup.find('div',{"class":"left_hos_bottom"}) hos_list = content.findAll("tr") # 当页面没有有效内容时,结束此次循环 if len(hos_list) == 0: continue for n in range(len(hos_list)): hos_info = hos_list[n].find('p',{'class':'title'}).find('a') hos_name = hos_info.text.encode('utf8') #医院名字 hos_url = hos_info['href'] #医院url match_obj = re.match(r'.*hos/info/(\d{1,4})\?deptCode.*',hos_url) hos_offi_id = match_obj.group(1) #官网医院id # sql = "insert into hospital(hos_name,class,level,region,hos_url)\
from BeautifulSoup import BeautifulSoup db = MySQLdb.connect("localhost","root","123456","guahao" ) cursor = db.cursor() sql_charset = 'set names utf8' cursor.execute(sql_charset) sql = 'select hos_id,hos_url,hos_name from hospital ' cursor.execute(sql) results = cursor.fetchall() for row in results: hos_id = int(row[0]) hos_url = row[1] hos_name = row[2] print hos_name if hos_url == 'http://www.bjguahao.gov.cn/comm/yyks-91.html': print '朝阳医院出现问题' else: html = spider.spider(hos_url) soup = BeautifulSoup(html) link_a = soup.find('a',{'rel':'#gm-ditu'}) img_src = link_a.find('img')['src'].encode('utf8') match_obj = re.match(r'.*center=(\d{1,4}\.\d{4,7}),(\d{1,4}\.\d{5,7}).*',img_src) longitude = match_obj.group(1) latitude = match_obj.group(2) sql_2 = 'update hospital set longitude =%s,latitude=%s where hos_id = %d' % (longitude,latitude,hos_id) cursor.execute(sql_2) db.commit() print longitude,latitude db.close()
def test_saves_successful_search_in_database(self): mockDAL = Mock(spec=DAL.DAL) mockDAL.get_eps_for_show = MagicMock(return_value = {1: [1,2,3,4,5,6,7,8,9,10,11,12]}) spiderbro = spider.spider(mockDAL) spiderbro.find_torrents_for_show("Constantine") mockDAL.mark_episode_for_download.assert_called_once_with("Constantine", 1, 13, unittest.mock.ANY, unittest.mock.ANY)
def spider(self): resultado = "Inicio" urlIni = self.par1.get() deep = self.par2.get() resultado = spider.spider(urlIni, deep) self.x.set(resultado)
def test_can_get_missing_episodes(self): mockDAL = Mock(spec=DAL.DAL) mockDAL.get_eps_for_show = MagicMock(return_value = {1: [1, 2]}) spiderbro = spider.spider(mockDAL) missing_episodes = spiderbro.get_missing_episodes("Life On Mars") self.assertDictEqual(missing_episodes, {1:[3,4,5,6,7,8], 2:[-1]})
# -*- coding:utf-8 -*- import spider if __name__ == '__main__': word = input("Input key word: ") url = 'https://www.baidu.com/sf/vsearch?wd=' + word + '&pd=video' spider = spider.spider() urls = spider.parseHtml(spider.getHtml(url)) spider.dowmloadVideos(urls)
import threading from queue import Queue from spider import spider from domain import * from general import * PROJECT_NAME = 'wikipedia' HOMEPAGE = 'https://www.wikipedia.org' DOMAIN_NAME = get_domain_name(HOMEPAGE) QUEUE_FILE = PROJECT_NAME + '/queue.txt' CRAWLED_FILE = PROJECT_NAME + '/crawled.txt' NUMBER_OF_THREADS = 2 queue = Queue() spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME) # Create worker threads (will die when main exits) def create_workers(): for _ in range(NUMBER_OF_THREADS): t = threading.Thread(target=work) t.daemon = True t.start() # Do the next job in the queue def work(): while True: url = queue.get() spider.crawl_page(threading.current_thread().name, url) queue.task_done()
def test_can_get_tvdbshow_episode_list(self): spiderbro = spider.spider() tvdb_episodes, status = spiderbro.get_tvdb_episodes('Life On Mars') self.assertDictEqual(tvdb_episodes, {1:[1,2,3,4,5,6,7,8], 2:[1,2,3,4,5,6,7,8]})
# python 3 compatibility import configuration import DAL import logging as log import spider configuration.setup_logging() log.info("SpiderBro, two point oh!") dal = DAL.DAL() sbro = spider.spider(dal) # TODO: implement cmdline args for 1 show, all shows, all shows airing this week for show in dal.get_full_show_list(): ga_list = sbro.find_torrents_for_show(show)