def dispose(ws,res): try: if res['command'] == 'GET_SCORE': #获取成绩 score = spider.Spider().get_score(res) score.update_score({"user":res['user'],"score":score}) ws.send(jsno.dumps({"command":"GET_SCORE","result":True})) except : try: #登录# if not res['check_code']: s = spider.Spider() check_code_bs4 = s.get_check_code() ########验证码图片的base4格式返回给服务器# ws.send(json.dumps({'username':res['username'],'check_code_bs4':check_code_bs4})) #####使用用户信息和验证码登录# s.login(res['username'],res['passwd'],res['check_code']) except: raise return res try: if res['login_test']: s = spider.Spider() #####使用用户信息和验证码登录# s.login(res['username'],res['passwd'],res['check_code']) if s.check_logged_in(): try: db.DB().add_user(res['username'],res['passwd']) except: pass ws.send(json.dumps({"login_test":True})) except: return res
def main(): if len(sys.argv) == 2 and sys.argv[1] == "help": print(""" Usage: %s (spider|tohtml|to-one-file) bookname options: spider: spider the given bookname, perducing program-usable and human-\ readable(though not pretty) files. tohtml: use the spidered content to create a html linked and indexed page \ that can be server with the command "python -m http.server" (The html's not \ perfect, I'm still working on it) to-one-file: (not made yet) (Use the spidered content and make it into one \ lage .txt file for easy coping and reading) bookname: the name that appers in the last part in the url if you opened it\ in the browser(right after book)(e.g. langtuteng) """ % os.path.basename(sys.argv[0])) elif len(sys.argv) == 3: do, bookname = sys.argv[1:] if do == 'spider': import spider spider.Spider(bookname).spider() elif do == "tohtml": import tohtml tohtml.HtmlWriter(bookname) else: sys.stderr.write("""Usage: python {0} (spider|tohtml|to-one-file) bookname or python {0} help""".format(os.path.basename(sys.argv[0]))) raise SystemExit
def main(): """入口函数 """ path = read_arg() config = spider_conf.SpiderConf() config.read_config_args(path) my_spider = spider.Spider(config) my_spider.run()
def __init__(self, master=None): Frame.__init__(self, master) self.root = master # 定义内部变量root self.word = StringVar() self.mean = StringVar() self.createPage() self.mysqlClient = MysqlClient() self.spider = spider.Spider()
def crawl_n(start=0, end=-1): crawl_list = pd.read_csv("./website_list.csv") for url in crawl_list["Company website address"][start:end]: s = spider.Spider(mongo_client) urls, page_contents = s.traverse_domain_bfs(url, max_depth=1) clean_text_col = get_clean_text_blocks(page_contents) insert_ids = put_mongo_records(mongo_client, "web_crawl", "web_text_content", clean_text_col)
def work(): while True: iterator = queue.get( ) # when get() is proceed, it blocks and waits until queue has something to return record = spider.Spider(threading.current_thread().name, PROJECT_URL_1, PROJECT_INITIAL_NUM, iterator, PROJECT_URL_2) with open(PROJECT_NAME + '.csv', 'a', encoding="utf-8") as table: csv_writer = csv.writer(table, delimiter='\t') csv_writer.writerow(record.get_list()) iterator_set.remove(iterator) queue.task_done()
def crawl(): case_numbers = [] start_number = CASE_START_NUMBER for j in range(NUMBER_OF_RANGES): for i in range(start_number, start_number + RANGE): case_number = CASE_PREFIX + str(i) case_numbers.append(case_number) spider1 = spider.Spider(PROJECT_NAME, URL, case_numbers) spider1.craw_page() spider1.save_results() case_numbers = [] start_number += RANGE
def do_POST(self): data = self.rfile.read(int(self.headers.getheader('content-length'))) self.send_response(200) self.send_header('Content-type', 'application/json') self.end_headers() url = data.split(' ')[0] maxPages = int(data.split(' ')[1]) maxLinks = int(data.split(' ')[2]) s = spider.Spider(seedUrls=[url], maxPages=maxPages, linksPerPage=maxLinks, scrapingFuncs=[createAdjList]) s.adjList = {} s.crawl() json.dump(s.adjList, self.wfile)
def main(config): app_spider = spider.Spider() user_rsp = input("Does the web application require authentication: [y/n]") cred = dict() if user_rsp is "y" or user_rsp is "Y": cred["USER_NAME"] = input("Username: "******"PASS"] = input("Password: "******"Web App Spider started crawling %s" % (config["APP_ADDRESS"])) web_app_config, vul_pages = app_spider.crawl(config["APP_ADDRESS"], cred) print(web_app_config) print("Pages vulnerable to HPP are: ", vul_pages) app_spider.teardown() print(time.asctime(), "WAF finished crawling")
def main(): ''' 进行爬取的主函数 ''' s = spider.Spider(config.start_url, num=config.num, max_layers=config.depth, succ_path=config.log_succ_path, fail_path=config.log_fail_path, save_path=config.root_save_path, time_out=config.time_out, filter_mode=config.filter_mode, root_domain=config.root_domain, use_daili=config.use_daili) s.start() s.run() print(len(s.has_url_set))
def getdata(Name): url = "https://baike.baidu.com/item/" + Name.encode(encoding='UTF-8') sp = spider.Spider() # download = dowload_url.Dowwload() # print download.crawdata(url) data = sp.get_node(url) print data["title"] data_dict = { "nodes": [{ "name": data["title"], "symbolSize": 60, "category": 0 }], "links": [] } #data_dict["nodes"].append({"name": data["name"], "symbolSize": 60,"category":0}) nodes = data["nodes"] for node in nodes: data_dict["nodes"].append({ "name": node["name"], "symbolSize": 40, "category": 1 }) data_dict["links"].append({ "source": data["title"], "target": node["name"] }) node_url = node["url"] node_data = sp.get_node(node_url) if node_data != None: print " " + node_data["title"] source = node["name"] for thrid_node in node_data["nodes"]: print " " + thrid_node["name"] data_dict["nodes"].append({ "name": thrid_node["name"], "symbolSize": 20, "category": 2 }) data_dict["links"].append({ "source": source, "target": thrid_node["name"] }) return data_dict
def test_spider(self): my_spider = spider.Spider(self.conf) my_spider.run() list = os.listdir(self.outputfile) #列出目录下的所有文件和目录 test_output = set(list) expected_output = set() expected_output.add(converse("http://pycm.baidu.com:8081")) expected_output.add(converse("http://pycm.baidu.com:8081/page1.html")) expected_output.add(converse(" http://pycm.baidu.com:8081/page2.html")) expected_output.add(converse(" http://pycm.baidu.com:8081/page3.html")) expected_output.add( converse("http://pycm.baidu.com:8081/mirror/index.html")) expected_output.add(converse(" http://pycm.baidu.com:8081/page4.html")) self.assertSetEqual(test_output, expected_output)
def spider_run(config_file): # read config spider_config = config_load.read_config(config_file) urls = file_util.read_file(spider_config.url_list_file) # set urls, build task url_center = url_manage.UrlManage() url_center.url_list_put(urls) # set spider spider_func_list = [] for i in xrange(spider_config.thread_count): sp = spider.Spider(spider_config, url_center) spider_func_list.append(sp.craw) task = thread_task.ThreadTask(spider_func_list) # craw start task.task_start()
def run_spider(): if request.method == "POST": params = get_values(request) print("get a request,params is as follows:") for key in params.keys(): print("{} : {}".format(key, params[key])) s = spider.Spider(start_url=params["start_url"], num=params["num"], max_layers=params["depth"], use_daili=params["use_daili"], time_out=params["time_out"], filter_mode=params["filter_mode"], root_domain=params["root_domain"]) s.start() s.run() #返回的内容 content = {"download_num": len(s.has_url_set), "layer": s.layer} res = make_response(jsonify(content)) res.headers['Access-Control-Allow-Origin'] = "*" return res
def main(): """1.初始化日志 2.解析命令行参数获取配置文件路径 3.创建Spider对象并初始化 4.开始抓取 """ log.init_log('./log/spider') config_path = parse_commandline() if config_path is None: print_usage() else: #create a spider and start it _spider = spider.Spider() if _spider.initialize(config_path): _spider.start() _spider.print_info() logging.info("All thread finished") else: logging.error("Initialize spider failed") return False
def work(): while True: time.sleep(3) # To adjust the speed of web crawling. iterator = queue.get( ) # when get() is proceed, it blocks and waits until queue has something to return record = spider.Spider(threading.current_thread().name, PROJECT_URL_1, PROJECT_INITIAL_NUM, iterator, PROJECT_URL_2) if record.get_list()[1] != 'FAIL': with open(PROJECT_NAME + '.csv', 'a', encoding="utf-8") as table: for attribute in record.get_list(): table.write(str(attribute)) table.write('\t') table.write('\n') iterator_set.remove(iterator) print('Queue ' + str(len(iterator_set)) + ' | Crawled ' + str(PROJECT_MAX_PAGES - len(iterator_set))) queue.task_done() else: print('Queue ' + str(len(iterator_set)) + ' | Crawled ' + str(PROJECT_MAX_PAGES - len(iterator_set))) queue.task_done() queue.put(iterator)
def __init__(self, host, port): asyncore.dispatcher.__init__(self) self.interval = SEND_INTERVAL self.recvbuffsize = 1024000 self.log_file = 'spider.log' handler = logging.handlers.RotatingFileHandler(self.log_file, maxBytes=1024 * 1024, backupCount=5) fmt = '[%(asctime)s][%(levelname)s]%(filename)s:%(lineno)s - %(message)s' formatter = logging.Formatter(fmt) handler.setFormatter(formatter) self.logger = logging.getLogger('spider') self.logger.addHandler(handler) self.logger.setLevel(logging.DEBUG) self.spider = spider.Spider(self.logger) self.messages = [{ 'layer': 0, 'url': 'http://www.baike.com/wiki/%E6%A2%85%E8%A5%BF' }] self.create_socket(socket.AF_INET, socket.SOCK_STREAM) self.connect((host, port))
#Fazer rodar o crawler no modo multithread import threading from queue import Queue import spider from domain import * from crawler1 import * PROJECT_NAME = 'the crawler' #variável constante HOMEPAGE = 'https://twitter.com/' DOMAIN_NAME = get_domain_name(HOMEPAGE) QUEUE_FILE = PROJECT_NAME + '/queue.txt' CRAWLED_FILE = PROJECT_NAME + '/crawled.txt' NUMBER_OF_THREADS = 8 queue = Queue() spider.Spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME) #create worker threads (will die when main exits) def create_workers(): for _ in range(NUMBER_OF_THREADS): t = threading.Thread(target=work) t.daemon = True t.start() #do the next job in the queue def work(): while True: url = queue.get() spider.Spider.crawl_page(threading.current_thread().name, url)
def official_server_fetch(self): with Timeout(7, False): _, result = spider.Spider().deploy(self.phrase) if result: self.racer_weapon(result, gun='official')
i_sc = args.scope #takes filename or list else: #if theres no in scope just use the list of domain names i_sc = [] for url in urls: #split off http[s]:// and anything after a / i_sc.append(url.split("://")[1].split("/")[0]) if args.outscope is not None: sc = scope.Scope(i_sc, args.outscope) else: sc = scope.Scope(i_sc) wl = Wordlist(args.regex) sp = spider.Spider(*urls, scope=sc, max_depth=args.max_depth, threads=args.threads, wordlist=wl, verbose=args.verbose) sp.run() if args.write: with open(args.write, "w") as f: print("writing to file...") for word in sp.next: f.write(word + "\n") else: for word in sp.next: print(word)
def __init__(self): self.browser = spider.Spider(desktop=False, chrome=True) vp = viewport.ViewPort(browser=self.browser) self.viewport_dictionary = vp.element_view_dictionary_errors
import re import BeautifulSoup import spider s = spider.Spider() domain = 'http://edcorner.stanford.edu' url = '/authorMaterialInfo.html?topicId=%s&d-5886639-p=%s' topic = 1 page = 1 def links_on_page(html): soup = BeautifulSoup.BeautifulSoup(html) tbody = soup.find('table', {'id': 'materialElement'}).tbody links = [] for tr in tbody.findAll('tr'): tds = [td for td in list(tr.findAll('td'))] if tds[0].renderContents().lower() == 'video': links.append(tds[1].find('a')['href']) return links def get_pages(topic=1): html = s.get(domain + url % (topic, page)) soup = BeautifulSoup.BeautifulSoup(html) td = soup.find('td', {'class': 'rightInfo'}) page_numbers = [1] page_numbers.extend([int(a.renderContents()) for a in td.findAll('a')]) num_pages = max(page_numbers)
import spider import dbWrite if __name__ == "__main__": db = dbWrite.dbWriter("localhost", "root", "mypassword", "xinhuadata") db.connect() spider = spider.Spider(['http://www.xinhuanet.com']) spider.setCallback(db.write) spider.startCrawl()
sys.setdefaultencoding('utf8') if __name__ == "__main__": cf = ConfigParser.ConfigParser() #cf.read("./conf/nice.conf") cf.read(sys.argv[1]) #初始logging log.init_log( cf.get("nice", "BASE") + cf.get("nice", "LOG_FILE"), log.logging.INFO) log.logging.info( "read conf ok [%s]" % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) crawled_avatar_conf = sys.argv[2] crawled_pic_conf = sys.argv[3] Spider = spider.Spider(cf) #读取种子用户和已爬取的头像和图片 Spider.prepare(crawled_avatar_conf, crawled_pic_conf) #爬取 time_now = int(time.time()) log.logging.info( "spider nice job start [%s]", time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time_now))) Spider.work(time_now) #保存用户爬取情况 Spider.finish() log.logging.info( "spider nice job done [%s]" %
import spider import sys reload(sys) sys.setdefaultencoding('utf8') requesturl = "http://space.bilibili.com/ajax/member/GetInfo" headers = dict() headers[ 'User-Agent'] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36 QQBrowser/3.9.3943.400" headers['X-Requested-With'] = "XMLHttpRequest" headers['Referer'] = "http://space.bilibili.com/2/" type = "POST" birthdataspyder = spider.Spider("birthdata") mid = 33307722 while mid < 33307723: values = dict() values['mid'] = mid mid = mid + 1 birthdataspyder.Jobs.AddOneJob(type, requesturl, headers, values) birthdataspyder.RequestOne() birthdataspyder.WriteOne()
# coding=utf-8 import spider sp = spider.Spider() sp.run('http://computer.hdu.edu.cn/', 'computer.hdu.edu.cn')
#!/usr/bin/python # -*- coding: UTF-8 -*- #import cookielib import urllib.request import re import sys from urllib.request import urlretrieve import spider #import requests #reload(sys) #sys.setdefaultencoding("utf-8") url = "http://app.mi.com/download/62289?id=com.xiaomi.o2o&ref=appstore.mobile_download&nonce=-553189008766792780%3A25943398&appClientId=2882303761517485445&appSignature=gmmJ_uC754lfFN_2Smb-GvUMPjg724AWAu7IMLBmdlU" #response1 = urllib.request.urlopen(url) #result = response1.read().decode('UTF-8') #urlretrieve(url, './apps/xiaomi') category = {} if __name__ == '__main__': url = "http://app.mi.com" crawler = spider.Spider(url) result = crawler.getSubpage() print(result) #if spider.downloadApp(url,'xiaomi', 'shopping'): # print("finished!") #else: # print("Failed!")
def crawling(): """ 爬虫的主程序 返回result(dict),CanNotCrawl(list) """ #----判断建立url仓库的方式---- if URL_BUILD_MODE == 1: #从CanNotCrawl.json文件中直接读取 with open("./CanNotCrawl.json", "r", encoding="ascii") as f: jobUrls = json.load(f) elif URL_BUILD_MODE == 2: #读取单独的测试url jobUrls = [TEST_URL] elif URL_BUILD_MODE == 0: #调用urls模块生成 page_num = urls.getPageNum() pageUrls = urls.webUrlsPool(page_num) number0 = 0 jobUrls = [] for pageUrl in pageUrls: number0 += 1 jobUrlsHelp = urls.getJobUrls(pageUrl) jobUrls = jobUrls + jobUrlsHelp if number0 % 5 == 0: randomDelay() print("url输入进度为:", number0) #----建立待爬取的url仓库---- toCrawl = uP() for x in jobUrls[:]: toCrawl.pressIn(x) print("待爬取的url的数量:", toCrawl.howMany()) #----建立爬取失败的url的仓库---- CanNotCrawl = uP() #----建立爬取结果的储存字典---- if URL_BUILD_MODE == 1: #从Results.json中读取已爬取的结果 with open("./Results.json", "r", encoding="gb18030") as f: result = json.load(f) elif URL_BUILD_MODE == 0 or URL_BUILD_MODE == 2: #建立空的结果储存字典 result = {} #-------开始爬取-------- #----实现spider对象 spider1 = spider.Spider() #----建立计数器与计错器 number1 = 0 errorCounter = 0 # 总出错数 errorCounter1 = 0 #keyWord1出错数 errorCounter2 = 0 #keyWord2出错数 errorCounter3 = 0 #同时出错数 errorOfparsing = 0 #解析错误AttributeError #----开始爬取循环 while True: if number1 % 100 == 0: print("爬了", number1, "个,错误", errorCounter, "个,仅关键词1错误", errorCounter1, "个,仅关键词2错误", errorCounter2, "个,同时错误", errorCounter3, "个,解析错误了", errorOfparsing, "个") randomDelay() number1 += 1 #爬虫获取待爬url url1 = spider1.eatURL(toCrawl, ) if url1 == -1: print("没有待爬的url啦") return result, CanNotCrawl #爬虫获取html html0 = spider1.fetchHtml() if html0 == 0: errorCounter += 1 errorOfparsing += 1 continue #爬虫获取目标内容 flag, data, titleAndSalary = spider1.parsingContent( KEY_WORDS1, KEY_WORDS2) #爬虫检查目标内容 errorCounterAdder = 0 CanNotCrawlAdder = 0 goodToPush = 1 if flag == 0: #不压入URL,不写入数据集 errorCounterAdder = 1 goodToPush = 0 errorOfparsing += 1 if flag == -1: errorCounterAdder = 1 CanNotCrawlAdder = 1 errorCounter1 += 1 if flag == -2: errorCounterAdder = 1 CanNotCrawlAdder = 1 errorCounter2 += 1 if flag == -3: errorCounterAdder = 1 goodToPush = 0 CanNotCrawlAdder = 1 errorCounter3 += 1 #爬虫修改相关数据 errorCounter += errorCounterAdder if CanNotCrawlAdder == 1: CanNotCrawl.pressIn(url1) if goodToPush == 1: titleAndSalary = str(titleAndSalary) data.append(url1) result[titleAndSalary] = data
def crawl_one(url): s = spider.Spider(mongo_client) urls, page_contents = s.traverse_domain_bfs(url, max_depth=1) clean_text_col = get_clean_text_blocks(page_contents) insert_ids = put_mongo_records(mongo_client, "web_crawl", "web_text_content", clean_text_col)
import spider addons = [spider.Spider()]