def start_crawler(self): res_gzh_list, sogou_request_flag = self.__get_first_gzh_from_result_list( ) if sogou_request_flag == CrawlerConst.program_output_code.REQUEST_BLOCKED: return sogou_request_flag url_profile, first_gzh_name = HtmlParser.parse_gzh_list_html( res_gzh_list) if first_gzh_name != self.gzh_name: # print("找不到指定的公众号!请重新确认此公众号是否存在。") return CrawlerConst.program_output_code.OTHER_ERROR res_gzh_profile, wechat_return_flag = self.__get_gzh_articles_dict( url_profile) if wechat_return_flag == CrawlerConst.program_output_code.REQUEST_BLOCKED: return wechat_return_flag articles = HtmlParser.parse_history_article_list_html(res_gzh_profile) for article in articles: url_content = article['content_url'] print(url_content) res_gzh_article, wechat_request_flag = self.__get_gzh_article( url_content) if wechat_request_flag == CrawlerConst.program_output_code.REQUEST_BLOCKED: return wechat_request_flag save_res = HtmlParser.parse_history_article_html( res_gzh_article, self.gzh_name) if save_res == CrawlerConst.program_output_code.SAVE_FAILURE: return save_res return CrawlerConst.program_output_code.SUCCESS
def __init__(self): # 初始化程序 self.download = Downloader() self.parser = HtmlParser() self.save = SaveData() self.workbook = Workbook() self.ch = Choice() print('初始化完成...')
def updateBlockTable(): for blockId in query.getIdBlocks(): print(blockId) htmlParser = HtmlParser(str(blockId)) blockInfo = htmlParser.getBlock() query.insertBlock( Block(id=blockId, hash='', timestamp=blockInfo['timestamp'], minedIn=blockInfo['minedIn']))
class SpiderMain(object): def __init__(self): # 初始化程序 self.download = Downloader() self.parser = HtmlParser() self.mysql = Mysqldb() def run(self, url, database): response = self.download.download(url) self.parser.parser(response, database)
def getTxsData(self): hash = self._getNotConfirmedTx() print(hash) htmlParser = HtmlParser(hash) if hasattr(htmlParser, 'tableText'): return { 'hash': hash, 'blockId': htmlParser.getBlockNumber(), 'gasPrice': htmlParser.getGasPrice(), 'gasLimit': htmlParser.getGasLimit() } return self._getFakeDataTx(hash)
class SpiderMain(object): def __init__(self): # 初始化程序 self.download = Downloader() self.parser = HtmlParser() self.save = SaveData() self.workbook = Workbook() self.ch = Choice() print('初始化完成...') def run(self): while True: try: p = int(input('想要爬多少页的数据?' + '\n')) break except ValueError: print('输入错误!请输入数字') page = p + 1 print("================================") print(' A.原创发布区 B.精品软件区 ') print(' C.脱壳破解区 D.移动安全区 ') print(' E.病毒分析区 F.编程语言区 ') print(' G.软件调试区 H.动画发布区 ') print(' I.逆向资源区 J.安全工具区 ') print(' K.招聘求职区 ') print("================================") while True: choice = input("选择爬取的专区,输入 Q 退出程序(输入的字母必须大写):") half_url, name = self.ch.make_the_arrg(choice) if name != 'Error': break print(half_url + '\n' + name) self.save.createfile(name) for i in range(1, page): url = half_url + str(i) + '.html' response = self.download.download(url) self.parser.parser(response, name) sleep = random.randint(2, 10) print('爬取第' + str(i) + '页完成,程序休息' + str(sleep) + '秒') time.sleep(sleep) # 程序睡眠 if i != page - 1: print('-----------------------------') print(' 下一页 ') print('-----------------------------') print('数据写入完成,正在进行数据去重...') self.save.delete_same_data() try: self.workbook.save('将csv的数据导入此表.xlsx') except: print('创建xlsx文件失败,请手动创建') print('程序运行完毕')
def fit(input_dir_non_reg='train_pages/non_reg/', input_dir_reg='train_pages/reg', output_dir='ml/cb'): train_htmls = [] train_classes = [] for file in os.scandir(input_dir_non_reg): filename = os.fsdecode(file) with open(filename) as file1: train_htmls.append(file1.read()) train_classes.append(0) for file in os.scandir(input_dir_reg): filename = os.fsdecode(file) with open(filename) as file1: train_htmls.append(file1.read()) train_classes.append(1) train_texts = [] for html in train_htmls: train_texts.append(HtmlParser.htmlToText(html)) vectorizer = CountVectorizer() train_set = vectorizer.fit_transform(train_texts) clfCB = CatBoostClassifier(iterations=100, learning_rate=3, depth=7) clfCB.fit(X=train_set.toarray(), y=train_classes) saved_vect = open(output_dir + '/saved_vect_cb', 'wb') pickle.dump(vectorizer, saved_vect) saved_vect.close() saved_model = open(output_dir + '/saved_clf_cb', 'wb') pickle.dump(clfCB, saved_model) saved_model.close()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): self.manager.add_new_url(root_url) while (self.manager.has_new_urls() and self.manager.old_url_size() < 100): #try: # 获取新的url new_url = self.manager.get_new_url() # 下载器下载网页 html = self.downloader.download(new_url) # 解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) # 添加UR管理器 self.manager.add_new_urls(new_urls) # 数据存储文件 self.output.store_data(data) print("已经抓取 %s 个链接" % self.manager.old_url_size()) #except Exception, e: # print("crawl failded", e) self.output.out_put_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.parser = HtmlParser() self.downloader = HtmlDownloader() self.output = DataOutput() def crawl(self, root_url): """ 程序主逻辑 :param root_url: 入口 url :return: """ self.manager.add_new_url(root_url) while self.manager.has_new_url() and self.manager.old_url_size() < 20: try: new_url = self.manager.get_new_url() html = self.downloader.downloader(new_url) new_urls, data = self.parser.parser(new_url, html) self.manager.add_new_urls(new_urls) self.output.output_txt(data) print(data) print("爬取了{}条链接".format(self.manager.old_url_size())) except Exception as e: print("爬取失败", e)
def __init__(self): #开启的线程数目 self.pcount = 1 #结果输出队列 self.dqueue = queue.Queue() #错误信息输出队列 self.equeue = queue.Queue() self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() # self.proxies = getProxy() self.proxies = getFromPool2() self.inactivepro = [] self.count = 0 self.sumSuccess = 0 self.sumFail = 0 self.updating = False
def getPendingTxsHashes(self): pendingTxs = [] htmlParser = HtmlParser() for pendingTx in htmlParser.getPendingTxs(): hash = self._getHashFromHtml(pendingTx) htmlParser = HtmlParser(hash) print(hash) if hasattr(htmlParser, 'tableText'): pendingTxs.append([hash, htmlParser.getTimestamp()]) return pendingTxs
class SpiderMan(object): def __init__(self): self.manger = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = OutputData() def crawl(self, root_url): """ 主程序 :param root_url: 入口 URL :return: """ self.manger.add_new_url(root_url) while self.manger.has_new_url() and self.manger.old_urls_size() < 5: new_url = self.manger.get_new_url() html = self.downloader.downloader(new_url) next_url, data = self.parser.parser(new_url, html) self.manger.add_new_url(next_url) self.output.outputTxt(data)
def predict(input_dir, clf_dir="ml/cb"): saved_model = open(clf_dir + "/saved_clf_cb", 'rb') clfCB = pickle.load(saved_model) saved_vect = open(clf_dir + "/saved_vect_cb", 'rb') vectorizer = pickle.load(saved_vect) htmls = [] filenames = [] for file in os.scandir(input_dir): filename = os.fsdecode(file) filenames.append(filename) with open(file) as file1: htmls.append(file1.read()) texts = [] for html in htmls: texts.append(HtmlParser.htmlToText(html)) test_set = vectorizer.transform(texts) predict = clfCB.predict(test_set.toarray()) for file, p in zip(filenames, predict): print(file) if p: print('Yes') else: print("No") return
def __init__(self): # 初始化程序 self.download = Downloader() self.parser = HtmlParser() self.mysql = Mysqldb()
class SpiderMan(object): def __init__(self): #开启的线程数目 self.pcount = 1 #结果输出队列 self.dqueue = queue.Queue() #错误信息输出队列 self.equeue = queue.Queue() self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() # self.proxies = getProxy() self.proxies = getFromPool2() self.inactivepro = [] self.count = 0 self.sumSuccess = 0 self.sumFail = 0 self.updating = False #self.proxies = ['http://127.0.0.1:2740'] def doCrawl(self, new_url): try: self.pcount += 1 count = 1 #随机选取代理IP pro = random.choice(self.proxies) #pro = 'http://127.0.0.1:2740' while (True): #HTML下载器下载网页 html = self.downloader.download(new_url, pro) #HTML解析器抽取网页数据 data = self.parser.parser(new_url, html) ## 数据存储器储存文件引起多线程写冲突而废弃 # self.output.store_data(data) #如果遇到机器人检测 if data == "robot": if count < 6: count = count + 1 #加入淘汰机制 # self.proxies - self.inactivepro 表现良好 # self.proxies and self.inactivepro 一次被墙 待观察 # self.inactivepro - self.proxies 两次被墙 暂时退出 # none 复活失败,永久退出 if (count == 5 and len(self.proxies) > 100): if (self.inactivepro.index(pro) < 0): #加入观察名单 self.inactivepro.append(pro) pro = random.choice(self.proxies) else: #暂时退出 print(str(pro) + " out\n") if (self.proxies.index(pro) >= 0): self.proxies.remove(pro) continue else: raise Exception("robot check") else: break # 队列将输出存储起来 self.dqueue.put(data) except Exception as e: self.sumFail = self.sumFail + 1 print( "Fail: link %d fail %d times : %s\n" % (self.count, self.sumFail, new_url), e.args) # 启动激活计划 if (len(self.proxies) < 200 or len(self.inactivepro) > 500): pro = random.choice(self.inactivepro) if (not pro is None and self.proxies.index(pro) < 0 and self.testIP(pro)): self.proxies.append(pro) print(str(pro) + " in!!!\n") # 不管结果如何,都要将pro移除, # 判断条件是为了防止多线程并发出现问题 if (self.inactivepro.index(pro) >= 0): self.inactivepro.remove(pro) self.equeue.put([new_url, e.args]) else: self.sumSuccess = self.sumSuccess + 1 print("Success: link %d success %d times : %s" % (self.count, self.sumSuccess, new_url)) finally: self.pcount -= 1 def setProxy(self): #self.proxies = getProxy() self.proxies = getFromPool2() self.updating = False #输出结果和错误信息 def outPutData(self): while (not self.dqueue.empty()): data = self.dqueue.get() self.output.store_data(data) while (not self.equeue.empty()): err = self.equeue.get() self.output.store_err(err) def testIP(self, pro): url = 'https://www.douban.com' res = requests.get(url, proxies={'proxy': pro}, timeout=20) if (res.status_code == 200): return True else: return False def crawl(self): threads = [] preFail = 0 #跳过之前的url for i in range(22350): self.manager.has_new_url() while (self.manager.has_new_url()): try: self.count = self.count + 1 # 启动更新计划 if self.sumFail - preFail > 46 and not self.updating: self.updating = True print("\n\nstart refreshing proxies\n\n") t = threading.Thread(target=SpiderMan.setProxy, args=[ self, ]) t.start() threads.append(t) # p = Pool() # result = p.apply_async(getFromPool2, args=()) # p.close() #self.proxies = result.get() #每50条数据刷新缓冲区和成功率 if (self.count % 50 == 0 and self.count != 0): preFail = self.sumFail rate = float(self.sumSuccess) / float(self.count - 1) print("Success Rate: %f" % rate) self.output.store_err([str(self.count), str(rate)]) self.output.flush() #从URL管理器获取新的url new_url = self.manager.get_new_url() #爬虫主过程(多线程优化) if self.pcount < 0: pcount = 0 else: pcount = self.pcount time.sleep(random.random() + pcount / 10) #随机时间间隔,根据线程数调整速度 t = threading.Thread(target=SpiderMan.doCrawl, args=[ self, new_url, ]) t.start() threads.append(t) #输出结果和错误信息 self.outPutData() except Exception as e: print("wired fail") [t.join() for t in threads] self.outPutData()
def main(): # create argument parser parser = argparse.ArgumentParser( description='Convert ConfigCrusher program measurement results.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # add arguments to parser and parse prepareParser(parser) args = parser.parse_args() # set up logger global LOGGER LOGGER = logging.getLogger('crusherToJSONLogger') LOGGER.setLevel(logging.DEBUG) # check if debug should be enabled logLevel = logging.INFO if args.verbose: logLevel = logging.DEBUG # channel to stream log events to console ch = logging.StreamHandler() ch.setLevel(logLevel) formatter = logging.Formatter('[%(levelname)s] (%(asctime)s): %(message)s') ch.setFormatter(formatter) LOGGER.addHandler(ch) # log to file if enabled logPath = args.logfile if len(logPath) > 0: if not logPath.endswith(".log"): logPath += ".log" fileHandler = logging.FileHandler(logPath) fileHandler.setFormatter(formatter) LOGGER.addHandler(fileHandler) LOGGER.info('Logger ready.') # validate output folder outFolder = args.outpath if not (outFolder.endswith("/") or outFolder.endswith("\\")): outFolder += "/" if not os.path.exists(outFolder): LOGGER.warning('The output folder does not exist! Creating it...') try: os.makedirs(outFolder) except Exception as ex: LOGGER.exception('Failed to create output folder!') return outFolder = os.path.normcase(outFolder) LOGGER.info('Output folder created: {}'.format(outFolder)) else: # check that path leads to a folder if not os.path.isdir(outFolder): LOGGER.error('The output folder path does not lead to a folder!') return # validate color schema file schemaPath = args.colorschema if not os.path.isfile(schemaPath): LOGGER.error( 'The given schema path is no valid file: {}'.format(schemaPath)) return # check if recursive export is desired recursive = True if args.recursive else False # check if user wants to overwrite existing files overwrite = True if args.overwrite else False # export the highlighted HTML code as well if desired exportHTML = True if args.exporthtml else False if exportHTML: LOGGER.info('Additional HTML export enabled.') # try to read JSON color schema jsonSchema = None with open(schemaPath, "r") as file: try: jsonSchema = json.loads(file.read()) except Exception as ex: LOGGER.error(ex) if jsonSchema is None: return # check if path exists filePath = args.path if not os.path.exists(filePath): LOGGER.error('Failed to convert! Given path does not exist: {}'.format( filePath)) return None # check if path leads to file or folder if os.path.isfile(filePath): # parses html code to unity rt format parser = HtmlParser(colorSchema=jsonSchema) # convert a file and export the result LOGGER.info('Converting the file...') resultPath = convertFile(htmlParser=parser, filePath=filePath, outputFolder=outFolder, exportHTML=exportHTML, overwrite=overwrite) elif os.path.isdir(filePath): # convert all files of the folder LOGGER.info('Converting the files{}...'.format( ' recursively' if recursive else '')) resultPath = convertFiles(folderPath=filePath, outputFolder=outFolder, jsonSchema=jsonSchema, exportHTML=exportHTML, overwrite=overwrite, recursive=recursive) # print result path if not resultPath is None: LOGGER.info('Result path: ' + os.path.abspath(resultPath))
def convertFiles(folderPath, outputFolder, jsonSchema, exportHTML=False, overwrite=False, recursive=False): ''' Converts all files source code to a syntax highlighted rich text format. This method does not check if the given path is valid! Returns None on errors, the path to the exported files otherwise. ''' firstOutPath = None pathLength = len(folderPath) if folderPath.endswith('/') or folderPath.endswith('\\'): folderPath = folderPath[:-1] srcDirName = os.path.normcase(os.path.basename(folderPath)) outputFolder = os.path.normcase(os.path.normpath(outputFolder)) for curDir, subDirs, files in os.walk(folderPath, topdown=True): curDir_relative = os.path.normpath( os.path.join(srcDirName, curDir[pathLength:])) LOGGER.info('Entering directory: {}'.format(curDir_relative)) # create export path #LOGGER.debug('Joining paths "{}" and "{}"'.format(outputFolder, curDir_relative)) curOutFolder = os.path.normcase( os.path.join(outputFolder, curDir_relative)) LOGGER.debug('Current output folder: {}'.format(curOutFolder)) if os.path.exists(curOutFolder): if os.path.isfile(curOutFolder): LOGGER.error( 'Failed to export to: {} (is a file instead of a folder)'. format(os.path.abspath(curOutFolder))) return None else: # create the output folder LOGGER.info('Creating folder: {}'.format(curOutFolder)) try: os.mkdir(curOutFolder) except Exception as ex: LOGGER.exception( 'Failed to create an output folder: {}'.format( curOutFolder)) return None if firstOutPath is None: firstOutPath = curOutFolder # convert and export all the files of this folder for file in files: # parses html code to unity rt format parser = HtmlParser(colorSchema=jsonSchema) LOGGER.info('Converting file: {}'.format(file)) path = convertFile(htmlParser=parser, filePath=os.path.join(curDir, file), outputFolder=curOutFolder, exportHTML=exportHTML, overwrite=overwrite) if not path is None: LOGGER.info('File exported: {}'.format(path)) # do not take sub-folders into account if recursion is disabled if not recursive: break return firstOutPath
def main(): motd = open('./motd', 'r') banner = motd.read() motd.close() style = colored.fg(random.choice(['red', 'blue', 'green'])) print(colored.stylize(banner, style)) try: opts, args = getopt.getopt(sys.argv[1:], "hfdao:e:u:1:2:i:m:c:", [ "help", "full", "download", "analyze", "output-dir=", "email=", "urls=", "input-dir1=", "input-dir2=", "input-dir=", "method=", "clf_dir" ]) except getopt.GetoptError as err: print(err) sys.exit(2) output_dir = None email = None nr_dir1 = None nr_dir2 = None input_dir = None input_dir1 = None method = None clf_dir = None if not opts: print('Options required, try -h or --help for more information') return o, a = opts[0] if o in ("-h", "--help"): print("Usefull information") elif o in ("-f", "--fit"): for o, a in opts[1:]: if o in ("-1", "--input-dir1"): input_dir = a elif o in ("-2", "--input-dir2"): input_dir1 = a elif o in ("-o", "--output_dir"): output_dir = a elif o in ("-m", "--method"): method = a else: assert False, "unhandled option for fit" if method == "cb": CatBoostHtmlClassifier.fit(input_dir, input_dir1, output_dir) elif method == "nb": KNeighboursHtmlClassifier.fit(input_dir, input_dir1, output_dir) else: assert False, "method not specified" elif o in ("-d", "--download"): for o, a in opts[1:]: if o in ("-o", "--output-dir"): output_dir = a elif o in ("-e", "--email"): email = a elif o in ("-u", "--urls"): urls = a else: assert False, "unhandled option for download" WebSelenium.saveHtmls(email, urls, output_dir) elif o in ("-a", "--analyze"): for o, a in opts[1:]: if o in ("-1", "--input-dir1"): nr_dir1 = a elif o in ("-2", "--input-dir2"): nr_dir2 = a elif o in ("-i", "--input-dir"): input_dir = a elif o in ("-c", "--clf_dir"): clf_dir = a elif o in ("-m", "--method"): method = a else: assert False, "unhandled option for analyze" if method == "1": HtmlParser.htmlCmp(nr_dir1, nr_dir2, input_dir) elif method == "cb": CatBoostHtmlClassifier.predict(input_dir, clf_dir) elif method == "nb": KNeighboursHtmlClassifier.predict(input_dir, clf_dir) else: assert False, "method not specified" else: assert False, "unhandled option"
def __init__(self): self.manager = UrlManager() self.parser = HtmlParser() self.downloader = HtmlDownloader() self.output = DataOutput()
import pandas as pd from requestUtil import * from htmlParser import HtmlParser id=1 #Column names COLUMN_NAMES = ["HTML_ID", "TAG_NAME", "ATTRIBUTE_ID", "ATTRIBUTE_NAME", "ATTRIBUTE_CLASS", "ATTRIBUTE_PLACEHOLDER", "IN_FORM", "TAG_DEPTH", "TAG_STRING", "LABEL"] #Initializing dataframe df = pd.DataFrame(columns=COLUMN_NAMES) #Read urls from xslx file loginurls = pd.read_csv("loginurls.csv") #Creating parser object htmlParser=HtmlParser() #Iterating over all login urls for loginurl in loginurls["LOGIN_URL"]: try: print("Requesting : " + loginurl) src = getHtmlString(loginurl) df = HtmlParser.parseHtml(src, id, df, loginurl) print("finished parsing html num " + str(id)) except Exception as e: print("Could not load: " + loginurl) id = id + 1