def addEntries2DB(uniqueId, kanji, xref, reb, gloss): handler = dbhandler.DBHandler() for k in kanji: for x in xref: for r in reb: for g in gloss: handler.addDictionaryRow(uniqueId, k, x, r, g)
def page_collector(self): try: self.browser.get(self.url) except TimeoutException: self.browser.execute_script('window.stop()') print('get %s successfully' % self.url) count = 1 while count < 20: print(count, end='===>') time.sleep(2) js = "document.body.scrollTop=%d000" % (count * 500) self.browser.execute_script(js) count += 1 all_player = self.browser.find_element_by_xpath( '//tbody[@class="Body"]').find_elements_by_xpath( '//tr[contains(@class,"Row")]') print('number of player found:', len(all_player)) for player in all_player[1:-1]: if player.find_elements_by_tag_name('td')[3].text in [ 'Challenger', 'Master' ]: tmp_link = player.find_element_by_tag_name('a').get_attribute( 'href') tmp_full_link = parse.urljoin(self.url, tmp_link) print('append:', tmp_full_link) self.page_urls.add(tmp_full_link) try: self.browser.get(self.pro_url) except TimeoutException: self.browser.execute_script('window.stop()') print('get %s successfully' % self.pro_url) all_pro_link = self.browser.find_element_by_xpath( '//ul[@class="RegisterSummonerList"]').find_elements_by_tag_name( 'a') for item in all_pro_link: tmp_link = item.get_attribute('href') tmp_full_link = parse.urljoin(self.url, tmp_link) print('append:', tmp_full_link) self.page_urls.add(tmp_full_link) db_handler = dbhandler.DBHandler() gameids = db_handler.get_idmappingmanual_gameid() print('length of url appended:', len(self.page_urls)) for gameid in gameids: tmp_full_link = parse.urljoin(self.base_url, 'userName='******'append:', tmp_full_link) self.page_urls.add(tmp_full_link) self.failed_downloaded_page_urls = self.page_urls for url in self.failed_downloaded_page_urls: self.page_url_download_times[url] = 1 while len(self.failed_downloaded_page_urls) != 0: pool = Pool(24) pool.map(self.page_generator, self.failed_downloaded_page_urls) pool.close() pool.join() print('number of pages downloaded:', len(self.pages)) self.pages_json = {'data': self.pages} with open(self.pages_json_file, 'w') as fwrite: json.dump(self.pages_json, fwrite)
def __init__(self): QtWidgets.QMainWindow.__init__(self) self.ui = ui.Ui_MainWindow() self.ui.setupUi(self) # TODO(jamie): Locate this radius search stuff in its own setup function self.ui.checkBox_radius.clicked.connect(self.handle_check_radius_click) self.ui.radioButton_5m.setChecked(True) self.ui.spinBox_radius.setMaximum(100) self.ui.spinBox_radius.setMinimum(10) self.is_radius_search = False self.ui.checkBox_radius.setChecked(True) self.dbhandler = dbhandler.DBHandler("database/fake.db") self.setup()
def main(): if len(sys.argv) < 3 or len(sys.argv) > 3: print("Usage: reconpp.py <database> <scanID>") sys.exit(1) if not acquire_lockfile(): print("Failed to acquire lock file. Is reconpp already running?") sys.exit(1) wsauth.write_token() websockets = [] running = True database = sys.argv[1] scanID = sys.argv[2] dbHandler = dbhandler.DBHandler(args=(websockets, database, scanID)) dbHandler.setDaemon(True) SocketServer.ThreadingTCPServer.allow_reuse_address = 1 server = SocketServer.ThreadingTCPServer( ("", 1337), WebSocketsHandler.WebSocketsHandler) server.running = running server.websockets = websockets try: dbHandler.start() server.serve_forever() dbHandler.join() except KeyboardInterrupt: server.running = False server.server_close() finally: release_lockfile() wsauth.remove_token()
import sys import time import basic_data_crawler import dbhandler import gameid_info_crawler from models import Team, Player, IDMapping, GameIDInfo if __name__ == '__main__': print('get argument:', sys.argv) time_start = time.time() db_handler = dbhandler.DBHandler() print('crawler started... good luck') if 'basic' in sys.argv: print('start to collect team and player pages') crawler = basic_data_crawler.TeamDataCrawler() crawler.page_collector() print('second %s: collecting pages finished' % (str(time.time() - time_start))) print('start to parse team data page') team_data = crawler.crawl_basic_info() print('second %s: crawling team data finished' % (str(time.time() - time_start))) print('team data:\n', team_data) print('start to save team data to db') db_handler.save_data(team_data, Team) print('crawling team img') crawler.crawl_team_img() print('start to parse player data page') player_data = crawler.crawl_player_info() print('second %s: crawling player data finished' %
def crawl_gameid_info(self): browser = webdriver.PhantomJS( executable_path="/opt/phantomjs/bin/phantomjs") # browser = webdriver.Chrome() browser.set_page_load_timeout(30) print('getting:', self.gameid_info_url) try: browser.get(self.gameid_info_url) except TimeoutException: browser.execute_script("window.stop()") count = 1 while count <= 20: js = "document.body.scrollTop=%d000" % (count * 500) browser.execute_script(js) time.sleep(1) count += 1 # print('length of html:', len(browser.page_source)) page_html = browser.page_source print('length of html:', len(page_html)) soup = htmlparser.HtmlParser(page_html).get_soup() all_gameid = soup.find("table", class_="LadderRankingTable").find_all("tr") for gameid in all_gameid[1:-1]: tmp_dict = {} tmp_dict['game_id'] = gameid.find( "td", class_="SummonerName").find("a").get_text() tmp_dict['link'] = 'http:' + gameid.find( "td", class_="SummonerName").find("a").get("href") tmp_dict['rank'] = gameid.find("td", class_="Rank").get_text() tmp_dict['tier'] = gameid.find("td", class_="TierRank").get_text() tmp_dict['lp'] = gameid.find( "td", class_="LP").get_text().split()[0].replace(',', '') tmp_dict['total_win'] = gameid.find("td", class_="RatioGraph").find( "div", { "class": "Text Left" }).get_text() tmp_dict['total_lose'] = gameid.find("td", class_="RatioGraph").find( "div", { "class": "Text Right" }).get_text() tmp_dict['total_win_ratio'] = gameid.find( "td", class_="RatioGraph").find( "span", class_="WinRatio").get_text().replace('%', '') self.gameid_info.append(tmp_dict) self.gameids_ladder.add(tmp_dict['game_id']) # print(tmp_dict) print('number of gameids crawled:', len(self.gameids_ladder)) # self.gameids_add = self.pro_gameids - self.gameids_ladder db_handler = dbhandler.DBHandler() gameids_db = set(db_handler.get_idmappingmanual_gameid()) self.gameids_add = (gameids_db - self.gameids_ladder) | ( self.pro_gameids - self.gameids_ladder) print('number of gameids still need to be crawled:', len(self.gameids_add)) print(self.gameids_add) browser.close() browser.quit() while len(self.gameids_add) != 0: pool = Pool(8) pool.map(self.crawl_gameid_info_by_search, self.gameids_add) pool.close() pool.join() print('invalid gameids found:', len(self.invalid_gameids)) print(self.invalid_gameids) print('number of gameids crawled:', len(self.gameid_info)) print(self.gameid_info) return self.gameid_info
import spider import dbhandler if __name__ == '__main__': crawler_obj = spider.Spider() data_today = crawler_obj.craw() print('crawler finished,got %d items of job info' % len(data_today)) dbhandler_obj = dbhandler.DBHandler() dbhandler_obj.savedata(data_today) job_info_new = dbhandler_obj.getdata() print('db task finished,got %d items of new job info today' % len(job_info_new)) print(job_info_new)