Пример #1
0
def addEntries2DB(uniqueId, kanji, xref, reb, gloss):
    handler = dbhandler.DBHandler()

    for k in kanji:
        for x in xref:
            for r in reb:
                for g in gloss:
                    handler.addDictionaryRow(uniqueId, k, x, r, g)
Пример #2
0
 def page_collector(self):
     try:
         self.browser.get(self.url)
     except TimeoutException:
         self.browser.execute_script('window.stop()')
     print('get %s successfully' % self.url)
     count = 1
     while count < 20:
         print(count, end='===>')
         time.sleep(2)
         js = "document.body.scrollTop=%d000" % (count * 500)
         self.browser.execute_script(js)
         count += 1
     all_player = self.browser.find_element_by_xpath(
         '//tbody[@class="Body"]').find_elements_by_xpath(
             '//tr[contains(@class,"Row")]')
     print('number of player found:', len(all_player))
     for player in all_player[1:-1]:
         if player.find_elements_by_tag_name('td')[3].text in [
                 'Challenger', 'Master'
         ]:
             tmp_link = player.find_element_by_tag_name('a').get_attribute(
                 'href')
             tmp_full_link = parse.urljoin(self.url, tmp_link)
             print('append:', tmp_full_link)
             self.page_urls.add(tmp_full_link)
     try:
         self.browser.get(self.pro_url)
     except TimeoutException:
         self.browser.execute_script('window.stop()')
     print('get %s successfully' % self.pro_url)
     all_pro_link = self.browser.find_element_by_xpath(
         '//ul[@class="RegisterSummonerList"]').find_elements_by_tag_name(
             'a')
     for item in all_pro_link:
         tmp_link = item.get_attribute('href')
         tmp_full_link = parse.urljoin(self.url, tmp_link)
         print('append:', tmp_full_link)
         self.page_urls.add(tmp_full_link)
     db_handler = dbhandler.DBHandler()
     gameids = db_handler.get_idmappingmanual_gameid()
     print('length of url appended:', len(self.page_urls))
     for gameid in gameids:
         tmp_full_link = parse.urljoin(self.base_url, 'userName='******'append:', tmp_full_link)
         self.page_urls.add(tmp_full_link)
     self.failed_downloaded_page_urls = self.page_urls
     for url in self.failed_downloaded_page_urls:
         self.page_url_download_times[url] = 1
     while len(self.failed_downloaded_page_urls) != 0:
         pool = Pool(24)
         pool.map(self.page_generator, self.failed_downloaded_page_urls)
         pool.close()
         pool.join()
     print('number of pages downloaded:', len(self.pages))
     self.pages_json = {'data': self.pages}
     with open(self.pages_json_file, 'w') as fwrite:
         json.dump(self.pages_json, fwrite)
Пример #3
0
Файл: main.py Проект: sm3w/ndb
    def __init__(self):
        QtWidgets.QMainWindow.__init__(self)
        self.ui = ui.Ui_MainWindow()
        self.ui.setupUi(self)

        # TODO(jamie): Locate this radius search stuff in its own setup function
        self.ui.checkBox_radius.clicked.connect(self.handle_check_radius_click)
        self.ui.radioButton_5m.setChecked(True)
        self.ui.spinBox_radius.setMaximum(100)
        self.ui.spinBox_radius.setMinimum(10)
        self.is_radius_search = False
        self.ui.checkBox_radius.setChecked(True)

        self.dbhandler = dbhandler.DBHandler("database/fake.db")
        self.setup()
Пример #4
0
def main():
    if len(sys.argv) < 3 or len(sys.argv) > 3:
        print("Usage: reconpp.py <database> <scanID>")
        sys.exit(1)

    if not acquire_lockfile():
        print("Failed to acquire lock file. Is reconpp already running?")
        sys.exit(1)

    wsauth.write_token()

    websockets = []
    running = True

    database = sys.argv[1]
    scanID = sys.argv[2]

    dbHandler = dbhandler.DBHandler(args=(websockets, database, scanID))
    dbHandler.setDaemon(True)

    SocketServer.ThreadingTCPServer.allow_reuse_address = 1
    server = SocketServer.ThreadingTCPServer(
        ("", 1337), WebSocketsHandler.WebSocketsHandler)
    server.running = running
    server.websockets = websockets

    try:
        dbHandler.start()
        server.serve_forever()
        dbHandler.join()
    except KeyboardInterrupt:
        server.running = False
        server.server_close()
    finally:
        release_lockfile()
        wsauth.remove_token()
Пример #5
0
import sys
import time

import basic_data_crawler
import dbhandler
import gameid_info_crawler
from models import Team, Player, IDMapping, GameIDInfo

if __name__ == '__main__':
    print('get argument:', sys.argv)
    time_start = time.time()
    db_handler = dbhandler.DBHandler()
    print('crawler started... good luck')
    if 'basic' in sys.argv:
        print('start to collect team and player pages')
        crawler = basic_data_crawler.TeamDataCrawler()
        crawler.page_collector()
        print('second %s: collecting pages finished' %
              (str(time.time() - time_start)))
        print('start to parse team data page')
        team_data = crawler.crawl_basic_info()
        print('second %s: crawling team data finished' %
              (str(time.time() - time_start)))
        print('team data:\n', team_data)
        print('start to save team data to db')
        db_handler.save_data(team_data, Team)
        print('crawling team img')
        crawler.crawl_team_img()
        print('start to parse player data page')
        player_data = crawler.crawl_player_info()
        print('second %s: crawling player data finished' %
Пример #6
0
 def crawl_gameid_info(self):
     browser = webdriver.PhantomJS(
         executable_path="/opt/phantomjs/bin/phantomjs")
     # browser = webdriver.Chrome()
     browser.set_page_load_timeout(30)
     print('getting:', self.gameid_info_url)
     try:
         browser.get(self.gameid_info_url)
     except TimeoutException:
         browser.execute_script("window.stop()")
     count = 1
     while count <= 20:
         js = "document.body.scrollTop=%d000" % (count * 500)
         browser.execute_script(js)
         time.sleep(1)
         count += 1
         # print('length of html:', len(browser.page_source))
     page_html = browser.page_source
     print('length of html:', len(page_html))
     soup = htmlparser.HtmlParser(page_html).get_soup()
     all_gameid = soup.find("table",
                            class_="LadderRankingTable").find_all("tr")
     for gameid in all_gameid[1:-1]:
         tmp_dict = {}
         tmp_dict['game_id'] = gameid.find(
             "td", class_="SummonerName").find("a").get_text()
         tmp_dict['link'] = 'http:' + gameid.find(
             "td", class_="SummonerName").find("a").get("href")
         tmp_dict['rank'] = gameid.find("td", class_="Rank").get_text()
         tmp_dict['tier'] = gameid.find("td", class_="TierRank").get_text()
         tmp_dict['lp'] = gameid.find(
             "td", class_="LP").get_text().split()[0].replace(',', '')
         tmp_dict['total_win'] = gameid.find("td",
                                             class_="RatioGraph").find(
                                                 "div", {
                                                     "class": "Text Left"
                                                 }).get_text()
         tmp_dict['total_lose'] = gameid.find("td",
                                              class_="RatioGraph").find(
                                                  "div", {
                                                      "class": "Text Right"
                                                  }).get_text()
         tmp_dict['total_win_ratio'] = gameid.find(
             "td", class_="RatioGraph").find(
                 "span", class_="WinRatio").get_text().replace('%', '')
         self.gameid_info.append(tmp_dict)
         self.gameids_ladder.add(tmp_dict['game_id'])
         # print(tmp_dict)
     print('number of gameids crawled:', len(self.gameids_ladder))
     # self.gameids_add = self.pro_gameids - self.gameids_ladder
     db_handler = dbhandler.DBHandler()
     gameids_db = set(db_handler.get_idmappingmanual_gameid())
     self.gameids_add = (gameids_db - self.gameids_ladder) | (
         self.pro_gameids - self.gameids_ladder)
     print('number of gameids still need to be crawled:',
           len(self.gameids_add))
     print(self.gameids_add)
     browser.close()
     browser.quit()
     while len(self.gameids_add) != 0:
         pool = Pool(8)
         pool.map(self.crawl_gameid_info_by_search, self.gameids_add)
         pool.close()
         pool.join()
     print('invalid gameids found:', len(self.invalid_gameids))
     print(self.invalid_gameids)
     print('number of gameids crawled:', len(self.gameid_info))
     print(self.gameid_info)
     return self.gameid_info
Пример #7
0
import spider
import dbhandler

if __name__ == '__main__':
    crawler_obj = spider.Spider()
    data_today = crawler_obj.craw()
    print('crawler finished,got %d items of job info' % len(data_today))
    dbhandler_obj = dbhandler.DBHandler()
    dbhandler_obj.savedata(data_today)
    job_info_new = dbhandler_obj.getdata()
    print('db task finished,got %d items of new job info today' % len(job_info_new))
    print(job_info_new)