Exemplo n.º 1
0
def GetList(driver, site, cate):
    # 리스트 생성
    indexList = []
    max_page = site['MAX_PAGE'] if 'MAX_PAGE' in site else 1
    for page in range(1, max_page + 1):
        print('PAGE : %s' % page)
        if 'SITE_TYPE' not in site:
            u = '%s/bbs/board.php?bo_table=%s&page=%s' % (
                site['TORRENT_SITE_URL'], cate, page)
        else:
            u = site['BO_TABLE_URL'] % cate
        if 'QUERY' in site: u += site['QUERY']
        print('URL : %s' % u)
        driver.get(u)

        list_tag = site['XPATH_LIST_TAG'][:site['XPATH_LIST_TAG'].find('[%s]')]
        list = WebDriverWait(
            driver,
            3).until(lambda driver: driver.find_elements_by_xpath(list_tag))
        step = 1 if 'STEP' not in site else site['STEP']
        start = 1 if 'START_INDEX' not in site else site['START_INDEX']
        for i in range(start, len(list) + 1, step):
            try:
                a = WebDriverWait(
                    driver,
                    3).until(lambda driver: driver.find_element_by_xpath(site[
                        'XPATH_LIST_TAG'] % i))
                if a.get_attribute('href').find(cate) == -1: continue
                item = {}
                item['title'] = a.text.strip()
                item['detail_url'] = a.get_attribute('href')
                indexList.append(item)
            except:
                print('NOT BBS : %s' % i)
                exc_info = sys.exc_info()
                traceback.print_exception(*exc_info)
    # 세부 페이지에서 링크 추출
    list = []
    for item in indexList:
        print('URL : %s' % item['detail_url'])
        driver.get(item['detail_url'])
        if 'HOW' not in site or site['HOW'] != 'USING_MAGNET_REGAX':
            try:
                # TODO
                if site['TORRENT_SITE_TITLE'] == 'tfreeca':
                    driver.switch_to_frame("external-frame")
                if 'HOW' in site and site['HOW'] == 'INCLUDE_MAGNET_IN_INPUT':
                    link_element = WebDriverWait(
                        driver,
                        10).until(lambda driver: driver.find_elements_by_xpath(
                            "//input[starts-with(@value,'magnet')]"))
                else:
                    link_element = WebDriverWait(
                        driver,
                        10).until(lambda driver: driver.find_elements_by_xpath(
                            "//a[starts-with(@href,'magnet')]"))
                for magnet in link_element:
                    if 'HOW' in site and site[
                            'HOW'] == 'INCLUDE_MAGNET_IN_LIST_AND_INCLUDE_LIST_ON_VIEW':
                        if not magnet.text.startswith('magnet'): break
                    if 'HOW' in site and site[
                            'HOW'] == 'INCLUDE_MAGNET_IN_INPUT':
                        entity = {}
                        entity['title'] = item['title']
                        entity['link'] = magnet.get_attribute('value')
                        print entity['link']
                        list.append(entity)
                        try:
                            print('TITLE : %s\nLINK : %s' %
                                  (entity['title'], entity['link']))
                        except:
                            pass
                        continue
                    idx2 = 0
                    # torrentao 에서 magnet이 붙어있다
                    while True:
                        idx1 = magnet.get_attribute('href').find(
                            'magnet:?xt=urn', idx2)
                        idx2 = magnet.get_attribute('href').find(
                            'magnet:?xt=urn', idx1 + 1)
                        if idx2 == -1: idx2 = len(magnet.get_attribute('href'))
                        # 중복검사
                        entity = {}
                        entity['title'] = item['title']
                        entity['link'] = magnet.get_attribute(
                            'href')[idx1:idx2]
                        flag = False
                        for tmp in list:
                            if tmp['link'] == entity['link']:
                                flag = True
                                break
                        if flag == False:
                            list.append(entity)
                            try:
                                print('TITLE : %s\nLINK : %s' %
                                      (entity['title'], entity['link']))
                            except:
                                pass
                        if idx2 == len(magnet.get_attribute('href')): break
            except:
                exc_info = sys.exc_info()
                traceback.print_exception(*exc_info)

        elif site['HOW'] == 'USING_MAGNET_REGAX':
            try:
                regax = re.compile(site['MAGNET_REGAX'], re.IGNORECASE)
                #match = regax.search(driver.page_source)
                match = regax.findall(driver.page_source)
                for m in match:
                    entity = {}
                    entity['title'] = item['title']
                    entity['link'] = site['MAGNET_MAKE_URL'] % m
                    list.append(entity)
                    try:
                        print('TITLE : %s\nLINK : %s' %
                              (entity['title'], entity['link']))
                    except:
                        pass
            except:
                exc_info = sys.exc_info()
                traceback.print_exception(*exc_info)

        # 첨부파일 다운로드
        if 'DOWNLOAD_FILE' in site and site['DOWNLOAD_FILE'] is 'ON':
            try:
                if 'DOWNLOAD_REGEX' not in site:
                    tmp = '%s/bbs/download.php' % site['TORRENT_SITE_URL']
                    link_element = WebDriverWait(
                        driver,
                        5).until(lambda driver: driver.find_elements_by_xpath(
                            "//a[starts-with(@href,'%s')]" % tmp))
                else:
                    link_element = WebDriverWait(
                        driver,
                        5).until(lambda driver: driver.find_elements_by_xpath(
                            "//a[contains(@href,'bbs/download.php')]"))

                for a_tag in link_element:
                    flag = False
                    filename = ''
                    if 'DOWNLOAD_REGEX' not in site:
                        tmp = a_tag.text.replace('\n', ' ').replace('\r', '')
                        url = a_tag.get_attribute('href')
                    else:
                        regax = re.compile(site['DOWNLOAD_REGEX'],
                                           re.IGNORECASE)
                        match = regax.search(a_tag.get_attribute('href'))
                        if not match: continue
                        tmp = match.group('filename')
                        url = match.group('url')
                        idx = url.find('bbs/download.php')
                        url = site['TORRENT_SITE_URL'] + '/' + url[idx:]
                    for ext in ['.torrent', '.smi', '.srt', '.ass']:
                        idx = tmp.find(ext)
                        if idx != -1:
                            flag = True
                            if ext != '.torrent':
                                filename = tmp[:idx + len(ext)]
                                filename = filename.replace('\\', ' ').replace(
                                    '/', ' ').replace(':', ' ').replace(
                                        '*', ' ').replace('?', ' ').replace(
                                            '"',
                                            ' ').replace('<', ' ').replace(
                                                '>', ' ').replace('|', ' ')
                            break
                    if flag and filename is not '':
                        print('DOWNLOAD : %s' % filename)
                        download(
                            driver, url, filename, site['DOWNLOAD_PATH']
                            if 'DOWNLOAD_PATH' in site else None)
            except:
                exc_info = sys.exc_info()
                traceback.print_exception(*exc_info)
                pass
        if 'SLEEP' in site: time.sleep(site['SLEEP'])
    return list
Exemplo n.º 2
0
def GetList(driver, site, cate):
    # 리스트 생성
    indexList = []
    for page in range(1, site['MAX_PAGE'] + 1):
        print('PAGE : %s' % page)
        u = '%s/bbs/board.php?bo_table=%s&page=%s' % (site['TORRENT_SITE_URL'],
                                                      cate, page)
        print('URL : %s' % u)
        driver.get(u)

        list_tag = site['XPATH_LIST_TAG'][:site['XPATH_LIST_TAG'].find('[%s]')]
        list = WebDriverWait(
            driver,
            3).until(lambda driver: driver.find_elements_by_xpath(list_tag))
        step = 1 if 'STEP' not in site else site['STEP']
        for i in range(1, len(list) + 1, step):
            #for i in range(1, 6):
            try:
                a = WebDriverWait(
                    driver,
                    3).until(lambda driver: driver.find_element_by_xpath(site[
                        'XPATH_LIST_TAG'] % i))
                if a.get_attribute('href').find(cate) == -1: continue
                #a = WebDriverWait(driver, 3).until(lambda driver: driver.find_element_by_xpath(''))

                item = {}
                item['title'] = a.text
                item['detail_url'] = a.get_attribute('href')
                indexList.append(item)
            except:
                print('NOT BBS : %s' % i)
                exc_info = sys.exc_info()
                traceback.print_exception(*exc_info)

    # 세부 페이지에서 링크 추출
    list = []
    for item in indexList:
        print('URL : %s' % item['detail_url'])
        driver.get(item['detail_url'])

        if 'HOW' not in site:
            try:
                link_element = WebDriverWait(
                    driver,
                    10).until(lambda driver: driver.find_elements_by_xpath(
                        "//a[starts-with(@href,'magnet')]"))

                for magnet in link_element:
                    print('HREF : %s' % magnet.get_attribute('href'))
                    idx2 = 0
                    # torrentao 에서 magnet이 붙어있다
                    while True:
                        idx1 = magnet.get_attribute('href').find(
                            'magnet:?xt=urn', idx2)
                        idx2 = magnet.get_attribute('href').find(
                            'magnet:?xt=urn', idx1 + 1)
                        if idx2 == -1: idx2 = len(magnet.get_attribute('href'))
                        # 중복검사
                        entity = {}
                        entity['title'] = item['title']
                        entity['link'] = magnet.get_attribute(
                            'href')[idx1:idx2]
                        flag = False
                        for tmp in list:
                            if tmp['link'] == entity['link']:
                                flag = True
                                break
                        if flag == False:
                            list.append(entity)
                            print('TITLE : %s\nLINK : %s' %
                                  (entity['title'], entity['link']))
                        if idx2 == len(magnet.get_attribute('href')): break
            except:
                exc_info = sys.exc_info()
                traceback.print_exception(*exc_info)

        elif site['HOW'] == 'USING_MAGNET_REGAX':
            try:
                regax = re.compile(site['MAGNET_REGAX'], re.IGNORECASE)
                #match = regax.search(driver.page_source)
                match = regax.findall(driver.page_source)
                for m in match:
                    entity = {}
                    entity['title'] = item['title']
                    entity['link'] = site['MAGNET_MAKE_URL'] % m
                    list.append(entity)
                    print('TITLE : %s\nLINK : %s' %
                          (entity['title'], entity['link']))
            except:
                exc_info = sys.exc_info()
                traceback.print_exception(*exc_info)

        # 첨부파일 다운로드
        if 'DOWNLOAD_FILE' in site and site['DOWNLOAD_FILE'] is 'ON':
            try:
                tmp = '%s/bbs/download.php' % site['TORRENT_SITE_URL']
                link_element = WebDriverWait(
                    driver,
                    10).until(lambda driver: driver.find_elements_by_xpath(
                        "//a[starts-with(@href,'%s')]" % tmp))
                for a_tag in link_element:
                    tmp = a_tag.text.replace('\n', ' ').replace('\r', '')
                    flag = False
                    filename = ''
                    for ext in ['.torrent', '.smi', '.srt', '.ass']:
                        idx = tmp.find(ext)
                        if idx != -1:
                            flag = True
                            if ext != '.torrent':
                                filename = tmp[:idx + len(ext)]
                            break
                    if flag and filename is not '':
                        print('DOWNLOAD : %s' % filename)
                        download(driver, a_tag.get_attribute('href'), filename)
            except:
                exc_info = sys.exc_info()
                traceback.print_exception(*exc_info)
                pass
    return list
Exemplo n.º 3
0
mercato = WebDriverWait(driver, 5).until(
    EC.presence_of_element_located(
        (By.XPATH, "/html/body/div[2]/header/div/div[2]/ul/li[3]/a")))
ActionChains(driver).move_to_element(mercato).perform()
quotazioni = WebDriverWait(driver, 5).until(
    EC.element_to_be_clickable(
        (By.XPATH,
         "/html/body/div[2]/header/div/div[2]/ul/li[3]/div/ul/li[1]/a"
         ))).click()
table = WebDriverWait(driver, 5).until(
    EC.presence_of_element_located(
        (By.XPATH,
         "/html/body/div[2]/div[1]/div[3]/div/div[1]/div[2]/div[2]/table")))
rows = table.find_elements(By.TAG_NAME, "tr")
table = []
for j in range(1, len(rows)):
    cells = rows[j].find_elements(By.TAG_NAME, "td")
    cellslist = []
    for i in range(0, len(cells)):
        if i == 3 or i == 5:
            continue
        cellslist.append(cells[i].text)  #da salvare in appropriata struttura
    table.append(cellslist)

table = pd.DataFrame(table)
ct = datetime.datetime.now().strftime("%d-%m-%y_%H-%M-%S")
table.to_csv(sys.path[0] + '/' + str(ct) + ".csv")

########################################################################################
#ANALISI DATI
#######################################################################################
Exemplo n.º 4
0
        livros = WebDriverWait(browser, 60).until(
            EC.presence_of_all_elements_located((
                By.XPATH,
                "/html/body/div/div/div/div/div/div/div/div/div/div/div/div/div/div/div/div/div/a/section/div/div/h2"
            )))

        precos = WebDriverWait(browser, 60).until(
            EC.presence_of_all_elements_located((
                By.XPATH,
                "/html/body/div/div/div/div/div/div/div/div/div/div/div/div/div/div/div/div/div/a/section/div/div/div"
            )))

        a = precos[2]

        precos.append(a)

        l = 2

        for livro in livros:

            if precos[l].text == "":

                arquivo.write(
                    str(
                        str(nome_categoria.upper()) + "|" +
                        str(nome_subcategoria.upper()) + "|" +
                        str(livro.text) + "| Produto fora de estoque\n"))

                print(
                    str(nome_categoria.upper()) + "|" +