예제 #1
0
    def _parse_one_list(self, article_list):
        soup = load_page(article_list)
        threads = soup.find('tbody',
                            attrs={'class', 'list_tbody'
                                   }).find_all('tr')  # 각 tr 항목마다 갤 제목 등등 저장

        valid_threads = []
        for t in threads:
            if t.find('td') is None:
                continue
            header = t.find('td', class_='t_notice').string
            if bool(re.match('\d+', header)):  # 공지사항 등 제외
                valid_threads.append(t)

        if len(self.keyword_filter) == 0:  # 제목 필터 없는 경우
            ret = [
                'http://gall.dcinside.com' + v.find('a').get('href')
                for v in valid_threads
            ]

        else:  # 제목 필터 적용
            ret = []
            for v in valid_threads:
                thread_title = ''.join(
                    v.find('td', class_='t_subject').strings)
                for k in self.keyword_filter:
                    if k in thread_title:
                        ret.append('http://gall.dcinside.com' +
                                   v.find('a').get('href'))
                        break

        return ret
예제 #2
0
def collect(_rid):
    # Get html
    base_url = "https://race.netkeiba.com/?pid=race_old&id=c{rid}"
    if re.match(r"^\d{12}$", _rid):
        url = base_url.replace("{rid}", _rid)
        page = load_page(url, ".race_table_old")
    else:
        return {"status": "ERROR", "message": "Invalid URL parameter: " + _rid}

    # Parse race info
    if page is not None:
        race = parse_nk_race(page)
    else:
        return {"status": "ERROR", "message": "There is no page: " + url}

    if "_id" in race:
        db = vault()
        db.races.update({"_id": race["_id"]}, race, upsert=True)
    else:
        return {
            "status": "ERROR",
            "message": "There is no id in page: " + race
        }

    return {
        "status": "SUCCESS",
        "message": "Start race collection process for " + _rid
    }
예제 #3
0
def format_code_dcinside(code, recommend=False):
    ret = 'http://gall.dcinside.com/board/lists/?id=' + code
    soup = load_page(ret)
    mat = re.match("window.location.replace\('(?P<target>\S+)'\);", soup.text)
    if bool(mat):
        ret = 'http://gall.dcinside.com' + mat.group('target')

    ret += '&page=1'
    if recommend:
        ret += '&exception_mode=recommend'

    return ret
예제 #4
0
    def get_last_list(self):
        soup = load_page(self.url)
        self.title = soup.title.string.strip()
        if self.recommend:
            self.title += ' (개념글만)'

        page_range_tag = soup.find('div', id='dgn_btn_paging')
        end_page_url = page_range_tag.find_all('a')[-1].get('href')
        last_page_mat = re.search('page=(?P<pagenum>\d+)', end_page_url)
        if bool(last_page_mat):  # 마지막 페이지가 있는 경우
            self.last_list = int(last_page_mat.group('pagenum'))
        else:  # 마지막 페이지가 없는 경우
            self.last_list = 1
예제 #5
0
    def get_last_list(self):
        title_soup = load_page(self.url)
        mat = re.search(
            '\S+.src="(?P<prefix>\S+)"\+Date.now\(\)\+"(?P<suffix>\S+)"',
            title_soup.text)
        if bool(mat):
            new_referer = mat.group('prefix') + str(round(
                time.time())) + mat.group('suffix')
            title_soup = load_page(self.url,
                                   extra_headers={'Referer': new_referer})
        self.title = title_soup.title.string.strip()

        category_soup = load_page(self.url + 'category')
        url_list = []
        a_list = category_soup.find_all('a')
        for a in a_list:
            link = a.get('href')
            if link is not None:
                mat = re.search('^/(?P<page_id>\d+)', a.get('href'))
                if bool(mat):
                    url_list.append(int(mat.group('page_id')))

        self.last_list = max(url_list)
예제 #6
0
def find_gall(keyword):
    search_url = 'http://m.dcinside.com/search/index.php?search_gall={}&search_type=gall_name'.format(
        keyword)
    soup = load_page(search_url,
                     mobile=True,
                     extra_headers={'Host': 'm.dcinside.com'})

    ret = []
    a_list = soup.find('div', class_="searh-result-box").find_all('a')
    for a in a_list:
        if 'http://m.dcinside.com/list.php' in a.get('href'):
            title = ''.join(a.strings).strip()
            code = re.split('=', a.get('href'))[1]
            ret.append([title, code])
    return ret
예제 #7
0
def bulk_collect(_year, _month):
    url = "https://keiba.yahoo.co.jp/schedule/list/" + _year + "/?month=" + _month
    page = load_page(url, ".layoutCol2M")

    # Parse race info
    if page is not None:
        race_id = parse_spn_rid(page)
    else:
        return {"status": "ERROR", "message": "There is no page: " + url}

    if len(race_id) != 0:
        for rid in race_id:
            collect(rid)
            time.sleep(5)
    else:
        return {"status": "ERROR", "message": "There is no page: " + url}

    return {"status": "SUCCESS", "message": "Start bulk collection process"}
예제 #8
0
def job_manager(_urls):
    # Put log
    task_id = start_log(len(_urls))

    for url in _urls:
        # Collect target page html
        page = load_page(url, ".layoutCol2M")
        if page is None:
            warning_log(task_id, "There is no page: " + url)
        else:
            # Parse & Insert page elements
            db = vault()
            for hold in parse_sportsnavi(page):
                if "_id" in hold:
                    db.holds.update({"_id": hold["_id"]}, hold, upsert=True)
                else:
                    warning_log(task_id, "There is no _id in page: " + hold)
            update_log(task_id)

        time.sleep(5)
    end_log(task_id)
예제 #9
0
 def get_last_list(self):
     soup = load_page(self.url)
     self.title = soup.find('h2', {'class': 'tit_series'}).string
     a_list = soup.find_all('a', {'class': 'spot_post_area'})
     self.last_list = len(a_list)
예제 #10
0
 def _get_soup(self):
     self.soup = load_page(self.url)
     self._get_raw_title()