예제 #1
0
def my_main(year_need):
    """
    需要爬取某一年的日历
    :param year_need: int类型,暂支持区间(1900,2100)
    :return:
    """
    sum_day = 1
    sum_week = 1
    year_dict = {}
    # 打开在线日历网页
    spider.get_page("https://wannianrili.51240.com/")

    # 跳转到需要获取那一年的前一年的12月,并获取数据
    spider.select_year_option(str(year_need - 1))
    spider.select_month_option("12")
    source = spider.get_source()
    month_dict, sum_day, sum_week = get_month_dict(source, sum_day, sum_week,
                                                   str(year_need - 1), "12")
    year_dict = dict(year_dict.items() + month_dict.items())
    last_month_day = sum_day
    last_month_week = sum_week

    spider.select_year_option(str(year_need))
    for month in range(1, 13):
        # 月份格式化
        if month < 10:
            month = '0' + str(month)
        else:
            month = str(month)
        print 'month:', month

        spider.select_month_option(month)
        source = spider.get_source()
        month_dict, sum_day, sum_week = get_month_dict(source,
                                                       sum_day, sum_week,
                                                       str(year_need), month,
                                                       last_month_day,
                                                       last_month_week)
        year_dict = dict(year_dict.items() + month_dict.items())
        # break  # 测试
    print year_dict

    # 跳转到需要获取那一年的后一年的1月,并获取数据
    spider.select_year_option(str(year_need + 1))
    spider.select_month_option("01")
    source = spider.get_source()
    month_dict, sum_day, sum_week = get_month_dict(source, sum_day, sum_week,
                                                   str(year_need + 1), "01")
    year_dict = dict(year_dict.items() + month_dict.items())

    # 修整与统计
    year_dict = alter_holiday(year_dict)
    year_dict = count_next_workday(year_dict)
    year_dict = count_week_holiday(year_dict)
    print year_dict

    col_name, col_data = save.format_data(year_dict, year_need)
    save.write_calender(str(year_need) + '年日历.csv', col_name, col_data)
예제 #2
0
def crawl_url5(url=url5):
    html = get_page(url)
    ip = re.compile(
        r"((1?\d?\d.|2[0-4]\d.|25[1-5].){3}(1?\d?\d|2[0-4]\d|25[1-5]):\d+)")
    for i in re.finditer(ip, html):
        proxie = {}
        proxie["http"] = i.group(1)
        yield proxie
예제 #3
0
def crawl_url6(url=url6):
    html = get_page(url)
    soup = BeautifulSoup(html, 'lxml')
    soup = soup.find("table", class_="table")
    tbody = soup.find("tbody")
    for tr in tbody.find_all("tr"):
        proxie = {}
        td = tr.find_all("td")
        proxie["http"] = ":".join((td[0].text.strip(), td[1].text.strip()))
        print(proxie)
예제 #4
0
 def crawl_ip3366(self):
     for page in range(1, 20):
         start_url = 'http://www.ip3366.net/free/?stype=1&page={}'.format(
             page)
         html = get_page(start_url)
         ip_adress = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
         # \s * 匹配空格,起到换行作用
         re_ip_adress = ip_adress.findall(html)
         for adress, port in re_ip_adress:
             yield ':'.join([adress, port])
예제 #5
0
def crawl_url7(url=url7, page=5):
    for i in range(1, page + 1):
        params = {"page": i}
        html = get_page(url, params=params)
        time.sleep(5)
        soup = BeautifulSoup(html, 'lxml')
        tbody = soup.find("tbody")
        for tr in tbody.find_all("tr"):
            td = tr.find_all("td")
            proxie = {}
            proxie[(td[4].text.strip()).lower()] = ":".join(
                (td[1].text.strip(), td[2].text.strip()))
            yield proxie
예제 #6
0
 def crawl_url5(self, url=url5):
     """从66ip抓取代理"""
     print(f"正在从{url}抓取代理")
     html = get_page(url)
     if html:
         ip = re.compile(
             r"((1?\d?\d.|2[0-4]\d.|25[1-5].){3}(1?\d?\d|2[0-4]\d|25[1-5]):\d+)"
         )
         for i in re.finditer(ip, html):
             proxie = {}
             proxie["http"] = i.group(1)
             yield json.dumps(proxie)
     else:
         self.crawl_url5()
예제 #7
0
def crawl_url2(url=url2, page=5):
    for i in range(1, page + 1):
        time.sleep(5)
        url = f"{url}_{page}.html"
        print(url)
        html = get_page(url)
        soup = BeautifulSoup(html, 'lxml')
        soup = soup.find("tbody")
        for tr in soup.find_all("tr"):
            td = tr.find_all("td")
            proxie = {}
            proxie['http'] = ":".join(
                (td[0].string.strip(), td[1].string.strip()))
            proxie = json.dumps(proxie)
            yield proxie
예제 #8
0
 def crawl_url6(self, url=url6):
     """从http://ip.seofangfa.com抓取代理"""
     print(f"正在从{url}抓取代理")
     try:
         html = get_page(url)
         soup = BeautifulSoup(html, 'lxml')
         soup = soup.find("table", class_="table")
         tbody = soup.find("tbody")
         for tr in tbody.find_all("tr"):
             proxie = {}
             td = tr.find_all("td")
             proxie["http"] = ":".join(
                 (td[0].text.strip(), td[1].text.strip()))
             yield json.dumps(proxie)
     except:
         self.crawl_url6()
예제 #9
0
 def crawl_url7(self, url=url7, page=5):
     """从http://ip.jiangxianli.com/抓取代理"""
     print(f"正在从{url}抓取代理")
     for i in range(1, page + 1):
         try:
             params = {"page": i}
             html = get_page(url, params=params)
             time.sleep(5)
             soup = BeautifulSoup(html, 'lxml')
             tbody = soup.find("tbody")
             for tr in tbody.find_all("tr"):
                 td = tr.find_all("td")
                 proxie = {}
                 proxie[(td[4].text.strip()).lower()] = ":".join(
                     (td[1].text.strip(), td[2].text.strip()))
                 yield json.dumps(proxie)
         except:
             continue
예제 #10
0
 def crawl_url2(self, url=url2, page=6):
     """从89ip抓取代理"""
     realurl = url
     for i in range(1, page + 1):
         try:
             url = f"{realurl}_{i}.html"
             print(f"正在从{url}抓取代理")
             html = get_page(url)
             time.sleep(5)
             soup = BeautifulSoup(html, 'lxml')
             soup = soup.find("tbody")
             for tr in soup.find_all("tr"):
                 td = tr.find_all("td")
                 proxie = {}
                 proxie["http"] = ":".join(
                     (td[0].string.strip(), td[1].string.strip()))
                 proxie = json.dumps(proxie)
                 yield proxie
         except:
             continue