Пример #1
0
def getRoomList(cursor, conn, url, city_name):
    qu = []
    qu_name = []
    newurl = ''
    if (url.find('zufang') == -1):
        newurl = url + 'zufang/'
    else:
        newurl = url
    print(newurl)
    respone = requests.get(newurl, headers=local_headers)
    respone.encoding = getEncoding(newurl).get_encode2()
    soup = BeautifulSoup(respone.text, 'html.parser')
    areas = soup.find('div',
                      id='filter-options').find_all('dl')[0].find_all('a')
    areas = areas[1:]
    for i in areas:
        qu_name.append(i.string)
        if (i['href'].find('https:') == -1):
            qu.append(url + i['href'])
        else:
            qu.append(i['href'])
    # print(qu)
    print(qu_name)

    for i in qu:
        try:
            saveInfo(cursor, conn, i, 1, qu, city_name, qu_name)
            # print(qu.index(i))

        except Exception as e:
            print(e)
Пример #2
0
 def getCityList(self):
     print('获取城市列表')
     respone = requests.get(self.url, headers=local_headers)
     respone.encoding = getEncoding(self.url).get_encode2()
     soup = BeautifulSoup(respone.text, 'html.parser')
     divs = soup.find('div', class_='all').find_all('ul', class_='clear')
     print('获取成功')
     print('开始入库')
     conn = pymysql.connect(host='localhost',
                            user='******',
                            password='',
                            db='lianjia',
                            charset='utf8')
     cursor = conn.cursor()
     for i in divs:
         lis = i.find_all('li')
         for j in lis:
             print(j.a['href'])
             print(j.a.string)
             sql = "insert into city_list(id, city_name, city_link)values (null,'%s','%s')" % (
                 j.a.string, j.a['href'])
             # print(sql)
             cursor.execute(sql)
             conn.commit()
     cursor.close()
     conn.close()
     print('入库成功')
Пример #3
0
def getJobInfo(page_url, pages_num, isFirst):
    if pages_num.count(page_url) != 0:
        # 列表中已经含有此列表
        return False

    if isFirst == 1:
        pages_num = []
        url = page_url
    else:
        url = base + page_url + '&ka=page-next'
        pages_num.append(page_url)

    respone = requests.get(url, headers=local_headers)
    respone.encoding = getEncoding(test_url).get_encode2()
    soup = BeautifulSoup(respone.text, 'html.parser')
    # dls = soup.find(class_='condition-district show-condition-district').find_all('a')
    # for d in dls[1:]:
    #     print(d.string)
    #     print(base + d['href'] + '?' + d['ka'])
    #     # print(d)

    lis = soup.find('div', class_='job-list').find_all('li')
    for i_ in lis:
        # i_ = lis[0]
        # 工作名字 job_name
        job_name = i_.find(class_='job-title').string
        # 公司名字 job_company
        job_company = i_.find(class_='company-text').find('a').string
        # 工作要求 job_require
        job_require = i_.find('div', class_='info-primary').p.text
        # 公司信息 job_company_info
        job_company_info = i_.find('div', class_='company-text').p.text
        # 公司招聘人 job_people
        job_people = i_.find('div', class_='info-publis').h3.text
        # 公司详细网址
        job_link = i_.find('div', class_='info-primary').h3.a['href']
        # 工资
        job_money = i_.find('div',
                            class_='info-primary').h3.find(class_='red').string
        # 工资_下限
        job_min_money = job_money.split('-')[0]
        # 工资_上限
        job_max_money = job_money.split('-')[1]

        # print('工作名字-' + job_name)
        # print('公司名字-' + job_company)
        # print('公司名字-' + job_require)
        # print('公司名字-' + job_company_info)
        # print('公司招聘人-' + job_people)
        # print('公司详细网址-' + job_link)
        # print('工资-' + job_money)
        # print('工资_下限-' + job_min_money)
        # print('工资_上限-' + job_max_money)
        print(job_name + page_url)
    # if soup.find('div', class_='page').find_all('a')[-1]['href'] == 'javascript:;':
    #     getJobInfo("",pages_num,3)
    # else:
    getJobInfo(
        soup.find('div', class_='page').find_all('a')[-1]['href'], pages_num,
        2)
Пример #4
0
def getpiclist(num, type, title, cursor, conn, link):
    link_ = link[:len(link) - 5]
    if num == 1:
        respone = requests.get(link, headers=ua_headers)
    else:
        respone = requests.get(link_ + "_" + str(num) + ".html",
                               headers=ua_headers)
    respone.encoding = getEncoding(base + tag).get_encode2()
    soup = BeautifulSoup(respone.text, 'html.parser')
    # 获取页码 和第一页入库
    bottom_pages = soup.find_all('div', class_='pages')[0].find_all('li')
    # print(pages[0].a.string[2])
    bg = bottom_pages[0].a.string
    page = int(bg.replace('共', '').replace('页:', ''))
    if page == 1:
        page = 1

    pic = soup.find('div', id='big-pic').find('img')['src']
    # print(pic)
    # 执行存储语句
    sql = "insert into weiyi_images(id, type, title, url)values (null,'%s','%s','%s')" % (
        type, title, pic)
    print(sql)
    cursor.execute(sql)
    conn.commit()
    # 判断是否有第二页,没有的话入下一个tag
    if page == 1:
        return
    else:
        num += 1
        if num <= page:
            getpiclist(num, type, title, cursor, conn, link)
        else:
            return
Пример #5
0
def test():
    respone = requests.get(test_url, headers=local_headers)
    respone.encoding = getEncoding(test_url).get_encode2()
    soup = BeautifulSoup(respone.text, 'html.parser')
    dls = soup.find(
        class_='condition-district show-condition-district').find_all('a')
    for d in dls[1:]:
        print(d.string)
        print(base + d['href'] + '?' + d['ka'])
        # print(d)

    lis = soup.find('div', class_='job-list').find_all('li')
    i_ = lis[0]
    # 工作名字 job_name
    job_name = i_.find(class_='job-title').string
    # 公司名字 job_company
    job_company = i_.find(class_='company-text').find('a').string
    # 工作要求 job_require
    job_require = i_.find('div', class_='info-primary').p.text
    # 公司信息 job_company_info
    job_company_info = i_.find('div', class_='company-text').p.text
    # 公司招聘人 job_people
    job_people = i_.find('div', class_='info-publis').h3.text
    # 公司详细网址
    job_link = i_.find('div', class_='info-primary').h3.a['href']
    # 工资
    job_money = i_.find('div',
                        class_='info-primary').h3.find(class_='red').string
    # 工资_下限
    job_min_money = job_money.split('-')[0]
    # 工资_上限
    job_max_money = job_money.split('-')[1]
Пример #6
0
    def startSpider(self):
        conn = pymysql.connect(host='localhost',
                               user='******',
                               password='',
                               db='biquge',
                               charset='utf8')
        cursor = conn.cursor()
        respone = requests.get(base, headers=local_headers)
        respone.encoding = getEncoding(base).get_encode2()
        soup = BeautifulSoup(respone.text, 'html.parser')
        for i in ids:
            # 大分类
            print(types[ids.index(i)])
            title_type = types[ids.index(i)]
            lis = soup.find(id=i).find_all('li')
            for j in lis:
                # 小分类
                print(j.a['title'])
                print(j.a['href'])
                sql = "insert into bqg_list(id, type, name, book_link)values (null,'%s','%s','%s')" % (
                    title_type, j.a['title'], j.a['href'])
                cursor.execute(sql)
                conn.commit()

        cursor.close()
        conn.close()
Пример #7
0
def downloadImage(url):
    # response = requests.get(url + '', headers=ua_headers)
    # img = response.content
    # with open('./a.jpg', 'wb') as f:
    #     f.write(img)
    respone = requests.get(url)
    respone.encoding = getEncoding(url).get_encode2()
    soup = BeautifulSoup(respone.text, 'html.parser')
    print(soup)
Пример #8
0
def bookInfo():
    print()
    respone = requests.get("https://www.qu.la/book/4140/2585313.html",
                           headers=local_headers)
    respone.encoding = getEncoding(
        "https://www.qu.la/book/4140/2585313.html").get_encode2()
    soup = BeautifulSoup(respone.text, "html.parser")
    text = soup.find(id="content").text.replace("&nbsp;",
                                                "").replace("<br />", "\n")
    print(text)
Пример #9
0
def init(adr):
    test_url = 'http://www.panduoduo.net/s/name/' + "变形金刚"
    respone = requests.get(test_url, headers=local_headers)
    respone.encoding = getEncoding(test_url).get_encode2()
    soup = BeautifulSoup(respone.text, 'html.parser')
    soup = soup.select('.row')
    pattern = re.compile('\/r\/\d+')
    for i in soup:
        i = str(i)
        adress = pattern.search(i)
        adress = adress.group()
        adr.append(adress)
Пример #10
0
def getSpiderUrl():
    respone = requests.get(test_url, headers=local_headers)
    respone.encoding = getEncoding(test_url).get_encode2()
    soup = BeautifulSoup(respone.text, 'html.parser')
    dls = soup.find(
        class_='condition-district show-condition-district').find_all('a')
    areas = []
    for d in dls[1:]:
        area = []
        area.append(d.string)
        area.append(base + d['href'] + '?' + d['ka'])
        area.append(base + d['href'])
        # print(d.string)
        # print(base+d['href']+'?'+d['ka'])
        areas.append(area)
    return areas
Пример #11
0
def getinfo(tag_, type_, num):
    print("正在获取此分类---" + type_)
    conn = pymysql.connect(host='localhost',
                           user='******',
                           password='',
                           db='weiyi',
                           charset='utf8')
    cursor = conn.cursor()

    if num == 1:
        respone = requests.get("http://www.mmonly.cc" + tag_,
                               headers=ua_headers)
    else:
        respone = requests.get("http://www.mmonly.cc" + tag_ + str(num) +
                               ".html",
                               headers=ua_headers)
    respone.encoding = getEncoding(base + tag).get_encode2()
    soup = BeautifulSoup(respone.text, 'html.parser')
    # 获取页码 和第一页入库
    pages = soup.find_all('div', class_='pages')[0].find_all('li')
    if (len(pages) == 1):
        page = 1
    else:
        page = len(pages) - 2
    print("该" + tag_ + "分类有" + str(page))
    divs = soup.find_all('div', class_='item masonry_brick masonry-brick')
    for i in divs:
        a = i.find('a', class_='img_album_btn')
        if i.find('b'):
            title = i.find('b').string
        else:
            title = i.find('div', class_='title').find('a').string

        # print(title)
        # print(i.find_all('img')[0]['src'])
        try:
            getpiclist(1, type_, title, cursor, conn, a['href'])
        except Exception as e:
            print(e)

    # 判断是否有第二页,没有的话入下一个tag
    if page == 1:
        return
    else:
        num += 1
        if num <= page:
            getinfo(tag_, type_, num)
Пример #12
0
def saveInfo(cursor, conn, url, num, qu, city_name, qu_name):
    if num > 1:
        newurl = url + 'pg' + str(num) + '/'
    else:
        newurl = url
    print(newurl + '当前是第' + str(num) + '页')

    respone = requests.get(newurl, headers=local_headers)
    respone.encoding = getEncoding(newurl).get_encode2()
    soup = BeautifulSoup(respone.text, 'html.parser')
    try:
        page = int(
            soup.find('div', class_='page-box house-lst-page-box')
            ['page-data'].split(',')[0].split(':')[1])
        print('共' + str(page) + '页')
    except Exception as e:
        page = 1

    lis = soup.find(id='house-lst').find_all('li')
    for i in lis:
        div_ = i.find('div', class_='info-panel')
        # 租房子的价格
        price = int(i.find('div', class_='price').span.string)
        info = i.find('div', class_='con').text
        size = int(
            i.find('span', class_='meters').text.replace('平米', '').strip())
        style = i.find('span', class_='zone').text.strip()
        adress = i.find('span', class_='region').text.strip()
        sql = "INSERT INTO city_room_list(id,city_name,city_area,adress,style,size,price,info) VALUES (null,'%s','%s','%s','%s',%d,%d,'%s')" % (
            city_name, qu_name[qu.index(url)], adress, style, size, price,
            info)
        print(sql)
        cursor.execute(sql)
        conn.commit()
        # print(adress)
    if page == num:
        return

    num += 1
    if num <= page:
        saveInfo(cursor, conn, url, num, qu, city_name, qu_name)
Пример #13
0
def catchTag():
    respone = requests.get(base + tag, headers=ua_headers)
    respone.encoding = getEncoding(base + tag).get_encode2()
    soup = BeautifulSoup(respone.text, 'html.parser')
    # 获取纵列
    lists_tag = soup.find_all('h2')
    for i in lists_tag:
        tag_title.append(i.string)

    lists = soup.find_all('div', class_='TagList')
    for i in lists:
        if i != '':
            links = i.find_all('a')
            tag_list = []
            for j in links:
                temp_ = []
                temp_.append(j['href'])
                temp_.append(j['title'])
                tag_list.append(temp_)

            tag_dic[tag_title[lists.index(i)]] = tag_list
Пример #14
0
def getbaidu(adr):
    for i in adr:
        respone = requests.get('http://www.panduoduo.net' + i,
                               headers=local_headers)
        respone.encoding = getEncoding('http://www.panduoduo.net' +
                                       i).get_encode2()
        # url = urllib.urlopen('http://www.panduoduo.net'+i)
        bs = BeautifulSoup(respone.text, 'html.parser')
        bs1 = bs.select('.dbutton2')
        href = re.compile('http\%(\%|\d|\w|\/\/|\/|\.)*')
        b = href.search(str(bs1))
        name = str(bs.select('.center')).decode('utf-8')
        text1 = re.compile('\<h1\sclass\=\"center"\>[\d|\w|\D|\W]*\</h1\>')
        text2 = text1.search(name)
        rag1 = re.compile('\>[\d|\w|\D|\W]*\<')
        if text2:
            text3 = rag1.search(text2.group())
            if text3:
                print(text3.group())
        if b:
            text = urllib.unquote(str(b.group())).decode('utf-8')
            print(text)
Пример #15
0
def getRoomList2(url):
    qu = []
    qu_name = []
    newurl = ''
    if (url.find('zufang') == -1):
        newurl = url + 'zufang/'
    else:
        newurl = url
    print(newurl)
    respone = requests.get(newurl, headers=local_headers)
    respone.encoding = getEncoding(newurl).get_encode2()
    soup = BeautifulSoup(respone.text, 'html.parser')
    areas = soup.find('div',
                      id='filter-options').find_all('dl')[0].find_all('a')
    areas = areas[1:]
    for i in areas:
        qu_name.append(i.string)
        if (i['href'].find('https:') == -1):
            qu.append(url + i['href'])
        else:
            qu.append(i['href'])
    # print(qu)
    print(qu_name)
Пример #16
0
    def titleSpider(self):

        conn = pymysql.connect(host='localhost',
                               user='******',
                               password='',
                               db='biquge',
                               charset='utf8')
        cursor = conn.cursor()

        # sql = "select * from bqg_list"
        # cursor.execute(sql)
        # books = cursor.fetchall()
        # print(books)
        books = ((99, '女生频道排行', '俏汉宠农妻:这个娘子好辣', '/book/42186/'),
                 (100, '女生频道排行', '重生七十年代:军嫂,有点田',
                  '/book/60629/'), (101, '女生频道排行', '绝宠妖妃:邪王,太闷骚!',
                                    '/book/28906/'),
                 (102, '女生频道排行', '鬼帝狂妻:纨绔大小姐',
                  '/book/40082/'), (103, '女生频道排行', '帝国总裁,宠翻天!',
                                    '/book/42973/'),
                 (104, '女生频道排行', '萌宝来袭:总裁爹地,太给力!',
                  '/book/45213/'), (105, '女生频道排行', '空间重生:盛宠神医商女',
                                    '/book/34335/'), (106, '完本小说总排行', '太古神王',
                                                      '/book/4140/'),
                 (107, '完本小说总排行', '大主宰',
                  '/book/176/'), (108, '完本小说总排行', '雪鹰领主', '/book/5094/'),
                 (109, '完本小说总排行', '择天记', '/book/168/'), (110, '完本小说总排行',
                                                         '全职高手', '/book/32/'),
                 (111, '完本小说总排行', '完美世界', '/book/14/'), (112, '完本小说总排行', '遮天',
                                                         '/book/394/'),
                 (113, '完本小说总排行', '逍遥小书生',
                  '/book/23934/'), (114, '完本小说总排行', '绝世武神', '/book/322/'),
                 (115, '完本小说总排行', '异世灵武天下',
                  '/book/199/'), (116, '完本小说总排行', '不朽凡人', '/book/18049/'),
                 (117, '完本小说总排行', '掠天记',
                  '/book/4295/'), (118, '完本小说总排行', '最强兵王', '/book/4511/'),
                 (119, '完本小说总排行', '都市无上仙医',
                  '/book/3802/'), (120, '完本小说总排行', '斗破苍穹', '/book/390/'))

        for book in books:
            print("当前是:" + book[2])
            respone = requests.get(base_book + book[3], headers=local_headers)
            respone.encoding = getEncoding(base_book + book[3]).get_encode2()
            soup = BeautifulSoup(respone.text, "html.parser")
            dds = soup.find(id='list').find_all('dd')
            for i in dds:
                try:
                    zu = i.find('a')['href'].split('/')
                    if (len(zu) < 2):
                        print("排除" + i.a['href'])
                    else:
                        print("当前是第" + i.find('a').string)
                        title = i.find('a').string.replace('\\', "")
                        sql = "insert into bqg_chapter(chapterid ,id, name, chapter,chapter_link)values (null,%d,'%s','%s','%s')" % (
                            book[0], book[2], title, i.find('a')['href'])
                        cursor.execute(sql)
                        conn.commit()
                except Exception as e:
                    print(e)
        cursor.close()
        conn.close()