Пример #1
0
    def start_requests(self):
        file = open("inputs/rrc_titles.txt", "r")
        for t in file.readlines():
            self.tlts.append(strHelper.format(t))
        file.close()

        file = open("inputs/rrclinks.txt", "r")
        for link in file.readlines():
            if "http" not in link:
                continue
            base_url = strHelper.format(link)
            request = scrapy.Request(base_url,
                                     callback=self.parseCars,
                                     dont_filter=True)
            yield request
Пример #2
0
    def start_requests(self):
        # url = "http://product.auto.163.com/series/config1/3148.html"
        # yield scrapy.Request(url,self.parse)
        titleFile = open("inputs/titles.txt", "r")
        for line in titleFile.readlines():
            self.extractedTitle.append(sh.format(line))
        titleFile.close()

        base_url = "http://product.auto.163.com/series/config1/"
        file = open("F:/ids.txt", "r")
        # i=0
        for line in file.readlines():
            # if i<2:
            #     i+=2
            # else:
            #     break
            id = sh.format(line)
            url = base_url + id + ".html"
            request = scrapy.Request(url, self.parse)
            yield request
        file.close()
Пример #3
0
 def start_requests(self):
     file = open("F:/163links.txt")
     for link in file.readlines():
         link = link.split(",")
         url = link[0]
         comment = link[1]
         re = scrapy.Request(url,
                             callback=self.parseOnePage,
                             dont_filter=True,
                             meta={"comment": strHelper.format(comment)})
         yield re
     file.close()
Пример #4
0
    def start_requests(self):

        # url = "https://www.autohome.com.cn/use/201711/909485.html#pvareaid=102624"
        # base_url = "http://auto.sina.com.cn/service/?page="

        # yield scrapy.Request(url,self.parseOnePage)
        # base_url = "http://auto.sina.com.cn/j_kandian.d.html?docid=fyremfz2599182"
        file = open("F:/sinalinks.txt", "r")
        for link in file.readlines():
            re = scrapy.Request(strHelper.format(link),
                                callback=self.parseOnePage,
                                dont_filter=True)
            yield re
Пример #5
0
    def parseOnePage(self, response):
        pageLink = response.url
        id = response.url.split("/")
        id = id[len(id) - 1].split(".h")[0]
        title = strHelper.format(response.xpath("//h1/text()").extract()[0])
        source = response.xpath(
            "//a[@id='ne_article_source']/text()").extract()[0]
        time = response.xpath(
            "//div[@class='post_time_source']/text()").extract()[0]
        time = time.strip().split(" ")[0]
        comment = response.meta['comment']
        contents = response.xpath("//div[@class='post_text']/p")
        pictures = []
        fulltext = ""
        for content in contents:
            _class = content.xpath("@class").extract()

            if len(_class) > 0 and "center" in _class[0]:
                pic = content.xpath("./img/@src").extract_first()
                pictures.append(pic)
                fulltext += "INSERT_PIC_HERE\n"
            else:
                text = content.xpath(".//text()").extract()
                for t in text:
                    fulltext += t
                fulltext += "\n"

        file = open("F:/163/" + id + ".txt", "w")
        file.write("link: " + pageLink + "\n\n")
        file.write("title: " + title + "\n\n")
        file.write("time: " + time + "\n\n")
        file.write("source: " + source + "\n\n")
        file.write(fulltext + "\n")
        file.write("comment:" + comment + "\n\n")
        file.write("pic links:")
        for pic in pictures:
            print(pic)
            file.write(pic)
        file.close()
        print('****************************')


# return allCars
Пример #6
0
    def start_requests(self):

        # url = "https://www.autohome.com.cn/use/201711/909485.html#pvareaid=102624"
        # base_url = "http://auto.sina.com.cn/service/?page="

        # yield scrapy.Request(url,self.parseOnePage)
        # base_url = "http://auto.sina.com.cn/j_kandian.d.html?docid=fyremfz2599182"
        link = 'http://www.pcauto.com.cn/drivers/yangche/point/'
        # re = scrapy.Request(
        #     strHelper.format(link),
        #     callback=self.parseHome,
        #     dont_filter=True
        # )
        # yield re
        for i in range(9, 11):
            re = scrapy.Request(strHelper.format(link) + "index_" + str(i) +
                                ".html",
                                callback=self.parseHome,
                                dont_filter=True)
            yield re
Пример #7
0
    def parse(self, response):
        print(response.url)
        tt = self.tlts
        dict = {}
        bb = response.xpath(
            "//p[@class='detail-breadcrumb-tagP']/a/text()").extract()
        # 品牌,型号,名称
        dict[tt[0]] = strHelper.numberTrans(bb[2])
        dict[tt[1]] = strHelper.numberTrans(bb[3])
        dict[tt[2]] = strHelper.numberTrans(bb[4])
        price = response.xpath(
            "//p[@class='price detail-title-right-tagP']/text()"
        ).extract_first()
        price = price[1:len(price)]
        dict[tt[3]] = price
        bbbbasicAttr = response.xpath(
            "//div[@class='row-fluid-wrapper']//li//strong/text()")
        license_city = response.xpath(
            "//div[@class='row-fluid-wrapper']//li//strong[@id]/@licensed-city"
        ).extract_first()
        # print(license_city)
        basicAttr = []
        for t in bbbbasicAttr:
            t = t.extract()
            basicAttr.append(strHelper.numberTrans(t))
        basicAttr.append(license_city)
        for i in range(0, len(basicAttr)):
            dict[tt[i + 4]] = basicAttr[i]

        extendedAttrs = response.xpath("//div[@id='js-parms-table']//table")
        for table in extendedAttrs:
            trs = response.xpath(".//tr")
            trs = trs[1:len(trs)]
            for tr in trs:
                tds = tr.xpath(".//td")
                for td in tds:
                    title = td.xpath(
                        "./div[@class='item-name']/text()").extract_first()
                    title = strHelper.format(title)
                    title = title.encode("utf-8")
                    value = td.xpath(
                        "./div[@class='item-value']/text()").extract_first()
                    value = strHelper.format(value)
                    value = value.encode("utf-8")
                    dict[title] = value
        keys = dict.keys()
        for title in tt:
            if title not in keys:
                dict[title] = "无".encode("utf-8")

        rexcel = xlrd.open_workbook("inputs/ershou.xls")
        row_count = rexcel.sheets()[0].nrows
        excel = copy(rexcel)
        sheet = excel.get_sheet(0)

        # for i in range(0,len(tt)):
        #     sheet.write(0, i, tt[i])
        for i in range(0, len(tt)):
            sheet.write(row_count, i, dict[tt[i]])
        i += 1
        sheet.write(row_count, i, response.url)
        excel.save("inputs/ershou.xls")

        #
        # titles=[]
        # titles.append("品牌".decode())
        # titles.append("型号".decode())
        # titles.append("名称".decode())
        # titles.append("报价".decode())
        #
        # basicAttrs = response.xpath("//div[@class='row-fluid-wrapper']//p[@class='small-title']/text()").extract()
        # for t in basicAttrs:
        #     titles.append(strHelper.format(t))
        #
        # extendedAttrs = response.xpath("//div[@id='js-parms-table']//table//div[@class='item-name']/text()").extract()
        # for t in extendedAttrs:
        #     titles.append(strHelper.format(t))
        # file = open("inputs/rrc_titles.txt","w")
        # for t in titles:
        #     file.write(t+"\n")
        # file.close()


# return allCars
Пример #8
0
    def parse(self, response):
        error = response.xpath("//div[@id='nav_hd']")
        if (error):
            return
        attributes = ['year', 'engine', 'product_id', 'product_name', 'price']
        attributes2 = [
            'year', 'engine', 'product_id', 'product_name', 'price', 'brand',
            'web_id', 'link'
        ]
        basic = [
            '1',
            '2',
            '3',
            '4',
            '5',
            '6',
            '7',
            '8',
            '9',
            '0',
            'a',
            'b',
            'c',
            'd',
            'e',
            'f',
            'g',
            'h',
            'i',
        ]
        brand = response.xpath("//a[@class='menu_name']/text()").extract()
        product_brand = brand[1]
        product_name = brand[2]
        attrnames = response.xpath(
            "//div[@class='car_config_param_names']/div")
        allCars = []
        # names = ""
        # for n in attrnames:
        #     names+=" "+n
        # print(names)
        print(len(attrnames))

        #outputs product basic information\
        cars = response.xpath(
            "//div[@class='car_config_param_head']//div[@class='cell']")
        for car in cars:
            for y in car.xpath("./@data-config").extract():
                x = y.decode("string-escape")
                mm = x.split(",")
                pattern = re.compile("'(.*)'")
                item = {}
                for i in range(len(mm)):
                    item[attributes[i]] = pattern.findall(mm[i])[0]
                fullname = ""
                for name in car.xpath(
                        ".//a[@target='_blank']/text()").extract():
                    fullname = fullname + " " + name
                item['product_name'] = fullname
                item['brand'] = product_brand
                item['web_id'] = response.url.split("g1/")[1].split(".")[0]
                item['link'] = response.url
                allCars.append(item)
        carCount = len(allCars)

        dicts = []
        for number in range(0, carCount):
            dddd = {}
            dicts.append(dddd)
        index = 0
        start = False
        basicAttributes = response.xpath(
            "//div[@class='car_config_param_list']/div")
        # print(len(basicAttributes))
        countDown = 0
        curAttrs = {}
        count = 0
        titles = []
        for i in range(0, len(basicAttributes)):
            _attribute = basicAttributes[i]
            _title = attrnames[i]
            _class = _title.xpath("./@class").extract_first()
            if "head" in _class:
                continue
            count += 1
            _titleN = _title.xpath(".//span/@title").extract_first()
            titles.append(_titleN)
            spans = _attribute.xpath(".//span/text()").extract()
            for j in range(0, len(allCars)):
                text = spans[j]
                if "●" in text:
                    text = "标配"
                if "○" in text:
                    text = "选配"
                allCars[j][_titleN] = sh.format(text)
        fulltext = ""
        for i in range(0, len(allCars)):
            for att in attributes2:
                fulltext = fulltext + att + "=" + sh.format(
                    allCars[i][att]) + "$"
            for dd in self.extractedTitle:
                if dd in allCars[i].keys():
                    fulltext = fulltext + dd + "=" + allCars[i][dd.decode(
                        'utf-8')] + "$"
                else:
                    fulltext = fulltext + dd + "=--$"
        print(count)

        # file = open("inputs/titles.txt","w")
        # for t in titles:
        #     file.write(t+" = scrapy.Field()\n")
        # file.close()

        file = open("crawled/163.txt", "a")
        file.write(fulltext + "\n")
        file.close()
        # return allCars

        print('===============')
Пример #9
0
    def parseOnePage(self, response):
        pictures = []
        meta = response.xpath(
            "//meta[@name='comment']/@content").extract_first()
        id = meta.split("_id:")[1]
        channel = meta.split("channel:")[1]
        channel = channel[0:2]

        details = response.xpath("//div[@class='article clearfix']")
        isA = False
        if "article_" in response.url:
            isA = True
        contents = details.xpath(".//p|.//div")
        fulltext = ""
        for con in contents:
            _class = con.xpath("./@class")
            if len(_class) > 0:
                try:
                    if isA:
                        link = "http:" + con.xpath(
                            "./img/@src").extract_first()
                    else:
                        link = con.xpath("./img/@src").extract_first()
                    pictures.append(link)
                    fulltext += "INSERT_PIC_HERE\n\n"
                except TypeError:
                    print("")

            else:
                if not isA:
                    if "docid" in response.url:
                        text = con.xpath("./font/text()").extract()
                    else:
                        text = con.xpath("./text()").extract()
                else:
                    text = con.xpath("./font/text()").extract()
                if len(text) > 0:
                    text = strHelper.format(text[0])
                    fulltext += text + "\n"

        mark = response.xpath("//div[@class='keywords']/a/text()")
        mark = mark.extract()
        if mark is None:
            mark = []
        marks = ""
        for m in mark:
            marks += m + ","
        marks = marks[0:len(marks) - 1]

        title = details.xpath("//h1/text()").extract_first()
        title = strHelper.format(title)

        tANDs = response.xpath("//div[@class='date-source']")
        time = tANDs.xpath("./span[@class='date']/text()").extract_first()
        time = strHelper.format(time)
        source = tANDs.xpath("./a/text()").extract_first()
        source = strHelper.format(source)
        print(time)
        print(source)

        pageLink = response.url
        commentlink = "http://comment5.news.sina.com.cn/page/info?version=1&format=json&channel=" \
                          "{channel}" \
                          "&newsid={id}&group=undefined&compress=0&ie=utf-8&oe" \
                          "=utf-8&page=1&page_size=3&t_size=3&h_size=3&thread=1&" \
                          "callback=jsonp_1528781515017&_=1528781515017".format(channel=channel,id=id)
        yield scrapy.Request(commentlink, self.parseComment)
        # if "article" in pageLink:
        #     id = pageLink.split("article_")[1].split(".html")[0]
        #     id = id.split("_")
        #     id = id[0]+"-"+id[1]
        #     basecommentlink = "http://comment5.news.sina.com.cn/page/info?version=1&format=json&channel=mp&newsid="
        #     end= "&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3&t_size=3&h_size=3&thread=1&callback=jsonp_1528781515017&_=1528781515017"
        #     commentlink = basecommentlink+id+end
        #     yield scrapy.Request(commentlink, self.parseComment)
        #
        # elif "detail" in pageLink:
        #     id = pageLink.split("detail-i")[1].split(".s")[0]
        #     basecommentlink = "http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=qc&newsid=comos-"
        #     end="&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20&jsvar=loader_1528783247454_78178050"
        #     commentlink = basecommentlink+id+end
        #     yield scrapy.Request(commentlink, self.parseComment)
        # else:
        #     id = pageLink.split("docid=")[1]
        # download images
        picCount = 0
        for link in pictures:
            # Create folder for each document
            fileHelper.mkdir(id)
            print(link)
            if link is None:
                continue
            if "http" not in link:
                link = "http:" + link
            urllib.urlretrieve(
                link, "F:/sina/images/" + id + "/" + str(picCount) + ".jpg")
            picCount += 1
        print("Get pic :" + str(picCount + 1))
        #
        file = open("F:/sina/" + id + ".txt", "w")
        file.write("link: " + pageLink + "\n\n")
        file.write("title: " + title + "\n\n")
        file.write("time: " + time + "\n\n")
        file.write("source: " + source + "\n\n")
        file.write("tag:" + marks + "\n\n")
        file.write(fulltext + "\n")
        file.close()
        print('****************************')


# return allCars
    def parseOnePage(self, response):
        originurl = response.url
        id = self.getID(originurl)

        nextpage = response.xpath("//div[@class='athm-page__info']")
        if len(nextpage)>0:
            if "all" not in id:
                newID = id+"-all.html"
                yield scrapy.Request("https://www.autohome.com.cn/use/201803/"+newID,self.parseOnePage)
                return

        details = response.xpath("//div[@class='article-details']")

        marks = details.xpath(".//div[@class='marks']/a/text()").extract_first()
        if marks is None:
            marks = ""
        title = details.xpath("//h1/text()").extract_first()
        title = strHelper.format(title)
        author = details.xpath("//a[@class='name']/text()").extract_first()
        author = strHelper.format(author)
        time = details.xpath("//span[@class='time']/text()").extract_first()
        time = strHelper.format(time)
        source = details.xpath("//span[@class='source']/a/text()").extract_first()
        source = strHelper.format(source)
        pageLink = response.url
        commentjsonurl = "https://reply.autohome.com.cn/showreply/ReplyJson.ashx?id="


        commentjsonurl+=id

        yield scrapy.Request(commentjsonurl,self.parseComment)

        content = details.xpath("//div[@class='details']//p")

        # download images
        picCount =0
        for c in content:
            if c.xpath("./@align").extract_first()=="center":
                link = c.xpath("./a")
                href = link.xpath("./@href").extract()
                if len(href)<1:
                    continue
                # print(href)
                if "pay" in href[0]:
                    print("pay")
                    continue
                link = link.xpath("./img/@src").extract_first()
                if link is not None:
                    link = "https:"+link
                    # Create folder for each document
                    fileHelper.mkdir(id)
                    # print(link)
                    urllib.urlretrieve(link,"F:/images/"+id+"/"+str(picCount)+".jpg")
                    picCount+=1
        print("Get pic :"+str(picCount+1))

        file = open("txt/"+id + ".txt", "w")
        file.write("link: "+pageLink+"\n\n")
        file.write("title: "+title+"\n\n")
        file.write("author: "+author+"\n\n")
        file.write("time: "+time+"\n\n")
        file.write("source: "+source+"\n\n")
        file.write("tag:" + marks + "\n\n")

        paras = []
        for text in content:
            if text.xpath("./@align").extract_first() =='center':
                paras.append("\n INSERT_PIC_HERE \n")
                continue
            con = text.xpath(".//text()").extract()
            para = ""
            for c in con:
                c = c.replace(u'\xa0',u'')
                para+=c
            paras.append(para)
        paras[len(paras)-1]=""
        for para in paras:
            file.write(para+"\n")
        file.close()
        print('****************************')

# return allCars
Пример #11
0
    def parseOnePage(self, response):
        page_link = response.url.replace("_all", "")
        base_link = "http://cmt.pcauto.com.cn/action/topic/get_data.jsp?url="
        yield scrapy.Request(base_link + page_link, self.parseComment)

        id = page_link.split("/")
        id = id[len(id) - 1].split(".html")[0]

        title = response.xpath(
            "//h1[@class='artTit']/span/text()").extract_first()
        title = strHelper.format(title)
        source = response.xpath(
            "//span[@class='ownner']/text()").extract_first()
        source = strHelper.format(source)
        author = strHelper.format(
            response.xpath(
                "//span[@class='editor']//a/text()").extract_first())
        time = strHelper.format(
            response.xpath("//span[@class='pubTime']/text()").extract_first())
        mark = response.xpath(
            "//p[@class='moreRead artTag']//a/text()").extract()
        marks = ""
        for m in mark:
            marks += m + ","
        marks = marks[0:len(marks) - 1]
        # print(title)
        # print(source)
        # print(author)

        contents = response.xpath("//div[@class='artText clearfix']")
        contents = contents.xpath(".//p|.//div[@class='cmsArtMainTit']")
        pictures = []
        fulltext = ""
        for block in contents:
            if len(block.xpath("./@class")) > 0:
                fulltext += block.xpath(".//text()").extract_first() + "\n"
            elif len(block.xpath("./@style")) > 0:
                fulltext += "INSERT_PIC_HERE\n\n"
                pic = block.xpath(".//img/@src").extract_first()
                pictures.append(pic)
            else:
                text = block.xpath(".//text()").extract()
                for t in text:
                    fulltext += t
                fulltext += "\n"

        picCount = 0
        # for link in pictures:
        # Create folder for each document
        # fileHelper.mkdir(id)
        # print(link)
        # if link is None:
        #     continue
        # urllib.urlretrieve(link, "F:/sina/images/" + id + "/" + str(picCount) + ".webp")
        # picCount += 1
        # print("Get pic :" + str(picCount))
        # print(fulltext)
        file = open("F:/pcauto/" + id + ".txt", "w")
        file.write("link: " + page_link + "\n\n")
        file.write("title: " + title + "\n\n")
        file.write("time: " + time + "\n\n")
        file.write("source: " + source + "\n\n")
        file.write("tag:" + marks + "\n\n")
        file.write(fulltext + "\n")
        file.close()