Пример #1
0
    def parse_page2(self, response):  # data stored
        global writer
        sel = HtmlXPathSelector(response)
        # article = ''.join(sel.xpath('//div[@class="body yom-art-content clearfix"]').extract())
        article = ''.join(sel.xpath('//p/text()').extract())
        subheadline = ''.join(sel.xpath('//h2[@class="subheadline"]/text()').extract())
        str2 = ''.join(sel.xpath('//abbr/text()').extract())

        millis = int(round(time.time() * 1000))  # Get the current time in milliseconds
        ntime = 0.0
        if "hour" in str2[1]:
            str3 = str2[1].split(" ")
            ntime += float(str3[0]) * 60
            if "minute" in str2[1]:
                ntime += float(str3[2])
        elif "minute" in str2[1]:
            str3 = str2[1].split(" ")
            ntime += float(str3[0])
        # article_time = datetime.datetime.fromtimestamp((millis - ntime * 60 * 1000)/1000).strftime('%m-%d-%Y %H:%M:%S.%f')
        articletime = " "

        # Grabs the information from parse function
        title = response.meta['Title']
        linktime = response.meta['LinkTime']
        source = response.meta['Source']
        link = response.meta['Link']

        # Stores everything in a CSV file
        Consumer.writer.writerow([title.encode("utf-8"), subheadline.encode("utf-8"), source.encode("utf-8"), linktime.encode("utf-8"),
                                  articletime.encode("utf-8"), article.encode("utf-8"), link.encode("utf-8")])
Пример #2
0
 def parse_url(self, response):
     sel = HtmlXPathSelector(response)
     for i in range(20):
         try:
             url_xpath = '//*[@id="result"]/div[%d]/div/h2/a/@href' % (i +
                                                                       4)
             news_url = sel.xpath(url_xpath).extract()[0].decode('UTF-8')
         except:
             try:
                 url_xpath = '//*[@id="result"]/div[%d]/h2/a/@href' % (i +
                                                                       4)
                 news_url = sel.xpath(url_xpath).extract()[0].decode(
                     'UTF-8')
             except:
                 try:
                     url_xpath = '//*[@id="result"]/div[%d]/h2/a' % (i + 4)
                     news_url = sel.xpath(url_xpath).extract()[0].decode(
                         'UTF-8')
                 except:
                     news_url = ''
         print i, news_url
         # 爬取正文和评论
         if news_url != '':
             url_info = {
                 'response_url': news_url,
                 'topic_id': self.topic_id
             }
             # yield UrlsInfoItem(url_info)
             self.cursor.execute("insert into crawl_url values(null,%s,%s)",
                                 (news_url, self.topic_id))
             yield scrapy.http.Request(news_url, callback=self.parse_news)
             # break
     self.conn.commit()
Пример #3
0
 def parseDetail(self,response):
   item = response.meta['item']
   article = HtmlXPathSelector(response)
   item['title'] = article.xpath('//h1/text()').extract()[0]
   item['content'] = article.xpath('//div[@class="entry-content"]/text()').extract()[0]
   item['createtime'] = article.xpath('//time[@class="entry-date"]/@datetime').extract()[0]
   return item
Пример #4
0
    def parse_images(self, response):
        """
        下载图片
        :param response:
        :return:
        """
        hxs = HtmlXPathSelector(response=response)
        items = hxs.xpath("//div[@id='content-list']/div[@class='item']")
        for item in items:
            # print(item)
            # href = item.xpath(".//div[@class='part1']//a[1]/@href").extract_first()
            # img = item.xpath("//div[@class='news-pic']/img/@original").extract_first()
            img = item.xpath(
                ".//div[@class='part2']/@share-pic").extract_first()
            # print(img)
            # file_name = img.rsplit('//')[1].rsplit('?')[0]
            img_name = img.rsplit('_')[-1]
            file_path = 'images/{0}'.format(img_name)
            #使用大文件下载方式
            item = ScrapyRedisSpidersItem(url=img,
                                          type='file',
                                          file_name=file_path)
            print(img)
            yield item

            pages = hxs.xpath(
                "//div[@id='page-area']//a[@class='ct_pagepa']/@href").extract(
                )
            print(pages)
            for page_url in pages:
                #获取所有页码的url
                page_url = "http://dig.chouti.com" + page_url
                print(page_url)
                yield Request(url=page_url, callback=self.parse_images)
Пример #5
0
    def get_page_parse(self, response):
        hxs = HtmlXPathSelector(response)
        item = WdspiderItem()
        _ans = hxs.xpath('//div[@class="answer-con"]/text()').extract()
        #print _ans
        if len(_ans[0]) < 30:
            print u"答案长度太短....."
            return None

        #sys.exit(),unicode(line,"UTF-8")"".join(_title)
        title = hxs.xpath('//h3[@id="questionTitle"]/text()').extract()
        '''
        _title = []
        for t in title[0]:
            _title.append(t.encode('utf-8'))
        '''
        item['question'] = title[0]
        item['question_detail'] = '',
        #item['topics'] = ,
        item['answers'] = [{'agree_count': random.randint(5, 25), 'publish_time': time.time(), "comments": {}}],
        item['answers_text'] = _ans[0]
        item['signcc'] = 123123,
        item['callback'] = response.url
        #print item
        return item
Пример #6
0
    def parse_pro(self, response):
        theItem = proItem()
        sel = HtmlXPathSelector(response)
        theItem['name'] = ''
        theItem['website'] = ''
        theItem['email'] = ''
        theItem['title'] = ''
        theItem['phone'] = ''
        theItem['office'] = ''
        theItem['picture'] = ''
        content = sel.xpath('//div[@class="node node-page view-mode-full clearfix"]/span/@content')
        name = content.extract()
        if(len(name) > 0):
            theItem['name'] = name[0]
        content = sel.xpath('//div[@class="block block-block first last odd"]/div[@class="content"]')
        picture = content.xpath('./img[@width="160"]/@src').extract()
        if(len(picture) > 0):
            url = picture[0]
            if(url[0]=='/'):
                url = 'https://www.cs.washington.edu' + url
            theItem['picture'] = url
        for p in content.xpath('.//p'):
            msglst = p.xpath('./text()').extract()
            if(len(msglst) == 0):
                continue
            msg = msglst[0]
            if(msg[0:3]=='Off'):
                theItem['office'] = msg[8:]
            if(msg[0:3]=='Ema'):
                theItem['email'] = msg[7:]
            if(msg[0:3]=='Phone'):
                theItem['phone'] = msg[7:]

        yield theItem
Пример #7
0
    def parse_items(self, response):
        hxs = HtmlXPathSelector(response)
        item = NewsItem()
        item["link"] = response.request.url
        item["lang"] = "en"
        item["source"] = "mirror"
        category = hxs.xpath(
            "//div[@class='col-md-12']/div[@class='breadcrumb-body clr']/span//text()"
        ).extract()
        date_time = hxs.xpath("//span[@class='modify-date']/text()").extract()
        item["author"] = ""
        title = hxs.xpath(
            "//h1[@class='news-detail-title selectionShareable']/text()"
        ).extract()
        intro = hxs.xpath(
            "//div[@class='news-detail-spot news-detail-spot-margin']/h2/text()"
        ).extract()
        new_content = hxs.xpath("//div[@class='news-box']/p/text()").extract()
        #
        # Processing outputs
        item["intro"] = ' '.join(intro)
        item["title"] = ' '.join(title)
        new_content = ' '.join(new_content)
        new_content = re.sub('\n', ' ', new_content)
        item["content"] = re.sub('\s{2,}', ' ', new_content)
        category = category[1:-1]
        category = [c for c in category if not c == ">"]
        item["category"] = '|'.join(category)
        item["date_time"] = " ".join(date_time)

        return (item)
Пример #8
0
 def displayLocations(self, response):
     hxs = HtmlXPathSelector(response)
     region = hxs.xpath(
         '//select[@id="display-refine-region"]/option/@value').extract(
         )[1::]
     regionName = hxs.xpath(
         '//select[@id="display-refine-region"]/option/text()').extract(
         )[1::]
     url = 'https://www.hamlan.com.au/wp-admin/admin-ajax.php'
     for n, i in enumerate(region):
         formdata = {
             'action': 'getDisplayLocationResults',
             'selectedRegion': '{}'.format(i)
         }
         headers = {
             'Accept': '*/*',
             'User-Agent':
             'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
             'X-Requested-With': 'XMLHttpRequest'
         }
         # yield Request(url, method='POST', callback=self.parseItem, dont_filter=True,headers=headers)
         requests = FormRequest(url=url,
                                formdata=formdata,
                                callback=self.getLinks,
                                dont_filter=True,
                                headers=headers,
                                meta={'data': regionName[n]})
         yield requests
Пример #9
0
    def handle_blog(self, response):
        hxs = HtmlXPathSelector(response)
        item = BuzzCrawlerItem()

        item['url'] = response.url
        item['date'] = dateutil.parser.parse(hxs.xpath(".//li[@class='entryDate']/time/@datetime").extract()[0])
        item['title'] = hxs.xpath(".//h1[@id='headline']/text()").extract()[0].strip()
        item['blurb'] = ""

        unprocessed_content = hxs.xpath(".//span[@itemprop='articleBody']").extract()[0]

        sane_html = remove_tags_with_content(unprocessed_content,("noscript","div","h6"))

        h = html2text.HTML2Text()
        h.ignore_links = True
        h.ignore_images = True

        processed_content = h.handle(sane_html)

        if "noscript" in unprocessed_content:
            print sane_html.encode("iso-8859-15", "replace")
            print "*"*98

        item['content'] = markdown(processed_content)
        item['source'] = 'wired.com'
        yield item
Пример #10
0
    def parse_letter(self,response):
        hxs=HtmlXPathSelector(response)

        next_page_url = hxs.xpath(u"//a[text()='下一页']/@href").extract()

        if len(next_page_url) != 0:
            flag = True
            request = Request(next_page_url[0], callback = self.parse_letter)
            request.meta['item'] = response.meta['item']
            yield request
        else:
            flag = False

        l = response.meta['item']
        letter=''
        letter1=hxs.xpath("//script").re('(?<=doctorjy).*?doctorjy')
        letter = letter + self.parse_letter_detail1(letter1)['letter']
        letter2=hxs.xpath("//table[@class='doctorjy']")
        letter = letter + self.parse_letter_detail2(letter2)['letter']

        l.add_value('comment',letter)

        if flag == True:
            yield None
        else:
            yield l.load_item()
Пример #11
0
    def parse_review_page(self, response):
        items = response.meta.get('items', '')
        url = response.meta.get('url', '')
        hxs = HtmlXPathSelector(text=self._extract_html(response))
        reviews = hxs.xpath('//div[@class="BVRRReviewDisplayStyle5"]')
        for review in reviews:
            l = ReviewLoader(item=Review(), response=response, date_format='%d/%m/%Y')
            rating = review.select(".//span[contains(@class,'BVRRRatingNumber')]/text()").extract()[0]
            date = review.select(".//span[contains(@class,'BVRRValue BVRRReviewDate')]/text()").extract()[0]
            title = review.select(".//span[contains(@class,'BVRRReviewTitle')]/text()").extract()
            review_text = ' '.join(review.select(".//span[contains(@class,'BVRRReviewText')]//text()").extract())

            if title:
                full_text = title[0].strip() + '\n' + review_text.strip()
            else:
                full_text = review_text.strip()

            l.add_value('rating', rating)
            l.add_value('url', url)
            l.add_value('date', datetime.strptime(date, '%d %B %Y').strftime('%d/%m/%Y'))
            l.add_value('full_text', full_text)
            for item in items:
                item['metadata']['reviews'].append(l.load_item())

        next = hxs.xpath('//span[@class="BVRRPageLink BVRRNextPage"]/a/@data-bvjsref').extract()
        if next:
            yield Request(next[0], callback=self.parse_review_page, meta={'items': items, 'url': url})
        else:
            for item in items:
                yield item
    def parse_dir_contents(self, response):
        str1 = response.url.split("/")[3]
        filename = 'output11/' + str1 + '.html'
        with open(filename, 'wb') as f:
            f.write(response.body)
        hxs = HtmlXPathSelector(response)

        #extract the cost for new format
        HDcost1 = hxs.xpath('//*[@class="dv-button-inner"]/text()').extract()
        len1 = len(HDcost1)
        del HDcost1[0]
        for i in range(0, len1 - 1):
            var1 = HDcost1[i]
            var1 = var1.encode('utf-8')
            HDcost1[i] = var1

        #extract the title for new format
        title1 = hxs.xpath('//*[@id="aiv-content-title"]/text()').extract()
        len1 = len(title1)
        for i in range(0, len1):
            var1 = title1[i]
            var1 = var1.encode('utf-8')
            var1 = var1.strip()
            title1[i] = var1
        title1 = filter(None, title1)

        #extract the release year for new format
        relyear = hxs.xpath('//*[@class="release-year"]/text()').extract()
        relyear1 = relyear[0].encode('utf-8')
        relyear1 = relyear1.strip()

        #extrcat the time for new format
        times = hxs.xpath(
            '//*[@id="dv-dp-left-content"]/div[2]/div[2]/dl/dd[2]/text()'
        ).extract()
        time1 = times[0].strip()
        time1 = time1.encode('utf-8')

        #extract the director for new format
        dir1 = response.xpath(
            '//*[@id="dv-center-features"]/div[1]/div/table/tr[2]/td/a/text()'
        ).extract()
        dir1 = dir1[0].encode('utf-8')
        dir1 = dir1.strip()

        #extract the starring actors
        actors = hxs.select(
            '//*[@id="dv-dp-left-content"]/div[2]/div[2]/dl/dd[1]/text()'
        ).extract()
        actors = actors[0].encode('utf-8')
        actors = actors.strip()

        yield DmozItem(
            title=title1,
            time=time1,
            cost=HDcost1,
            year=relyear1,
            director=dir1,
            star=actors,
        )
Пример #13
0
    def parse_items(self, response):
        hxs = HtmlXPathSelector(response)
        item = NewsItem()
        item["link"] = response.request.url
        item["lang"] = "tr"
        item["source"] = "sabah"
        category = hxs.xpath(
            "//div[contains(@class,'haber-header')]/header/span[contains(@class,'category')]//text()"
        ).extract()
        date_time = hxs.xpath(
            "//div[contains(@class,'haber-header')]/div[contains(@class,'info')]/time/text()"
        ).extract()
        item["author"] = ""
        title = hxs.xpath(
            "//div[contains(@class,'haber-header')]/header/h1/text()").extract(
            )
        intro = hxs.xpath(
            "//div[contains(@class,'haber-header')]/header/h2/text()").extract(
            )
        new_content = hxs.xpath(
            "//div[contains(@class,'content')]/div/p/text()").extract()
        #
        # Processing outputs
        item["intro"] = ' '.join(intro)
        item["title"] = ' '.join(title)
        new_content = ' '.join(new_content)
        new_content = re.sub('\n', ' ', new_content)
        item["content"] = re.sub('\s{2,}', ' ', new_content)
        item["category"] = '|'.join(category)
        item["date_time"] = " ".join(date_time)

        return (item)
Пример #14
0
    def parse(self,response):
        sel = HtmlXPathSelector(response)
        item = ProductItem()
        #str = sel.xpath("//div[@class='name']/hgroup/h1/text()").extract()

        item['title'] = sel.xpath("//div[@class='name']/hgroup/h1/text()").extract()
        item['description'] = sel.xpath("//div[@class='full']//text()").extract()[0]
        item['details'] = sel.xpath("//div[@id='description']//ul/li//text()").extract()

        item['images'] = sel.xpath("//img[@id='vsImage']/@src").extract()
        item['imagesdata'] = sel.xpath("//ul[@class='pdp-info box split primary']//section[@class='swatches module']/div[@class='swap']//span[@data-alt-image]/@data-alt-image").extract()

        item['prices'] = map(unicode.strip,sel.xpath("//ul[@class='pdp-info box split primary']/li//div[@class='price']/p/text()").extract())[0]
        item['colors'] = sel.xpath("//ul[@class='pdp-info box split primary']//section[@class='swatches module']/div[@class='swap']//h4/text()").extract()
        item['sizes'] = sel.xpath("//ul[@class='pdp-info box split primary']//div[@class=' scroll']//a//span/text()").extract()[1:]


        item['id'] = sel.xpath("//section[@class='product']/@data-id").extract()[0]

        str = sel.xpath("//script//text()").extract()
        for i in str:
            res = re.findall('\{\"assetId\".*?\"R\"\}',i)
            if res != []:
                item['data'] = res

        yield item
Пример #15
0
 def parse(self, response):
     item=ProvincecrawlItem()
     hxs = HtmlXPathSelector(response)
     item['country']=hxs.xpath('//tr[@class="o" or @class="e"]/following::td[1]/text()').re('\w.*')
     item['url']='http://www.statoids.com/'+hxs.xpath('//tr[@class="o" or @class="e"]/following::td[1]/following::a[1]/@href').re('u...html')
     yield item
     pass
Пример #16
0
    def parse_items(self, response):
        begin = time.time()
        logger.info("start to scrawl url:{0}".format(response.url))

        hxs = HtmlXPathSelector(response)
        scripts = hxs.xpath("//script/@src")
        is_exsisted = False

        for script in scripts:
            script_src = script.extract()

            if "http" not in script_src:
                parsed_uri = urlparse(response.url)
                domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
                script_src = domain + script_src

            r = requests.get(script_src)

            if GEEKCA_DOMAIN in r.content:
                is_exsisted = True
                break

        if not is_exsisted:
            titles = hxs.xpath(
                "//script[contains(.,'geekca.cubead.com')]/text()")
            items = []

            if not titles:
                item = UrlItem()
                item["url"] = response.url
                items.append(item)

            logger.info("end to scrawl url:{0} and cost time:{1}".format(
                response.url, (time.time() - begin)))
            return (items)
Пример #17
0
    def parse_story(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ArticleLoader(MeduzaArticlesItem(), hxs)

        date = str('-'.join(response.url.split('/')[4:7]))
        title = hxs.xpath(
            '//div/h1[contains(@class, "RichTitle-root")]/text()').extract()[0]

        p = []
        for par in hxs.xpath('''
            //div[@class="GeneralMaterial-article"]/p//text()
            |//div[@class="GeneralMaterial-article"]/h3//text()
            ''').extract():
            p.append(par)
        text = ' '.join(p)

        loader.add_value('url', str(response.url))
        loader.add_xpath(
            'title', '//div/h1[contains(@class, "RichTitle-root")]/text()')
        loader.add_value('date_published', date)
        loader.add_value('text', text)
        # TODO crawl likes
        loader.add_value('fb_likes', '')
        loader.add_value('vk_likes', '')
        loader.add_value('ok_likes', '')

        return loader.load_item()
Пример #18
0
 def parse(self, response):
     item = HtmlXPathSelector(response)
     # todo: strip html
     # todo: add other fields
     # todo: download articles
     titles = item.xpath('//*[@id="dlpage"]/dl/dd/div/div[contains(@class,"list-title")]/text()').extract()
     papers = item.xpath(
         '//*[@id="dlpage"]/dl/dt/span[contains(@class, "list-identifier")]/a[@title="Download PDF"]/@href').extract()
     print len(titles)
     print len(papers)
     title_list = []
     for t in titles:
         if t != '\n':
             t = t.replace("\n", "")
             print t
             title_list.append(t)
     print len(title_list), len(papers)
     i = 0
     list_map = {}
     for p in papers:
         base = "https://arxiv.org"
         print title_list[i], base + p
         subprocess.call('wget -U "Mozilla" {}.pdf'.format(base + p), shell=True)
         list_map[p.replace("/pdf/", "")] = title_list[i]
         i += 1
     print pprint.pprint(list_map)
Пример #19
0
 def other_question(self, response):
     try:
         hxs = HtmlXPathSelector(response)
         item = YahoourlsearcherItem()
         category = hxs.xpath(
             '(//a[contains(@class,"Clr-b")])[2]').extract()
         h = html2text.HTML2Text()
         h.ignore_links = True
         category_text = h.handle(category[0])
         # Check if the question thread is related to programming and design
         # if "程式編寫" and "設計" in str(category_text).strip():
         if (True):
             next_page = hxs.xpath(
                 '//a[contains(@class,"Clr-b") and text()=" Next "]/@href')\
                 .extract()
             composed_string = "https://hk.answers.yahoo.com" + next_page[0]
             item['url'] = str(response.url)
             item['date'] = str("not available")
             print("*** " + str(category_text).strip() + " - " +
                   item['url'] + " ***")
             yield item
             yield scrapy.Request(composed_string,
                                  callback=self.other_question)
     except NoSuchElementException:
         pass
Пример #20
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        if hxs.select("//center").extract():
            return
        bili = BiliItem()

        bili['url'] = response.url
        bili['avNo'] = int(re.search(r'\d+', str(response.url)).group())
        bili['title'] = hxs.xpath("//h1/text()").extract()[0]
        bili['time'] = hxs.xpath("//time/i/text()").extract()[0]
        bili['category'] = hxs.xpath('//a[@class="on"]/text()').extract()[0]
        bili['up'] = hxs.xpath('//a[@class="name"]/text()').extract()[0]

        if bili['title']:
            bili['comment'] = int(
                re.findall(
                    re.compile(r"acount.{2}\d+"),
                    urllib2.urlopen(
                        "http://api.bilibili.com/x/reply?jsonp=jsonp&type=1&sort=0&oid="
                        + str(bili['avNo'])).read())[0][8:])
            content = urllib2.urlopen(
                "http://interface.bilibili.com/count?key=5cb9d3f30568fd06bb388d13&aid="
                + re.search(r'\d+', str(response.url)).group()).read()
            bili['click'] = int(
                re.findall(re.compile(r"ji.{9}\d+"), content)[0][11:])
            bili['coin'] = int(
                re.findall(re.compile(r"es.{8}\d+"), content)[0][10:])
            bili['sc'] = int(
                re.findall(re.compile(r"stow_count.{9}\d+"), content)[0][19:])
            bili['dm'] = int(
                re.findall(re.compile(r"dm_count.{8}\d+"), content)[0][16:])

        del content, hxs
        yield bili
Пример #21
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        titles = hxs.xpath("string(//span[@id='productTitle'])").extract()
        price = hxs.xpath("string(//span[@id='priceblock_dealprice'])").extract()
        stock = hxs.xpath("//div[@id='availability']/span")
        description = hxs.xpath("//div[@id='feature-bullets']/ul/li//span")
        images = hxs.xpath("//img[contains(@class, 'a-dynamic-image') and contains(@class, 'a-stretch-vertical')][@src][1]")

        #print "########################################################################################################################"
        #print images.extract()

        items = []
        item = EbayScraperItem()
#        title = titles.select("text()").extract()
#        price = price.select("text()").extract()
        stock = stock.select("text()").extract()
        #title = title[1:-1]     # Remove quots
        item["title"] = str(titles).strip()
        item["price"] = price
        item["stock"] = stock
        formated_desc = ""
        for desc in description:
            formated_desc += str(desc.select("text()").extract())

        item["description"] = formated_desc
        item["images"] = images.select("@src").extract()
        items.append(item)
        return items
Пример #22
0
    def parse_item(self, response):
        global i, not_data
        i += 1  #记录抓取条数
        print(i)
        item = BaikeItem()
        sel = HtmlXPathSelector(response)
        baike_url = str(response.url)
        baike_name = sel.xpath(
            '//div[@id="sec-content0"]/h1/span[@class="lemmaTitleH1"]/text()'
        ).extract()
        baike_desc = sel.xpath(
            '//div[@class="card-summary-content"]/div[@class="para"]/text()'
        ).extract()[0]

        if not baike_name:
            not_data += 1  #记录未抓取到的条数
            print(not_data)

        if not baike_desc:
            baike_desc = '未抓取到'

        item['title'] = [n.encode('utf-8') for n in baike_name]
        item['link'] = baike_url.encode('utf-8')
        item['desc'] = baike_desc

        yield item
Пример #23
0
    def parse(self, response):
        selector = HtmlXPathSelector(response)
        # 出发地字母分类
        class4 = selector.xpath(
            '//*[@id="gnyallist-al"]/div/div[2]/div[1]/following-sibling::div')
        # 目的地省份
        provinces = selector.xpath(
            '/html/body/div[2]/div[1]/div/div[2]/div[4]/div[2]/dl')

        # departure_cities = []
        # for one in class4:
        #     in_class = one.xpath("dl")
        #     for element in in_class:
        #         info = element.xpath("dd/a/text()").extract()
        #         departure_cities += info
        #
        # with codecs.open("tmp.txt", 'w', encoding='utf-8') as f:
        #     f.write(str(departure_cities))
        #     f.close()

        arrive_cities = []
        for province in provinces:
            info = province.xpath('dd/a/text()').extract()
            arrive_cities += info

        with codecs.open("tmp.txt", 'w+', encoding='utf-8') as f:
            f.write(str(arrive_cities))
            f.close()
Пример #24
0
    def extract_details(self, response):
        hxs = HtmlXPathSelector(response)
        item = response.meta['item']
        items = [] # don't pass in items
#        logger.info("XXXXXXX parent URL %s, LOOP: %d, ADDR:%s", response.meta['parent'], response.meta['loop'], item["addr"])
        item["facts"] = hxs.xpath('//ul[@class="zsg-list_square zsg-lg-1-3 zsg-md-1-2 zsg-sm-1-1"]/li/text()').extract()
        zest = hxs.xpath('//div[@class="zest-value"]/text()').extract()
        item["zest_sale"] = zest[0]
        item["zest_rent"] = zest[1]
        school_info = hxs.xpath('//ul[@class="nearby-schools-list"]')
        school_name = school_info.xpath('//a[@class="za-track-event school-name notranslate"]/text()').extract()
        school_rating = school_info.xpath('//*[starts-with(@class, "gs-rating-number")]/text()').extract()
        school_grade = school_info.xpath('//div[@class="nearby-schools-grades"]/text()').extract()
        school_len = len(school_name)
        if school_len == 1:
            item["school_info1"] = str(school_name[0]) + "/" + str(school_rating[0]) + "/" + str(school_grade[0]) 
        elif school_len == 2:
            item["school_info1"] = str(school_name[0]) + "/" + str(school_rating[0]) + "/" + str(school_grade[0]) 
            item["school_info2"] = str(school_name[1]) + "/" + str(school_rating[1]) + "/" + str(school_grade[1]) 
        elif school_len == 3:
            item["school_info1"] = str(school_name[0]) + "/" + str(school_rating[0]) + "/" + str(school_grade[0]) 
            item["school_info2"] = str(school_name[1]) + "/" + str(school_rating[1]) + "/" + str(school_grade[1]) 
            item["school_info3"] = str(school_name[2]) + "/" + str(school_rating[2]) + "/" + str(school_grade[2]) 
        else:
            logger.info("No school info") 
        items.append(item)
        return items
Пример #25
0
    def parse_items(self, response):
        hxs = HtmlXPathSelector(response)
        item = NewsItem()
        item["link"] = response.request.url
        item["lang"] = "en"
        item["source"] = "wired"
        category = hxs.xpath(
            "//li/span[@itemprop='articleSection']//text()").extract()
        date_time = hxs.xpath(
            "//ul/meta[@itemprop='datePublished']/@content").extract()
        author = hxs.xpath(
            "//ul/li/span[@itemprop='author']//text()").extract()
        title = hxs.xpath("//header/h1[@data-js='postTitle']/text()").extract()
        intro = ""
        new_content = hxs.xpath(
            "//article[@data-js='content']/p//text()").extract()
        #
        # Processing outputs
        item["intro"] = ' '.join(intro)
        item["title"] = ' '.join(title)
        new_content = ' '.join(new_content)
        new_content = re.sub('\n', ' ', new_content)
        item["content"] = re.sub('\s{2,}', ' ', new_content)
        category = list(set([c for c in category if re.search("\S", c)]))
        item["category"] = '|'.join(category)
        date_time = " ".join(date_time)
        item["author"] = " ".join(author).strip()
        item["date_time"] = date_time.split("+")[0]

        return (item)
Пример #26
0
    def handle_blog(self, response):
        hxs = HtmlXPathSelector(response)
        item = BuzzCrawlerItem()

        item['url'] = response.url
        item['date'] = dateutil.parser.parse(
            hxs.xpath(".//li[@class='entryDate']/time/@datetime").extract()[0])
        item['title'] = hxs.xpath(
            ".//h1[@id='headline']/text()").extract()[0].strip()
        item['blurb'] = ""

        unprocessed_content = hxs.xpath(
            ".//span[@itemprop='articleBody']").extract()[0]

        sane_html = remove_tags_with_content(unprocessed_content,
                                             ("noscript", "div", "h6"))

        h = html2text.HTML2Text()
        h.ignore_links = True
        h.ignore_images = True

        processed_content = h.handle(sane_html)

        if "noscript" in unprocessed_content:
            print sane_html.encode("iso-8859-15", "replace")
            print "*" * 98

        item['content'] = markdown(processed_content)
        item['source'] = 'wired.com'
        yield item
Пример #27
0
	def parse(self, response):

	   driver = webdriver.Firefox()
	   driver.get("http://www.moneycontrol.com/india/stockpricequote/bankspublicsector/statebankindia/SBI")
	   time.sleep(10)
	   content = driver.page_source
	   i=0
	   converter = html2text.HTML2Text()
       	   converter.ignore_links = True
	   doc = HtmlXPathSelector(response)
	   j=0			
	   while(j<6):							
		   
                   driver.refresh()	
		   for desc in doc.xpath("//div/span[@id='Bse_Prc_tick']").extract():
		           i=i+1		   		  
			   print ("\n*******************************************************\n")
			   print i
			   print converter.handle(desc)

		   for desc1 in doc.xpath("//div/span[@id='Nse_Prc_tick']").extract():
		           i=i+1
		   	   print ("\n*******************************************************\n")
			   print i
			   print converter.handle(desc1)
	
       		   j=j+1	
	 
           
	   return desc
Пример #28
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     currurl = hxs.xpath('//link[@rel="alternate"]/@href').extract()
     print currurl
     URL = currurl[0].strip().split("/")[2]
     for url in hxs.xpath('//a/@href').extract():
         print "http://" + DOMAIN + URL + url
Пример #29
0
    def village_parse(self, response):
        hxs = HtmlXPathSelector(response)
        item = VillageItem()
        item['name'] = hxs.xpath('//h1[@id="commtitle"]/a/text()').extract()[0]
        infos = hxs.xpath('//ul[@class="chamber-infolist"]/li')
        for index, link in enumerate(infos):
            if index == 0:
                item['address'] = link.xpath('text()').extract()[0].split(
                    ":")[1]
            elif index == 3:
                item['build_date'] = link.select('text()').extract()[0].split(
                    ":")[1]
            elif index == 4:
                item['developer'] = link.select('text()').extract()[0].split(
                    ":")[1]
            elif index == 6:
                item['property_company'] = link.select(
                    'text()').extract()[0].split(":")[1]

        item['village_id'] = response.url.split("/")[5]
        location = hxs.xpath('//a[@id="propview_map"]/img/@src').extract(
        )[0].partition('?')[2].split('&')[0].split('=')[1]
        item['longitude'] = location.split(',')[0]
        item['latitude'] = location.split(',')[1]

        return item
Пример #30
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        sites = hxs.xpath('//div[contains(@class,"row")]')

        for site in sites:
            item = IndeedItem()

            company = site.xpath(
                ".//span[@class='company']//a/text()").extract_first()
            if not company:
                company = site.xpath(
                    ".//span[@class='company']/text()").extract_first()

            item['company'] = company.strip()

            # title
            title = site.xpath(
                './/a[@data-tn-element="jobTitle"]/@title[1]').extract_first()

            item['title'] = title

            # indeed url
            link = site.xpath(
                ".//span[@class='company']//a/@href").extract_first()
            if link:
                item['link'] = 'https://www.indeed.com' + link

            yield item
            # what to crawl next
            next_to_crawl = hxs.xpath(
                '//span[@class="pn"]/parent::a/@href').extract()
            for i in next_to_crawl:
                url = response.urljoin(i)
                yield Request(url)
Пример #31
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        Ac = Acitem()
        Ac['url'] = response.url
        Ac['title'] = hxs.xpath("//h1/text()").extract()[0]
        Ac['time'] = hxs.xpath(
            "/html/body/div/div[7]/div/div[1]/div[1]/div[1]/p/span[2]/text()"
        ).extract()[0]
        Ac['acNo'] = int(re.search(r'\d+', str(response.url)).group())
        Ac['up'] = hxs.xpath(
            "/html/body/div/div[7]/div/div[1]/div[1]/div[1]/div[2]/div[1]/a[1]/text()"
        ).extract()[0]
        Ac['category'] = hxs.xpath(
            "/html/body/div/div[7]/div/div[1]/div[1]/p/a[2]/text()").extract(
            )[0]
        if Ac['up']:
            content = urllib2.urlopen(
                "http://www.acfun.tv/content_view.aspx?contentId=" +
                re.search(r'\d+', str(response.url)).group()).read()
            contentnumber = re.findall(re.compile(r"\d*"), content)
            Ac['click'] = contentnumber[1]
            Ac['dm'] = contentnumber[9]
            Ac['coin'] = contentnumber[13]
            Ac['sc'] = contentnumber[11]
            Ac['comment'] = contentnumber[3]

        yield Ac
Пример #32
0
  def parse_item(self, response):
    print '************* URL:', response.url
    #sel = Selector(response)
    hxs = HtmlXPathSelector(response)
    
    # Reviews not working seems to dynamically retrieve using javascript
    item = VueCrawlerItem()
 

    item['product_item_num']  = str(response.url)
    tag = hxs.xpath('//meta[@name="keywords"]/@content').extract()
   # print '@@@@@@@@@@@values: ',tag 
    productIDs = []
    productsuidlist = hxs.xpath('//ul[@id="product-list"]//li//a/@data-item').extract()
    for productid in productsuidlist:
       if productid.strip():
         productIDs.append(productid)

    res  = hxs.xpath('//div[@class="breadcrumbs"]//li//span[@itemprop="title"]').extract()
    
    list = []
    for index in range(len(res)):
        v = (res[index])
        v = v.strip()
        if v!='/':
          list.append(v)
    
            
    item['tag'] = list 
    item['tag_product_ids']  = productIDs
        
    return item
Пример #33
0
 def go_go(self, response):
     hxs = HtmlXPathSelector(response)
     price = []
     for i in hxs.xpath(
             "//td[@class='price']/p[@class='new_price' or @class='price_no_discount']/text()"
     ).extract():
         if i != u'\r\n  ':
             price.append(i.strip())
     writer = csv.writer(open('price.csv', 'a'), lineterminator='\n')
     for x, i in enumerate(hxs.xpath("//td[@class='item']")):
         order = str(self.order_id)
         name = i.xpath("h2/text()").extract()[0].strip()
         title = i.xpath(
             "p[@class='description']/text()").extract()[0].strip()
         article = i.xpath(
             "p[@class='article']/text()").extract()[0].strip()
         num = price[x]
         writer.writerow([
             i.encode('utf-8') for i in [order, name, title, article, num]
         ])
     description = self.parse_me(hxs,
                                 "//div[@itemprop='description']/text()")
     features = self.parse_me(hxs, "//div[@id='features']/dl/dd/text()")
     img = self.parse_me(hxs,
                         "//div[@class='atg_store_productImage']/img/@src")
     alls = [str(self.order_id)] + description + features + img
     writer = csv.writer(open('shop.csv', 'a'), lineterminator='\n')
     writer.writerow([i.encode('utf-8') for i in alls])
     self.order_id += 1
Пример #34
0
    def parse_items(self, response):
        hxs = HtmlXPathSelector(response)
        item = NewsItem()
        item["link"] = response.request.url
        item["lang"] = "tr"
        item["source"] = "konya"
        category = hxs.xpath(
            "/html/body/div[6]/div[2]/div[1]/div[4]/div/div[2]/div[2]/div/div[1]/div[1]/div/h2"
        ).extract()
        date_time = hxs.xpath("").extract()
        item["author"] = ""
        title = hxs.xpath(
            "/html/body/div[6]/div[2]/div[1]/div[4]/div/div[2]/div[2]/div/div[1]/div[2]/div[1]/div[2]"
        ).extract()
        intro = hxs.xpath("//*[@id='phoneDetails_0']").extract()
        new_content = ""
        #
        # Processing outputs
        item["intro"] = ' '.join(intro)
        item["title"] = ' '.join(title)
        new_content = ' '.join(new_content)
        new_content = re.sub('\n', ' ', new_content)
        item["content"] = re.sub('\s{2,}', ' ', new_content)
        item["category"] = '|'.join(category)
        item["date_time"] = " ".join(date_time)

        return (item)
Пример #35
0
	def parse(self, response):
	
		open("douban",'wb').write(response.body)	
		self.log("Fetch group home page: %s" % response.url)

		hxs = HtmlXPathSelector(response)
		item = DoubanItem()

		#get group name
		item['groupName'] = hxs.xpath('//h1/text()').re("^\s+(.*)\s+$")[0]

		#get group id 
		item['groupURL'] = response.url
		groupid = self.__get_id_from_group_url(response.url)

		#get group members number
		members_url = "http://www.douban.com/group/%s/members" % groupid
		members_text = hxs.xpath('//a[contains(@href, "%s")]/text()' % members_url).re("\((\d+)\)")
		item['totalNumber'] = members_text[0]

		#get relative groups
		item['relativeGroups'] = []
		groups = hxs.select('//div[contains(@class, "group-list-item")]')
		for group in groups:
			url = group.xpath('div[contains(@class, "title")]/a/@href').extract()[0]
			item['relativeGroups'].append(url)
		#item['relativeGroups'] = ','.join(relative_groups)
		return item
Пример #36
0
    def parse(self, response):
        #index = self.find(response.url)
        index = self.start_urls.index(response.url)

        print index

        if index < 0:
            print 'index < 0'
            return

        #filename = response.url.split("/")[-2]
        #open(filename, 'wb').write(response.body)

        hxs = HtmlXPathSelector(response)

        sites = hxs.xpath('//ul/li')
        titles = hxs.xpath('//title/text()')
        urls = hxs.xpath('//div[@class="small_photo_wrap"]/ul/li/a/img/@data-big-url')
        url  = urls.extract()
        self.VDBobj.Update(index, url)
        if (len(url) == 0):
            urls = hxs.xpath('//div[@class="product_feature"]//img/@src')
            url  = urls.extract()
            self.VDBobj.Update(index, url)
        if (len(url) == 0):
            urls = hxs.xpath('//div[@class="hot_recommend"]//img/@src')
            url  = urls.extract()
            self.VDBobj.Update(index, url)
Пример #37
0
    def parse_money(self, response):  # data stored
        global writer
        sel = HtmlXPathSelector(response)
        article = "".join(sel.xpath('//section[@class="article-body"]/p/text()').extract())
        subheadline = "".join(sel.xpath('//h2[@class="article-excerpt"]/a/text()').extract())
        articletime = " "  #''.join(sel.xpath('//time[@datetime]/a/text').extract())

        # Grabs the information from parse function
        title = response.meta["Title"]
        linktime = response.meta["LinkTime"]
        source = response.meta["Source"]
        link = response.meta["Link"]

        # Stores everything in a CSV file
        Money.writer.writerow(
            [
                title.encode("utf-8"),
                subheadline.encode("utf-8"),
                source.encode("utf-8"),
                linktime.encode("utf-8"),
                articletime.encode("utf-8"),
                article.encode("utf-8"),
                link.encode("utf-8"),
            ]
        )
Пример #38
0
    def parse_items(self, response):
        hxs = HtmlXPathSelector(response)

        data = imdbItem()
        data["seriesRating"] = hxs.xpath(
            '//span[@itemprop="ratingValue"]/text()').extract()
        seasonLink = hxs.xpath(
            '//div[@id="titleTVSeries"]/div[1]//span[@class="see-more inline"]/a/@href'
        ).extract()

        #Directly go to ratings page
        '''
        if not seasonLink==[]:
            #print data["link"]
            url = data["link"][0]+'epdate'
            request = Request(url,callback=self.parse_episode_ratings)
            request.meta['item'] = data
            yield request
        '''

        #follow season links - can get more data as opposed to above method
        if not seasonLink == []:
            for season in seasonLink:
                link = 'http://www.imdb.com/' + season
                request = Request(link, callback=self.parse_season_links)
                request.meta['item'] = data
                yield request
Пример #39
0
 def detail(self, response):
     log.msg(response.url)
     hxs = HtmlXPathSelector(response)
     product_name = hxs.xpath(
         '//*[@id="vip_content_section"]/div[2]/h1/text()').extract()
     # //*[@id="vip_content_section"]/div[2]/h1
     if (len(product_name) != 0):
         product_name = hxs.xpath(
             '//*[@id="vip_content_section"]/div[2]/h1/text()').extract()[0]
     product_price = hxs.xpath('//*[@id="price-val"]/text()').extract()
     if (len(product_price) != 0):
         product_price = hxs.xpath(
             '//*[@id="price-val"]/text()').extract()[0]
     if (len(product_price) != 0
             or product_price != None) and (len(product_name)
                                            or product_name != None):
         l = ItemLoader(item=BillionPricesIndiaItem(), response=response)
         l.add_xpath('product_name',
                     '//*[@id="vip_content_section"]/div[2]/h1/text()')
         # l.add_xpath('quantity', '//*[@id="product_detail_view_1"]/div/div[1]/div/text()')
         l.add_xpath('category', '//*[@id="cat_crum"]/@value')
         l.add_xpath('product', '//*[@id="overview_tab"]/div/div/p/text()')
         item = l.load_item()
         item['product_url'] = response.url
         item['price'] = product_price
         item['vendor'] = 'PepperFry'
         item['city'] = 'Mumbai'
         item['state'] = 'Maharashtra'
         item['country'] = 'India'
         item['date'] = str(time.strftime("%d/%m/%Y"))
         return item
    def parse(self, response):
        f = open('Demo.csv', 'a')
        hxs = HtmlXPathSelector(response)
        varDrugname = (
            hxs.xpath('//div[@class="contentBox"]/h1/text()').extract())
        varReviewID = (hxs.xpath(
            '//div[@class="contentBox"]/div/div/div[@class="user-comment"]/p[@class="user-name user-type user-type-2_non_member"]/text()'
        ).extract())
        varReview = (hxs.xpath(
            '//div[@class="contentBox"]/div/div/div[@class="user-comment"]//p[1]//span//text()'
        ).extract())
        # print [x.encode('ascii', 'ignore') for x in varReview]
        # varDrugname[0].replace("User Reviews for ","")

        mydb = MySQLdb.connect(host='localhost',
                               user='******',
                               passwd='welcome',
                               db='drugs_review_drugs.com')
        cursor1 = mydb.cursor()
        name = list()
        drugname = list()
        for j in range(len(varReview)):
            name.append("drugs.com")
            drugname.append(varDrugname[0].replace("User Reviews for ", ""))
        result = zip(name, varReviewID, drugname,
                     [x.encode('ascii', 'ignore') for x in varReview])
        myfile = open('finalDrugReview1.csv', 'a')
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        for row in result:
            wr.writerow(row)
Пример #41
0
    def parse(self, response):
        log.msg(response.url)
        urls = []
        hxs = HtmlXPathSelector(response)

        # Look for category count total.
        tot_cat_count_list = hxs.xpath('//input[@id="total_category_content_count"]/@value').extract()

        if (len(tot_cat_count_list) != 0):
            tot_cat_count = int(tot_cat_count_list[0])
            log.msg("Total Category Count is {}".format(tot_cat_count))
            # log.msg("Total category count for" + response.url + " is " + tot_cat_count)
            if (tot_cat_count > 60):
                page_link = response.url + "?p=1"
                log.msg("Crawl " + page_link)
                yield scrapy.Request(page_link, callback=self.pagescrape)
                self.crawledPageUrls.append(page_link)
                page_nums = hxs.xpath('//*[@class="paginate pjaxer"]/text()').extract()
                for num in page_nums:
                    if (not num):
                        page_link = response.url + "?p=" + num
                        if page_link not in self.crawledPageUrls:
                            print page_link
                            yield scrapy.Request(page_link, callback=self.pagescrape)
                            self.crawledPageUrls.append(page_link)
            else:
                page_link = response.url + "?p=1"
                log.msg("Crawl " + page_link)
                yield scrapy.Request(page_link, callback=self.pagescrape)
                self.crawledPageUrls.append(page_link)
Пример #42
0
    def parse(self, response):
        log.msg(response.url)
        urls = []
        hxs = HtmlXPathSelector(response)

        # Look for category count total.
        tot_cat_count_list = hxs.xpath(
            '//input[@id="total_category_content_count"]/@value').extract()

        if (len(tot_cat_count_list) != 0):
            tot_cat_count = int(tot_cat_count_list[0])
            log.msg("Total Category Count is {}".format(tot_cat_count))
            # log.msg("Total category count for" + response.url + " is " + tot_cat_count)
            if (tot_cat_count > 60):
                page_link = response.url + "?p=1"
                log.msg("Crawl " + page_link)
                yield scrapy.Request(page_link, callback=self.pagescrape)
                self.crawledPageUrls.append(page_link)
                page_nums = hxs.xpath(
                    '//*[@class="paginate pjaxer"]/text()').extract()
                for num in page_nums:
                    if (not num):
                        page_link = response.url + "?p=" + num
                        if page_link not in self.crawledPageUrls:
                            print page_link
                            yield scrapy.Request(page_link,
                                                 callback=self.pagescrape)
                            self.crawledPageUrls.append(page_link)
            else:
                page_link = response.url + "?p=1"
                log.msg("Crawl " + page_link)
                yield scrapy.Request(page_link, callback=self.pagescrape)
                self.crawledPageUrls.append(page_link)
Пример #43
0
    def parse_letter(self, response):
        hxs = HtmlXPathSelector(response)

        next_page_url = hxs.xpath(u"//a[text()='下一页']/@href").extract()

        if len(next_page_url) != 0:
            flag = True
            request = Request(next_page_url[0], callback=self.parse_letter)
            request.meta['item'] = response.meta['item']
            yield request
        else:
            flag = False

        l = response.meta['item']
        letter = ''
        letter1 = hxs.xpath("//script").re('(?<=doctorjy).*?doctorjy')
        letter = letter + self.parse_letter_detail1(letter1)['letter']
        letter2 = hxs.xpath("//table[@class='doctorjy']")
        letter = letter + self.parse_letter_detail2(letter2)['letter']

        l.add_value('comment', letter)

        if flag == True:
            yield None
        else:
            yield l.load_item()
Пример #44
0
 def detail(self, response):
     log.msg(response.url)
     hxs = HtmlXPathSelector(response)
     product_name = hxs.xpath('//*[@id="vip_content_section"]/div[2]/h1/text()').extract()
     # //*[@id="vip_content_section"]/div[2]/h1
     if (len(product_name) != 0):
         product_name = hxs.xpath('//*[@id="vip_content_section"]/div[2]/h1/text()').extract()[0]
     product_price = hxs.xpath('//*[@id="price-val"]/text()').extract()
     if (len(product_price) != 0):
         product_price = hxs.xpath('//*[@id="price-val"]/text()').extract()[0]
     if (len(product_price) != 0 or product_price != None) and (len(product_name) or product_name != None):
         l = ItemLoader(item=BillionPricesIndiaItem(), response=response)
         l.add_xpath('product_name', '//*[@id="vip_content_section"]/div[2]/h1/text()')
         # l.add_xpath('quantity', '//*[@id="product_detail_view_1"]/div/div[1]/div/text()')
         l.add_xpath('category', '//*[@id="cat_crum"]/@value')
         l.add_xpath('product', '//*[@id="overview_tab"]/div/div/p/text()')
         item = l.load_item()
         item['product_url'] = response.url
         item['price'] = product_price
         item['vendor'] = 'PepperFry'
         item['city'] = 'Mumbai'
         item['state'] = 'Maharashtra'
         item['country'] = 'India'
         item['date'] = str(time.strftime("%d/%m/%Y"))
         return item
Пример #45
0
    def parse_product(self, response):
        product = WikiartProductsItem()

        hxs = HtmlXPathSelector(response)

        product['s_url'] = response._url
        product['product_name'] = hxs.xpath("//div[@class='tt30 pb8']/h1/text()").extract()[0]

        product_info = hxs.xpath("//div[@class='ArtistInfo']")

        image_info = product_info.xpath("//a[@id='paintingImage']/@href").extract()[0]
        product['resource_url'] = image_info

        for data_info in product_info.xpath("//div[@class='DataProfileBox']/p"):
            key = data_info.xpath("b/text()").extract()[0]
            if key == 'Material:':
                value = data_info.xpath("text()").extract()[1]
                value = value.replace("\r\n", '')
                product['material'] = value
            elif key == 'Dimensions:':
                value = data_info.xpath("text()").extract()[1]
                value = value.replace("\r\n", '')
                product['dimensions'] = value

        product['create_by'] = hxs.xpath("//a[@itemprop='author']/text()").extract()[0]
        years = product_info.xpath("//span[@itemprop='dateCreated']/text()").extract()
        if len(years) > 0:
            product['create_at'] = years[0]
        else:
            product['create_at'] = 'UnKnown'

        product['product_style'] = product_info.xpath("//span[@itemprop='style']/text()").extract()
        product['product_genre'] = product_info.xpath("//span[@itemprop='genre']/text()").extract()

        return product
 def parse(self, response):
         hxs = HtmlXPathSelector(response)
         items=[]
         item=UserdetailsItem()
         idd=response.url
         #Gets the user id info#######################################################################################
         idd=idd.replace("http://www.openprocessing.org/user/","").replace("/","") #strips away info to get the userid  
         item["ids"]=idd
         #Gets the website info#######################################################################################
         webs=hxs.xpath('//div[@id="userDetails"]/a/strong/text()').extract()
         item["website"]=webs
         #Gets the location information##################################################################################
         loc=hxs.xpath('//div[@id="userDetails"]/strong/text()').extract()
         item["location"]=loc
         #Gets the date joined info####################################################################################
         #What we are looking for in the joining data
         #<div id="userDetails"><blah blah>what we want </div>
         joined=hxs.xpath('//div[@id="userDetails"]/text()').extract()
         item["joined"]=joined
         #Gets the name of the person whose page it is which is in the title tag#######################################
         gd1=hxs.xpath('//title/text()').extract()  #Gets the title information for the page
         item["name"]=gd1
         #Gets the membership status data that we are looking for######################################################
         # Example <a href="/membership/" class="hangingBox" style="position:absolute; left: 10px;width: 72px; color:#ff9900; text-align:center; ">Professor+</a>
         gd2=hxs.xpath('//a[@href="/membership/"]/text()').extract()  
         gd2=str(gd2).replace(",","").replace("go","") #Cleans things up
         item["membership"]=gd2
         items.append(item)
         return items # returns the list
Пример #47
0
 def parse_item(self, response):
     hxs = HtmlXPathSelector(response)
     movie = DoubanItem()
     # 电影名
     movie['title'] = hxs.xpath(
         '//h1/span[@property="v:itemreviewed"]/text()')
     # 导演
     movie['director'] = hxs.xpath(
         '//div[@id=\"info\"]/span[1]/span[2]/a/text()')
     # 主演
     movie['actor'] = hxs.xpath('//a[@rel="v:starring"]/text()')
     # 类型
     movie['type'] = hxs.xpath(
         '//*[@id="info"]//span[@property="v:genre"]/text()')
     # 国家和地区
     movie['area'] = hxs.xpath('//*[@id="info"]/text()')
     # 上映时间
     movie['publishtime'] = hxs.xpath(
         '//span[@property=\"v:initialReleaseDate\"]/text()')
     # 片长
     movie['time'] = hxs.xpath(
         '//*[@id="info"]//span[@property="v:runtime"]/text()')
     # 评分
     movie['rate_num'] = hxs.xpath('//strong[@property="v:average"]/text()')
     # 评价
     movie['rate'] = hxs.xpath('//div[@class="rating_sum"]/a/span/text()')
     # 介绍
     movie['introduce'] = hxs.xpath('//*[@id="link-report"]/span/text()')
     yield movie
	def parse_link(self, response):
		hxs = HtmlXPathSelector(response)
		# yield{
		# 	'article':hxs.xpath('//div[@class="story-body__inner"]/p/text()').extract(),
		# 	'title':hxs.xpath('//div[@class="story-body"]/h1/text()').extract(),
		# 	'description':hxs.xpath('//div[@class="story-body__inner"]/p/text()').extract_first(),
		# }

		item = ArticleItem()
		parsed_items = []
		item['pubDate'] = hxs.xpath('.//div[@class="date date--v2"]/text()').extract_first()
		item['title'] = hxs.xpath('.//div[@class="story-body"]/h1/text()').extract_first()

		# item['article'] = hxs.xpath('.//div[@class="story-body__inner"]/p/text()').extract()
		article_list = hxs.xpath('.//div[@class="story-body__inner"]/p/text()').extract()
		article = ' '.join(article_list).strip(' \n')
		item['article'] = article
		 # body_list = response.xpath("//" + XpathUtil.xpath_for_class("story-body__inner") + "//p/text()").extract()
   #      body = ' '.join(body_list).strip(' \n')
		item['description'] = hxs.xpath('.//div[@class="story-body__inner"]/p/text()').extract_first()
		item['url'] = response.url	
		item['image_url'] = hxs.xpath('//span[@class="image-and-copyright-container"]/img/@src').extract_first()
		
		parsed_items.append(item)
		return parsed_items
    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses

        """
        obj=JabongPageData()
        selector = HtmlXPathSelector(text=response.body)
        str1=response.body
        pattern1='<h2 class="prod-disc" itemprop="description">'
        pattern2='</h2>'
        index1=str1.index(pattern1)
        index2=str1.index(pattern2)
        tempRes=str1[index1+45:index2]
        try:
            tempResponse="<div>"+tempRes+"</div>"
            parser = etree.HTMLParser()
            tree   = etree.parse(StringIO(unicode(tempResponse, "utf-8")), parser)
            data=list()
            x=tree.xpath(self.item_fields['desc2_1'])
            for element in x:
                element=element.strip()
                if element:
                    data.append(element)

            x=tree.xpath(self.item_fields['desc2_2'])
            for element in x:
                element=element.strip()
                if element:
                    data.append(element)
            x=tree.xpath(self.item_fields['desc2_3'])
            tdata=""
            for element in x:
                element=element.strip()
                if element:
                    tdata=tdata+element+","
            if tdata:
                tdata=tdata[:-1]
                data.append(tdata)
            obj['desc2']=json.dumps(data)
        except:
            print 'could not get h2 data'
            obj['desc2']=""


        x=selector.xpath(self.item_fields['brand'])
        obj['brand']=x[0].extract()
        x=selector.xpath(self.item_fields['product-title'])
        obj['productTitle']=x[0].extract()
        x=selector.xpath(self.item_fields['desc1'])
        data=dict()
        for element in x:
            print "enterd"
            print element.xpath('label/text()').extract()
            key=element.xpath('label/text()').extract()[0]
            val=element.xpath('span/text()').extract()[0]
            data[key]=val
        obj['desc1']=json.dumps(data)
        obj['requestURL']=unicode(response.request.url, "utf-8")
        yield obj
Пример #50
0
    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses

        """
        obj = JabongPageData()
        selector = HtmlXPathSelector(text=response.body)
        str1 = response.body
        pattern1 = '<h2 class="prod-disc" itemprop="description">'
        pattern2 = '</h2>'
        index1 = str1.index(pattern1)
        index2 = str1.index(pattern2)
        tempRes = str1[index1 + 45:index2]
        try:
            tempResponse = "<div>" + tempRes + "</div>"
            parser = etree.HTMLParser()
            tree = etree.parse(StringIO(unicode(tempResponse, "utf-8")),
                               parser)
            data = list()
            x = tree.xpath(self.item_fields['desc2_1'])
            for element in x:
                element = element.strip()
                if element:
                    data.append(element)

            x = tree.xpath(self.item_fields['desc2_2'])
            for element in x:
                element = element.strip()
                if element:
                    data.append(element)
            x = tree.xpath(self.item_fields['desc2_3'])
            tdata = ""
            for element in x:
                element = element.strip()
                if element:
                    tdata = tdata + element + ","
            if tdata:
                tdata = tdata[:-1]
                data.append(tdata)
            obj['desc2'] = json.dumps(data)
        except:
            print 'could not get h2 data'
            obj['desc2'] = ""

        x = selector.xpath(self.item_fields['brand'])
        obj['brand'] = x[0].extract()
        x = selector.xpath(self.item_fields['product-title'])
        obj['productTitle'] = x[0].extract()
        x = selector.xpath(self.item_fields['desc1'])
        data = dict()
        for element in x:
            print "enterd"
            print element.xpath('label/text()').extract()
            key = element.xpath('label/text()').extract()[0]
            val = element.xpath('span/text()').extract()[0]
            data[key] = val
        obj['desc1'] = json.dumps(data)
        obj['requestURL'] = unicode(response.request.url, "utf-8")
        yield obj
Пример #51
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     item = PhilomathItem()
     item['url'] = response.url
     item['title'] = hxs.xpath('//title/text()').extract()
     item['body'] = ' '.join(filter(bool, map(unicode.strip, hxs.xpath('//body//text()').extract())))
     item['date'] = datetime.datetime.now().strftime("%m-%d-%Y %H:%M:%S")
     yield item
Пример #52
0
    def parse(self, response):
#        filename = response.url.split("/")[-2]
#        open(filename, 'wb').write(response.body)

 #       hxs = HtmlXPathSelector(response)
        hxs = HtmlXPathSelector(text=response.body);
        xpath='//h1/a/@href'
        print hxs.xpath(xpath).extract()
        ex='''
Пример #53
0
 def parse(self, response):
     log.msg("Parsing content from url " + response.url)
     hxs = HtmlXPathSelector(response)
     subcat_links = hxs.xpath('//*[@id="url"]/@href')
     subcat_names = hxs.xpath('//*[@id="url"]/text()')
     for link, name in zip(subcat_links, subcat_names):
         print link.extract(), name.extract()
         print "Run Crawler for category " + name.extract()
         yield scrapy.Request(link.extract(), callback=self.detail_scrape)
Пример #54
0
 def parser(self,response):
    hxs      =  HtmlXPathSelector(response)
    title    =  hxs.xpath("//div[@class='main-header']/h1[@itemprop='name']/text()").extract()
    art      =  hxs.xpath("//div[@class='product__control']/span[not(@class='product__control-name')]/text()").extract()
    article  =  art[0].encode('utf-8')
    code     =  art[1].encode('utf-8')
    brand    =  hxs.xpath("//div[@class='product__control']/a/text()").extract()
    summary  =  hxs.xpath("//div[contains(@class,'showhide') and contains(@class ,'item_desc')]/p/text()").extract()[0]
    print summary
Пример #55
0
	def category_parse(self, response):
		'''
		parse category info
		'''
		if type(response) == s_response.html.HtmlResponse:
			hxs = HtmlXPathSelector(response)

			#get sku list
			p_list = hxs.xpath("//ul[@class='list-h']/li")
			sku_list = []
			if hxs.select("//ul[@class='list-h']/li"):
				sku_list = [p.xpath('./@sku').extract()[0] for p \
					in hxs.xpath("//ul[@class='list-h']/li")]
			if hxs.select("//div[@id='plist']/div"):
				sku_list = [p.xpath('./@sku').extract()[0] for p \
					in hxs.xpath("//div[@id='plist']/div")]
			for sku in sku_list:
				yield Request('/'.join(['http://item.jd.com', sku + '.html']), 
					callback=JD_Product_Spider().parse)
				
			next_page_link = hxs.xpath("//div[@class='pagin fr']/a[@class='next']/@href")
			if next_page_link:
				if next_page_link.extract()[0].startswith("?"):
					yield Request('http://list.jd.com/list.html' + next_page_link.extract()[0], 
						callback=self.category_parse)
				elif next_page_link.extract()[0].startswith("http"):
					yield Request(next_page_link.extract()[0], callback=self.category_parse)
				else:
					pass
		else:
			#maybe cache using bs4 to deal
			dom = BeautifulSoup(response.body)
			
			if dom.find('div', id='plist').find_all('li'):
				p_list = dom.find('div', id='plist').find_all('li')

			if dom.find('div', id='plist').find_all('div'):
				p_list = dom.find('div', id='plist').find_all('div')
			
			sku_list = [p.get('sku', '') for p in p_list]

			for sku in sku_list:
				yield Request('/'.join(['http://item.jd.com', sku + '.html']), 
					callback=JD_Product_Spider().parse)
			
			next_page_a = dom.find('div', 
				class_='pagin fr').find('a', class_='next')
			if next_page_a:
				next_page_link = next_page_a.get('href')
				if next_page_link.startswith("?"):
					yield Request('http://list.jd.com/list.html' + next_page_link.extract()[0], 
						callback=self.category_parse)
				elif next_page_link.startswith("http"):
					yield Request(next_page_link, callback=self.category_parse)
				else:
					pass
def getReviewText(product_id, n=-1, filename=""):
    isWriting = False

    if filename != "":
        f = open(filename, "w")
        isWriting = True

    url = "http://shopping.naver.com/detail/section_user_review.nhn?nv_mid=" + str(product_id) + "&page="
    urlForMaxPage = "http://shopping.naver.com/detail/detail.nhn?nv_mid=" + str(product_id)

    # to get maxPage
    d = requests.get(urlForMaxPage).text
    hxs = HtmlXPathSelector(text=d)
    total = int("".join(re.findall(r"\d+", hxs.xpath(".//*[@class='count']/text()").extract()[0])))
    print "total review num = %d" % total

    maxPage = 0
    if n == -1 or n > total:
        n = total

    if total % 20 != 0:
        maxPage = total / 20 + 1
    else:
        maxPage = total / 20

    print "Crawling ... (to %d)" % n
    # maxPage
    # to get review
    reviews = []
    count = 0
    for i in range(1, maxPage + 1):
        url_page = url + str(i)

        d = rf.remove_Tag(requests.get(url_page).text)
        hxs = HtmlXPathSelector(text=d)
        for each in hxs.xpath(".//*[@class='atc']/text()"):
            text = each.extract().strip()
            if len(text) > 0:
                review = rf.remove_whitespace(text.encode("utf-8"))
                if isWriting == True:
                    f.write(review + "\n")

                reviews.append(review)
                count += 1
            if count >= n:
                break

        if count >= n:
            break

    if isWriting == True:
        f.close()

    print "Complete product review crawling. (crawlingCount : %d / total : %d)" % (count, total)

    return reviews
    def parse_dir_contents(self, response):
        str1 = response.url.split("/")[3]
        filename = 'output11/'+str1+ '.html'
        with open(filename, 'wb') as f:
            f.write(response.body)
        hxs = HtmlXPathSelector(response)

        #extract the cost
        HDcost1 = hxs.xpath('//*[@class="dv-button-inner"]/text()').extract()
        len1 = len(HDcost1)
        del HDcost1[0]
        for i in range(0,len1-1):
            var1 = HDcost1[i]
            var1 = var1.encode('utf-8')
            HDcost1[i] = var1

        #extract the title
        title1 = hxs.xpath('//*[@id="aiv-content-title"]/text()').extract()
        len1 = len(title1)
        for i in range(0,len1):
            var1 = title1[i]
            var1 = var1.encode('utf-8')
            var1=var1.strip()
            title1[i]=var1
        title1 = filter(None,title1)


        #extract the release year
        relyear= hxs.xpath('//*[@class="release-year"]/text()').extract()
        relyear1=relyear[0].encode('utf-8')
        relyear1=relyear1.strip()

        #extrcat the time
        times = hxs.xpath('//*[@id="dv-dp-left-content"]/div[2]/div[2]/dl/dd[2]/text()').extract()
        time1 = times[0].strip()
        time1 = time1.encode('utf-8')

        #extract the director
        #dir1 = hxs.select('//*[@id="dv-center-features"]/div[1]/div/table/tbody/tr[2]/td/a/text()').extract()
        #not working

        dir1 = response.xpath('//*[@id="dv-center-features"]/div[1]/div/table/tr[2]/td/a/text()').extract()
        #dir1 = str(string1)
        dir1 = dir1[0].encode('utf-8')
        dir1 = dir1.strip()

        #extract the starring actors
        actors = hxs.select('//*[@id="dv-dp-left-content"]/div[2]/div[2]/dl/dd[1]/text()').extract()
        actors = actors[0].encode('utf-8')
        actors = actors.strip()

        #print "======"
        #print HDcost
        #print title1
        yield DmozItem(title=title1, time=time1, cost=HDcost1,year=relyear1,director=dir1,star=actors,)
Пример #58
0
 def parser(self, response):
    hxs = HtmlXPathSelector(response)
    vacancy  = hxs.xpath("//div[@class='b-vacancy-custom g-round']/h1[@class='title b-vacancy-title']/text()").extract()
    company  = hxs.xpath("//div[@class='companyname']/a/text()").extract()
    price    = hxs.xpath("//div[@class='l-paddings']/text()").extract()[3]
    city     = hxs.xpath("//div[@class='l-paddings']/text()").extract()[4]
    exp      = hxs.xpath("//div[@class='l-paddings']/text()").extract()[5]
    vacancy1 = vacancy[0].encode('utf-8')
    company1 = company[0].encode('utf-8')
    writer   = csv.writer(open('price.csv', 'a'), lineterminator='\n')
    writer.writerow([vacancy1, company1, price.encode('utf-8'), city.encode('utf-8'), exp.encode('utf-8')])
Пример #59
0
 def parse_item(self, response):
     self.log('Hi, this is an item page! %s' % response.url)
     hxs = HtmlXPathSelector(response)
     i = BoleItem()
     i['title'] = hxs.xpath('//div[@class="article"]/h2/text()').extract()
     i['info_class'] = hxs.xpath('//div[@class="article_info"]/div[@class="textl"]/a[2]/text()').extract()
     i['info_area'] = hxs.xpath('//div[@class="article_info"]/div[@class="textl"]/a[3]/text()').extract()
     i['pub_date'] = hxs.xpath('//div[@class="article_info"]/div[@class="textr"]/text()').extract()
     
     i['content'] = hxs.xpath('//div[@class="article"]/div[@class="context"]/p').extract()
     return i
Пример #60
0
 def parse(self, response):
   hxs = HtmlXPathSelector(response)  
   titles = hxs.select("//html")
   items = []
   item = CraigslistSampleItem()
   
   names = hxs.xpath('//td[@class="product_name"]/strong/text()')
   imageurls = hxs.xpath('//tr/td[@align="center"]/a/img/@src')
   for name, url in zip(names, imageurls):
       item["productname"] = name
       item["imgurl"] = url
       yield item