Exemplo n.º 1
0
    def parse(self, response):
        question_list = response.xpath('//*[@id="questions"]')

        for question in question_list.xpath('./div'):
            item = StackoverflowItem()
            item['_id'] = question.attrib['id']
            #print(item['_id'])
            item['questions'] = question.xpath('div[2]/h3/a/text()').extract()
            #print(item['questions'])
            item['votes'] = question.xpath(
                    'div[1]/div[1]/div[1]/div[1]/span/strong/text()').extract()
            item['answers'] = question.xpath(
                    'div[1]/div[1]/div[2]/strong/text()').extract()
            item['views'] = question.xpath('div[1]/div[2]/@title').extract()
            item['links'] = question.xpath('div[2]/h3/a/@href').extract()
            item['questionsbody']= question.xpath('div[2]/div[1]/text()').extract()
            #print(item['questionsbody'])
            item['questionstag'] = question.xpath('div[2]/div[2]/@class').extract()
            #print(item['questionstag'])
            item['questionstime'] = question.xpath('div[2]/div[3]/div[1]/div[1]/span/@title').extract()
            #print(item['questionstime'])
            if not self.connection or not item:
                break
            self.collection.save(item)
            yield
Exemplo n.º 2
0
    def parse(self, response):
        #filename = response.url.split('/')[-2].split('?')[-2] + ".html"
        #with open(filename, 'wb') as fp:
        #fp.write(response.body);

        self.logger.info('hhhhhhhhhhhh', response.url)
        ques = response.xpath('//*[@id="questions"]/div')
        for q in ques:
            item = StackoverflowItem()
            item['title'] = q.xpath('div[2]/h3/a/text()').extract()
            item['votes'] = q.xpath(
                'div[1]/div[2]/div[1]/div[1]/span[1]/strong[1]/text()'
            ).extract()
            item['answers'] = q.xpath(
                'div[1]/div[2]/div[2]/strong[1]/text()').extract()
            item['link'] = response.urljoin(
                q.xpath('div[2]/h3/a/@href').extract()[0])
            item['desc'] = q.xpath('div[2]/div/text()').extract()
            yield item

        page = response.xpath('//*[@id="mainbar"]/div[4]/a')
        for p in page:
            pageTips = p.xpath('span/text()').extract()
            pageTipsStrip = pageTips[0].strip()
            print pageTipsStrip

            if 'next' == pageTipsStrip:
                print 'next page ...............'
                nextPageUrl = p.xpath('@href').extract()[0]
                nextPageUrlFull = response.urljoin(nextPageUrl)
                print nextPageUrlFull
Exemplo n.º 3
0
def parse_question_list_page(url):
    site = requests.get(url)
    top_questions = []
    if site.status_code == 200:
        content = BeautifulSoup(site.content, 'html.parser')
        questions = content.select('.question-summary')[0:10:1]
        for question in questions:
            q_item = StackoverflowItem()
            excerpt = question.select('.excerpt')[0].get_text()
            print(f'excerpt: {excerpt}')
            tags_ = question.select('.tags a')
            tags = []
            for tag_ in tags_:
                tags.append(tag_.get_text())
            print(f'tags:{tags}')

            detail_url = urllib.parse.urljoin('https://stackoverflow.com',
                                      question.select('.question-hyperlink')[0].get('href'))
            # q_item = parse_question_detail_page(detail_url)
            q_item['links'] = detail_url
            q_item['question_title'] = question.select('.question-hyperlink')[0].get_text()[4:]
            q_item['question_excerpt'] = excerpt
            q_item['tags'] = tags
            q_item['asked_time'] = question.find(class_='started fr').select('.relativetime')[0].get_text()
            q_item['votes'] =  question.select('.votes')[0].find(class_='vote-count-post').get_text()
            top_questions.append(q_item)
            print(f'q_item: {q_item["question_title"]}')
    else:
        print('failed parsing url!')
    return top_questions
    def parse(self, response):

        print('PROCESSING...' + response.url)

	questions = response.css('div.question-summary')
	for question in questions:

	    item = StackoverflowItem()

            try:
	        item['TITLE'] = question.css('a.question-hyperlink::text').extract_first().strip()
	    except:
		print('ERROR TITLE PARSE...' + response.url)
	    try:
		item['LIEN_QUESTION'] = response.urljoin(question.css('a.question-hyperlink::attr(href)').extract_first().strip())
	    except:
		print('ERROR LIEN QUESTION PARSE...' + response.url)
	    try:
		item['NBR_VOTES'] = question.css('span.vote-count-post > strong::text').extract_first()
            except:
		print('ERROR NBR_VOTES PARSE...' + response.url)
	    try:
	        item['NBR_VIEWS'] = question.css('div.views::text').extract_first().strip()
            except:
		print('ERROR NBR_VIEWS PARSE...' + response.url)
	    try:
		item['NBR_REPONSES'] = question.css('div.status > strong::text').extract_first()
	    except:
		print('ERROR NBR_REPONSES PARSE...' + response.url)
	    try:
                item['AUTHOR'] = question.css('div.user-details > a::text').extract_first().strip() if len(question.css('div.user-details > a::text').extract_first().strip()) != 0 else ' '
	    except:
                print('ERROR AUTHOR PARSE...' + response.url)
            try:
		item['AUTHOR_IMAGE'] = question.css('div.gravatar-wrapper-32 > img::attr(src)').extract_first().strip() if len(question.css('div.gravatar-wrapper-32 > img::attr(src)').extract_first().strip()) != 0 else ' '
	    except:
                print('ERROR AUTHOR IMAGE PARSE...' + response.url)
	    try:
		item['PUBLICATION_DATE'] = question.css('span.relativetime::text').extract_first().strip()
	    except:
                print('ERROR PUBLICATION DATE PARSE...' + response.url)

	    # Don't need to add description in our output, if you need description of question, just uncomment this code and uncomment description field in items.py.
	    #try:
		#item['DESCRIPTION'] = question.css('div.excerpt::text').extract_first().strip()
            #except:
                #print('ERROR DESCRIPTION PARSE...' + response.url)

	    try:
		item['TAGS'] = question.css('div.tags > a::text').extract()
	    except:
                print('ERROR TAGS PARSE...' + response.url)

	    yield item


	relative_next_url = response.xpath('//a[@rel="next"]/@href').extract_first()
	absolute_next_url = response.urljoin(relative_next_url)
	yield Request(absolute_next_url, callback=self.parse)
Exemplo n.º 5
0
 def parse_question(self, response):
     item = StackoverflowItem()
     item['header'] = response.xpath(
         '//div[@id="question-header"]//a/text()').extract()
     item['solvers'] = response.xpath(
         '//div[@id="answers"]//div[@class="user-details"]/a/text()'
     ).extract()
     return item
Exemplo n.º 6
0
 def parse(self, response):
     qss = response.xpath('//div[@class="question-summary"]')
     for qs in qss:
         item = StackoverflowItem()
         item['title'] = qs.xpath('div[@class="summary"]/h3/a/text()').extract()[0]
         item['link'] = qs.xpath('div[@class="summary"]/h3/a/@href').extract()[0]
         item['view'] = qs.xpath('div[@class="statscontainer"]/div[@class="views "]/text()').extract()[0]
         yield item
Exemplo n.º 7
0
 def parse_question(self, response):
     item = StackoverflowItem()
     item['title'] = response.css('h1 a::text').extract()[0]
     item['votes'] = response.css(
         '.question .vote-count-post::text').extract()[0]
     item['body'] = response.css('.question .post-text').extract()[0]
     item['tags'] = response.css('.question .post-tag::text').extract()
     item['link'] = response.url
     yield item
Exemplo n.º 8
0
 def parse_nextlevel(self, response):
     items = response.meta['items']
     item = StackoverflowItem()
     #item['code'] = response.body
     item['code_accepted'] = response.xpath(
         '//div[@class="answer accepted-answer"]/*//code/text()').extract()
     print item['code_accepted']
     print '########################################'
     yield item  #sending array 'items' into the pipeline
Exemplo n.º 9
0
 def parse_item(self, response):
     i = StackoverflowItem()
     l = ItemLoader(item=i, response=response)
     l.default_output_processor = Join('')
     l.add_xpath('title', '//div[@id="question-header"]/h1/a/text()')
     l.add_xpath('question',
                 '//div[contains(@class,"postcell")]/div//p/text()',
                 MapCompose(lambda s: s.replace('\n', "")))
     l.add_xpath('answers',
                 '//div[contains(@class,"answercell")]/div//p/text()',
                 MapCompose(lambda s: s.replace('\n', "")))
     yield l.load_item()
Exemplo n.º 10
0
 def parse_question(self, response):
     """
     获取问题详情页的数据
     """
     self.logger.debug('Already into Pipeline!')
     item = StackoverflowItem()
     item['link'] = response.url
     item['title'] = response.xpath(
         '//*[@id="question-header"]/h1/a/text()').extract_first()
     item['votes'] = response.xpath(
         '//*[@id="question"]/div/div[1]/div/div/text()').extract_first()
     item['body'] = response.css('.post-text').xpath(
         './/*[contains(@class, "prettyprint")]').extract()
     item['tags'] = response.css('.question .post-tag::text').extract()
     yield item
Exemplo n.º 11
0
    def parse(self, response):
        feed = self.parse_feed(response.body)

        if feed.entries is None:
            return False

        item = StackoverflowItem()
        for entry in feed.entries:
            item['author'] = entry.author
            item['question'] = entry.title
            item['rank'] = entry.author
            item['link'] = entry.link
            item['publish'] = entry.published
            item['summary'] = entry.summary
            yield (item)
Exemplo n.º 12
0
def fetch_10_newest():
    base_url = 'https://stackoverflow.com/search?tab=newest&q=[android]+duplicate:no+created:7d..'
    site = requests.get(base_url)
    newest_questions = []
    if site.status_code == 200:
        content = BeautifulSoup(site.content,'html.parser')
        questions = content.select('.question-summary')[0:10:1]

        for question in questions:
            q_item = StackoverflowItem()
            q_item['question_title'] = question.select('.question-hyperlink')[0].get_text()
            q_item['asked_time'] = question.find(class_='started fr').select('.relativetime')[0].get_text()
            newest_questions.append(q_item)
            print(f'q_item: {q_item["question_title"]}')
    else:
        print('failed fetching 10 newest questions on Android')
    return newest_questions
Exemplo n.º 13
0
    def parse(self, response):

        # add code extraction
        #0th-order link list on search page

        link_base = 'https://stackoverflow.com'
        link_addon_list = response.xpath(
            '//a[@class="question-hyperlink"]/@href').extract()

        items = []  # used to save all info collected

        for link_addon in link_addon_list:
            item = StackoverflowItem()
            #the link to open the project page
            link_level0 = (link_base + link_addon).encode('utf-8')
            request = scrapy.Request(link_level0,
                                     callback=self.parse_nextlevel)
            request.meta['items'] = items
            yield request
Exemplo n.º 14
0
    def parse_profile(self, response):
        blog_link = response.xpath('//svg[@class="svg-icon iconLink"]\
                                    /parent::*/following-sibling::div\
                                    /a/@href').extract_first()
        # jump into top tags section
        top_tags = response.xpath('//div[contains(@class, "profile-top-tags")]\
                                   //div[contains(@class, "grid__fl1")]')
        # titles of the tags, e.g. "python"
        tag_titles = top_tags.xpath('./div/a/text()').extract()
        # scores for these tags
        scores = top_tags.xpath('./div[2]//span[contains(text(), "Score")]\
                                 /following-sibling::span/text()').extract()
        # number of posts for these tags
        posts = top_tags.xpath('./div[2]//span[contains(text(), "Posts")]\
                                /following-sibling::span/text()').extract()

        # load metas
        name = response.meta.get('name')
        rank_number = response.meta.get('rank_number')
        change = response.meta.get('change')
        total_rep = response.meta.get('total_rep')
        year_rep = response.meta.get('year_rep')

        # zip the lists
        zip_list = zip(tag_titles, scores, posts)

        item = StackoverflowItem()

        # one observation per list item
        for tag_title, score, num_posts in zip_list:
            item['name'] = name
            item['rank_number'] = rank_number
            item['blog_link'] = blog_link
            item['change'] = change
            item['total_rep'] = total_rep
            item['year_rep'] = year_rep
            item['tag_title'] = tag_title
            item['score'] = score
            item['number_posts'] = num_posts

            yield item
Exemplo n.º 15
0
    def parse_item(self, response):

        item = StackoverflowItem()
        item['title'] = response.xpath('//h1[@itemprop="name"]/a/text()').get()
        item['favorite'] = response.xpath('//button/div/text()').get()
        item['views'] = ''.join(
            response.xpath(
                '//p[@class="label-key"]/b/text()').getall()).strip()
        item['question'] = response.xpath(
            '//div[contains(@class,"postcell")]').getall()
        item['best_ans_text'] = response.xpath(
            '//div[@itemprop="acceptedAnswer"]//div[contains(@class,"answercell")]'
        ).getall()
        item['up_vote_count'] = response.xpath(
            '//div[@itemprop="acceptedAnswer"]//div[@itemprop="upvoteCount"]/text()'
        ).get()
        item['bounties'] = response.xpath(
            '//div[@itemprop="acceptedAnswer"]//span[contains(@class,"bounty-award")]/text()'
        ).get()
        item['question_url'] = 'https://stackoverflow.com' + response.xpath(
            '//h1[@itemprop="name"]/a/@href').get()

        yield item
Exemplo n.º 16
0
def parse_question_detail_page(detail_url):
    detail_page = requests.get(detail_url)
    q_item = StackoverflowItem()
    if detail_page.status_code == 200:
        detail = BeautifulSoup(detail_page.content, 'html.parser')
        q_item['question_title'] = detail.select('.question-hyperlink')[0].get_text()[4:]