Пример #1
0
    def parse_page(self, response):

        # Retrieve url
        url = response.meta['post_url']

        # Parse top post parameters
        post_date = response.xpath("//div[@id='siteTable']//div[@class='entry unvoted']//time/@datetime").extract_first()
        title = response.xpath("//div[@id='siteTable']//div[@class='entry unvoted']//p[@class='title']/a/text()").extract_first()
        op_text = response.xpath("//div[@id='siteTable']//div[@class='entry unvoted']//div[@class='md']/p/text()").extract()
        op_text = " ".join([p for p in op_text])  # Combine all paragraphs
        op_author = response.xpath("//div[@id='siteTable']//div[@class='entry unvoted']//p[@class='tagline ']/a/text()").extract_first()

        # Get all comments
        try:
            comments = response.xpath("//div[@class='commentarea']//div[@class='entry unvoted']")

            #Parse items
            for comment in comments:
                com_date = comment.xpath("p[@class='tagline']/time/@datetime").extract_first()
                com_author = comment.xpath("p[@class='tagline']/a/text()").extract()[1]
                com_text = comment.xpath("form//div[@class='md']/p/text()").extract()
                com_text = " ".join([p for p in com_text])  # Combine all paragraphs

                item = RedditItem()
                item['post_url'] = url
                item['post_date'] = post_date
                item['post_author'] = op_author
                item['post_title'] = title
                item['post_text'] = op_text
                item['com_date'] = com_date
                item['com_author'] = com_author
                item['com_text'] = com_text

                yield item

        # If no coments pass empy strings
        except IndexError:
            item = RedditItem()
            item['post_url'] = url
            item['post_date'] = post_date
            item['post_author'] = op_author
            item['post_title'] = title
            item['post_text'] = op_text
            item['com_date'] = ""
            item['com_author'] = ""
            item['com_text'] = ""

            yield item
Пример #2
0
    def parse_item(self, response):
        # div with class=thing, is where title and url is
        # got xpath by inspecting element
        # // *[ @ id = "thing_t3_6gkm6e"] / div[2] / p[1] / a
        # xpath is read as inside div with class=thing ,second div tag > first p tag > a tag and text() inside it

        # url currently scrapping
        url = response.url

        # all div tags with thing class
        divs = response.css('div.thing')

        for div in divs:
            # creating item instance
            item = RedditItem()

            # text() is to get the text between tags !
            title = div.xpath('div[2]/p[1]/a/text()').extract()

            # href is a property which is extracted by @
            img_link = div.xpath('div[2]/p[1]/a/@href').extract()

            # setting item field
            item['title'] = title
            item['img_link'] = img_link

            yield item
Пример #3
0
 def parse_page(self, response):
     author = json.loads(response.body)
     nextlinker = "https://www.reddit.com/r/RoastMe/.json?limit=1"  + "&after=" + author[0]['data']['children'][0]['data']['name'] + "&count=1" + ".json?limit=1" 
     items = RedditItem()
     
     try:
         items['insult'] = author[1]['data']['children'][0]['data']['body']
         #items['picture'] = base64.b64encode(requests.get(author[0]['data']['children'][0]['data']['url']).content).decode('ascii'),
         items['picture'] = author[0]['data']['children'][0]['data']['url'],
         yield items
         yield response.follow(nextlinker, self.parse)
     except:
         items['insult'] = 'null'
         yield response.follow(nextlinker, self.parse)
         DontCloseSpider
         pass
 
     
     
     # items['picture'] = img_base64duh
     # items['insult'] = author[1]['data']['children'][0]['data']['body']
     
     try:
         yield response.follow(nextlinker, self.parse)
     except:
         print("how is there no more links?")
Пример #4
0
    def post_parse(self, response):
        item = RedditItem()
        page = bs(response.body, 'html.parser')
        comments = page.findAll('div', {'class': 'entry unvoted'})

        item['url'] = response.url
        state = 0
        state_0_id = []
        for c_ix, c in enumerate(comments):

            comment_id = str(uuid4())
            item['comment_id'] = comment_id
            state_0_id.append(comment_id)

            comment_flag = c.find('a', {'data-event-action': 'parent'})

            if comment_flag:
                item['reply_type'] = 'comment'
                state = 1
                item['conversation_resp'] = comment_flag['href']
                item['thread_starter'] = state_0_id[0]

            else:
                item['reply_type'] = 'reply'
                state = 0
                item['conversation_resp'] = None
                item['thread_starter'] = 'self'
                state_0_id = []

            author = c.find('a',
                            {'class': lambda x: x and x.startswith('author')})
            if author:
                item['author'] = author.text
            else:
                item['author'] = None

            item['time'] = c.find('time')['title']

            if c_ix == 0:
                likes = page.find('div', {'class': 'score unvoted'}).text
                item['likes'] = int(likes[0])
            else:
                likes = c.find('span', {'class': 'score unvoted'})
                if likes:
                    item['likes'] = int(likes.text[0])

            title = c.find('p', {'class': 'title'})
            if title:
                title_text = title.a.text
            else:
                title_text = ''

            comment = c.find('div', {'class': 'md'})
            if comment:
                comment_text = comment.text.replace('\n', '')
            else:
                comment_text = ''
            item['comment'] = title_text.join(comment_text)

            yield item
Пример #5
0
    def parse(self, response):
        # Get the subreddit from the URL
        sub = response.url.split('/')[4]

        # Parse through each of the posts
        for post in response.css('div.thing'):
            item = RedditItem()

            item['date'] = dt.today()
            item['date_str'] = item['date'].strftime('%Y-%m-%d')
            item['sub'] = sub
            item['title'] = post.css('a.title::text').extract_first()

            item['url'] = post.css('a.title::attr(href)').extract_first()
            # If self-post, add reddit base url (as it's relative by default)
            if item['url'][:3] == '/r/':
                #item['url'] = 'https://www.reddit.com' + item['url']
                item['url'] = 'https://old.reddit.com' + item['url']

            #item['score'] = int(post.css('div.unvoted::text').extract_first())
            #print(post.css('div.unvoted::text').extract_first().strip('k'))
            item['score'] = 1000 * float(
                post.css('div.unvoted::text').extract_first().strip('k'))
            #item['commentsUrl'] = post.css('a.comments::attr(href)').extract_first()
            #print('PRINTERROR' + post.css('a.comments::attr(href)').extract_first())
            item['comments_url'] = post.css(
                'a.comments::attr(href)').extract_first()

            yield item
Пример #6
0
    def parse_item(self, response):
        sel_list = response.css('div.thing')

        for sel in sel_list:
            item = RedditItem()
            item['title'] = sel.xpath('div/p/a/text()').extract()
            item['url'] = sel.xpath('a/@href').extract()
            item['image_urls'] = sel.xpath('a/@href').extract()
            yield item
Пример #7
0
    def parse(self, response):
        print("Start scrapping Review info....")
        hxs = Selector(response)
        l_venue = RedditItem()

        reddit_titles = hxs.xpath(
            "//div[@class = 'content']/div[@class = 'spacer']/div[@id='siteTable']/div[@data-type='link']/div[@class = 'entry unvoted']/div[@class='top-matter']/p[@class='title']/a/text()"
        )
        l_venue['title'] = reddit_titles.extract()
        yield l_venue
Пример #8
0
    def parse(self, response):

        i = 0
        links = response.xpath(
            '//p[@class="title"]/a[@class="title may-blank outbound"]/@href'
        ).extract()
        titles = response.xpath(
            '//p[@class="title"]/a[@class="title may-blank outbound"]/text()'
        ).extract()
        dates = response.xpath(
            '//p[@class="tagline"]/time[@class="live-timestamp"]/@title'
        ).extract()
        votes = response.xpath(
            '//div[@class="midcol unvoted"]/div[@class="score unvoted"]/text()'
        ).extract()
        comments = response.xpath(
            '//div[@id="siteTable"]//a[@class="comments may-blank"]/@href'
        ).extract()
        #comments = response.xpath('//div[@id="siteTable"]//li[@class="first"]/a/text()').extract()
        for l in links:
            i = i + 1
        if i == 0:
            links = response.xpath(
                '//p[@class="title"]/a[@class="title may-blank "]/@href'
            ).extract()
            titles = response.xpath(
                '//p[@class="title"]/a[@class="title may-blank "]/text()'
            ).extract()

        # for link in links:
        # 	print "links:	",link
        # for link in titles:
        # 	print "titles:	",link
        # for link in dates:
        # 	print "dates:	",link
        # for link in votes:
        # 	print "votes:	",link
        # for link in comments:
        # 	print "comments:	",link

        for i, link in enumerate(links):
            item = RedditItem()
            item[
                'subreddit'] = ""  #str(re.findall('/r/[A-Za-z]*8?', link))[3:len(str(re.findall('/r/[A-Za-z]*8?', link))) - 2]
            item['link'] = links[i]
            item['title'] = titles[i]
            item['date'] = dates[i]
            if votes[i] == u'\u2022':
                item['vote'] = 'hidden'
            else:
                item['vote'] = votes[i]
            item['top_comment'] = ""

            yield item
 def parse_item(self, response):
     divs = response.css('div.thing')
     for div in divs:
         item = RedditItem()
         item['title'] = div.xpath('div[2]/div[1]/p[1]/a/text()').extract()
         item['img_link'] = div.xpath(
             'div[2]/div[1]/p[1]/a/@href').extract()
         # title= div.xpath('div[2]/div[1]/p[1]/a/text()').extract()
         # img_link=div.xpath('div[2]/div[1]/p[1]/a/@href').extract()
         # print (title)
         # print (img_link)
         yield item
    def parse(self, response):
        self.logger.info("Visited %s", response.url)
        self.counter += 1

        if (self.terminate == True):
            return 0

        #Excluding recent posts in Nexclude_pages whose score might be still change in the near future
        if (self.counter > self.Nexclude_pages):
            #Extracting the titles
            #Only posts with external links as the goals are usually shared through external links to streamable, mixtape.moe, gyfcat etc.
            titles = response.css('.outbound::text').extract()
            links = response.css('.outbound::attr(href)').extract()
            timestamps = response.css('.live-timestamp::attr(title)').extract()
            comments = response.css('.comments::text').extract()
            scores = response.css('.score.likes::text').extract()

            #Going through the submissions
            for i in range(len(titles)):

                #Check if the keywords for goal submissions are present in current submission
                if (self.check_goal(self.decompose(titles[i])) != True):
                    continue

                #Check if submission data exceeds the time difference upper limit
                #if self.exceed_time_diff(pd.to_datetime(timestamps[i])) == True:
                #    self.terminate = True
                #    raise CloseSpider('Submission History Exceeded Limit')

                #Item Creation
                item = RedditItem()
                item['title'] = self.decompose(titles[i])
                item['comments'] = comments[i]
                item['score'] = scores[i]
                item['link'] = links[i]
                item['time'] = timestamps[i]
                print(timestamps[i], " \n")

                yield (item)

        #Pauses for 10 seconds
        #time.sleep(10)

        if (self.terminate == False):
            next_page = response.css(
                '.next-button a::attr(href)').extract_first()

            if next_page is not None:
                yield scrapy.Request(response.urljoin(next_page),
                                     callback=self.parse)
Пример #11
0
	def parse_interests(self, response):
		#rank = response.meta['rank']
		meta = response.meta

		title = response.meta['title']
		source = response.meta['source']
		date = response.meta['date']
		time = response.meta['time']
		topic_vote = response.meta['topic_vote']
		link = response.meta['link']
		num_of_comments = response.meta['num_of_comments']
		submitter = response.meta['submitter']
		submitter_link = response.meta['submitter_link']
		subreddit = response.meta['subreddit']
		top_comment = response.meta['top_comment']
		top_comment_vote = response.meta['top_comment_vote']
		top_comment_child = response.meta['top_comment_child']
		percentage_of_upvotes = response.meta['percentage_of_upvotes']
		top_comment_username = response.meta['top_comment_username']
		# find subreddits that the user is interested in
		next_page = response.xpath('//span[@class="next-button"]/a/@href').extract_first()
		user_interests = response.meta.get('user_interests', [])
		user_interests.extend((response.xpath('//div[@onclick="click_thing(this)"and @data-type="comment"]/@data-subreddit').extract()))
		meta['user_interests'] = user_interests
		yield scrapy.Request(next_page, callback=self.parse_interests, meta=meta)



		item = RedditItem()
		#item['rank'] = rank
		item['title'] = title
		item['source'] = source
		item['date'] = date
		item['time'] = time
		item['topic_vote'] = topic_vote
		item['link'] = link
		item['num_of_comments'] = num_of_comments
		item['submitter'] = submitter
		item['submitter_link'] = submitter_link
		item['subreddit'] = subreddit
		item['top_comment'] = top_comment
		item['top_comment_vote'] = top_comment_vote
		item['percentage_of_upvotes'] = percentage_of_upvotes
		item['top_comment_username'] = top_comment_username
		item['user_interests'] = set(user_interests)
		item['top_comment_child'] = top_comment_child
	
		yield item
Пример #12
0
    def parse(self, response):
        #comment = response.xpath("//div[@class='usertext-body may-blank-within md-container']/div/text()").extract()
        #comment = response.xpath("//*[@id='form-t1_d7sgs6ywxr']/div/div/text()").extract()

        #rows = response.xpath('//div[@class="md"]/p').extract()
        #for row in rows[5:11]:
        comments = response.xpath(
            '//div[@class="md"]/p/text()')[5:400].extract()
        #comment2 =  response.xpath('//div[@class="md"]/p')[6].extract()
        #comment3 =  response.xpath('//div[@class="md"]/p')[7].extract()

        item = RedditItem()
        item['comments'] = comments
        #item['comment2'] = comment2
        #item['comment3'] = comment3
        yield item
Пример #13
0
    def parse(self, response):
        # how to extract data
        # print(response.css('h2.pd8yw6-0::text').extract())
        # print(response.css('a.SQnoC3ObvgnGjWt90zD9Z::attr(href)').extract())
        # print(response.css('div._1rZYMD_4xY3gRcSS3p8ODO::text').extract())
        title = (response.css('h2.pd8yw6-0::text').extract())
        href = (response.css('a.SQnoC3ObvgnGjWt90zD9Z::attr(href)').extract())
        score = (response.css('div._1rZYMD_4xY3gRcSS3p8ODO::text').extract())

        for item in zip(title, href, score):
            new_item = RedditItem()
            new_item['title'] = item[0]
            new_item['href'] = item[1]
            new_item['score'] = item[2]

            yield new_item
Пример #14
0
 def parse(self, response):
     
     title = response.xpath('//h2/a/text()').extract()
     author = response.xpath('//span/a/text()').extract()
     date = response.xpath('//div/time/text()').extract()
     for (title, author, date) in zip(title, author, date):
         
         post = RedditItem()
         post['title'] = title
         post['author'] = author
         post['date'] = date
         
         yield post
               
     next_page = response.css('.load-more::attr(href)').extract()[0]
     print(next_page)
     if next_page is not None:
         yield response.follow(next_page, callback=self.parse, dont_filter=True)
Пример #15
0
    def parse_item(self, response):
        # For debugging:
        # print(response.url)

        # In Chrome, right click on image -> Inspect
        # In Inspect area right click on element and Copy - XPath
        # //*[@id="thing_t3_6ye9u6"]/div[2]/div[1]/p[1]/a

        divs = response.css('div.thing')

        for div in divs:

            item = RedditItem()

            item['title'] = div.xpath('div[2]/div[1]/p[1]/a/text()').extract()
            item['img_link'] = div.xpath(
                'div[2]/div[1]/p[1]/a/@href').extract()

            yield item
Пример #16
0
    def parse(self, response):
        #How to extract data
        titles = print(response.css("a::text").extract())
        hrefs = print(response.css("a::text").extract())
        scores = print(response.css("a::text").extract())

        for item in zip(titles, hrefs, scores):
            new_item = RedditItem()

            new_item['title'] = item[0]
            new_item['href'] = item[1]
            new_item['score'] = item[2]

            yield new_item

        next_page = response.css("span.next-button").css(
            'a::attr(href)').extract()[0]

        yield Request(url=next_page, callback=self.parse)
Пример #17
0
    def parse(self, response):

        # Extract the sections about each posting
        postings = response.xpath('//div[@class="top-matter"]')

        # Loop through each entry and process
        for record in postings:

            # Create item object to capture data
            item = RedditItem()

            # Parse record using xpath to extract variables we want
            item['docTitle'] = record.xpath(
                'p[@class="title"]/a/text()').extract_first()
            docUrlBuilder = record.xpath(
                'p[@class="title"]/a/@href').extract_first()
            item['docUrl'] = response.urljoin(docUrlBuilder)
            item['docUrlHash'] = hashlib.sha224(item['docUrl']).hexdigest()
            item['docAuthorUrl'] = record.xpath(
                'p[@class="tagline "]/a/@href').extract_first()

            # If author not specified use start_urls as default value
            if item['docAuthorUrl'] is None:
                item['docAuthorUrl'] = "NoAuthorGiven"

            item['docAuthorUrlHash'] = hashlib.sha224(
                item['docAuthorUrl']).hexdigest()
            item['docTimestamp'] = record.xpath(
                'p[@class="tagline "]/time/@datetime').extract_first()[:10]

            # Return item
            yield item

            # Call the function to get the next page
            relative_next_url = response.xpath(
                '//span[@class="next-button"]/a/@href').extract_first()

            # Get the URL for the next page
            absolute_next_url = response.urljoin(relative_next_url)

            # Recursively call the parse function to get content from the next page
            yield Request(absolute_next_url, callback=self.parse)
Пример #18
0
    def parse(self, response):
        self.driver.get('https://www.reddit.com/r/technology/')
        response = TextResponse(url=response.url,
                                body=self.driver.page_source,
                                encoding='utf-8')

        posts = response.xpath('//div[@class="entry unvoted"]').extract()
        upvotes = response.xpath(
            '//div[@class="score unvoted"]/text()').extract()

        for i in range(50):
            for j, post in enumerate(posts):
                comment = Selector(text=post).xpath(
                    '//ul[@class="flat-list buttons"]/li[@class="first"]/a/text()'
                ).extract()
                label = Selector(text=post).xpath(
                    '//p[@class="title"]/span[@class="linkflairlabel"]/text()'
                ).extract()
                title = Selector(
                    text=post).xpath('//p[@class="title"]/a/text()').extract()
                date = Selector(text=post).xpath(
                    '//p[@class="tagline"]/time/@datetime').extract()
                link = Selector(text=post).xpath(
                    '//p[@class="title"]/span[@class="domain"]/a/text()'
                ).extract()
                upvote = upvotes[j]
                item = RedditItem()
                item['upvotes'] = upvote
                item['comments'] = comment
                item['label'] = label
                item['title'] = title
                item['date'] = date
                item['link'] = link
                yield item

            self.driver.find_element_by_xpath(
                '//a[@rel="nofollow next"]').click()
            time.sleep(2)
Пример #19
0
    def parse_item(self, response):

        #Debugger:
        from scrapy.shell import inspect_response
        inspect_response(response, self)

        #if response =[]:
        #    self.start_urls = the_
        item = RedditItem()

        item['dates'] = response.xpath(
            '//div[@class="search-result-meta"]/span[@class="search-time"]/time/@title'
        ).extract()
        item['authors'] = response.xpath(
            '//div[@class="search-result-meta"]/span[@class="search-author"]//a/text()'
        ).extract()
        item['votes'] = response.xpath(
            '//div[@class="search-result-meta"]/span[@class="search-score"]/text()'
        ).extract()

        #self.last_date = item['dates'][-1]

        yield item
Пример #20
0
	def parse(self, response):
		links = response.xpath('//p[@class="title"]/a[@class="title may-blank "]/@href').extract()
		titles = response.xpath('//p[@class="title"]/a[@class="title may-blank "]/text()').extract()
		dates = response.xpath('//p[@class="tagline"]/time[@class="live-timestamp"]/@title').extract()
		votes = response.xpath('//div[@class="midcol unvoted"]/div[@class="score unvoted"]/text()').extract()
		comments = response.xpath('//div[@id="siteTable"]//a[@class="comments may-blank"]/@href').extract()


		for i, link in enumerate(comments):
			item = RedditItem()
			item['subreddit'] = str(re.findall('/r/[A-Za-z]*8?', link))[3:len(str(re.findall('/r/[A-Za-z]*8?', link))) - 2]
			item['link'] = links[i]
			item['title'] = titles[i]
			item['date'] = dates[i]
			if votes[i] == u'\u2022':
				item['vote'] = 'hidden'
			else:
				item['vote'] = int(votes[i])

			request = Request(link, callback=self.parse_comment_page)
			request.meta['item'] = item

			yield request