def parse_page(self, response): # Retrieve url url = response.meta['post_url'] # Parse top post parameters post_date = response.xpath("//div[@id='siteTable']//div[@class='entry unvoted']//time/@datetime").extract_first() title = response.xpath("//div[@id='siteTable']//div[@class='entry unvoted']//p[@class='title']/a/text()").extract_first() op_text = response.xpath("//div[@id='siteTable']//div[@class='entry unvoted']//div[@class='md']/p/text()").extract() op_text = " ".join([p for p in op_text]) # Combine all paragraphs op_author = response.xpath("//div[@id='siteTable']//div[@class='entry unvoted']//p[@class='tagline ']/a/text()").extract_first() # Get all comments try: comments = response.xpath("//div[@class='commentarea']//div[@class='entry unvoted']") #Parse items for comment in comments: com_date = comment.xpath("p[@class='tagline']/time/@datetime").extract_first() com_author = comment.xpath("p[@class='tagline']/a/text()").extract()[1] com_text = comment.xpath("form//div[@class='md']/p/text()").extract() com_text = " ".join([p for p in com_text]) # Combine all paragraphs item = RedditItem() item['post_url'] = url item['post_date'] = post_date item['post_author'] = op_author item['post_title'] = title item['post_text'] = op_text item['com_date'] = com_date item['com_author'] = com_author item['com_text'] = com_text yield item # If no coments pass empy strings except IndexError: item = RedditItem() item['post_url'] = url item['post_date'] = post_date item['post_author'] = op_author item['post_title'] = title item['post_text'] = op_text item['com_date'] = "" item['com_author'] = "" item['com_text'] = "" yield item
def parse_item(self, response): # div with class=thing, is where title and url is # got xpath by inspecting element # // *[ @ id = "thing_t3_6gkm6e"] / div[2] / p[1] / a # xpath is read as inside div with class=thing ,second div tag > first p tag > a tag and text() inside it # url currently scrapping url = response.url # all div tags with thing class divs = response.css('div.thing') for div in divs: # creating item instance item = RedditItem() # text() is to get the text between tags ! title = div.xpath('div[2]/p[1]/a/text()').extract() # href is a property which is extracted by @ img_link = div.xpath('div[2]/p[1]/a/@href').extract() # setting item field item['title'] = title item['img_link'] = img_link yield item
def parse_page(self, response): author = json.loads(response.body) nextlinker = "https://www.reddit.com/r/RoastMe/.json?limit=1" + "&after=" + author[0]['data']['children'][0]['data']['name'] + "&count=1" + ".json?limit=1" items = RedditItem() try: items['insult'] = author[1]['data']['children'][0]['data']['body'] #items['picture'] = base64.b64encode(requests.get(author[0]['data']['children'][0]['data']['url']).content).decode('ascii'), items['picture'] = author[0]['data']['children'][0]['data']['url'], yield items yield response.follow(nextlinker, self.parse) except: items['insult'] = 'null' yield response.follow(nextlinker, self.parse) DontCloseSpider pass # items['picture'] = img_base64duh # items['insult'] = author[1]['data']['children'][0]['data']['body'] try: yield response.follow(nextlinker, self.parse) except: print("how is there no more links?")
def post_parse(self, response): item = RedditItem() page = bs(response.body, 'html.parser') comments = page.findAll('div', {'class': 'entry unvoted'}) item['url'] = response.url state = 0 state_0_id = [] for c_ix, c in enumerate(comments): comment_id = str(uuid4()) item['comment_id'] = comment_id state_0_id.append(comment_id) comment_flag = c.find('a', {'data-event-action': 'parent'}) if comment_flag: item['reply_type'] = 'comment' state = 1 item['conversation_resp'] = comment_flag['href'] item['thread_starter'] = state_0_id[0] else: item['reply_type'] = 'reply' state = 0 item['conversation_resp'] = None item['thread_starter'] = 'self' state_0_id = [] author = c.find('a', {'class': lambda x: x and x.startswith('author')}) if author: item['author'] = author.text else: item['author'] = None item['time'] = c.find('time')['title'] if c_ix == 0: likes = page.find('div', {'class': 'score unvoted'}).text item['likes'] = int(likes[0]) else: likes = c.find('span', {'class': 'score unvoted'}) if likes: item['likes'] = int(likes.text[0]) title = c.find('p', {'class': 'title'}) if title: title_text = title.a.text else: title_text = '' comment = c.find('div', {'class': 'md'}) if comment: comment_text = comment.text.replace('\n', '') else: comment_text = '' item['comment'] = title_text.join(comment_text) yield item
def parse(self, response): # Get the subreddit from the URL sub = response.url.split('/')[4] # Parse through each of the posts for post in response.css('div.thing'): item = RedditItem() item['date'] = dt.today() item['date_str'] = item['date'].strftime('%Y-%m-%d') item['sub'] = sub item['title'] = post.css('a.title::text').extract_first() item['url'] = post.css('a.title::attr(href)').extract_first() # If self-post, add reddit base url (as it's relative by default) if item['url'][:3] == '/r/': #item['url'] = 'https://www.reddit.com' + item['url'] item['url'] = 'https://old.reddit.com' + item['url'] #item['score'] = int(post.css('div.unvoted::text').extract_first()) #print(post.css('div.unvoted::text').extract_first().strip('k')) item['score'] = 1000 * float( post.css('div.unvoted::text').extract_first().strip('k')) #item['commentsUrl'] = post.css('a.comments::attr(href)').extract_first() #print('PRINTERROR' + post.css('a.comments::attr(href)').extract_first()) item['comments_url'] = post.css( 'a.comments::attr(href)').extract_first() yield item
def parse_item(self, response): sel_list = response.css('div.thing') for sel in sel_list: item = RedditItem() item['title'] = sel.xpath('div/p/a/text()').extract() item['url'] = sel.xpath('a/@href').extract() item['image_urls'] = sel.xpath('a/@href').extract() yield item
def parse(self, response): print("Start scrapping Review info....") hxs = Selector(response) l_venue = RedditItem() reddit_titles = hxs.xpath( "//div[@class = 'content']/div[@class = 'spacer']/div[@id='siteTable']/div[@data-type='link']/div[@class = 'entry unvoted']/div[@class='top-matter']/p[@class='title']/a/text()" ) l_venue['title'] = reddit_titles.extract() yield l_venue
def parse(self, response): i = 0 links = response.xpath( '//p[@class="title"]/a[@class="title may-blank outbound"]/@href' ).extract() titles = response.xpath( '//p[@class="title"]/a[@class="title may-blank outbound"]/text()' ).extract() dates = response.xpath( '//p[@class="tagline"]/time[@class="live-timestamp"]/@title' ).extract() votes = response.xpath( '//div[@class="midcol unvoted"]/div[@class="score unvoted"]/text()' ).extract() comments = response.xpath( '//div[@id="siteTable"]//a[@class="comments may-blank"]/@href' ).extract() #comments = response.xpath('//div[@id="siteTable"]//li[@class="first"]/a/text()').extract() for l in links: i = i + 1 if i == 0: links = response.xpath( '//p[@class="title"]/a[@class="title may-blank "]/@href' ).extract() titles = response.xpath( '//p[@class="title"]/a[@class="title may-blank "]/text()' ).extract() # for link in links: # print "links: ",link # for link in titles: # print "titles: ",link # for link in dates: # print "dates: ",link # for link in votes: # print "votes: ",link # for link in comments: # print "comments: ",link for i, link in enumerate(links): item = RedditItem() item[ 'subreddit'] = "" #str(re.findall('/r/[A-Za-z]*8?', link))[3:len(str(re.findall('/r/[A-Za-z]*8?', link))) - 2] item['link'] = links[i] item['title'] = titles[i] item['date'] = dates[i] if votes[i] == u'\u2022': item['vote'] = 'hidden' else: item['vote'] = votes[i] item['top_comment'] = "" yield item
def parse_item(self, response): divs = response.css('div.thing') for div in divs: item = RedditItem() item['title'] = div.xpath('div[2]/div[1]/p[1]/a/text()').extract() item['img_link'] = div.xpath( 'div[2]/div[1]/p[1]/a/@href').extract() # title= div.xpath('div[2]/div[1]/p[1]/a/text()').extract() # img_link=div.xpath('div[2]/div[1]/p[1]/a/@href').extract() # print (title) # print (img_link) yield item
def parse(self, response): self.logger.info("Visited %s", response.url) self.counter += 1 if (self.terminate == True): return 0 #Excluding recent posts in Nexclude_pages whose score might be still change in the near future if (self.counter > self.Nexclude_pages): #Extracting the titles #Only posts with external links as the goals are usually shared through external links to streamable, mixtape.moe, gyfcat etc. titles = response.css('.outbound::text').extract() links = response.css('.outbound::attr(href)').extract() timestamps = response.css('.live-timestamp::attr(title)').extract() comments = response.css('.comments::text').extract() scores = response.css('.score.likes::text').extract() #Going through the submissions for i in range(len(titles)): #Check if the keywords for goal submissions are present in current submission if (self.check_goal(self.decompose(titles[i])) != True): continue #Check if submission data exceeds the time difference upper limit #if self.exceed_time_diff(pd.to_datetime(timestamps[i])) == True: # self.terminate = True # raise CloseSpider('Submission History Exceeded Limit') #Item Creation item = RedditItem() item['title'] = self.decompose(titles[i]) item['comments'] = comments[i] item['score'] = scores[i] item['link'] = links[i] item['time'] = timestamps[i] print(timestamps[i], " \n") yield (item) #Pauses for 10 seconds #time.sleep(10) if (self.terminate == False): next_page = response.css( '.next-button a::attr(href)').extract_first() if next_page is not None: yield scrapy.Request(response.urljoin(next_page), callback=self.parse)
def parse_interests(self, response): #rank = response.meta['rank'] meta = response.meta title = response.meta['title'] source = response.meta['source'] date = response.meta['date'] time = response.meta['time'] topic_vote = response.meta['topic_vote'] link = response.meta['link'] num_of_comments = response.meta['num_of_comments'] submitter = response.meta['submitter'] submitter_link = response.meta['submitter_link'] subreddit = response.meta['subreddit'] top_comment = response.meta['top_comment'] top_comment_vote = response.meta['top_comment_vote'] top_comment_child = response.meta['top_comment_child'] percentage_of_upvotes = response.meta['percentage_of_upvotes'] top_comment_username = response.meta['top_comment_username'] # find subreddits that the user is interested in next_page = response.xpath('//span[@class="next-button"]/a/@href').extract_first() user_interests = response.meta.get('user_interests', []) user_interests.extend((response.xpath('//div[@onclick="click_thing(this)"and @data-type="comment"]/@data-subreddit').extract())) meta['user_interests'] = user_interests yield scrapy.Request(next_page, callback=self.parse_interests, meta=meta) item = RedditItem() #item['rank'] = rank item['title'] = title item['source'] = source item['date'] = date item['time'] = time item['topic_vote'] = topic_vote item['link'] = link item['num_of_comments'] = num_of_comments item['submitter'] = submitter item['submitter_link'] = submitter_link item['subreddit'] = subreddit item['top_comment'] = top_comment item['top_comment_vote'] = top_comment_vote item['percentage_of_upvotes'] = percentage_of_upvotes item['top_comment_username'] = top_comment_username item['user_interests'] = set(user_interests) item['top_comment_child'] = top_comment_child yield item
def parse(self, response): #comment = response.xpath("//div[@class='usertext-body may-blank-within md-container']/div/text()").extract() #comment = response.xpath("//*[@id='form-t1_d7sgs6ywxr']/div/div/text()").extract() #rows = response.xpath('//div[@class="md"]/p').extract() #for row in rows[5:11]: comments = response.xpath( '//div[@class="md"]/p/text()')[5:400].extract() #comment2 = response.xpath('//div[@class="md"]/p')[6].extract() #comment3 = response.xpath('//div[@class="md"]/p')[7].extract() item = RedditItem() item['comments'] = comments #item['comment2'] = comment2 #item['comment3'] = comment3 yield item
def parse(self, response): # how to extract data # print(response.css('h2.pd8yw6-0::text').extract()) # print(response.css('a.SQnoC3ObvgnGjWt90zD9Z::attr(href)').extract()) # print(response.css('div._1rZYMD_4xY3gRcSS3p8ODO::text').extract()) title = (response.css('h2.pd8yw6-0::text').extract()) href = (response.css('a.SQnoC3ObvgnGjWt90zD9Z::attr(href)').extract()) score = (response.css('div._1rZYMD_4xY3gRcSS3p8ODO::text').extract()) for item in zip(title, href, score): new_item = RedditItem() new_item['title'] = item[0] new_item['href'] = item[1] new_item['score'] = item[2] yield new_item
def parse(self, response): title = response.xpath('//h2/a/text()').extract() author = response.xpath('//span/a/text()').extract() date = response.xpath('//div/time/text()').extract() for (title, author, date) in zip(title, author, date): post = RedditItem() post['title'] = title post['author'] = author post['date'] = date yield post next_page = response.css('.load-more::attr(href)').extract()[0] print(next_page) if next_page is not None: yield response.follow(next_page, callback=self.parse, dont_filter=True)
def parse_item(self, response): # For debugging: # print(response.url) # In Chrome, right click on image -> Inspect # In Inspect area right click on element and Copy - XPath # //*[@id="thing_t3_6ye9u6"]/div[2]/div[1]/p[1]/a divs = response.css('div.thing') for div in divs: item = RedditItem() item['title'] = div.xpath('div[2]/div[1]/p[1]/a/text()').extract() item['img_link'] = div.xpath( 'div[2]/div[1]/p[1]/a/@href').extract() yield item
def parse(self, response): #How to extract data titles = print(response.css("a::text").extract()) hrefs = print(response.css("a::text").extract()) scores = print(response.css("a::text").extract()) for item in zip(titles, hrefs, scores): new_item = RedditItem() new_item['title'] = item[0] new_item['href'] = item[1] new_item['score'] = item[2] yield new_item next_page = response.css("span.next-button").css( 'a::attr(href)').extract()[0] yield Request(url=next_page, callback=self.parse)
def parse(self, response): # Extract the sections about each posting postings = response.xpath('//div[@class="top-matter"]') # Loop through each entry and process for record in postings: # Create item object to capture data item = RedditItem() # Parse record using xpath to extract variables we want item['docTitle'] = record.xpath( 'p[@class="title"]/a/text()').extract_first() docUrlBuilder = record.xpath( 'p[@class="title"]/a/@href').extract_first() item['docUrl'] = response.urljoin(docUrlBuilder) item['docUrlHash'] = hashlib.sha224(item['docUrl']).hexdigest() item['docAuthorUrl'] = record.xpath( 'p[@class="tagline "]/a/@href').extract_first() # If author not specified use start_urls as default value if item['docAuthorUrl'] is None: item['docAuthorUrl'] = "NoAuthorGiven" item['docAuthorUrlHash'] = hashlib.sha224( item['docAuthorUrl']).hexdigest() item['docTimestamp'] = record.xpath( 'p[@class="tagline "]/time/@datetime').extract_first()[:10] # Return item yield item # Call the function to get the next page relative_next_url = response.xpath( '//span[@class="next-button"]/a/@href').extract_first() # Get the URL for the next page absolute_next_url = response.urljoin(relative_next_url) # Recursively call the parse function to get content from the next page yield Request(absolute_next_url, callback=self.parse)
def parse(self, response): self.driver.get('https://www.reddit.com/r/technology/') response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8') posts = response.xpath('//div[@class="entry unvoted"]').extract() upvotes = response.xpath( '//div[@class="score unvoted"]/text()').extract() for i in range(50): for j, post in enumerate(posts): comment = Selector(text=post).xpath( '//ul[@class="flat-list buttons"]/li[@class="first"]/a/text()' ).extract() label = Selector(text=post).xpath( '//p[@class="title"]/span[@class="linkflairlabel"]/text()' ).extract() title = Selector( text=post).xpath('//p[@class="title"]/a/text()').extract() date = Selector(text=post).xpath( '//p[@class="tagline"]/time/@datetime').extract() link = Selector(text=post).xpath( '//p[@class="title"]/span[@class="domain"]/a/text()' ).extract() upvote = upvotes[j] item = RedditItem() item['upvotes'] = upvote item['comments'] = comment item['label'] = label item['title'] = title item['date'] = date item['link'] = link yield item self.driver.find_element_by_xpath( '//a[@rel="nofollow next"]').click() time.sleep(2)
def parse_item(self, response): #Debugger: from scrapy.shell import inspect_response inspect_response(response, self) #if response =[]: # self.start_urls = the_ item = RedditItem() item['dates'] = response.xpath( '//div[@class="search-result-meta"]/span[@class="search-time"]/time/@title' ).extract() item['authors'] = response.xpath( '//div[@class="search-result-meta"]/span[@class="search-author"]//a/text()' ).extract() item['votes'] = response.xpath( '//div[@class="search-result-meta"]/span[@class="search-score"]/text()' ).extract() #self.last_date = item['dates'][-1] yield item
def parse(self, response): links = response.xpath('//p[@class="title"]/a[@class="title may-blank "]/@href').extract() titles = response.xpath('//p[@class="title"]/a[@class="title may-blank "]/text()').extract() dates = response.xpath('//p[@class="tagline"]/time[@class="live-timestamp"]/@title').extract() votes = response.xpath('//div[@class="midcol unvoted"]/div[@class="score unvoted"]/text()').extract() comments = response.xpath('//div[@id="siteTable"]//a[@class="comments may-blank"]/@href').extract() for i, link in enumerate(comments): item = RedditItem() item['subreddit'] = str(re.findall('/r/[A-Za-z]*8?', link))[3:len(str(re.findall('/r/[A-Za-z]*8?', link))) - 2] item['link'] = links[i] item['title'] = titles[i] item['date'] = dates[i] if votes[i] == u'\u2022': item['vote'] = 'hidden' else: item['vote'] = int(votes[i]) request = Request(link, callback=self.parse_comment_page) request.meta['item'] = item yield request