def parse_blog(self, response): #There are dead links on the pages that can't be processed if not "The page you are looking for cannot be found or is no longer available." in response.text: blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = response.xpath( '//*[@id="Headline"]/h1/text()').get() blog['author'] = response.xpath( '//*[@id="Headline"]/p/span/a/text()').get() blog['published_date'] = parse( response.xpath( '//*[@id="Headline"]/p/span/text()').get().replace( ' by: ', '')) blog['content'] = "".join( response.xpath('//*[@id="Col2"]/article/div[2]//text()'). getall()).strip().replace('\n', ' ') blog['content_html'] = response.xpath( '//*[@id="Col2"]/article/div[2]').get() blog['links'] = get_links(blog['content_html']) blog['tags'] = tags_to_json( response.xpath( '//*[@id="Headline"]/p/span/i/a/text()').getall()) yield blog #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = get_views(response.url) stat['likes'] = None comment_data = get_comments(response.url) stat['comments'] = comment_data['total'] yield stat for comment in comment_data['comments']: #Comments parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = response.url parsed_comment['comment_id'] = comment['id'] parsed_comment['username'] = comment['author'][ 'username'] if 'username' in comment[ 'author'] else comment['author']['name'] parsed_comment['user_id'] = comment['author'][ 'id'] if 'id' in comment['author'] else None parsed_comment['comment'] = comment['raw_message'] parsed_comment['comment_original'] = comment['message'] parsed_comment['links'] = get_links(comment['message']) parsed_comment['upvotes'] = comment['likes'] parsed_comment['downvotes'] = comment['dislikes'] parsed_comment['published_date'] = parse(comment['createdAt']) parsed_comment['reply_count'] = len([ x for x in comment_data['comments'] if str(x['parent']) == str(comment['id']) ]) parsed_comment['reply_to'] = comment['parent'] yield parsed_comment
def parse_blog(self, response): blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = response.xpath( '//*[@id="Blog1"]/div[1]/div/div/div/div[1]/h3/text()').get( ).strip() blog['author'] = response.xpath( '//*[@id="Blog1"]/div[1]/div/div/div/div[1]/div[3]/div[1]/span[1]/span/text()' ).get() blog['published_date'] = parse( response.xpath( '//*[@id="Blog1"]/div[1]/div/div/div/div[1]/div[3]/div[1]/span[2]/a/abbr/@title' ).get()) blog['content'] = " ".join( response.xpath( "//div[contains(@class, 'post-body entry-content')]//text()"). getall()).replace('\n', '') blog['content_html'] = response.xpath( "//div[contains(@class, 'post-body entry-content')]").get() blog['links'] = get_links( response.xpath( "//div[contains(@class, 'post-body entry-content')]").get()) tags = response.xpath( '//*[contains(@id, "Blog1")]/div[1]/div/div/div/div[1]/div[3]/div[2]/span//text()' ).getall() blog['tags'] = tags_to_json( list( filter( lambda a: a != ',\n' and a != '\n' and 'Labels:' not in a, tags))) yield blog #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = None stat['likes'] = None comment_num = response.xpath('//div[@id="comments"]/h4/text()').get() if "No" not in comment_num: stat['comments'] = int(re.search(r'\d+', comment_num).group()) else: stat['comments'] = None yield stat #Comments if "No" not in comment_num: for c in response.xpath('//*[@id="top-ra"]//li'): parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = response.url parsed_comment['comment_id'] = c.xpath( '//li[contains(@class, "comment")]/@id').get() if c.xpath( '//li[contains(@class, "comment")]/div[2]/div/cite/a/text()' ).get() is not None: parsed_comment['username'] = c.xpath( '//li[contains(@class, "comment")]/div[2]/div/cite/a/text()' ).get() parsed_comment['user_id'] = c.xpath( '//li[contains(@class, "comment")]/div[2]/div/cite/a/@href' ).get().replace('https://www.blogger.com/profile/', "") else: parsed_comment['username'] = c.xpath( '//li[contains(@class, "comment")]/div[2]/div/cite/text()' ).get() parsed_comment['user_id'] = None parsed_comment['comment'] = " ".join( c.xpath( '//li[contains(@class, "comment")]/div[2]/p//text()'). getall()) parsed_comment['comment_original'] = c.xpath( '//li[contains(@class, "comment")]/div[2]/p').get() parsed_comment['links'] = get_links( c.xpath( '//li[contains(@class, "comment")]/div[2]/p').get()) parsed_comment['upvotes'] = None parsed_comment['downvotes'] = None parsed_comment['published_date'] = None parsed_comment['reply_count'] = None parsed_comment['reply_to'] = None yield parsed_comment
def parse_blog(self, response): # Posts blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = parse_title(response) date, author = parse_author_date(response) author = parse_author(response) blog['author'] = str(author)[0:99] if author else None blog['published_date'] = parse_datetime(date) if date else None content, content_html = parse_content(response) if content_html: blog['content'] = content blog['content_html'] = content_html blog['links'] = get_links(content_html) tags = response.xpath('//*[@id="article-tags"]/div//text()').extract() blog['tags'] = tags_to_json(list(filter(lambda x: '\n' not in x and '\t' not in x, tags))) if tags else None yield blog # Comments comment_data = facebook_comments(response.url, 318812448281278) comments = comment_data['comments'] authors = comment_data['authors'] reply_dic = comment_data['reply_dic'] if comments: #Catches no comments for c in comments: parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = response.url parsed_comment['comment_id'] = c['id'] parsed_comment['username'] = [x['name'] for x in authors if c['authorID'] == x['id']][0] parsed_comment['user_id'] = c['authorID'] parsed_comment['comment'] = c['body']['text'] parsed_comment['comment_original'] = None parsed_comment['links'] = get_links(c['body']['text']) parsed_comment['upvotes'] = c['likeCount'] parsed_comment['downvotes'] = None parsed_comment['published_date'] = parse_datetime(c['timestamp']['text']) if 'public_replies' in c: parsed_comment['reply_count'] = len([x for x in comments if 'targetID' in x and x['targetID'] == c['id']]) else: parsed_comment['reply_count'] = 0 if c['id'] in reply_dic: parsed_comment['reply_to'] = reply_dic[c['id']] else: parsed_comment['reply_to'] = None yield parsed_comment # #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = None stat['likes'] = None if comments is None: stat['comments'] = 0 else: stat['comments'] = len(comments) yield stat
def parse_blog(self, response): # Posts script = response.xpath( "//script[contains(., 'identity')]/text()").extract_first() try: data = json.loads(script) except Exception as e: print(str(e) + f"\n{str(response.url)}") data = {} if data: blog = Posts() blog['domain'] = get_domain(response.url) blog['url'] = response.url blog['title'] = response.css( '.article-title::text').extract_first() author, date = parse_authors_date(data) if data else None blog['author'] = author.replace(" ", "").strip() if author else None blog['published_date'] = date if date else None blog['tags'] = tags_to_json( data['page']['tags']) if 'page' in data else None blog['content'] = get_content(response) blog['content_html'] = " ".join( response.xpath('//*[@class="article-content"]').extract()) blog['links'] = get_links(" ".join( response.xpath('//*[@class="article-content"]').extract())) yield blog else: print('here') pass # Comments article_id = data['page']['articleId'] if 'page' in data else None comments = get_torontosun_comments(article_id) if comments: #Catches no comments for c in comments: if 'content' in c and c['content']: #Skipping empty comments parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = response.url parsed_comment['comment_id'] = c['content_uuid'] parsed_comment[ 'username'] = None #Could not find API to get this parsed_comment['user_id'] = c['actor_uuid'] parsed_comment[ 'comment'] = c['content'] if 'content' in c else None parsed_comment['comment_original'] = None parsed_comment['links'] = get_links( c['content']) if 'content' in c else None parsed_comment['upvotes'] = c['total_likes'] parsed_comment['downvotes'] = c['total_dislikes'] parsed_comment['published_date'] = parse_datetime( time.strftime('%m/%d/%Y %H:%M:%S', time.gmtime(c['date_created'] / 1000.))) if c['total_replies'] > 0: parsed_comment['reply_count'] = c['total_replies'] else: parsed_comment['reply_count'] = 0 if c['content_container_uuid'] != c['thread_uuid'] and c[ 'content_container_uuid'] != c['parent_uuid']: parsed_comment['reply_to'] = c['thread_uuid'] else: parsed_comment['reply_to'] = None yield parsed_comment #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = None stat['likes'] = None if comments is None: stat['comments'] = 0 else: stat['comments'] = len(comments) yield stat
def parse_blog(self, response): # Posts pid = response.css('article').xpath( "@id").extract_first().strip().replace(' ', '').split('-')[1] data = get_stats_data(pid) blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = response.css("h1.entry-title::text").extract_first() blog['author'] = parse_author(data, response) blog['published_date'] = parse( response.css( "span.entry-meta-date.updated a::text").extract_first()) blog['content'] = "".join( response.xpath("//div[contains(@class, 'entry-content')]//text()"). extract()).strip().replace('\n', ' ').replace('\t', ' ') blog['content_html'] = "".join( response.xpath( "//div[contains(@class, 'entry-content')]").extract()) blog['links'] = get_links("".join( response.xpath( "//div[contains(@class, 'entry-content')]").extract())) blog['tags'] = tags_to_json(list( data['tags'].keys())) if data['tags'] else None yield blog #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = None stat['likes'] = data['like_count'] stat['comments'] = data['comment_count'] yield stat #Comments pids__ = [ comment.strip().replace(' ', '').split('-')[1] for comment in response.css("li").xpath("@id").extract() ] req = build_batch(pids__, 70000375) res = make_api_request(req) if res: for res_url in res: res_values = res[res_url] #-Parsing comments parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = response.url parsed_comment['comment_id'] = f"bc_comment_{res_values['ID']}" usr_name = res_values['author']['name'].lower( ) if res_values['author']['name'] else None parsed_comment['username'] = usr_name parsed_comment['user_id'] = get_user_id( self.comment_users, usr_name) parsed_comment['comment'] = res_values['raw_content'] parsed_comment['comment_original'] = res_values['content'] parsed_comment['links'] = get_links(res_values['content']) stats_links = res_values['meta']['links'] parsed_comment['upvotes'] = parse_comments( stats_links['likes']) parsed_comment['downvotes'] = None parsed_comment['published_date'] = res_values['date'] #-Getting replies replies = get_wordpress_replies(res, res_values['ID'], 70000375) parsed_comment['reply_count'] = replies[0] parsed_comment['reply_to'] = replies[1] yield parsed_comment
def parse_article(self, response): blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = response.xpath( '//*[@class="news-article-header__title"]/text()').get() blog['author'] = response.xpath( '//*[@class="news-byline-full__info-wrapper"]/span/text()').get() blog['published_date'] = get_date( response.xpath( '//*[@class="news-article-header__timestamps-posted"]/text()'). get()) blog['content'] = " ".join( response.xpath( '//*[@id="js-post-container"]/div/div[1]/div[1]/div/p/text()'). getall()).strip() blog['content_html'] = " ".join( response.xpath( '//*[@id="js-post-container"]/div/div[1]/div[1]/div').getall()) blog['links'] = get_links(blog['content_html']) blog['tags'] = tags_to_json(parse_tags(response)) yield blog #Comments requests article_url = format_comment_url(response.url) comment_data = facebook_comments(article_url, '162111247988300') comments = comment_data['comments'] authors = comment_data['authors'] reply_dic = comment_data['reply_dic'] if comments: #Catches no comments for c in comments: parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = response.url parsed_comment['comment_id'] = c['id'] parsed_comment['username'] = [ x['name'] for x in authors if c['authorID'] == x['id'] ][0] parsed_comment['user_id'] = c['authorID'] parsed_comment['comment'] = c['body']['text'] parsed_comment['comment_original'] = None parsed_comment['links'] = get_links(c['body']['text']) parsed_comment['upvotes'] = c['likeCount'] parsed_comment['downvotes'] = None parsed_comment['published_date'] = parse_datetime( c['timestamp']['text']) if 'public_replies' in c: parsed_comment['reply_count'] = len([ x for x in comments if 'targetID' in x and x['targetID'] == c['id'] ]) else: parsed_comment['reply_count'] = 0 if c['id'] in reply_dic: parsed_comment['reply_to'] = reply_dic[c['id']] else: parsed_comment['reply_to'] = None yield parsed_comment #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = None stat['likes'] = None if comments is None: stat['comments'] = 0 else: stat['comments'] = len(comments) yield stat