def parse(self, response): # get full response extensions = response.css('.SearchResult') # get extension details for extension in extensions: # Extract metadata of each extensions name = extension.css('.SearchResult-link::text').get() text_user_numbers = extension.css('.SearchResult-users-text::text').get() # get user numbers user_numbers = re.findall("[-+]?\d*\,?\d+|\d+", text_user_numbers) text_rating = extension.css('.visually-hidden::text').get() # text_rating = extension.find_element_by_css_selector('.visually-hidden').text rating = re.findall("[-+]?\d*\.?\d+|\d+", text_rating) # equal to 0 if there is no valid rating if len(rating) == 0: rating = [0] creator = extension.css('h3.SearchResult-author.SearchResult--meta-section::text').get() details_link = extension.css('.SearchResult-link::attr(href)').get() # key_id of extension key = re.search('firefox/addon/(.+?)/', details_link).group(1) if details_link is not None: details_link = response.urljoin(details_link) # yield scrapy.Request(next_page, callback=self.parse) yield scrapy_selenium.SeleniumRequest(url=details_link, callback=self.parse_extension, cb_kwargs={'name':name, 'user_numbers' :user_numbers[0], 'rating' :float(rating[0]), 'creator' :creator, 'key' :key}) # NEXT PAGE and repeat parse method. next_page = response.css('a.Button.Button--cancel.Paginate-item.Paginate-item--next::attr("href")').get() if next_page is not None: next_page = response.urljoin(next_page) yield scrapy_selenium.SeleniumRequest(url=next_page, callback=self.parse)
def parse_reviews(self, response, previous_data): reviews = response.css('li') # stupid bug s and without s for review in reviews: # content = review.css('div::text').get() temp_css_content_with_br = review.css('div.ShowMoreCard-contents') # <br> HANDLER including parsing reviews and eliminating <br> content = temp_css_content_with_br.xpath('string(.)').get() # content = review.xpath('//*[@id="react-view"]/div/div/div/div[2]/div/section/div/ul/li[1]/div/div/div/section/div/div/div::text').get() if content is not None: previous_data["reviews_list"].append(content) # NEXT PAGE and repeat parse method. next_page_reviews = response.css('a.Button.Button--cancel.Paginate-item.Paginate-item--next::attr("href")').get() if next_page_reviews is not None: next_page_reviews = response.urljoin(next_page_reviews) yield scrapy_selenium.SeleniumRequest(url=next_page_reviews, callback=self.parse_reviews, cb_kwargs={'previous_data':previous_data}) else: # Avoid repeating when do paging parse why???? maybe after selenium request and callback, it creates two processes, one for call def parse_reviews ,one for continuing # processing the next piece of code. # Export data with reviews list yield { # 'platform': "firefox", # 'key': previous_data["key"], 'name': previous_data["name"], 'rating': previous_data["rating"], 'user_numbers': previous_data["user_numbers"], # 'creator': previous_data["creator"], 'last_updated': previous_data["last_updated"], 'reviews': previous_data["reviews_list"] #as a empty list if there is no valid reviews }
def parse_extension(self, response): # Retrieve creator profile creator = [] # store previous parsed data as a dictionary previous_data = { # "key": key, "name": response.css('h1.AddonTitle::text').get(), "creator_details": creator } creator_link = response.css('span.AddonTitle-author a::attr("href")').get() if creator_link is not None: creator_link = response.urljoin(creator_link) yield scrapy_selenium.SeleniumRequest(url=creator_link, callback = self.parse_creator, cb_kwargs={'previous_data':previous_data}) else: yield { # 'platform': "firefox", # 'key': previous_data["key"], 'name': previous_data["name"], 'creator_details': [] }
def start_requests(self): # List of urls for crawling urls = [] # Path to keywords.csv path_keywords_csv = 'malicious_ext_crawler/spiders/input_data/full_medium_keywords.csv' # READ and GENERATE urls with keywords with open(path_keywords_csv, mode='r', encoding='utf-8-sig') as csv_file: data = csv.reader(csv_file) for row_keyword in data: combined_keyword_url = 'https://addons.mozilla.org/en-US/firefox/search/?q=%s&type=extension' % row_keyword[0] urls.append(combined_keyword_url) # SEND and REQUEST the urls using selenium driver/chrome for url in urls: yield scrapy_selenium.SeleniumRequest(url=url, callback=self.parse)
def parse(self, response): # check if response status is 400 if response.status >= 400: self.logger.warn(f"Reached last page: {self.page}") return # parse stuff for item in self.scrape_list_page(response): yield item # yield new pages self.page += 1 yield scrapy_selenium.SeleniumRequest(url=self.get_url(), callback=self.parse)
def parse_extension(self, response): last_updated = response.css('dd.Definition-dd.AddonMoreInfo-last-updated::text').get() reviews_list = [] # Store reviews list and void repeating in parse reviews # Rating text_rating = response.css('div.AddonMeta-rating-title::text').get() rating = re.findall("[-+]?\d*\.?\d+|\d+", text_rating) # Retrieve creator profile # creator_link = response.css('span.AddonTitle-author a::attr("href")').get() # if creator_link is not None: # creator_link = response.urljoin(creator_link) # yield scrapy_selenium.SeleniumRequest(url=creator_link, callback = self.parse_creator) # store previous parsed data as a dictionary previous_data = { # "key": key, "name": response.css('h1.AddonTitle::text').get(), "user_numbers": response.css('dd.MetadataCard-content::text').get(), "rating": rating, # "creator": self.creator, "detail_creator": response.css('span.AddonTitle-author a::attr("href")').get(), "last_updated": last_updated, "reviews_list": reviews_list } # PS: Not every extension has reviews reviews_link = response.css('a.AddonMeta-reviews-title-link::attr("href")').get() if reviews_link is not None: reviews_link = response.urljoin(reviews_link) yield scrapy_selenium.SeleniumRequest(url=reviews_link, callback = self.parse_reviews, cb_kwargs={'previous_data':previous_data}) else: # For extensions that dont have reviews (no reviews_links) yield { # 'platform': "firefox", # 'key': previous_data["key"], 'name': previous_data["name"], 'rating': previous_data["rating"], 'user_numbers': previous_data["user_numbers"], # 'creator': previous_data["creator"], 'last_updated': previous_data["last_updated"], 'reviews': [] #as a empty list if there is no valid reviews }
def parse_extension(self, response, name, user_numbers, rating, creator, key): last_updated = response.css( 'dd.Definition-dd.AddonMoreInfo-last-updated::text').get() reviews_list = [ ] # Store reviews list and void repeating in parse reviews # store previous parsed data as a dictionary previous_data = { "key": key, "name": name, "user_numbers": user_numbers, "rating": rating, "creator": creator, "last_updated": last_updated, "reviews_list": reviews_list } # PS: Not every extension has reviews reviews_link = response.css( 'a.AddonMeta-reviews-title-link::attr("href")').get() if reviews_link is not None: reviews_link = response.urljoin(reviews_link) yield scrapy_selenium.SeleniumRequest( url=reviews_link, callback=self.parse_reviews, cb_kwargs={'previous_data': previous_data}) else: # For extensions that dont have reviews (no reviews_links) yield { 'platform': "firefox", 'key': previous_data["key"], 'name': previous_data["name"], 'rating': previous_data["rating"], 'user_numbers': previous_data["user_numbers"], 'creator': previous_data["creator"], 'last_updated': previous_data["last_updated"], 'reviews': [] #as a empty list if there is no valid reviews }
def start_requests(self): yield scrapy_selenium.SeleniumRequest(url=self.get_url(), callback=self.parse)