def parse_wizard_image(self, element, block_xpath, sample): wizard = Component() wizard.type = "WIZARD" wizard.wizard_type = "WIZARD_IMAGE" wizard.alignment = "LEFT" block_xpath = self.extract_xpath(block_xpath) inner_xpath = self.extract_xpath(sample.media_links[0].xpath) for img in sample.media_links: inner_xpath = self.great_common_prefix( inner_xpath, self.extract_xpath(img.xpath)) inner_xpath = self.combine_xpath(inner_xpath[len(block_xpath):], True) wizard.media_links = list() img_list = element.xpath(inner_xpath) for img in img_list: wizard.media_links.append( self.get_attr(img, sample.media_links[0].attr)) page_url_xpath = self.extract_xpath( sample.page_url.xpath)[len(block_xpath):] wizard.page_url = self.get_attr( element.xpath(self.combine_xpath(page_url_xpath, True)), sample.page_url.attr) title_xpath = self.extract_xpath(sample.title.xpath)[len(block_xpath):] wizard.title = self.get_attr( element.xpath(self.combine_xpath(title_xpath, True)), sample.title.attr) return wizard
def parse_search_result(self, element, block_xpath, sample): search_result = Component() search_result.type = "SEARCH_RESULT" search_result.alignment = "LEFT" block_xpath = self.extract_xpath(block_xpath) page_url_xpath = self.extract_xpath( sample.page_url.xpath)[len(block_xpath):] search_result.page_url = self.get_attr( element.xpath(self.combine_xpath(page_url_xpath, True)), sample.page_url.attr) title_xpath = self.extract_xpath(sample.title.xpath)[len(block_xpath):] search_result.title = self.get_attr( element.xpath(self.combine_xpath(title_xpath, True)), sample.title.attr) snippet_xpath = self.extract_xpath( sample.snippet.xpath)[len(block_xpath):] search_result.snippet = self.get_attr( element.xpath(self.combine_xpath(snippet_xpath, True)), sample.snippet.attr) view_url_xpath = self.extract_xpath( sample.view_url.xpath)[len(block_xpath):] search_result.view_url = self.get_attr( element.xpath(self.combine_xpath(view_url_xpath, True)), sample.view_url.attr) return search_result
def parse_image(self, element): image = Component() image.type = "IMAGE" image.alignment = "LEFT" image.page_url = self.get_from_page(element, "./a", "href") image.view_url = self.get_from_page(element, "./cite", "title") image.title = self.get_from_page(element, ".", "string") return image
def parse_wizard_news(self, element): wizard = Component() wizard.type = "WIZARD" wizard.wizard_type = "WIZARD_NEWS" wizard.alignment = "LEFT" wizard.page_url = self.get_from_page(element, ".", "href") wizard.title = self.get_from_page(element, ".", "string") return wizard
def parse_component(self, element): component = Component() component.type = "WIKI" component.alignment = "JSON" component.page_url = jsonpath.jsonpath(element, "$.pageid")[0] component.title = jsonpath.jsonpath(element, "$.title")[0] component.snippet = jsonpath.jsonpath(element, "$.snippet")[0] return component
def parse_search_result(self, element): search_result = Component() search_result.type = "SEARCH_RESULT" search_result.alignment = "LEFT" search_result.page_url = self.get_from_page(element, "./h3/a", "href") search_result.title = self.get_from_page(element, "./h3/a", "string") search_result.snippet = self.get_from_page(element, "./div/span", "strings") search_result.view_url = self.get_from_page(element, "./div/div/cite", "string") return search_result
def parse_adv(self, element): adv = Component() adv.type = "ADV" adv.alignment = "LEFT" adv.page_url = self.get_from_page(element, "./h3/a", "href") adv.title = self.get_from_page(element, "./h3/a", "string") adv.snippet = self.get_from_page(element, "./div[2]", "string") adv.view_url = self.get_from_page(element, "./div/cite", "string") return adv
def parse_adv(self, element): adv = Component() adv.type = "ADV" adv.alignment = "LEFT" adv.page_url = self.get_from_page(element, "./h2/a", "href") adv.title = self.get_from_page(element, "./h2/a", "string") adv.snippet = self.get_from_page(element, "./div[@class='organic__content-wrapper clearfix']/div[1]", "string") adv.view_url = self.get_from_page(element, "./div[@class='organic__subtitle typo typo_type_greenurl']/div[1]/a", "string") return adv
def parse_actor(self, element): cinema = Component() cinema.type = "Actor" cinema.alignment = "LEFT" cinema.page_url = self.get_from_page(element, "./div[2]/p/a", "href") cinema.title = self.get_from_page(element, "./div[2]/p/a", "string") cinema.snippet = self.get_from_page(element, "./div[2]/span[2]", "string") cinema.image = self.get_from_page(element, "./p/a/img", "src") return cinema
def parse_wizard_image(self, element): wizard = Component() wizard.type = "WIZARD" wizard.wizard_type = "WIZARD_IMAGE" wizard.alignment = "LEFT" img_list = element.xpath("./div/a/img") wizard.media_links = list() for img in img_list: wizard.media_links.append(self.get_from_page(img, ".", "src")) wizard.page_url = self.get_from_page(element, "./h3/a", "href") wizard.title = self.get_from_page(element, "./h3/a", "string") return wizard
def parse_evaluated_cinema(self, element): cinema = Component() cinema.type = "EvaluatedCinema" cinema.alignment = "LEFT" cinema.page_url = self.get_from_page(element, "./div[2]/p/a", "href") cinema.title = self.get_from_page(element, "./div[2]/p/a", "string") cinema.snippet = self.get_from_page(element, "./div[2]/span[2]", "string") cinema.actors = self.get_from_page(element, "./div[2]/span[3]", "string") cinema.year = self.get_from_page(element, "./div[2]/p/span", "string") cinema.value = self.get_from_page(element, "./div[1]/div", "string") cinema.image = self.get_from_page(element, "./p/a/img", "src") return cinema
def get_substitution_component(self, tree, component): subst = Component() subst.type = component.type subst.alignment = component.alignment subst.page_url = self.get_from_page(tree, component.page_url) subst.title = self.get_from_page(tree, component.title) if component.type == "SEARCH_RESULT": subst = self.get_substitution_search_result(tree, component, subst) if component.type == "WIZARD": if component.wizard_type == "WIZARD_IMAGE": subst = self.get_substitution_wizard_image( tree, component, subst) if component.wizard_type == "WIZARD_NEWS": subst = self.get_substitution_wizard_news( tree, component, subst) return subst
def parse_wizard_news(self, element, block_xpath, sample): wizard = Component() wizard.type = "WIZARD" wizard.wizard_type = "WIZARD_NEWS" wizard.alignment = "LEFT" block_xpath = self.extract_xpath(block_xpath) page_url_xpath = self.extract_xpath( sample.page_url.xpath)[len(block_xpath):] wizard.page_url = self.get_attr( element.xpath(self.combine_xpath(page_url_xpath, True)), sample.page_url.attr) title_xpath = self.extract_xpath(sample.title.xpath)[len(block_xpath):] wizard.title = self.get_attr( element.xpath(self.combine_xpath(title_xpath, True)), sample.title.attr) return wizard