def parse_search_result(self, element, block_xpath, sample): search_result = Component() search_result.type = "SEARCH_RESULT" search_result.alignment = "LEFT" block_xpath = self.extract_xpath(block_xpath) page_url_xpath = self.extract_xpath( sample.page_url.xpath)[len(block_xpath):] search_result.page_url = self.get_attr( element.xpath(self.combine_xpath(page_url_xpath, True)), sample.page_url.attr) title_xpath = self.extract_xpath(sample.title.xpath)[len(block_xpath):] search_result.title = self.get_attr( element.xpath(self.combine_xpath(title_xpath, True)), sample.title.attr) snippet_xpath = self.extract_xpath( sample.snippet.xpath)[len(block_xpath):] search_result.snippet = self.get_attr( element.xpath(self.combine_xpath(snippet_xpath, True)), sample.snippet.attr) view_url_xpath = self.extract_xpath( sample.view_url.xpath)[len(block_xpath):] search_result.view_url = self.get_attr( element.xpath(self.combine_xpath(view_url_xpath, True)), sample.view_url.attr) return search_result
def parse_wizard_image(self, element, block_xpath, sample): wizard = Component() wizard.type = "WIZARD" wizard.wizard_type = "WIZARD_IMAGE" wizard.alignment = "LEFT" block_xpath = self.extract_xpath(block_xpath) inner_xpath = self.extract_xpath(sample.media_links[0].xpath) for img in sample.media_links: inner_xpath = self.great_common_prefix( inner_xpath, self.extract_xpath(img.xpath)) inner_xpath = self.combine_xpath(inner_xpath[len(block_xpath):], True) wizard.media_links = list() img_list = element.xpath(inner_xpath) for img in img_list: wizard.media_links.append( self.get_attr(img, sample.media_links[0].attr)) page_url_xpath = self.extract_xpath( sample.page_url.xpath)[len(block_xpath):] wizard.page_url = self.get_attr( element.xpath(self.combine_xpath(page_url_xpath, True)), sample.page_url.attr) title_xpath = self.extract_xpath(sample.title.xpath)[len(block_xpath):] wizard.title = self.get_attr( element.xpath(self.combine_xpath(title_xpath, True)), sample.title.attr) return wizard
def parse_component(self, element, sample): logging.info("Start") component = Component() for key in sample.__dict__.keys(): field = sample.__dict__[key] if isinstance(field, str): component.__dict__[key] = field elif isinstance(field, list): inner_treepath = field[0] for elem in field: inner_treepath = inner_treepath.get_common_prefix(elem) component.__dict__[key] = None if len(element.get_elements(inner_treepath)) > 0: component.__dict__[key] = list() for elem in element.get_elements(inner_treepath): component.__dict__[key].append(elem.get_value(field[0].get_relative_path(field[0]))) else: component.__dict__[key] = element.get_value(field) if component.__dict__[key] is None: logging.info("End None") return None logging.info("End ok") return component
def parse_component(self, element, index): sample = self.samples[index] xpath = self.xpaths[index] t = self.types[index] block_xpath = self.extract_xpath(self.block_xpath) component = Component() for key in sample.__dict__.keys(): if isinstance(sample.__dict__[key], str): component.__dict__[key] = sample.__dict__[key] elif isinstance(sample.__dict__[key], list): inner_xpath = self.extract_xpath(sample.__dict__[key][0].xpath) for elem in sample.__dict__[key]: inner_xpath = self.great_common_prefix( inner_xpath, self.extract_xpath(elem.xpath)) inner_xpath = self.combine_xpath( inner_xpath[len(block_xpath):], True) component.__dict__[key] = None if len(element.xpath(inner_xpath)) > 0: component.__dict__[key] = list() for elem in element.xpath(inner_xpath): component.__dict__[key].append( t.get_attr(elem, sample.__dict__[key][0].attr)) else: key_xpath = self.extract_xpath( sample.__dict__[key].xpath)[len(block_xpath):] component.__dict__[key] = t.get_attr( element.xpath(self.combine_xpath(key_xpath, True)), sample.__dict__[key].attr) if component.__dict__[key] is None: return None return component
def parse_image(self, element): image = Component() image.type = "IMAGE" image.alignment = "LEFT" image.page_url = self.get_from_page(element, "./a", "href") image.view_url = self.get_from_page(element, "./cite", "title") image.title = self.get_from_page(element, ".", "string") return image
def parse_component(self, element): component = Component() component.type = "WIKI" component.alignment = "JSON" component.page_url = jsonpath.jsonpath(element, "$.pageid")[0] component.title = jsonpath.jsonpath(element, "$.title")[0] component.snippet = jsonpath.jsonpath(element, "$.snippet")[0] return component
def parse_wizard_news(self, element): wizard = Component() wizard.type = "WIZARD" wizard.wizard_type = "WIZARD_NEWS" wizard.alignment = "LEFT" wizard.page_url = self.get_from_page(element, ".", "href") wizard.title = self.get_from_page(element, ".", "string") return wizard
def parse_search_result(self, element): search_result = Component() search_result.type = "SEARCH_RESULT" search_result.alignment = "LEFT" search_result.page_url = self.get_from_page(element, "./h3/a", "href") search_result.title = self.get_from_page(element, "./h3/a", "string") search_result.snippet = self.get_from_page(element, "./div/span", "strings") search_result.view_url = self.get_from_page(element, "./div/div/cite", "string") return search_result
def parse_adv(self, element): adv = Component() adv.type = "ADV" adv.alignment = "LEFT" adv.page_url = self.get_from_page(element, "./h3/a", "href") adv.title = self.get_from_page(element, "./h3/a", "string") adv.snippet = self.get_from_page(element, "./div[2]", "string") adv.view_url = self.get_from_page(element, "./div/cite", "string") return adv
def parse_adv(self, element): adv = Component() adv.type = "ADV" adv.alignment = "LEFT" adv.page_url = self.get_from_page(element, "./h2/a", "href") adv.title = self.get_from_page(element, "./h2/a", "string") adv.snippet = self.get_from_page(element, "./div[@class='organic__content-wrapper clearfix']/div[1]", "string") adv.view_url = self.get_from_page(element, "./div[@class='organic__subtitle typo typo_type_greenurl']/div[1]/a", "string") return adv
def parse_actor(self, element): cinema = Component() cinema.type = "Actor" cinema.alignment = "LEFT" cinema.page_url = self.get_from_page(element, "./div[2]/p/a", "href") cinema.title = self.get_from_page(element, "./div[2]/p/a", "string") cinema.snippet = self.get_from_page(element, "./div[2]/span[2]", "string") cinema.image = self.get_from_page(element, "./p/a/img", "src") return cinema
def parse_wizard_image(self, element): wizard = Component() wizard.type = "WIZARD" wizard.wizard_type = "WIZARD_IMAGE" wizard.alignment = "LEFT" img_list = element.xpath("./div/a/img") wizard.media_links = list() for img in img_list: wizard.media_links.append(self.get_from_page(img, ".", "src")) wizard.page_url = self.get_from_page(element, "./h3/a", "href") wizard.title = self.get_from_page(element, "./h3/a", "string") return wizard
def parse_evaluated_cinema(self, element): cinema = Component() cinema.type = "EvaluatedCinema" cinema.alignment = "LEFT" cinema.page_url = self.get_from_page(element, "./div[2]/p/a", "href") cinema.title = self.get_from_page(element, "./div[2]/p/a", "string") cinema.snippet = self.get_from_page(element, "./div[2]/span[2]", "string") cinema.actors = self.get_from_page(element, "./div[2]/span[3]", "string") cinema.year = self.get_from_page(element, "./div[2]/p/span", "string") cinema.value = self.get_from_page(element, "./div[1]/div", "string") cinema.image = self.get_from_page(element, "./p/a/img", "src") return cinema
def get_substitution_component(self, tree, component): subst = Component() subst.type = component.type subst.alignment = component.alignment subst.page_url = self.get_from_page(tree, component.page_url) subst.title = self.get_from_page(tree, component.title) if component.type == "SEARCH_RESULT": subst = self.get_substitution_search_result(tree, component, subst) if component.type == "WIZARD": if component.wizard_type == "WIZARD_IMAGE": subst = self.get_substitution_wizard_image( tree, component, subst) if component.wizard_type == "WIZARD_NEWS": subst = self.get_substitution_wizard_news( tree, component, subst) return subst
def get_substitution(self, tree, markup): logging.info("Start") parser_result = ParserResult() for markup_component in markup.components: parser_component = Component() for key in markup_component.__dict__.keys(): field = markup_component.__dict__[key] if isinstance(field, str): parser_component.__dict__[key] = field elif isinstance(field, list): parser_component.__dict__[key] = list() for elem in field: parser_component.__dict__[key].append(tree.get_value(elem)) else: parser_component.__dict__[key] = tree.get_value(field) parser_result.add(parser_component) logging.info("End") return parser_result
def parse_wizard_news(self, element, block_xpath, sample): wizard = Component() wizard.type = "WIZARD" wizard.wizard_type = "WIZARD_NEWS" wizard.alignment = "LEFT" block_xpath = self.extract_xpath(block_xpath) page_url_xpath = self.extract_xpath( sample.page_url.xpath)[len(block_xpath):] wizard.page_url = self.get_attr( element.xpath(self.combine_xpath(page_url_xpath, True)), sample.page_url.attr) title_xpath = self.extract_xpath(sample.title.xpath)[len(block_xpath):] wizard.title = self.get_attr( element.xpath(self.combine_xpath(title_xpath, True)), sample.title.attr) return wizard