Exemplo n.º 1
0
 def parse_cinema(self, element):
     cinema = Component()
     cinema.type = "Cinema"
     cinema.alignment = "LEFT"
     cinema.page_url = self.get_from_page(element, "./div[2]/p/a", "href")
     cinema.title = self.get_from_page(element, "./div[2]/p/a", "string")
     cinema.snippet = self.get_from_page(element, "./div[2]/span[2]",
                                         "string")
     cinema.actors = self.get_from_page(element, "./div[2]/span[3]",
                                        "string")
     cinema.year = self.get_from_page(element, "./div[2]/p/span", "string")
     cinema.image = self.get_from_page(element, "./p/a/img", "src")
     return cinema
Exemplo n.º 2
0
    def parse_wizard_image(self, element, block_xpath, sample):
        wizard = Component()
        wizard.type = "WIZARD"
        wizard.wizard_type = "WIZARD_IMAGE"
        wizard.alignment = "LEFT"

        block_xpath = self.extract_xpath(block_xpath)

        inner_xpath = self.extract_xpath(sample.media_links[0].xpath)
        for img in sample.media_links:
            inner_xpath = self.great_common_prefix(
                inner_xpath, self.extract_xpath(img.xpath))
        inner_xpath = self.combine_xpath(inner_xpath[len(block_xpath):], True)

        wizard.media_links = list()
        img_list = element.xpath(inner_xpath)
        for img in img_list:
            wizard.media_links.append(
                self.get_attr(img, sample.media_links[0].attr))

        page_url_xpath = self.extract_xpath(
            sample.page_url.xpath)[len(block_xpath):]
        wizard.page_url = self.get_attr(
            element.xpath(self.combine_xpath(page_url_xpath, True)),
            sample.page_url.attr)

        title_xpath = self.extract_xpath(sample.title.xpath)[len(block_xpath):]
        wizard.title = self.get_attr(
            element.xpath(self.combine_xpath(title_xpath, True)),
            sample.title.attr)
        return wizard
Exemplo n.º 3
0
    def parse_search_result(self, element, block_xpath, sample):
        search_result = Component()
        search_result.type = "SEARCH_RESULT"
        search_result.alignment = "LEFT"

        block_xpath = self.extract_xpath(block_xpath)

        page_url_xpath = self.extract_xpath(
            sample.page_url.xpath)[len(block_xpath):]
        search_result.page_url = self.get_attr(
            element.xpath(self.combine_xpath(page_url_xpath, True)),
            sample.page_url.attr)

        title_xpath = self.extract_xpath(sample.title.xpath)[len(block_xpath):]
        search_result.title = self.get_attr(
            element.xpath(self.combine_xpath(title_xpath, True)),
            sample.title.attr)

        snippet_xpath = self.extract_xpath(
            sample.snippet.xpath)[len(block_xpath):]
        search_result.snippet = self.get_attr(
            element.xpath(self.combine_xpath(snippet_xpath, True)),
            sample.snippet.attr)

        view_url_xpath = self.extract_xpath(
            sample.view_url.xpath)[len(block_xpath):]
        search_result.view_url = self.get_attr(
            element.xpath(self.combine_xpath(view_url_xpath, True)),
            sample.view_url.attr)
        return search_result
 def parse_adv(self, element):
     adv = Component()
     adv.type = "ADV"
     adv.alignment = "LEFT"
     adv.page_url = self.get_from_page(element, "./h3/a", "href")
     adv.title = self.get_from_page(element, "./h3/a", "string")
     adv.snippet = self.get_from_page(element, "./div[2]", "string")
     adv.view_url = self.get_from_page(element, "./div/cite", "string")
     return adv
Exemplo n.º 5
0
 def parse_adv(self, element):
     adv = Component()
     adv.type = "ADV"
     adv.alignment = "LEFT"
     adv.page_url = self.get_from_page(element, "./h2/a", "href")
     adv.title = self.get_from_page(element, "./h2/a", "string")
     adv.snippet = self.get_from_page(element, "./div[@class='organic__content-wrapper clearfix']/div[1]", "string")
     adv.view_url = self.get_from_page(element, "./div[@class='organic__subtitle typo typo_type_greenurl']/div[1]/a", "string")
     return adv
 def parse_search_result(self, element):
     search_result = Component()
     search_result.type = "SEARCH_RESULT"
     search_result.alignment = "LEFT"
     search_result.page_url = self.get_from_page(element, "./h3/a", "href")
     search_result.title = self.get_from_page(element, "./h3/a", "string")
     search_result.snippet = self.get_from_page(element, "./div/span", "strings")
     search_result.view_url = self.get_from_page(element, "./div/div/cite", "string")
     return search_result
Exemplo n.º 7
0
 def get_substitution(self, tree, markup):
     logging.info("Start")
     parser_result = ParserResult()
     for markup_component in markup.components:
         parser_component = Component()
         for key in markup_component.__dict__.keys():
             field = markup_component.__dict__[key]
             if isinstance(field, str):
                 parser_component.__dict__[key] = field
             elif isinstance(field, list):
                 parser_component.__dict__[key] = list()
                 for elem in field:
                     parser_component.__dict__[key].append(tree.get_value(elem))
             else:
                 parser_component.__dict__[key] = tree.get_value(field)
         parser_result.add(parser_component)
     logging.info("End")
     return parser_result
 def parse_wizard_news(self, element):
     wizard = Component()
     wizard.type = "WIZARD"
     wizard.wizard_type = "WIZARD_NEWS"
     wizard.alignment = "LEFT"
     wizard.page_url = self.get_from_page(element, ".", "href")
     wizard.title = self.get_from_page(element, ".", "string")
     return wizard
 def parse_image(self, element):
     image = Component()
     image.type = "IMAGE"
     image.alignment = "LEFT"
     image.page_url = self.get_from_page(element, "./a", "href")
     image.view_url = self.get_from_page(element, "./cite", "title")
     image.title = self.get_from_page(element, ".", "string")
     return image
Exemplo n.º 10
0
 def parse_component(self, element):
     component = Component()
     component.type = "WIKI"
     component.alignment = "JSON"
     component.page_url = jsonpath.jsonpath(element, "$.pageid")[0]
     component.title = jsonpath.jsonpath(element, "$.title")[0]
     component.snippet = jsonpath.jsonpath(element, "$.snippet")[0]
     return component
Exemplo n.º 11
0
 def parse_wizard_image(self, element):
     wizard = Component()
     wizard.type = "WIZARD"
     wizard.wizard_type = "WIZARD_IMAGE"
     wizard.alignment = "LEFT"
     img_list = element.xpath("./div/a/img")
     wizard.media_links = list()
     for img in img_list:
         wizard.media_links.append(self.get_from_page(img, ".", "src"))
     wizard.page_url = self.get_from_page(element, "./h3/a", "href")
     wizard.title = self.get_from_page(element, "./h3/a", "string")
     return wizard
Exemplo n.º 12
0
    def parse_component(self, element, index):
        sample = self.samples[index]
        xpath = self.xpaths[index]
        t = self.types[index]
        block_xpath = self.extract_xpath(self.block_xpath)

        component = Component()
        for key in sample.__dict__.keys():
            if isinstance(sample.__dict__[key], str):
                component.__dict__[key] = sample.__dict__[key]
            elif isinstance(sample.__dict__[key], list):
                inner_xpath = self.extract_xpath(sample.__dict__[key][0].xpath)
                for elem in sample.__dict__[key]:
                    inner_xpath = self.great_common_prefix(
                        inner_xpath, self.extract_xpath(elem.xpath))
                inner_xpath = self.combine_xpath(
                    inner_xpath[len(block_xpath):], True)

                component.__dict__[key] = None
                if len(element.xpath(inner_xpath)) > 0:
                    component.__dict__[key] = list()
                    for elem in element.xpath(inner_xpath):
                        component.__dict__[key].append(
                            t.get_attr(elem, sample.__dict__[key][0].attr))
            else:
                key_xpath = self.extract_xpath(
                    sample.__dict__[key].xpath)[len(block_xpath):]
                component.__dict__[key] = t.get_attr(
                    element.xpath(self.combine_xpath(key_xpath, True)),
                    sample.__dict__[key].attr)
            if component.__dict__[key] is None:
                return None

        return component
Exemplo n.º 13
0
    def parse_component(self, element, sample):
        logging.info("Start")

        component = Component()
        for key in sample.__dict__.keys():
            field = sample.__dict__[key]
            if isinstance(field, str):
                component.__dict__[key] = field
            elif isinstance(field, list):
                inner_treepath = field[0]
                for elem in field:
                    inner_treepath = inner_treepath.get_common_prefix(elem)

                component.__dict__[key] = None
                if len(element.get_elements(inner_treepath)) > 0:
                    component.__dict__[key] = list()
                    for elem in element.get_elements(inner_treepath):
                        component.__dict__[key].append(elem.get_value(field[0].get_relative_path(field[0])))
            else:
                component.__dict__[key] = element.get_value(field)
            if component.__dict__[key] is None:
                logging.info("End None")
                return None
        logging.info("End ok")
        return component
Exemplo n.º 14
0
    def parse_wizard_news(self, element, block_xpath, sample):
        wizard = Component()
        wizard.type = "WIZARD"
        wizard.wizard_type = "WIZARD_NEWS"
        wizard.alignment = "LEFT"

        block_xpath = self.extract_xpath(block_xpath)

        page_url_xpath = self.extract_xpath(
            sample.page_url.xpath)[len(block_xpath):]
        wizard.page_url = self.get_attr(
            element.xpath(self.combine_xpath(page_url_xpath, True)),
            sample.page_url.attr)

        title_xpath = self.extract_xpath(sample.title.xpath)[len(block_xpath):]
        wizard.title = self.get_attr(
            element.xpath(self.combine_xpath(title_xpath, True)),
            sample.title.attr)
        return wizard
Exemplo n.º 15
0
 def get_substitution_component(self, tree, component):
     subst = Component()
     subst.type = component.type
     subst.alignment = component.alignment
     subst.page_url = self.get_from_page(tree, component.page_url)
     subst.title = self.get_from_page(tree, component.title)
     if component.type == "SEARCH_RESULT":
         subst = self.get_substitution_search_result(tree, component, subst)
     if component.type == "WIZARD":
         if component.wizard_type == "WIZARD_IMAGE":
             subst = self.get_substitution_wizard_image(
                 tree, component, subst)
         if component.wizard_type == "WIZARD_NEWS":
             subst = self.get_substitution_wizard_news(
                 tree, component, subst)
     return subst
Exemplo n.º 16
0
 def __init__(self):
     Component.__init__(self)
     self.snippet = None
     self.type = None
     self.image = None
     self.alignment = "LEFT"
Exemplo n.º 17
0
 def __init__(self):
     Component.__init__(self)
     self.type = None
     self.alignment = None
     self.page_url = None
     self.title = None
Exemplo n.º 18
0
 def __init__(self):
     Component.__init__(self)
     self.view_url = None
     self.type = "IMAGE"
     self.alignment = "LEFT"