Exemplo n.º 1
0
 def __init__(self, url: str, data: str, content_type: str):
     self.ignore_unparsed = False
     self.url = url
     #self.name = 'Web Parser'
     self.soup = BeautifulSoup(data, "lxml")
     super().__init__()
     #self.generic_parser = RecipeParser()
     self.preparse()
     self.get_images()
     self.text_parser = RecipeParser()
Exemplo n.º 2
0
 def __init__(self, url, data, content_type):
     self.ignore_unparsed = False
     self.url = url
     #self.name = 'Web Parser'
     self.soup = BeautifulSoup.BeautifulSoup(
         data,
         convertEntities=BeautifulSoup.BeautifulStoneSoup.XHTML_ENTITIES,
     )
     InteractiveImporter.__init__(self)
     #self.generic_parser = RecipeParser()
     self.preparse()
     self.get_images()
     self.text_parser = RecipeParser()
Exemplo n.º 3
0
 def __init__ (self, url, data, content_type):
     self.ignore_unparsed = False
     self.url = url
     #self.name = 'Web Parser'
     self.soup = BeautifulSoup.BeautifulSoup(data,
                                             convertEntities=BeautifulSoup.BeautifulStoneSoup.XHTML_ENTITIES,
                                             )
     InteractiveImporter.__init__(self)
     #self.generic_parser = RecipeParser()
     self.preparse()
     self.get_images()
     self.text_parser = RecipeParser()
Exemplo n.º 4
0
 def __init__(self,
              custom_parser=None,
              tags=DEFAULT_TAGS,
              tag_labels=DEFAULT_TAG_LABELS,
              modal=True,
              title=_('Import recipe')):
     self.title = title
     if custom_parser: self.parser = custom_parser
     else: self.parser = RecipeParser()
     self.labels_by_tag = tag_labels
     self.tags_by_label = {self.NEW_REC_TEXT: 'newrec'}
     for k, v in list(self.labels_by_tag.items()):
         self.tags_by_label[v] = k
     self.tags = tags
     self.setup_window()
     self.setup_action_area()
     self.markup_marks = {}
     self.markup_partners = {}
     self.anchors = []
     self.midno = 0  # an ID counter for markup marks we insert
     self.labelled = []
     self.label_counts = {}
     self.modal = modal  # If we're in an embedded gtk mainloop...
     ConvenientImporter.__init__(self)
Exemplo n.º 5
0
class InteractiveImporter(ConvenientImporter, NotThreadSafe):

    NEW_REC_TEXT = _('New Recipe')

    def __init__(self,
                 custom_parser=None,
                 tags=DEFAULT_TAGS,
                 tag_labels=DEFAULT_TAG_LABELS,
                 modal=True,
                 title=_('Import recipe')):
        self.title = title
        if custom_parser: self.parser = custom_parser
        else: self.parser = RecipeParser()
        self.labels_by_tag = tag_labels
        self.tags_by_label = {self.NEW_REC_TEXT: 'newrec'}
        for k, v in list(self.labels_by_tag.items()):
            self.tags_by_label[v] = k
        self.tags = tags
        self.setup_window()
        self.setup_action_area()
        self.markup_marks = {}
        self.markup_partners = {}
        self.anchors = []
        self.midno = 0  # an ID counter for markup marks we insert
        self.labelled = []
        self.label_counts = {}
        self.modal = modal  # If we're in an embedded gtk mainloop...
        ConvenientImporter.__init__(self)

    def setup_window(self):
        # set our parent...
        from gourmet.threadManager import get_thread_manager_gui
        import gourmet.GourmetRecipeManager
        tmg = get_thread_manager_gui()
        self.w = Gtk.Window()
        self.w.set_title(self.title)
        main_app = gourmet.GourmetRecipeManager.get_application()
        self.w.set_transient_for(main_app.window)
        self.w.set_destroy_with_parent(False)
        self.hb = Gtk.HBox()
        self.w.add(self.hb)
        self.tv = Gtk.TextView()
        self.tv.set_size_request(600, 500)
        self.tv.set_wrap_mode(Gtk.WrapMode.WORD)
        self.action_area = Gtk.VBox()
        sw = Gtk.ScrolledWindow()
        sw.add(self.tv)
        sw.set_policy(Gtk.PolicyType.NEVER, Gtk.PolicyType.AUTOMATIC)
        self.hb.add(sw)
        sw.show()
        self.tv.show()
        self.hb.pack_end(self.action_area, expand=False, fill=False, padding=0)
        self.action_area.show()
        self.tb = self.tv.get_buffer()
        self.setup_tags()

    def setup_action_area(self):
        # Set up hard-coded functional buttons...
        self.new_recipe_button = Gtk.Button.new_with_mnemonic(_('_New Recipe'))
        self.new_recipe_button.connect('clicked', self.new_recipe_cb)
        self.remove_markup_button = Gtk.Button.new_with_mnemonic(
            _('Clear _Tags'))  # noqa
        self.remove_markup_button.connect('clicked', self.clear_tags)
        # Set up ActionModel (for drop-down menu version of these commands)
        self.action_model = Gtk.ListStore(str, str)
        action_table = Gtk.Table()
        self.action_area.pack_start(action_table,
                                    expand=False,
                                    fill=False,
                                    padding=0)
        # Get our UI layout from UI_TAG_ORDER
        r = 0  # row number
        for label, rows in UI_TAG_ORDER:
            if r != 0:
                blank = Gtk.Label(label='')
                action_table.attach(blank, 0, 2, r, r + 1)
                blank.show()
            r += 1
            glabel = Gtk.Label()
            glabel.set_markup('<b>' + label + '</b>')
            glabel.set_alignment(0.0, 0.5)
            action_table.attach(glabel, 0, 2, r, r + 1)
            glabel.show()
            r += 1
            for row in rows:
                for c, t in enumerate(row):  # column number, tag
                    if t == 'clear':
                        tag_btn = self.remove_markup_button
                    elif t == 'newrec':
                        tag_btn = self.new_recipe_button
                    else:
                        tag_btn = Gtk.Button.new_with_mnemonic(
                            '_' + self.labels_by_tag[t])
                        self.action_model.append([self.labels_by_tag[t], t])
                        tag_btn.connect('clicked', self.label_callback,
                                        self.labels_by_tag[t])
                    action_table.attach(tag_btn,
                                        c,
                                        c + 1,
                                        r,
                                        r + 1,
                                        xpadding=12)
                r += 1

        action_table.set_margin_top(3)
        action_table.set_margin_bottom(3)
        action_table.set_margin_start(3)
        action_table.set_margin_end(3)

        self.import_button = Gtk.Button(label=_('Import Recipe'))
        self.import_button.connect('clicked',
                                   lambda *args: self.commit_changes())

        self.action_area.pack_end(self.import_button,
                                  fill=False,
                                  expand=False,
                                  padding=0)
        self.action_area.show_all()

    def setup_tags(self):
        self.markup_tag = Gtk.TextTag.new('markup')
        self.markup_tag.set_property('editable', False)
        # see https://developer.gnome.org/pango/stable/pango-Text-Attributes.html#PANGO-SCALE-XX-SMALL:CAPS  # noqa
        # for magic number meaning
        self.markup_tag.set_property('scale', 0.8333333333333)
        self.markup_tag.set_property('rise', 15)
        self.markup_tag.set_property('foreground', '#f00')
        self.ignore_tag = Gtk.TextTag.new('ignore')
        self.ignore_tag.set_property('invisible', True)
        self.ignore_tag.set_property('editable', False)
        self.tb.get_tag_table().add(self.markup_tag)
        self.tb.get_tag_table().add(self.ignore_tag)

    def label_callback(self, button, label):
        self.label_selection(label)

    def label_selection(self, label: str):
        cursel = self.tb.get_selection_bounds()
        if cursel:
            start, end = cursel
        else:
            # Otherwise, there's no clear sane default... we'll just
            # select the current whole line
            cur_mark = self.tb.get_insert()
            cur_pos = self.tb.get_iter_at_mark(cur_mark)
            cur_pos.backward_chars(cur_pos.get_line_offset())
            start = cur_pos.copy()
            cur_pos.forward_line()
            end = cur_pos
        self.label_range(start, end, label)

    def insert_with_label(self, st, text, label):
        start_offset = st.get_offset()
        self.tb.insert(st, text)
        end_offset = start_offset + len(text)
        self.label_range(self.tb.get_iter_at_offset(start_offset),
                         self.tb.get_iter_at_offset(end_offset), label)

    def unhide_area(self, midno):
        st, end = self.markup_marks[midno]
        self.tb.remove_tag(self.ignore_tag, self.tb.get_iter_at_mark(st),
                           self.tb.get_iter_at_mark(end))

    def hide_range(self, st, end):
        """Hide text between start and end.

        Return midno that can be used to unhide the range."""
        midno = self.midno
        self.midno += 1
        start_mark = Gtk.TextMark.new(f'start-markup-{midno}', False)
        end_mark = Gtk.TextMark.new(f'end-markup-{midno}', True)
        self.tb.apply_tag(self.ignore_tag, st, end)
        self.tb.add_mark(start_mark, st)
        self.tb.add_mark(end_mark, end)
        self.markup_marks[midno] = (start_mark, end_mark)
        return midno

    def label_range(self, st, end, label):
        if self.tags_by_label.get(label, '') == 'ignore':
            midno = self.hide_range(st, end)
            b = Gtk.Button(label='Ignored text: Reveal hidden text')
            anchor = self.insert_widget(end, b)

            def unhide_text(*args):
                self.unhide_area(midno)
                self.remove_widget(anchor)

            b.connect('clicked', unhide_text)
            b.show()
            return
        if label in self.label_counts:
            count = self.label_counts[label]
            self.label_counts[label] += 1
        else:
            self.label_counts[label] = 1
            count = 0
        smark = Gtk.TextMark.new(f'{label}-{count}-start', True)
        emark = Gtk.TextMark.new(f'{label}-{count}-end', False)
        self.tb.add_mark(smark, st)
        self.tb.add_mark(emark, end)
        self.labelled.append((smark, emark))
        # Now we add the labels...
        start_txt = '['
        start_id = self.insert_markup_text(st, start_txt, self.markup_tag)
        # Now move the mark back up...
        new_pos = self.tb.get_iter_at_mark(smark)
        new_pos.forward_chars(len(start_txt))
        self.tb.move_mark(smark, new_pos)
        # Create a "Remove me" button
        #b = Gtk.Button('_Remove tag'); b.show)(
        b = Gtk.Button()
        img = Gtk.Image.new_from_icon_name(Gtk.STOCK_REMOVE, Gtk.IconSize.MENU)
        b.add(img)
        img.show()
        itr = self.tb.get_iter_at_mark(emark)
        anchor = self.insert_widget(itr, b)
        # Set up combo button...
        labelbutton = Gtk.ComboBoxText.new()
        labelbutton.set_model(self.action_model)
        cb.cb_set_active_text(labelbutton, label)
        anchor2 = self.insert_widget(self.tb.get_iter_at_mark(smark),
                                     labelbutton)
        # Add final bracket for end of markup
        end_bracket_itr = self.tb.get_iter_at_mark(emark)
        end_id = self.insert_markup_text(end_bracket_itr, ']', self.markup_tag)
        self.markup_partners[start_id] = end_id
        self.markup_partners[end_id] = start_id
        # Now back up our itr one character (it got advanced by adding
        # the right bracket and the button)
        eitr = self.tb.get_iter_at_mark(emark)
        eitr.backward_chars(2)
        self.tb.move_mark(emark, eitr)

        # Define callback to remove our text when button is clicked
        def remove_markup(*args):
            self.labelled.remove((smark, emark))
            self.remove_markup_text(start_id)
            self.remove_markup_text(end_id)
            self.remove_widget(anchor)
            self.remove_widget(anchor2)

        def change_mark(cb):
            # copy marks for safekeeping...
            new_text = cb.get_active_text()
            sm = Gtk.TextMark.new(None, True)
            self.tb.add_mark(sm, self.tb.get_iter_at_mark(smark))
            em = Gtk.TextMark.new(None, False)
            self.tb.add_mark(em, self.tb.get_iter_at_mark(emark))
            # remove old marks...
            remove_markup()
            # And relabel!
            self.label_range(self.tb.get_iter_at_mark(sm),
                             self.tb.get_iter_at_mark(em), new_text)

        labelbutton.connect('changed', change_mark)
        b.connect('clicked', remove_markup)

    def new_recipe_cb(self, *args):
        # Start a new recipe at cursor
        itr = self.tb.get_iter_at_mark(self.tb.get_insert())
        self.label_range(itr, itr, self.NEW_REC_TEXT)

    def insert_markup_text(self, itr, text, *tags):
        """Insert markup text into the buffer. We do this in such a
        way that we can remove it easily later.
        """
        midno = self.midno
        self.midno += 1
        start_mark = Gtk.TextMark.new(f'start-markup-{midno}', False)
        end_mark = Gtk.TextMark.new(f'end-markup-{midno}', True)
        start_offset = itr.get_offset()
        if tags:
            self.tb.insert_with_tags(itr, text, *tags)
        else:
            self.tb.insert(itr, text)
        self.tb.add_mark(start_mark, self.tb.get_iter_at_offset(start_offset))
        end_offset = start_offset + len(text)
        end_itr = self.tb.get_iter_at_offset(end_offset)
        self.tb.add_mark(end_mark, end_itr)
        self.markup_marks[midno] = (start_mark, end_mark)
        return midno

    def insert_widget(self, itr, widget):
        anchor = self.tb.create_child_anchor(itr)
        self.anchors.append(anchor)
        self.tv.add_child_at_anchor(widget, anchor)
        widgetstart = self.tb.get_iter_at_child_anchor(anchor)
        widgetend = widgetstart.copy()
        widgetend.forward_char()
        self.tb.apply_tag(self.markup_tag, widgetstart, widgetend)
        widget.show()
        return anchor

    def remove_widget(self, anchor):
        anchor_iter = self.tb.get_iter_at_child_anchor(anchor)
        delete_to = anchor_iter.copy()
        delete_to.forward_char()
        self.tb.delete(anchor_iter, delete_to)

    def remove_markup_text(self, idno):
        smark, emark = self.markup_marks[idno]
        sitr, eitr = (self.tb.get_iter_at_mark(smark),
                      self.tb.get_iter_at_mark(emark))
        self.tb.delete(sitr, eitr)

    def clear_tags(self, *args):
        """Clear all markup in current selection, or whole buffer if
        there is no selection
        """
        cursel = self.tb.get_selection_bounds()
        if cursel:
            st, end = cursel
        else:
            st, end = self.tb.get_bounds()
        st_offset = st.get_offset()
        e_offset = end.get_offset()
        for idno, iters in list(self.markup_marks.items()):
            lst, lend = iters
            if ((e_offset > self.tb.get_iter_at_mark(lst).get_offset() >
                 st_offset)
                    or (e_offset > self.tb.get_iter_at_mark(lend).get_offset()
                        > st_offset)):
                self.remove_markup_text(idno)
                if idno in self.markup_partners:
                    self.remove_markup_text(self.markup_partners[idno])
        for lst, lend in self.labelled[:]:
            if ((e_offset > self.tb.get_iter_at_mark(lst).get_offset() >
                 st_offset)
                    or (e_offset > self.tb.get_iter_at_mark(lend).get_offset()
                        > st_offset)):
                self.labelled.remove((lst, lend))
        for anchor in self.anchors[:]:
            anchor_iter = self.tb.get_iter_at_child_anchor(anchor)
            if e_offset > anchor_iter.get_offset() > st_offset:
                self.anchors.remove(anchor)
                self.remove_widget(anchor)

    def commit_changes(self):
        self.labelled.sort(
            key=lambda x: self.tb.get_iter_at_mark(x[0]).get_offset())
        if not self.labelled:
            return

        self.start_rec()
        started = False
        for smark, emark in self.labelled:
            siter = self.tb.get_iter_at_mark(smark)
            eiter = self.tb.get_iter_at_mark(emark)
            text = siter.get_text(eiter)
            name = smark.get_name()
            label = name.split('-')[0]
            tag = self.tags_by_label[label]

            if not text:
                continue

            if tag in gglobals.TEXT_ATTR_DIC:
                self.add_text(tag, text)
                started = True
            elif tag in gglobals.REC_ATTR_DIC:
                self.add_attribute(tag, text)
            elif tag == 'ingredient':
                self.add_ing_from_text(text)
                started = True
            elif tag == 'ingredients':
                self.add_ings_from_text(text)
                started = True
            elif tag == 'inggroup':
                self.add_ing_group(text)
                started = True
            elif tag == 'newrec':
                if not started:
                    continue
                # Then we're starting a new recipe at this point...
                # Commit old recipe...
                self.commit_rec()
                started = False
                # Start new one...
                self.start_rec()
            elif tag == 'ignore':
                continue
            elif tag == 'servings':
                self.add_attribute('yields', text)
                self.add_attribute('yield_unit', 'servings')
            else:
                print('UNKNOWN TAG', tag, text, label)
        if started:
            self.commit_rec()

        if hasattr(self, 'images') and self.images:
            for rec in self.added_recs:
                browser = ImageBrowser(self.w, self.images)
                response = browser.run()
                if response == Gtk.ResponseType.OK:
                    thumb = browser.image.copy()
                    thumb.thumbnail((40, 40))
                    self.rd.modify_rec(
                        rec, {
                            'image': image_to_bytes(browser.image),
                            'thumb': image_to_bytes(thumb)
                        })
                browser.destroy()

        if self.modal:
            self.w.hide()
            Gtk.main_quit()

    def set_text(self, txt):
        txt = str(txt)  # convert to unicode for good measure
        txt = re.sub(r'(\n\s*\n)+', '\n\n', txt)  # Take out extra newlines
        txt = self.parser.parse(txt)  # Parse
        self.set_parsed(txt)

    def set_parsed(self, parsed):
        #dbg_file = open('/tmp/out','w')
        for chunk, tag in parsed:
            #dbg_file.write(chunk)
            if tag == None:
                self.tb.insert(self.tb.get_end_iter(), chunk)
            else:
                self.insert_with_label(self.tb.get_end_iter(), chunk,
                                       self.labels_by_tag.get(tag, tag))
        #dbg_file.close()

    def do_run(self):
        self.w.show_all()
        if self.modal:
            self.w.connect('delete-event', Gtk.main_quit)
            Gtk.main()
        else:
            self.w.connect('delete-event', lambda *args: self.w.hide())
Exemplo n.º 6
0
class WebParser(InteractiveImporter):

    BREAK_AROUND = [
        'p', 'title', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'table', 'p',
        'blockquote', 'title', 'div', 'section', 'header', 'footer', 'nav'
    ]
    IS_BREAK = ['br']
    NESTED = {
        'tr': ['table'],
        'li': ['ol', 'ul'],
        'dd': ['dl'],
    }
    TAB_BEFORE = ['td', 'dt']
    IGNORE = ['script', 'meta', 'select', 'link', 'img', 'style']
    TAB = '  '
    JOINABLE = [
        'instructions', 'notes', 'recipe', 'ignore', 'ingredients', 'include',
        None
    ]
    INVISIBLE_TYPES = [CData, Comment, Declaration, ProcessingInstruction]
    # BeautifulSoup.CData, BeautifulSoup.Comment, BeautifulSoup.Declaration, BeautifulSoup.ProcessingInstruction]

    do_postparse = True
    imageexcluders = None  # This could be a list of compiled regexps which would

    # be used to search image URL strings for
    # potential ads, etc.
    def __init__(self, url, data, content_type):
        self.ignore_unparsed = False
        self.url = url
        #self.name = 'Web Parser'
        self.soup = BeautifulSoup.BeautifulSoup(
            data,
            convertEntities=BeautifulSoup.BeautifulStoneSoup.XHTML_ENTITIES,
        )
        InteractiveImporter.__init__(self)
        #self.generic_parser = RecipeParser()
        self.preparse()
        self.get_images()
        self.text_parser = RecipeParser()

    def commit_rec(self):
        if not self.rec.get('link', ''): self.rec['link'] = self.url
        gourmet.importers.importer.Importer.commit_rec(self)

    def preparse(self):
        self.preparsed_elements = []

    def identify_match(self, tag):
        for t, label in self.preparsed_elements:
            if tag == t:
                return label

    def get_images(self):
        self.images = []
        for i in self.soup('img'):
            try:
                src = i['src']
            except KeyError:
                continue
            img_url = urllib.basejoin(self.url, src)
            if self.imageexcluders:
                exclude = False
                for exc in self.imageexcluders:
                    if exc.search(img_url):
                        exclude = True
                        break
                if exclude: continue
            self.images.append(img_url)

    def parse(self, tag=None):
        if not tag: tag = self.soup
        self.parsed = []
        self.buffer = ''
        self.last_label = None
        self.crawl(tag)
        if self.buffer:
            self.add_buffer_to_parsed()
        return self.parsed

    def crawl(self, tag, parent_label=None):
        formatting = self.format_tag_whitespace(tag)
        if formatting == -1:
            return  # special case allows formatting method to
            # auto-skip scripts and what-not
        else:
            start_ws, end_ws = formatting
            self.buffer += start_ws
        label = self.identify_match(tag)
        if not label and parent_label:
            # inherit...
            label = parent_label
        elif self.ignore_unparsed and not label:
            label = 'ignore'
        #elif not label:
        #    print 'DONT IGNORE'
        #print 'ID TAG',tag,'with',label
        if hasattr(tag, 'contents') and tag.contents:
            for child in tag.contents:
                self.crawl(child, label)
        else:
            if label != self.last_label or self.last_label not in self.JOINABLE:
                if self.buffer:
                    self.add_buffer_to_parsed()
                self.last_label = label
            if hasattr(tag, 'string'):
                self.buffer += self.reduce_whitespace(tag.string or '')
        if end_ws: self.buffer += end_ws
        return label

    def reduce_whitespace(self, s):
        if not hasattr(self, '__whitespace_regexp'):
            self.__whitespace_regexp = re.compile(r'\s+')
        return self.__whitespace_regexp.sub(' ', s)

    def cut_extra_whitespace(self, s):
        if s.count('\n') > 2:
            s = s.replace('\n', '', s.count('\n') - 2)
        return s

    def add_buffer_to_parsed(self):
        if not self.buffer.strip(): return
        tws = 0  #tws = # of trailing whitespace characters
        while tws + 1 < len(self.buffer) and self.buffer[-(tws + 1)].isspace():
            tws += 1
        if not tws:
            to_add = self.buffer
            self.buffer = ''
        else:
            to_add = self.buffer[:-tws]
            self.buffer = self.buffer[-tws:]
            self.buffer = self.cut_extra_whitespace(self.buffer)
        lws = 0
        while lws + 1 < len(to_add) and to_add[lws].isspace():
            lws += 1
        if lws:
            # In this case, we're going to add the white space separately with no label...
            pre_add = to_add[:lws]
            pre_add = self.cut_extra_whitespace(pre_add)
            to_add = to_add[lws:]
            self.parsed.append((pre_add, None))
        # Do extra substitution of MS Characters -- shouldn't be necessary...
        for char, tup in list(BeautifulSoup.UnicodeDammit.MS_CHARS.items()):
            char = char.decode('iso-8859-1').encode('utf-8')
            if to_add.find(char) >= 0:
                try:
                    to_add = to_add.replace(char, chr(int(tup[1], 16)))
                except ValueError:
                    print("ValueError caught in add_buffer_to_parsed")
        self.parsed.append((to_add, self.last_label))

    def format_tag_whitespace(self, tag):
        '''Return any whitespace required by tag, or -1 if tag should
        not be considered for text
        '''
        for klass in self.INVISIBLE_TYPES:
            if isinstance(tag, klass):
                return -1
        if not hasattr(tag, 'name'):
            return '', ''
        elif tag.name in self.IGNORE:
            return -1
        if tag.name in self.IS_BREAK:
            return '\n', ''
        elif tag.name in self.NESTED:
            parent_types = self.NESTED[tag.name]
            parents = 0
            for typ in parent_types:
                parents += len(tag.fetchParents(typ))
            return '\n' + self.TAB * parents, ''
        elif tag.name in self.TAB_BEFORE:
            return self.TAB, ''
        elif tag.name in self.BREAK_AROUND:
            return '\n', '\n'
        else:
            return '', ''

    def postparse(self, parsed):
        '''Do purely text-based parsing of content.
        '''
        new_parse = []
        for p, attr in parsed:
            p = re.sub(r'(\n\s*\n)+', '\n\n', p)  # Take out extra newlines
            if attr == None or attr == 'recipe':
                new_parse.extend(self.text_parser.parse(p))
            else:
                new_parse.append((p, attr))
        return new_parse

    def parse_webpage(self):
        self.preparse()
        tags = [pp[1] for pp in self.preparsed_elements]
        if 'include' in tags:
            self.ignore_unparsed = True
        parsed = self.parse()
        if self.do_postparse:
            return self.postparse(parsed)
        else:
            return parsed

    def do_run(self):
        parsed = self.parse_webpage()
        self.set_parsed(parsed)
        return InteractiveImporter.do_run(self)
Exemplo n.º 7
0
class WebParser (InteractiveImporter):

    BREAK_AROUND = ['p','title','h1','h2','h3','h4','h5','h6',
                    'table','p','blockquote','title','div','section','header','footer','nav']
    IS_BREAK = ['br']
    NESTED = {'tr':['table'],
              'li':['ol','ul'],
              'dd':['dl'],
              }
    TAB_BEFORE = ['td','dt']
    IGNORE = ['script','meta','select','link','img','style']
    TAB = '  '
    JOINABLE = ['instructions','notes','recipe','ignore','ingredients','include',None]
    INVISIBLE_TYPES = [
        BeautifulSoup.CData,
        BeautifulSoup.Comment,
        BeautifulSoup.Declaration,
        BeautifulSoup.ProcessingInstruction]

    do_postparse = True
    imageexcluders = None # This could be a list of compiled regexps which would
                         # be used to search image URL strings for
                         # potential ads, etc.
    def __init__ (self, url, data, content_type):
        self.ignore_unparsed = False
        self.url = url
        #self.name = 'Web Parser'
        self.soup = BeautifulSoup.BeautifulSoup(data,
                                                convertEntities=BeautifulSoup.BeautifulStoneSoup.XHTML_ENTITIES,
                                                )
        InteractiveImporter.__init__(self)
        #self.generic_parser = RecipeParser()
        self.preparse()
        self.get_images()
        self.text_parser = RecipeParser()

    def commit_rec (self):
        if not self.rec.get('link',''): self.rec['link'] = self.url
        gourmet.importers.importer.Importer.commit_rec(self)

    def preparse (self):
        self.preparsed_elements = []

    def identify_match (self, tag):
        for t,label in self.preparsed_elements:
            if tag==t:
                return label

    def get_images (self):
        self.images = []
        for i in self.soup('img'):
            try:
                src = i['src']
            except KeyError:
                continue
            img_url = urllib.basejoin(self.url,src)
            if self.imageexcluders:
                exclude = False
                for exc in  self.imageexcluders:
                    if exc.search(img_url):
                        exclude = True
                        break
                if exclude: continue
            self.images.append(img_url)
        
    def parse (self, tag=None):
        if not tag: tag = self.soup
        self.parsed = []
        self.buffer = ''
        self.last_label = None
        self.crawl(tag)
        if self.buffer:
            self.add_buffer_to_parsed()
        return self.parsed
    
    def crawl (self, tag, parent_label=None):
        formatting = self.format_tag_whitespace(tag)
        if formatting == -1:
            return # special case allows formatting method to
                   # auto-skip scripts and what-not
        else:
            start_ws,end_ws = formatting
            self.buffer += start_ws
        label = self.identify_match(tag)        
        if not label and parent_label:
            # inherit...
            label = parent_label
        elif self.ignore_unparsed and not label:
            label = 'ignore'
        #elif not label:
        #    print 'DONT IGNORE'
        #print 'ID TAG',tag,'with',label            
        if hasattr(tag,'contents') and tag.contents:
            for child in tag.contents:
                self.crawl(child,label)
        else:
            if label != self.last_label or self.last_label not in self.JOINABLE:
                if self.buffer:
                    self.add_buffer_to_parsed()
                self.last_label = label
            if hasattr(tag,'string'):
                self.buffer += self.reduce_whitespace(tag.string or '')
        if end_ws: self.buffer += end_ws
        return label

    def reduce_whitespace (self, s):
        if not hasattr(self,'__whitespace_regexp'):
            self.__whitespace_regexp = re.compile('\s+')
        return self.__whitespace_regexp.sub(' ',s)

    def cut_extra_whitespace (self, s):
        if s.count('\n')>2:
                s = s.replace(
                    '\n','',
                    s.count('\n')-2)
        return s
    
    def add_buffer_to_parsed (self):
        if not self.buffer.strip(): return
        tws = 0 #tws = # of trailing whitespace characters
        while tws+1 < len(self.buffer) and self.buffer[-(tws+1)].isspace():
            tws += 1
        if not tws:
            to_add = self.buffer
            self.buffer = ''
        else:
            to_add = self.buffer[:-tws]
            self.buffer = self.buffer[-tws:]
            self.buffer = self.cut_extra_whitespace(self.buffer)
        lws = 0
        while lws+1 < len(to_add) and to_add[lws].isspace():
            lws += 1
        if lws:
            # In this case, we're going to add the white space separately with no label...
            pre_add = to_add[:lws]
            pre_add = self.cut_extra_whitespace(pre_add)
            to_add = to_add[lws:]
            self.parsed.append((pre_add,None))
        # Do extra substitution of MS Characters -- shouldn't be necessary...
        for char,tup in BeautifulSoup.UnicodeDammit.MS_CHARS.items():
            char = char.decode('iso-8859-1').encode('utf-8')
            if to_add.find(char) >= 0:
                try:
                    to_add = to_add.replace(char,unichr(long(tup[1],16)))
                except ValueError:
                    print("ValueError caught in add_buffer_to_parsed")
        self.parsed.append((to_add,self.last_label))

    def format_tag_whitespace (self, tag):
        '''Return any whitespace required by tag, or -1 if tag should
        not be considered for text
        '''
        for klass in self.INVISIBLE_TYPES:
            if isinstance(tag,klass):
                return -1
        if not hasattr(tag,'name'):
            return '',''
        elif tag.name in self.IGNORE:
            return -1
        if tag.name in self.IS_BREAK:
            return '\n',''
        elif tag.name in self.NESTED:
            parent_types = self.NESTED[tag.name]; parents = 0
            for typ in parent_types:
                parents += len(tag.fetchParents(typ))
            return '\n'+self.TAB*parents,''
        elif tag.name in self.TAB_BEFORE:
            return self.TAB,''
        elif tag.name in self.BREAK_AROUND:
            return '\n','\n'
        else:
            return '',''

    def postparse (self, parsed):
        '''Do purely text-based parsing of content.
        '''
        new_parse = []
        for p,attr in parsed:
            p = re.sub('(\n\s*\n)+','\n\n',p) # Take out extra newlines
            if attr == None or attr == 'recipe':
                new_parse.extend(
                    self.text_parser.parse(p)
                    )
            else:
                new_parse.append((p,attr))
        return new_parse

    def parse_webpage (self):
        self.preparse()
        tags = [pp[1] for pp in self.preparsed_elements]
        if 'include' in tags:
            self.ignore_unparsed = True
        parsed = self.parse()
        if self.do_postparse:
            return self.postparse(parsed)
        else:
            return parsed

    def do_run (self):
        parsed = self.parse_webpage()
        self.set_parsed(parsed)
        return InteractiveImporter.do_run(self)