Python GetHTMLAndParse примеры использования

Язык программирования: Python

Пространство имен/Пакет: gethtmlandparse

Класс/Тип: GetHTMLAndParse

Примеров на hotexamples.com: 17

Python GetHTMLAndParse - 17 примеров найдено. Это лучшие примеры Python кода для gethtmlandparse.GetHTMLAndParse, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

GetHTMLAndParse(5)

find_anchor_elem(3)

get_etree(2)

ghap(2)

is_wanted_mime(2)

check_file(1)

compare_domains(1)

count_all_headers(1)

get_all_links(1)

get_anchor_from_link(1)

get_charset(1)

get_domain_name(1)

get_pager_links(1)

is_page(1)

look_for_frame(1)

Пример #1

Показать файл

    def __init__(self, verbose=False, debug=False):
        self.__dbg__ = debug
        self.__verbos = verbose
        self._crawler = Crawler()
        self._crawler.set_headers((
                   ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \
                   ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
                 ))
        self._wraper = HTMLSequenceWrapper(childcoef=5.0,
                                           headercoef=3.0,
                                           mintextlen=20)

        self._unwanted_titles = ['Download here', 'PDF format']
        self._records = []

        ################################
        #manual processing
        self.agent = GetHTMLAndParse()
        # to get region where to search for records
        self.regionHandler = GetDeliverableRegion()
        # init text formatter (encoding, erasing white chars etc.)
        self.formatter = TextFormatUtils()

        self._omitted_tags = ('br', 'img', 'html', 'body')
        # tag tolerance
        self.tagtol = 1

Пример #2

Показать файл

    def __init__(self, url, verbose=False, debug=False, addkeyw=None):
        # keywords used for document page search
        self._sigwords = [
            "d((eliverables?)|[0-9])",
            "documents?",
            "reports?",
            "public(ation)?s?",
            "results?",
            "presentations?",
            "library",
            #"projects?",
            "outocomes?",
            "downloads?",
            "outputs?"
        ]

        if addkeyw != None:
            self._sigwords.append(addkeyw)
        """ Associative array containing links with their flags
        { url : [Index/NoIndex/Frame, Visit/Visited, Rank] }
        index = 0, noindex = 1, frame = 2, unvisited = 0, visited = 1 """
        self._link_stack = {url: [0, 0, 0]}

        self.base_url = url  # save base (input) url

        # Open an parsing agent to get needed data from page
        self.agent = GetHTMLAndParse()

        self._current_url = url

        # a constant used to set rank in order of importance of the expression
        # being tested (self._sigwords)
        self.rank_const = len(self._sigwords)

        # few a constants for dictionary - just for good-looking source code
        self.IND_FR = 0  # index/noindex/frame/special
        self.VISIT = 1  # unvisited/visited
        self.RANK = 2  # value of rank

        # set verbose flag
        self.__verbose__ = verbose

        #set debug flag
        self.__dbg__ = debug

        # checking data types
        if not type(self.__verbose__) == bool:
            raise ValueError("Verbose flag has to be boolean.")

Пример #3

Показать файл

Файл: getdelivrecords.py Проект: KNOT-GIT/mDeliverables

    def __init__(self, verbose=False, debug=False):
        self.__dbg__ = debug
        self.__verbos = verbose
        self._crawler = Crawler()
        self._crawler.set_headers(
            (
                (
                    "User-Agent",
                    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19",
                ),
                ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"),
            )
        )
        self._wraper = HTMLSequenceWrapper(childcoef=5.0, headercoef=3.0, mintextlen=20)

        self._unwanted_titles = ["Download here", "PDF format"]
        self._records = []

        ################################
        # manual processing
        self.agent = GetHTMLAndParse()
        # to get region where to search for records
        self.regionHandler = GetDeliverableRegion()
        # init text formatter (encoding, erasing white chars etc.)
        self.formatter = TextFormatUtils()

        self._omitted_tags = ("br", "img", "html", "body")
        # tag tolerance
        self.tagtol = 1

Пример #4

Показать файл

Файл: getdeliverablerecords.py Проект: xkolac11/Deliverables

 def __init__(self, verbose=False, debug=False):
     # init agent for parsing html
     self.htmlHandler = GetHTMLAndParse()
     # to get region where to search for records
     self.regionHandler = GetDeliverableRegion()
     # init text formatter (encoding, erasing white chars etc.)
     self.formatter = TextFormatUtils()
     # list of acceptable words in title (header) of table
     self.table_sem_words = [
         'deliverable', 'description', 'name', 'date', 'dissemination',
         'no.', 'wp', 'delivery', 'particip', 'title', 'nature'
     ]
     self._omitted_tags = ('br', 'img', 'html', 'body')
     # tag tolerance
     self.tagtol = 1
     # verbose and debug flags
     self.debugger = DeliverableDebugger(verbose=verbose, debug=debug)
     self.__verbose = self.debugger.verbose
     self.__debug = self.debugger.debug

Пример #5

Показать файл

Файл: getdelivpage.py Проект: KNOT-GIT/mDeliverables

    def __init__(self, url, verbose=False, debug=False, addkeyw=None):
        # keywords used for document page search
        self._sigwords = [
            "d((eliverables?)|[0-9])",
            "documents?",
            "reports?",
            "public(ation)?s?",
            "results?",
            "presentations?",
            "library",
            # "projects?",
            "outocomes?",
            "downloads?",
            "outputs?",
        ]

        if addkeyw != None:
            self._sigwords.append(addkeyw)

        """ Associative array containing links with their flags
        { url : [Index/NoIndex/Frame, Visit/Visited, Rank] }
        index = 0, noindex = 1, frame = 2, unvisited = 0, visited = 1 """
        self._link_stack = {url: [0, 0, 0]}

        self.base_url = url  # save base (input) url

        # Open an parsing agent to get needed data from page
        self.agent = GetHTMLAndParse()

        self._current_url = url

        # a constant used to set rank in order of importance of the expression
        # being tested (self._sigwords)
        self.rank_const = len(self._sigwords)

        # few a constants for dictionary - just for good-looking source code
        self.IND_FR = 0  # index/noindex/frame/special
        self.VISIT = 1  # unvisited/visited
        self.RANK = 2  # value of rank

        # set verbose flag
        self.__verbose__ = verbose

        # set debug flag
        self.__dbg__ = debug

        # checking data types
        if not type(self.__verbose__) == bool:
            raise ValueError("Verbose flag has to be boolean.")

Пример #6

Показать файл

Файл: getdeliverablerecords.py Проект: KNOT-GIT/mDeliverables

 def __init__(self, verbose=False,debug=False):
     # init agent for parsing html
     self.htmlHandler = GetHTMLAndParse()        
     # to get region where to search for records
     self.regionHandler = GetDeliverableRegion()        
     # init text formatter (encoding, erasing white chars etc.)
     self.formatter = TextFormatUtils()               
     # list of acceptable words in title (header) of table
     self.table_sem_words = ['deliverable', 'description', 'name', 'date',
                             'dissemination', 'no.', 'wp', 'delivery',
                             'particip', 'title', 'nature']
     self._omitted_tags = ('br', 'img', 'html', 'body')
     # tag tolerance
     self.tagtol = 1
     # verbose and debug flags
     self.debugger = DeliverableDebugger(verbose = verbose,debug = debug)
     self.__verbose = self.debugger.verbose
     self.__debug = self.debugger.debug

Пример #7

Показать файл

    def __init__(self, options=opt, url=None):
        # get options
        self.opt = options
        if url != None:
            self.opt_url = url
        else:
            self.opt_url = self.opt.url

        # initialize main html handler and parser
        self.htmlhandler = GetHTMLAndParse()

        # searching deliverable page
        self.pagesearch = GetDelivPage(self.opt_url,
                                       verbose=self.opt.verbose,
                                       debug=self.opt.debug,
                                       addkeyw=self.opt.regexp)

        # extracting informations from page
        self.recordhandler = GetDelivRecords(debug=self.opt.debug)

Пример #8

Показать файл

class GetDelivRecords:
    def __init__(self, verbose=False, debug=False):
        self.__dbg__ = debug
        self.__verbos = verbose
        self._crawler = Crawler()
        self._crawler.set_headers((
                   ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \
                   ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
                 ))
        self._wraper = HTMLSequenceWrapper(childcoef=5.0,
                                           headercoef=3.0,
                                           mintextlen=20)

        self._unwanted_titles = ['Download here', 'PDF format']
        self._records = []

        ################################
        #manual processing
        self.agent = GetHTMLAndParse()
        # to get region where to search for records
        self.regionHandler = GetDeliverableRegion()
        # init text formatter (encoding, erasing white chars etc.)
        self.formatter = TextFormatUtils()

        self._omitted_tags = ('br', 'img', 'html', 'body')
        # tag tolerance
        self.tagtol = 1

    def __debug(self, msg):
        _err = "cannot decode debug info."
        if self.__dbg__ == True:
            try:
                print("Debug message:    " + str(msg))
            except UnicodeError:
                print(_err)

    def __verbose(self, msg):
        _err = "cannot decode debug info."
        if self.__verbose == True:
            try:
                print("Verbose:    " + str(msg))
            except UnicodeError:
                print(_err)

########################Processing sequencewrapper output######################

    """function gets an entry from output of sequence wrapper
       it tries to create deliv record and retruns true if succed. """
    def _make_deliv_record(self, entry):

        text = []
        links = []

        #harvest links and text form entry
        for e in entry.iter():
            if e.text != None:
                text.append(e.text)
            if e.attrib.get("link") != None:
                if self.agent.is_wanted_mime(e.attrib.get(
                        "link")) and e.attrib.get("link") not in links:
                    links.append(e.attrib.get("link"))

        res = self._deliv_in_text(text, links)
        if type(res) == RRSPublication:
            self._entriesFoundInText.append(res)
            self.__debug("Record found cause of text")
            return True

        elif type(res) == list:
            res = self._more_entry_in_record(entry)
            if (res == True):
                self.__debug("")
                return True
            else:
                return False

        res = self._deliv_in_link(text, links, entry)
        if type(res) == RRSPublication:
            self._entriesFoundInLinks.append(res)
            self.__debug("Record found cause of link")
            return True

        return False

    """look for keyword in text"""

    def _deliv_in_text(self, text, links):

        #print text
        #print links
        #print "*"*40
        _title = False
        _description = ""
        pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)",
                             re.I)

        #loop through text in entry looking for title and description
        for t in text:
            if _title == False:
                if pattern.search(t):
                    _title = t

            #set the longest string as description of deliverable
            if len(_description) < len(t):
                _description = t

        if _title == _description:
            _description = ""

        _link = False

        if type(links) == str:
            if self.agent.is_wanted_mime(links):
                _link = links
        elif type(links) == list:
            for l in links:
                if self.agent.is_wanted_mime(l):
                    if _link == False:
                        _link = l
                    else:
                        #if there was already found link
                        if _link[:s.rfind(_link, '.')] == l[:s.rfind(l, '.')]:
                            break
                        else:
                            return ['-3', 'Probably more records in one entry']

        #create object
        if _title:
            #print "TITLE:"+_title
            pub = RRSPublication(title=_title, abstract=_description)
            _typ = RRSPublication_type(type='techreport')
            pub['type'] = _typ
            self.__debug("*" * 40)
            self.__debug("Title: " + _title)
            self.__debug("Description: " + _description)

            if _link:
                #print "LINK:"+_link
                self.__debug("Link: " + _link)
                l = RRSUrl(link=_link)
                pl_rel = RRSRelationshipPublicationUrl()
                pl_rel.set_entity(l)
                pub['url'] = pl_rel

            return pub
        else:
            #this entry is not probably deliverable
            return False

    """look for a key word in link"""

    def _deliv_in_link(self, text, links, entry=False):

        ##print text
        ##print links
        #print "*"*40

        _title = False
        _description = ""
        pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)",
                             re.I)

        _link = False

        for l in links:
            if pattern.search(l):
                if _link == False:
                    _link = l
                else:
                    return ['-3', 'Probably more records in one entry']

        #loop through text in entry looking for title and description
        for t in text:
            if _title == False:
                if len(t) > 10:
                    _title = t
            #set the longest string as description of deliverable
            if len(_description) < len(t):
                _description = t

        if _title == _description:
            _description = ""

        #if chosen title is not valid try to find better in parent entry
        if _title and not self._check_title(_title) and entry != False:
            _title = self._repair_title(entry)

        #create object
        if _link:
            pub = RRSPublication(title=_title, abstract=_description)
            typ = RRSPublication_type(type='techreport')
            pub['type'] = typ

            self.__debug("*" * 40)
            self.__debug("Title: " + _title)
            self.__debug("Description: " + _description)

            self.__debug("Link: " + _link)
            l = RRSUrl(link=_link)
            pl_rel = RRSRelationshipPublicationUrl()
            pl_rel.set_entity(l)
            pub['url'] = pl_rel

            return pub
        else:
            #this entry is not probably deliverable
            return False

    """Check if title contents only unwanted string with some tolerance
    return true if title is ok
    """

    def _check_title(self, title, tolerance=10):

        for t in self._unwanted_titles:
            if (s.find(s.lower(title), s.lower(t))) != -1:
                if (len(t) + tolerance) > len(title):
                    return False
        return True

    "looks for an element with highest visibility rank in parent elemet"

    def _repair_title(self, entry):
        parent = entry.getparent()
        visibility = 0
        title = ""
        for i in parent.iter():
            try:
                if i.attrib.get('visibility') > visibility:
                    visibility = i.attrib.get('visibility')
                    title = i.text
            except AttributeError:
                pass

        if title != "":
            return title
        else:
            return False

    "Function try to create array of deliverables from one entry in xml tree"

    def _more_entry_in_record(self, entry):
        for ch in entry.iter('chunk'):
            if ch.text != None and ch.attrib.get("link") != None:
                if self.agent.is_wanted_mime(ch.attrib.get("link")):
                    _pub = RRSPublication(title=ch.text)
                    typ = RRSPublication_type(type='techreport')
                    _pub['type'] = typ
                    _l = RRSUrl(link=ch.attrib.get("link"))
                    _rel = RRSRelationshipPublicationUrl()
                    _rel.set_entity(_l)
                    _pub['url'] = _rel
                    self._entriesFoundInLinks.append(_pub)

    "Process pages definied by urls"

    def process_pages(self, pages):
        self._entriesFoundInText = []
        self._entriesFoundInLinks = []
        self._urls = pages
        self._pages = self._crawler.start(pages)

        #creates RRSPublication objects with information about deliverables
        for u in self._urls:
            self._wraper.wrap(self._pages[u], u)
            self._tree = self._wraper.get_etree()
            #print self._wraper.get_xml()
            for entry in self._tree.iter("entry"):
                self._make_deliv_record(entry)

        if len(self._entriesFoundInText) > 3:
            self.__debug("Deliverbles descriptions content keywords")
            self.__debug("Found " +
                         "{0}".format(len(self._entriesFoundInText)) +
                         " deliv records")

            self._records = self._entriesFoundInText
        elif len(self._entriesFoundInLinks) > 3:
            self.__debug("Deliverbles links content keywords")
            self.__debug("Found " +
                         "{0}".format(len(self._entriesFoundInLinks)) +
                         " deliv records")

            self._records = self._entriesFoundInLinks
        else:
            self._manual_processing()

    "This method is called when ther was no records found in output of sequencewrapper"

    def _manual_processing(self):
        self._entriesFoundInLinks = []
        self._entriesFoundInText = []
        self._manual_process_page(self._urls, urlsplit(self._urls[0])[1])
        if len(self._entriesFoundInText) > 0:
            self.__debug("Deliverbles descriptions content keywords")
            self.__debug("Found " +
                         "{0}".format(len(self._entriesFoundInText)) +
                         " deliv records")

            self._records = self._entriesFoundInText
        elif len(self._entriesFoundInLinks) > 0:
            self.__debug("Deliverbles links content keywords")
            self.__debug("Found " +
                         "{0}".format(len(self._entriesFoundInLinks)) +
                         " deliv records")

            self._records = self._entriesFoundInLinks

    ########################### TABLE HANDLING METHODS ############################
    """ Get texts from element and his descendants.
    If string isset, returns texts as one string with spaces.
    # elem: lxml element """

    def _get_descendats_texts(self, elem, string=True):
        texts = []
        for child in elem.iter():
            if child.text and isinstance(child.tag, basestring):
                if re.search("[a-z0-9]", child.text, re.I):
                    texts.append(self.formatter.format(child.text))
        if string:
            return " ".join(texts)
        return texts

    """ Get link from row of table - go through columns and the only href
    leading to deliverable is returned. """

    def _get_row_link(self, row):
        # find all anchors where parent is row
        linkanch = row.findall('.//a[@href]')
        if len(linkanch) == 0:
            return None
        for link in linkanch:
            anchor_link = link.get('href')
            if self.agent.is_wanted_mime(
                    anchor_link):  # check if it is file we want
                return anchor_link
        return None

    """ Handle region as a table.
    Work with region as it's a table. Try to get table semantic (table order)
    and get all records out of it. """

    def _handle_table(self):
        for row in self.parentetree:
            if not row.tag == 'tr':
                continue
            row_list = []
            _thislink = self._get_row_link(row)
            if _thislink == None:
                continue

            for column in row:
                text = self._get_descendats_texts(column)
                if not text:
                    continue
                row_list.append(text)

            res = self._deliv_in_text(row_list, [_thislink])
            if type(res) == RRSPublication:
                self._entriesFoundInText.append(res)
                self.__debug("Record found cause of text")
            else:
                res = self._deliv_in_link(row_list, [_thislink])
                if type(res) == RRSPublication:
                    self._entriesFoundInLinks.append(res)
                    self.__debug("Record found cause of link")
            del (row_list)
        return

########################  TAG SEQUENCE RECOGNIZING METHODS ####################

    """ Tag check.
    If it is anchor with href leading to deliverable, returns True """
    def _is_deliv_anch(self, tag):
        if tag.tag == 'a':
            href = tag.get('href')
            if self.agent.is_wanted_mime(href):
                return True
        return False

    """ Filters useless and messy tags.
    Return false if useless, true if normal tag """

    def _tagfilter(self, tag):
        if tag.tag in self._omitted_tags:
            return False
        #if tag.text:
        #    if not re.search("[a-z0-9\[\]]", tag.text, re.I):
        #        return False
        return True

    """ Gets difference between first two anchors. """

    def _getdiff(self, reg, tol):
        # etree reg = element tree region
        # int tol: accepted tolerance of tags
        d = {}
        index = 0
        # fill the dictionary with differences and their occurences
        for tag in reg.iter():
            if not self._tagfilter(tag):
                continue
            if self._is_deliv_anch(tag) and not index == 0:
                try:
                    d[index] += 1
                except:
                    d[index] = 1
                index = 0
            index += 1
        # check differencies if the variety isn't higher then $tol tolerance
        difflist = d.keys()
        self.__debug("difflist: " + str(difflist))
        if len(difflist) == 0:
            return -1
        _max = max(difflist)
        _min = min(difflist)
        dlen = len(d.keys())
        if dlen == 1:
            return d.keys()[0]
        if dlen > ((2 * tol) + 1):  # tolerance to both sides
            return -1
        if (_max - _min) > 2 * tol:  # some acceptable tolerance
            return -1
        # get the most frequent difference
        most_freq = max(d.values())
        for key in d:
            if d[key] == most_freq:
                return key
        return -1

    """ Only anchors found. No optional information. """

    def _get_anch_only(self):
        anchlist = self.agent.find_anchor_elem(self.baseUrl, self.parentetree)
        # We have to make list of list because XMLOutput
        return [[anch] for anch in anchlist]

    """ Main method handling tag sequences and recognizing records.
    Returns list of records. """

    def _get_tag_sequences(self, tag_tol=1):
        records = []
        self._rec = []
        if len(self.parentetree) == 0:
            return [[self.parentetree]]
        # get interval between anchors, use tolerance tag_tol
        self.difference = self._getdiff(self.parentetree, self.tagtol)
        while self.difference == -1:
            if self.tagtol > 5:
                self.__verbose("Variety of intervals between anchors is too huge. "+\
                               "Getting data out of anchors only")
                return self._get_anch_only()
            self.tagtol += 1
            self.difference = self._getdiff(self.parentetree, self.tagtol)

        # get sequence of first n tags, where n is average interval between anchors
        # this could be tag-sequence describing all records in region.
        self.record_seq = []
        i = 0
        for tag in self.parentetree.iter():
            if not self._tagfilter(tag):
                continue
            if i >= self.difference:
                if not 'a' in self.record_seq:
                    del self.record_seq[0]
                else:
                    break
            self.record_seq.append(tag.tag)
            i += 1

        # counter indicates on which position in tag sequence we actually are
        counter = 0
        # make sequence of tags as they go
        regionlist = filter(self._tagfilter,
                            [tag for tag in self.parentetree.iter()])
        recseqlen = len(self.record_seq)
        reglistlen = len(regionlist)

        # flag indicating begin of records - in region on the beginning can be some garbage
        self.begin = False
        # indicating unpredictable separator between deliverable records
        self.separator = 0
        for i, tag in enumerate(regionlist):
            # skip and save the sequence at the end
            if counter > self.difference - 1:
                records.append(self._rec)  # save
                self._rec = []  # erase the list
                counter = 0  # reset counter
            if not self.begin:
                if tag.tag != self.record_seq[0]:
                    continue
                else:
                    try:
                        if regionlist[i + 1].tag != self.record_seq[1]:
                            continue
                    except:
                        pass
                    self.begin = True
            # handle tolerances, try to compare sibling tags
            self.match = False  # match flag

            # tolerance algorithm. Goes through html and tries to pass irregular tags in sequence.
            for tol in range(self.tagtol + 1):
                if tag.tag == self.record_seq[(counter + tol) % recseqlen] or \
                   regionlist[(i + tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter += tol + 1
                    break
                elif tag.tag == self.record_seq[(counter - tol) % recseqlen] or \
                   regionlist[(i - tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter -= tol
                    counter += 1
                    break
            # if nothing matched, its probably out of tolerance
            if not self.match:
                self.separator += 1
                # tolerance 10 separators (tags between boxes or tables of deliverables)
                if self.separator > 10:
                    self.__verbose("Tag sequence doesnt match, probably out of "+\
                                "tolerance, getting data out of anchors only")
                    # maybe here could be tolerance++
                    # we didnt catch the sequence with tolerance...
                    return self._get_anch_only()
        records.append(self._rec)
        return filter(self._validseq, records)

    """ Helper method - check if sequence of tags rec contains deliv anchor
    """

    def _validseq(self, rec):
        for _atr in rec:
            # if we have anchor containing link to document, return true
            if self._is_deliv_anch(_atr):
                return True
        return False

    """ Get element texts only, dont look for descendants texts """

    def _get_tag_content(self, tag):
        links = []
        texts = []
        if tag.tag == 'a':
            href = tag.get('href')
            # if link leading to document found, add string to list
            if href is not None and self.agent.is_wanted_mime(href):
                links.append(self.formatter.format(href))
            title = tag.get('title')
            # if title found in tag, add string to list
            if title:
                texts.append(self.formatter.format(title))
        # if not anchor, search text in tag.text
        if tag.text:
            if re.search("[a-z0-9]", tag.text, re.I):
                texts.append(self.formatter.format(tag.text))
        return [links, texts]

    """ Harvest texts out of tags and return list of lists (record) """

    def _harvest_text(self, record_tag_list):
        self._records = []
        self._rec = []
        _links = []
        _texts = []
        # loop over records and search all possible useful texts
        for rec_list in record_tag_list:
            for tag in rec_list:
                harvested = (self._get_tag_content(tag))
                _links.extend(harvested[0])
                _texts.extend(harvested[1])
            #self._records.append(self._rec)
            res = self._deliv_in_text(_texts, _links)
            if type(res) == RRSPublication:
                self._entriesFoundInText.append(res)
                self.__debug("Record found cause of text")
            else:
                res = self._deliv_in_link(_texts, _links)
                if type(res) == RRSPublication:
                    self._entriesFoundInLinks.append(res)
                    self.__debug("Record found cause of link")
            _links = []
            _texts = []
            self._rec = []
        return self._records

    """ Text harvesting for sequences. """

    def _handle_sequence(self):
        seq = self._get_tag_sequences()
        return self._harvest_text(seq)

    """ Get records from region according document links
        this method is used when there was no records found
        in output of sequencewrapper"""

    def _manual_process_page(self, links, baseurl):
        _err = None
        self.baseUrl = baseurl

        for link in links:
            # find region with tolerance
            self.parentetree = self.regionHandler.get_region(link, baseurl, 1)
            if type(self.parentetree) == tuple:
                # error
                _err = self.parentetree
                self.__debug(_err)
                continue
            #make all links absolute in parent tree
            hrefs = self.parentetree.findall('.//a[@href]')
            for href in hrefs:
                href.make_links_absolute('http://' + urlsplit(link)[1] + '/')

            # get the charset. We dont have etree in htmlHandler,
            # so we have to use the one from regionHandler
            self.formatter.set_charset(
                self.regionHandler.formatter.get_charset())

            self.__debug("*" * 100 + '\n' + "*" * 40 + " DATA REGION " +
                         "*" * 40)
            self.__debug(
                lxml.etree.tostring(self.parentetree, pretty_print=True))
            # get root tag
            try:
                self.parentetree = self.parentetree.getroot()
            except:
                pass

            # Parent tag is table
            # call _handle_table
            if self.parentetree.tag in ('table', 'tbody'):
                self.__verbose("Handling table")
                self._handle_table()
            else:
                self.__verbose("Handling sequences")
                self._handle_sequence()

#############PUBLIC METHODS TO GET RESULTS

    def get_deliverables_XML(self):
        """return infromations about deliverables stored in objects as xml"""
        if len(self.get_deliverables()) == 0:
            return derrno.__err__(derrno.ENOREC)
        output = StringIO.StringIO()
        converter = Model2XMLConverter(stream=output)
        converter.convert(self.get_deliverables())
        result = output.getvalue()
        output.close()
        return result

    def get_deliverables(self):
        """return objects containing infromations"""
        if len(self._records) == 0:
            return derrno.__err__(derrno.ENOREC)
        else:
            return self._records

Пример #9

Показать файл

    def __init__(self):
        # init agent for parsing html
        self.agent = GetHTMLAndParse()

        # format text
        self.formatter = TextFormatUtils()

Пример #10

Показать файл

class GetDeliverableRegion:
    def __init__(self):
        # init agent for parsing html
        self.agent = GetHTMLAndParse()

        # format text
        self.formatter = TextFormatUtils()

    """ Get data region.
    Returns element tree with region where are deliverables stored """

    def get_region(self, url, base, tolerance):
        _res = self.agent.ghap(url)
        if len(_res) == 0:
            return derrno.__err__(errmsg)
        else:
            self._page = self.agent.get_etree()

        deliv_elements = self.agent.find_anchor_elem(base=base)
        if len(deliv_elements) == 0:
            return derrno.__err__(derrno.ENODOC, url)
        if len(deliv_elements) == 1:
            return lxml.etree.ElementTree(deliv_elements[0])

        # get parent tag of all deliverable anchors
        parent_element = self._get_common_parent(deliv_elements, tolerance)
        if parent_element == None:
            return derrno.__err__(derrno.ENOREG, "Parent element not found.")

        # get the region out of the parent element
        region = self._get_deliverable_region(parent_element)
        # if parent tag is region
        if region == 0:
            # return element tree made from parent tag
            return lxml.etree.ElementTree(parent_element)
        else:
            print
            #lxml.etree.ElementTree(region).write(sys.stdout,pretty_print=True)
        return region  # else return region

    """ Stabile searching parent of all elements in elem_list
    using method of making element parent vectors and comparing them.
    Tolerance of n tags makes the region smaller if there are
    >>not deliverable<< pdfs in more regions on the page."""

    def _get_common_parent(self, elem_list, tolerance):

        # supporting method - kind of bigger lambda. Get minimal length of
        # inside lists.
        def _minlength(seq_list):
            return min([len(seq) for seq in seq_list])

        # next supporting method: check the elements in list.
        # if elements are the same, its common parent tag - return True.
        def _iscommon(elem_seq, tol):
            tol_list = []
            for elem in elem_seq:
                if not elem in tol_list:
                    tol_list.append(elem)
            if len(tol_list) > tol + 1:
                return False
            # if only two anchors found then we have only two tags
            # and its pretty hard to use tolerance, so we omit it.
            if len(elem_seq) < 3 and len(tol_list) > 1:
                return False
            return True

        # get the most frequenced tag in list
        def _most_frequent(seq):
            suplist = []
            suplist_freq = []
            for el in seq:
                if not el in suplist:
                    suplist.append(el)
                    suplist_freq.append(int(1))
                else:
                    suplist_freq[suplist.index(el)] += 1
            ind = suplist_freq.index(max(suplist_freq))
            return suplist[ind]

        #
        # now continue with method _get_common_parent()
        #
        vectors = []  # here will be vectors stored - list of lists
        for self.elem in elem_list:
            _vector = []
            while 1:
                parent = self.elem.getparent()  # exception possible here
                if parent == None:
                    break
                _vector.append(parent)
                self.elem = parent
            vectors.append(_vector)
        # We have parent vectors of all elements from elem_list stored in list
        # $vectors. Then zip the vector list and get sequences of parent tags (and the
        # other tags) sorted from the highest to the lowest parent element.
        zipped = [[row[-i] for row in vectors]
                  for i in range(1,
                                 _minlength(vectors) + 1)]
        # now check all lists in list zipped. If these are filled with the same
        # elements, its a common parent. The last list before difference contains
        # the main parent tag.
        self.last_seq = []
        for zipvect in zipped:
            if not _iscommon(zipvect, tolerance):
                # return most frequented element in last vector
                return _most_frequent(self.last_seq)
            self.last_seq = zipvect
        return _most_frequent(self.last_seq)

    """ Get texts from element and his descendants.
    If string is True, returns texts as one string with spaces.
    elem: lxml element """

    def _get_element_texts(self, elem, string=True):
        texts = []
        for child in elem.iter():
            if child.text and isinstance(child.tag, basestring):
                if re.search("[a-z0-9]", child.text, re.I):
                    texts.append(self.formatter.format(child.text))
        if string:
            return " ".join(texts)
        return texts

    """ Get deliverable region - returns etree with region.
     If 0 returned parent_tag is region,
     if -1 returned some error occured searching,
     if html string returned its a region. """

    def _get_deliverable_region(self, parent_tag):
        def _convert_tag_to_html(tag):
            tag_html = lxml.etree.ElementTree(tag)
            return lxml.etree.tostring(tag_html)

        # list[0] = type, list[1] = atribute, list[2] = lxml tag element
        # in case of headers list[0] = element.tag, then [2] is element
        _reg_atr = ['', None, None]
        self._result_html_region = ''
        reg_flag = False  # flag indicating that we are looping over region
        # get headers first
        headers = []
        #lxml.etree.ElementTree(parent_tag).write(sys.stdout,pretty_print=True)
        for i in range(1, 7):
            headers.extend(parent_tag.findall('.//h' + str(i)))
        children = parent_tag.getchildren()
        if len(headers) > 0:
            for head in headers:
                text = self._get_element_texts(head)
                if text:
                    if re.search("deliverables", text, re.I):
                        _reg_atr[0] = head.tag
                        _reg_atr[2] = head
                        break
            if _reg_atr[2] == None:
                return 0
            # visit all tag in parent_tag
            for tag in parent_tag.iterdescendants():
                if tag.tag == 'img': continue
                text = self._get_element_texts(tag)
                if tag.tag == 'a' and not tag.text:
                    if tag.find('img') is not None:
                        text = tag.find('img').tail
                    else:
                        text = ' '
                if text:
                    if re.search("deliverables", text, re.I) and \
                        tag.tag == _reg_atr[0]:
                        # "deliverable" title, BEGIN of region
                        reg_flag = True
                    elif not re.search("deliverables", text, re.I) and \
                        tag.tag == _reg_atr[0]:
                        # next similar title, END of region
                        if reg_flag:
                            break
                # region content
                if tag in children and reg_flag:
                    self._result_html_region += _convert_tag_to_html(tag)
                elif tag.getparent() in children and reg_flag:
                    self._result_html_region += _convert_tag_to_html(
                        tag.getparent())
                    children.remove(tag.getparent())
        # if we dont have headers, try to find other kind of header (title)
        # "Deliverables" and compare with other elements with the same class or id.
        else:
            for tag in parent_tag.iter():
                if tag.text:
                    if re.search("deliverables", tag.text, re.I):
                        if tag.get("class"):
                            _reg_atr[0] = 'class'
                            _reg_atr[1] = tag.get("class")
                            _reg_atr[2] = tag
                            break
                        elif tag.get("id"):
                            _reg_atr[0] = 'id'
                            _reg_atr[1] = tag.get("id")
                            _reg_atr[2] = tag
                            break
                        elif tag.get("style"):
                            _reg_atr[0] = 'style'
                            _reg_atr[1] = tag.get("style")
                            _reg_atr[2] = tag
                            break

            # test _reg_atr. If there is no deliverable region, then all
            # documents make the region
            if _reg_atr[2] == None:
                return 0
            reg_flag = False
            # visit all tag in parent_tag
            for tag in parent_tag.iterdescendants():
                if tag.tag == 'a' and not tag.text:
                    if tag.find('img') is not None:
                        tag.text = tag.find('img').tail
                    else:
                        tag.text = ' '
                if tag.text:
                    if re.search("deliverables", tag.text, re.I) and \
                        tag.get(_reg_atr[0]) == _reg_atr[1]:
                        # "deliverable" title, BEGIN of region
                        reg_flag = True
                    elif not re.search("deliverables", tag.text, re.I) and \
                        tag.get(_reg_atr[0]) == _reg_atr[1]:
                        # next similar title, END of region
                        if reg_flag:
                            break
                # region content
                if tag in children and reg_flag:
                    self._result_html_region += _convert_tag_to_html(tag)
                    children.remove(tag)
                elif tag.getparent() in children and reg_flag:
                    self._result_html_region += _convert_tag_to_html(
                        tag.getparent())
                    children.remove(tag.getparent())
        if not self._result_html_region:
            return 0
        # create ElementTree from region
        try:
            return lxml.etree.fromstring(self._result_html_region)
        except:
            try:
                parser = lxml.etree.HTMLParser()
                return lxml.etree.fromstring(self._result_html_region, parser)
            except lxml.etree.XMLSyntaxError:
                return 0

Пример #11

Показать файл

Файл: getdelivpage.py Проект: Nela-xkaspa35/Rozsireni-portalu-vyzkumnych-projektu

class GetDelivPage:

    def __init__(self, url, verbose=False, debug=False, addkeyw=None):
        # keywords used for document page search
        self._sigwords = ["d((eliverables?)|[0-9])",
                          "documents?",
                          "reports?",
                          "public(ation)?s?",
                          "results?",             
                          "presentations?",
                          "library",
                           #"projects?",
                          "outocomes?", "downloads?",
                          "outputs?"]
        
        if addkeyw != None:
            self._sigwords.append(addkeyw)

        """ Associative array containing links with their flags
        { url : [Index/NoIndex/Frame, Visit/Visited, Rank] }
        index = 0, noindex = 1, frame = 2, unvisited = 0, visited = 1 """
        self._link_stack = { url : [0,0,0] }

        self.base_url = url # save base (input) url

        # Open an parsing agent to get needed data from page
        self.agent = GetHTMLAndParse()

        self._current_url = url

        # a constant used to set rank in order of importance of the expression 
        # being tested (self._sigwords)
        self.rank_const = len(self._sigwords)

        # few a constants for dictionary - just for good-looking source code
        self.IND_FR = 0 # index/noindex/frame/special
        self.VISIT = 1 # unvisited/visited
        self.RANK = 2 # value of rank

        # set verbose flag
        self.__verbose__ = verbose

        #set debug flag
        self.__dbg__ = debug
        
        # checking data types
        if not type(self.__verbose__) == bool:
            raise ValueError("Verbose flag has to be boolean.")


    def __verbose(self, msg):
        _err = "cannot decode verbose message."
        if self.__verbose__ == True:
            try:
                print(str(msg))
            except UnicodeError:
                print(_err) 

        
    def __debug(self, msg):
        _err = "cannot decode debug info."
        if self.__dbg__ == True:
            try:
                print("Debug message:    "+str(msg))
            except UnicodeError:
                print(_err) 

################################################################################

    """ Initialize item in dictionary to noindex/unvisited/rank=0 """
    def _link_item_init__(self, link, index=1, visit=0, rank=0):
        # default setting: noindex,unvisited,norank
        if not self._link_stack.has_key(link):
           self._link_stack[link] = [index,visit,rank]
        return


    """ Edits item in dictionary self._link_stack """
    def _link_item_edit(self, link, index=None, visit=None, rank=None):
        if index is not None:
            self._link_stack[link][self.IND_FR] = index
        if visit is not None:
            self._link_stack[link][self.VISIT] = visit
        if rank  is not None:
            # null rank if zero is argument
            if rank == 0:
                self._link_stack[link][self.RANK] = 0
            # add rank
            else:
                self._link_stack[link][self.RANK] += rank
        return

    
    """ Method representing one level of cascade. Do almost any job to search 
    one word in dictionary """
    def _level_job(self, index=None):
        # get list of links from anchors containing one of expression
        # from self_sigwords
        result = 0
        if index is not None: # searching with one 
            link_list = self.agent.get_all_links(
                regul = re.compile(self._sigwords[index], re.I), 
                base  = self._current_url)        
        else:
            link_list = self.agent.get_all_links(base = self._current_url)
            index = self.rank_const
        if link_list:
            #
            #   RANK giving & filter
            #       
            if index is None:
                rank = 0
            elif index == 0:
                rank = self.rank_const * 2
            else:
                rank = self.rank_const - index
            for link in link_list:
                # GTFO javascript
                if not link or "javascript:" in link or "mailto:" in link: 
                    continue
                if "#" in link: # if pointer delete it
                    link = re.sub('#.*$', '', link)
                if len(link) > 200:  
                    continue                
                if self._link_stack.get(link):
                    # RANK if you see those links for first
                    if self._link_stack[link][self.VISIT] == 0:
                        self._link_item_edit(self._current_url, rank=rank)
                    continue
                if not self.agent.compare_domains(self.base_url, link):
                    continue

                split_link = re.sub("https?://.+?/", "", link)
                # check whether it is file or not
                 
                if self.agent.is_wanted_mime(link):
                    #
                    #   Some PDF or DOC found
                    #
                    # RANK
                    self._link_item_edit(self._current_url, rank=10)
                    self.__debug("Added rank 10 to "+self._current_url)
                    # 
                    if re.search("de?l?(iverable)?[0-9]+([\._-][0-9])?", 
                                  split_link, re.I):
                        self.__debug("Type D on "+self._current_url) # debug print
                        # RANK
                        self._link_item_edit(self._current_url, rank=100)
                    continue
                elif not self.agent.is_page(link):
                    continue
                    self.__debug("UNWATED")
                #
                # Add link
                #
                # RANK
                # initialization of link item in dict
                self._link_item_init__(link)
                self._link_item_edit(self._current_url, rank=rank)
                result += 1
                # debug print
                self.__debug("ADD "+link[7:60])
                self.__debug("Rank "+str(rank)+" "+self._current_url)    
        return result


    """ Cascade search. May improve the speed of script """
    def _cascade_search(self):
        result = 0
        # first cascade - look for links cont. deliverables
        result += self._level_job(0)
        if not result == 0:
            return
        # second cascade - look for links cont. documents and publications
        result += self._level_job(1) 
        result += self._level_job(2)
        if not result == 0:
            return
        # last cascade - all the rest
        for i in range(3,self.rank_const):
            result += self._level_job(i)
        # check Intro page (all links) only on index
        if result == 0 and self._link_stack[self._current_url][0] == 0:
            result += self._level_job() 
        """if result == 0:
            # RANK DOWN
            self._link_item_edit(self._current_url, rank=0)
            print "No anchors on the page"""
        return


    """ TRY TO repair link. But for now only append / in base """
    def _repair_links(self, base=None):
        if base is None:
            base = self.base_url
        if re.match(".*[^/]$", base):
            base += "/"
        if self.agent.get_etree() == -1:
            return -1
        links = self.agent.get_all_links(base = base)
        # compare link with base url
        for link in links:
            if not self.agent.compare_domains(self.base_url, link):
                continue
            link = re.sub("https?://.+?/", base, link)
            # if match, save it as special case
            self._link_item_init__(link, index=3)


    """ Checking intro page. It is page without content, only with Enter label """
    def _check_intro(self):
        links = self.agent.get_all_links(base = self._current_url)
        self.__debug("We've found intro links: "+str(links))
        for link in links:
            if not self.agent.compare_domains(self.base_url, link):
                continue
            # save new link as normal page
            self._link_item_init__(link, index=1)
   

    """ Looks for frames on the page """
    def _check_frames(self):
        frames = self.agent.look_for_frame(base = self._current_url)
        if not frames:
            return None
        fcount = len(frames)
        # debug print
        self.__debug("We've found frames ("+str(fcount)+") on "+self._current_url) 
        # save new link as frame page
        for link in frames:
            if self.agent.compare_domains(self._current_url, link):
              self._link_item_init__(link, index=2)
        return fcount

    
    """ Checks for titles and gives rank according the result """
    def _check_titles(self):
        for i in range(self.rank_const):
            hcount = self.agent.count_all_headers(
                re.compile( self._sigwords[i], re.I ))
            if not hcount == 0:
                if i == 0: 
                    #
                    # "deliverable" match, the highest rank
                    #
                    # RANK constant is multiplied by 4
                    self.__debug("deliverable match"+str(self.rank_const *
                    4)+" "+self._current_url)
                    self._link_item_edit(self._current_url, 
                                         rank = self.rank_const * 4)
                else:
                    #
                    # other word match
                    #
                    # do not multiplied rank constant
                    self.__debug("Rank "+str(self.rank_const - i)+" "+self._current_url) 
                    self._link_item_edit(self._current_url, 
                                         rank = self.rank_const - i)


    """ Get information about current link """
    def _check_anchor(self):
        # tt is Text and Title
        tt = self.agent.get_anchor_from_link(self._current_url)
        # return 0 if no anchor match
        if tt == 0: return tt;
        # match for deliverables
        if re.search(self._sigwords[0], tt, re.I):
            self.__debug("Anchor matched "+self._current_url) # debug print
            return 1
        

    """ Returns list of unvisited links. Useful in cycle. """
    def _check_unvisited_links(self):
        unvisitedLinks = []
        for link in self._link_stack:
            if self._link_stack[link][self.VISIT] == 0: # if unvisited
                unvisitedLinks.append(link)
        return unvisitedLinks # list of unvisited page links

    
    """ Aplying all methods to unvisited links - next level of searching. 
    It is main private method. Only this method can decide end of searching """
    def _handle_unvis_links(self):
        unvisLinks = self._check_unvisited_links()
        if not unvisLinks:
            return None # end of searching
        for link in unvisLinks: # cycle in unvisited links
            # visit and parse page
            self._link_item_edit(link, visit = 1)

            (res, err) = self.agent.ghap(link)
            if res == -1:
                self.__debug(str(err)+" "+str(link)) # debug print
                # if link is broken (IND_FR == 3)
                if self._link_stack[link][self.IND_FR] != 3:
                    self._repair_links()
                continue
            # little hack with error message, there is no error but URL!
            if res == 2:
                self.base_url = err # URL of the new base
            self.__debug("Getting url in ghap(): "+str(link)) # debug print
            self.__verbose("Searching... URL: "+str(link)) # verbose print
            self._current_url = link
            if self._link_stack[link][self.IND_FR] == 2:
                dname = self.agent.get_domain_name(link)
                if dname is not None:
                    self.base_url = dname

            ###############
            # frame check #
            self._check_frames()

            ################
            # titles check #
            self._check_titles() # rank giving here

            ################
            # anchor check #
            if self._check_anchor():
                self._link_item_edit(link, rank = 10) # rank giving here too

            self._cascade_search() # search for next links on this page
        # when no unvisited links in list, return
        return 1


    """ Returns link of the highest value of rank in self._link_stack. 
    It is called in the end of process."""
    def _get_highest_ranks_link(self):
        hRank = 0
        hLink = ""
        # check all links and choose link with the highest rank
        for link in self._link_stack:
            if self._link_stack[link][self.RANK] > hRank:
                hLink = link
                hRank = self._link_stack[link][self.RANK]
        return hLink # WINNER


    """ Returns list of all links leading to deliverables. 
    Try to find more sites with deliverables.. i.e. like www.awissenet.com has.
    Maybe test for name of link - anchor: i.e. next, prev, [0-9]+ and so one...
    Page usualy looks like:       next pages: 1 2 3 4 ... """
    def _get_deliv_link_list(self,first_link):
        # agent gets first_link
        final_list = []
        nonvisited = [first_link]
        current = nonvisited.pop()
        while current:
            if not current or "javascript:" in current or "mailto:" in current:
                try:
                    current = nonvisited.pop()
                except: 
                    break
                continue
            if self.agent.ghap(current)[0] == -1: # CACHE ??? maybe 
                try:
                    current = nonvisited.pop()
                except: 
                    break
                continue

            nonvisited.extend(self.agent.get_pager_links(base=current))
            final_list.append(current) # append only one link
            try:
                current = nonvisited.pop()
            except: 
                break
        return final_list # returning all pages with deliverables
    
        
    """ Returns list of links on pages with deliverable-documents.
    If found returns list, if not found, return -1. 
    Only public method in module. """
    def get_deliverable_page(self):
        # the main searching loop 
        # while we have some unvisited links, search
        while self._handle_unvis_links(): 
            # security case
            if len(self._link_stack) > 10:
                break
            self.__debug("Stack content: "+str(self._link_stack))
        if len(self._link_stack) == 1 :
            return derrno.__err__(derrno.ELNOTFOUND)

        final_link = self._get_highest_ranks_link()
        if not final_link or self._link_stack[final_link][2] == 0:
            return derrno.__err__(derrno.ELNOTFOUND)
        self.__debug('#'*79)
        self.__debug("DELIVERABLE PAGE: "+final_link)
        return [final_link]
        
        ####### not in use #############
        result = self._get_deliv_link_list(final_link)
        if len(result) == 0:
            return derrno.__err__(derrno.ELNOTFOUND)
        else:
            return result

Пример #12

Показать файл

Файл: getdelivrecords.py Проект: Nela-xkaspa35/Rozsireni-portalu-vyzkumnych-projektu

class GetDelivRecords:


    def __init__(self,verbose=False,debug=False):
        self.__dbg__  = debug
        self.__verbos = verbose
        self._crawler = Crawler()
        self._crawler.set_headers((
                   ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \
                   ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
                 ))
        self._wraper = HTMLSequenceWrapper(childcoef=5.0, headercoef=3.0, mintextlen=20)
             
        self._unwanted_titles = ['Download here','PDF format']
        self._records = []

        ################################
        #manual processing
        self.agent = GetHTMLAndParse()
        # to get region where to search for records
        self.regionHandler = GetDeliverableRegion()
        # init text formatter (encoding, erasing white chars etc.)
        self.formatter = TextFormatUtils()
        
        self._omitted_tags = ('br', 'img', 'html', 'body')
        # tag tolerance
        self.tagtol = 1
        

    def __debug(self, msg):
        _err = "cannot decode debug info."
        if self.__dbg__ == True:
            try:
                print("Debug message:    "+str(msg))
            except UnicodeError:
                print(_err)
    def __verbose(self,msg):
        _err = "cannot decode debug info."
        if self.__verbose == True:
            try:
                print("Verbose:    "+str(msg))
            except UnicodeError:
                print(_err)

    
    
########################Processing sequencewrapper output######################
    """function gets an entry from output of sequence wrapper
       it tries to create deliv record and retruns true if succed. """
    def _make_deliv_record(self,entry):
        
        text  = []
        links = []

        #harvest links and text form entry
        for e in entry.iter():
            if e.text != None:
                text.append(e.text)
            if e.attrib.get("link")!=None:
                if self.agent.is_wanted_mime(e.attrib.get("link")) and e.attrib.get("link") not in links:
                   links.append(e.attrib.get("link"))


        res = self._deliv_in_text(text,links)
        if type(res) == RRSPublication:
            self._entriesFoundInText.append(res)
            self.__debug("Record found cause of text")
            return True

        elif type(res)==list:
            res=self._more_entry_in_record(entry)
            if(res==True):
               self.__debug("")
               return True
            else:
               return False

        res = self._deliv_in_link(text,links,entry)
        if type(res) == RRSPublication:
            self._entriesFoundInLinks.append(res)
            self.__debug("Record found cause of link")
            return True

        return False

    """look for keyword in text"""
    def _deliv_in_text(self,text,links):
        
        #print text
        #print links
        #print "*"*40
        _title = False
        _description = ""
        pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)",re.I)

        #loop through text in entry looking for title and description
        for t in text:
           if _title == False:
              if pattern.search(t):
                     _title = t


           #set the longest string as description of deliverable
           if len(_description)<len(t):
                _description = t

        if _title == _description:
            _description = ""

        _link = False

        if type(links) == str:
            if self.agent.is_wanted_mime(links):
                _link = links
        elif type(links) ==list:
            for l in links:
                if self.agent.is_wanted_mime(l):
                    if _link == False:
                       _link = l
                    else:
                       #if there was already found link
                       if _link[:s.rfind(_link,'.')] == l[:s.rfind(l,'.')]:
                          break
                       else:
                          return ['-3','Probably more records in one entry']

        
        #create object
        if _title:
            #print "TITLE:"+_title
            pub = RRSPublication(title=_title,abstract=_description)
            _typ = RRSPublication_type(type='techreport')
            pub['type'] = _typ
            self.__debug("*"*40)
            self.__debug("Title: "+_title)
            self.__debug("Description: "+_description)



            if _link:
                #print "LINK:"+_link
                self.__debug("Link: "+_link)
                l = RRSUrl(link=_link)
                pl_rel = RRSRelationshipPublicationUrl()
                pl_rel.set_entity(l)
                pub['url'] = pl_rel

            return pub
        else:
            #this entry is not probably deliverable
            return False

    """look for a key word in link"""
    def _deliv_in_link(self,text,links,entry = False):
        
        ##print text
        ##print links
        #print "*"*40
        
        _title = False
        _description = ""
        pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)",re.I)

        _link = False
        
        for l in links:
            if pattern.search(l):
                  if _link == False:
                     _link =l
                  else:
                     return ['-3','Probably more records in one entry']



        #loop through text in entry looking for title and description
        for t in text:
           if _title == False:
                if len(t)>10 :
                     _title = t
           #set the longest string as description of deliverable
           if len(_description)<len(t):
                _description = t
             

        if _title == _description:
            _description = ""

        #if chosen title is not valid try to find better in parent entry
        if _title and not self._check_title(_title) and entry != False:
            _title = self._repair_title(entry)        
       
        
        #create object
        if _link:
            pub = RRSPublication(title=_title,abstract=_description)
            typ = RRSPublication_type(type='techreport')
            pub['type'] = typ

            self.__debug("*"*40)
            self.__debug("Title: "+_title)
            self.__debug("Description: "+_description)
            
            self.__debug("Link: "+_link)
            l = RRSUrl(link=_link)
            pl_rel = RRSRelationshipPublicationUrl()
            pl_rel.set_entity(l)
            pub['url'] = pl_rel
            
            return pub
        else:
            #this entry is not probably deliverable
            return False

    """Check if title contents only unwanted string with some tolerance
    return true if title is ok
    """
    def _check_title(self,title,tolerance=10):
        
        for t in self._unwanted_titles:
           if (s.find(s.lower(title),s.lower(t))) != -1:
               if (len(t)+tolerance) > len(title):
                   return False
        return True

    "looks for an element with highest visibility rank in parent elemet"
    def _repair_title(self,entry):
        parent = entry.getparent()
        visibility = 0
        title = ""
        for i in parent.iter():
             try:
                 if i.attrib.get('visibility') > visibility:
                     visibility = i.attrib.get('visibility')
                     title = i.text
             except AttributeError:
                 pass

        if title != "":
            return title
        else:
            return False

    "Function try to create array of deliverables from one entry in xml tree"
    def _more_entry_in_record(self,entry):
        for ch in entry.iter('chunk'):
           if ch.text != None and ch.attrib.get("link")!=None:
              if self.agent.is_wanted_mime(ch.attrib.get("link")):
                 _pub= RRSPublication(title=ch.text)
                 typ = RRSPublication_type(type='techreport')
                 _pub['type'] = typ
                 _l = RRSUrl(link=ch.attrib.get("link"))
                 _rel = RRSRelationshipPublicationUrl()
                 _rel.set_entity(_l)
                 _pub['url'] = _rel
                 self._entriesFoundInLinks.append(_pub) 
      
    "Process pages definied by urls"
    def process_pages(self,pages):
       self._entriesFoundInText = []
       self._entriesFoundInLinks = []
       self._urls = pages
       self._pages = self._crawler.start(pages)
       

       #creates RRSPublication objects with information about deliverables
       for u in self._urls:
          self._wraper.wrap(self._pages[u],u)
          self._tree = self._wraper.get_etree()
          #print self._wraper.get_xml()
          for entry in self._tree.iter("entry"):
             self._make_deliv_record(entry)
          
       
       if len(self._entriesFoundInText)>3:
            self.__debug("Deliverbles descriptions content keywords")
            self.__debug("Found " + "{0}".format(len(self._entriesFoundInText)) + " deliv records")
       
            self._records = self._entriesFoundInText
       elif len(self._entriesFoundInLinks)>3:
            self.__debug("Deliverbles links content keywords")
            self.__debug("Found " + "{0}".format(len(self._entriesFoundInLinks)) + " deliv records")
       
            self._records = self._entriesFoundInLinks
       else:
            self._manual_processing()
            

    "This method is called when ther was no records found in output of sequencewrapper"
    def _manual_processing(self):
        self._entriesFoundInLinks = []
        self._entriesFoundInText = []
        self._manual_process_page(self._urls, urlsplit(self._urls[0])[1])
        if len(self._entriesFoundInText)>0:
            self.__debug("Deliverbles descriptions content keywords")
            self.__debug("Found " + "{0}".format(len(self._entriesFoundInText)) + " deliv records")

            self._records = self._entriesFoundInText
        elif len(self._entriesFoundInLinks)>0:
            self.__debug("Deliverbles links content keywords")
            self.__debug("Found " + "{0}".format(len(self._entriesFoundInLinks)) + " deliv records")

            self._records = self._entriesFoundInLinks

    ########################### TABLE HANDLING METHODS ############################

    """ Get texts from element and his descendants.
    If string isset, returns texts as one string with spaces.
    # elem: lxml element """
    def _get_descendats_texts(self, elem, string=True):
        texts = []
        for child in elem.iter():
            if child.text and isinstance(child.tag, basestring):
                if re.search("[a-z0-9]", child.text, re.I):
                    texts.append(self.formatter.format(child.text))
        if string:
            return " ".join(texts)
        return texts


  
    """ Get link from row of table - go through columns and the only href
    leading to deliverable is returned. """
    def _get_row_link(self, row):
        # find all anchors where parent is row
        linkanch = row.findall('.//a[@href]')
        if len(linkanch) == 0:
            return None
        for link in linkanch:
            anchor_link = link.get('href')
            if self.agent.is_wanted_mime(anchor_link): # check if it is file we want
                return anchor_link
        return None


    """ Handle region as a table.
    Work with region as it's a table. Try to get table semantic (table order)
    and get all records out of it. """
    def _handle_table(self):
        for row in self.parentetree:
            if not row.tag == 'tr':
                continue
            row_list = []
            _thislink = self._get_row_link(row)
            if _thislink == None:
                continue

            for column in row:
                text = self._get_descendats_texts(column)
                if not text:
                    continue
                row_list.append(text)

            res = self._deliv_in_text(row_list, [_thislink])
            if type(res) == RRSPublication:
                self._entriesFoundInText.append(res)
                self.__debug("Record found cause of text")
            else:
                res = self._deliv_in_link(row_list, [_thislink])
                if type(res) == RRSPublication:
                    self._entriesFoundInLinks.append(res)
                    self.__debug("Record found cause of link")
            del(row_list)
        return 
       

########################  TAG SEQUENCE RECOGNIZING METHODS ####################

    """ Tag check.
    If it is anchor with href leading to deliverable, returns True """
    def _is_deliv_anch(self, tag):
        if tag.tag == 'a':
            href = tag.get('href')
            if self.agent.is_wanted_mime(href):
                return True
        return False


    """ Filters useless and messy tags.
    Return false if useless, true if normal tag """
    def _tagfilter(self, tag):
        if tag.tag in self._omitted_tags:
            return False
        #if tag.text:
        #    if not re.search("[a-z0-9\[\]]", tag.text, re.I):
        #        return False
        return True


    """ Gets difference between first two anchors. """
    def _getdiff(self, reg, tol):
        # etree reg = element tree region
        # int tol: accepted tolerance of tags
        d = {}
        index = 0
        # fill the dictionary with differences and their occurences
        for tag in reg.iter():
            if not self._tagfilter(tag):
                continue
            if self._is_deliv_anch(tag) and not index == 0:
                try:
                    d[index] += 1
                except:
                    d[index] = 1
                index = 0
            index += 1
        # check differencies if the variety isn't higher then $tol tolerance
        difflist = d.keys()
        self.__debug("difflist: "+str(difflist))
        if len(difflist) == 0:
            return -1
        _max = max(difflist)
        _min = min(difflist)
        dlen = len(d.keys())
        if dlen == 1:
            return d.keys()[0]
        if dlen > ((2*tol)+1): # tolerance to both sides
            return -1
        if (_max - _min) > 2*tol: # some acceptable tolerance
            return -1
        # get the most frequent difference
        most_freq = max(d.values())
        for key in d:
            if d[key] == most_freq:
                return key
        return -1


    """ Only anchors found. No optional information. """
    def _get_anch_only(self):
        anchlist = self.agent.find_anchor_elem(self.baseUrl, self.parentetree)
        # We have to make list of list because XMLOutput
        return [[anch] for anch in anchlist]

    
    """ Main method handling tag sequences and recognizing records.
    Returns list of records. """
    def _get_tag_sequences(self, tag_tol=1):
        records = []
        self._rec = []
        if len(self.parentetree) == 0:
            return [[self.parentetree]]
        # get interval between anchors, use tolerance tag_tol
        self.difference = self._getdiff(self.parentetree, self.tagtol)
        while self.difference == -1:
            if self.tagtol > 5:
                self.__verbose("Variety of intervals between anchors is too huge. "+\
                               "Getting data out of anchors only")
                return self._get_anch_only()
            self.tagtol += 1
            self.difference = self._getdiff(self.parentetree, self.tagtol)

        # get sequence of first n tags, where n is average interval between anchors
        # this could be tag-sequence describing all records in region.
        self.record_seq = []
        i = 0
        for tag in self.parentetree.iter():
            if not self._tagfilter(tag):
                continue
            if i >= self.difference:
                if not 'a' in self.record_seq:
                    del self.record_seq[0]
                else:
                    break
            self.record_seq.append(tag.tag)
            i += 1

        # counter indicates on which position in tag sequence we actually are
        counter = 0
        # make sequence of tags as they go
        regionlist = filter(self._tagfilter, [tag for tag in self.parentetree.iter()])
        recseqlen = len(self.record_seq)
        reglistlen = len(regionlist)

        # flag indicating begin of records - in region on the beginning can be some garbage
        self.begin = False
        # indicating unpredictable separator between deliverable records
        self.separator = 0
        for i, tag in enumerate(regionlist):
            # skip and save the sequence at the end
            if counter > self.difference-1:
                records.append(self._rec) # save
                self._rec = [] # erase the list
                counter = 0 # reset counter
            if not self.begin:
                if tag.tag != self.record_seq[0]:
                    continue
                else:
                    try:
                        if regionlist[i+1].tag != self.record_seq[1]:
                            continue
                    except:
                        pass
                    self.begin = True
            # handle tolerances, try to compare sibling tags
            self.match = False # match flag

            # tolerance algorithm. Goes through html and tries to pass irregular tags in sequence.
            for tol in range(self.tagtol+1):
                if tag.tag == self.record_seq[(counter + tol) % recseqlen] or \
                   regionlist[(i + tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter += tol+1
                    break
                elif tag.tag == self.record_seq[(counter - tol) % recseqlen] or \
                   regionlist[(i - tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter -= tol
                    counter += 1
                    break
            # if nothing matched, its probably out of tolerance
            if not self.match:
                self.separator += 1
                # tolerance 10 separators (tags between boxes or tables of deliverables)
                if self.separator > 10:
                    self.__verbose("Tag sequence doesnt match, probably out of "+\
                                "tolerance, getting data out of anchors only")
                    # maybe here could be tolerance++
                    # we didnt catch the sequence with tolerance...
                    return self._get_anch_only()
        records.append(self._rec)
        return filter(self._validseq, records)


    """ Helper method - check if sequence of tags rec contains deliv anchor
    """
    def _validseq(self, rec):
        for _atr in rec:
            # if we have anchor containing link to document, return true
            if self._is_deliv_anch(_atr):
                return True
        return False

  
    """ Get element texts only, dont look for descendants texts """
    def _get_tag_content(self, tag):
        links = []
        texts = []
        if tag.tag == 'a':
            href = tag.get('href')
            # if link leading to document found, add string to list
            if href is not None and self.agent.is_wanted_mime(href):
                links.append(self.formatter.format(href))
            title = tag.get('title')
            # if title found in tag, add string to list
            if title:
                texts.append(self.formatter.format(title))
        # if not anchor, search text in tag.text
        if tag.text:
            if re.search("[a-z0-9]", tag.text, re.I):
                texts.append(self.formatter.format(tag.text))
        return [links,texts]


    """ Harvest texts out of tags and return list of lists (record) """
    def _harvest_text(self, record_tag_list):
        self._records = []
        self._rec = []
        _links = []
        _texts = []
        # loop over records and search all possible useful texts
        for rec_list in record_tag_list:
            for tag in rec_list:
                harvested = (self._get_tag_content(tag))
                _links.extend(harvested[0])
                _texts.extend(harvested[1])
            #self._records.append(self._rec)
            res = self._deliv_in_text(_texts, _links)
            if type(res) == RRSPublication:
                 self._entriesFoundInText.append(res)
                 self.__debug("Record found cause of text")
            else:
                 res = self._deliv_in_link(_texts, _links)
                 if type(res) == RRSPublication:
                      self._entriesFoundInLinks.append(res)
                      self.__debug("Record found cause of link")
            _links = []
            _texts = []
            self._rec = []
        return self._records


    """ Text harvesting for sequences. """
    def _handle_sequence(self):
        seq = self._get_tag_sequences()
        return self._harvest_text(seq)



    """ Get records from region according document links
        this method is used when there was no records found
        in output of sequencewrapper"""
    def _manual_process_page(self, links, baseurl):
        _err = None
        self.baseUrl = baseurl

        for link in links:
            # find region with tolerance
            self.parentetree = self.regionHandler.get_region(link, baseurl, 1)
            if type(self.parentetree) == tuple:
                # error
                _err = self.parentetree
                self.__debug(_err)
                continue
            #make all links absolute in parent tree
            hrefs = self.parentetree.findall('.//a[@href]')
            for href in hrefs:
                href.make_links_absolute('http://'+urlsplit(link)[1]+'/')
            
            # get the charset. We dont have etree in htmlHandler,
            # so we have to use the one from regionHandler
            self.formatter.set_charset(self.regionHandler.formatter.get_charset())

            self.__debug("*"*100+'\n'+"*"*40+" DATA REGION "+"*"*40)
            self.__debug(lxml.etree.tostring(self.parentetree, pretty_print=True))
            # get root tag
            try:
                self.parentetree = self.parentetree.getroot()
            except:
                pass
            
            # Parent tag is table
            # call _handle_table
            if self.parentetree.tag in ('table','tbody'):
                self.__verbose("Handling table")
                self._handle_table()
            else:
                self.__verbose("Handling sequences")
                self._handle_sequence()
    

#############PUBLIC METHODS TO GET RESULTS
    def get_deliverables_XML(self):
        """return infromations about deliverables stored in objects as xml"""
        if len(self.get_deliverables())==0:
            return derrno.__err__(derrno.ENOREC)
        output = StringIO.StringIO()
        converter = Model2XMLConverter(stream=output)
        converter.convert(self.get_deliverables())
        result = output.getvalue()
        output.close()
        return result
        

    def get_deliverables(self):
        """return objects containing infromations"""
        if len(self._records) == 0:
            return derrno.__err__(derrno.ENOREC)
        else:
            return self._records

Пример #13

Показать файл

Файл: getdelivrecords.py Проект: Nela-xkaspa35/Rozsireni-portalu-vyzkumnych-projektu

    def __init__(self):
        # init agent for parsing html
        self.agent = GetHTMLAndParse()

        # format text
        self.formatter = TextFormatUtils()

Пример #14

Показать файл

Файл: getdelivrecords.py Проект: Nela-xkaspa35/Rozsireni-portalu-vyzkumnych-projektu

class GetDeliverableRegion:

    def __init__(self):
        # init agent for parsing html
        self.agent = GetHTMLAndParse()

        # format text
        self.formatter = TextFormatUtils()
        
    

    """ Get data region.
    Returns element tree with region where are deliverables stored """
    def get_region(self, url, base, tolerance):
        _res = self.agent.ghap(url)
        if len(_res) == 0:
            return derrno.__err__(errmsg)
        else:
            self._page = self.agent.get_etree()
           
        deliv_elements = self.agent.find_anchor_elem(base=base)
        if len(deliv_elements) == 0:
            return derrno.__err__(derrno.ENODOC, url)
        if len(deliv_elements) == 1:
            return lxml.etree.ElementTree(deliv_elements[0])

        # get parent tag of all deliverable anchors
        parent_element = self._get_common_parent(deliv_elements, tolerance)
        if parent_element == None:
            return derrno.__err__(derrno.ENOREG, "Parent element not found.")

        # get the region out of the parent element
        region = self._get_deliverable_region(parent_element)
        # if parent tag is region
        if region == 0:
            # return element tree made from parent tag
            return lxml.etree.ElementTree(parent_element)
        else:
            print
            #lxml.etree.ElementTree(region).write(sys.stdout,pretty_print=True)
        return region # else return region

   
    """ Stabile searching parent of all elements in elem_list
    using method of making element parent vectors and comparing them.
    Tolerance of n tags makes the region smaller if there are
    >>not deliverable<< pdfs in more regions on the page."""
    def _get_common_parent(self, elem_list, tolerance):

        # supporting method - kind of bigger lambda. Get minimal length of
        # inside lists.
        def _minlength(seq_list):
            return min([len(seq) for seq in seq_list])

        # next supporting method: check the elements in list.
        # if elements are the same, its common parent tag - return True.
        def _iscommon(elem_seq, tol):
            tol_list = []
            for elem in elem_seq:
                if not elem in tol_list:
                    tol_list.append(elem)
            if len(tol_list) > tol+1:
                return False
            # if only two anchors found then we have only two tags
            # and its pretty hard to use tolerance, so we omit it.
            if len(elem_seq) < 3 and len(tol_list) > 1:
                return False
            return True

        # get the most frequenced tag in list
        def _most_frequent(seq):
            suplist = []
            suplist_freq = []
            for el in seq:
                if not el in suplist:
                    suplist.append(el)
                    suplist_freq.append(int(1))
                else:
                    suplist_freq[suplist.index(el)] += 1
            ind = suplist_freq.index(max(suplist_freq))
            return suplist[ind]

        #
        # now continue with method _get_common_parent()
        #
        vectors = [] # here will be vectors stored - list of lists
        for self.elem in elem_list:
            _vector = []
            while 1:
                parent = self.elem.getparent() # exception possible here
                if parent == None:
                    break
                _vector.append(parent)
                self.elem = parent
            vectors.append(_vector)
        # We have parent vectors of all elements from elem_list stored in list
        # $vectors. Then zip the vector list and get sequences of parent tags (and the
        # other tags) sorted from the highest to the lowest parent element.
        zipped = [[row[-i] for row in vectors] for i in range(1, _minlength(vectors)+1)]
        # now check all lists in list zipped. If these are filled with the same
        # elements, its a common parent. The last list before difference contains
        # the main parent tag.
        self.last_seq = []
        for zipvect in zipped:
            if not _iscommon(zipvect, tolerance):
                # return most frequented element in last vector
                return _most_frequent(self.last_seq)
            self.last_seq = zipvect
        return _most_frequent(self.last_seq)


    """ Get texts from element and his descendants.
    If string is True, returns texts as one string with spaces.
    elem: lxml element """
    def _get_element_texts(self, elem, string=True):
        texts = []
        for child in elem.iter():
            if child.text and isinstance(child.tag, basestring):
                if re.search("[a-z0-9]", child.text, re.I):
                    texts.append(self.formatter.format(child.text))
        if string:
            return " ".join(texts)
        return texts


    """ Get deliverable region - returns etree with region.
     If 0 returned parent_tag is region,
     if -1 returned some error occured searching,
     if html string returned its a region. """
    def _get_deliverable_region(self, parent_tag):
        def _convert_tag_to_html(tag):
           tag_html = lxml.etree.ElementTree(tag)
           return lxml.etree.tostring(tag_html)

        # list[0] = type, list[1] = atribute, list[2] = lxml tag element
        # in case of headers list[0] = element.tag, then [2] is element
        _reg_atr = ['',None,None]
        self._result_html_region = ''
        reg_flag = False # flag indicating that we are looping over region
        # get headers first
        headers = []
        #lxml.etree.ElementTree(parent_tag).write(sys.stdout,pretty_print=True)
        for i in range(1,7):
            headers.extend(parent_tag.findall('.//h'+str(i)))
        children = parent_tag.getchildren()
        if len(headers) > 0:
            for head in headers:
                text = self._get_element_texts(head)
                if text:
                    if re.search("deliverables", text, re.I):
                        _reg_atr[0] = head.tag
                        _reg_atr[2] = head
                        break
            if _reg_atr[2] == None:
                return 0
            # visit all tag in parent_tag
            for tag in parent_tag.iterdescendants():
                if tag.tag == 'img': continue;
                text = self._get_element_texts(tag)
                if tag.tag == 'a' and not tag.text:
                    if tag.find('img') is not None:
                        text = tag.find('img').tail
                    else:
                        text = ' '
                if text:
                    if re.search("deliverables", text, re.I) and \
                        tag.tag == _reg_atr[0]:
                        # "deliverable" title, BEGIN of region
                        reg_flag = True
                    elif not re.search("deliverables", text, re.I) and \
                        tag.tag == _reg_atr[0]:
                        # next similar title, END of region
                        if reg_flag:
                           break
                # region content
                if tag in children and reg_flag:
                    self._result_html_region += _convert_tag_to_html(tag)
                elif tag.getparent() in children and reg_flag:
                    self._result_html_region+=_convert_tag_to_html(tag.getparent())
                    children.remove(tag.getparent())
        # if we dont have headers, try to find other kind of header (title)
        # "Deliverables" and compare with other elements with the same class or id.
        else:
            for tag in parent_tag.iter():
                if tag.text:
                    if re.search("deliverables", tag.text, re.I):
                        if tag.get("class"):
                            _reg_atr[0] = 'class'
                            _reg_atr[1] = tag.get("class")
                            _reg_atr[2] = tag
                            break
                        elif tag.get("id"):
                            _reg_atr[0] = 'id'
                            _reg_atr[1] = tag.get("id")
                            _reg_atr[2] = tag
                            break
                        elif tag.get("style"):
                            _reg_atr[0] = 'style'
                            _reg_atr[1] = tag.get("style")
                            _reg_atr[2] = tag
                            break

            # test _reg_atr. If there is no deliverable region, then all
            # documents make the region
            if _reg_atr[2] == None:
                return 0
            reg_flag = False
            # visit all tag in parent_tag
            for tag in parent_tag.iterdescendants():
                if tag.tag == 'a' and not tag.text:
                    if tag.find('img') is not None:
                        tag.text = tag.find('img').tail
                    else:
                        tag.text = ' '
                if tag.text:
                    if re.search("deliverables", tag.text, re.I) and \
                        tag.get(_reg_atr[0]) == _reg_atr[1]:
                        # "deliverable" title, BEGIN of region
                        reg_flag = True
                    elif not re.search("deliverables", tag.text, re.I) and \
                        tag.get(_reg_atr[0]) == _reg_atr[1]:
                        # next similar title, END of region
                        if reg_flag:
                            break
                # region content
                if tag in children and reg_flag:
                    self._result_html_region += _convert_tag_to_html(tag)
                    children.remove(tag)
                elif tag.getparent() in children and reg_flag:
                    self._result_html_region+=_convert_tag_to_html(tag.getparent())
                    children.remove(tag.getparent())
        if not self._result_html_region:
            return 0
        # create ElementTree from region
        try:
            return lxml.etree.fromstring(self._result_html_region)
        except:
            try:
                parser = lxml.etree.HTMLParser()
                return lxml.etree.fromstring(self._result_html_region, parser)
            except lxml.etree.XMLSyntaxError:
                return 0

Пример #15

Показать файл

Файл: getdeliverablerecords.py Проект: xkolac11/Deliverables

class GetDeliverableRecords:
    def __init__(self, verbose=False, debug=False):
        # init agent for parsing html
        self.htmlHandler = GetHTMLAndParse()
        # to get region where to search for records
        self.regionHandler = GetDeliverableRegion()
        # init text formatter (encoding, erasing white chars etc.)
        self.formatter = TextFormatUtils()
        # list of acceptable words in title (header) of table
        self.table_sem_words = [
            'deliverable', 'description', 'name', 'date', 'dissemination',
            'no.', 'wp', 'delivery', 'particip', 'title', 'nature'
        ]
        self._omitted_tags = ('br', 'img', 'html', 'body')
        # tag tolerance
        self.tagtol = 1
        # verbose and debug flags
        self.debugger = DeliverableDebugger(verbose=verbose, debug=debug)
        self.__verbose = self.debugger.verbose
        self.__debug = self.debugger.debug

########################### TABLE HANDLING METHODS ############################

    """ Get texts from element and his descendants.
    If string isset, returns texts as one string with spaces.
    # elem: lxml element """
    def _get_descendats_texts(self, elem, string=True):
        texts = []
        for child in elem.iter():
            if child.text and isinstance(child.tag, basestring):
                if re.search("[a-z0-9]", child.text, re.I):
                    texts.append(self.formatter.format(child.text))
        if string:
            return " ".join(texts)
        return texts

    """ Get table order (table semantic) """

    def _get_table_order(self):
        sem_list = []
        for desc in self.parentetree.iterdescendants():
            if desc.tag == 'tr':  # first <tr> match
                for col in desc:  # its <th> or <td>
                    for child in col.iterdescendants():
                        if child.tag == 'a':
                            if self.htmlHandler.check_file(child.get('href')):
                                return None
                    value = self._get_descendats_texts(col)
                    if value != None:
                        # if it is not title, but some text.
                        if len(value) > 30:
                            return None
                        sem_list.append(value)
                break
        str_sem_list = " ".join(sem_list)
        for expr in self.table_sem_words:
            # two matches ???
            if re.search(expr, str_sem_list, re.I):
                return sem_list
        return None

    """ Get link from row of table - go through columns and the only href
    leading to deliverable is returned. """

    def _get_row_link(self, row):
        # find all anchors where parent is row
        linkanch = row.findall('.//a[@href]')
        if len(linkanch) == 0:
            return None
        for link in linkanch:
            anchor_link = link.get('href')
            if self.htmlHandler.check_file(
                    anchor_link):  # check if it is file we want
                return anchor_link
        return None

    """ Handle region as a table.
    Work with region as it's a table. Try to get table semantic (table order)
    and get all records out of it. """

    #
    #
    #
    #
    #
    #
    #
    #
    #
    #
    #

    def _handle_table(self):
        # get table semantic
        tbl_order = self._get_table_order()
        # if we didnt recognize table order, get records and return list
        if not tbl_order:
            self.__verbose("Table order not recognized, getting data...")
            records = []
            # tr tag is a record
            for row in self.parentetree:
                if not row.tag == 'tr':
                    continue
                row_list = []
                _thislink = self._get_row_link(row)
                if _thislink == None:
                    continue
                row_list.append(_thislink)
                for column in row:
                    text = self._get_descendats_texts(column)
                    if not text:
                        continue
                    row_list.append(text)
                records.append(row_list)
                del (row_list)
            return records
        # else we have recognized table order, make dict of dicts out of it
        else:
            self.__verbose(
                "Table order recognized, filling dictionary in this order.")
            # every column of the row (every atribute of the record) has it's own
            # semantic in order of table semantic
            semantic_data = dict()
            for row in self.parentetree:
                self._thislink = self._get_row_link(row)
                # if its header or non-deliverable row, omit it.
                if self._thislink == None:
                    continue
                semantic_data[self._thislink] = {}
                for index, column in enumerate(row):
                    # get column text
                    text = self._get_descendats_texts(column)
                    if not text:
                        continue
                    try:
                        # store it
                        semantic_data[self._thislink][tbl_order[index]] = text
                    except:
                        continue
            return semantic_data

########################  TAG SEQUENCE RECOGNIZING METHODS ####################

    """ Tag check. 
    If it is anchor with href leading to deliverable, returns True """
    def _is_deliv_anch(self, tag):
        if tag.tag == 'a':
            href = tag.get('href')
            if self.htmlHandler.check_file(href):
                return True
        return False

    """ Filters useless and messy tags.
    Return false if useless, true if normal tag """

    def _tagfilter(self, tag):
        if tag.tag in self._omitted_tags:
            return False
        #if tag.text:
        #    if not re.search("[a-z0-9\[\]]", tag.text, re.I):
        #        return False
        return True

    """ Gets difference between first two anchors. """

    def _getdiff(self, reg, tol):
        # etree reg = element tree region
        # int tol: accepted tolerance of tags
        d = {}
        index = 0
        # fill the dictionary with differences and their occurences
        for tag in reg.iter():
            if not self._tagfilter(tag):
                continue
            if self._is_deliv_anch(tag) and not index == 0:
                try:
                    d[index] += 1
                except:
                    d[index] = 1
                index = 0
            index += 1
        # check differencies if the variety isn't higher then $tol tolerance
        difflist = d.keys()
        self.__debug("difflist: " + str(difflist))
        if len(difflist) == 0:
            return -1
        _max = max(difflist)
        _min = min(difflist)
        dlen = len(d.keys())
        if dlen == 1:
            return d.keys()[0]
        if dlen > ((2 * tol) + 1):  # tolerance to both sides
            return -1
        if (_max - _min) > 2 * tol:  # some acceptable tolerance
            return -1
        # get the most frequent difference
        most_freq = max(d.values())
        for key in d:
            if d[key] == most_freq:
                return key
        return -1

    """ Only anchors found. No optional information. """

    def _get_anch_only(self):
        anchlist = self.htmlHandler.find_anchor_elem(self.baseUrl,
                                                     self.parentetree)
        # We have to make list of list because XMLOutput
        return [[anch] for anch in anchlist]

    """ Main method handling tag sequences and recognizing records.
    Returns list of records. """

    def _get_tag_sequences(self, tag_tol=1):
        records = []
        self._rec = []
        if len(self.parentetree) == 0:
            return [[self.parentetree]]
        # get interval between anchors, use tolerance tag_tol
        self.difference = self._getdiff(self.parentetree, self.tagtol)
        while self.difference == -1:
            if self.tagtol > 5:
                self.__verbose("Variety of intervals between anchors is too huge. "+\
                               "Getting data out of anchors only")
                return self._get_anch_only()
            self.tagtol += 1
            self.difference = self._getdiff(self.parentetree, self.tagtol)

        # get sequence of first n tags, where n is average interval between anchors
        # this could be tag-sequence describing all records in region.
        self.record_seq = []
        i = 0
        for tag in self.parentetree.iter():
            if not self._tagfilter(tag):
                continue
            if i >= self.difference:
                if not 'a' in self.record_seq:
                    del self.record_seq[0]
                else:
                    break
            self.record_seq.append(tag.tag)
            i += 1

        # counter indicates on which position in tag sequence we actually are
        counter = 0
        # make sequence of tags as they go
        regionlist = filter(self._tagfilter,
                            [tag for tag in self.parentetree.iter()])
        recseqlen = len(self.record_seq)
        reglistlen = len(regionlist)

        # flag indicating begin of records - in region on the beginning can be some garbage
        self.begin = False
        # indicating unpredictable separator between deliverable records
        self.separator = 0
        for i, tag in enumerate(regionlist):
            # skip and save the sequence at the end
            if counter > self.difference - 1:
                records.append(self._rec)  # save
                self._rec = []  # erase the list
                counter = 0  # reset counter
            if not self.begin:
                if tag.tag != self.record_seq[0]:
                    continue
                else:
                    try:
                        if regionlist[i + 1].tag != self.record_seq[1]:
                            continue
                    except:
                        pass
                    self.begin = True
            # handle tolerances, try to compare sibling tags
            self.match = False  # match flag

            # tolerance algorithm. Goes through html and tries to pass irregular tags in sequence.
            for tol in range(self.tagtol + 1):
                if tag.tag == self.record_seq[(counter + tol) % recseqlen] or \
                   regionlist[(i + tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter += tol + 1
                    break
                elif tag.tag == self.record_seq[(counter - tol) % recseqlen] or \
                   regionlist[(i - tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter -= tol
                    counter += 1
                    break
            # if nothing matched, its probably out of tolerance
            if not self.match:
                self.separator += 1
                # tolerance 10 separators (tags between boxes or tables of deliverables)
                if self.separator > 10:
                    self.__verbose("Tag sequence doesnt match, probably out of "+\
                                "tolerance, getting data out of anchors only")
                    # maybe here could be tolerance++
                    # we didnt catch the sequence with tolerance...
                    return self._get_anch_only()
        records.append(self._rec)
        return filter(self._validseq, records)

    """ Helper method - check if sequence of tags rec contains deliv anchor
    """

    def _validseq(self, rec):
        for _atr in rec:
            # if we have anchor containing link to document, return true
            if self._is_deliv_anch(_atr):
                return True
        return False

###
#
#
#
#
#
#
#
#
#
#
#

    """ Get element texts only, dont look for descendants texts """
    def _get_tag_content(self, tag):
        l = []
        if tag.tag == 'a':
            href = tag.get('href')
            # if link leading to document found, add string to list
            if href is not None and self.htmlHandler.check_file(href):
                l.append(self.formatter.format(href))
            title = tag.get('title')
            # if title found in tag, add string to list
            if title:
                l.append(self.formatter.format(title))
        # if not anchor, search text in tag.text
        if tag.text:
            if re.search("[a-z0-9]", tag.text, re.I):
                l.append(self.formatter.format(tag.text))
        return l

    """ Harvest texts out of tags and return list of lists (record) """

    def _harvest_text(self, record_tag_list):
        self._records = []
        self._rec = []
        # loop over records and search all possible useful texts
        for rec_list in record_tag_list:
            for tag in rec_list:
                self._rec.extend(self._get_tag_content(tag))
            self._records.append(self._rec)
            self._rec = []
        return self._records

    """ Text harvesting for sequences. """

    def _handle_sequence(self):
        seq = self._get_tag_sequences()
        return self._harvest_text(seq)

############################  OVERALL METHODS  ################################

    """ Get records from region according document links """
    def _manual_process_page(self, links, baseurl):
        _err = None
        recordlist = []
        self.baseUrl = baseurl

        for link in links:
            # find region with tolerance
            self.parentetree = self.regionHandler.get_region(link, baseurl, 1)
            if type(self.parentetree) == tuple:
                # error
                _err = self.parentetree
                continue

            # get the charset. We dont have etree in htmlHandler,
            # so we have to use the one from regionHandler
            self.formatter.set_charset(
                self.regionHandler.formatter.get_charset())

            self.__debug("*" * 100 + '\n' + "*" * 40 + " DATA REGION " +
                         "*" * 40)
            self.__debug(
                lxml.etree.tostring(self.parentetree, pretty_print=True))
            # get root tag
            try:
                self.parentetree = self.parentetree.getroot()
            except:
                pass

            # Parent tag is table
            # call _handle_table
            if self.parentetree.tag in ('table', 'tbody'):
                self.__verbose("Handling table")

                _result = self._handle_table()
                # if we had a dictionary, continue filling it
                if len(recordlist) > 0:
                    for key in _result:
                        recordlist[key] = _result[key]
                else:
                    recordlist = _result
            # Parent tag is not table
            # call _handle_sequence
            else:
                self.__verbose("Handling sequences")

                _result = self._handle_sequence()
                recordlist.extend(_result)
        # no records found
        if len(recordlist) == 0:
            if not _err == None:
                return _err
            return derrno.__err__(derrno.ENOREC)
        self.__debug("DATA RECORDS: ")
        self.__debug(recordlist)
        return recordlist  # returns list of records

Пример #16

Показать файл

Файл: getdeliverablerecords.py Проект: KNOT-GIT/mDeliverables

class GetDeliverableRecords:
    """ get records and return dict or list of records with atributes """

    def __init__(self, verbose=False,debug=False):
        # init agent for parsing html
        self.htmlHandler = GetHTMLAndParse()        
        # to get region where to search for records
        self.regionHandler = GetDeliverableRegion()        
        # init text formatter (encoding, erasing white chars etc.)
        self.formatter = TextFormatUtils()               
        # list of acceptable words in title (header) of table
        self.table_sem_words = ['deliverable', 'description', 'name', 'date',
                                'dissemination', 'no.', 'wp', 'delivery',
                                'particip', 'title', 'nature']
        self._omitted_tags = ('br', 'img', 'html', 'body')
        # tag tolerance
        self.tagtol = 1
        # verbose and debug flags
        self.debugger = DeliverableDebugger(verbose = verbose,debug = debug)
        self.__verbose = self.debugger.verbose
        self.__debug = self.debugger.debug


########################### TABLE HANDLING METHODS ############################

    
    def _get_descendats_texts(self, elem, string=True):
        """ Get texts from element and his descendants.
        If string isset, returns texts as one string with spaces.
        # elem: lxml element """
        texts = []
        for child in elem.iter():
            if child.text and isinstance(child.tag, basestring):
                if re.search("[a-z0-9]", child.text, re.I):
                    texts.append(self.formatter.format(child.text))
        if string:
            return " ".join(texts)
        return texts


    
    def _get_table_order(self):
        """ Get table order (table semantic) """
        sem_list = []
        for desc in self.parentetree.iterdescendants():
            if desc.tag == 'tr': # first <tr> match
                for col in desc: # its <th> or <td>
                    for child in col.iterdescendants():
                        if child.tag == 'a':
                             if self.htmlHandler.check_file(child.get('href')):
                                 return None
                    value = self._get_descendats_texts(col)
                    if value != None:
                        # if it is not title, but some text.
                        if len(value) > 30: 
                            return None
                        sem_list.append(value)
                break
        str_sem_list = " ".join(sem_list)
        for expr in self.table_sem_words:
            # two matches ???
            if re.search(expr, str_sem_list, re.I): 
                return sem_list
        return None


    
    def _get_row_link(self, row):
        """ Get link from row of table - go through columns and the only href
        leading to deliverable is returned. """
        # find all anchors where parent is row
        linkanch = row.findall('.//a[@href]')
        if len(linkanch) == 0:
            return None
        for link in linkanch:
            anchor_link = link.get('href')
            if self.htmlHandler.check_file(anchor_link): # check if it is file we want
                return anchor_link
        return None


    

    def _handle_table(self):
        """ Handle region as a table.
        Work with region as it's a table. Try to get table semantic (table order)
        and get all records out of it. """
        # get table semantic
        tbl_order = self._get_table_order()
        # if we didnt recognize table order, get records and return list
        if not tbl_order:
            self.__verbose("Table order not recognized, getting data...")
            records = []
            # tr tag is a record
            for row in self.parentetree:
                if not row.tag == 'tr':
                    continue
                row_list = []
                _thislink = self._get_row_link(row)
                if _thislink == None:
                    continue
                row_list.append(_thislink)
                for column in row:
                    text = self._get_descendats_texts(column)
                    if not text:
                        continue
                    row_list.append(text)
                records.append(row_list)
                del(row_list)
            return records     
        # else we have recognized table order, make dict of dicts out of it
        else:
            self.__verbose("Table order recognized, filling dictionary in this order.")
            # every column of the row (every atribute of the record) has it's own
            # semantic in order of table semantic
            semantic_data = dict()
            for row in self.parentetree:
                self._thislink = self._get_row_link(row)
                # if its header or non-deliverable row, omit it.
                if self._thislink == None:
                    continue
                semantic_data[self._thislink] = {}
                for index, column in enumerate(row):
                    # get column text                    
                    text = self._get_descendats_texts(column)
                    if not text:
                        continue
                    try:
                        # store it
                        semantic_data[self._thislink][tbl_order[index]] = text
                    except:
                        continue
            return semantic_data

########################  TAG SEQUENCE RECOGNIZING METHODS ####################

    
    def _is_deliv_anch(self, tag):
        """ Tag check. 
        If it is anchor with href leading to deliverable, returns True """
        if tag.tag == 'a':
            href = tag.get('href')
            if self.htmlHandler.check_file(href):
                return True
        return False


    
    def _tagfilter(self, tag):
        """ Filters useless and messy tags.
        Return false if useless, true if normal tag """
        if tag.tag in self._omitted_tags:
            return False
        #if tag.text:
        #    if not re.search("[a-z0-9\[\]]", tag.text, re.I):
        #        return False
        return True


    
    def _getdiff(self, reg, tol):
        """ Gets difference between first two anchors. """
        # etree reg = element tree region
        # int tol: accepted tolerance of tags 
        d = {}
        index = 0
        # fill the dictionary with differences and their occurences
        for tag in reg.iter():
            if not self._tagfilter(tag):
                continue
            if self._is_deliv_anch(tag) and not index == 0:
                try:
                    d[index] += 1
                except:
                    d[index] = 1
                index = 0
            index += 1        
        # check differencies if the variety isn't higher then $tol tolerance
        difflist = d.keys()
        self.__debug("difflist: "+str(difflist))
        if len(difflist) == 0:
            return -1
        _max = max(difflist)
        _min = min(difflist)
        dlen = len(d.keys())
        if dlen == 1:
            return d.keys()[0]
        if dlen > ((2*tol)+1): # tolerance to both sides
            return -1
        if (_max - _min) > 2*tol: # some acceptable tolerance
            return -1    
        # get the most frequent difference
        most_freq = max(d.values())
        for key in d:
            if d[key] == most_freq:
                return key
        return -1
    
    def _get_anch_only(self):
        """ Only anchors found. No optional information. """
        anchlist = self.htmlHandler.find_anchor_elem(self.baseUrl, self.parentetree)
        # We have to make list of list because XMLOutput
        return [[anch] for anch in anchlist]


    
    def _get_tag_sequences(self, tag_tol=1):
        """ Main method handling tag sequences and recognizing records.
        returns list of records. """
        records = []
        self._rec = []        
        if len(self.parentetree) == 0:
            return [[self.parentetree]]
        # get interval between anchors, use tolerance tag_tol
        self.difference = self._getdiff(self.parentetree, self.tagtol)
        while self.difference == -1:
            if self.tagtol > 5:
                self.__verbose("Variety of intervals between anchors is too huge. "+\
                               "Getting data out of anchors only")
                return self._get_anch_only()
            self.tagtol += 1
            self.difference = self._getdiff(self.parentetree, self.tagtol)        
        
        # get sequence of first n tags, where n is average interval between anchors
        # this could be tag-sequence describing all records in region.
        self.record_seq = []
        i = 0
        for tag in self.parentetree.iter():
            if not self._tagfilter(tag):
                continue
            if i >= self.difference:
                if not 'a' in self.record_seq:
                    del self.record_seq[0]
                else:
                    break
            self.record_seq.append(tag.tag)
            i += 1       
         
        # counter indicates on which position in tag sequence we actually are
        counter = 0
        # make sequence of tags as they go
        regionlist = filter(self._tagfilter, [tag for tag in self.parentetree.iter()])
        recseqlen = len(self.record_seq)
        reglistlen = len(regionlist)
        
        # flag indicating begin of records - in region on the beginning can be some garbage
        self.begin = False
        # indicating unpredictable separator between deliverable records
        self.separator = 0 
        for i, tag in enumerate(regionlist):
            # skip and save the sequence at the end
            if counter > self.difference-1:
                records.append(self._rec) # save
                self._rec = [] # erase the list
                counter = 0 # reset counter
            if not self.begin:
                if tag.tag != self.record_seq[0]:
                    continue
                else:
                    try:
                        if regionlist[i+1].tag != self.record_seq[1]:
                            continue
                    except:
                        pass
                    self.begin = True
            # handle tolerances, try to compare sibling tags
            self.match = False # match flag
            
            # tolerance algorithm. Goes through html and tries to pass irregular tags in sequence.
            for tol in range(self.tagtol+1):
                if tag.tag == self.record_seq[(counter + tol) % recseqlen] or \
                   regionlist[(i + tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter += tol+1
                    break
                elif tag.tag == self.record_seq[(counter - tol) % recseqlen] or \
                   regionlist[(i - tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter -= tol
                    counter += 1
                    break
            # if nothing matched, its probably out of tolerance
            if not self.match:
                self.separator += 1
                # tolerance 10 separators (tags between boxes or tables of deliverables)
                if self.separator > 10:
                    self.__verbose("Tag sequence doesnt match, probably out of "+\
                                "tolerance, getting data out of anchors only")
                    # maybe here could be tolerance++
                    # we didnt catch the sequence with tolerance...
                    return self._get_anch_only()             
        records.append(self._rec)
        return filter(self._validseq, records)


    
    def _validseq(self, rec):
        """ Helper method - check if sequence of tags rec contains deliv anchor """
        for _atr in rec:
            # if we have anchor containing link to document, return true
            if self._is_deliv_anch(_atr): 
                return True
        return False
        
    def _get_tag_content(self, tag):
        """ Get element texts only, dont look for descendants texts """
        l = []
        if tag.tag == 'a':
            href = tag.get('href')
            # if link leading to document found, add string to list
            if href is not None and self.htmlHandler.check_file(href):
                l.append(self.formatter.format(href))
            title = tag.get('title')
            # if title found in tag, add string to list
            if title:
                l.append(self.formatter.format(title))
        # if not anchor, search text in tag.text
        if tag.text:
            if re.search("[a-z0-9]", tag.text, re.I):
                l.append(self.formatter.format(tag.text))
        return l

    
    
    def _harvest_text(self, record_tag_list):
        """ Harvest texts out of tags and return list of lists (record) """
        self._records = []
        self._rec = []
        # loop over records and search all possible useful texts
        for rec_list in record_tag_list:
            for tag in rec_list:
                self._rec.extend(self._get_tag_content(tag))
            self._records.append(self._rec)
            self._rec = []
        return self._records


    
    def _handle_sequence(self):
        """ Text harvesting for sequences. """
        seq = self._get_tag_sequences()        
        return self._harvest_text(seq)
        
############################  OVERALL METHODS  ################################

    
    def _manual_process_page(self, links, baseurl):
        """ Get records from region according document links """
        _err = None
        recordlist = []
        self.baseUrl = baseurl
        
        for link in links:
            # find region with tolerance
            self.parentetree = self.regionHandler.get_region(link, baseurl, 1)
            if type(self.parentetree) == tuple:
                # error
                _err = self.parentetree
                continue
            
            # get the charset. We dont have etree in htmlHandler, 
            # so we have to use the one from regionHandler
            self.formatter.set_charset(self.regionHandler.formatter.get_charset())
            
            self.__debug("*"*100+'\n'+"*"*40+" DATA REGION "+"*"*40)
            self.__debug(lxml.etree.tostring(self.parentetree, pretty_print=True))
            # get root tag
            try:
                self.parentetree = self.parentetree.getroot()
            except:
                pass
            
            # Parent tag is table
            # call _handle_table
            if self.parentetree.tag in ('table','tbody'):
                self.__verbose("Handling table")
                
                _result = self._handle_table()
                # if we had a dictionary, continue filling it
                if len(recordlist) > 0:
                    for key in _result:
                        recordlist[key] = _result[key]
                else:
                    recordlist = _result
            # Parent tag is not table
            # call _handle_sequence
            else:
                self.__verbose("Handling sequences")
                
                _result = self._handle_sequence()
                recordlist.extend(_result)
        # no records found            
        if len(recordlist) == 0:
            if not _err == None:
                return _err
            return derrno.__err__(derrno.ENOREC)
        self.__debug("DATA RECORDS: ")
        self.__debug(recordlist)
        return recordlist # returns list of records

Пример #17

Показать файл

Файл: getdelivpage.py Проект: lucidvoci/ResearchProjectPortal

class GetDelivPage:

    def __init__(self, url, verbose=False, debug=False, addkeyw=None):
        # keywords used for document page search
        self._sigwords = ["d((eliverables?)|[0-9])",
                          "documents?",
                          "reports?",
                          "public(ation)?s?",
                          "results?",             
                          "presentations?",
                          "library",
                           #"projects?",
                          "outocomes?", "downloads?",
                          "outputs?"]
        
        if addkeyw != None:
            self._sigwords.append(addkeyw)

        """ Associative array containing links with their flags
        { url : [Index/NoIndex/Frame, Visit/Visited, Rank] }
        index = 0, noindex = 1, frame = 2, unvisited = 0, visited = 1 """
        self._link_stack = { url : [0,0,0] }

        self.base_url = url # save base (input) url

        # Open an parsing agent to get needed data from page
        self.agent = GetHTMLAndParse()

        self._current_url = url

        # a constant used to set rank in order of importance of the expression 
        # being tested (self._sigwords)
        self.rank_const = len(self._sigwords)

        # few a constants for dictionary - just for good-looking source code
        self.IND_FR = 0 # index/noindex/frame/special
        self.VISIT = 1 # unvisited/visited
        self.RANK = 2 # value of rank

        # set verbose flag
        self.__verbose__ = verbose

        #set debug flag
        self.__dbg__ = debug
        
        # checking data types
        if not type(self.__verbose__) == bool:
            raise ValueError("Verbose flag has to be boolean.")


    def __verbose(self, msg):
        _err = "cannot decode verbose message."
        if self.__verbose__ == True:
            try:
                print(str(msg))
            except UnicodeError:
                print(_err) 

        
    def __debug(self, msg):
        _err = "cannot decode debug info."
        if self.__dbg__ == True:
            try:
                print("Debug message:    "+str(msg))
            except UnicodeError:
                print(_err) 

################################################################################

    """ Initialize item in dictionary to noindex/unvisited/rank=0 """
    def _link_item_init__(self, link, index=1, visit=0, rank=0):
        # default setting: noindex,unvisited,norank
        if not self._link_stack.has_key(link):
           self._link_stack[link] = [index,visit,rank]
        return


    """ Edits item in dictionary self._link_stack """
    def _link_item_edit(self, link, index=None, visit=None, rank=None):
        if index is not None:
            self._link_stack[link][self.IND_FR] = index
        if visit is not None:
            self._link_stack[link][self.VISIT] = visit
        if rank  is not None:
            # null rank if zero is argument
            if rank == 0:
                self._link_stack[link][self.RANK] = 0
            # add rank
            else:
                self._link_stack[link][self.RANK] += rank
        return

    
    """ Method representing one level of cascade. Do almost any job to search 
    one word in dictionary """
    def _level_job(self, index=None):
        # get list of links from anchors containing one of expression
        # from self_sigwords
        result = 0
        if index is not None: # searching with one 
            link_list = self.agent.get_all_links(
                regul = re.compile(self._sigwords[index], re.I), 
                base  = self._current_url)        
        else:
            link_list = self.agent.get_all_links(base = self._current_url)
            index = self.rank_const
        if link_list:
            #
            #   RANK giving & filter
            #       
            if index is None:
                rank = 0
            elif index == 0:
                rank = self.rank_const * 2
            else:
                rank = self.rank_const - index
            for link in link_list:
                # GTFO javascript
                if not link or "javascript:" in link or "mailto:" in link: 
                    continue
                if "#" in link: # if pointer delete it
                    link = re.sub('#.*$', '', link)
                if len(link) > 200:  
                    continue                
                if self._link_stack.get(link):
                    # RANK if you see those links for first
                    if self._link_stack[link][self.VISIT] == 0:
                        self._link_item_edit(self._current_url, rank=rank)
                    continue
                if not self.agent.compare_domains(self.base_url, link):
                    continue

                split_link = re.sub("https?://.+?/", "", link)
                # check whether it is file or not
                 
                if self.agent.is_wanted_mime(link):
                    #
                    #   Some PDF or DOC found
                    #
                    # RANK
                    self._link_item_edit(self._current_url, rank=10)
                    self.__debug("Added rank 10 to "+self._current_url)
                    # 
                    if re.search("de?l?(iverable)?[0-9]+([\._-][0-9])?", 
                                  split_link, re.I):
                        self.__debug("Type D on "+self._current_url) # debug print
                        # RANK
                        self._link_item_edit(self._current_url, rank=100)
                    continue
                elif not self.agent.is_page(link):
                    continue
                    self.__debug("UNWATED")
                #
                # Add link
                #
                # RANK
                # initialization of link item in dict
                self._link_item_init__(link)
                self._link_item_edit(self._current_url, rank=rank)
                result += 1
                # debug print
                self.__debug("ADD "+link[7:60])
                self.__debug("Rank "+str(rank)+" "+self._current_url)    
        return result


    """ Cascade search. May improve the speed of script """
    def _cascade_search(self):
        result = 0
        # first cascade - look for links cont. deliverables
        result += self._level_job(0)
        if not result == 0:
            return
        # second cascade - look for links cont. documents and publications
        result += self._level_job(1) 
        result += self._level_job(2)
        if not result == 0:
            return
        # last cascade - all the rest
        for i in range(3,self.rank_const):
            result += self._level_job(i)
        # check Intro page (all links) only on index
        if result == 0 and self._link_stack[self._current_url][0] == 0:
            result += self._level_job() 
        """if result == 0:
            # RANK DOWN
            self._link_item_edit(self._current_url, rank=0)
            print "No anchors on the page"""
        return


    """ TRY TO repair link. But for now only append / in base """
    def _repair_links(self, base=None):
        if base is None:
            base = self.base_url
        if re.match(".*[^/]$", base):
            base += "/"
        if self.agent.get_etree() == -1:
            return -1
        links = self.agent.get_all_links(base = base)
        # compare link with base url
        for link in links:
            if not self.agent.compare_domains(self.base_url, link):
                continue
            link = re.sub("https?://.+?/", base, link)
            # if match, save it as special case
            self._link_item_init__(link, index=3)


    """ Checking intro page. It is page without content, only with Enter label """
    def _check_intro(self):
        links = self.agent.get_all_links(base = self._current_url)
        self.__debug("We've found intro links: "+str(links))
        for link in links:
            if not self.agent.compare_domains(self.base_url, link):
                continue
            # save new link as normal page
            self._link_item_init__(link, index=1)
   

    """ Looks for frames on the page """
    def _check_frames(self):
        frames = self.agent.look_for_frame(base = self._current_url)
        if not frames:
            return None
        fcount = len(frames)
        # debug print
        self.__debug("We've found frames ("+str(fcount)+") on "+self._current_url) 
        # save new link as frame page
        for link in frames:
            if self.agent.compare_domains(self._current_url, link):
              self._link_item_init__(link, index=2)
        return fcount

    
    """ Checks for titles and gives rank according the result """
    def _check_titles(self):
        for i in range(self.rank_const):
            hcount = self.agent.count_all_headers(
                re.compile( self._sigwords[i], re.I ))
            if not hcount == 0:
                if i == 0: 
                    #
                    # "deliverable" match, the highest rank
                    #
                    # RANK constant is multiplied by 4
                    self.__debug("deliverable match"+str(self.rank_const *
                    4)+" "+self._current_url)
                    self._link_item_edit(self._current_url, 
                                         rank = self.rank_const * 4)
                else:
                    #
                    # other word match
                    #
                    # do not multiplied rank constant
                    self.__debug("Rank "+str(self.rank_const - i)+" "+self._current_url) 
                    self._link_item_edit(self._current_url, 
                                         rank = self.rank_const - i)


    """ Get information about current link """
    def _check_anchor(self):
        # tt is Text and Title
        tt = self.agent.get_anchor_from_link(self._current_url)
        # return 0 if no anchor match
        if tt == 0: return tt;
        # match for deliverables
        if re.search(self._sigwords[0], tt, re.I):
            self.__debug("Anchor matched "+self._current_url) # debug print
            return 1
        

    """ Returns list of unvisited links. Useful in cycle. """
    def _check_unvisited_links(self):
        unvisitedLinks = []
        for link in self._link_stack:
            if self._link_stack[link][self.VISIT] == 0: # if unvisited
                unvisitedLinks.append(link)
        return unvisitedLinks # list of unvisited page links

    
    """ Aplying all methods to unvisited links - next level of searching. 
    It is main private method. Only this method can decide end of searching """
    def _handle_unvis_links(self):
        unvisLinks = self._check_unvisited_links()
        if not unvisLinks:
            return None # end of searching
        for link in unvisLinks: # cycle in unvisited links
            # visit and parse page
            self._link_item_edit(link, visit = 1)

            (res, err) = self.agent.ghap(link)
            if res == -1:
                self.__debug(str(err)+" "+str(link)) # debug print
                # if link is broken (IND_FR == 3)
                if self._link_stack[link][self.IND_FR] != 3:
                    self._repair_links()
                continue
            # little hack with error message, there is no error but URL!
            if res == 2:
                self.base_url = err # URL of the new base
            self.__debug("Getting url in ghap(): "+str(link)) # debug print
            self.__verbose("Searching... URL: "+str(link)) # verbose print
            self._current_url = link
            if self._link_stack[link][self.IND_FR] == 2:
                dname = self.agent.get_domain_name(link)
                if dname is not None:
                    self.base_url = dname

            ###############
            # frame check #
            self._check_frames()

            ################
            # titles check #
            self._check_titles() # rank giving here

            ################
            # anchor check #
            if self._check_anchor():
                self._link_item_edit(link, rank = 10) # rank giving here too

            self._cascade_search() # search for next links on this page
        # when no unvisited links in list, return
        return 1


    """ Returns link of the highest value of rank in self._link_stack. 
    It is called in the end of process."""
    def _get_highest_ranks_link(self):
        hRank = 0
        hLink = ""
        # check all links and choose link with the highest rank
        for link in self._link_stack:
            if self._link_stack[link][self.RANK] > hRank:
                hLink = link
                hRank = self._link_stack[link][self.RANK]
        return hLink # WINNER


    """ Returns list of all links leading to deliverables. 
    Try to find more sites with deliverables.. i.e. like www.awissenet.com has.
    Maybe test for name of link - anchor: i.e. next, prev, [0-9]+ and so one...
    Page usualy looks like:       next pages: 1 2 3 4 ... """
    def _get_deliv_link_list(self,first_link):
        # agent gets first_link
        final_list = []
        nonvisited = [first_link]
        current = nonvisited.pop()
        while current:
            if not current or "javascript:" in current or "mailto:" in current:
                try:
                    current = nonvisited.pop()
                except: 
                    break
                continue
            if self.agent.ghap(current)[0] == -1: # CACHE ??? maybe 
                try:
                    current = nonvisited.pop()
                except: 
                    break
                continue

            nonvisited.extend(self.agent.get_pager_links(base=current))
            final_list.append(current) # append only one link
            try:
                current = nonvisited.pop()
            except: 
                break
        return final_list # returning all pages with deliverables
    
        
    """ Returns list of links on pages with deliverable-documents.
    If found returns list, if not found, return -1. 
    Only public method in module. """
    def get_deliverable_page(self):
        # the main searching loop 
        # while we have some unvisited links, search
        while self._handle_unvis_links(): 
            # security case
            if len(self._link_stack) > 10:
                break
            self.__debug("Stack content: "+str(self._link_stack))
        if len(self._link_stack) == 1 :
            return derrno.__err__(derrno.ELNOTFOUND)

        final_link = self._get_highest_ranks_link()
        if not final_link or self._link_stack[final_link][2] == 0:
            return derrno.__err__(derrno.ELNOTFOUND)
        self.__debug('#'*79)
        self.__debug("DELIVERABLE PAGE: "+final_link)
        return [final_link]
        
        ####### not in use #############
        result = self._get_deliv_link_list(final_link)
        if len(result) == 0:
            return derrno.__err__(derrno.ELNOTFOUND)
        else:
            return result