Exemplo n.º 1
0
    def __init__(self, verbose=False, debug=False):
        self.__dbg__ = debug
        self.__verbos = verbose
        self._crawler = Crawler()
        self._crawler.set_headers((
                   ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \
                   ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
                 ))
        self._wraper = HTMLSequenceWrapper(childcoef=5.0,
                                           headercoef=3.0,
                                           mintextlen=20)

        self._unwanted_titles = ['Download here', 'PDF format']
        self._records = []

        ################################
        #manual processing
        self.agent = GetHTMLAndParse()
        # to get region where to search for records
        self.regionHandler = GetDeliverableRegion()
        # init text formatter (encoding, erasing white chars etc.)
        self.formatter = TextFormatUtils()

        self._omitted_tags = ('br', 'img', 'html', 'body')
        # tag tolerance
        self.tagtol = 1
Exemplo n.º 2
0
 def __init__(self, headercoef=2.5):
     """
     Constructor.
     @param headercoef: lower border of elemet's visibility to be handled as header
     """
     self.generalizer = _RRSPropertyGeneralizer()
     self.ee = EntityExtractor()
     self.headercoef = headercoef
     self.bibtexparser = BibTeXParser()
     self.crawler = Crawler()
     self.mime_handler = MIMEhandler()
     self.crawler.set_handler(FileDownloader)
Exemplo n.º 3
0
 def __init__(self, xmlcompatibility='db09'):
     self.seqwrapper = HTMLSequenceWrapper(childcoef=7.0,
                                           headercoef=3.0,
                                           mintextlen=30)
     self.citaextractor = CitationEntityExtractor(
         ALL, xmlcompatibility=xmlcompatibility)
     self.ee = EntityExtractor()
     self.mime = MIMEhandler()
     self.crawler = Crawler()
     self.bibtex = BibTeXParser()
     self.xmlcompatibility = xmlcompatibility
     self._xmlvalid = int(xmlcompatibility.lstrip('db'))
     self._publ_list = []
Exemplo n.º 4
0
    def __init__(self, verbose=False, debug=False):
        self.__dbg__ = debug
        self.__verbos = verbose
        self._crawler = Crawler()
        self._crawler.set_headers(
            (
                (
                    "User-Agent",
                    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19",
                ),
                ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"),
            )
        )
        self._wraper = HTMLSequenceWrapper(childcoef=5.0, headercoef=3.0, mintextlen=20)

        self._unwanted_titles = ["Download here", "PDF format"]
        self._records = []

        ################################
        # manual processing
        self.agent = GetHTMLAndParse()
        # to get region where to search for records
        self.regionHandler = GetDeliverableRegion()
        # init text formatter (encoding, erasing white chars etc.)
        self.formatter = TextFormatUtils()

        self._omitted_tags = ("br", "img", "html", "body")
        # tag tolerance
        self.tagtol = 1
 def __init__(self, xmlcompatibility='db09'):
     self.seqwrapper = HTMLSequenceWrapper(childcoef=7.0, headercoef=3.0, mintextlen=30)
     self.citaextractor = CitationEntityExtractor(ALL, xmlcompatibility=xmlcompatibility)
     self.ee = EntityExtractor()
     self.mime = MIMEhandler()
     self.crawler = Crawler()
     self.bibtex = BibTeXParser()
     self.xmlcompatibility = xmlcompatibility
     self._xmlvalid = int(xmlcompatibility.lstrip('db'))
     self._publ_list = []
 def __init__(self, headercoef=2.5):
     """
     Constructor.
     @param headercoef: lower border of elemet's visibility to be handled as header
     """
     self.generalizer = _RRSPropertyGeneralizer()
     self.ee = EntityExtractor()
     self.headercoef = headercoef
     self.bibtexparser = BibTeXParser()
     self.crawler = Crawler()
     self.mime_handler = MIMEhandler()
     self.crawler.set_handler(FileDownloader)
Exemplo n.º 7
0
class GetDelivRecords:
    def __init__(self, verbose=False, debug=False):
        self.__dbg__ = debug
        self.__verbos = verbose
        self._crawler = Crawler()
        self._crawler.set_headers((
                   ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \
                   ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
                 ))
        self._wraper = HTMLSequenceWrapper(childcoef=5.0,
                                           headercoef=3.0,
                                           mintextlen=20)

        self._unwanted_titles = ['Download here', 'PDF format']
        self._records = []

        ################################
        #manual processing
        self.agent = GetHTMLAndParse()
        # to get region where to search for records
        self.regionHandler = GetDeliverableRegion()
        # init text formatter (encoding, erasing white chars etc.)
        self.formatter = TextFormatUtils()

        self._omitted_tags = ('br', 'img', 'html', 'body')
        # tag tolerance
        self.tagtol = 1

    def __debug(self, msg):
        _err = "cannot decode debug info."
        if self.__dbg__ == True:
            try:
                print("Debug message:    " + str(msg))
            except UnicodeError:
                print(_err)

    def __verbose(self, msg):
        _err = "cannot decode debug info."
        if self.__verbose == True:
            try:
                print("Verbose:    " + str(msg))
            except UnicodeError:
                print(_err)

########################Processing sequencewrapper output######################

    """function gets an entry from output of sequence wrapper
       it tries to create deliv record and retruns true if succed. """
    def _make_deliv_record(self, entry):

        text = []
        links = []

        #harvest links and text form entry
        for e in entry.iter():
            if e.text != None:
                text.append(e.text)
            if e.attrib.get("link") != None:
                if self.agent.is_wanted_mime(e.attrib.get(
                        "link")) and e.attrib.get("link") not in links:
                    links.append(e.attrib.get("link"))

        res = self._deliv_in_text(text, links)
        if type(res) == RRSPublication:
            self._entriesFoundInText.append(res)
            self.__debug("Record found cause of text")
            return True

        elif type(res) == list:
            res = self._more_entry_in_record(entry)
            if (res == True):
                self.__debug("")
                return True
            else:
                return False

        res = self._deliv_in_link(text, links, entry)
        if type(res) == RRSPublication:
            self._entriesFoundInLinks.append(res)
            self.__debug("Record found cause of link")
            return True

        return False

    """look for keyword in text"""

    def _deliv_in_text(self, text, links):

        #print text
        #print links
        #print "*"*40
        _title = False
        _description = ""
        pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)",
                             re.I)

        #loop through text in entry looking for title and description
        for t in text:
            if _title == False:
                if pattern.search(t):
                    _title = t

            #set the longest string as description of deliverable
            if len(_description) < len(t):
                _description = t

        if _title == _description:
            _description = ""

        _link = False

        if type(links) == str:
            if self.agent.is_wanted_mime(links):
                _link = links
        elif type(links) == list:
            for l in links:
                if self.agent.is_wanted_mime(l):
                    if _link == False:
                        _link = l
                    else:
                        #if there was already found link
                        if _link[:s.rfind(_link, '.')] == l[:s.rfind(l, '.')]:
                            break
                        else:
                            return ['-3', 'Probably more records in one entry']

        #create object
        if _title:
            #print "TITLE:"+_title
            pub = RRSPublication(title=_title, abstract=_description)
            _typ = RRSPublication_type(type='techreport')
            pub['type'] = _typ
            self.__debug("*" * 40)
            self.__debug("Title: " + _title)
            self.__debug("Description: " + _description)

            if _link:
                #print "LINK:"+_link
                self.__debug("Link: " + _link)
                l = RRSUrl(link=_link)
                pl_rel = RRSRelationshipPublicationUrl()
                pl_rel.set_entity(l)
                pub['url'] = pl_rel

            return pub
        else:
            #this entry is not probably deliverable
            return False

    """look for a key word in link"""

    def _deliv_in_link(self, text, links, entry=False):

        ##print text
        ##print links
        #print "*"*40

        _title = False
        _description = ""
        pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)",
                             re.I)

        _link = False

        for l in links:
            if pattern.search(l):
                if _link == False:
                    _link = l
                else:
                    return ['-3', 'Probably more records in one entry']

        #loop through text in entry looking for title and description
        for t in text:
            if _title == False:
                if len(t) > 10:
                    _title = t
            #set the longest string as description of deliverable
            if len(_description) < len(t):
                _description = t

        if _title == _description:
            _description = ""

        #if chosen title is not valid try to find better in parent entry
        if _title and not self._check_title(_title) and entry != False:
            _title = self._repair_title(entry)

        #create object
        if _link:
            pub = RRSPublication(title=_title, abstract=_description)
            typ = RRSPublication_type(type='techreport')
            pub['type'] = typ

            self.__debug("*" * 40)
            self.__debug("Title: " + _title)
            self.__debug("Description: " + _description)

            self.__debug("Link: " + _link)
            l = RRSUrl(link=_link)
            pl_rel = RRSRelationshipPublicationUrl()
            pl_rel.set_entity(l)
            pub['url'] = pl_rel

            return pub
        else:
            #this entry is not probably deliverable
            return False

    """Check if title contents only unwanted string with some tolerance
    return true if title is ok
    """

    def _check_title(self, title, tolerance=10):

        for t in self._unwanted_titles:
            if (s.find(s.lower(title), s.lower(t))) != -1:
                if (len(t) + tolerance) > len(title):
                    return False
        return True

    "looks for an element with highest visibility rank in parent elemet"

    def _repair_title(self, entry):
        parent = entry.getparent()
        visibility = 0
        title = ""
        for i in parent.iter():
            try:
                if i.attrib.get('visibility') > visibility:
                    visibility = i.attrib.get('visibility')
                    title = i.text
            except AttributeError:
                pass

        if title != "":
            return title
        else:
            return False

    "Function try to create array of deliverables from one entry in xml tree"

    def _more_entry_in_record(self, entry):
        for ch in entry.iter('chunk'):
            if ch.text != None and ch.attrib.get("link") != None:
                if self.agent.is_wanted_mime(ch.attrib.get("link")):
                    _pub = RRSPublication(title=ch.text)
                    typ = RRSPublication_type(type='techreport')
                    _pub['type'] = typ
                    _l = RRSUrl(link=ch.attrib.get("link"))
                    _rel = RRSRelationshipPublicationUrl()
                    _rel.set_entity(_l)
                    _pub['url'] = _rel
                    self._entriesFoundInLinks.append(_pub)

    "Process pages definied by urls"

    def process_pages(self, pages):
        self._entriesFoundInText = []
        self._entriesFoundInLinks = []
        self._urls = pages
        self._pages = self._crawler.start(pages)

        #creates RRSPublication objects with information about deliverables
        for u in self._urls:
            self._wraper.wrap(self._pages[u], u)
            self._tree = self._wraper.get_etree()
            #print self._wraper.get_xml()
            for entry in self._tree.iter("entry"):
                self._make_deliv_record(entry)

        if len(self._entriesFoundInText) > 3:
            self.__debug("Deliverbles descriptions content keywords")
            self.__debug("Found " +
                         "{0}".format(len(self._entriesFoundInText)) +
                         " deliv records")

            self._records = self._entriesFoundInText
        elif len(self._entriesFoundInLinks) > 3:
            self.__debug("Deliverbles links content keywords")
            self.__debug("Found " +
                         "{0}".format(len(self._entriesFoundInLinks)) +
                         " deliv records")

            self._records = self._entriesFoundInLinks
        else:
            self._manual_processing()

    "This method is called when ther was no records found in output of sequencewrapper"

    def _manual_processing(self):
        self._entriesFoundInLinks = []
        self._entriesFoundInText = []
        self._manual_process_page(self._urls, urlsplit(self._urls[0])[1])
        if len(self._entriesFoundInText) > 0:
            self.__debug("Deliverbles descriptions content keywords")
            self.__debug("Found " +
                         "{0}".format(len(self._entriesFoundInText)) +
                         " deliv records")

            self._records = self._entriesFoundInText
        elif len(self._entriesFoundInLinks) > 0:
            self.__debug("Deliverbles links content keywords")
            self.__debug("Found " +
                         "{0}".format(len(self._entriesFoundInLinks)) +
                         " deliv records")

            self._records = self._entriesFoundInLinks

    ########################### TABLE HANDLING METHODS ############################
    """ Get texts from element and his descendants.
    If string isset, returns texts as one string with spaces.
    # elem: lxml element """

    def _get_descendats_texts(self, elem, string=True):
        texts = []
        for child in elem.iter():
            if child.text and isinstance(child.tag, basestring):
                if re.search("[a-z0-9]", child.text, re.I):
                    texts.append(self.formatter.format(child.text))
        if string:
            return " ".join(texts)
        return texts

    """ Get link from row of table - go through columns and the only href
    leading to deliverable is returned. """

    def _get_row_link(self, row):
        # find all anchors where parent is row
        linkanch = row.findall('.//a[@href]')
        if len(linkanch) == 0:
            return None
        for link in linkanch:
            anchor_link = link.get('href')
            if self.agent.is_wanted_mime(
                    anchor_link):  # check if it is file we want
                return anchor_link
        return None

    """ Handle region as a table.
    Work with region as it's a table. Try to get table semantic (table order)
    and get all records out of it. """

    def _handle_table(self):
        for row in self.parentetree:
            if not row.tag == 'tr':
                continue
            row_list = []
            _thislink = self._get_row_link(row)
            if _thislink == None:
                continue

            for column in row:
                text = self._get_descendats_texts(column)
                if not text:
                    continue
                row_list.append(text)

            res = self._deliv_in_text(row_list, [_thislink])
            if type(res) == RRSPublication:
                self._entriesFoundInText.append(res)
                self.__debug("Record found cause of text")
            else:
                res = self._deliv_in_link(row_list, [_thislink])
                if type(res) == RRSPublication:
                    self._entriesFoundInLinks.append(res)
                    self.__debug("Record found cause of link")
            del (row_list)
        return

########################  TAG SEQUENCE RECOGNIZING METHODS ####################

    """ Tag check.
    If it is anchor with href leading to deliverable, returns True """
    def _is_deliv_anch(self, tag):
        if tag.tag == 'a':
            href = tag.get('href')
            if self.agent.is_wanted_mime(href):
                return True
        return False

    """ Filters useless and messy tags.
    Return false if useless, true if normal tag """

    def _tagfilter(self, tag):
        if tag.tag in self._omitted_tags:
            return False
        #if tag.text:
        #    if not re.search("[a-z0-9\[\]]", tag.text, re.I):
        #        return False
        return True

    """ Gets difference between first two anchors. """

    def _getdiff(self, reg, tol):
        # etree reg = element tree region
        # int tol: accepted tolerance of tags
        d = {}
        index = 0
        # fill the dictionary with differences and their occurences
        for tag in reg.iter():
            if not self._tagfilter(tag):
                continue
            if self._is_deliv_anch(tag) and not index == 0:
                try:
                    d[index] += 1
                except:
                    d[index] = 1
                index = 0
            index += 1
        # check differencies if the variety isn't higher then $tol tolerance
        difflist = d.keys()
        self.__debug("difflist: " + str(difflist))
        if len(difflist) == 0:
            return -1
        _max = max(difflist)
        _min = min(difflist)
        dlen = len(d.keys())
        if dlen == 1:
            return d.keys()[0]
        if dlen > ((2 * tol) + 1):  # tolerance to both sides
            return -1
        if (_max - _min) > 2 * tol:  # some acceptable tolerance
            return -1
        # get the most frequent difference
        most_freq = max(d.values())
        for key in d:
            if d[key] == most_freq:
                return key
        return -1

    """ Only anchors found. No optional information. """

    def _get_anch_only(self):
        anchlist = self.agent.find_anchor_elem(self.baseUrl, self.parentetree)
        # We have to make list of list because XMLOutput
        return [[anch] for anch in anchlist]

    """ Main method handling tag sequences and recognizing records.
    Returns list of records. """

    def _get_tag_sequences(self, tag_tol=1):
        records = []
        self._rec = []
        if len(self.parentetree) == 0:
            return [[self.parentetree]]
        # get interval between anchors, use tolerance tag_tol
        self.difference = self._getdiff(self.parentetree, self.tagtol)
        while self.difference == -1:
            if self.tagtol > 5:
                self.__verbose("Variety of intervals between anchors is too huge. "+\
                               "Getting data out of anchors only")
                return self._get_anch_only()
            self.tagtol += 1
            self.difference = self._getdiff(self.parentetree, self.tagtol)

        # get sequence of first n tags, where n is average interval between anchors
        # this could be tag-sequence describing all records in region.
        self.record_seq = []
        i = 0
        for tag in self.parentetree.iter():
            if not self._tagfilter(tag):
                continue
            if i >= self.difference:
                if not 'a' in self.record_seq:
                    del self.record_seq[0]
                else:
                    break
            self.record_seq.append(tag.tag)
            i += 1

        # counter indicates on which position in tag sequence we actually are
        counter = 0
        # make sequence of tags as they go
        regionlist = filter(self._tagfilter,
                            [tag for tag in self.parentetree.iter()])
        recseqlen = len(self.record_seq)
        reglistlen = len(regionlist)

        # flag indicating begin of records - in region on the beginning can be some garbage
        self.begin = False
        # indicating unpredictable separator between deliverable records
        self.separator = 0
        for i, tag in enumerate(regionlist):
            # skip and save the sequence at the end
            if counter > self.difference - 1:
                records.append(self._rec)  # save
                self._rec = []  # erase the list
                counter = 0  # reset counter
            if not self.begin:
                if tag.tag != self.record_seq[0]:
                    continue
                else:
                    try:
                        if regionlist[i + 1].tag != self.record_seq[1]:
                            continue
                    except:
                        pass
                    self.begin = True
            # handle tolerances, try to compare sibling tags
            self.match = False  # match flag

            # tolerance algorithm. Goes through html and tries to pass irregular tags in sequence.
            for tol in range(self.tagtol + 1):
                if tag.tag == self.record_seq[(counter + tol) % recseqlen] or \
                   regionlist[(i + tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter += tol + 1
                    break
                elif tag.tag == self.record_seq[(counter - tol) % recseqlen] or \
                   regionlist[(i - tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter -= tol
                    counter += 1
                    break
            # if nothing matched, its probably out of tolerance
            if not self.match:
                self.separator += 1
                # tolerance 10 separators (tags between boxes or tables of deliverables)
                if self.separator > 10:
                    self.__verbose("Tag sequence doesnt match, probably out of "+\
                                "tolerance, getting data out of anchors only")
                    # maybe here could be tolerance++
                    # we didnt catch the sequence with tolerance...
                    return self._get_anch_only()
        records.append(self._rec)
        return filter(self._validseq, records)

    """ Helper method - check if sequence of tags rec contains deliv anchor
    """

    def _validseq(self, rec):
        for _atr in rec:
            # if we have anchor containing link to document, return true
            if self._is_deliv_anch(_atr):
                return True
        return False

    """ Get element texts only, dont look for descendants texts """

    def _get_tag_content(self, tag):
        links = []
        texts = []
        if tag.tag == 'a':
            href = tag.get('href')
            # if link leading to document found, add string to list
            if href is not None and self.agent.is_wanted_mime(href):
                links.append(self.formatter.format(href))
            title = tag.get('title')
            # if title found in tag, add string to list
            if title:
                texts.append(self.formatter.format(title))
        # if not anchor, search text in tag.text
        if tag.text:
            if re.search("[a-z0-9]", tag.text, re.I):
                texts.append(self.formatter.format(tag.text))
        return [links, texts]

    """ Harvest texts out of tags and return list of lists (record) """

    def _harvest_text(self, record_tag_list):
        self._records = []
        self._rec = []
        _links = []
        _texts = []
        # loop over records and search all possible useful texts
        for rec_list in record_tag_list:
            for tag in rec_list:
                harvested = (self._get_tag_content(tag))
                _links.extend(harvested[0])
                _texts.extend(harvested[1])
            #self._records.append(self._rec)
            res = self._deliv_in_text(_texts, _links)
            if type(res) == RRSPublication:
                self._entriesFoundInText.append(res)
                self.__debug("Record found cause of text")
            else:
                res = self._deliv_in_link(_texts, _links)
                if type(res) == RRSPublication:
                    self._entriesFoundInLinks.append(res)
                    self.__debug("Record found cause of link")
            _links = []
            _texts = []
            self._rec = []
        return self._records

    """ Text harvesting for sequences. """

    def _handle_sequence(self):
        seq = self._get_tag_sequences()
        return self._harvest_text(seq)

    """ Get records from region according document links
        this method is used when there was no records found
        in output of sequencewrapper"""

    def _manual_process_page(self, links, baseurl):
        _err = None
        self.baseUrl = baseurl

        for link in links:
            # find region with tolerance
            self.parentetree = self.regionHandler.get_region(link, baseurl, 1)
            if type(self.parentetree) == tuple:
                # error
                _err = self.parentetree
                self.__debug(_err)
                continue
            #make all links absolute in parent tree
            hrefs = self.parentetree.findall('.//a[@href]')
            for href in hrefs:
                href.make_links_absolute('http://' + urlsplit(link)[1] + '/')

            # get the charset. We dont have etree in htmlHandler,
            # so we have to use the one from regionHandler
            self.formatter.set_charset(
                self.regionHandler.formatter.get_charset())

            self.__debug("*" * 100 + '\n' + "*" * 40 + " DATA REGION " +
                         "*" * 40)
            self.__debug(
                lxml.etree.tostring(self.parentetree, pretty_print=True))
            # get root tag
            try:
                self.parentetree = self.parentetree.getroot()
            except:
                pass

            # Parent tag is table
            # call _handle_table
            if self.parentetree.tag in ('table', 'tbody'):
                self.__verbose("Handling table")
                self._handle_table()
            else:
                self.__verbose("Handling sequences")
                self._handle_sequence()

#############PUBLIC METHODS TO GET RESULTS

    def get_deliverables_XML(self):
        """return infromations about deliverables stored in objects as xml"""
        if len(self.get_deliverables()) == 0:
            return derrno.__err__(derrno.ENOREC)
        output = StringIO.StringIO()
        converter = Model2XMLConverter(stream=output)
        converter.convert(self.get_deliverables())
        result = output.getvalue()
        output.close()
        return result

    def get_deliverables(self):
        """return objects containing infromations"""
        if len(self._records) == 0:
            return derrno.__err__(derrno.ENOREC)
        else:
            return self._records
        """
        Returns xml in lxml.etree.ElementTree object.
        """
        self._make_xml()
        return self.xmldocument


# ------------------------------------------------------------------------------
# end of class HTMLSequenceWrapper
# ------------------------------------------------------------------------------

if __name__ == '__main__':
    from rrslib.web.crawler import Crawler
    sp = HTMLSequenceWrapper(childcoef=5.0, headercoef=3.0, mintextlen=20)

    c = Crawler()
    #urls = ['http://www.ualberta.ca/~bhan/publ.htm']
    #urls = ['http://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=1&lang=1']
    urls = [
        'http://kaminari.scitec.kobe-u.ac.jp/pub_en.html',
        'http://www.cis.uab.edu/sprague/',
        'http://www2.lifl.fr/~carle/old/mabib.htm',
        'http://www.poli.usp.br/p/fabio.cozman/',
        'http://www.cs.washington.edu/homes/weld/pubs.html',
        'http://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=1&lang=1'
    ]
    #urls = ['https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=2&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=3&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=4&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=5&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=6&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=7&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=8&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=9&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=10&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=11&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=12&lang=1']

    #urls = ['http://www.cs.princeton.edu/~schapire/publist.html']
    #urls = ['http://bionum.cs.purdue.edu/p.html']
    #urls = ['http://www.awissenet.eu/publications.aspx']
class GetDelivRecords:


    def __init__(self,verbose=False,debug=False):
        self.__dbg__  = debug
        self.__verbos = verbose
        self._crawler = Crawler()
        self._crawler.set_headers((
                   ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \
                   ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
                 ))
        self._wraper = HTMLSequenceWrapper(childcoef=5.0, headercoef=3.0, mintextlen=20)
             
        self._unwanted_titles = ['Download here','PDF format']
        self._records = []

        ################################
        #manual processing
        self.agent = GetHTMLAndParse()
        # to get region where to search for records
        self.regionHandler = GetDeliverableRegion()
        # init text formatter (encoding, erasing white chars etc.)
        self.formatter = TextFormatUtils()
        
        self._omitted_tags = ('br', 'img', 'html', 'body')
        # tag tolerance
        self.tagtol = 1
        

    def __debug(self, msg):
        _err = "cannot decode debug info."
        if self.__dbg__ == True:
            try:
                print("Debug message:    "+str(msg))
            except UnicodeError:
                print(_err)
    def __verbose(self,msg):
        _err = "cannot decode debug info."
        if self.__verbose == True:
            try:
                print("Verbose:    "+str(msg))
            except UnicodeError:
                print(_err)

    
    
########################Processing sequencewrapper output######################
    """function gets an entry from output of sequence wrapper
       it tries to create deliv record and retruns true if succed. """
    def _make_deliv_record(self,entry):
        
        text  = []
        links = []

        #harvest links and text form entry
        for e in entry.iter():
            if e.text != None:
                text.append(e.text)
            if e.attrib.get("link")!=None:
                if self.agent.is_wanted_mime(e.attrib.get("link")) and e.attrib.get("link") not in links:
                   links.append(e.attrib.get("link"))


        res = self._deliv_in_text(text,links)
        if type(res) == RRSPublication:
            self._entriesFoundInText.append(res)
            self.__debug("Record found cause of text")
            return True

        elif type(res)==list:
            res=self._more_entry_in_record(entry)
            if(res==True):
               self.__debug("")
               return True
            else:
               return False

        res = self._deliv_in_link(text,links,entry)
        if type(res) == RRSPublication:
            self._entriesFoundInLinks.append(res)
            self.__debug("Record found cause of link")
            return True

        return False

    """look for keyword in text"""
    def _deliv_in_text(self,text,links):
        
        #print text
        #print links
        #print "*"*40
        _title = False
        _description = ""
        pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)",re.I)

        #loop through text in entry looking for title and description
        for t in text:
           if _title == False:
              if pattern.search(t):
                     _title = t


           #set the longest string as description of deliverable
           if len(_description)<len(t):
                _description = t

        if _title == _description:
            _description = ""

        _link = False

        if type(links) == str:
            if self.agent.is_wanted_mime(links):
                _link = links
        elif type(links) ==list:
            for l in links:
                if self.agent.is_wanted_mime(l):
                    if _link == False:
                       _link = l
                    else:
                       #if there was already found link
                       if _link[:s.rfind(_link,'.')] == l[:s.rfind(l,'.')]:
                          break
                       else:
                          return ['-3','Probably more records in one entry']

        
        #create object
        if _title:
            #print "TITLE:"+_title
            pub = RRSPublication(title=_title,abstract=_description)
            _typ = RRSPublication_type(type='techreport')
            pub['type'] = _typ
            self.__debug("*"*40)
            self.__debug("Title: "+_title)
            self.__debug("Description: "+_description)



            if _link:
                #print "LINK:"+_link
                self.__debug("Link: "+_link)
                l = RRSUrl(link=_link)
                pl_rel = RRSRelationshipPublicationUrl()
                pl_rel.set_entity(l)
                pub['url'] = pl_rel

            return pub
        else:
            #this entry is not probably deliverable
            return False

    """look for a key word in link"""
    def _deliv_in_link(self,text,links,entry = False):
        
        ##print text
        ##print links
        #print "*"*40
        
        _title = False
        _description = ""
        pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)",re.I)

        _link = False
        
        for l in links:
            if pattern.search(l):
                  if _link == False:
                     _link =l
                  else:
                     return ['-3','Probably more records in one entry']



        #loop through text in entry looking for title and description
        for t in text:
           if _title == False:
                if len(t)>10 :
                     _title = t
           #set the longest string as description of deliverable
           if len(_description)<len(t):
                _description = t
             

        if _title == _description:
            _description = ""

        #if chosen title is not valid try to find better in parent entry
        if _title and not self._check_title(_title) and entry != False:
            _title = self._repair_title(entry)        
       
        
        #create object
        if _link:
            pub = RRSPublication(title=_title,abstract=_description)
            typ = RRSPublication_type(type='techreport')
            pub['type'] = typ

            self.__debug("*"*40)
            self.__debug("Title: "+_title)
            self.__debug("Description: "+_description)
            
            self.__debug("Link: "+_link)
            l = RRSUrl(link=_link)
            pl_rel = RRSRelationshipPublicationUrl()
            pl_rel.set_entity(l)
            pub['url'] = pl_rel
            
            return pub
        else:
            #this entry is not probably deliverable
            return False

    """Check if title contents only unwanted string with some tolerance
    return true if title is ok
    """
    def _check_title(self,title,tolerance=10):
        
        for t in self._unwanted_titles:
           if (s.find(s.lower(title),s.lower(t))) != -1:
               if (len(t)+tolerance) > len(title):
                   return False
        return True

    "looks for an element with highest visibility rank in parent elemet"
    def _repair_title(self,entry):
        parent = entry.getparent()
        visibility = 0
        title = ""
        for i in parent.iter():
             try:
                 if i.attrib.get('visibility') > visibility:
                     visibility = i.attrib.get('visibility')
                     title = i.text
             except AttributeError:
                 pass

        if title != "":
            return title
        else:
            return False

    "Function try to create array of deliverables from one entry in xml tree"
    def _more_entry_in_record(self,entry):
        for ch in entry.iter('chunk'):
           if ch.text != None and ch.attrib.get("link")!=None:
              if self.agent.is_wanted_mime(ch.attrib.get("link")):
                 _pub= RRSPublication(title=ch.text)
                 typ = RRSPublication_type(type='techreport')
                 _pub['type'] = typ
                 _l = RRSUrl(link=ch.attrib.get("link"))
                 _rel = RRSRelationshipPublicationUrl()
                 _rel.set_entity(_l)
                 _pub['url'] = _rel
                 self._entriesFoundInLinks.append(_pub) 
      
    "Process pages definied by urls"
    def process_pages(self,pages):
       self._entriesFoundInText = []
       self._entriesFoundInLinks = []
       self._urls = pages
       self._pages = self._crawler.start(pages)
       

       #creates RRSPublication objects with information about deliverables
       for u in self._urls:
          self._wraper.wrap(self._pages[u],u)
          self._tree = self._wraper.get_etree()
          #print self._wraper.get_xml()
          for entry in self._tree.iter("entry"):
             self._make_deliv_record(entry)
          
       
       if len(self._entriesFoundInText)>3:
            self.__debug("Deliverbles descriptions content keywords")
            self.__debug("Found " + "{0}".format(len(self._entriesFoundInText)) + " deliv records")
       
            self._records = self._entriesFoundInText
       elif len(self._entriesFoundInLinks)>3:
            self.__debug("Deliverbles links content keywords")
            self.__debug("Found " + "{0}".format(len(self._entriesFoundInLinks)) + " deliv records")
       
            self._records = self._entriesFoundInLinks
       else:
            self._manual_processing()
            

    "This method is called when ther was no records found in output of sequencewrapper"
    def _manual_processing(self):
        self._entriesFoundInLinks = []
        self._entriesFoundInText = []
        self._manual_process_page(self._urls, urlsplit(self._urls[0])[1])
        if len(self._entriesFoundInText)>0:
            self.__debug("Deliverbles descriptions content keywords")
            self.__debug("Found " + "{0}".format(len(self._entriesFoundInText)) + " deliv records")

            self._records = self._entriesFoundInText
        elif len(self._entriesFoundInLinks)>0:
            self.__debug("Deliverbles links content keywords")
            self.__debug("Found " + "{0}".format(len(self._entriesFoundInLinks)) + " deliv records")

            self._records = self._entriesFoundInLinks

    ########################### TABLE HANDLING METHODS ############################

    """ Get texts from element and his descendants.
    If string isset, returns texts as one string with spaces.
    # elem: lxml element """
    def _get_descendats_texts(self, elem, string=True):
        texts = []
        for child in elem.iter():
            if child.text and isinstance(child.tag, basestring):
                if re.search("[a-z0-9]", child.text, re.I):
                    texts.append(self.formatter.format(child.text))
        if string:
            return " ".join(texts)
        return texts


  
    """ Get link from row of table - go through columns and the only href
    leading to deliverable is returned. """
    def _get_row_link(self, row):
        # find all anchors where parent is row
        linkanch = row.findall('.//a[@href]')
        if len(linkanch) == 0:
            return None
        for link in linkanch:
            anchor_link = link.get('href')
            if self.agent.is_wanted_mime(anchor_link): # check if it is file we want
                return anchor_link
        return None


    """ Handle region as a table.
    Work with region as it's a table. Try to get table semantic (table order)
    and get all records out of it. """
    def _handle_table(self):
        for row in self.parentetree:
            if not row.tag == 'tr':
                continue
            row_list = []
            _thislink = self._get_row_link(row)
            if _thislink == None:
                continue

            for column in row:
                text = self._get_descendats_texts(column)
                if not text:
                    continue
                row_list.append(text)

            res = self._deliv_in_text(row_list, [_thislink])
            if type(res) == RRSPublication:
                self._entriesFoundInText.append(res)
                self.__debug("Record found cause of text")
            else:
                res = self._deliv_in_link(row_list, [_thislink])
                if type(res) == RRSPublication:
                    self._entriesFoundInLinks.append(res)
                    self.__debug("Record found cause of link")
            del(row_list)
        return 
       

########################  TAG SEQUENCE RECOGNIZING METHODS ####################

    """ Tag check.
    If it is anchor with href leading to deliverable, returns True """
    def _is_deliv_anch(self, tag):
        if tag.tag == 'a':
            href = tag.get('href')
            if self.agent.is_wanted_mime(href):
                return True
        return False


    """ Filters useless and messy tags.
    Return false if useless, true if normal tag """
    def _tagfilter(self, tag):
        if tag.tag in self._omitted_tags:
            return False
        #if tag.text:
        #    if not re.search("[a-z0-9\[\]]", tag.text, re.I):
        #        return False
        return True


    """ Gets difference between first two anchors. """
    def _getdiff(self, reg, tol):
        # etree reg = element tree region
        # int tol: accepted tolerance of tags
        d = {}
        index = 0
        # fill the dictionary with differences and their occurences
        for tag in reg.iter():
            if not self._tagfilter(tag):
                continue
            if self._is_deliv_anch(tag) and not index == 0:
                try:
                    d[index] += 1
                except:
                    d[index] = 1
                index = 0
            index += 1
        # check differencies if the variety isn't higher then $tol tolerance
        difflist = d.keys()
        self.__debug("difflist: "+str(difflist))
        if len(difflist) == 0:
            return -1
        _max = max(difflist)
        _min = min(difflist)
        dlen = len(d.keys())
        if dlen == 1:
            return d.keys()[0]
        if dlen > ((2*tol)+1): # tolerance to both sides
            return -1
        if (_max - _min) > 2*tol: # some acceptable tolerance
            return -1
        # get the most frequent difference
        most_freq = max(d.values())
        for key in d:
            if d[key] == most_freq:
                return key
        return -1


    """ Only anchors found. No optional information. """
    def _get_anch_only(self):
        anchlist = self.agent.find_anchor_elem(self.baseUrl, self.parentetree)
        # We have to make list of list because XMLOutput
        return [[anch] for anch in anchlist]

    
    """ Main method handling tag sequences and recognizing records.
    Returns list of records. """
    def _get_tag_sequences(self, tag_tol=1):
        records = []
        self._rec = []
        if len(self.parentetree) == 0:
            return [[self.parentetree]]
        # get interval between anchors, use tolerance tag_tol
        self.difference = self._getdiff(self.parentetree, self.tagtol)
        while self.difference == -1:
            if self.tagtol > 5:
                self.__verbose("Variety of intervals between anchors is too huge. "+\
                               "Getting data out of anchors only")
                return self._get_anch_only()
            self.tagtol += 1
            self.difference = self._getdiff(self.parentetree, self.tagtol)

        # get sequence of first n tags, where n is average interval between anchors
        # this could be tag-sequence describing all records in region.
        self.record_seq = []
        i = 0
        for tag in self.parentetree.iter():
            if not self._tagfilter(tag):
                continue
            if i >= self.difference:
                if not 'a' in self.record_seq:
                    del self.record_seq[0]
                else:
                    break
            self.record_seq.append(tag.tag)
            i += 1

        # counter indicates on which position in tag sequence we actually are
        counter = 0
        # make sequence of tags as they go
        regionlist = filter(self._tagfilter, [tag for tag in self.parentetree.iter()])
        recseqlen = len(self.record_seq)
        reglistlen = len(regionlist)

        # flag indicating begin of records - in region on the beginning can be some garbage
        self.begin = False
        # indicating unpredictable separator between deliverable records
        self.separator = 0
        for i, tag in enumerate(regionlist):
            # skip and save the sequence at the end
            if counter > self.difference-1:
                records.append(self._rec) # save
                self._rec = [] # erase the list
                counter = 0 # reset counter
            if not self.begin:
                if tag.tag != self.record_seq[0]:
                    continue
                else:
                    try:
                        if regionlist[i+1].tag != self.record_seq[1]:
                            continue
                    except:
                        pass
                    self.begin = True
            # handle tolerances, try to compare sibling tags
            self.match = False # match flag

            # tolerance algorithm. Goes through html and tries to pass irregular tags in sequence.
            for tol in range(self.tagtol+1):
                if tag.tag == self.record_seq[(counter + tol) % recseqlen] or \
                   regionlist[(i + tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter += tol+1
                    break
                elif tag.tag == self.record_seq[(counter - tol) % recseqlen] or \
                   regionlist[(i - tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter -= tol
                    counter += 1
                    break
            # if nothing matched, its probably out of tolerance
            if not self.match:
                self.separator += 1
                # tolerance 10 separators (tags between boxes or tables of deliverables)
                if self.separator > 10:
                    self.__verbose("Tag sequence doesnt match, probably out of "+\
                                "tolerance, getting data out of anchors only")
                    # maybe here could be tolerance++
                    # we didnt catch the sequence with tolerance...
                    return self._get_anch_only()
        records.append(self._rec)
        return filter(self._validseq, records)


    """ Helper method - check if sequence of tags rec contains deliv anchor
    """
    def _validseq(self, rec):
        for _atr in rec:
            # if we have anchor containing link to document, return true
            if self._is_deliv_anch(_atr):
                return True
        return False

  
    """ Get element texts only, dont look for descendants texts """
    def _get_tag_content(self, tag):
        links = []
        texts = []
        if tag.tag == 'a':
            href = tag.get('href')
            # if link leading to document found, add string to list
            if href is not None and self.agent.is_wanted_mime(href):
                links.append(self.formatter.format(href))
            title = tag.get('title')
            # if title found in tag, add string to list
            if title:
                texts.append(self.formatter.format(title))
        # if not anchor, search text in tag.text
        if tag.text:
            if re.search("[a-z0-9]", tag.text, re.I):
                texts.append(self.formatter.format(tag.text))
        return [links,texts]


    """ Harvest texts out of tags and return list of lists (record) """
    def _harvest_text(self, record_tag_list):
        self._records = []
        self._rec = []
        _links = []
        _texts = []
        # loop over records and search all possible useful texts
        for rec_list in record_tag_list:
            for tag in rec_list:
                harvested = (self._get_tag_content(tag))
                _links.extend(harvested[0])
                _texts.extend(harvested[1])
            #self._records.append(self._rec)
            res = self._deliv_in_text(_texts, _links)
            if type(res) == RRSPublication:
                 self._entriesFoundInText.append(res)
                 self.__debug("Record found cause of text")
            else:
                 res = self._deliv_in_link(_texts, _links)
                 if type(res) == RRSPublication:
                      self._entriesFoundInLinks.append(res)
                      self.__debug("Record found cause of link")
            _links = []
            _texts = []
            self._rec = []
        return self._records


    """ Text harvesting for sequences. """
    def _handle_sequence(self):
        seq = self._get_tag_sequences()
        return self._harvest_text(seq)



    """ Get records from region according document links
        this method is used when there was no records found
        in output of sequencewrapper"""
    def _manual_process_page(self, links, baseurl):
        _err = None
        self.baseUrl = baseurl

        for link in links:
            # find region with tolerance
            self.parentetree = self.regionHandler.get_region(link, baseurl, 1)
            if type(self.parentetree) == tuple:
                # error
                _err = self.parentetree
                self.__debug(_err)
                continue
            #make all links absolute in parent tree
            hrefs = self.parentetree.findall('.//a[@href]')
            for href in hrefs:
                href.make_links_absolute('http://'+urlsplit(link)[1]+'/')
            
            # get the charset. We dont have etree in htmlHandler,
            # so we have to use the one from regionHandler
            self.formatter.set_charset(self.regionHandler.formatter.get_charset())

            self.__debug("*"*100+'\n'+"*"*40+" DATA REGION "+"*"*40)
            self.__debug(lxml.etree.tostring(self.parentetree, pretty_print=True))
            # get root tag
            try:
                self.parentetree = self.parentetree.getroot()
            except:
                pass
            
            # Parent tag is table
            # call _handle_table
            if self.parentetree.tag in ('table','tbody'):
                self.__verbose("Handling table")
                self._handle_table()
            else:
                self.__verbose("Handling sequences")
                self._handle_sequence()
    

#############PUBLIC METHODS TO GET RESULTS
    def get_deliverables_XML(self):
        """return infromations about deliverables stored in objects as xml"""
        if len(self.get_deliverables())==0:
            return derrno.__err__(derrno.ENOREC)
        output = StringIO.StringIO()
        converter = Model2XMLConverter(stream=output)
        converter.convert(self.get_deliverables())
        result = output.getvalue()
        output.close()
        return result
        

    def get_deliverables(self):
        """return objects containing infromations"""
        if len(self._records) == 0:
            return derrno.__err__(derrno.ENOREC)
        else:
            return self._records
        Returns xml in lxml.etree.ElementTree object.
        """
        self._make_xml()
        return self.xmldocument


# ------------------------------------------------------------------------------
# end of class HTMLSequenceWrapper
# ------------------------------------------------------------------------------


if __name__ == '__main__':
    from rrslib.web.crawler import Crawler
    sp = HTMLSequenceWrapper(childcoef=5.0, headercoef=3.0, mintextlen=20)

    c = Crawler()
    #urls = ['http://www.ualberta.ca/~bhan/publ.htm']
    #urls = ['http://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=1&lang=1']
    urls = ['http://kaminari.scitec.kobe-u.ac.jp/pub_en.html',
            'http://www.cis.uab.edu/sprague/',
            'http://www2.lifl.fr/~carle/old/mabib.htm',
            'http://www.poli.usp.br/p/fabio.cozman/',
            'http://www.cs.washington.edu/homes/weld/pubs.html',
            'http://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=1&lang=1'
           ]
    #urls = ['https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=2&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=3&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=4&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=5&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=6&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=7&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=8&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=9&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=10&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=11&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=12&lang=1']

    #urls = ['http://www.cs.princeton.edu/~schapire/publist.html']
    #urls = ['http://bionum.cs.purdue.edu/p.html']
    #urls = ['http://www.awissenet.eu/publications.aspx']
    #urls = ['http://www.fit.vutbr.cz/~smrz/pubs.php.en']
Exemplo n.º 11
0
class PublicationListExtractor(object):
    """
    PublPageMetaExtractor handles harvests metadata from web pages containing
    references (publication list). For parsing sequences in HTML dom we use
    extractors.sequencewrapper.HTMLSequenceWrapper. For parsing citations
    (records in data regions, which were found by sequencewrapper) we use
    extractors.citationentityextractor.CitationEntityExtractor.

    To improve accuracy of this system, we check headers wheather they contain some
    keyword, which could help us to determine the correct type of publication.

    From headers we also harvest keywords.
    """

    entitydefstr = {
        '216': 'O',
        '217': 'U',
        '214': 'O',
        '197': 'A',
        '198': 'E',
        '210': 'O',
        '211': 'O',
        '195': 'A',
        '194': 'A',
        '196': 'A',
        '193': 'A',
        '192': 'A',
        '251': 'u',
        '252': 'u',
        '238': 'i',
        '239': 'i',
        '235': 'e',
        '234': 'e',
        '212': 'O',
        '236': 'e',
        '237': 'i',
        '230': 'e',
        '231': 'c',
        '232': 'e',
        '213': 'O',
        '224': 'a',
        '249': 'u',
        '253': 'y',
        '248': 'o',
        '243': 'o',
        '255': 'y',
        '250': 'u',
        '233': 'e',
        '201': 'E',
        '200': 'E',
        '203': 'E',
        '202': 'E',
        '205': 'I',
        '204': 'I',
        '207': 'I',
        '206': 'I',
        '242': 'o',
        '220': 'U',
        '245': 'o',
        '244': 'o',
        '246': 'o',
        '241': 'n',
        '218': 'U',
        '229': 'a',
        '228': 'a',
        '227': 'a',
        '226': 'a',
        '225': 'a',
        '219': 'U',
        '221': 'Y',
        # these are added
        '248': 'r',
        '185': 's',
        '174': 'Z',
        '232': 'c',
        '200': 'C',
        '169': 'S',
        '190': 'z',
        '199': 'C',
        'amp': '&',
        'nbsp': ' ',
        'quot': '\"'
    }

    def __init__(self, xmlcompatibility='db09'):
        self.seqwrapper = HTMLSequenceWrapper(childcoef=7.0,
                                              headercoef=3.0,
                                              mintextlen=30)
        self.citaextractor = CitationEntityExtractor(
            ALL, xmlcompatibility=xmlcompatibility)
        self.ee = EntityExtractor()
        self.mime = MIMEhandler()
        self.crawler = Crawler()
        self.bibtex = BibTeXParser()
        self.xmlcompatibility = xmlcompatibility
        self._xmlvalid = int(xmlcompatibility.lstrip('db'))
        self._publ_list = []

    def _set_new_topic(self, publ, kw):
        """
        This method adds new topic to publication.
        """
        if not re.search("[a-z]{4,}", kw):
            return publ
        if re.search("publi|paper", kw, re.I):
            return publ
        t = RRSTopic(title=kw)
        publ.set('topic', t)
        return publ

    def _set_publ_type(self, header, publ):
        def _floor(i):
            if i > 100: i = 100
            return i

        if header is None: return publ
        # try to set publication type from header
        for _type in RRSPublication.publication_types:
            if re.search(_type, header, re.I):
                if publ.get('type') == _type:
                    publ.set('credibility', _floor(publ.get('credibility')))
                else:
                    publ.set('type', _type)
                return publ
        if re.search("dissertation", header, re.I):
            publ.set('type', 'phdthesis')
            return publ
        if re.search('technical report', header, re.I):
            publ.set('type', 'techreport')
            return publ
        # make keyword from header
        return self._set_new_topic(publ, header)

    def translate_html_entities(self, text):
        ents = re.findall(r'&(#?)(x?)(\w+);', text)
        for ent in set(ents):
            try:
                text = re.sub('&(#?)' + re.escape(ent[2]) + ";",
                              self.entitydefstr[ent[2]], text)
            except:
                pass
        return text

    def compare_chunks_to_extracted(self, chunks, publ):
        if not publ.get('title'): return publ
        title = self.translate_html_entities(publ.get('title'))
        authors = publ.get('person_author')
        author_names = [a.get('name')[0].get('full_name') for a in authors]
        for ch in chunks:
            l = ch.get_link()
            # get chunk text
            ch = self.translate_html_entities(ch.get_text())
            # add url if available
            if l is not None and not l.startswith("javascript") and l != "#":
                u = RRSUrl(type='publication', title=ch, link=l)
                publ.set('url', u)

            # repair title if needed
            if ch in title or ch == title:
                if float(len(ch)) / float(len(title)) > 0.4:
                    publ.set('title', ch)
            # repair names if needed
            for a in author_names:
                if a in ch:
                    authors_extracted = self.ee.find_authors(ch)
                    publ.person_author = authors_extracted[0]
                break
        return publ

    def _fill_citation(self, publ):
        c = RRSCitation()
        c.set('content', self.cita_text)
        if publ.get('event'):
            c.set('event', publ.get('event')[0].get('title'))
        return c

    def _handle_bibtex_pages(self):
        urls = {}
        for i, p in enumerate(self._publ_list):
            pub_u = p.get('url')
            for u in pub_u:
                urls[u.get('link')] = i

        #if link is web page, not pdf
        urls_to_download = []
        content_types = self.mime.start(urls.keys())
        for k in urls.keys():
            if content_types[k] in ('text/html', 'application/xhtml+xml',
                                    'application/x-httpd-php',
                                    'text/javascript'):
                urls_to_download.append(k)
        # download page a try it for bibtex
        pages = self.crawler.start(urls_to_download)

        for u in urls_to_download:
            bibtex = self.bibtex.parse(pages[u])
            # if bibtex on page, set publication
            if bibtex is not None:
                self._publ_list[urls[u]] = bibtex

    def _empty(self):
        for x in range(len(self._publ_list)):
            self._publ_list.pop()
        self.cita_text = None

    def _handle_document(self, doc):
        self._empty()
        # for all regions which were found
        for reg in doc.get_regions():
            # get their header
            header = reg.get_name()
            # for all records in region
            for rec in reg._manual_process_page():
                # create empty citation object
                c = RRSCitation()
                # harvest citation record text (probably citation we hope)
                self.cita_text = self.translate_html_entities(rec.get_text())
                # set the content of record to citation object
                c.set('content', self.cita_text)
                # fill object it wih extracted data
                c = self.citaextractor.extract(c)

                # get extracted publication
                publ = c.get('publication_cited')
                # if sequencewrapper extracted come text chunks, it helps us a lot,
                # beacause we can compare extracted data to chunks and if not matched
                # we can fix it
                publ = self.compare_chunks_to_extracted(rec.get_chunks(), publ)
                # insert citation into publication
                # !!! we are extracting publications, not citations. Because we dont
                # want tree like this: citation->publication but this:
                # publication->citation
                publ.set('citation', self._fill_citation(publ))
                # try to find publication type in header of data region
                publ = self._set_publ_type(header, publ)
                # add to publication list
                self._publ_list.append(publ)
        #self._handle_bibtex_pages()
        return self._publ_list

    #---------------------------------------------------------------------------
    # public methods
    #---------------------------------------------------------------------------
    def extract_data(self, tree, url):
        """
        Main method for extracting publication metadata from page.
        """
        # wrap html document
        document = self.seqwrapper.wrap_h(tree, url)
        # handle it and return the result
        return self._handle_document(document)
Exemplo n.º 12
0
class PublicationPageExtractor(object):
    """
    This class wraps all methods for recognition publication and it's description
    on the web page.

    The result of page-processing is ORM object rrslib.db.model.RRSPublication
    with extracted data.
    """
    # what could an abstract start with
    _abstract_startswith = ('We present', 'This paper', 'This publica',
                            'In this p', 'The paper', 'This book')
    _abstract_blacklist = ('CiteU', 'Was ', 'I ha', 'I pre', 'I was', 'CiteS',
                           'Microso')
    # tags representing lists in HTML
    _list_tags = ('ol', 'ul', 'dl')
    # tags representing list items in HTML
    _list_item_tags = ('li', 'dd', 'dt')
    # ommitted tags - they are useless for this reason
    _omitted_tags = ('form', 'option', 'link', 'style', 'meta', 'head',
                     'script')
    # acceptable mime types for documents (publications)
    _accepted_mime = ('application/pdf', 'application/rtf',
                      'application/postscript', 'application/msword')

    def __init__(self, headercoef=2.5):
        """
        Constructor.
        @param headercoef: lower border of elemet's visibility to be handled as header
        """
        self.generalizer = _RRSPropertyGeneralizer()
        self.ee = EntityExtractor()
        self.headercoef = headercoef
        self.bibtexparser = BibTeXParser()
        self.crawler = Crawler()
        self.mime_handler = MIMEhandler()
        self.crawler.set_handler(FileDownloader)

    def _get_visibility2elem_map(self, etree):
        htmlroot = etree.getroot()
        visibility2elem_map = {}
        for elem in htmlroot.iterdescendants():
            # no need to get those
            if elem.tag in self._omitted_tags: continue
            assert hasattr(elem, 'style')
            v = elem.style.get_visibility()
            if v in visibility2elem_map:
                visibility2elem_map[v].append(elem)
            else:
                visibility2elem_map[v] = [elem]
        return visibility2elem_map

    def _classify_publ_title(self, title, init=70):
        def _bracket_alone(text):
            if text is not None and re.search(" [\(\)\{\}\[\]] ", text):
                return True
            return False

        # default credibility
        cred = init
        # remove prepositions and words with len(w) < 3
        title = re.sub("(?<= )(?:[fF]or|[iI]n|[oO]f|[oO]n|[aA]t|[Tt]he|"\
                       "[Aa]nd|[aA]|[Ii]s|[Ww]e|[Tt]o)(?= )", "", title)
        title = re.sub("^(?:A|The|In|Is|To) ", "", title)
        title = re.sub("[ ]+", " ", title)
        # split into chunks
        title_sp = title.split(" ")
        _bad = 0
        # if there are many chunks (almost-words) with no meaning, reduce
        # credibility value
        _blacklisted = '(?<![a-z])(?:department|faculty|universtiy|society|lab|press|)(?![a-z]+)'
        for chunk in title_sp:
            if re.search(_blacklisted, chunk, re.I):
                _bad += 1
            elif re.search("[a-z]{3,}", chunk, re.I):
                pass
            else:
                _bad += 1
        # guess accuracy
        negative = float(_bad) / float(len(title_sp))
        cred = float(cred) - negative * 65
        # bonus if all chunks are OK
        if _bad == 0:
            cred += 20
        # if there in title is bracket alone, reduce credibility
        if _bracket_alone(title):
            cred -= 15
        # floor
        if cred > 100: cred = 100
        if cred < 0: cred = 0
        return cred

    def _most_alike_term(self, term, target, threshold=0.3):
        assert len(target) > 0
        assert term is not None
        l = ListedDict()
        for t in target:
            s = SequenceMatcher(None, term, t)
            l[float(s.ratio())] = t
        m = max(l)
        if m < float(threshold):
            return None
        return l[m][0]

    def _add_property(self, property, values):
        # Add the property and its value into publication
        # maybe it is not a property, but an entity
        firstval = values[0]
        if firstval is None: return
        # First try to get attributes of the publication
        if property in ('abstract', 'isbn', 'issn', 'volume', 'number',
                        'acronym', 'issn', 'note'):
            _type = self._publ.__types__[property]
            if _type is basestring:
                _type = unicode
            self._publ[property] = (_type)(firstval)
        elif property == 'title':
            self._publ[property] = firstval
            # the origin is from meta-tag, so we are pretty sure about this information
            self._publ['credibility'] = 95
        elif property == 'publisher':
            self._publ['publisher'] = RRSOrganization(title=firstval)
        elif property == 'date':
            r = self.ee.find_published_date(firstval)
            if not r[0]: return
            rrsdate = r[0][0]
            for attr in ('year', 'month'):
                if rrsdate.get(attr) is not None:
                    self._publ[attr] = rrsdate.get(attr)
        elif property == 'type':
            # choose better heuristics
            publtype = self._most_alike_term(firstval, publication_types, 0.4)
            if publtype is not None:
                self._publ['type'] = RRSPublication_type(type=publtype)
        elif property == 'pages':
            if re.search("[0-9]+\-{1,2}[0-9]+", firstval):
                self._publ['pages'] = firstval
        elif property == 'start page':
            if not re.search("^[0-9]+$", firstval): return
            if 'end page' in self._storage and not 'pages' in self._publ:
                self._publ['pages'] = "%s-%s" % (firstval,
                                                 self._storage['end page'][0])
            else:
                self._storage[property] = [firstval]
        elif property == 'end page':
            if not re.search("^[0-9]+$", firstval): return
            if 'start page' in self._storage and not 'pages' in self._publ:
                self._publ['pages'] = "%s-%s" % (
                    self._storage['start page'][0], firstval)
            else:
                self._storage[property] = [firstval]
        # --------------------------------------------------
        # Now other entities connected with the publiacation
        # --------------------------------------------------
        elif property == 'topic':
            for topictitle in values:
                rel = RRSRelationshipPublicationTopic()
                rel.set_entity(RRSTopic(title=topictitle))
                self._publ['topic'] = rel
        elif property == 'url':
            for link in values:
                try:
                    rel = RRSRelationshipPublicationUrl()
                    u = RRSUrl(link=link)
                except (RRSDatabaseAttributeError, RRSDatabaseEntityError,
                        RRSDatabaseValueError):
                    return
                u['type'] = RRSUrl_type(type='publication')
                rel.set_entity(u)
                self._publ['url'] = rel
        elif property == 'keywords':
            for kw in values:
                rel = RRSRelationshipPublicationKeyword()
                rel.set_entity(RRSKeyword(title=kw))
                self._publ['keyword'] = rel
        elif property in ('author', 'editor'):
            for person in values:
                rel = RRSRelationshipPersonPublication()
                rel['editor'] = property == 'editor'
                r = self.ee.find_authors(person)
                if not r[0]: return
                rel.set_entity(r[0][0])
                rel['author_rank'] = len(self._publ['person']) + 1
                self._publ['person'] = rel
        elif property == 'reviews':
            # TODO if real reviews would be implemented in DB schema,
            # change this: add RRSReview objects into publication
            self._publ.set("review", values, strict=False)

    def _parse_meta(self, document):
        doc_meta = document.get_meta_map()
        # transform into generalized form
        for key in doc_meta:
            property = self.generalizer.generalize(key)
            if property is None: continue
            if property in self._storage:
                for val in doc_meta[key]:
                    if val not in self._storage[property]:
                        self._storage[property].append(val)
            else:
                self._storage[property] = doc_meta[key]
        # make authors and editors disjoint sets
        if 'author' in self._storage and 'editor' in self._storage:
            to_del = []
            for a in self._storage['author']:
                if a in self._storage['editor']:
                    to_del.append(a)
            for a in to_del:
                self._storage['author'].remove(a)
        # and now just set the values into real RRS objects
        for property in self._storage:
            self._add_property(property, self._storage[property])
        self._storage = {}

    def _find_local_sequence(self, header_elem, h_func):
        # lightweight version of sequencewrapper (rrslib.web.sequencewrapper)
        # targeted to repeated sequence of tags in one level of DOM - there's no
        # analysis of data regions (we already found one), just looking for data
        # records in a very straightforward (and very stupid) way.
        # @param header_elem - lxml.html.HtmlElement representing header element
        # @param h_func - heuristic function returning true/false, has to accept
        #                 one parameter - element (instance of lxml.html.HtmlElement)
        # @return tuple (records, likelihood of data)
        tags = ListedDict()
        for elem in header_elem.iterchildren():
            if elem.tag in self._omitted_tags: continue
            tags[elem.tag] = elem
        (tag, elements) = tags.item_by_longest_value()
        if len(elements[0]) < 2:
            return (None, 0.0)
        res = []
        grouped_elements = []
        for e_group in elements:
            res.extend(filter(h_func, e_group))
            grouped_elements.extend(e_group)
        return (res, float(len(res)) / float(len(grouped_elements)))

    def _get_data_below_header(self, elem, hdrtext, to_be_processed):
        # Try to iter over siblings of the header element and get text
        siblings = [sib.tag for sib in elem.itersiblings()]
        # the header is abstract
        if hdrtext == 'abstract':
            txts = {}
            paragraphs = []
            par_stop = False
            for sib in elem.itersiblings():
                content = sib.text_content()
                if sib in to_be_processed:
                    par_stop = True
                if sib.tag == 'p' and len(content) > 50 and not par_stop:
                    paragraphs.append(content)
                chunk = content[0:20].lower()
                score = 1.0
                for st in self._abstract_startswith:
                    if chunk.startswith(st): score *= 5.0
                score *= len(content)
                txts[score] = SimpleHTMLCleaner.clean(content)
            if paragraphs:
                self._storage[hdrtext] = [
                    SimpleHTMLCleaner.clean(" ".join(paragraphs))
                ]
            else:
                self._storage[hdrtext] = [txts[max(txts.keys())]]

        # related publications
        elif hdrtext == 'related':
            list_tags = ('ul', 'ol', 'dl')
            return  # TODO
            for ltag in list_tags:
                if ltag in siblings:
                    for sib in elem.itersiblings():
                        pass

        # keywords
        elif hdrtext == 'keywords':
            # create function returning elements containing possible keywords
            is_keyword = lambda kw: re.search("^(([a-z]{3,}( |,)){1,3} ?)+([a-z]{3,} ?){1,3}$", \
                                    kw.text_content(), re.I) \
                                    and not re.search("[@#\$%\^&\*\(\)]", kw.text_content())
            # iter over siblings of header a try to get keywords from its children
            likelihood_to_keyword_tags = ListedDict()
            for s in elem.itersiblings():
                (kw_elems,
                 likelihood) = self._find_local_sequence(s, is_keyword)
                if kw_elems is None: continue
                likelihood_to_keyword_tags[likelihood] = kw_elems
            if not likelihood_to_keyword_tags: return
            # if found some keywords, store them
            self._storage[hdrtext] = [
                kw.text_content() for kw in likelihood_to_keyword_tags[max(
                    likelihood_to_keyword_tags.keys())][0]
            ]

        # references
        elif hdrtext == 'references':
            pass  # TODO

        # chapters ??
        elif hdrtext == 'chapters':
            pass  # TODO

        # reviews?
        elif hdrtext == 'reviews':
            if hdrtext in self._storage: return
            # create function returning elements containing possible reviews
            is_review = lambda r: (len(r.text_content()) > 100
                                   ) or r.tag == 'blockquote'
            probability = ListedDict()
            # iter over siblings of header a try to get reviews from its children
            for s in elem.itersiblings():
                (elems, prob) = self._find_local_sequence(s, is_review)
                if elems is None: continue
                probability[prob] = elems
            review_texts = []
            if not probability: return
            for e in probability[max(probability.keys())][0]:
                review_texts.append(SimpleHTMLCleaner.clean(e.text_content()))
                # set all the elements as "processed" to avoid further processing
                for d in e.iter():
                    d.processed = True
            self._storage[hdrtext] = review_texts

    def _parse_visibility(self, document):
        vis_map = self._get_visibility2elem_map(document.get_etree())
        if len(vis_map) < 2: return
        sorted_vis = sorted(vis_map.keys(), reverse=True)
        if len(sorted_vis) < 2: return
        to_be_processed = None
        while 42:  #:)
            to_be_processed = []
            for i in xrange(0, len(sorted_vis)):
                if sorted_vis[i] < self.headercoef: continue
                to_be_processed.extend(vis_map[sorted_vis[i]])
            if len(to_be_processed) < 2:
                self.headercoef -= 0.5
            else:
                break
        # storage for possible titles
        possible_titles = ListedDict()
        # loop over all headers (elements containing very visible texts)
        for elem in to_be_processed:
            # get cleaned text content of the tag
            txt = SimpleHTMLCleaner.clean(elem.text_content())
            # generalize: maybe it is something useful
            hdrtext = self.generalizer.generalize(txt)
            # generalization found header beeing TITLE -> data are below header
            if hdrtext is not None:
                # found some useful header, try to get data below
                # what is below? probably sibling tags and their descendants
                self._get_data_below_header(elem, hdrtext, to_be_processed)
            # generalization wasnt successful -> maybe the header contains data
            else:
                # date?
                d = self.ee.find_published_date(txt)
                if d[0]:
                    rrsdate = d[0][0]
                    for attr in ('year', 'month'):
                        if rrsdate.get(attr) is not None:
                            self._publ[attr] = rrsdate.get(attr)
                    txt = d[1]
                # maybe title
                if len(txt.split(" ")) > 3:  # probably more than three words
                    # is there a domain name in the title? So it is probably
                    # general name of the website
                    if len(self.domain) > 6 and re.search(
                            re.escape(self.domain), txt, re.I):
                        continue

                    # preprocessing - remove standalone brackets
                    txt = re.sub("[\(\[][^\)\]]*[\)\]]+", "", txt).strip()
                    if document.name is not None and re.search(
                            re.escape(txt), document.name, re.I):
                        possible_titles[int(
                            self._classify_publ_title(txt, init=100))] = txt
                    elif len(txt.split(" ")) > 5:
                        possible_titles[int(
                            self._classify_publ_title(txt, init=60))] = txt
        if possible_titles:
            titles = possible_titles[max(possible_titles)]
            if len(titles) > 1:
                title = self._get_longest_string(titles)
            else:
                title = titles[0]
            self._publ['title'] = title
            self._publ['credibility'] = max(possible_titles)
        else:
            self._publ['credibility'] = 0
        # store all new properties and their values
        for property in self._storage:
            self._add_property(property, self._storage[property])

    def _get_longest_string(self, l):
        mx = None
        maxlen = 0
        for t in l:
            if len(t) > maxlen:
                maxlen = len(t)
                mx = t
        return mx

    def _find_abstract(self, etree):
        c = Cleaner(scripts=True,
                    javascript=True,
                    comments=True,
                    style=True,
                    meta=True,
                    page_structure=False,
                    processing_instructions=True,
                    embedded=True,
                    frames=False,
                    forms=True,
                    annoying_tags=True,
                    add_nofollow=False,
                    remove_unknown_tags=False)
        etree_copy = deepcopy(etree)
        etree_copy = c.clean_html(etree_copy)
        html = tostring(etree_copy.getroot())
        # XXX this may be probably useful, to delete all <p> tags...
        html = re.sub("</?p[^>]*>", " ", html)
        possible = []
        txts = re.findall("(?<=\>)[^>]+(?=\<)", html, re.U)
        for txt in txts:
            txt = SimpleHTMLCleaner.clean(txt)
            if len(txt) > 200:
                do_not_append = False
                for bl in self._abstract_blacklist:
                    if txt.startswith(bl):
                        do_not_append = True
                        break
                if not do_not_append:
                    possible.append(txt)
                    continue
            for st in self._abstract_startswith:
                if txt.startswith(st):
                    possible.append(txt)
                    break
        return self._get_longest_string(possible)

    def _find_unbound_entities(self, page):
        root = page.get_etree().getroot()
        # get abstract
        if not 'abstract' in self._publ:
            abst = self._find_abstract(page.get_etree())
            if abst is not None:
                self._publ['abstract'] = abst

        # find url of publication (pdf, ps, doc...)
        if 'url' not in self._publ:
            to_be_checked = []
            for (element, attribute, link, pos) in root.iterlinks():
                if re.search("(pdf|doc|odt|ps)$", link, re.I):
                    to_be_checked.append(link)
                # TODO try to get links with no suffix (queries etc.)
                # ----------------------------------------------------
                # ADD THE CODE HERE
                # ----------------------------------------------------
            if to_be_checked:
                documents = []
                mimes = self.mime_handler.start(to_be_checked)
                for link in mimes:
                    if mimes[link] in self._accepted_mime:
                        documents.append(link)
                dl = len(documents)
                doc_link = None
                if dl == 1:  # exactly one link
                    doc_link = documents[0]
                elif dl != 0:  # more than one
                    # try to guess out of the name of the publication
                    if 'title' in self._publ:
                        doc_link = self._most_alike_term(
                            self._publ['title'], documents, 0.5)
                if doc_link is not None:
                    try:
                        rel = RRSRelationshipPublicationUrl()
                        u = RRSUrl(link=doc_link)
                    except (RRSDatabaseAttributeError, RRSDatabaseEntityError,
                            RRSDatabaseValueError):
                        return
                    u['type'] = RRSUrl_type(type='publication')
                    rel.set_entity(u)
                    self._publ['url'] = rel

        # Now extract unbound entities from the plaintext.
        # Every time there is a high probability that the relationship will be
        # miss-recognitized.

        # get keywords if there are no such
        if not 'keyword' in self._publ:
            # Try to get keywords from the text. They are probably in the format:
            # Keywords: algorithm, algorithmic process, random, some other keyword
            kwre = re.search(
                "keywords?:?[\t\n\r ]+(([a-z]{3,}( |,)){1,3} ?)+([a-z]{3,} ?){1,3}",
                self.pagetext, re.I)
            if kwre is not None:
                kwstr = kwre.group(0)
                kwstr = re.sub("[kK]eywords?:?[\t\n\r ]+", "", kwstr)
                keywords = [x.strip() for x in kwstr.split(",")]
                for kw in keywords:
                    rel = RRSRelationshipPublicationKeyword()
                    rel.set_entity(RRSKeyword(title=kw))
                    self._publ['keyword'] = rel

    def _parse_bibtex(self, page):
        # Parse all possible bibtex on the page.
        # At first parse plaintext of the page and try to get BibTeX string out
        # of there. Then try to find possible links fo bibtex files, download
        # them and parse them.
        bibtexpubl = self.bibtexparser.parse(self.pagetext)
        if bibtexpubl:
            pass
        # try to get .bib or .bibtex files from the page
        else:
            html_tag = page.get_etree().getroot()
            bibtex_links = set()
            for l in html_tag.iterlinks():
                if re.search("\.bib(tex)?$", l[2], re.I):
                    bibtex_links.add(l[2])
            if len(bibtex_links) == 1:
                r = self.crawler.start(bibtex_links)
                for link in bibtex_links:
                    bibtex_file = r[link]
                    if isinstance(bibtex_file, basestring):
                        bibtexpubl = self.bibtexparser.parse(bibtex_file)
                    else:
                        return
            else:
                # TODO handle more than one bibtex files???
                return
        # process found bibtex
        publ = bibtexpubl[0]
        for attr in publ:
            value = publ[attr]
            # not set, useless
            if value is None:
                continue
            # list of relationship attrs
            elif isinstance(value, list):
                for v in value:
                    self._publ[attr] = v
            # own attribute
            else:
                self._publ[attr] = value

    def extract_data(self, etree, url):
        """
        Extract all possible data about the publication from the web page.
        @param etree - parsed DOM tree of the web page (has to be instance of
                       lxml.etree._ElementTree)
        @param url - url of the web page
        @return RRSPublication object containing extracted data
        """
        assert isinstance(url, basestring)
        assert isinstance(etree, _ElementTree)
        #c = Cleaner(scripts=True, javascript=True, comments=True, style=False,
        #            meta=False, page_structure=False, processing_instructions=True,
        #            embedded=True, frames=False, forms=True, annoying_tags=False,
        #            add_nofollow=False, remove_unknown_tags=False)
        #etree = c.clean_html(etree)
        self.url = url
        self.domain = re.sub("http://(www)?", "", self.url).split(".")[0]
        self._storage = {}
        self._publ = RRSPublication()
        cleaned_etree = SimpleHTMLCleaner.clean_html(etree)
        page = HTMLDocument(cleaned_etree, url)
        self.pagetext = page.get_etree().getroot().text_content()
        # parse CSS and metadata on the page
        page.parse_document()
        # get data from <meta> tags nad convert to RRS format
        self._parse_meta(page)
        # get data on the basis of the text visbility and recognized headers
        self._parse_visibility(page)
        # and now guess :)
        self._find_unbound_entities(page)
        # and parse BibTeX
        self._parse_bibtex(page)
        return self._publ
class PublicationListExtractor(object):
    """
    PublPageMetaExtractor handles harvests metadata from web pages containing
    references (publication list). For parsing sequences in HTML dom we use
    extractors.sequencewrapper.HTMLSequenceWrapper. For parsing citations
    (records in data regions, which were found by sequencewrapper) we use
    extractors.citationentityextractor.CitationEntityExtractor.

    To improve accuracy of this system, we check headers wheather they contain some
    keyword, which could help us to determine the correct type of publication.

    From headers we also harvest keywords.
    """

    entitydefstr = {'216': 'O', '217': 'U', '214': 'O', '197': 'A',
    '198': 'E', '210': 'O', '211': 'O', '195': 'A', '194': 'A',
    '196': 'A', '193': 'A', '192': 'A', '251': 'u', '252': 'u', '238': 'i',
    '239': 'i', '235': 'e', '234': 'e', '212': 'O', '236': 'e', '237': 'i',
    '230': 'e', '231': 'c', '232': 'e', '213': 'O', '224': 'a', '249': 'u',
    '253': 'y', '248': 'o', '243': 'o', '255': 'y', '250': 'u',
    '233': 'e', '201': 'E', '200': 'E', '203': 'E', '202': 'E', '205': 'I',
    '204': 'I', '207': 'I', '206': 'I', '242': 'o', '220': 'U',
    '245': 'o', '244': 'o', '246': 'o', '241': 'n', '218': 'U', '229': 'a',
    '228': 'a', '227': 'a', '226': 'a', '225': 'a', '219': 'U', '221': 'Y',
    # these are added
    '248': 'r', '185': 's', '174': 'Z', '232': 'c', '200': 'C', '169': 'S', '190': 'z',
    '199': 'C', 'amp': '&', 'nbsp': ' ', 'quot': '\"'
    }

    def __init__(self, xmlcompatibility='db09'):
        self.seqwrapper = HTMLSequenceWrapper(childcoef=7.0, headercoef=3.0, mintextlen=30)
        self.citaextractor = CitationEntityExtractor(ALL, xmlcompatibility=xmlcompatibility)
        self.ee = EntityExtractor()
        self.mime = MIMEhandler()
        self.crawler = Crawler()
        self.bibtex = BibTeXParser()
        self.xmlcompatibility = xmlcompatibility
        self._xmlvalid = int(xmlcompatibility.lstrip('db'))
        self._publ_list = []


    def _set_new_topic(self, publ, kw):
        """
        This method adds new topic to publication.
        """
        if not re.search("[a-z]{4,}", kw):
            return publ
        if re.search("publi|paper", kw, re.I):
            return publ
        t = RRSTopic(title=kw)
        publ.set('topic', t)
        return publ


    def _set_publ_type(self, header, publ):
        def _floor(i):
            if i > 100: i=100
            return i

        if header is None: return publ
        # try to set publication type from header
        for _type in RRSPublication.publication_types:
            if re.search(_type, header, re.I):
                if publ.get('type') == _type:
                    publ.set('credibility', _floor(publ.get('credibility')))
                else:
                    publ.set('type', _type)
                return publ
        if re.search("dissertation", header, re.I):
            publ.set('type', 'phdthesis')
            return publ
        if re.search('technical report', header, re.I):
            publ.set('type', 'techreport')
            return publ
        # make keyword from header
        return self._set_new_topic(publ, header)


    def translate_html_entities(self, text):
        ents = re.findall(r'&(#?)(x?)(\w+);', text)
        for ent in set(ents):
            try:
                text = re.sub('&(#?)'+re.escape(ent[2])+";", self.entitydefstr[ent[2]], text)
            except: pass
        return text


    def compare_chunks_to_extracted(self, chunks, publ):
        if not publ.get('title'): return publ
        title = self.translate_html_entities(publ.get('title'))
        authors = publ.get('person_author')
        author_names = [a.get('name')[0].get('full_name') for a in authors]
        for ch in chunks:
            l = ch.get_link()
            # get chunk text
            ch = self.translate_html_entities(ch.get_text())
            # add url if available
            if l is not None and not l.startswith("javascript") and l != "#":
                u = RRSUrl(type='publication', title=ch, link=l)
                publ.set('url', u)

            # repair title if needed
            if ch in title or ch == title:
                if float(len(ch))/float(len(title)) > 0.4:
                    publ.set('title', ch)
            # repair names if needed
            for a in author_names:
                if a in ch:
                    authors_extracted = self.ee.find_authors(ch)
                    publ.person_author = authors_extracted[0]
                break
        return publ


    def _fill_citation(self, publ):
        c = RRSCitation()
        c.set('content', self.cita_text)
        if publ.get('event'):
            c.set('event', publ.get('event')[0].get('title'))
        return c


    def _handle_bibtex_pages(self):
        urls = {}
        for i, p in enumerate(self._publ_list):
            pub_u = p.get('url')
            for u in pub_u:
                urls[u.get('link')] = i

        #if link is web page, not pdf
        urls_to_download = []
        content_types = self.mime.start(urls.keys())
        for k in urls.keys():
            if content_types[k] in ('text/html', 'application/xhtml+xml',
                                    'application/x-httpd-php', 'text/javascript'):
                urls_to_download.append(k)
        # download page a try it for bibtex
        pages = self.crawler.start(urls_to_download)

        for u in urls_to_download:
            bibtex = self.bibtex.parse(pages[u])
            # if bibtex on page, set publication
            if bibtex is not None:
                self._publ_list[urls[u]] = bibtex


    def _empty(self):
        for x in range(len(self._publ_list)):
            self._publ_list.pop()
        self.cita_text = None


    def _handle_document(self, doc):
        self._empty()
        # for all regions which were found
        for reg in doc.get_regions():
            # get their header
            header = reg.get_name()
            # for all records in region
            for rec in reg._manual_process_page():
                # create empty citation object
                c = RRSCitation()
                # harvest citation record text (probably citation we hope)
                self.cita_text = self.translate_html_entities(rec.get_text())
                # set the content of record to citation object
                c.set('content', self.cita_text)
                # fill object it wih extracted data
                c = self.citaextractor.extract(c)

                # get extracted publication
                publ = c.get('publication_cited')
                # if sequencewrapper extracted come text chunks, it helps us a lot,
                # beacause we can compare extracted data to chunks and if not matched
                # we can fix it
                publ = self.compare_chunks_to_extracted(rec.get_chunks(), publ)
                # insert citation into publication
                # !!! we are extracting publications, not citations. Because we dont
                # want tree like this: citation->publication but this:
                # publication->citation
                publ.set('citation', self._fill_citation(publ))
                # try to find publication type in header of data region
                publ = self._set_publ_type(header, publ)
                # add to publication list
                self._publ_list.append(publ)
        #self._handle_bibtex_pages()
        return self._publ_list

    #---------------------------------------------------------------------------
    # public methods
    #---------------------------------------------------------------------------
    def extract_data(self, tree, url):
        """
        Main method for extracting publication metadata from page.
        """
        # wrap html document
        document = self.seqwrapper.wrap_h(tree, url)
        # handle it and return the result
        return self._handle_document(document)
class PublicationPageExtractor(object):
    """
    This class wraps all methods for recognition publication and it's description
    on the web page.

    The result of page-processing is ORM object rrslib.db.model.RRSPublication
    with extracted data.
    """
    # what could an abstract start with
    _abstract_startswith = ('We present', 'This paper', 'This publica',
                            'In this p', 'The paper', 'This book')
    _abstract_blacklist = ('CiteU', 'Was ', 'I ha', 'I pre', 'I was', 'CiteS',
                           'Microso')
    # tags representing lists in HTML
    _list_tags = ('ol', 'ul', 'dl')
    # tags representing list items in HTML
    _list_item_tags = ('li', 'dd', 'dt')
    # ommitted tags - they are useless for this reason
    _omitted_tags = ('form', 'option', 'link', 'style', 'meta', 'head', 'script')
    # acceptable mime types for documents (publications)
    _accepted_mime = ('application/pdf', 'application/rtf', 'application/postscript', 'application/msword')

    def __init__(self, headercoef=2.5):
        """
        Constructor.
        @param headercoef: lower border of elemet's visibility to be handled as header
        """
        self.generalizer = _RRSPropertyGeneralizer()
        self.ee = EntityExtractor()
        self.headercoef = headercoef
        self.bibtexparser = BibTeXParser()
        self.crawler = Crawler()
        self.mime_handler = MIMEhandler()
        self.crawler.set_handler(FileDownloader)


    def _get_visibility2elem_map(self, etree):
        htmlroot = etree.getroot()
        visibility2elem_map = {}
        for elem in htmlroot.iterdescendants():
            # no need to get those
            if elem.tag in self._omitted_tags: continue
            assert hasattr(elem, 'style')
            v = elem.style.get_visibility()
            if v in visibility2elem_map:
                visibility2elem_map[v].append(elem)
            else:
                visibility2elem_map[v] = [elem]
        return visibility2elem_map


    def _classify_publ_title(self, title, init=70):
        def _bracket_alone(text):
            if text is not None and re.search(" [\(\)\{\}\[\]] ", text):
                return True
            return False
        # default credibility
        cred = init
        # remove prepositions and words with len(w) < 3
        title = re.sub("(?<= )(?:[fF]or|[iI]n|[oO]f|[oO]n|[aA]t|[Tt]he|"\
                       "[Aa]nd|[aA]|[Ii]s|[Ww]e|[Tt]o)(?= )", "", title)
        title = re.sub("^(?:A|The|In|Is|To) ", "", title)
        title = re.sub("[ ]+", " ", title)
        # split into chunks
        title_sp = title.split(" ")
        _bad = 0
        # if there are many chunks (almost-words) with no meaning, reduce
        # credibility value
        _blacklisted = '(?<![a-z])(?:department|faculty|universtiy|society|lab|press|)(?![a-z]+)'
        for chunk in title_sp:
            if re.search(_blacklisted, chunk, re.I):
                _bad += 1
            elif re.search("[a-z]{3,}", chunk, re.I):
                pass
            else:
                _bad += 1
        # guess accuracy
        negative = float(_bad) / float(len(title_sp))
        cred = float(cred) - negative * 65
        # bonus if all chunks are OK
        if _bad == 0:
            cred += 20
        # if there in title is bracket alone, reduce credibility
        if _bracket_alone(title):
            cred -= 15
        # floor
        if cred > 100: cred = 100
        if cred < 0: cred = 0
        return cred


    def _most_alike_term(self, term, target, threshold=0.3):
        assert len(target) > 0
        assert term is not None
        l = ListedDict()
        for t in target:
            s = SequenceMatcher(None, term, t)
            l[float(s.ratio())] = t
        m = max(l)
        if m < float(threshold):
            return None
        return l[m][0]

    def _add_property(self, property, values):
        # Add the property and its value into publication
        # maybe it is not a property, but an entity
        firstval = values[0]
        if firstval is None: return
        # First try to get attributes of the publication
        if property in ('abstract', 'isbn', 'issn', 'volume',
                        'number', 'acronym', 'issn', 'note'):
            _type = self._publ.__types__[property]
            if _type is basestring:
                _type = unicode
            self._publ[property] = (_type)(firstval)
        elif property == 'title':
            self._publ[property] = firstval
            # the origin is from meta-tag, so we are pretty sure about this information
            self._publ['credibility'] = 95
        elif property == 'publisher':
            self._publ['publisher'] = RRSOrganization(title=firstval)
        elif property == 'date':
            r = self.ee.find_published_date(firstval)
            if not r[0]: return
            rrsdate = r[0][0]
            for attr in ('year', 'month'):
                if rrsdate.get(attr) is not None:
                    self._publ[attr] = rrsdate.get(attr)
        elif property == 'type':
            # choose better heuristics
            publtype = self._most_alike_term(firstval, publication_types, 0.4)
            if publtype is not None:
                self._publ['type'] = RRSPublication_type(type=publtype)
        elif property == 'pages':
            if re.search("[0-9]+\-{1,2}[0-9]+", firstval):
                self._publ['pages'] = firstval
        elif property == 'start page':
            if not re.search("^[0-9]+$", firstval): return
            if 'end page' in self._storage and not 'pages' in self._publ:
                self._publ['pages'] = "%s-%s" % (firstval, self._storage['end page'][0])
            else:
                self._storage[property] = [firstval]
        elif property == 'end page':
            if not re.search("^[0-9]+$", firstval): return
            if 'start page' in self._storage and not 'pages' in self._publ:
                self._publ['pages'] = "%s-%s" % (self._storage['start page'][0], firstval)
            else:
                self._storage[property] = [firstval]
        # --------------------------------------------------
        # Now other entities connected with the publiacation
        # --------------------------------------------------
        elif property == 'topic':
            for topictitle in values:
                rel = RRSRelationshipPublicationTopic()
                rel.set_entity(RRSTopic(title=topictitle))
                self._publ['topic'] = rel
        elif property == 'url':
            for link in values:
                try:
                    rel = RRSRelationshipPublicationUrl()
                    u = RRSUrl(link=link)
                except (RRSDatabaseAttributeError, RRSDatabaseEntityError, RRSDatabaseValueError):
                    return
                u['type'] = RRSUrl_type(type='publication')
                rel.set_entity(u)
                self._publ['url'] = rel
        elif property == 'keywords':
            for kw in values:
                rel = RRSRelationshipPublicationKeyword()
                rel.set_entity(RRSKeyword(title=kw))
                self._publ['keyword'] = rel
        elif property in ('author', 'editor'):
            for person in values:
                rel = RRSRelationshipPersonPublication()
                rel['editor'] = property == 'editor'
                r = self.ee.find_authors(person)
                if not r[0]: return
                rel.set_entity(r[0][0])
                rel['author_rank'] = len(self._publ['person']) + 1
                self._publ['person'] = rel
        elif property == 'reviews':
            # TODO if real reviews would be implemented in DB schema,
            # change this: add RRSReview objects into publication
            self._publ.set("review", values, strict=False)


    def _parse_meta(self, document):
        doc_meta = document.get_meta_map()
        # transform into generalized form
        for key in doc_meta:
            property = self.generalizer.generalize(key)
            if property is None: continue
            if property in self._storage:
                for val in doc_meta[key]:
                    if val not in self._storage[property]:
                        self._storage[property].append(val)
            else:
                self._storage[property] = doc_meta[key]
        # make authors and editors disjoint sets
        if 'author' in self._storage and 'editor' in self._storage:
            to_del = []
            for a in self._storage['author']:
                if a in self._storage['editor']:
                    to_del.append(a)
            for a in to_del:
                self._storage['author'].remove(a)
        # and now just set the values into real RRS objects
        for property in self._storage:
            self._add_property(property, self._storage[property])
        self._storage= {}


    def _find_local_sequence(self, header_elem, h_func):
        # lightweight version of sequencewrapper (rrslib.web.sequencewrapper)
        # targeted to repeated sequence of tags in one level of DOM - there's no
        # analysis of data regions (we already found one), just looking for data
        # records in a very straightforward (and very stupid) way.
        # @param header_elem - lxml.html.HtmlElement representing header element
        # @param h_func - heuristic function returning true/false, has to accept
        #                 one parameter - element (instance of lxml.html.HtmlElement)
        # @return tuple (records, likelihood of data)
        tags = ListedDict()
        for elem in header_elem.iterchildren():
            if elem.tag in self._omitted_tags: continue
            tags[elem.tag] = elem
        (tag, elements) = tags.item_by_longest_value()
        if len(elements[0]) < 2:
            return (None, 0.0)
        res = []
        grouped_elements = []
        for e_group in elements:
            res.extend(filter(h_func, e_group))
            grouped_elements.extend(e_group)
        return (res, float(len(res))/float(len(grouped_elements)))


    def _get_data_below_header(self, elem, hdrtext, to_be_processed):
        # Try to iter over siblings of the header element and get text
        siblings = [sib.tag for sib in elem.itersiblings()]
        # the header is abstract
        if hdrtext == 'abstract':
            txts = {}
            paragraphs = []
            par_stop = False
            for sib in elem.itersiblings():
                content = sib.text_content()
                if sib in to_be_processed:
                    par_stop = True
                if sib.tag == 'p' and len(content) > 50 and not par_stop:
                    paragraphs.append(content)
                chunk = content[0:20].lower()
                score = 1.0
                for st in self._abstract_startswith:
                    if chunk.startswith(st): score*=5.0
                score *= len(content)
                txts[score] = SimpleHTMLCleaner.clean(content)
            if paragraphs:
                self._storage[hdrtext] = [SimpleHTMLCleaner.clean(" ".join(paragraphs))]
            else:
                self._storage[hdrtext] = [ txts[max(txts.keys())] ]

        # related publications
        elif hdrtext == 'related':
            list_tags = ('ul', 'ol', 'dl')
            return # TODO
            for ltag in list_tags:
                if ltag in siblings:
                    for sib in elem.itersiblings(): pass

        # keywords
        elif hdrtext == 'keywords':
            # create function returning elements containing possible keywords
            is_keyword = lambda kw: re.search("^(([a-z]{3,}( |,)){1,3} ?)+([a-z]{3,} ?){1,3}$", \
                                    kw.text_content(), re.I) \
                                    and not re.search("[@#\$%\^&\*\(\)]", kw.text_content())
            # iter over siblings of header a try to get keywords from its children
            likelihood_to_keyword_tags = ListedDict()
            for s in elem.itersiblings():
                (kw_elems, likelihood) = self._find_local_sequence(s, is_keyword)
                if kw_elems is None: continue
                likelihood_to_keyword_tags[likelihood] = kw_elems
            if not likelihood_to_keyword_tags: return
            # if found some keywords, store them
            self._storage[hdrtext] = [kw.text_content() for kw in likelihood_to_keyword_tags[max(likelihood_to_keyword_tags.keys())][0]]

        # references
        elif hdrtext == 'references':
            pass # TODO

        # chapters ??
        elif hdrtext == 'chapters':
            pass # TODO

        # reviews?
        elif hdrtext == 'reviews':
            if hdrtext in self._storage: return
            # create function returning elements containing possible reviews
            is_review = lambda r: (len(r.text_content()) > 100) or r.tag == 'blockquote'
            probability = ListedDict()
            # iter over siblings of header a try to get reviews from its children
            for s in elem.itersiblings():
                (elems, prob) = self._find_local_sequence(s, is_review)
                if elems is None: continue
                probability[prob] = elems
            review_texts = []
            if not probability: return
            for e in probability[max(probability.keys())][0]:
                review_texts.append(SimpleHTMLCleaner.clean(e.text_content()))
                # set all the elements as "processed" to avoid further processing
                for d in e.iter():
                    d.processed = True
            self._storage[hdrtext] = review_texts


    def _parse_visibility(self, document):
        vis_map = self._get_visibility2elem_map(document.get_etree())
        if len(vis_map) < 2: return
        sorted_vis = sorted(vis_map.keys(), reverse=True)
        if len(sorted_vis) < 2: return
        to_be_processed = None
        while 42: #:)
            to_be_processed = []
            for i in xrange(0, len(sorted_vis)):
                if sorted_vis[i] < self.headercoef: continue
                to_be_processed.extend(vis_map[sorted_vis[i]])
            if len(to_be_processed) < 2:
                self.headercoef -= 0.5
            else: break
        # storage for possible titles
        possible_titles = ListedDict()
        # loop over all headers (elements containing very visible texts)
        for elem in to_be_processed:
            # get cleaned text content of the tag
            txt = SimpleHTMLCleaner.clean( elem.text_content() )
            # generalize: maybe it is something useful
            hdrtext = self.generalizer.generalize(txt)
            # generalization found header beeing TITLE -> data are below header
            if hdrtext is not None:
                # found some useful header, try to get data below
                # what is below? probably sibling tags and their descendants
                self._get_data_below_header(elem, hdrtext, to_be_processed)
            # generalization wasnt successful -> maybe the header contains data
            else:
                # date?
                d = self.ee.find_published_date(txt)
                if d[0]:
                    rrsdate = d[0][0]
                    for attr in ('year', 'month'):
                        if rrsdate.get(attr) is not None:
                            self._publ[attr] = rrsdate.get(attr)
                    txt = d[1]
                # maybe title
                if len(txt.split(" ")) > 3: # probably more than three words
                    # is there a domain name in the title? So it is probably
                    # general name of the website
                    if len(self.domain) > 6 and re.search(re.escape(self.domain), txt, re.I):
                        continue

                    # preprocessing - remove standalone brackets
                    txt = re.sub("[\(\[][^\)\]]*[\)\]]+", "", txt).strip()
                    if document.name is not None and re.search(re.escape(txt), document.name, re.I):
                        possible_titles[int(self._classify_publ_title(txt, init=100))] = txt
                    elif len(txt.split(" ")) > 5:
                        possible_titles[int(self._classify_publ_title(txt, init=60))] = txt
        if possible_titles:
            titles = possible_titles[max(possible_titles)]
            if len(titles) > 1:
                title = self._get_longest_string(titles)
            else:
                title = titles[0]
            self._publ['title'] = title
            self._publ['credibility'] = max(possible_titles)
        else:
            self._publ['credibility'] = 0
        # store all new properties and their values
        for property in self._storage:
            self._add_property(property, self._storage[property])


    def _get_longest_string(self, l):
        mx = None
        maxlen = 0
        for t in l:
            if len(t) > maxlen:
                maxlen = len(t)
                mx = t
        return mx


    def _find_abstract(self, etree):
        c = Cleaner(scripts=True, javascript=True, comments=True, style=True,
                    meta=True, page_structure=False, processing_instructions=True,
                    embedded=True, frames=False, forms=True, annoying_tags=True,
                    add_nofollow=False, remove_unknown_tags=False)
        etree_copy = deepcopy(etree)
        etree_copy = c.clean_html(etree_copy)
        html = tostring(etree_copy.getroot())
        # XXX this may be probably useful, to delete all <p> tags...
        html = re.sub("</?p[^>]*>", " ", html)
        possible = []
        txts = re.findall("(?<=\>)[^>]+(?=\<)", html, re.U)
        for txt in txts:
            txt = SimpleHTMLCleaner.clean(txt)
            if len(txt) > 200:
                do_not_append = False
                for bl in self._abstract_blacklist:
                    if txt.startswith(bl):
                        do_not_append = True
                        break
                if not do_not_append:
                    possible.append(txt)
                    continue
            for st in self._abstract_startswith:
                if txt.startswith(st):
                    possible.append(txt)
                    break
        return self._get_longest_string(possible)


    def _find_unbound_entities(self, page):
        root = page.get_etree().getroot()
        # get abstract
        if not 'abstract' in self._publ:
            abst = self._find_abstract(page.get_etree())
            if abst is not None:
                self._publ['abstract'] = abst

        # find url of publication (pdf, ps, doc...)
        if 'url' not in self._publ:
            to_be_checked = []
            for (element, attribute, link, pos) in root.iterlinks():
                if re.search("(pdf|doc|odt|ps)$", link, re.I):
                    to_be_checked.append(link)
                # TODO try to get links with no suffix (queries etc.)
                # ----------------------------------------------------
                # ADD THE CODE HERE
                # ----------------------------------------------------
            if to_be_checked:
                documents = []
                mimes = self.mime_handler.start(to_be_checked)
                for link in mimes:
                    if mimes[link] in self._accepted_mime:
                        documents.append(link)
                dl = len(documents)
                doc_link = None
                if dl == 1: # exactly one link
                    doc_link = documents[0]
                elif dl != 0: # more than one
                    # try to guess out of the name of the publication
                    if 'title' in self._publ:
                        doc_link = self._most_alike_term(self._publ['title'], documents, 0.5)
                if doc_link is not None:
                    try:
                        rel = RRSRelationshipPublicationUrl()
                        u = RRSUrl(link=doc_link)
                    except (RRSDatabaseAttributeError, RRSDatabaseEntityError, RRSDatabaseValueError):
                        return
                    u['type'] = RRSUrl_type(type='publication')
                    rel.set_entity(u)
                    self._publ['url'] = rel

        # Now extract unbound entities from the plaintext.
        # Every time there is a high probability that the relationship will be
        # miss-recognitized.

        # get keywords if there are no such
        if not 'keyword' in self._publ:
            # Try to get keywords from the text. They are probably in the format:
            # Keywords: algorithm, algorithmic process, random, some other keyword
            kwre = re.search("keywords?:?[\t\n\r ]+(([a-z]{3,}( |,)){1,3} ?)+([a-z]{3,} ?){1,3}", self.pagetext, re.I)
            if kwre is not None:
                kwstr = kwre.group(0)
                kwstr = re.sub("[kK]eywords?:?[\t\n\r ]+", "", kwstr)
                keywords = [x.strip() for x in kwstr.split(",")]
                for kw in keywords:
                    rel = RRSRelationshipPublicationKeyword()
                    rel.set_entity(RRSKeyword(title=kw))
                    self._publ['keyword'] = rel


    def _parse_bibtex(self, page):
        # Parse all possible bibtex on the page.
        # At first parse plaintext of the page and try to get BibTeX string out
        # of there. Then try to find possible links fo bibtex files, download
        # them and parse them.
        bibtexpubl = self.bibtexparser.parse(self.pagetext)
        if bibtexpubl:
            pass
        # try to get .bib or .bibtex files from the page
        else:
            html_tag = page.get_etree().getroot()
            bibtex_links = set()
            for l in html_tag.iterlinks():
                if re.search("\.bib(tex)?$", l[2], re.I):
                    bibtex_links.add(l[2])
            if len(bibtex_links) == 1:
                r = self.crawler.start(bibtex_links)
                for link in bibtex_links:
                    bibtex_file = r[link]
                    if isinstance(bibtex_file, basestring):
                        bibtexpubl = self.bibtexparser.parse(bibtex_file)
                    else:
                        return
            else:
                # TODO handle more than one bibtex files???
                return
        # process found bibtex
        publ = bibtexpubl[0]
        for attr in publ:
            value = publ[attr]
            # not set, useless
            if value is None:
                continue
            # list of relationship attrs
            elif isinstance(value, list):
                for v in value:
                    self._publ[attr] = v
            # own attribute
            else:
                self._publ[attr] = value


    def extract_data(self, etree, url):
        """
        Extract all possible data about the publication from the web page.
        @param etree - parsed DOM tree of the web page (has to be instance of
                       lxml.etree._ElementTree)
        @param url - url of the web page
        @return RRSPublication object containing extracted data
        """
        assert isinstance(url, basestring)
        assert isinstance(etree, _ElementTree)
        #c = Cleaner(scripts=True, javascript=True, comments=True, style=False,
        #            meta=False, page_structure=False, processing_instructions=True,
        #            embedded=True, frames=False, forms=True, annoying_tags=False,
        #            add_nofollow=False, remove_unknown_tags=False)
        #etree = c.clean_html(etree)
        self.url = url
        self.domain = re.sub("http://(www)?", "", self.url).split(".")[0]
        self._storage= {}
        self._publ = RRSPublication()
        cleaned_etree = SimpleHTMLCleaner.clean_html(etree)
        page = HTMLDocument(cleaned_etree, url)
        self.pagetext = page.get_etree().getroot().text_content()
        # parse CSS and metadata on the page
        page.parse_document()
        # get data from <meta> tags nad convert to RRS format
        self._parse_meta(page)
        # get data on the basis of the text visbility and recognized headers
        self._parse_visibility(page)
        # and now guess :)
        self._find_unbound_entities(page)
        # and parse BibTeX
        self._parse_bibtex(page)
        return self._publ