예제 #1
0
class PublicationListExtractor(object):
    """
    PublPageMetaExtractor handles harvests metadata from web pages containing
    references (publication list). For parsing sequences in HTML dom we use
    extractors.sequencewrapper.HTMLSequenceWrapper. For parsing citations
    (records in data regions, which were found by sequencewrapper) we use
    extractors.citationentityextractor.CitationEntityExtractor.

    To improve accuracy of this system, we check headers wheather they contain some
    keyword, which could help us to determine the correct type of publication.

    From headers we also harvest keywords.
    """

    entitydefstr = {
        '216': 'O',
        '217': 'U',
        '214': 'O',
        '197': 'A',
        '198': 'E',
        '210': 'O',
        '211': 'O',
        '195': 'A',
        '194': 'A',
        '196': 'A',
        '193': 'A',
        '192': 'A',
        '251': 'u',
        '252': 'u',
        '238': 'i',
        '239': 'i',
        '235': 'e',
        '234': 'e',
        '212': 'O',
        '236': 'e',
        '237': 'i',
        '230': 'e',
        '231': 'c',
        '232': 'e',
        '213': 'O',
        '224': 'a',
        '249': 'u',
        '253': 'y',
        '248': 'o',
        '243': 'o',
        '255': 'y',
        '250': 'u',
        '233': 'e',
        '201': 'E',
        '200': 'E',
        '203': 'E',
        '202': 'E',
        '205': 'I',
        '204': 'I',
        '207': 'I',
        '206': 'I',
        '242': 'o',
        '220': 'U',
        '245': 'o',
        '244': 'o',
        '246': 'o',
        '241': 'n',
        '218': 'U',
        '229': 'a',
        '228': 'a',
        '227': 'a',
        '226': 'a',
        '225': 'a',
        '219': 'U',
        '221': 'Y',
        # these are added
        '248': 'r',
        '185': 's',
        '174': 'Z',
        '232': 'c',
        '200': 'C',
        '169': 'S',
        '190': 'z',
        '199': 'C',
        'amp': '&',
        'nbsp': ' ',
        'quot': '\"'
    }

    def __init__(self, xmlcompatibility='db09'):
        self.seqwrapper = HTMLSequenceWrapper(childcoef=7.0,
                                              headercoef=3.0,
                                              mintextlen=30)
        self.citaextractor = CitationEntityExtractor(
            ALL, xmlcompatibility=xmlcompatibility)
        self.ee = EntityExtractor()
        self.mime = MIMEhandler()
        self.crawler = Crawler()
        self.bibtex = BibTeXParser()
        self.xmlcompatibility = xmlcompatibility
        self._xmlvalid = int(xmlcompatibility.lstrip('db'))
        self._publ_list = []

    def _set_new_topic(self, publ, kw):
        """
        This method adds new topic to publication.
        """
        if not re.search("[a-z]{4,}", kw):
            return publ
        if re.search("publi|paper", kw, re.I):
            return publ
        t = RRSTopic(title=kw)
        publ.set('topic', t)
        return publ

    def _set_publ_type(self, header, publ):
        def _floor(i):
            if i > 100: i = 100
            return i

        if header is None: return publ
        # try to set publication type from header
        for _type in RRSPublication.publication_types:
            if re.search(_type, header, re.I):
                if publ.get('type') == _type:
                    publ.set('credibility', _floor(publ.get('credibility')))
                else:
                    publ.set('type', _type)
                return publ
        if re.search("dissertation", header, re.I):
            publ.set('type', 'phdthesis')
            return publ
        if re.search('technical report', header, re.I):
            publ.set('type', 'techreport')
            return publ
        # make keyword from header
        return self._set_new_topic(publ, header)

    def translate_html_entities(self, text):
        ents = re.findall(r'&(#?)(x?)(\w+);', text)
        for ent in set(ents):
            try:
                text = re.sub('&(#?)' + re.escape(ent[2]) + ";",
                              self.entitydefstr[ent[2]], text)
            except:
                pass
        return text

    def compare_chunks_to_extracted(self, chunks, publ):
        if not publ.get('title'): return publ
        title = self.translate_html_entities(publ.get('title'))
        authors = publ.get('person_author')
        author_names = [a.get('name')[0].get('full_name') for a in authors]
        for ch in chunks:
            l = ch.get_link()
            # get chunk text
            ch = self.translate_html_entities(ch.get_text())
            # add url if available
            if l is not None and not l.startswith("javascript") and l != "#":
                u = RRSUrl(type='publication', title=ch, link=l)
                publ.set('url', u)

            # repair title if needed
            if ch in title or ch == title:
                if float(len(ch)) / float(len(title)) > 0.4:
                    publ.set('title', ch)
            # repair names if needed
            for a in author_names:
                if a in ch:
                    authors_extracted = self.ee.find_authors(ch)
                    publ.person_author = authors_extracted[0]
                break
        return publ

    def _fill_citation(self, publ):
        c = RRSCitation()
        c.set('content', self.cita_text)
        if publ.get('event'):
            c.set('event', publ.get('event')[0].get('title'))
        return c

    def _handle_bibtex_pages(self):
        urls = {}
        for i, p in enumerate(self._publ_list):
            pub_u = p.get('url')
            for u in pub_u:
                urls[u.get('link')] = i

        #if link is web page, not pdf
        urls_to_download = []
        content_types = self.mime.start(urls.keys())
        for k in urls.keys():
            if content_types[k] in ('text/html', 'application/xhtml+xml',
                                    'application/x-httpd-php',
                                    'text/javascript'):
                urls_to_download.append(k)
        # download page a try it for bibtex
        pages = self.crawler.start(urls_to_download)

        for u in urls_to_download:
            bibtex = self.bibtex.parse(pages[u])
            # if bibtex on page, set publication
            if bibtex is not None:
                self._publ_list[urls[u]] = bibtex

    def _empty(self):
        for x in range(len(self._publ_list)):
            self._publ_list.pop()
        self.cita_text = None

    def _handle_document(self, doc):
        self._empty()
        # for all regions which were found
        for reg in doc.get_regions():
            # get their header
            header = reg.get_name()
            # for all records in region
            for rec in reg._manual_process_page():
                # create empty citation object
                c = RRSCitation()
                # harvest citation record text (probably citation we hope)
                self.cita_text = self.translate_html_entities(rec.get_text())
                # set the content of record to citation object
                c.set('content', self.cita_text)
                # fill object it wih extracted data
                c = self.citaextractor.extract(c)

                # get extracted publication
                publ = c.get('publication_cited')
                # if sequencewrapper extracted come text chunks, it helps us a lot,
                # beacause we can compare extracted data to chunks and if not matched
                # we can fix it
                publ = self.compare_chunks_to_extracted(rec.get_chunks(), publ)
                # insert citation into publication
                # !!! we are extracting publications, not citations. Because we dont
                # want tree like this: citation->publication but this:
                # publication->citation
                publ.set('citation', self._fill_citation(publ))
                # try to find publication type in header of data region
                publ = self._set_publ_type(header, publ)
                # add to publication list
                self._publ_list.append(publ)
        #self._handle_bibtex_pages()
        return self._publ_list

    #---------------------------------------------------------------------------
    # public methods
    #---------------------------------------------------------------------------
    def extract_data(self, tree, url):
        """
        Main method for extracting publication metadata from page.
        """
        # wrap html document
        document = self.seqwrapper.wrap_h(tree, url)
        # handle it and return the result
        return self._handle_document(document)
class PublicationListExtractor(object):
    """
    PublPageMetaExtractor handles harvests metadata from web pages containing
    references (publication list). For parsing sequences in HTML dom we use
    extractors.sequencewrapper.HTMLSequenceWrapper. For parsing citations
    (records in data regions, which were found by sequencewrapper) we use
    extractors.citationentityextractor.CitationEntityExtractor.

    To improve accuracy of this system, we check headers wheather they contain some
    keyword, which could help us to determine the correct type of publication.

    From headers we also harvest keywords.
    """

    entitydefstr = {'216': 'O', '217': 'U', '214': 'O', '197': 'A',
    '198': 'E', '210': 'O', '211': 'O', '195': 'A', '194': 'A',
    '196': 'A', '193': 'A', '192': 'A', '251': 'u', '252': 'u', '238': 'i',
    '239': 'i', '235': 'e', '234': 'e', '212': 'O', '236': 'e', '237': 'i',
    '230': 'e', '231': 'c', '232': 'e', '213': 'O', '224': 'a', '249': 'u',
    '253': 'y', '248': 'o', '243': 'o', '255': 'y', '250': 'u',
    '233': 'e', '201': 'E', '200': 'E', '203': 'E', '202': 'E', '205': 'I',
    '204': 'I', '207': 'I', '206': 'I', '242': 'o', '220': 'U',
    '245': 'o', '244': 'o', '246': 'o', '241': 'n', '218': 'U', '229': 'a',
    '228': 'a', '227': 'a', '226': 'a', '225': 'a', '219': 'U', '221': 'Y',
    # these are added
    '248': 'r', '185': 's', '174': 'Z', '232': 'c', '200': 'C', '169': 'S', '190': 'z',
    '199': 'C', 'amp': '&', 'nbsp': ' ', 'quot': '\"'
    }

    def __init__(self, xmlcompatibility='db09'):
        self.seqwrapper = HTMLSequenceWrapper(childcoef=7.0, headercoef=3.0, mintextlen=30)
        self.citaextractor = CitationEntityExtractor(ALL, xmlcompatibility=xmlcompatibility)
        self.ee = EntityExtractor()
        self.mime = MIMEhandler()
        self.crawler = Crawler()
        self.bibtex = BibTeXParser()
        self.xmlcompatibility = xmlcompatibility
        self._xmlvalid = int(xmlcompatibility.lstrip('db'))
        self._publ_list = []


    def _set_new_topic(self, publ, kw):
        """
        This method adds new topic to publication.
        """
        if not re.search("[a-z]{4,}", kw):
            return publ
        if re.search("publi|paper", kw, re.I):
            return publ
        t = RRSTopic(title=kw)
        publ.set('topic', t)
        return publ


    def _set_publ_type(self, header, publ):
        def _floor(i):
            if i > 100: i=100
            return i

        if header is None: return publ
        # try to set publication type from header
        for _type in RRSPublication.publication_types:
            if re.search(_type, header, re.I):
                if publ.get('type') == _type:
                    publ.set('credibility', _floor(publ.get('credibility')))
                else:
                    publ.set('type', _type)
                return publ
        if re.search("dissertation", header, re.I):
            publ.set('type', 'phdthesis')
            return publ
        if re.search('technical report', header, re.I):
            publ.set('type', 'techreport')
            return publ
        # make keyword from header
        return self._set_new_topic(publ, header)


    def translate_html_entities(self, text):
        ents = re.findall(r'&(#?)(x?)(\w+);', text)
        for ent in set(ents):
            try:
                text = re.sub('&(#?)'+re.escape(ent[2])+";", self.entitydefstr[ent[2]], text)
            except: pass
        return text


    def compare_chunks_to_extracted(self, chunks, publ):
        if not publ.get('title'): return publ
        title = self.translate_html_entities(publ.get('title'))
        authors = publ.get('person_author')
        author_names = [a.get('name')[0].get('full_name') for a in authors]
        for ch in chunks:
            l = ch.get_link()
            # get chunk text
            ch = self.translate_html_entities(ch.get_text())
            # add url if available
            if l is not None and not l.startswith("javascript") and l != "#":
                u = RRSUrl(type='publication', title=ch, link=l)
                publ.set('url', u)

            # repair title if needed
            if ch in title or ch == title:
                if float(len(ch))/float(len(title)) > 0.4:
                    publ.set('title', ch)
            # repair names if needed
            for a in author_names:
                if a in ch:
                    authors_extracted = self.ee.find_authors(ch)
                    publ.person_author = authors_extracted[0]
                break
        return publ


    def _fill_citation(self, publ):
        c = RRSCitation()
        c.set('content', self.cita_text)
        if publ.get('event'):
            c.set('event', publ.get('event')[0].get('title'))
        return c


    def _handle_bibtex_pages(self):
        urls = {}
        for i, p in enumerate(self._publ_list):
            pub_u = p.get('url')
            for u in pub_u:
                urls[u.get('link')] = i

        #if link is web page, not pdf
        urls_to_download = []
        content_types = self.mime.start(urls.keys())
        for k in urls.keys():
            if content_types[k] in ('text/html', 'application/xhtml+xml',
                                    'application/x-httpd-php', 'text/javascript'):
                urls_to_download.append(k)
        # download page a try it for bibtex
        pages = self.crawler.start(urls_to_download)

        for u in urls_to_download:
            bibtex = self.bibtex.parse(pages[u])
            # if bibtex on page, set publication
            if bibtex is not None:
                self._publ_list[urls[u]] = bibtex


    def _empty(self):
        for x in range(len(self._publ_list)):
            self._publ_list.pop()
        self.cita_text = None


    def _handle_document(self, doc):
        self._empty()
        # for all regions which were found
        for reg in doc.get_regions():
            # get their header
            header = reg.get_name()
            # for all records in region
            for rec in reg._manual_process_page():
                # create empty citation object
                c = RRSCitation()
                # harvest citation record text (probably citation we hope)
                self.cita_text = self.translate_html_entities(rec.get_text())
                # set the content of record to citation object
                c.set('content', self.cita_text)
                # fill object it wih extracted data
                c = self.citaextractor.extract(c)

                # get extracted publication
                publ = c.get('publication_cited')
                # if sequencewrapper extracted come text chunks, it helps us a lot,
                # beacause we can compare extracted data to chunks and if not matched
                # we can fix it
                publ = self.compare_chunks_to_extracted(rec.get_chunks(), publ)
                # insert citation into publication
                # !!! we are extracting publications, not citations. Because we dont
                # want tree like this: citation->publication but this:
                # publication->citation
                publ.set('citation', self._fill_citation(publ))
                # try to find publication type in header of data region
                publ = self._set_publ_type(header, publ)
                # add to publication list
                self._publ_list.append(publ)
        #self._handle_bibtex_pages()
        return self._publ_list

    #---------------------------------------------------------------------------
    # public methods
    #---------------------------------------------------------------------------
    def extract_data(self, tree, url):
        """
        Main method for extracting publication metadata from page.
        """
        # wrap html document
        document = self.seqwrapper.wrap_h(tree, url)
        # handle it and return the result
        return self._handle_document(document)
예제 #3
0
class PublicationPageExtractor(object):
    """
    This class wraps all methods for recognition publication and it's description
    on the web page.

    The result of page-processing is ORM object rrslib.db.model.RRSPublication
    with extracted data.
    """
    # what could an abstract start with
    _abstract_startswith = ('We present', 'This paper', 'This publica',
                            'In this p', 'The paper', 'This book')
    _abstract_blacklist = ('CiteU', 'Was ', 'I ha', 'I pre', 'I was', 'CiteS',
                           'Microso')
    # tags representing lists in HTML
    _list_tags = ('ol', 'ul', 'dl')
    # tags representing list items in HTML
    _list_item_tags = ('li', 'dd', 'dt')
    # ommitted tags - they are useless for this reason
    _omitted_tags = ('form', 'option', 'link', 'style', 'meta', 'head',
                     'script')
    # acceptable mime types for documents (publications)
    _accepted_mime = ('application/pdf', 'application/rtf',
                      'application/postscript', 'application/msword')

    def __init__(self, headercoef=2.5):
        """
        Constructor.
        @param headercoef: lower border of elemet's visibility to be handled as header
        """
        self.generalizer = _RRSPropertyGeneralizer()
        self.ee = EntityExtractor()
        self.headercoef = headercoef
        self.bibtexparser = BibTeXParser()
        self.crawler = Crawler()
        self.mime_handler = MIMEhandler()
        self.crawler.set_handler(FileDownloader)

    def _get_visibility2elem_map(self, etree):
        htmlroot = etree.getroot()
        visibility2elem_map = {}
        for elem in htmlroot.iterdescendants():
            # no need to get those
            if elem.tag in self._omitted_tags: continue
            assert hasattr(elem, 'style')
            v = elem.style.get_visibility()
            if v in visibility2elem_map:
                visibility2elem_map[v].append(elem)
            else:
                visibility2elem_map[v] = [elem]
        return visibility2elem_map

    def _classify_publ_title(self, title, init=70):
        def _bracket_alone(text):
            if text is not None and re.search(" [\(\)\{\}\[\]] ", text):
                return True
            return False

        # default credibility
        cred = init
        # remove prepositions and words with len(w) < 3
        title = re.sub("(?<= )(?:[fF]or|[iI]n|[oO]f|[oO]n|[aA]t|[Tt]he|"\
                       "[Aa]nd|[aA]|[Ii]s|[Ww]e|[Tt]o)(?= )", "", title)
        title = re.sub("^(?:A|The|In|Is|To) ", "", title)
        title = re.sub("[ ]+", " ", title)
        # split into chunks
        title_sp = title.split(" ")
        _bad = 0
        # if there are many chunks (almost-words) with no meaning, reduce
        # credibility value
        _blacklisted = '(?<![a-z])(?:department|faculty|universtiy|society|lab|press|)(?![a-z]+)'
        for chunk in title_sp:
            if re.search(_blacklisted, chunk, re.I):
                _bad += 1
            elif re.search("[a-z]{3,}", chunk, re.I):
                pass
            else:
                _bad += 1
        # guess accuracy
        negative = float(_bad) / float(len(title_sp))
        cred = float(cred) - negative * 65
        # bonus if all chunks are OK
        if _bad == 0:
            cred += 20
        # if there in title is bracket alone, reduce credibility
        if _bracket_alone(title):
            cred -= 15
        # floor
        if cred > 100: cred = 100
        if cred < 0: cred = 0
        return cred

    def _most_alike_term(self, term, target, threshold=0.3):
        assert len(target) > 0
        assert term is not None
        l = ListedDict()
        for t in target:
            s = SequenceMatcher(None, term, t)
            l[float(s.ratio())] = t
        m = max(l)
        if m < float(threshold):
            return None
        return l[m][0]

    def _add_property(self, property, values):
        # Add the property and its value into publication
        # maybe it is not a property, but an entity
        firstval = values[0]
        if firstval is None: return
        # First try to get attributes of the publication
        if property in ('abstract', 'isbn', 'issn', 'volume', 'number',
                        'acronym', 'issn', 'note'):
            _type = self._publ.__types__[property]
            if _type is basestring:
                _type = unicode
            self._publ[property] = (_type)(firstval)
        elif property == 'title':
            self._publ[property] = firstval
            # the origin is from meta-tag, so we are pretty sure about this information
            self._publ['credibility'] = 95
        elif property == 'publisher':
            self._publ['publisher'] = RRSOrganization(title=firstval)
        elif property == 'date':
            r = self.ee.find_published_date(firstval)
            if not r[0]: return
            rrsdate = r[0][0]
            for attr in ('year', 'month'):
                if rrsdate.get(attr) is not None:
                    self._publ[attr] = rrsdate.get(attr)
        elif property == 'type':
            # choose better heuristics
            publtype = self._most_alike_term(firstval, publication_types, 0.4)
            if publtype is not None:
                self._publ['type'] = RRSPublication_type(type=publtype)
        elif property == 'pages':
            if re.search("[0-9]+\-{1,2}[0-9]+", firstval):
                self._publ['pages'] = firstval
        elif property == 'start page':
            if not re.search("^[0-9]+$", firstval): return
            if 'end page' in self._storage and not 'pages' in self._publ:
                self._publ['pages'] = "%s-%s" % (firstval,
                                                 self._storage['end page'][0])
            else:
                self._storage[property] = [firstval]
        elif property == 'end page':
            if not re.search("^[0-9]+$", firstval): return
            if 'start page' in self._storage and not 'pages' in self._publ:
                self._publ['pages'] = "%s-%s" % (
                    self._storage['start page'][0], firstval)
            else:
                self._storage[property] = [firstval]
        # --------------------------------------------------
        # Now other entities connected with the publiacation
        # --------------------------------------------------
        elif property == 'topic':
            for topictitle in values:
                rel = RRSRelationshipPublicationTopic()
                rel.set_entity(RRSTopic(title=topictitle))
                self._publ['topic'] = rel
        elif property == 'url':
            for link in values:
                try:
                    rel = RRSRelationshipPublicationUrl()
                    u = RRSUrl(link=link)
                except (RRSDatabaseAttributeError, RRSDatabaseEntityError,
                        RRSDatabaseValueError):
                    return
                u['type'] = RRSUrl_type(type='publication')
                rel.set_entity(u)
                self._publ['url'] = rel
        elif property == 'keywords':
            for kw in values:
                rel = RRSRelationshipPublicationKeyword()
                rel.set_entity(RRSKeyword(title=kw))
                self._publ['keyword'] = rel
        elif property in ('author', 'editor'):
            for person in values:
                rel = RRSRelationshipPersonPublication()
                rel['editor'] = property == 'editor'
                r = self.ee.find_authors(person)
                if not r[0]: return
                rel.set_entity(r[0][0])
                rel['author_rank'] = len(self._publ['person']) + 1
                self._publ['person'] = rel
        elif property == 'reviews':
            # TODO if real reviews would be implemented in DB schema,
            # change this: add RRSReview objects into publication
            self._publ.set("review", values, strict=False)

    def _parse_meta(self, document):
        doc_meta = document.get_meta_map()
        # transform into generalized form
        for key in doc_meta:
            property = self.generalizer.generalize(key)
            if property is None: continue
            if property in self._storage:
                for val in doc_meta[key]:
                    if val not in self._storage[property]:
                        self._storage[property].append(val)
            else:
                self._storage[property] = doc_meta[key]
        # make authors and editors disjoint sets
        if 'author' in self._storage and 'editor' in self._storage:
            to_del = []
            for a in self._storage['author']:
                if a in self._storage['editor']:
                    to_del.append(a)
            for a in to_del:
                self._storage['author'].remove(a)
        # and now just set the values into real RRS objects
        for property in self._storage:
            self._add_property(property, self._storage[property])
        self._storage = {}

    def _find_local_sequence(self, header_elem, h_func):
        # lightweight version of sequencewrapper (rrslib.web.sequencewrapper)
        # targeted to repeated sequence of tags in one level of DOM - there's no
        # analysis of data regions (we already found one), just looking for data
        # records in a very straightforward (and very stupid) way.
        # @param header_elem - lxml.html.HtmlElement representing header element
        # @param h_func - heuristic function returning true/false, has to accept
        #                 one parameter - element (instance of lxml.html.HtmlElement)
        # @return tuple (records, likelihood of data)
        tags = ListedDict()
        for elem in header_elem.iterchildren():
            if elem.tag in self._omitted_tags: continue
            tags[elem.tag] = elem
        (tag, elements) = tags.item_by_longest_value()
        if len(elements[0]) < 2:
            return (None, 0.0)
        res = []
        grouped_elements = []
        for e_group in elements:
            res.extend(filter(h_func, e_group))
            grouped_elements.extend(e_group)
        return (res, float(len(res)) / float(len(grouped_elements)))

    def _get_data_below_header(self, elem, hdrtext, to_be_processed):
        # Try to iter over siblings of the header element and get text
        siblings = [sib.tag for sib in elem.itersiblings()]
        # the header is abstract
        if hdrtext == 'abstract':
            txts = {}
            paragraphs = []
            par_stop = False
            for sib in elem.itersiblings():
                content = sib.text_content()
                if sib in to_be_processed:
                    par_stop = True
                if sib.tag == 'p' and len(content) > 50 and not par_stop:
                    paragraphs.append(content)
                chunk = content[0:20].lower()
                score = 1.0
                for st in self._abstract_startswith:
                    if chunk.startswith(st): score *= 5.0
                score *= len(content)
                txts[score] = SimpleHTMLCleaner.clean(content)
            if paragraphs:
                self._storage[hdrtext] = [
                    SimpleHTMLCleaner.clean(" ".join(paragraphs))
                ]
            else:
                self._storage[hdrtext] = [txts[max(txts.keys())]]

        # related publications
        elif hdrtext == 'related':
            list_tags = ('ul', 'ol', 'dl')
            return  # TODO
            for ltag in list_tags:
                if ltag in siblings:
                    for sib in elem.itersiblings():
                        pass

        # keywords
        elif hdrtext == 'keywords':
            # create function returning elements containing possible keywords
            is_keyword = lambda kw: re.search("^(([a-z]{3,}( |,)){1,3} ?)+([a-z]{3,} ?){1,3}$", \
                                    kw.text_content(), re.I) \
                                    and not re.search("[@#\$%\^&\*\(\)]", kw.text_content())
            # iter over siblings of header a try to get keywords from its children
            likelihood_to_keyword_tags = ListedDict()
            for s in elem.itersiblings():
                (kw_elems,
                 likelihood) = self._find_local_sequence(s, is_keyword)
                if kw_elems is None: continue
                likelihood_to_keyword_tags[likelihood] = kw_elems
            if not likelihood_to_keyword_tags: return
            # if found some keywords, store them
            self._storage[hdrtext] = [
                kw.text_content() for kw in likelihood_to_keyword_tags[max(
                    likelihood_to_keyword_tags.keys())][0]
            ]

        # references
        elif hdrtext == 'references':
            pass  # TODO

        # chapters ??
        elif hdrtext == 'chapters':
            pass  # TODO

        # reviews?
        elif hdrtext == 'reviews':
            if hdrtext in self._storage: return
            # create function returning elements containing possible reviews
            is_review = lambda r: (len(r.text_content()) > 100
                                   ) or r.tag == 'blockquote'
            probability = ListedDict()
            # iter over siblings of header a try to get reviews from its children
            for s in elem.itersiblings():
                (elems, prob) = self._find_local_sequence(s, is_review)
                if elems is None: continue
                probability[prob] = elems
            review_texts = []
            if not probability: return
            for e in probability[max(probability.keys())][0]:
                review_texts.append(SimpleHTMLCleaner.clean(e.text_content()))
                # set all the elements as "processed" to avoid further processing
                for d in e.iter():
                    d.processed = True
            self._storage[hdrtext] = review_texts

    def _parse_visibility(self, document):
        vis_map = self._get_visibility2elem_map(document.get_etree())
        if len(vis_map) < 2: return
        sorted_vis = sorted(vis_map.keys(), reverse=True)
        if len(sorted_vis) < 2: return
        to_be_processed = None
        while 42:  #:)
            to_be_processed = []
            for i in xrange(0, len(sorted_vis)):
                if sorted_vis[i] < self.headercoef: continue
                to_be_processed.extend(vis_map[sorted_vis[i]])
            if len(to_be_processed) < 2:
                self.headercoef -= 0.5
            else:
                break
        # storage for possible titles
        possible_titles = ListedDict()
        # loop over all headers (elements containing very visible texts)
        for elem in to_be_processed:
            # get cleaned text content of the tag
            txt = SimpleHTMLCleaner.clean(elem.text_content())
            # generalize: maybe it is something useful
            hdrtext = self.generalizer.generalize(txt)
            # generalization found header beeing TITLE -> data are below header
            if hdrtext is not None:
                # found some useful header, try to get data below
                # what is below? probably sibling tags and their descendants
                self._get_data_below_header(elem, hdrtext, to_be_processed)
            # generalization wasnt successful -> maybe the header contains data
            else:
                # date?
                d = self.ee.find_published_date(txt)
                if d[0]:
                    rrsdate = d[0][0]
                    for attr in ('year', 'month'):
                        if rrsdate.get(attr) is not None:
                            self._publ[attr] = rrsdate.get(attr)
                    txt = d[1]
                # maybe title
                if len(txt.split(" ")) > 3:  # probably more than three words
                    # is there a domain name in the title? So it is probably
                    # general name of the website
                    if len(self.domain) > 6 and re.search(
                            re.escape(self.domain), txt, re.I):
                        continue

                    # preprocessing - remove standalone brackets
                    txt = re.sub("[\(\[][^\)\]]*[\)\]]+", "", txt).strip()
                    if document.name is not None and re.search(
                            re.escape(txt), document.name, re.I):
                        possible_titles[int(
                            self._classify_publ_title(txt, init=100))] = txt
                    elif len(txt.split(" ")) > 5:
                        possible_titles[int(
                            self._classify_publ_title(txt, init=60))] = txt
        if possible_titles:
            titles = possible_titles[max(possible_titles)]
            if len(titles) > 1:
                title = self._get_longest_string(titles)
            else:
                title = titles[0]
            self._publ['title'] = title
            self._publ['credibility'] = max(possible_titles)
        else:
            self._publ['credibility'] = 0
        # store all new properties and their values
        for property in self._storage:
            self._add_property(property, self._storage[property])

    def _get_longest_string(self, l):
        mx = None
        maxlen = 0
        for t in l:
            if len(t) > maxlen:
                maxlen = len(t)
                mx = t
        return mx

    def _find_abstract(self, etree):
        c = Cleaner(scripts=True,
                    javascript=True,
                    comments=True,
                    style=True,
                    meta=True,
                    page_structure=False,
                    processing_instructions=True,
                    embedded=True,
                    frames=False,
                    forms=True,
                    annoying_tags=True,
                    add_nofollow=False,
                    remove_unknown_tags=False)
        etree_copy = deepcopy(etree)
        etree_copy = c.clean_html(etree_copy)
        html = tostring(etree_copy.getroot())
        # XXX this may be probably useful, to delete all <p> tags...
        html = re.sub("</?p[^>]*>", " ", html)
        possible = []
        txts = re.findall("(?<=\>)[^>]+(?=\<)", html, re.U)
        for txt in txts:
            txt = SimpleHTMLCleaner.clean(txt)
            if len(txt) > 200:
                do_not_append = False
                for bl in self._abstract_blacklist:
                    if txt.startswith(bl):
                        do_not_append = True
                        break
                if not do_not_append:
                    possible.append(txt)
                    continue
            for st in self._abstract_startswith:
                if txt.startswith(st):
                    possible.append(txt)
                    break
        return self._get_longest_string(possible)

    def _find_unbound_entities(self, page):
        root = page.get_etree().getroot()
        # get abstract
        if not 'abstract' in self._publ:
            abst = self._find_abstract(page.get_etree())
            if abst is not None:
                self._publ['abstract'] = abst

        # find url of publication (pdf, ps, doc...)
        if 'url' not in self._publ:
            to_be_checked = []
            for (element, attribute, link, pos) in root.iterlinks():
                if re.search("(pdf|doc|odt|ps)$", link, re.I):
                    to_be_checked.append(link)
                # TODO try to get links with no suffix (queries etc.)
                # ----------------------------------------------------
                # ADD THE CODE HERE
                # ----------------------------------------------------
            if to_be_checked:
                documents = []
                mimes = self.mime_handler.start(to_be_checked)
                for link in mimes:
                    if mimes[link] in self._accepted_mime:
                        documents.append(link)
                dl = len(documents)
                doc_link = None
                if dl == 1:  # exactly one link
                    doc_link = documents[0]
                elif dl != 0:  # more than one
                    # try to guess out of the name of the publication
                    if 'title' in self._publ:
                        doc_link = self._most_alike_term(
                            self._publ['title'], documents, 0.5)
                if doc_link is not None:
                    try:
                        rel = RRSRelationshipPublicationUrl()
                        u = RRSUrl(link=doc_link)
                    except (RRSDatabaseAttributeError, RRSDatabaseEntityError,
                            RRSDatabaseValueError):
                        return
                    u['type'] = RRSUrl_type(type='publication')
                    rel.set_entity(u)
                    self._publ['url'] = rel

        # Now extract unbound entities from the plaintext.
        # Every time there is a high probability that the relationship will be
        # miss-recognitized.

        # get keywords if there are no such
        if not 'keyword' in self._publ:
            # Try to get keywords from the text. They are probably in the format:
            # Keywords: algorithm, algorithmic process, random, some other keyword
            kwre = re.search(
                "keywords?:?[\t\n\r ]+(([a-z]{3,}( |,)){1,3} ?)+([a-z]{3,} ?){1,3}",
                self.pagetext, re.I)
            if kwre is not None:
                kwstr = kwre.group(0)
                kwstr = re.sub("[kK]eywords?:?[\t\n\r ]+", "", kwstr)
                keywords = [x.strip() for x in kwstr.split(",")]
                for kw in keywords:
                    rel = RRSRelationshipPublicationKeyword()
                    rel.set_entity(RRSKeyword(title=kw))
                    self._publ['keyword'] = rel

    def _parse_bibtex(self, page):
        # Parse all possible bibtex on the page.
        # At first parse plaintext of the page and try to get BibTeX string out
        # of there. Then try to find possible links fo bibtex files, download
        # them and parse them.
        bibtexpubl = self.bibtexparser.parse(self.pagetext)
        if bibtexpubl:
            pass
        # try to get .bib or .bibtex files from the page
        else:
            html_tag = page.get_etree().getroot()
            bibtex_links = set()
            for l in html_tag.iterlinks():
                if re.search("\.bib(tex)?$", l[2], re.I):
                    bibtex_links.add(l[2])
            if len(bibtex_links) == 1:
                r = self.crawler.start(bibtex_links)
                for link in bibtex_links:
                    bibtex_file = r[link]
                    if isinstance(bibtex_file, basestring):
                        bibtexpubl = self.bibtexparser.parse(bibtex_file)
                    else:
                        return
            else:
                # TODO handle more than one bibtex files???
                return
        # process found bibtex
        publ = bibtexpubl[0]
        for attr in publ:
            value = publ[attr]
            # not set, useless
            if value is None:
                continue
            # list of relationship attrs
            elif isinstance(value, list):
                for v in value:
                    self._publ[attr] = v
            # own attribute
            else:
                self._publ[attr] = value

    def extract_data(self, etree, url):
        """
        Extract all possible data about the publication from the web page.
        @param etree - parsed DOM tree of the web page (has to be instance of
                       lxml.etree._ElementTree)
        @param url - url of the web page
        @return RRSPublication object containing extracted data
        """
        assert isinstance(url, basestring)
        assert isinstance(etree, _ElementTree)
        #c = Cleaner(scripts=True, javascript=True, comments=True, style=False,
        #            meta=False, page_structure=False, processing_instructions=True,
        #            embedded=True, frames=False, forms=True, annoying_tags=False,
        #            add_nofollow=False, remove_unknown_tags=False)
        #etree = c.clean_html(etree)
        self.url = url
        self.domain = re.sub("http://(www)?", "", self.url).split(".")[0]
        self._storage = {}
        self._publ = RRSPublication()
        cleaned_etree = SimpleHTMLCleaner.clean_html(etree)
        page = HTMLDocument(cleaned_etree, url)
        self.pagetext = page.get_etree().getroot().text_content()
        # parse CSS and metadata on the page
        page.parse_document()
        # get data from <meta> tags nad convert to RRS format
        self._parse_meta(page)
        # get data on the basis of the text visbility and recognized headers
        self._parse_visibility(page)
        # and now guess :)
        self._find_unbound_entities(page)
        # and parse BibTeX
        self._parse_bibtex(page)
        return self._publ
class PublicationPageExtractor(object):
    """
    This class wraps all methods for recognition publication and it's description
    on the web page.

    The result of page-processing is ORM object rrslib.db.model.RRSPublication
    with extracted data.
    """
    # what could an abstract start with
    _abstract_startswith = ('We present', 'This paper', 'This publica',
                            'In this p', 'The paper', 'This book')
    _abstract_blacklist = ('CiteU', 'Was ', 'I ha', 'I pre', 'I was', 'CiteS',
                           'Microso')
    # tags representing lists in HTML
    _list_tags = ('ol', 'ul', 'dl')
    # tags representing list items in HTML
    _list_item_tags = ('li', 'dd', 'dt')
    # ommitted tags - they are useless for this reason
    _omitted_tags = ('form', 'option', 'link', 'style', 'meta', 'head', 'script')
    # acceptable mime types for documents (publications)
    _accepted_mime = ('application/pdf', 'application/rtf', 'application/postscript', 'application/msword')

    def __init__(self, headercoef=2.5):
        """
        Constructor.
        @param headercoef: lower border of elemet's visibility to be handled as header
        """
        self.generalizer = _RRSPropertyGeneralizer()
        self.ee = EntityExtractor()
        self.headercoef = headercoef
        self.bibtexparser = BibTeXParser()
        self.crawler = Crawler()
        self.mime_handler = MIMEhandler()
        self.crawler.set_handler(FileDownloader)


    def _get_visibility2elem_map(self, etree):
        htmlroot = etree.getroot()
        visibility2elem_map = {}
        for elem in htmlroot.iterdescendants():
            # no need to get those
            if elem.tag in self._omitted_tags: continue
            assert hasattr(elem, 'style')
            v = elem.style.get_visibility()
            if v in visibility2elem_map:
                visibility2elem_map[v].append(elem)
            else:
                visibility2elem_map[v] = [elem]
        return visibility2elem_map


    def _classify_publ_title(self, title, init=70):
        def _bracket_alone(text):
            if text is not None and re.search(" [\(\)\{\}\[\]] ", text):
                return True
            return False
        # default credibility
        cred = init
        # remove prepositions and words with len(w) < 3
        title = re.sub("(?<= )(?:[fF]or|[iI]n|[oO]f|[oO]n|[aA]t|[Tt]he|"\
                       "[Aa]nd|[aA]|[Ii]s|[Ww]e|[Tt]o)(?= )", "", title)
        title = re.sub("^(?:A|The|In|Is|To) ", "", title)
        title = re.sub("[ ]+", " ", title)
        # split into chunks
        title_sp = title.split(" ")
        _bad = 0
        # if there are many chunks (almost-words) with no meaning, reduce
        # credibility value
        _blacklisted = '(?<![a-z])(?:department|faculty|universtiy|society|lab|press|)(?![a-z]+)'
        for chunk in title_sp:
            if re.search(_blacklisted, chunk, re.I):
                _bad += 1
            elif re.search("[a-z]{3,}", chunk, re.I):
                pass
            else:
                _bad += 1
        # guess accuracy
        negative = float(_bad) / float(len(title_sp))
        cred = float(cred) - negative * 65
        # bonus if all chunks are OK
        if _bad == 0:
            cred += 20
        # if there in title is bracket alone, reduce credibility
        if _bracket_alone(title):
            cred -= 15
        # floor
        if cred > 100: cred = 100
        if cred < 0: cred = 0
        return cred


    def _most_alike_term(self, term, target, threshold=0.3):
        assert len(target) > 0
        assert term is not None
        l = ListedDict()
        for t in target:
            s = SequenceMatcher(None, term, t)
            l[float(s.ratio())] = t
        m = max(l)
        if m < float(threshold):
            return None
        return l[m][0]

    def _add_property(self, property, values):
        # Add the property and its value into publication
        # maybe it is not a property, but an entity
        firstval = values[0]
        if firstval is None: return
        # First try to get attributes of the publication
        if property in ('abstract', 'isbn', 'issn', 'volume',
                        'number', 'acronym', 'issn', 'note'):
            _type = self._publ.__types__[property]
            if _type is basestring:
                _type = unicode
            self._publ[property] = (_type)(firstval)
        elif property == 'title':
            self._publ[property] = firstval
            # the origin is from meta-tag, so we are pretty sure about this information
            self._publ['credibility'] = 95
        elif property == 'publisher':
            self._publ['publisher'] = RRSOrganization(title=firstval)
        elif property == 'date':
            r = self.ee.find_published_date(firstval)
            if not r[0]: return
            rrsdate = r[0][0]
            for attr in ('year', 'month'):
                if rrsdate.get(attr) is not None:
                    self._publ[attr] = rrsdate.get(attr)
        elif property == 'type':
            # choose better heuristics
            publtype = self._most_alike_term(firstval, publication_types, 0.4)
            if publtype is not None:
                self._publ['type'] = RRSPublication_type(type=publtype)
        elif property == 'pages':
            if re.search("[0-9]+\-{1,2}[0-9]+", firstval):
                self._publ['pages'] = firstval
        elif property == 'start page':
            if not re.search("^[0-9]+$", firstval): return
            if 'end page' in self._storage and not 'pages' in self._publ:
                self._publ['pages'] = "%s-%s" % (firstval, self._storage['end page'][0])
            else:
                self._storage[property] = [firstval]
        elif property == 'end page':
            if not re.search("^[0-9]+$", firstval): return
            if 'start page' in self._storage and not 'pages' in self._publ:
                self._publ['pages'] = "%s-%s" % (self._storage['start page'][0], firstval)
            else:
                self._storage[property] = [firstval]
        # --------------------------------------------------
        # Now other entities connected with the publiacation
        # --------------------------------------------------
        elif property == 'topic':
            for topictitle in values:
                rel = RRSRelationshipPublicationTopic()
                rel.set_entity(RRSTopic(title=topictitle))
                self._publ['topic'] = rel
        elif property == 'url':
            for link in values:
                try:
                    rel = RRSRelationshipPublicationUrl()
                    u = RRSUrl(link=link)
                except (RRSDatabaseAttributeError, RRSDatabaseEntityError, RRSDatabaseValueError):
                    return
                u['type'] = RRSUrl_type(type='publication')
                rel.set_entity(u)
                self._publ['url'] = rel
        elif property == 'keywords':
            for kw in values:
                rel = RRSRelationshipPublicationKeyword()
                rel.set_entity(RRSKeyword(title=kw))
                self._publ['keyword'] = rel
        elif property in ('author', 'editor'):
            for person in values:
                rel = RRSRelationshipPersonPublication()
                rel['editor'] = property == 'editor'
                r = self.ee.find_authors(person)
                if not r[0]: return
                rel.set_entity(r[0][0])
                rel['author_rank'] = len(self._publ['person']) + 1
                self._publ['person'] = rel
        elif property == 'reviews':
            # TODO if real reviews would be implemented in DB schema,
            # change this: add RRSReview objects into publication
            self._publ.set("review", values, strict=False)


    def _parse_meta(self, document):
        doc_meta = document.get_meta_map()
        # transform into generalized form
        for key in doc_meta:
            property = self.generalizer.generalize(key)
            if property is None: continue
            if property in self._storage:
                for val in doc_meta[key]:
                    if val not in self._storage[property]:
                        self._storage[property].append(val)
            else:
                self._storage[property] = doc_meta[key]
        # make authors and editors disjoint sets
        if 'author' in self._storage and 'editor' in self._storage:
            to_del = []
            for a in self._storage['author']:
                if a in self._storage['editor']:
                    to_del.append(a)
            for a in to_del:
                self._storage['author'].remove(a)
        # and now just set the values into real RRS objects
        for property in self._storage:
            self._add_property(property, self._storage[property])
        self._storage= {}


    def _find_local_sequence(self, header_elem, h_func):
        # lightweight version of sequencewrapper (rrslib.web.sequencewrapper)
        # targeted to repeated sequence of tags in one level of DOM - there's no
        # analysis of data regions (we already found one), just looking for data
        # records in a very straightforward (and very stupid) way.
        # @param header_elem - lxml.html.HtmlElement representing header element
        # @param h_func - heuristic function returning true/false, has to accept
        #                 one parameter - element (instance of lxml.html.HtmlElement)
        # @return tuple (records, likelihood of data)
        tags = ListedDict()
        for elem in header_elem.iterchildren():
            if elem.tag in self._omitted_tags: continue
            tags[elem.tag] = elem
        (tag, elements) = tags.item_by_longest_value()
        if len(elements[0]) < 2:
            return (None, 0.0)
        res = []
        grouped_elements = []
        for e_group in elements:
            res.extend(filter(h_func, e_group))
            grouped_elements.extend(e_group)
        return (res, float(len(res))/float(len(grouped_elements)))


    def _get_data_below_header(self, elem, hdrtext, to_be_processed):
        # Try to iter over siblings of the header element and get text
        siblings = [sib.tag for sib in elem.itersiblings()]
        # the header is abstract
        if hdrtext == 'abstract':
            txts = {}
            paragraphs = []
            par_stop = False
            for sib in elem.itersiblings():
                content = sib.text_content()
                if sib in to_be_processed:
                    par_stop = True
                if sib.tag == 'p' and len(content) > 50 and not par_stop:
                    paragraphs.append(content)
                chunk = content[0:20].lower()
                score = 1.0
                for st in self._abstract_startswith:
                    if chunk.startswith(st): score*=5.0
                score *= len(content)
                txts[score] = SimpleHTMLCleaner.clean(content)
            if paragraphs:
                self._storage[hdrtext] = [SimpleHTMLCleaner.clean(" ".join(paragraphs))]
            else:
                self._storage[hdrtext] = [ txts[max(txts.keys())] ]

        # related publications
        elif hdrtext == 'related':
            list_tags = ('ul', 'ol', 'dl')
            return # TODO
            for ltag in list_tags:
                if ltag in siblings:
                    for sib in elem.itersiblings(): pass

        # keywords
        elif hdrtext == 'keywords':
            # create function returning elements containing possible keywords
            is_keyword = lambda kw: re.search("^(([a-z]{3,}( |,)){1,3} ?)+([a-z]{3,} ?){1,3}$", \
                                    kw.text_content(), re.I) \
                                    and not re.search("[@#\$%\^&\*\(\)]", kw.text_content())
            # iter over siblings of header a try to get keywords from its children
            likelihood_to_keyword_tags = ListedDict()
            for s in elem.itersiblings():
                (kw_elems, likelihood) = self._find_local_sequence(s, is_keyword)
                if kw_elems is None: continue
                likelihood_to_keyword_tags[likelihood] = kw_elems
            if not likelihood_to_keyword_tags: return
            # if found some keywords, store them
            self._storage[hdrtext] = [kw.text_content() for kw in likelihood_to_keyword_tags[max(likelihood_to_keyword_tags.keys())][0]]

        # references
        elif hdrtext == 'references':
            pass # TODO

        # chapters ??
        elif hdrtext == 'chapters':
            pass # TODO

        # reviews?
        elif hdrtext == 'reviews':
            if hdrtext in self._storage: return
            # create function returning elements containing possible reviews
            is_review = lambda r: (len(r.text_content()) > 100) or r.tag == 'blockquote'
            probability = ListedDict()
            # iter over siblings of header a try to get reviews from its children
            for s in elem.itersiblings():
                (elems, prob) = self._find_local_sequence(s, is_review)
                if elems is None: continue
                probability[prob] = elems
            review_texts = []
            if not probability: return
            for e in probability[max(probability.keys())][0]:
                review_texts.append(SimpleHTMLCleaner.clean(e.text_content()))
                # set all the elements as "processed" to avoid further processing
                for d in e.iter():
                    d.processed = True
            self._storage[hdrtext] = review_texts


    def _parse_visibility(self, document):
        vis_map = self._get_visibility2elem_map(document.get_etree())
        if len(vis_map) < 2: return
        sorted_vis = sorted(vis_map.keys(), reverse=True)
        if len(sorted_vis) < 2: return
        to_be_processed = None
        while 42: #:)
            to_be_processed = []
            for i in xrange(0, len(sorted_vis)):
                if sorted_vis[i] < self.headercoef: continue
                to_be_processed.extend(vis_map[sorted_vis[i]])
            if len(to_be_processed) < 2:
                self.headercoef -= 0.5
            else: break
        # storage for possible titles
        possible_titles = ListedDict()
        # loop over all headers (elements containing very visible texts)
        for elem in to_be_processed:
            # get cleaned text content of the tag
            txt = SimpleHTMLCleaner.clean( elem.text_content() )
            # generalize: maybe it is something useful
            hdrtext = self.generalizer.generalize(txt)
            # generalization found header beeing TITLE -> data are below header
            if hdrtext is not None:
                # found some useful header, try to get data below
                # what is below? probably sibling tags and their descendants
                self._get_data_below_header(elem, hdrtext, to_be_processed)
            # generalization wasnt successful -> maybe the header contains data
            else:
                # date?
                d = self.ee.find_published_date(txt)
                if d[0]:
                    rrsdate = d[0][0]
                    for attr in ('year', 'month'):
                        if rrsdate.get(attr) is not None:
                            self._publ[attr] = rrsdate.get(attr)
                    txt = d[1]
                # maybe title
                if len(txt.split(" ")) > 3: # probably more than three words
                    # is there a domain name in the title? So it is probably
                    # general name of the website
                    if len(self.domain) > 6 and re.search(re.escape(self.domain), txt, re.I):
                        continue

                    # preprocessing - remove standalone brackets
                    txt = re.sub("[\(\[][^\)\]]*[\)\]]+", "", txt).strip()
                    if document.name is not None and re.search(re.escape(txt), document.name, re.I):
                        possible_titles[int(self._classify_publ_title(txt, init=100))] = txt
                    elif len(txt.split(" ")) > 5:
                        possible_titles[int(self._classify_publ_title(txt, init=60))] = txt
        if possible_titles:
            titles = possible_titles[max(possible_titles)]
            if len(titles) > 1:
                title = self._get_longest_string(titles)
            else:
                title = titles[0]
            self._publ['title'] = title
            self._publ['credibility'] = max(possible_titles)
        else:
            self._publ['credibility'] = 0
        # store all new properties and their values
        for property in self._storage:
            self._add_property(property, self._storage[property])


    def _get_longest_string(self, l):
        mx = None
        maxlen = 0
        for t in l:
            if len(t) > maxlen:
                maxlen = len(t)
                mx = t
        return mx


    def _find_abstract(self, etree):
        c = Cleaner(scripts=True, javascript=True, comments=True, style=True,
                    meta=True, page_structure=False, processing_instructions=True,
                    embedded=True, frames=False, forms=True, annoying_tags=True,
                    add_nofollow=False, remove_unknown_tags=False)
        etree_copy = deepcopy(etree)
        etree_copy = c.clean_html(etree_copy)
        html = tostring(etree_copy.getroot())
        # XXX this may be probably useful, to delete all <p> tags...
        html = re.sub("</?p[^>]*>", " ", html)
        possible = []
        txts = re.findall("(?<=\>)[^>]+(?=\<)", html, re.U)
        for txt in txts:
            txt = SimpleHTMLCleaner.clean(txt)
            if len(txt) > 200:
                do_not_append = False
                for bl in self._abstract_blacklist:
                    if txt.startswith(bl):
                        do_not_append = True
                        break
                if not do_not_append:
                    possible.append(txt)
                    continue
            for st in self._abstract_startswith:
                if txt.startswith(st):
                    possible.append(txt)
                    break
        return self._get_longest_string(possible)


    def _find_unbound_entities(self, page):
        root = page.get_etree().getroot()
        # get abstract
        if not 'abstract' in self._publ:
            abst = self._find_abstract(page.get_etree())
            if abst is not None:
                self._publ['abstract'] = abst

        # find url of publication (pdf, ps, doc...)
        if 'url' not in self._publ:
            to_be_checked = []
            for (element, attribute, link, pos) in root.iterlinks():
                if re.search("(pdf|doc|odt|ps)$", link, re.I):
                    to_be_checked.append(link)
                # TODO try to get links with no suffix (queries etc.)
                # ----------------------------------------------------
                # ADD THE CODE HERE
                # ----------------------------------------------------
            if to_be_checked:
                documents = []
                mimes = self.mime_handler.start(to_be_checked)
                for link in mimes:
                    if mimes[link] in self._accepted_mime:
                        documents.append(link)
                dl = len(documents)
                doc_link = None
                if dl == 1: # exactly one link
                    doc_link = documents[0]
                elif dl != 0: # more than one
                    # try to guess out of the name of the publication
                    if 'title' in self._publ:
                        doc_link = self._most_alike_term(self._publ['title'], documents, 0.5)
                if doc_link is not None:
                    try:
                        rel = RRSRelationshipPublicationUrl()
                        u = RRSUrl(link=doc_link)
                    except (RRSDatabaseAttributeError, RRSDatabaseEntityError, RRSDatabaseValueError):
                        return
                    u['type'] = RRSUrl_type(type='publication')
                    rel.set_entity(u)
                    self._publ['url'] = rel

        # Now extract unbound entities from the plaintext.
        # Every time there is a high probability that the relationship will be
        # miss-recognitized.

        # get keywords if there are no such
        if not 'keyword' in self._publ:
            # Try to get keywords from the text. They are probably in the format:
            # Keywords: algorithm, algorithmic process, random, some other keyword
            kwre = re.search("keywords?:?[\t\n\r ]+(([a-z]{3,}( |,)){1,3} ?)+([a-z]{3,} ?){1,3}", self.pagetext, re.I)
            if kwre is not None:
                kwstr = kwre.group(0)
                kwstr = re.sub("[kK]eywords?:?[\t\n\r ]+", "", kwstr)
                keywords = [x.strip() for x in kwstr.split(",")]
                for kw in keywords:
                    rel = RRSRelationshipPublicationKeyword()
                    rel.set_entity(RRSKeyword(title=kw))
                    self._publ['keyword'] = rel


    def _parse_bibtex(self, page):
        # Parse all possible bibtex on the page.
        # At first parse plaintext of the page and try to get BibTeX string out
        # of there. Then try to find possible links fo bibtex files, download
        # them and parse them.
        bibtexpubl = self.bibtexparser.parse(self.pagetext)
        if bibtexpubl:
            pass
        # try to get .bib or .bibtex files from the page
        else:
            html_tag = page.get_etree().getroot()
            bibtex_links = set()
            for l in html_tag.iterlinks():
                if re.search("\.bib(tex)?$", l[2], re.I):
                    bibtex_links.add(l[2])
            if len(bibtex_links) == 1:
                r = self.crawler.start(bibtex_links)
                for link in bibtex_links:
                    bibtex_file = r[link]
                    if isinstance(bibtex_file, basestring):
                        bibtexpubl = self.bibtexparser.parse(bibtex_file)
                    else:
                        return
            else:
                # TODO handle more than one bibtex files???
                return
        # process found bibtex
        publ = bibtexpubl[0]
        for attr in publ:
            value = publ[attr]
            # not set, useless
            if value is None:
                continue
            # list of relationship attrs
            elif isinstance(value, list):
                for v in value:
                    self._publ[attr] = v
            # own attribute
            else:
                self._publ[attr] = value


    def extract_data(self, etree, url):
        """
        Extract all possible data about the publication from the web page.
        @param etree - parsed DOM tree of the web page (has to be instance of
                       lxml.etree._ElementTree)
        @param url - url of the web page
        @return RRSPublication object containing extracted data
        """
        assert isinstance(url, basestring)
        assert isinstance(etree, _ElementTree)
        #c = Cleaner(scripts=True, javascript=True, comments=True, style=False,
        #            meta=False, page_structure=False, processing_instructions=True,
        #            embedded=True, frames=False, forms=True, annoying_tags=False,
        #            add_nofollow=False, remove_unknown_tags=False)
        #etree = c.clean_html(etree)
        self.url = url
        self.domain = re.sub("http://(www)?", "", self.url).split(".")[0]
        self._storage= {}
        self._publ = RRSPublication()
        cleaned_etree = SimpleHTMLCleaner.clean_html(etree)
        page = HTMLDocument(cleaned_etree, url)
        self.pagetext = page.get_etree().getroot().text_content()
        # parse CSS and metadata on the page
        page.parse_document()
        # get data from <meta> tags nad convert to RRS format
        self._parse_meta(page)
        # get data on the basis of the text visbility and recognized headers
        self._parse_visibility(page)
        # and now guess :)
        self._find_unbound_entities(page)
        # and parse BibTeX
        self._parse_bibtex(page)
        return self._publ