예제 #1
0
def deduplicate_sentences(raws):
    logger = logging.getLogger(sys._getframe().f_code.co_name)

    #logger.debug('deduplicate_sentences')
    sentences = raws.split(u'.')
    sentences = [normalize_space(sentence) for sentence in sentences if len(normalize_space(sentence))>0]
    good_sentences = []
    prev_sentence = u''
    for sentence in sentences:
        #logger.debug(u'  checking: "{0}"'.format(sentence))
        if prev_sentence == sentence:
            #logger.debug(u'    DUPLICATE: IGNORED')
            pass
        elif len(prev_sentence) > 0 and len(prev_sentence) < len(sentence):
            #logger.debug(u'    checking if "{0}" starts with "{1}"'.format(sentence, prev_sentence))
            foo = sentence[:len(prev_sentence)]
            #logger.debug(u'      foo: {0}'.format(foo))
            if foo == prev_sentence:
                #logger.debug('    STARTS WITH: PREVIOUS REMOVED')
                good_sentences = good_sentences[0:-1]
            good_sentences.append(sentence)
        else:
            #logger.debug(u'    KEEP!')
            good_sentences.append(sentence)
        prev_sentence = sentence
    #logger.debug('good sentences follow')
    #for sentence in good_sentences:
        #logger.debug(u'    {0}'.format(sentence))
    return u'. '.join(good_sentences)
예제 #2
0
 def _get_language(self, *args):
     logger = logging.getLogger(sys._getframe().f_code.co_name)
     chunks = [chunk for chunk in args if chunk is not None]
     s = u' '.join((tuple(chunks)))
     s = normalize_space(s)
     logger.debug('s: \n"{}\n'.format(s.encode('utf-8')))
     if s != u'':
         language = LANGUAGE_IDENTIFIER.classify(s)
         logger.debug(repr(language))
         if language[1] >= LANGID_THRESHOLD:
             return language[0]
     return None
예제 #3
0
    def _parse_peeps(self, rx_list, content_text):

        cooked = []
        raw = u''
        for rx in rx_list:
            m = rx.search(content_text)
            if m:
                raw = m.groups()[-1]
                break
        if len(raw) > 0:
            if u',' in raw:
                cracked = raw.split(u',')
            else:
                cracked = [raw,]
            for chunk in cracked:
                if u' and ' in chunk:
                    cooked.extend(chunk.split(u' and '))
                else:
                    cooked.append(chunk)
            cooked = [normalize_space(peep) for peep in cooked if len(normalize_space(peep)) > 0]
        return cooked
예제 #4
0
def deduplicate_lines(raws):
    #logger = logging.getLogger(sys._getframe().f_code.co_name)
    #logger.debug('\n\ndeduplicating!')
    prev_line = u''
    good_lines = []
    cookeds = u''
    lines = raws.split(u'\n')
    lines = [normalize_space(line) for line in lines if normalize_space(line) != u'']
    for line in lines:
        #logger.debug(u'prev_line: {0}'.format(prev_line))
        #logger.debug(u'line: {0}'.format(line))
        canary = RX_CANARY.sub(u'', line.lower())
        #logger.debug(u'canary: {0}'.format(canary))
        if canary != u'':
            prev_length = len(prev_line)
            if prev_line != u'' and prev_length < len(canary):
                toucan = unicode(canary[:prev_length])
            else:
                toucan = u''
            #logger.debug(u'toucan: {0}'.format(toucan))
            if prev_line == u'':
                good_lines.append(line)
                #logger.debug('append initial!')
            elif toucan == prev_line:
                good_lines = good_lines[0:-1]
                #logger.debug('clawback!')
                good_lines.append(line)
            elif canary != prev_line:
                #logger.debug('append!')
                good_lines.append(line)
            else:
                #logger.debug('NEIN!')
                pass
        else:
            good_lines.append(line)
        prev_line = canary
    #logger.debug('good_lines follows')
    #for line in good_lines:
        #logger.debug(u'   {0}'.format(line))
    return normalize_space(u' '.join(good_lines))
예제 #5
0
def clean_string(raw):
    prepped = normalize_space(raw)
    if prepped == u'':
        return u''
    chopped = prepped.split(u'.')
    if len(chopped) > 2:
        cooked = u'.'.join(tuple(chopped[:2]))
        i = 2
        #while i < len(chopped) and len(cooked) < 40: why truncation?
        while i < len(chopped):
            cooked = cooked + u'.' + chopped[i]
            i = i + 1
    else:
        cooked = prepped
    junk = [
        (u'(', u')'),
        (u'[', u']'),
        (u'{', u'}'),
        (u'"', u'"'),
        (u"'", u"'"),
        (u'<', u'>'),
        (u'«', u'»'),
        (u'‘', u'’'),
        (u'‚', u'‛'),
        (u'“', u'”'),
        (u'‟', u'„'),
        (u'‹', u'›'),
        (u'〟', u'"'),
        (u'\\'),
        (u'/'),
        (u'|'),
        (u','),
        (u';'),
        (u'-'),
        (u'.'),
        (u'_'),
    ]
    for j in junk:
        try:
            if len(j) == 2:
                cooked = cooked[1:-1] if cooked[0] == j[0] and cooked[-1] == j[1] else cooked
            else:
                cooked = cooked[1:] if cooked[0] == j[0] else cooked
                cooked = cooked[:-1] if cooked[-1] == j[0] else cooked
            if cooked[0:4] == u'and ':
                cooked = cooked[4:]
        except IndexError:
            pass
        else:
            cooked = cooked.strip()
    return cooked
예제 #6
0
def ukey(raw):
    raw_type = type(raw)
    if raw_type == list:
        uraw = u' '.join([unicode(chunk) for chunk in raw])
    elif raw_type == str:
        uraw = str(raw)
    elif raw_type == unicode:
        uraw = raw
    else:
        raise TypeError(u'ukey does not support arguments of type {0}'.format(raw_type))
    cooked = normalize_space(uraw)
    cooked = RX_PUNCTSTRIP.sub(u'', cooked)
    cooked = cooked.lower().split()
    cooked = list(set(cooked))
    cooked = u''.join(cooked)
    return cooked
예제 #7
0
    def _nodesplain(self, node, gloss=u'', include_source=False):
        """Provide copious information about this XML node."""
        logger = logging.getLogger(sys._getframe().f_code.co_name)
        template = u"""
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    >>> NODESPLANATION <<<
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    type: {node_type}
    name: {name}
    xpath: /{xpath}
    attributes: {attributes}
    text: {text}
    gloss: {gloss}
    source: {source}
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        """

        name = node.name
        try:
            text = normalize_space(u' '.join([string for string in node.stripped_strings]))
        except AttributeError:
            text = u'None'
        try:
            attributes = pprint.pformat(node.attrs)
        except AttributeError:
            attributes = u'None'
        count = str(1+len([t for t in node.previous_siblings if t.name == name]))
        path = ['{name}[{count}]'.format(name=name, count=count)]
        for parent in node.parents:
            if type(parent) != NavigableString:
                parent_name = parent.name
                count = str(1+len([t for t in parent.previous_siblings if t.name == parent_name]))
            path = ['{name}[{count}]'.format(name=parent_name, count=count)] + path
        root = [p for p in node.parents][1]
        params = {
            'node_type': type(node),
            'name': name,
            'xpath': '/'.join(path),
            'attributes': attributes,
            'text': text,
            'gloss': gloss,
            'source': root.prettify() if include_source else u''
        }
        return template.format(**params)
예제 #8
0
 def _clean_keywords(self, raw_tags):
     tags = list(set(raw_tags))
     keywords = []
     for tag in tags:
         if tag == u'':
             pass
         elif u',' in tag:
             keywords.extend(tag.split(u','))
         else:
             keywords.append(tag)
     keywords = sorted([normalize_space(kw) for kw in list(set(keywords))], key=lambda s: s.lower())
     for tag in keywords:
         if tag == tag.upper():
             pass
         elif tag.lower() in TITLE_SUBSTRING_TAGS.keys():
             pass
         elif tag != tag.lower():
             raise ValueError(u'keyword "{0}" lacks an appropriate entry in awol_title_strings.csv'.format(tag))
     return list(set(keywords))
예제 #9
0
from isaw.awol.normalize_space import normalize_space
from isaw.awol.resource import Resource

PATH_CURRENT = os.path.dirname(os.path.abspath(__file__))
# Build a dictionary of format {<colon prefix>:<list of cols 2,3 and 4>}
colon_prefix_csv = pkg_resources.resource_stream('isaw.awol',
                                                 'awol_colon_prefixes.csv')
dreader = unicodecsv.DictReader(
    colon_prefix_csv,
    fieldnames=['col_pre', 'omit_post', 'strip_title', 'mul_res'],
    delimiter=',',
    quotechar='"')
COLON_PREFIXES = dict()
for row in dreader:
    COLON_PREFIXES.update({
        normalize_space(row['col_pre']).lower():
        [row['omit_post'], row['strip_title'], row['mul_res']]
    })
del dreader
DOMAINS_TO_IGNORE = ['draft.blogger.com']
DOMAINS_SECONDARY = ['ancientworldonline.blogspot.com']
LANGID_THRESHOLD = 0.95
RX_CANARY = re.compile(r'[\.,:!\"“„\;\-\s]+', re.IGNORECASE)
RX_NUMERICISH = re.compile(r'^a?n?d?\s*[\.,:!\"“„\;\-\s\d\(\)\[\]]+$',
                           re.IGNORECASE)
RX_MATCH_DOMAIN = re.compile('^https?:\/\/([^/#]+)')
RX_IDENTIFIERS = {
    'issn': {
        'electronic': [
            re.compile(
                r'(electronic|e-|e‒|e–|e—|e|online|on-line|digital)([\s:]*issn[^\d]*[\dX-‒–—]{4}[-‒–—\s]?[\dX]{4})',
예제 #10
0
 def digdigdig(this_node, first_node, stop_tags, skip_first_anchor, previous_urls):
     node_type = type(this_node)
     node_name = this_node.name
     try:
         node_url = this_node.get('href')
     except AttributeError:
         node_url = ''
     if node_url is None:
         node_url = ''
     if '/' in node_url:
         chunks = node_url.split('/')
         if chunks[-1] in ['index.html', 'index.php', '', None]:
             node_url = '/'.join(chunks[:-1])
     results = []
     if (
         this_node != first_node
         and node_name in stop_tags
         and (
             node_name != 'a'
             or (
                 'a' in stop_tags
                 and node_name == 'a'
                 and (
                         (
                         skip_first_anchor
                         and len(previous_urls) == 0
                         )
                     or (
                         not(skip_first_anchor)
                         and len(previous_urls) > 0
                         and node_url != previous_urls[-1]
                         )
                     )
                 )
             )
         ):
         return (True, results)
     if node_name == 'a':
         previous_urls.append(node_url)
     try:
         previous_text = normalize_space(this_node.previous_sibling.get_text())
     except AttributeError:
         previous_text = u''
     try:
         previous_last = previous_text[-1]
     except IndexError:
         previous_last = previous_text
     if node_name == 'br' and previous_last != u'.':
         results.append(u'. ')
     if node_type == NavigableString:
         results.append(unicode(this_node))
     else:
         try:
             descendants = this_node.descendants
         except AttributeError:
             pass
         else:
             if descendants is not None:
                 for child in this_node.children:
                     stop, child_results = digdigdig(child, first_node, stop_tags, skip_first_anchor, previous_urls)
                     results.extend(child_results)
                     if stop:
                         return (stop, results)
     return (False, results)
예제 #11
0
    def _get_description(self, context=None, title=u''):
        if context is None:
            c = self.content
            soup = c['soup']
            first_node = soup.body.contents[0]
            skip_first_anchor = True
        else:
            first_node = context
            skip_first_anchor = False

        def digdigdig(this_node, first_node, stop_tags, skip_first_anchor, previous_urls):
            node_type = type(this_node)
            node_name = this_node.name
            try:
                node_url = this_node.get('href')
            except AttributeError:
                node_url = ''
            if node_url is None:
                node_url = ''
            if '/' in node_url:
                chunks = node_url.split('/')
                if chunks[-1] in ['index.html', 'index.php', '', None]:
                    node_url = '/'.join(chunks[:-1])
            results = []
            if (
                this_node != first_node
                and node_name in stop_tags
                and (
                    node_name != 'a'
                    or (
                        'a' in stop_tags
                        and node_name == 'a'
                        and (
                                (
                                skip_first_anchor
                                and len(previous_urls) == 0
                                )
                            or (
                                not(skip_first_anchor)
                                and len(previous_urls) > 0
                                and node_url != previous_urls[-1]
                                )
                            )
                        )
                    )
                ):
                return (True, results)
            if node_name == 'a':
                previous_urls.append(node_url)
            try:
                previous_text = normalize_space(this_node.previous_sibling.get_text())
            except AttributeError:
                previous_text = u''
            try:
                previous_last = previous_text[-1]
            except IndexError:
                previous_last = previous_text
            if node_name == 'br' and previous_last != u'.':
                results.append(u'. ')
            if node_type == NavigableString:
                results.append(unicode(this_node))
            else:
                try:
                    descendants = this_node.descendants
                except AttributeError:
                    pass
                else:
                    if descendants is not None:
                        for child in this_node.children:
                            stop, child_results = digdigdig(child, first_node, stop_tags, skip_first_anchor, previous_urls)
                            results.extend(child_results)
                            if stop:
                                return (stop, results)
            return (False, results)

        def skiptomalou(first_node, stop_tags, skip_first_anchor):
            previous_urls = []
            stop, desc_lines = digdigdig(first_node, first_node, stop_tags, skip_first_anchor, previous_urls)
            node = first_node
            while True:
                previous_node = node
                node = node.next_sibling
                if node is None:
                    break
                try:
                    node_name = node.name
                except AttributeError:
                    node_name = type(node)
                try:
                    node_url = node.get('href')
                except AttributeError:
                    node_url = ''
                if node_url is None:
                    node_url = ''
                if '/' in node_url:
                    chunks = node_url.split('/')
                    if chunks[-1] in ['index.html', 'index.php', '', None]:
                        node_url = '/'.join(chunks[:-1])
                if (
                    node_name in stop_tags
                    and (
                        node_name != 'a'
                        or (
                            'a' in stop_tags
                            and node_name == 'a'
                            and (
                                    (
                                    not(skip_first_anchor)
                                    and len(previous_urls) == 0
                                    )
                                or (
                                    skip_first_anchor
                                    and len(previous_urls) > 0
                                    and node_url != previous_urls[-1]
                                    )
                                )
                            )
                        )
                    ):
                    break
                if node_name == 'a':
                    previous_urls.append(node_url)
                stop, results = digdigdig(node, first_node, stop_tags, skip_first_anchor, previous_urls)
                desc_lines.extend(results)
                if stop:
                    break
            return desc_lines

        stop_tags = ['a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'ol', 'ul', 'dl', 'dt', 'li', 'table']
        desc_lines = skiptomalou(first_node, stop_tags, skip_first_anchor)

        stop_tags = ['a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
        if len(desc_lines) == 0:
            desc_lines = skiptomalou(first_node, stop_tags, False)
        elif ukey(desc_lines) == ukey(title):
            desc_lines = skiptomalou(first_node, stop_tags, skip_first_anchor)
        if len(desc_lines) == 0:
            desc_text = None
        else:
            desc_text = deduplicate_lines(u'\n'.join(desc_lines))
            desc_text = u''.join(desc_lines)
            if len(desc_text) == 0:
                desc_text = None
            else:
                desc_text = desc_text.replace(u'%IMAGEREPLACED%', u'').strip()
                desc_text = RX_PUNCT_FIX.sub(r'\1', desc_text)
                desc_text = deduplicate_sentences(desc_text)
                desc_text = RX_PUNCT_DEDUPE.sub(r'\1', desc_text)
                desc_text = normalize_space(desc_text)
                if len(desc_text) == 0:
                    desc_text = None
                elif desc_text[-1] != u'.':
                    desc_text += u'.'

        return desc_text
예제 #12
0
    def _get_resources(self, article):
        """Override basic resource extraction."""
        if article.url == u'http://ancientworldonline.blogspot.com/2015/05/universitat-wien-theses-and.html':
            resources = []
            relatives = []
            soup = article.soup
            people = soup.find_all('span', 'person_name')
            for person in people:
                a = person.find_next_sibling('a')
                foo = a.find_next_sibling('br').next_sibling
                try:
                    bar = a.find_next_sibling('span',
                                              'person_name').previous_sibling
                except AttributeError:
                    bar = None
                description = u''
                while foo is not None and foo != bar:
                    if type(foo) == NavigableString:
                        description += u'{0} '.format(
                            clean_string(unicode(foo)))
                    else:
                        description += u'{0} '.format(
                            clean_string(foo.get_text()))
                    foo = foo.next_sibling
                if description.strip() == u'':
                    description = None
                else:
                    description = normalize_space(u'. '.join(
                        [chunk.strip() for chunk in description.split(u'.')]))
                foosball = a.find_all_previous('a')
                foosball = [f for f in foosball if 'subjects' in f.get('href')]
                if len(foosball) > 0:
                    f = foosball[0]
                    params = {
                        'domain':
                        domain_from_url(f.get('href')),
                        'keywords':
                        self._parse_keywords(
                            resource_title=clean_string(f.get_text())),
                        'languages':
                        self._get_language(clean_string(f.get_text())),
                        'title':
                        clean_string(f.get_text()),
                        'url':
                        f.get('href')
                    }
                    rr = self._make_resource(**params)
                    self._set_provenance(rr, article)
                    relatives.append(rr)
                params = {
                    'authors': [
                        clean_string(person.get_text()),
                    ],
                    'description':
                    description,
                    'domain':
                    domain_from_url(a.get('href')),
                    'keywords':
                    self._parse_keywords(post_title=rr.title,
                                         resource_title=clean_string(
                                             a.get_text())),
                    'languages':
                    self._get_language(clean_string(a.get_text())),
                    'title':
                    clean_string(a.get_text()),
                    'url':
                    a.get('href'),
                    'year':
                    clean_string(unicode(person.next_sibling)),
                }
                resource = self._make_resource(**params)

                resource.related_resources.append(rr.package())
                self._set_provenance(resource, article)
                resources.append(resource)
            relative_urls = list(set([r.url for r in relatives]))
            unique_relatives = []
            for rurl in relative_urls:
                unique_relatives.append(
                    [r for r in relatives if r.url == rurl][0])
            return resources + unique_relatives
        else:
            return AwolDomainParser._get_resources(self, article)
예제 #13
0
파일: article.py 프로젝트: JoannaAshe/COACS
    def _load_atom(self, atom_file_name):
        """Open atom file and parse for basic info.

        We attempt to set the following attributes on the class:

         * id (string): tag id for this atom entry
         * title (unicode): title of the original blog post
         * url (string): url for the original blog post)
         * categories (dictionary) with the following keys:
           * 'vocabulary' (string): captures "scheme" from the entry categories
           * 'term' (string): verbatim from the entry categories
         * content (unicode): normalized unicode string containing everything
           that was in the entry content (see normalization comments below)
         * soup (bs4 BeutifulSoup object): html-parsed version of content

        All strings are space normalized (i.e., all continguous spans of
        whitespace are collapsed to a single space and the result string is
        stripped of leading and trailing whitespace).

        The normalization form of all unicode strings (title and content) are
        converted to Normalization Form "C" (canonical normalized).
        """

        logger = logging.getLogger(sys._getframe().f_code.co_name)

        with open(atom_file_name, 'r') as file_object:
            self.doc = exml.parse(file_object)
        self.root = self.doc.getroot()
        root = self.root
        self.id = root.find('{http://www.w3.org/2005/Atom}id').text.strip()
        #logger.debug('article id: "{0}"'.format(self.id))

        # title of blog post should be same as title of atom entry
        raw_title = str(root.find('{http://www.w3.org/2005/Atom}title').text)
        try:
            self.title = purify_text(
                normalize_space(unicodedata.normalize('NFC', raw_title)))
        except TypeError:
            msg = 'could not extract blog post title for article with id: "{0}"'.format(
                self.id)
            raise RuntimeWarning(msg)

        else:
            #logger.debug(u'article title: "{0}"'.format(self.title))
            pass

        # get url of blog post (html alternate)
        try:
            raw_url = str(
                root.xpath("//*[local-name()='link' and @rel='alternate']")
                [0].get('href'))
        except IndexError:
            msg = 'could not extract blog post URL for article with id: "{0}"'.format(
                self.id)
            raise RuntimeError(msg)
        else:
            try:
                raw_url = normalize_space(unicodedata.normalize(
                    'NFC', raw_url))
            except TypeError:
                msg = 'could not normalize blog post URL for article with id: "{0}"'.format(
                    self.id)
                raise RuntimeError(msg)
            else:
                if urls.valid(raw_url):
                    self.url = raw_url
                else:
                    msg = 'invalid blog post URL ({0}) for article with id: "{1}"'.format(
                        raw_url, self.id)
                    raise RuntimeError(msg)

        # capture categories as vocabulary terms
        self.categories = [{
            'vocabulary':
            c.get('scheme'),
            'term':
            normalize_space(unicodedata.normalize('NFC', str(c.get('term'))))
        } for c in root.findall('{http://www.w3.org/2005/Atom}category')]

        # extract content, normalize, and parse as HTML for later use
        raw_content = root.find('{http://www.w3.org/2005/Atom}content').text
        soup = BeautifulSoup(
            raw_content,
            'lxml')  # mainly to convert character entities to unicode
        soup_content = str(soup)
        del soup
        content = unicodedata.normalize('NFC', soup_content)
        del soup_content
        content = normalize_space(content)
        content = purify_html(
            content)  # get rid of all manner of evil, stupid stuff
        self.content = content
        try:
            html = exml.fromstring(content, XML_PARSER)
        except XMLSyntaxError:
            msg = 'XMLSyntaxError while trying to parse content of {0}; trying html5lib parser with BeautifulSoup and then lxml parser with recover=True'.format(
                atom_file_name)
            logger.warning(msg)
            soup = BeautifulSoup(raw_content, 'html5lib')
            soup_content = str(soup)
            del soup
            content = unicodedata.normalize('NFC', soup_content)
            del soup_content
            content = normalize_space(content)
            content = purify_html(content)
            self.content = content
            try:
                html = exml.fromstring(content, XML_PARSER_LENIENT)
            except XMLSyntaxError:
                msg = 'XMLSyntaxError while trying to re-parse content of {0} using html5lib parser with BeautifulSoup'.format(
                    atom_file_name)
                logger.error(msg)
                logger.error(content)
                sys.exit(-1000)

        #logger.debug('normalized html:\n\n' + exml.tostring(html, pretty_print=True))
        transform = exml.XSLT(XSL_CLEANUP)
        clean_html = transform(html)
        #logger.debug('cleaned html:\n\n' + exml.tostring(clean_html, pretty_print=True))
        self.soup = BeautifulSoup(exml.tostring(clean_html), 'lxml')