示例#1
0
 def _get_primary_anchor(self):
     """Deal with OI peculiarities."""
     for pa in AwolDomainParser._get_anchors(self):
         url = pa.get('href')
         domain = domain_from_url(url)
         if domain not in NEVER_PRIMARY_DOMAINS and url not in MY_SKIP_URLS:
             break
     return pa
示例#2
0
 def _reconcile_titles(self, anchor_title=None, article_title=None):
     """Override some ASCSA titles."""
     logger = logging.getLogger(sys._getframe().f_code.co_name)
     #logger.debug(u'anchor_title: "{0}"'.format(anchor_title))
     #logger.debug(u'article_title: "{0}"'.format(article_title))
     if anchor_title == u"newsletter's home page here" and article_title == u'ákoue News':
         return (article_title, )
     else:
         return AwolDomainParser._reconcile_titles(self, anchor_title,
                                                   article_title)
示例#3
0
    def _get_resources(self, article):
        """Override basic resource extraction."""
        if article.url == u'http://ancientworldonline.blogspot.com/2015/05/universitat-wien-theses-and.html':
            resources = []
            relatives = []
            soup = article.soup
            people = soup.find_all('span', 'person_name')
            for person in people:
                a = person.find_next_sibling('a')
                foo = a.find_next_sibling('br').next_sibling
                try:
                    bar = a.find_next_sibling('span',
                                              'person_name').previous_sibling
                except AttributeError:
                    bar = None
                description = u''
                while foo is not None and foo != bar:
                    if type(foo) == NavigableString:
                        description += u'{0} '.format(
                            clean_string(unicode(foo)))
                    else:
                        description += u'{0} '.format(
                            clean_string(foo.get_text()))
                    foo = foo.next_sibling
                if description.strip() == u'':
                    description = None
                else:
                    description = normalize_space(u'. '.join(
                        [chunk.strip() for chunk in description.split(u'.')]))
                foosball = a.find_all_previous('a')
                foosball = [f for f in foosball if 'subjects' in f.get('href')]
                if len(foosball) > 0:
                    f = foosball[0]
                    params = {
                        'domain':
                        domain_from_url(f.get('href')),
                        'keywords':
                        self._parse_keywords(
                            resource_title=clean_string(f.get_text())),
                        'languages':
                        self._get_language(clean_string(f.get_text())),
                        'title':
                        clean_string(f.get_text()),
                        'url':
                        f.get('href')
                    }
                    rr = self._make_resource(**params)
                    self._set_provenance(rr, article)
                    relatives.append(rr)
                params = {
                    'authors': [
                        clean_string(person.get_text()),
                    ],
                    'description':
                    description,
                    'domain':
                    domain_from_url(a.get('href')),
                    'keywords':
                    self._parse_keywords(post_title=rr.title,
                                         resource_title=clean_string(
                                             a.get_text())),
                    'languages':
                    self._get_language(clean_string(a.get_text())),
                    'title':
                    clean_string(a.get_text()),
                    'url':
                    a.get('href'),
                    'year':
                    clean_string(unicode(person.next_sibling)),
                }
                resource = self._make_resource(**params)

                resource.related_resources.append(rr.package())
                self._set_provenance(resource, article)
                resources.append(resource)
            relative_urls = list(set([r.url for r in relatives]))
            unique_relatives = []
            for rurl in relative_urls:
                unique_relatives.append(
                    [r for r in relatives if r.url == rurl][0])
            return resources + unique_relatives
        else:
            return AwolDomainParser._get_resources(self, article)
示例#4
0
 def __init__(self):
     self.domain = 'othes.univie.ac.at'
     AwolDomainParser.__init__(self)
示例#5
0
 def __init__(self):
     self.domain = 'oi.uchicago.edu'
     AwolDomainParser.__init__(self)
示例#6
0
 def reset(self, content_soup=None):
     AwolDomainParser.reset(self, content_soup)
     self.skip_urls.append(
         'http://www.ascsa.edu.gr/index.php/news/newsDetails/school-newsletter-now-online'
     )
示例#7
0
 def __init__(self):
     self.domain = 'www.ascsa.edu.gr'
     AwolDomainParser.__init__(self)