示例#1
0
    def pages(self):
        my_pages = []

        if self.normalized_title:
            if title_is_too_short(self.normalized_title):
                # logger.info(u"title too short! don't match by title")
                pass
            elif title_is_too_common(self.normalized_title):
                # logger.info(u"title too common!  don't match by title.")
                pass
            else:
                my_pages = self.page_matches_by_title_filtered

        # do dois last, because the objects are actually the same, not copies, and then they get the doi reason
        for my_page in self.page_matches_by_doi:
            my_page.scrape_evidence = u"oa repository (via OAI-PMH doi match)"
            my_pages.append(my_page)

        # eventually only apply this filter to matches by title, once pages only includes
        # the doi when it comes straight from the pmh record
        if max_pages_from_one_repo(
            [p.repo_id for p in self.page_matches_by_title_filtered]) >= 3:
            my_pages = []
            logger.info(
                u"matched too many pages in one repo, not allowing matches")

        return my_pages
示例#2
0
    def query_for_num_pub_matches(self):
        from pmh_record import title_is_too_common
        from pmh_record import title_is_too_short
        from pub import Pub

        # it takes too long to query for things like "tablecontents"
        if title_is_too_common(self.normalized_title) or title_is_too_short(self.normalized_title):
            logger.info(u"title is too common or too short, not scraping")
            return -1

        num_pubs_with_this_normalized_title = db.session.query(Pub.id).filter(Pub.normalized_title==self.normalized_title).count()
        return num_pubs_with_this_normalized_title
示例#3
0
    def query_for_num_pub_matches(self):
        from pmh_record import title_is_too_common
        from pmh_record import title_is_too_short
        from pub import Pub

        # it takes too long to query for things like "tablecontents"
        if title_is_too_common(self.normalized_title) or title_is_too_short(self.normalized_title):
            logger.info(u"title is too common or too short, not scraping")
            return -1

        num_pubs_with_this_normalized_title = db.session.query(Pub.id).filter(Pub.normalized_title==self.normalized_title).count()
        return num_pubs_with_this_normalized_title