Exemplo n.º 1
0
    def test_01_same_fulltext(self):
        """Check duplication detection on articles with the same fulltext URL"""

        # A list of various URLs to check matching on
        ftus = [
            "http://examplejournal.telfor.rs/Published/Vol1No1/Vol1No1_A5.pdf",
            "http://www.sbe.deu.edu.tr/dergi/cilt15.say%C4%B12/06%20AKALIN.pdf",
            "http://www.ujcem.med.sumdu.edu.ua/images/sampledata/2013/4/408_412_IV-020.pdf",
            "http://www.psychologie-aktuell.com/fileadmin/download/ptam/1-2014_20140324/01_Geiser.pdf"
        ]

        for ftu in ftus:
            # make ourselves an example article
            a = models.Article()
            b = a.bibjson()
            b.title = "Example article with a fulltext url"
            b.add_url(ftu, urltype="fulltext")
            a.save(blocking=True)

            # create a replacement article
            z = models.Article()
            y = z.bibjson()
            y.title = "Replacement article for fulltext url"
            y.add_url(ftu, urltype="fulltext")

            # determine if there's a duplicate
            articleService = DOAJ.articleService()
            d = articleService.get_duplicate(z)

            assert d is not None
            assert d.bibjson().title == "Example article with a fulltext url"
Exemplo n.º 2
0
    def test_05_full_doi(self):
        """ Test that we still detect duplicate DOIs when we have the full URI, not just the 10. """
        # make ourselves a couple of example articles
        a = models.Article()
        b = a.bibjson()
        b.title = "Example A article with a DOI"
        b.add_identifier('doi', "https://doi.org/10.doi/123")
        a.save(blocking=True)

        # create an article which should not be caught by the duplicate detection
        not_duplicate = models.Article()
        not_duplicate_bibjson = not_duplicate.bibjson()
        not_duplicate_bibjson.title = "Example C article with a DOI"
        not_duplicate_bibjson.add_identifier('doi', "https://doi.org/10.doi/DIFFERENT")
        not_duplicate.save(blocking=True)

        # create a replacement article
        z = models.Article()
        y = z.bibjson()
        y.title = "Replacement article for DOI"
        y.add_identifier('doi', "http://doi.org/10.doi/123")

        # determine if there's a duplicate
        articleService = DOAJ.articleService()
        dups = articleService.get_duplicates(z)
        assert len(dups) == 1

        # Check when we ask for one duplicate we get the most recent duplicate.
        d = articleService.get_duplicate(z)
        assert d is not None
        assert d.bibjson().title == "Example A article with a DOI", d.bibjson().title
Exemplo n.º 3
0
    def retrieve(cls, id, account):

        # is the article id valid?
        ar = models.Article.pull(id)
        if ar is None:
            raise Api404Error()

        # at this point we're happy to return the article if it's
        # meant to be seen by the public
        if ar.is_in_doaj():
            try:
                return OutgoingArticleDO.from_model(ar)
            except:
                raise Api500Error()

        # as long as authentication (in the layer above) has been successful, and the account exists, then
        # we are good to proceed
        if account is None or account.is_anonymous:
            raise Api401Error()

        # Check we're allowed to retrieve this article
        articleService = DOAJ.articleService()
        if not articleService.is_legitimate_owner(ar, account.id):
            raise Api404Error()  # not found for this account

        # Return the article
        oa = OutgoingArticleDO.from_model(ar)
        return oa
Exemplo n.º 4
0
    def retrieve(cls, id, account):

        # is the article id valid?
        ar = models.Article.pull(id)
        if ar is None:
            raise Api404Error()

        # at this point we're happy to return the article if it's
        # meant to be seen by the public
        if ar.is_in_doaj():
            return OutgoingArticleDO.from_model(ar)

        # as long as authentication (in the layer above) has been successful, and the account exists, then
        # we are good to proceed
        if account is None:
            raise Api401Error()

        # Check we're allowed to retrieve this article
        articleService = DOAJ.articleService()
        if not articleService.is_legitimate_owner(ar, account.id):
            raise Api404Error()  # not found for this account

        # Return the article
        oa = OutgoingArticleDO.from_model(ar)
        return oa
Exemplo n.º 5
0
    def test_07_both_duplication_criteria(self):
        """Check that an article is only reported once if it is duplicated by both DOI and fulltext URL"""
        # make ourselves an example article
        ftu = "http://www.sbe.deu.edu.tr/dergi/cilt15.say%C4%B12/06%20AKALIN.pdf"
        doi = "10.doi/123"

        a = models.Article()
        b = a.bibjson()
        b.title = "Example article with a fulltext url and a DOI"
        b.add_url(ftu, urltype="fulltext")
        b.add_identifier('doi', doi)
        a.save(blocking=True)

        # create another article
        z = models.Article()
        y = z.bibjson()
        y.title = "Replacement article for fulltext url and a DOI"
        y.add_url(ftu, urltype="fulltext")
        y.add_identifier('doi', doi)

        # determine if there's a duplicate
        articleService = DOAJ.articleService()
        d = articleService.get_duplicates(z)

        assert len(d) == 1
        print len(d)
        assert d[0].bibjson().title == "Example article with a fulltext url and a DOI"
Exemplo n.º 6
0
    def test_07_both_duplication_criteria(self):
        """Check that an article is only reported once if it is duplicated by both DOI and fulltext URL"""
        # make ourselves an example article
        ftu = "http://www.sbe.deu.edu.tr/dergi/cilt15.say%C4%B12/06%20AKALIN.pdf"
        doi = "10.doi/123"

        a = models.Article()
        b = a.bibjson()
        b.title = "Example article with a fulltext url and a DOI"
        b.add_url(ftu, urltype="fulltext")
        b.add_identifier('doi', doi)
        a.save(blocking=True)

        # create another article
        z = models.Article()
        y = z.bibjson()
        y.title = "Replacement article for fulltext url and a DOI"
        y.add_url(ftu, urltype="fulltext")
        y.add_identifier('doi', doi)

        # determine if there's a duplicate
        articleService = DOAJ.articleService()
        d = articleService.get_duplicates(z)

        assert len(d) == 1
        print len(d)
        assert d[0].bibjson(
        ).title == "Example article with a fulltext url and a DOI"
Exemplo n.º 7
0
    def test_05_full_doi(self):
        """ Test that we still detect duplicate DOIs when we have the full URI, not just the 10. """
        # make ourselves a couple of example articles
        a = models.Article()
        b = a.bibjson()
        b.title = "Example A article with a DOI"
        b.add_identifier('doi', "https://doi.org/10.doi/123")
        a.save(blocking=True)

        # create an article which should not be caught by the duplicate detection
        not_duplicate = models.Article()
        not_duplicate_bibjson = not_duplicate.bibjson()
        not_duplicate_bibjson.title = "Example C article with a DOI"
        not_duplicate_bibjson.add_identifier(
            'doi', "https://doi.org/10.doi/DIFFERENT")
        not_duplicate.save(blocking=True)

        # create a replacement article
        z = models.Article()
        y = z.bibjson()
        y.title = "Replacement article for DOI"
        y.add_identifier('doi', "http://doi.org/10.doi/123")

        # determine if there's a duplicate
        articleService = DOAJ.articleService()
        dups = articleService.get_duplicates(z)
        assert len(dups) == 1

        # Check when we ask for one duplicate we get the most recent duplicate.
        d = articleService.get_duplicate(z)
        assert d is not None
        assert d.bibjson().title == "Example A article with a DOI", d.bibjson(
        ).title
Exemplo n.º 8
0
    def test_04_with_doi_instead(self):
        """Detect a duplicate using the DOI field."""
        # make ourselves a couple of example articles
        a = models.Article()
        b = a.bibjson()
        b.title = "Example A article with a DOI"
        b.add_identifier('doi', "10.doi/123")
        a.save(blocking=True)

        # create an article which should not be caught by the duplicate detection
        not_duplicate = models.Article()
        not_duplicate_bibjson = not_duplicate.bibjson()
        not_duplicate_bibjson.title = "Example C article with a DOI"
        not_duplicate_bibjson.add_identifier('doi', "10.doi/DIFFERENT")
        not_duplicate.save(blocking=True)

        # create a replacement article
        z = models.Article()
        y = z.bibjson()
        y.title = "Replacement article for DOI"
        y.add_identifier('doi', "10.doi/123")

        # determine if there's a duplicate
        articleService = DOAJ.articleService()
        dups = articleService.get_duplicates(z)
        assert len(dups) == 1

        # Check when we ask for one duplicate we get the most recent duplicate.
        d = articleService.get_duplicate(z)
        assert d is not None
        assert d.bibjson().title == "Example A article with a DOI", d.bibjson(
        ).title
Exemplo n.º 9
0
    def test_04_with_doi_instead(self):
        """Detect a duplicate using the DOI field."""
        # make ourselves a couple of example articles
        a = models.Article()
        b = a.bibjson()
        b.title = "Example A article with a DOI"
        b.add_identifier('doi', "10.doi/123")
        a.save(blocking=True)

        # create an article which should not be caught by the duplicate detection
        not_duplicate = models.Article()
        not_duplicate_bibjson = not_duplicate.bibjson()
        not_duplicate_bibjson.title = "Example C article with a DOI"
        not_duplicate_bibjson.add_identifier('doi', "10.doi/DIFFERENT")
        not_duplicate.save(blocking=True)

        # create a replacement article
        z = models.Article()
        y = z.bibjson()
        y.title = "Replacement article for DOI"
        y.add_identifier('doi', "10.doi/123")

        # determine if there's a duplicate
        articleService = DOAJ.articleService()
        dups = articleService.get_duplicates(z)
        assert len(dups) == 1

        # Check when we ask for one duplicate we get the most recent duplicate.
        d = articleService.get_duplicate(z)
        assert d is not None
        assert d.bibjson().title == "Example A article with a DOI", d.bibjson().title
Exemplo n.º 10
0
    def test_01_same_fulltext(self):
        """Check duplication detection on articles with the same fulltext URL"""

        # A list of various URLs to check matching on
        ftus = [
            "http://examplejournal.telfor.rs/Published/Vol1No1/Vol1No1_A5.pdf",
            "http://www.sbe.deu.edu.tr/dergi/cilt15.say%C4%B12/06%20AKALIN.pdf",
            "http://www.ujcem.med.sumdu.edu.ua/images/sampledata/2013/4/408_412_IV-020.pdf",
            "http://www.psychologie-aktuell.com/fileadmin/download/ptam/1-2014_20140324/01_Geiser.pdf"
        ]

        for ftu in ftus:
            # make ourselves an example article
            a = models.Article()
            b = a.bibjson()
            b.title = "Example article with a fulltext url"
            b.add_url(ftu, urltype="fulltext")
            a.save(blocking=True)

            # create a replacement article
            z = models.Article()
            y = z.bibjson()
            y.title = "Replacement article for fulltext url"
            y.add_url(ftu, urltype="fulltext")

            # determine if there's a duplicate
            articleService = DOAJ.articleService()
            d = articleService.get_duplicate(z)

            assert d is not None
            assert d.bibjson().title == "Example article with a fulltext url"
Exemplo n.º 11
0
    def test_03_retrieve_multiple_conflict(self):

        ftu = "http://www.sbe.deu.edu.tr/dergi/cilt15.say%C4%B12/06%20AKALIN.pdf"
        # make ourselves a couple of example articles
        a = models.Article()
        b = a.bibjson()
        b.title = "Example A article with a fulltext url"
        b.add_url(ftu, urltype="fulltext")
        a.save(blocking=True)

        # Wait a second to ensure the timestamps are different
        time.sleep(1.01)

        a2 = models.Article()
        b2 = a2.bibjson()
        b2.title = "Example B article with a fulltext url"
        b2.add_url(ftu, urltype="fulltext")
        a2.save(blocking=True)

        # create an article which should not be caught by the duplicate detection
        not_duplicate = models.Article()
        not_duplicate_bibjson = not_duplicate.bibjson()
        not_duplicate_bibjson.title = "Example C article with a fulltext url"
        not_duplicate_bibjson.add_url("http://this.is/a/different/url",
                                      urltype="fulltext")
        not_duplicate.save(blocking=True)

        # create a replacement article
        z = models.Article()
        y = z.bibjson()
        y.title = "Replacement article for fulltext url"
        y.add_url(ftu, urltype="fulltext")

        # determine that there are multiple duplicates
        articleService = DOAJ.articleService()
        with self.assertRaises(ArticleMergeConflict):
            d = articleService.get_duplicate(z)

        # get the xwalk to determine all duplicates
        # sort both results and expectations here to avoid false alarm
        # we don't care about the order of duplicates
        expected = [a, a2]
        expected.sort(key=lambda x: datetime.strptime(x.last_updated,
                                                      "%Y-%m-%dT%H:%M:%SZ"),
                      reverse=True)
        # determine if there's a duplicate
        l = articleService.get_duplicates(z)
        assert isinstance(l, list), l
        assert l is not None
        l.sort(key=lambda x: datetime.strptime(x.last_updated,
                                               "%Y-%m-%dT%H:%M:%SZ"),
               reverse=True)
        assert expected == l
Exemplo n.º 12
0
    def test_04_with_doi_instead(self):
        """Detect a duplicate using the DOI field."""
        # make ourselves a couple of example articles
        a = models.Article()
        b = a.bibjson()
        b.title = "Example A article with a DOI"
        b.add_identifier('doi', "10.doi/123")
        a.save(blocking=True)

        # Wait a second to ensure the timestamps are different
        time.sleep(1.01)

        a2 = models.Article()
        b2 = a2.bibjson()
        b2.title = "Example B article with a DOI"
        b2.add_identifier('doi', "10.doi/123")
        a2.save(blocking=True)

        # create an article which should not be caught by the duplicate detection
        not_duplicate = models.Article()
        not_duplicate_bibjson = not_duplicate.bibjson()
        not_duplicate_bibjson.title = "Example C article with a DOI"
        not_duplicate_bibjson.add_identifier('doi', "10.doi/DIFFERENT")
        not_duplicate.save(blocking=True)

        # create a replacement article
        z = models.Article()
        y = z.bibjson()
        y.title = "Replacement article for DOI"
        y.add_identifier('doi', "10.doi/123")

        # determine if there's a duplicate
        articleService = DOAJ.articleService()
        dups = articleService.get_duplicates(z)
        assert len(dups) == 2

        # Check when we ask for one duplicate we get the most recent duplicate.
        d = articleService.get_duplicate(z)
        assert d is not None
        assert d.bibjson().title == "Example B article with a DOI", d.bibjson(
        ).title

        # get the xwalk to determine all duplicates
        # sort both results and expectations here to avoid false alarm
        # we don't care about the order of duplicates
        expected = sorted([a, a2])
        # determine if there's a duplicate
        l = articleService.get_duplicates(z)
        assert isinstance(l, list)
        assert l
        assert len(l) == 2
        l.sort()
        assert expected == l
Exemplo n.º 13
0
    def test_03_retrieve_latest(self):

        ftu = "http://www.sbe.deu.edu.tr/dergi/cilt15.say%C4%B12/06%20AKALIN.pdf"
        # make ourselves a couple of example articles
        a = models.Article()
        b = a.bibjson()
        b.title = "Example A article with a fulltext url"
        b.add_url(ftu, urltype="fulltext")
        a.save(blocking=True)

        # Wait a second to ensure the timestamps are different
        time.sleep(1.01)

        a2 = models.Article()
        b2 = a2.bibjson()
        b2.title = "Example B article with a fulltext url"
        b2.add_url(ftu, urltype="fulltext")
        a2.save(blocking=True)

        # create an article which should not be caught by the duplicate detection
        not_duplicate = models.Article()
        not_duplicate_bibjson = not_duplicate.bibjson()
        not_duplicate_bibjson.title = "Example C article with a fulltext url"
        not_duplicate_bibjson.add_url("http://this.is/a/different/url",
                                      urltype="fulltext")
        not_duplicate.save(blocking=True)

        # create a replacement article
        z = models.Article()
        y = z.bibjson()
        y.title = "Replacement article for fulltext url"
        y.add_url(ftu, urltype="fulltext")

        # determine if there's a duplicate
        articleService = DOAJ.articleService()
        d = articleService.get_duplicate(z)

        # Check when we ask for one duplicate we get the most recent duplicate.
        assert d is not None
        assert d.bibjson(
        ).title == "Example B article with a fulltext url", d.bibjson().title

        # get the xwalk to determine all duplicates
        # sort both results and expectations here to avoid false alarm
        # we don't care about the order of duplicates
        expected = sorted([a, a2])
        # determine if there's a duplicate
        l = articleService.get_duplicates(z)
        assert isinstance(l, list), l
        assert l is not None
        l.sort()
        assert expected == l
Exemplo n.º 14
0
    def test_03_retrieve_multiple_conflict(self):

        ftu = "http://www.sbe.deu.edu.tr/dergi/cilt15.say%C4%B12/06%20AKALIN.pdf"
        # make ourselves a couple of example articles
        a = models.Article()
        b = a.bibjson()
        b.title = "Example A article with a fulltext url"
        b.add_url(ftu, urltype="fulltext")
        a.save(blocking=True)

        # Wait a second to ensure the timestamps are different
        time.sleep(1.01)
        
        a2 = models.Article()
        b2 = a2.bibjson()
        b2.title = "Example B article with a fulltext url"
        b2.add_url(ftu, urltype="fulltext")
        a2.save(blocking=True)

        # create an article which should not be caught by the duplicate detection
        not_duplicate = models.Article()
        not_duplicate_bibjson = not_duplicate.bibjson()
        not_duplicate_bibjson.title = "Example C article with a fulltext url"
        not_duplicate_bibjson.add_url("http://this.is/a/different/url", urltype="fulltext")
        not_duplicate.save(blocking=True)
        
        # create a replacement article
        z = models.Article()
        y = z.bibjson()
        y.title = "Replacement article for fulltext url"
        y.add_url(ftu, urltype="fulltext")
        
        # determine that there are multiple duplicates
        articleService = DOAJ.articleService()
        with self.assertRaises(ArticleMergeConflict):
            d = articleService.get_duplicate(z)

        # get the xwalk to determine all duplicates
        # sort both results and expectations here to avoid false alarm
        # we don't care about the order of duplicates
        expected = sorted([a, a2])
        # determine if there's a duplicate
        l = articleService.get_duplicates(z)
        assert isinstance(l, list), l
        assert l is not None
        l.sort()
        assert expected == l
Exemplo n.º 15
0
    def create(cls, data, account):
        # as long as authentication (in the layer above) has been successful, and the account exists, then
        # we are good to proceed
        if account is None:
            raise Api401Error()

        # convert the data into a suitable article model
        am = cls.prep_article(data)

        articleService = DOAJ.articleService()
        result = articleService.create_article(am, account)

        # Check we are allowed to create an article for this journal
        if result.get("fail", 0) == 1:
            raise Api403Error()

        return am
Exemplo n.º 16
0
    def delete(cls, id, account, dry_run=False):
        # as long as authentication (in the layer above) has been successful, and the account exists, then
        # we are good to proceed
        if account is None:
            raise Api401Error()

        # now see if there's something for us to delete
        ar = models.Article.pull(id)
        if ar is None:
            raise Api404Error()

        # Check we're allowed to retrieve this article
        articleService = DOAJ.articleService()
        if not articleService.is_legitimate_owner(ar, account.id):
            raise Api404Error()  # not found for this account

        # issue the delete (no record of the delete required)
        if not dry_run:
            ar.delete()
Exemplo n.º 17
0
    def delete(cls, id, account, dry_run=False):
        # as long as authentication (in the layer above) has been successful, and the account exists, then
        # we are good to proceed
        if account is None:
            raise Api401Error()

        # now see if there's something for us to delete
        ar = models.Article.pull(id)
        if ar is None:
            raise Api404Error()

        # Check we're allowed to retrieve this article
        articleService = DOAJ.articleService()
        if not articleService.is_legitimate_owner(ar, account.id):
            raise Api404Error()  # not found for this account

        # issue the delete (no record of the delete required)
        if not dry_run:
            ar.delete()
Exemplo n.º 18
0
    def test_02_different_fulltext(self):
        """Check that an article with different fulltext URLs is not considered a duplicate"""
        # make ourselves an example article
        a = models.Article()
        b = a.bibjson()
        b.title = "Example 2 article with a fulltext url"
        b.add_url("http://www.sbe.deu.edu.tr/dergi/cilt15.say%C4%B12/06%20AKALIN.pdf", urltype="fulltext")
        a.save(blocking=True)

        # create another article
        z = models.Article()
        y = z.bibjson()
        y.title = "Replacement article for fulltext url"
        y.add_url("http://this.is/a/different/url", urltype="fulltext")
        
        # determine if there's a duplicate
        articleService = DOAJ.articleService()
        d = articleService.get_duplicate(z)
        
        assert d is None
Exemplo n.º 19
0
    def create(cls, data, account):
        # as long as authentication (in the layer above) has been successful, and the account exists, then
        # we are good to proceed
        if account is None:
            raise Api401Error()

        # convert the data into a suitable article model
        am = cls.prep_article(data)

        articleService = DOAJ.articleService()
        try:
            result = articleService.create_article(am, account, add_journal_info=True)
        except ArticleMergeConflict as e:
            raise Api400Error(e.message)

        # Check we are allowed to create an article for this journal
        if result.get("fail", 0) == 1:
            raise Api403Error()

        return am
Exemplo n.º 20
0
    def test_02_different_fulltext(self):
        """Check that an article with different fulltext URLs is not considered a duplicate"""
        # make ourselves an example article
        a = models.Article()
        b = a.bibjson()
        b.title = "Example 2 article with a fulltext url"
        b.add_url(
            "http://www.sbe.deu.edu.tr/dergi/cilt15.say%C4%B12/06%20AKALIN.pdf",
            urltype="fulltext")
        a.save(blocking=True)

        # create another article
        z = models.Article()
        y = z.bibjson()
        y.title = "Replacement article for fulltext url"
        y.add_url("http://this.is/a/different/url", urltype="fulltext")

        # determine if there's a duplicate
        articleService = DOAJ.articleService()
        d = articleService.get_duplicate(z)

        assert d is None
Exemplo n.º 21
0
    def create(cls, data, account):
        # as long as authentication (in the layer above) has been successful, and the account exists, then
        # we are good to proceed
        if account is None:
            raise Api401Error()

        # convert the data into a suitable article model
        am = cls.prep_article(data)

        articleService = DOAJ.articleService()
        try:
            result = articleService.create_article(am,
                                                   account,
                                                   add_journal_info=True)
        except ArticleMergeConflict as e:
            raise Api400Error(e.message)
        except ArticleNotAcceptable as e:
            raise Api400Error("; ".join(e.errors))

        # Check we are allowed to create an article for this journal
        if result.get("fail", 0) == 1:
            raise Api403Error()

        return am
Exemplo n.º 22
0
    def update(cls, id, data, account):
        # as long as authentication (in the layer above) has been successful, and the account exists, then
        # we are good to proceed
        if account is None:
            raise Api401Error()

        # now see if there's something for us to delete
        ar = models.Article.pull(id)
        if ar is None:
            raise Api404Error()

        # Check we're allowed to edit this article
        articleService = DOAJ.articleService()
        if not articleService.is_legitimate_owner(ar, account.id):
            raise Api404Error()  # not found for this account

        # next thing to do is a structural validation of the replacement data, by instantiating the object
        try:
            ia = IncomingArticleDO(data)
        except dataobj.DataStructureException as e:
            raise Api400Error(e.message)

        # if that works, convert it to an Article object bringing over everything outside the
        # incoming article from the original article
        new_ar = ia.to_article_model(ar)

        # we need to ensure that any properties of the existing article that aren't allowed to change
        # are copied over
        new_ar.set_id(id)
        new_ar.set_created(ar.created_date)
        new_ar.bibjson().set_subjects(ar.bibjson().subjects())
        new_ar = cls.__handle_journal_info(new_ar)

        # finally save the new article, and return to the caller
        new_ar.save()
        return new_ar
Exemplo n.º 23
0
    def update(cls, id, data, account):
        # as long as authentication (in the layer above) has been successful, and the account exists, then
        # we are good to proceed
        if account is None:
            raise Api401Error()

        # now see if there's something for us to delete
        ar = models.Article.pull(id)
        if ar is None:
            raise Api404Error()

        # Check we're allowed to edit this article
        articleService = DOAJ.articleService()
        if not articleService.is_legitimate_owner(ar, account.id):
            raise Api404Error()  # not found for this account

        # next thing to do is a structural validation of the replacement data, by instantiating the object
        try:
            ia = IncomingArticleDO(data)
        except dataobj.DataStructureException as e:
            raise Api400Error(str(e))

        # if that works, convert it to an Article object bringing over everything outside the
        # incoming article from the original article
        new_ar = ia.to_article_model(ar)

        # we need to ensure that any properties of the existing article that aren't allowed to change
        # are copied over
        new_ar.set_id(id)
        new_ar.set_created(ar.created_date)
        new_ar.bibjson().set_subjects(ar.bibjson().subjects())
        new_ar = cls.__handle_journal_info(new_ar)

        # finally save the new article, and return to the caller
        new_ar.save()
        return new_ar
Exemplo n.º 24
0
    def create(cls, data, account):
        # as long as authentication (in the layer above) has been successful, and the account exists, then
        # we are good to proceed
        if account is None:
            raise Api401Error()

        # convert the data into a suitable article model (raises Api400Error if doesn't conform to struct)
        am = cls.prep_article(data)

        articleService = DOAJ.articleService()
        try:
            result = articleService.create_article(am, account, add_journal_info=True)
        except ArticleMergeConflict as e:
            raise Api400Error(str(e))
        except ArticleNotAcceptable as e:
            raise Api400Error("; ".join(e.errors))
        except exceptions.DuplicateArticleException as e:
            raise Api403Error(str(e))

        # Check we are allowed to create an article for this journal
        if result.get("fail", 0) == 1:
            raise Api403Error("It is not possible to create an article for this journal. Have you included in the upload an ISSN which is not associated with any journal in your account? ISSNs must match exactly the ISSNs against the journal record.")

        return am
Exemplo n.º 25
0
    def run(self):
        job = self.background_job
        params = job.params

        # Set up the files we need to run this task - a dir to place the report, and a place to write the article csv
        outdir = self.get_param(params, "outdir",
                                "article_duplicates_" + dates.today())
        job.add_audit_message("Saving reports to " + outdir)
        if not os.path.exists(outdir):
            os.makedirs(outdir)

        # Location for our interim CSV file of articles
        tmpdir = self.get_param(params, "tmpdir",
                                'tmp_article_duplicate_report')
        if not os.path.exists(tmpdir):
            os.makedirs(tmpdir)

        tmp_csvname = self.get_param(params, "article_csv", False)
        tmp_csvpath, total = self._make_csv_dump(tmpdir, tmp_csvname)

        # Initialise our reports
        global_reportfile = 'duplicate_articles_global_' + dates.today(
        ) + '.csv'
        global_reportpath = os.path.join(outdir, global_reportfile)
        f = open(global_reportpath, "w", encoding="utf-8")
        global_report = csv.writer(f)
        header = [
            "article_id", "article_created", "article_doi", "article_fulltext",
            "article_owner", "article_issns", "article_in_doaj", "n_matches",
            "match_type", "match_id", "match_created", "match_doi",
            "match_fulltext", "match_owner", "match_issns", "match_in_doaj",
            "owners_match", "titles_match", "article_title", "match_title"
        ]
        global_report.writerow(header)

        noids_reportfile = 'noids_' + dates.today() + '.csv'
        noids_reportpath = os.path.join(outdir, noids_reportfile)
        g = open(noids_reportpath, "w", encoding="utf-8")
        noids_report = csv.writer(g)
        header = [
            "article_id", "article_created", "article_owner", "article_issns",
            "article_in_doaj"
        ]
        noids_report.writerow(header)

        # Record the sets of duplicated articles
        global_matches = []

        a_count = 0

        articleService = DOAJ.articleService()

        # Read back in the article csv file we created earlier
        with open(tmp_csvpath, 'r', encoding='utf-8') as t:
            article_reader = csv.reader(t)

            start = datetime.now()
            estimated_finish = ""
            for a in article_reader:
                if a_count > 1 and a_count % 100 == 0:
                    n = datetime.now()
                    diff = (n - start).total_seconds()
                    expected_total = ((diff / a_count) * total)
                    estimated_finish = dates.format(
                        dates.after(start, expected_total))
                a_count += 1

                article = models.Article(
                    _source={
                        'id': a[0],
                        'created_date': a[1],
                        'bibjson': {
                            'identifier': json.loads(a[2]),
                            'link': json.loads(a[3]),
                            'title': a[4]
                        },
                        'admin': {
                            'in_doaj': json.loads(a[5])
                        }
                    })

                # Get the global duplicates
                try:
                    global_duplicates = articleService.discover_duplicates(
                        article,
                        results_per_match_type=10000,
                        include_article=False)
                except exceptions.DuplicateArticleException:
                    # this means the article did not have any ids that could be used for deduplication
                    owner = self._lookup_owner(article)
                    noids_report.writerow([
                        article.id, article.created_date, owner,
                        ','.join(article.bibjson().issns()),
                        article.is_in_doaj()
                    ])
                    continue

                dupcount = 0
                if global_duplicates:

                    # Look up an article's owner
                    owner = self._lookup_owner(article)

                    # Deduplicate the DOI and fulltext duplicate lists
                    s = set([article.id] + [
                        d.id for d in global_duplicates.get('doi', []) +
                        global_duplicates.get('fulltext', [])
                    ])
                    # remove article's own id from global_duplicates
                    dupcount = len(s) - 1
                    if s not in global_matches:
                        self._write_rows_from_duplicates(
                            article, owner, global_duplicates, global_report)
                        global_matches.append(s)

                app.logger.debug('{0}/{1} {2} {3} {4} {5}'.format(
                    a_count, total, article.id, dupcount, len(global_matches),
                    estimated_finish))

        job.add_audit_message(
            '{0} articles processed for duplicates. {1} global duplicate sets found.'
            .format(a_count, len(global_matches)))
        f.close()
        g.close()

        # Delete the transient temporary files.
        shutil.rmtree(tmpdir)

        # Email the reports if that parameter has been set.
        send_email = self.get_param(params, "email", False)
        if send_email:
            archive_name = "article_duplicates_" + dates.today()
            email_archive(outdir, archive_name)
            job.add_audit_message("email alert sent")
        else:
            job.add_audit_message("no email alert sent")
Exemplo n.º 26
0
    def run(self):
        job = self.background_job
        params = job.params

        # Set up the files we need to run this task - a dir to place the report, and a place to write the article csv
        outdir = self.get_param(params, "outdir", "article_duplicates_" + dates.today())
        job.add_audit_message("Saving reports to " + outdir)
        if not os.path.exists(outdir):
            os.makedirs(outdir)

        # Location for our interim CSV file of articles
        tmpdir = self.get_param(params, "tmpdir", 'tmp_article_duplicate_report')
        if not os.path.exists(tmpdir):
            os.makedirs(tmpdir)

        tmp_csvname = self.get_param(params, "article_csv", False)
        tmp_csvpath, total = self._make_csv_dump(tmpdir, tmp_csvname)

        # Initialise our reports
        global_reportfile = 'duplicate_articles_global_' + dates.today() + '.csv'
        global_reportpath = os.path.join(outdir, global_reportfile)
        f = codecs.open(global_reportpath, "wb", "utf-8")
        global_report = UnicodeWriter(f)
        header = ["article_id", "article_created", "article_doi", "article_fulltext", "article_owner", "article_issns", "article_in_doaj", "n_matches", "match_type", "match_id", "match_created", "match_doi", "match_fulltext", "match_owner", "match_issns", "match_in_doaj", "owners_match", "titles_match", "article_title", "match_title"]
        global_report.writerow(header)

        noids_reportfile = 'noids_' + dates.today() + '.csv'
        noids_reportpath = os.path.join(outdir, noids_reportfile)
        g = codecs.open(noids_reportpath, "wb", "utf-8")
        noids_report = UnicodeWriter(g)
        header = ["article_id", "article_created", "article_owner", "article_issns", "article_in_doaj"]
        noids_report.writerow(header)

        # Record the sets of duplicated articles
        global_matches = []

        a_count = 0

        articleService = DOAJ.articleService()

        # Read back in the article csv file we created earlier
        with codecs.open(tmp_csvpath, 'rb', 'utf-8') as t:
            article_reader = UnicodeReader(t)

            start = datetime.now()
            estimated_finish = ""
            for a in article_reader:
                if a_count > 1 and a_count % 100 == 0:
                    n = datetime.now()
                    diff = (n - start).total_seconds()
                    expected_total = ((diff / a_count) * total)
                    estimated_finish = dates.format(dates.after(start, expected_total))
                a_count += 1

                article = models.Article(_source={'id': a[0], 'created_date': a[1], 'bibjson': {'identifier': json.loads(a[2]), 'link': json.loads(a[3]), 'title': a[4]}, 'admin': {'in_doaj': json.loads(a[5])}})

                # Get the global duplicates
                try:
                    global_duplicates = articleService.discover_duplicates(article, owner=None, results_per_match_type=10000)
                except exceptions.DuplicateArticleException:
                    # this means the article did not have any ids that could be used for deduplication
                    owner = self._lookup_owner(article)
                    noids_report.writerow([article.id, article.created_date, owner, ','.join(article.bibjson().issns()), article.is_in_doaj()])
                    continue

                dupcount = 0
                if global_duplicates:

                    # Look up an article's owner
                    owner = self._lookup_owner(article)

                    # Deduplicate the DOI and fulltext duplicate lists
                    s = set([article.id] + [d.id for d in global_duplicates.get('doi', []) + global_duplicates.get('fulltext', [])])
                    dupcount = len(s) - 1
                    if s not in global_matches:
                        self._write_rows_from_duplicates(article, owner, global_duplicates, global_report)
                        global_matches.append(s)

                app.logger.debug('{0}/{1} {2} {3} {4} {5}'.format(a_count, total, article.id, dupcount, len(global_matches), estimated_finish))

        job.add_audit_message('{0} articles processed for duplicates. {1} global duplicate sets found.'.format(a_count, len(global_matches)))
        f.close()
        g.close()

        # Delete the transient temporary files.
        shutil.rmtree(tmpdir)

        # Email the reports if that parameter has been set.
        send_email = self.get_param(params, "email", False)
        if send_email:
            archive_name = "article_duplicates_" + dates.today()
            email_archive(outdir, archive_name)
            job.add_audit_message("email alert sent")
        else:
            job.add_audit_message("no email alert sent")