Пример #1
0
def content_reports(fr, to, outdir):
    report = {}

    q = ContentByDate(fr, to)
    res = models.Suggestion.query(q=q.query())
    year_buckets = res.get("aggregations", {}).get("years", {}).get("buckets", [])
    for years in year_buckets:
        ds = years.get("key_as_string")
        do = dates.parse(ds)
        year = do.year
        if year not in report:
            report[year] = {}
        country_buckets = years.get("countries", {}).get("buckets", [])
        for country in country_buckets:
            cc = country.get("key")
            cn = datasets.get_country_name(cc)
            if cn not in report[year]:
                report[year][cn] = {}
            count = country.get("doc_count")
            report[year][cn]["count"] = count

    table = _tabulate_time_entity_group(report, "Country")

    filename = "applications_by_year_by_country__" + _fft(fr) + "_to_" + _fft(to) + "__on_" + dates.today() + ".csv"
    outfiles = []
    outfile = os.path.join(outdir, filename)
    outfiles.append(outfile)
    with codecs.open(outfile, "wb", "utf-8") as f:
        writer = UnicodeWriter(f)
        for row in table:
            writer.writerow(row)

    return outfiles
Пример #2
0
def migrate(test=False):
    start = datetime.now()

    journal_iterator = models.Journal.all_in_doaj()

    counter = 0
    with open(os.path.join(OUT_DIR, OUT_FILENAME), 'wb') as o:
        writer = csv.writer(o)
        writer.writerow(['Old country', 'New Country'])

        for j in journal_iterator:
            counter += 1
            oldcountry = j.bibjson().country
            j.bibjson().country = datasets.get_country_code(
                j.bibjson().country)
            newcountry = j.bibjson().country
            newcountry_name = datasets.get_country_name(newcountry)

            writer.writerow(
                [oldcountry.encode('utf-8'),
                 newcountry_name.encode('utf-8')])

            if not test:
                j.prep()
                j.save()

    end = datetime.now()

    print("Updated Journals", counter)
    print(start, end)
    print('Time taken:', end - start)
    print('You can pass -t to test the migration you just ran.')
Пример #3
0
def content_reports(fr, to, outdir):
    report = {}

    q = ContentByDate(fr, to)
    res = models.Suggestion.query(q=q.query())
    year_buckets = res.get("aggregations", {}).get("years",
                                                   {}).get("buckets", [])
    for years in year_buckets:
        ds = years.get("key_as_string")
        do = dates.parse(ds)
        year = do.year
        if year not in report:
            report[year] = {}
        country_buckets = years.get("countries", {}).get("buckets", [])
        for country in country_buckets:
            cc = country.get("key")
            cn = datasets.get_country_name(cc)
            if cn not in report[year]:
                report[year][cn] = {}
            count = country.get("doc_count")
            report[year][cn]["count"] = count

    table = _tabulate_time_entity_group(report, "Country")

    filename = "applications_by_year_by_country__" + _fft(fr) + "_to_" + _fft(
        to) + "__on_" + dates.today() + ".csv"
    outfiles = []
    outfile = os.path.join(outdir, filename)
    outfiles.append(outfile)
    with codecs.open(outfile, "wb", "utf-8") as f:
        writer = UnicodeWriter(f)
        for row in table:
            writer.writerow(row)

    return outfiles
Пример #4
0
    def test_01_countries(self):
        """ Use country information from our datasets """
        assert datasets.get_country_code(
            'united kingdom') == 'GB', 'expected GB, received: {}'.format(
                datasets.get_country_name('GB'))
        assert datasets.get_country_name(
            'GB'
        ) == 'United Kingdom', 'expected United Kingdom, received: {}'.format(
            datasets.get_country_name('GB'))

        # If the country is unrecognised, we send it back unchanged.
        assert datasets.get_country_code('mordor') == 'mordor'
        assert datasets.get_country_name('mordor') == 'mordor'

        # Unless fail_if_not_found is set in get_country_code()
        assert datasets.get_country_code('united states') == 'US'
        assert datasets.get_country_code('the shire',
                                         fail_if_not_found=True) is None
        assert datasets.get_country_code(
            'the shire', fail_if_not_found=False) is 'the shire'

        # When we have more than one option, the first alphabetically is returned
        assert datasets.get_country_name('AE') == 'United Arab Emirates'
Пример #5
0
def do_report(out):
    with codecs.open(out, "wb", encoding="utf-8") as f:
        writer = clcsv.UnicodeWriter(f)
        writer.writerow([
            "Title", "ISSN(s)", "Country Code", "Country", "Status",
            "Date Applied", "Last Manual Update", "Last Update", "Notes"
        ])
        gen = models.Suggestion.list_by_status(
            constants.APPLICATION_STATUS_REJECTED)
        for s in gen:
            bj = s.bibjson()
            title = bj.title
            issns = bj.issns()
            cc = bj.country
            applied = s.created_date
            last_manual = s.last_manual_update
            last_update = s.last_updated
            notes = s.notes
            status = s.application_status

            if title is None:
                title = ""
            if issns is None:
                issns = []
            if cc is None:
                cc = ""
            if applied is None:
                applied = ""
            if last_manual is None:
                last_manual = "never"
            if last_update is None:
                last_update = "never"
            if notes is None:
                notes = []
            if status is None:
                status = "unknown"

            issns = ", ".join(issns)
            notes = "\n\n".join(
                ["[" + n.get("date") + "] " + n.get("note") for n in notes])

            country = datasets.get_country_name(cc)

            writer.writerow([
                title, issns, cc, country, status, applied, last_manual,
                last_update, notes
            ])
Пример #6
0
def do_report(out):
    with codecs.open(out, "wb", encoding="utf-8") as f:
        writer = clcsv.UnicodeWriter(f)
        writer.writerow(["Title", "ISSN(s)", "Country Code", "Country", "Status", "Date Applied", "Last Manual Update", "Last Update", "Notes"])
        gen = models.Suggestion.list_by_status(constants.APPLICATION_STATUS_REJECTED)
        for s in gen:
            bj = s.bibjson()
            title = bj.title
            issns = bj.issns()
            cc = bj.country
            applied = s.created_date
            last_manual = s.last_manual_update
            last_update = s.last_updated
            notes = s.notes
            status = s.application_status

            if title is None:
                title = ""
            if issns is None:
                issns = []
            if cc is None:
                cc = ""
            if applied is None:
                applied = ""
            if last_manual is None:
                last_manual = "never"
            if last_update is None:
                last_update = "never"
            if notes is None:
                notes = []
            if status is None:
                status = "unknown"

            issns = ", ".join(issns)
            notes = "\n\n".join(["[" + n.get("date") + "] " + n.get("note") for n in notes])

            country = datasets.get_country_name(cc)

            writer.writerow([title, issns, cc, country, status, applied, last_manual, last_update, notes])
Пример #7
0
 def country_name(self):
     if self.country is not None:
         return datasets.get_country_name(self.country)
     return None
Пример #8
0
 def country_name(self):
     if self.country is not None:
         from portality import datasets  # delayed import because of files to be loaded
         return datasets.get_country_name(self.country)
     return None
Пример #9
0
    def journal2question(cls, journal):
        def other_list(main_field, other_field, other_value):
            aids = forminfo.get(main_field, [])
            if aids is None or aids == "" or aids == "None":
                aids = []

            # if the xwalk has returned a single-list element like ["None"]
            # we want to strip that "None" for the purpose of the CSV
            if choices.Choices.NONE in aids:
                aids.remove(choices.Choices.NONE)

            aidother = forminfo.get(other_field)

            if other_value in aids:
                aids.remove(other_value)
            if aidother is not None and aidother != "" and aidother != "None":
                aids.append(aidother)
            return ", ".join(aids)

        def yes_or_blank(val):
            return "Yes" if val in [True, "True", "Yes", "true", "yes"] else ''

        def license_checkbox(val):
            opts = {}
            [
                opts.update({k: v})
                for k, v in choices.Choices.licence_checkbox()
            ]
            nv = [opts.get(v) for v in val]
            return ", ".join(nv)

        def languages(vals):
            keep = []
            codes = [c.lower() for c, _ in datasets.language_options]
            names = [n.lower() for _, n in datasets.language_options]
            for v in vals:
                if v.lower() in codes:
                    keep.append(datasets.name_for_lang(v))
                elif v.lower() in names:
                    keep.append(v)
            return ", ".join(keep)

        # start by converting the object to the forminfo version
        forminfo = JournalFormXWalk.obj2form(journal)

        kvs = []

        # create key/value pairs for the questions in order
        kvs.append((cls.q("title"), forminfo.get("title")))
        kvs.append((cls.q("url"), forminfo.get("url")))
        kvs.append(
            (cls.q("alternative_title"), forminfo.get("alternative_title")))
        kvs.append((cls.q("pissn"), forminfo.get("pissn")))
        kvs.append((cls.q("eissn"), forminfo.get("eissn")))
        kvs.append((cls.q("publisher"), forminfo.get("publisher")))
        kvs.append((cls.q("society_institution"),
                    forminfo.get("society_institution")))
        kvs.append((cls.q("platform"), forminfo.get("platform")))
        kvs.append((cls.q("country"),
                    datasets.get_country_name(forminfo.get("country"))))
        # Get the APC info from journal index, since this includes [yes / no / no information] rather than true / false
        kvs.append(
            (cls.q("processing_charges"), journal.data.get("index",
                                                           {}).get("has_apc")))
        kvs.append((cls.q("processing_charges_url"),
                    forminfo.get("processing_charges_url")))
        kvs.append((cls.q("processing_charges_amount"),
                    forminfo.get("processing_charges_amount")))
        kvs.append((cls.q("processing_charges_currency"),
                    datasets.get_currency_name(
                        forminfo.get("processing_charges_currency"))))
        kvs.append((cls.q("submission_charges"),
                    yes_or_blank(forminfo.get("submission_charges"))))
        kvs.append((cls.q("submission_charges_url"),
                    forminfo.get("submission_charges_url")))
        kvs.append((cls.q("submission_charges_amount"),
                    forminfo.get("submission_charges_amount")))
        kvs.append((cls.q("submission_charges_currency"),
                    datasets.get_currency_name(
                        forminfo.get("submission_charges_currency"))))
        # these fields are present in the application but not the journal
        #kvs.append((cls.q("articles_last_year"), forminfo.get("articles_last_year", "")))
        #kvs.append((cls.q("articles_last_year_url"), forminfo.get("articles_last_year_url", "")))
        kvs.append((cls.q("waiver_policy"),
                    yes_or_blank(forminfo.get("waiver_policy"))))
        kvs.append(
            (cls.q("waiver_policy_url"), forminfo.get("waiver_policy_url")))

        dap = deepcopy(forminfo.get("digital_archiving_policy", []))
        lib = choices.Choices.digital_archiving_policy_val("library")
        oth = choices.Choices.digital_archiving_policy_val("other")
        if lib in dap: dap.remove(lib)
        if oth in dap: dap.remove(oth)
        if choices.Choices.digital_archiving_policy_val('none') in dap:
            dap.remove(choices.Choices.digital_archiving_policy_val('none'))
        kvs.append((cls.q("digital_archiving_policy"), ", ".join(dap)))
        kvs.append((cls.q("digital_archiving_policy_library"),
                    forminfo.get("digital_archiving_policy_library")))
        kvs.append((cls.q("digital_archiving_policy_other"),
                    forminfo.get("digital_archiving_policy_other")))

        kvs.append((cls.q("digital_archiving_policy_url"),
                    forminfo.get("digital_archiving_policy_url")))
        kvs.append((cls.q("crawl_permission"),
                    yes_or_blank(forminfo.get("crawl_permission"))))

        article_identifiers = other_list(
            "article_identifiers", "article_identifiers_other",
            choices.Choices.article_identifiers_val("other"))
        kvs.append((cls.q("article_identifiers"), article_identifiers))

        kvs.append((cls.q("download_statistics"),
                    yes_or_blank(forminfo.get("download_statistics"))))
        kvs.append((cls.q("download_statistics_url"),
                    forminfo.get("download_statistics_url")))
        kvs.append((cls.q("first_fulltext_oa_year"),
                    forminfo.get("first_fulltext_oa_year")))

        fulltext_formats = other_list(
            "fulltext_format", "fulltext_format_other",
            choices.Choices.fulltext_format_val("other"))
        kvs.append((cls.q("fulltext_format"), fulltext_formats))

        kvs.append((cls.q("keywords"), ", ".join(forminfo.get("keywords",
                                                              []))))
        kvs.append(
            (cls.q("languages"), languages(forminfo.get("languages", []))))
        kvs.append((cls.q("editorial_board_url"),
                    forminfo.get("editorial_board_url")))
        kvs.append(
            (cls.q("review_process"), forminfo.get("review_process", '')))
        kvs.append(
            (cls.q("review_process_url"), forminfo.get("review_process_url")))
        kvs.append((cls.q("aims_scope_url"), forminfo.get("aims_scope_url")))
        kvs.append((cls.q("instructions_authors_url"),
                    forminfo.get("instructions_authors_url")))
        kvs.append((cls.q("plagiarism_screening"),
                    yes_or_blank(forminfo.get("plagiarism_screening"))))
        kvs.append((cls.q("plagiarism_screening_url"),
                    forminfo.get("plagiarism_screening_url")))
        kvs.append(
            (cls.q("publication_time"), forminfo.get("publication_time")))
        kvs.append(
            (cls.q("oa_statement_url"), forminfo.get("oa_statement_url")))
        kvs.append((cls.q("license_embedded"),
                    yes_or_blank(forminfo.get("license_embedded"))))
        kvs.append((cls.q("license_embedded_url"),
                    forminfo.get("license_embedded_url")))

        lic = forminfo.get("license")
        if lic == choices.Choices.licence_val("other"):
            lic = forminfo.get("license_other")
        kvs.append((cls.q("license"), lic))

        kvs.append((cls.q("license_checkbox"),
                    license_checkbox(forminfo.get("license_checkbox", []))))
        kvs.append((cls.q("license_url"), forminfo.get("license_url")))
        kvs.append(
            (cls.q("open_access"), yes_or_blank(forminfo.get("open_access"))))

        deposit_policies = other_list(
            "deposit_policy", "deposit_policy_other",
            choices.Choices.deposit_policy_other_val("other"))
        kvs.append((cls.q("deposit_policy"), deposit_policies))

        cr = forminfo.get("copyright")
        kvs.append((cls.q("copyright"), cr))

        kvs.append((cls.q("copyright_url"), forminfo.get("copyright_url")))

        pr = forminfo.get("publishing_rights")
        kvs.append((cls.q("publishing_rights"), pr))

        kvs.append((cls.q("publishing_rights_url"),
                    forminfo.get("publishing_rights_url")))

        return kvs
Пример #10
0
 def country_name(self):
     if self.country is not None:
         from portality import datasets  # delayed import because of files to be loaded
         return datasets.get_country_name(self.country)
     return None
Пример #11
0
    def _generate_index(self):
        # the index fields we are going to generate
        issns = []
        subjects = []
        schema_subjects = []
        schema_codes = []
        classification = []
        langs = []
        country = None
        licenses = []
        publisher = []
        classification_paths = []
        unpunctitle = None
        asciiunpunctitle = None
        doi = None
        fulltext = None

        # the places we're going to get those fields from
        cbib = self.bibjson()
        jindex = self.data.get('index', {})
        hist = self.history()

        # get the issns out of the current bibjson
        issns += cbib.get_identifiers(cbib.P_ISSN)
        issns += cbib.get_identifiers(cbib.E_ISSN)

        # get the issn from the journal bibjson
        if isinstance(cbib.journal_issns, list):
            issns += cbib.journal_issns

        # de-duplicate the issns
        issns = list(set(issns))

        # now get the issns out of the historic records
        for date, hbib in hist:
            issns += hbib.get_identifiers(hbib.P_ISSN)
            issns += hbib.get_identifiers(hbib.E_ISSN)

        # get the subjects and concatenate them with their schemes from the current bibjson
        for subs in cbib.subjects():
            scheme = subs.get("scheme")
            term = subs.get("term")
            subjects.append(term)
            schema_subjects.append(scheme + ":" + term)
            classification.append(term)
            if "code" in subs:
                schema_codes.append(scheme + ":" + subs.get("code"))

        # copy the languages
        if len(cbib.journal_language) > 0:
            langs = [datasets.name_for_lang(l) for l in cbib.journal_language]

        # copy the country
        if jindex.get('country'):
            country = jindex.get('country')
        elif cbib.journal_country:
            country = datasets.get_country_name(cbib.journal_country)

        # get the title of the license
        lic = cbib.get_journal_license()
        if lic is not None:
            licenses.append(lic.get("title"))

        # copy the publisher/provider
        if cbib.publisher:
            publisher.append(cbib.publisher)

        # deduplicate the lists
        issns = list(set(issns))
        subjects = list(set(subjects))
        schema_subjects = list(set(schema_subjects))
        classification = list(set(classification))
        licenses = list(set(licenses))
        publisher = list(set(publisher))
        langs = list(set(langs))
        schema_codes = list(set(schema_codes))

        # work out what the date of publication is
        date = cbib.get_publication_date()

        # calculate the classification paths
        from portality.lcc import lcc  # inline import since this hits the database
        for subs in cbib.subjects():
            scheme = subs.get("scheme")
            term = subs.get("term")
            if scheme == "LCC":
                path = lcc.pathify(term)
                if path is not None:
                    classification_paths.append(path)

        # normalise the classification paths, so we only store the longest ones
        classification_paths = lcc.longest(classification_paths)

        # create an unpunctitle
        if cbib.title is not None:
            throwlist = string.punctuation + '\n\t'
            unpunctitle = "".join(c for c in cbib.title
                                  if c not in throwlist).strip()
            try:
                asciiunpunctitle = unidecode(unpunctitle)
            except:
                asciiunpunctitle = unpunctitle

        # determine if the seal is applied
        has_seal = "Yes" if self.has_seal() else "No"

        # create a normalised version of the DOI for deduplication
        source_doi = cbib.get_one_identifier(constants.IDENT_TYPE_DOI)
        try:
            doi = normalise.normalise_doi(source_doi)
        except ValueError as e:
            # if we can't normalise the DOI, just store it as-is.
            doi = source_doi

        # create a normalised version of the fulltext URL for deduplication
        fulltexts = cbib.get_urls(constants.LINK_TYPE_FULLTEXT)
        if len(fulltexts) > 0:
            source_fulltext = fulltexts[0]
            try:
                fulltext = normalise.normalise_url(source_fulltext)
            except ValueError as e:
                # if we can't normalise the fulltext store it as-is
                fulltext = source_fulltext

        # build the index part of the object
        self.data["index"] = {}
        if len(issns) > 0:
            self.data["index"]["issn"] = issns
        if date != "":
            self.data["index"]["date"] = date
            self.data["index"][
                "date_toc_fv_month"] = date  # Duplicated so we can have year/month facets in fv2
        if len(subjects) > 0:
            self.data["index"]["subject"] = subjects
        if len(schema_subjects) > 0:
            self.data["index"]["schema_subject"] = schema_subjects
        if len(classification) > 0:
            self.data["index"]["classification"] = classification
        if len(publisher) > 0:
            self.data["index"]["publisher"] = publisher
        if len(licenses) > 0:
            self.data["index"]["license"] = licenses
        if len(langs) > 0:
            self.data["index"]["language"] = langs
        if country is not None:
            self.data["index"]["country"] = country
        if len(schema_codes) > 0:
            self.data["index"]["schema_code"] = schema_codes
        if len(classification_paths) > 0:
            self.data["index"]["classification_paths"] = classification_paths
        if unpunctitle is not None:
            self.data["index"]["unpunctitle"] = unpunctitle
        if asciiunpunctitle is not None:
            self.data["index"]["asciiunpunctitle"] = unpunctitle
        if has_seal:
            self.data["index"]["has_seal"] = has_seal
        if doi is not None:
            self.data["index"]["doi"] = doi
        if fulltext is not None:
            self.data["index"]["fulltext"] = fulltext