def content_reports(fr, to, outdir): report = {} q = ContentByDate(fr, to) res = models.Suggestion.query(q=q.query()) year_buckets = res.get("aggregations", {}).get("years", {}).get("buckets", []) for years in year_buckets: ds = years.get("key_as_string") do = dates.parse(ds) year = do.year if year not in report: report[year] = {} country_buckets = years.get("countries", {}).get("buckets", []) for country in country_buckets: cc = country.get("key") cn = datasets.get_country_name(cc) if cn not in report[year]: report[year][cn] = {} count = country.get("doc_count") report[year][cn]["count"] = count table = _tabulate_time_entity_group(report, "Country") filename = "applications_by_year_by_country__" + _fft(fr) + "_to_" + _fft(to) + "__on_" + dates.today() + ".csv" outfiles = [] outfile = os.path.join(outdir, filename) outfiles.append(outfile) with codecs.open(outfile, "wb", "utf-8") as f: writer = UnicodeWriter(f) for row in table: writer.writerow(row) return outfiles
def migrate(test=False): start = datetime.now() journal_iterator = models.Journal.all_in_doaj() counter = 0 with open(os.path.join(OUT_DIR, OUT_FILENAME), 'wb') as o: writer = csv.writer(o) writer.writerow(['Old country', 'New Country']) for j in journal_iterator: counter += 1 oldcountry = j.bibjson().country j.bibjson().country = datasets.get_country_code( j.bibjson().country) newcountry = j.bibjson().country newcountry_name = datasets.get_country_name(newcountry) writer.writerow( [oldcountry.encode('utf-8'), newcountry_name.encode('utf-8')]) if not test: j.prep() j.save() end = datetime.now() print("Updated Journals", counter) print(start, end) print('Time taken:', end - start) print('You can pass -t to test the migration you just ran.')
def content_reports(fr, to, outdir): report = {} q = ContentByDate(fr, to) res = models.Suggestion.query(q=q.query()) year_buckets = res.get("aggregations", {}).get("years", {}).get("buckets", []) for years in year_buckets: ds = years.get("key_as_string") do = dates.parse(ds) year = do.year if year not in report: report[year] = {} country_buckets = years.get("countries", {}).get("buckets", []) for country in country_buckets: cc = country.get("key") cn = datasets.get_country_name(cc) if cn not in report[year]: report[year][cn] = {} count = country.get("doc_count") report[year][cn]["count"] = count table = _tabulate_time_entity_group(report, "Country") filename = "applications_by_year_by_country__" + _fft(fr) + "_to_" + _fft( to) + "__on_" + dates.today() + ".csv" outfiles = [] outfile = os.path.join(outdir, filename) outfiles.append(outfile) with codecs.open(outfile, "wb", "utf-8") as f: writer = UnicodeWriter(f) for row in table: writer.writerow(row) return outfiles
def test_01_countries(self): """ Use country information from our datasets """ assert datasets.get_country_code( 'united kingdom') == 'GB', 'expected GB, received: {}'.format( datasets.get_country_name('GB')) assert datasets.get_country_name( 'GB' ) == 'United Kingdom', 'expected United Kingdom, received: {}'.format( datasets.get_country_name('GB')) # If the country is unrecognised, we send it back unchanged. assert datasets.get_country_code('mordor') == 'mordor' assert datasets.get_country_name('mordor') == 'mordor' # Unless fail_if_not_found is set in get_country_code() assert datasets.get_country_code('united states') == 'US' assert datasets.get_country_code('the shire', fail_if_not_found=True) is None assert datasets.get_country_code( 'the shire', fail_if_not_found=False) is 'the shire' # When we have more than one option, the first alphabetically is returned assert datasets.get_country_name('AE') == 'United Arab Emirates'
def do_report(out): with codecs.open(out, "wb", encoding="utf-8") as f: writer = clcsv.UnicodeWriter(f) writer.writerow([ "Title", "ISSN(s)", "Country Code", "Country", "Status", "Date Applied", "Last Manual Update", "Last Update", "Notes" ]) gen = models.Suggestion.list_by_status( constants.APPLICATION_STATUS_REJECTED) for s in gen: bj = s.bibjson() title = bj.title issns = bj.issns() cc = bj.country applied = s.created_date last_manual = s.last_manual_update last_update = s.last_updated notes = s.notes status = s.application_status if title is None: title = "" if issns is None: issns = [] if cc is None: cc = "" if applied is None: applied = "" if last_manual is None: last_manual = "never" if last_update is None: last_update = "never" if notes is None: notes = [] if status is None: status = "unknown" issns = ", ".join(issns) notes = "\n\n".join( ["[" + n.get("date") + "] " + n.get("note") for n in notes]) country = datasets.get_country_name(cc) writer.writerow([ title, issns, cc, country, status, applied, last_manual, last_update, notes ])
def do_report(out): with codecs.open(out, "wb", encoding="utf-8") as f: writer = clcsv.UnicodeWriter(f) writer.writerow(["Title", "ISSN(s)", "Country Code", "Country", "Status", "Date Applied", "Last Manual Update", "Last Update", "Notes"]) gen = models.Suggestion.list_by_status(constants.APPLICATION_STATUS_REJECTED) for s in gen: bj = s.bibjson() title = bj.title issns = bj.issns() cc = bj.country applied = s.created_date last_manual = s.last_manual_update last_update = s.last_updated notes = s.notes status = s.application_status if title is None: title = "" if issns is None: issns = [] if cc is None: cc = "" if applied is None: applied = "" if last_manual is None: last_manual = "never" if last_update is None: last_update = "never" if notes is None: notes = [] if status is None: status = "unknown" issns = ", ".join(issns) notes = "\n\n".join(["[" + n.get("date") + "] " + n.get("note") for n in notes]) country = datasets.get_country_name(cc) writer.writerow([title, issns, cc, country, status, applied, last_manual, last_update, notes])
def country_name(self): if self.country is not None: return datasets.get_country_name(self.country) return None
def country_name(self): if self.country is not None: from portality import datasets # delayed import because of files to be loaded return datasets.get_country_name(self.country) return None
def journal2question(cls, journal): def other_list(main_field, other_field, other_value): aids = forminfo.get(main_field, []) if aids is None or aids == "" or aids == "None": aids = [] # if the xwalk has returned a single-list element like ["None"] # we want to strip that "None" for the purpose of the CSV if choices.Choices.NONE in aids: aids.remove(choices.Choices.NONE) aidother = forminfo.get(other_field) if other_value in aids: aids.remove(other_value) if aidother is not None and aidother != "" and aidother != "None": aids.append(aidother) return ", ".join(aids) def yes_or_blank(val): return "Yes" if val in [True, "True", "Yes", "true", "yes"] else '' def license_checkbox(val): opts = {} [ opts.update({k: v}) for k, v in choices.Choices.licence_checkbox() ] nv = [opts.get(v) for v in val] return ", ".join(nv) def languages(vals): keep = [] codes = [c.lower() for c, _ in datasets.language_options] names = [n.lower() for _, n in datasets.language_options] for v in vals: if v.lower() in codes: keep.append(datasets.name_for_lang(v)) elif v.lower() in names: keep.append(v) return ", ".join(keep) # start by converting the object to the forminfo version forminfo = JournalFormXWalk.obj2form(journal) kvs = [] # create key/value pairs for the questions in order kvs.append((cls.q("title"), forminfo.get("title"))) kvs.append((cls.q("url"), forminfo.get("url"))) kvs.append( (cls.q("alternative_title"), forminfo.get("alternative_title"))) kvs.append((cls.q("pissn"), forminfo.get("pissn"))) kvs.append((cls.q("eissn"), forminfo.get("eissn"))) kvs.append((cls.q("publisher"), forminfo.get("publisher"))) kvs.append((cls.q("society_institution"), forminfo.get("society_institution"))) kvs.append((cls.q("platform"), forminfo.get("platform"))) kvs.append((cls.q("country"), datasets.get_country_name(forminfo.get("country")))) # Get the APC info from journal index, since this includes [yes / no / no information] rather than true / false kvs.append( (cls.q("processing_charges"), journal.data.get("index", {}).get("has_apc"))) kvs.append((cls.q("processing_charges_url"), forminfo.get("processing_charges_url"))) kvs.append((cls.q("processing_charges_amount"), forminfo.get("processing_charges_amount"))) kvs.append((cls.q("processing_charges_currency"), datasets.get_currency_name( forminfo.get("processing_charges_currency")))) kvs.append((cls.q("submission_charges"), yes_or_blank(forminfo.get("submission_charges")))) kvs.append((cls.q("submission_charges_url"), forminfo.get("submission_charges_url"))) kvs.append((cls.q("submission_charges_amount"), forminfo.get("submission_charges_amount"))) kvs.append((cls.q("submission_charges_currency"), datasets.get_currency_name( forminfo.get("submission_charges_currency")))) # these fields are present in the application but not the journal #kvs.append((cls.q("articles_last_year"), forminfo.get("articles_last_year", ""))) #kvs.append((cls.q("articles_last_year_url"), forminfo.get("articles_last_year_url", ""))) kvs.append((cls.q("waiver_policy"), yes_or_blank(forminfo.get("waiver_policy")))) kvs.append( (cls.q("waiver_policy_url"), forminfo.get("waiver_policy_url"))) dap = deepcopy(forminfo.get("digital_archiving_policy", [])) lib = choices.Choices.digital_archiving_policy_val("library") oth = choices.Choices.digital_archiving_policy_val("other") if lib in dap: dap.remove(lib) if oth in dap: dap.remove(oth) if choices.Choices.digital_archiving_policy_val('none') in dap: dap.remove(choices.Choices.digital_archiving_policy_val('none')) kvs.append((cls.q("digital_archiving_policy"), ", ".join(dap))) kvs.append((cls.q("digital_archiving_policy_library"), forminfo.get("digital_archiving_policy_library"))) kvs.append((cls.q("digital_archiving_policy_other"), forminfo.get("digital_archiving_policy_other"))) kvs.append((cls.q("digital_archiving_policy_url"), forminfo.get("digital_archiving_policy_url"))) kvs.append((cls.q("crawl_permission"), yes_or_blank(forminfo.get("crawl_permission")))) article_identifiers = other_list( "article_identifiers", "article_identifiers_other", choices.Choices.article_identifiers_val("other")) kvs.append((cls.q("article_identifiers"), article_identifiers)) kvs.append((cls.q("download_statistics"), yes_or_blank(forminfo.get("download_statistics")))) kvs.append((cls.q("download_statistics_url"), forminfo.get("download_statistics_url"))) kvs.append((cls.q("first_fulltext_oa_year"), forminfo.get("first_fulltext_oa_year"))) fulltext_formats = other_list( "fulltext_format", "fulltext_format_other", choices.Choices.fulltext_format_val("other")) kvs.append((cls.q("fulltext_format"), fulltext_formats)) kvs.append((cls.q("keywords"), ", ".join(forminfo.get("keywords", [])))) kvs.append( (cls.q("languages"), languages(forminfo.get("languages", [])))) kvs.append((cls.q("editorial_board_url"), forminfo.get("editorial_board_url"))) kvs.append( (cls.q("review_process"), forminfo.get("review_process", ''))) kvs.append( (cls.q("review_process_url"), forminfo.get("review_process_url"))) kvs.append((cls.q("aims_scope_url"), forminfo.get("aims_scope_url"))) kvs.append((cls.q("instructions_authors_url"), forminfo.get("instructions_authors_url"))) kvs.append((cls.q("plagiarism_screening"), yes_or_blank(forminfo.get("plagiarism_screening")))) kvs.append((cls.q("plagiarism_screening_url"), forminfo.get("plagiarism_screening_url"))) kvs.append( (cls.q("publication_time"), forminfo.get("publication_time"))) kvs.append( (cls.q("oa_statement_url"), forminfo.get("oa_statement_url"))) kvs.append((cls.q("license_embedded"), yes_or_blank(forminfo.get("license_embedded")))) kvs.append((cls.q("license_embedded_url"), forminfo.get("license_embedded_url"))) lic = forminfo.get("license") if lic == choices.Choices.licence_val("other"): lic = forminfo.get("license_other") kvs.append((cls.q("license"), lic)) kvs.append((cls.q("license_checkbox"), license_checkbox(forminfo.get("license_checkbox", [])))) kvs.append((cls.q("license_url"), forminfo.get("license_url"))) kvs.append( (cls.q("open_access"), yes_or_blank(forminfo.get("open_access")))) deposit_policies = other_list( "deposit_policy", "deposit_policy_other", choices.Choices.deposit_policy_other_val("other")) kvs.append((cls.q("deposit_policy"), deposit_policies)) cr = forminfo.get("copyright") kvs.append((cls.q("copyright"), cr)) kvs.append((cls.q("copyright_url"), forminfo.get("copyright_url"))) pr = forminfo.get("publishing_rights") kvs.append((cls.q("publishing_rights"), pr)) kvs.append((cls.q("publishing_rights_url"), forminfo.get("publishing_rights_url"))) return kvs
def _generate_index(self): # the index fields we are going to generate issns = [] subjects = [] schema_subjects = [] schema_codes = [] classification = [] langs = [] country = None licenses = [] publisher = [] classification_paths = [] unpunctitle = None asciiunpunctitle = None doi = None fulltext = None # the places we're going to get those fields from cbib = self.bibjson() jindex = self.data.get('index', {}) hist = self.history() # get the issns out of the current bibjson issns += cbib.get_identifiers(cbib.P_ISSN) issns += cbib.get_identifiers(cbib.E_ISSN) # get the issn from the journal bibjson if isinstance(cbib.journal_issns, list): issns += cbib.journal_issns # de-duplicate the issns issns = list(set(issns)) # now get the issns out of the historic records for date, hbib in hist: issns += hbib.get_identifiers(hbib.P_ISSN) issns += hbib.get_identifiers(hbib.E_ISSN) # get the subjects and concatenate them with their schemes from the current bibjson for subs in cbib.subjects(): scheme = subs.get("scheme") term = subs.get("term") subjects.append(term) schema_subjects.append(scheme + ":" + term) classification.append(term) if "code" in subs: schema_codes.append(scheme + ":" + subs.get("code")) # copy the languages if len(cbib.journal_language) > 0: langs = [datasets.name_for_lang(l) for l in cbib.journal_language] # copy the country if jindex.get('country'): country = jindex.get('country') elif cbib.journal_country: country = datasets.get_country_name(cbib.journal_country) # get the title of the license lic = cbib.get_journal_license() if lic is not None: licenses.append(lic.get("title")) # copy the publisher/provider if cbib.publisher: publisher.append(cbib.publisher) # deduplicate the lists issns = list(set(issns)) subjects = list(set(subjects)) schema_subjects = list(set(schema_subjects)) classification = list(set(classification)) licenses = list(set(licenses)) publisher = list(set(publisher)) langs = list(set(langs)) schema_codes = list(set(schema_codes)) # work out what the date of publication is date = cbib.get_publication_date() # calculate the classification paths from portality.lcc import lcc # inline import since this hits the database for subs in cbib.subjects(): scheme = subs.get("scheme") term = subs.get("term") if scheme == "LCC": path = lcc.pathify(term) if path is not None: classification_paths.append(path) # normalise the classification paths, so we only store the longest ones classification_paths = lcc.longest(classification_paths) # create an unpunctitle if cbib.title is not None: throwlist = string.punctuation + '\n\t' unpunctitle = "".join(c for c in cbib.title if c not in throwlist).strip() try: asciiunpunctitle = unidecode(unpunctitle) except: asciiunpunctitle = unpunctitle # determine if the seal is applied has_seal = "Yes" if self.has_seal() else "No" # create a normalised version of the DOI for deduplication source_doi = cbib.get_one_identifier(constants.IDENT_TYPE_DOI) try: doi = normalise.normalise_doi(source_doi) except ValueError as e: # if we can't normalise the DOI, just store it as-is. doi = source_doi # create a normalised version of the fulltext URL for deduplication fulltexts = cbib.get_urls(constants.LINK_TYPE_FULLTEXT) if len(fulltexts) > 0: source_fulltext = fulltexts[0] try: fulltext = normalise.normalise_url(source_fulltext) except ValueError as e: # if we can't normalise the fulltext store it as-is fulltext = source_fulltext # build the index part of the object self.data["index"] = {} if len(issns) > 0: self.data["index"]["issn"] = issns if date != "": self.data["index"]["date"] = date self.data["index"][ "date_toc_fv_month"] = date # Duplicated so we can have year/month facets in fv2 if len(subjects) > 0: self.data["index"]["subject"] = subjects if len(schema_subjects) > 0: self.data["index"]["schema_subject"] = schema_subjects if len(classification) > 0: self.data["index"]["classification"] = classification if len(publisher) > 0: self.data["index"]["publisher"] = publisher if len(licenses) > 0: self.data["index"]["license"] = licenses if len(langs) > 0: self.data["index"]["language"] = langs if country is not None: self.data["index"]["country"] = country if len(schema_codes) > 0: self.data["index"]["schema_code"] = schema_codes if len(classification_paths) > 0: self.data["index"]["classification_paths"] = classification_paths if unpunctitle is not None: self.data["index"]["unpunctitle"] = unpunctitle if asciiunpunctitle is not None: self.data["index"]["asciiunpunctitle"] = unpunctitle if has_seal: self.data["index"]["has_seal"] = has_seal if doi is not None: self.data["index"]["doi"] = doi if fulltext is not None: self.data["index"]["fulltext"] = fulltext