def migrate(test=False): start = datetime.now() journal_iterator = models.Journal.all_in_doaj() counter = 0 with open(os.path.join(OUT_DIR, OUT_FILENAME), 'wb') as o: writer = csv.writer(o) writer.writerow(['Old country', 'New Country']) for j in journal_iterator: counter += 1 oldcountry = j.bibjson().country j.bibjson().country = xwalk.get_country_code(j.bibjson().country) newcountry = j.bibjson().country newcountry_name = xwalk.get_country_name(newcountry) writer.writerow([oldcountry.encode('utf-8'), newcountry_name.encode('utf-8')]) if not test: j.prep() j.save() end = datetime.now() print "Updated Journals", counter print start, end print 'Time taken:', end-start print 'You can pass -t to test the migration you just ran.'
def migrate(test=False): start = datetime.now() journal_iterator = models.Journal.all_in_doaj() counter = 0 with open(os.path.join(OUT_DIR, OUT_FILENAME), 'wb') as o: writer = csv.writer(o) writer.writerow(['Old country', 'New Country']) for j in journal_iterator: counter += 1 oldcountry = j.bibjson().country j.bibjson().country = xwalk.get_country_code(j.bibjson().country) newcountry = j.bibjson().country newcountry_name = xwalk.get_country_name(newcountry) writer.writerow( [oldcountry.encode('utf-8'), newcountry_name.encode('utf-8')]) if not test: j.prep() j.save() end = datetime.now() print "Updated Journals", counter print start, end print 'Time taken:', end - start print 'You can pass -t to test the migration you just ran.'
def _generate_index(self): # the index fields we are going to generate issns = [] subjects = [] schema_subjects = [] schema_codes = [] classification = [] langs = [] country = None licenses = [] publisher = [] classification_paths = [] unpunctitle = None asciiunpunctitle = None doi = None fulltext = None # the places we're going to get those fields from cbib = self.bibjson() jindex = self.data.get('index', {}) hist = self.history() # get the issns out of the current bibjson issns += cbib.get_identifiers(cbib.P_ISSN) issns += cbib.get_identifiers(cbib.E_ISSN) # get the issn from the journal bibjson if isinstance(cbib.journal_issns, list): issns += cbib.journal_issns # de-duplicate the issns issns = list(set(issns)) # now get the issns out of the historic records for date, hbib in hist: issns += hbib.get_identifiers(hbib.P_ISSN) issns += hbib.get_identifiers(hbib.E_ISSN) # get the subjects and concatenate them with their schemes from the current bibjson for subs in cbib.subjects(): scheme = subs.get("scheme") term = subs.get("term") subjects.append(term) schema_subjects.append(scheme + ":" + term) classification.append(term) if "code" in subs: schema_codes.append(scheme + ":" + subs.get("code")) # copy the languages from portality import datasets # delayed import, as it loads some stuff from file if len(cbib.journal_language) > 0: langs = [datasets.name_for_lang(l) for l in cbib.journal_language] # copy the country if jindex.get('country'): country = jindex.get('country') elif cbib.journal_country: country = xwalk.get_country_name(cbib.journal_country) # get the title of the license lic = cbib.get_journal_license() if lic is not None: licenses.append(lic.get("title")) # copy the publisher/provider if cbib.publisher: publisher.append(cbib.publisher) # deduplicate the lists issns = list(set(issns)) subjects = list(set(subjects)) schema_subjects = list(set(schema_subjects)) classification = list(set(classification)) licenses = list(set(licenses)) publisher = list(set(publisher)) langs = list(set(langs)) schema_codes = list(set(schema_codes)) # work out what the date of publication is date = cbib.get_publication_date() # calculate the classification paths from portality.lcc import lcc # inline import since this hits the database for subs in cbib.subjects(): scheme = subs.get("scheme") term = subs.get("term") if scheme == "LCC": path = lcc.pathify(term) if path is not None: classification_paths.append(path) # normalise the classification paths, so we only store the longest ones classification_paths = lcc.longest(classification_paths) # create an unpunctitle if cbib.title is not None: throwlist = string.punctuation + '\n\t' unpunctitle = "".join(c for c in cbib.title if c not in throwlist).strip() try: asciiunpunctitle = unidecode(unpunctitle) except: asciiunpunctitle = unpunctitle # determine if the seal is applied has_seal = "Yes" if self.has_seal() else "No" # create a normalised version of the DOI for deduplication source_doi = cbib.get_one_identifier(constants.IDENT_TYPE_DOI) try: doi = normalise.normalise_doi(source_doi) except ValueError as e: # if we can't normalise the DOI, just store it as-is. doi = source_doi # create a normalised version of the fulltext URL for deduplication fulltexts = cbib.get_urls(constants.LINK_TYPE_FULLTEXT) if len(fulltexts) > 0: source_fulltext = fulltexts[0] try: fulltext = normalise.normalise_url(source_fulltext) except ValueError as e: # if we can't normalise the fulltext store it as-is fulltext = source_fulltext # build the index part of the object self.data["index"] = {} if len(issns) > 0: self.data["index"]["issn"] = issns if date != "": self.data["index"]["date"] = date self.data["index"][ "date_toc_fv_month"] = date # Duplicated so we can have year/month facets in fv2 if len(subjects) > 0: self.data["index"]["subject"] = subjects if len(schema_subjects) > 0: self.data["index"]["schema_subject"] = schema_subjects if len(classification) > 0: self.data["index"]["classification"] = classification if len(publisher) > 0: self.data["index"]["publisher"] = publisher if len(licenses) > 0: self.data["index"]["license"] = licenses if len(langs) > 0: self.data["index"]["language"] = langs if country is not None: self.data["index"]["country"] = country if schema_codes > 0: self.data["index"]["schema_code"] = schema_codes if len(classification_paths) > 0: self.data["index"]["classification_paths"] = classification_paths if unpunctitle is not None: self.data["index"]["unpunctitle"] = unpunctitle if asciiunpunctitle is not None: self.data["index"]["asciiunpunctitle"] = unpunctitle if has_seal: self.data["index"]["has_seal"] = has_seal if doi is not None: self.data["index"]["doi"] = doi if fulltext is not None: self.data["index"]["fulltext"] = fulltext
def _generate_index(self): # the index fields we are going to generate issns = [] titles = [] subjects = [] schema_subjects = [] schema_codes = [] classification = [] langs = [] country = None license = [] publisher = [] urls = {} # the places we're going to get those fields from cbib = self.bibjson() hist = self.history() # get the issns out of the current bibjson issns += cbib.get_identifiers(cbib.P_ISSN) issns += cbib.get_identifiers(cbib.E_ISSN) # get the title out of the current bibjson if cbib.title is not None: titles.append(cbib.title) # get the subjects and concatenate them with their schemes from the current bibjson for subs in cbib.subjects(): scheme = subs.get("scheme") term = subs.get("term") subjects.append(term) schema_subjects.append(scheme + ":" + term) classification.append(term) if "code" in subs: schema_codes.append(scheme + ":" + subs.get("code")) # add the keywords to the non-schema subjects (but not the classification) subjects += cbib.keywords # now get the issns and titles out of the historic records for date, r, irb, hbib in hist: issns += hbib.get_identifiers(hbib.P_ISSN) issns += hbib.get_identifiers(hbib.E_ISSN) if hbib.title is not None: titles.append(hbib.title) # copy the languages if cbib.language is not None: langs = cbib.language # copy the country if cbib.country is not None: country = xwalk.get_country_name(cbib.country) # get the title of the license lic = cbib.get_license() if lic is not None: license.append(lic.get("title")) # copy the publisher/institution if cbib.publisher: publisher.append(cbib.publisher) if cbib.institution: publisher.append(cbib.institution) # extract and convert all of the urls by their type links = cbib.get_urls() for link in links: lt = link.get("type") if lt is not None: urls[lt + "_url"] = link.get("url") # deduplicate the lists issns = list(set(issns)) titles = list(set(titles)) subjects = list(set(subjects)) schema_subjects = list(set(schema_subjects)) classification = list(set(classification)) license = list(set(license)) publisher = list(set(publisher)) langs = list(set(langs)) schema_codes = list(set(schema_codes)) # build the index part of the object self.data["index"] = {} if len(issns) > 0: self.data["index"]["issn"] = issns if len(titles) > 0: self.data["index"]["title"] = titles if len(subjects) > 0: self.data["index"]["subject"] = subjects if len(schema_subjects) > 0: self.data["index"]["schema_subject"] = schema_subjects if len(classification) > 0: self.data["index"]["classification"] = classification if len(publisher) > 0: self.data["index"]["publisher"] = publisher if len(license) > 0: self.data["index"]["license"] = license if len(langs) > 0: self.data["index"]["language"] = langs if country is not None: self.data["index"]["country"] = country if len(schema_codes) > 0: self.data["index"]["schema_code"] = schema_codes if len(urls.keys()) > 0: self.data["index"].update(urls)
def _generate_index(self): # the index fields we are going to generate issns = [] subjects = [] schema_subjects = [] schema_codes = [] classification = [] langs = [] country = None license = [] publisher = [] # the places we're going to get those fields from cbib = self.bibjson() jindex = self.data.get('index', {}) hist = self.history() # get the issns out of the current bibjson issns += cbib.get_identifiers(cbib.P_ISSN) issns += cbib.get_identifiers(cbib.E_ISSN) # now get the issns out of the historic records for date, hbib in hist: issns += hbib.get_identifiers(hbib.P_ISSN) issns += hbib.get_identifiers(hbib.E_ISSN) # get the subjects and concatenate them with their schemes from the current bibjson for subs in cbib.subjects(): scheme = subs.get("scheme") term = subs.get("term") subjects.append(term) schema_subjects.append(scheme + ":" + term) classification.append(term) if "code" in subs: schema_codes.append(scheme + ":" + subs.get("code")) # copy the languages if cbib.journal_language is not None: langs = cbib.journal_language # copy the country if jindex.get('country'): country = jindex.get('country') elif cbib.journal_country: country = xwalk.get_country_name(cbib.journal_country) # get the title of the license lic = cbib.get_journal_license() if lic is not None: license.append(lic.get("title")) # copy the publisher/provider if cbib.publisher: publisher.append(cbib.publisher) # deduplicate the list issns = list(set(issns)) subjects = list(set(subjects)) schema_subjects = list(set(schema_subjects)) classification = list(set(classification)) license = list(set(license)) publisher = list(set(publisher)) langs = list(set(langs)) schema_codes = list(set(schema_codes)) # work out what the date of publication is date = cbib.get_publication_date() # build the index part of the object self.data["index"] = {} if len(issns) > 0: self.data["index"]["issn"] = issns if date != "": self.data["index"]["date"] = date if len(subjects) > 0: self.data["index"]["subject"] = subjects if len(schema_subjects) > 0: self.data["index"]["schema_subject"] = schema_subjects if len(classification) > 0: self.data["index"]["classification"] = classification if len(publisher) > 0: self.data["index"]["publisher"] = publisher if len(license) > 0: self.data["index"]["license"] = license if len(langs) > 0: self.data["index"]["language"] = langs if country is not None: self.data["index"]["country"] = country if schema_codes > 0: self.data["index"]["schema_code"] = schema_codes
def _generate_index(self): # the index fields we are going to generate issns = [] subjects = [] schema_subjects = [] schema_codes = [] classification = [] langs = [] country = None licenses = [] publisher = [] classification_paths = [] unpunctitle = None asciiunpunctitle = None doi = None fulltext = None # the places we're going to get those fields from cbib = self.bibjson() jindex = self.data.get('index', {}) hist = self.history() # get the issns out of the current bibjson issns += cbib.get_identifiers(cbib.P_ISSN) issns += cbib.get_identifiers(cbib.E_ISSN) # get the issn from the journal bibjson if isinstance(cbib.journal_issns, list): issns += cbib.journal_issns # de-duplicate the issns issns = list(set(issns)) # now get the issns out of the historic records for date, hbib in hist: issns += hbib.get_identifiers(hbib.P_ISSN) issns += hbib.get_identifiers(hbib.E_ISSN) # get the subjects and concatenate them with their schemes from the current bibjson for subs in cbib.subjects(): scheme = subs.get("scheme") term = subs.get("term") subjects.append(term) schema_subjects.append(scheme + ":" + term) classification.append(term) if "code" in subs: schema_codes.append(scheme + ":" + subs.get("code")) # copy the languages from portality import datasets # delayed import, as it loads some stuff from file if len(cbib.journal_language) > 0: langs = [datasets.name_for_lang(l) for l in cbib.journal_language] # copy the country if jindex.get('country'): country = jindex.get('country') elif cbib.journal_country: country = xwalk.get_country_name(cbib.journal_country) # get the title of the license lic = cbib.get_journal_license() if lic is not None: licenses.append(lic.get("title")) # copy the publisher/provider if cbib.publisher: publisher.append(cbib.publisher) # deduplicate the lists issns = list(set(issns)) subjects = list(set(subjects)) schema_subjects = list(set(schema_subjects)) classification = list(set(classification)) licenses = list(set(licenses)) publisher = list(set(publisher)) langs = list(set(langs)) schema_codes = list(set(schema_codes)) # work out what the date of publication is date = cbib.get_publication_date() # calculate the classification paths from portality.lcc import lcc # inline import since this hits the database for subs in cbib.subjects(): scheme = subs.get("scheme") term = subs.get("term") if scheme == "LCC": path = lcc.pathify(term) if path is not None: classification_paths.append(path) # normalise the classification paths, so we only store the longest ones classification_paths = lcc.longest(classification_paths) # create an unpunctitle if cbib.title is not None: throwlist = string.punctuation + '\n\t' unpunctitle = "".join(c for c in cbib.title if c not in throwlist).strip() try: asciiunpunctitle = unidecode(unpunctitle) except: asciiunpunctitle = unpunctitle # determine if the seal is applied has_seal = "Yes" if self.has_seal() else "No" # create a normalised version of the DOI for deduplication source_doi = cbib.get_one_identifier(constants.IDENT_TYPE_DOI) try: doi = normalise.normalise_doi(source_doi) except ValueError as e: # if we can't normalise the DOI, just store it as-is. doi = source_doi # create a normalised version of the fulltext URL for deduplication fulltexts = cbib.get_urls(constants.LINK_TYPE_FULLTEXT) if len(fulltexts) > 0: source_fulltext = fulltexts[0] try: fulltext = normalise.normalise_url(source_fulltext) except ValueError as e: # if we can't normalise the fulltext store it as-is fulltext = source_fulltext # build the index part of the object self.data["index"] = {} if len(issns) > 0: self.data["index"]["issn"] = issns if date != "": self.data["index"]["date"] = date self.data["index"]["date_toc_fv_month"] = date # Duplicated so we can have year/month facets in fv2 if len(subjects) > 0: self.data["index"]["subject"] = subjects if len(schema_subjects) > 0: self.data["index"]["schema_subject"] = schema_subjects if len(classification) > 0: self.data["index"]["classification"] = classification if len(publisher) > 0: self.data["index"]["publisher"] = publisher if len(licenses) > 0: self.data["index"]["license"] = licenses if len(langs) > 0: self.data["index"]["language"] = langs if country is not None: self.data["index"]["country"] = country if schema_codes > 0: self.data["index"]["schema_code"] = schema_codes if len(classification_paths) > 0: self.data["index"]["classification_paths"] = classification_paths if unpunctitle is not None: self.data["index"]["unpunctitle"] = unpunctitle if asciiunpunctitle is not None: self.data["index"]["asciiunpunctitle"] = unpunctitle if has_seal: self.data["index"]["has_seal"] = has_seal if doi is not None: self.data["index"]["doi"] = doi if fulltext is not None: self.data["index"]["fulltext"] = fulltext