def get_normalised_fulltext(self): if self.data.get("index", {}).get("fulltext") is not None: return self.data["index"]["fulltext"] fulltexts = self.bibjson().get_urls(constants.LINK_TYPE_FULLTEXT) if len(fulltexts) == 0: return None try: return normalise.normalise_url(fulltexts[0]) except ValueError: # can't be normalised, so we just return the url as-is return fulltexts[0]
def test_01_normalise_url(self, name, kwargs): url_arg = kwargs.get("url") scheme_arg = kwargs.get("scheme") whitespace_arg = kwargs.get("whitespace") raises_arg = kwargs.get("raises") raises = EXCEPTIONS.get(raises_arg) ############################################### ## set up canonicalUrl = None if url_arg != "none": canonicalUrl = "//example.com/path;p=1?query=one&two=three#frag" url = canonicalUrl if scheme_arg == "none" and url is not None: url = url[2:] if scheme_arg not in ["-", "invalid", "none", "//"]: url = scheme_arg + ":" + url elif scheme_arg == "invalid": url = "somerubbish:" + url elif scheme_arg == "unknown": url = "unknown:" + url if whitespace_arg == "yes": url = " " + url + "\t\n" ########################################################### # Execution if raises is not None: with self.assertRaises(raises): norm = normalise_url(url) else: norm = normalise_url(url) assert norm == canonicalUrl
def duplicates(cls, publisher_record_id=None, doi=None, fulltexts=None, title=None, volume=None, number=None, start=None, should_match=None, size=10): # some input sanitisation urls = fulltexts if isinstance(fulltexts, list) else [ fulltexts ] if isinstance(fulltexts, str) or isinstance(fulltexts, str) else [] # make sure that we're dealing with the normal form of the identifiers norm_urls = [] for url in urls: try: norm = normalise.normalise_url(url) norm_urls.append(norm) except ValueError: # use the non-normal form norm_urls.append(url) urls = norm_urls try: doi = normalise.normalise_doi(doi) except ValueError: # leave the doi as it is pass q = DuplicateArticleQuery(publisher_record_id=publisher_record_id, doi=doi, urls=urls, title=title, volume=volume, number=number, start=start, should_match=should_match, size=size) res = cls.query(q=q.query()) return [ cls(**hit.get("_source")) for hit in res.get("hits", {}).get("hits", []) ]
def _generate_index(self): # the index fields we are going to generate issns = [] subjects = [] schema_subjects = [] schema_codes = [] classification = [] langs = [] country = None licenses = [] publisher = [] classification_paths = [] unpunctitle = None asciiunpunctitle = None doi = None fulltext = None # the places we're going to get those fields from cbib = self.bibjson() jindex = self.data.get('index', {}) hist = self.history() # get the issns out of the current bibjson issns += cbib.get_identifiers(cbib.P_ISSN) issns += cbib.get_identifiers(cbib.E_ISSN) # get the issn from the journal bibjson if isinstance(cbib.journal_issns, list): issns += cbib.journal_issns # de-duplicate the issns issns = list(set(issns)) # now get the issns out of the historic records for date, hbib in hist: issns += hbib.get_identifiers(hbib.P_ISSN) issns += hbib.get_identifiers(hbib.E_ISSN) # get the subjects and concatenate them with their schemes from the current bibjson for subs in cbib.subjects(): scheme = subs.get("scheme") term = subs.get("term") subjects.append(term) schema_subjects.append(scheme + ":" + term) classification.append(term) if "code" in subs: schema_codes.append(scheme + ":" + subs.get("code")) # copy the languages from portality import datasets # delayed import, as it loads some stuff from file if len(cbib.journal_language) > 0: langs = [datasets.name_for_lang(l) for l in cbib.journal_language] # copy the country if jindex.get('country'): country = jindex.get('country') elif cbib.journal_country: country = xwalk.get_country_name(cbib.journal_country) # get the title of the license lic = cbib.get_journal_license() if lic is not None: licenses.append(lic.get("title")) # copy the publisher/provider if cbib.publisher: publisher.append(cbib.publisher) # deduplicate the lists issns = list(set(issns)) subjects = list(set(subjects)) schema_subjects = list(set(schema_subjects)) classification = list(set(classification)) licenses = list(set(licenses)) publisher = list(set(publisher)) langs = list(set(langs)) schema_codes = list(set(schema_codes)) # work out what the date of publication is date = cbib.get_publication_date() # calculate the classification paths from portality.lcc import lcc # inline import since this hits the database for subs in cbib.subjects(): scheme = subs.get("scheme") term = subs.get("term") if scheme == "LCC": path = lcc.pathify(term) if path is not None: classification_paths.append(path) # normalise the classification paths, so we only store the longest ones classification_paths = lcc.longest(classification_paths) # create an unpunctitle if cbib.title is not None: throwlist = string.punctuation + '\n\t' unpunctitle = "".join(c for c in cbib.title if c not in throwlist).strip() try: asciiunpunctitle = unidecode(unpunctitle) except: asciiunpunctitle = unpunctitle # determine if the seal is applied has_seal = "Yes" if self.has_seal() else "No" # create a normalised version of the DOI for deduplication source_doi = cbib.get_one_identifier(constants.IDENT_TYPE_DOI) try: doi = normalise.normalise_doi(source_doi) except ValueError as e: # if we can't normalise the DOI, just store it as-is. doi = source_doi # create a normalised version of the fulltext URL for deduplication fulltexts = cbib.get_urls(constants.LINK_TYPE_FULLTEXT) if len(fulltexts) > 0: source_fulltext = fulltexts[0] try: fulltext = normalise.normalise_url(source_fulltext) except ValueError as e: # if we can't normalise the fulltext store it as-is fulltext = source_fulltext # build the index part of the object self.data["index"] = {} if len(issns) > 0: self.data["index"]["issn"] = issns if date != "": self.data["index"]["date"] = date self.data["index"][ "date_toc_fv_month"] = date # Duplicated so we can have year/month facets in fv2 if len(subjects) > 0: self.data["index"]["subject"] = subjects if len(schema_subjects) > 0: self.data["index"]["schema_subject"] = schema_subjects if len(classification) > 0: self.data["index"]["classification"] = classification if len(publisher) > 0: self.data["index"]["publisher"] = publisher if len(licenses) > 0: self.data["index"]["license"] = licenses if len(langs) > 0: self.data["index"]["language"] = langs if country is not None: self.data["index"]["country"] = country if schema_codes > 0: self.data["index"]["schema_code"] = schema_codes if len(classification_paths) > 0: self.data["index"]["classification_paths"] = classification_paths if unpunctitle is not None: self.data["index"]["unpunctitle"] = unpunctitle if asciiunpunctitle is not None: self.data["index"]["asciiunpunctitle"] = unpunctitle if has_seal: self.data["index"]["has_seal"] = has_seal if doi is not None: self.data["index"]["doi"] = doi if fulltext is not None: self.data["index"]["fulltext"] = fulltext
def duplicates(cls, issns=None, publisher_record_id=None, doi=None, fulltexts=None, title=None, volume=None, number=None, start=None, should_match=None, size=10): # some input sanitisation issns = issns if isinstance(issns, list) else [] urls = fulltexts if isinstance( fulltexts, list) else [fulltexts] if isinstance( fulltexts, str) or isinstance(fulltexts, unicode) else [] # make sure that we're dealing with the normal form of the identifiers norm_urls = [] for url in urls: try: norm = normalise.normalise_url(url) norm_urls.append(norm) except ValueError: # use the non-normal form norm_urls.append(url) urls = norm_urls try: doi = normalise.normalise_doi(doi) except ValueError: # leave the doi as it is pass # in order to make sure we don't send too many terms to the ES query, break the issn list down into chunks terms_limit = app.config.get("ES_TERMS_LIMIT", 1024) issn_groups = [] lower = 0 upper = terms_limit while lower < len(issns): issn_groups.append(issns[lower:upper]) lower = upper upper = lower + terms_limit if issns is not None and len(issns) > 0: duplicate_articles = [] for g in issn_groups: q = DuplicateArticleQuery( issns=g, publisher_record_id=publisher_record_id, doi=doi, urls=urls, title=title, volume=volume, number=number, start=start, should_match=should_match, size=size) # print json.dumps(q.query()) res = cls.query(q=q.query()) duplicate_articles += [ cls(**hit.get("_source")) for hit in res.get("hits", {}).get("hits", []) ] return duplicate_articles else: q = DuplicateArticleQuery(publisher_record_id=publisher_record_id, doi=doi, urls=urls, title=title, volume=volume, number=number, start=start, should_match=should_match, size=size) # print json.dumps(q.query()) res = cls.query(q=q.query()) return [ cls(**hit.get("_source")) for hit in res.get("hits", {}).get("hits", []) ]
def _read_match_set(reader, next_row): n_matches = -1 match_set = MatchSet() while True: if next_row is not None: row = deepcopy(next_row) next_row = None else: try: row = next(reader) except StopIteration: return match_set, None if row is None: return match_set, None a_id = row[0] root = match_set.root if root is not None and a_id != root["id"]: return match_set, row a_created = row[1] try: a_doi = normalise.normalise_doi(row[2]) except: a_doi = row[2] try: a_ft = normalise.normalise_url(row[3]) except: a_ft = row[3] a_owner = row[4] a_issns = row[5] a_in_doaj = row[6] == "True" if n_matches != -1: n_matches = int(row[7]) match_type = row[8] b_id = row[9] b_created = row[10] try: b_doi = normalise.normalise_doi(row[11]) except: b_doi = row[11] try: b_ft = normalise.normalise_url(row[12]) except: b_ft = row[12] b_owner = row[13] b_issns = row[14] b_in_doaj = row[15] == "True" title_match = row[17] == "True" if root is None: match_set.add_root(a_id, a_created, a_doi, a_ft, a_owner, a_issns, a_in_doaj, title_match) match_set.add_match(b_id, b_created, b_doi, b_ft, b_owner, b_issns, b_in_doaj, title_match, match_type) # a catch to make sure that everything is ok with the match set detection assert n_matches + 1 == len(match_set.matches)
def _generate_index(self): # the index fields we are going to generate issns = [] subjects = [] schema_subjects = [] schema_codes = [] classification = [] langs = [] country = None licenses = [] publisher = [] classification_paths = [] unpunctitle = None asciiunpunctitle = None doi = None fulltext = None # the places we're going to get those fields from cbib = self.bibjson() jindex = self.data.get('index', {}) hist = self.history() # get the issns out of the current bibjson issns += cbib.get_identifiers(cbib.P_ISSN) issns += cbib.get_identifiers(cbib.E_ISSN) # get the issn from the journal bibjson if isinstance(cbib.journal_issns, list): issns += cbib.journal_issns # de-duplicate the issns issns = list(set(issns)) # now get the issns out of the historic records for date, hbib in hist: issns += hbib.get_identifiers(hbib.P_ISSN) issns += hbib.get_identifiers(hbib.E_ISSN) # get the subjects and concatenate them with their schemes from the current bibjson for subs in cbib.subjects(): scheme = subs.get("scheme") term = subs.get("term") subjects.append(term) schema_subjects.append(scheme + ":" + term) classification.append(term) if "code" in subs: schema_codes.append(scheme + ":" + subs.get("code")) # copy the languages from portality import datasets # delayed import, as it loads some stuff from file if len(cbib.journal_language) > 0: langs = [datasets.name_for_lang(l) for l in cbib.journal_language] # copy the country if jindex.get('country'): country = jindex.get('country') elif cbib.journal_country: country = xwalk.get_country_name(cbib.journal_country) # get the title of the license lic = cbib.get_journal_license() if lic is not None: licenses.append(lic.get("title")) # copy the publisher/provider if cbib.publisher: publisher.append(cbib.publisher) # deduplicate the lists issns = list(set(issns)) subjects = list(set(subjects)) schema_subjects = list(set(schema_subjects)) classification = list(set(classification)) licenses = list(set(licenses)) publisher = list(set(publisher)) langs = list(set(langs)) schema_codes = list(set(schema_codes)) # work out what the date of publication is date = cbib.get_publication_date() # calculate the classification paths from portality.lcc import lcc # inline import since this hits the database for subs in cbib.subjects(): scheme = subs.get("scheme") term = subs.get("term") if scheme == "LCC": path = lcc.pathify(term) if path is not None: classification_paths.append(path) # normalise the classification paths, so we only store the longest ones classification_paths = lcc.longest(classification_paths) # create an unpunctitle if cbib.title is not None: throwlist = string.punctuation + '\n\t' unpunctitle = "".join(c for c in cbib.title if c not in throwlist).strip() try: asciiunpunctitle = unidecode(unpunctitle) except: asciiunpunctitle = unpunctitle # determine if the seal is applied has_seal = "Yes" if self.has_seal() else "No" # create a normalised version of the DOI for deduplication source_doi = cbib.get_one_identifier(constants.IDENT_TYPE_DOI) try: doi = normalise.normalise_doi(source_doi) except ValueError as e: # if we can't normalise the DOI, just store it as-is. doi = source_doi # create a normalised version of the fulltext URL for deduplication fulltexts = cbib.get_urls(constants.LINK_TYPE_FULLTEXT) if len(fulltexts) > 0: source_fulltext = fulltexts[0] try: fulltext = normalise.normalise_url(source_fulltext) except ValueError as e: # if we can't normalise the fulltext store it as-is fulltext = source_fulltext # build the index part of the object self.data["index"] = {} if len(issns) > 0: self.data["index"]["issn"] = issns if date != "": self.data["index"]["date"] = date self.data["index"]["date_toc_fv_month"] = date # Duplicated so we can have year/month facets in fv2 if len(subjects) > 0: self.data["index"]["subject"] = subjects if len(schema_subjects) > 0: self.data["index"]["schema_subject"] = schema_subjects if len(classification) > 0: self.data["index"]["classification"] = classification if len(publisher) > 0: self.data["index"]["publisher"] = publisher if len(licenses) > 0: self.data["index"]["license"] = licenses if len(langs) > 0: self.data["index"]["language"] = langs if country is not None: self.data["index"]["country"] = country if schema_codes > 0: self.data["index"]["schema_code"] = schema_codes if len(classification_paths) > 0: self.data["index"]["classification_paths"] = classification_paths if unpunctitle is not None: self.data["index"]["unpunctitle"] = unpunctitle if asciiunpunctitle is not None: self.data["index"]["asciiunpunctitle"] = unpunctitle if has_seal: self.data["index"]["has_seal"] = has_seal if doi is not None: self.data["index"]["doi"] = doi if fulltext is not None: self.data["index"]["fulltext"] = fulltext
def duplicates(cls, issns=None, publisher_record_id=None, doi=None, fulltexts=None, title=None, volume=None, number=None, start=None, should_match=None, size=10): # some input sanitisation issns = issns if isinstance(issns, list) else [] urls = fulltexts if isinstance(fulltexts, list) else [fulltexts] if isinstance(fulltexts, str) or isinstance(fulltexts, unicode) else [] # make sure that we're dealing with the normal form of the identifiers norm_urls = [] for url in urls: try: norm = normalise.normalise_url(url) norm_urls.append(norm) except ValueError: # use the non-normal form norm_urls.append(url) urls = norm_urls try: doi = normalise.normalise_doi(doi) except ValueError: # leave the doi as it is pass # in order to make sure we don't send too many terms to the ES query, break the issn list down into chunks terms_limit = app.config.get("ES_TERMS_LIMIT", 1024) issn_groups = [] lower = 0 upper = terms_limit while lower < len(issns): issn_groups.append(issns[lower:upper]) lower = upper upper = lower + terms_limit if issns is not None and len(issns) > 0: duplicate_articles = [] for g in issn_groups: q = DuplicateArticleQuery(issns=g, publisher_record_id=publisher_record_id, doi=doi, urls=urls, title=title, volume=volume, number=number, start=start, should_match=should_match, size=size) # print json.dumps(q.query()) res = cls.query(q=q.query()) duplicate_articles += [cls(**hit.get("_source")) for hit in res.get("hits", {}).get("hits", [])] return duplicate_articles else: q = DuplicateArticleQuery(publisher_record_id=publisher_record_id, doi=doi, urls=urls, title=title, volume=volume, number=number, start=start, should_match=should_match, size=size) # print json.dumps(q.query()) res = cls.query(q=q.query()) return [cls(**hit.get("_source")) for hit in res.get("hits", {}).get("hits", [])]
def _read_match_set(reader, next_row): n_matches = -1 match_set = MatchSet() while True: if next_row is not None: row = deepcopy(next_row) next_row = None else: try: row = reader.next() except StopIteration: return match_set, None if row is None: return match_set, None a_id = row[0] root = match_set.root if root is not None and a_id != root["id"]: return match_set, row a_created = row[1] try: a_doi = normalise.normalise_doi(row[2]) except: a_doi = row[2] try: a_ft = normalise.normalise_url(row[3]) except: a_ft = row[3] a_owner = row[4] a_issns = row[5] a_in_doaj = row[6] == "True" if n_matches != -1: n_matches = int(row[7]) match_type = row[8] b_id = row[9] b_created = row[10] try: b_doi = normalise.normalise_doi(row[11]) except: b_doi = row[11] try: b_ft = normalise.normalise_url(row[12]) except: b_ft = row[12] b_owner = row[13] b_issns = row[14] b_in_doaj = row[15] == "True" title_match = row[17] == "True" if root is None: match_set.add_root(a_id, a_created, a_doi, a_ft, a_owner, a_issns, a_in_doaj, title_match) match_set.add_match(b_id, b_created, b_doi, b_ft, b_owner, b_issns, b_in_doaj, title_match, match_type) # a catch to make sure that everything is ok with the match set detection assert n_matches + 1 == len(match_set.matches)