def get_author_year_score_for_input_fields(result_record, hypothesis): """ returns evidences based on just author and year. For most sources, you should rather use get_basic_score_for_input_fields -- see there for more information. :param result_record: :param hypothesis: :return: """ input_fields = hypothesis.get_detail('input_fields') evidences = Evidences() normalized_authors = hypothesis.get_detail('normalized_authors') if normalized_authors is None: normalized_authors = normalize_author_list( input_fields.get('author', '')) add_author_evidence(evidences, normalized_authors, result_record['author_norm'], result_record['first_author_norm'], has_etal=hypothesis.get_detail('has_etal')) add_year_evidence(evidences, input_fields.get('year'), result_record.get('year')) return evidences
def get_author_year_score_for_input_fields(result_record, hypothesis): """ returns evidences based on just author and year. :param result_record: :param hypothesis: :return: """ input_fields = hypothesis.get_detail('input_fields') normalized_authors = hypothesis.get_detail('normalized_authors') if normalized_authors is None: normalized_authors = normalize_author_list( input_fields.get('author', '')) evidences = Evidences() add_author_evidence(evidences, normalized_authors, result_record['author_norm'], result_record['first_author_norm'], has_etal=hypothesis.get_detail('has_etal')) add_year_evidence(evidences, input_fields.get('year'), result_record.get('year')) return evidences
def make_solr_condition_author(value): """ :param value: :return: """ # only if not already normalized if ";" not in value: value = re.sub( r"\.( ?[A-Z]\.)*", "", # ... and silly "double initials" re.sub(r"-[A-Z]\.", "", normalize_author_list(value, initials='.' in value))) # something went wrong with normalization, # so grab all last names and insert semicolon between them if ";" not in value: lastname = '; '.join(AUTHOR_LAST_NAME.findall(value)) # most probably lastname is not capitalized # so grab the words if len(lastname) == 0: lastname = '; '.join( AUTHOR_LAST_NAME_CASE_INSENSITIVE.findall(value)) value = lastname # authors fields have special serialization rules return " AND ".join('"%s"' % s.strip() for s in value.split(";"))
def make_solr_condition_author(value): """ :param value: :return: """ value = re.sub( r"\.( ?[A-Z]\.)*", "", # ... and silly "double initials" re.sub("-[A-Z]\.", "", normalize_author_list(value, initials='.' in value))) # authors fields have special serialization rules return " AND ".join('"%s"' % s.strip() for s in value.split(";"))
def make_digested_record(self): """ adds a digested_record attribute from field_mappings and self.ref. This is exclusively called by the constructor. :return: """ self.digested_record = {} for dest_key, src_key in self.field_mappings: value = self.ref.get(src_key) if value: self.digested_record[dest_key] = value self.normalized_authors = None if "author" in self.digested_record: self.digested_record["author"] = self.ETAL_PAT.sub( '', self.digested_record["author"]) self.normalized_authors = normalize_author_list( self.digested_record["author"], initials=True) self.normalized_first_author = re.sub( r"\.( ?[A-Z]\.)*", "", re.sub("-[A-Z]\.", "", self.normalized_authors)).split(";")[0].strip() if "year" in self.digested_record and len( self.digested_record["year"]) > 4: # the extra character(s) are at the end, just to be smart about it let's go with RE self.digested_record["year"] = re.findall( r'^([12][089]\d\d)', self.digested_record["year"])[0] if "-" in self.digested_record.get("page", ""): # we are querying on page stat, for now through out the page end self.digested_record["page"] = self.digested_record["page"].split( "-")[0] if "volume" in self.digested_record and "pub" in self.digested_record: # if volume has a alpha character at the beginning, remove it and attach it to the journal # ie. A. Arvanitaki, S. Dimopoulos, S. Dubovsky, N. Kaloper, and J. March-Russell, "String Axiverse," "Phys. Rev.", vol. D81, p. 123530, 2010. # which is in fact Journal `Phys. Rev. D.` Volume `81` match = self.JOURNAL_LETTER_ATTACHED_VOLUME.match( self.digested_record["volume"]) if match: self.digested_record["pub"] = '%s %s' % ( self.digested_record["pub"], self.digested_record["volume"][0]) self.digested_record["volume"] = self.digested_record[ "volume"][1:]
def get_score_for_baas_match(result_record, hypothesis): """ scores a BAAS->DDA match. For these, volume and page are hidden deep inside pub_raw. We also expect an expected_bibstem detail in the hypothesis, mainly for robustness in case this gets used to score something else. :param result_record: :param hypothesis: :return: """ evidences = Evidences() if not re.match(r'....%s' % hypothesis.get_detail('expected_bibstem'), result_record['bibcode']): evidences.add_evidence(current_app.config['EVIDENCE_SCORE_RANGE'][0], 'no DDA bibcode') return evidences input_fields = hypothesis.get_detail('input_fields') normalized_authors = hypothesis.get_detail('normalized_authors') if normalized_authors is None: normalized_authors = normalize_author_list( input_fields.get('author', '')) add_author_evidence(evidences, normalized_authors, result_record['author_norm'], result_record['first_author_norm']) add_boolean_evidence( evidences, 'Vol. %s' % input_fields['volume'] in result_record['pub_raw'], 'vol in pub_raw?') add_boolean_evidence( evidences, re.search(r'p\.\s*%s\b' % input_fields['page'], result_record['pub_raw']), 'page in pub_raw?') return evidences
def make_digested_record(self): """ adds a digested_record attribute from field_mappings and self.ref. This is exclusively called by the constructor. :return: """ self.digested_record = {} for dest_key, src_key in self.field_mappings: value = self.ref.get(src_key) if value: self.digested_record[dest_key] = value self.normalized_authors = None if "author" in self.digested_record: self.digested_record["author"] = self.ETAL_PAT.sub('', self.digested_record["author"]) self.normalized_authors = normalize_author_list(self.digested_record["author"], initials='.' in self.digested_record["author"]) self.normalized_first_author = re.sub(r"\.( ?[A-Z]\.)*", "", re.sub(r"-[A-Z]\.", "", self.normalized_authors)).split(";")[0].strip() if len(self.normalized_first_author) <= 3: self.digested_record.pop("author") if "year" in self.digested_record and len(self.digested_record["year"]) > 4: # the extra character(s) are at the end, just to be smart about it let's go with RE self.digested_record["year"] = self.YEAR_PATTERN.findall(self.digested_record["year"])[0] # sometimes parser identifies title where it is actually journal # if we have volume and page, no pub, but title, it is actually pub so switch if not self.digested_record.get("pub", None) and self.digested_record.get("title", None) and \ self.digested_record.get("volume", None) and self.digested_record.get("page", None): self.digested_record["pub"] = self.digested_record.pop("title") if self.digested_record.get("page", None): if "-" in self.digested_record.get("page"): # we are querying on page stat, for now through out the page end self.digested_record["page"] = self.digested_record["page"].split("-")[0] qualifier, self.digested_record["page"] = self.tokenize_page(self.digested_record["page"]) if qualifier is not None: self.digested_record["qualifier"] = qualifier if "volume" in self.digested_record and "pub" in self.digested_record: # if volume has a alpha character at the beginning, remove it and attach it to the journal # ie. A. Arvanitaki, S. Dimopoulos, S. Dubovsky, N. Kaloper, and J. March-Russell, "String Axiverse," "Phys. Rev.", vol. D81, p. 123530, 2010. # which is in fact Journal `Phys. Rev. D.` Volume `81` match = self.JOURNAL_LETTER_ATTACHED_VOLUME.match(self.digested_record["volume"]) if match: self.digested_record["pub"] = '%s %s'%(self.digested_record["pub"], self.digested_record["volume"][0]) self.digested_record["volume"] = self.digested_record["volume"][1:] if "title" in self.digested_record: # remove too much information self.digested_record["title"] = self.TITLE_MAIN.split(self.digested_record["title"])[0] pub = self.digested_record.get("pub", None) or self.digested_record.get("title", None) if pub: try: if len(pub) <= 2: if self.digested_record.get("pub", None): self.digested_record.remove("pub") elif self.digested_record.get("title", None): self.digested_record.remove("title") else: bibstem = get_best_bibstem_for(pub) # if bibstem is one of the multi-section journal, # sometimes user do not include the section char # so included a wildcard in bibstem if any([bib==bibstem for bib in self.BIBSTEM_WITH_SECTIONS]): self.digested_record["bibstem"] = '%s*'%bibstem else: self.digested_record["bibstem"] = bibstem except KeyError: # when bibstem can not be infered from pub, get_best_bibstem_for raises this exception self.digested_record["bibstem"] = '' if "arxiv" in self.digested_record: # authors specify arxiv id different ways, # sometimes for the new format they include class name, which is wrong # sometimes for the old format they include class name, but out of order # get the correct arxiv_id self.digested_record["arxiv"] = self.digested_record["arxiv"].split(":")[-1] for aie in self.ARXIV_ID_EXTRACTOR: match = aie.match(self.digested_record["arxiv"]) if match: group_names = list(match.groupdict().keys()) if 'new_pattern' in group_names: self.digested_record["arxiv"] = match.group('new_pattern') elif 'old_pattern' in group_names: self.digested_record["arxiv"] = "%s/%s"%(match.group('class_name'), match.group('old_pattern')) break if "ascl" in self.digested_record: # remove the ascl prefix if included self.digested_record["ascl"] = self.digested_record["ascl"].split(":")[-1]