def read_s2_excerpt(ex): """ Reads excerpts in a jsonlines format In this format each citation excerpt is one json line It is the flattened format from the original s2 data Args: ex: citation excerpt blob Returns: Citation object """ citation = Citation( text=ex['string'], citing_paper_id=ex['citingPaperId'], cited_paper_id=ex['citedPaperId'], # citing_paper_title=ex['citingPaper']['title'], # cited_paper_title=ex['citedPaper']['title'], # citing_paper_year=citing_paper_year, # cited_paper_year=cited_paper_year, # citing_author_ids=citing_author_ids, # cited_author_ids=cited_author_ids, extended_context=None, # Not available for s2 data section_number=None, # Not available for s2 data section_title=ex['sectionName'], intent=ex['label'], # cite_marker_offset=offsets, # Not useful here sents_before=None, # not available for s2 data sents_after=None, # not available for s2 data citation_excerpt_index=ex['excerpt_index'], cleaned_cite_text=regex_find_citation.sub('', ex['string']) ) return citation
def _read(self, file_path): for obj in jsonlines.open(file_path): citation_text = obj['text'] if self._clean_citation: citation_text = regex_find_citation.sub("", citation_text) citation_intent = None section_name = obj['section_name'] citing_paper_id = obj['citing_paper_id'] cited_paper_id = obj['cited_paper_id'] yield self.text_to_instance(citation_text=citation_text, intent=citation_intent, citing_paper_id=citing_paper_id, cited_paper_id=cited_paper_id, section_name=section_name)
def read_s2_jsonline(ex, evaluate_mode=False, clean_citation=True, multilabel=False): """ reads a json lines object (citation blob) This is a separate function to be used in the predictor Args: ex: input Example evaluate_mode: If we are evaluating only consider annotated excerpts """ citations = [] num_not_annotated = 0 try: citing_paper_year = ex['citingPaper']['year'] except KeyError: citing_paper_year = -1 try: cited_paper_year = ex['citedPaper']['year'] except KeyError: cited_paper_year = -1 # authors is like: [{'name': 'S Pandav', 'ids': ['2098534'], ...}] try: citing_author_ids = [author['ids'][0] if author['ids'] else 'n/a' for author in ex['citingPaper']['authors']] except KeyError: # authors do not exist in the context: citing_author_ids = [] try: cited_author_ids = [author['ids'][0] if author['ids'] else 'n/a' for author in ex['citedPaper']['authors']] except KeyError: cited_author_ids = [] for excerpt_index, excerpt_obj in enumerate(ex['context']): if evaluate_mode: # only consider excerpts that are annotated if 'intents' not in excerpt_obj: num_not_annotated += 1 continue try: offsets = [excerpt_obj['citeStart'], excerpt_obj['citeEnd']] except KeyError: # context does not have citeStart or citeEnd offsets = [-1, -1] if clean_citation: # remove citation markers (e.g., things like [1,4], (Peters, et al 2018), etc) citation_text = regex_find_citation.sub("", excerpt_obj['string']) else: citation_text = excerpt_obj['string'] section_name = excerpt_obj['sectionName'] # intents = [e['intent'] for e in excerpt_obj['intents'] if e['score'] > 0.0] if 'intents' in excerpt_obj: if multilabel: intents = [e['intent'] if e['score'] > 0.0 else NEGATIVE_CLASS_PREFIX + e['intent'] for e in excerpt_obj['intents']] else: intents = [e['intent'] for e in excerpt_obj['intents'] if e['score'] > 0.0] else: intents = None citation = Citation( text=citation_text, citing_paper_id=ex['citingPaper']['id'], cited_paper_id=ex['citedPaper']['id'], citing_paper_title=ex['citingPaper']['title'], cited_paper_title=ex['citedPaper']['title'], citing_paper_year=citing_paper_year, cited_paper_year=cited_paper_year, citing_author_ids=citing_author_ids, cited_author_ids=cited_author_ids, extended_context=None, # Not available for s2 data section_number=None, # Not available for s2 data section_title=section_name, intent=intents, cite_marker_offset=offsets, # Not useful here sents_before=None, # not available for s2 data sents_after=None, # not available for s2 data citation_excerpt_index=excerpt_index, cleaned_cite_text=citation_text ) citations.append(citation) return citations
def read(self): """ Reads the input data and yields a citation object""" data = [json.loads(line) for line in open(self.data_path)] num_returned_citations = 0 num_not_annotated = 0 for ex in data: try: citing_paper_year = ex['citingPaper']['year'] except KeyError: citing_paper_year = -1 try: cited_paper_year = ex['citedPaper']['year'] except KeyError: cited_paper_year = -1 # authors is like: [{'name': 'S Pandav', 'ids': ['2098534'], ...}] try: citing_author_ids = [author['ids'][0] if author['ids'] else 'n/a' for author in ex['citingPaper']['authors']] except KeyError: # authors do not exist in the context: citing_author_ids = [] try: cited_author_ids = [author['ids'][0] if author['ids'] else 'n/a' for author in ex['citedPaper']['authors']] except KeyError: cited_author_ids = [] for excerpt_index, excerpt_obj in enumerate(ex['context']): if self.evaluate_mode: # only consider excerpts that are annotated if 'intents' not in excerpt_obj: num_not_annotated += 1 continue try: offsets = [excerpt_obj['citeStart'], excerpt_obj['citeEnd']] except KeyError: # context does not have citeStart or citeEnd offsets = [-1, -1] if self.clean_citation: # remove citation markers (e.g., things like [1,4], (Peters, et al 2018), etc) citation_text = regex_find_citation.sub("", excerpt_obj['string']) else: citation_text = excerpt_obj['string'] section_name = excerpt_obj['sectionName'] # in case of multilabel add all possible labels and their negative prefix if self.multilabel: intents = [e['intent'] if e['score'] > 0.0 else NEGATIVE_CLASS_PREFIX + e['intent'] for e in excerpt_obj['intents']] else: intents = [e['intent'] for e in excerpt_obj['intents'] if e['score'] > 0.0] citation = Citation( text=citation_text, citing_paper_id=ex['citingPaper']['id'], cited_paper_id=ex['citedPaper']['id'], citing_paper_title=ex['citingPaper']['title'], cited_paper_title=ex['citedPaper']['title'], citing_paper_year=citing_paper_year, cited_paper_year=cited_paper_year, citing_author_ids=citing_author_ids, cited_author_ids=cited_author_ids, extended_context=None, # Not available for s2 data section_number=None, # Not available for s2 data section_title=section_name, intent=intents, cite_marker_offset=offsets, # Not useful here sents_before=None, # not available for s2 data sents_after=None, # not available for s2 data citation_excerpt_index=excerpt_index, cleaned_cite_text=citation_text ) num_returned_citations += 1 yield citation logger.info(f'Total annotated citation texts returned: {num_returned_citations}; ' f'not annotated {num_not_annotated}')