def get_documents(cls, file_descriptions): for file_description in file_descriptions: if file_description.filename is None: # Abstract-Only Document yield Document( identifier=file_description.cord_uid, title=file_description.title, sections=[ Section(name='Abstract', content=file_description.abstract) ], collection=settings.COLLECTION_NAME, ) else: # Full-text document, loaded from filesystem with open(file_description.filename, 'r', errors=settings.ENCODING_ERRORS) as infile: data = json.load(infile) sections = [] # Add the abstract if 'abstract' in data['metadata'].keys(): paragraphs = [paragraph['text'] for paragraph in \ data['metadata']['abstract']] joined_text = ' '.join(paragraphs) section = Section(name='Abstract', content=joined_text) sections.append(section) current_section = None current_section_text = '' # Add sections from the article body for paragraph in data['body_text']: if paragraph['section'] != current_section: if current_section is not None: sections.append( Section(name=current_section, content=current_section_text)) current_section = paragraph['section'] current_section_text = '' current_section_text += ' %s' % paragraph['text'] # Add the final section sections.append( Section(name=current_section, content=current_section_text)) yield Document( identifier=file_description.cord_uid, title=file_description.title, sections=sections, collection=settings.COLLECTION_NAME, )
def get_documents(cls, filenames): for filedesc in filenames: if filedesc.filename is None: yield Document(identifier=filedesc.cord_uid, title=filedesc.title, sections=[ Section(name='Abstract', content=filedesc.abstract) ], collection=settings.COLLECTION_NAME) else: with open(filedesc.filename, 'r', errors=settings.ENCODING_ERRORS) as infile: data = json.load(infile) sections = [] if 'abstract' in data['metadata'].keys(): paragraph_text = ' '.join([ paragraph['text'] for paragraph in data['metadata']['abstract'] ]) section = Section(name='Abstract', content=paragraph_text) sections.append(section) current_section = None current_section_text = '' for paragraph in data['body_text']: if paragraph['section'] != current_section: if current_section is not None: sections.append( Section(name=current_section, content=current_section_text)) current_section = paragraph['section'] current_section_text = '' current_section_text += (' %s' % paragraph['text'])\ .replace('q q', '') sections.append( Section(name=current_section, content=current_section_text)) yield Document(identifier=filedesc.cord_uid, title=filedesc.title, sections=sections, collection=settings.COLLECTION_NAME)
def get_topics(topfile, section='narrative'): with open(topfile, 'r') as infile: soup = BeautifulSoup(infile.read(), 'xml') for topic in soup.find_all('topic'): number = topic['number'] query = topic.query.string question = topic.question.string narrative = topic.narrative.string if section == 'query': section_content = query elif section == 'question': section_content = question else: section_content = narrative yield Document( identifier=number, title=query, sections=[Section(name=section, content=section_content)], collection='topics', )
def get_topics(topfile, section='summary'): """ Gets the list of topics from an XML file. """ with open(topfile, 'r') as infile: soup = BeautifulSoup(infile.read(), 'xml') for topic in soup.find_all('topic'): number = topic['number'] topic_type = topic['type'] note = topic.note.string description = topic.description.string summary = topic.summary.string if section == 'note': section_content = note elif section == 'description': section_content = description else: section_content = summary yield Document( identifier=number, title=topic_type, sections=[Section(name=section, content=section_content)], collection='topics', )
def get_covid_topics(topfile, topic_selector): """ Gets the topics from the TREC-COVID dataset. """ with open(topfile, 'r') as infile: soup = BeautifulSoup(infile.read(), 'xml') for topic in soup.find_all('topic'): number = topic['number'] query = topic.query.string question = topic.question.string narrative = topic.narrative.string if topic_selector == 'query': section_content = query elif topic_selector == 'question': section_content = question elif topic_selector == 'narrative': section_content = narrative else: raise TypeError("Unsupported topic selector: %s" \ % topic_selector) yield Document( identifier=number, title=topic_type, sections=[Section(name=section, content=section_content)], collection='topics', )
def get_cds_topics(topfile, topic_selector): """ Gets the topics from the TREC CDS track. """ with open(topfile, 'r') as infile: soup = BeautifulSoup(infile.read(), 'xml') for topic in soup.find_all('topic'): number = topic['number'] topic_type = topic['type'] note = topic.note.string description = topic.description.string summary = topic.summary.string if topic_selector == 'note': section_content = note elif topic_selector == 'description': section_content = description elif topic_selector == 'summary': section_content = summary else: raise TypeError("Unsupported topic selector: %s" \ % topic_selector) yield Document( identifier=number, title=topic_type, sections=[Section(name=section, content=section_content)]< collection='topics', )
def get_documents(cls, filenames): for filename in filenames: file_id = os.path.splitext(os.path.basename(filename))[0] try: with open(filename, 'r', errors=settings.ENCODING_ERRORS) \ as infile: soup = BeautifulSoup(infile.read(), 'xml') # Get article title (if it exists) try: article_title = \ ''.join(soup.front.find('article-meta')\ .find('title-group')\ .find('article-title')\ .strings ) except AttributeError: article_title = file_id # Get sections sections = [] try: for sec in soup.body.find_all('sec', recursive=False): sec_title = ''.join(sec.title.strings) paragraphs = sec.find_all('p') content = ''.join([ ''.join(paragraph.strings) for paragraph in paragraphs ]) sections.append( Section(name=sec_title, content=content)) except AttributeError: pass yield Document( identifier=file_id, title=article_title, sections=sections, collections=settings.COLLECTION_NAME, ) except FileNotFoundError: warnings.warn("Could not find file: %s" % filename)