示例#1
0
    def get_documents(cls, file_descriptions):

        for file_description in file_descriptions:
            if file_description.filename is None:
                # Abstract-Only Document
                yield Document(
                    identifier=file_description.cord_uid,
                    title=file_description.title,
                    sections=[
                        Section(name='Abstract',
                                content=file_description.abstract)
                    ],
                    collection=settings.COLLECTION_NAME,
                )
            else:
                # Full-text document, loaded from filesystem
                with open(file_description.filename,
                          'r',
                          errors=settings.ENCODING_ERRORS) as infile:
                    data = json.load(infile)

                    sections = []

                    # Add the abstract
                    if 'abstract' in data['metadata'].keys():

                        paragraphs = [paragraph['text'] for paragraph in \
                                data['metadata']['abstract']]
                        joined_text = ' '.join(paragraphs)

                        section = Section(name='Abstract', content=joined_text)
                        sections.append(section)

                    current_section = None
                    current_section_text = ''
                    # Add sections from the article body
                    for paragraph in data['body_text']:
                        if paragraph['section'] != current_section:
                            if current_section is not None:
                                sections.append(
                                    Section(name=current_section,
                                            content=current_section_text))
                            current_section = paragraph['section']
                            current_section_text = ''

                        current_section_text += ' %s' % paragraph['text']

                    # Add the final section
                    sections.append(
                        Section(name=current_section,
                                content=current_section_text))

                    yield Document(
                        identifier=file_description.cord_uid,
                        title=file_description.title,
                        sections=sections,
                        collection=settings.COLLECTION_NAME,
                    )
示例#2
0
    def get_documents(cls, filenames):

        for filedesc in filenames:

            if filedesc.filename is None:
                yield Document(identifier=filedesc.cord_uid,
                               title=filedesc.title,
                               sections=[
                                   Section(name='Abstract',
                                           content=filedesc.abstract)
                               ],
                               collection=settings.COLLECTION_NAME)
            else:
                with open(filedesc.filename,
                          'r',
                          errors=settings.ENCODING_ERRORS) as infile:
                    data = json.load(infile)

                    sections = []

                    if 'abstract' in data['metadata'].keys():

                        paragraph_text = ' '.join([
                            paragraph['text']
                            for paragraph in data['metadata']['abstract']
                        ])

                        section = Section(name='Abstract',
                                          content=paragraph_text)
                        sections.append(section)

                    current_section = None
                    current_section_text = ''
                    for paragraph in data['body_text']:
                        if paragraph['section'] != current_section:
                            if current_section is not None:
                                sections.append(
                                    Section(name=current_section,
                                            content=current_section_text))
                            current_section = paragraph['section']
                            current_section_text = ''

                        current_section_text += (' %s' % paragraph['text'])\
                                .replace('q q', '')

                    sections.append(
                        Section(name=current_section,
                                content=current_section_text))

                    yield Document(identifier=filedesc.cord_uid,
                                   title=filedesc.title,
                                   sections=sections,
                                   collection=settings.COLLECTION_NAME)
def get_topics(topfile, section='narrative'):

    with open(topfile, 'r') as infile:
        soup = BeautifulSoup(infile.read(), 'xml')

        for topic in soup.find_all('topic'):
            number = topic['number']

            query = topic.query.string
            question = topic.question.string
            narrative = topic.narrative.string

            if section == 'query':
                section_content = query
            elif section == 'question':
                section_content = question
            else:
                section_content = narrative

            yield Document(
                identifier=number,
                title=query,
                sections=[Section(name=section, content=section_content)],
                collection='topics',
            )
示例#4
0
def get_topics(topfile, section='summary'):
    """
    Gets the list of topics from an XML file.
    """

    with open(topfile, 'r') as infile:
        soup = BeautifulSoup(infile.read(), 'xml')

        for topic in soup.find_all('topic'):
            number = topic['number']
            topic_type = topic['type']

            note = topic.note.string
            description = topic.description.string
            summary = topic.summary.string

            if section == 'note':
                section_content = note
            elif section == 'description':
                section_content = description
            else:
                section_content = summary

            yield Document(
                identifier=number,
                title=topic_type,
                sections=[Section(name=section, content=section_content)],
                collection='topics',
            )
示例#5
0
def get_covid_topics(topfile, topic_selector):
    """
    Gets the topics from the TREC-COVID dataset.
    """

    with open(topfile, 'r') as infile:
        soup = BeautifulSoup(infile.read(), 'xml')

        for topic in soup.find_all('topic'):
            number = topic['number']

            query = topic.query.string
            question = topic.question.string
            narrative = topic.narrative.string

            if topic_selector == 'query':
                section_content = query
            elif topic_selector == 'question':
                section_content = question
            elif topic_selector == 'narrative':
                section_content = narrative
            else:
                raise TypeError("Unsupported topic selector: %s" \
                        % topic_selector)

            yield Document(
                identifier=number,
                title=topic_type,
                sections=[Section(name=section, content=section_content)],
                collection='topics',
            )
示例#6
0
def get_cds_topics(topfile, topic_selector):
    """
    Gets the topics from the TREC CDS track.
    """

    with open(topfile, 'r') as infile:
        soup = BeautifulSoup(infile.read(), 'xml')

        for topic in soup.find_all('topic'):
            number = topic['number']
            topic_type = topic['type']

            note = topic.note.string
            description = topic.description.string
            summary = topic.summary.string

            if topic_selector == 'note':
                section_content = note
            elif topic_selector == 'description':
                section_content = description
            elif topic_selector == 'summary':
                section_content = summary
            else:
                raise TypeError("Unsupported topic selector: %s" \
                        % topic_selector)

            yield Document(
                identifier=number,
                title=topic_type,
                sections=[Section(name=section, content=section_content)]<
                collection='topics',
            )
示例#7
0
    def get_documents(cls, filenames):

        for filename in filenames:
            file_id = os.path.splitext(os.path.basename(filename))[0]

            try:
                with open(filename, 'r', errors=settings.ENCODING_ERRORS) \
                        as infile:

                    soup = BeautifulSoup(infile.read(), 'xml')

                    # Get article title (if it exists)
                    try:
                        article_title = \
                            ''.join(soup.front.find('article-meta')\
                                .find('title-group')\
                                .find('article-title')\
                                .strings
                            )
                    except AttributeError:
                        article_title = file_id

                    # Get sections
                    sections = []
                    try:
                        for sec in soup.body.find_all('sec', recursive=False):
                            sec_title = ''.join(sec.title.strings)
                            paragraphs = sec.find_all('p')
                            content = ''.join([
                                ''.join(paragraph.strings)
                                for paragraph in paragraphs
                            ])
                            sections.append(
                                Section(name=sec_title, content=content))
                    except AttributeError:
                        pass

                    yield Document(
                        identifier=file_id,
                        title=article_title,
                        sections=sections,
                        collections=settings.COLLECTION_NAME,
                    )
            except FileNotFoundError:
                warnings.warn("Could not find file: %s" % filename)