Exemplo n.º 1
0
    def prep_input(self, read_list):
        "Prepare the list of files or text content objects to be read."
        logger.info('Prepping input for sparser.')

        file_list = []

        def add_nxml_file(tcid, nxml_bts):
            fpath = path.join(self.input_dir, 'PMC%d.nxml' % tcid)
            with open(fpath, 'wb') as f:
                f.write(nxml_bts)
            file_list.append(fpath)

        for item in read_list:
            if isinstance(item, str):
                # This implies that it is a file path
                fpath = item.strip()
                if fpath.endswith('.nxml'):
                    # If it is already an nxml, we just need to adjust the
                    # name a bit, if anything.
                    if fpath.startswith('PMC'):
                        file_list.append(fpath)
                    else:
                        new_fpath = path.join(self.tmp_dir,
                                              'PMC' + path.basename(fpath))
                        shutil.copy(fpath, new_fpath)
                        file_list.append(new_fpath)
                else:
                    # Otherwise we need to frame the content in xml and put it
                    # in a new file with the appropriat name.
                    old_name = path.basename(fpath)
                    new_fname = '.'.join(old_name.split('.')[:-1] + ['nxml'])
                    new_fpath = path.join(self.tmp_dir, new_fname)
                    with open(fpath, 'r') as f_old:
                        content = f_old.read()
                    nxml_str = sparser.make_nxml_from_text(content)
                    with open(new_fpath, 'w') as f_new:
                        f_new.write(nxml_str)
                    file_list.append(new_fpath)
            elif all([hasattr(item, a) for a in ['format', 'content', 'id']]):
                # This implies that it is a text content object, or something
                # with a matching API.
                if item.format == formats.XML:
                    add_nxml_file(
                        item.id,
                        zlib.decompress(item.content, 16 + zlib.MAX_WBITS))
                elif item.format == formats.TEXT:
                    txt_bts = zlib.decompress(item.content,
                                              16 + zlib.MAX_WBITS)
                    nxml_str = sparser.make_nxml_from_text(
                        txt_bts.decode('utf8'))
                    add_nxml_file(item.id, nxml_str.encode('utf8'))
                else:
                    raise SparserError("Unrecognized format %s." % item.format)
            else:
                raise SparserError("Unknown type of item for reading %s." %
                                   type(item))
        return file_list
Exemplo n.º 2
0
    def prep_input(self, content_iter):
        "Prepare the list of files or text content objects to be read."
        logger.info('Prepping input for sparser.')

        self.file_list = []

        for content in content_iter:
            quality_issue = self._check_content(content.get_text())
            if quality_issue is not None:
                logger.warning("Skipping %d due to: %s" %
                               (content.get_id(), quality_issue))
                continue

            if content.is_format('nxml'):
                # If it is already an nxml, we just need to adjust the
                # name a bit, if anything.
                if not content.get_filename().startswith('PMC'):
                    content.change_id('PMC' + str(content.get_id()))
                fpath = content.copy_to(self.tmp_dir)
                self.file_list.append(fpath)
            elif content.is_format('txt', 'text'):
                # Otherwise we need to frame the content in xml and put it
                # in a new file with the appropriate name.
                nxml_str = sparser.make_nxml_from_text(content.get_text())
                new_content = Content.from_string(
                    'PMC' + str(content.get_id()), 'nxml', nxml_str)
                fpath = new_content.copy_to(self.tmp_dir)
                self.file_list.append(fpath)
            else:
                raise SparserError("Unrecognized format %s." % content.format)
        return
Exemplo n.º 3
0
    def prep_input(self, read_list):
        "Prepare the list of files or text content objects to be read."
        logger.info('Prepping input for sparser.')

        self.file_list = []

        for content in read_list:
            quality_issue = self._check_content(content.get_text())
            if quality_issue is not None:
                logger.warning("Skipping %d due to: %s"
                               % (content.get_id(), quality_issue))
                continue

            if content.is_format('nxml'):
                # If it is already an nxml, we just need to adjust the
                # name a bit, if anything.
                if not content.get_filename().startswith('PMC'):
                    content.change_id('PMC' + str(content.get_id()))
                fpath = content.copy_to(self.tmp_dir)
                self.file_list.append(fpath)
            elif content.is_format('txt', 'text'):
                # Otherwise we need to frame the content in xml and put it
                # in a new file with the appropriate name.
                nxml_str = sparser.make_nxml_from_text(content.get_text())
                new_content = Content.from_string('PMC' + str(content.get_id()),
                                                  'nxml', nxml_str)
                fpath = new_content.copy_to(self.tmp_dir)
                self.file_list.append(fpath)
            else:
                raise SparserError("Unrecognized format %s."
                                   % content.format)
        return