def prep_input(self, read_list): "Prepare the list of files or text content objects to be read." logger.info('Prepping input for sparser.') file_list = [] def add_nxml_file(tcid, nxml_bts): fpath = path.join(self.input_dir, 'PMC%d.nxml' % tcid) with open(fpath, 'wb') as f: f.write(nxml_bts) file_list.append(fpath) for item in read_list: if isinstance(item, str): # This implies that it is a file path fpath = item.strip() if fpath.endswith('.nxml'): # If it is already an nxml, we just need to adjust the # name a bit, if anything. if fpath.startswith('PMC'): file_list.append(fpath) else: new_fpath = path.join(self.tmp_dir, 'PMC' + path.basename(fpath)) shutil.copy(fpath, new_fpath) file_list.append(new_fpath) else: # Otherwise we need to frame the content in xml and put it # in a new file with the appropriat name. old_name = path.basename(fpath) new_fname = '.'.join(old_name.split('.')[:-1] + ['nxml']) new_fpath = path.join(self.tmp_dir, new_fname) with open(fpath, 'r') as f_old: content = f_old.read() nxml_str = sparser.make_nxml_from_text(content) with open(new_fpath, 'w') as f_new: f_new.write(nxml_str) file_list.append(new_fpath) elif all([hasattr(item, a) for a in ['format', 'content', 'id']]): # This implies that it is a text content object, or something # with a matching API. if item.format == formats.XML: add_nxml_file( item.id, zlib.decompress(item.content, 16 + zlib.MAX_WBITS)) elif item.format == formats.TEXT: txt_bts = zlib.decompress(item.content, 16 + zlib.MAX_WBITS) nxml_str = sparser.make_nxml_from_text( txt_bts.decode('utf8')) add_nxml_file(item.id, nxml_str.encode('utf8')) else: raise SparserError("Unrecognized format %s." % item.format) else: raise SparserError("Unknown type of item for reading %s." % type(item)) return file_list
def prep_input(self, content_iter): "Prepare the list of files or text content objects to be read." logger.info('Prepping input for sparser.') self.file_list = [] for content in content_iter: quality_issue = self._check_content(content.get_text()) if quality_issue is not None: logger.warning("Skipping %d due to: %s" % (content.get_id(), quality_issue)) continue if content.is_format('nxml'): # If it is already an nxml, we just need to adjust the # name a bit, if anything. if not content.get_filename().startswith('PMC'): content.change_id('PMC' + str(content.get_id())) fpath = content.copy_to(self.tmp_dir) self.file_list.append(fpath) elif content.is_format('txt', 'text'): # Otherwise we need to frame the content in xml and put it # in a new file with the appropriate name. nxml_str = sparser.make_nxml_from_text(content.get_text()) new_content = Content.from_string( 'PMC' + str(content.get_id()), 'nxml', nxml_str) fpath = new_content.copy_to(self.tmp_dir) self.file_list.append(fpath) else: raise SparserError("Unrecognized format %s." % content.format) return
def prep_input(self, read_list): "Prepare the list of files or text content objects to be read." logger.info('Prepping input for sparser.') self.file_list = [] for content in read_list: quality_issue = self._check_content(content.get_text()) if quality_issue is not None: logger.warning("Skipping %d due to: %s" % (content.get_id(), quality_issue)) continue if content.is_format('nxml'): # If it is already an nxml, we just need to adjust the # name a bit, if anything. if not content.get_filename().startswith('PMC'): content.change_id('PMC' + str(content.get_id())) fpath = content.copy_to(self.tmp_dir) self.file_list.append(fpath) elif content.is_format('txt', 'text'): # Otherwise we need to frame the content in xml and put it # in a new file with the appropriate name. nxml_str = sparser.make_nxml_from_text(content.get_text()) new_content = Content.from_string('PMC' + str(content.get_id()), 'nxml', nxml_str) fpath = new_content.copy_to(self.tmp_dir) self.file_list.append(fpath) else: raise SparserError("Unrecognized format %s." % content.format) return