def test_paragraph_splitting(): splitter = ParagraphSplitter() my_dir = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(my_dir, 'test_paragraph_splitting.txt')) as f: text = f.read() paragraphs = splitter.find_paragraphs(text) assert len(paragraphs) == 10 for i, paragraph in enumerate(paragraphs): assert 'paragraph {}'.format(i + 1) in paragraph assert paragraph[-1] == "." # Make sure newlines have been removed within each paragraph. assert '\n' not in paragraph # Make the total number of non-whitespace characters is the same. assert len(re.sub(r'\s', '', text)) \ == len(''.join([re.sub(r'\s', '', x) for x in paragraphs]))
def parse_document(self, doc_dict): doc_id = doc_dict["doc_id"] logging.info('syntax parsing, doc_id = {}'.format(doc_id)) # TODO should there be some extra preprocessing to deal with fancy # quotes, etc.? The tokenizer doesn't appear to handle it well paragraphs = ParagraphSplitter.find_paragraphs(doc_dict["raw_text"], doc_id=doc_id) starts_paragraph_list = [] trees = [] no_parse_for_paragraph = False for paragraph in paragraphs: # try to use the server first if self._zpar_proxy: trees_p = self._parse_document_via_server(paragraph, doc_id) # then fall back to the shared library else: if self._zpar_ref is None: raise RuntimeError('The ZPar server is unavailable.') trees_p = self._parse_document_via_lib(paragraph, doc_id) if len(trees_p) > 0: starts_paragraph_list.append(True) starts_paragraph_list.extend([False for t in trees_p[1:]]) trees.extend(trees_p) else: # TODO add some sort of error flag to the dictionary for this # document? no_parse_for_paragraph = True logging.debug('starts_paragraph_list = {}, doc_id = {}' .format(starts_paragraph_list, doc_id)) # Check that either the number of True indicators in # starts_paragraph_list equals the number of paragraphs, or that the # syntax parser had to skip a paragraph entirely. assert (sum(starts_paragraph_list) == len(paragraphs) or no_parse_for_paragraph) assert len(trees) == len(starts_paragraph_list) return trees, starts_paragraph_list