def _parse(self, enclosing_session=None, verbose=False): """ Parse the article content to yield parse trees and annotated token list """ with SessionContext(enclosing_session) as session: # Convert the content soup to a token iterable (generator) toklist = Fetcher.tokenize_html(self._url, self._html, session) bp = self.get_parser() ip = IncrementalParser(bp, toklist, verbose=verbose) # List of paragraphs containing a list of sentences containing token lists # for sentences in string dump format (1-based paragraph and sentence indices) pgs = [] # Dict of parse trees in string dump format, # stored by sentence index (1-based) trees = OrderedDict() # Word stem dictionary, indexed by (stem, cat) words = defaultdict(int) num_sent = 0 for p in ip.paragraphs(): pgs.append([]) for sent in p.sentences(): num_sent += 1 if sent.parse(): # Obtain a text representation of the parse tree trees[num_sent] = ParseForestDumper.dump_forest( sent.tree) pgs[-1].append( Article._dump_tokens(sent.tokens, sent.tree, words)) else: # Error or no parse: add an error index entry for this sentence eix = sent.err_index trees[num_sent] = "E{0}".format(eix) pgs[-1].append( Article._dump_tokens(sent.tokens, None, None, eix)) parse_time = ip.parse_time self._parsed = datetime.utcnow() self._parser_version = bp.version self._num_tokens = ip.num_tokens self._num_sentences = ip.num_sentences self._num_parsed = ip.num_parsed self._ambiguity = ip.ambiguity # Make one big JSON string for the paragraphs, sentences and tokens self._raw_tokens = pgs self._tokens = json.dumps(pgs, separators=(',', ':'), ensure_ascii=False) self._words = words # self._tokens = "[" + ",\n".join("[" + ",\n".join(sent for sent in p) + "]" for p in pgs) + "]" # Create a tree representation string out of all the accumulated parse trees self._tree = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items())
def _parse(self, enclosing_session=None, verbose=False): """ Parse the article content to yield parse trees and annotated token list """ with SessionContext(enclosing_session) as session: # Convert the content soup to a token iterable (generator) toklist = Fetcher.tokenize_html(self._url, self._html, session) bp = self.get_parser() ip = IncrementalParser(bp, toklist, verbose=verbose) # List of paragraphs containing a list of sentences containing token lists # for sentences in string dump format (1-based paragraph and sentence indices) pgs = [] # Dict of parse trees in string dump format, # stored by sentence index (1-based) trees = OrderedDict() # Word stem dictionary, indexed by (stem, cat) words = defaultdict(int) num_sent = 0 for p in ip.paragraphs(): pgs.append([]) for sent in p.sentences(): num_sent += 1 num_tokens = len(sent) # We don't attempt to parse very long sentences (>100 tokens) # since they are memory intensive (>16 GB) and may take # minutest to process if num_tokens <= MAX_SENTENCE_TOKENS and sent.parse(): # Obtain a text representation of the parse tree token_dicts = TreeUtility.dump_tokens( sent.tokens, sent.tree, words) # Create a verbose text representation of # the highest scoring parse tree tree = ParseForestDumper.dump_forest( sent.tree, token_dicts=token_dicts) # Add information about the sentence tree's score # and the number of tokens trees[num_sent] = "\n".join([ "C{0}".format(sent.score), "L{0}".format(num_tokens), tree ]) else: # Error, sentence too long or no parse: # add an error index entry for this sentence if num_tokens > MAX_SENTENCE_TOKENS: # Set the error index at the first # token outside the maximum limit eix = MAX_SENTENCE_TOKENS else: eix = sent.err_index token_dicts = TreeUtility.dump_tokens( sent.tokens, None, None, eix) trees[num_sent] = "E{0}".format(eix) pgs[-1].append(token_dicts) # parse_time = ip.parse_time self._parsed = datetime.utcnow() self._parser_version = bp.version self._num_tokens = ip.num_tokens self._num_sentences = ip.num_sentences self._num_parsed = ip.num_parsed self._ambiguity = ip.ambiguity # Make one big JSON string for the paragraphs, sentences and tokens self._raw_tokens = pgs self._tokens = json.dumps(pgs, separators=(",", ":"), ensure_ascii=False) # Keep the bag of words (stem, category, count for each word) self._words = words # Create a tree representation string out of all the accumulated parse trees self._tree = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items())