def build_input_sequence(doc, vocab_ids): """Builds input sequence from file. Splits lines on whitespace. Treats punctuation as whitespace. For word-level sequences, only keeps terms that are in the vocab. Terms are added as token in the SequenceExample. The EOS_TOKEN is also appended. Label and weight features are set to 0. Args: doc: Document (defined in `document_generators`) from which to build the sequence. vocab_ids: dict<term, id>. Returns: SequenceExampleWrapper. """ seq = data.SequenceWrapper() for token in document_generators.tokens(doc): if token in vocab_ids: seq.add_timestep().set_token(vocab_ids[token]) # Add EOS token to end seq.add_timestep().set_token(vocab_ids[data.EOS_TOKEN]) return seq
def fill_vocab_from_doc(doc, vocab_freqs, doc_counts): """Fills vocabulary and doc counts with tokens from doc. Args: doc: Document to read tokens from. vocab_freqs: dict<token, frequency count> doc_counts: dict<token, document count> Returns: None """ doc_seen = set() for token in document_generators.tokens(doc): if doc.add_tokens or token in vocab_freqs: vocab_freqs[token] += 1 if token not in doc_seen: doc_counts[token] += 1 doc_seen.add(token)