示例#1
0
def build_input_sequence(doc, vocab_ids):
    """Builds input sequence from file.

  Splits lines on whitespace. Treats punctuation as whitespace. For word-level
  sequences, only keeps terms that are in the vocab.

  Terms are added as token in the SequenceExample.  The EOS_TOKEN is also
  appended. Label and weight features are set to 0.

  Args:
    doc: Document (defined in `document_generators`) from which to build the
      sequence.
    vocab_ids: dict<term, id>.

  Returns:
    SequenceExampleWrapper.
  """
    seq = data.SequenceWrapper()
    for token in document_generators.tokens(doc):
        if token in vocab_ids:
            seq.add_timestep().set_token(vocab_ids[token])

    # Add EOS token to end
    seq.add_timestep().set_token(vocab_ids[data.EOS_TOKEN])

    return seq
示例#2
0
文件: gen_data.py 项目: Jmq14/models
def build_input_sequence(doc, vocab_ids):
  """Builds input sequence from file.

  Splits lines on whitespace. Treats punctuation as whitespace. For word-level
  sequences, only keeps terms that are in the vocab.

  Terms are added as token in the SequenceExample.  The EOS_TOKEN is also
  appended. Label and weight features are set to 0.

  Args:
    doc: Document (defined in `document_generators`) from which to build the
      sequence.
    vocab_ids: dict<term, id>.

  Returns:
    SequenceExampleWrapper.
  """
  seq = data.SequenceWrapper()
  for token in document_generators.tokens(doc):
    if token in vocab_ids:
      seq.add_timestep().set_token(vocab_ids[token])

  # Add EOS token to end
  seq.add_timestep().set_token(vocab_ids[data.EOS_TOKEN])

  return seq
示例#3
0
def fill_vocab_from_doc(doc, vocab_freqs, doc_counts):
    """Fills vocabulary and doc counts with tokens from doc.

  Args:
    doc: Document to read tokens from.
    vocab_freqs: dict<token, frequency count>
    doc_counts: dict<token, document count>

  Returns:
    None
  """
    doc_seen = set()

    for token in document_generators.tokens(doc):
        if doc.add_tokens or token in vocab_freqs:
            vocab_freqs[token] += 1
        if token not in doc_seen:
            doc_counts[token] += 1
            doc_seen.add(token)
示例#4
0
文件: gen_vocab.py 项目: Jmq14/models
def fill_vocab_from_doc(doc, vocab_freqs, doc_counts):
  """Fills vocabulary and doc counts with tokens from doc.

  Args:
    doc: Document to read tokens from.
    vocab_freqs: dict<token, frequency count>
    doc_counts: dict<token, document count>

  Returns:
    None
  """
  doc_seen = set()

  for token in document_generators.tokens(doc):
    if doc.add_tokens or token in vocab_freqs:
      vocab_freqs[token] += 1
    if token not in doc_seen:
      doc_counts[token] += 1
      doc_seen.add(token)