예제 #1
0
 def __init__(self, name, document, vocab, index):
     self.index = index
     self.name = name
     self.words = np.array([
         vocab.get(word) for sentence in to_raw_text_markupless(document)
         for word in sentence if vocab.get(word)
     ],
                           dtype='int32')
     self.size = len(self.words)
예제 #2
0
파일: parser.py 프로젝트: bhack/Dali
def to_token_string(text):
    tokens = to_raw_text_markupless(text)
    tokens = [' '.join(sentence_tokens) for sentence_tokens in tokens]
    tokens = ' '.join(tokens)
    return tokens
예제 #3
0
def to_token_string(text):
    tokens = to_raw_text_markupless(text)
    tokens = [' '.join(sentence_tokens) for sentence_tokens in tokens]
    tokens = ' '.join(tokens)
    return tokens
예제 #4
0
def tokenize_and_write(file, text, token):
    for sentence in to_raw_text_markupless(text):
        file.write(" ".join(sentence))
        file.write(token)
예제 #5
0
def collect_counts(documents):
    vocab = Counter()
    for value in documents.values():
        vocab.update(word for sentence in to_raw_text_markupless(value)
                     for word in sentence)
    return vocab
예제 #6
0
def collect_counts(documents):
    vocab = Counter()
    for value in documents.values():
        vocab.update(word for sentence in to_raw_text_markupless(value) for word in sentence)
    return vocab
예제 #7
0
def tokenize_sentences(text):
    sentences = text.strip().split("\t")
    gen_sentences = [" ".join(tsentence) for sentence in sentences for tsentence in to_raw_text_markupless(sentence)]
    return "\t".join(gen_sentences[0:2]) + " ".join(gen_sentences[2:])
예제 #8
0
    print("Generated %d question answer pairs" % (len(output_content) ))
    print("Skipped %d pairs because of answer shorter than %d words" % (num_too_short, MIN_ANSWER_LENGTH))
    print("Skipped %d because of encoding issues." % (num_nonascii,))

    num_valid = 0
    num_train = 0

    with open(VALIDATE_FILE, 'wt') as fvalid:
        with open(TRAIN_FILE, 'wt') as ftrain:

            for i, qa in enumerate(output_content):
                question, answer = qa
                print_progress(i, len(output_content))
                question_tokens = []
                answer_tokens = []
                for line in to_raw_text_markupless(question):
                    question_tokens.extend(line)
                for line in to_raw_text_markupless(answer):
                    answer_tokens.extend(line)

                output_line = '%s\t%s\n' % (' '.join(question_tokens), ' '.join(answer_tokens))
                if random.random() < VALIDATION_SIZE:
                    fvalid.write(output_line)
                    num_valid += 1
                else:
                    ftrain.write(output_line)
                    num_train += 1



    print("Saved %d pairs in %s" % (num_train, TRAIN_FILE))
예제 #9
0
def tokenize_and_write(file, text, token):
    for sentence in to_raw_text_markupless(text):
        file.write(" ".join(sentence))
        file.write(token)