Exemplo n.º 1
0
def build_dev_vocab(questions,
                    contexts):  # most vocabulary comes from tr_vocab
    # TODO: Needs train vocab
    tr_vocab = pickle.load(open('train_vocab.pkl', 'rb'))
    existing_vocab = set(tr_vocab)
    glove_vocab = load_glove_vocab('./glove/glove.840B.300d.txt',
                                   300)  # return a "set" of vocabulary
    new_vocab = list(
        set([
            w for doc in questions + contexts for w in doc
            if w not in existing_vocab and w in glove_vocab
        ]))
    vocab = tr_vocab + new_vocab
    print('train vocab {0}, total vocab {1}'.format(len(tr_vocab), len(vocab)))
    return vocab
Exemplo n.º 2
0
trn_file = 'CoQA/train.json'
dev_file = 'CoQA/dev.json'
wv_file = args.wv_file
wv_dim = args.wv_dim
nlp = spacy.load('vi_spacy_model', disable=['parser'])

random.seed(args.seed)
np.random.seed(args.seed)

logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG,
                    datefmt='%m/%d/%Y %I:%M:%S')
log = logging.getLogger(__name__)

log.info('start data preparing... (using {} threads)'.format(args.threads))

glove_vocab = load_glove_vocab(wv_file, wv_dim)  # return a "set" of vocabulary
log.info('glove loaded.')


# ===============================================================
# =================== Work on training data =====================
# ===============================================================

def proc_train(ith, article):
    rows = []
    context = article['story']

    for j, (question, answers) in enumerate(zip(article['questions'], article['answers'])):
        gold_answer = answers['input_text']
        span_answer = answers['span_text']
Exemplo n.º 3
0
wv_file = args.wv_file
wv_dim = args.wv_dim

nlp = spacy.load('en', disable=['parser'])

random.seed(args.seed)
np.random.seed(args.seed)

logging.basicConfig(format='%(asctime)s %(message)s',
                    level=logging.DEBUG,
                    datefmt='%m/%d/%Y %I:%M:%S')
log = logging.getLogger(__name__)

log.info('start data preparing... (using {} threads)'.format(args.threads))
# 只是得到一个set类型的单词表,没有对应的vector
glove_vocab = load_glove_vocab(wv_file, wv_dim)
log.info('glove loaded.')

# ===============================================================
# =================== Work on training data =====================
# ===============================================================


def proc_train(ith, article):
    rows = []

    for paragraph in article['paragraphs']:
        context = paragraph['context']
        for qa in paragraph['qas']:
            question = qa['question']
            answers = qa['orig_answer']