def streaming_create_tsv_reader(func, line, polymath, seqs, num_workers, is_test=False, misc=None): ctokens, qtokens, atokens, cwids, qwids, baidx, eaidx, ccids, qcids = tsv2ctf.tsv_iter( line, polymath.vocab, polymath.chars, polymath.wg_dim, is_test, misc) batch = { 'cwids': [], 'qwids': [], 'baidx': [], 'eaidx': [], 'ccids': [], 'qcids': [] } batch['cwids'].append(cwids) batch['qwids'].append(qwids) batch['baidx'].append(baidx) batch['eaidx'].append(eaidx) batch['ccids'].append(ccids) batch['qcids'].append(qcids) if len(batch['cwids']) > 0: context_g_words = C.Value.one_hot([[ C.Value.ONE_HOT_SKIP if i >= polymath.wg_dim else i for i in cwids ] for cwids in batch['cwids']], polymath.wg_dim) context_ng_words = C.Value.one_hot([[ C.Value.ONE_HOT_SKIP if i < polymath.wg_dim else i - polymath.wg_dim for i in cwids ] for cwids in batch['cwids']], polymath.wn_dim) query_g_words = C.Value.one_hot([[ C.Value.ONE_HOT_SKIP if i >= polymath.wg_dim else i for i in qwids ] for qwids in batch['qwids']], polymath.wg_dim) query_ng_words = C.Value.one_hot([[ C.Value.ONE_HOT_SKIP if i < polymath.wg_dim else i - polymath.wg_dim for i in qwids ] for qwids in batch['qwids']], polymath.wn_dim) context_chars = [ np.asarray( [[[c for c in cc + [0] * max(0, polymath.word_size - len(cc))]] for cc in ccid], dtype=np.float32) for ccid in batch['ccids'] ] query_chars = [ np.asarray( [[[c for c in qc + [0] * max(0, polymath.word_size - len(qc))]] for qc in qcid], dtype=np.float32) for qcid in batch['qcids'] ] answer_begin = [ np.asarray(ab, dtype=np.float32) for ab in batch['baidx'] ] answer_end = [ np.asarray(ae, dtype=np.float32) for ae in batch['eaidx'] ] return { argument_by_name(func, 'cgw'): context_g_words, argument_by_name(func, 'qgw'): query_g_words, argument_by_name(func, 'cnw'): context_ng_words, argument_by_name(func, 'qnw'): query_ng_words, argument_by_name(func, 'cc'): context_chars, argument_by_name(func, 'qc'): query_chars, argument_by_name(func, 'ab'): answer_begin, argument_by_name(func, 'ae'): answer_end }
def create_tsv_reader(func, tsv_file, polymath, seqs, num_workers, is_test=False, misc=None): with open(tsv_file, 'r', encoding='utf-8') as f: eof = False batch_count = 0 while not (eof and (batch_count % num_workers) == 0): batch_count += 1 batch = { 'cwids': [], 'qwids': [], 'baidx': [], 'eaidx': [], 'ccids': [], 'qcids': [] } while not eof and len(batch['cwids']) < seqs: line = f.readline() if not line: eof = True break if misc is not None: import re misc['uid'].append(re.match('^([^\t]*)', line).groups()[0]) ctokens, qtokens, atokens, cwids, qwids, baidx, eaidx, ccids, qcids = tsv2ctf.tsv_iter( line, polymath.vocab, polymath.chars, polymath.wg_dim, is_test, misc) batch['cwids'].append(cwids) batch['qwids'].append(qwids) batch['baidx'].append(baidx) batch['eaidx'].append(eaidx) batch['ccids'].append(ccids) batch['qcids'].append(qcids) if len(batch['cwids']) > 0: context_g_words = C.Value.one_hot([[ C.Value.ONE_HOT_SKIP if i >= polymath.wg_dim else i for i in cwids ] for cwids in batch['cwids']], polymath.wg_dim) context_ng_words = C.Value.one_hot([[ C.Value.ONE_HOT_SKIP if i < polymath.wg_dim else i - polymath.wg_dim for i in cwids ] for cwids in batch['cwids']], polymath.wn_dim) query_g_words = C.Value.one_hot([[ C.Value.ONE_HOT_SKIP if i >= polymath.wg_dim else i for i in qwids ] for qwids in batch['qwids']], polymath.wg_dim) query_ng_words = C.Value.one_hot([[ C.Value.ONE_HOT_SKIP if i < polymath.wg_dim else i - polymath.wg_dim for i in qwids ] for qwids in batch['qwids']], polymath.wn_dim) context_chars = [ np.asarray([[[ c for c in cc + [0] * max(0, polymath.word_size - len(cc)) ]] for cc in ccid], dtype=np.float32) for ccid in batch['ccids'] ] query_chars = [ np.asarray([[[ c for c in qc + [0] * max(0, polymath.word_size - len(qc)) ]] for qc in qcid], dtype=np.float32) for qcid in batch['qcids'] ] answer_begin = [ np.asarray(ab, dtype=np.float32) for ab in batch['baidx'] ] answer_end = [ np.asarray(ae, dtype=np.float32) for ae in batch['eaidx'] ] yield { argument_by_name(func, 'cgw'): context_g_words, argument_by_name(func, 'qgw'): query_g_words, argument_by_name(func, 'cnw'): context_ng_words, argument_by_name(func, 'qnw'): query_ng_words, argument_by_name(func, 'cc'): context_chars, argument_by_name(func, 'qc'): query_chars, argument_by_name(func, 'ab'): answer_begin, argument_by_name(func, 'ae'): answer_end } else: yield { } # need to generate empty batch for distributed training
def create_tsv_reader(func, tsv_file, polymath, seqs, num_workers, is_test=False, misc=None): with open(tsv_file, 'r', encoding='utf-8') as f: eof = False batch_count = 0 while not(eof and (batch_count % num_workers) == 0): batch_count += 1 batch={'cwids':[], 'qwids':[], 'baidx':[], 'eaidx':[], 'ccids':[], 'qcids':[]} while not eof and len(batch['cwids']) < seqs: line = f.readline() if not line: eof = True break if misc is not None: import re misc['uid'].append(re.match('^([^\t]*)', line).groups()[0]) ctokens, qtokens, atokens, cwids, qwids, baidx, eaidx, ccids, qcids = tsv2ctf.tsv_iter(line, polymath.vocab, polymath.chars, is_test, misc) batch['cwids'].append(cwids) batch['qwids'].append(qwids) batch['baidx'].append(baidx) batch['eaidx'].append(eaidx) batch['ccids'].append(ccids) batch['qcids'].append(qcids) if len(batch['cwids']) > 0: context_g_words = C.Value.one_hot([[C.Value.ONE_HOT_SKIP if i >= polymath.wg_dim else i for i in cwids] for cwids in batch['cwids']], polymath.wg_dim) context_ng_words = C.Value.one_hot([[C.Value.ONE_HOT_SKIP if i < polymath.wg_dim else i - polymath.wg_dim for i in cwids] for cwids in batch['cwids']], polymath.wn_dim) query_g_words = C.Value.one_hot([[C.Value.ONE_HOT_SKIP if i >= polymath.wg_dim else i for i in qwids] for qwids in batch['qwids']], polymath.wg_dim) query_ng_words = C.Value.one_hot([[C.Value.ONE_HOT_SKIP if i < polymath.wg_dim else i - polymath.wg_dim for i in qwids] for qwids in batch['qwids']], polymath.wn_dim) context_chars = [np.asarray([[[c for c in cc+[0]*max(0,polymath.word_size-len(cc))]] for cc in ccid], dtype=np.float32) for ccid in batch['ccids']] query_chars = [np.asarray([[[c for c in qc+[0]*max(0,polymath.word_size-len(qc))]] for qc in qcid], dtype=np.float32) for qcid in batch['qcids']] answer_begin = [np.asarray(ab, dtype=np.float32) for ab in batch['baidx']] answer_end = [np.asarray(ae, dtype=np.float32) for ae in batch['eaidx']] yield { argument_by_name(func, 'cgw'): context_g_words, argument_by_name(func, 'qgw'): query_g_words, argument_by_name(func, 'cnw'): context_ng_words, argument_by_name(func, 'qnw'): query_ng_words, argument_by_name(func, 'cc' ): context_chars, argument_by_name(func, 'qc' ): query_chars, argument_by_name(func, 'ab' ): answer_begin, argument_by_name(func, 'ae' ): answer_end } else: yield {} # need to generate empty batch for distributed training