def load_squad_data(data_path, ids, max_para_len=600, max_ans_len=10): dataset = json.load(open(data_path, 'r')) samples = [] qn, an = 0, 0 skipped = 0 vocabulary = defaultdict(int) def __(s): import unicodedata s = ''.join(c for c in unicodedata.normalize('NFKD', s) if unicodedata.category(c) != 'Mn') return s.replace("``", '"').replace("''", '"') try: for aid, article in enumerate(tqdm(dataset['data'])): for pid, paragraph in enumerate(article['paragraphs']): context = TokenString(__(paragraph['context']), word_tokenize).delete_whitespace() questions = paragraph['qas'] for token in context: vocabulary[token] += 1 for qid, qa in enumerate(questions): log.debug('processing: {}.{}.{}'.format(aid, pid, qid)) q = TokenString(__(qa['question']), word_tokenize).delete_whitespace() a = TokenString(__(qa['answers'][0]['text']), word_tokenize).delete_whitespace( ) #simply ignore other answers squad_id = qa['id'] for token in q: vocabulary[token] += 1 indices = context.index(a) if not indices: log.debug(pformat(paragraph['context'])) log.debug(pformat(paragraph['qas'][qid])) log.error('{}.{}.{} - "{}" not found in \n"{}"'.format( aid, pid, qid, a.tokenized_string, context.tokenized_string)) skipped += 1 continue a_start, a_end = indices fields = (aid, pid, qid, squad_id, context, q, a, list(range(a_start, a_end))) _id = tuple(fields[i - 1] for i in ids) samples.append(Sample(_id, *fields)) except: skipped += 1 log.exception('{}'.format(aid)) print('skipped {} samples'.format(skipped)) return samples, vocabulary
def build_sentimentnet_sample(raw_sample): labels = [ 'negative', 'somewhat negative', 'neutral', 'sometwhat positive', 'positive' ] sentence = TokenString( raw_sample.sentence.strip(' \n\t').lower(), word_tokenize).delete_whitespace() return Sample(raw_sample.id, raw_sample.sentence_id.lower(), sentence, labels[int(raw_sample.sentiment.strip(' \n\t'))])
def load_squad_data(data_path, ids, max_para_len=600, max_ans_len=10, max_sample_size=None): samples = [] qn, an = 0, 0 skipped = 0 vocabulary = defaultdict(int) try: for i, file_ in enumerate(glob.glob('dataset/en-10k/qa*_train.txt')): dataset = open(file_).readlines() prev_linenum = 1000000 for line in dataset: questions, answers = [], [] linenum, line = line.split(' ', 1) linenum = int(linenum) if prev_linenum > linenum: story = '' if '?' in line: q, a, _ = line.split('\t') samples.append( Sample('{}.{}'.format(i, linenum), i, linenum, TokenString(story, word_tokenize), TokenString(q, word_tokenize), TokenString(a, word_tokenize)) ) else: story += ' ' + line prev_linenum = linenum except: skipped += 1 log.exception('{}'.format(i, linenum)) print('skipped {} samples'.format(skipped)) samples = sorted(samples, key=lambda x: -len(x.a + x.story)) if max_sample_size: samples = samples[:max_sample_size] log.info('building vocabulary...') for sample in samples: for token in sample.story + sample.q + sample.a: vocabulary[token] += 1 return samples, vocabulary
net.do_train() if args.task == 'drop-words-and-validate': net.drop_words_and_validate(args.epoch) if args.task == 'dump-vocab': from collections import Counter from utilz import Sample counter = Counter() for s in dataset.trainset: counter.update([s.word, s.context]) embedding = [] words = sorted(counter.keys()) for w in tqdm(words): ids, word, context = _batchop([Sample('0', w, '')], for_prediction=True) emb = net.__(net.embed(word), 'emb') embedding.append(emb) embedding = torch.stack(embedding).squeeze() dump_vocab_tsv(config, words, embedding.cpu().detach().numpy(), config.ROOT_DIR + '/vocab.tsv') if args.task == 'dump-cosine-similarity': dump_cosine_similarity_tsv(config, dataset.input_vocab, net.embed.weight.data.cpu(), config.ROOT_DIR + '/cosine_similarity.tsv')
if config.CONFIG.multi_gpu and torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = nn.DataParallel(model) print('**** the model', model) if args.task == 'train': model.do_train() if args.task == 'dump-vocab': dump_vocab_tsv(config, dataset.input_vocab, model.embed.weight.data.cpu().numpy(), config.ROOT_DIR + '/vocab.tsv') if args.task == 'predict': if args.starting_char: datapoints = [Sample(0, args.gender, [args.starting_char])] input_ = _batchop(datapoints) else: input_ = None for i in range(args.count): try: output = model.do_predict(input_, length=args.prediction_length, beam_width=args.beam_width) except: log.exception('#########3') pass end = time.time()
if sys.argv[1]: log.addFilter(CMDFilter(sys.argv[1])) ROOT_DIR = initialize_task(SELF_NAME) print('====================================') print(ROOT_DIR) print('====================================') if config.CONFIG.flush or 'flush' in sys.argv: log.info('flushing...') dataset = [] with open('../dataset/dataset.csv') as f: for line in tqdm(f.readlines()): line = line.split('|') dataset.append(Sample(line[0], line[1], line[2])) dataset, vocabulary, labels = prep_samples(dataset) pivot = int(config.CONFIG.split_ratio * len(dataset)) trainset, testset = dataset[:pivot], dataset[pivot:] pickle.dump([trainset, testset, dict(vocabulary), dict(labels)], open('{}__cache.pkl'.format(SELF_NAME), 'wb')) else: trainset, testset, _vocabulary, _labels = pickle.load( open('{}__cache.pkl'.format(SELF_NAME), 'rb')) vocabulary = defaultdict(int) labels = defaultdict(int) vocabulary.update(_vocabulary), labels.update(_labels) log.info('trainset size: {}'.format(len(trainset)))