Пример #1
0
def load_squad_data(data_path, ids, max_para_len=600, max_ans_len=10):
    dataset = json.load(open(data_path, 'r'))
    samples = []
    qn, an = 0, 0
    skipped = 0

    vocabulary = defaultdict(int)

    def __(s):
        import unicodedata
        s = ''.join(c for c in unicodedata.normalize('NFKD', s)
                    if unicodedata.category(c) != 'Mn')
        return s.replace("``", '"').replace("''", '"')

    try:
        for aid, article in enumerate(tqdm(dataset['data'])):
            for pid, paragraph in enumerate(article['paragraphs']):

                context = TokenString(__(paragraph['context']),
                                      word_tokenize).delete_whitespace()
                questions = paragraph['qas']

                for token in context:
                    vocabulary[token] += 1

                for qid, qa in enumerate(questions):
                    log.debug('processing: {}.{}.{}'.format(aid, pid, qid))
                    q = TokenString(__(qa['question']),
                                    word_tokenize).delete_whitespace()
                    a = TokenString(__(qa['answers'][0]['text']),
                                    word_tokenize).delete_whitespace(
                                    )  #simply ignore other answers
                    squad_id = qa['id']
                    for token in q:
                        vocabulary[token] += 1

                    indices = context.index(a)
                    if not indices:
                        log.debug(pformat(paragraph['context']))
                        log.debug(pformat(paragraph['qas'][qid]))
                        log.error('{}.{}.{} - "{}" not found in \n"{}"'.format(
                            aid, pid, qid, a.tokenized_string,
                            context.tokenized_string))
                        skipped += 1
                        continue

                    a_start, a_end = indices
                    fields = (aid, pid, qid, squad_id, context, q, a,
                              list(range(a_start, a_end)))
                    _id = tuple(fields[i - 1] for i in ids)
                    samples.append(Sample(_id, *fields))
    except:
        skipped += 1
        log.exception('{}'.format(aid))

    print('skipped {} samples'.format(skipped))
    return samples, vocabulary
def build_sentimentnet_sample(raw_sample):
    labels = [
        'negative', 'somewhat negative', 'neutral', 'sometwhat positive',
        'positive'
    ]
    sentence = TokenString(
        raw_sample.sentence.strip(' \n\t').lower(),
        word_tokenize).delete_whitespace()
    return Sample(raw_sample.id, raw_sample.sentence_id.lower(), sentence,
                  labels[int(raw_sample.sentiment.strip(' \n\t'))])
Пример #3
0
def load_squad_data(data_path, ids, max_para_len=600, max_ans_len=10, max_sample_size=None):
    samples = []
    qn, an = 0, 0
    skipped = 0

    vocabulary = defaultdict(int)
    
    try:
        for i, file_ in enumerate(glob.glob('dataset/en-10k/qa*_train.txt')):
            dataset = open(file_).readlines()
            prev_linenum = 1000000
            for line in dataset:
                questions, answers = [], []
                linenum, line = line.split(' ', 1)

                linenum = int(linenum)
                if prev_linenum > linenum:
                    story = ''

                if '?' in line:
                    q, a, _ = line.split('\t')

                    samples.append(
                        Sample('{}.{}'.format(i, linenum),
                               i, linenum,
                               TokenString(story, word_tokenize),
                               TokenString(q,     word_tokenize),
                               TokenString(a,     word_tokenize))
                        )

                else:
                    story += ' ' + line

                prev_linenum = linenum

    except:
        skipped += 1
        log.exception('{}'.format(i, linenum))

    print('skipped {} samples'.format(skipped))
    samples = sorted(samples, key=lambda x: -len(x.a + x.story))
    if max_sample_size:
        samples = samples[:max_sample_size]

    log.info('building vocabulary...')
    for sample in samples:
        for token in sample.story + sample.q + sample.a:
            vocabulary[token] += 1
    return samples, vocabulary
Пример #4
0
        net.do_train()

    if args.task == 'drop-words-and-validate':
        net.drop_words_and_validate(args.epoch)
        
    if args.task == 'dump-vocab':
        from collections import Counter
        from utilz import Sample
        counter = Counter()
        for s in dataset.trainset:
            counter.update([s.word, s.context])

        embedding = []
        words = sorted(counter.keys())
        for w in tqdm(words):
            ids, word, context = _batchop([Sample('0', w, '')], for_prediction=True)
            emb = net.__(net.embed(word), 'emb')
            embedding.append(emb)

        embedding = torch.stack(embedding).squeeze()
        dump_vocab_tsv(config,
                       words,
                       embedding.cpu().detach().numpy(),
                       config.ROOT_DIR + '/vocab.tsv')

        
    if args.task == 'dump-cosine-similarity':
        dump_cosine_similarity_tsv(config,
                   dataset.input_vocab,
                   net.embed.weight.data.cpu(),
                   config.ROOT_DIR + '/cosine_similarity.tsv')
Пример #5
0
        if config.CONFIG.multi_gpu and torch.cuda.device_count() > 1:
            print("Let's use", torch.cuda.device_count(), "GPUs!")
            model = nn.DataParallel(model)

    print('**** the model', model)
    if args.task == 'train':
        model.do_train()

    if args.task == 'dump-vocab':
        dump_vocab_tsv(config, dataset.input_vocab,
                       model.embed.weight.data.cpu().numpy(),
                       config.ROOT_DIR + '/vocab.tsv')

    if args.task == 'predict':
        if args.starting_char:
            datapoints = [Sample(0, args.gender, [args.starting_char])]
            input_ = _batchop(datapoints)
        else:
            input_ = None

        for i in range(args.count):
            try:

                output = model.do_predict(input_,
                                          length=args.prediction_length,
                                          beam_width=args.beam_width)
            except:
                log.exception('#########3')
                pass

    end = time.time()
Пример #6
0
    if sys.argv[1]:
        log.addFilter(CMDFilter(sys.argv[1]))

    ROOT_DIR = initialize_task(SELF_NAME)

    print('====================================')
    print(ROOT_DIR)
    print('====================================')

    if config.CONFIG.flush or 'flush' in sys.argv:
        log.info('flushing...')
        dataset = []
        with open('../dataset/dataset.csv') as f:
            for line in tqdm(f.readlines()):
                line = line.split('|')
                dataset.append(Sample(line[0], line[1], line[2]))
        dataset, vocabulary, labels = prep_samples(dataset)
        pivot = int(config.CONFIG.split_ratio * len(dataset))
        trainset, testset = dataset[:pivot], dataset[pivot:]
        pickle.dump([trainset, testset,
                     dict(vocabulary),
                     dict(labels)],
                    open('{}__cache.pkl'.format(SELF_NAME), 'wb'))
    else:
        trainset, testset, _vocabulary, _labels = pickle.load(
            open('{}__cache.pkl'.format(SELF_NAME), 'rb'))
        vocabulary = defaultdict(int)
        labels = defaultdict(int)
        vocabulary.update(_vocabulary), labels.update(_labels)

    log.info('trainset size: {}'.format(len(trainset)))