Пример #1
0
def build_targets_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir):
    totals = len(trnMentions) + len(devMentions) + len(tstMentions)
    targets_m = numpy.zeros(shape=(totals, len(t2idx)), dtype='int32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        types_idx = [t2idx[t] for t in men.alltypes]
        targets_m[i] = cmn.convertTargetsToBinVec(types_idx, len(t2idx))
    hdf5_file = dsdir + '_targets.h5py'
    f = h5py.File(hdf5_file, mode='w')
    targets = f.create_dataset('targets', targets_m.shape, dtype='int32')
    targets.attrs['type_to_ix'] = yaml.dump(t2idx)
    targets[...] = targets_m
    targets.dims[0].label = 'all_types'
    nsamples_train = len(trnMentions)
    nsamples_dev = len(devMentions)
    split_dict = {
        'train': {
            'targets': (0, nsamples_train)
        },
        'dev': {
            'targets': (nsamples_train, nsamples_train + nsamples_dev)
        },
        'test': {
            'targets': (nsamples_train + nsamples_dev, totals)
        }
    }
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
Пример #2
0
def build_targets_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir):
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    targets_m = numpy.zeros(shape=(totals, len(t2idx)), dtype='int32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        types_idx = [t2idx[t] for t in men.alltypes] 
        targets_m[i] = cmn.convertTargetsToBinVec(types_idx, len(t2idx))
    hdf5_file = dsdir + '_targets.h5py'
    f = h5py.File(hdf5_file, mode='w')
    targets = f.create_dataset('targets', targets_m.shape, dtype='int32')
    targets.attrs['type_to_ix'] = yaml.dump(t2idx)
    targets[...] = targets_m
    targets.dims[0].label = 'all_types'
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {'targets': (0, nsamples_train)},
        'dev': {'targets': (nsamples_train, nsamples_train + nsamples_dev)},
        'test': {'targets': (nsamples_train + nsamples_dev, totals)}}    
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
Пример #3
0
def build_type_patterns(trnMentions, t2idx, dsdir, vectorfile, upto=-1):
    
    dsdir += '_typeCooccurrMatrix.npy'
    pattern2freq = defaultdict(lambda: 0)
    for i, men in enumerate(trnMentions):
        pattern = [t2idx[t] for t in men.alltypes] 
        vec = ' '.join([str(v) for v in cmn.convertTargetsToBinVec(pattern, len(t2idx))])
        pattern2freq[vec] += 1
    sorted_p2f = sorted(pattern2freq.items(), key=operator.itemgetter(1))
    
#     max_pat = 300
    label_cooccur_matrix = numpy.zeros((len(sorted_p2f), len(t2idx)), dtype='float32')
    for i, patternfreq in enumerate(sorted_p2f):
        pattern, freq = patternfreq
        pattern = numpy.asarray([int(p) for p in pattern.split(' ')]).astype('float32')
#         vec = cmn.convertTargetsToBinVec(pattern, len(t2idx)).astype('float32')
        pattern *= numpy.sqrt(6. / (len(pattern) + len(t2idx)))
#         print pattern
        label_cooccur_matrix[i] = pattern
    print len(label_cooccur_matrix)
    numpy.save(dsdir, label_cooccur_matrix)
Пример #4
0
def build_type_patterns(trnMentions, t2idx, dsdir, vectorfile, upto=-1):

    dsdir += '_typeCooccurrMatrix.npy'
    pattern2freq = defaultdict(lambda: 0)
    for i, men in enumerate(trnMentions):
        pattern = [t2idx[t] for t in men.alltypes]
        vec = ' '.join(
            [str(v) for v in cmn.convertTargetsToBinVec(pattern, len(t2idx))])
        pattern2freq[vec] += 1
    sorted_p2f = sorted(pattern2freq.items(), key=operator.itemgetter(1))

    #     max_pat = 300
    label_cooccur_matrix = numpy.zeros((len(sorted_p2f), len(t2idx)),
                                       dtype='float32')
    for i, patternfreq in enumerate(sorted_p2f):
        pattern, freq = patternfreq
        pattern = numpy.asarray([int(p) for p in pattern.split(' ')
                                 ]).astype('float32')
        #         vec = cmn.convertTargetsToBinVec(pattern, len(t2idx)).astype('float32')
        pattern *= numpy.sqrt(6. / (len(pattern) + len(t2idx)))
        #         print pattern
        label_cooccur_matrix[i] = pattern
    print len(label_cooccur_matrix)
    numpy.save(dsdir, label_cooccur_matrix)