예제 #1
0
def build_features(ent2names, etrain2types, word2cluster, configFeatures=['shape', 'ngram'],maxnamenum=1, maxngram=5, upto=-1):
    ent2features = defaultdict(dict)
    c = 0
    for mye in ent2names:
        c += 1
        thenames = get_ent_names(ent2names[mye], maxnum=maxnamenum)
        for i, onename in enumerate(thenames):
            onename = re.sub(r'[^\x00-\x7F]+','', onename)
            features = []
            if 'shape' in configFeatures:
                wordshapefeaturizer(onename, features)
            if 'tok' in configFeatures:
                tokenFeaturizer(onename, features)
            if 'len' in configFeatures:
                lengthFeaturizer(onename, features)
            if 'ngram' in configFeatures:
                ngramFeaturizer(onename, features, maxn=maxngram)
            if 'ngram_norm' in configFeatures:
                ngramFeaturizer(onename, features, maxn=maxngram, normalize=True)
#             posFeaturizer(onename, features)
# #             brownClusterFeaturizer(onename, features, word2cluster)
#              headFeaturizer(onename, features)
            ent2features[mye][i] = features
        if upto != -1 and c == upto:
            break
    return ent2features
예제 #2
0
def gen_new_ds(e2names, e2types, max_name=1, outfile='train.txt'):
    f = open(outfile, 'w')
    for mye in e2names:
        names = get_ent_names(e2names[mye], max_name)
        targets = e2types[mye]
        for nm in names:
            line = '\t'.join([mye, nm.strip(), targets[0]])
            line += '\t' + ' '.join(targets[1:])
            f.write(line + '\n')
    f.close()