def build_features(ent2names, etrain2types, word2cluster, configFeatures=['shape', 'ngram'],maxnamenum=1, maxngram=5, upto=-1): ent2features = defaultdict(dict) c = 0 for mye in ent2names: c += 1 thenames = get_ent_names(ent2names[mye], maxnum=maxnamenum) for i, onename in enumerate(thenames): onename = re.sub(r'[^\x00-\x7F]+','', onename) features = [] if 'shape' in configFeatures: wordshapefeaturizer(onename, features) if 'tok' in configFeatures: tokenFeaturizer(onename, features) if 'len' in configFeatures: lengthFeaturizer(onename, features) if 'ngram' in configFeatures: ngramFeaturizer(onename, features, maxn=maxngram) if 'ngram_norm' in configFeatures: ngramFeaturizer(onename, features, maxn=maxngram, normalize=True) # posFeaturizer(onename, features) # # brownClusterFeaturizer(onename, features, word2cluster) # headFeaturizer(onename, features) ent2features[mye][i] = features if upto != -1 and c == upto: break return ent2features
def gen_new_ds(e2names, e2types, max_name=1, outfile='train.txt'): f = open(outfile, 'w') for mye in e2names: names = get_ent_names(e2names[mye], max_name) targets = e2types[mye] for nm in names: line = '\t'.join([mye, nm.strip(), targets[0]]) line += '\t' + ' '.join(targets[1:]) f.write(line + '\n') f.close()