def make_feature(fc): '''Builds a new `StringCounter` from the many `StringCounters` in the input `fc`. This StringCounter will define one of the targets for the `MultinomialNB` classifier. This crucial function decides the relative importance of features extracted by the ETL pipeline. This is essentially a form of domain fitting that allows us to tune the extraction to the fields that are important to a domain. However, if the NER for a domain is inadequate, then the primary purpose of these relative weightings is to remove bogus NER extractions. ''' feat = StringCounter() rejects = set() keepers = set() #keepers_keys = ['GPE', 'PERSON', 'ORGANIZATION', 'usernames'] keepers_keys = ['phone', 'email'] #['usernames', 'phone', 'email', 'ORGANIZATION', 'PERSON'] rejects_keys = ['keywords', 'usernames', 'ORGANIZATION', 'PERSON'] # The features used to pull the keys for the classifier for f, strength in [('keywords', 10**4), ('GPE', 1), ('bow', 1), ('bowNP_sip', 10**8), ('phone', 10**12), ('email', 10**12), ('bowNP', 10**3), ('PERSON', 10**8), ('ORGANIZATION', 10**6), ('usernames', 10**12)]: if strength == 1: feat += fc[f] else: feat += StringCounter({key: strength * count for key, count in fc[f].items()}) if f in rejects_keys: map(rejects.add, fc[f]) if f in keepers_keys: map(keepers.add, fc[f]) if u'' in feat: feat.pop(u'') return feat, rejects, keepers