def __init__(self, language, window_width=2): self.language = language self.tagger = TTPosTagger(language) self.feature_index = SortedSet() self.role_index = SortedSet() self.window_width = window_width self.features = [] self.unk_index = self.feature_index.put('UNK')
def main(corpus, verbs, processes, outfile, sub_sentences): """ Compute the LU distribution in the corpus, i.e. how many LUs per sentence """ global splitter, tagger, parser, all_verbs splitter = PunktSentenceSplitter('en') tagger = TTPosTagger('en') parser = StanfordParser( path_to_jar='dev/stanford-corenlp-3.6.0.jar', path_to_models_jar='dev/stanford-corenlp-3.6.0-models.jar', java_options=' -mx1G -Djava.ext.dirs=dev/' ) # no way to make classpath work all_verbs = reduce(lambda x, y: x.union(y), imap(set, json.load(verbs).values()), set()) all_verbs.discard('be') all_verbs.discard('have') args = load_corpus(corpus, 'bio', text_only=True) worker = worker_with_sub_sentences if sub_sentences else worker_with_sentences counter = defaultdict(int) for i, counts in enumerate(parallel.map(worker, args, processes)): for k, v in counts.iteritems(): counter[k] += v if (i + 1) % 10000 == 0: logger.info('Processed %d documents', i + 1) counter = OrderedDict(sorted(counter.items(), key=lambda (k, v): k)) for k, v in counter.iteritems(): print k, v json.dump(counter, outfile, indent=2)
def __init__(self, language='en', window_width=2, collapse_fes=True, target_size=None): """ Initializes the extractor. :param language: The language of the sentences that will be used :param window_width: how many tokens to look before and after a each token when building its features. :param collapse_fes: Whether to collapse FEs to a single token or to keep them split. """ self.language = language self.tagger = TTPosTagger(language) self.window_width = window_width self.collapse_fes = collapse_fes self.unk_feature = 'UNK' self.vectorizer = DictVectorizer() self.target_size = target_size self.reducer = TruncatedSVD(target_size) if target_size else None self.vocabulary = set() self.label_index = {} self.lu_index = {} self.stopwords = set(w.lower() for w in StopWords().words(language)) self.start()
def __init__(self, corpus, document_key, sentences_key, language, lemma_to_token, match_base_form): """ Initializes the extractor. :param iterable corpus: The corpus, iterable of `dict`s :param str document_key: The key from which to retrieve the textual document :param str sentences_key: The key to which the extracted sentences should be stored :param str language: The language the text is in :param dict lemma_to_token: Mapping from lemma to list of tokens """ self.corpus = corpus self.sentences_key = sentences_key self.document_key = document_key self.lemma_to_token = lemma_to_token self.language = language self.lemma_to_token = lemma_to_token if match_base_form else self._filter_base_form( lemma_to_token) self.tokenizer = Tokenizer(self.language) self.tagger = TTPosTagger(self.language)
class FactExtractorFeatureExtractor(BaseFeatureExtractor): """ Feature extractor inspired from the fact-extractor """ def __init__(self, language, window_width=2): self.language = language self.tagger = TTPosTagger(language) self.feature_index = SortedSet() self.role_index = SortedSet() self.window_width = window_width self.features = [] self.unk_index = self.feature_index.put('UNK') def sentence_to_tokens(self, sentence, fes): """ Transforms a sentence into a list of tokens :param unicode sentence: Text of the sentence :param dict fes: mapping FE -> chunk :return: List of tokens """ tagged = self.tagger.tag_one(sentence, skip_unknown=False) # find entities and group them into single tokens for fe, chunk in fes.iteritems(): if chunk is None: continue fe_tokens = self.tagger.tokenize(chunk) if not fe_tokens: continue # find fe_tokens into tagged found = False i = j = 0 while i < len(tagged): if fe_tokens[j].lower() == tagged[i][0].lower(): j += 1 if j == len(fe_tokens): found = True break else: j = 0 i += 1 if found: position = i - len(fe_tokens) + 1 pos = 'ENT' if len(fe_tokens) > 1 else tagged[position][1] tagged = tagged[:position] + [[chunk, pos, chunk, fe]] + tagged[position + len(fe_tokens):] else: logger.debug('cunk "%s" of fe "%s" not found in sentence "%s". Overlapping chunks?', chunk, fe, sentence) return tagged def feature_for(self, term, type_, position, add_unknown): """ Returns the feature for the given token, i.e. the column of the feature in a sparse matrix :param str term: Actual term :param str type_: Type of the term, for example token, pos or lemma :param int position: Relative position (used for context windows) :param bool add_unknown: Whether to add previously unseen terms to the dictionary or use the UNK token instead :return: Column of the corresponding feature """ feat = '%s_%s_%+d' % (term.lower(), type_.lower(), position) if add_unknown: index = self.feature_index.put(feat) else: index = self.feature_index.index(feat) if index == -1: index = self.unk_index return index def token_to_features(self, tokens, position, add_unknown, gazetteer): """ Extracts the features for the token in the given position :param list tokens: POS-tagged tokens of the sentence :param int position: position of the token for which features are requestsd :param dict gazetteer: mapping chunk -> additional features :return: sparse set of features (i.e. numbers are indexes in a row of a sparse matrix) """ features = set() for i in xrange(max(position - self.window_width, 0), min(position + self.window_width + 1, len(tokens))): rel = i - position features.add(self.feature_for(tokens[i][0], 'TERM', rel, add_unknown)) features.add(self.feature_for(tokens[i][1], 'POS', rel, add_unknown)) features.add(self.feature_for(tokens[i][2], 'LEMMA', rel, add_unknown)) for feat in gazetteer.get(tokens[i][0], []): features.add(self.feature_for(feat, 'GAZ', rel, add_unknown)) return features def extract_features(self, sentence, fes, add_unknown, gazetteer): """ Extracts the features for each token of the sentence :param unicode sentence: Text of the sentence :param dicr fes: mapping FE -> chunk :param dict gazetteer: mapping chunk -> additional features :return: List of features, each one as a sparse row (i.e. with the indexes of the relevant columns) """ tagged = self.sentence_to_tokens(sentence, fes) features = [] for i in xrange(len(tagged)): feat = self.token_to_features(tagged, i, add_unknown, gazetteer) label = 'O' if len(tagged[i]) == 3 else tagged[i][3] features.append((feat, self.role_index.put(label))) return tagged, features def process_sentence(self, sentence, fes, add_unknown, gazetteer): tagged, features = self.extract_features(sentence, fes, add_unknown, gazetteer) self.features.extend(features) return tagged def start(self): self.features = [] def get_features(self): x, y = [], [] data, indices, indptr = [], [], [] for sample, label in self.features: y.append(label) indptr.append(len(data)) for feature in sample: indices.append(int(feature)) data.append(1.0) indptr.append(len(data)) x = csr_matrix((data, indices, indptr), shape=(len(indptr) - 1, len(self.feature_index.items)), dtype=np.float32) y = np.array(y) return x, y def __getstate__(self): return (self.language, self.unk_index, self.window_width, self.role_index.items, self.feature_index.items, self.features) def __setstate__(self, (language, unk_index, window_width, role_index, feature_index, features)): self.__init__(language, window_width) self.feature_index.items = feature_index self.role_index.items = role_index self.features = features self.unk_index = unk_index
class BagOfTermsFeatureExtractor(object): """ Extracts features from sentences. Will process sentences one by one accumulating their features and finalizes them into the final training set. It should be used to extract features prior to classification, in which case the fe arguments can be used to group tokens of the same entity into a single chunk while ignoring the actual frame element name, e.g. `fes = dict(enumerate(entities))` """ def __init__(self, language='en', window_width=2, collapse_fes=True, target_size=None): """ Initializes the extractor. :param language: The language of the sentences that will be used :param window_width: how many tokens to look before and after a each token when building its features. :param collapse_fes: Whether to collapse FEs to a single token or to keep them split. """ self.language = language self.tagger = TTPosTagger(language) self.window_width = window_width self.collapse_fes = collapse_fes self.unk_feature = 'UNK' self.vectorizer = DictVectorizer() self.target_size = target_size self.reducer = TruncatedSVD(target_size) if target_size else None self.vocabulary = set() self.label_index = {} self.lu_index = {} self.stopwords = set(w.lower() for w in StopWords().words(language)) self.start() def start(self): """ Clears the samples accumulated so far and starts over. """ self.samples = [] def lu_column(self): return self.vectorizer.vocabulary_['lu'] if not self.target_size else None def process_sentence(self, sentence, lu, fes, add_unknown, gazetteer): """ Extracts and accumulates features for the given sentence :param unicode sentence: Text of the sentence :param unicode lu: lexical unit of the sentence :param dict fes: Dictionary with FEs and corresponding chunks :param bol add_unknown: Whether unknown tokens should be added to the index of treaded as a special, unknown token. Set to True when building the training set and to False when building the features used to classify new sentences :param dict gazetteer: Additional features to add when a given chunk is found in the sentence. Keys should be chunks and values should be list of features :return: List of tuples whose first elements are chunks of words and the second ones indicate whether the chunk was used as a sample or skipped altogether :type: list of tuples (chunk, is_sample) """ gazetteer = gazetteer or {} tagged = self.sentence_to_tokens(sentence, fes) ret = [] for position in xrange(len(tagged)): if tagged[position][0].lower() in self.stopwords or tagged[position][2] != 'ENT': ret.append((tagged[position][0], False)) continue else: ret.append((tagged[position][0], True)) # add the unknown feature to every sample to trick the dict vectorizer into # thinking that there is a feature like that. will be useful when add_unknown # is false, because by default the dict vectorizer skips unseen labels self.lu_index[lu] = self.lu_index.get(lu, len(self.lu_index)) sample = {'unk': self.unk_feature, 'lu': self.lu_index[lu]} for i in xrange(max(position - self.window_width, 0), min(position + self.window_width + 1, len(tagged))): rel = i - position self.add_feature_to(sample, 'TERM%+d' % rel, tagged[i][0], add_unknown) self.add_feature_to(sample, 'POS%+d' % rel, tagged[i][1], add_unknown) self.add_feature_to(sample, 'LEMMA%+d' % rel, tagged[i][2], add_unknown) for feat in gazetteer.get(tagged[i][0], []): sample['GAZ%+d' % rel] = feat label = 'O' if len(tagged[i]) == 3 else tagged[i][3] self.label_index[label] = self.label_index.get(label, len(self.label_index)) self.samples.append((sample, label)) return ret def add_feature_to(self, sample, feature_name, feature_value, add_unknown): if add_unknown or feature_value in self.vocabulary: sample[feature_name] = feature_value self.vocabulary.add(feature_value) else: sample[feature_name] = self.unk_feature def get_features(self, refit): """ Returns the final features matrix :param bool refit: whether to refit the features or use the previous model. use refit=True when training and refit=False when retrieving features for classifying unknown samples :return: A matrix whose rows are samples and columns are features and a row vector with the sample label (i.e. the correct answer for the classifier) :rtype: tuple """ samples, labels = zip(*self.samples) if refit: features = self.vectorizer.fit_transform(samples) if self.target_size: features = self.reducer.fit_transform(features) else: features = self.vectorizer.transform(samples) if self.target_size: features = self.reducer.transform(features) labels = np.array([self.label_index[label] for label in labels]) return features, labels def sentence_to_tokens(self, sentence, fes): """ Transforms a sentence into a list of tokens. Appends the FE type to all tokens composing a certain FE and optionally group them into a single token. :param unicode sentence: Text of the sentence :param dict fes: mapping FE -> chunk :return: List of tokens """ if not sentence.strip(): return [] tagged = self.tagger.tag_one(sentence, skip_unknown=False) for fe, chunk in fes.iteritems(): if chunk is None: continue fe_tokens = self.tagger.tokenize(chunk) if not fe_tokens: continue # find fe_tokens into tagged found = False i = j = 0 while i < len(tagged): if len(tagged[i]) == 3 and fe_tokens[j].lower() == tagged[i][0].lower(): j += 1 if j == len(fe_tokens): found = True break else: j = 0 i += 1 if found: position = i - len(fe_tokens) + 1 pos = 'ENT' if len(fe_tokens) > 1 else tagged[position][1] if self.collapse_fes: # make a single token with the whole chunk tagged = tagged[:position] + [[chunk, pos, 'ENT', fe]] + tagged[position + len(fe_tokens):] else: # set custom lemma and label for the tokens of the FE for i in xrange(position, position + len(fe_tokens)): token, pos, _ = tagged[i] tagged[i] = (token, pos, 'ENT', fe) else: logger.debug('cunk "%s" of fe "%s" not found in sentence "%s". Overlapping chunks?', chunk, fe, sentence) return tagged def __getstate__(self): return (self.language, self.unk_feature, self.window_width, self.samples, self.vocabulary, self.label_index, self.vectorizer, self.collapse_fes, self.reducer, self.target_size) def __setstate__(self, (language, unk_feature, window_width, samples, vocabulary, label_index, vectorizer, collapse_fes, reducer, target_size)): self.__init__(language, window_width, collapse_fes, target_size) self.samples = samples self.vocabulary = vocabulary self.unk_feature = unk_feature self.label_index = label_index self.vectorizer = vectorizer self.reducer = reducer