def predict_sentence(self, sent): """ Return a predicted label for each word in an arbitrary length sentence sent - a list of string tokens """ ret = [] sent_str = " ".join(sent) # Extract predicates by looking at verbal POS preds = [(word.i, str(word)) for word in spacy_ws(sent_str) if word.tag_.startswith("V")] # Calculate num of samples (round up to the nearst multiple of sent_maxlen) num_of_samples = np.ceil( float(len(sent)) / self.sent_maxlen) * self.sent_maxlen # Run RNN for each predicate on this sentence for ind, pred in preds: cur_sample = self.create_sample(sent, ind) X = self.encode_inputs([cur_sample]) ret.append(( (ind, pred), [ [(self.consolidate_label(label), float(prob)) for (label, prob) in label_list] for label_list in #for (label, prob) in self.transform_output_probs( self.model.predict(X), # "flatten" and truncate get_prob=True)[0][:len(sent)] ])) return ret
def convert_single_sent(annotated_sent, verbal): """ Return our format for a single annotated sentence. Verbal controls whether only verbal extractions should be made. From Mesquita's readme: Annotated Sentence: The sentence annotated with the entity pair, the trigger and allowed tokens. Entities are enclosed in triple square brackets, triggers are enclosed in triple curly brackets and the allowed tokens are enclosed in arrows. ("--->" and "<---"). """ proc_sent = [] word_ind = 0 for word in annotated_sent.split(): if (word not in SPECIAL_CHARS): if "{{{" in word: # Boilerplate index bp_ind = word.index('{{{') + 3 # Plant the index in the correct place word = "{}{}_{}".format(word[0:bp_ind], word_ind, word[bp_ind:]) word_ind += 1 elif not (word.startswith("[[[")): word = "{}_{}".format(word_ind, word) word_ind += 1 proc_sent.append(word) proc_sent = " ".join(proc_sent) pred = get_predicate_head(proc_sent) raw_sent = get_raw_sent(proc_sent) doc = spacy_ws(strip_word_index(raw_sent)) # Filter non-verbs and empty predicates if (not pred) or \ (verbal and \ (not doc[int(pred.split("_")[0])].tag_.startswith("V"))): return None return map(strip_word_index, [raw_sent, pred] + get_entities(proc_sent))
def encode_inputs(self, sents): """ Given a dataframe which is already split to sentences, encode inputs for rnn classification. Should return a dictionary of sequences of sample of length maxlen. """ word_inputs = [] pred_inputs = [] pos_inputs = [] # Preproc to get all preds per run_id # Sanity check - make sure that all sents agree on run_id assert (all([len(set(sent.run_id.values)) == 1 for sent in sents])) run_id_to_pred = dict([(int(sent.run_id.values[0]), self.get_head_pred_word(sent)) for sent in sents]) # Construct a mapping from running word index to pos word_id_to_pos = {} for sent in sents: indices = sent.index.values words = sent.word.values for index, word in zip(indices, spacy_ws(" ".join(words))): word_id_to_pos[index] = word.tag_ fixed_size_sents = self.get_fixed_size(sents) for sent in fixed_size_sents: assert (len(set(sent.run_id.values)) == 1) word_indices = sent.index.values sent_words = sent.word.values sent_str = " ".join(sent_words) pos_tags_encodings = [(SPACY_POS_TAGS.index(word_id_to_pos[word_ind]) \ if word_id_to_pos[word_ind] in SPACY_POS_TAGS \ else 0) for word_ind in word_indices] word_encodings = [self.emb.get_word_index(w) for w in sent_words] # Same pred word encodings for all words in the sentence pred_word = run_id_to_pred[int(sent.run_id.values[0])] pred_word_encodings = [ self.emb.get_word_index(pred_word) for _ in sent_words ] word_inputs.append([Sample(w) for w in word_encodings]) pred_inputs.append([Sample(w) for w in pred_word_encodings]) pos_inputs.append([Sample(pos) for pos in pos_tags_encodings]) # Pad / truncate to desired maximum length ret = defaultdict(lambda: []) for name, sequence in zip( ["word_inputs", "predicate_inputs", "postags_inputs"], [word_inputs, pred_inputs, pos_inputs]): for samples in pad_sequences(sequence, pad_func=lambda: Pad_sample(), maxlen=self.sent_maxlen): ret[name].append([sample.encode() for sample in samples]) return {k: np.array(v) for k, v in ret.iteritems()}