def get_event_vector(self, event_input, include_all_pobj=True): if include_all_pobj: assert isinstance(event_input, IndexedEventMultiPobj), \ 'event_input must be a {} instance ' \ 'when include_all_pobj=True'.format( get_class_name(IndexedEventMultiPobj)) else: assert isinstance(event_input, IndexedEvent), \ 'event_input must be a {} instance ' \ 'when include_all_pobj=False'.format( get_class_name(IndexedEvent)) # initialize event vector to be all zero vector = np.zeros(self.embedding_model.vector_size) # add vector for predicate pred_vector = self.embedding_model.get_index_vec( event_input.get_predicate()) if pred_vector is not None: vector += pred_vector else: return None # add vectors for all arguments for arg_input in event_input.get_all_argument(): arg_vector = self.embedding_model.get_index_vec(arg_input) if arg_vector is not None: vector += arg_vector return vector
def __init__(self, left_event, pos_event, neg_event, pos_arg_idx, neg_arg_idx, pos_salience, neg_salience): assert isinstance(left_event, IndexedEvent), \ 'left_event must be a {} instance'.format( get_class_name(IndexedEvent)) self.left_event = deepcopy(left_event) assert isinstance(pos_event, IndexedEvent), \ 'pos_event must be a {} instance'.format( get_class_name(IndexedEvent)) self.pos_event = deepcopy(pos_event) assert isinstance(neg_event, IndexedEvent), \ 'neg_event must be a {} instance'.format( get_class_name(IndexedEvent)) self.neg_event = deepcopy(neg_event) assert pos_arg_idx in [1, 2, 3], \ 'pos_arg_type must be 1 (for subj), 2 (for obj), or 3 (for pobj)' self.pos_arg_idx = pos_arg_idx assert neg_arg_idx in [1, 2, 3], \ 'neg_arg_type must be 1 (for subj), 2 (for obj), or 3 (for pobj)' self.neg_arg_idx = neg_arg_idx # extra features for entity salience assert isinstance(pos_salience, EntitySalience), \ 'pos_salience must be a {} instance'.format( get_class_name(EntitySalience)) self.pos_salience = pos_salience assert isinstance(neg_salience, EntitySalience), \ 'neg_salience must be a {} instance'.format( get_class_name(EntitySalience)) self.neg_salience = neg_salience
def parse_args(self, treebank_reader, corenlp_reader): assert isinstance(treebank_reader, TreebankReader), \ 'treebank_reader must be a {} instance'.format( get_class_name(TreebankReader)) assert isinstance(corenlp_reader, CoreNLPReader), \ 'corenlp_reader must be a {} instance'.format( get_class_name(CoreNLPReader)) for label in self.imp_args: for arg in self.imp_args[label]: arg.get_treebank(treebank_reader) arg.parse_treebank() arg.parse_corenlp(corenlp_reader) for label, fillers in self.exp_args.items(): for arg in fillers: arg.get_treebank(treebank_reader) arg.parse_treebank() arg.parse_corenlp(corenlp_reader) if label in core_arg_list and len(fillers) > 1: assert len(fillers) == 2 new_fillers = [] for arg in fillers: # remove pointer pointing to WH-determiner if arg.tree.pos()[arg.tree_pointer.wordnum][1] != 'WDT': new_fillers.append(arg) # should only exists one non-WH-determiner pointer assert len(new_fillers) == 1 self.exp_args[label] = new_fillers
def __init__(self, core, salience): assert isinstance(core, CoreArgument), \ 'RichEntity must be initialized with a {} instance'.format( get_class_name(CoreArgument)) self.core = core assert isinstance(salience, EntitySalience), \ 'RichEntity must be initialized with a {} instance'.format( get_class_name(EntitySalience)) self.salience = salience
def __init__(self, doc_name, entities, events): self.doc_name = doc_name if not all(isinstance(entity, Entity) for entity in entities): raise ParseScriptError('every entity must be a {} instance'.format( get_class_name(Entity))) self.entities = entities if not all(isinstance(event, Event) for event in events): raise ParseScriptError('every event must be a {} instance'.format( get_class_name(Event))) self.events = events
def get_index(self, model, include_type=True, use_unk=True, pred_count_dict=None): assert isinstance(model, Word2VecModel), \ 'model must be a {} instance'.format(get_class_name(Word2VecModel)) self.rich_pred.get_index(model, include_type=include_type, use_unk=use_unk, pred_count_dict=pred_count_dict) if self.rich_subj is not None: self.rich_subj.get_index(model, include_type=include_type, use_unk=use_unk) if self.rich_obj is not None: self.rich_obj.get_index(model, include_type=include_type, use_unk=use_unk) for rich_pobj in self.rich_pobj_list: rich_pobj.get_index(model, include_type=include_type, use_unk=use_unk) # select the first argument with indexed positive candidate and at least # one indexed negative candidate from rich_pobj_list as the rich_pobj for rich_pobj in self.rich_pobj_list: if rich_pobj.has_neg(): self.rich_pobj = rich_pobj break
def build(cls, event, rich_entity_list, prep_vocab_list, use_lemma=True): assert isinstance(event, Event), 'event must be a {} instance'.format( get_class_name(Event)) rich_pred = RichPredicate.build(event.pred, use_lemma=use_lemma) rich_subj = None if event.subj is not None: rich_subj = BaseRichArgument.build('SUBJ', event.subj, rich_entity_list, use_lemma=use_lemma) rich_obj = None if event.obj is not None: rich_obj = BaseRichArgument.build('OBJ', event.obj, rich_entity_list, use_lemma=use_lemma) rich_pobj_list = [] for prep, pobj in event.pobj_list: arg_type = 'PREP_' + prep if prep in prep_vocab_list else 'PREP' rich_pobj = BaseRichArgument.build(arg_type, pobj, rich_entity_list, use_lemma=use_lemma) rich_pobj_list.append(rich_pobj) return cls(rich_pred, rich_subj, rich_obj, rich_pobj_list)
def add_sent(self, sent): assert isinstance(sent, Sentence), \ 'add_sent must be called with a {} instance'.format( get_class_name(Sentence)) sent.build_dep_graph() sent.process_verb_prt() self.sents.append(sent)
def __init__(self, word2vec, event_vector_layer_sizes=None, pair_composition_layer_sizes=None, use_salience=True, salience_features=None): assert isinstance(word2vec, Word2VecModel), \ 'word2vec must be a {} instance'.format( get_class_name(Word2VecModel)) self.word2vec = word2vec if event_vector_layer_sizes: self.event_vector_network = EventVectorNetwork( word_vectors=self.word2vec.get_vector_matrix(), vector_size=self.word2vec.vector_size, layer_sizes=event_vector_layer_sizes) else: self.event_vector_network = None if pair_composition_layer_sizes: self.pair_composition_network = PairCompositionNetwork( event_vector_network=self.event_vector_network, layer_sizes=pair_composition_layer_sizes, use_salience=use_salience, salience_features=salience_features) else: self.pair_composition_network = None
def set_embedding_model(self, embedding_model): assert isinstance(embedding_model, Word2VecModel), \ 'model must be a {} instance'.format(get_class_name(Word2VecModel)) self.logger.info('set embedding model: {}'.format( embedding_model.name)) self.embedding_model = embedding_model self.embedding_model_name = embedding_model.name
def build(cls, script, prep_vocab_list, use_lemma=True, filter_stop_events=False): assert isinstance(script, Script), \ 'script must be a {} instance'.format(get_class_name(Script)) # FIXME: should use the token count of original document token_count_dict = script.get_token_count(use_lemma=use_lemma) rich_entity_list = [] for entity in script.entities: rich_entity = RichEntity.build(entity, token_count_dict, use_lemma=use_lemma) rich_entity_list.append(rich_entity) rich_events = [] for event in script.events: rich_event = RichEvent.build(event, rich_entity_list=rich_entity_list, prep_vocab_list=prep_vocab_list, use_lemma=use_lemma) if (not filter_stop_events) or \ (rich_event.rich_pred.get_text(include_type=False) not in consts.STOP_PREDS): rich_events.append(rich_event) return cls(script.doc_name, rich_events, rich_entity_list)
def __init__(self, mentions): if not mentions: raise ParseEntityError('must provide at least one mention') if not all(isinstance(mention, Mention) for mention in mentions): raise ParseEntityError( 'every mention must be a {} instance'.format( get_class_name(Mention))) self.mentions = mentions self._rep_mention = None for mention in self.mentions: if mention.rep: if self._rep_mention is None: self._rep_mention = mention else: raise ParseEntityError( 'cannot have more than one representative mentions') if self._rep_mention is None: raise ParseEntityError('no representative mention provided') # NOBUG: set self.ner to be the most frequent ner of all mentions # might be different than the ner of rep_mention ner_counter = Counter() for mention in self.mentions: if mention.ner != '': ner_counter[mention.ner] += 1 if len(ner_counter): self.ner = ner_counter.most_common(1)[0][0] else: self.ner = ''
def from_coref(cls, coref): if not isinstance(coref, document.Coreference): raise ParseEntityError( 'from_coref must be called with a {} instance'.format( get_class_name(document.Coreference))) return cls( [Mention.from_mention(mention) for mention in coref.mentions])
def __init__(self, node_list): if len(node_list) <= 1: raise ParseNodeError('only 1 node provided in {}'.format( map(str, node_list))) if not all(isinstance(n, Node) for n in node_list): raise ParseNodeError('every node must be a {} instance'.format( get_class_name(Node))) if not all(node_list[0].file_id == n.file_id for n in node_list[1:]): raise ParseNodeError('inconsistency in file_id in {}'.format( map(str, node_list))) if not all(node_list[0].sent_id == n.sent_id for n in node_list[1:]): raise ParseNodeError('inconsistency in sent_id in {}'.format( map(str, node_list))) self.node_list = sorted(node_list, key=lambda n: n.token_id) self.file_id = node_list[0].file_id self.sent_id = node_list[0].sent_id # Penn TreeBank related info self.ptb_idx_list = [] self.ptb_surface = '' # Stanford CoreNLP related info self.corenlp_idx_list = [] self.corenlp_word_surface = '' self.corenlp_lemma_surface = '' self.head_idx = -1 self.head_word = '' self.head_node = None
def get_index(self, model, include_type=True, use_unk=True, pred_count_dict=None): # TODO: add logic to process stop predicates assert isinstance(model, Word2VecModel), \ 'model must be a {} instance'.format(get_class_name(Word2VecModel)) candidates = self.get_candidates() # add UNK to the candidates if use_unk is set to True if use_unk: candidates.append('UNK') # drop the predicate (return index -1) if its frequency is too high # use the threshold of count as consts.PRED_COUNT_THRES (100,000) if candidates and pred_count_dict: pred_count = pred_count_dict.get(candidates[0], 0) if pred_count > consts.PRED_COUNT_THRES: if random.random() < 1.0 - math.sqrt( float(consts.PRED_COUNT_THRES) / pred_count): self.wv = -1 return if include_type: candidates = [candidate + '-PRED' for candidate in candidates] index = -1 for text in candidates: index = model.get_word_index(text) if index != -1: break self.wv = index
def __repr__(self): return '%s\n%s'%( get_class_name(self), '\n'.join( ' '.join(mapl(str,locs)) for locs in self._loc_matrix) )
def _create_service(self, service_package, base_package, db_info): if not os.path.exists(service_package): os.makedirs(service_package) for table_info in db_info.items(): # 为每一张表创建一个对应的Service table_name = table_info[0] columns = table_info[1] # 找到主键 primary_key, primary_key_type, primary_key_type_short, primary_key_jdbc_type = \ self._get_primary_key(columns) # 内容 content = '' class_name = util.get_class_name(table_name) file_name = os.path.join(service_package, class_name + 'Service.java') package_str = 'package ' + base_package + '.service;' import_str = 'import ' + base_package + '.pojo.' + class_name + ';' class_str = 'public interface ' + class_name + 'Service {' class_str += '\n' class_str += '\tint deleteByPrimaryKey(' + primary_key_type_short + ' ' + primary_key + ');\n\n' class_str += '\tint insert(' + class_name + ' record);\n\n' class_str += '\tint insertSelective(' + class_name + ' record);\n\n' class_str += '\t' + class_name + ' selectByPrimaryKey(' + primary_key_type_short + \ ' ' + primary_key + ');\n\n' class_str += '\tint updateByPrimaryKeySelective(' + class_name + ' record);\n\n' class_str += '\tint updateByPrimaryKey(' + class_name + ' record);\n' class_str += '}' content += package_str content += '\n\n' content += import_str content += '\n\n' content += class_str self._write_file(content, file_name)
def get_sent(self, idx): assert 0 <= idx < len(self.sents), \ '{} out of sentence index'.format(idx) result = self.sents[idx] assert isinstance(result, Sentence), \ 'return value of get_sent must be a {} instance'.format( get_class_name(Sentence)) return result
def get_coref(self, idx): assert 0 <= idx < len(self.corefs), \ '{} out of coreference index'.format(idx) result = self.corefs[idx] assert isinstance(result, Coreference), \ 'return value of get_coref must be a {} instance'.format( get_class_name(Coreference)) return result
def __init__(self, arg_pointer, dice_score, core, entity_salience): assert isinstance(arg_pointer, RichTreePointer), \ 'arg_pointer must be a {} instance'.format( get_class_name(RichTreePointer)) self.arg_pointer = arg_pointer self.dice_score = dice_score assert isinstance(core, CoreArgument), \ 'core must be a {} instance'.format(get_class_name(CoreArgument)) self.core = deepcopy(core) assert isinstance(entity_salience, EntitySalience) or \ entity_salience is None, \ 'entity_salience must be a {} instance or None'.format( get_class_name(EntitySalience)) self.entity_salience = entity_salience
def get_mention(self, idx): assert 0 <= idx < len(self.mentions), \ '{} out of mention index'.format(idx) result = self.mentions[idx] assert isinstance(result, Mention), \ 'return value of get_mention must be a {} instance'.format( get_class_name(Mention)) return result
def add_token(self, token): assert isinstance(token, Token), \ 'add_token must be called with a {} instance'.format( get_class_name(Token)) # set the sent_idx attrib of the token token.set_attrib('sent_idx', self.idx) # set the token_idx attrib of the token token.set_attrib('token_idx', len(self.tokens)) self.tokens.append(token)
def __init__(self, pred, neg=False): assert pred is not None and isinstance(pred, document.Token), \ 'Predicate must be a {} instance'.format( get_class_name(document.Token)) self.pred = pred self.neg = neg self.subj = None self.obj = None self.pobj_list = []
def from_token(cls, token): if not isinstance(token, document.Token): raise ParseTokenError( 'from_token must be called with a {} instance'.format( get_class_name(document.Token))) word = token.word lemma = token.lemma pos = token.pos return cls(word, lemma, pos)
def __init__(self, pred, subj, obj, pobj_list): if not isinstance(pred, Predicate): raise ParseEventError('pred must be a {} instance'.format( get_class_name(Predicate))) self.pred = pred if not (subj is None or isinstance(subj, Argument)): raise ParseEventError('subj must be None or a {} instance'.format( get_class_name(Argument))) self.subj = subj if not (obj is None or isinstance(obj, Argument)): raise ParseEventError('obj must be None or a {} instance'.format( get_class_name(Argument))) self.obj = obj if not all(prep != '' for prep, _ in pobj_list): warn('some of prep(s) in pobj_list are empty') if not all(isinstance(pobj, Argument) for _, pobj in pobj_list): raise ParseEventError('every pobj must be a {} instance'.format( get_class_name(Argument))) self.pobj_list = pobj_list
def _generate_getters(columns): getters_str = '' for column in columns: name = column['name'] db_type = column['type'] method_name = util.get_class_name(name) field_name = util.get_field_name(name) java_type = util.get_java_type(db_type) getters_str += '\tpublic ' + java_type + ' get' + method_name + '() { return ' + field_name + '; }\n' return getters_str
def construct(cls, doc): assert isinstance(doc, document.Document), \ 'read_from_document must be called with a {} instance'.format( get_class_name(document.Document)) script = cls(doc.doc_name) for sent in doc.sents: script.read_from_sentence(sent) for coref in doc.corefs: script.add_coref(coref) script.sort() return script
def _generate_setters(columns): setters_str = '' for column in columns: name = column['name'] db_type = column['type'] method_name = util.get_class_name(name) field_name = util.get_field_name(name) java_type = util.get_java_type(db_type) setters_str += '\tpublic void set' + method_name + '(' + java_type + ' ' + \ field_name + ') { this.' + field_name + ' = ' + field_name + '; }\n' return setters_str
def build_pred_wv_mapping(pred_list, model): assert isinstance(model, Word2VecModel), \ 'model must be a {} instance'.format(get_class_name(Word2VecModel)) pred_wv_mapping = {} for pred in pred_list: index = model.get_word_index(pred + '-PRED') assert index != -1 pred_wv_mapping[pred] = index return pred_wv_mapping
def add_mention(self, mention): assert isinstance(mention, Mention), \ 'add_mention must be called with a {} instance'.format( get_class_name(Mention)) # set the coref_idx attrib of the mention mention.set_attrib('coref_idx', self.idx) # set the mention_idx attrib of the mention mention.set_attrib('mention_idx', len(self.mentions)) self.mentions.append(mention) if mention.rep: self.rep_mention = mention