def get_coefs_helper(clf, feature_names: List[str] = None, coef_func=None): """ Get dataframe of classifier coefficients. By default, assumes it is a pipeline with a logistic regression component :param clf: classifier model :param feature_names: list of feature names to get coefficients for :param coef_func: function for accessing the list of coefficients from the classifier model :return: DataFrame of features and coefficients, indexed by feature names """ if coef_func is None: try: coefs = clf.named_steps['logreg'].coef_[0].tolist() except AttributeError: warn( "Classifier is not a pipeline with a logistic regression component, so default coefficient getter function" "did not work. Choose a valid coef_func argument.") return else: coefs = coef_func(clf) assert len(feature_names) == len(coefs) feats_coefs = sorted(list(zip(feature_names, coefs)), key=lambda x: x[1], reverse=True) return pd.DataFrame(feats_coefs, columns=['feat_name', 'coef'])\ .set_index('feat_name').sort_values('coef', ascending=False)
def __setitem__(self, key, value): if not isinstance(key, str): warn( "Metadata attribute keys must be strings. Input key has been casted to a string." ) key = str(key) if self.index.type_check: if not isinstance( value, type(None)): # do nothing to index if value is None if key not in self.index.indices[self.obj_type]: type_ = _optimized_type_check(value) self.index.update_index(self.obj_type, key=key, class_type=type_) else: # entry exists if self.index.get_index(self.obj_type)[key] != [ "bin" ]: # if "bin" do no further checks if str(type(value)) not in self.index.get_index( self.obj_type)[key]: new_type = _optimized_type_check(value) if new_type == "bin": self.index.set_index(self.obj_type, key, "bin") else: self.index.update_index( self.obj_type, key, new_type) dict.__setitem__(self, key, value)
def set_id(self, value): if not isinstance(value, str) and value is not None: self._id = str(value) warn( "{} id must be a string. ID input has been casted to a string." .format(self.obj_type)) else: self._id = value
def __setitem__(self, key, value): if not isinstance(key, str): warn( "Metadata attribute keys must be strings. Input key has been casted to a string." ) key = str(key) if self.index.type_check: ConvoKitMeta._check_type_and_update_index(self.index, self.obj_type, key, value) dict.__setitem__(self, key, value)
def initialize_speakers_and_utterances_objects(corpus, utt_dict, utterances, speakers_dict, speakers_data): """ Initialize Speaker and Utterance objects """ if len( utterances ) > 0: # utterances might be empty for invalid corpus start/end indices KeySpeaker = "speaker" if "speaker" in utterances[0] else "user" KeyConvoId = "conversation_id" if "conversation_id" in utterances[ 0] else "root" for i, u in enumerate(utterances): u = defaultdict(lambda: None, u) speaker_key = u[KeySpeaker] if speaker_key not in speakers_dict: if u[KeySpeaker] not in speakers_data: warn( "CorpusLoadWarning: Missing speaker metadata for speaker ID: {}. " "Initializing default empty metadata instead.".format( u[KeySpeaker])) speakers_data[u[KeySpeaker]] = {} if KeyMeta in speakers_data[u[KeySpeaker]]: speakers_dict[speaker_key] = Speaker( owner=corpus, id=u[KeySpeaker], meta=speakers_data[u[KeySpeaker]][KeyMeta]) else: speakers_dict[speaker_key] = Speaker( owner=corpus, id=u[KeySpeaker], meta=speakers_data[u[KeySpeaker]]) speaker = speakers_dict[speaker_key] speaker.vectors = speakers_data[u[KeySpeaker]].get(KeyVectors, []) # temp fix for reddit reply_to if "reply_to" in u: reply_to_data = u["reply_to"] else: reply_to_data = u[KeyReplyTo] utt = Utterance(owner=corpus, id=u[KeyId], speaker=speaker, conversation_id=u[KeyConvoId], reply_to=reply_to_data, timestamp=u[KeyTimestamp], text=u[KeyText], meta=u[KeyMeta]) utt.vectors = u.get(KeyVectors, []) utt_dict[utt.id] = utt
def __delitem__(self, key): if self.obj_type == 'corpus': dict.__delitem__(self, key) self.index.del_from_index(self.obj_type, key) else: if self.index.lock_metadata_deletion[self.obj_type]: warn( "For consistency in metadata attributes in Corpus component objects, deleting metadata attributes " "from component objects individually is not allowed. " "To delete this metadata attribute from all Corpus components of this type, " "use corpus.delete_metadata(obj_type='{}', attribute='{}') instead." .format(self.obj_type, key)) else: dict.__delitem__(self, key)
def from_dir(dirpath, matrix_name): """ Initialize a ConvoKitMatrix of the specified `matrix_name` from a specified directory `dirpath`. :param dirpath: path to Corpus directory :param matrix_name: name of vector matrix :return: the initialized ConvoKitMatrix """ try: with open(os.path.join(dirpath, 'vectors.{}.p'.format(matrix_name)), 'rb') as f: retval: ConvoKitMatrix = pickle.load(f) if not retval._sparse: retval.matrix = retval.matrix.toarray() return retval except FileNotFoundError: warn("Could not find vector with name: {} at {}.".format(matrix_name, dirpath)) return None
def __setitem__(self, key, value): if not isinstance(key, str): warn( "Metadata keys must be strings. Input key has been casted to a string." ) key = str(key) if key not in self.index.indices[self.obj_type]: # update Corpus index try: json.dumps(value) self.index.update_index(self.obj_type, key=key, class_type=str(type(value))) except (TypeError, OverflowError): # unserializable self.index.update_index(self.obj_type, key=key, class_type="bin") dict.__setitem__(self, key, value)
def check_integrity(self, verbose: bool = True) -> bool: """ Check the integrity of this Conversation; i.e. do the constituent utterances form a complete reply-to chain? :param verbose: whether to print errors indicating the problems with the Conversation :return: True if the conversation structure is complete else False """ if verbose: print("Checking reply-to chain of Conversation", self.id) utt_reply_tos = { utt.id: utt.reply_to for utt in self.iter_utterances() } target_utt_ids = set(list(utt_reply_tos.values())) speaker_utt_ids = set(list(utt_reply_tos.keys())) root_utt_id = target_utt_ids - speaker_utt_ids # There should only be 1 root_utt_id: None if len(root_utt_id) != 1: if verbose: for utt_id in root_utt_id: if utt_id is not None: warn("ERROR: Missing utterance {}".format(utt_id)) return False else: root_id = list(root_utt_id)[0] if root_id is not None: if verbose: warn("ERROR: Missing utterance {}".format(root_id)) return False # sanity check utts_replying_to_none = 0 for utt in self.iter_utterances(): if utt.reply_to is None: utts_replying_to_none += 1 if utts_replying_to_none > 1: if verbose: warn("ERROR: Found more than one Utterance replying to None.") return False circular = [ utt_id for utt_id, utt_reply_to in utt_reply_tos.items() if utt_id == utt_reply_to ] if len(circular) > 0: if verbose: warn( "ERROR: Found utterances with .reply_to pointing to themselves: {}" .format(circular)) return False if verbose: print("No issues found.\n") return True
def check_integrity(self, verbose=True): if verbose: print("Checking reply-to chain of Conversation", self.id) utt_reply_tos = { utt.id: utt.reply_to for utt in self.iter_utterances() } target_utt_ids = set(list(utt_reply_tos.values())) speaker_utt_ids = set(list(utt_reply_tos.keys())) root_utt_id = target_utt_ids - speaker_utt_ids # There should only be 1 root_utt_id: None if len(root_utt_id) != 1: if verbose: for utt_id in root_utt_id: if utt_id is not None: warn("ERROR: Missing utterance {}".format(utt_id)) return False else: root_id = list(root_utt_id)[0] if root_id is not None: if verbose: warn("ERROR: Missing utterance {}".format(root_id)) return False # sanity check utts_replying_to_none = 0 for utt in self.iter_utterances(): if utt.reply_to is None: utts_replying_to_none += 1 if utts_replying_to_none > 1: if verbose: warn("ERROR: Found more than one Utterance replying to None.") return False if verbose: print("No issues found.\n") return True
def __init__(self, owner=None, id: Optional[str] = None, speaker: Optional[Speaker] = None, user: Optional[Speaker] = None, conversation_id: Optional[str] = None, root: Optional[str] = None, reply_to: Optional[str] = None, timestamp: Optional[int] = None, text: str = '', meta: Optional[Dict] = None): super().__init__(obj_type="utterance", owner=owner, id=id, meta=meta) speaker_ = speaker if speaker is not None else user self.speaker = speaker_ if self.speaker is None: raise ValueError("No Speaker found: Utterance must be initialized with a Speaker.") self.user = speaker # for backwards compatbility self.conversation_id = conversation_id if conversation_id is not None else root if self.conversation_id is not None and not isinstance(self.conversation_id, str): warn("Utterance conversation_id must be a string: conversation_id of utterance with ID: {} " "has been casted to a string.".format(self.id)) self.conversation_id = str(self.conversation_id) self._root = self.conversation_id self.reply_to = reply_to self.timestamp = timestamp # int(timestamp) if timestamp is not None else timestamp if not isinstance(text, str): warn("Utterance text must be a string: text of utterance with ID: {} " "has been casted to a string.".format(self.id)) text = '' if text is None else str(text) self.text = text
def matrix(self): warn( "ConvoKitMatrix's internal matrix cannot be deleted. Use Corpus.delete_vector_matrix() instead." )