def train(self, train_ranked_lists_by_ranker: Dict[str, List[List[int]]]): L_train = get_L_from_rankings(train_ranked_lists_by_ranker) ds = DependencySelector() deps = ds.select(L_train, threshold=0.0) self.snorkel_gm.train(L_train, deps, epochs=100, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6) self.is_trained = True
# Load gold labels missed = load_external_labels(session, VirusHost, annotator_name='gold', split=1) L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1) missed = load_external_labels(session, VirusHost, annotator_name='gold', split=2) L_gold_test = load_gold_labels(session, annotator_name='gold', split=2) # Generative model ds = DependencySelector() deps = ds.select(L_train, threshold=0.1) gen_model = GenerativeModel() gen_model.train(L_train, epochs=100, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1.00e-03, deps=deps) train_marginals = gen_model.marginals(L_train) # Discriminative model featurizer = FeatureAnnotator(f=hybrid_span_mention_ftrs) F_train = featurizer.load_matrix(session, split=0)
L_train_BC L_train_BD L_train_BM L_train_BT # Labeling Function Performance - Coverage, Overlaps, Conflicts L_train_BC.lf_stats(session) L_train_BD.lf_stats(session) L_train_BM.lf_stats(session) L_train_BT.lf_stats(session) # Analyzing Dependencies Ldeps = [] for L in [L_train_BC, L_train_BD, L_train_BD, L_train_BD]: ds = DependencySelector() deps = ds.select(L, threshold=0.1) len(deps) Ldeps.append(deps) gen_model = GenerativeModel(lf_propensity=True) gen_model.train(L_train, deps=deps, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=0.0) train_marginals = gen_model.marginals(L_train) plt.hist(train_marginals, bins=20) plt.show() gen_model.learned_lf_stats() save_marginals(session, L_train, train_marginals) load_external_labels(session,
L_train[:, columns], epochs=10, decay=0.95, step_size=0.1 / L_train[:, columns].shape[0], reg_param=1e-6, threads=50, ) indep_models.append(indep_gen_model) # In[ ]: dep_models = [] for columns in lfs_columns: # select the dependancies from the label matrix ds = DependencySelector() deps = ds.select(L_train[:, columns], threshold=0.1) print(len(deps)) # Model each label function and the underlying correlation structure gen_model = GenerativeModel(lf_propensity=True) gen_model.train(L_train[:, columns], epochs=10, decay=0.95, step_size=0.1 / L_train[:, columns].shape[0], reg_param=1e-6, threads=50, deps=deps) dep_models.append(gen_model) # # Generative Model Statistics
class SnorkelCollator(Collator): def __init__( self, positive_label: str, class_cardinality: int = 2, num_epochs: int = 500, log_train_every: int = 50, seed: int = 123, threshold: float = 0.5, ): self.positive_label = positive_label self.class_cardinality = class_cardinality self.num_epochs = num_epochs self.log_train_every = log_train_every self.seed = seed self.ds = DependencySelector() self.gen_model = GenerativeModel(lf_propensity=True) self.threshold = threshold @classmethod def get_snorkel_index(cls, tag: str) -> int: if is_positive(tag): return 1 elif is_negative(tag): return 0 else: return -1 def get_tag(self, index: int) -> str: if index == 1: return self.positive_label else: return NEGATIVE_LABEL def get_index(self, prob: np.ndarray) -> str: assert prob.shape == (2, ) return prob.argmax() def collate_np(self, annotations) -> Tuple[np.ndarray, List[str], List[int]]: output_arrs: List[np.ndarray] = [] words_list: List[str] = [] id_to_labels: Dict[int, Tuple[int, int]] = {} num_funcs = len(annotations) for i, ann_inst in tqdm(enumerate(zip(*annotations))): ids = [inst['id'] for inst in ann_inst] inputs = [inst['input'] for inst in ann_inst] outputs = [inst['output'] for inst in ann_inst] input_len = len(inputs[0]) entry_id = ids[0] # output arr = (sentence x num_labels) output_arr = np.zeros((input_len, num_funcs)) for i, output in enumerate(outputs): for j, out_j in enumerate(output): output_arr[j, i] = SnorkelCollator.get_snorkel_index(out_j) label_start = len(words_list) for word_i, word in enumerate(inputs[0]): words_list.append(word) output_arrs.append(output_arr) label_end = len(words_list) id_to_labels[entry_id] = (label_start, label_end) output_res = np.concatenate(output_arrs, axis=0) return output_res, words_list, id_to_labels def train_label_model( self, collated_labels: np.ndarray, descriptions: Optional[List[str]], train_data_np: Optional[np.ndarray], ): sparse_labels = sparse.csr_matrix(collated_labels.astype(int)) if descriptions is not None: descriptions = [(i, desc) for i, desc in enumerate(descriptions)] logger.warn(f'labeling function order: {descriptions}') deps = self.ds.select(sparse_labels, threshold=0.05) self.gen_model.train( sparse_labels, deps=deps, decay=0.95, step_size=0.1 / sparse_labels.shape[0], reg_param=0.0, cardinality=self.class_cardinality, ) def get_probabilistic_labels(self, collated_labels: np.ndarray) -> np.ndarray: sparse_labels = sparse.csr_matrix(collated_labels) return self.gen_model.marginals(sparse_labels) def convert_to_tags( self, train_probs: np.ndarray, word_list: List[str], id_to_labels: Dict[int, Tuple[int, int]], ) -> List[AnnotatedDataType]: output = [] for entry_id, (label_start, label_end) in id_to_labels.items(): words = word_list[label_start:label_end] prob_labels = train_probs[label_start:label_end] if self.class_cardinality == 2: # (m, ) marginals in prob labels label_ids = (prob_labels > self.threshold).astype(int) else: # (m, k) marginals in prob labels label_ids = prob_labels.argmax(axis=1) labels = [self.get_tag(i) for i in label_ids] output.append({ 'id': entry_id, 'input': words, 'output': labels, }) return output def collate( self, annotations: List[AnnotatedDataType], should_verify: bool = False, descriptions: Optional[List[str]] = None, train_data: Optional[AnnotatedDataType] = None ) -> AnnotatedDataType: ''' args: ``annotations``: List[AnnotatedDataType] given a series of annotations, collate them into a single series of annotations per instance ''' if should_verify: # make sure the annotations are in the # proper format Collator.verify_annotations(annotations) train_data_np = None if train_data: # if train data specified, will be used by Snorkel to estimate class balanc train_data_np, word_lists, id_to_labels = self.collate_np( [train_data]) train_data_np = train_data_np.astype(int) train_data_np = train_data_np.reshape(-1) collate_np, word_lists, id_to_labels = self.collate_np(annotations) self.train_label_model(collated_labels=collate_np, descriptions=descriptions, train_data_np=train_data_np) y_train_probs = self.get_probabilistic_labels( collated_labels=collate_np, ) tags = self.convert_to_tags(y_train_probs, word_list=word_lists, id_to_labels=id_to_labels) return tags