def load_hardware_labels( session, candidate_class, filename, attrib, annotator_name="gold" ): ak = session.query(GoldLabelKey).filter(GoldLabelKey.name == annotator_name).first() if ak is None: ak = GoldLabelKey(name=annotator_name) session.add(ak) session.commit() candidates = session.query(candidate_class).all() gold_dict = get_gold_dict(filename, attribute=attrib) cand_total = len(candidates) logger.info("Loading {} candidate labels".format(cand_total)) labels = [] for i, c in enumerate(tqdm(candidates)): doc = (c[0].span.sentence.document.name).upper() part = (c[0].span.get_span()).upper() val = ("".join(c[1].span.get_span().split())).upper() label = ( session.query(GoldLabel) .filter(GoldLabel.key == ak) .filter(GoldLabel.candidate == c) .first() ) if label is None: if (doc, part, val) in gold_dict: label = GoldLabel(candidate=c, key=ak, value=1) else: label = GoldLabel(candidate=c, key=ak, value=-1) session.add(label) labels.append(label) session.commit() session.commit() logger.info("AnnotatorLabels created: %s" % (len(labels),))
def load_hardware_labels( session, candidate_classes, filename, attrib, annotator_name="gold" ): """Bulk insert hardware GoldLabels. :param session: The database session to use. :param candidate_classes: Which candidate_classes to load labels for. :param filename: Path to the CSV file containing gold labels. :param attrib: Which attributes to load labels for (e.g. "stg_temp_max"). """ # Check that candidate_classes is iterable candidate_classes = ( candidate_classes if isinstance(candidate_classes, (list, tuple)) else [candidate_classes] ) ak = session.query(GoldLabelKey).filter(GoldLabelKey.name == annotator_name).first() # Add the gold key if ak is None: ak = GoldLabelKey( name=annotator_name, candidate_classes=[_.__tablename__ for _ in candidate_classes], ) session.add(ak) session.commit() # Bulk insert candidate labels candidates = [] for candidate_class in candidate_classes: candidates.extend(session.query(candidate_class).all()) gold_dict = get_gold_dict(filename, attribute=attrib) cand_total = len(candidates) logger.info(f"Loading {cand_total} candidate labels") labels = 0 cands = [] values = [] for i, c in enumerate(tqdm(candidates)): doc = (c[0].context.sentence.document.name).upper() part = (c[0].context.get_span()).upper() val = ("".join(c[1].context.get_span().split())).upper() label = session.query(GoldLabel).filter(GoldLabel.candidate == c).first() if label is None: if (doc, part, val) in gold_dict: values.append(TRUE) else: values.append(FALSE) cands.append(c) labels += 1 # Only insert the labels which were not already present session.bulk_insert_mappings( GoldLabel, [ {"candidate_id": cand.id, "keys": [annotator_name], "values": [val]} for (cand, val) in zip(cands, values) ], ) session.commit() logger.info(f"GoldLabels created: {labels}")
def load_president_gold_labels(session, candidate_classes, filename, annotator_name="gold"): """Bulk insert hardware GoldLabels. :param session: The database session to use. :param candidate_classes: Which candidate_classes to load labels for. :param filename: Path to the CSV file containing gold labels. """ # Check that candidate_classes is iterable candidate_classes = (candidate_classes if isinstance( candidate_classes, (list, tuple)) else [candidate_classes]) print("Clearing ALL Gold labels") session.query(GoldLabel).delete() session.query(GoldLabelKey).delete() ak = session.query(GoldLabelKey).filter( GoldLabelKey.name == annotator_name).first() # Add the gold key if ak is None: ak = GoldLabelKey( name=annotator_name, candidate_classes=[_.__tablename__ for _ in candidate_classes], ) session.add(ak) session.commit() # Bulk insert candidate labels candidates = [] for candidate_class in candidate_classes: candidates.extend(session.query(candidate_class).all()) gold_dict = get_gold_dict(filename) cand_total = len(candidates) print(f"Loading {cand_total} candidate labels") labels = 0 docs_in_gold_dict = set([x[0] for x in gold_dict]) print(f"{len(docs_in_gold_dict)} different docs in gold dict") candidates_by_doc = dict() for name, place in candidates: doc = name.context.sentence.document.name if doc not in candidates_by_doc: candidates_by_doc[doc] = {name: [place]} else: if name not in candidates_by_doc[doc]: candidates_by_doc[doc][name] = [place] else: candidates_by_doc[doc][name].append(place) cands = [] values = [] for i, c in enumerate(tqdm(candidates)): doc = (c[0].context.sentence.document.name).upper() president_name = (c[0].context.get_span()).upper() birthplace = (c[1].context.get_span()).upper() cand_tuple = (doc, president_name, birthplace) # gold_matches = [x for x in gold_dict if x[0] == doc] if cand_tuple in gold_dict: values.append(TRUE) else: values.append(FALSE) cands.append(c) labels += 1 # Only insert the labels which were not already present session.bulk_insert_mappings( GoldLabel, [{ "candidate_id": cand.id, "keys": [annotator_name], "values": [val] } for (cand, val) in zip(cands, values)], ) session.commit() print(f"GoldLabels created: {labels}")
def load_section_heading_gold_labels( featurizer_output,annotator_name="gold" ): """ :param session: The database session to use. :param candidate_classes: Which candidate_classes to load labels for. :param filename: Path to the CSV file containing gold labels. """ session = featurizer_output['session'] candidate_classes = featurizer_output['candidate_variable'] filename = config.gold_file_path # Check that candidate_classes is iterable candidate_classes = ( candidate_classes if isinstance(candidate_classes, (list, tuple)) else [candidate_classes] ) ak = session.query(GoldLabelKey).filter(GoldLabelKey.name == annotator_name).first() # Add the gold key if ak is None: ak = GoldLabelKey( name=annotator_name, candidate_classes=[_.__tablename__ for _ in candidate_classes], ) session.add(ak) session.commit() # Bulk insert candidate labels candidates = [] for candidate_class in candidate_classes: candidates.extend(session.query(candidate_class).all()) gold_dict = get_gold_dict(filename) cand_total = len(candidates) print(f"Loading {cand_total} candidate labels") labels = 0 cands = [] values = [] for i, c in enumerate(tqdm(candidates)): doc = (c[0].context.sentence.document.name) val = (c[0].context.get_span()) label = session.query(GoldLabel).filter(GoldLabel.candidate == c).first() if label is None: if (doc, val) in gold_dict: values.append(TRUE) else: values.append(FALSE) cands.append(c) labels += 1 # Only insert the labels which were not already present session.bulk_insert_mappings( GoldLabel, [ {"candidate_id": cand.id, "keys": [annotator_name], "values": [val]} for (cand, val) in zip(cands, values) ], ) session.commit() print(f"GoldLabels created: {labels}")