def extract_claims(data: pd.DataFrame(), model_path: str = MODEL_PATH, weight_path: str = WEIGHT_PATH, col_name: str = "sentence"): """ Extract Claims from given columns in a dataset to extract the claim. :param file_path: path to input file, which contains sentences :param model_path: location of model, can be downloaded offline or link can be given :param weight_path: location of model weight, can be downloaded offline or link can be given :param col_name: name of column on which claim is to be identified, should not be "sentences :return: labels, if a sentence is a claim or not """ model = load_claim_extraction_model(model_path, weight_path) # print("MODEL LOADED!!!") # noqa: T001 reader = CrfPubmedRCTReader() claim_predictor = ClaimCrfPredictor(model, dataset_reader=reader) df = data if col_name not in df.columns: return None df_sentence = df.copy() # NOTE(alpha_darklord): The function returns a list of labels, whether a particular # sentence is a claim or not (0 or 1), best_paths is used to get this label, # later we extract sentences which have 1 label and transfer them into a list contained in column "claims" df_sentence["sentences"] = df_sentence[col_name] df_sentence["sentences"] = df_sentence.sentences.apply(sent_tokenize) df_sentence['pred'] = df_sentence.sentences.apply( lambda x: claim_predictor.predict_json({'sentences': x})) df_sentence['best_paths'] = df_sentence.pred.apply( lambda x: model.crf.viterbi_tags( torch.FloatTensor(x['logits']).unsqueeze(0), torch.LongTensor(x['mask']).unsqueeze(0))) df_sentence['p_claims'] = df_sentence['best_paths'].apply( lambda x: 100 * np.array(x[0][0])) df_sentence['claims'] = df_sentence.apply( lambda x: np.extract(x['p_claims'], x['sentences']), axis=1) df_claims = df_sentence[~(df_sentence.claims.str.len() == 0)] del df_sentence # NOTE(alpha_darklord): This converts a list present inside a column to different rows # containing individual items df_updated = df_claims[[col_name, "claims"]].explode("claims") df_updated["claim_flag"] = 1 df_merged = df.merge(df_updated, on=[col_name], how="left") df_merged["claim_flag"].fillna(0, inplace=True) return df_merged
discourse_predictor = Predictor.from_archive(archive_, 'discourse_crf_predictor') model = predictor._model for param in list(model.parameters()): param.requires_grad = False num_classes, constraints, include_start_end_transitions = 2, None, False model.crf = ConditionalRandomField( num_classes, constraints, include_start_end_transitions=include_start_end_transitions) model.label_projection_layer = TimeDistributed( Linear(2 * EMBEDDING_DIM, num_classes)) model.load_state_dict( torch.load(cached_path(WEIGHT_PATH), map_location='cpu')) reader = CrfPubmedRCTReader() claim_predictor = ClaimCrfPredictor(model, dataset_reader=reader) def parse_pubmed_xml(pmid): """ Parse article information for the given PMID """ url = PUBMED_URL % pmid page = urllib.request.urlopen(url).read() tree = html.fromstring(page) abstract = '' for e in tree.xpath('//abstract/abstracttext'): if e is not None: abstract += stringify_children(e).strip() title = ' '.join(
self.vocab.get_token_from_index(label, namespace='labels') for label in instance_labels ] for instance_labels in output_dict["labels"]] return output_dict @overrides def get_metrics(self, reset: bool = False) -> Dict[str, float]: return { metric_name: metric.get_metric(reset) for metric_name, metric in self.metrics.items() } if __name__ == '__main__': claim_reader = ClaimAnnotationReaderJSON() discourse_reader = CrfPubmedRCTReader() claim_train_dataset = claim_reader.read(cached_path(TRAIN_PATH)) claim_validation_dataset = claim_reader.read(cached_path(VALIDATION_PATH)) discourse_train_dataset = discourse_reader.read( cached_path(DISCOURSE_TRAIN_PATH)) discourse_validation_dataset = discourse_reader.read( cached_path(DISCOURSE_VALIDATION_PATH)) vocab = Vocabulary.from_instances(claim_train_dataset + \ claim_validation_dataset + \ discourse_train_dataset + \ discourse_validation_dataset) discourse_dict = { 'RESULTS': 0, 'METHODS': 1, 'CONCLUSIONS': 2, 'BACKGROUND': 3,
for param in list(model.parameters()): param.requires_grad = False num_classes, constraints, include_start_end_transitions = 2, None, False model.classifier_feedforward._linear_layers = ModuleList([ torch.nn.Linear(2 * EMBEDDING_DIM, EMBEDDING_DIM), torch.nn.Linear(EMBEDDING_DIM, num_classes) ]) model.crf = ConditionalRandomField( num_classes, constraints, include_start_end_transitions=include_start_end_transitions) model.label_projection_layer = TimeDistributed( Linear(2 * EMBEDDING_DIM, num_classes)) model.load_state_dict(torch.load(cached_path(WEIGHT_PATH))) reader = CrfPubmedRCTReader() claim_predictor = ClaimCrfPredictor(model, dataset_reader=reader) fixture_path = os.path.join('..', 'pubmed-rct', 'PubMed_200k_RCT', 'fixtures_crf.json') examples = read_json(fixture_path) pred_list = [] for example in examples: sentences = sent_tokenize(example['abstract']) instance = reader.text_to_instance(sents=sentences) pred = claim_predictor.predict_instance(instance) logits = torch.FloatTensor(pred['logits']) best_paths = model.crf.viterbi_tags( torch.FloatTensor(pred['logits']).unsqueeze(0), torch.LongTensor(pred['mask']).unsqueeze(0)) pred_list.append(best_paths[0][0])