def reconstruct_stem_fore(model, data_loader, args): ''' the foreign version of the stem reconstruction training. :param model: :param data_loader: :param args: :return: ''' stem_dict = {} num2stem = {} stemmer = NltkPorterStemmer() train_stem = [] for sent in data_loader.processed_sent: sent_stem = [] for word in sent: word_stem = stemmer.stem(word) if word_stem not in stem_dict: stem_dict[word_stem] = len(stem_dict) num2stem[stem_dict[word_stem]] = word_stem sent_stem.append(stem_dict[word_stem]) train_stem.append(torch.LongTensor(sent_stem)) stem_dict['UNK'] = len(stem_dict) print(len(stem_dict)) ''' training steps ''' for param in model.parameters(): param.require_grad = False embed_loader = Foreign_Elmo(args.elmo_model_path, args.embedding_source) elmo_embeds_train, elmo_type_train = embed_loader._get_embeddings( data_loader.processed_sent) args.weight_decay = args.weight_decay2 predictor = baselines.Recon_Lemma(stem_dict, model, args) for e in range(args.epoch): out1 = predictor.train(data_loader.processed_sent, train_stem, elmo_embeds_train, sent_per_epoch=args.sent_per_epoch) print(out1) # ''' load the dev data ''' processed_tag_dev, processed_sent_dev, processed_tree_dev, processed_tree_lab_dev, \ processed_tree_Long_dev, processed_tree_lab_Long_dev, processed_tag_Long_dev = data_loader.load_dev_verbo( args.dataset_dev) token_embeds_dev, elmo_type_dev = embed_loader._get_embeddings( processed_sent_dev) stem_dev = get_stem(processed_sent_dev, stemmer, stem_dict) out1, result_dict = predictor.eval_dev(processed_sent_dev, stem_dev, token_embeds_dev) print('dev-final-summary: {}'.format(out1)) ''' load the test data ''' if args.test == 'yes': processed_tag_test, processed_sent_test, processed_tree_test, processed_tree_lab_test, \ processed_tree_Long_test, processed_tree_lab_Long_test, processed_tag_Long_test = data_loader.load_dev_verbo( args.dataset_test) token_embeds_test, elmo_type_test = embed_loader._get_embeddings( processed_sent_test) stem_test = get_stem(processed_sent_test, stemmer, stem_dict) out1, result_dict = predictor.eval_dev(processed_sent_test, stem_test, token_embeds_test) print('test-final-summary: {}'.format(out1))
def reconstruct_stem(model, data_loader, args): stem_dict = {} num2stem = {} stemmer = NltkPorterStemmer() train_stem = [] for sent in data_loader.processed_sent: sent_stem = [] for word in sent: word_stem = stemmer.stem(word) if word_stem not in stem_dict: stem_dict[word_stem] = len(stem_dict) num2stem[stem_dict[word_stem]] = word_stem sent_stem.append(stem_dict[word_stem]) train_stem.append(torch.LongTensor(sent_stem)) stem_dict['UNK'] = len(stem_dict) print(len(stem_dict)) ''' training ''' for param in model.parameters(): param.require_grad = False embed_loader = Embedding_Weight(args.embedding_source, data_loader=data_loader, num_sent=args.epoch_sent) args.weight_decay = args.weight_decay2 elmo_embeds_train, elmo_type_train = load_elmo(data_loader, args, embed_loader, mod='train', processed_sent_dev=None, processed_sent_test=None) predictor = baselines.Recon_Lemma(stem_dict, model, args) for e in range(args.epoch): out1 = predictor.train(data_loader.processed_sent, train_stem, elmo_embeds_train, sent_per_epoch=args.sent_per_epoch) print(out1) # ''' load the dev data ''' processed_tag_dev, processed_sent_dev, processed_tree_dev, processed_tree_lab_dev, \ processed_tree_Long_dev, processed_tree_lab_Long_dev, processed_tag_Long_dev = data_loader.load_dev_verbo( args.dataset_dev) token_embeds_dev, elmo_type_dev = load_elmo( data_loader, args, embed_loader, mod='dev', processed_sent_dev=processed_sent_dev, processed_sent_test=None) stem_dev = get_stem(processed_sent_dev, stemmer, stem_dict) out1, result_dict = predictor.eval_dev(processed_sent_dev, stem_dev, token_embeds_dev) print('dev-final-summary: {}'.format(out1)) ''' load the test data ''' if args.test == 'yes': processed_tag_test, processed_sent_test, processed_tree_test, processed_tree_lab_test, \ processed_tree_Long_test, processed_tree_lab_Long_test, processed_tag_Long_test = data_loader.load_dev_verbo( args.dataset_test) token_embeds_test, elmo_type_test = load_elmo( data_loader, args, embed_loader, mod='test', processed_sent_dev=None, processed_sent_test=processed_sent_test) stem_test = get_stem(processed_sent_test, stemmer, stem_dict) out1, result_dict = predictor.eval_dev(processed_sent_test, stem_test, token_embeds_test) print('test-final-summary: {}'.format(out1))
def __init__(self): self.stemmer = NltkPorterStemmer()
# if the answer phrase is included in the retrieved sentence irrespective of context """ import argparse import json import csv from collections import defaultdict import operator import math import numpy as np from nltk import word_tokenize from nltk.corpus import stopwords from nltk.stem import PorterStemmer as NltkPorterStemmer np.random.seed(seed=20) stemmer = NltkPorterStemmer() # pylint: disable=invalid-name # stemmer for lematizing the words def pre_process(row): """ gets a row, makes a complete sentence string, lower cases it, tokenizes, removes stop words and does stemming Returns the clean list ------- """ sentence = ' '.join(word.lower() for word in row) sentence_tokenized = [w for w in word_tokenize(sentence)] clean_row = [ w for w in sentence_tokenized if w not in stopwords.words('english')
def __init__(self, args): self.vocab = Vocabulary() self.performance = Performance(args) self.stemmer = NltkPorterStemmer() self.nlp = spacy.load('en')
def __init__( self, lazy: bool = False, sample: int = -1, lf_syntax: str = None, replace_world_entities: bool = False, align_world_extractions: bool = False, gold_world_extractions: bool = False, tagger_only: bool = False, denotation_only: bool = False, world_extraction_model: Optional[str] = None, skip_attributes_regex: Optional[str] = None, entity_bits_mode: Optional[str] = None, entity_types: Optional[List[str]] = None, lexical_cues: List[str] = None, tokenizer: Tokenizer = None, question_token_indexers: Dict[str, TokenIndexer] = None, ) -> None: super().__init__(lazy=lazy) self._tokenizer = tokenizer or SpacyTokenizer() self._question_token_indexers = question_token_indexers or { "tokens": SingleIdTokenIndexer() } self._entity_token_indexers = self._question_token_indexers self._sample = sample self._replace_world_entities = replace_world_entities self._lf_syntax = lf_syntax self._entity_bits_mode = entity_bits_mode self._align_world_extractions = align_world_extractions self._gold_world_extractions = gold_world_extractions self._entity_types = entity_types self._tagger_only = tagger_only self._denotation_only = denotation_only self._skip_attributes_regex = None if skip_attributes_regex is not None: self._skip_attributes_regex = re.compile(skip_attributes_regex) self._lexical_cues = lexical_cues # Recording of entities in categories relevant for tagging all_entities = {} all_entities["world"] = ["world1", "world2"] # TODO: Clarify this into an appropriate parameter self._collapse_tags = ["world"] self._all_entities = None if entity_types is not None: if self._entity_bits_mode == "collapsed": self._all_entities = entity_types else: self._all_entities = [e for t in entity_types for e in all_entities[t]] logger.info(f"all_entities = {self._all_entities}") # Base world, depending on LF syntax only self._knowledge_graph = KnowledgeGraph( entities={"placeholder"}, neighbors={}, entity_text={"placeholder": "placeholder"} ) self._world = QuarelWorld(self._knowledge_graph, self._lf_syntax) # Decide dynamic entities, if any self._dynamic_entities: Dict[str, str] = dict() self._use_attr_entities = False if "_attr_entities" in lf_syntax: self._use_attr_entities = True qr_coeff_sets = self._world.qr_coeff_sets for qset in qr_coeff_sets: for attribute in qset: if ( self._skip_attributes_regex is not None and self._skip_attributes_regex.search(attribute) ): continue # Get text associated with each entity, both from entity identifier and # associated lexical cues, if any entity_strings = [words_from_entity_string(attribute).lower()] if self._lexical_cues is not None: for key in self._lexical_cues: if attribute in LEXICAL_CUES[key]: entity_strings += LEXICAL_CUES[key][attribute] self._dynamic_entities["a:" + attribute] = " ".join(entity_strings) # Update world to include dynamic entities if self._use_attr_entities: logger.info(f"dynamic_entities = {self._dynamic_entities}") neighbors: Dict[str, List[str]] = {key: [] for key in self._dynamic_entities} self._knowledge_graph = KnowledgeGraph( entities=set(self._dynamic_entities.keys()), neighbors=neighbors, entity_text=self._dynamic_entities, ) self._world = QuarelWorld(self._knowledge_graph, self._lf_syntax) self._stemmer = NltkPorterStemmer() self._world_tagger_extractor = None self._extract_worlds = False if world_extraction_model is not None: logger.info("Loading world tagger model...") self._extract_worlds = True self._world_tagger_extractor = WorldTaggerExtractor(world_extraction_model) logger.info("Done loading world tagger model!") # Convenience regex for recognizing attributes self._attr_regex = re.compile(r"""\((\w+) (high|low|higher|lower)""")