Пример #1
0
def reconstruct_stem_fore(model, data_loader, args):
    '''
    the foreign version of the stem reconstruction training.
    :param model:
    :param data_loader:
    :param args:
    :return:
    '''
    stem_dict = {}
    num2stem = {}
    stemmer = NltkPorterStemmer()
    train_stem = []
    for sent in data_loader.processed_sent:
        sent_stem = []
        for word in sent:
            word_stem = stemmer.stem(word)
            if word_stem not in stem_dict:
                stem_dict[word_stem] = len(stem_dict)
                num2stem[stem_dict[word_stem]] = word_stem
            sent_stem.append(stem_dict[word_stem])
        train_stem.append(torch.LongTensor(sent_stem))
    stem_dict['UNK'] = len(stem_dict)
    print(len(stem_dict))
    ''' training steps '''
    for param in model.parameters():
        param.require_grad = False

    embed_loader = Foreign_Elmo(args.elmo_model_path, args.embedding_source)
    elmo_embeds_train, elmo_type_train = embed_loader._get_embeddings(
        data_loader.processed_sent)

    args.weight_decay = args.weight_decay2

    predictor = baselines.Recon_Lemma(stem_dict, model, args)
    for e in range(args.epoch):
        out1 = predictor.train(data_loader.processed_sent,
                               train_stem,
                               elmo_embeds_train,
                               sent_per_epoch=args.sent_per_epoch)
        print(out1)
    #
    ''' load the dev data '''
    processed_tag_dev, processed_sent_dev, processed_tree_dev, processed_tree_lab_dev, \
    processed_tree_Long_dev, processed_tree_lab_Long_dev, processed_tag_Long_dev = data_loader.load_dev_verbo(
        args.dataset_dev)

    token_embeds_dev, elmo_type_dev = embed_loader._get_embeddings(
        processed_sent_dev)

    stem_dev = get_stem(processed_sent_dev, stemmer, stem_dict)
    out1, result_dict = predictor.eval_dev(processed_sent_dev, stem_dev,
                                           token_embeds_dev)
    print('dev-final-summary: {}'.format(out1))
    ''' load the test data '''
    if args.test == 'yes':
        processed_tag_test, processed_sent_test, processed_tree_test, processed_tree_lab_test, \
        processed_tree_Long_test, processed_tree_lab_Long_test, processed_tag_Long_test = data_loader.load_dev_verbo(
            args.dataset_test)

        token_embeds_test, elmo_type_test = embed_loader._get_embeddings(
            processed_sent_test)
        stem_test = get_stem(processed_sent_test, stemmer, stem_dict)

        out1, result_dict = predictor.eval_dev(processed_sent_test, stem_test,
                                               token_embeds_test)
        print('test-final-summary: {}'.format(out1))
Пример #2
0
def reconstruct_stem(model, data_loader, args):
    stem_dict = {}
    num2stem = {}
    stemmer = NltkPorterStemmer()
    train_stem = []
    for sent in data_loader.processed_sent:
        sent_stem = []
        for word in sent:
            word_stem = stemmer.stem(word)
            if word_stem not in stem_dict:
                stem_dict[word_stem] = len(stem_dict)
                num2stem[stem_dict[word_stem]] = word_stem
            sent_stem.append(stem_dict[word_stem])
        train_stem.append(torch.LongTensor(sent_stem))
    stem_dict['UNK'] = len(stem_dict)
    print(len(stem_dict))
    ''' training '''
    for param in model.parameters():
        param.require_grad = False

    embed_loader = Embedding_Weight(args.embedding_source,
                                    data_loader=data_loader,
                                    num_sent=args.epoch_sent)
    args.weight_decay = args.weight_decay2
    elmo_embeds_train, elmo_type_train = load_elmo(data_loader,
                                                   args,
                                                   embed_loader,
                                                   mod='train',
                                                   processed_sent_dev=None,
                                                   processed_sent_test=None)

    predictor = baselines.Recon_Lemma(stem_dict, model, args)
    for e in range(args.epoch):
        out1 = predictor.train(data_loader.processed_sent,
                               train_stem,
                               elmo_embeds_train,
                               sent_per_epoch=args.sent_per_epoch)
        print(out1)
    #
    ''' load the dev data '''
    processed_tag_dev, processed_sent_dev, processed_tree_dev, processed_tree_lab_dev, \
    processed_tree_Long_dev, processed_tree_lab_Long_dev, processed_tag_Long_dev = data_loader.load_dev_verbo(
        args.dataset_dev)

    token_embeds_dev, elmo_type_dev = load_elmo(
        data_loader,
        args,
        embed_loader,
        mod='dev',
        processed_sent_dev=processed_sent_dev,
        processed_sent_test=None)

    stem_dev = get_stem(processed_sent_dev, stemmer, stem_dict)
    out1, result_dict = predictor.eval_dev(processed_sent_dev, stem_dev,
                                           token_embeds_dev)
    print('dev-final-summary: {}'.format(out1))
    ''' load the test data '''
    if args.test == 'yes':
        processed_tag_test, processed_sent_test, processed_tree_test, processed_tree_lab_test, \
        processed_tree_Long_test, processed_tree_lab_Long_test, processed_tag_Long_test = data_loader.load_dev_verbo(
            args.dataset_test)

        token_embeds_test, elmo_type_test = load_elmo(
            data_loader,
            args,
            embed_loader,
            mod='test',
            processed_sent_dev=None,
            processed_sent_test=processed_sent_test)

        stem_test = get_stem(processed_sent_test, stemmer, stem_dict)
        out1, result_dict = predictor.eval_dev(processed_sent_test, stem_test,
                                               token_embeds_test)
        print('test-final-summary: {}'.format(out1))
Пример #3
0
 def __init__(self):
     self.stemmer = NltkPorterStemmer()
Пример #4
0
# if the answer phrase is included in the retrieved sentence irrespective of context
"""

import argparse
import json
import csv
from collections import defaultdict
import operator
import math
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer as NltkPorterStemmer

np.random.seed(seed=20)
stemmer = NltkPorterStemmer()  # pylint: disable=invalid-name


# stemmer for lematizing the words
def pre_process(row):
    """
    gets a row, makes a complete sentence string, lower cases it,
    tokenizes, removes stop words and does stemming
    Returns the clean list
    -------

    """
    sentence = ' '.join(word.lower() for word in row)
    sentence_tokenized = [w for w in word_tokenize(sentence)]
    clean_row = [
        w for w in sentence_tokenized if w not in stopwords.words('english')
Пример #5
0
 def __init__(self, args):
     self.vocab = Vocabulary()
     self.performance = Performance(args)
     self.stemmer = NltkPorterStemmer()
     self.nlp = spacy.load('en')
Пример #6
0
    def __init__(
        self,
        lazy: bool = False,
        sample: int = -1,
        lf_syntax: str = None,
        replace_world_entities: bool = False,
        align_world_extractions: bool = False,
        gold_world_extractions: bool = False,
        tagger_only: bool = False,
        denotation_only: bool = False,
        world_extraction_model: Optional[str] = None,
        skip_attributes_regex: Optional[str] = None,
        entity_bits_mode: Optional[str] = None,
        entity_types: Optional[List[str]] = None,
        lexical_cues: List[str] = None,
        tokenizer: Tokenizer = None,
        question_token_indexers: Dict[str, TokenIndexer] = None,
    ) -> None:
        super().__init__(lazy=lazy)
        self._tokenizer = tokenizer or SpacyTokenizer()
        self._question_token_indexers = question_token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self._entity_token_indexers = self._question_token_indexers
        self._sample = sample
        self._replace_world_entities = replace_world_entities
        self._lf_syntax = lf_syntax
        self._entity_bits_mode = entity_bits_mode
        self._align_world_extractions = align_world_extractions
        self._gold_world_extractions = gold_world_extractions
        self._entity_types = entity_types
        self._tagger_only = tagger_only
        self._denotation_only = denotation_only
        self._skip_attributes_regex = None
        if skip_attributes_regex is not None:
            self._skip_attributes_regex = re.compile(skip_attributes_regex)
        self._lexical_cues = lexical_cues

        # Recording of entities in categories relevant for tagging
        all_entities = {}
        all_entities["world"] = ["world1", "world2"]
        # TODO: Clarify this into an appropriate parameter
        self._collapse_tags = ["world"]

        self._all_entities = None
        if entity_types is not None:
            if self._entity_bits_mode == "collapsed":
                self._all_entities = entity_types
            else:
                self._all_entities = [e for t in entity_types for e in all_entities[t]]

        logger.info(f"all_entities = {self._all_entities}")

        # Base world, depending on LF syntax only
        self._knowledge_graph = KnowledgeGraph(
            entities={"placeholder"}, neighbors={}, entity_text={"placeholder": "placeholder"}
        )
        self._world = QuarelWorld(self._knowledge_graph, self._lf_syntax)

        # Decide dynamic entities, if any
        self._dynamic_entities: Dict[str, str] = dict()
        self._use_attr_entities = False
        if "_attr_entities" in lf_syntax:
            self._use_attr_entities = True
            qr_coeff_sets = self._world.qr_coeff_sets
            for qset in qr_coeff_sets:
                for attribute in qset:
                    if (
                        self._skip_attributes_regex is not None
                        and self._skip_attributes_regex.search(attribute)
                    ):
                        continue
                    # Get text associated with each entity, both from entity identifier and
                    # associated lexical cues, if any
                    entity_strings = [words_from_entity_string(attribute).lower()]
                    if self._lexical_cues is not None:
                        for key in self._lexical_cues:
                            if attribute in LEXICAL_CUES[key]:
                                entity_strings += LEXICAL_CUES[key][attribute]
                    self._dynamic_entities["a:" + attribute] = " ".join(entity_strings)

        # Update world to include dynamic entities
        if self._use_attr_entities:
            logger.info(f"dynamic_entities = {self._dynamic_entities}")
            neighbors: Dict[str, List[str]] = {key: [] for key in self._dynamic_entities}
            self._knowledge_graph = KnowledgeGraph(
                entities=set(self._dynamic_entities.keys()),
                neighbors=neighbors,
                entity_text=self._dynamic_entities,
            )
            self._world = QuarelWorld(self._knowledge_graph, self._lf_syntax)

        self._stemmer = NltkPorterStemmer()

        self._world_tagger_extractor = None
        self._extract_worlds = False
        if world_extraction_model is not None:
            logger.info("Loading world tagger model...")
            self._extract_worlds = True
            self._world_tagger_extractor = WorldTaggerExtractor(world_extraction_model)
            logger.info("Done loading world tagger model!")

        # Convenience regex for recognizing attributes
        self._attr_regex = re.compile(r"""\((\w+) (high|low|higher|lower)""")