Пример #1
0
 def process_files(self, input_text_file, input_da_file, skip_hello=False):
     """Load DAs & sentences, obtain abstraction instructions, and store it all in member
     variables (to be used later by writing methods).
     @param input_text_file: path to the input file with sentences
     @param input_da_file: path to the input file with DAs
     @param skip_hello: skip hello() DAs (remove them from the output?)
     """
     # load DAs
     self._das = []
     with codecs.open(input_da_file, 'r', encoding='UTF-8') as fh:
         for line in fh:
             self._das.append(DA.parse(line.strip()))
     # load & process sentences
     self._sents = []
     with codecs.open(input_text_file, 'r', encoding='UTF-8') as fh:
         for line in fh:
             self._sents.append(self.analyze(line.strip()))
     assert(len(self._das) == len(self._sents))
     # skip hello() DAs, if required
     if skip_hello:
         pos = 0
         while pos < len(self._das):
             da = self._das[pos]
             if len(da) == 1 and da[0].da_type == 'hello':
                 del self._das[pos]
                 del self._sents[pos]
             else:
                 pos += 1
     # delexicalize DAs and sentences
     self._delex_texts()
     self._delex_das()
Пример #2
0
    def _init_training(self, das_file, ttree_file, data_portion):
        """Initialize training.

        Store input data, initialize 1-hot feature representations for input and output and
        transform training data accordingly, initialize the classification neural network.
        """
        # read input
        log_info('Reading DAs from ' + das_file + '...')
        das = read_das(das_file)
        log_info('Reading t-trees from ' + ttree_file + '...')
        ttree_doc = read_ttrees(ttree_file)
        trees = trees_from_doc(ttree_doc, self.language, self.selector)

        # make training data smaller if necessary
        train_size = int(round(data_portion * len(trees)))
        self.train_trees = trees[:train_size]
        self.train_das = das[:train_size]

        # add empty tree + empty DA to training data
        # (i.e. forbid the network to keep any of its outputs "always-on")
        train_size += 1
        self.train_trees.append(TreeData())
        empty_da = DA.parse('inform()')
        self.train_das.append(empty_da)

        self.train_order = range(len(self.train_trees))
        log_info('Using %d training instances.' % train_size)

        # initialize input features/embeddings
        if self.tree_embs:
            self.dict_size = self.tree_embs.init_dict(self.train_trees)
            self.X = np.array([
                self.tree_embs.get_embeddings(tree)
                for tree in self.train_trees
            ])
        else:
            self.tree_feats = Features(['node: presence t_lemma formeme'])
            self.tree_vect = DictVectorizer(sparse=False,
                                            binarize_numeric=True)
            self.X = [
                self.tree_feats.get_features(tree, {})
                for tree in self.train_trees
            ]
            self.X = self.tree_vect.fit_transform(self.X)

        # initialize output features
        self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence'])
        self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True)
        self.y = [
            self.da_feats.get_features(None, {'da': da})
            for da in self.train_das
        ]
        self.y = self.da_vect.fit_transform(self.y)

        # initialize I/O shapes
        self.input_shape = [list(self.X[0].shape)]
        self.num_outputs = len(self.da_vect.get_feature_names())

        # initialize NN classifier
        self._init_neural_network()
Пример #3
0
    def process_dataset(self, input_data):
        """Load DAs & sentences, obtain abstraction instructions, and store it all in member
        variables (to be used later by writing methods).
        @param input_data: path to the input JSON file with the data
        """
        # load data from JSON
        self._das = []
        self._texts = []
        with codecs.open(input_data, 'r', encoding='UTF-8') as fh:
            data = json.load(fh)
            for inst in data:
                da = DA.parse(inst['da'])
                da.sort()
                self._das.append(da)
                self._texts.append(self.analyze(inst['text']))

        # delexicalize DAs and sentences
        self._create_delex_texts()
        self._create_delex_das()

        # return the result
        out = []
        for da, text, delex_da, delex_text, abst in zip(
                self._das, self._texts, self._delex_das, self._delex_texts,
                self._absts):
            out.append(Inst(da, text, delex_da, delex_text, abst))
        return out
Пример #4
0
 def process_files(self, input_text_file, input_da_file, skip_hello=False):
     """Load DAs & sentences, obtain abstraction instructions, and store it all in member
     variables (to be used later by writing methods).
     @param input_text_file: path to the input file with sentences
     @param input_da_file: path to the input file with DAs
     @param skip_hello: skip hello() DAs (remove them from the output?)
     """
     # load DAs
     self._das = []
     with codecs.open(input_da_file, 'r', encoding='UTF-8') as fh:
         for line in fh:
             self._das.append(DA.parse(line.strip()))
     # load & process sentences
     self._sents = []
     with codecs.open(input_text_file, 'r', encoding='UTF-8') as fh:
         for line in fh:
             self._sents.append(self.analyze(line.strip()))
     assert (len(self._das) == len(self._sents))
     # skip hello() DAs, if required
     if skip_hello:
         pos = 0
         while pos < len(self._das):
             da = self._das[pos]
             if len(da) == 1 and da[0].da_type == 'hello':
                 del self._das[pos]
                 del self._sents[pos]
             else:
                 pos += 1
     # delexicalize DAs and sentences
     self._delex_texts()
     self._delex_das()
Пример #5
0
    def _init_training(self, das_file, ttree_file, data_portion):
        """Initialize training.

        Store input data, initialize 1-hot feature representations for input and output and
        transform training data accordingly, initialize the classification neural network.
        """
        # read input
        log_info('Reading DAs from ' + das_file + '...')
        das = read_das(das_file)
        log_info('Reading t-trees from ' + ttree_file + '...')
        ttree_doc = read_ttrees(ttree_file)
        trees = trees_from_doc(ttree_doc, self.language, self.selector)

        # make training data smaller if necessary
        train_size = int(round(data_portion * len(trees)))
        self.train_trees = trees[:train_size]
        self.train_das = das[:train_size]

        # add empty tree + empty DA to training data
        # (i.e. forbid the network to keep any of its outputs "always-on")
        train_size += 1
        self.train_trees.append(TreeData())
        empty_da = DA.parse('inform()')
        self.train_das.append(empty_da)

        self.train_order = range(len(self.train_trees))
        log_info('Using %d training instances.' % train_size)

        # initialize input features/embeddings
        if self.tree_embs:
            self.dict_size = self.tree_embs.init_dict(self.train_trees)
            self.X = np.array([self.tree_embs.get_embeddings(tree) for tree in self.train_trees])
        else:
            self.tree_feats = Features(['node: presence t_lemma formeme'])
            self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True)
            self.X = [self.tree_feats.get_features(tree, {}) for tree in self.train_trees]
            self.X = self.tree_vect.fit_transform(self.X)

        # initialize output features
        self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence'])
        self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True)
        self.y = [self.da_feats.get_features(None, {'da': da}) for da in self.train_das]
        self.y = self.da_vect.fit_transform(self.y)

        # initialize I/O shapes
        self.input_shape = [list(self.X[0].shape)]
        self.num_outputs = len(self.da_vect.get_feature_names())

        # initialize NN classifier
        self._init_neural_network()
Пример #6
0
 def process_files(self, input_text_file, input_da_file):
     """Load DAs & sentences, obtain abstraction instructions, and store it all in member
     variables (to be used later by writing methods)."""
     # load DAs
     self._das = []
     with codecs.open(input_da_file, 'r', encoding='UTF-8') as fh:
         for line in fh:
             self._das.append(DA.parse(line.strip()))
     # load & process sentences
     self._sents = []
     with codecs.open(input_text_file, 'r', encoding='UTF-8') as fh:
         for line in fh:
             self._sents.append(self.analyze(line.strip()))
     assert (len(self._das) == len(self._sents))
     # delexicalize DAs and sentences
     self._delex_texts()
     self._delex_das()
Пример #7
0
def convert(args):
    src = lines_to_list(args.src_file)
    if args.das:
        src = [DA.parse(da_text).to_cambridge_da_string() for da_text in src]
    ref = lines_to_list(args.ref_file)
    columns = ['mr', 'orig_ref']
    df = pd.DataFrame.from_dict({'mr': src, 'orig_ref': ref})

    if args.system_output:
        sys = lines_to_list(args.system_output)
        df['system_ref'] = sys
        columns.append('system_ref')

    if args.score:
        score = [float(score) for score in lines_to_list(args.score)]
        df['quality'] = score
        columns.append('quality')

    df.to_csv(args.out_file,
              columns=columns,
              sep=b"\t",
              index=False,
              encoding='UTF-8')
Пример #8
0
    def _init_training(self, das, trees, data_portion):
        """Initialize training.

        Store input data, initialize 1-hot feature representations for input and output and
        transform training data accordingly, initialize the classification neural network.

        @param das: name of source file with training DAs, or list of DAs
        @param trees: name of source file with corresponding trees/sentences, or list of trees
        @param data_portion: portion of the training data to be used (0.0-1.0)
        """
        # read input from files or take it directly from parameters
        if not isinstance(das, list):
            log_info('Reading DAs from ' + das + '...')
            das = read_das(das)
        if not isinstance(trees, list):
            log_info('Reading t-trees from ' + trees + '...')
            ttree_doc = read_ttrees(trees)
            if self.mode == 'tokens':
                tokens = tokens_from_doc(ttree_doc, self.language,
                                         self.selector)
                trees = self._tokens_to_flat_trees(tokens)
            elif self.mode == 'tagged_lemmas':
                tls = tagged_lemmas_from_doc(ttree_doc, self.language,
                                             self.selector)
                trees = self._tokens_to_flat_trees(tls, use_tags=True)
            else:
                trees = trees_from_doc(ttree_doc, self.language, self.selector)
        elif self.mode in ['tokens', 'tagged_lemmas']:
            trees = self._tokens_to_flat_trees(
                trees, use_tags=self.mode == 'tagged_lemmas')

        # make training data smaller if necessary
        train_size = int(round(data_portion * len(trees)))
        self.train_trees = trees[:train_size]
        self.train_das = das[:train_size]

        # ignore contexts, if they are contained in the DAs
        if isinstance(self.train_das[0], tuple):
            self.train_das = [da for (context, da) in self.train_das]
        # delexicalize if DAs are lexicalized and we don't want that
        if self.delex_slots:
            self.train_das = [
                da.get_delexicalized(self.delex_slots) for da in self.train_das
            ]

        # add empty tree + empty DA to training data
        # (i.e. forbid the network to keep any of its outputs "always-on")
        train_size += 1
        self.train_trees.append(TreeData())
        empty_da = DA.parse('inform()')
        self.train_das.append(empty_da)

        self.train_order = range(len(self.train_trees))
        log_info('Using %d training instances.' % train_size)

        # initialize input features/embeddings
        if self.tree_embs:
            self.dict_size = self.tree_embs.init_dict(self.train_trees)
            self.X = np.array([
                self.tree_embs.get_embeddings(tree)
                for tree in self.train_trees
            ])
        else:
            self.tree_feats = Features(['node: presence t_lemma formeme'])
            self.tree_vect = DictVectorizer(sparse=False,
                                            binarize_numeric=True)
            self.X = [
                self.tree_feats.get_features(tree, {})
                for tree in self.train_trees
            ]
            self.X = self.tree_vect.fit_transform(self.X)

        # initialize output features
        self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence'])
        self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True)
        self.y = [
            self.da_feats.get_features(None, {'da': da})
            for da in self.train_das
        ]
        self.y = self.da_vect.fit_transform(self.y)
        log_info('Number of binary classes: %d.' %
                 len(self.da_vect.get_feature_names()))

        # initialize I/O shapes
        if not self.tree_embs:
            self.input_shape = list(self.X[0].shape)
        else:
            self.input_shape = self.tree_embs.get_embeddings_shape()
        self.num_outputs = len(self.da_vect.get_feature_names())

        # initialize NN classifier
        self._init_neural_network()
        # initialize the NN variables
        self.session.run(tf.global_variables_initializer())
Пример #9
0
def convert(args):
    """Main function – read in the JSON data and output TGEN-specific files."""

    # initialize storage
    items = 0
    das = []  # abstracted DAs
    concs = []  # concrete sentences
    texts = []  # abstracted sentences
    absts = []  # abstraction descriptions
    contexts = []  # abstracted contexts
    conc_contexts = []  # lexicalized contexts

    # process the input data and store it in memory
    with open(args.in_file, 'r') as fh:
        data = json.load(fh, encoding='UTF-8')
        for item in data:
            da = convert_abstr_da(DA.parse(item['response_da']))
            context = convert_abstractions(item['context_utt'])
            context_l = item['context_utt_l']
            conc_da = DA.parse(item['response_da_l'])
            concs_ = [tokenize(s) for s in item['response_nl_l']]
            absts_ = []
            texts_ = []
            for abst_text in item['response_nl']:
                text, abst = get_abstraction(
                    abst_text, conc_da, args.slot_names)  # convert *SLOT -> X
                absts_.append(abst)
                texts_.append(text)

            das.append(da)
            contexts.append(context)
            conc_contexts.append(context_l)
            concs.append(concs_)
            absts.append(absts_)
            texts.append(texts_)
            items += 1

        print 'Processed', items, 'items.'

    if args.split:
        # get file name prefixes and compute data sizes for all the parts to be split
        out_names = re.split(r'[, ]+', args.out_name)
        data_sizes = [int(part_size) for part_size in args.split.split(':')]
        assert len(out_names) == len(data_sizes)
        # compute sizes for all but the 1st part (+ round them)
        total = float(sum(data_sizes))
        remain = items
        for part_no in xrange(len(data_sizes) - 1, 0, -1):
            part_size = int(round(items * (data_sizes[part_no] / total)))
            data_sizes[part_no] = part_size
            remain -= part_size
        # put whatever remained into the 1st part
        data_sizes[0] = remain
    else:
        # use just one part -- containing all the data
        data_sizes = [items]
        out_names = [args.out_name]

    # write all data parts
    for part_size, part_name in zip(data_sizes, out_names):

        repeat_num = len(concs[0])
        if args.multi_ref and part_name in ['devel', 'test', 'dtest', 'etest']:
            repeat_num = 1

        # repeat DAs and contexts for synonymous paraphrases, unless for test data in multi-ref mode
        write_part(part_name + '-das.txt', das, part_size, repeat_num)
        write_part(part_name + '-context.txt', contexts, part_size, repeat_num)
        write_part(part_name + '-conc_context.txt', conc_contexts, part_size,
                   repeat_num)

        # write all other just once (here, each instance is a list, it will be unrolled)
        write_part(part_name + '-conc.txt', concs, part_size)
        write_part(part_name + '-abst.txt', absts, part_size)
        write_part(part_name + '-text.txt', texts, part_size)
Пример #10
0
def read_sfx_data():
    with codecs.open('data/sfrest-refs.tag.ngram.txt', 'r', 'UTF-8') as fh:
        refs = [split_tags(inst.strip()) for inst in fh.readlines()]
    with codecs.open('data/sfrest-mrs.txt', 'r', 'UTF-8') as fh:
        mrs = [DA.parse(mr.strip()) for mr in fh.readlines()]
    return mrs, refs
Пример #11
0
    def _init_training(self, das, trees, data_portion):
        """Initialize training.

        Store input data, initialize 1-hot feature representations for input and output and
        transform training data accordingly, initialize the classification neural network.

        @param das: name of source file with training DAs, or list of DAs
        @param trees: name of source file with corresponding trees/sentences, or list of trees
        @param data_portion: portion of the training data to be used (0.0-1.0)
        """
        # read input from files or take it directly from parameters
        if not isinstance(das, list):
            log_info('Reading DAs from ' + das + '...')
            das = read_das(das)
        if not isinstance(trees, list):
            log_info('Reading t-trees from ' + trees + '...')
            ttree_doc = read_ttrees(trees)
            if self.mode == 'tokens':
                tokens = tokens_from_doc(ttree_doc, self.language, self.selector)
                trees = self._tokens_to_flat_trees(tokens)
            elif self.mode == 'tagged_lemmas':
                tls = tagged_lemmas_from_doc(ttree_doc, self.language, self.selector)
                trees = self._tokens_to_flat_trees(tls, use_tags=True)
            else:
                trees = trees_from_doc(ttree_doc, self.language, self.selector)
        elif self.mode in ['tokens', 'tagged_lemmas']:
            trees = self._tokens_to_flat_trees(trees, use_tags=self.mode == 'tagged_lemmas')

        # make training data smaller if necessary
        train_size = int(round(data_portion * len(trees)))
        self.train_trees = trees[:train_size]
        self.train_das = das[:train_size]

        # ignore contexts, if they are contained in the DAs
        if isinstance(self.train_das[0], tuple):
            self.train_das = [da for (context, da) in self.train_das]
        # delexicalize if DAs are lexicalized and we don't want that
        if self.delex_slots:
            self.train_das = [da.get_delexicalized(self.delex_slots) for da in self.train_das]

        # add empty tree + empty DA to training data
        # (i.e. forbid the network to keep any of its outputs "always-on")
        train_size += 1
        self.train_trees.append(TreeData())
        empty_da = DA.parse('inform()')
        self.train_das.append(empty_da)

        self.train_order = range(len(self.train_trees))
        log_info('Using %d training instances.' % train_size)

        # initialize input features/embeddings
        if self.tree_embs:
            self.dict_size = self.tree_embs.init_dict(self.train_trees)
            self.X = np.array([self.tree_embs.get_embeddings(tree) for tree in self.train_trees])
        else:
            self.tree_feats = Features(['node: presence t_lemma formeme'])
            self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True)
            self.X = [self.tree_feats.get_features(tree, {}) for tree in self.train_trees]
            self.X = self.tree_vect.fit_transform(self.X)

        # initialize output features
        self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence'])
        self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True)
        self.y = [self.da_feats.get_features(None, {'da': da}) for da in self.train_das]
        self.y = self.da_vect.fit_transform(self.y)
        log_info('Number of binary classes: %d.' % len(self.da_vect.get_feature_names()))

        # initialize I/O shapes
        if not self.tree_embs:
            self.input_shape = list(self.X[0].shape)
        else:
            self.input_shape = self.tree_embs.get_embeddings_shape()
        self.num_outputs = len(self.da_vect.get_feature_names())

        # initialize NN classifier
        self._init_neural_network()
        # initialize the NN variables
        self.session.run(tf.global_variables_initializer())
Пример #12
0
def convert(args):
    """Main function – read in the JSON data and output TGEN-specific files."""

    # initialize storage
    items = 0
    conc_das = [] # concrete DAs
    das = []  # abstracted DAs
    concs = []  # concrete sentences
    texts = []  # abstracted sentences
    absts = []  # abstraction descriptions
    contexts = []  # abstracted contexts
    conc_contexts = []  # lexicalized contexts

    # process the input data and store it in memory
    with open(args.in_file, 'r') as fh:
        data = json.load(fh, encoding='UTF-8')
        for item in data:
            da = convert_abstr_da(DA.parse(item['response_da']))
            context = convert_abstractions(item['context_utt'])
            context_l = item['context_utt_l']
            conc_da = DA.parse(item['response_da_l'])
            concs_ = [tokenize(s) for s in item['response_nl_l']]
            absts_ = []
            texts_ = []
            for abst_text in item['response_nl']:
                text, abst = get_abstraction(abst_text, conc_da, args.slot_names)  # convert *SLOT -> X
                absts_.append(abst)
                texts_.append(text)

            das.append(da)
            conc_das.append(conc_da)
            contexts.append(context)
            conc_contexts.append(context_l)
            concs.append(concs_)
            absts.append(absts_)
            texts.append(texts_)
            items += 1

        print 'Processed', items, 'items.'

    if args.split:
        # get file name prefixes and compute data sizes for all the parts to be split
        out_names = re.split(r'[, ]+', args.out_name)
        data_sizes = [int(part_size) for part_size in args.split.split(':')]
        assert len(out_names) == len(data_sizes)
        # compute sizes for all but the 1st part (+ round them)
        total = float(sum(data_sizes))
        remain = items
        for part_no in xrange(len(data_sizes) - 1, 0, -1):
            part_size = int(round(items * (data_sizes[part_no] / total)))
            data_sizes[part_no] = part_size
            remain -= part_size
        # put whatever remained into the 1st part
        data_sizes[0] = remain
    else:
        # use just one part -- containing all the data
        data_sizes = [items]
        out_names = [args.out_name]

    # write all data parts
    for part_size, part_name in zip(data_sizes, out_names):

        repeat_num = len(concs[0])
        if args.multi_ref and part_name in ['devel', 'test', 'dtest', 'etest']:
            repeat_num = 1

        # repeat DAs and contexts for synonymous paraphrases, unless for test data in multi-ref mode
        write_part(part_name + '-das.txt', das, part_size, repeat_num)
        write_part(part_name + '-conc_das.txt', conc_das, part_size, repeat_num)
        write_part(part_name + '-context.txt', contexts, part_size, repeat_num)
        write_part(part_name + '-conc_context.txt', conc_contexts, part_size, repeat_num)

        # write all other just once (here, each instance is a list, it will be unrolled)
        write_part(part_name + '-ref.txt', concs, part_size, trunc=False, separate=True)
        write_part(part_name + '-conc.txt', concs, part_size)
        write_part(part_name + '-abst.txt', absts, part_size)
        write_part(part_name + '-text.txt', texts, part_size)