Пример #1
0
    def read_extracted_data(self) -> None:
        """ Read existed data.

        Data consists of common data and specific mode data.

        """

        # Common data
        if isfile(DatasetConfig.common_raw_data_file):
            common_data: CommonData = load_pkl(
                DatasetConfig.common_raw_data_file)
            self.dialog_vocab = common_data.dialog_vocab
            self.glove = common_data.glove
            self.image_url_id = common_data.image_url_id
            self.image_paths = common_data.image_paths

        # Specific mode data
        if self.mode & TRAIN_MODE and isfile(
                DatasetConfig.train_raw_data_file):
            train_data = load_pkl(DatasetConfig.train_raw_data_file)
            self.train_dialogs = train_data
        if self.mode & VALID_MODE and isfile(
                DatasetConfig.valid_raw_data_file):
            valid_data = load_pkl(DatasetConfig.valid_raw_data_file)
            self.valid_dialogs = valid_data
        if self.mode & TEST_MODE and isfile(DatasetConfig.test_raw_data_file):
            test_data = load_pkl(DatasetConfig.test_raw_data_file)
            self.test_dialogs = test_data
Пример #2
0
    def train_setup(self):

        from featlists.ci import feats as flist

        assert "train_pkl" in self.params and "dev_pkl" in self.params
        logger.info(" * TRAINING: %s stage * " % self.name)

        vocab = util.load_pkl(self.params["vocab_fn"])
        train_data = util.load_pkl(self.params["train_pkl"])
        val_data = util.load_pkl(self.params["dev_pkl"])
        test_data = util.load_pkl(self.params["test_pkl"]) if "test_pkl" in self.params else None

        self.fx = Fxtractor(fx_maps=collections.defaultdict(), labels=vocab["ctypes"], flist=flist)
        self.fx.fx_dataset_ci(train_data, train=True)
        self.fx.fx_dataset_ci(val_data, train=False)
        self.fx.fx_maps["vidx"] = copy.deepcopy(vocab["vidx"])

        n_features = self.fx.fx_maps["ci_fdim"]
        n_classes = len(self.fx.fx_maps["ci_l2i"])
        self.clf = Perceptron.SparsePerceptron()
        self.clf.init_w(n_features, n_classes)

        self.ruleset = Ruleset()
        self.ruleset.load_db()

        logger.debug("Data: %d (train), %d (dev). Num feats %d" % (len(train_data), len(val_data), n_features))
        logger.debug("Used features: \n%s", "\n".join(self.fx.flist))

        return vocab, train_data, val_data, test_data
Пример #3
0
    def train_setup(self):

        from featlists.ri import feats as flist

        assert "train_pkl" in self.params and "dev_pkl" in self.params
        logger.info(" * TRAINING: %s stage * " % self.name)

        vocab = util.load_pkl(self.params["vocab_fn"])
        train_data = util.load_pkl(self.params["train_pkl"])
        val_data = util.load_pkl(self.params["dev_pkl"])
        test_data = util.load_pkl(self.params["test_pkl"]) if "test_pkl" in self.params else None

        labels = ["<null>", "<unk>"] + vocab["edges"].keys()
        self.fx = Fxtractor(fx_maps=collections.defaultdict(), labels=labels, flist=flist)
        self.fx.fx_dataset_ri(train_data, train=True)
        self.fx.fx_dataset_ri(val_data, train=False)

        n_features = self.fx.fx_maps["ri_fdim"]
        n_classes = len(self.fx.fx_maps["ri_l2i"])
        self.clf = Perceptron.SparsePerceptron()
        self.clf.init_w(n_features, n_classes)

        self.amr_printer = Printer()

        logger.debug("Data: %d (train), %d (dev). Num feats %d" % (len(train_data), len(val_data), n_features))
        logger.debug("Used features: \n%s", "\n".join(self.fx.flist))

        return vocab, train_data, val_data, test_data
Пример #4
0
    def save_data(self):
        """ Save QNAS data in a pickle file for logging and reloading purposes, including
            chromosomes, generation number, evaluation score and number of evaluations. Note
            that the data in the file is loaded and updated with the current generation, so that
            we keep track of the entire evolutionary process.
        """

        if self.current_gen == 0:
            data = dict()
        else:
            data = load_pkl(self.data_file)

        data[self.current_gen] = {
            'time': str(datetime.datetime.now()),
            'total_eval': self.total_eval,
            'best_so_far': self.best_so_far,
            'best_so_far_id': self.best_so_far_id,
            'fitnesses': self.fitnesses,
            'raw_fitnesses': self.raw_fitnesses,
            'lower': self.qpop_params.lower,
            'upper': self.qpop_params.upper,
            'params_pop': self.qpop_params.current_pop,
            'net_probs': self.qpop_net.probabilities,
            'num_net_nodes': self.qpop_net.chromosome.num_genes,
            'net_pop': self.qpop_net.current_pop
        }

        self.dump_pkl_data(data)
Пример #5
0
    def load_qnas_data(self, file_path):
        """ Read pkl data in *file_path* and load its information to current QNAS. It also saves
            its info into the new pkl data file *self.data_file*.

        Args:
            file_path: (str) path to the pkl data file.
        """

        log_data = load_pkl(file_path)

        if not os.path.exists(self.data_file):
            self.dump_pkl_data(log_data)

        generation = max(log_data.keys())
        log_data = log_data[generation]

        self.current_gen = generation
        self.total_eval = log_data['total_eval']
        self.best_so_far = log_data['best_so_far']
        self.best_so_far_id = log_data['best_so_far_id']
        self.qpop_net.chromosome.set_num_genes(log_data['num_net_nodes'])

        self.fitnesses = log_data['fitnesses']
        self.raw_fitnesses = log_data['raw_fitnesses']
        self.qpop_params.lower = log_data['lower']
        self.qpop_params.upper = log_data['upper']
        self.qpop_net.probabilities = log_data['net_probs']

        self.qpop_params.current_pop = log_data['params_pop']
        self.qpop_net.current_pop = log_data['net_pop']
Пример #6
0
    def load_evolved_data(self, generation=None, individual=0):
        """ Read the yaml log *self.files_spec['data_file']* and get values from the individual
            specified by *generation* and *individual*.

        Args:
            generation: (int) generation number from which data will be loaded. If None, loads
                the last generation data.
            individual: (int) number of the classical individual to be loaded. If no number is
                specified, individual 0 is loaded (the one with highest fitness on the given
                *generation*.
        """

        log_data = load_pkl(self.files_spec['data_file'])

        if generation is None:
            generation = max(log_data.keys())

        log_data = log_data[generation]

        params_pop = log_data['params_pop']
        net_pop = log_data['net_pop']

        assert individual < net_pop.shape[0], \
            "The individual number cannot be bigger than the size of the population!"

        params = QChromosomeParams(
                params_ranges=self.QNAS_spec['params_ranges']).decode(params_pop[individual])
        net = QChromosomeNetwork(
                fn_list=self.QNAS_spec['fn_list'],
                max_num_nodes=log_data['num_net_nodes']).decode(net_pop[individual])

        self.evolved_params = {'params': params, 'net': net}
Пример #7
0
def tfidf(data_set, space_path):
    if os.path.exists(space_path):
        tfidf_space = load_pkl(space_path)
    else:
        vectorizer = CountVectorizer()
        transformer = TfidfTransformer()
        tfidf_space = transformer.fit_transform(
            vectorizer.fit_transform(data_set))
        dump_pkl(tfidf_space, space_path)
    print('tfidf shape:', tfidf_space.shape)
    return tfidf_space
Пример #8
0
def load_params(exp_path, generation=None, individual=0):
    """ Load the parameters from *exp_path/log_params_evolution.txt* and the data from
        *exp_path/data_QNAS.txt*. The data loaded is the network encoded by individual
        *individual* of generation *generation*.

    Args:
        exp_path: (str) path to the directory containing evolution files.
        generation: (int) the generation number of the individual to be profiled.
            If *None*, the last generation will be used.
        individual: (int) the number of the individual in *generation* to be profiled.

    Returns:
        dict holding all the necessary parameters and data.
    """

    log_file_path = os.path.join(exp_path, 'log_params_evolution.txt')
    log_data_path = os.path.join(exp_path, 'data_QNAS.pkl')

    params = load_yaml(log_file_path)
    log_data = load_pkl(log_data_path)

    input_shape = (1, params['train_data_info']['height'],
                   params['train_data_info']['width'],
                   params['train_data_info']['num_channels'])

    # Load last generation, if it is not specified
    if generation is None:
        generation = max(log_data.keys())

    log_data = log_data[generation]
    nets = log_data['net_pop']

    net = QChromosomeNetwork(
        fn_list=params['QNAS']['fn_list'],
        max_num_nodes=params['QNAS']['max_num_nodes']).decode(nets[individual])
    loaded_params = {
        'individual_id_str':
        f"Generation {generation} - individual {individual}",
        'individual_id': (generation, individual),
        'net_list': net,
        'input_shape': input_shape,
        'num_classes': params['train_data_info']['num_classes'],
        'fn_dict': params['fn_dict'],
        'fn_list': params['QNAS']['fn_list']
    }

    return loaded_params
Пример #9
0
    def __init__(self):
        self.styletips_data: StyleTipsData = None
        self.celebrity_data: CelebrityData = None
        self.attribute_data: AttributeData = None

        if isfile(DatasetConfig.knowledge_data_file):
            # Read existed extracted data files.
            knowledge_data = load_pkl(DatasetConfig.knowledge_data_file)
            self.styletips_data = knowledge_data.styletips_data
            self.celebrity_data = knowledge_data.celebrity_data
            self.attribute_data = knowledge_data.attribute_data
        else:
            # Load data from raw data file and save them into pkl.
            self.styletips_data = StyleTipsData.from_file()
            self.celebrity_data = CelebrityData.from_file()
            self.attribute_data = AttributeData.from_file()
            save_pkl(self, 'KnowledgeData', DatasetConfig.knowledge_data_file)
Пример #10
0
def main(params):
    captions_file = params.captions_file
    output_file = params.output_file

    vids = util.load_pkl(captions_file)
    st_model = skipthoughts.load_model()

    skip_vectors = {}
    for vid in vids.keys():

        caps = vids[vid]
        num_caps = len(caps)

        raw_caps = ['' for x in range(num_caps)]

        for cap in caps:
            raw_caps[int(cap['cap_id'])] = cap['tokenized']

        vector = skipthoughts.encode(st_model, raw_caps, verbose=False)

        skip_vectors[vid] = vector

    util.dump_pkl(skip_vectors, output_file)
Пример #11
0
from mlxtend.plotting import plot_decision_regions
import matplotlib.pyplot as plt
from algs import load_alg
import util
from feature_data import DataFeatures

data = util.load_pkl('data/ml/graph_features.pkl')
alg = load_alg('Random_Forest')

clf = alg.clf
X = data.get_joint_matrix(util.features)

train_x = X[data.train_indices]
# train_y = data.labels[data.train_indices]
val_x = X[data.val_indices]
# val_y = data.labels[data.val_indices]
y = data.labels[data.train_indices]

for i in range(len(y)):
    if i > 50:
        y[i] = 1
    else:
        y[i] = 0

# Fix feature values
feature_ind = [1, 2]
filler_vals = {0: 1, 3: 1, 4: 1, 5: 1}
filler_ranges = {0: 1, 3: 1, 4: 1, 5: 1}
# feature_cols = {0 : 'degree', 1: 'clustering', 2: 'comm_edge_density',
#                 3: 'comm_sz', 4: 'comm_review_count', 5: 'split'}
Пример #12
0
def train(task: int, model_file_name: str):
    """Train model.
    Args:
        task (int): Task.
        model_file_name (str): Model file name (saved or to be saved).
    """

    # Check if data exists.
    if not isfile(DatasetConfig.common_raw_data_file):
        raise ValueError('No common raw data.')

    # Load extracted common data.
    common_data: CommonData = load_pkl(DatasetConfig.common_raw_data_file)

    # Dialog data files.
    train_dialog_data_file = DatasetConfig.get_dialog_filename(
        task, TRAIN_MODE)
    valid_dialog_data_file = DatasetConfig.get_dialog_filename(
        task, VALID_MODE)
    test_dialog_data_file = DatasetConfig.get_dialog_filename(task, TEST_MODE)
    if not isfile(train_dialog_data_file):
        raise ValueError('No train dialog data file.')
    if not isfile(valid_dialog_data_file):
        raise ValueError('No valid dialog data file.')

    # Load extracted dialogs.
    train_dialogs: List[TidyDialog] = load_pkl(train_dialog_data_file)
    valid_dialogs: List[TidyDialog] = load_pkl(valid_dialog_data_file)
    test_dialogs: List[TidyDialog] = load_pkl(test_dialog_data_file)

    if task in {KNOWLEDGE_TASK}:
        knowledge_data = KnowledgeData()

    # Dataset wrap.
    train_dataset = Dataset(
        task,
        common_data.dialog_vocab,
        None,  #common_data.obj_id,
        train_dialogs,
        knowledge_data if task == KNOWLEDGE_TASK else None)
    valid_dataset = Dataset(
        task,
        common_data.dialog_vocab,
        None,  #common_data.obj_id,
        valid_dialogs,
        knowledge_data if task == KNOWLEDGE_TASK else None)
    test_dataset = Dataset(
        task,
        common_data.dialog_vocab,
        None,  #common_data.obj_id,
        test_dialogs,
        knowledge_data if task == KNOWLEDGE_TASK else None)

    print('Train dataset size:', len(train_dataset))
    print('Valid dataset size:', len(valid_dataset))
    print('Test dataset size:', len(test_dataset))

    # Get initial embedding.
    vocab_size = len(common_data.dialog_vocab)
    embed_init = get_embed_init(common_data.glove,
                                vocab_size).to(GlobalConfig.device)

    # Context model configurations.
    context_text_encoder_config = ContextTextEncoderConfig(
        vocab_size, embed_init)
    context_image_encoder_config = ContextImageEncoderConfig()
    context_encoder_config = ContextEncoderConfig()

    # Context models.
    context_text_encoder = TextEncoder(context_text_encoder_config)
    context_text_encoder = context_text_encoder.to(GlobalConfig.device)
    context_image_encoder = ImageEncoder(context_image_encoder_config)
    context_image_encoder = context_image_encoder.to(GlobalConfig.device)
    context_encoder = ContextEncoder(context_encoder_config)
    context_encoder = context_encoder.to(GlobalConfig.device)

    # Load model file.
    model_file = join(DatasetConfig.dump_dir, model_file_name)
    if isfile(model_file):
        state = torch.load(model_file)
        # if task != state['task']:
        #     raise ValueError("Task doesn't match.")
        context_text_encoder.load_state_dict(state['context_text_encoder'])
        context_image_encoder.load_state_dict(state['context_image_encoder'])
        context_encoder.load_state_dict(state['context_encoder'])

    # Task-specific parts.
    if task == INTENTION_TASK:
        intention_train(context_text_encoder, context_image_encoder,
                        context_encoder, train_dataset, valid_dataset,
                        test_dataset, model_file)
    elif task == TEXT_TASK:
        text_train(context_text_encoder, context_image_encoder,
                   context_encoder, train_dataset, valid_dataset, test_dataset,
                   model_file, common_data.dialog_vocab, embed_init)
    elif task == RECOMMEND_TASK:
        recommend_train(context_text_encoder, context_image_encoder,
                        context_encoder, train_dataset, valid_dataset,
                        test_dataset, model_file, vocab_size, embed_init)
    elif task == KNOWLEDGE_TASK:
        knowledge_attribute_train(context_text_encoder, context_image_encoder,
                                  context_encoder, train_dataset,
                                  valid_dataset, test_dataset, model_file,
                                  knowledge_data.attribute_data,
                                  common_data.dialog_vocab, embed_init)
Пример #13
0
def load_alg(name):
    path = 'data/ml/results/' + name + '.pkl'
    if os.path.isfile(path):
        return util.load_pkl(path)
    model_name = name.split('-')[0]
    return Algorithm(name, util.model_dict[model_name])
Пример #14
0
import config
from util import tokenize, preprocess, load_pkl, dump_pkl

bigram_path = 'data/kenlm/zhwiki_bigram.klm'
bigram = kenlm.Model(bigram_path)
print('Loaded bigram language model from {}'.format(bigram_path))

trigram_path = 'data/kenlm/zhwiki_trigram.klm'
trigram = kenlm.Model(trigram_path)
print('Loaded trigram language model from {}'.format(trigram_path))

text_path = 'data/train_input.txt'
text_counter_path = 'data/train_input_counter.pkl'
# 字频统计
if os.path.exists(text_counter_path):
    char_counter = load_pkl(text_counter_path)
else:
    print('generate counter from text file:', text_path)
    char_counter = Counter((codecs.open(text_path, 'r', encoding='utf-8').read()))
    dump_pkl(char_counter, text_counter_path)


def load_same_pinyin(path, sep='\t'):
    """
    加载同音字
    :param path:
    :return:
    """
    result = dict()
    if not os.path.exists(path):
        print("file not exists:", path)
Пример #15
0

if __name__ == "__main__":
    feature_path = 'data/ml/graph_features.pkl'
    x = DataFeatures(folder, True)
    for name in names:
        opts = {}
        if name == "Decision_Tree":
            opts = {
                'min_samples_split': 20,
                'max_features': 'log2',
                'min_samples_leaf': 20
            }
        name += "-extra"
        a = algs.load_alg(name)
        data = util.load_pkl(feature_path)

        a.run(data, util.features, clf_options=opts)
        a.to_csv()

    # folder = 'edge_rem_split_angle_norm'
    # feature_path = 'data/ml/graph_features_{}.pkl'.format(folder)
    # x = DataFeatures(folder, True)
    # for name in names:
    #     opts = {}
    #     if name == "Decision_Tree":
    #         opts = {'min_samples_split': 20, 'max_features': 'log2', 'min_samples_leaf': 20}
    #     name += "-ROUND2-angle-norm"
    #     a = algs.load_alg(name)
    #     data = util.load_pkl(feature_path)
    #
Пример #16
0
def retrieve_data():
    X_train, X_test, y_train, y_test = util.load_pkl(
        "../cleaned_data/data-new.pkl")
    return X_train, X_test, y_train, y_test
Пример #17
0
import matplotlib.pyplot as plt
import numpy as np
import util

seed_text = util.load_pkl('./results/seed_text_gru4.pkl')
print("seed_text: ", seed_text)
theta = util.load_pkl('./results/theta_gru4.pkl')
#theta = np.exp(theta)
#theta = theta / np.sum(theta)
theta = theta[0]
print("theta: ", theta)
N = len(theta)
x = range(N)
width = 1 / 2.0
plt.bar(x, theta, width, color="blue")
plt.title("Inferred Topic Distribution from TopicGRU", fontsize=20)
plt.savefig("theta_gru4.pdf")
plt.show()
Пример #18
0
def retrieve_model():
    model = util.load_pkl("../models/NN.pkl")
    return model