Пример #1
0
def add_features_cross_smooth_ctr(all_data):
    '''
    向总体数据添加特征
    feature=['user_id', 'item_id', 'item_brand_id', 'shop_id', 'context_page_id', 'category2_label', 'item_price_level', 'category_predict_rank']
    拼接键[feature, 'day']
    '''
    for feature in tqdm([
            'user_gender_id', 'user_age_level', 'user_occupation_id',
            'user_star_level'
    ]):

        for feature2 in tqdm([
                'item_id', 'item_brand_id', 'shop_id', 'item_price_level',
                'hour'
        ]):

            feature_path = feature_data_path + feature + '_' + feature2 + '_smooth_CTR.pkl'  #要存放的目录

            if not os.path.exists(feature_path):
                gen_features_cross_smooth_ctr()
            ctr_data = load_pickle(feature_path)
            all_data = pd.merge(all_data,
                                ctr_data,
                                how='left',
                                on=[feature, feature2, 'day'])
    return all_data
def compute_codes_orig_it(track_ids, maindir, clique_ids, start_idx, end_idx):
    """Computes the original features, based on Thierry and Ellis, 2012.
    Dimensionality reduction using PCA of 50, 100, and 200 components."""
    res = []
    trainedpca = utils.load_pickle("models/pca_250Kexamples_900dim_nocovers.pkl")
    pca_components = [50,100,200]

    # Init codes
    codes = []
    for n_comp in pca_components:
        codes.append(np.ones((end_idx-start_idx,n_comp)) * np.nan)

    for i, tid in enumerate(track_ids[start_idx:end_idx]):
        path = utils.path_from_tid(maindir, tid)
        feats = utils.extract_feats(path)
        if feats == None:
            continue
        med = np.median(feats, axis=0)
        for pca_idx, n_comp in enumerate(pca_components):
            tmp = dan_tools.chromnorm(med.reshape(med.shape[0], 
                                    1)).squeeze()
            codes[pca_idx][i] = trainedpca.apply_newdata(tmp, ndims=n_comp)
        if i % 1000 == 0:
            logger.info("Computed %d of %d track(s)" % (i, end_idx-start_idx))
    res = (codes, track_ids[start_idx:end_idx], clique_ids[start_idx:end_idx])
    return res
def load_codes(codesdir, lda_idx, max_files=None):
    code_files = glob.glob(os.path.join(codesdir, "*.pk"))
    if lda_idx == 0:
        n_comp = 50
    elif lda_idx == 1:
        n_comp = 100
    elif lda_idx == 2:
        n_comp = 200
    elif lda_idx == -1:
        n_comp = 2045
    feats = np.empty((0,n_comp))
    track_ids = []
    clique_ids = []
    if max_files is not None:
        code_files = code_files[:max_files]
    for code_file in code_files:
        codes = utils.load_pickle(code_file)
        feats = np.append(feats, codes[0][lda_idx], axis=0)
        track_ids += codes[1]
        clique_ids += list(codes[2])

    track_ids = np.asarray(track_ids)
    clique_ids = np.asarray(clique_ids)

    return feats, track_ids, clique_ids
Пример #4
0
def add_app_hist_install(data):
    feature_path = feature_data_path + 'app_hist_install.pkl'
    app_hist_install = load_pickle(feature_path)
    data = pd.merge(data, app_hist_install, 'left', ['appID', 'clickDay'])
    app_hist_install['app_hist_install'] = app_hist_install[
        'app_hist_install'] / (app_hist_install['clickDay'] - 1)
    return data
Пример #5
0
def load_titles():
    titles_train, titles_val, titles_test = load_pickle(
        os.path.join(CACHE_DIR, 'titles.pkl'))
    print(
        f'Titles: {len(titles_train)}, {len(titles_val)}, {len(titles_test)}')

    return titles_train, titles_val, titles_test
Пример #6
0
    def __init__(self, name=None, mode='ranking',
                 # model initialization
                 load_weights_from=None, weights_file=None,
                 # neural network architecture
                 layer_sizes=None, activation='relu', dropout=0.5, freeze_embeddings=False,
                 # error penalties for heuristic ranking objective
                 FN=0.8, FL=0.5 if directories.CHINESE else 0.4, WL=1.0,
                 # learning rates
                 all_pairs_lr=0.002, top_pairs_lr=0.0002, ranking_lr=0.000002,
                 reinforce_lr=0.000002, reward_rescaling_lr=0.00002,
                 # which speaker and string-matching features to use
                 pair_features=None,
                 # mention features
                 use_length=True, use_mention_type=True,  use_position=True, use_dep_reln=False,
                 # distance and genre features
                 use_distance=True, use_genre=True,
                 # averaged word embedding features
                 use_spans=True, use_doc_embedding=True):
        if layer_sizes is None:
            layer_sizes = [1000, 500, 500]
        if pair_features is None:
            pair_features=[
               # speaker features
               "same-speaker",
               "antecedent-is-mention-speaker",
               "mention-is-antecedent-speaker",
               # string-matching features
               "relaxed-head-match",
               "exact-string-match",
               "relaxed-string-match",
           ]

        self.load_weights_from = load_weights_from
        self.weights_file = weights_file
        self.layer_sizes = layer_sizes
        self.activation = activation
        self.dropout = dropout
        self.freeze_embeddings = freeze_embeddings
        self.FN, self.FL, self.WL = FN, FL, WL
        self.ranking_lr = ranking_lr
        self.reinforce_lr = reinforce_lr
        self.reward_rescaling_lr = reward_rescaling_lr
        self.top_pairs_lr = top_pairs_lr
        self.all_pairs_lr = all_pairs_lr

        self.use_length = use_length
        self.use_mention_type = use_mention_type
        self.use_position = use_position
        self.use_dep_reln = use_dep_reln
        self.use_distance = use_distance
        self.use_genre = use_genre
        self.use_spans = use_spans
        self.use_doc_embedding = use_doc_embedding

        if os.path.exists(directories.MISC + 'pair_feature_names.pkl'):
            name_mapping = utils.load_pickle(directories.MISC + 'pair_feature_names.pkl')
            self.active_pair_features = sorted([name_mapping[f] for f in pair_features])

        self.set_name(name)
        self.set_mode(mode)
Пример #7
0
def deal_with_postag(data_list, mode='full'):

    if len(data_list) == 1 and mode == 'train':
        cache_postag = get_config_values('cache', 'postag_train')
    elif len(data_list) == 1 and mode == 'dev':
        cache_postag = get_config_values('cache', 'postag_dev')
    elif len(data_list) == 2 and mode == 'mix':
        cache_postag = get_config_values('cache', 'postag_mix')
    elif len(data_list) == 3 and mode == 'full':
        cache_postag = get_config_values('cache', 'postag_full')
    else:
        logger.warn('Found data format wrong when dealing with postag...')

    if not os.path.exists(cache_postag):
        logger.info("dealing with postag...")
        postag = []
        for dataset in tqdm(data_list):
            for line in dataset:
                postag.append([[
                    Converter('zh-hans').convert(word['word'].strip().replace(
                        ' ', '')), word['pos'],
                    len(word['word'])
                ] for word in line['postag']])
        save_pickle(cache_postag, postag)
    else:
        logger.info("loading with postag...")
        postag = load_pickle(cache_postag)
    logger.info("postag total num: {0}".format(len(postag)))
    logger.info("postag 5: {0}".format(postag[:5]))
    return postag
Пример #8
0
def plot_imgrun_loss(p):
    """
    load the models and loss so we can find the best fit.. 

    input: params = dict (root_dir, img_run_id, txt_run_id, img_size,kl_weight, ..)
    """

    img_root = f"{p['root_dir']}/imgruns/{p['img_run_id']}/"

    print(img_root)

    epoch_mx = 0
    for filename in glob.glob(os.path.join(img_root,
                                           'saved_data/losses*.pkl')):
        epochs = (filename.split(sep="_")[-1].rstrip(".pkl")
                  )  #split returns a list...
        if int(epochs) > epoch_mx:
            loss_file = filename
            epoch_mx = int(epochs)

    print(loss_file)
    # loops over matching filenames in all subdirectories of `directory`.
    train, test = ut.load_pickle(
        os.path.join(img_root, f"saved_data/losses_{epoch_mx}.pkl"))

    print(len(train))
    print(len(test))
    e_train = range(0, len(train))
    e_test = range(0, len(train), int(len(train) / len(test)))
def load_bias(bias_name) -> Dict[str, np.ndarray]:
    """Load dictionary of example_id->bias where bias is a length 3 array
    of log-probabilities"""

    if bias_name == "hans":
        if bias_name == "hans":
            bias_src = config.MNLI_WORD_OVERLAP_BIAS
        if not exists(bias_src):
            raise Exception("lexical overlap bias file is not found")
        bias = utils.load_pickle(bias_src)
        for k, v in bias.items():
            # Convert from entail vs non-entail to 3-way classes by splitting non-entail
            # to neutral and contradict
            bias[k] = np.array([
                v[0] - np.log(2.),
                v[1],
                v[0] - np.log(2.),
            ])
        return bias

    if bias_name in config.BIAS_SOURCES:
        file_path = config.BIAS_SOURCES[bias_name]
        with open(file_path, "r") as hypo_file:
            all_lines = hypo_file.read()
            bias = json.loads(all_lines)
            for k, v in bias.items():
                bias[k] = np.array(v)
        return bias
    else:
        raise Exception("invalid bias name")
Пример #10
0
def main():
    """Save figures showing the pre-existing Urban Observatory network of sensors and
    comparisons with optimised networks using our approach.
    """
    print("Saving Urban Observatory figures...")
    set_fig_style()

    config = get_config()
    lad20cd = lad20nm_to_lad20cd(config["la"])
    networks_path = get_single_obj_filepath(config)
    networks = load_pickle(networks_path)
    uo_sensors = load_uo_sensors(config)

    figs_dir = get_figures_save_dir(config)

    population_groups, all_groups = get_objectives(config)
    oa_weights = get_weights(lad20cd, population_groups)
    theta, _ = get_default_optimisation_params(config)

    uo_sensor_dict = get_uo_sensor_dict(lad20cd, uo_sensors=uo_sensors)
    uo_coverage = get_uo_coverage_oa(lad20cd, uo_sensor_dict, theta,
                                     all_groups, oa_weights)

    fig_uo_sensor_locations(lad20cd, uo_sensors, figs_dir)
    fig_uo_coverage_grid(lad20cd, uo_sensors, theta, figs_dir)
    fig_uo_coverage_grid_diff(lad20cd, uo_sensors, theta, all_groups, networks,
                              figs_dir)
    fig_uo_coverage_oa(uo_coverage, theta, all_groups, figs_dir)
    fig_uo_coverage_oa_diff(lad20cd, uo_coverage, theta, all_groups, networks,
                            figs_dir)
Пример #11
0
def get_arguments():
    args = build_parser()
    # set random seed for reproducible experiments
    # reference: https://github.com/pytorch/pytorch/issues/7068
    random.seed(args.random_seed)
    numpy.random.seed(args.random_seed)
    torch.manual_seed(args.random_seed)
    torch.cuda.manual_seed(args.random_seed)
    torch.cuda.manual_seed_all(args.random_seed)

    # these flags can affect performance, selec carefully
    # torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = False

    os.makedirs(args.save_path, exist_ok=True)
    if args.train_flag:
        os.makedirs(os.path.join(args.save_path, 'training_log'),
                    exist_ok=True)
    else:
        loaded_args = load_pickle(
            os.path.join(os.path.dirname(args.model_load), 'argument.pickle'))
        args = update_arguments_for_eval(args, loaded_args)

    # cuda setting
    os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
    os.environ['CUDA_VISIBLE_DEVICES'] = ', '.join(map(str, args.gpu_no))

    with open(os.path.join(args.save_path, 'argument.txt'), 'w') as f:
        for key, value in sorted(vars(args).items()):
            f.write('%s: %s' % (key, value) + '\n')

    save_pickle(os.path.join(args.save_path, 'argument.pickle'), args)
    return args
Пример #12
0
def add_features_cross_day_ctr(all_data):
    '''
    向总体数据添加特征
    feature=['user_id', 'item_id', 'item_brand_id', 'shop_id', 'context_page_id', 'category2_label', 'item_price_level', 'category_predict_rank']
    拼接键[feature, 'day']
    '''
    for feature in tqdm([
            'user_id',
    ]):

        for feature2 in tqdm([
                'item_id',
                'item_brand_id',
                'shop_id',
                'category2_label',
                'item_price_level',
        ]):

            I_alias = feature + '_' + feature2 + '_day_I'  #总点击次数
            C_alias = feature + '_' + feature2 + '_day_C'  #购买次数
            feature_path = feature_data_path + feature + '_' + feature2 + '_before_day_CTR.pkl'  #要存放的目录

            if not os.path.exists(feature_path):
                gen_features_cross_day_ctr()
            ctr_data = load_pickle(feature_path)
            all_data = pd.merge(all_data,
                                ctr_data,
                                how='left',
                                on=[feature, feature2, 'day'])
            all_data[I_alias] = all_data[I_alias].fillna(0)
            all_data[C_alias] = all_data[C_alias].fillna(0)
    return all_data
Пример #13
0
def save_shap_val(hp_filename,
                  filename,
                  name,
                  SAVE_DIR,
                  train_data,
                  test_data,
                  test_labels,
                  use_gpu=True,
                  background_length=100,
                  padding_length=512):
    hp_d = 'models/{}.pkl'.format(hp_filename)
    hp_path = utils.get_abs_path(SAVE_DIR, hp_d)
    d = utils.load_pickle(hp_path)
    model_d = 'models/{}.pkl'.format(filename)
    model_path = utils.get_abs_path(SAVE_DIR, model_d)
    model = init_model(train_data, d, model_path, use_gpu=use_gpu)
    features_l, importance_l = [], []
    features = 'features/{}_shap_all_features.pkl'.format(name)
    feature_path = utils.get_abs_path(SAVE_DIR, features)
    scores = 'feature_importance/{}_shap_all_scores.pkl'.format(name)
    model_path = utils.get_abs_path(SAVE_DIR, scores)
    features_l, importance_l = get_lstm_shap(
        model,
        train_data,
        test_data,
        background_length=background_length,
        padding_length=padding_length,
        feature_path=feature_path,
        model_path=model_path)
    utils.save_pickle(features_l, feature_path)
    utils.save_pickle(importance_l, model_path)
Пример #14
0
    def __init__(self, system_id):
        super().__init__(system_id)

        self.story_model = os.environ.get(
            "CWC_STORY_MODEL_" + system_id.upper(),
            self.model_folder + "/" + self.BEAM_MODEL_FILE)
        self.story_vocab = os.environ.get(
            "CWC_STORY_VOCAB_" + system_id.upper(),
            self.model_folder + "/" + self.BEAM_VOCAB_FILE)

        torch.manual_seed(self.torch_seed)

        # Load models and vocab dictionaries, init stopping symbols for generation
        self.st_model = load_model(self.story_model, self.use_cuda)
        self.st_dict = load_pickle(self.story_vocab)
        self.st_vocab_size = len(self.st_dict)
        self.st_eot_id = self.st_dict.word2idx[self.title_end]
        self.st_eos_id = self.st_dict.word2idx[self.story_end]
        self.st_sep_id = self.st_dict.word2idx[self.story_sep]
        # self.special_chars = [self.story_end, self.story_sep, self.title_end]
        self.special_chars = SPECIAL_CHARACTERS
        self.nlp = init_nlp_model()

        self.decoder = BeamSearchDecoder(self.st_model,
                                         self.beam_size,
                                         self.st_eos_id,
                                         verbosity=False,
                                         dictionary=self.st_dict)
Пример #15
0
def get_mean_final_score(model_result_paths):
    print('Mean score begin ...')
    test_unass = load_json(TEST_UNASS_PATH)
    # aid2coauthor = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2coauthor.pkl'))
    # test_pub = load_json(TEST_PUB_PATH)
    # aid2orgwithyear = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2orgwithyear.pkl'))
    # title_feature_df = pd.read_pickle(os.path.join(TEST_FEATURE_DIR, 'test-title-distance-df.pkl'))
    # title_feature = title_feature_df.values

    # org_text_process_dict = {
    #     'my_stopwords': set(stopwords.words('english')),
    #     'num_pattern': re.compile(r'\d+'),
    #     'remove_punctuation': str.maketrans(string.punctuation, ' '*len(string.punctuation)),
    #     'lemmatizer': WordNetLemmatizer(),
    # }
    result_dict_list = [load_pickle(path) for path in model_result_paths]
    submission = defaultdict(list)
    # count = 0
    # problem_pids = []
    for pid_with_index in tqdm.tqdm(test_unass):
        candidate_aids = result_dict_list[0][pid_with_index]['candidate-aids']
        inner_data = np.zeros((len(candidate_aids), len(result_dict_list)))
        for num, result_dict in enumerate(result_dict_list):
            data = result_dict[pid_with_index]['result-score']
            inner_data[:, num] = data
        final_output = np.mean(inner_data, axis=1)
        predict_author = candidate_aids[np.argmax(final_output)]
        submission[predict_author].append(pid_with_index.split('-')[0])
    save_json(
        submission,
        os.path.join(
            FINAL_DIR,
            'name-clean-2-mean-result-%d.json' % len(result_dict_list)))
def gen_feature_click_stats(update=True):
    """生成各个分类属性日点击量的统计特征

    file_name: (feature)_click_day_stats.pkl

    example:
        user_id_click_day_mean 该用户平均每天点击多少次
        item_id_click_day_max 该物品单日最高销量

    features:
        'user_id_click_day_mean', 'user_id_click_day_max', 'user_id_click_day_min', 
        'item_id_click_day_mean', 'item_id_click_day_max', 'item_id_click_day_min',
        'item_brand_id_click_day_mean', 'item_brand_id_click_day_max', 'item_brand_id_click_day_min', 
        'shop_id_click_day_mean', 'shop_id_click_day_max', 'shop_id_click_day_min',
        'context_page_id_click_day_mean', 'context_page_id_click_day_max', 'context_page_id_click_day_min',
        'category2_label_click_day_mean', 'category2_label_click_day_max', 'category2_label_click_day_min'
        

    """

    data = load_pickle(raw_data_path + 'all_data.pkl')

    stats_feature = ['user_id', 'item_id', 'item_brand_id', 'shop_id']

    for feature in tqdm(stats_feature):
        feature_path = feature_data_path + feature + '_click_day_stats.pkl'
        if os.path.exists(feature_path) and update == False:
            print('found ' + feature_path)
        else:
            print('generating ' + feature_path)
            feature_stats = gen_feature_click_day_stats(data, feature)
            print(feature_stats.columns)
            dump_pickle(feature_stats, feature_path)
Пример #17
0
 def __init__(self, bson_filepaths, transform=None, mode='train'):
     assert mode in {'train', 'valid'}
     self.transform = transform
     train_valid_data = utils.load_pickle(config.TRAIN_VALID_DATA_FILENAME)
     self.bson_filepaths = bson_filepaths
     self.dataset_index = train_valid_data[f'{mode}_index']
     self.data = train_valid_data['shuffled_train_data']
Пример #18
0
def add_features_hour_ctr(all_data):
    for feature in tqdm([
            'user_id',
            'item_id',
            'item_brand_id',
            'category2_label',
            'category3_label',
            'context_page_id',
            'shop_id',
            'item_sales_level_bin',
            'item_price_level_bin',
            'item_collected_level_bin',
            'item_pv_level_bin',
            'shop_review_num_level_bin',
            'shop_review_positive_rate_bin',
            'shop_star_level_bin',
            'shop_score_service_bin',
            'shop_score_delivery_bin',
            'shop_score_description_bin',
    ]):
        feature_path = feature_data_path + '_2_5_' + feature + '_hour_CTR.pkl'
        if not os.path.exists(feature_path):
            gen_features_hour_ctr()
        ctr_data = load_pickle(feature_path)
        all_data = pd.merge(all_data,
                            ctr_data,
                            how='left',
                            on=[feature, 'day', 'hour_bin'])

    return all_data
Пример #19
0
def add_features_cross_history_ctr(all_data):
    for feature in tqdm([
            'user_id',
    ]):

        for feature2 in tqdm([
                'item_id', 'item_brand_id', 'category2_label',
                'category3_label', 'shop_id', 'item_sales_level_bin',
                'item_price_level_bin'
        ]):

            I_alias = feature + '_' + feature2 + '_history_I'  #总点击次数
            C_alias = feature + '_' + feature2 + '_history_C'  #购买次数
            feature_path = feature_data_path + '_2_5_' + feature + '_' + feature2 + '_before_history_CTR.pkl'  #要存放的目录

            if not os.path.exists(feature_path):
                gen_features_cross_history_ctr()
            ctr_data = load_pickle(feature_path)
            all_data = pd.merge(all_data,
                                ctr_data,
                                how='left',
                                on=[feature, feature2, 'day'])
            all_data[I_alias] = all_data[I_alias].fillna(0)
            all_data[C_alias] = all_data[C_alias].fillna(0)
    return all_data
Пример #20
0
def main(unused_argv):

    session_config = tf.ConfigProto()
    session_config.gpu_options.allow_growth = True

    vocab = utils.load_pickle(FLAGS.vocab)
    caption_model = model.Captioner(vocab)
    caption_model.build_estimator(config=run_config,
                                  model_dir=FLAGS.model_dir,
                                  params=params)

    if FLAGS.mode == 'train':
        coco_data_train = utils.load_coco(FLAGS.data_dir, 'train')
        coco_data_val = utils.load_coco(FLAGS.data_dir, 'val')
        print('Successfully loading data')

        for _ in range(FLAGS.num_epochs // FLAGS.epochs_per_eval):
            caption_model.train(captions=coco_data_train.captions,
                                features=coco_data_train.features,
                                batch_size=FLAGS.batch_size,
                                epochs=FLAGS.epochs_per_eval)
            caption_model.eval(captions=coco_data_val.captions,
                               features=coco_data_val.features,
                               batch_size=FLAGS.batch_size)
    elif FLAGS.mode == 'inference':

        assert FLAGS.predict_image is not None
        caption_model.predict(FLAGS.predict_image)
Пример #21
0
def gen_user_start_installed_cateA():
    """
    计算用户初始安装的各大类app的的数量
    拼接键['userID',]
    """
    user_install = load_pickle(raw_data_path + 'user_installedapps.pkl')
    app_cate = pd.read_csv(raw_data_path + 'app_categories.csv')
    app_cate['cate_a'] = app_cate.appCategory.apply(lambda x: x // 100
                                                    if x > 100 else x)
    user_install = user_install.merge(app_cate, 'left', 'appID')
    for cate_a in tqdm(app_cate.cate_a.unique()):
        feature_path = feature_data_path + 'user_start_installed_cate_' + str(
            cate_a) + '.pkl'
        if os.path.exists(feature_path):
            print('found ' + feature_path)
        else:
            print('generating ' + feature_path)
            user_install_cate = user_install[user_install.cate_a == cate_a][[
                'userID', 'cate_a'
            ]]
            user_install_cate.rename(
                columns={'cate_a': 'user_start_install_cate_' + str(cate_a)},
                inplace=True)
            user_install_cate = user_install_cate.groupby(
                'userID', as_index=False).sum()
            dump_pickle(user_install_cate, feature_path)
Пример #22
0
    def test(self, path):
        corp = Corpus(path)
        bs = Bayesian()
        count = 0
        sender_bl = load_pickle('sender_bl.pickle')
        # scan email and define if msg is SPAM or HAM
        # first check if sender occurs in sender Blacklist
        # then count spamicity of the word using the Bayes approach
        for fname, body in corp.emails():
            sender = find_sender(body)
            if sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
                continue

            spamicity_list = []
            count += 1
            tokens = tokenize(body)
            # compute spamicity for each word and create list of the values
            for el in tokens:
                word_spamicity = [el, bs.word_spamicity(el)]
                spamicity_list.append(word_spamicity)
            # prepare list for Bayes
            spamicity_list = [
                list(i) for i in set(map(tuple, spamicity_list))
            ]  # remove duplicates from list
            spamicity_list.sort(key=lambda x: abs(0.5 - x[1]), reverse=True)
            prediction = bs.bayes_pred(
                spamicity_list[:15])  # Consider only 15 'words'
            if prediction > 0.9 or sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
            else:
                self.tag_it(path, fname, 'OK')
Пример #23
0
def cal_test_additional_chars(
    test_data_path,
    label_additional_chars,
    test_save_path,
):
    test_data_file_names = os.listdir(test_data_path)
    lengths = len(test_data_file_names)
    test_data_additional_chars = set()

    # new_extra_chars = set("/﹒–é/▲‧♥♡∩×『2〉×.è◆……①&")

    extra_chars = set(
        "!#$%&\()*+,-./:;<=>?@[\\]^_`{|}~!#¥%&?《》{}“”,:‘’。()·、;【】/……﹒–")
    for index in range(lengths):
        test_data_dir = os.path.join(test_data_path, str(index) + '.txt')

        with open(test_data_dir, 'r', encoding='utf-8') as f1:
            lines_text = f1.readlines()
            raw_text = ''
            for line_text in lines_text:
                raw_text += line_text
            test_data_additional_chars.update(
                re.findall(u'[^\u4e00-\u9fa5a-zA-Z0-9\*]', str(raw_text)))

    additional_chars = test_data_additional_chars.difference(
        label_additional_chars)  # 去掉标签里含有的特殊字符
    additional_chars = additional_chars.difference(extra_chars)  # 去掉额外的一些标点符号
    # additional_chars = additional_chars.difference(new_extra_chars)  # 去掉额外的一些标点符号
    save_pickle(additional_chars, test_save_path)  # 保存成pickle形式
    additional_chars = load_pickle(test_save_path)
    return additional_chars, test_data_additional_chars, label_additional_chars
Пример #24
0
def gen_user_search_time(file_name):
    '''
    #用当次搜索距离当天第一次搜索该商品时间差
    #用当次搜索距离当天第最后一次搜索该商品时间差
    #用当次搜索距离当天第一次搜索该店铺时间差
    #用当次搜索距离当天第最后一次搜索该店铺时间差
    #用当次搜索距离当天第一次搜索该品牌时间差
    #用当次搜索距离当天第最后一次搜索该品牌时间差
    #用当次搜索距离当天第一次搜索该类目时间差
    #用当次搜索距离当天第最后一次搜索该类目时间差
    '''
    data_select = pd.DataFrame()
    data = load_pickle(path=raw_data_path + file_name + '.pkl')
    
    cols = ['item_id','shop_id', 'item_brand_id','second_cate']
    for col in cols:
        data_filter = data[['user_id', col,'day','context_timestamp']].groupby(['user_id', col,'day'])
        max_time = data_filter.agg(max)
        min_time = data_filter.agg(min)
        x = data.loc[:, ('user_id', col, 'day')].values
        m = max_time.loc[[tuple(i) for i in x]]
        n = min_time.loc[[tuple(i) for i in x]]
        data_select['sub_maxtime_'+col] = data['context_timestamp'].values - np.squeeze(m.values)
        data_select['sub_mintime_'+col] = data['context_timestamp'].values - np.squeeze(n.values)
        
        data_select['sub_maxtime_'+col] = data_select['sub_maxtime_'+col].apply(lambda x: x.total_seconds())
        data_select['sub_mintime_'+col] = data_select['sub_mintime_'+col].apply(lambda x: x.total_seconds())
    dump_pickle(data_select, feature_data_path +file_name + '_user_search_time')
Пример #25
0
def test(model_props=None,
         model_name=None,
         weights_file='best_weights',
         dataset_name='test',
         save_output=True,
         save_scores=False):
    if model_props is None:
        model_props = model_properties.MentionRankingProps(
            name=model_name,
            load_weights_from=model_name,
            weights_file=weights_file)

    print "Loading data"
    vectors = np.load(directories.RELEVANT_VECTORS + 'word_vectors.npy')
    dataset = datasets.DocumentBatchedDataset(dataset_name,
                                              model_props,
                                              with_ids=True)
    docs = utils.load_pickle(directories.DOCUMENTS + dataset_name +
                             '_docs.pkl')
    stats = {}

    print "Building model"
    model, _ = pairwise_models.get_model(dataset, vectors, model_props)

    print "Evaluating model on", dataset_name
    evaluate_model(dataset,
                   docs,
                   model,
                   model_props,
                   stats,
                   save_output=save_output,
                   save_scores=save_scores)
    timer.clear()
    utils.write_pickle(stats, model_props.path + dataset_name + "_results.pkl")
Пример #26
0
def add_user_start_installed_cateA(data):
    for cate in tqdm([0, 1, 2, 3, 4, 5]):
        feature_path = feature_data_path + 'user_start_installed_cate_' + str(
            cate) + '.pkl'
        user_start_installed_cateA = load_pickle(feature_path)
        data = pd.merge(data, user_start_installed_cateA, 'left', 'userID')
    return data
Пример #27
0
    def test(self, path):
        corp = Corpus(path)
        bs = Bayesian()
        count = 0
        sender_bl = load_pickle('sender_bl.pickle')
        # scan email and define if msg is SPAM or HAM
        # first check if sender occurs in sender Blacklist
        # then count spamicity of the word using the Bayes approach
        for fname, body in corp.emails():
            sender = find_sender(body)
            if sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
                continue

            spamicity_list = []
            count += 1
            tokens = tokenize(body)
            # compute spamicity for each word and create list of the values
            for el in tokens:
                word_spamicity = [el, bs.word_spamicity(el)]
                spamicity_list.append(word_spamicity)
            # prepare list for Bayes
            spamicity_list = [list(i) for i in set(map(tuple, spamicity_list))]  # remove duplicates from list
            spamicity_list.sort(key=lambda x: abs(0.5 - x[1]), reverse=True)
            prediction = bs.bayes_pred(spamicity_list[:15])  # Consider only 15 'words'
            if prediction > 0.9 or sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
            else:
                self.tag_it(path, fname, 'OK')
def gen_user_feature_click_hour():
    """生成用户对所有分类属性的当前小时点击量
    """

    data = load_pickle(raw_data_path + 'all_data_4567.pkl')

    feature_list = [
        'category2_label',
        'category3_label',
        'shop_id',
        'item_id',
        'item_brand_id',
        'context_page_id',
        'item_price_level_bin',
        'item_sales_level_bin',
        'item_property_topic_k_15',
    ]

    for feature in tqdm(feature_list):
        feature_path = feature_data_path + '_2_1_' + 'user_' + feature + '_click_hour.pkl'
        if os.path.exists(feature_path):
            print('found ' + feature_path)
        else:
            print('generating ' + feature_path)

            user_feature_click_hour = data.groupby(
                ['user_id', 'day', 'hour',
                 feature]).size().reset_index().rename(
                     columns={0: 'user_' + feature + '_click_hour'})
            dump_pickle(user_feature_click_hour, feature_path)
Пример #29
0
def gen_feature_click_day_hour(update=True):
    '''
    计算feature=['user_id', 'item_id', 'item_brand_id', 'shop_id', 'user_gender_id', 'context_page_id',
                        'user_occupation_id', 'user_age_level']的点击量
    计算的是每天每小时

    文件名:[feature]_click_hour.pkl
    '''

    all_data = load_pickle(raw_data_path + 'all_data_4567.pkl')

    for feature in tqdm([
            'user_id', 'item_id', 'item_brand_id', 'category2_label',
            'category3_label', 'context_page_id', 'shop_id',
            'item_property_topic_k_15'
    ]):
        feature_path = feature_data_path + '_2_7_' + feature + '_click_day_hour.pkl'  # 要存放的目录
        if os.path.exists(feature_path) and update == False:
            print('found ' + feature_path)
        else:
            print('generating ' + feature_path)
            feature_click_day_hour = all_data.groupby(
                [feature, 'day', 'hour']).size().reset_index().rename(
                    columns={0: feature + '_click_hour'})
            dump_pickle(feature_click_day_hour, feature_path)  # 存储
Пример #30
0
def deal_with_text(data_list, mode='full'):

    if len(data_list) == 1 and mode == 'train':
        cache_text = get_config_values('cache', 'text_train')
    elif len(data_list) == 1 and mode == 'dev':
        cache_text = get_config_values('cache', 'text_dev')
    elif len(data_list) == 2 and mode == 'mix':
        cache_text = get_config_values('cache', 'text_mix')
    elif len(data_list) == 3 and mode == 'full':
        cache_text = get_config_values('cache', 'text_full')
    else:
        logger.warn('Found data format wrong when dealing with text...')

    if not os.path.exists(cache_text):
        logger.info("dealing with text...")
        text = []
        for dataset in tqdm(data_list):
            text.extend([
                Converter('zh-hans').convert(line['text']) for line in dataset
            ])
        save_pickle(cache_text, text)
    else:
        logger.info("loading with text...")
        text = load_pickle(cache_text)
    logger.info("text total num: {0}".format(len(text)))
    return text
def process_babi_dataset(save, print_dict=False):
    file = open('dialog-bAbI-tasks/dialog-babi-task5-full-dialogs-trn.txt',
                'r')
    text = file.readlines()
    file.close()
    system_acts = load_pickle('system_acts.pickle')

    def print_dict():
        for key in uttr_dict:
            print(key)
            print(uttr_dict[key])
            print()

    uttr_dict = {'<BEGIN>': [set()]}
    for act in system_acts:
        uttr_dict[act] = [set()]

    prev_uttr = '<BEGIN>'
    for uttr in text:
        if uttr == '\n':
            prev_uttr = '<BEGIN>'
        for act in system_acts:
            if prev_uttr == '':
                prev_uttr = act
                continue
            if act in uttr:
                user_uttr = re.sub(r'\d+', '', uttr.split(act)[0]).strip()
                uttr_dict[prev_uttr][0].add(user_uttr)
                prev_uttr = act

    if save:
        save_pickle(uttr_dict, 'simulator_uttrs.pickle')
    if print_dict:
        for k, v in uttr_dict.items():
            print(k, v, '\n')
Пример #32
0
def gen_user_basic_info(file_name='train', test_day=24):
    data_select = pd.DataFrame()

    data = load_pickle(path=raw_data_path + file_name + '.pkl')
    data_select['user_id'] = data['user_id']
    data_select['user_gender_id'] = data['user_gender_id']
    data_select['user_age_level'] = data['user_age_level']
    data_select['user_occupation_id'] = data['user_occupation_id']
    data_select['user_star_level'] = data['user_star_level']

    #用户搜索时间划分,上午/下午/晚上/凌晨
    data_select['is_morning'] = (data['hour'].values >=
                                 8) & (data['hour'].values <= 12)
    data_select['is_afternoon'] = (data['hour'].values >
                                   12) & (data['hour'].values <= 17)
    data_select['is_evening'] = (data['hour'].values >
                                 17) & (data['hour'].values <= 23)
    data_select['is_before_dawn'] = (data['hour'].values < 8)

    if file_name == 'train':
        '''
        为了后面的抽样,这里先加上is_trade,训练时记得要删去
        '''
        data_select['is_trade'] = data['is_trade']
    dump_pickle(data_select,
                feature_data_path + file_name + '_user_basic_info')
Пример #33
0
def add_feature_click_stats(data, ):
    """添加分类属性日点击量的统计特征

    join_key: ['feature_id',]

    """

    feature_list = [
        'user_id',
        'category2_label',
        'category3_label',
        'shop_id',
        'item_id',
        'item_brand_id',
        'context_page_id',
    ]

    for feature in tqdm(feature_list):
        feature_path = feature_data_path + '_2_2_' + feature + '_click_day_mean.pkl'
        if not os.path.exists(feature_path):
            gen_feature_click_stats()

        feature_click_day_stats = load_pickle(feature_path)
        data = pd.merge(data, feature_click_day_stats, 'left', [
            feature,
        ])

    return data
def add_user_feature_click_day(data):
    """添加用户对所有分类属性的当天点击量

    join_key: ['user_id', 'feature_id', 'day']

    """

    feature_list = [
        'category2_label',
        'category3_label',
        'shop_id',
        'item_id',
        'item_brand_id',
        'context_page_id',
        'item_price_level_bin',
        'item_sales_level_bin',
        'item_property_topic_k_15',
    ]

    for feature in tqdm(feature_list):
        feature_path = feature_data_path + '_2_1_' + 'user_' + feature + '_click_day.pkl'
        if not os.path.exists(feature_path):
            gen_user_feature_click_day()

        user_feature_click_day = load_pickle(feature_path)
        data = pd.merge(data, user_feature_click_day, 'left',
                        [feature, 'day', 'user_id'])

    return data
 def __init__(self):
     os.makedirs(SPACY_DIR, exist_ok=True)
     self.text_tokens_path = os.path.join(SPACY_DIR, 'text.tokens.json')
     self.token_vector_path = os.path.join(SPACY_DIR, 'token.vector.pkl')
     self.text_tokens = load_json(self.text_tokens_path, {})
     self.token_vector = load_pickle(self.token_vector_path, {})
     self.text_tokens_len = len(self.text_tokens)
     self.token_vector_len = len(self.token_vector)
     self.nlp = spacy.load('en_core_web_lg')
     self.n_calls = 0
Пример #36
0
def build_dataset(vectors, name, tune_fraction=0.0, reduced=False, columns=None):
    doc_vectors = utils.load_pickle(directories.MISC + name.replace("_reduced", "") +
                                   "_document_vectors.pkl")

    main_pairs = PairDataBuilder(columns)
    tune_pairs = PairDataBuilder(columns)
    main_mentions = MentionDataBuilder(columns)
    tune_mentions = MentionDataBuilder(columns)
    main_docs = DocumentDataBuilder(columns)
    tune_docs = DocumentDataBuilder(columns)

    print "Building dataset", name + ("/tune" if tune_fraction > 0 else "")
    p = utils.Progbar(target=(2 if reduced else utils.lines_in_file(directories.RAW + name)))
    for i, d in enumerate(utils.load_json_lines(directories.RAW + name)):
        if reduced and i > 2:
            break
        p.update(i + 1)

        if reduced and tune_fraction != 0:
            pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \
                if i == 0 else (tune_pairs, tune_mentions, tune_docs)
        else:
            pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \
                if random.random() > tune_fraction else (tune_pairs, tune_mentions, tune_docs)

        ms, ps = mentions.size(), pairs.size()
        mention_positions = {}
        for mention_num in sorted(d["mentions"].keys(), key=int):
            mention_positions[mention_num] = mentions.size()
            mentions.add_mention(d["mentions"][mention_num], vectors,
                                 doc_vectors[d["mentions"][mention_num]["doc_id"]])

        for key in sorted(d["labels"].keys(), key=lambda k: (int(k.split()[1]), int(k.split()[0]))):
            k1, k2 = key.split()
            pairs.add_pair(d["labels"][key], mention_positions[k1], mention_positions[k2],
                           int(d["mentions"][k1]["doc_id"]),
                           int(d["mentions"][k1]["mention_id"]),
                           int(d["mentions"][k2]["mention_id"]),
                           d["pair_features"][key])

        me, pe = mentions.size(), pairs.size()
        docs.add_doc(ms, me, ps, pe, d["document_features"])

    suffix = ("_reduced" if reduced else "")
    if tune_mentions.size() > 0:
        tune_mentions.write(name + "_tune" + suffix)
        tune_pairs.write(name + "_tune" + suffix)
        tune_docs.write(name + "_tune" + suffix)
        main_mentions.write(name + "_train" + suffix)
        main_pairs.write(name + "_train" + suffix)
        main_docs.write(name + "_train" + suffix)
    else:
        main_mentions.write(name + suffix)
        main_pairs.write(name + suffix)
        main_docs.write(name + suffix)
Пример #37
0
def merge_switch_ssm_fitness_sims(input_fnames, output_fname):
    """
    Combine all the switch SSM fitness simulations into a single pickle file.
    """
    ### combine all the simulations into one pickle file.
    all_sim_data = OrderedDict()
    for fname in input_fnames:
        sim_name = os.path.basename(fname).split(".data")[0]
        curr_sim_data = utils.load_pickle(fname)
        all_sim_data[sim_name] = curr_sim_data
    extra = {}
    utils.save_as_pickle(output_fname, all_sim_data, extra)
def process(statsfile, k, optfile=None):
    stats = utils.load_pickle(statsfile)
    track_ar = average_rank_per_track(stats)
    clique_ar = average_rank_per_clique(stats)
    ma_p = mean_average_precision(stats)
    #k_p = average_precision(stats, k, ver=True)
    k_p = average_precision_at_k(stats, k)

    # Set up logger
    logger = utils.configure_logger()

    # print results
    logger.info("Number of queries: %d" % len(stats))
    logger.info("Average Rank per Track: %.3f" % track_ar)
    logger.info("Average Rank per Clique: %.3f" % clique_ar)
    logger.info("Mean Average Precision: %.2f %%" % (ma_p * 100))
    logger.info("Precision at %d: %.2f %%" % (k, k_p * 100))
    
    if optfile is not None:
        stats2 = utils.load_pickle(optfile)
        #plot_rank_histograms(stats, stats2, test=False) 
        plot_precision_at_k_histograms(stats, stats2, K=[1,3,5,10], test=False)
    else:
        plot_rank_histogram(stats)
def compute_codes(args):
    """Computes maximum 10,000 x 10 tracks. N is the index in the MSD:
        e.g. 
            if N = 1: tracks computed: from 100,000 to 199,999
            if N = 5: tracks computed: from 500,000 to 599,999
    """

    track_ids = args["track_ids"]
    maindir = args["maindir"]
    d = args["d"]
    N = args["N"]
    clique_ids = args["clique_ids"]
    outdir = args["outdir"]
    origcodesdir = args["origcodesdir"]
    pca_n = args["pca_n"]
    norm = args["norm"]

    MAX     = 1e5 / 1
    ITER    = 1e4 / 1

    for it in xrange(10):
        logger.info("Computing %d of 10 iteration" % it)
        start_idx = int(N*MAX + it*ITER)
        end_idx = int(start_idx + ITER)
        codes = []
        strN = str(N)
        if N < 10:
            strN = "0" + str(N)
        out_file = os.path.join(outdir, strN) + str(it) + "-msd-codes.pk"
        if origcodesdir is None:
            origcodes = None
        else:
            origcodes_file = os.path.join(origcodesdir, strN) + str(it) + \
                "-msd-codes.pk"
            origcodes = utils.load_pickle(origcodes_file)[0][0]
            #origcodes = utils.load_pickle(origcodes_file)[0]
        if d == "":
            codes = compute_codes_orig_it(track_ids, maindir, clique_ids,
                start_idx, end_idx)
        else:
            codes = compute_codes_it(track_ids, maindir, d, clique_ids,
                start_idx, end_idx, origcodes=origcodes, norm=norm)
        
        utils.save_pickle(codes, out_file)
Пример #40
0
def test(model_props=None, model_name=None, weights_file='best_weights', dataset_name='test',
         save_output=True, save_scores=False):
    if model_props is None:
        model_props = model_properties.MentionRankingProps(name=model_name,
                                                           load_weights_from=model_name,
                                                           weights_file=weights_file)

    print "Loading data"
    vectors = np.load(directories.RELEVANT_VECTORS + 'word_vectors.npy')
    dataset = datasets.DocumentBatchedDataset(dataset_name, model_props, with_ids=True)
    docs = utils.load_pickle(directories.DOCUMENTS + dataset_name + '_docs.pkl')
    stats = {}

    print "Building model"
    model, _ = pairwise_models.get_model(dataset, vectors, model_props)

    print "Evaluating model on", dataset_name
    evaluate_model(dataset, docs, model, model_props, stats,
                   save_output=save_output, save_scores=save_scores)
    timer.clear()
    utils.write_pickle(stats, model_props.path + dataset_name + "_results.pkl")
Пример #41
0
 def __init__(self):
     self.spams_dict = load_pickle("spams.pickle")
     self.hams_dict = load_pickle("hams.pickle")
def main():
    # Args parser
    parser = argparse.ArgumentParser(description=
                "Cover song ID on the training Second Hand Song dataset",
                formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("msd_dir", action="store",
                        help="Million Song Dataset main directory")
    parser.add_argument("-dictfile", action="store", default="",
                        help="Pickle to the learned dictionary")
    parser.add_argument("-lda", action="store", nargs=2, default=[None,0], 
                        help="LDA file and version", metavar=('lda.pkl', 'n'))
    parser.add_argument("-codes", action="store", default=None, dest="codesfile",
                        help="Pickle to the features file")
    parser.add_argument("-f", action="store", default="", dest="featfile",
                        help="Pickle to the final features")
    parser.add_argument("-pca", nargs=2, metavar=('f.pkl', 'n'), 
                        default=("", 0),
                        help="pca model saved in a pickle file, " \
                        "use n dimensions")

    args = parser.parse_args()
    start_time = time.time()
    maindir = args.msd_dir
    shsf = "SHS/shs_dataset_train.txt"
    dictfile = args.dictfile

    # sanity cheks
    utils.assert_file(dictfile)
    utils.assert_file(maindir)
    utils.assert_file(shsf)

    # read clique ids and track ids
    cliques, all_tracks = utils.read_shs_file(shsf)
    track_ids = all_tracks.keys()
    clique_ids = np.asarray(utils.compute_clique_idxs(track_ids, cliques))
    logger.info("Track ids and clique ids read")
    utils.save_pickle(clique_ids, "SHS/clique_ids_train.pk")
    utils.save_pickle(track_ids, "SHS/track_ids_train.pk")

    # read LDA file
    lda_file = args.lda[0]
    if lda_file != None:
        lda_file = utils.load_pickle(lda_file)
        logger.info("LDA file read")

    # read codes file
    codesfile = args.codesfile
    if codesfile != None:
        codesfile = utils.load_pickle(codesfile)
        logger.info("Codes file read")

    # Compute features if needed
    if args.featfile == "":
        feats = compute_feats(track_ids, maindir, dictfile,
            lda_file=lda_file, lda_n=int(args.lda[1]), codes=codesfile,
            pca=args.pca[0], pca_n=int(args.pca[1]))
    else:  
        feats = utils.load_pickle(args.featfile)

    # Apply PCA
    pcafile = args.pca[0]
    pcadim = int(args.pca[1])
    if pcafile != "" and False:
        trainedpca = utils.load_pickle(pcafile)
        assert pcadim > 0
        logger.info('trained pca loaded')
        pcafeats = np.zeros((feats.shape[0], pcadim))
        for i,feat in enumerate(feats):
            pcafeats[i] = trainedpca.apply_newdata(feat, ndims=pcadim)
        feats = pcafeats

    # Scores
    feats, clique_ids, track_ids = utils.clean_feats(feats, clique_ids, track_ids)
    stats = score(feats, clique_ids)

    # Save data
    if dictfile == "":
        dictfile = "thierry" # For saving purposes
    utils.save_pickle(stats, "results/stats-" + os.path.basename(dictfile) + ".pk")

    # done
    logger.info('Average rank per track: %.2f, clique: %.2f, MAP: %.2f%%' \
                % (anst.average_rank_per_track(stats),
                    anst.average_rank_per_clique(stats),
                    anst.mean_average_precision(stats) * 100))
    logger.info("Done! Took %.2f seconds" % (time.time() - start_time))
Пример #43
0
def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None,
              sort_by_len=True):
    '''Loads the dataset

    :type path: String
    :param path: The path to the dataset (here IMDB)
    :type n_words: int
    :param n_words: The number of word to keep in the vocabulary.
        All extra words are set to unknow (1).
    :type valid_portion: float
    :param valid_portion: The proportion of the full train set used for
        the validation set.
    :type maxlen: None or positive int
    :param maxlen: the max sequence length we use in the train/valid set.
    :type sort_by_len: bool
    :name sort_by_len: Sort by the sequence lenght for the train,
        valid and test set. This allow faster execution as it cause
        less padding per minibatch. Another mechanism must be used to
        shuffle the train set at each epoch.

    '''

    #############
    # LOAD DATA #
    #############

    # Load the dataset
    # path = get_dataset_file(
        # path, "imdb.pkl",
        # "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")

    # if path.endswith(".gz"):
        # f = gzip.open(path, 'rb')
    # else:
        # f = open(path, 'rb')

    train_set = utils.load_pickle("encode_train_reviews.pickle")
    test_set = utils.load_pickle("encode_test_reviews.pickle")
    if maxlen:
        new_train_set_x = []
        new_train_set_y = []
        for x, y in zip(train_set[0], train_set[1]):
            if len(x) < maxlen:
                new_train_set_x.append(x)
                new_train_set_y.append(y)
        train_set = (new_train_set_x, new_train_set_y)
        del new_train_set_x, new_train_set_y

    # split training set into validation set
    train_set_x, train_set_y = train_set
    n_samples = len(train_set_x)
    sidx = numpy.random.permutation(n_samples)
    n_train = int(numpy.round(n_samples * (1. - valid_portion)))
    valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
    valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
    train_set_x = [train_set_x[s] for s in sidx[:n_train]]
    train_set_y = [train_set_y[s] for s in sidx[:n_train]]

    train_set = (train_set_x, train_set_y)
    valid_set = (valid_set_x, valid_set_y)

    def remove_unk(x):
        return [[1 if w >= n_words else w for w in sen] for sen in x]

    test_set_x, test_set_y = test_set
    valid_set_x, valid_set_y = valid_set
    train_set_x, train_set_y = train_set

    train_set_x = remove_unk(train_set_x)
    valid_set_x = remove_unk(valid_set_x)
    test_set_x = remove_unk(test_set_x)

    def len_argsort(seq):
        return sorted(range(len(seq)), key=lambda x: len(seq[x]))

    if sort_by_len:
        sorted_index = len_argsort(test_set_x)
        test_set_x = [test_set_x[i] for i in sorted_index]
        test_set_y = [test_set_y[i] for i in sorted_index]

        sorted_index = len_argsort(valid_set_x)
        valid_set_x = [valid_set_x[i] for i in sorted_index]
        valid_set_y = [valid_set_y[i] for i in sorted_index]

        sorted_index = len_argsort(train_set_x)
        train_set_x = [train_set_x[i] for i in sorted_index]
        train_set_y = [train_set_y[i] for i in sorted_index]

    train = (train_set_x, train_set_y)
    valid = (valid_set_x, valid_set_y)
    test = (test_set_x, test_set_y)

    return train, valid, test
def compute_feats(track_ids, maindir, d, lda_file=None, lda_n=0, codes=None, 
        ver=True, pca="", pca_n=0):
    """Computes the features using the dictionary d. If it doesn't exist, 
     computes them using Thierry's method.

     The improved pipeline is composed of 11 steps:

        1.- Beat Synchronous Chroma
        2.- L2-Norm
        3.- Shingle (PATCH_LEN: 75 x 12)
        4.- 2D-FFT
        5.- L2-Norm
        6.- Log-Scale
        7.- Sparse Coding
        8.- Shrinkage
        9.- Median Aggregation
        10.- Dimensionality Reduction
        11.- L2-Norm

    Original method by Thierry doesn't include steps 5,6,7,8,11.
     """
    if d != "":
        fx = load_transform(d)
        K = int(d.split("_")[1].split("E")[1])
    else:
        K = PATCH_LEN
    
    if codes is None:
        compute_codes = True
        codes = np.ones((len(track_ids),K)) * np.nan
    else:
        compute_codes = False
        K = codes[0].shape[0]
    if lda_file is not None:
        if lda_n == 0: n_comp = 50
        elif lda_n == 1: n_comp = 100
        elif lda_n == 2: n_comp = 200
    else:
        n_comp = K 

    if pca != "":
        pca = utils.load_pickle(pca)
        pca = pca[pca_n]

    final_feats = np.ones((codes.shape[0],n_comp)) * np.nan
    orig_feats = []
    for cnt, tid in enumerate(track_ids):
        if compute_codes:
            path = utils.path_from_tid(maindir, tid)

            # 1.- Beat Synchronous Chroma
            # 2.- L2-Norm
            # 3.- Shingle (PATCH_LEN: 75 x 12)
            # 4.- 2D-FFT
            feats = utils.extract_feats(path)
            #orig_feats.append(feats)    # Store orig feats
            if feats == None:
                continue
            
            if d != "":
                # 5.- L2-Norm
                # 6.- Log-Scale
                # 7.- Sparse Coding
                # 8.- Shrinkage
                H = fx(feats)
            else:
                H = feats
            #. 9.- Median Aggregation
            H = np.median(H, axis=0)
        else:
            H = codes[cnt]

        if compute_codes:
            codes[cnt] = H.copy()

        if pca != "":
            H = pca.transform(H)

        # Apply LDA if needed
        if lda_file is not None:
            #H = dan_tools.chromnorm(H.reshape(H.shape[0], 1)).squeeze()
            # 10.- Dimensionality Reduction
            H = lda_file[lda_n].transform(H)

        # 11.- L2-Norm
        final_feats[cnt] = dan_tools.chromnorm(H.reshape(H.shape[0], 1)).squeeze()

        if ver:
            if cnt % 50 == 1:
                logger.info("----Computing features %.1f%%" % \
                            (cnt/float(len(track_ids)) * 100))

    if d == "":
        d = "orig" # For saving purposes
    
    # Save codes
    utils.create_dir("results")
    if compute_codes:
        utils.save_pickle(codes, "results/codes-" + os.path.basename(d) + ".pk")

    # Save features
    #utils.save_pickle(orig_feats, "results/feats-" + os.path.basename(d) + ".pk")

    logger.info("Features Computed")
    return final_feats
Пример #45
0
def write_links(model_path, dataset_name):
    links = utils.load_pickle(model_path + dataset_name + '_links.pkl')
    with open(model_path + dataset_name + "_links", "w") as f:
        for did in links:
            f.write(str(did) + "\t" + " ".join(
                map(lambda (m1, m2): str(m1) + "," + str(m2), links[did])) + "\n")
Пример #46
0
 def __init__(self, columns=None):
     self.columns = columns
     self.mention_inds = DatasetColumn('dmi', columns)
     self.pair_inds = DatasetColumn('dpi', columns)
     self.features = DatasetColumn('df', columns)
     self.genres = utils.load_pickle(directories.MISC + 'genres.pkl')
SAVE = True
LOWE = False

descriptor = "SIFT"
descriptor = "spSIFT"

if TEST:
    prefix = "%s_%s_" % (descriptor, "test")
else:
    prefix = "%s_%s_" % (descriptor, "full")

train_images, train_labels, test_images, test_labels = get_train_test(TEST)

if descriptor == "SIFT":
    if os.path.isfile(prefix % "kmeans"):
        kmeans = load_pickle(prefix + "kmeans.pkl")
    else:
        pool = Pool(getNumberOfCPUs() - 2)

        if LOWE:
            print " [!] Lowe's SIFT"
            train_sift_with_null = pool.map(get_sift_lowe, train_images)
            test_sift_with_null = pool.map(get_sift_lowe, test_images)
        else:
            print " [!] OpenCV2's SIFT"
            train_sift_with_null = pool.map(get_sift, train_images)
            test_sift_with_null = pool.map(get_sift, test_images)

        pool.close()
        pool.join()
        pool.terminate()
Пример #48
0
def load_docs(dataset_name, word_vectors):
    return (datasets.Dataset(dataset_name, model_properties.MentionRankingProps(), word_vectors),
            zip(utils.load_pickle(directories.DOCUMENTS + dataset_name + '_docs.pkl'),
                utils.load_pickle(directories.ACTION_SPACE + dataset_name + '_action_space.pkl')))
Пример #49
0
def main():
    # Args parser
    parser = argparse.ArgumentParser(
        description="Evaluates the 500 binary queries from the SHS data set",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("msd_dir", action="store", help="Million Song Dataset main directory")
    parser.add_argument("-dictfile", action="store", default="", help="Pickle to the learned dictionary")
    parser.add_argument(
        "-lda", action="store", nargs=2, default=[None, 0], help="LDA file and version", metavar=("lda.pkl", "n")
    )
    parser.add_argument(
        "-pca",
        nargs=2,
        metavar=("f.pkl", "n"),
        default=("", 0),
        help="pca model saved in a pickle file, " "use n dimensions",
    )
    # Parse
    args = parser.parse_args()

    # Track time
    start_time = time.time()

    maindir = args.msd_dir
    queriesf = "SHS/list_500queries.txt"
    shsf = "SHS/shs_dataset_train.txt"
    lda = args.lda[0]
    lda_n = int(args.lda[1])
    pcafile = args.pca[0]
    pcadim = int(args.pca[1])

    # sanity cheks
    utils.assert_file(maindir)
    utils.assert_file(queriesf)
    utils.assert_file(shsf)
    utils.assert_file(pcafile)

    # read queries
    queries = read_query_file(queriesf)

    # load pca
    trainedpca = None
    if pcafile != "":
        f = open(pcafile, "r")
        trainedpca = cPickle.load(f)
        f.close()
        assert pcadim > 0
        logger.info("trained pca loaded")

    # load lda
    if lda != None:
        lda = utils.load_pickle(lda)

    # to keep stats
    results = []

    # iterate over queries
    logger.info("Starting the binary task...")

    # Get the dictionary transform
    td = load_transform(args.dictfile)

    for triplet in queries:
        # get features
        filenames = map(lambda tid: utils.path_from_tid(maindir, tid), triplet)
        triplet_feats = map(lambda f: extract_feats(f, td=td, lda_file=lda, lda_n=lda_n), filenames)
        if None in triplet_feats:
            continue

        # Apply pca if needed
        if trainedpca:
            triplet_feats = map(lambda feat: trainedpca.apply_newdata(feat, ndims=pcadim), triplet_feats)
            assert triplet_feats[np.random.randint(3)].shape[0] == pcadim

        # Compute result
        res1 = triplet_feats[0] - triplet_feats[1]
        res1 = np.sum(res1 * res1)
        res2 = triplet_feats[0] - triplet_feats[2]
        res2 = np.sum(res2 * res2)
        if res1 < res2:
            results.append(1)
        else:
            results.append(0)

        # verbose
        if len(results) % 5 == 0:
            logger.info(" --- after %d queries, accuracy: %.1f %%" % (len(results), 100.0 * np.mean(results)))
    # done
    logger.info("After %d queries, accuracy: %.1f %%" % (len(results), 100.0 * np.mean(results)))
    logger.info("Done! Took %.2f seconds" % (time.time() - start_time))
def main():
    # Args parser
    parser = argparse.ArgumentParser(description=
                "Evaluates the average rank and mean AP for the test SHS " \
                "over the entire MSD",
                formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("msd_dir", action="store",
                        help="Million Song Dataset main directory")
    parser.add_argument("-dictfile", action="store", default="",
                        help="Pickle to the learned dictionary")
    parser.add_argument("-outdir", action="store", default="msd_codes",
                        help="Output directory for the features")
    parser.add_argument("-N", action="store", default=10, type=int,
                        help="Number of processors to use when computing " \
                        "the codes for 1M tracks,")
    parser.add_argument("-lda", action="store", default=None, 
                        help="LDA file")
    parser.add_argument("-pca", nargs=2, metavar=('f.pkl', 'n'), 
                        default=(None, 0),
                        help="pca model saved in a pickle file, " \
                        "use n dimensions")
    parser.add_argument("-codes", action="store", nargs=2, default=[None,0], 
                        dest="codesdir", metavar=("msd_codes/", "n"),
                        help="Path to the folder with all the codes and "
                            "version to evaluate")
    parser.add_argument("-orig_codes", action="store", default=None, 
                        dest="origcodesdir",
                        help="Path to the folder with all the codes without "
                            "dimensionality reduction")
    parser.add_argument("-norm", action="store_true", dest="norm", default=False, 
                        help="Normalize before LDA/PCA or not")

    args = parser.parse_args()
    start_time = time.time()
    maindir = args.msd_dir
    shsf = "SHS/shs_dataset_test.txt"

    global lda
    global pca

    # sanity cheks
    utils.assert_file(maindir)
    utils.assert_file(shsf)
    utils.create_dir(args.outdir)

    # read cliques and all tracks
    cliques, all_tracks = utils.read_shs_file(shsf)
    track_ids = utils.load_pickle("SHS/track_ids_test.pk")
    clique_ids = utils.load_pickle("SHS/clique_ids_test.pk")

    # read codes file
    codesdir = args.codesdir[0]
    if codesdir is not None:
        if os.path.isfile(codesdir):
            c = utils.load_pickle(codesdir)
            feats = c[0]
            track_ids = c[1]
            clique_ids = c[2]
        else:
            feats, track_ids, clique_ids = load_codes(codesdir, 
                                                lda_idx=int(args.codesdir[1]))
        logger.info("Codes files read")
        print feats.shape
    else:
        # Read PCA file
        if args.pca[0] is not None:
            pca = utils.load_pickle(args.pca[0])[int(args.pca[1])]

        # read LDA file
        lda_file = args.lda
        if lda_file is not None:
            lda = utils.load_pickle(lda_file)

        utils.assert_file(args.dictfile)

        # Prepare Multiprocessing computation
        input = []
        pool = Pool(processes=args.N)
        for n in xrange(args.N):
            arg = {}
            arg["track_ids"] = track_ids
            arg["maindir"] = maindir
            arg["d"] = args.dictfile
            arg["N"] = n
            arg["clique_ids"] = clique_ids
            arg["outdir"] = args.outdir
            arg["origcodesdir"] = args.origcodesdir
            arg["pca_n"] = int(args.pca[1])
            arg["norm"] = args.norm
            input.append(arg)

        # Start computing the codes
        pool.map(compute_codes, input)

        # Done!
        logger.info("Codes computation done!")
        logger.info("Took %.2f seconds" % (time.time() - start_time))
        sys.exit()

    # Scores
    feats, clique_ids, track_ids = utils.clean_feats(feats, clique_ids, track_ids)
    stats = score(feats, clique_ids, N=len(all_tracks))

    # TODO: change file name
    utils.save_pickle(stats, "stats.pk")

    # done
    logger.info('Average rank per track: %.2f, clique: %.2f, MAP: %.2f%%' \
                % (anst.average_rank_per_track(stats),
                    anst.average_rank_per_clique(stats),
                    anst.mean_average_precision(stats) * 100))
    logger.info("Done! Took %.2f seconds" % (time.time() - start_time))
Пример #51
0
def main(model_path, dataset_name):
    docs = utils.load_pickle(model_path + dataset_name + '_processed_docs.pkl')

    for doc_data in utils.load_json_lines(directories.RAW + dataset_name):
        sentences = doc_data["sentences"]
        mid_to_mention = {int(m["mention_id"]): m for m in doc_data["mentions"].values()}
        mid_to_position = {mid: int(m["mention_num"]) for mid, m in mid_to_mention.iteritems()}

        doc = docs[doc_data["document_features"]["doc_id"]]
        clusters = [c for c in doc.clusters if len(c) > 1]

        cluster_to_endpoints = {}
        for c in clusters:
            positions = [mid_to_position[mid] for mid in c]
            cluster_to_endpoints[c] = (min(positions), max(positions))
        sorted_clusters = sorted(clusters, key=lambda c: cluster_to_endpoints[c])

        color_last_usage = {i: -1 for i in range(len(COLORS))}
        active_clusters = []
        cluster_to_color = {}
        for c in sorted_clusters:
            start, end = cluster_to_endpoints[c]
            for a in list(active_clusters):
                if cluster_to_endpoints[a][1] < start:
                    active_clusters.remove(a)

            used_colors = [cluster_to_color[a] for a in active_clusters]
            sorted_colors = sorted((u, i) for i, u in color_last_usage.iteritems())
            next_color = None
            for u, i in sorted_colors:
                if i not in used_colors:
                    next_color = i
                    break
            if next_color is None:
                next_color = sorted_colors[0][1]

            color_last_usage[next_color] = start
            cluster_to_color[c] = next_color
            active_clusters.append(c)

        annotations = defaultdict(lambda: defaultdict(list))
        for i, c in enumerate(sorted_clusters):
            color = COLORS[cluster_to_color[c]]
            for m in c:
                mention = mid_to_mention[m]
                start, end = mention["start_index"], mention["end_index"] - 1
                annotations[mention["sent_num"]][start].append(
                    (color + "[" + ENDC, 1 + end))
                annotations[mention["sent_num"]][end].append(
                    (color + "]" + subscript(i) + ENDC, -1 - start))

        for i, s in enumerate(sentences):
            for j, sentence_annotations in annotations[i].iteritems():
                sentence_annotations = sorted(sentence_annotations, key=itemgetter(1))
                for (annotation, priority) in sentence_annotations:
                    if priority > 0:
                        s[j] = annotation + s[j]
                    else:
                        s[j] = s[j] + annotation
            print " ".join(s)

        print
        print 80 * "="
        print
Пример #52
0
def train(model_props, n_epochs=10000, reduced=False, dev_set_name='dev'):
    print "Training", model_props.path
    pprint(model_props.__dict__)

    model_props.write(model_props.path + 'model_props.pkl')
    utils.rmkdir(model_props.path + 'src')
    for fname in os.listdir('.'):
        if fname.endswith('.py'):
            shutil.copyfile(fname, model_props.path + 'src/' + fname)
    if model_props.ranking or \
            model_props.top_pairs:
        write_start = 0
        write_every = 10
    else:
        write_start = 80
        write_every = 20

    print "Loading data"
    vectors = np.load(directories.RELEVANT_VECTORS + 'word_vectors.npy')
    train = datasets.DocumentBatchedDataset("train_reduced" if reduced else "train",
                                            model_props, with_ids=True)
    dev = datasets.DocumentBatchedDataset(dev_set_name + "_reduced" if reduced else dev_set_name,
                                          model_props, with_ids=True)

    print "Building model"
    model, _ = pairwise_models.get_model(dev, vectors, model_props)
    json_string = model.to_json()
    open(model_props.path + 'architecture.json', 'w').write(json_string)

    best_val_score = 1000
    best_val_score_in_window = 1000
    history = []
    for epoch in range(n_epochs):
        timer.start("train")
        print "EPOCH {:}, model = {:}".format((epoch + 1), model_props.path)

        epoch_stats = {}
        model_weights = model.get_weights()
        train_docs = utils.load_pickle(directories.DOCUMENTS + 'train_docs.pkl')
        dev_docs = utils.load_pickle(directories.DOCUMENTS + dev_set_name + '_docs.pkl')
        if reduced:
            dev_docs = dev_docs[:3]

        if model_props.ranking:
            print "Running over training set"
            run_model_over_docs(train, train_docs, model)
            epoch_stats.update(compute_metrics(train_docs, "train"))
            if model_props.use_rewards:
                print "Setting costs"
                set_costs(train, train_docs)

        print "Training"
        prog = utils.Progbar(train.n_batches)
        train.shuffle()
        loss_sum, n_examples = 0, 0
        for i, X in enumerate(train):
            if X['y'].size == 0:
                continue
            batch_loss = model.train_on_batch(X)
            loss_sum += batch_loss * train.scale_factor
            n_examples += X['y'].size
            prog.update(i + 1, exact=[("train loss", loss_sum / n_examples)])
        epoch_stats["train time"] = time.time() - prog.start
        for k in prog.unique_values:
            epoch_stats[k] = prog.sum_values[k][0] / max(1, prog.sum_values[k][1])

        epoch_stats["weight diffs"] = [
            (np.sum(np.abs(new_weight - old_weight)), new_weight.size)
            for new_weight, old_weight in zip(model.get_weights(), model_weights)]
        summed = np.sum(map(np.array, epoch_stats["weight diffs"][1:]), axis=0)
        epoch_stats["total weight diff"] = tuple(summed)

        print "Testing on dev set"
        evaluate_model(dev, dev_docs, model, model_props, epoch_stats)

        history.append(epoch_stats)
        utils.write_pickle(history, model_props.path + 'history.pkl')
        score = -epoch_stats["dev conll"] if model_props.ranking else \
            (epoch_stats["dev loss"] if not model_props.anaphoricity_only else
             epoch_stats["dev anaphoricity loss"])
        if score < best_val_score:
            best_val_score = score
            print "New best {:}, saving model".format(
                "CoNLL F1" if model_props.ranking else "validation loss")
            model.save_weights(model_props.path + "best_weights.hdf5", overwrite=True)
        if score < best_val_score_in_window and epoch > write_start:
            print "Best in last {:}, saved to weights_{:}".format(
                write_every, write_every * (epoch / write_every))
            best_val_score_in_window = score
            model.save_weights(model_props.path + "weights_{:}.hdf5".format(
                write_every * (epoch / write_every)), overwrite=True)
            if epoch + write_every >= n_epochs:
                model.save_weights(model_props.path + "final_weights.hdf5", overwrite=True)
        if epoch % write_every == 0:
            best_val_score_in_window = 1000

        timer.stop("train")
        timer.print_totals()
        print

    timer.clear()
    print(t)
    assert False

print('Tokenization:')
t0 = time.clock()
train_tokens = [tokenize(s, token_vector) for s in train[COMMENT]]
print('train_tokens: %1.f sec %.2f sec / token' % (time.clock() - t0, (time.clock() - t0) / len(train_tokens)))
t0 = time.clock()
test_tokens = [tokenize(s, token_vector) for s in test[COMMENT]]
print('test_tokens: %1.f sec %.2f sec / token' % (time.clock() - t0, (time.clock() - t0) / len(test_tokens)))

save_pickle('token.vector.pkl', token_vector)
save_json('train.tokens.json', train_tokens)
save_json('test.tokens.json', test_tokens)

token_vector = load_pickle('token.vector.pkl')
train_tokens = load_json('train.tokens.json')
test_tokens = load_json('test.tokens.json')


def compute_ngram_vector(token_list, n):
    """Compute an embedding vector for all n-grams in token_list
    """
    vec = np.zeros((n, SPACY_VECTOR_SIZE), dtype=np.float64)
    n_vecs = len(token_list) - n + 1
    for i in range(n_vecs):
        for j in range(n):
            vec[j] += token_vector[token_list[i + j]]
    vec /= n_vecs
    return np.reshape(vec, n * SPACY_VECTOR_SIZE)