def re_generator(files: Dict[str, tuple], args):
    """Generates files for RE"""
    for filename, data in files.items():
        generate_re_input_files(ehr_records=data[0],
                                ade_records=data[1],
                                filename=args.target_dir + filename + '.' +
                                args.ext,
                                max_len=args.max_seq_len,
                                sep=args.sep,
                                is_test=data[2],
                                is_label=data[3])

    save_pickle(args.target_dir + 'train', {
        "EHR": files['train'][0],
        "ADE": files['train'][1]
    })
    save_pickle(args.target_dir + 'test', {
        "EHR": files['test'][0],
        "ADE": files['test'][1]
    })

    print("\nGenerating files successful. Files generated: ",
          'train.tsv,',
          'dev.tsv,',
          'test.tsv,',
          'test_labels.tsv,',
          'train_rel.pkl,',
          'test_rel.pkl,',
          'test_labels_rel.pkl',
          sep=' ')
def pred_error(f_pred,
               prepare_data,
               data,
               iterator,
               verbose=False,
               is_test_phase=False):
    """
    Just compute the error
    f_pred: Theano fct computing the prediction
    prepare_data: usual prepare_data for that dataset.
    """
    valid_err = 0
    for _, valid_index in iterator:
        x, mask, y = prepare_data([data[0][t] for t in valid_index],
                                  np.array(data[1])[valid_index],
                                  maxlen=None)
        preds = f_pred(x, mask)
        if is_test_phase:
            print preds
            utils.save_pickle(("%0.3f" % np.random.rand()) + "pred.pickle",
                              preds)
        targets = np.array(data[1])[valid_index]
        valid_err += (preds == targets).sum()
    valid_err = 1. - np_floatX(valid_err) / len(data[0])

    return valid_err
Пример #3
0
def main(nlp, glove_dir):
    """Filter out sentences that are too short to be meaningful or far longer
    than the rest of our data.

    Parameters
    -----------
    nlp: spacy.lang.en.English
        Spacy parser used for tokenization.
    glove_dir: str
        Location to load glove vectors from.
    """
    # Load and split data.
    dtypes = dict(text=object, sex='category', age=np.int8)
    df = pd.read_csv('data/sentences.csv', dtype=dtypes, usecols=dtypes.keys())
    df['sex'] = (df.sex == 'male') * 1
    lengths = df.text.str.split().str.len()
    df = df[(lengths >= 5) & (lengths <= 50)]
    data = train_val_test_split(df.text,
                                df[['sex', 'age']],
                                train_p=.99,
                                val_p=.005,
                                state=1,
                                shuffle=True)
    # Order: x_train, x_val, x_test, y_train, y_val, y_test
    save_pickle(data, 'split_data')

    # w2count, w2idx, i2w, and w2vec will be pickled for easy access.
    build_word_mappings(data[0], nlp, glove_dir)
Пример #4
0
def cal_test_additional_chars(
    test_data_path,
    label_additional_chars,
    test_save_path,
):
    test_data_file_names = os.listdir(test_data_path)
    lengths = len(test_data_file_names)
    test_data_additional_chars = set()

    # new_extra_chars = set("/﹒–é/▲‧♥♡∩×『2〉×.è◆……①&")

    extra_chars = set(
        "!#$%&\()*+,-./:;<=>?@[\\]^_`{|}~!#¥%&?《》{}“”,:‘’。()·、;【】/……﹒–")
    for index in range(lengths):
        test_data_dir = os.path.join(test_data_path, str(index) + '.txt')

        with open(test_data_dir, 'r', encoding='utf-8') as f1:
            lines_text = f1.readlines()
            raw_text = ''
            for line_text in lines_text:
                raw_text += line_text
            test_data_additional_chars.update(
                re.findall(u'[^\u4e00-\u9fa5a-zA-Z0-9\*]', str(raw_text)))

    additional_chars = test_data_additional_chars.difference(
        label_additional_chars)  # 去掉标签里含有的特殊字符
    additional_chars = additional_chars.difference(extra_chars)  # 去掉额外的一些标点符号
    # additional_chars = additional_chars.difference(new_extra_chars)  # 去掉额外的一些标点符号
    save_pickle(additional_chars, test_save_path)  # 保存成pickle形式
    additional_chars = load_pickle(test_save_path)
    return additional_chars, test_data_additional_chars, label_additional_chars
def save_imdb_tfidf():
    """
     For comparison purposes only.
     apply tfidf to imdb data and save the resulting dataset.
     Not used in this project.
     
    """
    train_pos_files = glob.glob('data/imdb/train/pos/*')
    train_neg_files = glob.glob('data/imdb/train/neg/*')
    test_pos_files = glob.glob('data/imdb/test/pos/*')
    test_neg_files = glob.glob('data/imdb/test/neg/*')

    vocab = get_imdb_vocab()
    tfidf = TfidfVectorizer(input='filename',
                            stop_words='english',
                            max_df=0.5,
                            vocabulary=vocab,
                            sublinear_tf=True)

    total_train = train_pos_files + train_neg_files
    x_train = tfidf.fit_transform(total_train)
    y_train = np.concatenate(
        (np.ones(len(train_pos_files)), np.zeros(len(train_neg_files))))

    total_test = test_pos_files + test_neg_files
    x_test = tfidf.transform(total_test)
    y_test = np.concatenate(
        (np.ones(len(test_pos_files)), np.zeros(len(test_neg_files))))

    train_data = (x_train, y_train)
    test_data = (x_test, y_test)
    data = {'train': train_data, 'test': test_data}

    save_pickle('data/imdb_tfidf.pkl', data)
Пример #6
0
 def _commit(self, thot=None):
     if self._persist and thot:
         save_pickle(thot,
                     '%s%s.obj' % (constants.MEMORY_PATH, thot.t_name))
     if self._persist and not thot:
         save_pickle(self, '%s%s.obj' % (constants.MEMORY_PATH, self._name))
     return
Пример #7
0
def read_data_e2(data_dir):

    main_dir = glob.glob(data_dir+'/*/*')

    print(main_dir)
    for fl in main_dir:
        # print("Participant id is: ",fl.strip().split('/')[-2])
        participant = fl.strip().split("/")[-2]
        exp = fl.strip().split("/")[-3]
        print(fl.split('/')[-1])

        if 'example' in fl.split('/')[-1]:
            ff = spio.loadmat(fl,squeeze_me=True)
            ff_2 = spio.loadmat(fl,squeeze_me=False)
            disc_pr()
            sents = ff['keySentences']
            
            part_topic_id = ff['labelsPassageForEachSentence']
            topic_id = ff['labelsPassageCategory']
            topics = ff['keyPassageCategory']
            part_of_topics =ff['keyPassages']
            vxl = ff['examples']
            mtd = ff_2['meta']
            topic_id = [x for x, number in zip(topic_id, len(topic_id)*[4]) for _ in range(number)]
            data_dict={}
            for id,el in enumerate(part_topic_id):
                data_dict[(sents[id],part_of_topics[el-1],topics[topic_id[id]-1])]=vxl[id]

        
            # (Sentence,subtopic(Apple),topic(Fruit)): voxels
            save_pickle(data_dict, '../data_processed/' + exp + '_proc/' + participant + '/' + fl.strip().split("/")[-1])
            save_pickle(mtd, '../data_processed/' + exp + '_proc/' + participant + '/' + fl.strip().split("/")[-1] + '_meta')
Пример #8
0
def save_shap_val(hp_filename,
                  filename,
                  name,
                  SAVE_DIR,
                  train_data,
                  test_data,
                  test_labels,
                  use_gpu=True,
                  background_length=100,
                  padding_length=512):
    hp_d = 'models/{}.pkl'.format(hp_filename)
    hp_path = utils.get_abs_path(SAVE_DIR, hp_d)
    d = utils.load_pickle(hp_path)
    model_d = 'models/{}.pkl'.format(filename)
    model_path = utils.get_abs_path(SAVE_DIR, model_d)
    model = init_model(train_data, d, model_path, use_gpu=use_gpu)
    features_l, importance_l = [], []
    features = 'features/{}_shap_all_features.pkl'.format(name)
    feature_path = utils.get_abs_path(SAVE_DIR, features)
    scores = 'feature_importance/{}_shap_all_scores.pkl'.format(name)
    model_path = utils.get_abs_path(SAVE_DIR, scores)
    features_l, importance_l = get_lstm_shap(
        model,
        train_data,
        test_data,
        background_length=background_length,
        padding_length=padding_length,
        feature_path=feature_path,
        model_path=model_path)
    utils.save_pickle(features_l, feature_path)
    utils.save_pickle(importance_l, model_path)
Пример #9
0
def to_sequence(df,
                attrs=['sku_ID', 'if_order', 'request_time', 'brand_ID'],
                num_clicks=1000000):
    '''
    convert sorted dataframe into sequence:
    [[user_ID1, [sku_ID_1, sku_ID2, ...], [time1, time2, ...], [False, True, ...]]]
     [user_ID2, [sku_ID_3, sku_ID4, ...], [time3, time4, ...], [False, False, ...]]
    
    '''
    num_clicks = min(num_clicks, len(df))
    df = df.query('user_ID != "-"')
    same_user_indicator = df['user_ID'].shift(1) == df['user_ID']
    same_user_indicator.iloc[0] = True
    sequences = []
    sequence = [None] + [[] for attr in attrs]
    sequence[0] = df['user_ID'].iloc[0]
    for i in tqdm(range(0, num_clicks)):
        user = df['user_ID'].iloc[i]
        if_same_user = same_user_indicator.iloc[i]
        if not if_same_user:
            sequences.append(sequence)
            sequence = [None] + [[] for attr in attrs]
            sequence[0] = user
        for _, attr in enumerate(attrs):
            attr_value = df[attr].iloc[i]
            sequence[_ + 1].append(attr_value)
    utils.save_pickle(sequences, 'click_sequence.pk')
    sequences.append(sequence)
    return sequences
Пример #10
0
def deal_with_postag(data_list, mode='full'):

    if len(data_list) == 1 and mode == 'train':
        cache_postag = get_config_values('cache', 'postag_train')
    elif len(data_list) == 1 and mode == 'dev':
        cache_postag = get_config_values('cache', 'postag_dev')
    elif len(data_list) == 2 and mode == 'mix':
        cache_postag = get_config_values('cache', 'postag_mix')
    elif len(data_list) == 3 and mode == 'full':
        cache_postag = get_config_values('cache', 'postag_full')
    else:
        logger.warn('Found data format wrong when dealing with postag...')

    if not os.path.exists(cache_postag):
        logger.info("dealing with postag...")
        postag = []
        for dataset in tqdm(data_list):
            for line in dataset:
                postag.append([[
                    Converter('zh-hans').convert(word['word'].strip().replace(
                        ' ', '')), word['pos'],
                    len(word['word'])
                ] for word in line['postag']])
        save_pickle(cache_postag, postag)
    else:
        logger.info("loading with postag...")
        postag = load_pickle(cache_postag)
    logger.info("postag total num: {0}".format(len(postag)))
    logger.info("postag 5: {0}".format(postag[:5]))
    return postag
Пример #11
0
def deal_with_text(data_list, mode='full'):

    if len(data_list) == 1 and mode == 'train':
        cache_text = get_config_values('cache', 'text_train')
    elif len(data_list) == 1 and mode == 'dev':
        cache_text = get_config_values('cache', 'text_dev')
    elif len(data_list) == 2 and mode == 'mix':
        cache_text = get_config_values('cache', 'text_mix')
    elif len(data_list) == 3 and mode == 'full':
        cache_text = get_config_values('cache', 'text_full')
    else:
        logger.warn('Found data format wrong when dealing with text...')

    if not os.path.exists(cache_text):
        logger.info("dealing with text...")
        text = []
        for dataset in tqdm(data_list):
            text.extend([
                Converter('zh-hans').convert(line['text']) for line in dataset
            ])
        save_pickle(cache_text, text)
    else:
        logger.info("loading with text...")
        text = load_pickle(cache_text)
    logger.info("text total num: {0}".format(len(text)))
    return text
Пример #12
0
def read_data_e1(data_dir):

    main_dir = glob.glob(data_dir+'/*/*')

    for fl in main_dir:
        # print("Participant id is: ",fl.strip().split('/')[-2])
        ff = spio.loadmat(fl,squeeze_me=True)
        ff_nv2 = spio.loadmat(fl,squeeze_me=False)
        assert check_list(ff['labelsConcept']), "False ordered data"
        mtd = ff_nv2['meta']
        # print(mtd.dtype)
        participant = fl.strip().split("/")[-2]
        exp = fl.strip().split("/")[-3]
        print(fl.split('/')[-1])
        if 'data' in fl.split('/')[-1]:
            ff['labelsPOS']=[ff['keyPOS'][x-1] for x in ff['labelsPOS']]
            pos = ff['labelsPOS']
            wds = ff['keyConcept']
            vxl = ff['examples']
            cnc = ff['labelsConcreteness']
            mtd = ff['meta']
            data_dict={}

            for el in ff['labelsConcept']:
                id=el-1
                data_dict[(wds[id],pos[id],cnc[id])]=vxl[id]
                #print((wds[id],pos[id],cnc[id]))
            save_pickle(data_dict,'../data_processed/'+exp+'_proc/'+participant+'/'+fl.strip().split("/")[-1])
            save_pickle(mtd,'../data_processed/'+exp+'_proc/'+participant+'/'+fl.strip().split("/")[-1]+'_meta')
def transform(zip_file, save_dir=None):
    """Refactor file directories, rename images and partition the train/val/test 
  set.
  """

    train_test_split_file = osp.join(save_dir, 'train_test_split.pkl')
    train_test_split = save_images(zip_file, save_dir, train_test_split_file)
    # train_test_split = load_pickle(train_test_split_file)

    # partition train/val/test set

    trainval_ids = list(
        set([
            parse_new_im_name(n, 'id')
            for n in train_test_split['trainval_im_names']
        ]))
    # Sort ids, so that id-to-label mapping remains the same when running
    # the code on different machines.
    trainval_ids.sort()
    trainval_ids2labels = dict(zip(trainval_ids, range(len(trainval_ids))))
    partitions = partition_train_val_set(train_test_split['trainval_im_names'],
                                         parse_new_im_name,
                                         num_val_ids=100)
    train_im_names = partitions['train_im_names']
    train_ids = list(
        set([parse_new_im_name(n, 'id')
             for n in partitions['train_im_names']]))
    # Sort ids, so that id-to-label mapping remains the same when running
    # the code on different machines.
    train_ids.sort()
    train_ids2labels = dict(zip(train_ids, range(len(train_ids))))

    # A mark is used to denote whether the image is from
    #   query (mark == 0), or
    #   gallery (mark == 1), or
    #   multi query (mark == 2) set

    val_marks = [0, ] * len(partitions['val_query_im_names']) \
                + [1, ] * len(partitions['val_gallery_im_names'])
    val_im_names = list(partitions['val_query_im_names']) \
                   + list(partitions['val_gallery_im_names'])

    test_im_names = list(train_test_split['q_im_names']) \
                    + list(train_test_split['gallery_im_names'])
    test_marks = [0, ] * len(train_test_split['q_im_names']) \
                 + [1, ] * len(train_test_split['gallery_im_names'])

    partitions = {
        'trainval_im_names': train_test_split['trainval_im_names'],
        'trainval_ids2labels': trainval_ids2labels,
        'train_im_names': train_im_names,
        'train_ids2labels': train_ids2labels,
        'val_im_names': val_im_names,
        'val_marks': val_marks,
        'test_im_names': test_im_names,
        'test_marks': test_marks
    }
    partition_file = osp.join(save_dir, 'partitions.pkl')
    save_pickle(partitions, partition_file)
    print('Partition file saved to {}'.format(partition_file))
def save_lime_coef(filename,
                   model_name,
                   SAVE_DIR,
                   train_dev_tokens,
                   test_tokens,
                   d_file=None):
    model = 'models/{}.pkl'.format(filename)
    path = utils.get_abs_path(SAVE_DIR, model)
    if 'svm' in model_name:
        model = utils.load_pickle(path, encoding=False)
    else:
        if model_name == 'lstm_att':
            hp_d = 'models/{}.pkl'.format(d_file)
            hp_path = utils.get_abs_path(SAVE_DIR, hp_d)
            d = utils.load_pickle(hp_path)
            model = init_model(train_dev_tokens, d, path)
        else:
            model = utils.load_pickle(path)
    features_l, importance_l = get_lime(model, test_tokens, model_name)
    features = 'features/{}_lime_all_features.pkl'.format(model_name)
    path = utils.get_abs_path(SAVE_DIR, features)
    utils.save_pickle(features_l, path)
    scores = 'feature_importance/{}_lime_all_scores.pkl'.format(model_name)
    path = utils.get_abs_path(SAVE_DIR, scores)
    utils.save_pickle(importance_l, path)
Пример #15
0
def train(model_info):
    os.makedirs(STACK_MODEL_DIR_v2, exist_ok=True)
    df = pd.read_pickle(
        os.path.join(NEW_DATA_V3_DIR,
                     'data-last1year-withsetinfo-extend1.pkl'))
    data_title_distance = pd.read_pickle(
        os.path.join(NEW_DATA_V3_DIR, 'data-title-distance-df.pkl'))
    all_data = np.concatenate(
        (df.values, data_title_distance.values.reshape(-1, 1)), axis=1)
    df = pd.DataFrame(data=all_data,
                      columns=list(df.columns) +
                      list(data_title_distance.columns))
    print(df.head())
    df = shuffle(df, random_state=RANDOM_SEED)
    print(df.head())

    train_data = df[model_info['cols']].values
    train_y = df['label'].values
    print(train_data.shape)

    ss = StandardScaler()
    train_data = ss.fit_transform(train_data)
    save_pickle(ss, model_info['ss_path'])

    models = model_info['model']
    params = model_info['model_param']
    sm = StackModel(models, params)
    sm.fit(train_data, train_y)
    save_pickle(sm, model_info['model_path'])
    return sm
Пример #16
0
def train(
    name,
    model_init_fn,
    train_data,
    dev_data,
    test_data,
    prep_fn=prepare_minibatch,
):
    orig_name = name

    print("----------------------")
    print(f"TRAINING: {name}")
    print("----------------------")
    total_results = train_model(
        name,
        model_init_fn,
        optimizer_fn,
        num_iterations=NUM_ITERATIONS,
        patience=None,
        eval_every=EVAL_EVERY,
        prep_fn=prep_fn,
        eval_fn=evaluate,
        batch_fn=get_minibatch,
        batch_size=BATCH_SIZE,
        eval_batch_size=BATCH_SIZE,
        train_data=train_data,
        dev_data=dev_data,
        test_data=test_data,
    )
    utils.save_pickle(f"{name}_results.pkl", total_results)
Пример #17
0
def build_word_mappings(x_train, nlp, glove_dir):
    """Generate word to count, word to index, and word to vector mappings."""
    # Map each token to the # of times it appears in the corpus.
    tokens = [
        item for t in nlp(' '.join(x_train.values),
                          disable=['parser', 'tagger', 'ner'])
        for item in [t.text.strip()] if item
    ]
    w2count = dict(filter(lambda x: x[1] > 4, Counter(tokens).items()))
    save_pickle(tokens, 'tokens')
    save_pickle(w2count, 'w2count')

    # Construct w2idx dict and i2w list.
    w2idx = {
        k: i
        for i, (k, v) in enumerate(
            sorted(w2count.items(), key=lambda x: x[1], reverse=True), 2)
    }
    w2idx['<PAD>'] = 0
    w2idx['<UNK>'] = 1
    i2w = [k for k, v in sorted(w2idx.items(), key=lambda x: x[1])]
    save_pickle(w2idx, 'w2idx')
    save_pickle(i2w, 'i2w')

    # Load word vectors and filter to include words in our vocab.
    w2vec = load_glove(300, glove_dir)
    w2vec = {k: v for k, v in w2vec.items() if k in w2idx}
    save_pickle(w2vec, 'w2vec')
Пример #18
0
def get_arguments():
    args = build_parser()
    # set random seed for reproducible experiments
    # reference: https://github.com/pytorch/pytorch/issues/7068
    random.seed(args.random_seed)
    numpy.random.seed(args.random_seed)
    torch.manual_seed(args.random_seed)
    torch.cuda.manual_seed(args.random_seed)
    torch.cuda.manual_seed_all(args.random_seed)

    # these flags can affect performance, selec carefully
    # torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = False

    os.makedirs(args.save_path, exist_ok=True)
    if args.train_flag:
        os.makedirs(os.path.join(args.save_path, 'training_log'),
                    exist_ok=True)
    else:
        loaded_args = load_pickle(
            os.path.join(os.path.dirname(args.model_load), 'argument.pickle'))
        args = update_arguments_for_eval(args, loaded_args)

    # cuda setting
    os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
    os.environ['CUDA_VISIBLE_DEVICES'] = ', '.join(map(str, args.gpu_no))

    with open(os.path.join(args.save_path, 'argument.txt'), 'w') as f:
        for key, value in sorted(vars(args).items()):
            f.write('%s: %s' % (key, value) + '\n')

    save_pickle(os.path.join(args.save_path, 'argument.pickle'), args)
    return args
Пример #19
0
def ner_generator(files: Dict[str, tuple], args) -> None:
    """Generates files for NER"""
    # Generate train, dev, test files
    for filename, data in files.items():
        generate_input_files(ehr_records=data[0],
                             ade_records=data[1],
                             filename=args.target_dir + filename + '.' +
                             args.ext,
                             max_len=args.max_seq_len,
                             sep=args.sep)
        save_pickle(args.target_dir + filename, {
            "EHR": data[0],
            "ADE": data[1]
        })

    # Generate labels file
    with open(args.target_dir + 'labels.txt', 'w') as file:
        output_labels = map(lambda x: x + '\n', labels)
        file.writelines(output_labels)

    filenames = [
        name for files in map(lambda x: [x + '.' + args.ext, x + '.pkl'],
                              list(files.keys())) for name in files
    ]

    print("\nGenerating files successful. Files generated: ",
          ', '.join(filenames),
          ', labels.txt',
          sep='')
Пример #20
0
def create_w2ts(w2hs, path):
    '''
    Create a dictionary that map a whale id with the training samples
    '''
    w2ts_path = path['root'] + path['w2ts']
    train_ps_path = path['root'] + path['train_ps']
    if isfile(w2ts_path):
        print(w2ts_path, 'exists! Load it!')
        w2ts = load_pickle(w2ts_path)
        train = load_pickle(train_ps_path)
    else:
        train = []  # A list of training image ids
        for hs in w2hs.values():
            if len(hs) > 1:
                train += hs
        random.shuffle(train)
        train_set = set(train)

        w2ts = {}  # Associate the image ids from train to each whale id.
        for w, hs in w2hs.items():
            for h in hs:
                if h in train_set:
                    if w not in w2ts:
                        w2ts[w] = []
                    if h not in w2ts[w]:
                        w2ts[w].append(h)
        for w, ts in w2ts.items():
            w2ts[w] = np.array(ts)
        save_pickle(w2ts, w2ts_path)
        save_pickle(train, train_ps_path)
    return w2ts, train
Пример #21
0
def merge_loop(double_set, list_name, file=None):
    """
    进行团合并操作,循环直到不能合并
    :param double_set:
    :return:团成员最大数,最终的团
    """
    bestSet = set()
    oldSet = double_set
    num_list = []
    count_list = []
    group_list = []
    while len(oldSet) > 0:
        print('成员数:', len(list(oldSet)[0]))
        print('个数:', len(oldSet))
        print(oldSet)
        num_list.append(len(list(oldSet)[0]))
        count_list.append(len(oldSet))
        group_list.append(oldSet)
        bestSet = oldSet
        oldSet = merge_group(oldSet, double_set)
    if file is not None:
        group_list = utils.num_2_word(list_name, group_list)
        utils.write_csv(['成员数', '个数', '团'], file, num_list, count_list,
                        group_list)
        utils.save_pickle(file + '.pkl', group_list)
    return len(list(bestSet)[0]), bestSet
Пример #22
0
def create_training_labels(type_,
                           path=os.path.join(path_train, "semcor+omsti.json")):
    ###############################################################################
    # This function creates a txt file with sentences with a specific label
    # and a vocabulary with all the seen labels.
    #
    # Input:
    #   type_: it is a laabel used to choose the type of label
    #   path: path of the json file
    #
    # Output:
    #   None
    ###############################################################################

    # create a list with sentences of the considered labels for all the training set
    dictionary = create_labels_words()
    data = utils.load_json(path)
    data = list(
        map(partial(sentence_from_dictionaries, training_sentence=False),
            data))

    sentences = []
    labels = set()

    for sentence in data:

        single_sentence = []

        for word in sentence.split():

            # insert the current word
            if type(dictionary.get(word, word)) != list:
                single_sentence.append(word)

            # insert the corrispondent label for the current word
            else:
                single_sentence.append(str(dictionary.get(word, word)[type_]))
                labels.add(str(dictionary.get(word, word)[type_]))

        sentences.append(single_sentence)

    # create the vocabulary of seen labels, adding the ids for the padding, the unseen labels and the unlabelled words
    vocabulary = {
        value: key
        for key, value in dict(enumerate(labels, 3)).items()
    }
    vocabulary["<PAD>"] = "0"
    vocabulary["<UNSEEN>"] = "1"
    vocabulary["<WORD>"] = "2"

    # exchange strings with ids
    sentences = list(
        map(
            lambda sentence: ' '.join(
                str(vocabulary.get(word, word)) for word in sentence),
            sentences))

    utils.save_txt(sentences, path_train + name_training_file[type_][0])
    utils.save_pickle(vocabulary,
                      "../resources/" + name_training_file[type_][1])
def process_babi_dataset(save, print_dict=False):
    file = open('dialog-bAbI-tasks/dialog-babi-task5-full-dialogs-trn.txt',
                'r')
    text = file.readlines()
    file.close()
    system_acts = load_pickle('system_acts.pickle')

    def print_dict():
        for key in uttr_dict:
            print(key)
            print(uttr_dict[key])
            print()

    uttr_dict = {'<BEGIN>': [set()]}
    for act in system_acts:
        uttr_dict[act] = [set()]

    prev_uttr = '<BEGIN>'
    for uttr in text:
        if uttr == '\n':
            prev_uttr = '<BEGIN>'
        for act in system_acts:
            if prev_uttr == '':
                prev_uttr = act
                continue
            if act in uttr:
                user_uttr = re.sub(r'\d+', '', uttr.split(act)[0]).strip()
                uttr_dict[prev_uttr][0].add(user_uttr)
                prev_uttr = act

    if save:
        save_pickle(uttr_dict, 'simulator_uttrs.pickle')
    if print_dict:
        for k, v in uttr_dict.items():
            print(k, v, '\n')
Пример #24
0
 def get_entity_word_dict(self):
     data_path = r'./similarity/entity_word_dict.pkl'
     if os.path.exists(data_path):
         word_vector = load_pickle(data_path)
     else:
         word_vector = self.get_dict_key_num(self.data_dict)
         save_pickle(data_path, word_vector)
     return word_vector
Пример #25
0
 def get_all_sentence_vector(self):
     data_path = r'./similarity/all_sentence_vector.pkl'
     if os.path.exists(data_path):
         all_sentence_vector = load_pickle(data_path)
     else:
         all_sentence_vector = self.get_data_dict_vector(self.data_dict)
         save_pickle(data_path, all_sentence_vector)
     return all_sentence_vector
Пример #26
0
def main(_):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model, downstream_loader = setup()
    model.to(device).eval()
    goal_emb, distance_scale = embed(model, downstream_loader, device)
    utils.save_pickle(FLAGS.experiment_path, goal_emb, "goal_emb.pkl")
    utils.save_pickle(FLAGS.experiment_path, distance_scale,
                      "distance_scale.pkl")
Пример #27
0
def save_att_weights(word_score_ds, save_dir):
    features_l, importance_l = get_att_weights(word_score_ds)
    features_file_name = 'features/lstm_att_weights_all_features.pkl'
    path = utils.get_abs_path(save_dir, features_file_name)
    utils.save_pickle(features_l, path)
    scores_file_name = 'feature_importance/lstm_att_weights_all_scores.pkl'
    path = utils.get_abs_path(save_dir, scores_file_name)
    utils.save_pickle(importance_l, path)
def process_example_phrases(save, print_dict=False):
    from openpyxl import load_workbook

    def cell(row, col):
        return sh[ALPH[col - 1] + str(row)].value

    uttr_dict = {}
    uttr_dict['<SILENT>'] = [set()]
    uttr_dict['any preference on a type of cuisine'] = [set(), set()]
    uttr_dict['api_call'] = [set()]
    uttr_dict['great let me do the reservation'] = [set()]
    uttr_dict['hello what can i help you with today'] = [set()]
    uttr_dict['here it is '] = [set(), set()]
    uttr_dict['how many people would be in your party'] = [set()]
    uttr_dict["i'm on it"] = [set()]
    uttr_dict['is there anything i can help you with'] = [set()]
    uttr_dict['ok let me look into some options for you'] = [set()]
    uttr_dict['sure is there anything else to update'] = [set()]
    uttr_dict['sure let me find an other option for you'] = [set()]
    uttr_dict['what do you think of this option: '] = [set()]
    uttr_dict['where should it be'] = [set()]
    uttr_dict['which price range are looking for'] = [set(), set()]
    uttr_dict["you're welcome"] = [set()]
    uttr_dict['<BEGIN>'] = [set()]

    wb = load_workbook(filename='user_simulator_phrases.xlsx')
    sh = wb['Phrases']

    col = 0
    for phrase, list in uttr_dict.items():
        col += 1
        row = 2
        while True:
            phrase = cell(row, col)
            if phrase == None:
                break
            else:
                list[0].add(phrase)
                row += 1
                if ' ' not in phrase:
                    pass

        if len(
                list
        ) > 1:  # only necessary when context_vector can be != [1, 1, 1, 1]
            row = 2
            col += 1
            while True:
                if cell(row, col) == None:
                    break
                else:
                    list[1].add(cell(row, col))
                    row += 1
    if save:
        save_pickle(uttr_dict, 'example_phrases_dict.pickle')
    if print_dict:
        for k, v in uttr_dict.items():
            print(k, v, '\n')
Пример #29
0
def dset2dict(dset, name):
    dset_dict = {}
    for idx, question in enumerate(dset):
        dset_dict[question['qid']] = question
    save_pickle(
        dset_dict,
        os.path.expanduser("~/kable_management/data/tvqa/" + name +
                           "_dict.pickle"))
    print(name, "Done")
def save_xgb_impt(file, name, SAVE_DIR):
    model = 'models/{}.pkl'.format(file)
    path = utils.get_abs_path(SAVE_DIR, model)
    print('model path: {}'.format(path))
    pipeline = utils.load_pickle(path)
    xgb_impt_d = get_xgb_impt_d(pipeline)
    features = 'features/{}_impt_all_features.pkl'.format(name)
    path = utils.get_abs_path(SAVE_DIR, features)
    utils.save_pickle(xgb_impt_d, path)
Пример #31
0
 def from_file(cls, faces, data_file_name, n_eigs):
     pickle = get_pickle(data_file_name)
     if pickle is not None:
         logging.info('using previously calculated facespace')
         return cls(faces, n_eigs=n_eigs, face_space=pickle)
     else:
         logging.info('No previous facespace was found')
         eig_face = cls(faces, n_eigs=n_eigs)
         save_pickle(eig_face.entire_face_space, data_file_name)
         return eig_face
 def _save(self, min_delta=0):
     if self.text_tokens_len + min_delta < len(self.text_tokens):
         print('_save 1: %7d = %7d + %4d %s' % (len(self.text_tokens),
             self.text_tokens_len, len(self.text_tokens) - self.text_tokens_len,
             self.text_tokens_path))
         save_json(self.text_tokens_path, self.text_tokens)
         self.text_tokens_len = len(self.text_tokens)
     if self.token_vector_len + 2 * min_delta < len(self.token_vector):
         print('_save 2: %7d = %7d + %4d %s' % (len(self.token_vector),
             self.token_vector_len, len(self.token_vector) - self.token_vector_len,
             self.token_vector_path))
         save_pickle(self.token_vector_path, self.token_vector)
         self.token_vector_len = len(self.token_vector)
def compute_codes(args):
    """Computes maximum 10,000 x 10 tracks. N is the index in the MSD:
        e.g. 
            if N = 1: tracks computed: from 100,000 to 199,999
            if N = 5: tracks computed: from 500,000 to 599,999
    """

    track_ids = args["track_ids"]
    maindir = args["maindir"]
    d = args["d"]
    N = args["N"]
    clique_ids = args["clique_ids"]
    outdir = args["outdir"]
    origcodesdir = args["origcodesdir"]
    pca_n = args["pca_n"]
    norm = args["norm"]

    MAX     = 1e5 / 1
    ITER    = 1e4 / 1

    for it in xrange(10):
        logger.info("Computing %d of 10 iteration" % it)
        start_idx = int(N*MAX + it*ITER)
        end_idx = int(start_idx + ITER)
        codes = []
        strN = str(N)
        if N < 10:
            strN = "0" + str(N)
        out_file = os.path.join(outdir, strN) + str(it) + "-msd-codes.pk"
        if origcodesdir is None:
            origcodes = None
        else:
            origcodes_file = os.path.join(origcodesdir, strN) + str(it) + \
                "-msd-codes.pk"
            origcodes = utils.load_pickle(origcodes_file)[0][0]
            #origcodes = utils.load_pickle(origcodes_file)[0]
        if d == "":
            codes = compute_codes_orig_it(track_ids, maindir, clique_ids,
                start_idx, end_idx)
        else:
            codes = compute_codes_it(track_ids, maindir, d, clique_ids,
                start_idx, end_idx, origcodes=origcodes, norm=norm)
        
        utils.save_pickle(codes, out_file)
def pred_error(f_pred, prepare_data, data, iterator, verbose=False, is_test_phase=False):
    """
    Just compute the error
    f_pred: Theano fct computing the prediction
    prepare_data: usual prepare_data for that dataset.
    """
    valid_err = 0
    for _, valid_index in iterator:
        x, mask, y = prepare_data([data[0][t] for t in valid_index],
                                  np.array(data[1])[valid_index],
                                  maxlen=None)
        preds = f_pred(x, mask)
        if is_test_phase:
            print preds
            utils.save_pickle(("%0.3f" % np.random.rand())+"pred.pickle", preds)
        targets = np.array(data[1])[valid_index]
        valid_err += (preds == targets).sum()
    valid_err = 1. - np_floatX(valid_err) / len(data[0])

    return valid_err
def main():
    # Args parser
    parser = argparse.ArgumentParser(description=
                "Evaluates the average rank and mean AP for the test SHS " \
                "over the entire MSD",
                formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("msd_dir", action="store",
                        help="Million Song Dataset main directory")
    parser.add_argument("-dictfile", action="store", default="",
                        help="Pickle to the learned dictionary")
    parser.add_argument("-outdir", action="store", default="msd_codes",
                        help="Output directory for the features")
    parser.add_argument("-N", action="store", default=10, type=int,
                        help="Number of processors to use when computing " \
                        "the codes for 1M tracks,")
    parser.add_argument("-lda", action="store", default=None, 
                        help="LDA file")
    parser.add_argument("-pca", nargs=2, metavar=('f.pkl', 'n'), 
                        default=(None, 0),
                        help="pca model saved in a pickle file, " \
                        "use n dimensions")
    parser.add_argument("-codes", action="store", nargs=2, default=[None,0], 
                        dest="codesdir", metavar=("msd_codes/", "n"),
                        help="Path to the folder with all the codes and "
                            "version to evaluate")
    parser.add_argument("-orig_codes", action="store", default=None, 
                        dest="origcodesdir",
                        help="Path to the folder with all the codes without "
                            "dimensionality reduction")
    parser.add_argument("-norm", action="store_true", dest="norm", default=False, 
                        help="Normalize before LDA/PCA or not")

    args = parser.parse_args()
    start_time = time.time()
    maindir = args.msd_dir
    shsf = "SHS/shs_dataset_test.txt"

    global lda
    global pca

    # sanity cheks
    utils.assert_file(maindir)
    utils.assert_file(shsf)
    utils.create_dir(args.outdir)

    # read cliques and all tracks
    cliques, all_tracks = utils.read_shs_file(shsf)
    track_ids = utils.load_pickle("SHS/track_ids_test.pk")
    clique_ids = utils.load_pickle("SHS/clique_ids_test.pk")

    # read codes file
    codesdir = args.codesdir[0]
    if codesdir is not None:
        if os.path.isfile(codesdir):
            c = utils.load_pickle(codesdir)
            feats = c[0]
            track_ids = c[1]
            clique_ids = c[2]
        else:
            feats, track_ids, clique_ids = load_codes(codesdir, 
                                                lda_idx=int(args.codesdir[1]))
        logger.info("Codes files read")
        print feats.shape
    else:
        # Read PCA file
        if args.pca[0] is not None:
            pca = utils.load_pickle(args.pca[0])[int(args.pca[1])]

        # read LDA file
        lda_file = args.lda
        if lda_file is not None:
            lda = utils.load_pickle(lda_file)

        utils.assert_file(args.dictfile)

        # Prepare Multiprocessing computation
        input = []
        pool = Pool(processes=args.N)
        for n in xrange(args.N):
            arg = {}
            arg["track_ids"] = track_ids
            arg["maindir"] = maindir
            arg["d"] = args.dictfile
            arg["N"] = n
            arg["clique_ids"] = clique_ids
            arg["outdir"] = args.outdir
            arg["origcodesdir"] = args.origcodesdir
            arg["pca_n"] = int(args.pca[1])
            arg["norm"] = args.norm
            input.append(arg)

        # Start computing the codes
        pool.map(compute_codes, input)

        # Done!
        logger.info("Codes computation done!")
        logger.info("Took %.2f seconds" % (time.time() - start_time))
        sys.exit()

    # Scores
    feats, clique_ids, track_ids = utils.clean_feats(feats, clique_ids, track_ids)
    stats = score(feats, clique_ids, N=len(all_tracks))

    # TODO: change file name
    utils.save_pickle(stats, "stats.pk")

    # done
    logger.info('Average rank per track: %.2f, clique: %.2f, MAP: %.2f%%' \
                % (anst.average_rank_per_track(stats),
                    anst.average_rank_per_clique(stats),
                    anst.mean_average_precision(stats) * 100))
    logger.info("Done! Took %.2f seconds" % (time.time() - start_time))
def main():
    # Args parser
    parser = argparse.ArgumentParser(description=
                "Cover song ID on the training Second Hand Song dataset",
                formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("msd_dir", action="store",
                        help="Million Song Dataset main directory")
    parser.add_argument("-dictfile", action="store", default="",
                        help="Pickle to the learned dictionary")
    parser.add_argument("-lda", action="store", nargs=2, default=[None,0], 
                        help="LDA file and version", metavar=('lda.pkl', 'n'))
    parser.add_argument("-codes", action="store", default=None, dest="codesfile",
                        help="Pickle to the features file")
    parser.add_argument("-f", action="store", default="", dest="featfile",
                        help="Pickle to the final features")
    parser.add_argument("-pca", nargs=2, metavar=('f.pkl', 'n'), 
                        default=("", 0),
                        help="pca model saved in a pickle file, " \
                        "use n dimensions")

    args = parser.parse_args()
    start_time = time.time()
    maindir = args.msd_dir
    shsf = "SHS/shs_dataset_train.txt"
    dictfile = args.dictfile

    # sanity cheks
    utils.assert_file(dictfile)
    utils.assert_file(maindir)
    utils.assert_file(shsf)

    # read clique ids and track ids
    cliques, all_tracks = utils.read_shs_file(shsf)
    track_ids = all_tracks.keys()
    clique_ids = np.asarray(utils.compute_clique_idxs(track_ids, cliques))
    logger.info("Track ids and clique ids read")
    utils.save_pickle(clique_ids, "SHS/clique_ids_train.pk")
    utils.save_pickle(track_ids, "SHS/track_ids_train.pk")

    # read LDA file
    lda_file = args.lda[0]
    if lda_file != None:
        lda_file = utils.load_pickle(lda_file)
        logger.info("LDA file read")

    # read codes file
    codesfile = args.codesfile
    if codesfile != None:
        codesfile = utils.load_pickle(codesfile)
        logger.info("Codes file read")

    # Compute features if needed
    if args.featfile == "":
        feats = compute_feats(track_ids, maindir, dictfile,
            lda_file=lda_file, lda_n=int(args.lda[1]), codes=codesfile,
            pca=args.pca[0], pca_n=int(args.pca[1]))
    else:  
        feats = utils.load_pickle(args.featfile)

    # Apply PCA
    pcafile = args.pca[0]
    pcadim = int(args.pca[1])
    if pcafile != "" and False:
        trainedpca = utils.load_pickle(pcafile)
        assert pcadim > 0
        logger.info('trained pca loaded')
        pcafeats = np.zeros((feats.shape[0], pcadim))
        for i,feat in enumerate(feats):
            pcafeats[i] = trainedpca.apply_newdata(feat, ndims=pcadim)
        feats = pcafeats

    # Scores
    feats, clique_ids, track_ids = utils.clean_feats(feats, clique_ids, track_ids)
    stats = score(feats, clique_ids)

    # Save data
    if dictfile == "":
        dictfile = "thierry" # For saving purposes
    utils.save_pickle(stats, "results/stats-" + os.path.basename(dictfile) + ".pk")

    # done
    logger.info('Average rank per track: %.2f, clique: %.2f, MAP: %.2f%%' \
                % (anst.average_rank_per_track(stats),
                    anst.average_rank_per_clique(stats),
                    anst.mean_average_precision(stats) * 100))
    logger.info("Done! Took %.2f seconds" % (time.time() - start_time))
def compute_feats(track_ids, maindir, d, lda_file=None, lda_n=0, codes=None, 
        ver=True, pca="", pca_n=0):
    """Computes the features using the dictionary d. If it doesn't exist, 
     computes them using Thierry's method.

     The improved pipeline is composed of 11 steps:

        1.- Beat Synchronous Chroma
        2.- L2-Norm
        3.- Shingle (PATCH_LEN: 75 x 12)
        4.- 2D-FFT
        5.- L2-Norm
        6.- Log-Scale
        7.- Sparse Coding
        8.- Shrinkage
        9.- Median Aggregation
        10.- Dimensionality Reduction
        11.- L2-Norm

    Original method by Thierry doesn't include steps 5,6,7,8,11.
     """
    if d != "":
        fx = load_transform(d)
        K = int(d.split("_")[1].split("E")[1])
    else:
        K = PATCH_LEN
    
    if codes is None:
        compute_codes = True
        codes = np.ones((len(track_ids),K)) * np.nan
    else:
        compute_codes = False
        K = codes[0].shape[0]
    if lda_file is not None:
        if lda_n == 0: n_comp = 50
        elif lda_n == 1: n_comp = 100
        elif lda_n == 2: n_comp = 200
    else:
        n_comp = K 

    if pca != "":
        pca = utils.load_pickle(pca)
        pca = pca[pca_n]

    final_feats = np.ones((codes.shape[0],n_comp)) * np.nan
    orig_feats = []
    for cnt, tid in enumerate(track_ids):
        if compute_codes:
            path = utils.path_from_tid(maindir, tid)

            # 1.- Beat Synchronous Chroma
            # 2.- L2-Norm
            # 3.- Shingle (PATCH_LEN: 75 x 12)
            # 4.- 2D-FFT
            feats = utils.extract_feats(path)
            #orig_feats.append(feats)    # Store orig feats
            if feats == None:
                continue
            
            if d != "":
                # 5.- L2-Norm
                # 6.- Log-Scale
                # 7.- Sparse Coding
                # 8.- Shrinkage
                H = fx(feats)
            else:
                H = feats
            #. 9.- Median Aggregation
            H = np.median(H, axis=0)
        else:
            H = codes[cnt]

        if compute_codes:
            codes[cnt] = H.copy()

        if pca != "":
            H = pca.transform(H)

        # Apply LDA if needed
        if lda_file is not None:
            #H = dan_tools.chromnorm(H.reshape(H.shape[0], 1)).squeeze()
            # 10.- Dimensionality Reduction
            H = lda_file[lda_n].transform(H)

        # 11.- L2-Norm
        final_feats[cnt] = dan_tools.chromnorm(H.reshape(H.shape[0], 1)).squeeze()

        if ver:
            if cnt % 50 == 1:
                logger.info("----Computing features %.1f%%" % \
                            (cnt/float(len(track_ids)) * 100))

    if d == "":
        d = "orig" # For saving purposes
    
    # Save codes
    utils.create_dir("results")
    if compute_codes:
        utils.save_pickle(codes, "results/codes-" + os.path.basename(d) + ".pk")

    # Save features
    #utils.save_pickle(orig_feats, "results/feats-" + os.path.basename(d) + ".pk")

    logger.info("Features Computed")
    return final_feats
Пример #38
0
    for key,value in wavs.iteritems():
        if not test_wavs.has_key(key):
            test_wavs[key] = []
        test_wavs[key].extend(value)
test_nums = [len(test_wavs[each]) for each in test_wavs.keys()]
print "test total words:{}".format(np.asarray(test_nums).sum())


print "mergeing"
joint = dict()
for key in train_wavs.keys():
    for word in train_wavs[key]:
        if not joint.has_key(key):
            joint[key] = []
        joint[key].append(word)

for key in test_wavs.keys():
    for word in test_wavs[key]:
        if not joint.has_key(key):
            joint[key] = []
        joint[key].append(word)
total_nums = 0
for key in joint.keys():
    total_nums = total_nums + len(joint[key])
    for word in joint[key]:
        if len(word) == 0:
            print key, word
print "merge total words:{}".format(total_nums)

utils.save_pickle('merge.pkl', joint)
        reduced_train_sift = concatenate(train_sift, axis=0)

        test_sift = removing_null(test_sift_with_null, test_labels)
        reduced_test_sift = concatenate(test_sift, axis=0)

        all_sift = concatenate((reduced_train_sift, reduced_test_sift), axis=0)
        nfeatures = all_sift.shape[0]
        k = 1000

        kmeans = MiniBatchKMeans(
            n_clusters=k, init="k-means++", n_init=10, max_iter=100, init_size=1000, batch_size=1000
        )

        kmeans.fit(all_sift)
        if SAVE:
            save_pickle(prefix + "kmeans.pkl", kmeans)

    # train_predicted = kmeans.predict(reduced_train_sift)
    # test_predicted = kmeans.predict(reduced_test_sift)

    # train_hist_features = get_histogram(k, train_sift, train_predicted)
    # test_hist_features = get_histogram(k, test_sift, test_predicted)
elif descriptor == "spSIFT":
    k = 1000

    if os.path.isfile(prefix + "train_sift.pkl"):
        kmeans = load_pickle(prefix + "kmeans.pkl")
        train_sift = load_pickle(prexif + "train_sift.pkl")
        test_sift = load_pickle(prexif + "test_sift.pkl")

        reduced_train_sift = concatenate(train_sift, axis=0)
Пример #40
0
                sys.exit(1)
            MARGIN_SIZE = n
        except ValueError:
            print 'margin size must be an integer!'
            sys.exit(1)

    utils.ensure_dir(outputdir)

    for d in os.listdir(inputdir):
        path = os.path.join(inputdir, d)
        if not os.path.isdir(path):
            continue

        pngs = [fn for fn in os.listdir(path) if fn.endswith('.png')]
        data = np.ndarray(shape=(len(pngs),
                          IMG_SIZE+2*MARGIN_SIZE,
                          IMG_SIZE+2*MARGIN_SIZE))
        start = time()
        for idx, png in enumerate(pngs):
            resized = utils.read_resize_image(os.path.join(path, png), IMG_SIZE)
            data[idx, :, :] = utils.add_margins(resized, MARGIN_SIZE)
        end = time()

        print '{}: {} images read and resized to {} in {:.3f}s. Saving...'.format(
            path, len(data), data[0].shape, end-start)

        utils.save_pickle({'data': data,
                           'image_size': IMG_SIZE,
                           'margin_size': MARGIN_SIZE},
                          d + '.pickle', outputdir)
Пример #41
0
print("------------------Request and Answer DataFrame Generating!")
request=training_order_df.groupby(['date','time_of_day','start_dist_id'])['order_id'].count().reset_index()
request.columns=['date','time_of_day','start_dist_id','request']

answer_training_order_df=training_order_df.dropna(axis=0,subset=['driver_id'])
answer=answer_training_order_df.groupby(['date','time_of_day','start_dist_id'])['order_id'].count().reset_index()
answer.columns=['date','time_of_day','start_dist_id','answer']

request_answer=pd.merge(request,answer,how='left',on=['date','time_of_day','start_dist_id'])
request_answer.fillna(0,inplace=True)
request_answer['gap']=request_answer['request']-request_answer['answer']

request_answer['date']=request_answer['date'].apply(lambda x:datetime.strptime(x,"%Y-%m-%d"))
request_answer['day_of_week']=request_answer['date'].apply(lambda x:datetime.weekday(x))

utils.save_pickle(request_answer,"request_answer")

print("------------------Model Training!")
data_X=request_answer[['time_of_day','start_dist_id','day_of_week']]
data_Y=request_answer['gap']

X_train,X_test,Y_train,Y_test=cross_validation.train_test_split(data_X,data_Y,test_size=0.2,random_state=0)

#training_order_df=utils.generate_order_df(aim="predict")
pred_df=pd.read_csv("./data/test_set_1/read_me_1.txt",sep='\t', names=['origin'])
pred_df['time_of_day']=pred_df['origin'].apply(lambda x: int(x[11:]))
pred_df['date']=pred_df['origin'].apply(lambda x: datetime.strptime(x[:10],"%Y-%m-%d"))
pred_df['day_of_week']=pred_df['date'].apply(lambda x:datetime.weekday(x))

temp=[]
for i in np.arange(start=1,stop=67,step=1):
        print('`' * 80)
        for k, shape, score, columns in results:
            f1 = score[1]
            col = set(columns)
            d_f1 = f1 - last_f1
            d_col = list(col - last_col)
            print('%4d: %-20s %g' % (k, d_col, d_f1))
            last_f1 = f1
            last_col = col
    if True:
        # X = X[X.columns[:5]]
        beam_size = 3
        max_items = -1
        (X_train, y_train), (X_test, y_test) = resample(X, y, sample_fraction=1.0)
        m_scores = beam_search_feature(X_train, y_train, X_test, y_test, beam_size=3, max_items=-1)
        save_pickle('m_scores.pkl', m_scores)

        print('*' * 80)
        print('beam_size=%d, max_items=%d' % (beam_size, max_items))
        last_f1 = 0
        for m, cols_scores in m_scores:
            f1 = cols_scores[0][1][1]
            print('%3d: f1=%.3f improvement=%+.3f' % (m, f1, f1 - last_f1))
            last_f1 = f1
            for i, (c, s) in enumerate(cols_scores[:3]):
                print('%5d: %s %s' % (i, s, c))
    if False:
        columns = ['Parent Region', 'Reseller Tier', 'resellerDiscountPercentage', 'type']
        show_splits(X, y, columns)

Пример #43
0
for review in clean_test_reviews:
    for word in review:
        if d.check(word) and not dicts.has_key(word):
            dicts[word] = len(dicts)


encode_train_reviews = []
for review in clean_train_reviews:
    each = []
    for word in review:
        if dicts.has_key(word):
            each.append(dicts[word])
    if len(each) != 0:
        encode_train_reviews.append(each)

encode_test_reviews = []
for review in clean_test_reviews:
    each = []
    for word in review:
        if dicts.has_key(word):
            each.append(dicts[word])
    if len(each) != 0:
        encode_test_reviews.append(each)

assert len(encode_train_reviews) == len(train_labels)
train = (encode_train_reviews, train_labels)

utils.save_pickle("encode_train_reviews.pickle", train)
utils.save_pickle("encode_test_reviews.pickle", encode_test_reviews)
utils.save_pickle("dicts.pickle", dicts)
    nlabels = len(label_map)
    train, train_lbl = reshape(train, train_lbl, nlabels)
    valid, valid_lbl = reshape(valid, valid_lbl, nlabels)
    test, test_lbl = reshape(test, test_lbl, nlabels)

    # store all in a dict and pickle it
    data = {
        'train': train,
        'train_lbl': train_lbl,
        'valid': valid,
        'valid_lbl': valid_lbl,
        'test': test,
        'test_lbl': test_lbl,
    }

    utils.save_pickle(data, outfile)

    print
    print 'Train: {}'.format(len(train))
    print 'Valid: {}'.format(len(valid))
    print 'Test: {}'.format(len(test))
    print 'Classes: {}'.format(len(label_map))
    print
    print 'Dataset written to {}'.format(outfile)

    # save label map, mean image and sizes
    meta = {
        'label_map': label_map,
        'mean_image': mean_image,
        'image_size': image_size,
        'margin_size': margin_size
# ngrams, as suggested in the NBSVM paper.
if False:
    s = '''It turns out "why is" that's using. Doesn't it? Can't i'''
    t = tokenize(s)
    print(t)
    assert False

print('Tokenization:')
t0 = time.clock()
train_tokens = [tokenize(s, token_vector) for s in train[COMMENT]]
print('train_tokens: %1.f sec %.2f sec / token' % (time.clock() - t0, (time.clock() - t0) / len(train_tokens)))
t0 = time.clock()
test_tokens = [tokenize(s, token_vector) for s in test[COMMENT]]
print('test_tokens: %1.f sec %.2f sec / token' % (time.clock() - t0, (time.clock() - t0) / len(test_tokens)))

save_pickle('token.vector.pkl', token_vector)
save_json('train.tokens.json', train_tokens)
save_json('test.tokens.json', test_tokens)

token_vector = load_pickle('token.vector.pkl')
train_tokens = load_json('train.tokens.json')
test_tokens = load_json('test.tokens.json')


def compute_ngram_vector(token_list, n):
    """Compute an embedding vector for all n-grams in token_list
    """
    vec = np.zeros((n, SPACY_VECTOR_SIZE), dtype=np.float64)
    n_vecs = len(token_list) - n + 1
    for i in range(n_vecs):
        for j in range(n):