예제 #1
0
파일: run.py 프로젝트: sully90/spider
def main(args):

    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    for count in range(2, len(urls) + 1):

        print '[learner] clustering with %d urls' % count

        # load data
        data = [utils.load_data(path, id) for id, url in enumerate(urls)]
        data = data[:count]

        # process data
        processor = processors.Processor(data)
        features = processor.extract()

        # clustering
        clusterer = clusterers.DBSCAN()
        labels = clusterer.cluster(features).labels_

        # score
        clusters = processor.score(labels)

        with open(os.path.join(path, 'clusters.%03d.json' % count), 'w') as f:
            f.write(
                json.dumps(clusters, indent=2,
                           ensure_ascii=False).encode('utf8'))
예제 #2
0
def main_segmentation(doc_num,
                      window_size,
                      model_type,
                      doc_type,
                      segmentation_type,
                      eval=False):
    # === Load doc ===
    print('')
    print('Interview:', doc_num)
    print('Load data')
    path = './data/interview/interview-text_01-26_' + doc_num + '.txt'

    data = utils.load_data(path)
    if doc_type == 'sentence':
        data = utils.to_sentence(data)

    docs = [row[1] for row in data]
    label = [row[0] for row in data]
    print(data[:5])
    print('Done')

    # === Model ===
    print('Model:', model_type)
    print('Segmentation type:', segmentation_type)
    model, segmentation_model = load_model(model_type, segmentation_type)

    # === Result ===
    print('===結果===')
    res = segmentation_model.segment([stems(doc) for doc in docs])
    print(segmentation_model.sim_arr)
예제 #3
0
def main():
    data = utils.load_data(__file__)
    output = generate_output(data)

    output.sort(key=lambda pair: len(pair[1]), reverse=True)

    for (header, sequence) in output:
        print header.encode('utf-8')
        print sequence.encode('utf-8')
예제 #4
0
def predict(data):
    X, y = load_data('datingTestSet.txt')

    scale = StandardScaler()
    X = scale.fit_transform(X)
    encoder = LabelEncoder()
    y = encoder.fit_transform(y)

    model = KNeighborsClassifier(n_neighbors=3)
    model.fit(X, y)

    result = model.predict(scale.transform(data))
    result = encoder.inverse_transform(result)

    return result
예제 #5
0
def main():
    config = utils.load_config(__file__)
    data = utils.load_data(__file__)

    count = {}

    for code in data:
        for interface in data[code]['interfaces']:
            is_active = config['active'][code][interface]
            residues = data[code]['interfaces'][interface]['residues']

            if is_active:
                for r in residues:
                    acid = r['resn'].encode('utf-8')
                    if acid in count:
                        count[acid] += 1
                    else:
                        count[acid] = 1

    total = sum(count.values())

    for acid in sorted(count.keys()):
        print acid, '{:1.1f}'.format(100 * count[acid] / float(total)), u'({0} / {1})'.format(count[acid], total)
예제 #6
0
from model import *
import os
from tqdm import tqdm
from hparam import hparam
import lib.utils as utils
from lib.logging import init_logger, logger

init_logger(utils.get_log(hparam.eval))

rnn = utils.load_data(hparam.model,logger)
vocab = utils.load_data(hparam.vocab,logger)
n_hidden=1024
# Just return an output given a line
def evaluate(line_tensor):
    hidden = rnn.initHidden(n_hidden)
    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)
    return output

def predict(line,label):
    li = utils.stoi(line, vocab)
    char_tensor=utils.lineToTensor(vocab.size,li)
    output = evaluate(char_tensor)
    output=output.cpu()
    # Get top N categories
    topv, topi = output.data.topk(1, 1, True)
    out=vocab.label["iton"][topi.item()]
    return out

def eavl():
    for id,filename in enumerate(utils.findFiles(os.path.join(hparam.eval, hparam.files))):
예제 #7
0
loss_bce = torch.nn.BCELoss().cuda()
loss_nll = torch.nn.NLLLoss().cuda()

# GPU mode
if config['CUDA'] == True:
    G.cuda(), D.cuda(), AE.cuda()
    loss_bce.cuda(), loss_nll.cuda()

# define optimizers
G_solver = optim.RMSprop(theta_G, lr=lr)
D_solver = optim.RMSprop(theta_D_gan + theta_D_aux, lr=lr)
AE_solver = optim.Adam(AE.parameters(), lr=lr)

# ##### Load dataset
# define dataloader
load_data = utils.load_data()

# ##### train loop
for ex_fold in range(num_fold):
    for in_fold in range(num_fold):
        X_train, y_train, X_valid, y_valid, X_test, y_test = next(load_data)
        load_minibatch = utils.load_minibatch(X_train, y_train)
        num_batch = int(np.ceil(np.shape(X_train)[0] / batch_size))
        for epoch in range(num_epoch):
            for batch in range(num_batch):
                # load data batch
                x_mb, y_mb, z_mb, zy_mb = next(load_minibatch)
                X_real = Variable(x_mb).cuda()  # input features of real data
                y = Variable(y_mb).cuda()  # class targets of real data
                z = Variable(z_mb, volatile=True).cuda()  # inference mode
                z_y = Variable(zy_mb, volatile=True).cuda()
예제 #8
0
def load_data_per_interview(doc_num):
    print('Interview:', doc_num)
    path = './data/interview/interview-text_01-26_' + doc_num + '.txt'

    return utils.load_data(path)
예제 #9
0
def main():
    # Prepare parameters
    args = parse_args()
    batch_size = args.batch_size
    num_epoch = args.num_epoch
    data_path = args.data_path
    logdir = args.logdir
    checkpoint_dir = args.checkpoint_dir
    rdm = np.random.RandomState(13)

    # Prepare data
    x_train, y_train, x_test, y_test = load_data(data_path)
    #print(np.shape(x_train))
    #print(np.shape(x_test))

    x_train, y_train = shuffle(x_train, y_train)

    num_train_data = x_train.shape[0] / 100

    input_data = tf.placeholder(tf.float32,
                                shape=[None, 32, 32, 32],
                                name='input')
    net_input = input_data[..., np.newaxis]

    CAE_3D = conv_autoencoder_3d(net_input, args=args, is_training=True)

    with tf.name_scope('training_summary'):
        tf.summary.scalar('train_loss', CAE_3D.loss)
    sum_op = tf.summary.merge_all()

    # Start Session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    saver = tf.train.Saver(max_to_keep=10)

    with tf.Session() as sess:
        writer = tf.summary.FileWriter(logdir, sess.graph)
        sess.run(tf.global_variables_initializer())

        for epoch in range(num_epoch):
            print('epoch :', epoch)
            x_train = x_train[rdm.permutation(num_train_data)]

            average_loss = 0
            for i in range(0, num_train_data, batch_size):
                feed_dict = {input_data: x_train[i:i + batch_size]}
                fetch = {
                    'optimizer': CAE_3D.optimizer,
                    'loss': CAE_3D.loss,
                    'summary': sum_op
                }

                results = sess.run(fetches=fetch, feed_dict=feed_dict)
                average_loss += results['loss']

            print('train loss : ',
                  average_loss / int(num_train_data / batch_size))

            # save summary and checkpoint by epoch
            writer.add_summary(summary=results['summary'], global_step=epoch)
            saver.save(sess,
                       os.path.join(checkpoint_dir, 'model_{0}'.format(epoch)))
예제 #10
0
from lib import model, utils,graph
import numpy as np
import os
import time
from scipy import sparse
import random
#GPU 控制
#os.environ["CUDA_VISIBLE_DEVICES"] = '2'
# os.environ["TF_CPP_MIN_LOG_LEVEL"]='3'

print('start prepairing data')
x0_train,x1_train,y_train,x0_test,x1_test,y_test = utils.prepair_data()

# save and load data
#utils.save_data(x0_train,x1_train,y_train,x0_test,x1_test,y_test)
x0_train,x1_train,y_train,x0_test,x1_test,y_test = utils.load_data()

print('start build graph')
# Calculate Laplacians
g0=sparse.csr_matrix(utils.build_graph('./data/content_10_knn_graph.txt')).astype(np.float32)
print('graph_size:',g0.shape)
graphs0 = []
for i in range(3):
    graphs0.append(g0)
L0 = [graph.laplacian(A, normalized=True) for A in graphs0]
L1 = 1

# Graph Conv-net
f0,f1,features,K=1,1,1,3
params = dict()
params['num_epochs']     = 50
예제 #11
0
def main_segmentation(doc_num,
                      window_size,
                      model_type,
                      doc_type,
                      segmentation_type,
                      eval=False):
    # === Load doc ===
    print('')
    print('Interview:', doc_num)
    print('Load data')
    path = './data/interview/interview-text_01-26_' + doc_num + '.txt'

    data = utils.load_data(path)
    if doc_type == 'sentence':
        data = utils.to_sentence(data)

    docs = [row[1] for row in data]
    label = [row[0] for row in data]
    print(data[:5])
    print('Done')

    # === Model ===
    print('Model:', model_type)
    print('Segmentation type:', segmentation_type)
    model, segmentation_model = load_model(model_type, segmentation_type,
                                           [stems(doc) for doc in docs])

    # === Result ===
    print('Segmentation')
    res = segmentation_model.segment([stems(doc) for doc in docs])
    print('Done')
    # print(res)

    # 画像
    save_path = './result/segmentation/' + segmentation_type + '/' + model_type + '/' + doc_type + '/img/' + 'doc_num_' + doc_num + '_' + model_type + '_window_size_' + str(
        segmentation_model.window_size) + '_' + str(datetime.date.today())

    fig = plt.figure()
    plt.ylim([0, 1])
    segmentation_model.sim_arr.plot(title='Cosine similarity')
    plt.savefig(save_path + '.png')
    plt.close('all')

    # セグメント
    # save_path = './result/segmentation/' + segmentation_type + '/' + model_type + '/' + doc_type + '/interview_text/' + 'doc_num_' + doc_num + '_' + model_type + '_window_size_' + str(segmentation_model.window_size) + '_' + str(datetime.date.today())
    # For lda
    save_path = './data/segmentation/' + doc_type + '/' + 'interview-text_' + doc_num
    with open(save_path + '.txt', 'w') as f:
        for i in range(len(docs)):
            print(label[i] + ' ' + docs[i].replace('\n', '。'), file=f)
            print('', file=f)
            if str(i + 0.5) in res.index.values:
                print("___________\n", file=f)

    # === Evaluation ===
    count, f_score = 0, 0
    label_for_eval = []
    if eval:
        print('===評価===')
        count, label_for_eval, f_score = evaluation(res, segmentation_model,
                                                    segmentation_type,
                                                    model_type, doc_type,
                                                    doc_num)

    return count, res.index.values, label_for_eval, f_score
예제 #12
0
def main():
    data = utils.load_data(__file__)
    utils.generate_config(__file__, data)
예제 #13
0
파일: label.py 프로젝트: sully90/spider
def main(args):

    extractor = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                             'label.py')
    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    # load each JSON file from chaos.
    # Read each block of that file.
    # [P2] Sort the blocks by their size.
    # Also load the gold-text of that file.
    # If matching between gold-text and that element text is
    #   above a certain threshold, label that block as 1.
    # [P2] remove the matching part from gold-text.
    # Rewrite the blocks to another json file.

    # extract data from each url

    # load data
    pages = []
    domains = collections.defaultdict(lambda: 0)

    for id, url in enumerate(urls):
        if not url.strip():
            continue

        host = url.split('/', 3)[2]
        #if domains[host] > 2:
        #    continue
        domains[host] += 1
        print host

        page = utils.load_data(path, id)
        processor = processors.Processor([page],
                                         tokenizer=tokenizers.GenericTokenizer,
                                         analyzer=analyzers.LongestAnalyzer)
        features = processor.extract()

        clusterer = clusterers.DBSCAN()
        labels = clusterer.cluster(features).labels_

        clusters = collections.defaultdict(list)
        for text, label in zip(processor.texts, labels):
            clusters[int(label)].append(text)

        gold_text = utils.load_gold_text(path, id)
        gold_text = processor.tokenizer.tokenize(gold_text)

        max_score = 0
        best_label = None
        for label, texts in clusters.iteritems():
            tokens = ''
            for text in texts:
                tokens += text['tokens']
            score = processor.analyzer.get_similarity(tokens, gold_text)
            if score > max_score:
                max_score = score
                best_label = label

        for text in clusters[best_label]:
            text['label'] = 1

        page_texts = []
        for label, texts in clusters.iteritems():
            page_texts += texts
        random.shuffle(page_texts)
        pages.append(page_texts)

    #random.shuffle(pages)

    continuous_features = []
    discrete_features = []
    labels = []

    for page in pages:
        for text in page:
            text_length = len(text['tokens'])
            area = text['bound']['height'] * text['bound']['width']
            text_density = float(text_length) / float(area)

            # continuous_feature
            continuous_feature = []  #text_length, text_density]
            continuous_features.append(continuous_feature)

            # discrete features
            discrete_feature = dict()
            discrete_feature = dict(text['computed'].items())
            discrete_feature['path'] = ' > '.join(text['path'])
            """
            discrete_feature['selector'] = ' > '.join([
                '%s%s%s' % (
                    selector['name'],
                    '#' + selector['id'] if selector['id'] else '',
                    '.' + '.'.join(selector['classes']) if selector['classes'] else '',
                )
                for selector in text['selector']
            ])
            """
            discrete_feature['class'] = ' > '.join([
                '%s%s' % (
                    selector['name'],
                    '.' + '.'.join(selector['classes'])
                    if selector['classes'] else '',
                ) for selector in text['selector']
            ])
            """
            discrete_feature['id'] = ' > '.join([
                '%s%s' % (
                    selector['name'],
                    '#' + selector['id'] if selector['id'] else '',
                )
                for selector in text['selector']
            ])
            """
            discrete_features.append(discrete_feature)

            # label
            labels.append(text['label'])

    vectorizer = DictVectorizer()
    discrete_features = vectorizer.fit_transform(discrete_features).toarray()
    continuous_features = np.array(continuous_features)
    labels = np.array(labels).astype(np.float32)

    # scale features
    features = preprocessing.scale(features)

    features = np.hstack([continuous_features,
                          discrete_features]).astype(np.float32)
    print features.shape

    precisions = []
    recalls = []
    f1scores = []
    supports = []

    rs = cross_validation.KFold(len(labels),
                                n_folds=4,
                                shuffle=False,
                                random_state=0)
    for train_index, test_index in rs:
        print 'training size = %d, testing size = %d' % (len(train_index),
                                                         len(test_index))

        clf = svm.SVC(verbose=False,
                      kernel='linear',
                      probability=False,
                      random_state=0,
                      cache_size=2000,
                      class_weight='auto')
        clf.fit(features[train_index], labels[train_index])

        print clf.n_support_
        """
        negatives = []
        for i in clf.support_[:clf.n_support_[0]]:
            negatives.append(all_texts[i])

        positives = []
        for i in clf.support_[clf.n_support_[0]:]:
            positives.append(all_texts[i])

        stats(negatives, positives)
        """

        print "training:"
        predicted = clf.predict(features[train_index])
        print classification_report(labels[train_index], predicted)

        print "testing:"
        predicted = clf.predict(features[test_index])
        print classification_report(labels[test_index], predicted)

        precision, recall, f1score, support = precision_recall_fscore_support(
            labels[test_index], predicted)

        precisions.append(precision)
        recalls.append(recall)
        f1scores.append(f1score)
        supports.append(support)

    precisions = np.mean(np.array(precisions), axis=0)
    recalls = np.mean(np.array(recalls), axis=0)
    f1scores = np.mean(np.array(f1scores), axis=0)
    supports = np.mean(np.array(supports), axis=0)

    for label in range(2):
        print '%f\t%f\t%f\t%f' % (precisions[label], recalls[label],
                                  f1scores[label], supports[label])

    return
예제 #14
0
def main():
    # Prepare parameters
    args = parse_args()
    checkpoint_dir = args.checkpoint_dir
    data_path = args.data_path
    num_top_similarity = args.num_top_similarity
    num_search_sample = args.num_search_sample
    modelout_save = args.modelout_save
    use_exist_modelout = args.use_exist_modelout
    modeleval_out_dir = args.modeleval_out_dir

    # Prepare Data
    _, _, x_test, y_test = load_data(data_path=data_path)

    input_data = tf.placeholder(tf.float32,
                                shape=[None, 32, 32, 32],
                                name='input')
    net_input = input_data[:, :, :, :, np.newaxis]

    CAE_3D = conv_autoencoder_3d(net_input, args=args, is_training=False)

    if use_exist_modelout:
        data = np.load(modeleval_out_dir)
        idx = data['idx']
        sims = data['sims']
        encoded = data['encoded']
        decoded = data['decoded']
    else:
        with tf.Session() as sess:
            saver = tf.train.Saver()
            saver.restore(sess, tf.train.latest_checkpoint(checkpoint_dir))
            feed_dict = {input_data: x_test}

            # extract encoded features and vectorize them
            encoded = CAE_3D.encoded.eval(session=sess, feed_dict=feed_dict)
            nd, k1, k2, k3, k4 = encoded.shape
            encoded = np.reshape(encoded, (nd, k1 * k2 * k3 * k4))

            decoded = CAE_3D.decoded.eval(session=sess, feed_dict=feed_dict)

            idx, sims = similarity_search(encoded, num_top_similarity)

            if modelout_save:
                np.savez_compressed(modeleval_out_dir,
                                    idx=idx,
                                    sims=sims,
                                    encoded=encoded,
                                    decoded=decoded)

    # visualize encoded data with t-SNE
    # visualize_tsne(encoded, y_test)

    # add self-index as the first column
    self_idx = np.arange(encoded.shape[0]).reshape((encoded.shape[0], 1))
    idx = np.concatenate([self_idx, idx], axis=1)

    # select samples to visualize randomly
    sample_idx = np.random.randint(0, x_test.shape[0], num_search_sample)

    # visualize similar search result
    visualize(x_test, y_test, idx[sample_idx])

    # visualize input and its decoded data
    # visualize_3d_iodata(x_test[sample_idx], decoded[sample_idx], y_test[sample_idx])

    # calculate average precision
    ap = calculate_average_precision(y_test, idx[sample_idx], sims[sample_idx],
                                     num_search_sample)
    print('Average Precision per sample : ', ap)
예제 #15
0
        if not(args[1] == 'tfidf' or args[1] == 'doc2vec' or args[1] == 'word2vec'):
            print('Argument is invalid')
            exit()
        if args[-1] == 'update':
            update = True
    else:
        print('Arguments are too sort')
        exit()

    model_type = args[1]

    # docs: インタビュー全体
    print('Load data')
    # モデルを訓練する
    path = './data/interview/interview-text_01-26_all.txt'
    data = utils.to_sentence(utils.load_data(path))
    docs = [row[1] for row in data]

    # max_characters: XX文字以上の単文は要約対象外
    # docs = utils.polish_docs(docs, max_characters=1000)
    sw = stopwords()
    docs_for_train = [stems(doc, polish=True, sw=sw) for doc in docs]
    print(docs_for_train[:10])
    sum = 0
    for arr in docs_for_train:
        sum += len(arr)
    print(sum)
    """
    以下のようなデータを作っています
    edocs_for_train = [
    ['出身は', 'どこ', 'ですか' ...
예제 #16
0
def main():
    # Prepare args
    args = parse_args()

    num_labeled_train = args.num_labeled_train
    num_test = args.num_test
    ramp_up_period = args.ramp_up_period
    ramp_down_period = args.ramp_down_period
    num_class = args.num_class
    num_epoch = args.num_epoch
    batch_size = args.batch_size
    weight_max = args.weight_max
    learning_rate = args.learning_rate
    alpha = args.alpha
    weight_norm_flag = args.weight_norm_flag
    augmentation_flag = args.augmentation_flag
    whitening_flag = args.whitening_flag
    trans_range = args.trans_range

    # Data Preparation
    train_x, train_y, test_x, test_y = load_data(args.data_path)
    ret_dic = split_supervised_train(train_x, train_y, num_labeled_train)

    ret_dic['test_x'] = test_x
    ret_dic['test_y'] = test_y
    ret_dic = make_train_test_dataset(ret_dic, num_class)

    unsupervised_target = ret_dic['unsupervised_target']
    supervised_label = ret_dic['supervised_label']
    supervised_flag = ret_dic['train_sup_flag']
    unsupervised_weight = ret_dic['unsupervised_weight']
    test_y = ret_dic['test_y']

    train_x, test_x = normalize_images(ret_dic['train_x'], ret_dic['test_x'])

    # pre-process
    if whitening_flag:
        train_x, test_x = whiten_zca(train_x, test_x)

    if augmentation_flag:
        train_x = np.pad(train_x, ((0, 0), (trans_range, trans_range),
                                   (trans_range, trans_range), (0, 0)),
                         'reflect')

    # make the whole data and labels for training
    # x = [train_x, supervised_label, supervised_flag, unsupervised_weight]
    y = np.concatenate((unsupervised_target, supervised_label, supervised_flag,
                        unsupervised_weight),
                       axis=1)

    num_train_data = train_x.shape[0]

    # Build Model
    if weight_norm_flag:
        from lib.model_WN import build_model
        from lib.weight_norm import AdamWithWeightnorm
        optimizer = AdamWithWeightnorm(lr=learning_rate,
                                       beta_1=0.9,
                                       beta_2=0.999)
    else:
        from lib.model_BN import build_model
        optimizer = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999)

    model = build_model(num_class=num_class)
    model.compile(optimizer=optimizer, loss=semi_supervised_loss(num_class))

    model.metrics_tensors += model.outputs
    model.summary()

    # prepare weights and arrays for updates
    gen_weight = ramp_up_weight(
        ramp_up_period, weight_max * (num_labeled_train / num_train_data))
    gen_lr_weight = ramp_down_weight(ramp_down_period)
    idx_list = [v for v in range(num_train_data)]
    ensemble_prediction = np.zeros((num_train_data, num_class))
    cur_pred = np.zeros((num_train_data, num_class))

    # Training
    for epoch in range(num_epoch):
        print('epoch: ', epoch)
        idx_list = shuffle(idx_list)

        if epoch > num_epoch - ramp_down_period:
            weight_down = next(gen_lr_weight)
            K.set_value(model.optimizer.lr, weight_down * learning_rate)
            K.set_value(model.optimizer.beta_1, 0.4 * weight_down + 0.5)

        ave_loss = 0
        for i in range(0, num_train_data, batch_size):
            target_idx = idx_list[i:i + batch_size]

            if augmentation_flag:
                x1 = data_augmentation_tempen(train_x[target_idx], trans_range)
            else:
                x1 = train_x[target_idx]

            x2 = supervised_label[target_idx]
            x3 = supervised_flag[target_idx]
            x4 = unsupervised_weight[target_idx]
            y_t = y[target_idx]

            x_t = [x1, x2, x3, x4]
            tr_loss, output = model.train_on_batch(x=x_t, y=y_t)
            cur_pred[idx_list[i:i + batch_size]] = output[:, 0:num_class]
            ave_loss += tr_loss

        print('Training Loss: ', (ave_loss * batch_size) / num_train_data,
              flush=True)

        # Update phase
        next_weight = next(gen_weight)
        y, unsupervised_weight = update_weight(y, unsupervised_weight,
                                               next_weight)
        ensemble_prediction, y = update_unsupervised_target(
            ensemble_prediction, y, num_class, alpha, cur_pred, epoch)

        # Evaluation
        if epoch % 5 == 0:
            print('Evaluate epoch :  ', epoch, flush=True)
            evaluate(model, num_class, num_test, test_x, test_y)
예제 #17
0
def main(model,
         auxiliary=True,
         model_label='rcnn',
         rnn_type='gru',
         padding='pre',
         reg='s',
         prefix="crawl",
         embedding_file_type="word2vec",
         train_fname="./data/train.csv",
         test_fname="./data/test.csv",
         embeds_fname="./data/GoogleNews-vectors-negative300.bin",
         logger_fname="./logs/log-aws",
         mode="all",
         wrong_words_fname="./data/correct_words.csv",
         format_embeds="binary",
         config="./config.json",
         output_dir="./out",
         norm_prob=False,
         norm_prob_koef=1,
         gpus=0,
         char_level=False,
         random_seed=2018,
         num_folds=5):

    embedding_type = prefix + "_" + embedding_file_type

    logger = Logger(logging.getLogger(), logger_fname)

    # ====Detect GPUs====
    logger.debug(device_lib.list_local_devices())

    # ====Load data====
    logger.info('Loading data...')
    train_df = load_data(train_fname)
    test_df = load_data(test_fname)

    target_labels = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]
    num_classes = len(target_labels)

    # ====Load additional data====
    logger.info('Loading additional data...')
    # swear_words = load_data(swear_words_fname, func=lambda x: set(x.T[0]), header=None)
    wrong_words_dict = load_data(wrong_words_fname,
                                 func=lambda x: {val[0]: val[1]
                                                 for val in x})

    tokinizer = RegexpTokenizer(r'\S+')
    regexps = [
        re.compile("([a-zA-Z]+)([0-9]+)"),
        re.compile("([0-9]+)([a-zA-Z]+)")
    ]

    # ====Load word vectors====
    logger.info('Loading embeddings...')
    if model != 'mvcnn':
        embed_dim = 300
        embeds = Embeds(embeds_fname,
                        embedding_file_type,
                        format=format_embeds)

    if mode in ('preprocess', 'all'):
        logger.info('Generating indirect features...')
        # https://www.kaggle.com/jagangupta/stop-the-s-toxic-comments-eda
        # Word count in each comment:
        train_df['count_word'] = train_df["comment_text"].apply(
            lambda x: len(str(x).split()))
        test_df['count_word'] = test_df["comment_text"].apply(
            lambda x: len(str(x).split()))
        # Unique word count
        train_df['count_unique_word'] = train_df["comment_text"].apply(
            lambda x: len(set(str(x).split())))
        test_df['count_unique_word'] = test_df["comment_text"].apply(
            lambda x: len(set(str(x).split())))
        # Letter count
        train_df['count_letters'] = train_df["comment_text"].apply(
            lambda x: len(str(x)))
        test_df['count_letters'] = test_df["comment_text"].apply(
            lambda x: len(str(x)))
        # punctuation count
        train_df["count_punctuations"] = train_df["comment_text"].apply(
            lambda x: len([c for c in str(x) if c in string.punctuation]))
        test_df["count_punctuations"] = test_df["comment_text"].apply(
            lambda x: len([c for c in str(x) if c in string.punctuation]))
        # upper case words count
        train_df["count_words_upper"] = train_df["comment_text"].apply(
            lambda x: len([w for w in str(x).split() if w.isupper()]))
        test_df["count_words_upper"] = test_df["comment_text"].apply(
            lambda x: len([w for w in str(x).split() if w.isupper()]))
        # title case words count
        train_df["count_words_title"] = train_df["comment_text"].apply(
            lambda x: len([w for w in str(x).split() if w.istitle()]))
        test_df["count_words_title"] = test_df["comment_text"].apply(
            lambda x: len([w for w in str(x).split() if w.istitle()]))
        # Word count percent in each comment:
        train_df['word_unique_pct'] = train_df[
            'count_unique_word'] * 100 / train_df['count_word']
        test_df['word_unique_pct'] = test_df[
            'count_unique_word'] * 100 / test_df['count_word']
        # Punct percent in each comment:
        train_df['punct_pct'] = train_df[
            'count_punctuations'] * 100 / train_df['count_word']
        test_df['punct_pct'] = test_df['count_punctuations'] * 100 / test_df[
            'count_word']
        # Average length of the words
        train_df["mean_word_len"] = train_df["comment_text"].apply(
            lambda x: np.mean([len(w) for w in str(x).split()]))
        test_df["mean_word_len"] = test_df["comment_text"].apply(
            lambda x: np.mean([len(w) for w in str(x).split()]))
        # upper case words percentage
        train_df["words_upper_pct"] = train_df[
            "count_words_upper"] * 100 / train_df['count_word']
        test_df["words_upper_pct"] = test_df[
            "count_words_upper"] * 100 / test_df['count_word']
        # title case words count
        train_df["words_title_pct"] = train_df[
            "count_words_title"] * 100 / train_df['count_word']
        test_df["words_title_pct"] = test_df[
            "count_words_title"] * 100 / test_df['count_word']
        # remove columns
        train_df = train_df.drop('count_word', 1)
        train_df = train_df.drop('count_unique_word', 1)
        train_df = train_df.drop('count_punctuations', 1)
        train_df = train_df.drop('count_words_upper', 1)
        train_df = train_df.drop('count_words_title', 1)
        test_df = test_df.drop('count_word', 1)
        test_df = test_df.drop('count_unique_word', 1)
        test_df = test_df.drop('count_punctuations', 1)
        test_df = test_df.drop('count_words_upper', 1)
        test_df = test_df.drop('count_words_title', 1)

        logger.info('Cleaning text...')
        train_df['comment_text_clear'] = clean_text(train_df['comment_text'],
                                                    tokinizer,
                                                    wrong_words_dict,
                                                    regexps,
                                                    autocorrect=False)
        test_df['comment_text_clear'] = clean_text(test_df['comment_text'],
                                                   tokinizer,
                                                   wrong_words_dict,
                                                   regexps,
                                                   autocorrect=False)
        if reg == 'w':
            # remove all punctuations
            train_df.to_csv(os.path.join(output_dir, 'train_clear_w.csv'),
                            index=False)
            test_df.to_csv(os.path.join(output_dir, 'test_clear_w.csv'),
                           index=False)
            train_df = pd.read_csv(
                os.path.join(output_dir, 'train_clear_w.csv'))
            test_df = pd.read_csv(os.path.join(output_dir, 'test_clear_w.csv'))
        elif reg == 's':
            # split by S+ keep all punctuations
            train_df.to_csv(os.path.join(output_dir, 'train_clear.csv'),
                            index=False)
            test_df.to_csv(os.path.join(output_dir, 'test_clear.csv'),
                           index=False)
            train_df = pd.read_csv(os.path.join(output_dir, 'train_clear.csv'))
            test_df = pd.read_csv(os.path.join(output_dir, 'test_clear.csv'))

    if mode == 'preprocess':
        return

    if mode == 'processed':
        if reg == 'w':
            train_df = pd.read_csv(
                os.path.join(output_dir, 'train_clear_w.csv'))
            test_df = pd.read_csv(os.path.join(output_dir, 'test_clear_w.csv'))
        elif reg == 's':
            train_df = pd.read_csv(os.path.join(output_dir, 'train_clear.csv'))
            test_df = pd.read_csv(os.path.join(output_dir, 'test_clear.csv'))

    logger.info('Calc text length...')
    train_df.fillna('unknown', inplace=True)
    test_df.fillna('unknown', inplace=True)
    train_df['text_len'] = train_df['comment_text_clear'].apply(
        lambda words: len(words.split()))
    test_df['text_len'] = test_df['comment_text_clear'].apply(
        lambda words: len(words.split()))
    max_seq_len = np.round(train_df['text_len'].mean() +
                           3 * train_df['text_len'].std()).astype(int)
    logger.debug('Max seq length = {}'.format(max_seq_len))

    # ====Prepare data to NN====
    logger.info('Converting texts to sequences...')
    max_words = 100000
    if char_level:
        max_seq_len = 1200

    train_df['comment_seq'], test_df[
        'comment_seq'], word_index = convert_text2seq(
            train_df['comment_text_clear'].tolist(),
            test_df['comment_text_clear'].tolist(),
            max_words,
            max_seq_len,
            embeds,
            lower=True,
            char_level=char_level,
            uniq=True,
            use_only_exists_words=True,
            position=padding)
    logger.debug('Dictionary size = {}'.format(len(word_index)))

    logger.info('Preparing embedding matrix...')
    if model != 'mvcnn':
        embedding_matrix, words_not_found = get_embedding_matrix(
            embed_dim, embeds, max_words, word_index)

    logger.debug('Embedding matrix shape = {}'.format(
        np.shape(embedding_matrix)))
    logger.debug('Number of null word embeddings = {}'.format(
        np.sum(np.sum(embedding_matrix, axis=1) == 0)))

    # ====Train/test split data====
    # train/val
    x_aux = np.matrix([
        train_df["word_unique_pct"].tolist(), train_df["punct_pct"].tolist(),
        train_df["mean_word_len"].tolist(),
        train_df["words_upper_pct"].tolist(),
        train_df["words_title_pct"].tolist()
    ],
                      dtype='float32').transpose((1, 0))
    x = np.array(train_df['comment_seq'].tolist())
    y = np.array(train_df[target_labels].values)
    x_train_nn, x_test_nn, x_aux_train_nn, x_aux_test_nn, y_train_nn, y_test_nn, train_idxs, test_idxs = \
        split_data(x, np.squeeze(np.asarray(x_aux)),y,test_size=0.2,shuffle=True,random_state=2018)
    # test set
    test_df_seq = np.array(test_df['comment_seq'].tolist())
    test_aux = np.matrix([
        train_df["word_unique_pct"].tolist(), train_df["punct_pct"].tolist(),
        train_df["mean_word_len"].tolist(),
        train_df["words_upper_pct"].tolist(),
        train_df["words_title_pct"].tolist()
    ],
                         dtype='float32').transpose((1, 0))
    test_df_seq_aux = np.squeeze(np.asarray(test_aux))
    y_nn = []
    logger.debug('X shape = {}'.format(np.shape(x_train_nn)))

    # ====Train models====
    params = Params(config)
    if model_label == None:
        logger.warn('Should choose a model to train')
        return

    if model_label == 'dense':
        model = dense(
            embedding_matrix,
            num_classes,
            max_seq_len,
            dense_dim=params.get('dense').get('dense_dim'),
            n_layers=params.get('dense').get('n_layers'),
            concat=params.get('dense').get('concat'),
            dropout_val=params.get('dense').get('dropout_val'),
            l2_weight_decay=params.get('dense').get('l2_weight_decay'),
            pool=params.get('dense').get('pool'),
            train_embeds=params.get('dense').get('train_embeds'),
            add_sigmoid=True,
            gpus=gpus)
    if model_label == 'cnn':
        model = cnn(embedding_matrix,
                    num_classes,
                    max_seq_len,
                    num_filters=params.get('cnn').get('num_filters'),
                    l2_weight_decay=params.get('cnn').get('l2_weight_decay'),
                    dropout_val=params.get('cnn').get('dropout_val'),
                    dense_dim=params.get('cnn').get('dense_dim'),
                    train_embeds=params.get('cnn').get('train_embeds'),
                    n_cnn_layers=params.get('cnn').get('n_cnn_layers'),
                    pool=params.get('cnn').get('pool'),
                    add_embeds=params.get('cnn').get('add_embeds'),
                    auxiliary=auxiliary,
                    add_sigmoid=True,
                    gpus=gpus)
    if model_label == 'cnn2d':
        model = cnn2d(
            embedding_matrix,
            num_classes,
            max_seq_len,
            num_filters=params.get('cnn2d').get('num_filters'),
            l2_weight_decay=params.get('cnn2d').get('l2_weight_decay'),
            dropout_val=params.get('cnn2d').get('dropout_val'),
            dense_dim=params.get('cnn2d').get('dense_dim'),
            train_embeds=params.get('cnn2d').get('train_embeds'),
            add_embeds=params.get('cnn2d').get('add_embeds'),
            auxiliary=auxiliary,
            add_sigmoid=True,
            gpus=gpus)

    if model_label == 'lstm':
        model = rnn(
            embedding_matrix,
            num_classes,
            max_seq_len,
            l2_weight_decay=params.get('lstm').get('l2_weight_decay'),
            rnn_dim=params.get('lstm').get('rnn_dim'),
            dropout_val=params.get('lstm').get('dropout_val'),
            dense_dim=params.get('lstm').get('dense_dim'),
            n_branches=params.get('lstm').get('n_branches'),
            n_rnn_layers=params.get('lstm').get('n_rnn_layers'),
            n_dense_layers=params.get('lstm').get('n_dense_layers'),
            train_embeds=params.get('lstm').get('train_embeds'),
            mask_zero=params.get('lstm').get('mask_zero'),
            kernel_regularizer=params.get('lstm').get('kernel_regularizer'),
            recurrent_regularizer=params.get('lstm').get(
                'recurrent_regularizer'),
            activity_regularizer=params.get('lstm').get(
                'activity_regularizer'),
            dropout=params.get('lstm').get('dropout'),
            recurrent_dropout=params.get('lstm').get('recurrent_dropout'),
            auxiliary=auxiliary,
            add_sigmoid=True,
            gpus=gpus,
            rnn_type='lstm')
    if model_label == 'gru':
        model = rnn(
            embedding_matrix,
            num_classes,
            max_seq_len,
            l2_weight_decay=params.get('gru').get('l2_weight_decay'),
            rnn_dim=params.get('gru').get('rnn_dim'),
            dropout_val=params.get('gru').get('dropout_val'),
            dense_dim=params.get('gru').get('dense_dim'),
            n_branches=params.get('gru').get('n_branches'),
            n_rnn_layers=params.get('gru').get('n_rnn_layers'),
            n_dense_layers=params.get('gru').get('n_dense_layers'),
            train_embeds=params.get('gru').get('train_embeds'),
            mask_zero=params.get('gru').get('mask_zero'),
            kernel_regularizer=params.get('gru').get('kernel_regularizer'),
            recurrent_regularizer=params.get('gru').get(
                'recurrent_regularizer'),
            activity_regularizer=params.get('gru').get('activity_regularizer'),
            dropout=params.get('gru').get('dropout'),
            recurrent_dropout=params.get('gru').get('recurrent_dropout'),
            auxiliary=auxiliary,
            add_sigmoid=True,
            gpus=gpus,
            rnn_type='gru')

    if model_label == 'charrnn':
        model = charrnn(
            len(word_index),
            num_classes,
            max_seq_len,
            rnn_dim=params.get('charrnn').get('rnn_dim'),
            dropout_val=params.get('charrnn').get('dropout_val'),
            auxiliary=auxiliary,
            dropout=params.get('charrnn').get('dropout'),
            recurrent_dropout=params.get('charrnn').get('recurrent_dropout'),
            add_sigmoid=True,
            gpus=gpus,
            rnn_type=rnn_type)
    if model_label == 'cnn2rnn':
        model = cnn2rnn(embedding_matrix,
                        num_classes,
                        max_seq_len,
                        rnn_type=rnn_type)
    if model_label == 'dpcnn':
        model = dpcnn(embedding_matrix,
                      num_classes,
                      max_seq_len,
                      num_filters=params.get('dpcnn').get('num_filters'),
                      dense_dim=params.get('dpcnn').get('dense_dim'),
                      add_sigmoid=True,
                      gpus=gpus)

    if model_label == 'rcnn':
        model = rcnn(
            embedding_matrix,
            num_classes,
            max_seq_len,
            rnn_dim=params.get('rcnn').get('rnn_dim'),
            dropout_val=params.get('rcnn').get('dropout_val'),
            dense_dim=params.get('rcnn').get('dense_dim'),
            train_embeds=params.get('rcnn').get('train_embeds'),
            auxiliary=auxiliary,
            dropout=params.get('rcnn').get('dropout'),
            recurrent_dropout=params.get('rcnn').get('recurrent_dropout'),
            add_sigmoid=True,
            gpus=gpus,
            rnn_type=rnn_type)
    if model_label == 'capsule':
        model = capsule(
            embedding_matrix,
            num_classes,
            max_seq_len,
            auxiliary=auxiliary,
            Num_capsule=params.get('capsule').get('Num_capsule'),
            Routings=params.get('capsule').get('Routing'),
            add_sigmoid=params.get('capsule').get('add_sigmoid'),
            mask_zero=params.get('capsule').get('mask_zero'),
            gpus=gpus,
            rnn_type='gru')  # lstm may diverge but gru works better

    if model == 'mvcnn':
        embeds_fname1 = "./data/crawl-300d-2M.vec"  # "./data/crawl-300d-2M.vec  word2vec-raw.txt
        embeds_fname2 = "./data/glove.840B.300d.txt"
        embeds_fname3 = "./data/GoogleNews-vectors-negative300.bin"
        embed_dim = 300
        embeds1 = Embeds(embeds_fname1, "glove", format='file')
        embeds2 = Embeds(embeds_fname2, "fasttext", format='file')
        embeds3 = Embeds(embeds_fname3, "word2vec", format='binary')
        embedding_matrix1, words_not_found1 = get_embedding_matrix(
            embed_dim, embeds1, max_words, word_index)
        embedding_matrix2, words_not_found2 = get_embedding_matrix(
            embed_dim, embeds2, max_words, word_index)
        #embedding_matrix3, words_not_found3 = get_embedding_matrix(embed_dim, embeds3, max_words, word_index)
        model = mvcnn(embedding_matrix1,
                      embedding_matrix2,
                      num_classes,
                      max_seq_len,
                      auxiliary=auxiliary,
                      gpus=gpus)

    # ====k-fold cross validations split data====
    logger.info('Run k-fold cross validation...')
    params = Params(config)
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=random_seed)
    oof_train = np.zeros((x.shape[0], num_classes))
    oof_test_skf = []

    for i, (train_index, test_index) in enumerate(kf.split(x, y)):
        print("TRAIN:", train_index, "TEST:", test_index)
        x_train, x_aux_train, x_test, x_aux_test = x[train_index], x_aux[
            train_index], x[test_index], x_aux[test_index]
        y_train, y_test = y[train_index], y[test_index]
        logger.info('Start training {}-th fold'.format(i))
        if auxiliary:
            inputs = [x_train, x_aux_train]
            inputs_val = [x_test, x_aux_test]
            output = [test_df_seq, test_df_seq_aux]
        else:
            inputs = x_train
            inputs_val = x_test
            output = test_df_seq
        hist = train(
            x_train=
            inputs,  # [x_train, x_aux_train] when auxiliary input is allowed.
            y_train=y_train,
            x_val=inputs_val,  # [x_test, x_aux_test],
            y_val=y_test,
            model=model,
            batch_size=params.get(model_label).get('batch_size'),
            num_epochs=params.get(model_label).get('num_epochs'),
            learning_rate=params.get(model_label).get('learning_rate'),
            early_stopping_delta=params.get(model_label).get(
                'early_stopping_delta'),
            early_stopping_epochs=params.get(model_label).get(
                'early_stopping_epochs'),
            use_lr_strategy=params.get(model_label).get('use_lr_strategy'),
            lr_drop_koef=params.get(model_label).get('lr_drop_koef'),
            epochs_to_drop=params.get(model_label).get('epochs_to_drop'),
            model_checkpoint_dir=os.path.join('.', 'model_checkpoint', reg,
                                              model_label, embedding_type,
                                              padding, str(i)),
            logger=logger)

        model.load_weights(
            os.path.join('.', 'model_checkpoint', reg, model_label,
                         embedding_type, padding, str(i), 'weights.h5'))
        oof_train[test_index, :] = model.predict(
            inputs_val)  # model.predict([x_test, x_aux_test])
        proba = model.predict(
            output)  # model.predict([test_df_seq, test_df_seq_aux])
        oof_test_skf.append(proba)
        result = pd.read_csv("./data/sample_submission.csv")
        result[target_labels] = proba
        ithfold_path = "./cv/{}/{}/{}/{}/{}".format(reg, model_label,
                                                    embedding_type, padding, i)
        if not os.path.exists(ithfold_path):
            os.makedirs(ithfold_path)

        result.to_csv(os.path.join(ithfold_path, 'sub.csv'), index=False)
        # model.save(os.path.join(ithfold_path,'weights.h5'))

    # dump oof_test and oof_train for later slacking
    # oof_train:
    oof_train_path = "./cv/{}/{}/{}/{}/oof_train".format(
        reg, model_label, embedding_type, padding)
    if not os.path.exists(oof_train_path):
        os.makedirs(oof_train_path)

    np.savetxt(os.path.join(oof_train_path, "oof_train.csv"),
               oof_train,
               fmt='%.24f',
               delimiter=' ')
    # oof_test: stacking version
    oof_test = np.array(oof_test_skf).mean(axis=0)
    oof_test_path = "./cv/{}/{}/{}/{}/oof_test".format(reg, model_label,
                                                       embedding_type, padding)
    if not os.path.exists(oof_test_path):
        os.makedirs(oof_test_path)

    np.savetxt(os.path.join(oof_test_path, "oof_test.csv"),
               oof_test,
               fmt='%.24f',
               delimiter=' ')
    # oof_test: submission version
    result[target_labels] = oof_test
    oof_test_bag_path = "./cv/{}/{}/{}/{}/bagged".format(
        reg, model_label, embedding_type, padding)
    if not os.path.exists(oof_test_bag_path):
        os.makedirs(oof_test_bag_path)

    result.to_csv(os.path.join(oof_test_bag_path, "sub.csv"), index=False)
예제 #18
0
    if 2 <= len(args):
        if not (args[1] == 'sentence' or args[1] == 'segmentation'
                or args[1] == 'utterance' or args[1] == 'segmentation/ans'):
            print('Argument is invalid')
            exit()
    else:
        print('Arguments are too sort')
        exit()

    doc_type = args[1]

    doc_num = 'all'
    path = './data/interview/interview-text_01-26_' + doc_num + '.txt'

    if doc_type == 'sentence':
        data = utils.load_data(path)
        # to sentence
        data = utils.to_sentence(data)
        docs = [row[1] for row in data]

    if doc_type == 'utterance':
        data = utils.load_data(path)
        docs = [row[1] for row in data]

    elif doc_type == 'segmentation' or doc_type == 'segmentation/ans':
        ans = False
        if doc_type == 'segmentation/ans':
            ans = True
        if doc_num == 'all':
            doc_num = '26'
        data_arr = []
예제 #19
0
파일: run.py 프로젝트: sully90/spider
def main(args):

    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    # load data
    data = [utils.load_data(path, id) for id, url in enumerate(urls)]
    random.shuffle(data)
    for page in data:
        random.shuffle(page['texts'])

    # process data
    processor = processors.Processor(data,
                                     tokenizer=tokenizers.GenericTokenizer,
                                     analyzer=analyzers.LongestAnalyzer)
    features = processor.extract()

    # clustering
    clusterer = clusterers.DBSCAN()
    labels = clusterer.cluster(features).labels_

    # prepare features
    continuous_features, discrete_features, labels = processor.prepare(labels)

    vectorizer = DictVectorizer()
    discrete_features = vectorizer.fit_transform(discrete_features).toarray()
    continuous_features = np.array(continuous_features)
    labels = np.array(labels).astype(np.float32)

    features = np.hstack([continuous_features,
                          discrete_features]).astype(np.float32)

    # scale features
    features = preprocessing.scale(features)
    print features.shape

    precisions = []
    recalls = []
    f1scores = []
    supports = []

    rs = cross_validation.KFold(len(labels),
                                n_folds=4,
                                shuffle=False,
                                random_state=0)
    for train_index, test_index in rs:
        print 'training size = %d, testing size = %d' % (len(train_index),
                                                         len(test_index))

        clf = svm.SVC(verbose=False,
                      kernel='linear',
                      probability=False,
                      random_state=0,
                      cache_size=2000,
                      class_weight='auto')
        clf.fit(features[train_index], labels[train_index])

        print clf.n_support_

        print "training:"
        predicted = clf.predict(features[train_index])
        print classification_report(labels[train_index], predicted)

        print "testing:"
        predicted = clf.predict(features[test_index])
        print classification_report(labels[test_index], predicted)

        precision, recall, f1score, support = precision_recall_fscore_support(
            labels[test_index], predicted)

        precisions.append(precision)
        recalls.append(recall)
        f1scores.append(f1score)
        supports.append(support)

    precisions = np.mean(np.array(precisions), axis=0)
    recalls = np.mean(np.array(recalls), axis=0)
    f1scores = np.mean(np.array(f1scores), axis=0)
    supports = np.mean(np.array(supports), axis=0)

    for label in range(2):
        print '%f\t%f\t%f\t%f' % (precisions[label], recalls[label],
                                  f1scores[label], supports[label])

    return

    negatives = []
    positives = []
    for i in range(len(processor.texts)):
        if labels[i]:
            positives.append(processor.texts[i])
        else:
            negatives.append(processor.texts[i])

    stats(negatives, positives)

    return
    """