def main(args): path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) for count in range(2, len(urls) + 1): print '[learner] clustering with %d urls' % count # load data data = [utils.load_data(path, id) for id, url in enumerate(urls)] data = data[:count] # process data processor = processors.Processor(data) features = processor.extract() # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # score clusters = processor.score(labels) with open(os.path.join(path, 'clusters.%03d.json' % count), 'w') as f: f.write( json.dumps(clusters, indent=2, ensure_ascii=False).encode('utf8'))
def main_segmentation(doc_num, window_size, model_type, doc_type, segmentation_type, eval=False): # === Load doc === print('') print('Interview:', doc_num) print('Load data') path = './data/interview/interview-text_01-26_' + doc_num + '.txt' data = utils.load_data(path) if doc_type == 'sentence': data = utils.to_sentence(data) docs = [row[1] for row in data] label = [row[0] for row in data] print(data[:5]) print('Done') # === Model === print('Model:', model_type) print('Segmentation type:', segmentation_type) model, segmentation_model = load_model(model_type, segmentation_type) # === Result === print('===結果===') res = segmentation_model.segment([stems(doc) for doc in docs]) print(segmentation_model.sim_arr)
def main(): data = utils.load_data(__file__) output = generate_output(data) output.sort(key=lambda pair: len(pair[1]), reverse=True) for (header, sequence) in output: print header.encode('utf-8') print sequence.encode('utf-8')
def predict(data): X, y = load_data('datingTestSet.txt') scale = StandardScaler() X = scale.fit_transform(X) encoder = LabelEncoder() y = encoder.fit_transform(y) model = KNeighborsClassifier(n_neighbors=3) model.fit(X, y) result = model.predict(scale.transform(data)) result = encoder.inverse_transform(result) return result
def main(): config = utils.load_config(__file__) data = utils.load_data(__file__) count = {} for code in data: for interface in data[code]['interfaces']: is_active = config['active'][code][interface] residues = data[code]['interfaces'][interface]['residues'] if is_active: for r in residues: acid = r['resn'].encode('utf-8') if acid in count: count[acid] += 1 else: count[acid] = 1 total = sum(count.values()) for acid in sorted(count.keys()): print acid, '{:1.1f}'.format(100 * count[acid] / float(total)), u'({0} / {1})'.format(count[acid], total)
from model import * import os from tqdm import tqdm from hparam import hparam import lib.utils as utils from lib.logging import init_logger, logger init_logger(utils.get_log(hparam.eval)) rnn = utils.load_data(hparam.model,logger) vocab = utils.load_data(hparam.vocab,logger) n_hidden=1024 # Just return an output given a line def evaluate(line_tensor): hidden = rnn.initHidden(n_hidden) for i in range(line_tensor.size()[0]): output, hidden = rnn(line_tensor[i], hidden) return output def predict(line,label): li = utils.stoi(line, vocab) char_tensor=utils.lineToTensor(vocab.size,li) output = evaluate(char_tensor) output=output.cpu() # Get top N categories topv, topi = output.data.topk(1, 1, True) out=vocab.label["iton"][topi.item()] return out def eavl(): for id,filename in enumerate(utils.findFiles(os.path.join(hparam.eval, hparam.files))):
loss_bce = torch.nn.BCELoss().cuda() loss_nll = torch.nn.NLLLoss().cuda() # GPU mode if config['CUDA'] == True: G.cuda(), D.cuda(), AE.cuda() loss_bce.cuda(), loss_nll.cuda() # define optimizers G_solver = optim.RMSprop(theta_G, lr=lr) D_solver = optim.RMSprop(theta_D_gan + theta_D_aux, lr=lr) AE_solver = optim.Adam(AE.parameters(), lr=lr) # ##### Load dataset # define dataloader load_data = utils.load_data() # ##### train loop for ex_fold in range(num_fold): for in_fold in range(num_fold): X_train, y_train, X_valid, y_valid, X_test, y_test = next(load_data) load_minibatch = utils.load_minibatch(X_train, y_train) num_batch = int(np.ceil(np.shape(X_train)[0] / batch_size)) for epoch in range(num_epoch): for batch in range(num_batch): # load data batch x_mb, y_mb, z_mb, zy_mb = next(load_minibatch) X_real = Variable(x_mb).cuda() # input features of real data y = Variable(y_mb).cuda() # class targets of real data z = Variable(z_mb, volatile=True).cuda() # inference mode z_y = Variable(zy_mb, volatile=True).cuda()
def load_data_per_interview(doc_num): print('Interview:', doc_num) path = './data/interview/interview-text_01-26_' + doc_num + '.txt' return utils.load_data(path)
def main(): # Prepare parameters args = parse_args() batch_size = args.batch_size num_epoch = args.num_epoch data_path = args.data_path logdir = args.logdir checkpoint_dir = args.checkpoint_dir rdm = np.random.RandomState(13) # Prepare data x_train, y_train, x_test, y_test = load_data(data_path) #print(np.shape(x_train)) #print(np.shape(x_test)) x_train, y_train = shuffle(x_train, y_train) num_train_data = x_train.shape[0] / 100 input_data = tf.placeholder(tf.float32, shape=[None, 32, 32, 32], name='input') net_input = input_data[..., np.newaxis] CAE_3D = conv_autoencoder_3d(net_input, args=args, is_training=True) with tf.name_scope('training_summary'): tf.summary.scalar('train_loss', CAE_3D.loss) sum_op = tf.summary.merge_all() # Start Session config = tf.ConfigProto() config.gpu_options.allow_growth = True saver = tf.train.Saver(max_to_keep=10) with tf.Session() as sess: writer = tf.summary.FileWriter(logdir, sess.graph) sess.run(tf.global_variables_initializer()) for epoch in range(num_epoch): print('epoch :', epoch) x_train = x_train[rdm.permutation(num_train_data)] average_loss = 0 for i in range(0, num_train_data, batch_size): feed_dict = {input_data: x_train[i:i + batch_size]} fetch = { 'optimizer': CAE_3D.optimizer, 'loss': CAE_3D.loss, 'summary': sum_op } results = sess.run(fetches=fetch, feed_dict=feed_dict) average_loss += results['loss'] print('train loss : ', average_loss / int(num_train_data / batch_size)) # save summary and checkpoint by epoch writer.add_summary(summary=results['summary'], global_step=epoch) saver.save(sess, os.path.join(checkpoint_dir, 'model_{0}'.format(epoch)))
from lib import model, utils,graph import numpy as np import os import time from scipy import sparse import random #GPU 控制 #os.environ["CUDA_VISIBLE_DEVICES"] = '2' # os.environ["TF_CPP_MIN_LOG_LEVEL"]='3' print('start prepairing data') x0_train,x1_train,y_train,x0_test,x1_test,y_test = utils.prepair_data() # save and load data #utils.save_data(x0_train,x1_train,y_train,x0_test,x1_test,y_test) x0_train,x1_train,y_train,x0_test,x1_test,y_test = utils.load_data() print('start build graph') # Calculate Laplacians g0=sparse.csr_matrix(utils.build_graph('./data/content_10_knn_graph.txt')).astype(np.float32) print('graph_size:',g0.shape) graphs0 = [] for i in range(3): graphs0.append(g0) L0 = [graph.laplacian(A, normalized=True) for A in graphs0] L1 = 1 # Graph Conv-net f0,f1,features,K=1,1,1,3 params = dict() params['num_epochs'] = 50
def main_segmentation(doc_num, window_size, model_type, doc_type, segmentation_type, eval=False): # === Load doc === print('') print('Interview:', doc_num) print('Load data') path = './data/interview/interview-text_01-26_' + doc_num + '.txt' data = utils.load_data(path) if doc_type == 'sentence': data = utils.to_sentence(data) docs = [row[1] for row in data] label = [row[0] for row in data] print(data[:5]) print('Done') # === Model === print('Model:', model_type) print('Segmentation type:', segmentation_type) model, segmentation_model = load_model(model_type, segmentation_type, [stems(doc) for doc in docs]) # === Result === print('Segmentation') res = segmentation_model.segment([stems(doc) for doc in docs]) print('Done') # print(res) # 画像 save_path = './result/segmentation/' + segmentation_type + '/' + model_type + '/' + doc_type + '/img/' + 'doc_num_' + doc_num + '_' + model_type + '_window_size_' + str( segmentation_model.window_size) + '_' + str(datetime.date.today()) fig = plt.figure() plt.ylim([0, 1]) segmentation_model.sim_arr.plot(title='Cosine similarity') plt.savefig(save_path + '.png') plt.close('all') # セグメント # save_path = './result/segmentation/' + segmentation_type + '/' + model_type + '/' + doc_type + '/interview_text/' + 'doc_num_' + doc_num + '_' + model_type + '_window_size_' + str(segmentation_model.window_size) + '_' + str(datetime.date.today()) # For lda save_path = './data/segmentation/' + doc_type + '/' + 'interview-text_' + doc_num with open(save_path + '.txt', 'w') as f: for i in range(len(docs)): print(label[i] + ' ' + docs[i].replace('\n', '。'), file=f) print('', file=f) if str(i + 0.5) in res.index.values: print("___________\n", file=f) # === Evaluation === count, f_score = 0, 0 label_for_eval = [] if eval: print('===評価===') count, label_for_eval, f_score = evaluation(res, segmentation_model, segmentation_type, model_type, doc_type, doc_num) return count, res.index.values, label_for_eval, f_score
def main(): data = utils.load_data(__file__) utils.generate_config(__file__, data)
def main(args): extractor = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'label.py') path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) # load each JSON file from chaos. # Read each block of that file. # [P2] Sort the blocks by their size. # Also load the gold-text of that file. # If matching between gold-text and that element text is # above a certain threshold, label that block as 1. # [P2] remove the matching part from gold-text. # Rewrite the blocks to another json file. # extract data from each url # load data pages = [] domains = collections.defaultdict(lambda: 0) for id, url in enumerate(urls): if not url.strip(): continue host = url.split('/', 3)[2] #if domains[host] > 2: # continue domains[host] += 1 print host page = utils.load_data(path, id) processor = processors.Processor([page], tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer) features = processor.extract() clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ clusters = collections.defaultdict(list) for text, label in zip(processor.texts, labels): clusters[int(label)].append(text) gold_text = utils.load_gold_text(path, id) gold_text = processor.tokenizer.tokenize(gold_text) max_score = 0 best_label = None for label, texts in clusters.iteritems(): tokens = '' for text in texts: tokens += text['tokens'] score = processor.analyzer.get_similarity(tokens, gold_text) if score > max_score: max_score = score best_label = label for text in clusters[best_label]: text['label'] = 1 page_texts = [] for label, texts in clusters.iteritems(): page_texts += texts random.shuffle(page_texts) pages.append(page_texts) #random.shuffle(pages) continuous_features = [] discrete_features = [] labels = [] for page in pages: for text in page: text_length = len(text['tokens']) area = text['bound']['height'] * text['bound']['width'] text_density = float(text_length) / float(area) # continuous_feature continuous_feature = [] #text_length, text_density] continuous_features.append(continuous_feature) # discrete features discrete_feature = dict() discrete_feature = dict(text['computed'].items()) discrete_feature['path'] = ' > '.join(text['path']) """ discrete_feature['selector'] = ' > '.join([ '%s%s%s' % ( selector['name'], '#' + selector['id'] if selector['id'] else '', '.' + '.'.join(selector['classes']) if selector['classes'] else '', ) for selector in text['selector'] ]) """ discrete_feature['class'] = ' > '.join([ '%s%s' % ( selector['name'], '.' + '.'.join(selector['classes']) if selector['classes'] else '', ) for selector in text['selector'] ]) """ discrete_feature['id'] = ' > '.join([ '%s%s' % ( selector['name'], '#' + selector['id'] if selector['id'] else '', ) for selector in text['selector'] ]) """ discrete_features.append(discrete_feature) # label labels.append(text['label']) vectorizer = DictVectorizer() discrete_features = vectorizer.fit_transform(discrete_features).toarray() continuous_features = np.array(continuous_features) labels = np.array(labels).astype(np.float32) # scale features features = preprocessing.scale(features) features = np.hstack([continuous_features, discrete_features]).astype(np.float32) print features.shape precisions = [] recalls = [] f1scores = [] supports = [] rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0) for train_index, test_index in rs: print 'training size = %d, testing size = %d' % (len(train_index), len(test_index)) clf = svm.SVC(verbose=False, kernel='linear', probability=False, random_state=0, cache_size=2000, class_weight='auto') clf.fit(features[train_index], labels[train_index]) print clf.n_support_ """ negatives = [] for i in clf.support_[:clf.n_support_[0]]: negatives.append(all_texts[i]) positives = [] for i in clf.support_[clf.n_support_[0]:]: positives.append(all_texts[i]) stats(negatives, positives) """ print "training:" predicted = clf.predict(features[train_index]) print classification_report(labels[train_index], predicted) print "testing:" predicted = clf.predict(features[test_index]) print classification_report(labels[test_index], predicted) precision, recall, f1score, support = precision_recall_fscore_support( labels[test_index], predicted) precisions.append(precision) recalls.append(recall) f1scores.append(f1score) supports.append(support) precisions = np.mean(np.array(precisions), axis=0) recalls = np.mean(np.array(recalls), axis=0) f1scores = np.mean(np.array(f1scores), axis=0) supports = np.mean(np.array(supports), axis=0) for label in range(2): print '%f\t%f\t%f\t%f' % (precisions[label], recalls[label], f1scores[label], supports[label]) return
def main(): # Prepare parameters args = parse_args() checkpoint_dir = args.checkpoint_dir data_path = args.data_path num_top_similarity = args.num_top_similarity num_search_sample = args.num_search_sample modelout_save = args.modelout_save use_exist_modelout = args.use_exist_modelout modeleval_out_dir = args.modeleval_out_dir # Prepare Data _, _, x_test, y_test = load_data(data_path=data_path) input_data = tf.placeholder(tf.float32, shape=[None, 32, 32, 32], name='input') net_input = input_data[:, :, :, :, np.newaxis] CAE_3D = conv_autoencoder_3d(net_input, args=args, is_training=False) if use_exist_modelout: data = np.load(modeleval_out_dir) idx = data['idx'] sims = data['sims'] encoded = data['encoded'] decoded = data['decoded'] else: with tf.Session() as sess: saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(checkpoint_dir)) feed_dict = {input_data: x_test} # extract encoded features and vectorize them encoded = CAE_3D.encoded.eval(session=sess, feed_dict=feed_dict) nd, k1, k2, k3, k4 = encoded.shape encoded = np.reshape(encoded, (nd, k1 * k2 * k3 * k4)) decoded = CAE_3D.decoded.eval(session=sess, feed_dict=feed_dict) idx, sims = similarity_search(encoded, num_top_similarity) if modelout_save: np.savez_compressed(modeleval_out_dir, idx=idx, sims=sims, encoded=encoded, decoded=decoded) # visualize encoded data with t-SNE # visualize_tsne(encoded, y_test) # add self-index as the first column self_idx = np.arange(encoded.shape[0]).reshape((encoded.shape[0], 1)) idx = np.concatenate([self_idx, idx], axis=1) # select samples to visualize randomly sample_idx = np.random.randint(0, x_test.shape[0], num_search_sample) # visualize similar search result visualize(x_test, y_test, idx[sample_idx]) # visualize input and its decoded data # visualize_3d_iodata(x_test[sample_idx], decoded[sample_idx], y_test[sample_idx]) # calculate average precision ap = calculate_average_precision(y_test, idx[sample_idx], sims[sample_idx], num_search_sample) print('Average Precision per sample : ', ap)
if not(args[1] == 'tfidf' or args[1] == 'doc2vec' or args[1] == 'word2vec'): print('Argument is invalid') exit() if args[-1] == 'update': update = True else: print('Arguments are too sort') exit() model_type = args[1] # docs: インタビュー全体 print('Load data') # モデルを訓練する path = './data/interview/interview-text_01-26_all.txt' data = utils.to_sentence(utils.load_data(path)) docs = [row[1] for row in data] # max_characters: XX文字以上の単文は要約対象外 # docs = utils.polish_docs(docs, max_characters=1000) sw = stopwords() docs_for_train = [stems(doc, polish=True, sw=sw) for doc in docs] print(docs_for_train[:10]) sum = 0 for arr in docs_for_train: sum += len(arr) print(sum) """ 以下のようなデータを作っています edocs_for_train = [ ['出身は', 'どこ', 'ですか' ...
def main(): # Prepare args args = parse_args() num_labeled_train = args.num_labeled_train num_test = args.num_test ramp_up_period = args.ramp_up_period ramp_down_period = args.ramp_down_period num_class = args.num_class num_epoch = args.num_epoch batch_size = args.batch_size weight_max = args.weight_max learning_rate = args.learning_rate alpha = args.alpha weight_norm_flag = args.weight_norm_flag augmentation_flag = args.augmentation_flag whitening_flag = args.whitening_flag trans_range = args.trans_range # Data Preparation train_x, train_y, test_x, test_y = load_data(args.data_path) ret_dic = split_supervised_train(train_x, train_y, num_labeled_train) ret_dic['test_x'] = test_x ret_dic['test_y'] = test_y ret_dic = make_train_test_dataset(ret_dic, num_class) unsupervised_target = ret_dic['unsupervised_target'] supervised_label = ret_dic['supervised_label'] supervised_flag = ret_dic['train_sup_flag'] unsupervised_weight = ret_dic['unsupervised_weight'] test_y = ret_dic['test_y'] train_x, test_x = normalize_images(ret_dic['train_x'], ret_dic['test_x']) # pre-process if whitening_flag: train_x, test_x = whiten_zca(train_x, test_x) if augmentation_flag: train_x = np.pad(train_x, ((0, 0), (trans_range, trans_range), (trans_range, trans_range), (0, 0)), 'reflect') # make the whole data and labels for training # x = [train_x, supervised_label, supervised_flag, unsupervised_weight] y = np.concatenate((unsupervised_target, supervised_label, supervised_flag, unsupervised_weight), axis=1) num_train_data = train_x.shape[0] # Build Model if weight_norm_flag: from lib.model_WN import build_model from lib.weight_norm import AdamWithWeightnorm optimizer = AdamWithWeightnorm(lr=learning_rate, beta_1=0.9, beta_2=0.999) else: from lib.model_BN import build_model optimizer = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999) model = build_model(num_class=num_class) model.compile(optimizer=optimizer, loss=semi_supervised_loss(num_class)) model.metrics_tensors += model.outputs model.summary() # prepare weights and arrays for updates gen_weight = ramp_up_weight( ramp_up_period, weight_max * (num_labeled_train / num_train_data)) gen_lr_weight = ramp_down_weight(ramp_down_period) idx_list = [v for v in range(num_train_data)] ensemble_prediction = np.zeros((num_train_data, num_class)) cur_pred = np.zeros((num_train_data, num_class)) # Training for epoch in range(num_epoch): print('epoch: ', epoch) idx_list = shuffle(idx_list) if epoch > num_epoch - ramp_down_period: weight_down = next(gen_lr_weight) K.set_value(model.optimizer.lr, weight_down * learning_rate) K.set_value(model.optimizer.beta_1, 0.4 * weight_down + 0.5) ave_loss = 0 for i in range(0, num_train_data, batch_size): target_idx = idx_list[i:i + batch_size] if augmentation_flag: x1 = data_augmentation_tempen(train_x[target_idx], trans_range) else: x1 = train_x[target_idx] x2 = supervised_label[target_idx] x3 = supervised_flag[target_idx] x4 = unsupervised_weight[target_idx] y_t = y[target_idx] x_t = [x1, x2, x3, x4] tr_loss, output = model.train_on_batch(x=x_t, y=y_t) cur_pred[idx_list[i:i + batch_size]] = output[:, 0:num_class] ave_loss += tr_loss print('Training Loss: ', (ave_loss * batch_size) / num_train_data, flush=True) # Update phase next_weight = next(gen_weight) y, unsupervised_weight = update_weight(y, unsupervised_weight, next_weight) ensemble_prediction, y = update_unsupervised_target( ensemble_prediction, y, num_class, alpha, cur_pred, epoch) # Evaluation if epoch % 5 == 0: print('Evaluate epoch : ', epoch, flush=True) evaluate(model, num_class, num_test, test_x, test_y)
def main(model, auxiliary=True, model_label='rcnn', rnn_type='gru', padding='pre', reg='s', prefix="crawl", embedding_file_type="word2vec", train_fname="./data/train.csv", test_fname="./data/test.csv", embeds_fname="./data/GoogleNews-vectors-negative300.bin", logger_fname="./logs/log-aws", mode="all", wrong_words_fname="./data/correct_words.csv", format_embeds="binary", config="./config.json", output_dir="./out", norm_prob=False, norm_prob_koef=1, gpus=0, char_level=False, random_seed=2018, num_folds=5): embedding_type = prefix + "_" + embedding_file_type logger = Logger(logging.getLogger(), logger_fname) # ====Detect GPUs==== logger.debug(device_lib.list_local_devices()) # ====Load data==== logger.info('Loading data...') train_df = load_data(train_fname) test_df = load_data(test_fname) target_labels = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] num_classes = len(target_labels) # ====Load additional data==== logger.info('Loading additional data...') # swear_words = load_data(swear_words_fname, func=lambda x: set(x.T[0]), header=None) wrong_words_dict = load_data(wrong_words_fname, func=lambda x: {val[0]: val[1] for val in x}) tokinizer = RegexpTokenizer(r'\S+') regexps = [ re.compile("([a-zA-Z]+)([0-9]+)"), re.compile("([0-9]+)([a-zA-Z]+)") ] # ====Load word vectors==== logger.info('Loading embeddings...') if model != 'mvcnn': embed_dim = 300 embeds = Embeds(embeds_fname, embedding_file_type, format=format_embeds) if mode in ('preprocess', 'all'): logger.info('Generating indirect features...') # https://www.kaggle.com/jagangupta/stop-the-s-toxic-comments-eda # Word count in each comment: train_df['count_word'] = train_df["comment_text"].apply( lambda x: len(str(x).split())) test_df['count_word'] = test_df["comment_text"].apply( lambda x: len(str(x).split())) # Unique word count train_df['count_unique_word'] = train_df["comment_text"].apply( lambda x: len(set(str(x).split()))) test_df['count_unique_word'] = test_df["comment_text"].apply( lambda x: len(set(str(x).split()))) # Letter count train_df['count_letters'] = train_df["comment_text"].apply( lambda x: len(str(x))) test_df['count_letters'] = test_df["comment_text"].apply( lambda x: len(str(x))) # punctuation count train_df["count_punctuations"] = train_df["comment_text"].apply( lambda x: len([c for c in str(x) if c in string.punctuation])) test_df["count_punctuations"] = test_df["comment_text"].apply( lambda x: len([c for c in str(x) if c in string.punctuation])) # upper case words count train_df["count_words_upper"] = train_df["comment_text"].apply( lambda x: len([w for w in str(x).split() if w.isupper()])) test_df["count_words_upper"] = test_df["comment_text"].apply( lambda x: len([w for w in str(x).split() if w.isupper()])) # title case words count train_df["count_words_title"] = train_df["comment_text"].apply( lambda x: len([w for w in str(x).split() if w.istitle()])) test_df["count_words_title"] = test_df["comment_text"].apply( lambda x: len([w for w in str(x).split() if w.istitle()])) # Word count percent in each comment: train_df['word_unique_pct'] = train_df[ 'count_unique_word'] * 100 / train_df['count_word'] test_df['word_unique_pct'] = test_df[ 'count_unique_word'] * 100 / test_df['count_word'] # Punct percent in each comment: train_df['punct_pct'] = train_df[ 'count_punctuations'] * 100 / train_df['count_word'] test_df['punct_pct'] = test_df['count_punctuations'] * 100 / test_df[ 'count_word'] # Average length of the words train_df["mean_word_len"] = train_df["comment_text"].apply( lambda x: np.mean([len(w) for w in str(x).split()])) test_df["mean_word_len"] = test_df["comment_text"].apply( lambda x: np.mean([len(w) for w in str(x).split()])) # upper case words percentage train_df["words_upper_pct"] = train_df[ "count_words_upper"] * 100 / train_df['count_word'] test_df["words_upper_pct"] = test_df[ "count_words_upper"] * 100 / test_df['count_word'] # title case words count train_df["words_title_pct"] = train_df[ "count_words_title"] * 100 / train_df['count_word'] test_df["words_title_pct"] = test_df[ "count_words_title"] * 100 / test_df['count_word'] # remove columns train_df = train_df.drop('count_word', 1) train_df = train_df.drop('count_unique_word', 1) train_df = train_df.drop('count_punctuations', 1) train_df = train_df.drop('count_words_upper', 1) train_df = train_df.drop('count_words_title', 1) test_df = test_df.drop('count_word', 1) test_df = test_df.drop('count_unique_word', 1) test_df = test_df.drop('count_punctuations', 1) test_df = test_df.drop('count_words_upper', 1) test_df = test_df.drop('count_words_title', 1) logger.info('Cleaning text...') train_df['comment_text_clear'] = clean_text(train_df['comment_text'], tokinizer, wrong_words_dict, regexps, autocorrect=False) test_df['comment_text_clear'] = clean_text(test_df['comment_text'], tokinizer, wrong_words_dict, regexps, autocorrect=False) if reg == 'w': # remove all punctuations train_df.to_csv(os.path.join(output_dir, 'train_clear_w.csv'), index=False) test_df.to_csv(os.path.join(output_dir, 'test_clear_w.csv'), index=False) train_df = pd.read_csv( os.path.join(output_dir, 'train_clear_w.csv')) test_df = pd.read_csv(os.path.join(output_dir, 'test_clear_w.csv')) elif reg == 's': # split by S+ keep all punctuations train_df.to_csv(os.path.join(output_dir, 'train_clear.csv'), index=False) test_df.to_csv(os.path.join(output_dir, 'test_clear.csv'), index=False) train_df = pd.read_csv(os.path.join(output_dir, 'train_clear.csv')) test_df = pd.read_csv(os.path.join(output_dir, 'test_clear.csv')) if mode == 'preprocess': return if mode == 'processed': if reg == 'w': train_df = pd.read_csv( os.path.join(output_dir, 'train_clear_w.csv')) test_df = pd.read_csv(os.path.join(output_dir, 'test_clear_w.csv')) elif reg == 's': train_df = pd.read_csv(os.path.join(output_dir, 'train_clear.csv')) test_df = pd.read_csv(os.path.join(output_dir, 'test_clear.csv')) logger.info('Calc text length...') train_df.fillna('unknown', inplace=True) test_df.fillna('unknown', inplace=True) train_df['text_len'] = train_df['comment_text_clear'].apply( lambda words: len(words.split())) test_df['text_len'] = test_df['comment_text_clear'].apply( lambda words: len(words.split())) max_seq_len = np.round(train_df['text_len'].mean() + 3 * train_df['text_len'].std()).astype(int) logger.debug('Max seq length = {}'.format(max_seq_len)) # ====Prepare data to NN==== logger.info('Converting texts to sequences...') max_words = 100000 if char_level: max_seq_len = 1200 train_df['comment_seq'], test_df[ 'comment_seq'], word_index = convert_text2seq( train_df['comment_text_clear'].tolist(), test_df['comment_text_clear'].tolist(), max_words, max_seq_len, embeds, lower=True, char_level=char_level, uniq=True, use_only_exists_words=True, position=padding) logger.debug('Dictionary size = {}'.format(len(word_index))) logger.info('Preparing embedding matrix...') if model != 'mvcnn': embedding_matrix, words_not_found = get_embedding_matrix( embed_dim, embeds, max_words, word_index) logger.debug('Embedding matrix shape = {}'.format( np.shape(embedding_matrix))) logger.debug('Number of null word embeddings = {}'.format( np.sum(np.sum(embedding_matrix, axis=1) == 0))) # ====Train/test split data==== # train/val x_aux = np.matrix([ train_df["word_unique_pct"].tolist(), train_df["punct_pct"].tolist(), train_df["mean_word_len"].tolist(), train_df["words_upper_pct"].tolist(), train_df["words_title_pct"].tolist() ], dtype='float32').transpose((1, 0)) x = np.array(train_df['comment_seq'].tolist()) y = np.array(train_df[target_labels].values) x_train_nn, x_test_nn, x_aux_train_nn, x_aux_test_nn, y_train_nn, y_test_nn, train_idxs, test_idxs = \ split_data(x, np.squeeze(np.asarray(x_aux)),y,test_size=0.2,shuffle=True,random_state=2018) # test set test_df_seq = np.array(test_df['comment_seq'].tolist()) test_aux = np.matrix([ train_df["word_unique_pct"].tolist(), train_df["punct_pct"].tolist(), train_df["mean_word_len"].tolist(), train_df["words_upper_pct"].tolist(), train_df["words_title_pct"].tolist() ], dtype='float32').transpose((1, 0)) test_df_seq_aux = np.squeeze(np.asarray(test_aux)) y_nn = [] logger.debug('X shape = {}'.format(np.shape(x_train_nn))) # ====Train models==== params = Params(config) if model_label == None: logger.warn('Should choose a model to train') return if model_label == 'dense': model = dense( embedding_matrix, num_classes, max_seq_len, dense_dim=params.get('dense').get('dense_dim'), n_layers=params.get('dense').get('n_layers'), concat=params.get('dense').get('concat'), dropout_val=params.get('dense').get('dropout_val'), l2_weight_decay=params.get('dense').get('l2_weight_decay'), pool=params.get('dense').get('pool'), train_embeds=params.get('dense').get('train_embeds'), add_sigmoid=True, gpus=gpus) if model_label == 'cnn': model = cnn(embedding_matrix, num_classes, max_seq_len, num_filters=params.get('cnn').get('num_filters'), l2_weight_decay=params.get('cnn').get('l2_weight_decay'), dropout_val=params.get('cnn').get('dropout_val'), dense_dim=params.get('cnn').get('dense_dim'), train_embeds=params.get('cnn').get('train_embeds'), n_cnn_layers=params.get('cnn').get('n_cnn_layers'), pool=params.get('cnn').get('pool'), add_embeds=params.get('cnn').get('add_embeds'), auxiliary=auxiliary, add_sigmoid=True, gpus=gpus) if model_label == 'cnn2d': model = cnn2d( embedding_matrix, num_classes, max_seq_len, num_filters=params.get('cnn2d').get('num_filters'), l2_weight_decay=params.get('cnn2d').get('l2_weight_decay'), dropout_val=params.get('cnn2d').get('dropout_val'), dense_dim=params.get('cnn2d').get('dense_dim'), train_embeds=params.get('cnn2d').get('train_embeds'), add_embeds=params.get('cnn2d').get('add_embeds'), auxiliary=auxiliary, add_sigmoid=True, gpus=gpus) if model_label == 'lstm': model = rnn( embedding_matrix, num_classes, max_seq_len, l2_weight_decay=params.get('lstm').get('l2_weight_decay'), rnn_dim=params.get('lstm').get('rnn_dim'), dropout_val=params.get('lstm').get('dropout_val'), dense_dim=params.get('lstm').get('dense_dim'), n_branches=params.get('lstm').get('n_branches'), n_rnn_layers=params.get('lstm').get('n_rnn_layers'), n_dense_layers=params.get('lstm').get('n_dense_layers'), train_embeds=params.get('lstm').get('train_embeds'), mask_zero=params.get('lstm').get('mask_zero'), kernel_regularizer=params.get('lstm').get('kernel_regularizer'), recurrent_regularizer=params.get('lstm').get( 'recurrent_regularizer'), activity_regularizer=params.get('lstm').get( 'activity_regularizer'), dropout=params.get('lstm').get('dropout'), recurrent_dropout=params.get('lstm').get('recurrent_dropout'), auxiliary=auxiliary, add_sigmoid=True, gpus=gpus, rnn_type='lstm') if model_label == 'gru': model = rnn( embedding_matrix, num_classes, max_seq_len, l2_weight_decay=params.get('gru').get('l2_weight_decay'), rnn_dim=params.get('gru').get('rnn_dim'), dropout_val=params.get('gru').get('dropout_val'), dense_dim=params.get('gru').get('dense_dim'), n_branches=params.get('gru').get('n_branches'), n_rnn_layers=params.get('gru').get('n_rnn_layers'), n_dense_layers=params.get('gru').get('n_dense_layers'), train_embeds=params.get('gru').get('train_embeds'), mask_zero=params.get('gru').get('mask_zero'), kernel_regularizer=params.get('gru').get('kernel_regularizer'), recurrent_regularizer=params.get('gru').get( 'recurrent_regularizer'), activity_regularizer=params.get('gru').get('activity_regularizer'), dropout=params.get('gru').get('dropout'), recurrent_dropout=params.get('gru').get('recurrent_dropout'), auxiliary=auxiliary, add_sigmoid=True, gpus=gpus, rnn_type='gru') if model_label == 'charrnn': model = charrnn( len(word_index), num_classes, max_seq_len, rnn_dim=params.get('charrnn').get('rnn_dim'), dropout_val=params.get('charrnn').get('dropout_val'), auxiliary=auxiliary, dropout=params.get('charrnn').get('dropout'), recurrent_dropout=params.get('charrnn').get('recurrent_dropout'), add_sigmoid=True, gpus=gpus, rnn_type=rnn_type) if model_label == 'cnn2rnn': model = cnn2rnn(embedding_matrix, num_classes, max_seq_len, rnn_type=rnn_type) if model_label == 'dpcnn': model = dpcnn(embedding_matrix, num_classes, max_seq_len, num_filters=params.get('dpcnn').get('num_filters'), dense_dim=params.get('dpcnn').get('dense_dim'), add_sigmoid=True, gpus=gpus) if model_label == 'rcnn': model = rcnn( embedding_matrix, num_classes, max_seq_len, rnn_dim=params.get('rcnn').get('rnn_dim'), dropout_val=params.get('rcnn').get('dropout_val'), dense_dim=params.get('rcnn').get('dense_dim'), train_embeds=params.get('rcnn').get('train_embeds'), auxiliary=auxiliary, dropout=params.get('rcnn').get('dropout'), recurrent_dropout=params.get('rcnn').get('recurrent_dropout'), add_sigmoid=True, gpus=gpus, rnn_type=rnn_type) if model_label == 'capsule': model = capsule( embedding_matrix, num_classes, max_seq_len, auxiliary=auxiliary, Num_capsule=params.get('capsule').get('Num_capsule'), Routings=params.get('capsule').get('Routing'), add_sigmoid=params.get('capsule').get('add_sigmoid'), mask_zero=params.get('capsule').get('mask_zero'), gpus=gpus, rnn_type='gru') # lstm may diverge but gru works better if model == 'mvcnn': embeds_fname1 = "./data/crawl-300d-2M.vec" # "./data/crawl-300d-2M.vec word2vec-raw.txt embeds_fname2 = "./data/glove.840B.300d.txt" embeds_fname3 = "./data/GoogleNews-vectors-negative300.bin" embed_dim = 300 embeds1 = Embeds(embeds_fname1, "glove", format='file') embeds2 = Embeds(embeds_fname2, "fasttext", format='file') embeds3 = Embeds(embeds_fname3, "word2vec", format='binary') embedding_matrix1, words_not_found1 = get_embedding_matrix( embed_dim, embeds1, max_words, word_index) embedding_matrix2, words_not_found2 = get_embedding_matrix( embed_dim, embeds2, max_words, word_index) #embedding_matrix3, words_not_found3 = get_embedding_matrix(embed_dim, embeds3, max_words, word_index) model = mvcnn(embedding_matrix1, embedding_matrix2, num_classes, max_seq_len, auxiliary=auxiliary, gpus=gpus) # ====k-fold cross validations split data==== logger.info('Run k-fold cross validation...') params = Params(config) kf = KFold(n_splits=num_folds, shuffle=True, random_state=random_seed) oof_train = np.zeros((x.shape[0], num_classes)) oof_test_skf = [] for i, (train_index, test_index) in enumerate(kf.split(x, y)): print("TRAIN:", train_index, "TEST:", test_index) x_train, x_aux_train, x_test, x_aux_test = x[train_index], x_aux[ train_index], x[test_index], x_aux[test_index] y_train, y_test = y[train_index], y[test_index] logger.info('Start training {}-th fold'.format(i)) if auxiliary: inputs = [x_train, x_aux_train] inputs_val = [x_test, x_aux_test] output = [test_df_seq, test_df_seq_aux] else: inputs = x_train inputs_val = x_test output = test_df_seq hist = train( x_train= inputs, # [x_train, x_aux_train] when auxiliary input is allowed. y_train=y_train, x_val=inputs_val, # [x_test, x_aux_test], y_val=y_test, model=model, batch_size=params.get(model_label).get('batch_size'), num_epochs=params.get(model_label).get('num_epochs'), learning_rate=params.get(model_label).get('learning_rate'), early_stopping_delta=params.get(model_label).get( 'early_stopping_delta'), early_stopping_epochs=params.get(model_label).get( 'early_stopping_epochs'), use_lr_strategy=params.get(model_label).get('use_lr_strategy'), lr_drop_koef=params.get(model_label).get('lr_drop_koef'), epochs_to_drop=params.get(model_label).get('epochs_to_drop'), model_checkpoint_dir=os.path.join('.', 'model_checkpoint', reg, model_label, embedding_type, padding, str(i)), logger=logger) model.load_weights( os.path.join('.', 'model_checkpoint', reg, model_label, embedding_type, padding, str(i), 'weights.h5')) oof_train[test_index, :] = model.predict( inputs_val) # model.predict([x_test, x_aux_test]) proba = model.predict( output) # model.predict([test_df_seq, test_df_seq_aux]) oof_test_skf.append(proba) result = pd.read_csv("./data/sample_submission.csv") result[target_labels] = proba ithfold_path = "./cv/{}/{}/{}/{}/{}".format(reg, model_label, embedding_type, padding, i) if not os.path.exists(ithfold_path): os.makedirs(ithfold_path) result.to_csv(os.path.join(ithfold_path, 'sub.csv'), index=False) # model.save(os.path.join(ithfold_path,'weights.h5')) # dump oof_test and oof_train for later slacking # oof_train: oof_train_path = "./cv/{}/{}/{}/{}/oof_train".format( reg, model_label, embedding_type, padding) if not os.path.exists(oof_train_path): os.makedirs(oof_train_path) np.savetxt(os.path.join(oof_train_path, "oof_train.csv"), oof_train, fmt='%.24f', delimiter=' ') # oof_test: stacking version oof_test = np.array(oof_test_skf).mean(axis=0) oof_test_path = "./cv/{}/{}/{}/{}/oof_test".format(reg, model_label, embedding_type, padding) if not os.path.exists(oof_test_path): os.makedirs(oof_test_path) np.savetxt(os.path.join(oof_test_path, "oof_test.csv"), oof_test, fmt='%.24f', delimiter=' ') # oof_test: submission version result[target_labels] = oof_test oof_test_bag_path = "./cv/{}/{}/{}/{}/bagged".format( reg, model_label, embedding_type, padding) if not os.path.exists(oof_test_bag_path): os.makedirs(oof_test_bag_path) result.to_csv(os.path.join(oof_test_bag_path, "sub.csv"), index=False)
if 2 <= len(args): if not (args[1] == 'sentence' or args[1] == 'segmentation' or args[1] == 'utterance' or args[1] == 'segmentation/ans'): print('Argument is invalid') exit() else: print('Arguments are too sort') exit() doc_type = args[1] doc_num = 'all' path = './data/interview/interview-text_01-26_' + doc_num + '.txt' if doc_type == 'sentence': data = utils.load_data(path) # to sentence data = utils.to_sentence(data) docs = [row[1] for row in data] if doc_type == 'utterance': data = utils.load_data(path) docs = [row[1] for row in data] elif doc_type == 'segmentation' or doc_type == 'segmentation/ans': ans = False if doc_type == 'segmentation/ans': ans = True if doc_num == 'all': doc_num = '26' data_arr = []
def main(args): path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) # load data data = [utils.load_data(path, id) for id, url in enumerate(urls)] random.shuffle(data) for page in data: random.shuffle(page['texts']) # process data processor = processors.Processor(data, tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer) features = processor.extract() # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # prepare features continuous_features, discrete_features, labels = processor.prepare(labels) vectorizer = DictVectorizer() discrete_features = vectorizer.fit_transform(discrete_features).toarray() continuous_features = np.array(continuous_features) labels = np.array(labels).astype(np.float32) features = np.hstack([continuous_features, discrete_features]).astype(np.float32) # scale features features = preprocessing.scale(features) print features.shape precisions = [] recalls = [] f1scores = [] supports = [] rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0) for train_index, test_index in rs: print 'training size = %d, testing size = %d' % (len(train_index), len(test_index)) clf = svm.SVC(verbose=False, kernel='linear', probability=False, random_state=0, cache_size=2000, class_weight='auto') clf.fit(features[train_index], labels[train_index]) print clf.n_support_ print "training:" predicted = clf.predict(features[train_index]) print classification_report(labels[train_index], predicted) print "testing:" predicted = clf.predict(features[test_index]) print classification_report(labels[test_index], predicted) precision, recall, f1score, support = precision_recall_fscore_support( labels[test_index], predicted) precisions.append(precision) recalls.append(recall) f1scores.append(f1score) supports.append(support) precisions = np.mean(np.array(precisions), axis=0) recalls = np.mean(np.array(recalls), axis=0) f1scores = np.mean(np.array(f1scores), axis=0) supports = np.mean(np.array(supports), axis=0) for label in range(2): print '%f\t%f\t%f\t%f' % (precisions[label], recalls[label], f1scores[label], supports[label]) return negatives = [] positives = [] for i in range(len(processor.texts)): if labels[i]: positives.append(processor.texts[i]) else: negatives.append(processor.texts[i]) stats(negatives, positives) return """