def topicOfClassificationForAllYear(probDir, modelDir, classDir, clf_dict, fun): probFiles = fileSys.traverseDirectory(probDir) topicFiles = fileSys.traverseTopicDirecotry(modelDir, 1) classFiles = fileSys.traverseDirectory(classDir) N = len(probFiles) if len(topicFiles) != N or len(classFiles) != N: print "numbers of files are not same" sys.exit('System will exit') all_clf_topic = {} if fun == 0: irange = range(0, N) # acm-class start from 1998 elif fun == 1: irange = range(5, N) for i in irange: prob = ioFile.load_object(probFiles[i]) topics = ioFile.load_object(topicFiles[i]) inFile = ioFile.dataFromFile(classFiles[i]) year = probFiles[i][-8:-4] topic_index = np.squeeze(np.array(prob.argmax(1))) doc_topic = topic_index #doc_topic = [] #[doc_topic.append(' '.join(topics[index])) for index in topic_index] all_clf, unique_clf = classificationOfDocument(inFile, clf_dict, fun) clf_topic = topicOfClassification(unique_clf, all_clf, clf_dict, doc_topic, fun) all_clf_topic[year] = clf_topic return all_clf_topic
def skip_grams_with_label(index2word, sentences, window, vocab_size, nb_negative_samples=5.): import keras.preprocessing.sequence as seq print 'building skip-grams and labels...' def sg(sentence): return seq.skipgrams(sentence, vocab_size, window_size=np.random.randint(window - 1) + 1, negative_samples=nb_negative_samples) couples = [] labels = [] # concat all skipgrams for cpl, lbl in map(sg, sentences): couples.extend(cpl) labels.extend(lbl) true_label = load_object('labels.pkl') return np.asarray(couples), np.asarray([labels, true_label]).T
def skip_grams_with_cilin(index2word, sentences, window, vocab_size, nb_negative_samples=5.): import keras.preprocessing.sequence as seq import numpy as np print 'building skip-grams and labels...' def sg(sentence): return seq.skipgrams(sentence, vocab_size, window_size=np.random.randint(window - 1) + 1, negative_samples=nb_negative_samples) couples = [] labels = [] # concat all skipgrams for cpl, lbl in map(sg, sentences): couples.extend(cpl) labels.extend(lbl) cs = load_object('object.pkl') cilin_dist = [] for word, context_word in couples: sim = cs.similarity(index2word[word], index2word[context_word]) cilin_dist.append(sim) if len(cilin_dist) % 10000 == 0: print len(cilin_dist) return np.asarray(couples), np.asarray([labels, cilin_dist]).T
def train_model_for_big_data(data_dir='data'): model = Model(inputs=[input_pvt, input_ctx], outputs=predictions) model.compile(optimizer='rmsprop', loss=loss_with_cilin, metrics=[metric_with_cilin]) fname_list = traverseDirectory(path.join(data_dir, 'couples')) isFirst = True for fname in fname_list: print fname couples = load_object(fname) index = fname[fname.find('couples_'):].lstrip('couples_').rstrip('.pkl') fname = path.join(data_dir, 'labels', 'labels_'+index+'.pkl') print fname labels = load_object(fname) # metrics nb_batch = len(labels) // batch_size samples_per_epoch = batch_size * nb_batch if not isFirst: model.set_weights(weights) model.fit_generator(generator=batch_generator(couples, labels, nb_batch), steps_per_epoch=samples_per_epoch, epochs=nb_epoch, verbose=1, workers=1) weights = model.get_weights()
dest='output', help='fileName', default=None) (options, args) = optparser.parse_args() if options.input is None: fname = sys.stdin elif options.input is not None: fname = options.input else: print 'No filename(.pkl) specified, system with exit\n' sys.exit('System will exit') if options.num is None: n_all_term = sys.stdin elif options.num is not None: n_all_term = int(options.num) else: print 'No number of conversion specified, system with exit\n' sys.exit('System will exit') if options.output is None: outFile = 'convert_prob.pkl' elif options.output is not None: outFile = options.output prob = ioFile.load_object(fname) convert_prob = convertProb(prob, n_all_term) #print convert_prob.shape ioFile.save_object(convert_prob, outFile)
help='filename', default=None) (options, args) = optparser.parse_args() if options.distance is None: print 'No distance directory specified, system with exit\n' sys.exit('System will exit') elif options.distance is not None: distanceDir = options.distance if options.clf is None: print 'No classification filename specified, system with exit\n' sys.exit('System will exit') elif options.clf is not None: clf_list = ioFile.load_object(options.clf) if options.clf_topic is None: print 'No clf_topic filename specified, system with exit\n' sys.exit('System will exit') elif options.clf_topic is not None: clf_topic = ioFile.load_object(options.clf_topic) if options.output is None: output = 'topic_glm.csv' elif options.output is not None: output = options.output start_time = datetime.datetime.now() distance_list = {}
sys.exit('System will exit') else: if options.class_name == 'arxiv-category': fun = 0 elif options.class_name == 'acm-class': fun = 1 else: print 'Name of the category is incorrect, system with exit\n' sys.exit('System will exit') if options.clf_dict is None: if options.class_name == 'acm-class': print 'No class dict filename specified, system with exit\n' sys.exit('System will exit') else: acm_class_dict = ioFile.load_object(options.clf_dict) if options.output is None: year = options.input[-8:-4] if fun == 0: outFile = 'arxiv-category_' + year + '.txt' elif fun == 1: outFile = 'acm-class_' + year + '.txt' elif options.output is not None: outFile = options.output data_iterator = inFile_ref clf_dict = dict() for line in data_iterator: line = line.split('\t')
fname_f = sys.stdin elif options.input_f is not None: fname_f = options.input_f else: print 'No filename(.pkl) specified, system with exit\n' sys.exit('System will exit') if options.input_g is None: fname_g = sys.stdin elif options.input_g is not None: fname_g = options.input_g else: print 'No filename(.pkl) specified, system with exit\n' sys.exit('System will exit') if options.output is None: outFile = 'distance.pkl' elif options.output is not None: outFile = options.output prob_f = ioFile.load_object(fname_f) prob_g = ioFile.load_object(fname_g) all_distance, count = distanceBetweenTwoYears(prob_f, prob_g) print count, len(all_distance) ioFile.save_object(all_distance, outFile)
if options.prob is None: inFile = sys.stdin elif options.prob is not None: inFile = options.prob else: print 'No filename specified, system with exit\n' sys.exit('System will exit') if options.vocabulary is not None: fname = options.vocabulary all_term = allTerm(fname) else: fname = None all_term = None if options.output is None: outFile = 'topic.pkl' elif options.output is not None: outFile = options.output prob = ioFile.load_object(inFile) all_topic = [] nTopic, nTerm = prob.shape for i in range(0, nTopic): topic = topNTerm(5, prob[i, :].reshape(nTerm, 1), 1, all_term) all_topic.append(topic) ioFile.save_object(all_topic, outFile)
inFile = sys.stdin elif options.prob is not None: inFile = options.prob else: print 'No filename specified, system with exit\n' sys.exit('System will exit') if options.vocabulary is not None: fname = options.vocabulary all_term = allTerm(fname) else: fname = None all_term = None if options.output is None: outFile = 'topic.pkl' elif options.output is not None: outFile = options.output prob = ioFile.load_object(inFile) all_topic = [] nTopic, nTerm = prob.shape for i in range(0, nTopic): topic = topNTerm(5, prob[i,:].reshape(nTerm,1), 1, all_term) all_topic.append(topic) ioFile.save_object(all_topic, outFile)
if options.input_f is None: fname_f = sys.stdin elif options.input_f is not None: fname_f = options.input_f else: print 'No filename(.pkl) specified, system with exit\n' sys.exit('System will exit') if options.input_g is None: fname_g = sys.stdin elif options.input_g is not None: fname_g = options.input_g else: print 'No filename(.pkl) specified, system with exit\n' sys.exit('System will exit') if options.output is None: outFile = 'distance.pkl' elif options.output is not None: outFile = options.output prob_f = ioFile.load_object(fname_f) prob_g = ioFile.load_object(fname_g) all_distance, count = distanceBetweenTwoYears(prob_f, prob_g) print count, len(all_distance) ioFile.save_object(all_distance, outFile)
nb_batch = len(labels) // batch_size samples_per_epoch = batch_size * nb_batch if not isFirst: model.set_weights(weights) model.fit_generator(generator=batch_generator(couples, labels, nb_batch), steps_per_epoch=samples_per_epoch, epochs=nb_epoch, verbose=1, workers=1) weights = model.get_weights() # load data # - sentences: list of (list of word-id) # - index2word: list of string #sentences, index2word = utils.load_sentences(data_file) #sentences = load_object('sentences.pkl') index2word = load_object('index2word.pkl') # params nb_epoch = 3 # learn `batch_size words` at a time batch_size = 60 vec_dim = 50 # half of window window_size = 5 vocab_size = len(index2word) # create input #data_size = skip_grams_with_cilin_for_big_data(index2word, sentences, window, vocab_size) #print data_size data_size = 268350544