示例#1
0
def topicOfClassificationForAllYear(probDir, modelDir, classDir, clf_dict, fun):

    probFiles = fileSys.traverseDirectory(probDir)
    topicFiles = fileSys.traverseTopicDirecotry(modelDir, 1)
    classFiles = fileSys.traverseDirectory(classDir)
    
    N = len(probFiles)
    if len(topicFiles) != N or len(classFiles) != N:
        print "numbers of files are not same"
        sys.exit('System will exit')
    
    all_clf_topic = {}
    if fun == 0:
        irange = range(0, N)
    # acm-class start from 1998
    elif fun == 1:
        irange = range(5, N)
    for i in irange:
        prob = ioFile.load_object(probFiles[i])
        topics = ioFile.load_object(topicFiles[i])
        inFile = ioFile.dataFromFile(classFiles[i])
        
        year = probFiles[i][-8:-4]
        topic_index = np.squeeze(np.array(prob.argmax(1)))
        doc_topic = topic_index
        #doc_topic = []
        #[doc_topic.append(' '.join(topics[index])) for index in topic_index]
 
        all_clf, unique_clf = classificationOfDocument(inFile, clf_dict, fun)
        clf_topic = topicOfClassification(unique_clf, all_clf, clf_dict, doc_topic, fun)
        
        all_clf_topic[year] = clf_topic
    
    return all_clf_topic
def skip_grams_with_label(index2word,
                          sentences,
                          window,
                          vocab_size,
                          nb_negative_samples=5.):
    import keras.preprocessing.sequence as seq

    print 'building skip-grams and labels...'

    def sg(sentence):
        return seq.skipgrams(sentence,
                             vocab_size,
                             window_size=np.random.randint(window - 1) + 1,
                             negative_samples=nb_negative_samples)

    couples = []
    labels = []

    # concat all skipgrams
    for cpl, lbl in map(sg, sentences):
        couples.extend(cpl)
        labels.extend(lbl)

    true_label = load_object('labels.pkl')

    return np.asarray(couples), np.asarray([labels, true_label]).T
def skip_grams_with_cilin(index2word,
                          sentences,
                          window,
                          vocab_size,
                          nb_negative_samples=5.):
    import keras.preprocessing.sequence as seq
    import numpy as np

    print 'building skip-grams and labels...'

    def sg(sentence):
        return seq.skipgrams(sentence,
                             vocab_size,
                             window_size=np.random.randint(window - 1) + 1,
                             negative_samples=nb_negative_samples)

    couples = []
    labels = []

    # concat all skipgrams
    for cpl, lbl in map(sg, sentences):
        couples.extend(cpl)
        labels.extend(lbl)

    cs = load_object('object.pkl')

    cilin_dist = []
    for word, context_word in couples:
        sim = cs.similarity(index2word[word], index2word[context_word])
        cilin_dist.append(sim)
        if len(cilin_dist) % 10000 == 0:
            print len(cilin_dist)

    return np.asarray(couples), np.asarray([labels, cilin_dist]).T
示例#4
0
def train_model_for_big_data(data_dir='data'):
    model = Model(inputs=[input_pvt, input_ctx], outputs=predictions)
    model.compile(optimizer='rmsprop', loss=loss_with_cilin, metrics=[metric_with_cilin])
    fname_list = traverseDirectory(path.join(data_dir, 'couples'))
    isFirst = True
    for fname in fname_list:
        print fname
        couples = load_object(fname)
        index = fname[fname.find('couples_'):].lstrip('couples_').rstrip('.pkl')
        fname = path.join(data_dir, 'labels', 'labels_'+index+'.pkl')
        print fname
        labels = load_object(fname)
        # metrics
        nb_batch = len(labels) // batch_size
        samples_per_epoch = batch_size * nb_batch
        if not isFirst:
            model.set_weights(weights)
        model.fit_generator(generator=batch_generator(couples, labels, nb_batch),
                    steps_per_epoch=samples_per_epoch,
                    epochs=nb_epoch, verbose=1, workers=1)
        weights = model.get_weights()
示例#5
0
def topicOfClassificationForAllYear(probDir, modelDir, classDir, clf_dict,
                                    fun):

    probFiles = fileSys.traverseDirectory(probDir)
    topicFiles = fileSys.traverseTopicDirecotry(modelDir, 1)
    classFiles = fileSys.traverseDirectory(classDir)

    N = len(probFiles)
    if len(topicFiles) != N or len(classFiles) != N:
        print "numbers of files are not same"
        sys.exit('System will exit')

    all_clf_topic = {}
    if fun == 0:
        irange = range(0, N)
    # acm-class start from 1998
    elif fun == 1:
        irange = range(5, N)
    for i in irange:
        prob = ioFile.load_object(probFiles[i])
        topics = ioFile.load_object(topicFiles[i])
        inFile = ioFile.dataFromFile(classFiles[i])

        year = probFiles[i][-8:-4]
        topic_index = np.squeeze(np.array(prob.argmax(1)))
        doc_topic = topic_index
        #doc_topic = []
        #[doc_topic.append(' '.join(topics[index])) for index in topic_index]

        all_clf, unique_clf = classificationOfDocument(inFile, clf_dict, fun)
        clf_topic = topicOfClassification(unique_clf, all_clf, clf_dict,
                                          doc_topic, fun)

        all_clf_topic[year] = clf_topic

    return all_clf_topic
示例#6
0
                      dest='output',
                      help='fileName',
                      default=None)
     
 (options, args) = optparser.parse_args()
 
 if options.input is None:
         fname = sys.stdin
 elif options.input is not None:
         fname = options.input
 else:
         print 'No filename(.pkl) specified, system with exit\n'
         sys.exit('System will exit')
         
 if options.num is None:
         n_all_term = sys.stdin
 elif options.num is not None:
         n_all_term = int(options.num)
 else:
         print 'No number of conversion specified, system with exit\n'
         sys.exit('System will exit')
                    
 if options.output is None:
         outFile = 'convert_prob.pkl'
 elif options.output is not None:
         outFile = options.output
         
 prob = ioFile.load_object(fname)
 convert_prob = convertProb(prob, n_all_term)
 #print convert_prob.shape
 ioFile.save_object(convert_prob, outFile)
示例#7
0
                         help='filename',
                         default=None)
        
    (options, args) = optparser.parse_args()
    
    if options.distance is None:
            print 'No distance directory specified, system with exit\n'
            sys.exit('System will exit')
    elif options.distance is not None:
            distanceDir = options.distance

    if options.clf is None:
            print 'No classification filename specified, system with exit\n'
            sys.exit('System will exit')
    elif options.clf is not None:
            clf_list = ioFile.load_object(options.clf)

    if options.clf_topic is None:
            print 'No clf_topic filename specified, system with exit\n'
            sys.exit('System will exit')
    elif options.clf_topic is not None:
            clf_topic = ioFile.load_object(options.clf_topic)
            
    if options.output is None:
            output = 'topic_glm.csv'
    elif options.output is not None:
            output = options.output
            
    start_time = datetime.datetime.now()          
    
    distance_list = {}
            sys.exit('System will exit')        
    else:
            if options.class_name == 'arxiv-category':
                fun = 0
            elif options.class_name == 'acm-class':
                fun = 1
            else:
                print 'Name of the category is incorrect, system with exit\n'
                sys.exit('System will exit')                 
                
    if options.clf_dict is None:
        if options.class_name == 'acm-class':
            print 'No class dict filename specified, system with exit\n'
            sys.exit('System will exit')
    else:
        acm_class_dict = ioFile.load_object(options.clf_dict)
 
    if options.output is None:
        year = options.input[-8:-4]
        if fun == 0:
            outFile = 'arxiv-category_' + year + '.txt'
        elif fun == 1:
            outFile = 'acm-class_' + year + '.txt'
    elif options.output is not None:
            outFile = options.output
        
    data_iterator = inFile_ref

    clf_dict = dict()
    for line in data_iterator:
        line = line.split('\t')
示例#9
0
            fname_f = sys.stdin
    elif options.input_f is not None:
            fname_f = options.input_f
    else:
            print 'No filename(.pkl) specified, system with exit\n'
            sys.exit('System will exit')

    if options.input_g is None:
            fname_g = sys.stdin
    elif options.input_g is not None:
            fname_g = options.input_g
    else:
            print 'No filename(.pkl) specified, system with exit\n'
            sys.exit('System will exit')
            
    if options.output is None:
            outFile = 'distance.pkl'
    elif options.output is not None:
            outFile = options.output            
            

    
    prob_f = ioFile.load_object(fname_f)
    prob_g = ioFile.load_object(fname_g)
    
    all_distance, count = distanceBetweenTwoYears(prob_f, prob_g)
    
    print count, len(all_distance)
    
    ioFile.save_object(all_distance, outFile)
    
示例#10
0
    if options.prob is None:
        inFile = sys.stdin
    elif options.prob is not None:
        inFile = options.prob
    else:
        print 'No filename specified, system with exit\n'
        sys.exit('System will exit')

    if options.vocabulary is not None:
        fname = options.vocabulary
        all_term = allTerm(fname)
    else:
        fname = None
        all_term = None

    if options.output is None:
        outFile = 'topic.pkl'
    elif options.output is not None:
        outFile = options.output

    prob = ioFile.load_object(inFile)

    all_topic = []
    nTopic, nTerm = prob.shape
    for i in range(0, nTopic):
        topic = topNTerm(5, prob[i, :].reshape(nTerm, 1), 1, all_term)
        all_topic.append(topic)

    ioFile.save_object(all_topic, outFile)
示例#11
0
            inFile = sys.stdin
    elif options.prob is not None:
            inFile = options.prob
    else:
            print 'No filename specified, system with exit\n'
            sys.exit('System will exit')    

    if options.vocabulary is not None:
            fname = options.vocabulary
            all_term = allTerm(fname)
    else:
            fname = None
            all_term = None
            
    if options.output is None:
            outFile = 'topic.pkl'
    elif options.output is not None:
            outFile = options.output
            
    prob = ioFile.load_object(inFile)
    
    all_topic = []
    nTopic, nTerm = prob.shape
    for i in range(0, nTopic):
        topic = topNTerm(5, prob[i,:].reshape(nTerm,1), 1, all_term)
        all_topic.append(topic)
    
    ioFile.save_object(all_topic, outFile)

    
    
示例#12
0
    if options.input_f is None:
        fname_f = sys.stdin
    elif options.input_f is not None:
        fname_f = options.input_f
    else:
        print 'No filename(.pkl) specified, system with exit\n'
        sys.exit('System will exit')

    if options.input_g is None:
        fname_g = sys.stdin
    elif options.input_g is not None:
        fname_g = options.input_g
    else:
        print 'No filename(.pkl) specified, system with exit\n'
        sys.exit('System will exit')

    if options.output is None:
        outFile = 'distance.pkl'
    elif options.output is not None:
        outFile = options.output

    prob_f = ioFile.load_object(fname_f)
    prob_g = ioFile.load_object(fname_g)

    all_distance, count = distanceBetweenTwoYears(prob_f, prob_g)

    print count, len(all_distance)

    ioFile.save_object(all_distance, outFile)
示例#13
0
        nb_batch = len(labels) // batch_size
        samples_per_epoch = batch_size * nb_batch
        if not isFirst:
            model.set_weights(weights)
        model.fit_generator(generator=batch_generator(couples, labels, nb_batch),
                    steps_per_epoch=samples_per_epoch,
                    epochs=nb_epoch, verbose=1, workers=1)
        weights = model.get_weights()


# load data
# - sentences: list of (list of word-id)
# - index2word: list of string
#sentences, index2word = utils.load_sentences(data_file)
#sentences = load_object('sentences.pkl')
index2word = load_object('index2word.pkl')
# params
nb_epoch = 3
# learn `batch_size words` at a time
batch_size = 60
vec_dim = 50
# half of window
window_size = 5
vocab_size = len(index2word)

# create input
#data_size = skip_grams_with_cilin_for_big_data(index2word, sentences, window, vocab_size)
#print data_size

data_size = 268350544