Пример #1
0
def call_matlab(stat, m, kappa):
    assert(m is not None and kappa is not None)
        
    outdirname = 'output/{}.m{}.kappa{}.matlab.out'.format(complete_path(stat)[:-1].rsplit('/',1)[1] , m, kappa)
    if not os.path.exists(outdirname): os.makedirs(outdirname)                

    commandstr = matlab + ' -nojvm -nodisplay -nosplash -r ' + '\"approx_cca(\'' + stat + '\',' + str(m) + ',' + str(kappa) + ',\'' + outdirname + '\')\"'
    os.system(commandstr)
    
    say('Postprocessing to sort rows by frequency...') 
    wordmap = read_wordmap(os.path.join(stat, 'wordmap'))
    freqmap = read_freqmap(os.path.join(stat, 'X'))
    sorted_indices = [pair[0] for pair in sorted([(i, freqmap[i]) for i in wordmap], key=lambda x:x[1], reverse=True)]
    
    lines = open(os.path.join(outdirname, 'Ur')).readlines()
    with open(os.path.join(outdirname, 'Ur'), 'wb') as outf:
        for i in sorted_indices: write_row(outf, freqmap[i], wordmap[i], lines[i].split())
    
    return outdirname
Пример #2
0
def call_matlab(stat, m, kappa):
    assert(m is not None and kappa is not None)
        
    outdirname = 'output/{}.m{}.kappa{}.matlab.out'.format(complete_path(stat)[:-1].rsplit('/',1)[1] , m, kappa)
    if not os.path.exists(outdirname): os.makedirs(outdirname)                

    commandstr = matlab + ' -nojvm -nodisplay -nosplash -r ' + '\"approx_cca(\'' + stat + '\',' + str(m) + ',' + str(kappa) + ',\'' + outdirname + '\')\"'
    os.system(commandstr)
    
    say('Postprocessing to sort rows by frequency...') 
    wordmap = read_wordmap(os.path.join(stat, 'wordmap'))
    freqmap = read_freqmap(os.path.join(stat, 'X'))
    sorted_indices = [pair[0] for pair in sorted([(i, freqmap[i]) for i in wordmap], key=lambda x:x[1], reverse=True)]
    
    lines = open(os.path.join(outdirname, 'Ur')).readlines()
    with open(os.path.join(outdirname, 'Ur'), 'wb') as outf:
        for i in sorted_indices: write_row(outf, freqmap[i], wordmap[i], lines[i].split())
    
    return outdirname
Пример #3
0
    def get_stat(self, stat):
        self.stat = complete_path(stat)
        XYstats = self.stat + 'XY'
        Xstats = self.stat + 'X' 
        Ystats = self.stat + 'Y' 
        
        assert(os.path.isfile(XYstats) and 
               os.path.isfile(Xstats) and 
               os.path.isfile(Ystats))

        say('XYstats: {}'.format(XYstats))
        say('Xstats: {}'.format(Xstats))
        say('Ystats: {}'.format(Ystats))
        self.wordmap = {}
        wordmapf = self.stat + 'wordmap'
        with open(wordmapf) as f:
            for line in f:
                toks = line.split()
                self.wordmap[int(toks[0])-1] = toks[1]
        
        pickle_file = self.stat + 'pickle'
        if os.path.isfile(pickle_file):
            with open(pickle_file) as f:
                self.countXY, self.countX, self.countY, self.num_samples = \
                    cPickle.load(f)
            return
        
        self.countXY = Counter()
        self.countX = Counter()
        self.countY = Counter()
        self.num_samples = 0. 
        
        num_lines = wc_l(XYstats)
        linenum = 0
        with open(XYstats) as f:
            for line in f:
                linenum += 1
                toks = line.split()
                x, y, count = int(toks[0])-1, int(toks[1])-1, int(toks[2])
                self.countXY[x, y] = count 
                if linenum % 1000 is 0: 
                    inline_print('Processing line %i of %i' % 
                                 (linenum, num_lines))
        
        with open(Xstats) as f:
            for line in f:
                toks = line.split()
                x, count = int(toks[0])-1, int(toks[1])
                self.countX[x] = count
                self.num_samples += count
        
        with open(Ystats) as f:
            for line in f:
                toks = line.split()
                y, count = int(toks[0])-1, int(toks[1])
                self.countY[y] = count
        
        inline_print('\nConstructing matrices\n')
        self.countXY = csc_matrix((self.countXY.values(), 
                                   zip(*self.countXY.keys())), 
                                  shape=(len(self.countX), len(self.countY)))
        self.countX = array([self.countX[i] for i in range(len(self.countX))])
        self.countY = array([self.countY[i] for i in range(len(self.countY))])

        with open(pickle_file, 'wb') as outf:
            cPickle.dump((self.countXY, self.countX, self.countY, 
                          self.num_samples), outf, 
                         protocol=cPickle.HIGHEST_PROTOCOL) 
Пример #4
0
    def get_stat(self, stat):
        self.stat = complete_path(stat)
        XYstats = self.stat + 'XY'
        Xstats = self.stat + 'X' 
        Ystats = self.stat + 'Y' 
        
        assert(os.path.isfile(XYstats) and 
               os.path.isfile(Xstats) and 
               os.path.isfile(Ystats))

        say('XYstats: {}'.format(XYstats))
        say('Xstats: {}'.format(Xstats))
        say('Ystats: {}'.format(Ystats))
        self.wordmap = {}
        wordmapf = self.stat + 'wordmap'
        with open(wordmapf) as f:
            for line in f:
                toks = line.split()
                self.wordmap[int(toks[0])-1] = toks[1]
        
        pickle_file = self.stat + 'pickle'
        if os.path.isfile(pickle_file):
            with open(pickle_file) as f:
                self.countXY, self.countX, self.countY, self.num_samples = \
                    cPickle.load(f)
            return
        
        self.countXY = Counter()
        self.countX = Counter()
        self.countY = Counter()
        self.num_samples = 0. 
        
        num_lines = wc_l(XYstats)
        linenum = 0
        with open(XYstats) as f:
            for line in f:
                linenum += 1
                toks = line.split()
                x, y, count = int(toks[0])-1, int(toks[1])-1, int(toks[2])
                self.countXY[x, y] = count 
                if linenum % 1000 is 0: 
                    inline_print('Processing line %i of %i' % 
                                 (linenum, num_lines))
        
        with open(Xstats) as f:
            for line in f:
                toks = line.split()
                x, count = int(toks[0])-1, int(toks[1])
                self.countX[x] = count
                self.num_samples += count
        
        with open(Ystats) as f:
            for line in f:
                toks = line.split()
                y, count = int(toks[0])-1, int(toks[1])
                self.countY[y] = count
        
        inline_print('\nConstructing matrices\n')
        self.countXY = csc_matrix((self.countXY.values(), 
                                   zip(*self.countXY.keys())), 
                                  shape=(len(self.countX), len(self.countY)))
        self.countX = array([self.countX[i] for i in range(len(self.countX))])
        self.countY = array([self.countY[i] for i in range(len(self.countY))])

        with open(pickle_file, 'wb') as outf:
            cPickle.dump((self.countXY, self.countX, self.countY, 
                          self.num_samples), outf, 
                         protocol=cPickle.HIGHEST_PROTOCOL)