def call_matlab(stat, m, kappa): assert(m is not None and kappa is not None) outdirname = 'output/{}.m{}.kappa{}.matlab.out'.format(complete_path(stat)[:-1].rsplit('/',1)[1] , m, kappa) if not os.path.exists(outdirname): os.makedirs(outdirname) commandstr = matlab + ' -nojvm -nodisplay -nosplash -r ' + '\"approx_cca(\'' + stat + '\',' + str(m) + ',' + str(kappa) + ',\'' + outdirname + '\')\"' os.system(commandstr) say('Postprocessing to sort rows by frequency...') wordmap = read_wordmap(os.path.join(stat, 'wordmap')) freqmap = read_freqmap(os.path.join(stat, 'X')) sorted_indices = [pair[0] for pair in sorted([(i, freqmap[i]) for i in wordmap], key=lambda x:x[1], reverse=True)] lines = open(os.path.join(outdirname, 'Ur')).readlines() with open(os.path.join(outdirname, 'Ur'), 'wb') as outf: for i in sorted_indices: write_row(outf, freqmap[i], wordmap[i], lines[i].split()) return outdirname
def get_stat(self, stat): self.stat = complete_path(stat) XYstats = self.stat + 'XY' Xstats = self.stat + 'X' Ystats = self.stat + 'Y' assert(os.path.isfile(XYstats) and os.path.isfile(Xstats) and os.path.isfile(Ystats)) say('XYstats: {}'.format(XYstats)) say('Xstats: {}'.format(Xstats)) say('Ystats: {}'.format(Ystats)) self.wordmap = {} wordmapf = self.stat + 'wordmap' with open(wordmapf) as f: for line in f: toks = line.split() self.wordmap[int(toks[0])-1] = toks[1] pickle_file = self.stat + 'pickle' if os.path.isfile(pickle_file): with open(pickle_file) as f: self.countXY, self.countX, self.countY, self.num_samples = \ cPickle.load(f) return self.countXY = Counter() self.countX = Counter() self.countY = Counter() self.num_samples = 0. num_lines = wc_l(XYstats) linenum = 0 with open(XYstats) as f: for line in f: linenum += 1 toks = line.split() x, y, count = int(toks[0])-1, int(toks[1])-1, int(toks[2]) self.countXY[x, y] = count if linenum % 1000 is 0: inline_print('Processing line %i of %i' % (linenum, num_lines)) with open(Xstats) as f: for line in f: toks = line.split() x, count = int(toks[0])-1, int(toks[1]) self.countX[x] = count self.num_samples += count with open(Ystats) as f: for line in f: toks = line.split() y, count = int(toks[0])-1, int(toks[1]) self.countY[y] = count inline_print('\nConstructing matrices\n') self.countXY = csc_matrix((self.countXY.values(), zip(*self.countXY.keys())), shape=(len(self.countX), len(self.countY))) self.countX = array([self.countX[i] for i in range(len(self.countX))]) self.countY = array([self.countY[i] for i in range(len(self.countY))]) with open(pickle_file, 'wb') as outf: cPickle.dump((self.countXY, self.countX, self.countY, self.num_samples), outf, protocol=cPickle.HIGHEST_PROTOCOL)