def getFreqRepr(X, D): freqX = OrderedDict() reprX = OrderedDict() for i, w in enumerate(X.words): freqX[w] = X.freq[i] reprX[w] = {j: X.features[i, j] for j in xrange(D)} return freqX, reprX # read input parameters if __name__ == '__main__': N = int(sys.argv[1]) D = int(sys.argv[2]) Nseed = int(sys.argv[3]) # make X,Y mock data (X, Y, pi) = make(N, Nseed, D) seed = [(i, i) for i in xrange(Nseed)] # write to CSV files freqX, reprX = getFreqRepr(X, D) freqY, reprY = getFreqRepr(Y, D) IO.writePickledWords('pockX.txt', freqX, reprX) IO.writePickledWords('pockY.txt', freqY, reprY) IO.writeWords('mockX.txt', X) IO.writeWords('mockY.txt', Y) IO.writeSeed('seedXY.txt', seed) print X.asTuple() # now need to save
freq = top_nouns[word] sys.stdout.write(str(freq)) sys.stdout.write(',') V = [context_features[word][other_word] for other_word in noun_keys] print ','.join([str(v) for v in V]) if __name__ == '__main__': global verbosity verbosity = 0 filename_text = sys.argv[1] filename_tags = sys.argv[2] N = int(sys.argv[3]) lang = (sys.argv[4]) assert lang == 'en' or lang == 'es' if lang == 'en': accept_tags = ['NN', 'NNS', 'NP', 'NPS'] elif lang == 'es': accept_tags = ['NC', 'NP'] out_filename = lang + '_' + 'pickled_N='+str(N)+'.txt' common.log(100, 'Extracting', N, 'top nouns', '-- accepted tags:', accept_tags) top_nouns_freq, context_features, feature_names = extract(filename_text, filename_tags, accept_tags, N) # sort by frequency (descending) context_features0 = OrderedDict() for noun in top_nouns_freq: context_features0[noun] = context_features[noun] IO.writePickledWords(out_filename, top_nouns_freq, context_features0, feature_names)