parser.add_option('--KNN', dest='KNN', type="int", action='store', default=10) parser.add_option('--normalize', dest='normalize', type="int", action='store', default=1) (options, args) = parser.parse_args() return options if __name__ == '__main__': # parse arguments filename_wordsX = (sys.argv[1]) # read input wordsX = IO.readPickledWords(filename_wordsX) options = parseOptions() # make graph G = makeGraph(wordsX, options) G = G.todense() if options.normalize == 1: G = toSymmetricStochastic(G, sym=(options.sym == 1), stochastic=(options.stochastic == 1), norm='l1') elif options.normalize == 2: G = toSymmetricStochastic(G, sym=(options.sym == 1), stochastic=(options.stochastic == 1), norm='l2') msk = MSK(None, wordsX.words, wordsX.words) # save the matrix. # This is hacky, since we're trusting that G is generated with rows/columns that match the order of wordsX.words msk.M = G graphFilename = filename_wordsX.replace(".", "_WG.") if options.KNN > 0: graphFilename = graphFilename.replace(".", "_KNN"+str(options.KNN)+".") IO.pickle(graphFilename, msk)
L = M.M.todense() I = (L > 0).sum(axis=0) >= options.M # find words that co-occur with at least M distinct words J = np.nonzero(np.array(I)[0])[0] # pi_f = [M.features[i] for i in M.strings] # pi_s = [M.strings[i] for i in M.strings] # P = L[pi_s, pi_f] FCW = set([M.reverseFeatures[j] for j in J]) print >> sys.stderr, 'FCW length:', len(FCW) #too_frequent = FCW.intersection(wordsX.words) L = np.array(L) for w in FCW: i = M.features[w] L[:, i] = 0 #L *= (L > options.minCoFreq) output_edges(M, L, M.reverseFeatures) elif graph_mode == 2: # PMI M.M = M.M.todense() L = M.materialize(wordsX.words, wordsX.words) # L's rows/columns are ordered by wordsX.words L[:, :1500] = 0 # remove common words words L = np.array(L) * np.array(L>options.minCoFreq) # remove low occuring bigrams #L = normalize(L, norm, axis=1) # normalize rows P = np.triu(L) unigram = np.mat(wordsX.freq*1.0 / np.sum(wordsX.freq)) P -= np.diag(np.diag(P)) # remove diagonal P += P.T P /= P.sum() # P now contains the joint probability P[i,j] Q = np.array(unigram.T * unigram) # Q_ij = Ui*Uj PMI = P / Q # pointwise mutual information output_edges(M, PMI, M.reverseStrings)