def foldin_item(self, itemid, text): global users_items, uu_enh, ii_enh, docids # TODO check if itemid already exists??? # first, deal with folding in the new item! ttdm = tfidf.transform([text]) dT = ttdm.todense() d_hat = np.dot(dT, self.U).dot(self.SI) self.VT = np.hstack((lsi.VT, d_hat.T)) # next, calculate this document's similarity vs. each user's reading # history # insert similarity score into users_items sparse matrix IFF exceed thr # create virtual doc A = sps.lil_matrix(users_items) A._shape = (A.shape[0], A.shape[1] + 1) for user in userids: ui = np.where(userids == user)[0].item() dd = users_items.getrow(ui).nonzero()[1] # <<< including similar, # but unread items! tids = [] for did in docids[dd]: tids.extend(terms[corpus[did]]) vd = " ".join(tids) # tt = [] # for d in dd: # tt.append() # vd = " ".join( np.concatenate([corp[e] for e in dd]) ) tmp = tfidf.transform([vd]) qT = tmp.todense() q_hat = np.dot(qT, self.U).dot(self.SI) s = cossim(q_hat, d_hat) # >>> threshold also used when folding in new items <<< if s > 0.2: A[ui, -1] = s # TODO is this really the right thing to do??? !!!!! # this here is O(n^2) !!!!! users_items = A.tocsr() uu_enh = cossim(users_items) ii_enh = cossim(users_items.transpose()) docids = np.append(docids, itemid) corpus[itemid] = ttdm.indices print "Docids: ", docids print corpus
def graphAll(df, max_features=1000, keep_unconnected=False, lowerThresh=.9, upperThresh=1.1): """ Option for graphing the similarity of stories. If keep_unconnected == True, nodes with no connecting edges will be included. df -- contains the stories (newspaper name, headlines, body text) max_features -- dimension of TFiDF vectorization of body text from collection of day's stories lowerThresh -- lower bound for similarity of cossim measure for connecting story nodes upperThresh -- upper bound to lowerThresh """ corpus = df.loc[:, 'body'].fillna('').str.lower().values vectorizer = TfidfVectorizer(max_features=max_features) X = vectorizer.fit_transform(corpus) G = nx.Graph() edges = [] for x in range(0, len(df)): simMeasures = cossim(X[x], X) matches = df.loc[(simMeasures[0] >= lowerThresh) & (simMeasures[0] <= upperThresh), :].index if keep_unconnected == False: matches = [elm for elm in matches if elm != x] for elm in matches: edges.append((x, elm)) G.add_edges_from(edges) return G, vectorizer
def summarize(self, document, sum_len=5): """ Centroid defined as the sum of all tfidf vectors of the document. Summaries are made by greedily adding in sentences such that summed cossim between the summary vector and the centroid is maximized at each iteration, until sum_len is reached. Input: document: Document class containing list of Sentence instances sum_len: length for the summary to be (int) Output: list of sentences that 'summarize' the input document. """ self.document = document self.submatrix = self.matrix[document.i1:document.i2] self.centroid = sum(self.submatrix[:]) self.submatrix = self.method() summary, sum_idx = [], [] while len(summary) < sum_len: sims = np.ravel([cossim(self.centroid, sum(summary + [i])) for i in self.submatrix]) summary.append(self.submatrix[np.argmax(sims)]) sum_idx.append(np.argmax(sims)) return [self.document.text[i] for i in sum_idx]
def getsimmatrix(embvectors, N, embchars): corrmat = np.zeros((len(embchars), len(embchars))) for i, e in enumerate(embvectors): if not i in embchars: continue for j, d in enumerate(embvectors): if not j in embchars: continue corrmat[i][j] = cossim(e.reshape(1, -1), d.reshape(1, -1)) return corrmat
def update_user_reading(user, item): # TODO isn't there some other way to do this right? global users_items, uu_enh, ii_enh ui = np.where(userids == user)[0] ii = np.where(docids == item)[0] if ui.size == 0: raise Exception("User %s not found!" % user) if ii.size == 0: raise Exception("Item %s not found!" % item) ui = ui.item() ii = ii.item() tmp = sps.lil_matrix(users_items) tmp[ui, ii] = 1.0 # TODO is this really the right thing to do??? !!!!! # this here is O(n^2) !!!!! users_items = tmp.tocsr() uu_enh = cossim(users_items) ii_enh = cossim(users_items.transpose()) # TODO is this really the right thing to do??? !!!!! ur = users_items.getrow(ui) ri = ur.indices[np.where(ur.data == 1.0)] tids = [] for did in docids[ri]: tids.extend(terms[corpus[did]]) vd = " ".join(tids) #vd = " ".join( np.concatenate([corp[e] for e in d]) ) _t = tfidf.transform([vd]) ti = np.argsort(-_t.data) users_terms[user] = zip(_t.indices[ti], _t.data[ti])
def get_similarity_matrix(embedded_vectors, embedded_chars): """ :param embedded_vectors: :param embedded_chars: :return: """ correlation_matrix = np.zeros((len(embedded_chars), len(embedded_chars))) for i, embedded_vector1 in enumerate(embedded_vectors): if i not in embedded_chars or embedded_vector1 is None: continue for j, embedded_vector2 in enumerate(embedded_vectors): if j not in embedded_chars or embedded_vector2 is None: continue correlation_matrix[i][j] = cossim(embedded_vector1.reshape(1, -1), embedded_vector2.reshape(1, -1)) return correlation_matrix
def cosine_similarity(v1, v2): if v1 == [] or v2 == []: return 0 score = cossim(array(v1).reshape(1, -1), array(v2).reshape(1, -1)) return score[0][0]
def query(self, qT): q_hat = np.dot(qT, self.U).dot(self.SI) res = cossim(q_hat, self.VT.transpose()).flatten() return res
res = lsi.query(qT) # don't include values for docs already read! res[d] = 0 # TODO parameterize this arbitrary threshold? # >>> threshold also used when folding in new items <<< ti = np.where(res > 0.20)[0] tmpi.extend(ti) tmpu.extend(np.tile(i, ti.size)) tmpd.extend(res[ti]) tmpm = sps.csr_matrix((tmpd, (tmpu, tmpi)), shape=users_items.shape) users_items = users_items + tmpm # User-user similarity matrix, enhanced" uu_enh = cossim(users_items) uu_raw = cossim(users_items_raw) # Item-item similarity matrix, enhanced" ii_enh = cossim(users_items.transpose()) ii_raw = cossim(users_items_raw.transpose()) def __get_user_row(user): uid = np.where(userids == user)[0] if len(uid) > 0: uid = uid.item() return users_items.getrow(uid) else: return None
def graphAddedEdges(G, df, vectorizer, cosSim_thresh=.5): """ Deals with the issue that same topic was split into a few different clusters thanks to language nuance (and potentially the different amount of space each editor gave the story's body text). Links groups with similar average cos sim score. """ connectedStories = sorted(nx.connected_component_subgraphs(G), key=len, reverse=True) connectedStoriesCount = [len(elm.nodes()) for elm in connectedStories] #finding subgraphs: subGraphs = range(0, connectedStoriesCount.index(2)) dfSub = pd.DataFrame( columns=['filename', 'headline', 'body', 'GraphGroup']) for i in subGraphs: dfSub0 = df.loc[list(connectedStories[i].nodes()), :] dfSub0['GraphGroup'] = i dfSub = pd.concat([dfSub, dfSub0], sort=True) lsa = decomposition.TruncatedSVD(n_components=5, algorithm='randomized', n_iter=5) Xsub = vectorizer.transform(dfSub.loc[:, 'body'].values) XsubLsa = lsa.fit_transform(Xsub) #finding mean cos sim score for each subgroup: groupMeans = np.array( np.mean(Xsub[(dfSub['GraphGroup'] == subGraphs[0]).values], axis=0)) for i in subGraphs[1:]: groupMeans = np.concatenate([ groupMeans, np.array( np.mean(Xsub[(dfSub['GraphGroup'] == subGraphs[i]).values], axis=0)) ]) #linking subgroups with rel. high cos sim score addedEdges = [] combined_groups = [] for i in subGraphs: #range(22,len(subGraphs)):#subGraphs: simScores = cossim(groupMeans[i].reshape(1, -1), groupMeans) possibleMissedconnections = [ elm[0] for elm in list( zip(range(0, len(simScores[0])), ( simScores > cosSim_thresh)[0])) if elm[1] == True ] combined_groups.append(possibleMissedconnections) if len(possibleMissedconnections) > 1: for j in [elm for elm in possibleMissedconnections if elm != i]: u = list(connectedStories[i].nodes)[ 0] #np.random.choice(np.array(connectedStories[i].nodes)) v = list(connectedStories[j].nodes)[ 0] #np.random.choice(np.array(connectedStories[j].nodes)) G.add_edge(u, v) addedEdges.append((u, v)) # Finding the clusters of larger groups metaGrouped = [] for i in range(0, len(combined_groups)): grouped = [] for elm in combined_groups[i]: for group in combined_groups[i:]: if elm in group: grouped.extend(group) metaGrouped.append(sorted(list(set(grouped)))) newGroups = [] for i in range(0, len(metaGrouped)): if metaGrouped[i][0] == i: newGroups.append(metaGrouped[i]) return dfSub, newGroups, addedEdges
def _n_best(self): """ Only consider the N most similar sentences to the centroid """ sims = np.ravel([cossim(self.centroid, i) for i in self.submatrix]) args = np.argsort(sims)[::-1] return self.submatrix[args[:self.N]]