def sorensen_plus(a: str, b: str) -> float: length = min(len(a), len(b)) ng = [ distance.sorensen(ngrams(a, n), ngrams(b, n)) for n in range(1, length + 1) ] return 1 - np.sum(ng) / length
def p_spectrum(x, y, p: int = 2) -> float: """Hashmap algorithm for p-spectrum similarity.""" ng_a = ngrams(x, p) ng_b = ngrams(y, p) x_count = Counter(ng_a) y_count = Counter(ng_b) return np.sum([x_count[k] * y_count[k] for k in x_count.keys()])
def ngram_sim(x, y, n: int = 2) -> float: """Binary version of n-gram similarity.""" ng_a = ngrams(x, n) ng_b = ngrams(y, n) x_len = len(ng_a) y_len = len(ng_b) np_mem = np.zeros([x_len + 1, y_len + 1], dtype=np.intc) mem_table = np_mem for i in range(1, x_len + 1): for j in range(1, y_len + 1): mem_table[i][j] = max( mem_table[i][j - 1], mem_table[i - 1][j], mem_table[i - 1][j - 1] + (ng_a[i - 1] == ng_b[j - 1])) return float(mem_table[x_len][y_len]) / float(max(x_len, y_len))
for s in file ][1:] vocab = seqs[0] for s in seqs: vocab |= s vocab |= {" "} aminoacid_dict = {} for a in vocab: aminoacid_dict[a] = len(aminoacid_dict.keys()) trigram_dict = {} for x in vocab: for y in vocab: for z in vocab: trigram_dict[x + y + z] = len(trigram_dict.keys()) with codecs.open(seq_path, "r") as f_in: seqs = [s[:-1] for s in f_in][1:] v_size = len(trigram_dict.keys()) V = np.zeros([len(seqs), v_size], dtype=int) for i in np.arange(V.shape[0]): for t in ngrams(seqs[i], 3): V[i, trigram_dict[t]] = 1 np.save("seqs_sparse_trigram_vectors.npy", V)
def compute_similarity_matrix_ngram_sparse( *, repr_vocab, full_vocab, ngram_to_index, n: int, use_tqdm: bool = True, ): v_size = len(ngram_to_index) # R is the transposed matrix of the one-hot representations of the representative vocab R = dok_matrix((v_size, len(repr_vocab)), dtype=int) R_ng = np.zeros([len(repr_vocab)], dtype=int) it_1 = np.arange(R.shape[1]) if use_tqdm: it_1 = tqdm( it_1, desc= f"{EMOJI} compute one hot representation of representative vocabulary" ) for i in it_1: ng = ngrams(repr_vocab[i], int(n)) R_ng[i] = len(ng) for j in range(len(ng)): R[ngram_to_index[ng[j]], i] = 1 R.tocsr() V = dok_matrix((len(full_vocab), v_size), dtype=int) V_ng = np.zeros([len(full_vocab)], dtype=int) it_2 = np.arange(V.shape[0]) if use_tqdm: it_2 = tqdm( it_2, desc=f"{EMOJI} compute one hot representation of full vocabulary") for i in it_2: ng = ngrams(full_vocab[i], int(n)) V_ng[i] = len(ng) for j in range(len(ng)): try: V[i, ngram_to_index[ng[j]]] = 1 except KeyError: pass V.tocsr() L = np.empty([len(V_ng), len(R_ng)], dtype=int) it_3 = range(len(V_ng)) if use_tqdm: it_3 = tqdm( it_3, desc= f"{EMOJI} Compute normalization matrix with maximum number of n-grams for the proteins" ) for i in it_3: for j in range(len(R_ng)): L[i, j] = max(V_ng[i], R_ng[j]) return V.dot(R).toarray() / L
def _get_ngram_elements_helper(strings, *, ngram_to_index, n: int, desc=None): return [[ngram_to_index[t] for t in ngrams(string, n)] for string in tqdm(strings, desc=desc)]