def googlenews(allowed_str): # Word2vec (GoogleNews): # non-normalized. # unordered, from gensim's dict-like structure. an_w = an.load(fnames[2], verbosity=1) if an_w is not None: an_w.add_evaluators(get_e()) an_w.analysis(print_report=False) an_w.save() else: import gensim model_w = gensim.models.KeyedVectors.load_word2vec_format( "/mnt/pccfs/not_backed_up/nate/analyst_embeddings/" "GoogleNews-vectors-negative300.bin", binary=True) #common_w = list(filter(lambda w: w in model_w.vocab.keys() \ # or bytes(w) in model_w.vocab.keys(), allowed_str)) common_w = [w for w in allowed_str if w in model_w.vocab.keys()] embed_w = [model_w.get_vector(w) for w in common_w] an_w = an.Analyst(embeddings=embed_w, strings=common_w, metric=metric, auto_print=printing, desc="GoogleNews", parallel_count=cpus, evaluators=get_e(), auto_save=2, file_name=fnames[2], over_write=True)
def glove(allowed_str): # GloVe: # ordered by frequency. # non-normalized. an_g = an.load(fnames[3], verbosity=1) if an_g is not None: an_g.add_evaluators(get_e()) an_g.analysis(print_report=False) an_g.save() else: str_g, embed_g = read_text_table( "/mnt/pccfs/not_backed_up/nate/analyst_embeddings/" "glove.6B.300d.txt", firstline=False, limit_lines=MAX_LINES) #embed_g = [normalize(v) for v in embed_g] common = [w for w in allowed_str if w in str_g] indeces = [str_g.index(w) for w in common] embed_g = embed_g[indeces] an_g = an.Analyst(embeddings=embed_g, strings=common, metric=metric, auto_print=printing, desc="GloVe", parallel_count=cpus, evaluators=get_e(), auto_save=2, file_name=fnames[3], over_write=True)
def fasttext(allowed_str): # Fasttext: # ordered by frequency. # non-normalized. an_fnc = an.load(fnames[0], verbosity=1) if an_fnc is not None: an_fnc.add_evaluators(get_e()) # + get_e_freq()) an_fnc.analysis(print_report=False) an_fnc.save() else: with open( "/mnt/pccfs/not_backed_up/nate/analyst_embeddings/" "fasttext.en.py2.pkl", 'rb') as f: data_ft = pkl.load(f) str_f = data_ft['tokens'][:MAX_LINES] str_f = list(map(str, str_f)) embed_f = data_ft['vectors'][:MAX_LINES] #embed_fn = np.array([normalize(v) for v in embed_f]) common = [w for w in allowed_str if w in str_f] indeces = [str_f.index(w) for w in common] embed_f = embed_f[indeces] an_fnc = an.Analyst(embeddings=embed_f, strings=common, auto_print=printing, metric=metric, desc="Fasttext", evaluators=get_e(), auto_save=2, file_name=fnames[0], over_write=True, parallel_count=cpus) # + get_e_freq())
def numberbatch(allowed_str): # ConceptNet Numberbatch: # alphanumeric order. # normalized. #if not os.path.isfile("embeddings/an_numberbatch"): an_nb = an.load(fnames[1], verbosity=1) if an_nb is not None: an_nb.add_evaluators(get_e()) an_nb.analysis(print_report=False) an_nb.save() else: str_nb, embed_nb = read_text_table( "/mnt/pccfs/not_backed_up/nate/analyst_embeddings/" "numberbatch-en-17.06.txt", firstline=True) common_nb = [w for w in allowed_str if w in str_nb] indeces_nb = [str_nb.index(w) for w in common_nb] #embed_nb = np.array([embed_nb[i] for i in indeces_nb]) embed_nb = embed_nb[indeces_nb] an_nb = an.Analyst(embeddings=embed_nb, strings=common_nb, metric=metric, auto_print=printing, parallel_count=cpus, desc="ConceptNet Numberbatch", evaluators=get_e(), auto_save=2, file_name=fnames[1], over_write=True)
def deps(allowed_str): # Dependency-Based Word Embeddings: # appears to be ordered by frequency. # Normalized. a = an.load(fnames[8], verbosity=1) if a is not None: a.add_evaluators(get_e()) a.analysis(print_report=False) a.save() else: strings, embed_g = read_text_table( "/mnt/pccfs/not_backed_up/nate/analyst_embeddings/" "dependency_based_word_embeddings/deps.words", firstline=False, limit_lines=MAX_LINES) common = [w for w in allowed_str if w in strings] indeces = [strings.index(w) for w in common] embed_g = embed_g[indeces] a = an.Analyst(embeddings=embed_g, strings=common, metric=metric, auto_print=printing, desc="DEPS", parallel_count=cpus, evaluators=get_e(), auto_save=2, file_name=fnames[8], over_write=True)
def word2vec_analysis(): return an.Analyst( embeddings=None, strings=model.vocab, encoder=model.__getitem__, auto_print=True, metric=metric, desc="Word2Vec Canonical Test", evaluators=get_e(), auto_save=True, # Careful! End up saving twice if you forget... over_write=True, )
def sense_2_vec(allowed_str): # Sense2Vec: # originally from reddit, then through sense2vec, I modify sense2vec # by doing a weighted average of all the parts of speech of each word # I seek, since they are often close in the space. # NOT normalized. # 128 dimensions. a = an.load(fnames[4], verbosity=1) if a is not None: a.add_evaluators(get_e()) a.analysis(print_report=False) a.save() else: import sense2vec s2v = sense2vec.load('/mnt/pccfs/not_backed_up/nate/' 'analyst_embeddings/reddit_vectors-1.1.0/') strings = [] vectors = [] endings = [ '|ADJ', '|ADP', '|ADV', '|AUX', '|CONJ', '|DET', '|INTJ', '|NOUN', '|NUM', '|PART', '|PRON', '|PROPN', '|PUNCT', '|SCONJ', '|SYM', '|VERB', '|X' ] for s in allowed_str: senses = [] freq_sum = 0 for e in endings: try: t = s2v[s + e] senses.append(t[1] * t[0]) freq_sum += t[0] except: pass if len(senses) > 0: strings.append(s) vectors.append(np.sum(senses, axis=0) / freq_sum) a = an.Analyst(embeddings=np.array(vectors), strings=strings, metric=metric, auto_print=printing, desc="Sense2Vec", parallel_count=cpus, evaluators=get_e(), auto_save=2, file_name=fnames[4], over_write=True)
def run_analyst(lines, pts, tag=TAG, save=True): print("Analyzing space...") a = an.Analyst(embeddings=pts[:MAX_LINES], strings=lines[:MAX_LINES], metric=METRIC, auto_print=True, desc=tag + "_" + str(len(lines)), evaluators=[CLUSTERS_TYPE], calculate=True) if save: # Save a copy of the analyst: analyst_file = "experiments/" + tag + "_" + str(len(a.strings)) \ + "_analyst" print("Success at pickling utterance clusters Analyst: " + str(an.Analyst.save(a, analyst_file))) return a
def use_large(allowed_str): # Universal Sentence Encoder: # embeddings must be found by hand from things to encode. # normalized. # 512 dimensions. an_u = an.load(fnames[7], verbosity=1) if an_u is not None: an_u.add_evaluators(get_e()) an_u.analysis(print_report=False) an_u.save() else: import tensorflow as tf import tensorflow_hub as hub module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3" embed = hub.Module(module_url) tf.logging.set_verbosity(tf.logging.ERROR) batches = [ allowed_str[b:b + 10000] for b in range(0, len(allowed_str), 10000) ] embeddings = [] with tf.Session() as sess: sess.run([ tf.global_variables_initializer(), tf.tables_initializer() ]) for b in batches: embeddings.append(sess.run(embed(b))) embeddings = np.vstack(embeddings) an_u = an.Analyst(embeddings=embeddings, strings=allowed_str, metric=metric, auto_print=printing, desc="USE Large", parallel_count=cpus, evaluators=get_e(), auto_save=2, file_name=fnames[7], over_write=True)
def run_analyst(lines, pts, tag=TAG, save=True): print("Analyzing space...") nucleizer = an.evaluators.nucleus_clusterizer.NucleusClusterizer( hub_category=u"Nodal " + str(HUB_THRESHOLD) + u"-Hubs") a = an.Analyst( embeddings=pts[:MAX_LINES], strings=lines[:MAX_LINES], metric=METRIC, auto_print=True, desc=tag + "_" + str(len(lines)), evaluators=[nucleizer], ) if save: # Save a copy of the analyst: analyst_file = "experiments/" + tag + "_" + str(len(a.strings)) \ + "_analyst" print("Success at pickling utterance clusters Analyst: " + str(an.Analyst.save(a, analyst_file))) return a
l = set() for i, line in enumerate(lines): if line not in l: l.add(line) unique_lines.append(line) unique_pts.append(pts[i]) #print("unique things gathered") #print(len(unique_lines)) #assert len(set([str(unique_pts[i]) for i in range(len(unique_pts))])) == len(unique_pts) #print("asserted uniqueness of vectors") an_ccc = an.Analyst( embeddings=unique_pts[:MAX_LINES], strings=unique_lines[:MAX_LINES], metric=metric, auto_print=True, desc="ChitChatChallenge Utterance Hubs", #evaluators=["Nodal 4-Hubs"], calculate=True) print("Success at saving ChitChatChallenge Utterance Hubs: " + str(an.Analyst.save(an_ccc, filename))) a = an.load(filename) hubber = a.find_evaluator("Nodal 4-Hubs") hubs = hubber.get_clusters() sizes = [len(h) for h in hubs] order = np.argsort(sizes)[::-1] #order = np.argsort([h.stats_dict["Dispersion"] for h in hubs]) hubs = np.array(hubs)[order] #.tolist()
return strings, embeddings # Fasttext: # ordered by frequency, I think. # non-normalized. #with open("embeddings/fasttext.en.pkl", 'rb') as f: with open("embeddings/fasttext.en.py2.pkl", 'rb') as f: data_ft = pkl.load(f) str_f = data_ft['tokens'][:MAX_LINES] str_f = list(map(str, str_f)) # Universal Sentence Encoder: # embeddings must be found by hand from things to encode. # normalized. # 512 dimensions. module_url = "https://tfhub.dev/google/universal-sentence-encoder/1" embed = hub.Module(module_url) tf.logging.set_verbosity(tf.logging.ERROR) with tf.Session() as sess: sess.run([tf.global_variables_initializer(), tf.tables_initializer()]) embed_u = sess.run(embed(str_f)) an_u = an.Analyst(embeddings=embed_u, strings=str_f, metric=metric, auto_print=True, desc="Universal Sentence Encoder with words") print("Success at saving Universal Sentence Encoder with words: " + str( an.Analyst.save( an_u, "saved_analyses/an" + str(MAX_LINES) + "_universal_sentence_encoder_with_words")))
import nearest import analyst import preview as pv n = nearest.Nearest() test_img_data = pv.load_batch("test_batch") a = analyst.Analyst(test_img_data) predictions = n.predict_all(test_img_data) print(predictions) print(a.score(predictions))
row = lines[i + firstline].split(" ") strings.append(row[0])#str(row[0])) embeddings[i] = row[1:] return strings, embeddings # Fasttext: # ordered by frequency, I think. # non-normalized. #with open("embeddings/fasttext.en.pkl", 'rb') as f: with open("embeddings/fasttext.en.py2.pkl", 'rb') as f: data_ft = pkl.load(f) str_f = data_ft['tokens'][:MAX_LINES] str_f = list(map(str, str_f)) embed_f = data_ft['vectors'][:MAX_LINES] embed_fn = embed_f#np.array([normalize(v) for v in embed_f]) anag = an.evaluators.analogizer.Analogizer( analogies_path="/mnt/pccfs/backed_up/zac/zac_desktop/zac_docs/Corpora/" "subcorp_analogy_storage/analogy_subcorp5_family_relations") anagc = an.evaluators.analogizer_combiner.AnalogizerCombiner() an_fnc = an.Analyst( embeddings=embed_fn, strings=str_f, auto_print=True, metric=metric, desc="Fasttext Non-Normalized Euclidean", evaluators=[u"All", anag, anagc]) file_name = "saved_analyses/an" + str(MAX_LINES) + \ "_fasttext_non-normalized_euclidean" print("Success at saving: " + str(an.Analyst.save(an_fnc, file_name)))
lines = open(path, 'rt').readlines() if firstline: numvecs, dim = map(int, lines[0].split(" ")) else: numvecs = len(lines) if limit_lines == None \ else min(len(lines), limit_lines) dim = len(lines[0].split(" ")) - 1 strings = [] embeddings = np.empty(shape=(numvecs, dim)) for i in tqdm(range(numvecs), desc="Reading " + path): row = lines[i + firstline].split(" ") strings.append(row[0]) #str(row[0])) embeddings[i] = row[1:] return strings, embeddings # GloVe: # ordered by frequency, I think. # non-normalized. str_g, embed_g = read_text_table("embeddings/glove.6B.300d.txt", firstline=False, limit_lines=MAX_LINES) embed_g = [normalize(v) for v in embed_g] an_g = an.Analyst(embeddings=embed_g, strings=str_g, metric=metric, auto_print=True, desc="GloVe Normalized") print("Success at saving GloVe Normalized: " + str( an.Analyst.save( an_g, "saved_analyses/an" + str(MAX_LINES) + "_glove_normalized")))
def use_lite(allowed_str): # Universal Sentence Encoder: # embeddings must be found by hand from things to encode. # normalized. # 512 dimensions. an_u = an.load(fnames[6], verbosity=1) if an_u is not None: an_u.add_evaluators(get_e()) an_u.analysis(print_report=False) an_u.save() else: import tensorflow as tf import tensorflow_hub as hub import sentencepiece as spm def process_to_IDs_in_sparse_format(sp, sentences): # An utility method that processes sentences with the sentence piece processor # 'sp' and returns the results in tf.SparseTensor-similar format: # (values, indices, dense_shape) ids = [sp.EncodeAsIds(x) for x in sentences] max_len = max(len(x) for x in ids) dense_shape = (len(ids), max_len) values = [item for sublist in ids for item in sublist] indices = [[row, col] for row in range(len(ids)) for col in range(len(ids[row]))] return (values, indices, dense_shape) with tf.Session() as sess: module = hub.Module( "https://tfhub.dev/google/universal-sentence-encoder-lite/2" ) spm_path = sess.run(module(signature="spm_path")) # spm_path now contains a path to the SentencePiece model stored inside the # TF-Hub module sp = spm.SentencePieceProcessor() sp.Load(spm_path) input_placeholder = tf.sparse_placeholder(tf.int64, shape=[None, None]) embedder = module( inputs=dict(values=input_placeholder.values, indices=input_placeholder.indices, dense_shape=input_placeholder.dense_shape)) sess.run([ tf.global_variables_initializer(), tf.tables_initializer() ]) batches = [ allowed_str[b:b + 10000] for b in range(0, len(allowed_str), 10000) ] embeddings = [] for b in batches: values, indices, dense_shape = process_to_IDs_in_sparse_format( sp, b) embeddings.append( sess.run(embedder, feed_dict={ input_placeholder.values: values, input_placeholder.indices: indices, input_placeholder.dense_shape: dense_shape })) embeddings = np.vstack(embeddings) an_u = an.Analyst(embeddings=embeddings, strings=allowed_str, metric=metric, auto_print=printing, desc="USE Lite", parallel_count=cpus, evaluators=get_e(), auto_save=2, file_name=fnames[6], over_write=True)
"4_city_state", "5_family_relations", "6_adj_adverb", "7_opposites", "8_comparative", "9_superlative", "10_present_participle", "11_nationality_adj", "12_past_tense", "13_plural", "14_plural_verbs", ] corpora = [ an.evaluators.analogizer.Analogizer(category="Analogies_" + p, analogies_path=path_start + p) for p in path_ends ] anagc = an.evaluators.analogizer_combiner.AnalogizerCombiner() an_fnc = an.Analyst( embeddings=None, strings=model.vocab, encoder=model.__getitem__, auto_print=True, metric=metric, desc="Word2Vec Analogies", evaluators=[anagc] + corpora, # + ["all"], auto_save=True, over_write=True, )
for i in tqdm(range(numvecs), desc="Reading " + path): row = lines[i + firstline].split(" ") strings.append(row[0])#str(row[0])) embeddings[i] = row[1:] return strings, embeddings # Fasttext: # ordered by frequency, I think. # non-normalized. #with open("embeddings/fasttext.en.pkl", 'rb') as f: with open("embeddings/fasttext.en.py2.pkl", 'rb') as f: data_ft = pkl.load(f) str_f = data_ft['tokens'][:MAX_LINES] str_f = list(map(str, str_f)) # Word2vec (GoogleNews): # non-normalized. # unordered, from gensim's dict-like structure. model_w = gensim.models.KeyedVectors.load_word2vec_format( 'embeddings/GoogleNews-vectors-negative300.bin', binary=True) #common_w = list(filter(lambda w: w in model_w.vocab.keys() \ # or bytes(w) in model_w.vocab.keys(), str_f)) common_w = [w for w in str_f if w in model_w.vocab.keys()] embed_w = [normalize(model_w.get_vector(w)) for w in common_w] an_w = an.Analyst(embeddings=embed_w, strings=common_w, metric=metric, auto_print=True, desc="Word2Vec GoogleNews Normalized") print("Success at saving Word2Vec GoogleNews Normalized: " + str(an.Analyst.save(an_w, "saved_analyses/an" + str(MAX_LINES) + "_word2vec_googlenews_normalized")))
raise ValueError("No matching vector") def encode_real(word): return vectors_real[words.index(word)] def encode_fake(word): return vectors_fake[words.index(word)] def metric(vec1, vec2): #return s.angle(vec1, vec2)*180/np.pi return sp.distance.cosine(vec1, vec2) * 180 / np.pi an_real = an.Analyst( embeddings=vectors_real, strings=words, metric=metric, #encoder=encode_real, decoder=decode_real, auto_print=True, desc="real scholar words") an_fake = an.Analyst( embeddings=vectors_fake, strings=words, metric=metric, #encoder=encode_fake, decoder=decode_fake, auto_print=True, desc="fake scholar words") worked_r = an_real.save(an_real, "analyst_project/an_scholar400_real") worked_f = an_fake.save(an_fake, "analyst_project/an_scholar400_fake") assert worked_r