def write_clean_turian_unigrams(): """ Extracts unigram embeddings from Socher's binary distribution. These can be used by other composers. There are only 50k embeddings (presumably for the most frequent tokens in the corpus). The words have not been processed- there are punctuation-only tokens, uppercased words and non-lemmatized words. There isn't any PoS tag filtering either- words like "to", "while" and "there". I remove punctuation, then lowercase and lemmatize each entry. Multiple entries may map to the same canonical form. I select the shortest original entry (ties are broken by giving preference to words that are already lowercased). This could have been done better. Only vectors for the selected entries are kept. There's 33k canonical forms left, many of which are not nouns/adjs/verbs. We don't have a PoS tag for the canonical forms. I get around the problem by creating 3 copies of each canonical form and expand "cat" to cat/N, cat/J and cat/V, which all share the same vector. """ logging.info('Writing Turian unigrams to %s', turian_unigram_vectors_file) mat = loadmat(socher_unigram_embedding_matlab) words = [w[0] for w in mat['words'].ravel()] df = pd.DataFrame(mat['We'].T, index=words) lmtzr = WordNetLemmatizer() clean_to_dirty = defaultdict(list) # canonical -> [non-canonical] dirty_to_clean = dict() # non-canonical -> canonical to_keep = set() # which non-canonical forms forms we will keep # todo this can be done based on frequency or something for w in words: if set(w).intersection(set(string.punctuation).union(set('0123456789'))): # not a real word- contains digits or punctuation continue lemma = lmtzr.lemmatize(w.lower()) clean_to_dirty[lemma].append(w) dirty_to_clean[w] = lemma # decide which of possibly many non-canonical forms with the same lemma to keep # prefer shorter and lowercased non-canonical forms for lemma, dirty_list in clean_to_dirty.items(): if len(dirty_list) > 1: best_lemma = min(dirty_list, key=lambda w: (len(w), not w.islower())) else: best_lemma = dirty_list[0] to_keep.add(best_lemma) # remove non-canonical forms we don't want idx_to_drop = [i for i, w in enumerate(df.index) if w not in to_keep] ddf = df.drop(df.index[idx_to_drop]) # canonicalize whatever is left ddf.index = [lmtzr.lemmatize(w.lower()) for w in ddf.index] # we don't know what the PoS tags of the canonical forms are, so make them all of the same tag # e.g. expand "cat" to cat/N, cat/J and cat/V, which all share the same vector new_index = ['%s/%s'%(w, pos) for pos in 'NJV' for w in ddf.index] new_data = np.vstack([ddf.values] * 3) ddf = pd.DataFrame(new_data, index= new_index) dv = DenseVectors(ddf, allow_lexical_overlap=True) dv.to_tsv(turian_unigram_vectors_file) logging.info('Done')
def generate(output, dim): np.random.seed(0) feats = ['rand%d' % i for i in range(dim)] phrases = list(get_all_document_features(include_unigrams=True)) vectors = np.random.random((len(phrases), dim)) v = DenseVectors(pd.DataFrame(vectors, index=phrases, columns=feats)) v.to_tsv(output, dense_hd5=True)
def test_distributional_with_vector_clusters(conf, tmpdir): # generate random vectors for the the appropriate features and cluster them first x_tr, _, _, _ = get_tokenized_data(conf['training_data'], conf['tokenizer']) feats = FeatureExtractor().extract_features_from_tree_list([foo[0] for foo in x_tr]) vectors = np.random.random((len(feats), 10)) v = DenseVectors(pd.DataFrame(vectors, index=feats)) tmpfile = str(tmpdir.join('tmp_random_vectors')) v.to_tsv(tmpfile, dense_hd5=True) tmpclusters = str(tmpdir.join('tmp_random_clusters')) cluster_vectors(tmpfile, tmpclusters, n_clusters=5, n_jobs=1) conf['vector_sources']['neighbours_file'] = [] conf['vectorizer']['class'] = 'eval.pipeline.multivectors.KmeansVectorizer' conf['vector_sources']['clusters_file'] = tmpclusters # the features of the document are cluster ids, not phrases # no point in checking in they are in the thesaurus conf['feature_selection']['must_be_in_thesaurus'] = False for debug_level in [0, 1, 2]: conf['debug_level'] = debug_level run_experiment(conf)