def generate(output, dim): np.random.seed(0) feats = ['rand%d' % i for i in range(dim)] phrases = list(get_all_document_features(include_unigrams=True)) vectors = np.random.random((len(phrases), dim)) v = DenseVectors(pd.DataFrame(vectors, index=phrases, columns=feats)) v.to_tsv(output, dense_hd5=True)
def compose_and_write_vectors(unigram_vectors, short_vector_dataset_name, composer_classes, remove_pos= False, pretrained_Baroni_composer_file=None, pretrained_Guevara_composer_file=None, pretrained_Gref_composer_file=None, categorical_vector_matrix_file=None, output_dir='.', gzipped=True, dense_hd5=False, row_filter=default_row_filter): """ Extracts all composable features from a labelled classification corpus and dumps a composed vector for each of them to disk. The output file will also contain all unigram vectors that were passed in, and only unigrams! :param unigram_vectors: a file in Byblo events format that contain vectors for all unigrams OR a Vectors object. This will be used in the composition process. :type unigram_vectors: str or Vectors :param classification_corpora: Corpora to extract features from. Dict {corpus_path: conf_file} :param pretrained_Baroni_composer_file: path to pre-trained Baroni AN/NN composer file :param output_dir: :param composer_classes: what composers to use :type composer_classes: list """ phrases_to_compose = get_all_document_features(remove_pos=remove_pos) # if this isn't a Vectors object assume it's the name of a file containing vectors and load them if not isinstance(unigram_vectors, Vectors): # ensure there's only unigrams in the set of unigram vectors # composers do not need any ngram vectors contain in this file, they may well be # observed ones unigram_vectors = Vectors.from_tsv(unigram_vectors, row_filter=row_filter) logging.info('Starting composition with %d unigram vectors', len(unigram_vectors)) # doing this loop in parallel isn't worth it as pickling or shelving `vectors` is so slow # it negates any gains from using multiple cores for composer_class in composer_classes: if composer_class == BaroniComposer: assert pretrained_Baroni_composer_file is not None composer = BaroniComposer(unigram_vectors, pretrained_Baroni_composer_file) elif composer_class == GuevaraComposer: assert pretrained_Guevara_composer_file is not None composer = GuevaraComposer(unigram_vectors, pretrained_Guevara_composer_file) elif composer_class == GrefenstetteMultistepComposer: assert pretrained_Gref_composer_file is not None composer = GrefenstetteMultistepComposer(unigram_vectors, pretrained_Gref_composer_file) elif composer_class in [CopyObject, FrobeniusAdd, FrobeniusMult]: composer = composer_class(categorical_vector_matrix_file, unigram_vectors) else: composer = composer_class(unigram_vectors) try: # compose_all returns all unigrams and composed phrases mat, cols, rows = composer.compose_all(phrases_to_compose) events_path = os.path.join(output_dir, 'composed_%s_%s.events.filtered.strings' % (short_vector_dataset_name, composer.name)) if dense_hd5: write_vectors_to_hdf(mat, rows, cols, events_path) else: rows2idx = {i: DocumentFeature.from_string(x) for (x, i) in rows.items()} write_vectors_to_disk(mat.tocoo(), rows2idx, cols, events_path, entry_filter=lambda x: x.type in {'AN', 'NN', 'VO', 'SVO', '1-GRAM'}, gzipped=gzipped) except ValueError as e: logging.error('RED ALERT, RED ALERT') logging.error(e) continue