def apply_model(in_file, out_dir, model, trained_model, arg_space_files, alpha, beta, lambda_, out_format): print "Reading in data..." in_descr = in_file.split("/")[-1] if not model is None: model_obj = create_model(model, alpha, beta, lambda_) else: model_obj = io_utils.load(trained_model, CompositionModel) model_descr = type(model_obj).__name__ arg_space = io_utils.load(arg_space_files[0], Space) arg_space2 = None if len(arg_space_files) == 2: arg_space2 = io_utils.load(arg_space_files[1], Space) data = io_utils.read_tuple_list(in_file, fields=[0, 1, 2]) print "Applying composition model:%s" % model_descr if arg_space2 is None or type(model_obj) is LexicalFunction: composed_space = model_obj.compose(data, arg_space) else: composed_space = model_obj.compose(data, (arg_space, arg_space2)) print "Printing..." out_file = ".".join([out_dir + "/COMPOSED_SS", model_descr, in_descr]) io_utils.save(composed_space, "%s.pkl" % out_file) if not out_format is None: composed_space.export(out_file, format=out_format)
def save_pkl_files(dsm_prefix, dsm, save_in_one_file=False): """ Save the space to separate pkl files. :param dsm_prefix: :param dsm: """ # Save in a single file (for small spaces) if save_in_one_file: io_utils.save(dsm, dsm_prefix + '.pkl') # Save in multiple files: npz for the matrix and pkl for the other data members of Space else: mat = coo_matrix(dsm.cooccurrence_matrix.get_mat()) np.savez_compressed(dsm_prefix + 'cooc.npz', data=mat.data, row=mat.row, col=mat.col, shape=mat.shape) with open(dsm_prefix + '_row2id.pkl', 'wb') as f_out: pickle.dump(dsm._row2id, f_out, 2) with open(dsm_prefix + '_id2row.pkl', 'wb') as f_out: pickle.dump(dsm._id2row, f_out, 2) with open(dsm_prefix + '_column2id.pkl', 'wb') as f_out: pickle.dump(dsm._column2id, f_out, 2) with open(dsm_prefix + '_id2column.pkl', 'wb') as f_out: pickle.dump(dsm._id2column, f_out, 2)
def train_core(rows_file, cols_file, sm_file, ppmi=False, top_features=None, svd=None, save_location=None): """ Takes co-occurance relate files and train the model. @rows_file : All the entries which label the rows of the matrix. @columss_file : All the entries which label the columns of the matrix. @sm_file : All the co-occurance entries in the corpus. @ppmi : Whether we want to do the Ppmi weighting. @TopFeatures : To restrict the no of features are to be selected in total. None, signifies all the features have to be selected. @Svd : If we want to reduce the dimensions. Not advised though. None, signifies that dimensions have to be reduced. """ global final_model core_space = MySpace.xbuild(data=sm_file, rows=rows_file, cols=cols_file, format="sm") if ppmi: core_space = core_space.apply(PpmiWeighting()) if top_features: core_space = core_space.apply(TopFeatureSelection(int(top_features))) if svd: core_space = core_space.apply(Svd(int(svd))) final_model = core_space if save_location: io_utils.save(final_model, save_location)
def train_model(in_file, out_dir, model, arg_space_files, phrase_space_file, regression, crossvalid, intercept, param, param_range, export_params): print "Reading in data..." in_descr = in_file.split("/")[-1] model_dict = { "weighted_add": WeightedAdditive, "full_add": FullAdditive, "lexical_func": LexicalFunction, "dilation": Dilation } learner_dict = { "ridge": RidgeRegressionLearner, "lstsq": LstsqRegressionLearner } arg_space = io_utils.load(arg_space_files[0], Space) arg_space2 = None if len(arg_space_files) == 2: arg_space2 = io_utils.load(arg_space_files[1], Space) phrase_space = io_utils.load(phrase_space_file, Space) if not model in model_dict: raise ValueError("Invalid model:%s for training" % model) model_cls = model_dict[model] if model_cls in (WeightedAdditive, Dilation): model_obj = model_cls() else: if regression == "ridge": regression_obj = learner_dict[regression]( crossvalidation=crossvalid, intercept=intercept, param=param, param_range=param_range) model_obj = model_cls(learner=regression_obj) elif regression == "lstsq": regression_obj = learner_dict[regression](intercept=intercept) model_obj = model_cls(learner=regression_obj) else: model_obj = model_cls() train_data = io_utils.read_tuple_list(in_file, fields=[0, 1, 2]) print "Training %s model" % model if arg_space2 is None or model == "lexical_func": model_obj.train(train_data, arg_space, phrase_space) else: model_obj.train(train_data, (arg_space, arg_space2), phrase_space) print "Printing..." out_file = ".".join([out_dir + "/TRAINED_COMP_MODEL", model, in_descr]) io_utils.save(model_obj, "%s.pkl" % out_file) if export_params: model_obj.export("%s.params" % out_file)
def apply_model(in_file, out_dir, model, trained_model, arg_space_files, alpha, beta, lambda_, out_format): print("Reading in data...") in_descr = in_file.split("/")[-1] if not model is None: model_obj = create_model(model, alpha, beta, lambda_) else: model_obj = io_utils.load(trained_model, CompositionModel) model_descr = type(model_obj).__name__ arg_space = io_utils.load(arg_space_files[0], Space) arg_space2 = None if len(arg_space_files) == 2: arg_space2 = io_utils.load(arg_space_files[1], Space) data = io_utils.read_tuple_list(in_file, fields=[0, 1, 2]) print("Applying composition model:%s" % model_descr) if arg_space2 is None or type(model_obj) is LexicalFunction: composed_space = model_obj.compose(data, arg_space) else: composed_space = model_obj.compose(data, (arg_space, arg_space2)) print("Printing...") out_file = ".".join([out_dir + "/COMPOSED_SS", model_descr, in_descr]) io_utils.save(composed_space, "%s.pkl" % out_file) if not out_format is None: composed_space.export(out_file, format=out_format)
def save_space(space, space_type) : current_dir = os.getcwd() outfile_location = current_dir + '/space' if not os.path.exists(outfile_location): os.makedirs(outfile_location) outfilename = outfile_location + "/"+ space_type + ".pkl" io_utils.save(space, outfilename)
def print_space(space, out_dir, op_list, out_format): ops = [op for op in op_list if (op and (not op == "none"))] space_descr = ".".join(ops) out_file = out_dir + "/" + space_descr io_utils.save(space, out_file + ".pkl") if not out_format is None: space.export(out_file, format=out_format)
def train_model(in_file, out_dir, model, arg_space_files, phrase_space_file, regression, crossvalid, intercept, param, param_range, export_params): print "Reading in data..." in_descr = in_file.split("/")[-1] model_dict = {"weighted_add": WeightedAdditive, "full_add": FullAdditive, "lexical_func": LexicalFunction, "dilation": Dilation } learner_dict = {"ridge": RidgeRegressionLearner, "lstsq": LstsqRegressionLearner } arg_space = io_utils.load(arg_space_files[0], Space) arg_space2 = None if len(arg_space_files) == 2: arg_space2 = io_utils.load(arg_space_files[1], Space) phrase_space = io_utils.load(phrase_space_file, Space) if not model in model_dict: raise ValueError("Invalid model:%s for training" % model) model_cls = model_dict[model] if model_cls in (WeightedAdditive, Dilation): model_obj = model_cls() else: if regression == "ridge": regression_obj = learner_dict[regression](crossvalidation=crossvalid, intercept=intercept, param=param, param_range=param_range) model_obj = model_cls(learner=regression_obj) elif regression == "lstsq": regression_obj = learner_dict[regression](intercept=intercept) model_obj = model_cls(learner=regression_obj) else: model_obj = model_cls() train_data = io_utils.read_tuple_list(in_file, fields=[0, 1, 2]) print "Training %s model" % model if arg_space2 is None or model == "lexical_func": model_obj.train(train_data, arg_space, phrase_space) else: model_obj.train(train_data, (arg_space, arg_space2), phrase_space) print "Printing..." out_file = ".".join([out_dir + "/TRAINED_COMP_MODEL", model, in_descr]) io_utils.save(model_obj, "%s.pkl" % out_file) if export_params: model_obj.export("%s.params" % out_file)
def save_pkl_files(dsm, dsm_prefix, save_in_one_file=False, save_as_w2v=False): """ Save semantic space (from DISSECT package) to different formats. :param dsm: the semantic space :param dsm_prefix: the prefix for the output files :param save_in_one_file: whether to save as one file (pkl or w2v) or separate files (npz for matrix and pkl for rows and columns) :param save_as_w2v: given save_in_one_file=True, whether to save it in w2v format or pkl """ # Save in a single file (for small spaces) if save_in_one_file: # only useful for dense spaces if save_as_w2v: rows = np.array(dsm.cooccurrence_matrix.get_mat()).astype(object) id2row = np.array( [word.decode('utf-8') for word in dsm.get_id2row()]) r, d = rows.shape id2row = id2row.reshape(-1, 1) rows = np.concatenate((id2row, rows), axis=1) np.savetxt(dsm_prefix + '.w2v', rows, fmt=["%s"] + [ '%.16g', ] * d, delimiter=' ', newline='\n', header='%d %d' % (r, d), comments='', encoding='utf-8') else: io_utils.save(dsm, dsm_prefix + '.pkl') # Save in multiple files: npz for the matrix and pkl for the other data members of Space else: mat = coo_matrix(dsm.cooccurrence_matrix.get_mat()) np.savez_compressed(dsm_prefix + '.npz', data=mat.data, row=mat.row, col=mat.col, shape=mat.shape) with open(dsm_prefix + '_row2id.pkl', 'wb') as f_out: pickle.dump(dsm._row2id, f_out, 2) with open(dsm_prefix + '_id2row.pkl', 'wb') as f_out: pickle.dump(dsm._id2row, f_out, 2) with open(dsm_prefix + '_column2id.pkl', 'wb') as f_out: pickle.dump(dsm._column2id, f_out, 2) with open(dsm_prefix + '_id2column.pkl', 'wb') as f_out: pickle.dump(dsm._id2column, f_out, 2)
def transform_raw_per_space(raw_per_space, in_file_prefix, out_dir, out_format, core_space_file): in_file_descr = "PER_SS." + in_file_prefix.split("/")[-1] core_space = io_utils.load(core_space_file, Space) core_descr = ".".join(core_space_file.split("/")[-1].split(".")[0:-1]) space = PeripheralSpace(core_space, raw_per_space.cooccurrence_matrix, raw_per_space.id2row, raw_per_space.row2id) print "Printing..." out_file_prefix = "%s/%s.%s" % (out_dir, in_file_descr, core_descr) io_utils.save(space, out_file_prefix + ".pkl") if not out_format is None: space.export(out_file_prefix, format=out_format)
def transform_raw_per_space(raw_per_space, in_file_prefix, out_dir, out_format, core_space_file): in_file_descr = "PER_SS." + in_file_prefix.split("/")[-1] core_space = io_utils.load(core_space_file, Space) core_descr = ".".join(core_space_file.split("/")[-1].split(".")[0:-1]) space = PeripheralSpace(core_space, raw_per_space.cooccurrence_matrix, raw_per_space.id2row, raw_per_space.row2id) print("Printing...") out_file_prefix = "%s/%s.%s" % (out_dir, in_file_descr, core_descr) io_utils.save(space, out_file_prefix + ".pkl") if not out_format is None: space.export(out_file_prefix, format=out_format)
def main(): """ Compute the PPMI/PLMI matrix from a co-occurrence matrix, as default pickle the raw matrix. """ # Get the arguments args = docopt( '''Compute the PPMI/PLMI matrix from a co-occurrence matrix, as default pickle the raw matrix. Usage: apply_ppmi_plmi.py <dsm_prefix> [-p | -l] <dsm_prefix> = the prefix for the input files (.sm for the matrix, .rows and .cols) and output files (.ppmi) Options: -p, --ppmi weight the matrice entries via PPMI -l, --plmi weight the matrice entries via PLMI ''') dsm_prefix = args['<dsm_prefix>'] is_ppmi = args['--ppmi'] is_plmi = args['--plmi'] postfix = "" # Create a space from co-occurrence counts in sparse format dsm = Space.build(data=dsm_prefix + '.sm', rows=dsm_prefix + '.rows', cols=dsm_prefix + '.cols', format='sm') if is_ppmi: # Apply ppmi weighting dsm = dsm.apply(PpmiWeighting()) postfix = "_ppmi" elif is_plmi: # Apply plmi weighting dsm = dsm.apply(PlmiWeighting()) postfix = "_plmi" # Save the Space object in pickle format io_utils.save(dsm, dsm_prefix + postfix + '.pkl')
def write_pkl(self): """ Create spaces from co-occurrence counts in sparse format (.sm) """ # For direction DE-EN my_space_1 = Space.build( data=OUTPUT_FILE_DE_DE_EN_SM, rows=OUTPUT_FILE_DE_WORDS_ROW, cols=OUTPUT_FILE_DE_EN_WORDS_COL, format="sm" ) # For direction EN-DE my_space_2 = Space.build( data=OUTPUT_FILE_EN_EN_DE_SM, rows=OUTPUT_FILE_EN_WORDS_ROW, cols=OUTPUT_FILE_DE_EN_WORDS_COL, format="sm" ) # Save the space objects in pickle format io_utils.save(my_space_1, OUTPUT_FILE_DE_DE_EN_PKL) io_utils.save(my_space_2, OUTPUT_FILE_EN_EN_DE_PKL) print >> stderr, "Pickle file 1 written out:", OUTPUT_FILE_DE_DE_EN_PKL print >> stderr, "Pickle file 2 written out:", OUTPUT_FILE_EN_EN_DE_PKL
def main(): ap = argparse.ArgumentParser() ap.add_argument('spaces_dir') ap.add_argument('words_list_dir') args = ap.parse_args() spaces_dir = args.spaces_dir words_list_dir = args.words_list_dir # '/mnt/8tera/shareclic/lucaNgrams/5grams/ITA_5grams/matrices/pkl_matrices/' #space_filename = '../spaces/cbow1_wind5_hs0_neg10_size400_smpl1e-05.pkl' output_dir = os.path.join('output', os.path.basename(words_list_dir)) mkdir_p(output_dir) all_words = set(l.strip() for words_filename in glob.glob(os.path.join(words_list_dir, '*')) for l in file(words_filename)) for words_filename in glob.glob(os.path.join(words_list_dir, '*')): space_filename = os.path.join(spaces_dir, os.path.splitext(os.path.basename(words_filename))[0] + '.pkl') if not os.path.isfile(space_filename): logging.error('{0} not found: ignoring'.format(space_filename )) continue context_filename = hashlib.md5(spaces_dir).hexdigest() + '.txt' context_words = load_context_vocab(context_filename, spaces_dir) logging.debug('Processing {0}'.format(space_filename)) sp = io_utils.load(space_filename) #words = [l.strip() for l in file(words_filename)] filtered_words = [w for w in all_words if w in sp.row2id] words_vectors = sp.get_rows(filtered_words) context_vectors = sp.get_rows(context_words) m = words_vectors * context_vectors.transpose() sp2 = Space(m, filtered_words, context_words) io_utils.save(sp2, os.path.join(output_dir,os.path.basename(space_filename)))
recipes[words[0]] = words[1:] if len(words)-1 > max_size: max_size = len(words)-1 WA = WeightedAdditive(alpha = 1, beta = 1) last_space = None number = count() for size in xrange(max_size,1,-1): relevant = (rec for rec in recipes if len(recipes[rec]) == size) print(size) composition = [] for recipe in relevant: old = recipes[recipe] if size == 2: name = recipe else: name = "comp_" + str(next(number)) if old[-2] in stacked_space.id2row: composition.append((old[-1],old[-2],name)) recipes[recipe].pop(-1) recipes[recipe].pop(-1) recipes[recipe].append(name) else: recipes[recipe].pop(-2) if composition: last_space = WA.compose(composition, stacked_space) if size != 2: stacked_space = Space.vstack(stacked_space, last_space) io_utils.save(last_space, "recicomp.pkl")
#ex10.py #------- from composes.utils import io_utils from composes.composition.weighted_additive import WeightedAdditive #load a space my_space = io_utils.load("./data/out/ex10.pkl") print my_space.id2row print my_space.cooccurrence_matrix # instantiate a weighted additive model my_comp = WeightedAdditive(alpha = 1, beta = 1) # use the model to compose words in my_space composed_space = my_comp.compose([("good", "book", "good_book"), ("good", "car", "good_car")], my_space) print composed_space.id2row print composed_space.cooccurrence_matrix #save the composed space io_utils.save(composed_space, "data/out/PHRASE_SS.ex10.pkl")
#ex02.py #------- from composes.semantic_space.space import Space from composes.utils import io_utils #create a space from co-occurrence counts in sparse format my_space = Space.build(data="./data/in/ex01.sm", rows="./data/in/ex01.rows", cols="./data/in/ex01.cols", format="sm") #print the co-occurrence matrix of the space print my_space.cooccurrence_matrix #save the Space object in pickle format io_utils.save(my_space, "./data/out/ex01.pkl") #load the saved object my_space2 = io_utils.load("./data/out/ex01.pkl") #print the co-occurrence matrix of the loaded space print my_space2.cooccurrence_matrix
#ex02.py #------- from composes.semantic_space.space import Space from composes.utils import io_utils #create a space from co-occurrence counts in sparse format my_space = Space.build(data = "./data/in/ex01.sm", rows = "./data/in/ex01.rows", cols = "./data/in/ex01.cols", format = "sm") #print the co-occurrence matrix of the space print my_space.cooccurrence_matrix #save the Space object in pickle format io_utils.save(my_space, "./data/out/ex01.pkl") #load the saved object my_space2 = io_utils.load("./data/out/ex01.pkl") #print the co-occurrence matrix of the loaded space print my_space2.cooccurrence_matrix
import sys import os folder = os.path.expandvars('/home/luka/Downloads/dissect-master/src') if folder not in sys.path: sys.path.append(folder) from composes.semantic_space.space import Space holspace = Space.build(data = "/home/luka/ThLi/cooccurrence/spm1.sm", rows = "/home/luka/ThLi/cooccurrence/rows1.rows", cols = "/home/luka/ThLi/cooccurrence/cols1.cols", format = "sm") #%% from composes.transformation.scaling.ppmi_weighting import PpmiWeighting holspace = holspace.apply(PpmiWeighting()) #%% from composes.utils import io_utils io_utils.save(holspace, "/home/luka/ThLi/cooccurrence/weighted") #%% holspace.export("/home/luka/ThLi/cooccurrence/weighted_sm", format = "sm")
#ex05.py #------- from composes.utils import io_utils from composes.semantic_space.peripheral_space import PeripheralSpace from composes.transformation.scaling.ppmi_weighting import PpmiWeighting #load a space and apply ppmi on it my_space = io_utils.load("./data/out/ex01.pkl") my_space = my_space.apply(PpmiWeighting()) print(my_space.cooccurrence_matrix) print(my_space.id2row) #create a peripheral space my_per_space = PeripheralSpace.build(my_space, data="./data/in/ex05.sm", cols="./data/in/ex05.cols", format="sm") print(my_per_space.cooccurrence_matrix) print(my_per_space.id2row) #save the space io_utils.save(my_per_space, "./data/out/PER_SS.ex05.pkl")
def train_grefenstette_multistep_composer(all_vectors_file, root_dir): """ Train Grefenstette et al's multistep regression VO/SVO model Adapted from dissect's ex19.py :param all_vectors_file: file containing N, V, VO and SVO vectors :param root_dir: where to write temp files and output """ mkdirs_if_not_exists(root_dir) vo_composer_output_file = join(root_dir, 'vo_comp.pkl') svo_composer_output_file = join(root_dir, 'svo_comp.pkl') filename = basename(all_vectors_file) noun_events_file = join(root_dir, '%s-onlyN.tmp' % filename) # verb_events_file = join(root_dir, '%s-onlyV.tmp' % filename) # vo_events_file = join(root_dir, '%s-onlyVO.tmp' % filename) svo_events_file = join(root_dir, '%s-onlySVO.tmp' % filename) # this has unigrams and observed phrases thes = Vectors.from_tsv(all_vectors_file) thes.to_tsv(noun_events_file, entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N') _translate_byblo_to_dissect(noun_events_file) # thes.to_tsv(verb_events_file, # entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'V') # _translate_byblo_to_dissect(verb_events_file) # thes.to_tsv(vo_events_file, # entry_filter=lambda x: x.type == 'VO') # _translate_byblo_to_dissect(vo_events_file) thes.to_tsv(svo_events_file, entry_filter=lambda x: x.type == 'SVO') _translate_byblo_to_dissect(svo_events_file) train_vo_data, train_v_data = [], [] for phrase in thes.keys(): df = DocumentFeature.from_string(phrase) if df.type == 'SVO': train_vo_data.append((str(df[1:]), str(df[0]), str(df))) if df.type == 'VO': train_v_data.append((str(df[0]), str(df[1]), str(df))) # logging.info('train_vo_data %r', len(train_vo_data)) # logging.info('train_v_data %r', len(train_v_data)) # load N and SVO spaces n_space = Space.build(data=noun_events_file + '.sm', cols=noun_events_file + '.cols', format="sm") svo_space = Space.build(data=svo_events_file + '.sm', cols=svo_events_file + '.cols', format="sm") logging.info("Input SVO training space:") logging.info(svo_space.id2row) # logging.info(svo_space.cooccurrence_matrix) # 1. train a model to learn VO functions on train data: VO N -> SVO logging.info("Step 1 training") vo_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2) # Gref et al 2013, §5 says 3 vo_model.train(train_vo_data, n_space, svo_space) io_utils.save(vo_model, vo_composer_output_file) # 2. train a model to learn V functions on train data: V N -> VO # where VO space: function space learned in step 1 logging.info("Step 2 training") vo_space = vo_model.function_space v_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2) v_model.train(train_v_data, n_space, vo_space) io_utils.save(v_model, svo_composer_output_file)
def train_baroni_guevara_composers(all_vectors, ROOT_DIR, baroni_output_path, guevara_output_path, baroni_threshold=10): """ :type all_vectors: str; path to vectors file containing both N and observed AN vectors :type ROOT_DIR: str; where to write temp files :type baroni_output_path: str; where to write pickled baroni composer :type guevara_output_path: str :type baroni_threshold: int """ SVD_DIMS = 100 baroni_training_phrase_types = {'AN', 'NN'} # what kind of NPs to train Baroni composer for # prepare the input files to be fed into Dissect mkdirs_if_not_exists(ROOT_DIR) filename = basename(all_vectors) noun_events_file = join(ROOT_DIR, '%s-onlyN-SVD%d.tmp' % (filename, SVD_DIMS)) NPs_events_file = join(ROOT_DIR, '%s-onlyPhrases-SVD%d.tmp' % (filename, SVD_DIMS)) thes = Vectors.from_tsv(all_vectors, lowercasing=False) thes.to_tsv(noun_events_file, entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N') _translate_byblo_to_dissect(noun_events_file) thes.to_tsv(NPs_events_file, entry_filter=lambda x: x.type in baroni_training_phrase_types, row_transform=lambda x: str(x).replace(' ', '_')) _translate_byblo_to_dissect(NPs_events_file) my_space = Space.build(data="{}.sm".format(noun_events_file), rows="{}.rows".format(noun_events_file), cols="{}.cols".format(noun_events_file), format="sm") logging.info('Each unigram vector has dimensionality %r', my_space.element_shape) # create a peripheral space my_per_space = PeripheralSpace.build(my_space, data="{}.sm".format(NPs_events_file), rows="{}.rows".format(NPs_events_file), # The columns of the peripheral space have to be identical to those # in the core space (including their order)! cols="{}.cols".format(NPs_events_file), format="sm") logging.info('Each phrase vector has dimensionality %r', my_per_space.element_shape) # use the model to compose words in my_space all_data = [] for phrase in my_per_space._row2id: # make sure there are only NPs here if DocumentFeature.from_string(phrase.replace(' ', '_')).type in baroni_training_phrase_types: adj, noun = phrase.split('_') all_data.append((adj, noun, '%s_%s' % (adj, noun))) # train a composition model on the data and save it baroni = LexicalFunction(min_samples=baroni_threshold, learner=RidgeRegressionLearner()) guevara = FullAdditive(learner=RidgeRegressionLearner()) for composer, out_path in zip([baroni, guevara], [baroni_output_path, guevara_output_path]): composer.train(all_data, my_space, my_per_space) io_utils.save(composer, out_path) logging.info('Saved trained composer to %s', out_path)
import sys from composes.semantic_space.space import Space from composes.semantic_space.peripheral_space import PeripheralSpace from composes.transformation.scaling.ppmi_weighting import PpmiWeighting from composes.transformation.dim_reduction.svd import Svd from composes.utils import io_utils print("Loading ingredient space...",end="",file=sys.stderr) sys.stderr.flush() gastrovec = Space.build(data = "../corpus_collection/corpus.sm", rows = "../corpus_collection/corpus.rows", cols = "../corpus_collection/corpus.cols", format = "sm") print("done.", file=sys.stderr) io_utils.save(gastrovec, "gastrovec.pkl") print("Applying PPMI... ",end="", file=sys.stderr) sys.stderr.flush() gastrovec = gastrovec.apply(PpmiWeighting()) print("Applying SVD (20)... ",end="",file=sys.stderr) sys.stderr.flush() gastrovec = gastrovec.apply(Svd(20)) print("done.", file=sys.stderr) io_utils.save(gastrovec, "gastrovec.ppmi.svd20.pkl") print("Loading recipe peripheral space...",end="",file=sys.stderr) sys.stderr.flush() recipes = PeripheralSpace.build(gastrovec, data = "../corpus_collection/recipes.sm",
if __name__ == '__main__': # set constants data_path = sys.argv[0] + "/" + sys.argv[1] + "_" log_file = data_path + "all.log" core_cooccurrence_file = data_path + "GemmaData_sm" core_row_file = data_path + "GemmaData_rows" core_col_file = data_path + "GemmaData_cols" core_space_file = data_path + "core.pkl" # config log file log_utils.config_logging(log_file) print "Building semantic space from co-occurrence counts" core_space = Space.build(data=core_cooccurrence_file, rows=core_row_file, cols=core_col_file, format="sm") print "Applying ppmi weighting" core_space = core_space.apply(PpmiWeighting()) print "Applying feature selection" core_space = core_space.apply(TopFeatureSelection(5000)) print "Applying svd 500" core_space = core_space.apply(Svd(100)) print "Saving the semantic space" io_utils.save(core_space, core_space_file) #print "Finding 10 neighbors of " + sys.argv[1] #neighbors = core_space.get_neighbours(sys.argv[1], 10, CosSimilarity()) #print neighbors
#ex11.py #------- from composes.utils import io_utils from composes.composition.weighted_additive import WeightedAdditive # instantiate a weighted additive model my_comp = WeightedAdditive(alpha = 1, beta = 1) #save it to pickle io_utils.save(my_comp, "./data/out/model01.pkl") #print its parameters my_comp.export("./data/out/model01.params")
# Uses dissect toolkit to import the sparse matrix from sort-cooccur-matrix. # Applies ppmi weighting and exports the result to ./cooccurence/weighted/ # Note that this file is in python 2, not 3. import sys import os folder = os.path.expandvars('/home/luka/Downloads/dissect-master/src') if folder not in sys.path: sys.path.append(folder) from composes.semantic_space.space import Space #pathnames path = '/home/luka/ThLi/cooccurrence/' #import matrix holspace = Space.build(data=path + "spm1.sm", rows=path + "rows1.rows", cols=path + "cols1.cols", format="sm") #apply ppmi weighting from composes.transformation.scaling.ppmi_weighting import PpmiWeighting holspace = holspace.apply(PpmiWeighting()) #export matrix from composes.utils import io_utils io_utils.save(holspace, path + "weighted") holspace.export(path + "weighted_sm", format="sm")
input_file = "vectors.bin" output_file = "vectors.pkl" argv = sys.argv[1:] def readWord(i_stream): word = "" c = i_stream.read(1) while (c != ' ' and c != '\n'): word = word + c c = i_stream.read(1) return word with open(input_file, 'r') as i_stream: vocab_size = int(readWord(i_stream)) vector_size = int(readWord(i_stream)) print vocab_size, vector_size cooc_mat = np.zeros((vocab_size, vector_size)) vocabs = [] for i in range(vocab_size): vocabs.append(readWord(i_stream)) for j in range(vector_size): cooc_mat[i, j] = struct.unpack("<f", i_stream.read(4))[0] i_stream.read(1) print cooc_mat[0, 0], cooc_mat[0, 1] print cooc_mat[1, 0], cooc_mat[1, 1] space = Space(DenseMatrix(cooc_mat), vocabs, []) io_utils.save(space, output_file)
try: my_space = io_utils.load("my_space.pkl") except FileNotFoundError: my_space = Space.build(data="./data/in/spacew.sm", rows="./data/in/spacew.rows", cols="./data/in/spacew.cols", format="sm") print("Applying PPMI...") my_space = my_space.apply(PpmiWeighting()) print("Applying SVD...") my_space = my_space.apply(Svd(350)) io_utils.save(my_space, "my_space.pkl") print("Loading pairs...") with open('./data/out/dpair.pkl', 'rb') as f: pair_data = pickle.load(f) train_data = [] vocab = set(my_space.id2row) for tup in pair_data: if tup[1] in vocab and tup[2] in vocab: train_data.append(tup) ''' try: with open('temp_func.pkl', 'rb') as file: print("Loading model...") my_comp = pickle.load(file)
#ex05.py #------- from composes.utils import io_utils from composes.semantic_space.peripheral_space import PeripheralSpace from composes.transformation.scaling.ppmi_weighting import PpmiWeighting #load a space and apply ppmi on it my_space = io_utils.load("./data/out/ex01.pkl") my_space = my_space.apply(PpmiWeighting()) print my_space.cooccurrence_matrix print my_space.id2row #create a peripheral space my_per_space = PeripheralSpace.build(my_space, data="./data/in/ex05.sm", cols="./data/in/ex05.cols", format="sm") print my_per_space.cooccurrence_matrix print my_per_space.id2row #save the space io_utils.save(my_per_space, "./data/out/PER_SS.ex05.pkl")
from composes.semantic_space.space import Space from composes.utils import io_utils from composes.transformation.scaling.ppmi_weighting import PpmiWeighting from composes.transformation.scaling.row_normalization import RowNormalization import sys #create a space from co-occurrence counts in sparse format my_space = Space.build(data="../data/" + sys.argv[1] + ".sm", rows="../data/" + sys.argv[1] + ".rows", cols="../data/" + sys.argv[1] + ".cols", format="sm") my_space = my_space.apply(PpmiWeighting()) my_space = my_space.apply(RowNormalization()) #export the space in dense format and pkl format my_space.export("../spaces/" + sys.argv[1], format="dm") io_utils.save(my_space, "../spaces/" + sys.argv[1] + ".pkl")
from composes.semantic_space.space import Space from composes.utils import io_utils from composes.transformation.scaling.ppmi_weighting import PpmiWeighting from composes.transformation.scaling.row_normalization import RowNormalization from composes.transformation.dim_reduction.svd import Svd; import sys #create a space from co-occurrence counts in sparse format my_space = Space.build(data = "../data/"+sys.argv[1]+".sm", rows = "../data/"+sys.argv[1]+".rows", cols = "../data/"+sys.argv[1]+".cols", format = "sm") my_space = my_space.apply(PpmiWeighting()) my_space = my_space.apply(RowNormalization()) #apply svd reduction my_space = my_space.apply(Svd(1500)) #export the space in dense format and pkl format my_space.export("../spaces/"+sys.argv[1], format = "dm") io_utils.save(my_space, "../spaces/"+sys.argv[1]+".pkl")
#Convert .dm file to .pkl #Usage: python dm2pkl bnc.dm from composes.semantic_space.space import Space from composes.utils import io_utils import sys space = Space.build(data=sys.argv[1], format='dm') name = sys.argv[1][0:-3] io_utils.save(space, name+".pkl")