Пример #1
0
def apply_model(in_file, out_dir, model, trained_model, arg_space_files,
                alpha, beta, lambda_, out_format):

    print "Reading in data..."
    in_descr = in_file.split("/")[-1] 
    
    if not model is None: 
        model_obj = create_model(model, alpha, beta, lambda_)
    else:
        model_obj = io_utils.load(trained_model, CompositionModel)
        
    model_descr = type(model_obj).__name__
     
    arg_space = io_utils.load(arg_space_files[0], Space)
    arg_space2 = None
    if len(arg_space_files) == 2:
        arg_space2 = io_utils.load(arg_space_files[1], Space)
    
    data = io_utils.read_tuple_list(in_file, fields=[0, 1, 2])
    
    print "Applying composition model:%s" % model_descr
    if arg_space2 is None or type(model_obj) is LexicalFunction:
        composed_space = model_obj.compose(data, arg_space)
    else:
        composed_space = model_obj.compose(data, (arg_space, arg_space2))
    
    print "Printing..."
    out_file = ".".join([out_dir + "/COMPOSED_SS", model_descr, in_descr])    
    io_utils.save(composed_space, "%s.pkl" % out_file)
    
    if not out_format is None:
        composed_space.export(out_file, format=out_format)
Пример #2
0
def save_pkl_files(dsm_prefix, dsm, save_in_one_file=False):
    """
    Save the space to separate pkl files.
    :param dsm_prefix:
    :param dsm:
    """
    
    # Save in a single file (for small spaces)
    if save_in_one_file:
        io_utils.save(dsm, dsm_prefix + '.pkl')

    # Save in multiple files: npz for the matrix and pkl for the other data members of Space
    else:
        mat = coo_matrix(dsm.cooccurrence_matrix.get_mat())
        np.savez_compressed(dsm_prefix + 'cooc.npz', data=mat.data, row=mat.row, col=mat.col, shape=mat.shape)

        with open(dsm_prefix + '_row2id.pkl', 'wb') as f_out:
            pickle.dump(dsm._row2id, f_out, 2)

        with open(dsm_prefix + '_id2row.pkl', 'wb') as f_out:
            pickle.dump(dsm._id2row, f_out, 2)

        with open(dsm_prefix + '_column2id.pkl', 'wb') as f_out:
            pickle.dump(dsm._column2id, f_out, 2)

        with open(dsm_prefix + '_id2column.pkl', 'wb') as f_out:
            pickle.dump(dsm._id2column, f_out, 2)
def train_core(rows_file, cols_file, sm_file, ppmi=False, top_features=None, svd=None, save_location=None):
    """
    Takes co-occurance relate files and train the model.
    
    @rows_file    : All the entries which label the rows of the matrix.
    @columss_file : All the entries which label the columns of the matrix.
    @sm_file      : All the co-occurance entries in the corpus.
    @ppmi         : Whether we want to do the Ppmi weighting.
    @TopFeatures  : To restrict the no of features are to be selected in total. 
                    None, signifies all the features have to be selected.
    @Svd          : If we want to reduce the dimensions. Not advised though.
                    None, signifies that dimensions have to be reduced.

    """
    global final_model

    core_space = MySpace.xbuild(data=sm_file, rows=rows_file, 
                                cols=cols_file, format="sm")

    if ppmi:
        core_space = core_space.apply(PpmiWeighting())
    
    if top_features:
        core_space = core_space.apply(TopFeatureSelection(int(top_features)))

    if svd:
        core_space = core_space.apply(Svd(int(svd)))
    
    final_model = core_space

    if save_location:
        io_utils.save(final_model, save_location)
Пример #4
0
def train_model(in_file, out_dir, model, arg_space_files, phrase_space_file,
                regression, crossvalid, intercept, param, param_range,
                export_params):

    print "Reading in data..."
    in_descr = in_file.split("/")[-1]

    model_dict = {
        "weighted_add": WeightedAdditive,
        "full_add": FullAdditive,
        "lexical_func": LexicalFunction,
        "dilation": Dilation
    }
    learner_dict = {
        "ridge": RidgeRegressionLearner,
        "lstsq": LstsqRegressionLearner
    }

    arg_space = io_utils.load(arg_space_files[0], Space)
    arg_space2 = None
    if len(arg_space_files) == 2:
        arg_space2 = io_utils.load(arg_space_files[1], Space)

    phrase_space = io_utils.load(phrase_space_file, Space)

    if not model in model_dict:
        raise ValueError("Invalid model:%s for training" % model)

    model_cls = model_dict[model]
    if model_cls in (WeightedAdditive, Dilation):
        model_obj = model_cls()
    else:
        if regression == "ridge":
            regression_obj = learner_dict[regression](
                crossvalidation=crossvalid,
                intercept=intercept,
                param=param,
                param_range=param_range)
            model_obj = model_cls(learner=regression_obj)
        elif regression == "lstsq":
            regression_obj = learner_dict[regression](intercept=intercept)
            model_obj = model_cls(learner=regression_obj)

        else:
            model_obj = model_cls()

    train_data = io_utils.read_tuple_list(in_file, fields=[0, 1, 2])

    print "Training %s model" % model
    if arg_space2 is None or model == "lexical_func":
        model_obj.train(train_data, arg_space, phrase_space)
    else:
        model_obj.train(train_data, (arg_space, arg_space2), phrase_space)

    print "Printing..."
    out_file = ".".join([out_dir + "/TRAINED_COMP_MODEL", model, in_descr])
    io_utils.save(model_obj, "%s.pkl" % out_file)

    if export_params:
        model_obj.export("%s.params" % out_file)
def apply_model(in_file, out_dir, model, trained_model, arg_space_files, alpha,
                beta, lambda_, out_format):

    print("Reading in data...")
    in_descr = in_file.split("/")[-1]

    if not model is None:
        model_obj = create_model(model, alpha, beta, lambda_)
    else:
        model_obj = io_utils.load(trained_model, CompositionModel)

    model_descr = type(model_obj).__name__

    arg_space = io_utils.load(arg_space_files[0], Space)
    arg_space2 = None
    if len(arg_space_files) == 2:
        arg_space2 = io_utils.load(arg_space_files[1], Space)

    data = io_utils.read_tuple_list(in_file, fields=[0, 1, 2])

    print("Applying composition model:%s" % model_descr)
    if arg_space2 is None or type(model_obj) is LexicalFunction:
        composed_space = model_obj.compose(data, arg_space)
    else:
        composed_space = model_obj.compose(data, (arg_space, arg_space2))

    print("Printing...")
    out_file = ".".join([out_dir + "/COMPOSED_SS", model_descr, in_descr])
    io_utils.save(composed_space, "%s.pkl" % out_file)

    if not out_format is None:
        composed_space.export(out_file, format=out_format)
def save_space(space, space_type) :
	current_dir = os.getcwd()
	outfile_location = current_dir + '/space' 
	if not os.path.exists(outfile_location):
    		os.makedirs(outfile_location)
	
	outfilename = outfile_location + "/"+ space_type + ".pkl" 
	io_utils.save(space, outfilename) 
Пример #7
0
def print_space(space, out_dir, op_list, out_format):
                
    ops = [op for op in op_list if (op and (not op == "none"))]     
    space_descr = ".".join(ops)
    out_file = out_dir + "/" + space_descr
    
    io_utils.save(space, out_file + ".pkl")
    if not out_format is None:
        space.export(out_file, format=out_format)
Пример #8
0
def print_space(space, out_dir, op_list, out_format):

    ops = [op for op in op_list if (op and (not op == "none"))]
    space_descr = ".".join(ops)
    out_file = out_dir + "/" + space_descr

    io_utils.save(space, out_file + ".pkl")
    if not out_format is None:
        space.export(out_file, format=out_format)
Пример #9
0
def train_model(in_file, out_dir, model, arg_space_files, phrase_space_file, regression,
                crossvalid, intercept, param, param_range, export_params):

    print "Reading in data..."
    in_descr = in_file.split("/")[-1]

    model_dict = {"weighted_add": WeightedAdditive,
                  "full_add": FullAdditive,
                  "lexical_func": LexicalFunction,
                  "dilation": Dilation
                  }
    learner_dict = {"ridge": RidgeRegressionLearner,
                    "lstsq": LstsqRegressionLearner
                    }

    arg_space = io_utils.load(arg_space_files[0], Space)
    arg_space2 = None
    if len(arg_space_files) == 2:
        arg_space2 = io_utils.load(arg_space_files[1], Space)

    phrase_space = io_utils.load(phrase_space_file, Space)

    if not model in model_dict:
        raise ValueError("Invalid model:%s for training" % model)

    model_cls = model_dict[model]
    if model_cls in (WeightedAdditive, Dilation):
        model_obj = model_cls()
    else:
        if regression == "ridge":
            regression_obj = learner_dict[regression](crossvalidation=crossvalid,
                                                       intercept=intercept,
                                                       param=param,
                                                       param_range=param_range)
            model_obj = model_cls(learner=regression_obj)
        elif regression == "lstsq":
            regression_obj = learner_dict[regression](intercept=intercept)
            model_obj = model_cls(learner=regression_obj)

        else:
            model_obj = model_cls()

    train_data = io_utils.read_tuple_list(in_file, fields=[0, 1, 2])

    print "Training %s model" % model
    if arg_space2 is None or model == "lexical_func":
        model_obj.train(train_data, arg_space, phrase_space)
    else:
        model_obj.train(train_data, (arg_space, arg_space2), phrase_space)

    print "Printing..."
    out_file = ".".join([out_dir + "/TRAINED_COMP_MODEL", model, in_descr])
    io_utils.save(model_obj, "%s.pkl" % out_file)

    if export_params:
        model_obj.export("%s.params" % out_file)
Пример #10
0
def save_pkl_files(dsm, dsm_prefix, save_in_one_file=False, save_as_w2v=False):
    """
    Save semantic space (from DISSECT package) to different formats.
    :param dsm: the semantic space
    :param dsm_prefix: the prefix for the output files
    :param save_in_one_file: whether to save as one file (pkl or w2v) or separate files (npz for matrix and pkl for rows and columns)
    :param save_as_w2v: given save_in_one_file=True, whether to save it in w2v format or pkl
    """

    # Save in a single file (for small spaces)
    if save_in_one_file:
        # only useful for dense spaces
        if save_as_w2v:
            rows = np.array(dsm.cooccurrence_matrix.get_mat()).astype(object)
            id2row = np.array(
                [word.decode('utf-8') for word in dsm.get_id2row()])
            r, d = rows.shape
            id2row = id2row.reshape(-1, 1)
            rows = np.concatenate((id2row, rows), axis=1)
            np.savetxt(dsm_prefix + '.w2v',
                       rows,
                       fmt=["%s"] + [
                           '%.16g',
                       ] * d,
                       delimiter=' ',
                       newline='\n',
                       header='%d %d' % (r, d),
                       comments='',
                       encoding='utf-8')
        else:
            io_utils.save(dsm, dsm_prefix + '.pkl')

    # Save in multiple files: npz for the matrix and pkl for the other data members of Space
    else:
        mat = coo_matrix(dsm.cooccurrence_matrix.get_mat())
        np.savez_compressed(dsm_prefix + '.npz',
                            data=mat.data,
                            row=mat.row,
                            col=mat.col,
                            shape=mat.shape)

        with open(dsm_prefix + '_row2id.pkl', 'wb') as f_out:
            pickle.dump(dsm._row2id, f_out, 2)

        with open(dsm_prefix + '_id2row.pkl', 'wb') as f_out:
            pickle.dump(dsm._id2row, f_out, 2)

        with open(dsm_prefix + '_column2id.pkl', 'wb') as f_out:
            pickle.dump(dsm._column2id, f_out, 2)

        with open(dsm_prefix + '_id2column.pkl', 'wb') as f_out:
            pickle.dump(dsm._id2column, f_out, 2)
Пример #11
0
def transform_raw_per_space(raw_per_space, in_file_prefix, out_dir, out_format, core_space_file):

    in_file_descr = "PER_SS." + in_file_prefix.split("/")[-1]
    core_space = io_utils.load(core_space_file, Space)
    core_descr = ".".join(core_space_file.split("/")[-1].split(".")[0:-1])

    space = PeripheralSpace(core_space, raw_per_space.cooccurrence_matrix, raw_per_space.id2row, raw_per_space.row2id)

    print "Printing..."
    out_file_prefix = "%s/%s.%s" % (out_dir, in_file_descr, core_descr)
    io_utils.save(space, out_file_prefix + ".pkl")
    if not out_format is None:
        space.export(out_file_prefix, format=out_format)
Пример #12
0
def transform_raw_per_space(raw_per_space, in_file_prefix, out_dir, out_format,
                            core_space_file):

    in_file_descr = "PER_SS." + in_file_prefix.split("/")[-1]
    core_space = io_utils.load(core_space_file, Space)
    core_descr = ".".join(core_space_file.split("/")[-1].split(".")[0:-1])

    space = PeripheralSpace(core_space, raw_per_space.cooccurrence_matrix,
                            raw_per_space.id2row, raw_per_space.row2id)

    print("Printing...")
    out_file_prefix = "%s/%s.%s" % (out_dir, in_file_descr, core_descr)
    io_utils.save(space, out_file_prefix + ".pkl")
    if not out_format is None:
        space.export(out_file_prefix, format=out_format)
Пример #13
0
def main():
    """
    Compute the PPMI/PLMI matrix from a co-occurrence matrix, as default pickle the raw matrix.
    """

    # Get the arguments
    args = docopt(
        '''Compute the PPMI/PLMI matrix from a co-occurrence matrix, as default pickle the raw matrix.

    Usage:
        apply_ppmi_plmi.py <dsm_prefix> [-p | -l]

        <dsm_prefix> = the prefix for the input files (.sm for the matrix, .rows and .cols) and output files (.ppmi)
    
    Options:  
    -p, --ppmi  weight the matrice entries via PPMI
    -l, --plmi  weight the matrice entries via PLMI
    
    ''')

    dsm_prefix = args['<dsm_prefix>']
    is_ppmi = args['--ppmi']
    is_plmi = args['--plmi']

    postfix = ""

    # Create a space from co-occurrence counts in sparse format
    dsm = Space.build(data=dsm_prefix + '.sm',
                      rows=dsm_prefix + '.rows',
                      cols=dsm_prefix + '.cols',
                      format='sm')

    if is_ppmi:
        # Apply ppmi weighting
        dsm = dsm.apply(PpmiWeighting())
        postfix = "_ppmi"
    elif is_plmi:
        # Apply plmi weighting
        dsm = dsm.apply(PlmiWeighting())
        postfix = "_plmi"

    # Save the Space object in pickle format
    io_utils.save(dsm, dsm_prefix + postfix + '.pkl')
Пример #14
0
    def write_pkl(self):
        """
        Create spaces from co-occurrence counts in sparse format (.sm)
        """

        # For direction DE-EN
        my_space_1 = Space.build(
            data=OUTPUT_FILE_DE_DE_EN_SM, rows=OUTPUT_FILE_DE_WORDS_ROW, cols=OUTPUT_FILE_DE_EN_WORDS_COL, format="sm"
        )

        # For direction EN-DE
        my_space_2 = Space.build(
            data=OUTPUT_FILE_EN_EN_DE_SM, rows=OUTPUT_FILE_EN_WORDS_ROW, cols=OUTPUT_FILE_DE_EN_WORDS_COL, format="sm"
        )

        # Save the space objects in pickle format
        io_utils.save(my_space_1, OUTPUT_FILE_DE_DE_EN_PKL)
        io_utils.save(my_space_2, OUTPUT_FILE_EN_EN_DE_PKL)

        print >> stderr, "Pickle file 1 written out:", OUTPUT_FILE_DE_DE_EN_PKL
        print >> stderr, "Pickle file 2 written out:", OUTPUT_FILE_EN_EN_DE_PKL
Пример #15
0
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('spaces_dir')
    ap.add_argument('words_list_dir')
    args = ap.parse_args()
    spaces_dir = args.spaces_dir
    words_list_dir = args.words_list_dir
    #    '/mnt/8tera/shareclic/lucaNgrams/5grams/ITA_5grams/matrices/pkl_matrices/'
    #space_filename = '../spaces/cbow1_wind5_hs0_neg10_size400_smpl1e-05.pkl'

    output_dir = os.path.join('output', os.path.basename(words_list_dir))
    mkdir_p(output_dir)
    all_words = set(l.strip() for words_filename in glob.glob(os.path.join(words_list_dir, '*'))
        for l in file(words_filename))

    for words_filename in glob.glob(os.path.join(words_list_dir, '*')):
        space_filename = os.path.join(spaces_dir,
            os.path.splitext(os.path.basename(words_filename))[0] + '.pkl')
        if not os.path.isfile(space_filename):
            logging.error('{0} not found: ignoring'.format(space_filename ))
            continue

        context_filename = hashlib.md5(spaces_dir).hexdigest() + '.txt'
        context_words = load_context_vocab(context_filename, spaces_dir)

        logging.debug('Processing {0}'.format(space_filename))
        sp = io_utils.load(space_filename)

        #words = [l.strip() for l in file(words_filename)]
        filtered_words = [w for w in all_words if w in sp.row2id]
        words_vectors = sp.get_rows(filtered_words)
        context_vectors = sp.get_rows(context_words)

        m = words_vectors * context_vectors.transpose()

        sp2 = Space(m, filtered_words, context_words)

        io_utils.save(sp2,
            os.path.join(output_dir,os.path.basename(space_filename)))
        recipes[words[0]] = words[1:]
        if len(words)-1 > max_size:
            max_size = len(words)-1

WA = WeightedAdditive(alpha = 1, beta = 1)
last_space = None
number = count()
for size in xrange(max_size,1,-1):
    relevant = (rec for rec in recipes if len(recipes[rec]) == size)
    print(size)
    composition = []
    for recipe in relevant:
        old = recipes[recipe]
        if size == 2:
            name = recipe
        else:
            name = "comp_" + str(next(number))
        if old[-2] in stacked_space.id2row:
            composition.append((old[-1],old[-2],name))
            recipes[recipe].pop(-1)
            recipes[recipe].pop(-1)
            recipes[recipe].append(name)
        else:
            recipes[recipe].pop(-2)
    if composition:
        last_space = WA.compose(composition, stacked_space)
        if size != 2:
            stacked_space = Space.vstack(stacked_space, last_space)

io_utils.save(last_space, "recicomp.pkl")
Пример #17
0
#ex10.py
#-------
from composes.utils import io_utils
from composes.composition.weighted_additive import WeightedAdditive

#load a space
my_space = io_utils.load("./data/out/ex10.pkl")

print my_space.id2row
print my_space.cooccurrence_matrix

# instantiate a weighted additive model
my_comp = WeightedAdditive(alpha = 1, beta = 1)

# use the model to compose words in my_space
composed_space = my_comp.compose([("good", "book", "good_book"),
                                  ("good", "car", "good_car")], 
                                 my_space)

print composed_space.id2row
print composed_space.cooccurrence_matrix

#save the composed space
io_utils.save(composed_space, "data/out/PHRASE_SS.ex10.pkl")


Пример #18
0
#ex02.py
#-------
from composes.semantic_space.space import Space
from composes.utils import io_utils

#create a space from co-occurrence counts in sparse format
my_space = Space.build(data="./data/in/ex01.sm",
                       rows="./data/in/ex01.rows",
                       cols="./data/in/ex01.cols",
                       format="sm")

#print the co-occurrence matrix of the space
print my_space.cooccurrence_matrix

#save the Space object in pickle format
io_utils.save(my_space, "./data/out/ex01.pkl")

#load the saved object
my_space2 = io_utils.load("./data/out/ex01.pkl")

#print the co-occurrence matrix of the loaded space
print my_space2.cooccurrence_matrix
Пример #19
0
#ex02.py
#-------
from composes.semantic_space.space import Space
from composes.utils import io_utils

#create a space from co-occurrence counts in sparse format
my_space = Space.build(data = "./data/in/ex01.sm",
                       rows = "./data/in/ex01.rows",
                       cols = "./data/in/ex01.cols",
                       format = "sm")

#print the co-occurrence matrix of the space
print my_space.cooccurrence_matrix

#save the Space object in pickle format
io_utils.save(my_space, "./data/out/ex01.pkl")

#load the saved object
my_space2 = io_utils.load("./data/out/ex01.pkl")

#print the co-occurrence matrix of the loaded space
print my_space2.cooccurrence_matrix

Пример #20
0
import sys
import os

folder = os.path.expandvars('/home/luka/Downloads/dissect-master/src')
if folder not in sys.path:
    sys.path.append(folder)
    
from composes.semantic_space.space import Space

holspace = Space.build(data = "/home/luka/ThLi/cooccurrence/spm1.sm",
                       rows = "/home/luka/ThLi/cooccurrence/rows1.rows",
                       cols = "/home/luka/ThLi/cooccurrence/cols1.cols",
                       format = "sm")

#%%

from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
holspace = holspace.apply(PpmiWeighting())

#%%

from composes.utils import io_utils
io_utils.save(holspace, "/home/luka/ThLi/cooccurrence/weighted")

#%%

holspace.export("/home/luka/ThLi/cooccurrence/weighted_sm", format = "sm")
Пример #21
0
#ex05.py
#-------
from composes.utils import io_utils
from composes.semantic_space.peripheral_space import PeripheralSpace
from composes.transformation.scaling.ppmi_weighting import PpmiWeighting


#load a space and apply ppmi on it
my_space = io_utils.load("./data/out/ex01.pkl")
my_space = my_space.apply(PpmiWeighting())

print(my_space.cooccurrence_matrix)
print(my_space.id2row)

#create a peripheral space
my_per_space = PeripheralSpace.build(my_space,
                                     data="./data/in/ex05.sm",
                                     cols="./data/in/ex05.cols",
                                     format="sm")

print(my_per_space.cooccurrence_matrix)
print(my_per_space.id2row)

#save the space
io_utils.save(my_per_space, "./data/out/PER_SS.ex05.pkl")

Пример #22
0
def train_grefenstette_multistep_composer(all_vectors_file, root_dir):
    """
    Train Grefenstette et al's multistep regression VO/SVO model
    Adapted from dissect's ex19.py
    :param all_vectors_file: file containing N, V, VO and SVO vectors
    :param root_dir: where to write temp files and output
    """
    mkdirs_if_not_exists(root_dir)
    vo_composer_output_file = join(root_dir, 'vo_comp.pkl')
    svo_composer_output_file = join(root_dir, 'svo_comp.pkl')

    filename = basename(all_vectors_file)
    noun_events_file = join(root_dir, '%s-onlyN.tmp' % filename)
    # verb_events_file = join(root_dir, '%s-onlyV.tmp' % filename)
    # vo_events_file = join(root_dir, '%s-onlyVO.tmp' % filename)
    svo_events_file = join(root_dir, '%s-onlySVO.tmp' % filename)

    # this has unigrams and observed phrases
    thes = Vectors.from_tsv(all_vectors_file)
    thes.to_tsv(noun_events_file,
                entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N')
    _translate_byblo_to_dissect(noun_events_file)
    # thes.to_tsv(verb_events_file,
    # entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'V')
    # _translate_byblo_to_dissect(verb_events_file)
    # thes.to_tsv(vo_events_file,
    #             entry_filter=lambda x: x.type == 'VO')
    # _translate_byblo_to_dissect(vo_events_file)
    thes.to_tsv(svo_events_file,
                entry_filter=lambda x: x.type == 'SVO')
    _translate_byblo_to_dissect(svo_events_file)

    train_vo_data, train_v_data = [], []
    for phrase in thes.keys():
        df = DocumentFeature.from_string(phrase)
        if df.type == 'SVO':
            train_vo_data.append((str(df[1:]), str(df[0]), str(df)))
        if df.type == 'VO':
            train_v_data.append((str(df[0]), str(df[1]), str(df)))

    # logging.info('train_vo_data %r', len(train_vo_data))
    # logging.info('train_v_data %r', len(train_v_data))

    # load N and SVO spaces
    n_space = Space.build(data=noun_events_file + '.sm',
                          cols=noun_events_file + '.cols',
                          format="sm")

    svo_space = Space.build(data=svo_events_file + '.sm',
                            cols=svo_events_file + '.cols',
                            format="sm")

    logging.info("Input SVO training space:")
    logging.info(svo_space.id2row)
    # logging.info(svo_space.cooccurrence_matrix)

    # 1. train a model to learn VO functions on train data: VO N -> SVO
    logging.info("Step 1 training")
    vo_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2)  # Gref et al 2013, §5 says 3
    vo_model.train(train_vo_data, n_space, svo_space)
    io_utils.save(vo_model, vo_composer_output_file)

    # 2. train a model to learn V functions on train data: V N -> VO
    # where VO space: function space learned in step 1
    logging.info("Step 2 training")
    vo_space = vo_model.function_space
    v_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2)
    v_model.train(train_v_data, n_space, vo_space)
    io_utils.save(v_model, svo_composer_output_file)
Пример #23
0
def train_baroni_guevara_composers(all_vectors,
                                   ROOT_DIR,
                                   baroni_output_path, guevara_output_path,
                                   baroni_threshold=10):
    """

    :type all_vectors: str; path to vectors file containing both N and observed AN vectors
    :type ROOT_DIR: str; where to write temp files
    :type baroni_output_path: str; where to write pickled baroni composer
    :type guevara_output_path: str
    :type baroni_threshold: int
    """
    SVD_DIMS = 100
    baroni_training_phrase_types = {'AN', 'NN'}  # what kind of NPs to train Baroni composer for

    # prepare the input files to be fed into Dissect
    mkdirs_if_not_exists(ROOT_DIR)

    filename = basename(all_vectors)
    noun_events_file = join(ROOT_DIR, '%s-onlyN-SVD%d.tmp' % (filename, SVD_DIMS))
    NPs_events_file = join(ROOT_DIR, '%s-onlyPhrases-SVD%d.tmp' % (filename, SVD_DIMS))

    thes = Vectors.from_tsv(all_vectors, lowercasing=False)
    thes.to_tsv(noun_events_file,
                entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N')
    _translate_byblo_to_dissect(noun_events_file)

    thes.to_tsv(NPs_events_file,
                entry_filter=lambda x: x.type in baroni_training_phrase_types,
                row_transform=lambda x: str(x).replace(' ', '_'))
    _translate_byblo_to_dissect(NPs_events_file)

    my_space = Space.build(data="{}.sm".format(noun_events_file),
                           rows="{}.rows".format(noun_events_file),
                           cols="{}.cols".format(noun_events_file),
                           format="sm")
    logging.info('Each unigram vector has dimensionality %r', my_space.element_shape)

    # create a peripheral space
    my_per_space = PeripheralSpace.build(my_space,
                                         data="{}.sm".format(NPs_events_file),
                                         rows="{}.rows".format(NPs_events_file),
                                         # The columns of the peripheral space have to be identical to those
                                         # in the core space (including their order)!
                                         cols="{}.cols".format(NPs_events_file),
                                         format="sm")
    logging.info('Each phrase vector has dimensionality %r', my_per_space.element_shape)

    # use the model to compose words in my_space
    all_data = []
    for phrase in my_per_space._row2id:
        # make sure there are only NPs here
        if DocumentFeature.from_string(phrase.replace(' ', '_')).type in baroni_training_phrase_types:
            adj, noun = phrase.split('_')
            all_data.append((adj, noun, '%s_%s' % (adj, noun)))

    # train a composition model on the data and save it
    baroni = LexicalFunction(min_samples=baroni_threshold, learner=RidgeRegressionLearner())
    guevara = FullAdditive(learner=RidgeRegressionLearner())
    for composer, out_path in zip([baroni, guevara],
                                  [baroni_output_path, guevara_output_path]):
        composer.train(all_data, my_space, my_per_space)
        io_utils.save(composer, out_path)
        logging.info('Saved trained composer to %s', out_path)
Пример #24
0
import sys
from composes.semantic_space.space import Space
from composes.semantic_space.peripheral_space import PeripheralSpace
from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
from composes.transformation.dim_reduction.svd import Svd
from composes.utils import io_utils

print("Loading ingredient space...",end="",file=sys.stderr)
sys.stderr.flush()
gastrovec = Space.build(data = "../corpus_collection/corpus.sm",
                        rows = "../corpus_collection/corpus.rows",
                        cols = "../corpus_collection/corpus.cols",
                        format = "sm")
print("done.", file=sys.stderr)

io_utils.save(gastrovec, "gastrovec.pkl")

print("Applying PPMI... ",end="", file=sys.stderr)
sys.stderr.flush()
gastrovec = gastrovec.apply(PpmiWeighting())
print("Applying SVD (20)... ",end="",file=sys.stderr)
sys.stderr.flush()
gastrovec = gastrovec.apply(Svd(20))
print("done.", file=sys.stderr)

io_utils.save(gastrovec, "gastrovec.ppmi.svd20.pkl")

print("Loading recipe peripheral space...",end="",file=sys.stderr)
sys.stderr.flush()
recipes = PeripheralSpace.build(gastrovec,
                                  data = "../corpus_collection/recipes.sm",
Пример #25
0
#ex10.py
#-------
from composes.utils import io_utils
from composes.composition.weighted_additive import WeightedAdditive

#load a space
my_space = io_utils.load("./data/out/ex10.pkl")

print my_space.id2row
print my_space.cooccurrence_matrix

# instantiate a weighted additive model
my_comp = WeightedAdditive(alpha = 1, beta = 1)

# use the model to compose words in my_space
composed_space = my_comp.compose([("good", "book", "good_book"),
                                  ("good", "car", "good_car")],
                                 my_space)

print composed_space.id2row
print composed_space.cooccurrence_matrix

#save the composed space
io_utils.save(composed_space, "data/out/PHRASE_SS.ex10.pkl")


if __name__ == '__main__':
    # set constants
    data_path = sys.argv[0] + "/" + sys.argv[1] + "_"

    log_file = data_path + "all.log"
    core_cooccurrence_file = data_path + "GemmaData_sm"
    core_row_file = data_path + "GemmaData_rows"
    core_col_file = data_path + "GemmaData_cols"
    core_space_file = data_path + "core.pkl"
    
    # config log file
    log_utils.config_logging(log_file)
    
    print "Building semantic space from co-occurrence counts"
    core_space = Space.build(data=core_cooccurrence_file, rows=core_row_file,
                             cols=core_col_file, format="sm")
    
    print "Applying ppmi weighting"
    core_space = core_space.apply(PpmiWeighting())
    print "Applying feature selection"
    core_space = core_space.apply(TopFeatureSelection(5000))
    print "Applying svd 500"
    core_space = core_space.apply(Svd(100))
    
    print "Saving the semantic space"
    io_utils.save(core_space, core_space_file)
    
    #print "Finding 10 neighbors of " + sys.argv[1]
    #neighbors = core_space.get_neighbours(sys.argv[1], 10, CosSimilarity())
    #print neighbors
Пример #27
0
#ex11.py
#-------
from composes.utils import io_utils
from composes.composition.weighted_additive import WeightedAdditive

# instantiate a weighted additive model
my_comp = WeightedAdditive(alpha = 1, beta = 1)

#save it to pickle
io_utils.save(my_comp, "./data/out/model01.pkl")

#print its parameters
my_comp.export("./data/out/model01.params")

# Uses dissect toolkit to import the sparse matrix from sort-cooccur-matrix.
# Applies ppmi weighting and exports the result to ./cooccurence/weighted/
# Note that this file is in python 2, not 3.

import sys
import os
folder = os.path.expandvars('/home/luka/Downloads/dissect-master/src')
if folder not in sys.path:
    sys.path.append(folder)
from composes.semantic_space.space import Space

#pathnames
path = '/home/luka/ThLi/cooccurrence/'

#import matrix
holspace = Space.build(data=path + "spm1.sm",
                       rows=path + "rows1.rows",
                       cols=path + "cols1.cols",
                       format="sm")

#apply ppmi weighting
from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
holspace = holspace.apply(PpmiWeighting())

#export matrix
from composes.utils import io_utils
io_utils.save(holspace, path + "weighted")
holspace.export(path + "weighted_sm", format="sm")
input_file = "vectors.bin"
output_file = "vectors.pkl"

argv = sys.argv[1:]


def readWord(i_stream):
    word = ""
    c = i_stream.read(1)
    while (c != ' ' and c != '\n'):
        word = word + c
        c = i_stream.read(1)
    return word


with open(input_file, 'r') as i_stream:
    vocab_size = int(readWord(i_stream))
    vector_size = int(readWord(i_stream))
    print vocab_size, vector_size
    cooc_mat = np.zeros((vocab_size, vector_size))
    vocabs = []
    for i in range(vocab_size):
        vocabs.append(readWord(i_stream))
        for j in range(vector_size):
            cooc_mat[i, j] = struct.unpack("<f", i_stream.read(4))[0]
        i_stream.read(1)
    print cooc_mat[0, 0], cooc_mat[0, 1]
    print cooc_mat[1, 0], cooc_mat[1, 1]
    space = Space(DenseMatrix(cooc_mat), vocabs, [])
    io_utils.save(space, output_file)
try:
    my_space = io_utils.load("my_space.pkl")
except FileNotFoundError:

    my_space = Space.build(data="./data/in/spacew.sm",
                           rows="./data/in/spacew.rows",
                           cols="./data/in/spacew.cols",
                           format="sm")

    print("Applying PPMI...")
    my_space = my_space.apply(PpmiWeighting())

    print("Applying SVD...")
    my_space = my_space.apply(Svd(350))
    io_utils.save(my_space, "my_space.pkl")

print("Loading pairs...")
with open('./data/out/dpair.pkl', 'rb') as f:
    pair_data = pickle.load(f)

train_data = []
vocab = set(my_space.id2row)
for tup in pair_data:
    if tup[1] in vocab and tup[2] in vocab:
        train_data.append(tup)
'''
try:
    with open('temp_func.pkl', 'rb') as file:
        print("Loading model...")
        my_comp = pickle.load(file)
Пример #31
0
#ex05.py
#-------
from composes.utils import io_utils
from composes.semantic_space.peripheral_space import PeripheralSpace
from composes.transformation.scaling.ppmi_weighting import PpmiWeighting


#load a space and apply ppmi on it
my_space = io_utils.load("./data/out/ex01.pkl")
my_space = my_space.apply(PpmiWeighting())

print my_space.cooccurrence_matrix
print my_space.id2row

#create a peripheral space 
my_per_space = PeripheralSpace.build(my_space,
                                     data="./data/in/ex05.sm",
                                     cols="./data/in/ex05.cols",
                                     format="sm")

print my_per_space.cooccurrence_matrix
print my_per_space.id2row

#save the space
io_utils.save(my_per_space, "./data/out/PER_SS.ex05.pkl")

Пример #32
0
from composes.semantic_space.space import Space
from composes.utils import io_utils
from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
from composes.transformation.scaling.row_normalization import RowNormalization

import sys

#create a space from co-occurrence counts in sparse format
my_space = Space.build(data="../data/" + sys.argv[1] + ".sm",
                       rows="../data/" + sys.argv[1] + ".rows",
                       cols="../data/" + sys.argv[1] + ".cols",
                       format="sm")

my_space = my_space.apply(PpmiWeighting())
my_space = my_space.apply(RowNormalization())

#export the space in dense format and pkl format
my_space.export("../spaces/" + sys.argv[1], format="dm")
io_utils.save(my_space, "../spaces/" + sys.argv[1] + ".pkl")
Пример #33
0
from composes.semantic_space.space import Space
from composes.utils import io_utils
from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
from composes.transformation.scaling.row_normalization import RowNormalization
from composes.transformation.dim_reduction.svd import Svd;

import sys

#create a space from co-occurrence counts in sparse format
my_space = Space.build(data = "../data/"+sys.argv[1]+".sm",
                       rows = "../data/"+sys.argv[1]+".rows",
                       cols = "../data/"+sys.argv[1]+".cols",
                       format = "sm")
                       
my_space = my_space.apply(PpmiWeighting())
my_space = my_space.apply(RowNormalization())

#apply svd reduction
my_space = my_space.apply(Svd(1500))

    
#export the space in dense format and pkl format
my_space.export("../spaces/"+sys.argv[1], format = "dm")
io_utils.save(my_space, "../spaces/"+sys.argv[1]+".pkl")
Пример #34
0
#Convert .dm file to .pkl
#Usage: python dm2pkl bnc.dm

from composes.semantic_space.space import Space
from composes.utils import io_utils
import sys

space = Space.build(data=sys.argv[1], format='dm')
name = sys.argv[1][0:-3]
io_utils.save(space, name+".pkl")