def embed(file='data/mml.txt'): strings = [] with open(file) as f: strings = [s.rstrip() for s in f.readlines()] features = StringCharFeatures(strings,DNA) kernel = WeightedDegreeStringKernel(10) distance = KernelDistance(1.0,kernel) distance.init(features,features) converter = MultidimensionalScaling() converter.set_target_dim(2) return converter.embed_distance(distance).get_feature_matrix(), strings
def converter_multidimensionalscaling_modular(data_fname): try: import numpy from modshogun import RealFeatures, MultidimensionalScaling, EuclideanDistance, CSVFile features = RealFeatures(CSVFile(data_fname)) distance_before = EuclideanDistance() distance_before.init(features, features) converter = MultidimensionalScaling() converter.set_target_dim(2) converter.set_landmark(False) embedding = converter.apply(features) distance_after = EuclideanDistance() distance_after.init(embedding, embedding) distance_matrix_after = distance_after.get_distance_matrix() distance_matrix_before = distance_before.get_distance_matrix() return numpy.linalg.norm(distance_matrix_after - distance_matrix_before) / numpy.linalg.norm( distance_matrix_before) < 1e-6 except ImportError: print('No Eigen3 available')
def embed(file='mml.pickle',N=500): strings = [] print '%s reading %s' % (datetime.datetime.now(), file) file_contents = load(file) print '%s there are %d strings in %s' % (datetime.datetime.now(), len(file_contents['examples']), file) positives = numpy.where(numpy.array(file_contents['labels'])>0)[0] selected_idxs = random.sample(positives,N) for i in selected_idxs: strings.append(file_contents['examples'][i]) features = StringCharFeatures(strings,DNA) kernel = WeightedDegreeStringKernel(10) distance = KernelDistance(1.0,kernel) distance.init(features,features) converter = MultidimensionalScaling() converter.set_target_dim(2) return converter.embed_distance(distance).get_feature_matrix(), strings
def embed(file='mml.pickle'): strings = [] print '%s reading %s' % (datetime.datetime.now(), file) file_contents = load(file) print '%s there are %d strings in %s' % (datetime.datetime.now(), len(file_contents['examples']), file) count = 0 for i in xrange(len(file_contents['labels'])): if file_contents['labels'][i] > 0.0 and count<1000: strings.append(file_contents['examples'][i]) count += 1 features = StringCharFeatures(strings,DNA) kernel = WeightedDegreeStringKernel(10) distance = KernelDistance(1.0,kernel) distance.init(features,features) converter = MultidimensionalScaling() converter.set_target_dim(2) return converter.embed_distance(distance).get_feature_matrix(), strings
def embed(file='mml.pickle'): strings = [] print '%s reading %s' % (datetime.datetime.now(), file) file_contents = load(file) print '%s there are %d strings in %s' % ( datetime.datetime.now(), len(file_contents['examples']), file) count = 0 for i in xrange(len(file_contents['labels'])): if file_contents['labels'][i] > 0.0 and count < 1000: strings.append(file_contents['examples'][i]) count += 1 features = StringCharFeatures(strings, DNA) kernel = WeightedDegreeStringKernel(10) distance = KernelDistance(1.0, kernel) distance.init(features, features) converter = MultidimensionalScaling() converter.set_target_dim(2) return converter.embed_distance(distance).get_feature_matrix(), strings
def converter_multidimensionalscaling_modular (data_fname): try: import numpy from modshogun import RealFeatures, MultidimensionalScaling, EuclideanDistance, CSVFile features = RealFeatures(CSVFile(data_fname)) distance_before = EuclideanDistance() distance_before.init(features,features) converter = MultidimensionalScaling() converter.set_target_dim(2) converter.set_landmark(False) embedding = converter.apply(features) distance_after = EuclideanDistance() distance_after.init(embedding,embedding) distance_matrix_after = distance_after.get_distance_matrix() distance_matrix_before = distance_before.get_distance_matrix() return numpy.linalg.norm(distance_matrix_after-distance_matrix_before)/numpy.linalg.norm(distance_matrix_before) < 1e-6 except ImportError: print('No Eigen3 available')