예제 #1
0
def save(dataset_name, write_separator=";"):
    """
    Compute the mappings <relationship name -> list of types> for a specific dataset, and save them in a file.

    :param write_separator: the separator to use when writing the file
    :param dataset_name: the name of the dataset for which to compute the mappings
    """

    dataset = datasets.Dataset(dataset_name)
    relation_2_types = compute(dataset)

    lines = []
    for relation in sorted(dataset.relationships):
        lines.append(
            write_separator.join(
                [relation, ",".join(relation_2_types[relation])]) + "\n")

    output_filepath = os.path.join(dataset.home, FOLDER, FILENAME)

    print(
        "Writing the mappings <entity name -> in, out and overall degree> for %s training set in %s..."
        % (dataset_name, output_filepath))

    with open(output_filepath, "w") as output_file:
        output_file.writelines(lines)
예제 #2
0
def save(dataset_name, write_separator=";"):
    """
    Compute the mapping <relation name -> number of mentions>
        from the training set of a specific dataset
        and save it in a specific file in the home folder for that dataset
    :param dataset_name: the name of the dataset to compute and save the mappings for
    :param write_separator: the separator to use when writing the mappings
    """

    dataset = datasets.Dataset(dataset_name)
    relation_2_mentions = compute(dataset)

    lines = []
    for relation in relation_2_mentions:
        lines.append(write_separator.join([relation, str(relation_2_mentions[relation])]) + "\n")

    output_filepath = os.path.join(dataset.home, FOLDER, FILENAME)

    print("Writing the mapping <relation name -> number of mentions> for %s training set in %s..." % (dataset_name, output_filepath))

    with open(output_filepath, "w") as output_file:
        output_file.writelines(lines)

#save(datasets.FB15K)
#save(datasets.WN18)
#save(datasets.FB15K_237)
#save(datasets.WN18RR)
# save(datasets.YAGO3_10)
예제 #3
0
    def __init__(
        self,
        *args,
        identifier: Identifier = None,
        dataset_fmt: str = None,
        **kwargs,
    ):

        logger.debug("Creating Dataset.")

        # Internally, keep track of data inside a dataset
        self._dataset = None

        # Set a default dataset_fmt
        if len(args) == 1 and isinstance(args[0], datasets.Dataset):
            # If no `dataset_fmt` is passed in with a datasets.Dataset, just assume
            # `dataset_fmt` should be 'datasets'
            dataset_fmt = "datasets" if dataset_fmt is None else dataset_fmt
        else:
            # Default to `in_memory`, unless a dataset_fmt is passed in explicitly
            dataset_fmt = "in_memory" if dataset_fmt is None else dataset_fmt

        # InMemoryDataset
        if dataset_fmt == "in_memory":
            if len(args) == 1 and isinstance(args[0], InMemoryDataset):
                # Assign the dataset directly
                self._dataset = args[0]
            else:
                # Assign the dataset after converting to an InMemoryDataset
                self._dataset = InMemoryDataset(*args, **kwargs)

        # datasets.Dataset
        elif dataset_fmt == "datasets":
            if len(args) == 1 and isinstance(args[0], datasets.Dataset):
                # Assign the dataset directly
                self._dataset = args[0]
            else:
                # Assign the dataset after converting to a datasets.Dataset
                self._dataset = datasets.Dataset(*args, **kwargs)

        else:
            raise NotImplementedError(
                "`dataset_fmt` must be one of ['in_memory', 'datasets'].")

        # Store the dataset format
        self._dataset_fmt = dataset_fmt

        # Call the InteractionTapeHierarchyMixin constructor
        InteractionTapeHierarchyMixin.__init__(self)

        # Create an identifier
        self._identifier = (self._autobuild_identifier()
                            if not identifier else identifier)

        # Create logging directory
        self._create_logdir()

        # Add an index to the dataset
        if not self.has_index:
            self._add_index()
예제 #4
0
def save(dataset_name, write_separator = ";"):
    """
    Compute the mappings
            <entity name -> in degree>
            <entity name -> out degree>
            <entity name -> overall degree>
    and save them in a file.

    :param write_separator: the separator to use when writing the file
    :param dataset_name: the name of the dataset for which to compute the mappings
    """

    dataset = datasets.Dataset(dataset_name)
    entity_in_degrees, entity_out_degrees, entity_degrees = compute(dataset)

    lines = []
    for entity in entity_degrees:
        lines.append(write_separator.join([entity,
                                     str(entity_in_degrees[entity]),
                                     str(entity_out_degrees[entity]),
                                     str(entity_degrees[entity])]) + "\n")


    output_filepath = os.path.join(dataset.home, FOLDER, FILENAME)

    print("Writing the mappings <entity name -> in, out and overall degree> for %s training set in %s..." % (dataset_name, output_filepath))

    with open(output_filepath, "w") as output_file:
        output_file.writelines(lines)
예제 #5
0
def elmo_emb_dataset(task: str, split: str):

    dataset = ds.Dataset()
    if task == "secondary_structure":
        dataset = ds.SecondaryStructureDataset(SOURCE_DATA_PATH, split)
    elif task == "remote_homology":
        dataset = ds.RemoteHomologyDataset(SOURCE_DATA_PATH, split)
    elif task == "stability":
        dataset = ds.StabilityDataset(SOURCE_DATA_PATH, split)
    elif task == "fluorescence":
        dataset = ds.FluorescenceDataset(SOURCE_DATA_PATH, split)
    else:
        print("no data set for task " + task)
        return

    # set up file
    filename = task + "_" + split
    filepath = "./elmo/" + task + "/" + filename + ".p"
    os.makedirs(os.path.dirname(filepath), exist_ok=True)

    # do emb
    print("Embedding dataset " + filename)
    seq_dict = data_set_to_seq_dict(dataset)
    emb_dict = emb.get_embeddings(seq_dict, verbose=True)

    print('Writing embeddings to: {}'.format(filepath))
    with open(filepath, 'wb') as f:
        pickle.dump(emb_dict, f)
def load_docs(dataset_name, word_vectors):
    return (datasets.Dataset(dataset_name,
                             model_properties.MentionRankingProps(),
                             word_vectors),
            zip(
                utils.load_pickle(directories.DOCUMENTS + dataset_name +
                                  '_docs.pkl'),
                utils.load_pickle(directories.ACTION_SPACE + dataset_name +
                                  '_action_space.pkl')))
예제 #7
0
    def SetData(self, name, val, symerr=None, negerr=None, poserr=None):
        """Set dataset with name with values (and optionally errors)."""

        data = datasets.Dataset(val, symerr, negerr, poserr)
        op = operations.OperationDatasetSet(name, data)
        self.document.applyOperation(op)

        if self.verbose:
            print "Set dataset '%s':" % name
            print " Values = %s" % str(data.data)
            print " Symmetric errors = %s" % str(data.serr)
            print " Negative errors = %s" % str(data.nerr)
            print " Positive errors = %s" % str(data.perr)
예제 #8
0
    def do(self, document):
        """Create the dataset."""
        OperationDatasetCreate.do(self, document)

        p = self.parts.copy()
        p['parametric'] = self.parametric
        ds = datasets.DatasetExpression(**p)
        ds.document = document

        if not self.link:
            # copy these values if we don't want to link
            ds = datasets.Dataset(data=ds.data,
                                  serr=ds.serr,
                                  perr=ds.perr,
                                  nerr=ds.nerr)

        document.setData(self.datasetname, ds)
        return ds
예제 #9
0
    def setData(self, document, linkedfile=None):
        """Set the read-in datasets in the document."""

        # iterate over each read-in dataset
        dsnames = []

        for name in self.data.iterkeys():
            # skip error data here, they are used below
            # error data name contains \0
            if name.find('\0') >= 0:
                continue

            dsnames.append(name)

            # get data and errors (if any)
            data = []
            for k in (name, name + '\0+-', name + '\0+', name + '\0-'):
                data.append(self.data.get(k, None))

            # make them have a maximum length by adding NaNs
            maxlen = max([len(x) for x in data if x is not None])
            for i in range(len(data)):
                if data[i] is not None and len(data[i]) < maxlen:
                    data[i] = N.concatenate(
                        (data[i], N.zeros(maxlen - len(data[i])) * N.nan))

            # create dataset
            dstype = self.nametypes[name]
            if dstype == 'string':
                ds = datasets.DatasetText(data=data[0], linked=linkedfile)
            elif dstype == 'date':
                ds = datasets.DatasetDateTime(data=data[0], linked=linkedfile)
            else:
                ds = datasets.Dataset(data=data[0],
                                      serr=data[1],
                                      perr=data[2],
                                      nerr=data[3],
                                      linked=linkedfile)

            document.setData(name, ds)

        dsnames.sort()
        return dsnames
예제 #10
0
def main():
    parser = argparse.ArgumentParser(description='Behavioral Cloning Training Program')
    parser.add_argument('-d', help='data directory',        dest='data_dir',          type=str,   default='./data/drive_data.csv')
    parser.add_argument('-t', help='train size fraction',    dest='train_size',         type=float, default=0.8)
    parser.add_argument('-e', help='number of epochs',      dest='nb_epoch',          type=int,   default=10)
    parser.add_argument('-b', help='batch size',            dest='batch_size',        type=int,   default=64)
    parser.add_argument('-l', help='learning rate',         dest='learning_rate',     type=float, default=1.0e-4)
    args = parser.parse_args()
    args = vars(args)

    #print parameters
    print('-' * 30)
    print('Parameters')
    print('-' * 30)
    for key, value in args.items():
        print('{:<20} := {}'.format(key, value))
    print('-' * 30)

    # Set device
    if torch.cuda.is_available():
        print('Using GPU !!!')
        device = torch.device("cuda:0")
        torch.backends.cudnn.benchmark = True
    else:
        print('Using CPU !!!')
        device = torch.device("cpu")

    # Create Dataset
    drivingData = datasets.Dataset(args['data_dir'], Transforms())

    # Split Dataset
    train_size = int(len(drivingData) * args['train_size'])
    training_set, val_set = random_split(drivingData, [train_size, len(drivingData) - train_size])

    trainLoader = DataLoader(training_set, batch_size=args["batch_size"], num_workers=3, shuffle=True)
    valLoader = DataLoader(val_set, batch_size=args["batch_size"], num_workers=3, shuffle=False)

    # Initialize Model
    model = Driver(batch_size=args['batch_size'])

    # Start Training
    train(model, device, lr=args['learning_rate'], epochs=args['nb_epoch'],
          trainingLoader=trainLoader, validationLoader=valLoader)
예제 #11
0
    def train(self, args):
        noise = locate('noises.{0}'.format(args.noise))(args)
        dataset = datasets.Dataset(args)

        x_real = dataset.training_data[:args.n_test_data].copy()
        z_test = noise.test_data

        config_proto = tf.ConfigProto()
        config_proto.gpu_options.allow_growth = True

        with tf.Session(config=config_proto) as sess:
            sess.run(tf.global_variables_initializer())
            for step in range(1, args.n_iters + 1):
                # Update discriminator
                x = dataset.get_training_data(args.batch_size)
                z = noise.sample(args.batch_size)
                D_loss, _ = sess.run(
                    [self.D_loss, self.D_opt], {
                        'x_real:0': x,
                        'z_noise:0': z,
                        'train_D:0': True,
                        'train_G:0': True,
                    })

                # Update generator
                z = noise.sample(args.batch_size)
                G_loss, _ = sess.run([self.G_loss, self.G_opt], {
                    'z_noise:0': z,
                    'train_D:0': True,
                    'train_G:0': True,
                })

                if step % args.log_interval == 0:
                    print 'step:{0:>6}, Ld:{1:>9.6f}, Lg:{2:>9.6f}'.format(
                        step, D_loss, G_loss)

                if step % args.plot_interval == 0:
                    x_fake = self.sample_data(args, sess, z_test)
                    plotutil.save_plot(args, step, x_real, x_fake, z_test)
예제 #12
0
    def _import1d(self, hdu):
        """Import 1d data from hdu."""

        data = hdu.data
        datav = None
        symv = None
        posv = None
        negv = None

        # read the columns required
        p = self.params
        if p.datacol is not None:
            datav = data.field(p.datacol)
        if p.symerrcol is not None:
            symv = data.field(p.symerrcol)
        if p.poserrcol is not None:
            posv = data.field(p.poserrcol)
        if p.negerrcol is not None:
            negv = data.field(p.negerrcol)

        # actually create the dataset
        return datasets.Dataset(data=datav, serr=symv, perr=posv, nerr=negv)
예제 #13
0
    def do(self, document):
        """Create dataset using range."""

        OperationDatasetCreate.do(self, document)
        data = self.parts['data']
        serr = self.parts.get('serr', None)
        perr = self.parts.get('perr', None)
        nerr = self.parts.get('nerr', None)

        ds = datasets.DatasetRange(self.numsteps,
                                   data,
                                   serr=serr,
                                   perr=perr,
                                   nerr=nerr)
        if not self.linked:
            # copy these values if we don't want to link
            ds = datasets.Dataset(data=ds.data,
                                  serr=ds.serr,
                                  perr=ds.perr,
                                  nerr=ds.nerr)

        document.setData(self.datasetname, ds)
        return ds
예제 #14
0
    def get_mask(self, geometry, shape):

        # create an ogr datasource
        driver = ogr.GetDriverByName('Memory')
        source = driver.CreateDataSource('')
        sr = osr.SpatialReference(self.projection)
        layer = source.CreateLayer('', sr)
        defn = layer.GetLayerDefn()
        feature = ogr.Feature(defn)
        feature.SetGeometry(geometry)
        layer.CreateFeature(feature)

        # burn where data should be
        mask = np.zeros(shape, dtype='u1')
        geo_transform = self.geo_transform.shifted(geometry)
        kwargs = {
            'geo_transform': geo_transform,
            'projection': self.projection
        }
        with datasets.Dataset(mask, **kwargs) as dataset:
            gdal.RasterizeLayer(dataset, (1, ), layer, burn_values=(1, ))

        return mask.astype('b1').repeat(3, axis=0)
예제 #15
0
def top_k_recommendation(file_with_user_ratings, test_og_df, k):
    name = 'ml_gui'
    data_utility_dir = '../datasets/ml-100k/utility-matrix/'
    results_folder = '../wnmf/'
    ds = datasets.TrainingAndTest('ml_gui')
    ds.training = datasets.Dataset(
        'ml_gui',  #name
        file_with_user_ratings,  #original source
        data_utility_dir + name + '_um.csv',  #utility matrix
        results_folder + name + '_',  #similarity matrix
        'ml',  #data source
        'wnmf',  #algorithm
        'wnmf',
        2,  #latent factors
        26)  #iterations
    #get
    ds.test = datasets.TestSet(
        str(name) + ' test set',  #name
        'gui_user_test.csv',
        'ml')
    ds.build_ml_wnmf_predictions_df(results_folder + str(name) +
                                    '_wnmf_predictions.csv',
                                    cap=False)
    predictions = ds.test.predictions_df.copy()
    recommendations = predictions.sort_values('prediction',
                                              ascending=False).head(k)
    recommendations = list(recommendations['item'])
    movie_dict, titles = movie_titles.get_movie_info()
    recommendations = [movie_dict[i] for i in recommendations]

    #delete source files to ensure not reused
    os.remove('../wnmf/ml_gui__u.csv')
    os.remove('../wnmf/ml_gui__v.csv')
    os.remove('../wnmf/ml_gui_wnmf_predictions.csv')
    os.remove('../datasets/ml-100k/utility-matrix/ml_gui_um.csv')
    #returns list of top 5
    return recommendations
예제 #16
0
def main():
    params = parser.Parser().get_arguments()
    prod = np.prod(params["image_dims"])
    print(params)

    # Get datasets
    if params["dataset"] == "mnist":
        params["image_dims"] = [32, 32, 1]
        x_train, x_test = datasets.Dataset(params).mnist()
        x_train = x_train.reshape([-1, 32, 32, 1])
        x_test = x_test.reshape([-1, 32, 32, 1])
    elif params["dataset"] == "cifar":
        params["image_dims"] = [32, 32, 3]
        x_train, x_test = datasets.Dataset(params).cifar()
        x_train = x_train.reshape([-1, 32, 32, 3])
        x_test = x_test.reshape([-1, 32, 32, 3])
    elif params["dataset"] == "stl10":
        params["image_dims"] = [96, 96, 3]
        if params["colors_output"] == "rgb":
            x_test = datasets.Dataset(params).stl10(colors="rgb")
            x_test = x_test.reshape([-1, 96, 96, 3])
        elif params["colors_output"] == "cbcr":
            x_test = datasets.Dataset(params).stl10(colors="ycbr")
            x_test = x_test.reshape([-1, 96, 96, 3])

    # Import test image
    test_image = x_test[:10]

    # Get noise
    adv_dict = {
        'psnr_input': 0,
        'rand-linf': 1,
        'rand-l2': 2,
        'linear-linf-1': 3,
        'linear-linf-10': 4,
        'linear-linf-20': 5,
        'linear-l2-1': 6,
        'linear-l2-10': 7,
        'linear-l2-20': 8,
        'quadratic-linf-1': 9,
        'quadratic-linf-10': 10,
        'quadratic-l2-1': 11,
        'quadratic-l2-10': 12,
        'linear-pixel-1': 13,
        'linear-pixel-10': 14,
        'quadratic-pixel-1': 15,
        'rand-pixel-1': 16,
        'rand-pixel-10': 17
    }
    img_mtx = {}

    if params["norm"] == "l2":
        # Epsilon
        epsilon_range = np.array(np.logspace(-0.5, 1.0, 5))  # for l2
        adv_mtx = np.zeros([len(epsilon_range), len(adv_dict)])

        # L2
        adv_mtx[:, adv_dict['psnr_input']] = -20. * np.log10(
            epsilon_range / np.sqrt(prod))  # only true for l2
        adv_mtx[:, adv_dict['rand-l2']], img_mtx['rand-l2'] = \
            adv_noise(test_image, params, epsilon_range, "rand", "l2", False)
        adv_mtx[:, adv_dict['linear-l2-1']], img_mtx['linear-l2-1'] = \
            adv_noise(test_image, params, epsilon_range, "linear", "l2", False, 1)
        adv_mtx[:, adv_dict['linear-l2-10']], img_mtx['linear-l2-10'] = \
            adv_noise(test_image, params, epsilon_range, "linear", "l2", False, 10)
        adv_mtx[:, adv_dict['linear-l2-20']], img_mtx['linear-l2-20'] = \
            adv_noise(test_image, params, epsilon_range, "linear", "l2", False, 20)
        if params['dataset'] == 'mnist' or params['dataset'] == 'cifar':
            adv_mtx[:, adv_dict['quadratic-l2-1']], img_mtx['quadratic-l2-1'] = \
                adv_noise(test_image, params, epsilon_range, "quadratic", "l2", False, 1)
            # adv_mtx[:, adv_dict['quadratic-l2-10']], img_mtx['quadratic-l2-10'] = \
            #     adv_noise(test_image, params, epsilon_range, "quadratic", "l2", True, 10)
            # helpers.save_image_fig(img_mtx, params, adv_mtx[:, adv_dict['psnr_input']],
            #     ["rand-l2", 'linear-l2-1', 'linear-l2-10', 'linear-l2-20', 'quadratic-l2-1', 'quadratic-l2-10'], "l2",
            #                          img_size=params["image_dims"])
        else:
            print('No image plots for l2')
            # helpers.save_image_fig(img_mtx, params, adv_mtx[:, adv_dict['psnr_input']],
            #                          ["rand-l2", 'linear-l2-1', 'linear-l2-10', 'linear-l2-20'],
            #                          "l2", img_size=params["image_dims"])

    elif params["norm"] == "linf":
        # Epsilon
        epsilon_range = np.array(np.logspace(-2, -0.5, 5))  # for linf
        adv_mtx = np.zeros([len(epsilon_range), len(adv_dict)])

        # Linf
        adv_mtx[:, adv_dict['psnr_input']] = -20. * np.log10(epsilon_range)
        adv_mtx[:, adv_dict['rand-linf']], img_mtx['rand-linf'] = \
            adv_noise(test_image, params, epsilon_range, "rand", "linf", False)
        adv_mtx[:, adv_dict['linear-linf-1']], img_mtx['linear-linf-1'] = \
            adv_noise(test_image, params, epsilon_range, "linear", "linf", False, 1)
        adv_mtx[:, adv_dict['linear-linf-10']], img_mtx['linear-linf-10'] = \
            adv_noise(test_image, params, epsilon_range, "linear", "linf", False, 10)
        adv_mtx[:, adv_dict['linear-linf-20']], img_mtx['linear-linf-20'] = \
            adv_noise(test_image, params, epsilon_range, "linear", "linf", False, 20)

        helpers.save_image_fig(
            img_mtx,
            params,
            adv_mtx[:, adv_dict['psnr_input']],
            ["rand-linf", "linear-linf-1", "linear-linf-10", "linear-linf-20"],
            "linf",
            img_size=params["image_dims"])

    elif params["norm"] == "pixel":
        # Epsilon
        epsilon_range = epsilon_range = np.array([0.1, 0.2, 0.3, 0.5, 0.7])
        adv_mtx = np.zeros([len(epsilon_range), len(adv_dict)])

        adv_mtx[:, adv_dict['psnr_input']] = epsilon_range  # only true for l2

        adv_mtx[:, adv_dict['rand-pixel-1']], img_mtx['rand-pixel-1'] = \
            adv_noise(test_image, params, epsilon_range, "rand", "pixel", False, 1)

        adv_mtx[:, adv_dict['linear-pixel-1']], img_mtx['linear-pixel-1'] = \
            adv_noise(test_image, params, epsilon_range, "linear", "pixel", False, 1)

        adv_mtx[:, adv_dict['rand-pixel-10']], img_mtx['rand-pixel-10'] = \
            adv_noise(test_image, params, epsilon_range, "rand", "pixel", False, 100)

        adv_mtx[:, adv_dict['linear-pixel-10']], img_mtx['linear-pixel-10'] = \
            adv_noise(test_image, params, epsilon_range, "linear", "pixel", False, 100)

        helpers.save_image_fig(
            img_mtx,
            params,
            adv_mtx[:, adv_dict['psnr_input']],
            ['linear-pixel-1', 'linear-pixel-10', 'quadratic-pixel-1'],
            "pixel",
            img_size=params["image_dims"])

    np.savetxt(params['output_dir'] + '/summary/' + params['model'] +
               '_psnr_summary_' + params['dataset'] + '.csv',
               adv_mtx,
               delimiter=";")
    helpers.save_psnr_fig(adv_mtx,
                          './results/images/' + params['figs_dir'] + '/' +
                          params['model'] + '_fig_' + params['dataset'] + '_' +
                          params["norm"] + '.png',
                          adv_dict,
                          legend=True)
def generate_100B_dataset(num_examples: int,
                          chunk_size: int) -> datasets.Dataset:
    table = pa.Table.from_pydict({"col": [0] * chunk_size})
    table = pa.concat_tables([table] * (num_examples // chunk_size))
    return datasets.Dataset(table, fingerprint="table_100B")
예제 #18
0
def run_test(data_source,
             test_source,
             name,
             data,
             algo,
             sim,
             latent_factors,
             iterations,
             rebuild=False,
             sklearn_wnmf=False,
             elementwise=False):
    t1 = time.time()
    results_folder = str(algo)
    data_utility_dir = None
    filetype = None
    if data == 'ml':
        data_utility_dir = 'datasets/ml-100k/utility-matrix/'
        filetype = 'csv'
        ds = datasets.TrainingAndTest(data + ' training/test sets')
        ds.training = datasets.Dataset(
            name,  #name
            data_source,  #original source
            data_utility_dir + str(name) + '_' + str(algo) + '_um.' +
            filetype,  #utility matrix
            results_folder + str(name) + '_' + str(algo) + '_' + str(sim) +
            '_sm.' +
            filetype,  #similarity matrix (or u and v matrices filename after u_ and v_ respectively)
            data,  #data source
            algo,  #algorithm
            sim,
            latent_factors,
            iterations,
            rebuild_files=rebuild,
            sklearn=sklearn_wnmf,
            elwise=elementwise)
        ds.test = datasets.TestSet(
            str(name) + ' test set',  #name
            test_source,
            data)
    elif data == 'yelp':
        data_utility_dir = None
        filetype = None
        um_location = None
        if algo == 'wnmf':
            data_utility_dir = 'datasets/yelp_dataset/'
            filetype = 'csv'
            um_location = data_source
        else:
            data_utility_dir = 'datasets/yelp_dataset/utility-matrix/'
            filetype = 'json'
            um_location = data_utility_dir + 'yelp_review_uc_training_um' + str(
                algo) + '_um.' + filetype
        ds = datasets.TrainingAndTest(data + ' training/test sets',
                                      latent_factors=latent_factors,
                                      iterations=iterations)
        ds.training = datasets.Dataset(
            name,  #name
            data_source,  #original source
            um_location,  #utility matrix
            results_folder + str(name) + '_' + str(algo) + '_' + str(sim) +
            '_sm.' +
            filetype,  #similarity matrix (or u and v matrices filename after u_ and v_ respectively)
            data,  #data source
            algo,  #algorithm
            sim,
            latent_factors,
            iterations)
        ds.test = datasets.TestSet(
            str(name) + ' test set',  #name
            test_source,
            data)

    if algo == 'user' or algo == 'item':
        results_folder += '_similarity/'
    elif algo == 'wnmf':
        results_folder += '/'
    if data == 'ml':
        if algo == 'item':
            ds.build_ml_item_predictions_df('item_similarity/' + str(name) +
                                            '_' + str(algo) + '_' + str(sim) +
                                            '_predictions.csv',
                                            rebuild=True)
        elif algo == 'user':
            ds.build_ml_user_predictions_df('user_similarity/' + str(name) +
                                            '_' + str(algo) + '_' + str(sim) +
                                            '_predictions.csv',
                                            rebuild=True)
        elif algo == 'wnmf':
            if sklearn_wnmf:
                ds.build_ml_wnmf_predictions_df('wnmf/' + str(name) + '_' +
                                                str(algo) + '_' + str(sim) +
                                                '_predictions.csv',
                                                sklearn=True)
            else:
                ds.build_ml_wnmf_predictions_df('wnmf/' + str(name) + '_' +
                                                str(algo) + '_' + str(sim) +
                                                '_predictions.csv')
    elif data == 'yelp':
        if algo == 'item':
            ds.build_yelp_item_predictions_df('item_similarity/' + str(name) +
                                              '_' + str(algo) + '_' +
                                              str(sim) + '_predictions.csv',
                                              rebuild=True)
        elif algo == 'user':
            ds.build_yelp_user_predictions_df('user_similarity/' + str(name) +
                                              '_' + str(algo) + '_' +
                                              str(sim) + '_predictions.csv',
                                              rebuild=True)
        elif algo == 'wnmf':
            ds.build_yelp_wnmf_predictions_df('wnmf/' + str(name) + '_' +
                                              str(algo) + '_' + str(sim) +
                                              '_predictions.csv')
    print('Predictions: ')
    print(ds.test.predictions_df)
    if data == 'ml':
        ds.test.calculate_ml_mae()
        ds.test.calculate_ml_rmse()
    elif data == 'yelp':
        ds.test.calculate_yelp_mae()
        ds.test.calculate_yelp_rmse()
    print("MAE: " + str(ds.test.mae))
    print("RMSE: " + str(ds.test.rmse))
    print('Run time: ' + str(time.time() - t1) + ' sec')
    log_entry = str(
        t1) + ',' + name + ',' + data + ',' + algo + ',' + sim + ',' + str(
            latent_factors) + ',' + str(ds.training.predictor_log) + ',' + str(
                time.time() - t1) + ',' + str(ds.test.mae) + ',' + str(
                    ds.test.rmse) + ',\n'
    return ds, log_entry
                args=(i, head, rel, tail, dataset, entity_2_train_facts,
                      entity_pair_2_train_facts, all_paths, all_relations,
                      relation_2_tfidf_vec, path_2_df, return_dict))
            process_list.append(p)
            evaluation_fact_index += 1
            p.start()

        for p in process_list:
            p.join()
        end = time.time()

        print(end - start)

        batch_lines = []
        for key in return_dict.keys():
            batch_lines.append(";".join([
                return_dict[key]["head"], return_dict[key]["relation"],
                return_dict[key]["tail"],
                str(return_dict[key]["head_rank"]),
                str(return_dict[key]["head_ties"]),
                str(return_dict[key]["tail_rank"]),
                str(return_dict[key]["tail_ties"])
            ]) + "\n")
            print(return_dict[key])

        with open("results.csv", "a") as outfile:
            outfile.writelines(batch_lines)


compute(datasets.Dataset(datasets.FB15K))
예제 #20
0
import datasets
from dataset_analysis.degrees import relation_mentions
from dataset_analysis.paths import relation_paths

rel_2_mentions = relation_mentions.read(datasets.FB15K)
rel_2_paths_counts = relation_paths.read(datasets.Dataset(datasets.FB15K))

rel_2_mentions_items = sorted(rel_2_mentions.items(),
                              key=lambda x: x[1],
                              reverse=True)
for relation_and_mentions in rel_2_mentions_items[0:1]:
    relation = relation_and_mentions[0]
    mentions = relation_and_mentions[1]
    print(relation)
    print("Mentions: " + str(mentions))
    path_2_count = rel_2_paths_counts[relation]
    for path in path_2_count:
        print("\t" + path + ":" + str(path_2_count[path]))
    print()
예제 #21
0
result_dir = project_dir + 'Result/'

train_file = 'train_list.txt'
test_file = 'test_list.txt'
result_file = 'result.txt'

input_size = 64
batch_size = 20
max_epochs = 25
class_num = 66

if not os.path.exists(result_dir):
    os.makedirs(result_dir)

if not os.path.exists(train_dir):
    os.makedirs(train_dir)

dataset = datasets.Dataset(train_dir,
                           train_file,
                           test_dir,
                           test_file,
                           result_dir,
                           result_file,
                           batch_size=batch_size,
                           input_size=input_size)
net = models.ResNet50(dataset, class_num)
net.train(model_file,
          max_epoches=max_epochs,
          load_weight=True,
          should_train=False)
net.test()
예제 #22
0
    def doImport(self, document):
        """Do import."""

        pluginnames = [p.name for p in plugins.importpluginregistry]
        plugin = plugins.importpluginregistry[pluginnames.index(
            self.params.plugin)]

        # if the plugin is a class, make an instance
        # the old API is for the plugin to be instances
        if isinstance(plugin, type):
            plugin = plugin()

        # strip out parameters for plugin itself
        p = self.params

        # stick back together the plugin parameter object
        plugparams = plugins.ImportPluginParams(p.filename, p.encoding,
                                                p.pluginpars)
        results = plugin.doImport(plugparams)

        # make link for file
        LF = None
        if p.linked:
            LF = linked.LinkedFilePlugin(p)

        customs = []

        # convert results to real datasets
        names = []
        for d in results:
            if isinstance(d, plugins.Dataset1D):
                ds = datasets.Dataset(data=d.data,
                                      serr=d.serr,
                                      perr=d.perr,
                                      nerr=d.nerr)
            elif isinstance(d, plugins.Dataset2D):
                ds = datasets.Dataset2D(data=d.data,
                                        xrange=d.rangex,
                                        yrange=d.rangey)
            elif isinstance(d, plugins.DatasetText):
                ds = datasets.DatasetText(data=d.data)
            elif isinstance(d, plugins.DatasetDateTime):
                ds = datasets.DatasetDateTime(data=d.data)
            elif isinstance(d, plugins.Constant):
                customs.append(['constant', d.name, d.val])
                continue
            elif isinstance(d, plugins.Function):
                customs.append(['function', d.name, d.val])
                continue
            else:
                raise RuntimeError("Invalid data set in plugin results")

            # set any linking
            if linked:
                ds.linked = LF

            # construct name
            name = p.prefix + d.name + p.suffix

            # actually make dataset
            document.setData(name, ds)

            names.append(name)

        # add constants, functions to doc, if any
        self.addCustoms(document, customs)

        self.outdatasets = names
        self.outcustoms = list(customs)
예제 #23
0
 def _import1dimage(self, hdu):
     """Import 1d image data form hdu."""
     return datasets.Dataset(data=hdu.data)
예제 #24
0
          (dataset.name, output_filepath))
    with open(output_filepath, "w") as output_file:
        output_file.writelines(lines)


def read(dataset):
    test_fact_2_support = dict()

    input_filepath = os.path.join(dataset.home, FOLDER,
                                  TEST_FACT_2_SUPPORT_FILENAME)
    print(
        "Reading relation paths support for each test fact in dataset %s from location %s"
        % (dataset.name, input_filepath))

    with open(input_filepath, "r") as input_file:
        for line in input_file:
            line = html.unescape(line)
            head, rel, tail, support_str = line.strip().split(SEPARATOR)
            support = float(support_str)

            test_fact_2_support[SEPARATOR.join([head, rel, tail])] = support

    return test_fact_2_support


#save(datasets.Dataset(datasets.FB15K))
#save(datasets.Dataset(datasets.FB15K_237))
#save(datasets.Dataset(datasets.WN18))
save(datasets.Dataset(datasets.WN18RR))
#save(datasets.Dataset(datasets.YAGO3_10))
예제 #25
0
import datasets as dt
import classifiers as clf
import preprocess as pr
import numpy as np
import pandas as pd

if __name__ == '__main__':
    lstm_cv = False
    np.random.seed(123)

    # Load in data objects
    blur = dt.Dataset('data/Oasis_lyrics.pickle', 'Oasis')
    oasis = dt.Dataset('data/Blur_lyrics.pickle', 'Blur', True)
    analysis = pr.Analyser(blur, oasis, 0)
    analysis.get_summaries()
    analysis.train_test()
    analysis.get_tfidf()

    # # Determine optimal number of trees
    # print('Random Forest:')
    # tree_count = clf.test_random_forest(analysis, 50, 10, True)
    # print('Optimal Tree Count: {}.'.format(tree_count))

    results = []
    for i in np.round(np.linspace(0, 0.9, 10), 1).tolist():
        print('Noise Amount: {}'.format(i))
        # Preprocess
        analysis = pr.Analyser(blur, oasis, i)
        # analysis.get_summaries()
        analysis.train_test()
        analysis.get_tfidf()
예제 #26
0
    train_fact_to_two_step_paths = _read_two_step_paths_from_file(input_filepath)

    input_filepath = os.path.join(dataset.home, FOLDER, TRAIN_FACTS_WITH_THREE_STEP_GRAPH_PATHS_FILENAME)
    print("Reading three-step graph paths for train facts of dataset %s into location %s" % (dataset.name, input_filepath))
    train_fact_to_three_step_paths = _read_three_step_paths_from_file(input_filepath)

    return train_fact_to_one_step_paths, train_fact_to_two_step_paths, train_fact_to_three_step_paths


def read_test(dataset):

    input_filepath = os.path.join(dataset.home, FOLDER, TEST_FACTS_WITH_ONE_STEP_GRAPH_PATHS_FILENAME)
    print("Reading one-step graph paths for test facts of dataset %s from location %s" % (dataset.name, input_filepath))
    test_fact_to_one_step_paths = _read_one_step_paths_from_file(input_filepath)

    input_filepath = os.path.join(dataset.home, FOLDER, TEST_FACTS_WITH_TWO_STEP_GRAPH_PATHS_FILENAME)
    print("Reading two-step graph paths for test facts of dataset %s from location %s" % (dataset.name, input_filepath))
    test_fact_to_two_step_paths = _read_two_step_paths_from_file(input_filepath)

    input_filepath = os.path.join(dataset.home, FOLDER, TEST_FACTS_WITH_THREE_STEP_GRAPH_PATHS_FILENAME)
    print("Reading three-step graph paths for test facts of dataset %s into location %s" % (dataset.name, input_filepath))
    test_fact_to_three_step_paths = _read_three_step_paths_from_file(input_filepath)

    return test_fact_to_one_step_paths, test_fact_to_two_step_paths, test_fact_to_three_step_paths

compute_and_progressively_save(datasets.Dataset(datasets.FB15K))
#compute_and_progressively_save(datasets.Dataset(datasets.FB15K_237))
#compute_and_progressively_save(datasets.Dataset(datasets.WN18))
#compute_and_progressively_save(datasets.Dataset(datasets.WN18RR))
#compute_and_progressively_save(datasets.Dataset(datasets.YAGO3_10))
예제 #27
0
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 24 19:08:25 2019

@author: jonathan
"""
import numpy as np
import pandas as pd
import math
import sys
sys.path.insert(0, '../../')
import os
import datasets

temp = datasets.Dataset()
def build_item_pearson_sim(dataset = temp, source_filename = 'not_entered', dest_filename):
    
    #if a Dataset object is passed as a parameter
    if dataset is not temp and source_filename == 'not_entered':
        source_filename = dataset.item_utility_source
        if dataset.item_utility_df is None:
            dataset.build_item_utility_df()
    #if utility matrix not built yet
    if dataset.item_utility_df is None and source_filename == 'not_entered':
        print("You must provide either an object containing a source utility matrix, or the location of the source itself")
    else:
        utility_np = dataset.item_utility_df.to_numpy()
        similarity_np = np.zeros((len(utility_np[0]), len(utility_np[0])), dtype=float) 
        
        #PEARSON CORRELATION FUNCTION
        def pearson_corr(col1, col2):
예제 #28
0
def main(cfg):
    if cfg.training.resume is not None:
        log_dir = cfg.training.log_dir
        checkpoint_dir = os.path.dirname(cfg.training.resume)
    else:
        timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S.%f')
        log_dir = os.path.join(cfg.training.logs_dir, '{}_{}'.format(timestamp, cfg.training.experiment_name))
        checkpoint_dir = os.path.join(cfg.training.checkpoints_dir, '{}_{}'.format(timestamp, cfg.training.experiment_name))
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        print('log_dir: {}'.format(log_dir))
        print('checkpoint_dir: {}'.format(checkpoint_dir))

    single_model = models.DRNSeg(cfg.arch, cfg.data.classes, None, pretrained=True)
    model = torch.nn.DataParallel(single_model).cuda()
    cudnn.benchmark = True
    criterion = nn.NLLLoss().cuda()
    optimizer = torch.optim.SGD(single_model.optim_parameters(),
                                cfg.optimizer.lr,
                                momentum=cfg.optimizer.momentum,
                                weight_decay=cfg.optimizer.weight_decay)
    start_epoch = 0
    if cfg.training.resume is not None:
        if os.path.isfile(cfg.training.resume):
            print("=> loading checkpoint '{}'".format(cfg.training.resume))
            checkpoint = torch.load(cfg.training.resume)
            start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(cfg.training.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(cfg.training.resume))

    crop_transform = transforms.CropTransform(shape=(640, 480))
    zoom_generator = transforms.RandomIntGenerator(480, 540)
    zoom_bilinear_transform = transforms.ZoomTransform(interpolation="bilinear", generator=zoom_generator)
    zoom_nearest_transform = transforms.ZoomTransform(interpolation="nearest", generator=zoom_generator)
    rotate_freq_generator = transforms.RandomFloatGenerator()
    rotate_angle_generator = transforms.RandomFloatGenerator()
    rotate_bilinear_transform = transforms.FrequencyTransform(
        freq=0.5,
        transform=transforms.RotateTransform(interpolation="bilinear", generator=rotate_angle_generator),
        generator=rotate_freq_generator
    )
    rotate_nearest_transform = transforms.FrequencyTransform(
        freq=0.5,
        transform=transforms.RotateTransform(interpolation="nearest", generator=rotate_angle_generator),
        generator=rotate_freq_generator
    )
    brightness_generator = transforms.RandomFloatGenerator()
    gamma_transform = transforms.BrightnessTransform(0.5, 1.5, brightness_generator)
    train_image_transforms = (zoom_bilinear_transform, rotate_bilinear_transform, crop_transform, gamma_transform, transforms.ToTensorTransform(torch.FloatTensor))
    label_transforms = (zoom_nearest_transform, rotate_nearest_transform, crop_transform, transforms.ToTensorTransform(torch.LongTensor))

    train_transforms = transforms.ParallelTransform([train_image_transforms, label_transforms])
    val_transforms = transforms.Compose([transforms.ToTensor()])

    if cfg.data.train_all:
        train_dataset = datasets.Dataset(cfg.data.root, cfg.data.ann_file, 'train', train_transforms)
    else:
        train_dataset = datasets.Dataset(cfg.data.root, 'train_' + cfg.data.ann_file, 'train', train_transforms)
    val_dataset = datasets.Dataset(
        cfg.data.root, 'val_' + cfg.data.ann_file, 'val', val_transforms)
    print(train_dataset.__len__())
    print(val_dataset.__len__())
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=cfg.data.batch_size, shuffle=True, num_workers=cfg.data.workers, pin_memory=True)
    val_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=cfg.data.batch_size, shuffle=True, num_workers=cfg.data.workers, pin_memory=True)

    train_summary_writer = SummaryWriter(log_dir=os.path.join(log_dir, 'train'))
    val_summary_writer = SummaryWriter(log_dir=os.path.join(log_dir, 'val'))
    visualization_summary_writer = SummaryWriter(log_dir=os.path.join(log_dir, 'visualization'))
    for epoch in range(start_epoch, cfg.training.epochs):
        lr = adjust_learning_rate(optimizer, epoch)
        train_summary_writer.add_scalar('learning_rate', lr, epoch + 1)

        train_batch_time, train_data_time, train_loss = train(train_loader, model, criterion, optimizer, epoch)
        train_summary_writer.add_scalar('batch_time', train_batch_time, epoch + 1)
        train_summary_writer.add_scalar('data_time', train_data_time, epoch + 1)
        train_summary_writer.add_scalar('loss', train_loss, epoch + 1)

        val_batch_time, val_data_time, val_loss, val_accuracy, val_ious = validate(val_loader, model, criterion)
        val_summary_writer.add_scalar('batch_time', val_batch_time, epoch + 1)
        val_summary_writer.add_scalar('data_time', val_data_time, epoch + 1)
        val_summary_writer.add_scalar('loss', val_loss, epoch + 1)
        val_summary_writer.add_scalar('accuracy', val_accuracy, epoch + 1)
        for i, iou in enumerate(val_ious):
            if not np.isnan(iou) and iou != 0:
                val_summary_writer.add_scalar('iou_{}'.format(cfg.data.class_names[i]), iou, epoch + 1)

        first_input_batch, first_target_batch = iter(val_loader).next()
        rendered = visualize_batch(utils.visualize, model, first_input_batch, first_target_batch)
        visualization_summary_writer.add_image('segmentation', torch.from_numpy(rendered).permute(2, 0, 1), epoch + 1)

        if (epoch + 1) % cfg.training.checkpoint_epochs == 0:
            checkpoint_path = save_checkpoint(checkpoint_dir, {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optimizer' : optimizer.state_dict(),
            }, epoch + 1)
            cfg.training.log_dir = log_dir
            cfg.training.resume = checkpoint_path
            with open(os.path.join(log_dir, 'config.yml'), 'w') as f:
                f.write(cfg.toYAML())
video_rate = 3
conv = {'50Salads':25, "JIGSAWS":20, "MERL":5, "GTEA":25}[dataset]

# Which features for the given dataset
features = "SpatialCNN"
bg_class = 0 if dataset is not "JIGSAWS" else None

if dataset == "50Salads":
    features = "SpatialCNN_" + granularity

# ------------------------------------------------------------------
# Evaluate using different filter lengths
if 1:
# for conv in [5, 10, 15, 20]:
    # Initialize dataset loader & metrics
    data = datasets.Dataset(dataset, base_dir)
    trial_metrics = metrics.ComputeMetrics(overlap=.1, bg_class=bg_class)

    # Load data for each split
    for split in data.splits:
        if sensor_type=="video":
            feature_type = "A" if model_type != "SVM" else "X"
        else:
            feature_type = "S"

        X_train, y_train, X_test, y_test = data.load_split(features, split=split, 
                                                            sample_rate=video_rate, 
                                                            feature_type=feature_type)

        if trial_metrics.n_classes is None:
            trial_metrics.set_classes(data.n_classes)
예제 #30
0
    max_change_angle = (2 * 3.14159) / 500
    eye_sensor.position = (
        eye_sensor.position[0] + random.gauss(1, .75),
        eye_sensor.position[1] + random.gauss(1, .75),
    )
    eye_sensor.orientation += random.uniform(-max_change_angle,
                                             max_change_angle)
    eye_sensor.scale = 1


if __name__ == '__main__':
    eye = Eye()

    import datasets, random
    # data = datasets.Dataset('./datasets/small_items')
    data = datasets.Dataset('./datasets/textures')
    print("Num Images:", len(data))
    data.shuffle()
    for z in range(len(data)):
        eye.reset()
        data.next_image()
        img_path = data.current_image
        print("Loading image %s" % img_path)
        img = np.asarray(PIL.Image.open(img_path))
        eye.new_image(img)
        eye.scale = 1

        for i in range(10):
            sdr = eye.compute()
            eye.show_view()
            small_random_movement(eye)