def pre_processing(self):
        print 'preprocessing...'

        # uniformization
        raw = numpy.concatenate([
            self.train_set.sparse_matrix.data,
            self.valid_set.sparse_matrix.data, self.test_set.sparse_matrix.data
        ])

        len_train = len(self.train_set.sparse_matrix.data)
        len_valid = len(self.valid_set.sparse_matrix.data)
        len_test = len(self.test_set.sparse_matrix.data)

        out = data_processing.uniformization(raw, False)

        self.train_set.sparse_matrix.data = raw[0:len_train]
        self.valid_set.sparse_matrix.data = raw[len_train:(len_train +
                                                           len_valid)]
        self.test_set.sparse_matrix.data = raw[-len_test:]

        self.full_train = scipy.sparse.vstack(
            [self.train_set.sparse_matrix, self.valid_set.sparse_matrix],
            'csr')

        # shuffling train set
        self.full_train = self.full_train[
            numpy.random.permutation(self.full_train.shape[0]), :]

        # feature subset selection
        self.full_train = self.full_train[:, self.features_selected]
        self.valid_set = self.valid_set.sparse_matrix[:,
                                                      self.features_selected]
        self.test_set = self.test_set.sparse_matrix[:, self.features_selected]

        # whitening
        std = numpy.std(self.full_train.data)
        self.full_train /= std
        self.valid_set /= std
        self.test_set /= std

        # finally
        self.trainset = SparseDataset(
            from_scipy_sparse_dataset=self.full_train)
        self.validset = SparseDataset(from_scipy_sparse_dataset=self.valid_set)
        self.testset = SparseDataset(from_scipy_sparse_dataset=self.test_set)
    def torch_loader(dataset,
                     data_path,
                     batch_size,
                     shuffle=True,
                     cuda_device=None,
                     num_workers=1):
        (train_data, val_data), (train_labels,
                                 val_labels), label_names = load_data_func(
                                     dataset, data_path)

        kwargs = {
            'num_workers': num_workers,
            'pin_memory': True
        } if cuda_device is not None else {}
        kwargs['drop_last'] = True

        if type(train_data) == numpy.ndarray:
            train_dataset = TensorDataset(torch.from_numpy(train_data),
                                          torch.from_numpy(train_labels))
            val_dataset = TensorDataset(torch.from_numpy(val_data),
                                        torch.from_numpy(val_labels))
        elif type(train_data) == scipy.sparse.csr.csr_matrix:
            from sklearn.feature_extraction.text import TfidfTransformer
            tfidf_trans = TfidfTransformer(norm=None)
            tfidf_trans.fit(train_data)
            train_dataset = SparseDataset(train_data, tfidf_trans.idf_)
            val_dataset = SparseDataset(val_data, tfidf_trans.idf_)
        else:
            train_dataset = torchvision.datasets.ImageFolder(train_data)
            val_dataset = torchvision.datasets.ImageFolder(val_data)

        train_loader = DataLoader(train_dataset,
                                  batch_size=batch_size,
                                  shuffle=shuffle,
                                  **kwargs)
        val_loader = DataLoader(val_dataset,
                                batch_size=batch_size,
                                shuffle=False,
                                **kwargs)

        return train_loader, val_loader, label_names
    def __init__(self):

        self.trainset_path = '/data/lisa/data/UTLC/sparse/terry_train.npy.gz'
        self.validset_path = '/data/lisa/data/UTLC/sparse/terry_valid.npy.gz'
        self.testset_path = '/data/lisa/data/UTLC/sparse/terry_test.npy.gz'
        self.use_features_path = '/data/lisa/data/UTLC/sparse/terry_testvalid_activefeat.npy'

        self.features_selected = numpy.load(open(self.use_features_path))
        # these are sets before preprocessing
        self.train_set = SparseDataset(load_path=self.trainset_path)
        self.valid_set = SparseDataset(load_path=self.validset_path)
        self.test_set = SparseDataset(load_path=self.validset_path)
        # these are sets after preprocessing
        self.trainset = None
        self.validset = None
        self.testset = None
        #fullset = scipy.sparse.vstack((scipy.sparse.vstack((self.train_set.data, self.valid_set.data)),
        #                              self.test_set.data))
        #self.full_set = SparseDataset(from_sparse_dataset=fullset)

        self.pre_processing()
示例#4
0
def create_folds(args):
    parser = argparse.ArgumentParser(
        prog='geoinf create_folds',
        description=
        'creates a set of data partitions for evaluating with cross-fold validation'
    )
    parser.add_argument(
        '-f',
        '--force',
        help='overwrite the output model directory if it already exists')
    parser.add_argument('dataset_dir',
                        help='a directory containing a geoinference dataset')
    parser.add_argument(
        'num_folds',
        help='the number of folds into which the dataset should be divided')
    parser.add_argument(
        'fold_dir',
        help=
        'a (non-existent) directory that will contain the information on the cross-validation folds'
    )

    args = parser.parse_args(args)

    # Confirm that the output directory doesn't exist
    if not os.path.exists(args.fold_dir):  #and not args.force:
        #raise Exception, 'output fold_dir cannot already exist'
        os.mkdir(args.fold_dir)

    # Decide on the number of folds
    num_folds = int(args.num_folds)
    if num_folds <= 1:
        raise Exception, 'The number of folds must be at least two'

# Initialize the output streams.  Rather than keeping things in memory,
# we batch the gold standard posts by users (one at a time) and then
# stream the user's gold standard posts (if any) to the output streams
    output_held_out_post_ids_file_handles = []
    output_held_out_user_ids_file_handles = []
    output_gold_loc_file_handles = []
    output_posts_file_handles = []
    cf_info_fh = open(os.path.join(args.fold_dir, "folds.info.tsv"), 'w')

    for i in range(0, num_folds):
        fold_name = "fold_%d" % i
        # All the IDs of the gold posts in this fold are written here
        fold_posts_ids_fh = open(
            os.path.join(args.fold_dir, fold_name + ".post-ids.txt"), 'w')
        output_held_out_post_ids_file_handles.append(fold_posts_ids_fh)

        # All the IDs of the users with gold posts are written here
        fold_users_ids_fh = open(
            os.path.join(args.fold_dir, fold_name + ".user-ids.txt"), 'w')
        output_held_out_user_ids_file_handles.append(fold_users_ids_fh)

        # All the lat/lon and IDs of the gold posts are written here
        gold_loc_fh = open(
            os.path.join(args.fold_dir, fold_name + ".gold-locations.tsv"),
            'w')
        output_gold_loc_file_handles.append(gold_loc_fh)

        # The users.json.gz file with the gold data (used for testing)
        gold_loc_fh = gzip.open(
            os.path.join(args.fold_dir, fold_name + ".users.json.gz"), 'w')
        output_posts_file_handles.append(gold_loc_fh)
        cf_info_fh.write(
            "%s\t%s.post-ids.txt\t%s.user-ids.txt\t%s.users.json.gz\n" %
            (fold_name, fold_name, fold_name, fold_name))
    cf_info_fh.close()

    # Load the dataset
    ds = SparseDataset(args.dataset_dir)

    logger.debug('Extracting gold-standard posts')
    num_users = 0
    num_posts = 0
    num_gold_users = 0
    num_gold_posts = 0

    # Iterate over the dataset looking for posts with geo IDs that we can
    # use as a gold standard
    for user in ds.user_iter():
        gold_posts = []
        gold_post_id_to_loc = {}
        user_id = user['user_id']
        num_posts += len(user['posts'])
        for post in user['posts']:
            if "geo" in post:
                post_id = post['id']
                loc = post['geo']['coordinates']
                gold_post_id_to_loc[post_id] = loc
                gold_posts.append(post)
        # If this user had any gold locations, add them as folds
        if len(gold_posts) > 0:
            num_gold_posts += len(gold_posts)
            fold_to_use = num_gold_users % num_folds
            num_gold_users += 1

            output_held_out_user_ids_file_handles[fold_to_use].write(
                "%s\n" % user['user_id'])

            for post_id, loc in gold_post_id_to_loc.iteritems():
                output_held_out_post_ids_file_handles[fold_to_use].write(
                    "%d\n" % post_id)
                output_gold_loc_file_handles[fold_to_use].write(
                    "%d\t%s\t%f\t%f\n" % (post_id, user_id, loc[0], loc[1]))
            # Lazily mutate the existing user object and the dump
            # that object to the fold's user.json.gz
            user['posts'] = gold_posts
            output_posts_file_handles[fold_to_use].write(
                "%s\n" % simplejson.dumps(user))

        num_users += 1
        if num_users % 100000 == 0:
            logger.debug(
                'Processed %d users, saw %d gold so far (%d posts of %d (%f))'
                % (num_users, num_gold_users, num_gold_posts, num_posts,
                   float(num_gold_posts) / num_posts))

    for fh in output_posts_file_handles:
        fh.close()
    for fh in output_held_out_post_ids_file_handles:
        fh.close()
    for fh in output_held_out_user_ids_file_handles:
        fh.close()
    for fh in output_gold_loc_file_handles:
        fh.close()

    logger.debug('Saw %d gold standard users in %d total' %
                 (num_gold_users, num_users))
示例#5
0
def infer(args, by_user=False):
    prog_name = 'geoinf'
    if by_user:
        description = 'infer the location of posts in a dataset using a specific inference method. Posts will be provided to the method grouped by user.'
        prog_name += ' infer_by_user'
    else:
        description = 'infer the location of posts in a dataset using a specific inference method. Posts will be provided to the method one at a time.'
        prog_name += ' infer_by_post'

    parser = argparse.ArgumentParser(prog=prog_name, description=description)
    parser.add_argument('-f',
                        '--force',
                        action='store_true',
                        help='overwrite the output file if it already exists')
    parser.add_argument(
        '-s',
        '--settings',
        help='a json file of settings to be passed to the model',
        nargs=1)
    parser.add_argument('method_name',
                        help='the type of method to use for inference')
    parser.add_argument(
        'model_dir',
        help=
        'the directory of a model that was constructed using the train procedure'
    )
    parser.add_argument(
        'dataset',
        help='a json specification for the dataset to infer locations on')
    parser.add_argument('infer_file',
                        help='the file that the inferences will be written to')

    logger.debug('infer args = %s' % str(args))
    args = parser.parse_args(args)

    # load the infer settings if necessary
    settings = {}
    if args.settings:
        with open(args.settings, 'r') as fh:
            settings = json.load(fh)

    if os.path.exists(args.infer_file) and not args.force:
        raise Exception, 'output infer_file cannot exist'

    # load the method
    method = get_method_by_name(args.method_name)
    method_inst = method()
    model = method_inst.load_model(args.model_dir, settings)

    # load the dataset
    ds = SparseDataset(args.dataset)

    # get the output file ready
    outfh = open(args.infer_file, 'w')

    # write settings to the first line
    outfh.write('%s\n' % json.dumps({
        'method': args.method_name,
        'settings': settings,
        'dataset': args.dataset,
        'by_user': by_user
    }))

    # locate all the posts
    logger.info('inferring locations for posts')
    if by_user:
        num_posts_seen = 0
        num_posts_located = 0
        num_users_seen = 0
        for user in ds.user_iter():
            user_id = user['user_id']
            posts = user['posts']

            locs = model.infer_posts_locations_by_user(user_id, posts)

            assert len(locs) == len(posts)
            num_users_seen += 1

            for loc, post in zip(locs, posts):
                num_posts_seen += 1
                if not loc is None:
                    num_posts_located += 1
                    outfh.write('%s\t%f\t%f\n' % (post['id'], loc[0], loc[1]))

                if num_posts_seen % 10000 == 0:
                    logger.debug(
                        "Saw %d users, %d posts, %d of which were located" %
                        (num_users_seen, num_posts_seen, num_posts_located))
    else:
        num_posts_seen = 0
        num_posts_located = 0
        for post in ds.post_iter():
            user_id = post['user']['id_str']
            loc = model.infer_post_location(post)
            num_posts_seen += 1
            if not loc is None:
                outfh.write('%s\t%f\t%f\n' % (post['id'], loc[0], loc[1]))
                num_posts_located += 1
            if num_posts_seen % 10000 == 0:
                logger.debug("Saw %d posts, %d of which were located" %
                             (num_posts_seen, num_posts_located))

    outfh.close()
示例#6
0
def train(args):
    parser = argparse.ArgumentParser(
        prog='geoinf train',
        description='train a geoinference method on a specific dataset')
    parser.add_argument(
        '-f',
        '--force',
        help='overwrite the output model directory if it already exists')
    parser.add_argument('method_name', help='the method to use')
    parser.add_argument(
        'method_settings',
        help='a json file containing method-specific configurations')
    parser.add_argument('dataset_dir',
                        help='a directory containing a geoinference dataset')
    parser.add_argument(
        'model_dir',
        help='a (non-existing) directory where the trained model will be stored'
    )
    parser.add_argument('--location-source',
                        nargs=1,
                        help='specifies the source of ground-truth locations')

    args = parser.parse_args(args)

    # confirm that the output directory doesn't exist
    if os.path.exists(args.model_dir) and not args.force:
        raise Exception, 'output model_dir cannot exist'

    # load the method
    method = get_method_by_name(args.method_name)

    # load the data
    with open(args.method_settings, 'r') as fh:
        settings = json.load(fh)

    location_source = args.location_source
    if location_source:
        location_source = location_source[0]
        logger.debug('Using %s as the source of ground truth location' %
                     location_source)
        settings['location_source'] = location_source

    # load the dataset
    ds = None  #Dataset(args.dataset_dir)
    if not location_source is None:
        ds = SparseDataset(args.dataset_dir,
                           default_location_source=location_source)
    else:
        ds = SparseDataset(args.dataset_dir)

    # load the method
    method = get_method_by_name(args.method_name)
    method_inst = method()

    start_time = time.time()
    method_inst.train_model(settings, ds, args.model_dir)
    end_time = time.time()
    logger.info('Trained model %s on dataset %s in %f seconds' %
                (args.method_name, args.dataset_dir, end_time - start_time))

    # drop some metadata into the run method
    # run the method
    # gi_inst = method()
    # gi_inst.train(settings,ds,args.model_dir)

    return
示例#7
0
def cross_validate(args):
    parser = argparse.ArgumentParser(
        prog='geoinf cross_validate',
        description='evaluate a geocinference method using cross-validation')
    parser.add_argument(
        '-f',
        '--force',
        help='overwrite the output model directory if it already exists')
    parser.add_argument('method_name', help='the method to use')
    parser.add_argument(
        'method_settings',
        help='a json file containing method-specific configurations')
    parser.add_argument('dataset_dir',
                        help='a directory containing a geoinference dataset')
    parser.add_argument(
        'fold_dir',
        help=
        'the name of the directory containing information on the cross-validation folds'
    )
    parser.add_argument(
        'results_dir',
        help=
        'a (non-existent) directory where the evaluation results will be stored'
    )
    parser.add_argument('--fold',
                        nargs=1,
                        help='runs just that fold from the cross-fold dataset')
    parser.add_argument('--location-source',
                        nargs=1,
                        help='specifies the source of ground-truth locations')

    args = parser.parse_args(args)

    # confirm that the output directory doesn't exist
    #	if os.path.exists(args.results_dir) and not args.force:
    #		raise Exception, 'output results_dir cannot already exist'

    if not os.path.exists(args.results_dir):  #and not args.force:
        #raise Exception, 'output fold_dir cannot already exist'
        os.mkdir(args.results_dir)

    # load the method
    method = get_method_by_name(args.method_name)

    # load the data
    with open(args.method_settings, 'r') as fh:
        settings = json.load(fh)

    specific_fold_to_run = args.fold
    if specific_fold_to_run:
        specific_fold_to_run = specific_fold_to_run[0]
    location_source = args.location_source
    if location_source:
        logger.debug('Using %s as the source of ground truth location' %
                     location_source)
        location_source = location_source[0]
        settings['location_source'] = location_source

    print "running fold %s" % (specific_fold_to_run)

    # Load the folds to be used in the dataset
    cfv_fh = open(os.path.join(args.fold_dir, 'folds.info.tsv'))

    # Each line contains two files specifying the post IDs to be held out
    # from the full dataset (for that fold) and the corresponding file in
    # the fold_dir containing the testing data for that fold
    for line in cfv_fh:
        line = line.strip()
        fold_name, testing_post_ids_file, testing_user_ids_file, testing_users_file = line.split(
            "\t")

        # Skip this fold if the user has told us to run only one fold by name
        if specific_fold_to_run is not None and fold_name != specific_fold_to_run:
            continue

        logger.debug('starting processing of fold %s' % fold_name)

        # Read in the post IDs to exclude
        testing_post_ids = set()
        tpi_fh = open(
            os.path.join(args.fold_dir,
                         testing_post_ids_file.replace('held-out-', '')))
        for id_str in tpi_fh:
            testing_post_ids.add(id_str.strip())
        tpi_fh.close()

        # Read in the user IDs to exclude
        testing_user_ids = set()
        tpi_fh = open(
            os.path.join(args.fold_dir,
                         testing_user_ids_file.replace('held-out-', '')))
        for id_str in tpi_fh:
            testing_user_ids.add(id_str.strip())
        tpi_fh.close()

        logger.debug('Loaded %d users whose location data will be held out' %
                     len(testing_user_ids))

        # load the dataset
        training_data = None
        if not location_source is None:
            training_data = SparseDataset(
                args.dataset_dir,
                excluded_users=testing_user_ids,
                default_location_source=location_source)
        else:
            training_data = SparseDataset(args.dataset_dir,
                                          excluded_users=testing_user_ids)

    # load the method
        method = get_method_by_name(args.method_name)
        method_inst = method()

        # Create the temporary directory that will hold the model for
        # this fold
        model_dir = os.path.join(args.results_dir, fold_name)
        if not os.path.exists(model_dir):
            os.mkdir(model_dir)

    # Train on the datset, holding out the testing post IDs
        model = method_inst.train_model(settings, training_data, None)

        logger.debug('Finished training during fold %s; beginning testing' %
                     fold_name)

        logger.debug("Reading testing data from %s" %
                     (os.path.join(args.fold_dir, testing_users_file)))

        testing_data = Dataset(args.fold_dir,
                               users_file=os.path.join(args.fold_dir,
                                                       testing_users_file))

        logger.debug(
            "Writing results to %s" %
            (os.path.join(args.results_dir, fold_name + ".results.tsv.gz")))

        out_fh = gzip.open(
            os.path.join(args.results_dir, fold_name + ".results.tsv.gz"), 'w')

        num_tested_users = 0
        num_tested_posts = 0
        seen_ids = set()
        for user in testing_data.user_iter():
            user_id = user['user_id']
            posts = user['posts']

            locs = model.infer_posts_locations_by_user(user_id, posts)

            if len(locs) != len(posts):
                print "#WUT %d != %d" % (len(locs), len(posts))

            num_located_posts = 0
            num_tested_posts += len(posts)
            for loc, post in zip(locs, posts):
                pid = post['id']
                if pid in seen_ids:
                    continue
                seen_ids.add(pid)
                if not loc is None:
                    out_fh.write('%s\t%f\t%f\n' % (post['id'], loc[0], loc[1]))
                    num_located_posts += 1
            num_tested_users += 1
            if num_tested_users % 10000 == 0:
                logger.debug(
                    'During testing of fold %s, processed %d users, %d posts, %d located'
                    % (fold_name, num_tested_users, num_tested_posts,
                       num_located_posts))

        out_fh.close()
        logger.debug('Finished testing of fold %s' % fold_name)