示例#1
0
def create_folds(args): 
	parser = argparse.ArgumentParser(prog='geoinf create_folds', description='creates a set of data partitions for evaluating with cross-fold validation')
	parser.add_argument('-f', '--force', help='overwrite the output model directory if it already exists')
	parser.add_argument('dataset_dir', help='a directory containing a geoinference dataset')
	parser.add_argument('num_folds', help='the number of folds into which the dataset should be divided')
	parser.add_argument('fold_dir', help='a (non-existent) directory that will contain the information on the cross-validation folds')

	args = parser.parse_args(args)

	# Confirm that the output directory doesn't exist
	if not os.path.exists(args.fold_dir): #and not args.force:
		#raise Exception, 'output fold_dir cannot already exist'
                os.mkdir(args.fold_dir)

	# Decide on the number of folds
	num_folds = int(args.num_folds)
	if num_folds <= 1:
		raise Exception, 'The number of folds must be at least two'

        # Initialize the output streams.  Rather than keeping things in memory,
        # we batch the gold standard posts by users (one at a time) and then
        # stream the user's gold standard posts (if any) to the output streams
        output_held_out_post_ids_file_handles = []
        output_held_out_user_ids_file_handles = []
        output_gold_loc_file_handles = []
        output_posts_file_handles = []
	cf_info_fh = open(os.path.join(args.fold_dir, "folds.info.tsv"), 'w')

	for i in range(0, num_folds):
		fold_name = "fold_%d" % i
                # All the IDs of the gold posts in this fold are written here
		fold_posts_ids_fh = open(os.path.join(args.fold_dir, fold_name + ".post-ids.txt"), 'w')
                output_held_out_post_ids_file_handles.append(fold_posts_ids_fh)

                # All the IDs of the users with gold posts are written here
		fold_users_ids_fh = open(os.path.join(args.fold_dir, fold_name + ".user-ids.txt"), 'w')
                output_held_out_user_ids_file_handles.append(fold_users_ids_fh)

                # All the lat/lon and IDs of the gold posts are written here
		gold_loc_fh = open(os.path.join(args.fold_dir, fold_name + ".gold-locations.tsv"), 'w')
                output_gold_loc_file_handles.append(gold_loc_fh)

                # The users.json.gz file with the gold data (used for testing)
		gold_loc_fh = gzip.open(os.path.join(args.fold_dir, fold_name + ".users.json.gz"), 'w')
                output_posts_file_handles.append(gold_loc_fh)
                cf_info_fh.write("%s\t%s.post-ids.txt\t%s.user-ids.txt\t%s.users.json.gz\n" 
                                 % (fold_name, fold_name, fold_name, fold_name))
        cf_info_fh.close()

	# Load the dataset
	ds = SparseDataset(args.dataset_dir)

	logger.debug('Extracting gold-standard posts')
	num_users = 0
        num_posts = 0
        num_gold_users = 0
        num_gold_posts = 0

	# Iterate over the dataset looking for posts with geo IDs that we can
	# use as a gold standard
	for user in ds.user_iter():
                gold_posts = []
                gold_post_id_to_loc = {}
                user_id = user['user_id']
                num_posts += len(user['posts'])
                for post in user['posts']:
                        if "geo" in post:
                                post_id = post['id']
                                loc = post['geo']['coordinates']
                                gold_post_id_to_loc[post_id] = loc
                                gold_posts.append(post)
                # If this user had any gold locations, add them as folds
                if len(gold_posts) > 0:
                        num_gold_posts += len(gold_posts)
                        fold_to_use = num_gold_users % num_folds
                        num_gold_users += 1
                        
                        output_held_out_user_ids_file_handles[fold_to_use].write("%s\n" % user['user_id'])

                        for post_id, loc in gold_post_id_to_loc.iteritems():
                                output_held_out_post_ids_file_handles[fold_to_use].write("%d\n" % post_id)
                                output_gold_loc_file_handles[fold_to_use].write("%d\t%s\t%f\t%f\n" % (post_id, user_id, loc[0], loc[1]))
                        # Lazily mutate the existing user object and the dump
                        # that object to the fold's user.json.gz 
                        user['posts'] = gold_posts
                        output_posts_file_handles[fold_to_use].write("%s\n" % simplejson.dumps(user))
                        
                num_users += 1
		if num_users % 100000 == 0:
			logger.debug('Processed %d users, saw %d gold so far (%d posts of %d (%f))' 
                                     % (num_users, num_gold_users, num_gold_posts, num_posts,
                                        float(num_gold_posts) / num_posts))

        for fh in output_posts_file_handles:
                fh.close()
        for fh in output_held_out_post_ids_file_handles:
                fh.close()
        for fh in output_held_out_user_ids_file_handles:
                fh.close()
        for fh in output_gold_loc_file_handles:
                fh.close()

	logger.debug('Saw %d gold standard users in %d total' % (num_gold_users, num_users))
示例#2
0
def infer(args,by_user=False):
	prog_name = 'geoinf'
	if by_user:
		description='infer the location of posts in a dataset using a specific inference method. Posts will be provided to the method grouped by user.'
		prog_name += ' infer_by_user'
	else:
		description='infer the location of posts in a dataset using a specific inference method. Posts will be provided to the method one at a time.'
		prog_name += ' infer_by_post'

	parser = argparse.ArgumentParser(prog=prog_name,description=description)
	parser.add_argument('-f','--force',action='store_true',help='overwrite the output file if it already exists')
	parser.add_argument('-s','--settings',help='a json file of settings to be passed to the model',nargs=1)
	parser.add_argument('method_name',help='the type of method to use for inference')
	parser.add_argument('model_dir',help='the directory of a model that was constructed using the train procedure')
	parser.add_argument('dataset',help='a json specification for the dataset to infer locations on')
	parser.add_argument('infer_file',help='the file that the inferences will be written to')
		
	logger.debug('infer args = %s' % str(args))
	args = parser.parse_args(args)

	# load the infer settings if necessary
	settings = {}
	if args.settings:
		with open(args.settings,'r') as fh:
			settings = json.load(fh)
	
	if os.path.exists(args.infer_file) and not args.force:
		raise Exception, 'output infer_file cannot exist'

	# load the method
	method = get_method_by_name(args.method_name)
	method_inst = method()
	model = method_inst.load_model(args.model_dir,settings)

	# load the dataset
	ds = SparseDataset(args.dataset)

	# get the output file ready
	outfh = open(args.infer_file,'w')

	# write settings to the first line
	outfh.write('%s\n' % json.dumps({'method': args.method_name, 
									 'settings': settings, 
									 'dataset': args.dataset,
									 'by_user': by_user}))
	
	# locate all the posts
	logger.info('inferring locations for posts')	
	if by_user:
                num_posts_seen = 0
                num_posts_located = 0
                num_users_seen = 0
		for user in ds.user_iter():
			user_id = user['user_id']
			posts = user['posts']

			locs = model.infer_posts_locations_by_user(user_id,posts)

			assert len(locs) == len(posts)
                        num_users_seen += 1

			for loc,post in zip(locs,posts):
                                num_posts_seen += 1
				if not loc is None:
                                        num_posts_located += 1
					outfh.write('%s\t%f\t%f\n' % (post['id'],loc[0],loc[1]))

                                if num_posts_seen % 10000 == 0:
                                        logger.debug("Saw %d users, %d posts, %d of which were located" % (num_users_seen, num_posts_seen, num_posts_located))
	else:
                num_posts_seen = 0
                num_posts_located = 0
		for post in ds.post_iter():
                        user_id = post['user']['id_str']
			loc = model.infer_post_location(post)
                        num_posts_seen += 1
			if not loc is None:
                                outfh.write('%s\t%f\t%f\n' % (post['id'],loc[0],loc[1]))
                                num_posts_located += 1
                        if num_posts_seen % 10000 == 0:
                                logger.debug("Saw %d posts, %d of which were located" % (num_posts_seen, num_posts_located))

	outfh.close()
示例#3
0
def infer(args, by_user=False):
    prog_name = 'geoinf'
    if by_user:
        description = 'infer the location of posts in a dataset using a specific inference method. Posts will be provided to the method grouped by user.'
        prog_name += ' infer_by_user'
    else:
        description = 'infer the location of posts in a dataset using a specific inference method. Posts will be provided to the method one at a time.'
        prog_name += ' infer_by_post'

    parser = argparse.ArgumentParser(prog=prog_name, description=description)
    parser.add_argument('-f',
                        '--force',
                        action='store_true',
                        help='overwrite the output file if it already exists')
    parser.add_argument(
        '-s',
        '--settings',
        help='a json file of settings to be passed to the model',
        nargs=1)
    parser.add_argument('method_name',
                        help='the type of method to use for inference')
    parser.add_argument(
        'model_dir',
        help=
        'the directory of a model that was constructed using the train procedure'
    )
    parser.add_argument(
        'dataset',
        help='a json specification for the dataset to infer locations on')
    parser.add_argument('infer_file',
                        help='the file that the inferences will be written to')

    logger.debug('infer args = %s' % str(args))
    args = parser.parse_args(args)

    # load the infer settings if necessary
    settings = {}
    if args.settings:
        with open(args.settings, 'r') as fh:
            settings = json.load(fh)

    if os.path.exists(args.infer_file) and not args.force:
        raise Exception, 'output infer_file cannot exist'

    # load the method
    method = get_method_by_name(args.method_name)
    method_inst = method()
    model = method_inst.load_model(args.model_dir, settings)

    # load the dataset
    ds = SparseDataset(args.dataset)

    # get the output file ready
    outfh = open(args.infer_file, 'w')

    # write settings to the first line
    outfh.write('%s\n' % json.dumps({
        'method': args.method_name,
        'settings': settings,
        'dataset': args.dataset,
        'by_user': by_user
    }))

    # locate all the posts
    logger.info('inferring locations for posts')
    if by_user:
        num_posts_seen = 0
        num_posts_located = 0
        num_users_seen = 0
        for user in ds.user_iter():
            user_id = user['user_id']
            posts = user['posts']

            locs = model.infer_posts_locations_by_user(user_id, posts)

            assert len(locs) == len(posts)
            num_users_seen += 1

            for loc, post in zip(locs, posts):
                num_posts_seen += 1
                if not loc is None:
                    num_posts_located += 1
                    outfh.write('%s\t%f\t%f\n' % (post['id'], loc[0], loc[1]))

                if num_posts_seen % 10000 == 0:
                    logger.debug(
                        "Saw %d users, %d posts, %d of which were located" %
                        (num_users_seen, num_posts_seen, num_posts_located))
    else:
        num_posts_seen = 0
        num_posts_located = 0
        for post in ds.post_iter():
            user_id = post['user']['id_str']
            loc = model.infer_post_location(post)
            num_posts_seen += 1
            if not loc is None:
                outfh.write('%s\t%f\t%f\n' % (post['id'], loc[0], loc[1]))
                num_posts_located += 1
            if num_posts_seen % 10000 == 0:
                logger.debug("Saw %d posts, %d of which were located" %
                             (num_posts_seen, num_posts_located))

    outfh.close()
示例#4
0
def create_folds(args):
    parser = argparse.ArgumentParser(
        prog='geoinf create_folds',
        description=
        'creates a set of data partitions for evaluating with cross-fold validation'
    )
    parser.add_argument(
        '-f',
        '--force',
        help='overwrite the output model directory if it already exists')
    parser.add_argument('dataset_dir',
                        help='a directory containing a geoinference dataset')
    parser.add_argument(
        'num_folds',
        help='the number of folds into which the dataset should be divided')
    parser.add_argument(
        'fold_dir',
        help=
        'a (non-existent) directory that will contain the information on the cross-validation folds'
    )

    args = parser.parse_args(args)

    # Confirm that the output directory doesn't exist
    if not os.path.exists(args.fold_dir):  #and not args.force:
        #raise Exception, 'output fold_dir cannot already exist'
        os.mkdir(args.fold_dir)

    # Decide on the number of folds
    num_folds = int(args.num_folds)
    if num_folds <= 1:
        raise Exception, 'The number of folds must be at least two'

# Initialize the output streams.  Rather than keeping things in memory,
# we batch the gold standard posts by users (one at a time) and then
# stream the user's gold standard posts (if any) to the output streams
    output_held_out_post_ids_file_handles = []
    output_held_out_user_ids_file_handles = []
    output_gold_loc_file_handles = []
    output_posts_file_handles = []
    cf_info_fh = open(os.path.join(args.fold_dir, "folds.info.tsv"), 'w')

    for i in range(0, num_folds):
        fold_name = "fold_%d" % i
        # All the IDs of the gold posts in this fold are written here
        fold_posts_ids_fh = open(
            os.path.join(args.fold_dir, fold_name + ".post-ids.txt"), 'w')
        output_held_out_post_ids_file_handles.append(fold_posts_ids_fh)

        # All the IDs of the users with gold posts are written here
        fold_users_ids_fh = open(
            os.path.join(args.fold_dir, fold_name + ".user-ids.txt"), 'w')
        output_held_out_user_ids_file_handles.append(fold_users_ids_fh)

        # All the lat/lon and IDs of the gold posts are written here
        gold_loc_fh = open(
            os.path.join(args.fold_dir, fold_name + ".gold-locations.tsv"),
            'w')
        output_gold_loc_file_handles.append(gold_loc_fh)

        # The users.json.gz file with the gold data (used for testing)
        gold_loc_fh = gzip.open(
            os.path.join(args.fold_dir, fold_name + ".users.json.gz"), 'w')
        output_posts_file_handles.append(gold_loc_fh)
        cf_info_fh.write(
            "%s\t%s.post-ids.txt\t%s.user-ids.txt\t%s.users.json.gz\n" %
            (fold_name, fold_name, fold_name, fold_name))
    cf_info_fh.close()

    # Load the dataset
    ds = SparseDataset(args.dataset_dir)

    logger.debug('Extracting gold-standard posts')
    num_users = 0
    num_posts = 0
    num_gold_users = 0
    num_gold_posts = 0

    # Iterate over the dataset looking for posts with geo IDs that we can
    # use as a gold standard
    for user in ds.user_iter():
        gold_posts = []
        gold_post_id_to_loc = {}
        user_id = user['user_id']
        num_posts += len(user['posts'])
        for post in user['posts']:
            if "geo" in post:
                post_id = post['id']
                loc = post['geo']['coordinates']
                gold_post_id_to_loc[post_id] = loc
                gold_posts.append(post)
        # If this user had any gold locations, add them as folds
        if len(gold_posts) > 0:
            num_gold_posts += len(gold_posts)
            fold_to_use = num_gold_users % num_folds
            num_gold_users += 1

            output_held_out_user_ids_file_handles[fold_to_use].write(
                "%s\n" % user['user_id'])

            for post_id, loc in gold_post_id_to_loc.iteritems():
                output_held_out_post_ids_file_handles[fold_to_use].write(
                    "%d\n" % post_id)
                output_gold_loc_file_handles[fold_to_use].write(
                    "%d\t%s\t%f\t%f\n" % (post_id, user_id, loc[0], loc[1]))
            # Lazily mutate the existing user object and the dump
            # that object to the fold's user.json.gz
            user['posts'] = gold_posts
            output_posts_file_handles[fold_to_use].write(
                "%s\n" % simplejson.dumps(user))

        num_users += 1
        if num_users % 100000 == 0:
            logger.debug(
                'Processed %d users, saw %d gold so far (%d posts of %d (%f))'
                % (num_users, num_gold_users, num_gold_posts, num_posts,
                   float(num_gold_posts) / num_posts))

    for fh in output_posts_file_handles:
        fh.close()
    for fh in output_held_out_post_ids_file_handles:
        fh.close()
    for fh in output_held_out_user_ids_file_handles:
        fh.close()
    for fh in output_gold_loc_file_handles:
        fh.close()

    logger.debug('Saw %d gold standard users in %d total' %
                 (num_gold_users, num_users))
示例#5
0
def create_folds(args): 
    parser = argparse.ArgumentParser(prog='geoinf create_folds', description='creates a set of data partitions for evaluating with cross-fold validation')
    parser.add_argument('-f', '--force', help='overwrite the output model directory if it already exists')
    parser.add_argument('dataset_dir', help='a directory containing a geoinference dataset')
    parser.add_argument('fold_dir', help='a (non-existent) directory that will contain the information on the cross-validation folds')
    parser.add_argument('test_case', help="What type of test wanted to run i.e. rural vs urban (county), gender (gender), or random (any other string)")

    args = parser.parse_args(args)

    # Confirm that the output directory doesn't exist
    if not os.path.exists(args.fold_dir): #and not args.force:
        #raise Exception, 'output fold_dir cannot already exist'
                os.mkdir(args.fold_dir)
    ground_truth_file = "filtered_user_groundtruth_locfield.tsv"
    ground_truth_locs = "users.home-locations.loc-field.tsv.gz"

    # Decide on the number of folds
    if num_folds <= 1:
        #raise Exception, 'The number of folds must be at least two'
        print("the number of folds must be at least two")

    if args.test_case == "gender":
        num_folds = NUM_MALE_FOLDS + NUM_FEMALE_FOLDS + NUM_UNKNOWN_FOLDS
    elif args.test_case == "county":
        num_folds = NUM_URBAN_FOLDS * 6
    else:
        num_folds = NUM_RANDOM_FOLDS

    idToGender = {}
    idToUrbanLevel = {}
    with open(os.path.join(args.dataset_dir, ground_truth_file), "r") as gt_file:
        gt_file.next();
        for line in gt_file:
            try:
                uid, gender, urbanLevel = line.split('\t')
                idToGender[uid] = gender
                if urbanLevel != "\r\n":
                    idToUrbanLevel[uid] = int(urbanLevel)
            except:
                print line

    idToLoc = {}
    with gzip.open(os.path.join(args.dataset_dir, ground_truth_locs), "r") as gt_file:
        gt_file.next()
        for line in gt_file:
            uid, lat, lon = line.split('\t')
            idToLoc[uid] = (lat, lon)

    # Initialize the output streams.  Rather than keeping things in memory,
    # we batch the gold standard posts by users (one at a time) and then
    # stream the user's gold standard posts (if any) to the output streams
    output_user_ids_file_handles = []
    cf_info_fh = open(os.path.join(args.fold_dir, "folds.info.tsv"), 'w')

    for i in range(0, num_folds):
        fold_name = "fold_%d" % i

        # All the IDs of the users with gold posts are written here
        fold_users_ids_fh = open(os.path.join(args.fold_dir, fold_name + ".user-ids.txt"), 'w')
        output_user_ids_file_handles.append(fold_users_ids_file_handles)

        cf_info_fh.write("%s\t%s.user-ids.txt" 
                                 % (fold_name, fold_name))
    cf_info_fh.close()

    # Load the dataset
    ds = SparseDataset(args.dataset_dir)

    if args.test_case == "gender":
        female_users = []
        male_users = []
        unknown_users = []

        for user in ds.user_iter():                             
            user_id = user['user_id']
            usergender = idToGender.get(str(user_id), -1)
            # If this user had any gold locations, add them as folds
            if usergender != -1:
                #determine fold to use
                if userGender == "m":
                    male_users.append(user_id)
                elif userGender == "f":
                    female_users.append(user_id)
                else:
                    unknown_users.append(user_id)
        currentFold = 0

        male_folds = generate_folds(male_users, NUM_MALE_FOLDS)
        for fold in male_folds:
            write_fold(fold, currentFold, idToLoc, output_user_ids_file_handles)
            currentFold += 1
        
        female_folds = generate_folds(female_users, NUM_FEMALE_FOLDS)
        for fold in female_folds:
            write_fold(fold, currentFold, idToLoc, output_user_ids_file_handles)
            currentFold += 1
        
        unknown_folds = generate_folds(unknown_users, NUM_UNKNOWN_FOLDS)
        for fold in unknown_folds:
            write_fold(fold, currentFold, idToLoc, output_user_ids_file_handles)
            currentFold += 1
    elif args.test_case == "county":
        usersAtLevel = []
        for i in range(1, 7):
            usersAtLevel[i] = []
        for user in ds.user_iter():
            user_id = user['user_id']
            urbanRuralLevel = idToUrbanLevel.get(str(user,id) -1)
            # If this user had any gold locations, add them as folds
            if urbanRuralLevel != -1:
                usersAtLevel[urbanRuralLevel].append(user_id)
        currentFoldIndex = 0
        for i in range(1,7):        
            currentFolds = generate_folds(usersAtLevel[i], NUM_URBAN_FOLDS)
            for fold in currentFolds:
                write_fold(fold, currentFoldIndex, idToLoc, output_users_ids_file_handles)
                currentFoldIndex += 1
    else:
        # Iterate over the dataset looking for posts with geo IDs that we can
        # use as a gold standard
        for user in ds.user_iter():
            gold_users = []
            user_id = user['user_id']
            gender = idToGender.get(str(user_id), -1)
            # If this user had any gold locations, add them as folds
            if gender != -1:
                gold_users.append(uid)
                
        currentFoldIndex = 0
        currentFolds = generate_folds(gold_users, NUM_RANDOM_FOLDS)
        for fold in currentFolds:
            write_fold(fold, currentFoldIndex, idToLoc, output_users_ids_file_handles)
            currentFoldIndex += 1
            
    for fh in output_user_ids_file_handles:
        fh.close()