def main():

    path = './'
    n_runs = 1
    simulation_periods = [['2019-02-01', '2019-02-15'],
                          ['2019-02-08', '2019-02-22'],
                          ['2019-02-15', '2019-03-01'],
                          ['2019-02-22', '2019-03-01']]

    #files are in weekly subsets, e.g. venezuela_v2_extracted_twitter_2019-02-01_2019-02-08.json
    all_files = glob.glob(path + 'venezuela_v2_extracted*.json')

    #extract dates and platforms from file names
    date_re = '(20\d\d-\d\d-\d\d)_(20\d\d-\d\d-\d\d)'
    dates = [re.search(date_re, fn) for fn in all_files]
    start_dates = [d.group(1) for d in dates]
    end_dates = [d.group(2) for d in dates]
    platforms = [re.search('twitter|youtube', fn).group(0) for fn in all_files]

    #create data frame with files, dates, and platforms
    fn_df = pd.DataFrame({
        'fn': all_files,
        'start': start_dates,
        'end': end_dates,
        'platform': platforms
    })

    fn_df['start'] = pd.to_datetime(fn_df['start'])
    fn_df['end'] = pd.to_datetime(fn_df['end'])

    fn_df = fn_df.sort_values('start')

    #loop over simulation periods
    for sim_period in simulation_periods:
        #start and end time of the simulation
        start = pd.to_datetime(sim_period[0])
        end = pd.to_datetime(sim_period[1])

        #select files to sample from based on dates (same length as simulation period, just before simulation period)
        sim = fn_df[(fn_df['start'] >= start) & (fn_df['start'] < end)]
        hist = fn_df[(fn_df['start'] < start) & (fn_df['start'] >= start -
                                                 (end - start))]
        previous = fn_df[(fn_df['start'] < start - (end - start))]

        print(start, end)
        print('Historical Data to Sample From')
        print(hist)
        print('Prior Data to Track Users From')
        print(previous)

        previous_history_data = list(previous['fn'].values)
        history_data = list(hist['fn'].values)

        #load data
        hist = []
        for data in history_data:
            hist.append(
                ss.load_data(data, ignore_first_line=False, verbose=False))
        hist = pd.concat(hist)

        hist = hist.sort_values('nodeTime')

        previous_hist = []
        for data in previous_history_data:
            previous_hist.append(
                ss.load_data(data, ignore_first_line=False, verbose=False))
        previous_hist = pd.concat(previous_hist)
        previous_hist = previous_hist[['nodeUserID',
                                       'informationID']].drop_duplicates()

        #multiple runs of the baseline sampling
        for i in range(n_runs):
            dfs = []
            # for each platform and information ID
            for (plat,
                 info), grp in hist.groupby(['platform', 'informationID']):

                print(plat, info)

                starting = time.time()
                sampled_df = sample_from_historical_data(
                    grp,
                    info,
                    plat,
                    hist['nodeTime'].min(),
                    hist['nodeTime'].max(),
                    start,
                    end,
                    previous_hist=previous_hist,
                    new_users=True)
                ending = time.time()
                elapsed = (ending - starting) / 60.0
                print(f'Time elapsed: {elapsed} minutes')

                dfs.append(sampled_df)

            baseline = pd.concat(dfs).reset_index(drop=True)

            #save generated baseline
            start_str = start.strftime('%Y-%m-%d')
            end_str = end.strftime('%Y-%m-%d')
            baseline.to_json(f'baseline_{start_str}_{end_str}_{i}.json',
                             orient='records',
                             lines=True)
示例#2
0
import socialsim as ss

# Load the configuration file
config = 'data/cp4_configuration.json'
config = ss.load_config(config)

# Get metadata
metadata = ss.MetaData()

# Instantiate the task runner with the specified ground truth
ground_truth_filepath = 'data/test_dataset.json'
ground_truth = ss.load_data(ground_truth_filepath,
                            ignore_first_line=True,
                            verbose=False)
eval_runner = ss.EvaluationRunner(ground_truth, config, metadata=metadata)

# Evaluate a series of submissions that contain submission metadata as the first line of the submission file\
submission_filepaths = ['data/test_dataset.json']
for simulation_filepath in submission_filepaths:
    # Run measurements and metrics on the simulation data
    results, logs = eval_runner(simulation_filepath,
                                verbose=True,
                                submission_meta=True)
import socialsim as ss

# Load the example dataset
dataset = 'data/test_dataset.txt'
dataset = ss.load_data(dataset)

# Subset the dataset to a particular platform
dataset = dataset[dataset['platform'] == 'twitter'].head(n=2000)

# Load the configuration file
config = 'cp1_configuration.json'
config = ss.load_config(config)

# Subset the configuration for the given task
config = config['twitter']['social_structure']

# Define the measurement object
social_structure_measurements = ss.SocialStructureMeasurements(
    dataset, config, None, 'twitter')

# Run all measurements in the config file
results = social_structure_measurements.run(verbose=True)
示例#4
0
def check_records(submission_filepath, nodelist, simulation_period):
    errors, warnings = [], []
    try:
        # test that submission file can be loaded
        subm = ss.load_data(submission_filepath,
                            ignore_first_line=True,
                            verbose=False)
        loaded = True
    except Exception as e:
        errors.append('Submission could not be loaded: ' + str(e))
        loaded = False

    if loaded:
        # platform tests
        valid_items = VALID_OPTIONS[challenge]['platforms']
        subm_items = set(subm['platform'].unique())
        platform_errors, platform_warnings = check_all_present(
            valid_items, subm_items, 'platforms')
        errors.extend(platform_errors)
        warnings.extend(platform_warnings)

        if nodelist is not None:
            # informationID tests
            valid_items = VALID_OPTIONS[challenge]['informationID']
            subm_items = set(subm['informationID'].unique())
            informationID_errors, informationID_warnings = check_all_present(
                valid_items, subm_items, 'informationIDs')
            errors.extend(informationID_errors)
            warnings.extend(informationID_warnings)

        # test that there are no NaN items in required event details
        for c in [
                'informationID', 'nodeTime', 'nodeID', 'parentID', 'rootID',
                'platform', 'actionType', 'nodeUserID'
        ]:
            if len(subm[c]) != len(subm[c].dropna()):
                errors.append(f'{c} can not be NaN values.')
                print(c)
                print(subm[c].astype(str).unique())
        # check for empty user-user network
        parentID_nodeID_overlap = set(subm['parentID']).intersection(
            set(subm['nodeID']))
        if len(parentID_nodeID_overlap) == 0:
            warnings.append(
                'There is no overlap between nodeID values and parentID values -- the user-to-user network created from this submission will be empty.'
            )

        # check that nodeTimes fall within simulation window
        try:
            simulation_window = VALID_OPTIONS[challenge]['simulation_windows'][
                simulation_period]
            minday = f'2019-{simulation_window[0]}'
            maxday = f'2019-{simulation_window[1]} 23:59'
            maxday_str = maxday.split(' ')[0]
            subm['nodeTime'] = pd.to_datetime(subm['nodeTime']).astype(str)
            subm_minday, subm_maxday = subm['nodeTime'].min(
            ), subm['nodeTime'].max()
            if subm_maxday <= minday:
                errors.append(
                    f'There is no data within the simulation period, all nodeTime values occur before the simulation period ({minday} - {maxday_str}).\n\tSubmission nodeTime values -- Min: {subm_minday} Max: {subm_maxday}'
                )
            elif subm_minday > maxday:
                errors.append(
                    f'There is no data within the simulation period, all nodeTime values occur after the simulation period ({minday} - {maxday_str}).\n\tSubmission nodeTime values -- Min: {subm_minday} Max: {subm_maxday}'
                )
            elif subm_minday < minday:
                warnings.append(
                    f'Some events occur before the simulation period, earliest nodeTime value is {subm_minday}'
                )
            elif subm_maxday > maxday:
                warnings.append(
                    f'Some events occur after the simulation period, latest nodeTime value is {subm_maxday}'
                )
        except Exception as e:
            warnings.append(
                'Could not validate nodeTimes occur within the simulation period: '
                + str(e))

        # actionType tests
        valid_items = VALID_OPTIONS[challenge]['actiontypes']
        subm_items = set(subm['actionType'].unique())
        platform_errors, platform_warnings = check_all_present(
            valid_items, subm_items, 'actionType')
        errors.extend(platform_errors)
        warnings.extend(platform_warnings)

    result = ''

    if len(errors) > 0:
        result = result + 'ERRORS:\n\t'
        result = result + '\n\n\t'.join(errors) + '\n\n'
    if len(warnings) > 0:
        result = result + 'WARNINGS:\n\t'
        result = result + '\n\n\t'.join(warnings)

    if result == '': result = 'success'
    return result
import socialsim as ss

# Load the simulation data
simulation = 'data/test_dataset.txt'
simulation = ss.load_data(simulation)

# Load the ground truth data
ground_truth = 'data/test_dataset.txt'
ground_truth = ss.load_data(ground_truth)

# Load the configuration file
config = 'data/cp2_configuration.json'
config = ss.load_config(config)

# Get metadata
metadata = ss.MetaData(community_directory='data/communities/')

# Instantiate the task runner
task_runner = ss.TaskRunner(ground_truth, config, metadata=metadata, test=True)

# Run measurements and metrics on the simulation data
results = task_runner(simulation, verbose=True)
示例#6
0
from pprint import pprint
import socialsim as ss

# Load the simulation data
simulation = 'data/test_dataset.json'
simulation = ss.load_data(simulation, ignore_first_line=True, verbose=False)

# Load the ground truth data
ground_truth = 'data/test_dataset.json'
ground_truth = ss.load_data(ground_truth, ignore_first_line=True, verbose=False)

# Load the configuration file 
config = 'data/cp3_s1_configuration.json'
config = ss.load_config(config)

# Get metadata
metadata = ss.MetaData()

# Instantiate the task runner 
task_runner = ss.TaskRunner(ground_truth, config, metadata=metadata)

# Run measurements and metrics on the simulation data
results, logs = task_runner(simulation, verbose=True)