예제 #1
0
def load_datasets_tira_evaluation(test_dataset_main_directory, preset_key):
    """Load the PAN dataset for **Tira** evaluation.
    This function loads the PAN training and test dataset and truth by calling the *ProcessDataFiles* module twice,
    then passes them along with Author IDs of the test dataset.
    """

    # Define the dictionary of presets. Each “preset” is a dictionary of some values.
    PRESETS_DICTIONARY = {
        'PAN18_English': {
            'dataset_name': 'PAN 2018 English',
            'xmls_subdirectory': '/en/',
            'truth_subpath': '/en-truth/truth.txt',
        }
    }
    PRESET = PRESETS_DICTIONARY[preset_key]

    # Define the constant and the paths
    TRAINING_DATASET_MAIN_DIRECTORY =\
        "U:/TA"

    # # TEMP (TIRA): For local testing on TIRA
    # TRAINING_DATASET_MAIN_DIRECTORY = "E:/author-profiling/pan18-author-profiling-training-dataset-2018-02-27"

    xmls_directory_train = os.path.join(TRAINING_DATASET_MAIN_DIRECTORY,
                                        PRESET['xmls_subdirectory'])
    truth_path_train = os.path.join(TRAINING_DATASET_MAIN_DIRECTORY,
                                    PRESET['truth_subpath'])
    xmls_directory_test = os.path.join(test_dataset_main_directory,
                                       PRESET['xmls_subdirectory'])
    # ↳ Note: truth_path_test will not be provided to the participants.

    # Load the PAN 2018 training dataset and truth from the files into lists
    print("Loading the %s training dataset and truth...",
          PRESET['dataset_name'])
    docs_train, y_train, author_ids_train, original_tweet_lengths_train = \
        process_data_files.load_pan_data(xmls_directory_train, truth_path_train, False, None)

    # Load the PAN 2018 test dataset from the files into lists
    print("Loading the %s test dataset...", PRESET['dataset_name'])
    docs_test, y_test, author_ids_test, original_tweet_lengths_test = \
        process_data_files.load_pan_data(xmls_directory_test, None, False, None)
    # ↳ Note: truth_path_test will not be provided to the participants. As a result, *truths_test* will be empty.

    return docs_train, docs_test, y_train, author_ids_test
예제 #2
0
def load_datasets_development(preset_key):
    """Load the PAN dataset for the development phase.
    This function loads the PAN training dataset and truth by calling the *ProcessDataFiles* module,
    then splits the dataset into training and test sets.
    """
    directory = 'U:/TA'
    # Define the dictionary of presets. Each “preset” is a dictionary of some values.
    PRESETS_DICTIONARY = {
        'PAN18_English': {
            'dataset_name': 'PAN 2018 English',
            'xmls_directory': directory + '/en/',
            'truth_path': directory + '/truth/truth.txt',
            'txts_destination_directory': directory,
        }
    }
    PRESET = PRESETS_DICTIONARY[preset_key]

    # Load the PAN 2018 training dataset and the truth from the files into lists
    print("Loading the %s training dataset and the truth...",
          PRESET['dataset_name'])
    merged_tweets_of_authors, truths, author_ids, original_tweet_lengths =\
        process_data_files.load_pan_data(PRESET['xmls_directory'], PRESET['truth_path'],
                                         False, PRESET['txts_destination_directory'])

    # Split the dataset into balanced (stratified) training and test sets:
    docs_train, docs_test, y_train, y_test, author_ids_train, author_ids_test,\
    original_tweet_lengths_train, original_tweet_lengths_test =\
        train_test_split(merged_tweets_of_authors, truths, author_ids, original_tweet_lengths,
                         test_size=0.4, random_state=42, stratify=truths)
    # ↳ *stratify=truths* selects a balanced sample from the data, with the same class proportion as the *truths* list.

    # • Sort all lists in the ascending order of *author_ids* (separately, for the training and test set)
    # This is only done for the sakes of consistency between the *load_datasets_development()* and
    # *load_datasets_tira_evaluation()* functions, because the output of the latter is sorted by *author_ids*, while the
    # former is shuffled by the *train_test_split()* function.
    # Sort the training set
    author_ids_train, docs_train, y_train, original_tweet_lengths_train = [
        list(tuple) for tuple in zip(*sorted(
            zip(author_ids_train, docs_train, y_train,
                original_tweet_lengths_train)))
    ]
    # Sort the test set
    author_ids_test, docs_test, y_test, original_tweet_lengths_test = [
        list(tuple) for tuple in zip(*sorted(
            zip(author_ids_test, docs_test, y_test,
                original_tweet_lengths_test)))
    ]

    # # TEMP: Used for producing a mimic of the **TIRA** environment
    # ProcessDataFiles.split_train_and_test_files(author_ids_train, author_ids_test, y_train, y_test, preset_key)

    return docs_train, docs_test, y_train, y_test