Exemplo n.º 1
0
def store_input_data_frame(data_frame = None, collection = None, survey = None, table = None):
    assert data_frame is not None
    assert collection is not None
    assert survey is not None
    try:
        openfisca_survey_collection = SurveyCollection.load(collection = collection)
    except Exception as e:
        openfisca_survey_collection = SurveyCollection(name = collection)

    log.debug("In collection {} the following survey are present: {}".format(collection, openfisca_survey_collection.surveys))
    output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory')
    if table is None:
        table = "input"
    #
    survey_name = survey
    hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name))
    available_survey_names = [survey_.name for survey_ in openfisca_survey_collection.surveys]
    if survey_name in available_survey_names:
        survey = openfisca_survey_collection.get_survey(survey_name)
    else:
        survey = Survey(name = survey_name, hdf5_file_path = hdf5_file_path)
    survey.insert_table(name = table, data_frame = data_frame)
    openfisca_survey_collection.surveys.append(survey)
    collections_directory = openfisca_survey_collection.config.get('collections', 'collections_directory')
    json_file_path = os.path.join(collections_directory, '{}.json'.format(collection))
    log.debug("In collection {} the following surveyx are present: {}".format(collection, openfisca_survey_collection.surveys))
    openfisca_survey_collection.dump(json_file_path = json_file_path)
def run_all(year = None, filename = "test", check = False):

    assert year is not None
    pre_processing.create_indivim_menage_en_mois(year = year)
    pre_processing.create_enfants_a_naitre(year = year)
    # imputation_loyer.imputation_loyer(year = year)
    fip.create_fip(year = year)
    famille.famille(year = year)
    foyer.sif(year = year)
    foyer.foyer_all(year = year)
    rebuild.create_totals(year = year)
    rebuild.create_final(year = year)
    invalides.invalide(year = year)
    data_frame = final.final(year = year, check = check)

    # Saving the data_frame
    openfisca_survey_collection = SurveyCollection(name = "openfisca", config_files_directory = config_files_directory)
    output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory')
    survey_name = "openfisca_data_{}".format(year)
    table = "input"
    hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name))
    survey = Survey(
        name = survey_name,
        hdf5_file_path = hdf5_file_path,
        )
    survey.insert_table(name = table, data_frame = data_frame)
    openfisca_survey_collection.surveys.append(survey)
    collections_directory = openfisca_survey_collection.config.get('collections', 'collections_directory')
    json_file_path = os.path.join(collections_directory, 'openfisca.json')
    openfisca_survey_collection.dump(json_file_path = json_file_path)
Exemplo n.º 3
0
def build_survey_collection(name=None,
                            erase_collection_json=False,
                            overwrite_surveys=False,
                            data_directory_path_by_year=None,
                            source_format='sas'):

    assert name is not None
    assert data_directory_path_by_year is not None
    years = data_directory_path_by_year.keys()
    if years is None:
        log.error("A list of years to process is needed")

    if erase_collection_json:
        survey_collection = SurveyCollection(
            name=name, config_files_directory=config_files_directory)
    else:
        try:
            survey_collection = SurveyCollection.load(
                collection=name, config_files_directory=config_files_directory)
        except ConfigParser.NoOptionError:
            survey_collection = SurveyCollection(
                name=name, config_files_directory=config_files_directory)

    for year, data_directory_path in data_directory_path_by_year.iteritems():
        if not os.path.isdir(data_directory_path):
            input_data_directory = survey_collection.config.get(
                'data', 'input_directory')
            assert os.path.isdir(input_data_directory)
            data_directory_path = os.path.join(input_data_directory,
                                               data_directory_path)
            assert os.path.isdir(input_data_directory)

        data_file_by_format = create_data_file_by_format(data_directory_path)
        print data_file_by_format
        survey_name = '{}_{}'.format(name, year)
        add_survey_to_collection(
            survey_name=survey_name,
            survey_collection=survey_collection,
            sas_files=data_file_by_format[source_format],
        )
        collections_directory = survey_collection.config.get(
            'collections', 'collections_directory')
        collection_json_path = os.path.join(collections_directory,
                                            "{}.json".format(name))
        survey_collection.dump(json_file_path=collection_json_path)
        surveys = [
            survey for survey in survey_collection.surveys
            if survey.name.endswith(str(year))
        ]
        survey_collection.fill_hdf(source_format=source_format,
                                   surveys=surveys,
                                   overwrite=overwrite_surveys)
    return survey_collection
def build_survey_collection(
        config_files_directory: str,
        collection_name = None,
        replace_metadata = False,
        replace_data = False,
        data_directory_path_by_survey_suffix = None,
        source_format = 'sas',
        ):

    assert collection_name is not None
    assert data_directory_path_by_survey_suffix is not None
    surveys_name = list(data_directory_path_by_survey_suffix.keys())
    assert surveys_name is not None, "A list of surveys to process is needed"

    if replace_metadata:
        survey_collection = SurveyCollection(
            name = collection_name, config_files_directory = config_files_directory)
    else:
        try:
            survey_collection = SurveyCollection.load(
                collection = collection_name, config_files_directory = config_files_directory)
        except configparser.NoOptionError:
            survey_collection = SurveyCollection(
                name = collection_name, config_files_directory = config_files_directory)

    for survey_suffix, data_directory_path in data_directory_path_by_survey_suffix.items():
        assert os.path.isdir(data_directory_path), '{} is not a valid directory path'.format(data_directory_path)

        data_file_by_format = create_data_file_by_format(data_directory_path)
        survey_name = '{}_{}'.format(collection_name, survey_suffix)
        add_survey_to_collection(
            survey_name = survey_name,
            survey_collection = survey_collection,
            sas_files = data_file_by_format.get('sas'),
            stata_files = data_file_by_format.get('stata'),
            )

        valid_source_format = [
            _format for _format in list(data_file_by_format.keys())
            if data_file_by_format.get((_format))
            ]
        log.info("Valid source formats are: {}".format(valid_source_format))
        source_format = valid_source_format[0]
        log.info("Using the following format: {}".format(source_format))
        collections_directory = survey_collection.config.get('collections', 'collections_directory')
        assert os.path.isdir(collections_directory), """{} who should be the collections' directory does not exist.
Fix the option collections_directory in the collections section of your config file.""".format(collections_directory)
        collection_json_path = os.path.join(collections_directory, "{}.json".format(collection_name))
        survey_collection.dump(json_file_path = collection_json_path)
        surveys = [survey for survey in survey_collection.surveys if survey.name.endswith(str(survey_suffix))]
        survey_collection.fill_hdf(source_format = source_format, surveys = surveys, overwrite = replace_data)
    return survey_collection
Exemplo n.º 5
0
def set_table_in_survey(input_dataframe, entity, period, collection, survey_name, survey_label = None,
        table_label = None, table_name = None):
    period = periods.period(period)
    if table_name is None:
        table_name = entity + '_' + str(period)
    if table_label is None:
        table_label = "Input data for entity {} at period {}".format(entity, period)
    try:
        survey_collection = SurveyCollection.load(collection = collection)
    except configparser.NoOptionError:
        survey_collection = SurveyCollection(name = collection)
    except configparser.NoSectionError:  # For tests
        data_dir = os.path.join(
            pkg_resources.get_distribution('openfisca-survey-manager').location,
            'openfisca_survey_manager',
            'tests',
            'data_files',
            )
        survey_collection = SurveyCollection(
            name = collection,
            config_files_directory = data_dir,
            )

    try:
        survey = survey_collection.get_survey(survey_name)
    except AssertionError:
        survey = Survey(
            name = survey_name,
            label = survey_label or None,
            survey_collection = survey_collection,
            )

    if survey.hdf5_file_path is None:
        config = survey.survey_collection.config
        directory_path = config.get("data", "output_directory")
        if not os.path.isdir(directory_path):
            log.warn("{} who should be the HDF5 data directory does not exist: we create the directory".format(
                directory_path))
            os.makedirs(directory_path)
        survey.hdf5_file_path = os.path.join(directory_path, survey.name + '.h5')

    assert survey.hdf5_file_path is not None
    survey.insert_table(label = table_label, name = table_name, dataframe = input_dataframe)
    survey_collection.surveys = [
        kept_survey for kept_survey in survey_collection.surveys if kept_survey.name != survey_name
        ]
    survey_collection.surveys.append(survey)
    collections_directory = survey_collection.config.get('collections', 'collections_directory')
    assert os.path.isdir(collections_directory), """{} who should be the collections' directory does not exist.
Fix the option collections_directory in the collections section of your config file.""".format(collections_directory)
    collection_json_path = os.path.join(collections_directory, "{}.json".format(collection))
    survey_collection.dump(json_file_path = collection_json_path)
def set_table_in_survey(input_dataframe, entity, period, collection, survey_name, survey_label = None,
        table_label = None, table_name = None):
    period = periods.period(period)
    if table_name is None:
        table_name = entity + '_' + str(period)
    if table_label is None:
        table_label = "Input data for entity {} at period {}".format(entity, period)
    try:
        survey_collection = SurveyCollection.load(collection = collection)
    except configparser.NoOptionError:
        survey_collection = SurveyCollection(name = collection)
    except configparser.NoSectionError:  # For tests
        data_dir = os.path.join(
            pkg_resources.get_distribution('openfisca-survey-manager').location,
            'openfisca_survey_manager',
            'tests',
            'data_files',
            )
        survey_collection = SurveyCollection(
            name = collection,
            config_files_directory = data_dir,
            )

    try:
        survey = survey_collection.get_survey(survey_name)
    except AssertionError:
        survey = Survey(
            name = survey_name,
            label = survey_label or None,
            survey_collection = survey_collection,
            )

    if survey.hdf5_file_path is None:
        config = survey.survey_collection.config
        directory_path = config.get("data", "output_directory")
        if not os.path.isdir(directory_path):
            log.warn("{} who should be the HDF5 data directory does not exist: we create the directory".format(
                directory_path))
            os.makedirs(directory_path)
        survey.hdf5_file_path = os.path.join(directory_path, survey.name + '.h5')

    assert survey.hdf5_file_path is not None
    survey.insert_table(label = table_label, name = table_name, dataframe = input_dataframe)
    survey_collection.surveys = [
        kept_survey for kept_survey in survey_collection.surveys if kept_survey.name != survey_name
        ]
    survey_collection.surveys.append(survey)
    collections_directory = survey_collection.config.get('collections', 'collections_directory')
    assert os.path.isdir(collections_directory), """{} who should be the collections' directory does not exist.
Fix the option collections_directory in the collections section of your config file.""".format(collections_directory)
    collection_json_path = os.path.join(collections_directory, "{}.json".format(collection))
    survey_collection.dump(json_file_path = collection_json_path)
def build_bdf_survey_collection(years=None, erase=False, overwrite=False):
    if years is None:
        log.error("A list of years to process is needed")

    if erase:
        bdf_survey_collection = SurveyCollection(
            name="budget_des_familles",
            config_files_directory=config_files_directory)
    else:
        try:
            bdf_survey_collection = SurveyCollection.load(
                collection='budget_des_familles',
                config_files_directory=config_files_directory)
        except ConfigParser.NoOptionError:
            bdf_survey_collection = SurveyCollection(
                name="budget_des_familles",
                config_files_directory=config_files_directory)

    input_data_directory = bdf_survey_collection.config.get(
        'data', 'input_directory')
    if getpass.getuser() == 'benjello':
        input_data_directory = os.path.join(
            os.path.dirname(input_data_directory), 'INSEE')
    else:
        input_data_directory = os.path.dirname(input_data_directory)

    for year in years:
        data_directory_path = os.path.join(
            input_data_directory, 'budget_des_familles/{}'.format(year))
        data_file_by_format = create_data_file_by_format(data_directory_path)
        survey_name = 'budget_des_familles_{}'.format(year)

        add_survey_to_collection(
            survey_name=survey_name,
            survey_collection=bdf_survey_collection,
            stata_files=data_file_by_format['stata'],
        )

        collections_directory = bdf_survey_collection.config.get(
            'collections', 'collections_directory')
        collection_json_path = os.path.join(collections_directory,
                                            "budget_des_familles" + ".json")
        bdf_survey_collection.dump(json_file_path=collection_json_path)
        surveys = [
            survey for survey in bdf_survey_collection.surveys
            if survey.name.endswith(str(year))
        ]
        bdf_survey_collection.fill_hdf(source_format='stata',
                                       surveys=surveys,
                                       overwrite=overwrite)
    return bdf_survey_collection
Exemplo n.º 8
0
def store_input_data_frame(data_frame = None, collection = None, survey = None):
    assert data_frame is not None
    assert collection is not None
    assert survey is not None
    openfisca_survey_collection = SurveyCollection(name = collection, config_files_directory = config_files_directory)
    output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory')
    survey_name = survey
    table = "input"
    hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name))
    survey = Survey(
        name = survey_name,
        hdf5_file_path = hdf5_file_path,
        )
    survey.insert_table(name = table, data_frame = data_frame)
    openfisca_survey_collection.surveys.append(survey)
    collections_directory = openfisca_survey_collection.config.get('collections', 'collections_directory')
    json_file_path = os.path.join(collections_directory, 'openfisca_erfs_fpr.json')
    openfisca_survey_collection.dump(json_file_path = json_file_path)
def build_erfs_survey_collection(years = None, erase = False, overwrite = False):

    if years is None:
        log.error("A list of years to process is needed")

    if erase:
        erfs_survey_collection = SurveyCollection(
            name = "erfs", config_files_directory = config_files_directory)
    else:
        try:
            erfs_survey_collection = SurveyCollection.load(
                collection = 'erfs', config_files_directory = config_files_directory)
        except ConfigParser.NoOptionError:
            erfs_survey_collection = SurveyCollection(
                name = "erfs", config_files_directory = config_files_directory)

    input_data_directory = erfs_survey_collection.config.get('data', 'input_directory')
    if getpass.getuser() == 'benjello':
        input_data_directory = os.path.join(os.path.dirname(input_data_directory), 'INSEE')
    else:
        input_data_directory = os.path.dirname(input_data_directory)

    for year in years:
        data_directory_path = os.path.join(
            input_data_directory,
            'ERF/ERFS_{}'.format(year)
            )
        data_file_by_format = create_data_file_by_format(data_directory_path)
        survey_name = 'erfs_{}'.format(year)

        add_survey_to_collection(
            survey_name = survey_name,
            survey_collection = erfs_survey_collection,
            sas_files = data_file_by_format['sas'],
            )

        collections_directory = erfs_survey_collection.config.get('collections', 'collections_directory')
        collection_json_path = os.path.join(collections_directory, "erfs" + ".json")
        erfs_survey_collection.dump(json_file_path = collection_json_path)
        surveys = [survey for survey in erfs_survey_collection.surveys if survey.name.endswith(str(year))]

        erfs_survey_collection.fill_hdf(source_format = 'sas', surveys = surveys, overwrite = overwrite)
    return erfs_survey_collection
Exemplo n.º 10
0
def build(year=None, check=False):

    assert year is not None
    pre_processing.create_indivim_menagem(year=year)
    pre_processing.create_enfants_a_naitre(year=year)
    #    try:
    #        imputation_loyer.imputation_loyer(year = year)
    #    except Exception, e:
    #        log.info('Do not impute loyer because of the following error: \n {}'.format(e))
    #        pass
    fip.create_fip(year=year)
    famille.famille(year=year)
    foyer.sif(year=year)
    foyer.foyer_all(year=year)
    rebuild.create_totals_first_pass(year=year)
    rebuild.create_totals_second_pass(year=year)
    rebuild.create_final(year=year)
    invalides.invalide(year=year)
    final.final(year=year, check=check)

    temporary_store = get_store(file_name='erfs')
    data_frame = temporary_store['input_{}'.format(year)]
    # Saving the data_frame
    openfisca_survey_collection = SurveyCollection(
        name="openfisca", config_files_directory=config_files_directory)
    output_data_directory = openfisca_survey_collection.config.get(
        'data', 'output_directory')
    survey_name = "openfisca_data_{}".format(year)
    table = "input"
    hdf5_file_path = os.path.join(os.path.dirname(output_data_directory),
                                  "{}.h5".format(survey_name))
    survey = Survey(
        name=survey_name,
        hdf5_file_path=hdf5_file_path,
    )
    survey.insert_table(name=table, data_frame=data_frame)
    openfisca_survey_collection.surveys.append(survey)
    collections_directory = openfisca_survey_collection.config.get(
        'collections', 'collections_directory')
    json_file_path = os.path.join(collections_directory, 'openfisca.json')
    openfisca_survey_collection.dump(json_file_path=json_file_path)
def build_survey_collection(name = None, erase_collection_json = False, overwrite_surveys = False,
        data_directory_path_by_year = None, source_format = 'sas'):

    assert name is not None
    assert data_directory_path_by_year is not None
    years = data_directory_path_by_year.keys()
    if years is None:
        log.error("A list of years to process is needed")

    if erase_collection_json:
        survey_collection = SurveyCollection(
            name = name, config_files_directory = config_files_directory)
    else:
        try:
            survey_collection = SurveyCollection.load(
                collection = name, config_files_directory = config_files_directory)
        except ConfigParser.NoOptionError:
            survey_collection = SurveyCollection(
                name = name, config_files_directory = config_files_directory)

    for year, data_directory_path in data_directory_path_by_year.iteritems():
        if not os.path.isdir(data_directory_path):
            input_data_directory = survey_collection.config.get('data', 'input_directory')
            assert os.path.isdir(input_data_directory)
            data_directory_path = os.path.join(input_data_directory, data_directory_path)
            assert os.path.isdir(input_data_directory)

        data_file_by_format = create_data_file_by_format(data_directory_path)
        print data_file_by_format
        survey_name = '{}_{}'.format(name, year)
        add_survey_to_collection(
            survey_name = survey_name,
            survey_collection = survey_collection,
            sas_files = data_file_by_format[source_format],
            )
        collections_directory = survey_collection.config.get('collections', 'collections_directory')
        collection_json_path = os.path.join(collections_directory, "{}.json".format(name))
        survey_collection.dump(json_file_path = collection_json_path)
        surveys = [survey for survey in survey_collection.surveys if survey.name.endswith(str(year))]
        survey_collection.fill_hdf(source_format = source_format, surveys = surveys, overwrite = overwrite_surveys)
    return survey_collection
Exemplo n.º 12
0
def store_input_data_frame(data_frame=None,
                           collection=None,
                           survey=None,
                           table=None):
    assert data_frame is not None
    assert collection is not None
    assert survey is not None
    try:
        openfisca_survey_collection = SurveyCollection.load(
            collection=collection)
    except Exception as e:
        openfisca_survey_collection = SurveyCollection(name=collection)

    log.debug("In collection {} the following survey are present: {}".format(
        collection, openfisca_survey_collection.surveys))
    output_data_directory = openfisca_survey_collection.config.get(
        'data', 'output_directory')
    if table is None:
        table = "input"
    #
    survey_name = survey
    hdf5_file_path = os.path.join(os.path.dirname(output_data_directory),
                                  "{}.h5".format(survey_name))
    available_survey_names = [
        survey_.name for survey_ in openfisca_survey_collection.surveys
    ]
    if survey_name in available_survey_names:
        survey = openfisca_survey_collection.get_survey(survey_name)
    else:
        survey = Survey(name=survey_name, hdf5_file_path=hdf5_file_path)
    survey.insert_table(name=table, data_frame=data_frame)
    openfisca_survey_collection.surveys.append(survey)
    collections_directory = openfisca_survey_collection.config.get(
        'collections', 'collections_directory')
    json_file_path = os.path.join(collections_directory,
                                  '{}.json'.format(collection))
    log.debug("In collection {} the following surveyx are present: {}".format(
        collection, openfisca_survey_collection.surveys))
    openfisca_survey_collection.dump(json_file_path=json_file_path)
def build_survey_collection(collection_name = None, replace_metadata = False, replace_data = False,
        data_directory_path_by_survey_suffix = None, source_format = 'sas'):

    assert collection_name is not None
    assert data_directory_path_by_survey_suffix is not None
    surveys_name = data_directory_path_by_survey_suffix.keys()
    assert surveys_name is not None, "A list of surveys to process is needed"

    if replace_metadata:
        survey_collection = SurveyCollection(
            name = collection_name, config_files_directory = config_files_directory)
    else:
        try:
            survey_collection = SurveyCollection.load(
                collection = collection_name, config_files_directory = config_files_directory)
        except ConfigParser.NoOptionError:
            survey_collection = SurveyCollection(
                name = collection_name, config_files_directory = config_files_directory)

    for survey_suffix, data_directory_path in data_directory_path_by_survey_suffix.iteritems():
        assert os.path.isdir(data_directory_path)

        data_file_by_format = create_data_file_by_format(data_directory_path)
        print data_file_by_format
        survey_name = '{}_{}'.format(collection_name, survey_suffix)
        add_survey_to_collection(
            survey_name = survey_name,
            survey_collection = survey_collection,
            sas_files = data_file_by_format[source_format],
            )
        collections_directory = survey_collection.config.get('collections', 'collections_directory')
        assert os.path.isdir(collections_directory), """{} who should be the collections' directory does not exist.
Fix the option collections_directory in the collections section of your config file.""".format(collections_directory)
        collection_json_path = os.path.join(collections_directory, "{}.json".format(collection_name))
        survey_collection.dump(json_file_path = collection_json_path)
        surveys = [survey for survey in survey_collection.surveys if survey.name.endswith(str(survey_suffix))]
        survey_collection.fill_hdf(source_format = source_format, surveys = surveys, overwrite = replace_data)
    return survey_collection
Exemplo n.º 14
0
def run_all(year = None, check = False):

    assert year is not None
    pre_processing.create_indivim_menagem(year = year)
    pre_processing.create_enfants_a_naitre(year = year)
    #    try:
    #        imputation_loyer.imputation_loyer(year = year)
    #    except Exception, e:
    #        log.info('Do not impute loyer because of the following error: \n {}'.format(e))
    #        pass
    fip.create_fip(year = year)
    famille.famille(year = year)
    foyer.sif(year = year)
    foyer.foyer_all(year = year)
    rebuild.create_totals_first_pass(year = year)
    rebuild.create_totals_second_pass(year = year)
    rebuild.create_final(year = year)
    invalides.invalide(year = year)
    final.final(year = year, check = check)

    temporary_store = get_store(file_name = 'erfs')
    data_frame = temporary_store['input_{}'.format(year)]
    # Saving the data_frame
    openfisca_survey_collection = SurveyCollection(name = "openfisca", config_files_directory = config_files_directory)
    output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory')
    survey_name = "openfisca_data_{}".format(year)
    table = "input"
    hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name))
    survey = Survey(
        name = survey_name,
        hdf5_file_path = hdf5_file_path,
        )
    survey.insert_table(name = table, data_frame = data_frame)
    openfisca_survey_collection.surveys.append(survey)
    collections_directory = openfisca_survey_collection.config.get('collections', 'collections_directory')
    json_file_path = os.path.join(collections_directory, 'openfisca.json')
    openfisca_survey_collection.dump(json_file_path = json_file_path)
            collection = 'openfisca_indirect_taxation', config_files_directory = config_files_directory)
    except:
        openfisca_survey_collection = SurveyCollection(
            name = 'openfisca_indirect_taxation', config_files_directory = config_files_directory)

    output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory')
    survey_name = "openfisca_indirect_taxation_data_{}".format(year_calage)
    table = "input"
    hdf5_file_path = os.path.join(output_data_directory, "{}.h5".format(survey_name))
    survey = Survey(
        name = survey_name,
        hdf5_file_path = hdf5_file_path,
        )
    survey.insert_table(name = table, data_frame = data_frame)
    openfisca_survey_collection.surveys.append(survey)
    openfisca_survey_collection.dump()


def run(years_calage):
    import time
    year_data_list = [1995, 2000, 2005, 2011]
    for year_calage in years_calage:
        start = time.time()
        run_all(year_calage, year_data_list)
        log.info("Finished {}".format(time.time() - start))

if __name__ == '__main__':
    import sys
    logging.basicConfig(level = logging.INFO, stream = sys.stdout)
    years_calage = [2000, 2005, 2011]
    run(years_calage)
def build_survey_collection(collection_name=None,
                            replace_metadata=False,
                            replace_data=False,
                            data_directory_path_by_survey_suffix=None,
                            source_format='sas'):

    assert collection_name is not None
    assert data_directory_path_by_survey_suffix is not None
    surveys_name = data_directory_path_by_survey_suffix.keys()
    assert surveys_name is not None, "A list of surveys to process is needed"

    if replace_metadata:
        survey_collection = SurveyCollection(
            name=collection_name,
            config_files_directory=config_files_directory)
    else:
        try:
            survey_collection = SurveyCollection.load(
                collection=collection_name,
                config_files_directory=config_files_directory)
        except ConfigParser.NoOptionError:
            survey_collection = SurveyCollection(
                name=collection_name,
                config_files_directory=config_files_directory)

    for survey_suffix, data_directory_path in data_directory_path_by_survey_suffix.iteritems(
    ):
        assert os.path.isdir(
            data_directory_path), '{} is not a valid directory path'.format(
                data_directory_path)

        data_file_by_format = create_data_file_by_format(data_directory_path)
        survey_name = '{}_{}'.format(collection_name, survey_suffix)
        add_survey_to_collection(
            survey_name=survey_name,
            survey_collection=survey_collection,
            sas_files=data_file_by_format.get('sas'),
            stata_files=data_file_by_format.get('stata'),
        )

        valid_source_format = [
            _format for _format in data_file_by_format.keys()
            if data_file_by_format.get((_format))
        ]
        log.info("Valid source formats are: {}".format(valid_source_format))
        source_format = valid_source_format[0]
        log.info("Using the following format: {}".format(source_format))
        collections_directory = survey_collection.config.get(
            'collections', 'collections_directory')
        assert os.path.isdir(
            collections_directory
        ), """{} who should be the collections' directory does not exist.
Fix the option collections_directory in the collections section of your config file.""".format(
            collections_directory)
        collection_json_path = os.path.join(collections_directory,
                                            "{}.json".format(collection_name))
        survey_collection.dump(json_file_path=collection_json_path)
        surveys = [
            survey for survey in survey_collection.surveys
            if survey.name.endswith(str(survey_suffix))
        ]
        survey_collection.fill_hdf(source_format=source_format,
                                   surveys=surveys,
                                   overwrite=replace_data)
    return survey_collection