示例#1
0
def build_survey_collection(name=None,
                            erase_collection_json=False,
                            overwrite_surveys=False,
                            data_directory_path_by_year=None,
                            source_format='sas'):

    assert name is not None
    assert data_directory_path_by_year is not None
    years = data_directory_path_by_year.keys()
    if years is None:
        log.error("A list of years to process is needed")

    if erase_collection_json:
        survey_collection = SurveyCollection(
            name=name, config_files_directory=config_files_directory)
    else:
        try:
            survey_collection = SurveyCollection.load(
                collection=name, config_files_directory=config_files_directory)
        except ConfigParser.NoOptionError:
            survey_collection = SurveyCollection(
                name=name, config_files_directory=config_files_directory)

    for year, data_directory_path in data_directory_path_by_year.iteritems():
        if not os.path.isdir(data_directory_path):
            input_data_directory = survey_collection.config.get(
                'data', 'input_directory')
            assert os.path.isdir(input_data_directory)
            data_directory_path = os.path.join(input_data_directory,
                                               data_directory_path)
            assert os.path.isdir(input_data_directory)

        data_file_by_format = create_data_file_by_format(data_directory_path)
        print data_file_by_format
        survey_name = '{}_{}'.format(name, year)
        add_survey_to_collection(
            survey_name=survey_name,
            survey_collection=survey_collection,
            sas_files=data_file_by_format[source_format],
        )
        collections_directory = survey_collection.config.get(
            'collections', 'collections_directory')
        collection_json_path = os.path.join(collections_directory,
                                            "{}.json".format(name))
        survey_collection.dump(json_file_path=collection_json_path)
        surveys = [
            survey for survey in survey_collection.surveys
            if survey.name.endswith(str(year))
        ]
        survey_collection.fill_hdf(source_format=source_format,
                                   surveys=surveys,
                                   overwrite=overwrite_surveys)
    return survey_collection
def build_survey_collection(
        config_files_directory: str,
        collection_name = None,
        replace_metadata = False,
        replace_data = False,
        data_directory_path_by_survey_suffix = None,
        source_format = 'sas',
        ):

    assert collection_name is not None
    assert data_directory_path_by_survey_suffix is not None
    surveys_name = list(data_directory_path_by_survey_suffix.keys())
    assert surveys_name is not None, "A list of surveys to process is needed"

    if replace_metadata:
        survey_collection = SurveyCollection(
            name = collection_name, config_files_directory = config_files_directory)
    else:
        try:
            survey_collection = SurveyCollection.load(
                collection = collection_name, config_files_directory = config_files_directory)
        except configparser.NoOptionError:
            survey_collection = SurveyCollection(
                name = collection_name, config_files_directory = config_files_directory)

    for survey_suffix, data_directory_path in data_directory_path_by_survey_suffix.items():
        assert os.path.isdir(data_directory_path), '{} is not a valid directory path'.format(data_directory_path)

        data_file_by_format = create_data_file_by_format(data_directory_path)
        survey_name = '{}_{}'.format(collection_name, survey_suffix)
        add_survey_to_collection(
            survey_name = survey_name,
            survey_collection = survey_collection,
            sas_files = data_file_by_format.get('sas'),
            stata_files = data_file_by_format.get('stata'),
            )

        valid_source_format = [
            _format for _format in list(data_file_by_format.keys())
            if data_file_by_format.get((_format))
            ]
        log.info("Valid source formats are: {}".format(valid_source_format))
        source_format = valid_source_format[0]
        log.info("Using the following format: {}".format(source_format))
        collections_directory = survey_collection.config.get('collections', 'collections_directory')
        assert os.path.isdir(collections_directory), """{} who should be the collections' directory does not exist.
Fix the option collections_directory in the collections section of your config file.""".format(collections_directory)
        collection_json_path = os.path.join(collections_directory, "{}.json".format(collection_name))
        survey_collection.dump(json_file_path = collection_json_path)
        surveys = [survey for survey in survey_collection.surveys if survey.name.endswith(str(survey_suffix))]
        survey_collection.fill_hdf(source_format = source_format, surveys = surveys, overwrite = replace_data)
    return survey_collection
def build_bdf_survey_collection(years=None, erase=False, overwrite=False):
    if years is None:
        log.error("A list of years to process is needed")

    if erase:
        bdf_survey_collection = SurveyCollection(
            name="budget_des_familles",
            config_files_directory=config_files_directory)
    else:
        try:
            bdf_survey_collection = SurveyCollection.load(
                collection='budget_des_familles',
                config_files_directory=config_files_directory)
        except ConfigParser.NoOptionError:
            bdf_survey_collection = SurveyCollection(
                name="budget_des_familles",
                config_files_directory=config_files_directory)

    input_data_directory = bdf_survey_collection.config.get(
        'data', 'input_directory')
    if getpass.getuser() == 'benjello':
        input_data_directory = os.path.join(
            os.path.dirname(input_data_directory), 'INSEE')
    else:
        input_data_directory = os.path.dirname(input_data_directory)

    for year in years:
        data_directory_path = os.path.join(
            input_data_directory, 'budget_des_familles/{}'.format(year))
        data_file_by_format = create_data_file_by_format(data_directory_path)
        survey_name = 'budget_des_familles_{}'.format(year)

        add_survey_to_collection(
            survey_name=survey_name,
            survey_collection=bdf_survey_collection,
            stata_files=data_file_by_format['stata'],
        )

        collections_directory = bdf_survey_collection.config.get(
            'collections', 'collections_directory')
        collection_json_path = os.path.join(collections_directory,
                                            "budget_des_familles" + ".json")
        bdf_survey_collection.dump(json_file_path=collection_json_path)
        surveys = [
            survey for survey in bdf_survey_collection.surveys
            if survey.name.endswith(str(year))
        ]
        bdf_survey_collection.fill_hdf(source_format='stata',
                                       surveys=surveys,
                                       overwrite=overwrite)
    return bdf_survey_collection
def build_erfs_survey_collection(years = None, erase = False, overwrite = False):

    if years is None:
        log.error("A list of years to process is needed")

    if erase:
        erfs_survey_collection = SurveyCollection(
            name = "erfs", config_files_directory = config_files_directory)
    else:
        try:
            erfs_survey_collection = SurveyCollection.load(
                collection = 'erfs', config_files_directory = config_files_directory)
        except ConfigParser.NoOptionError:
            erfs_survey_collection = SurveyCollection(
                name = "erfs", config_files_directory = config_files_directory)

    input_data_directory = erfs_survey_collection.config.get('data', 'input_directory')
    if getpass.getuser() == 'benjello':
        input_data_directory = os.path.join(os.path.dirname(input_data_directory), 'INSEE')
    else:
        input_data_directory = os.path.dirname(input_data_directory)

    for year in years:
        data_directory_path = os.path.join(
            input_data_directory,
            'ERF/ERFS_{}'.format(year)
            )
        data_file_by_format = create_data_file_by_format(data_directory_path)
        survey_name = 'erfs_{}'.format(year)

        add_survey_to_collection(
            survey_name = survey_name,
            survey_collection = erfs_survey_collection,
            sas_files = data_file_by_format['sas'],
            )

        collections_directory = erfs_survey_collection.config.get('collections', 'collections_directory')
        collection_json_path = os.path.join(collections_directory, "erfs" + ".json")
        erfs_survey_collection.dump(json_file_path = collection_json_path)
        surveys = [survey for survey in erfs_survey_collection.surveys if survey.name.endswith(str(year))]

        erfs_survey_collection.fill_hdf(source_format = 'sas', surveys = surveys, overwrite = overwrite)
    return erfs_survey_collection
def build_survey_collection(name = None, erase_collection_json = False, overwrite_surveys = False,
        data_directory_path_by_year = None, source_format = 'sas'):

    assert name is not None
    assert data_directory_path_by_year is not None
    years = data_directory_path_by_year.keys()
    if years is None:
        log.error("A list of years to process is needed")

    if erase_collection_json:
        survey_collection = SurveyCollection(
            name = name, config_files_directory = config_files_directory)
    else:
        try:
            survey_collection = SurveyCollection.load(
                collection = name, config_files_directory = config_files_directory)
        except ConfigParser.NoOptionError:
            survey_collection = SurveyCollection(
                name = name, config_files_directory = config_files_directory)

    for year, data_directory_path in data_directory_path_by_year.iteritems():
        if not os.path.isdir(data_directory_path):
            input_data_directory = survey_collection.config.get('data', 'input_directory')
            assert os.path.isdir(input_data_directory)
            data_directory_path = os.path.join(input_data_directory, data_directory_path)
            assert os.path.isdir(input_data_directory)

        data_file_by_format = create_data_file_by_format(data_directory_path)
        print data_file_by_format
        survey_name = '{}_{}'.format(name, year)
        add_survey_to_collection(
            survey_name = survey_name,
            survey_collection = survey_collection,
            sas_files = data_file_by_format[source_format],
            )
        collections_directory = survey_collection.config.get('collections', 'collections_directory')
        collection_json_path = os.path.join(collections_directory, "{}.json".format(name))
        survey_collection.dump(json_file_path = collection_json_path)
        surveys = [survey for survey in survey_collection.surveys if survey.name.endswith(str(year))]
        survey_collection.fill_hdf(source_format = source_format, surveys = surveys, overwrite = overwrite_surveys)
    return survey_collection
def build_survey_collection(collection_name = None, replace_metadata = False, replace_data = False,
        data_directory_path_by_survey_suffix = None, source_format = 'sas'):

    assert collection_name is not None
    assert data_directory_path_by_survey_suffix is not None
    surveys_name = data_directory_path_by_survey_suffix.keys()
    assert surveys_name is not None, "A list of surveys to process is needed"

    if replace_metadata:
        survey_collection = SurveyCollection(
            name = collection_name, config_files_directory = config_files_directory)
    else:
        try:
            survey_collection = SurveyCollection.load(
                collection = collection_name, config_files_directory = config_files_directory)
        except ConfigParser.NoOptionError:
            survey_collection = SurveyCollection(
                name = collection_name, config_files_directory = config_files_directory)

    for survey_suffix, data_directory_path in data_directory_path_by_survey_suffix.iteritems():
        assert os.path.isdir(data_directory_path)

        data_file_by_format = create_data_file_by_format(data_directory_path)
        print data_file_by_format
        survey_name = '{}_{}'.format(collection_name, survey_suffix)
        add_survey_to_collection(
            survey_name = survey_name,
            survey_collection = survey_collection,
            sas_files = data_file_by_format[source_format],
            )
        collections_directory = survey_collection.config.get('collections', 'collections_directory')
        assert os.path.isdir(collections_directory), """{} who should be the collections' directory does not exist.
Fix the option collections_directory in the collections section of your config file.""".format(collections_directory)
        collection_json_path = os.path.join(collections_directory, "{}.json".format(collection_name))
        survey_collection.dump(json_file_path = collection_json_path)
        surveys = [survey for survey in survey_collection.surveys if survey.name.endswith(str(survey_suffix))]
        survey_collection.fill_hdf(source_format = source_format, surveys = surveys, overwrite = replace_data)
    return survey_collection
def build_survey_collection(collection_name=None,
                            replace_metadata=False,
                            replace_data=False,
                            data_directory_path_by_survey_suffix=None,
                            source_format='sas'):

    assert collection_name is not None
    assert data_directory_path_by_survey_suffix is not None
    surveys_name = data_directory_path_by_survey_suffix.keys()
    assert surveys_name is not None, "A list of surveys to process is needed"

    if replace_metadata:
        survey_collection = SurveyCollection(
            name=collection_name,
            config_files_directory=config_files_directory)
    else:
        try:
            survey_collection = SurveyCollection.load(
                collection=collection_name,
                config_files_directory=config_files_directory)
        except ConfigParser.NoOptionError:
            survey_collection = SurveyCollection(
                name=collection_name,
                config_files_directory=config_files_directory)

    for survey_suffix, data_directory_path in data_directory_path_by_survey_suffix.iteritems(
    ):
        assert os.path.isdir(
            data_directory_path), '{} is not a valid directory path'.format(
                data_directory_path)

        data_file_by_format = create_data_file_by_format(data_directory_path)
        survey_name = '{}_{}'.format(collection_name, survey_suffix)
        add_survey_to_collection(
            survey_name=survey_name,
            survey_collection=survey_collection,
            sas_files=data_file_by_format.get('sas'),
            stata_files=data_file_by_format.get('stata'),
        )

        valid_source_format = [
            _format for _format in data_file_by_format.keys()
            if data_file_by_format.get((_format))
        ]
        log.info("Valid source formats are: {}".format(valid_source_format))
        source_format = valid_source_format[0]
        log.info("Using the following format: {}".format(source_format))
        collections_directory = survey_collection.config.get(
            'collections', 'collections_directory')
        assert os.path.isdir(
            collections_directory
        ), """{} who should be the collections' directory does not exist.
Fix the option collections_directory in the collections section of your config file.""".format(
            collections_directory)
        collection_json_path = os.path.join(collections_directory,
                                            "{}.json".format(collection_name))
        survey_collection.dump(json_file_path=collection_json_path)
        surveys = [
            survey for survey in survey_collection.surveys
            if survey.name.endswith(str(survey_suffix))
        ]
        survey_collection.fill_hdf(source_format=source_format,
                                   surveys=surveys,
                                   overwrite=replace_data)
    return survey_collection