Exemplo n.º 1
0
    def test_open_xls(self):
        # create a temporary directory using the context manager
        with tempfile.TemporaryDirectory() as tmpdirname:
            working_dir = pathlib.Path(tmpdirname)
            datapath = working_dir / "breed.xls"

            # save worksheet in temporary folder
            self.workbook.save(f"{datapath}")

            data = pandas_open(datapath)
            self.assertIsInstance(data, pd.DataFrame)
Exemplo n.º 2
0
def main(species, dataset, datafile, code_column, breed_column, fid_column,
         country_column):
    logger.info(f"{Path(__file__).name} started")

    # custom method to check a dataset and ensure that needed stuff exists
    dataset, [datapath] = fetch_and_check_dataset(archive=dataset,
                                                  contents=[datafile])

    # read breed into data
    data = pandas_open(datapath)

    for index, row in data.iterrows():
        code = row.get(code_column)
        name = row.get(breed_column)

        # by default, fid is equal to code
        if not fid_column:
            fid_column = code_column

        fid = row.get(fid_column)

        logger.debug(f"Got code: '{code}', breed_name: '{name}', "
                     f"fid: '{fid}'")

        # deal with multi countries dataset
        country = None

        if country_column:
            country = row.get(country_column)

        # need to define also an alias in order to retrieve such breed when
        # dealing with original file
        alias = BreedAlias(fid=fid, dataset=dataset, country=country)

        try:
            breed, modified = get_or_create_breed(species=species,
                                                  name=name,
                                                  code=code,
                                                  aliases=[alias])

            if modified:
                logger.info(f"{breed} added to database")

        except NotUniqueError as e:
            logger.error(e)
            raise SmarterDBException(
                f"Got an error while inserting '{name}'. '{code}'")

    logger.info(f"{Path(__file__).name} ended")
Exemplo n.º 3
0
def main(src_dataset, dst_dataset, datafile, sheet_name, breed_column,
         id_column, purpose_column, chest_girth_column, height_column,
         length_column, additional_column, na_values):
    logger.info(f"{Path(__file__).name} started")

    if additional_column:
        logger.debug(f"Got {additional_column} as additional phenotype")

    # custom method to check a dataset and ensure that needed stuff exists
    src_dataset, [datapath] = fetch_and_check_dataset(archive=src_dataset,
                                                      contents=[datafile])

    # this will be the dataset used to define samples
    dst_dataset, _ = fetch_and_check_dataset(archive=dst_dataset, contents=[])

    if sheet_name and sheet_name.isnumeric():
        sheet_name = int(sheet_name)

    # open data with pandas
    data = pandas_open(datapath, na_values=na_values, sheet_name=sheet_name)

    # collect columns in a dictionary
    columns = {
        'breed_column': breed_column,
        'id_column': id_column,
        'purpose_column': purpose_column,
        'chest_girth_column': chest_girth_column,
        'height_column': height_column,
        'length_column': length_column,
        'additional_column': additional_column,
    }

    if breed_column:
        add_phenotype_by_breed(data, dst_dataset, columns)

    elif id_column:
        add_phenotype_by_sample(data, dst_dataset, columns)

    logger.info(f"{Path(__file__).name} ended")
Exemplo n.º 4
0
def main(src_dataset, dst_dataset, datafile, breed_column, id_column,
         latitude_column, longitude_column, metadata_column, na_values):
    logger.info(f"{Path(__file__).name} started")

    if metadata_column:
        logger.warning(f"Got {metadata_column} as additional metadata")

    # custom method to check a dataset and ensure that needed stuff exists
    src_dataset, [datapath] = fetch_and_check_dataset(
        archive=src_dataset,
        contents=[datafile]
    )

    # this will be the dataset used to define samples
    dst_dataset, _ = fetch_and_check_dataset(
        archive=dst_dataset,
        contents=[]
    )

    # open data with pandas
    data = pandas_open(datapath, na_values=na_values)

    # collect columns in a dictionary
    columns = {
        'breed_column': breed_column,
        'id_column': id_column,
        'latitude_column': latitude_column,
        'longitude_column': longitude_column,
        'metadata_column': metadata_column,
    }

    if breed_column:
        add_metadata_by_breed(data, dst_dataset, columns)

    elif id_column:
        add_metadata_by_sample(data, dst_dataset, columns)

    logger.info(f"{Path(__file__).name} ended")
Exemplo n.º 5
0
 def test_open_csv(self):
     data = pandas_open(SCRIPTS_DATA_DIR / "test_manifest.csv")
     self.assertIsInstance(data, pd.DataFrame)
Exemplo n.º 6
0
def main(
        src_dataset, dst_dataset, datafile, code_column, country_column,
        id_column, sex_column, chip_name):
    logger.info(f"{Path(__file__).name} started")

    # custom method to check a dataset and ensure that needed stuff exists
    src_dataset, [datapath] = fetch_and_check_dataset(
        archive=src_dataset,
        contents=[datafile]
    )

    # this will be the dataset used to define samples
    dst_dataset, _ = fetch_and_check_dataset(
        archive=dst_dataset,
        contents=[]
    )

    # mind dataset species
    SampleSpecie = get_sample_species(dst_dataset.species)

    # read datafile
    data = pandas_open(datapath)

    logger.info(f"Got columns: {data.columns.to_list()}")

    for index, row in data.iterrows():
        logger.debug(f"Got: {row.to_list()}")
        code = row.get(code_column)
        country = row.get(country_column)
        original_id = row.get(id_column)
        sex = None

        if sex_column:
            sex = str(row.get(sex_column))
            sex = SEX.from_string(sex)

            # drop sex column if unknown
            if sex == SEX.UNKNOWN:
                sex = None

        logger.debug(
            f"Got code: {code}, country: {country}, "
            f"original_id: {original_id}, sex: {sex}"
        )

        # process a country by doing a fuzzy search
        # HINT: this function cache results relying arguments using lru_cache
        # see find country implementation for more informations
        country = find_country(country)

        # get breed from database
        breed = Breed.objects(
            aliases__match={'fid': code, 'dataset': dst_dataset}).get()

        logger.debug(f"found breed '{breed}'")

        # get or create a new Sample Obj
        sample, created = get_or_create_sample(
            SampleSpecie,
            original_id,
            dst_dataset,
            breed,
            country.name,
            chip_name,
            sex)

        if created:
            logger.info(f"Sample '{sample}' added to database")

    logger.info(f"{Path(__file__).name} ended")