Exemplo n.º 1
0
    def test_load_glove_file(self):
        """
        Test method to load the glove file as DataFrame

        Examined test case: just checks that the DataFrame is not empty
        """
        properties = {"output_folder": "output", "resources_folder": "resources", "embeddings_file": "glove.6B.50d.txt"}
        logger = utils.config_logger(properties)
        df = utils.load_glove_file(properties, logger)
        self.assertTrue(not df.empty)
Exemplo n.º 2
0
def main():
    """
    Main function is the starting point of the program. Properties from the yaml are loaded in order to be used as
    configuration to the program. If the dataset and the embeddings files are not already downloaded, property
    setup_folders should be set to True. The scripts setup.sh or setup.bat are used (based on the operating system) in
    order to download the necessary files.

    The next step is to read the csv of the selected dataset and convert the data into input vectors.
    The recommendation is system is built using two methods: the collaborative and the content-based methods. For the
    collaborative method, the ratings of a user are used as input vector and they are fed in a kmeans model. On the
    other hand, for the content-based method, information about the user and the movie are used (associated with each
    live of the ratings.csv). The rating serves as label to each input vector. Based on the classification property,
    the labels are either binary (like, dislike) or 5 different classes (representing the possible ratings). The models
    used for the content-based method are: KNN, Random Forest and Deep Neural Networks.

    The dataset is split into training and testing datasets. The testing dataset is kept aside, while the training is
    split into folds for 10-fold cross-validation. Finally, the testing dataset is used as additional validation of the
    model. The confusion matrices of the folds as well as the one produced by the test dataset are used in order to
    calculate micro/macro precision, recall, F-measure. The visualization method can be used in order to produce plots
    of the micro/macro metrics.
    """
    # load properties
    properties = utils.load_properties()
    logger = utils.config_logger(properties)
    logger.info("Configuration file is loaded")
    if properties["setup_folders"]:
        logger.info(
            "Set up folders is true. Glove vectors and datasets will be downloaded"
        )
        utils.setup_folders(properties=properties, logger=logger)
    # get dataset filenames to read
    logger.info("Collect the dataset filenames")
    file_names = utils.get_filenames(properties)
    # read datasets
    logger.info("Creating dataframes from the csvs in the selected dataset")
    dp = DataPreprocessing()
    dp.read_csv(file_names)
    csvs = dp.datasets
    if Methods.collaborative.value in properties["methods"]:
        run_collaborative(properties=properties, csvs=csvs, logger=logger)
    if Methods.content_based.value in properties["methods"]:
        run_content_based(properties=properties, csvs=csvs, logger=logger)
    if properties["execution_kind"] == "test":
        run_test(properties=properties, csvs=csvs, logger=logger)
    utils.send_email(properties=properties, logger=logger)
Exemplo n.º 3
0
    def test_preprocess_text(self):
        """
        Test method for the preprocess_text function. Given a movie and user id, the movie title, genres and given tags
        by the user are collected and concatenated into a single text. Then the text is preprocessed by removing symbols
        and numbers and splitting the text into a list of words.

        Examined test case: the returned list of words is the same as the expected list of words
        """
        logger = utils.config_logger(properties=load_test_properties())
        movies_df = pd.DataFrame(data=[[1, "Toy Story (1995)", "Adventure|Animation|Children|Comedy|Fantasy"]],
                                 columns=["movieId", "title", "genres"])
        tags_df = pd.DataFrame(data=[[1, 1, "funny"]], columns=["userId", "movieId", "tag"])
        movie_id = 1
        user_id = 1
        data_preprocess = ContentBasedPreprocessing()
        text = data_preprocess._preprocess_text(movies_df=movies_df, tags_df=tags_df, movie_id=movie_id,
                                                user_id=user_id, logger=logger)
        expected_text = ["Toy", "Story", "Adventure", "Animation", "Children", "Comedy", "Fantasy", "funny"]
        self.assertEqual(text, expected_text)
EXP_DIR = os.path.join('Outputs', '{}', 'Exposure_SADD')
EXP_FILE = '{}_Exposure.geojson'

# COVID
COVID_DIR = 'COVID'

# maybe we can move this to the yml file?
HLX_TAG_TOTAL_CASES = '#affected+infected+confirmed+total'
HLX_TAG_TOTAL_DEATHS = '#affected+infected+dead+total'
HLX_TAG_DATE = '#date'
HLX_TAG_ADM1_NAME = '#adm1+name'
HLX_TAG_ADM2_NAME = '#adm2+name'
HLX_TAG_ADM1_PCODE = '#adm1+pcode'
HLX_TAG_ADM2_PCODE = '#adm2+pcode'

utils.config_logger()
logger = logging.getLogger(__name__)


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('country_iso3', help='Country ISO3')
    parser.add_argument('-d',
                        '--download-covid',
                        action='store_true',
                        help='Download the COVID-19 data')
    return parser.parse_args()


def main(country_iso3, download_covid=False):
    # Get config file
Exemplo n.º 5
0
 def test_send_email(self):
     properties = load_test_properties()
     logger = utils.config_logger(properties=properties)
     result = utils.send_email(properties=properties, logger=logger)
     self.assertEqual(result, ResultStatus.success.value)