예제 #1
0
def main():
    '''
    Prepares the dataset ready for training by preprocessing the data, building
    and engineering the features from the dataset and creating the .csv file of the
    dataset into the data folder.

    Returns:
        data_name_bf : prepared dataset file located on the data folder
    '''
    data_lst = ["val","train"]

    for i in data_lst:

        #Preprocessing the data
        my_input = "../data/{input}.csv".format(input=i)
        preprocess.execute(input_file=my_input,
        output_file='../data/{input}_bf.csv'.format(input=i))

        #Preparing the data for training
        my_input= "../data/{input}_bf.csv".format(input=i)
        build_features.execute(input_file=my_input,
        output_file='../data/{input}_bf.csv'.format(input=i))
        print("{input}_bf.csv file has been created".format(input=i))

    train_acc = train.execute()
    pred_acc = predict.execute()

    compare_models = pd.DataFrame({
        'Model': ['RandForest','ExtTree', 'GraBoo', 'AdaBoo'],
        'Train Score': train_acc,
        'Prediction Score': pred_acc

    })

    return print(compare_models)
예제 #2
0
    def test_DistancePoint100FromCentre(self):

        df = preprocess.execute("../../Data/TestData/bee-data_NT_test.csv")

        Distance = df['DistanceCentre'].iloc[99]
        print("Distance:", Distance)
        self.assertAlmostEqual(Distance, 36.1062682861314)
예제 #3
0
    def test_locationDensity(self):

        df = preprocess.execute(
            "../../Data/TestData/bee-data_NT_test_locationDensity.csv")
        #(traj, lseg, ovlp, cum_dist_end_prev)
        arena = classArena.classArena(df)

        dt_first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 7, 0, 0)
        length_first_segment = cs.getSegmentLength(dt_first_segment)
        features_first_segment = sg.Segment(dt_first_segment,
                                            length_first_segment, arena, 0)

        feat = features_first_segment.getFeature(
            enums.eFeature.LocationDensity)

        SumDistanceBetweenEachPairPoints = 1 + math.sqrt(1**2 + 2**2) + 2 \
                                            + 2 + math.sqrt(1**2 + 2**2) \
                                            + 1

        nCr = 6

        LocationDensity_TrueValue = SumDistanceBetweenEachPairPoints / nCr

        self.assertEqual(feat.value, LocationDensity_TrueValue)
예제 #4
0
    def test_calcCentralDisplacement_withinCorrectRange(self):

        df = preprocess.execute("../../Data/TestData/bee-data_NT_test.csv")
        arena = classArena.classArena(df)

        dt_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 10, 0, 0)
        for i in range(0, 20):
            dt_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
                df, 10, 0.3, cum_dist_end_segment)
            length_segment = cs.getSegmentLength(dt_segment)
            features_segment = sg.Segment(dt_segment, length_segment, arena, 0)
            cent_displ = features_segment.getFeature(
                enums.eFeature.CentralDisplacement).value
            #            print("cent_displ:", cent_displ)
            self.assertLessEqual(cent_displ,
                                 1)  # test never bigger than arena size
            self.assertGreater(cent_displ, 0)  # test positive
            self.assertGreaterEqual(
                cent_displ,
                (features_segment.ellipse.centre[0] - arena.centre_x) * 2 /
                arena.diameter)  # test greater than ellipse centre x
            self.assertGreaterEqual(
                cent_displ,
                (features_segment.ellipse.centre[1] - arena.centre_y) * 2 /
                arena.diameter)  # test greater than ellipse centre y
예제 #5
0
def execute(data_dir, data_file, categorize=False):
    """Builds features
    Args:
        data_dir (str): relative path to data subdirectory
        data_file (str): name of csv data file
        categorize: set to True if Age and Fare should be categorized
    """

    # Read preprocessed data:
    data = preprocess.execute(data_dir + data_file)

    # Replace sex strings with binary value:
    data["Sex"] = data["Sex"].replace("male", 0)
    data["Sex"] = data["Sex"].replace("female", 1)

    if categorize:

        # Convert age into categories:
        data["Age"] = data["Age"].astype(int)
        data.loc[data["Age"] <= 19, "Age"] = 0
        data.loc[(data["Age"] > 19) & (data["Age"] <= 25), "Age"] = 1
        data.loc[(data["Age"] > 25) & (data["Age"] <= 32), "Age"] = 2
        data.loc[(data["Age"] > 32) & (data["Age"] <= 42), "Age"] = 3
        data.loc[(data["Age"] > 42), "Age"] = 4

        # Convert fare into categories:
        data.loc[data["Fare"] <= 7.854, "Fare"] = 0
        data.loc[(data["Fare"] > 7.854) & (data["Fare"] <= 10.5), "Fare"] = 1
        data.loc[(data["Fare"] > 10.5) & (data["Fare"] <= 22.225), "Fare"] = 2
        data.loc[(data["Fare"] > 22.225) & (data["Fare"] <= 39.688),
                 "Fare"] = 3
        data.loc[(data["Fare"] > 39.688), "Fare"] = 4

    # Embarked: C = Cherbourg, Q = Queenstown, S = Southampton
    # Replace above labels with numbers from 1 to 3:
    embarked_dict = {}
    embarked_dict_values = 0
    for i in data.Embarked:
        if i in embarked_dict.keys():
            pass
        else:
            embarked_dict_values = embarked_dict_values + 1
            embarked_dict[i] = embarked_dict_values
    for i in embarked_dict.keys():
        data["Embarked"].replace(i, embarked_dict[i], inplace=True)

    # Add columns FamilySize and IsAlone:
    data["FamilySize"] = data["SibSp"] + data["Parch"] + 1
    data["IsAlone"] = 0
    data.loc[data["FamilySize"] == 1, "IsAlone"] = 1

    data.to_csv(data_dir + data_file + processed_suffix + "_" +
                str(categorize),
                sep=";",
                index=False)
예제 #6
0
    def test_getSegmentLength(self):
        df = preprocess.execute("../../Data/TestData/bee-data_NT_test.csv")

        first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 10, 0, 0)
        second_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 10, 0.3, cum_dist_end_segment)

        length_segment = cs.getSegmentLength(second_segment)

        self.assertAlmostEqual(length_segment, 10.4350527761)
예제 #7
0
    def test_FindingCorrectSecondSegment(self):

        df = preprocess.execute("../../Data/TestData/bee-data_NT_test.csv")

        first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 10, 0, 0)
        second_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 10, 0.3, cum_dist_end_segment)

        self.assertAlmostEqual(second_segment['CumulativeDistance'].iloc[0],
                               7.0969949712)
        self.assertAlmostEqual(second_segment['CumulativeDistance'].iloc[-1],
                               17.5320477473)
예제 #8
0
    def test_iQRangeDistanceCentre(self):
        df = preprocess.execute("../../Data/TestData/bee-data_NT_test.csv")
        arena = classArena.classArena(df)

        dt_first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 10, 0, 0)
        dt_second_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 10, 0.3, cum_dist_end_segment)

        len_second_segment = cs.getSegmentLength(dt_second_segment)
        second_segment_features = sg.Segment(dt_second_segment,
                                             len_second_segment, arena, 0)

        feat = second_segment_features.getFeature(enums.eFeature.IQRange)
        self.assertAlmostEqual(feat.value, 0.0164803959758471)
예제 #9
0
    def test_checkCorrectingRotation(self):

        df = preprocess.execute(
            "../../Data/TestData/bee-data_NT_test_maxloop.csv")
        #(traj, lseg, ovlp, cum_dist_end_prev)
        arena = classArena.classArena(df)

        dt_first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 20, 0, 0)
        length_first_segment = cs.getSegmentLength(dt_first_segment)
        features_first_segment = sg.Segment(dt_first_segment,
                                            length_first_segment, arena, 0)

        feat = features_first_segment.getFeature(enums.eFeature.MeanSpeed)

        self.assertEqual(feat.value, 118.75)
예제 #10
0
    def test_sumAbsAngles(self):

        df = preprocess.execute(
            "../../Data/TestData/bee-data_NT_test_sum_abs_angles.csv")
        #(traj, lseg, ovlp, cum_dist_end_prev)
        arena = classArena.classArena(df)

        dt_first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 7, 0, 0)
        length_first_segment = cs.getSegmentLength(dt_first_segment)
        features_first_segment = sg.Segment(dt_first_segment,
                                            length_first_segment, arena, 0)

        feat = features_first_segment.getFeature(
            enums.eFeature.SumAbsoluteAngles)

        self.assertEqual(feat.value, 2 * math.pi)
예제 #11
0
    def test_MedianDistanceCentre(self):

        df = preprocess.execute("../../Data/TestData/bee-data_NT_test.csv")
        arena = classArena.classArena(df)

        dt_first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 10, 0, 0)
        dt_second_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 10, 0.3, cum_dist_end_segment)
        length_second_segment = cs.getSegmentLength(dt_second_segment)

        features_second_segment = sg.Segment(dt_second_segment,
                                             length_second_segment, arena, 0)

        feat = features_second_segment.getFeature(
            enums.eFeature.MedianDistanceFromCentre)

        self.assertAlmostEqual(feat.value, 0.8628325515)
예제 #12
0
    def test_pathEfficiency(self):

        df = preprocess.execute(
            "../../Data/TestData/bee-data_NT_test_sum_abs_angles.csv")
        #(traj, lseg, ovlp, cum_dist_end_prev)
        arena = classArena.classArena(df)

        dt_first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 20, 0, 0)
        length_first_segment = cs.getSegmentLength(dt_first_segment)
        features_first_segment = sg.Segment(dt_first_segment,
                                            length_first_segment, arena, 0)

        feat = features_first_segment.getFeature(enums.eFeature.PathEfficiency)

        PathEfficiency_TrueValue = 1.0 / 7.0

        self.assertEqual(feat.value, PathEfficiency_TrueValue)
def segmentIndividualFilenames(df, exp_name, arena, segment_length, overlap):
    list_segments = []
    unique_filename = df.filename.unique()
    for iFile in unique_filename:
        logging.info(".....................Filename: " + iFile +
                     ".....................")
        df_file = df[df['filename'] == iFile]
        df_file.reset_index(drop=True, inplace=True)
        df_file = preprocess.execute(df_file,
                                     experiment_name=exp_name,
                                     arena=arena)
        temp_list_segments = sendSectionDfSegment(
            df=df_file,
            using_light=True,
            arena=arena,
            segment_length=segment_length,
            overlap=overlap)
        list_segments = list_segments + temp_list_segments
    return list_segments
예제 #14
0
    def test_areaFormula(self):
        df = preprocess.execute("../../Data/TestData/bee-data_NT_test.csv")
        arena = classArena.classArena(df)

        dt_first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 10, 0, 0)
        dt_second_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 10, 0.3, cum_dist_end_segment)
        length_second_segment = cs.getSegmentLength(dt_second_segment)

        features_second_segment = sg.Segment(dt_second_segment,
                                             length_second_segment, arena, 0)

        #features_second_segment.calcMinEnclosingEllipseArea

        points = np.array([[-1, 0, 0, 1], [0, 1, -1, 0]]).T

        ellipse = features_second_segment.findMinEnclosingEllipse(points)
        min_enclosing_ellipse_area = features_second_segment.calcMinEnclosingEllipseArea(
            ellipse.radii)

        self.assertAlmostEqual(min_enclosing_ellipse_area, math.pi)
예제 #15
0
# To run this code you need the following Folde structure
#     ./
#      |
#      |---- [to_process]   --> this must contain your json files
#      |---- [CSV]          --> after pre-processing files
#      |---- [Bulk]         --> one grouped file containing all csv
#      |---- [Final_files]  --> summary result files (Tweets & Users)
#

if __name__ == '__main__':

    # Pre processing
    # Will read all json files from input directory and will produce new files
    # after data cleaning.
    # Will also produce a Bulkfile containing all data.
    pre.execute()

    # Processing
    # Will read bulk file and calculate the number of tweets and users
    # found in each day.
    # The result is two csv files. Tweets and Users csv
    pro.execute()

    # Final merge of Tweets and Users Datasets
    # todo: Implement a for loop to cover all tweets and users csv files

    rootdir = './Final-files'
    file = 'processed-tweets.csv'
    filepath = rootdir + os.sep + file
    dfTweets = pd.read_csv(filepath, names=['Date', 'Hour', 'Tweets'])
    rootdir = './Final-files'