Пример #1
0
 def __iter__(self):
     return stream.iter_csv(
         self.path,
         target='passengers',
         converters={'passengers': int},
         parse_dates={'month': '%Y-%m'}
     )
Пример #2
0
 def _stream_X_y(self, directory):
     return stream.iter_csv(
         f'{directory}/trec07p.csv',
         target='y',
         delimiter=',',
         quotechar='"',
         field_size_limit=1_000_000,
     )
Пример #3
0
 def _iter(self):
     return stream.iter_csv(
         self.path,
         target='y',
         delimiter=',',
         quotechar='"',
         field_size_limit=1_000_000,
     )
Пример #4
0
 def _stream_X_y(self, directory):
     return stream.iter_csv(f'{directory}/smtp.csv',
                            target='service',
                            converters={
                                'duration': float,
                                'src_bytes': float,
                                'dst_bytes': float,
                                'service': int
                            })
Пример #5
0
 def __iter__(self):
     return stream.iter_csv(self.path,
                            target='weight',
                            converters={
                                'time': int,
                                'weight': int,
                                'chick': int,
                                'diet': int
                            })
Пример #6
0
 def _iter(self):
     return stream.iter_csv(self.path,
                            target='service',
                            converters={
                                'duration': float,
                                'src_bytes': float,
                                'dst_bytes': float,
                                'service': int
                            })
Пример #7
0
 def _iter(self):
     return stream.iter_csv(self.path,
                            target='rating',
                            converters={
                                'timestamp': int,
                                'release_date': int,
                                'age': float,
                                'rating': float
                            },
                            delimiter='\t')
Пример #8
0
 def _iter(self):
     return stream.iter_csv(self.path,
                            target='visitors',
                            converters={
                                'latitude': float,
                                'longitude': float,
                                'visitors': int,
                                'is_holiday': ast.literal_eval
                            },
                            parse_dates={'date': '%Y-%m-%d'})
Пример #9
0
    def _iter(self):

        converters = {f'V{i}': float for i in range(1, 29)}
        converters['Class'] = int
        converters['Time'] = float
        converters['Amount'] = float

        return stream.iter_csv(self.path,
                               target='Class',
                               converters=converters)
Пример #10
0
 def __iter__(self):
     return stream.iter_csv(self.path,
                            target='five_thirty_eight',
                            converters={
                                'ordinal_date': int,
                                'gallup': float,
                                'ipsos': float,
                                'morning_consult': float,
                                'rasmussen': float,
                                'you_gov': float,
                                'five_thirty_eight': float
                            })
Пример #11
0
 def _iter(self):
     return stream.iter_csv(self.path,
                            target='bikes',
                            converters={
                                'clouds': int,
                                'humidity': int,
                                'pressure': float,
                                'temperature': float,
                                'wind': float,
                                'bikes': int
                            },
                            parse_dates={'moment': '%Y-%m-%d %H:%M:%S'})
Пример #12
0
 def _stream_X_y(self, directory):
     return stream.iter_csv(
         os.path.join(directory, 'trump_approval.csv.gz'),
         target='five_thirty_eight',
         converters={
             'ordinal_date': int,
             'gallup': float,
             'ipsos': float,
             'morning_consult': float,
             'rasmussen': float,
             'you_gov': float,
             'five_thirty_eight': float
         }
     )
Пример #13
0
 def _iter(self):
     return stream.iter_csv(self.path,
                            target='class',
                            converters={
                                'date': float,
                                'day': int,
                                'period': float,
                                'nswprice': float,
                                'nswdemand': float,
                                'vicprice': float,
                                'vicdemand': float,
                                'transfer': float,
                                'class': lambda x: x == 'UP'
                            })
Пример #14
0
 def _stream_X_y(self, directory):
     return stream.iter_csv(
         f'{directory}/toulouse_bikes.csv',
         target='bikes',
         converters={
             'clouds': int,
             'humidity': int,
             'pressure': float,
             'temperature': float,
             'wind': float,
             'bikes': int
         },
         parse_dates={'moment': '%Y-%m-%d %H:%M:%S'}
     )
Пример #15
0
 def _iter(self):
     return stream.iter_csv(
         self.path,
         target='trip_duration',
         converters={
             'passenger_count': int,
             'pickup_longitude': float,
             'pickup_latitude': float,
             'dropoff_longitude': float,
             'dropoff_latitude': float,
             'trip_duration': int
         },
         parse_dates={'pickup_datetime': '%Y-%m-%d %H:%M:%S'},
         drop=['dropoff_datetime', 'id'])
Пример #16
0
 def _stream_X_y(self, directory):
     return stream.iter_csv(
         f'{directory}/train.csv',
         target='trip_duration',
         converters={
             'passenger_count': int,
             'pickup_longitude': float,
             'pickup_latitude': float,
             'dropoff_longitude': float,
             'dropoff_latitude': float,
             'trip_duration': int
         },
         parse_dates={'pickup_datetime': '%Y-%m-%d %H:%M:%S'},
         drop=['dropoff_datetime', 'id']
     )
Пример #17
0
 def __iter__(self):
     return stream.iter_csv(
         self.path,
         target='is_phishing',
         converters={
             'empty_server_form_handler': float,
             'popup_window': float,
             'https': float,
             'request_from_other_domain': float,
             'anchor_from_other_domain': float,
             'is_popular': float,
             'long_url': float,
             'age_of_domain': int,
             'ip_in_url': int,
             'is_phishing': lambda x: x == '1'
         }
     )
Пример #18
0
    def _iter(self):

        features = [
            'lepton pT', 'lepton eta', 'lepton phi',
            'missing energy magnitude', 'missing energy phi', 'jet 1 pt',
            'jet 1 eta', 'jet 1 phi', 'jet 1 b-tag', 'jet 2 pt', 'jet 2 eta',
            'jet 2 phi', 'jet 2 b-tag', 'jet 3 pt', 'jet 3 eta', 'jet 3 phi',
            'jet 3 b-tag', 'jet 4 pt', 'jet 4 eta', 'jet 4 phi', 'jet 4 b-tag',
            'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb'
        ]

        return stream.iter_csv(self.path,
                               fieldnames=['is_signal', *features],
                               target='is_signal',
                               converters={
                                   'is_signal': lambda x: x.startswith('1'),
                                   **{f: float
                                      for f in features}
                               })
Пример #19
0
 def __iter__(self):
     return stream.iter_csv(
         self.path,
         target=['c-class-flares', 'm-class-flares', 'x-class-flares'],
         converters={
             'zurich-class': str,
             'largest-spot-size': str,
             'spot-distribution': str,
             'activity': int,
             'evolution': int,
             'previous-24h-flare-activity': int,
             'hist-complex': int,
             'hist-complex-this-pass': int,
             'area': int,
             'largest-spot-area': int,
             'c-class-flares': int,
             'm-class-flares': int,
             'x-class-flares': int
         })
Пример #20
0
 def __iter__(self):
     return stream.iter_csv(self.path,
                            target='category',
                            converters={
                                'region-centroid-col': int,
                                'region-centroid-row': int,
                                'short-line-density-5': float,
                                'short-line-density-2': float,
                                'vedge-mean': float,
                                'vegde-sd': float,
                                'hedge-mean': float,
                                'hedge-sd': float,
                                'intensity-mean': float,
                                'rawred-mean': float,
                                'rawblue-mean': float,
                                'rawgreen-mean': float,
                                'exred-mean': float,
                                'exblue-mean': float,
                                'exgreen-mean': float,
                                'value-mean': float,
                                'saturation-mean': float,
                                'hue-mean': float
                            })
Пример #21
0
BATCH_SIZE = 256
write_dataset('train.csv', db['features'][:SPLIT_INDEX],
              db['labels'][:SPLIT_INDEX], BATCH_SIZE)
write_dataset('test.csv', db['features'][SPLIT_INDEX:],
              db['labels'][SPLIT_INDEX:], BATCH_SIZE)

FEATURE_SIZE = db['features'].shape[1]
types = {f'feature_{i}': float for i in range(FEATURE_SIZE)}
types['class'] = int

model = StandardScaler()
model |= OneVsRestClassifier(LogisticRegression())

metric = Accuracy()
dataset = stream.iter_csv('train.csv', target_name='class', converters=types)
print('Training started...')
for i, (X, y) in enumerate(dataset):
    predictions = model.predict_one(X)
    model = model.fit_one(X, y)
    metric = metric.update(y, predictions)

    if i % 100 == 0:
        print(f'Update {i} - {metric}')

print(f'Final - {metric}')

metric = Accuracy()
test_dataset = stream.iter_csv('test.csv',
                               target_name='class',
                               converters=types)
Пример #22
0
 def _stream_X_y(self, directory):
     return stream.iter_csv(os.path.join(directory,
                                         'airline-passengers.csv'),
                            target='passengers',
                            converters={'passengers': int},
                            parse_dates={'month': '%Y-%m'})
Пример #23
0
                             required=True,
                             help='Path to test CSV file.')
argument_parser.add_argument(
    '-n',
    '--num-cols',
    type=int,
    required=True,
    help='Number of columns in the feature CSV file (excluding label).')
arguments = vars(argument_parser.parse_args())

print('[INFO] Building column names...')
types = {f'feature_{i}': float
         for i in range(arguments['num_cols'])}  # Data type per feature
types['class'] = int

dataset = stream.iter_csv(arguments['train'], target_name='class', types=types)

model = Pipeline([('scaler', StandardScaler()),
                  ('learner',
                   OneVsRestClassifier(binary_classifier=PAClassifier()))])

metric = Accuracy()

print('[INFO] Training started...')
for index, (X, y) in enumerate(dataset):
    try:
        predictions = model.predict_one(X)
        model = model.fit_one(X, y)
        metric = metric.update(y, predictions)

        if index % 10 == 0:
Пример #24
0
ap.add_argument("-c", "--csv", required=True,
	help="path to features CSV file")
ap.add_argument("-n", "--cols", type=int, required=True,
	help="# of feature columns in the CSV file (excluding class column")
args = vars(ap.parse_args())



# construct our data dictionary which maps the data types of the
# columns in the CSV file to built-in data types
print("[INFO] building column names...")
types = {"feat_{}".format(i): float for i in range(0, args["cols"])}
types["class"] = int

# create a CSV data generator for the extracted Keras features
dataset = stream.iter_csv(args["csv"], target_name="class", types=types)

# construct our pipeline
model = Pipeline([
    ("scale", StandardScaler()),
    ("learn", OneVsRestClassifier(binary_classifier=LogisticRegression()))])

# initialize our metric
print("[INFO] starting training...")
metric = Accuracy()

# loop over the dataset
for (i, (X, y)) in enumerate(dataset):
    # make predictions on the current set of features, train the
    # model on the features, and then update our metric
    preds = model.predict_one(X)
PATH_TO_CSV = 'bbc-text.csv'

start_time = time.time()
logger = logging.getLogger()
logger.setLevel(logging.WARN)
logging.warn('\tLoading word embeddings and data streamer...')
nlp = spacy.load('en_core_web_md')
encodings = {
    'tech': 0,
    'business': 1,
    'sport': 2,
    'entertainment': 3,
    'politics': 4
}
types = {"category": str}
dataset = stream.iter_csv(PATH_TO_CSV, target_name="category", types=types)
stop_time = time.time()
elapsed_time = stop_time - start_time
logging.info('\tFinished in {0} seconds.'.format(elapsed_time))

classifier = MLPClassifier(activation='tanh',
                           learning_rate='constant',
                           alpha=1e-4,
                           hidden_layer_sizes=(15, ),
                           random_state=1,
                           batch_size=16,
                           verbose=False,
                           max_iter=20,
                           warm_start=True)

predictions = []
    "-n",
    "--num_cols",
    type=int,
    required=True,
    help="# of feature columns in the CSV file (excluding class column")
args = vars(ap.parse_args())

# construct our data dictionary which maps the data types of the
# columns in the CSV file to built-in data types
print("[INFO] building column names...")
types = {f'feat_{i}': float for i in range(args['num_cols'])}
types["class"] = int

# create a CSV data generator for the extracted Keras features
dataset = stream.iter_csv(filepath_or_buffer=args["csv"],
                          target_name="class",
                          converters=types)

# construct our pipeline
model = Pipeline(StandardScaler(),
                 OneVsRestClassifier(binary_classifier=PAClassifier()))

# initialize our metric
print("[INFO] starting training...")
metric = ClassificationReport()

# loop over the dataset
for i, (X, y) in enumerate(dataset):
    # make predictions on the current set of features, train the
    # model on the features, and then update our metric
    preds = model.predict_one(X)
ap.add_argument(
    "-n",
    "--cols",
    type=int,
    required=True,
    help="# of feature columns in the CSV file (excluding class column")
args = vars(ap.parse_args())

# construct our data dictionary which maps the data types of the
# columns in the CSV file to built-in data types
print("[INFO] building column names...")
types = {"feat_{}".format(i): float for i in range(0, args["cols"])}
types["class"] = int

# create a CSV data generator for the extracted Keras features
dataset = stream.iter_csv(args["csv"], target="class", converters=types)
# construct our pipeline (maybe set to .0000003)
model = Pipeline(StandardScaler(),
                 LogisticRegression(optimizer=optim.SGD(.0000001)))

# initialize our metric
print("[INFO] starting training...")
metric = Accuracy()

# loop over the dataset
for (i, (X, y)) in enumerate(dataset):
    # make predictions on the current set of features, train the
    # model on the features, and then update our metric
    preds = model.predict_one(X)
    model = model.fit_one(X, y)
    metric = metric.update(y, preds)
Пример #28
0
 def _stream_X_y(self, directory):
     return stream.iter_csv(f'{directory}',
                            target=[
                                'amazed-suprised', 'happy-pleased',
                                'relaxing-clam', 'quiet-still',
                                'sad-lonely', 'angry-aggresive'
                            ],
                            converters={
                                'amazed-suprised': lambda x: x == '1',
                                'happy-pleased': lambda x: x == '1',
                                'relaxing-clam': lambda x: x == '1',
                                'quiet-still': lambda x: x == '1',
                                'sad-lonely': lambda x: x == '1',
                                'angry-aggresive': lambda x: x == '1',
                                'Mean_Acc1298_Mean_Mem40_Centroid': float,
                                'Mean_Acc1298_Mean_Mem40_Rolloff': float,
                                'Mean_Acc1298_Mean_Mem40_Flux': float,
                                'Mean_Acc1298_Mean_Mem40_MFCC_0': float,
                                'Mean_Acc1298_Mean_Mem40_MFCC_1': float,
                                'Mean_Acc1298_Mean_Mem40_MFCC_2': float,
                                'Mean_Acc1298_Mean_Mem40_MFCC_3': float,
                                'Mean_Acc1298_Mean_Mem40_MFCC_4': float,
                                'Mean_Acc1298_Mean_Mem40_MFCC_5': float,
                                'Mean_Acc1298_Mean_Mem40_MFCC_6': float,
                                'Mean_Acc1298_Mean_Mem40_MFCC_7': float,
                                'Mean_Acc1298_Mean_Mem40_MFCC_8': float,
                                'Mean_Acc1298_Mean_Mem40_MFCC_9': float,
                                'Mean_Acc1298_Mean_Mem40_MFCC_10': float,
                                'Mean_Acc1298_Mean_Mem40_MFCC_11': float,
                                'Mean_Acc1298_Mean_Mem40_MFCC_12': float,
                                'Mean_Acc1298_Std_Mem40_Centroid': float,
                                'Mean_Acc1298_Std_Mem40_Rolloff': float,
                                'Mean_Acc1298_Std_Mem40_Flux': float,
                                'Mean_Acc1298_Std_Mem40_MFCC_0': float,
                                'Mean_Acc1298_Std_Mem40_MFCC_1': float,
                                'Mean_Acc1298_Std_Mem40_MFCC_2': float,
                                'Mean_Acc1298_Std_Mem40_MFCC_3': float,
                                'Mean_Acc1298_Std_Mem40_MFCC_4': float,
                                'Mean_Acc1298_Std_Mem40_MFCC_5': float,
                                'Mean_Acc1298_Std_Mem40_MFCC_6': float,
                                'Mean_Acc1298_Std_Mem40_MFCC_7': float,
                                'Mean_Acc1298_Std_Mem40_MFCC_8': float,
                                'Mean_Acc1298_Std_Mem40_MFCC_9': float,
                                'Mean_Acc1298_Std_Mem40_MFCC_10': float,
                                'Mean_Acc1298_Std_Mem40_MFCC_11': float,
                                'Mean_Acc1298_Std_Mem40_MFCC_12': float,
                                'Std_Acc1298_Mean_Mem40_Centroid': float,
                                'Std_Acc1298_Mean_Mem40_Rolloff': float,
                                'Std_Acc1298_Mean_Mem40_Flux': float,
                                'Std_Acc1298_Mean_Mem40_MFCC_0': float,
                                'Std_Acc1298_Mean_Mem40_MFCC_1': float,
                                'Std_Acc1298_Mean_Mem40_MFCC_2': float,
                                'Std_Acc1298_Mean_Mem40_MFCC_3': float,
                                'Std_Acc1298_Mean_Mem40_MFCC_4': float,
                                'Std_Acc1298_Mean_Mem40_MFCC_5': float,
                                'Std_Acc1298_Mean_Mem40_MFCC_6': float,
                                'Std_Acc1298_Mean_Mem40_MFCC_7': float,
                                'Std_Acc1298_Mean_Mem40_MFCC_8': float,
                                'Std_Acc1298_Mean_Mem40_MFCC_9': float,
                                'Std_Acc1298_Mean_Mem40_MFCC_10': float,
                                'Std_Acc1298_Mean_Mem40_MFCC_11': float,
                                'Std_Acc1298_Mean_Mem40_MFCC_12': float,
                                'Std_Acc1298_Std_Mem40_Centroid': float,
                                'Std_Acc1298_Std_Mem40_Rolloff': float,
                                'Std_Acc1298_Std_Mem40_Flux': float,
                                'Std_Acc1298_Std_Mem40_MFCC_0': float,
                                'Std_Acc1298_Std_Mem40_MFCC_1': float,
                                'Std_Acc1298_Std_Mem40_MFCC_2': float,
                                'Std_Acc1298_Std_Mem40_MFCC_3': float,
                                'Std_Acc1298_Std_Mem40_MFCC_4': float,
                                'Std_Acc1298_Std_Mem40_MFCC_5': float,
                                'Std_Acc1298_Std_Mem40_MFCC_6': float,
                                'Std_Acc1298_Std_Mem40_MFCC_7': float,
                                'Std_Acc1298_Std_Mem40_MFCC_8': float,
                                'Std_Acc1298_Std_Mem40_MFCC_9': float,
                                'Std_Acc1298_Std_Mem40_MFCC_10': float,
                                'Std_Acc1298_Std_Mem40_MFCC_11': float,
                                'Std_Acc1298_Std_Mem40_MFCC_12': float,
                                'BH_LowPeakAmp': float,
                                'BH_LowPeakBPM': int,
                                'BH_HighPeakAmp': float,
                                'BH_HighPeakBPM': int,
                                'BH_HighLowRatio': int,
                                'BHSUM1': float,
                                'BHSUM2': float,
                                'BHSUM3': float
                            })