def init(data_src_dir, dataset_src_dir, dst_dir, model_dst_dir,
         training_plot_dst_dir, test_plot_dst_dir, img_format, input_suffix,
         output_suffix, gt_suffix, transformation, downsample_rate,
         samples_per_second, multiplier, load):
    if not load:
        shutil.rmtree(dst_dir, ignore_errors=True)
        # Without the delay sometimes weird stuff happens when deleting/creating the folder
        time.sleep(1)

        # Crate necessary folders
        os.makedirs(model_dst_dir)
        os.makedirs(training_plot_dst_dir)
        os.makedirs(test_plot_dst_dir)

        # Preprocess data if necessary
        if not os.path.isdir(dataset_src_dir):
            preprocessor = Preprocessor(data_src_dir, dataset_src_dir,
                                        img_format, input_suffix,
                                        output_suffix, gt_suffix,
                                        downsample_rate, samples_per_second)
            preprocessor.preprocess(transformation=transformation,
                                    duration_multiplier=multiplier)

    tf.reset_default_graph()
    return tf.Session()
예제 #2
0
def main(data_src_path, dataset_dst_path, img_format, input_suffix,
         output_suffix, gt_suffix, transformation, downsample_rate,
         samples_per_second, duration_multiplier):
    preprocessor = Preprocessor(data_src_path, dataset_dst_path, img_format,
                                input_suffix, output_suffix, gt_suffix,
                                downsample_rate, samples_per_second)
    preprocessor.preprocess(
        gen_input=True,
        gen_output=True,
        transformation=transformation,
        duration_multiplier=duration_multiplier,
    )
예제 #3
0
def main():
    csv_handler = CSVHandler(data_dir)
    preprocessor = Preprocessor()
    # visualizer = Visualizer()
    logger = Logger()

    # print "load train data and test data"
    try:
        train = csv_handler.load_csv(train_filename)
        test = csv_handler.load_csv(test_filename)
    except Exception as e:
        logger.show_exception(e)

    # print "preprocess the both data"
    t_train = train["SalePrice"].values
    train, test = preprocessor.preprocess(train, test, except_num=True)

    # print "extract target column and feature column for both data"
    x_train = train.values
    x_test = test.values

    # print "save test ids"
    test_ids = test.index

    # print "design training"
    tuned_parameters = [{'C': [1000, 10000, 100000], 'epsilon': [1000, 100, 10]}]
    reg = GridSearchCV(
        SVR(),
        tuned_parameters,
        cv=5
    )

    # print "train"
    reg.fit(x_train, t_train)
    logger.show_training_result(reg)

    # print "prediction"
    y_train = reg.predict(x_train).astype(int)
    y_test = reg.predict(x_test).astype(int)

    # print "save"
    output = zip(test_ids, y_test)
    csv_handler.save_csv(output, 'support_vector_regression')

    # print "show difference between true distribution and prediction"
    # visualizer.show_result(t_train, y_train)

    # print "everything works well"
    return 0
예제 #4
0
def main(args):
    data = {}
    data["vectors"] = Word2Vec(
        "data/GoogleNews-vectors-negative300.bin", args.word2vec_limit,
        MAX_WORDS, WORD_VEC_DIMENSIONS).convert(
            [Preprocessor().preprocess_line(args.review, True)])
    data["vectors"] = np.array(data["vectors"])
    model = load_model("model.h5")

    result = model.predict(data["vectors"])
    print(f"Review is positive: {result[0][0]*100:.2f}%")
    print(f"Review is negative: {100 - result[0][0]*100:.2f}%")
예제 #5
0
def main():
    modelType = sys.argv[1]  # model type can be 'sk' or 'xgb'

    preprocess = Preprocessor()
    train = preprocess.load_data(TRAIN_FILE)
    trainX = np.array(
        preprocess.fill_missing_values(train.drop(
            ['Id', 'SalePrice'], axis=1)).select_dtypes(exclude=['object']))
    trainY = np.array(preprocess.fill_missing_values(train[['SalePrice']]))

    # modelScore = get_k_fold_cross_validation(trainX, trainY, modelType)
    # modelScores = grid_search_model(trainX, trainY, modelType)

    test = preprocess.load_data(TEST_FILE)
    testX = np.array(
        preprocess.fill_missing_values(test.drop(
            ['Id'], axis=1)).select_dtypes(exclude=['object']))
    outputDf = model(trainX, trainY, test, testX, modelType)
    outputDf.to_csv('predictions.csv', index=False)
예제 #6
0
class ObjectiveClassifier:
    def __init__(self, model_path, senti_path, stop_words, ngrams_path):
        self.loader = Loader(classname="weka.core.converters.ArffLoader")
        self.features_calculator = FeaturesCalculator(ngrams_path)
        self.classifier = Classifier(jobject=serialization.read(model_path))
        self.normalizer = Preprocessor(senti_path)
        self.stop_words = stop_words

    def classify_tweet(self, tweet, polarity='"positive"'):
        tweet_normalized = self.normalizer.preprocess(tweet, self.stop_words,
                                                      "")
        self.features_calculator.calculateFeatures(
            tweet_normalized, "output/tweet_features_objective.arff", polarity)
        tweet_features = self.loader.load_file(
            "output/tweet_features_objective.arff")
        tweet_features.class_is_last()
        for index, inst in enumerate(tweet_features):
            pred = self.classifier.classify_instance(inst)
            dist = self.classifier.distribution_for_instance(inst)
            print("%d - %s - %s" %
                  (index + 1, inst.class_attribute.value(
                      int(pred)), str(dist.tolist())))
예제 #7
0
def load_data(word2vec_limit):
    data = {"text": [], "is_positive": []}

    preprocessor = Preprocessor()

    for line in preprocessor.preprocess_file("data/neg.txt"):
        data["text"].append(line)
        data["is_positive"].append(0)

    for line in preprocessor.preprocess_file("data/pos.txt"):
        data["text"].append(line)
        data["is_positive"].append(1)

    preprocessor.visualize()

    data["text"], data["is_positive"] = shuffle(np.array(data["text"]), np.array(data["is_positive"]))
    data["vectors"] = Word2Vec("data/GoogleNews-vectors-negative300.bin", word2vec_limit, MAX_WORDS, WORD_VEC_DIMENSIONS).convert(
        data["text"]
    )
    data["vectors"] = np.array(data["vectors"])

    return data
예제 #8
0
 def __init__(self, model_path, senti_path, stop_words, ngrams_path):
     self.loader = Loader(classname="weka.core.converters.ArffLoader")
     self.features_calculator = FeaturesCalculator(ngrams_path)
     self.classifier = Classifier(jobject=serialization.read(model_path))
     self.normalizer = Preprocessor(senti_path)
     self.stop_words = stop_words