def plot_accuracies(file_train, features):
    """
    Plots the accuracies of a basic Multinomial Naive Bayes classifier as a function of the number
    of features in the Vectorizer
    :param file_train: The file to read the train data from
    :param features: a list consisting of different number of features
    """
    df_train = pd.read_csv(file_train)  # reads the file into df
    # creates random train-test-split
    X_train, X_test, y_train, y_test = train_test_split(df_train['sample'],
                                                        df_train['label'],
                                                        test_size=0.2)
    # apply preproccessing
    X_train = pre.preprocessing(X_train.to_numpy())
    X_test = pre.preprocessing(X_test.to_numpy())
    acc = np.zeros(len(features))
    # loop on the number of features
    for i in range(len(features)):
        acc[i] = calculate_accuracies(X_train, X_test, y_train, y_test,
                                      features[i])
    # plot the results
    df_1 = DataFrame({"features": features, "acc": acc})
    plot = (ggplot(df_1) + geom_line(aes(x="features", y="acc")) +
            labs(title="accuracies on train test based on num_feat",
                 x="features",
                 y="accuracy"))
    print(plot)
예제 #2
0
def main(yf):
    parser = ParseYaml(yf)

    to_preprocessing = parser.get_item("to_preprocessing")
    to_train = parser.get_item("to_train")
    to_demo = parser.get_item("to_demo")
    to_plots = parser.get_item("to_plots")

    preprocessing_config = parser.get_item("preprocessing")
    if to_preprocessing:
        preprocessing.preprocessing(preprocessing_config)

    train_config = parser.get_item("train")
    if to_train:
        train.train(train_config)

    demo_config = parser.get_item("demo")
    if to_demo:
        demo.demo(demo_config)

    plot_config = parser.get_item("plots")
    if to_plots:
        plots.line_plot(plot_config.get("line_plot"))
        # print("---1---")
        plots.day_scatter_plot(plot_config.get("day_scatter_plot"))
        plots.multi_day_scatter_plot(plot_config.get("multi_day_scatter_plot"))
        # print("---2---")
        plots.hist_plot(plot_config.get("hist_plot"))
        # print("---3---")
        plots.day_map_plot(plot_config.get("day_map_plot"))
        plots.mutli_day_map_plot(plot_config.get("multi_day_map_plot"))
예제 #3
0
def pipeline(video , len):
    preprocessing.preprocessing(video , len)
    os.system("rm -rf Anomaly-Detection/C3D_Features")
    os.system("sh mk.sh")
    os.system("sh C3D/C3D-v1.0/examples/c3d_feature_extraction/c3d_sport1m_feature_extraction_video.sh")
    os.system("rm -rf Anomaly-Detection/C3D_Features_Avg")
    os.system("sh mk1.sh")
    os.system("python convert.py")
    os.system("python run.py")
    print("Pipeline ended")
예제 #4
0
def main(args):
    # Time setting
    total_start_time = time.time()

    if args.preprocessing:
        preprocessing(args)

    if args.training:
        training(args)

    # Time calculate
    print(f'Done! ; {round((time.time()-total_start_time)/60, 3)}min spend')
예제 #5
0
def read_test_data():
    print('Load testing data:')
    with open('data/123456789/test-data.csv', 'r') as test_csv:
        reader = csv.reader(test_csv, delimiter=';')
        for index, row in enumerate(reader):
            if index == 0:
                X_test = np.array([preprocessing(row[0])])
                Y_test = np.array([get_index_of_category(row[1])])
            else:
                X_test = np.vstack((X_test, preprocessing(row[0])))
                Y_test = np.vstack((Y_test, get_index_of_category(row[1])))
    return X_test, Y_test
예제 #6
0
def main(args):
    start_time = time.time()

    if args.preprocessing:
        preprocessing(args)

    if args.training:
        training(args)
    
    if args.testing:
        testing(args)

    end_time = round((time.time() - start_time) / 60, 4)
    print(f'Done!; {end_time}min spend')
예제 #7
0
def predict_status(img_height,
                   img_width,
                   n_images,
                   path_test,
                   preprocess=True):
    # dimensions of our images.
    img_width, img_height = img_width, img_height

    test_data_dir = path_test

    # if preprocessing needs to be done - preprocessed images are stored in preprocessed folder
    if preprocess == True:
        preprocessing(img_width, img_height, test_data_dir)

    nb_test_samples = n_images

    # number of images in each batch
    batch_size = 10

    # this is the augmentation configuration we will use for testing:
    # only rescaling & default preprocessing function for InceptionV3
    test_datagen = ImageDataGenerator(rescale=1. / 255,
                                      preprocessing_function=preprocess_input)

    test_generator = test_datagen.flow_from_directory(test_data_dir,
                                                      target_size=(img_width,
                                                                   img_height),
                                                      batch_size=batch_size,
                                                      class_mode='binary',
                                                      shuffle=False)

    # load saved model for predictions
    model = load_model('Best_Model_Checkpoint.hdf5')

    # generate predictions
    pred = model.predict_generator(test_generator,
                                   steps=nb_test_samples / batch_size)

    # with 0.5 as threshold get Normal/ abnormal class
    y_pred = (pred >= 0.5).astype(int)

    # store predictions along with labels in dataframe
    predictions = pd.DataFrame()
    predictions['filename'] = test_generator.filenames
    predictions['pred'] = y_pred

    predictions.ix[predictions['pred'] == 1, 'pred'] = 'Normal'
    predictions.ix[predictions['pred'] == 0, 'pred'] = 'Abnormal'

    predictions.to_csv('prediction_file_testset.csv', index=False)
예제 #8
0
def Myprediction(df, features, clf, name, item_category_list_unique):
    testdf = pd.read_csv('data/test/round1_ijcai_18_test_a_20180301.txt',sep=' ')
    testdf.context_timestamp += 8*60*60
    testdf = preprocessing.preprocessing(testdf)
    testdf.item_category_list.replace(item_category_list_unique, list(np.arange(len(item_category_list_unique))), inplace=True)
    listfeature = [
               'item_category_list-user_id_query_day_hour_item_brand_id', 
               'item_category_list+item_pv_level',
               'item_price_level+shop_review_num_level',
               'item_price_level+item_category_list',
               'item_price_level-item_collected_level',
               ]
    for i in listfeature:
        if '+' in i:
            x = i.split('+')
            testdf[i] = add(testdf[x[0]],testdf[x[1]])
        elif '-' in i:
            x = i.split('-')
            testdf[i] = substract(testdf[x[0]],testdf[x[1]])
        elif '*' in i:
            x = i.split('*')
            testdf[i] = times(testdf[x[0]],testdf[x[1]])
        elif '/' in i:
            x = i.split('/')
            testdf[i] = divide(df[x[0]],testdf[x[1]])
    prediction_format = pd.read_csv('data/output/0203.txt',sep=' ')
    train, predict = df, testdf
    clf.fit(train[features], train.is_trade, eval_set = [(train[features], train.is_trade)], eval_metric='logloss', verbose=True)
    predict['predicted_score'] = clf.predict_proba(predict[features])[:,1]
    print(predict[['instance_id', 'predicted_score']])
    prediction_file = pd.merge(prediction_format[['instance_id']], predict[['instance_id', 'predicted_score']], on = 'instance_id', how = 'left')
    prediction_file.to_csv('data/output/{}.txt'.format(name), sep=' ',index = None)
    return clf
예제 #9
0
def test_model():
    """
    test the model on test set
    """
    df_train = pd.read_csv("train_set.csv")
    df_test = pd.read_csv("test_set.csv")
    x_train, y_train = df_train['sample'], df_train['label']
    x_test, y_test = df_test['sample'], df_test['label']
    x_train = pre.preprocessing(x_train.to_numpy())
    x_test = pre.preprocessing(x_test.to_numpy())
    model = LinearSVC(C=LAMBDA)
    max_features = int(len(x_train)/2)
    test_acc, train_acc, p = calculate_accuracies(x_train, x_test, y_train,
                                               y_test, model, max_features)
    print("number of features: ", len(p))
    print("test Accuracy: ", test_acc, " train Accuracy: ", train_acc)
예제 #10
0
def Myprediction(df, features, clf, name, item_category_list_unique):
    testdf = pd.read_csv('data/test/round1_ijcai_18_test_a_20180301.txt',
                         sep=' ')
    testdf.context_timestamp += 8 * 60 * 60
    testdf = preprocessing(testdf)  #convert_time(testdf)
    testdf.item_category_list.replace(
        item_category_list_unique,
        list(np.arange(len(item_category_list_unique))),
        inplace=True)
    prediction_format = pd.read_csv('data/output/0203.txt', sep=' ')
    train, predict = df, testdf
    clf.fit(train[features],
            train.is_trade,
            eval_set=[(train[features], train.is_trade)],
            eval_metric='logloss',
            verbose=True)
    predict['predicted_score'] = clf.predict_proba(predict[features])[:, 1]
    print(predict[['instance_id', 'predicted_score']])
    prediction_file = pd.merge(prediction_format[['instance_id']],
                               predict[['instance_id', 'predicted_score']],
                               on='instance_id',
                               how='left')
    prediction_file.to_csv('data/output/{}.txt'.format(name),
                           sep=' ',
                           index=None)
    return clf
예제 #11
0
def predict():

    data = {"success": False}

    if request.method == "POST":
        user_input = request.json["text"]      
        preprocessed_input = preprocessing(user_input)
        array_dtype = get_dtype(preprocessed_input)            
        encoded_input = base64_encoding(preprocessed_input)
        # MAKE C-CONTIGOUS?????
        # endoced_input = encoded_input.copy(order="C")   ###
        
        k = str(uuid.uuid4())
        d = {"id": k, "shape": preprocessed_input.shape, "dtype": array_dtype, "data": encoded_input}
        rdb.rpush(DATA_QUEUE, json.dumps(d))    # dump the preprocessed input as a numpy array

        while True:
            output = rdb.get(k)

            if output is not None:
                output = output.decode("utf-8")
                data["predictions"] = json.loads(output)

                rdb.delete(k)
                break
            
            time.sleep(CLIENT_SLEEP)
        data["success"] = True
    
    return jsonify(data)    
예제 #12
0
def train_doc2vec(texts, filename='doc2vec'):
    '''
    Trains and saves a doc2vec mode on a given data.

    Args:
        texts: dict, where key (str) is an id of a text, value (str) is the text itself
        filename: str
    '''
    preprocessed_texts = []
    for id_ in texts:
        preprocessed_texts.append(
            TaggedDocument(words=preprocessing(texts[id_], stopwords_),
                           tags=[id_]))
    print('preprocessing is done')
    model = Doc2Vec(vector_size=300,
                    min_count=3,
                    alpha=0.25,
                    min_alpha=0.025,
                    epochs=100,
                    workers=4,
                    dm=1)
    model.build_vocab(preprocessed_texts)
    print(len(model.wv.vocab))

    model.train(preprocessed_texts,
                total_examples=model.corpus_count,
                epochs=model.epochs,
                report_delay=60)
    model.save(filename + '.model')
def mean_GPP_nn():
    preds = mlp_predictions()
    m = np.mean(preds, axis=0)

    X, Y, Y_Preles = preprocessing.preprocessing()

    years = [
        2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
        2012
    ]
    arr = np.zeros((365, 13))
    for i in range(len(years)):
        x, y, y_nn = preprocessing.split_by_year(X, Y, m, years=[years[i]])
        arr[:, i] = y_nn[:365]

    fig, ax = plt.subplots(figsize=(8, 6), dpi=100)  #figsize=(8,6), dpi=100
    CI = np.quantile(arr, (0.05, 0.95), axis=1)
    fig.tight_layout(pad=1.5)
    ax.fill_between(np.arange(365),
                    CI[0],
                    CI[1],
                    color="lightgreen",
                    alpha=0.5)
    ax.plot(np.arange(365),
            np.mean(arr, axis=1),
            color="green",
            label="$\widehat{p2}_{m2} - \widehat{p2}_{m1}$",
            linewidth=1.0)
    ax.set_ylabel("GPP [g C m$^{-2}$ day$^{-1}$] ")
    ax.set_xlabel("Day of year")
    ax.set_ylim(-1, 20)
예제 #14
0
def load_casual_data(data_path):
    question_signal = u'Người báo tin:'
    answer_signal = u'Chatbot:'

    QA_dict = {}

    with open(data_path, 'r', encoding='utf-8') as fp:
        flag = False
        question = None
        answer = None
        for sen in fp:
            sen = sen.strip()
            if question_signal in sen:
                flag = True
                question = sen.split(question_signal)[1]
                question = preprocessing(question, tokenize=False)
            elif answer_signal in sen and flag:
                flag = False
                answer = sen.split(answer_signal)[1]
                # answer = preprocessing(answer).lower()
                QA_dict.update({question.lower(): answer.strip()})
            else:
                continue

    return QA_dict
예제 #15
0
def one_sample():

    import pandas as pd
    import numpy as np

    from preprocessing import preprocessing

    data = pd.read_csv("Churn_Modelling.csv")
    pre = preprocessing(data)

    scaler = pre[-1]

    geo = input("Geography: ")
    score = int(input("Credit Score:"))
    gender = input("Gender (m/f): ")
    age = int(input("Age: "))
    tenure = int(input("Tenure (year): "))
    balance = int(input("Balance: "))
    num_pro = int(input("Number of Procuts: "))
    credit = input("Does it have credit card (y/n): ")
    active = input("Is this custmoer active memeber (y/n): ")
    salary = int(input("Estimated Salary: "))

    geo = geo.lower()
    gender = gender.lower()
    credit = credit.lower()
    active = active.lower()

    if geo == "france":
        geo = 0
        geo1 = 0
    elif geo == "spain":
        geo = 1
        geo1 = 0
    elif geo == "germany":
        geo = 0
        geo1 = 1

    if gender == "f":
        gender = 0
    elif gender == "m":
        gender = 1

    if credit == "y":
        credit = 1
    elif credit == "n":
        credit = 0

    if active == "y":
        active = 1
    elif active == "n":
        active = 0

    arr = np.array([[
        float(geo), geo1, score, gender, age, tenure, balance, num_pro, credit,
        active, salary
    ]])
    arr = scaler.transform(arr)

    return arr
예제 #16
0
def run(source, method, k):
	tags = [1,1,1,1,1,1]
	if method=='knn' and k is None:
		raise ValueError('Argument k is mandatory for KnnClassifier')
	else:
		if source=='chemistry':
			if tags[5]:
				glossary = get_chemistry()
			else:
				glossary = None
			filepath = 'files/chemistry.xml'
		else:
			if tags[5]:
				glossary = get_graphicdesign()
			else:
				glossary = None
			filepath = 'files/graphic-design.xml'
			
		matrix, tag_names = preprocessing(filepath, tags, glossary)

		print(tag_names)

		if method=='knn':
			k = int(math.fabs(int(k) or 5))
			r = knn(matrix, k)
		else:
			r = logistic_regression(matrix)

		print(r)
예제 #17
0
def get_corpus(path, meta=True):
    '''
    Collects corpus and its meta from file.

    Args:
        path: str
        meta: bool, if meta is needed

    Returns:
        dict (doc_id, text)
        AND dict (doc_id, dict of meta) (if meta)
    '''
    counter = 0
    texts = {}
    meta = {}
    for file in os.listdir(path):
        with open(path + file, 'r', encoding='utf-8') as f:
            d = json.load(f)
        if d['text'] is not None:
            counter += 1
            texts[d['id']] = d['text']
            meta[d['id']] = d
            meta[d['id']]['text'] = preprocessing(meta[d['id']]['text'])
    if meta:
        return texts, meta
    return texts
예제 #18
0
def Analysis():
    
    while True:
        folder_name = input("分析結果を格納するフォルダ名を入力してください。resultフォルダの中に新しく作成されます")
        try:
            os.mkdir("result/{}".format(folder_name))
            break
        except FileExistsError:
            print("同じ名前のフォルダがすでに存在しています。名前を変えてください")
            continue
    tweetdata_name = input("分析するtwitterデータファイルをdataフォルダに入れて、そのファイル名を入力してください(.csvまで)")
    tweet_path = "data/{}".format(tweetdata_name)
    df = preprocessing(tweet_path)
    df = sentiment_predict(df,folder_name)
    sentiment_plot(df,folder_name)
    posi_corpus,posi_dictionary,nega_corpus,nega_dictionary = get_topic_num(df)
    os.mkdir("result/{}/positive_group".format(folder_name))
    os.mkdir("result/{}/negative_group".format(folder_name))
    plot_topics(posi_corpus,posi_dictionary,nega_corpus,nega_dictionary,folder_name)
    while True:
        finish_or_continue = int(input("グループ数を変更して分析し直す場合は[1]を、終了する場合は[2]を押してください"))
        if finish_or_continue == 1:
            plot_topics(posi_corpus,posi_dictionary,nega_corpus,nega_dictionary,folder_name)
        elif finish_or_continue == 2:
            break
        else:
            print("半角で1か2を押してください")
            continue
예제 #19
0
def processing():
#Reading the configfile            
    specifications = readConfig()
    number = specifications[0]
    filenames = specifications[1]
    species = specifications[2]
    horizontalGridLat = specifications[3]
    horizontalGridLon = specifications[4]
    verticalGrid = specifications[5]
    output = specifications[6]
    filename = specifications[7]
    airden = specifications[8]
    airdenfiles = specifications[9]
    apriori = specifications[10]
    aks = specifications[11]
    modeloutput = []
    modeloutputFilenames = []
    airdenFilenames = []
    cache = []

    if(airden == "False" or airden == "false" or airden == "0" or airden == "no" or airden == "No"):
        airden = False
    else:
        airden = True

    aksNew = []
    apriori = pr.preprocessing(apriori)
    for elem in aks:
        aksNew.append(pr.preprocessing(elem))
    for i in range(number):
        if(airden):
            print("Processing Air density..")
            airdenFilenames.append(density.airden(airdenfiles[i], verticalGrid, output))
            print("Processing Air density: Done!")
        print("Processing ASCII-Files: {}/{}".format(i+1,number))
        nameOfFile = a2d.ipAscii(filenames[i], species, horizontalGridLat, horizontalGridLon, verticalGrid, output)
        print("Interpolating the model..")
        cache = pm.interpolateModel(filename, species, nameOfFile, verticalGrid, output)
        modeloutputFilenames.append(cache)
        print("Interpolation: Done!")
    print("Processing ASCII-Files: Done!")
    print("Computing total column...")
    for i in range(number):
        tc.createTC(filenames[i], apriori, species, aksNew[i], output, modeloutputFilenames[i], airdenFilenames)
    print("Computing total column: Done!")
    return
예제 #20
0
def test_preprocessing():
    source = os.getcwd()
    big_folder = os.path.dirname(source) + '/Raman_Data'
    with os.scandir(big_folder) as entries:
        for entry in entries:
            foldername = entry.name
            print(foldername)
            subig_folder = os.path.dirname(
                source) + '/Raman_Data/' + foldername
            with os.scandir(subig_folder) as entries:
                pnumber = 0
                for entry in entries:
                    filename = entry.name
                    if filename.endswith(".txt"):
                        pnumber = pnumber + 1
    assert pnumber is not 0, 'There is no txt file in the' + subig_folder
    pp.preprocessing(big_folder)
예제 #21
0
 def run(self):
     preprocessing(self.axial_images, self.coronal_images, self.sagittal_images)
     try:
         self.examine_result_abnormal = Model(key = 'abnormal').get_prediction()
         self.examine_result_acl = Model(key = 'acl').get_prediction()
         self.examine_result_men = Model(key = 'men').get_prediction()
     except Exception:
         print("can't find models")
     
     self.get_data(self.examine_result_abnormal,
                   self.examine_result_acl,
                   self.examine_result_men)
     
     self.save_to_database()
     self.db.close()
     self.signals.finished.emit()
     return
예제 #22
0
 def setUpClass(cls):
     cls.pp = preprocessing('insight_testsuite/test_1/input/complaints.csv')
     cls.processed_data = cls.pp.input_data
     with open(cls.pp.ipath, 'r') as file:
         reader = csv.reader(file, delimiter=',')
         cls.items = [row for row in reader]
     cls.data = cls.items[1:]
     cls.header = cls.items[0]
예제 #23
0
def execute():
    chat = preprocessing()
    # msg = request.args.get('msg')
    msg = request.get_json()
    # return msg
    # kata = "apakah stok ada"
    res = chat.predict(inputtxt=msg['msg'])
    # print(msg['msg'])
    return jsonify(res)
예제 #24
0
def filter(all_news_accounts):
    preprocessor = preprocessing()
    files = os.listdir('tweets')
    all_urls = {}
    all_texts = {}
    bad_urls = []
    for file in files:
        urls = {}
        texts = {}
        str_data = ''
        all_json = []
        with gzip.open('tweets/' + file, 'rb') as f:
            # data = json.loads(f.read())
            # data = json.loads(f.read(), cls=ConcatJSONDecoder)
            str_data = f.readlines()

        for line in str_data:
            line = json.dumps(line)
            all_json.append(json.loads(line))
        for tweet in all_json:
            if not tweet.find('media_url') == -1:
                try:
                    tweet = ast.literal_eval(tweet)
                    if not str(tweet['user']["id"]) in all_news_accounts:
                        continue
                    if tweet['is_quote_status']:
                        continue
                    if 'media' in tweet['entities'] and tweet['entities'][
                            'media'][-1]['type'] == 'photo':
                        text = preprocessor.clean(tweet['text'])
                        texts[tweet['id']] = text
                        urls[tweet['id']] = tweet['entities']['media'][-1][
                            'media_url']
                    elif 'media' in tweet['extended_entities'] and tweet[
                            'extended_entities']['media'][-1][
                                'type'] == 'photo':
                        text = preprocessor.clean(tweet['text'])
                        texts[tweet['id']] = text
                        urls[tweet['id']] = tweet['extended_entities'][
                            'media'][-1]['media_url']
                    else:
                        continue
                except KeyboardInterrupt:
                    print str(datetime.now()) + " program exit\n"
                    raise SystemExit
                except KeyError as e:
                    print str(datetime.now()) + ' ' + str(e) + '\n'
                    traceback.print_exc()
                except Exception as e:
                    print str(datetime.now()) + ' ' + str(e) + '\n'
                    traceback.print_exc()
                    raise SystemExit

        all_urls[file] = urls
        all_texts[file] = texts
    get_photos(all_urls, bad_urls)
    save_texts(all_texts, bad_urls)
예제 #25
0
def main(args):
    # Time setting
    total_start_time = time.time()

    preprocessing
    if args.preprocessing:
        preprocessing(args)

    # Augmentation by NER_Masking
    if args.augmenting:
        augmenting(args)

    # training
    if args.training:
        training(args)

    # Time calculate
    print(f'Done! ; {round((time.time()-total_start_time)/60, 3)}min spend')
예제 #26
0
def form():

    resp = []

    if request.method == 'POST':
        text_news = request.form['text']
        audio = request.form['audio']

        print(text_news)
        print(audio)

        if text_news == '':
            text_news = speech_to_text(audio)
            resp = preprocessing(text_news)
        else:
            resp = preprocessing(text_news)

    return render_template('result.html', respuesta=resp)
예제 #27
0
def search_d2v(raw_query):
    query = preprocessing(raw_query)
    query_vec = d2v_model.infer_vector(query)
    results = {}
    for item in d2v_index:
        this_doc_vec = d2v_index[item]
        results[item] = similarity(query_vec, this_doc_vec)
    sort_res = sorted(results.items(), key=lambda kv: kv[1], reverse=True)[:10]
    return [res[0] for res in sort_res]
예제 #28
0
파일: main.py 프로젝트: afrodev/tcc
def allResults(stockName, filename, rangeSizeDataset, rangeTrainPercent):
    originalDataset = files.getDataset(filename='../original-data/' + filename)

    preprocessedDataset1Day = ppc.preprocessing(daysAhead=1,
                                                dataset=originalDataset)
    preprocessedDataset5Days = ppc.preprocessing(daysAhead=5,
                                                 dataset=originalDataset)
    preprocessedDataset22Days = ppc.preprocessing(daysAhead=22,
                                                  dataset=originalDataset)

    # print('------ FIXED WINDOW 1 DAY -------')
    # rs.getResultsFixedWindow(stockName=stockName, dataset=preprocessedDataset1Day, daysAhead=1,
    # 	rangeSizeDataset=rangeSizeDataset, rangeTrainPercent=rangeTrainPercent)

    # print('------ FIXED WINDOW 5 DAYS -------')
    # rs.getResultsFixedWindow(stockName=stockName, dataset=preprocessedDataset5Days, daysAhead=5,
    # 	rangeSizeDataset=rangeSizeDataset, rangeTrainPercent=rangeTrainPercent)

    # print('------ FIXED WINDOW 22 DAYS -------')
    # rs.getResultsFixedWindow(stockName=stockName, dataset=preprocessedDataset22Days, daysAhead=22,
    # 	rangeSizeDataset=rangeSizeDataset, rangeTrainPercent=rangeTrainPercent)

    print('------ SLIDING WINDOW 1 DAY -------')
    rs.getResultsSlidingWindow(stockName=stockName,
                               dataset=preprocessedDataset1Day,
                               daysAhead=1,
                               rangeSizeDataset=rangeSizeDataset,
                               rangeTrainPercent=rangeTrainPercent)

    print('------ SLIDING WINDOW 5 DAYS -------')
    rs.getResultsSlidingWindow(stockName=stockName,
                               dataset=preprocessedDataset5Days,
                               daysAhead=5,
                               rangeSizeDataset=rangeSizeDataset,
                               rangeTrainPercent=rangeTrainPercent)

    print('------ SLIDING WINDOW 22 DAYS -------')
    rs.getResultsSlidingWindow(stockName=stockName,
                               dataset=preprocessedDataset22Days,
                               daysAhead=22,
                               rangeSizeDataset=rangeSizeDataset,
                               rangeTrainPercent=rangeTrainPercent)

    print('------ ALL DATA FROM ' + stockName + ' FINISHED -------')
예제 #29
0
def main(clf_name, scoring_function):
    classifier = None
    random_grid = None
    nelement = 10000
    assert clf_name == 'RandomForest' or clf_name == 'AdaBoost'
    assert scoring_function == 'accuracy' or scoring_function == 'f1' or scoring_function == 'AMS'

    if clf_name == 'RandomForest':
        print("Using random forest")
        classifier = RandomForestClassifier()

        # Number of trees in random forest
        n_estimators = [
            int(x) for x in np.linspace(start=200, stop=2000, num=10)
        ]
        # Number of features to consider at every split
        max_features = ['auto', 'sqrt']
        # Maximum number of levels in tree
        max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
        max_depth.append(None)
        # Minimum number of samples required to split a node
        min_samples_split = [2, 5, 10]
        # Minimum number of samples required at each leaf node
        min_samples_leaf = [1, 2, 4]
        # Method of selecting samples for training each tree
        bootstrap = [True, False]

        random_grid = {
            'n_estimators': n_estimators,
            'max_features': max_features,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'bootstrap': bootstrap
        }
    else:
        print("Using AdaBoost")
        classifier = AdaBoostClassifier()

        random_grid = {
            'n_estimators': [50, 100],
            'learning_rate': [0.01, 0.05, 0.1, 0.3, 1]
        }

    print("Preprocessing data... ({} samples)".format(nelement))
    X, y = preprocessing("data.csv", nelement=nelement)

    X, y = shuffle(X, y, random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0,
                                                        stratify=y)

    randomized_grid_search(classifier, random_grid, X_train, y_train, X_test,
                           y_test, scoring_function)
예제 #30
0
def main():

    result_file = "test_salaries.csv"

    model_data = preprocessing(os.path.join(data_path, train_feature_file),
                               os.path.join(data_path, train_salary_file),
                               os.path.join(data_path, test_feature_file),
                               remove_cols=["companyId", "jobType"])

    generate_results(model_data, result_file)
 def __init__(self):
     self.prePro = preprocessing()
     self.model = model()
     try:
         pickleIn = open("model.p", "rb")
         self.classifier = pickle.load(pickleIn)
     except (OSError, IOError) as e:
         self.model.main()
         pickleIn = open("model.p", "rb")
         self.classifier = pickle.load(pickleIn)
예제 #32
0
파일: main.py 프로젝트: rrrokhtar/whoami
    def preprocessingAndFeatures(self, img, writer):
        preProcessing, lines = preprocessing(img)
        features = extractLBPLines(lines)

        writerList = [writer] * len(features)
        if writer is not None:
            self.featuresLabled.extend(features)
            self.yTrain.extend(writerList)
        else:
            self.featuresTest.extend(features)
예제 #33
0
 def __init__(self):
     print "Reached init"
     preproc = preprocessing()
     self.trend_filter = trend_filtering()
     print "did stuff 1"
     self.in_data = preproc.get_data()
     print "did stuff 2"
     self.program_map = preproc.get_programs()
     print "did stuff 3"
     self.kmeans = kmeansclutering(self.in_data, self.program_map)
     print "Done with init"
예제 #34
0
 def readAndTokenize(self,file_list,listindex):
     for file in file_list:
         fp=open(file,"r")
         word_list=preprocessing(fp.read())
         for w in word_list:
             if w in self.dictionary.keys():
                 self.dictionary[w][listindex] += 1
             else:
                 newlist=[1]*4
                 self.dictionary[w]=newlist
                 self.dictionary[w][listindex] += 1
예제 #35
0
def main():  
    fpw=open("nboutput.txt","w")
    classify=nbclassify()
    for root,direc,filelist in os.walk(sys.argv[1]):
        for file in filelist:
          if file.endswith('.txt') and len(direc)==0:
            fp=open(os.path.join(root,file),"r")
            word_list=preprocessing(fp.read())
            result=classify.compute(word_list)
            result+=os.path.join(root,file)
            fpw.write(result+"\n")
예제 #36
0
def main():

    ##training
    labels_filename = ["obama_labels.txt", "romney_labels.txt"]
    book = xlrd.open_workbook("training-Obama-Romney-tweets.xlsx")
    words_filename = ["obama_words.txt", "romney_words.txt"]

    for i in range(0, 2):
        sheet = book.sheet_by_index(i)

        # extract data and store it in file featurelist.txt
        preprocessing().dataExtraction(2, sheet.nrows, sheet, labels_filename[i], words_filename[i])

    ##testing
    book_test = xlrd.open_workbook("testing-Obama-Romney-tweets-3labels.xlsx")
    labels_filename_test = ["obama_labels_test.txt", "romney_labels_test.txt"]
    words_filename_test = ["obama_words_test.txt", "romney_words_test.txt"]

    for i in range(0, 2):
        sheet = book_test.sheet_by_index(i)

        # extract data and store it in file featurelist.txt
        preprocessing().dataExtraction(2, sheet.nrows, sheet, labels_filename_test[i], words_filename_test[i])
예제 #37
0
def test_knn(filepath, glossary, k=5):
	bits = functions.gen_bitlist(6)[1:]

	total = len(bits)
	max = 0
	max_tags = []
	counter = 1
	for tags in bits:
		matrix, tag_names = preprocessing(filepath, tags, glossary)
		r = knn(matrix, k)
		print(str(counter) + "/" + str(total), end='\r')
		if r > max:
			max = r
			max_tags = tag_names
		counter += 1
	print("---Mejor---")
	print("Valoracion: " + str(max))
	print("Tag names: " + str(max_tags))
예제 #38
0
def task03_pre(k):
	preprocessing('task03', 'english', k)
예제 #39
0
import os, sys, json
dirr = os.path.dirname(sys.argv[0])
filename = os.path.join(dirr, '../../libraries')
sys.path.append(filename)

dirr = os.path.dirname(sys.argv[0])
filename = os.path.join(dirr, '../imageProcess')
sys.path.append(filename)

import preprocessing, proportion, centroid

dirr = os.path.dirname(sys.argv[0])
filename = os.path.join(dirr, '../../../img/generated_canvas/verify_11.png')
img = preprocessing.preprocessing(filename)

proportion = proportion.proportion(img)
centroidX = centroid.centroid(img)[0]
centroidY = centroid.centroid(img)[1]

result = {
	'prop' : proportion,
	'centX' : centroidX,
	'centY' : centroidY,
}

print json.dumps(result)
예제 #40
0
파일: lda.py 프로젝트: sagieske/NLP1
	def __init__(self, alpha, beta, nr_topics, skip_lda=False, orig_lda=False, remove_poprock=False):
		""" Initialize
		"""
		
		self.alpha = alpha
		self.beta = beta
		self.nr_topics = nr_topics
		self.skiplda = skip_lda
		self.orig_lda = orig_lda
		self.fold = 0
		# Dict to save recall/precision/f1 scores for every fold
		self.metric_folds = {}
		self.metric_folds_orig_lda = {}

		# Preprocess data
		prep = preprocessing.preprocessing(dump_files=False, load_files=True, dump_clean=False, load_clean=True)
		# Get lyrics
		self.total_dataset = prep.get_dataset()
		self.remove_poprock = remove_poprock

		# Possibly remove pop/rock
		if self.remove_poprock:
			
			print "Remove pop/rock: %s" %(str(remove_poprock))
			self.total_dataset_temp = []
			for i in range(0, len(self.total_dataset)):
				if self.total_dataset[i]['genre'] == 'pop/rock':
					continue
				else:
					self.total_dataset_temp.append(self.total_dataset[i])
			self.total_dataset = self.total_dataset_temp

		# Note: to use smaller dataset add [:set]
		print "total nr of lyrics:", len(self.total_dataset)

		labels = []
		#labels_subgenre = []
		label_count = {}
		# Get all labels of dataset
		for item in self.total_dataset:
			labels.append(item['genre'])
			label_count[item['genre']] = label_count.get(item['genre'], 0) +1
		

		# Set instance variable to list of set of all labels
		self.all_genres =  list(set(labels))

		#""" UNCOMMENT TO CREATE NEW FOLD INDICES
		# Get kfold training and test indices (folds: 10 so that it uses 90% for training data)
		# Stratified 10-fold cross-validation
		#skf = cross_validation.StratifiedKFold(labels, n_folds=5)
		#self.train_indices_folds = []
		#self.test_indices_folds = []
		#for train_index, test_index in skf:
		#	self.train_indices_folds.append(train_index)
		#	self.test_indices_folds.append(test_index)
		#file_indices = 'train_test_indices_stratified'
		#if remove_poprock:
		#	file_indices += '_notpoprock'
		#	print "not poprock"
		#pickle.dump((self.train_indices_folds, self.test_indices_folds), open(file_indices,"wb+"))
		#sys.exit()

		# OR LOAD FROM PICKLE FILE:
		if self.remove_poprock:
			self.train_indices_folds, self.test_indices_folds = pickle.load(open('train_test_indices_stratified_notpoprock',"r"))
		else:
			self.train_indices_folds, self.test_indices_folds = pickle.load(open('train_test_indices_stratified',"r"))

		# Create the training and test set. Both are set as instance variables
		self.create_train_test_set(0)

		# Initialize counts
		self.genre_count = np.zeros(len(self.all_genres), dtype=int)
		self.topic_count = np.zeros(nr_topics, dtype=int)

		# Counts for original LDA
		if self.orig_lda:
			self.topic_count_orig_lda = np.zeros(nr_topics, dtype=int)
			self.doc_word_count_orig_lda = np.zeros(len(self.dataset), dtype=int)

		# Initialization of matrices and dictionaries 
		self._initialize_lists()
		# Initialize counts for matrices
		self._initialize_counts(load=False)
def main(_):
    # Create training directories
    now = datetime.datetime.now()
    train_dir_name = now.strftime('alexnet_%Y%m%d_%H%M%S')
    train_dir = os.path.join(FLAGS.train_root_dir, train_dir_name)
    checkpoint_dir = os.path.join(train_dir, 'checkpoint')
    tensorboard_dir = os.path.join(train_dir, 'tensorboard')
    tensorboard_train_dir = os.path.join(tensorboard_dir, 'train')
    tensorboard_val_dir = os.path.join(tensorboard_dir, 'val')

    if not os.path.isdir(FLAGS.train_root_dir): os.mkdir(FLAGS.train_root_dir)
    if not os.path.isdir(train_dir): os.mkdir(train_dir)
    if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir)
    if not os.path.isdir(tensorboard_dir): os.mkdir(tensorboard_dir)
    if not os.path.isdir(tensorboard_train_dir): os.mkdir(tensorboard_train_dir)
    if not os.path.isdir(tensorboard_val_dir): os.mkdir(tensorboard_val_dir)

    # Write flags to txt
    flags_file_path = os.path.join(train_dir, 'flags.txt')
    flags_file = open(flags_file_path, 'w')
    flags_file.write('learning_rate={}\n'.format(FLAGS.learning_rate))
    flags_file.write('dropout_keep_prob={}\n'.format(FLAGS.dropout_keep_prob))
    flags_file.write('num_epochs={}\n'.format(FLAGS.num_epochs))
    flags_file.write('batch_size={}\n'.format(FLAGS.batch_size))
    flags_file.write('train_layers={}\n'.format(FLAGS.train_layers))
    flags_file.write('multi_scale={}\n'.format(FLAGS.multi_scale))
    flags_file.write('train_root_dir={}\n'.format(FLAGS.train_root_dir))
    flags_file.write('log_step={}\n'.format(FLAGS.log_step))
    flags_file.close()
    
    adlamb=tf.placeholder(tf.float32,name='adlamb')
    num_update=tf.placeholder(tf.float32,name='num_update')
    decay_learning_rate=tf.placeholder(tf.float32)
    dropout_keep_prob = tf.placeholder(tf.float32)
    is_training=tf.placeholder(tf.bool)    
    time=tf.placeholder(tf.float32,[1])

    # Model
    train_layers = FLAGS.train_layers.split(',')
    model = LeNetModel(num_classes=NUM_CLASSES, image_size=28,is_training=is_training,dropout_keep_prob=dropout_keep_prob)
    # Placeholders
    x_s = tf.placeholder(tf.float32, [None, 32, 32, 3],name='x')
    x_t = tf.placeholder(tf.float32, [None, 28, 28, 1],name='xt')
    x=preprocessing(x_s,model)
    xt=preprocessing(x_t,model)
    tf.summary.image('Source Images',x)
    tf.summary.image('Target Images',xt)
    print 'x_s ',x_s.get_shape()
    print 'x ',x.get_shape()
    print 'x_t ',x_t.get_shape()
    print 'xt ',xt.get_shape()
    y = tf.placeholder(tf.float32, [None, NUM_CLASSES],name='y')
    yt = tf.placeholder(tf.float32, [None, NUM_CLASSES],name='yt')
    loss = model.loss(x, y)
    # Training accuracy of the model
    source_correct_pred = tf.equal(tf.argmax(model.score, 1), tf.argmax(y, 1))
    source_correct=tf.reduce_sum(tf.cast(source_correct_pred,tf.float32))
    source_accuracy = tf.reduce_mean(tf.cast(source_correct_pred, tf.float32))
    
    G_loss,D_loss,sc,tc=model.adloss(x,xt,y,yt)
    
    # Testing accuracy of the model
    correct_pred = tf.equal(tf.argmax(model.score, 1), tf.argmax(yt, 1))
    correct=tf.reduce_sum(tf.cast(correct_pred,tf.float32))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    
    update_op = model.optimize(decay_learning_rate,train_layers,adlamb,sc,tc)
	
    D_op=model.adoptimize(decay_learning_rate,train_layers)
    optimizer=tf.group(update_op,D_op)
    
    train_writer=tf.summary.FileWriter('./log/tensorboard')
    train_writer.add_graph(tf.get_default_graph())
    config=projector.ProjectorConfig()
    embedding=config.embeddings.add()
    embedding.tensor_name=model.feature.name
    embedding.metadata_path='domain.csv'
    projector.visualize_embeddings(train_writer,config)
    tf.summary.scalar('G_loss',model.G_loss)
    tf.summary.scalar('D_loss',model.D_loss)
    tf.summary.scalar('C_loss',model.loss)
    tf.summary.scalar('SA_loss',model.Semanticloss)
    tf.summary.scalar('Training Accuracy',source_accuracy)
    tf.summary.scalar('Testing Accuracy',accuracy)
    merged=tf.summary.merge_all()




    print '============================GLOBAL TRAINABLE VARIABLES ============================'
    print tf.trainable_variables()
    #print '============================GLOBAL VARIABLES ======================================'
    #print tf.global_variables()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
	saver=tf.train.Saver()
	#saver.restore(sess,'log/checkpoint')
        # Load the pretrained weights
        #model.load_original_weights(sess, skip_layers=train_layers)
	train_writer.add_graph(sess.graph)
        # Directly restore (your model should be exactly the same with checkpoint)
        # saver.restore(sess, "/Users/dgurkaynak/Projects/marvel-training/alexnet64-fc6/model_epoch10.ckpt")

        print("{} Start training...".format(datetime.datetime.now()))
        #print("{} Open Tensorboard at --logdir {}".format(datetime.datetime.now(), tensorboard_dir))
	gd=0
        step = 1
	for epoch in range(300000):
            # Start training
	    gd+=1
	    lamb=adaptation_factor(gd*1.0/MAX_STEP)
	    #rate=decay(FLAGS.learning_rate,gd,MAX_STEP)
	    power=gd/10000	    
	    rate=FLAGS.learning_rate
	    tt=pow(0.1,power)
	    batch_xs, batch_ys = TRAIN.next_batch(FLAGS.batch_size)
            Tbatch_xs, Tbatch_ys = VALID.next_batch(FLAGS.batch_size)
	    #print batch_xs.shape
            #print Tbatch_xs.shape
            summary,_,closs,gloss,dloss,smloss=sess.run([merged,optimizer,model.loss,model.G_loss,model.D_loss,model.Semanticloss], feed_dict={x_s: batch_xs,x_t: Tbatch_xs,time:[1.0*gd],decay_learning_rate:rate,adlamb:lamb,is_training:True,y: batch_ys,dropout_keep_prob:0.5,yt:Tbatch_ys})
	    train_writer.add_summary(summary,gd)
	
            step += 1
            if gd%250==0:
		epoch=gd/(72357/100)
	        print 'lambda: ',lamb
	        print 'rate: ',rate
		print 'Epoch {5:<10} Step {3:<10} C_loss {0:<10} G_loss {1:<10} D_loss {2:<10} Sem_loss {4:<10}'.format(closs,gloss,dloss,gd,smloss,epoch)
                print("{} Start validation".format(datetime.datetime.now()))
                test_acc = 0.
                test_count = 0
		print 'test_iter ',len(TEST.labels)
                for _ in xrange((len(TEST.labels))/5000):
                    batch_tx, batch_ty = TEST.next_batch(5000)
		    #print TEST.pointer,'   ',TEST.shuffle
                    acc = sess.run(correct, feed_dict={x_t: batch_tx, yt: batch_ty, is_training:True,dropout_keep_prob: 1.})
                    test_acc += acc
                    test_count += 5000
                print test_acc,test_count
                test_acc /= test_count
		if epoch==300:
		    return
		    
                #batch_tx, batch_ty = TEST.next_batch(len(TEST.labels))
		#test_acc=sess.run(accuracy,feed_dict={x_t:batch_tx,y:batch_ty,is_training:False,dropout_keep_prob:1.0})
		print len(batch_tx)
	        print("{} Validation Accuracy = {:.4f}".format(datetime.datetime.now(), test_acc))

		if gd%10000==0 and gd>0:
		    #saver.save(sess,'./log/mstn2model'+str(gd)+'.ckpt')
		    #print 'tensorboard --logdir ./log/tensorboard'
		    #return
		    pass 
예제 #42
0
sys.path.append(filename)

import preprocessing, proportion, centroid, meanStDev

times = json.loads(sys.argv[1])
NUMDEVS = 3
imgArray = ['filename_1.png', 'filename_2.png', 'filename_3.png', 'filename_4.png', 'filename_5.png', 'filename_6.png', 'filename_7.png', 'filename_8.png', 'filename_9.png', 'filename_10.png']
dirr = os.path.dirname(sys.argv[0])
filename = os.path.abspath(os.path.join(dirr, '../../../img/generated_canvas'))
centroidsX = []
centroidsY = []
proportions = []

for img in imgArray:
	fileTemp = os.path.join(filename, img)
	imgTemp = preprocessing.preprocessing(fileTemp)
	imgTemp.save('fart.png')
	proportions.append(proportion.proportion(imgTemp))
	centroidsX.append(centroid.centroid(imgTemp)[0])
	centroidsY.append(centroid.centroid(imgTemp)[1])


meanProps, devProps = meanStDev.meanStDev(proportions)
meanCentsX, devCentsX = meanStDev.meanStDev(centroidsX)
meanCentsY, devCentsY = meanStDev.meanStDev(centroidsY)
meanTimes, devTimes = meanStDev.meanStDev(times)

minProps = meanProps - (NUMDEVS * devProps)
maxProps = meanProps + (NUMDEVS * devProps)
resultsProps = [minProps, maxProps]
예제 #43
0
파일: geoschem.py 프로젝트: Encesat/HiWi
def processing():
    # Reading the configfile
    specifications = readConfig()
    number = specifications[0]
    filenames = specifications[1]
    species = specifications[2]
    horizontalGridLat = specifications[3]
    horizontalGridLon = specifications[4]
    verticalGrid = specifications[5]
    output = specifications[6]
    retrieval = specifications[7]
    airden = specifications[8]
    airdenfiles = specifications[9]
    apriori = specifications[10]
    aks = specifications[11]
    try:
        akssza = specifications[12]
    except IndexError:
        akssza = None
    modeloutputFilenames = []
    airdenFilenames = []
    cache = []

    if airden == "False" or airden == "false" or airden == "0" or airden == "no" or airden == "No":
        airden = False
    else:
        airden = True

    aksNew = []
    apriori = pr.preprocessing(apriori)
    for elem in aks:
        print("Preprocessing {}".format(elem))
        aksNew.append(pr.preprocessing(elem))
    for i in range(number):
        print("Processing ASCII-Files: {}/{}".format(i + 1, number))
        if airden:
            print("Processing Air density..")
            airdenFilenames.append(density.airden(airdenfiles[i], verticalGrid, output))
            print("Processing Air density: Done!")
        nameOfFile = a2d.ipAscii(filenames[i], species, horizontalGridLat, horizontalGridLon, verticalGrid, output)
        print("Interpolating the model..")
        cache = pm.interpolateModel(retrieval, species, nameOfFile, verticalGrid, output)
        modeloutputFilenames.append(cache)
        print("Interpolation: Done!")
    print("Processing ASCII-Files: Done!")
    print("Computing total column...")
    for i in range(number):
        print(aksNew[i])
        tc.createTC(
            filenames[i], apriori, species, output, modeloutputFilenames[i][0], airdenFilenames, akssza, aksNew[i]
        )
    print("Computing total column: Done!")

    print("Delete temporary files ...")
    f = open("./config.dat", "r")
    directory = f.readlines()[-2].split(":")[1].rstrip()
    for temporary in os.listdir(directory):
        if "PROCESSED" in temporary:
            os.remove("{}{}".format(directory, temporary))
    print("Delete temporary files: Done!")
    return
예제 #44
0
__author__ = 'root'

import numpy as np
import theano
import theano.tensor as T
import os.path
import matplotlib.pyplot as plot
import time

import RNN
import preprocessing

preproc = preprocessing.preprocessing()

sequenceLengthFileName = "data/out/sequence.csv"
sortNoShuffleTrainFileName = "data/out/trainNoShuffle.csv"
map48FileName = "data/out/map.csv"

if not os.path.isfile(sequenceLengthFileName):
    print("Generating sequence file...")
    preproc.generatingSequence(sortNoShuffleTrainFileName,sequenceLengthFileName)

preproc.loadTrainFile(sortNoShuffleTrainFileName)
preproc.load48Map(map48FileName)

layers = [69,128,48]
learningRate = 0.0001

x = T.matrix("x")
y = T.matrix("y")
memoryInitail = T.vector("memoryInitail")