Exemplo n.º 1
0
def main():
    print("Starting run")
    scrapping.main()
    preprocessing.main()
    k_means.main()
    hierarchical.main()
    print("Run complete")
Exemplo n.º 2
0
def main(workdir, dataset, identifier, numtopics, passes, lang):
    print("==", "starting", "==", "\n==", helpers.get_time(), "==")   
    helpers.make_dirs(workdir, identifier)
    preprocessing.main(workdir, dataset, identifier, lang)
    build_corpus.main(workdir, identifier)
    modeling.main(workdir, identifier, numtopics, passes)
    postprocessing.main(workdir, dataset, identifier, numtopics)
    make_overview.main(workdir, identifier) 
Exemplo n.º 3
0
def main(workdir, dataset, identifier):
    helpers.make_dirs(workdir, identifier)
    preprocessing.main(workdir, dataset, identifier)
    build_corpus.main(workdir, identifier)
    modeling.main(workdir, identifier, numtopics, passes)
    postprocessing.main(workdir, dataset, identifier, numtopics)
    make_overview.main(workdir, identifier)
    make_heatmap.main(workdir, identifier)
    make_wordclouds.main(workdir, identifier, numtopics)
    evaluation.main(workdir, identifier, numtopics)
Exemplo n.º 4
0
def univariate_arima():
    '''
    Reads the data and fits the ARIMA model
    Prints the Acccuracy Score

    Inputs:
        None

    Outputs:
        None
    '''

    data = preprocessing.main()
    n_train_hours = 52 * 3
    train = data.iloc[:n_train_hours, :]
    test = data.iloc[n_train_hours:, :]

    model = pf.ARIMA(data=train, ar=9, ma=0, integ=1, target='milk')

    x = model.fit("MLE")
    x.summary()

    # model.plot_fit(figsize=(15,5))
    model.plot_predict(h=38, past_values=20, figsize=(15, 5))
    #import pdb; pdb.set_trace()

    yhat = model.predict(h=38)
    pred_chg = yhat > 0
    actual_chg = test.iloc[:-1, 0].diff() > 0
    print accuracy_score(actual_chg, pred_chg)
Exemplo n.º 5
0
def multivariate_arima():
    '''
    Reads the data and fits the ARIMAX model
    Prints the Acccuracy Score

    Inputs:
        None

    Outputs:
        None
    '''

    data = preprocessing.main()
    n_train_hours = 52 * 3
    train = data.iloc[:n_train_hours, :]
    test = data.iloc[n_train_hours:, :]

    model = pf.ARIMAX(data=train, formula = 'milk~1+cheese+dry+corn+Value', \
                        ar=9, ma=0, integ=1)
    x = model.fit("MLE")
    x.summary()

    # model.plot_fit(figsize=(15,5))
    # model.plot_predict(h=38,past_values=20,figsize=(15,5), oos_data=test)

    yhat = model.predict(h=38, oos_data=test)
    pred_chg = yhat > 0
    actual_chg = test.iloc[:-1, 0].diff() > 0
    print accuracy_score(actual_chg, pred_chg)
Exemplo n.º 6
0
def looper():
	file_results =[]
	for cat in cat_list:
		for n_size in n_size_list:
			for stpwords in stopwords_options:
				print("in it")
				cleaning.main(cat,n_size,stpwords)
				cleaned_eda.main()
				train_test_split.main()
				preprocessing.main()
				#preprocessed_eda.main()
				modelselection.main()
				training.main()
				accuracy = testing.main()
				results.main(cat,n_size,stpwords,'balanced',accuracy,file_results)
	df = pd.DataFrame(file_results,columns=cols)
	df.to_csv('balanced_new_results.csv')
Exemplo n.º 7
0
def preprocess(
    radio_cat,
    imbin_file,
    img_size=(2, 150, 150),
    tile_cutout_path="",
    remove_tile_cutouts=False,
    overwrite=False,
):
    # /// Preprocess Data \\\
    if not os.path.exists(imbin_file) or overwrite:
        preprocessing.main(radio_cat, imbin_file, img_size, tile_cutout_path)

    if remove_tile_cutouts:
        shutil.rmtree(tile_cutout_path)

    imgs = pu.ImageReader(imbin_file)
    return imgs
Exemplo n.º 8
0
def load_data(filename=None):
    directory = r'/home/oskar/PycharmProjects/Poverty prediction data'
    if filename is None:
        filename = 'train.csv'
    print("Preprocessing data")
    X, y, ids = pre.main(filenames=[filename],
                         directory=directory,
                         to_binarize=False,
                         to_select_feats=False,
                         to_aggregate=False)
    return X, y, ids
Exemplo n.º 9
0
def save_model():
    train_X, train_y, test_X, test_y = preprocessing.main()

    clf = models.create_sklearn_MLP()
    clf.fit(train_X, train_y)
    with open("sklean_MLP.pkl", "wb") as file:
        pickle.dump(clf, file)

    svm = models.create_svm()
    svm.fit(train_X, train_y)
    with open("svm.pkl", "wb") as file:
        pickle.dump(svm, file)

    keras = models.create_keras_mlp(train_X, train_y)
    keras.fit(train_X, train_y, batch_size=32, epochs=300)
    keras.save('keras')
Exemplo n.º 10
0
def metric_evaluation():

	ticker_ds = ['WMT']
	N = len(ticker_ds)
	S = 3 #sample size
	rmse = [[] for i in range(S)]
	r2_values = [[] for i in range(S)]


	for ticker in ticker_ds:
		print("Ticker value: %s" %(ticker))
		
		#local array rmse_test stores average rmse values of models per ticker
		rmse_test = [0 for i in range(S)]
		r2_test = []
		residual = [0 for i in range(4)]


		#represents samples per ticker
		for n in range(S):
			sample,r2,residual = preprocessing.main(ticker)
			rmse_test[0] += sample['pred_lin']/S
			rmse_test[1] += sample['pred_ridge']/S
			rmse_test[2] += sample['pred_svr_lin']/S

			r2_test.append(r2[n]/S)


		#store rmse values per ticker in final rmse table
		for i in range(S):
			rmse[i].append(rmse_test[i])
			r2_values[i].append(r2_test[i])

		#graphs residual fit plot of final sample for each ticker
		residual_fit_plot(residual)

	r2_linear = sum(r2_values[0])/len(r2_values)
	r2_ridge = sum(r2_values[1])/len(r2_values)
	r2_svr = sum(r2_values[2])/len(r2_values)

		
	print(rmse)
	print(r2_linear,r2_ridge,r2_svr)
Exemplo n.º 11
0
def storeInDb():
    request_file = request.files['data_file']
    if not request_file:
        return "No file"
    df = pd.read_csv(request_file)  #csv file which you want to import
    records_ = df.to_dict(orient='records')
    result = db[request_file.filename]
    result.drop()
    result = db[request_file.filename].insert_many(records_)
    filename = request_file.filename
    #Preprocessing
    df_new = preprocessing.main(filename)
    records_new = df_new.to_dict(orient='records')
    result_new = db[filename]
    result_new.drop()
    result_new = db[filename].insert_many(records_new)
    return jsonify({
        "result": "File sucessfully uploaded!",
        "filename": filename
    })
import tools.helper
import preprocessing
import optimization
import postprocessing
import plot_single_scenario
import join_scenarios
import plot_combination

if __name__ == '__main__':
    scenario_assumptions = tools.helper.get_scenario_assumptions()

    selected_id = scenario_assumptions.index

    for i in selected_id:
        print(
            f"Running scenario {i} with the name '{scenario_assumptions.loc[i]['scenario']}'"
        )

        preprocessing.main(**scenario_assumptions.loc[i])

        optimization.main(**scenario_assumptions.loc[i])

        postprocessing.main(**scenario_assumptions.loc[i])

        plot_single_scenario.main(**scenario_assumptions.loc[i])

join_scenarios.main(**scenario_assumptions)

plot_combination.main()
Exemplo n.º 13
0
def process_data_function():
    preprocessing.main()
Exemplo n.º 14
0
	If access to GPU or a massive RAM, you can: 
		1. Set p = None and q = None
		2. Set p = 0 and q = (n + 1) where the n is the number of pictures to read per iteration
			Make sure to update p and q by replacing 10 with n
	"""
	# Training in batches because of the dataset size
	p = 0 
	q = 21
	iteration = 0
	top = 125 # The total pics in the class with fewest images + 1

	logger.info('Reading DF...')
	theModel = sys.argv[1]

	if((p is None) || (q is None)):
		df = preprocessing.main('train', theModel, None, None) #Read all data

		X = np.array(df['image'].tolist()) # Generate array of arrays for X, and array of vectors for y
		df['vector_labels'] = pd.get_dummies(df['label']).values.tolist()
		Y = np.array(df['vector_labels'].tolist())

		logger.info('X.shape: {}'.format(X.shape))
		logger.info('Y.shape: {}'.format(Y.shape))
		logger.info('Starting training...')

		train(X,  Y, iteration, theModel)
	
	else:
		while (p < top):		
			df = preprocessing.main('train', theModel, p, q) #Let's do (q - p) pics from all classes per iter
Exemplo n.º 15
0
	Due to RAM constraints, I'm reading the images in batches of 10 i.e. 10 images per class per iteration.
	
	If access to GPU or a massive RAM, you can: 
		1. Set p = None and q = None
		2. Set p = 0 and q = (n + 1) where the n is the number of pictures to read per iteration
			Make sure to update p and q by replacing 10 with n
	"""
	# Training in batches because of the dataset size
	# p = 0 
	# q = 20
	# iteration = 0
	# top = 125 # The total pics in the class with fewest images + 1

	theModel = sys.argv[1]

	df = preprocessing.main('train', theModel, None, None) #Read all data

	X = np.array(df['image'].tolist()) # Generate array of arrays for X, and array of vectors for y
	df['vector_labels'] = pd.get_dummies(df['label']).values.tolist()
	Y = np.array(df['vector_labels'].tolist())

	logger.info('X.shape: {}'.format(X.shape))
	logger.info('Y.shape: {}'.format(Y.shape))
	logger.info('Starting training...')

	train(X, Y, iteration, theModel)

	# while (p < top):
	# 	logger.info('Reading DF...')
	# 	df = preprocessing.main('train', theModel, p, q) #Let's do (q - p) pics from all classes per iter
Exemplo n.º 16
0
    validation_set_size = 0.8
    seed = 17

    models = {'rand_forest' : RandomForestClassifier(n_estimators=500, verbose=2, max_depth=4, max_features=50, random_state=seed),
              'boosted_trees' : GradientBoostingClassifier(n_estimators=1600, verbose=2, max_features=40, subsample=0.5, random_state=seed)}
    model = models.get(model_selector)
    reductors = {'pca' : PCA(n_components=len(X.columns), random_state=seed),
                 'lda' : LinearDiscriminantAnalysis(n_components=len(X.columns)),
                 'none' : None}
    red = reductors.get(reductor_selector)

    if red:
        print("Feature space transformation")
        red.fit(X,y)
        X_r = red.transform(X)
        msg = lambda x: "Accuracy of trained model with feature space transformation is {:.4f}%".format(100*x)
    else:
        X_r = X
        msg = lambda x: "Accuracy of trained model without feature space transformation is {:.4f}%".format(100*x)

    print("Model selected. Starting training {}".format(time.asctime()))
    X_train, X_test, y_train, y_test = train_test_split(X_r, y, train_size=validation_set_size, random_state=seed)
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    print(msg(score))
    return score


if __name__ == '__main__':
    X, y = prp.main()
    score = main(X, y)
Exemplo n.º 17
0
 def preProcess(self):
     self.textBrowser.setText("Preprocessing...")
     preprocessing.main()
     self.textBrowser.setText("Preprocessing Done")
Exemplo n.º 18
0
def run(im):
    # binarized = binarize_image(im)
    binarized = preprocessing.main()
    cv_binarized = skimage.img_as_ubyte(binarized)

    lines = line_segmentation(cv_binarized)
    lines = [skimage.img_as_float(i) for i in lines]
    # lines = [skimage.img_as_float(cv_binarized)]
    bounding_boxes = []
    for i, line in enumerate(lines):
        # Find the contours
        contours = skimage.measure.find_contours(line, 0.8)
        for contour in contours:
            # Get the contour's bounding box (x, y, w, h)
            bound = (int(contour[:, 0].min()), int(contour[:, 1].min()),
                     int(contour[:, 0].max() - contour[:, 0].min()),
                     int(contour[:, 1].max() - contour[:, 1].min()), i)
            # Only keep it if it's large enough
            if bound[2] > 5 and bound[3] > 5:
                bounding_boxes.append(bound)

    bounding_boxes = sorted(bounding_boxes, key=lambda x: x[0])
    merged = []
    done = []
    # Attempt to merge the bounding boxes
    for i1, bound in enumerate(bounding_boxes):
        if i1 in done:
            continue
        for i2, other in enumerate(bounding_boxes):
            if (other[0] >= bound[0] and other[0] <= bound[0] + bound[2]) and (
                    other[1] >= bound[1] and other[1] <= bound[1] + bound[3]):
                merged.append(
                    (min(bound[0],
                         other[0]), min(bound[1],
                                        other[1]), bound[2] + other[2],
                     bound[3] + other[3], min(bound[4], other[4])))
                done.append(i1)
                done.append(i2)

    boxes = merged + bounding_boxes
    boxes = [box for box in boxes if box[2] < 80 and box[3] < 80]
    windows = np.zeros((len(boxes), 70, 70))
    for i, box in enumerate(boxes):
        windows[i, :, :] = skimage.transform.resize(
            binarized[box[0]:box[0] + box[2], box[1]:box[1] + box[3]],
            (70, 70))
    line_ns = [box[4] for box in boxes]
    # Draw the rectangles
    # for rect in boxes:
    #     lines[0][rect[0]:rect[0]+rect[2], rect[1]] = 0
    #     lines[0][rect[0]:rect[0]+rect[2], rect[1]+rect[3]] = 0
    #     lines[0][rect[0], rect[1]:rect[1]+rect[3]] = 0
    #     lines[0][rect[0]+rect[2], rect[1]:rect[1]+rect[3]] = 0

    return (windows, line_ns)

    # _fig, ax = plt.subplots()
    # ax.imshow(line, interpolation='nearest', cmap=plt.cm.gray)
    # for n, _contour in enumerate(contours):
    #     ax.plot(contours[n][:, 1], contours[n][:, 0], linewidth=2)
    # plt.show()
    '''
Exemplo n.º 19
0
if __name__ == '__main__':
    path= '//home/timom/git/DeepBeliefBird/SongFeatures/Motifs/3718/'
    plotting=True
    os.chdir(path)
    for files in os.listdir("."):
        if files.endswith(".xls"):
            xlsfile=files 
            
    nfft=1024
    songs,fs,filenames=readSongs(path)        
    labels= createLabelArray(path,PEGGY_DRIFT=0.01,fs=fs,windowL=nfft,hopF=2)  
    print filenames      
    print len(songs)
    songnumber = 0
    print filenames[songnumber][:-4]
    newspec,invD,mu,sigma= pp.main(songs[songnumber],fs,hpFreq=250,nfft=nfft,hopfactor=2,filterbank=False,numCoeffs= 30,plotting=False)
    print newspec.shape
    
    if plotting:
        oldspec = np.dot(invD,((newspec*sigma)+mu).T)
        print oldspec.shape
        pl.figure()
        pl.subplot(211)
        pl.imshow(oldspec,origin='lower',aspect='auto')
        label=labels[filenames[songnumber][:-4]]
        pl.plot(label*oldspec.shape[0]/20.0,'xb')
        pl.xlim(0,oldspec.shape[1])
        
        pl.subplot(212)
        pl.imshow(newspec.T,origin='lower',aspect='auto')
        pl.show()
Exemplo n.º 20
0
import gym
import preprocessing
env = gym.make('CarRacing-v0')
for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        env.render()
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        preprocessing.main(observation)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
env.close()

Exemplo n.º 21
0
    Inputs:
        None

    Outputs:
        None
    '''

    data = preprocessing.main()
    n_train_hours = 52 * 3
    train = data.iloc[:n_train_hours, :]
    test = data.iloc[n_train_hours:, :]

    model = pf.ARIMAX(data=train, formula = 'milk~1+cheese+dry+corn+Value', \
                        ar=9, ma=0, integ=1)
    x = model.fit("MLE")
    x.summary()

    # model.plot_fit(figsize=(15,5))
    # model.plot_predict(h=38,past_values=20,figsize=(15,5), oos_data=test)

    yhat = model.predict(h=38, oos_data=test)
    pred_chg = yhat > 0
    actual_chg = test.iloc[:-1, 0].diff() > 0
    print accuracy_score(actual_chg, pred_chg)


if __name__ == '__main__':
    #    multivariate_arima()
    data = preprocessing.main()
    test_stationarity(data.milk)
Exemplo n.º 22
0
def rnn_multivariate(epochs=50, batch_size=72, neurons=50):

    # load dataset
    dataset = preprocessing.main()
    #dataset.drop(['Value'], axis=1, inplace=True)
    values = dataset.values
    # integer encode direction
    #encoder = LabelEncoder()
    #values[:,4] = encoder.fit_transform(values[:,4])
    # ensure all data is float
    values = values.astype('float32')
    # normalize features
    scaler_x = MinMaxScaler(feature_range=(0, 1))
    scaler_y = MinMaxScaler(feature_range=(0, 1))
    scaled_x = scaler_x.fit_transform(values[:, :-1])
    scaled_y = scaler_y.fit_transform(values[:, -1].reshape(-1,1))
    scaled = concatenate((scaled_x, scaled_y), axis=1)
    # frame as supervised learning
    reframed = series_to_supervised(scaled, 1, 1)
    #print reframed.head()
    # drop columns we don't want to predict
    reframed.drop(reframed.columns[[6, 7, 8, 9]], axis=1, inplace=True)
    #print(reframed.head())

    # split into train and test sets
    values = reframed.values
    n_train_hours = 52*3
    train = values[:n_train_hours, :]
    test = values[n_train_hours:, :]
    # split into input and outputs
    train_X, train_y = train[:, :-1], train[:, -1]
    test_X, test_y = test[:, :-1], test[:, -1]
    # reshape input to be 3D [samples, timesteps, features]
    train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
    test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
    #print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

    # design network
    model = Sequential()
    model.add(LSTM(neurons, input_shape=(train_X.shape[1], train_X.shape[2])))
    model.add(Dense(1))
    model.compile(loss='mae', optimizer='adam')
    # fit network
    history = model.fit(train_X, train_y, nb_epoch=epochs, batch_size=batch_size, validation_data=(test_X, test_y), verbose=2, shuffle=False)
    # plot history
    #print history
    # pyplot.plot(history.history['loss'], label='train')
    # pyplot.plot(history.history['val_loss'], label='test')
    # pyplot.legend()
    # pyplot.show()

    # make a prediction
    yhat = model.predict(test_X)
    test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))
    # invert scaling for forecast
    #inv_yhat = concatenate((yhat, test_X[:, 1:]), axis=1)
    inv_yhat = scaler_y.inverse_transform(yhat)
    inv_yhat = inv_yhat[:,0]
    # invert scaling for actual
    test_y = test_y.reshape((len(test_y), 1))
    # inv_y = concatenate((test_y, test_X[:, 1:]), axis=1)
    inv_y = scaler_y.inverse_transform(test_y)
    inv_y = inv_y[:,0]
    # calculate RMSE
    rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
    print('Test RMSE: %.3f' % rmse)

    fig, ax = plt.subplots()
    ax.plot(test_y, label='Actual')
    ax.plot(yhat, label='Predicted', c='r')
    ax.plot(yhat+1.96*rmse, '--', c='r', alpha=.5)
    ax.plot(yhat-1.96*rmse, '--', c='r', alpha=.5)
    #ax.set_xticklabels(dataset.index[-42:].date, rotation=45)
    #pyplot.fill_between(yhat + rmse, yhat - rmse)
    pyplot.ylabel('Normalized Milk Futures Prices')
    pyplot.xlabel('Week')
    pyplot.legend()
    pyplot.savefig('rnn_act_v_pred.png')
    pyplot.show()


    test_output = pd.concat([pd.DataFrame(test_y), pd.DataFrame(yhat)], axis=1, keys=['actual','predicted'])

    test_output['actual_chg'] = test_output.actual.diff().fillna(0).values > 0
    test_output['predicted_chg'] = test_output.predicted.diff().fillna(0).values > 0
    test_output['comparison'] = test_output.actual_chg == test_output.predicted_chg

    acc = accuracy_score(test_output['actual_chg'], test_output['predicted_chg'])
    print('Accuracy: %.3f' % acc)
    return acc
Exemplo n.º 23
0
# ===============================================================================
# inputfile= 'test1.wav'
# [fs, songwhole]=wavfile.read(inputfile)
# if len(songwhole.shape)==2:
#    song=np.mean(songwhole[:],axis=1)
# else:
#    song = songwhole[5*fs:15*fs]
# ===============================================================================
inputpath = "//home/timom/git/DeepBeliefBird/SongFeatures/Motifs/1189"

songs, fs, filenames = bsu.readSongs(inputpath)

savednamed = "512_12_1189_concat"
test_data, invD, mu, sigma, triF = pp.main(
    songs, fs, hpFreq=250, nfft=512, hopfactor=2, filterbank=True, numCoeffs=12, DCT=True
)

print "time slizes: %i || input dimensions: %i || window size:%i" % (
    test_data.shape[0],
    test_data.shape[1],
    (invD.shape[0] - 1) * 2,
)

oldspec = np.dot(invD, ((test_data * sigma) + mu).T)
pl.figure()
pl.imshow(oldspec, origin="lower", aspect="auto")
pl.show()


batchdata = np.asarray(test_data, dtype=theano.config.floatX)
Exemplo n.º 24
0
def main():
    xTrain, yTrain, xTest, yTest = preprocessing.main()
    yHat = xgboost(xTrain, yTrain, xTest, yTest)
    fpr, tpr, _ = roc_curve(yTest, yHat)
    roc_auc = auc(fpr, tpr)
    print("AUC for XGBoost", roc_auc)
Exemplo n.º 25
0
def main():
    """
    Main function
    :return:
    """
    # set loggers
    set_logger(stream_level="error", file_level="info", log_filename="file1.log")
    preprocessing.set_logger(stream_level="error", file_level="info", log_filename="file2.log")

    try:
        _, ELS_DOMAIN, SQL_DOMAIN, WARC_FILE = sys.argv
    except Exception as e:
        print('Usage: python elasticsearch.py DOMAIN QUERY')
        sys.exit(0)

    # for each word in each document find the potential candidates by using elastic search.
    # For each candidate query trident KB and keep only the english abstracts from the results
    for warc_id, document_results in preprocessing.main(WARC_FILE):
        logger.info("============  DOCUMENT  ==============")
        for doc_entity in document_results:
            logger.debug("===============  Elastic search ==================")
            logger.debug("Candidates for [{}]".format(doc_entity))
            candidates = find_candidates(ELS_DOMAIN, doc_entity)
            log_candidates(candidates, "debug")
            logger.debug("================End of ES -- Start of Trident=================")
            for candidate in candidates:
                logger.debug("QUERY Trident for candidate: {} with id: {}".format(candidate.name, candidate.freebase_id))
                trident_response = get_kb_info_by_candidate(SQL_DOMAIN, candidate.freebase_id)
                #logger.info(json.dumps(trident_response, indent=2))
                # extract only English abstract
                candidate.kb_abstract = get_only_english_abstract_from_json(trident_response)
                logger.debug("Abstract from trident: {}\n".format(candidate.kb_abstract))
            logger.debug("===============  END of Trident ==================")
            candidates = remove_candidates_without_abstracts(candidates)
            # if candidates not found (or removed) move to the next word
            if not candidates:
                continue
            logger.info("===============  Candidates ==================")
            # initialise the best candidate
            candidate_with_best_score = candidates[0]
            for candidate in candidates:
                # concatenate the english abstract of one candidate
                abstract = " ".join(candidate.kb_abstract)
                # extract the nouns from the abstract
                candidate.kb_nouns = preprocessing.extract_nouns_from_text(abstract)
                candidate.similarity_score = similarity_measure(document_results, candidate.kb_nouns)
                logger.info("Candidate_id: {},   label: {},   Abstract:  \n{}\n\n Nouns: {}\n\n Score: {}\n\n\n".format(
                    candidate.freebase_id,
                    candidate.freebase_label,
                    candidate.kb_abstract,
                    candidate.kb_nouns,
                    candidate.similarity_score))
                # check the best score from candidates
                if candidate.similarity_score > candidate_with_best_score.similarity_score:
                    # change best candidate
                    candidate_with_best_score = candidate

            logger.info(" -------------   Candidate with BEST score for {} -------------  ".format(doc_entity))
            logger.info("Candidate_id: {},   label: {},   Abstract:  \n{}\n\n Nouns: {}\n\n Score: {}\n\n\n".format(
                candidate_with_best_score.freebase_id,
                candidate_with_best_score.freebase_label,
                candidate_with_best_score.kb_abstract,
                candidate_with_best_score.kb_nouns,
                candidate_with_best_score.similarity_score))

            # if the candidate has similarity score less than 0.2 then it is considered as Unlinkable Mention Entity
            # after many experiments we conclude that the results with such a low are false positives
            if candidate_with_best_score.similarity_score < THRESHOLD_FOR_UNLINKABLE_MENTION:
                continue

            print "{}\t{}\t{}".format(warc_id, doc_entity, candidate_with_best_score.freebase_id)
Exemplo n.º 26
0
def setup():
    preprocessing.main()
    build_datasets.build_datasets(reduced=True)
    build_datasets.build_datasets(reduced=False)
    document.main()
Exemplo n.º 27
0
import preprocessing
import vsm_similarity
import token_matching
import stack_trace
import semantic_similarity
import fixed_bug_reports
import evaluation

print('Parsing & Preprocessing...')
preprocessing.main()

print('Token Matching...')
token_matching.main()

print('VSM Similarity...')
vsm_similarity.main()

print('Stack Trace...')
stack_trace.main()

print('Semantic Similarity...')
semantic_similarity.main()

print('Fixed Bug Reports...')
fixed_bug_reports.main()

print('Evaluating...')
evaluation.main()
Exemplo n.º 28
0
def main():
    # for year in range(2003, 2004):
        preprocessing.main(all=True)
        k_means.main(all=True)
        wordcloudy.main(all=True)
Exemplo n.º 29
0
    def run(self):
        hists = {symbol: self.get_bars(symbol) for symbol in self.symbols.keys()}
        hists = {symbol: temp.result() for symbol, temp in hists.items()}
        filenames = []
        for symbol, hist in hists.items():
            if hist is None:
                filenames.append(self.save_data(symbol))
                del self.symbols[symbol]
                continue
            old_hist = self.symbols[symbol]['hist']
            self.symbols[symbol]['hist'] = pd.concat([old_hist, hist[[idx not in old_hist for idx in hist.index]]])

        # image_files = [draw_chart(symbol, values['hist']) for symbol, values in self.symbols.items()]

        # labels, probs = self.predict(image_files)
        data = []
        prices = []
        for symbol, hist in hists.items():
            if hist is not None:
                hist['Local time'] = [idx.isoformat().replace('T', ' ') for idx in hist.index]
                df = main(hist)
                for c in ['Unnamed: 0', 'Date', 'Class']:
                    if c in df.columns:
                        del df[c]

                df['Minute'] = df['Minute'].apply(get_time)

                actual_x, _ = df.iloc[-self.lookback:].values, df.iloc[-self.predict_size:]['Close'].values
                norm_x, _ = normalize(df.iloc[-self.lookback:], df.iloc[-self.predict_size:]['Close'])
                data.append(norm_x)
                prices.append(actual_x)

        data = np.array(data)
        preds = self.model_tf.predict(data)

        for symbol, pred, current, price in zip(self.symbols.keys(), preds, data, prices):
            print(symbol, pred)
            self.place_order(symbol, pred, current[0], price)

        if sum([True if values['entered_trade'] else False for symbol, values in self.symbols.items()]) == 0:
            self.delay = 30

        if self.run_update_symbols:
            self.update_symbols()

        now = datetime.datetime.now(tz)

        if now.hour >= 15 or now.hour < 9:
            if now.hour == 15 and now.minute > 50:
                self.delay = 60*60
                print(f'Market end. Selling all shares.')
                for order in self.wb.get_account()['positions']:
                    symbol = order['ticker']['symbol']
                    if self.symbols[symbol]['entered_trade']:
                        order = self.wb.place_order(stock=symbol, action='SELL', orderType='MKT', enforce='DAY', quant=1)
                        print(f"SOLD: {symbol} --- Order#: {order.get('orderId')} --- EOD")
                        self.symbols[symbol]['entered_trade'] = False
            elif now.hour == 15 and now.minute > 29:
                self.delay = 5
                print(f'Market is about to close for the day. Making delay {self.delay} seconds.')

            elif now.hour == 8:
                self.delay = (now.replace(hour=9, minute=20, second=0) + datetime.timedelta(hours=24) - now).seconds
                print(f'Market is closed for the day. Sleeping for {self.delay/60/60} hours.')
            else:
                self.delay = 60*60
                print(f'Market is closed for the day. Sleeping for {1} hour.')

        self.s.enter(self.delay, 1, self.run)