def createBigramTrigram(infile, outfile, type): #read file with loader module loader = Loader() items = loader.read_file(infile) plt = [] #create bigram or trigram for tweet in items: if type == 'b': plt.append(bigramas(tweet['text'])) else: plt.append(trigramas(tweet['text'])) #Pass all strings list to a single string palavras = ' ' for i in range(len(plt)): for j in range(len(plt[i])): palavras += plt[i][j].replace('\n', ' ').replace('\t', '') + ' ' #Formatting count = {} for word in palavras.split(" "): if len(word) < 0: continue if word not in count: count[word] = 0 count[word] += 1 l = sorted(count.items(), key=lambda x: -x[1]) write_file(infile, outfile, l)
def createDict(infile): #read file with loader module loader = Loader() items = loader.read_file(infile) #create dict() dic = collections.defaultdict(list) for tweet in items: aux = tweet['user_name'] #Check if key exists in dictionary using in operator if aux in dic: dic[aux] = dic[aux] + 1 else: dic[aux] = 1 #sorted dictionary list_x = sorted(dic.items(), key=lambda kv: kv[1], reverse=True) return list_x
def sanitize(infile, outfile, stopwords, emoji, rt): #initialize cleaner and load stopwords cleaner = TweetCleaner() stopwords = cleaner.load_stopwords(stopwords) #read file with loader module loader = Loader() items = loader.read_file(infile) #remove stopwords and emoji from tweets for tweet in items: tweet['text'] = cleaner.standardize_quotes(tweet['text']) tweet['text'] = cleaner.clean_apostrophe_s(tweet['text']) tweet['text'] = cleaner.remove_urls(tweet['text']) tweet['text'] = cleaner.remove_symbols(tweet['text']) tweet['text'] = cleaner.remove_stopwords(tweet['text'], stopwords) if not emoji: tweet['text'] = cleaner.remove_emoji(tweet['text']) if rt: cleaner.remove_rts(items, tweet) write_file(infile, outfile, items)
def load_pays(): load = Loader( "https://simplonline-v3-prod.s3.eu-west-3.amazonaws.com/media/file/csv/25d9c746-3622-4c48-835e-d7ccafa311f5.csv" , "../datas/RAW/" ) csv_path = load.ensure_data_loaded() pec.clean_csv( '../datas/RAW/' + csv_path, "../datas/CURATED/pays_en_chiffre.csv") pec.jsonify_csv( "../datas/CURATED/pays_en_chiffre.csv", "../datas/CURATED/pays_en_chiffre.json" ) pays: dict = json.load(open("../datas/CURATED/pays_en_chiffre.json")) if mongo.db.pays: mongo.db.pays.drop() db_pays = mongo.db["pays"] db_pays.insert_many(pays) return jsonify( etat="success" )
from utils.constants import VILLE_NAME from modules.loader import Loader from modules.cleaner import Cleaner from modules.analyser import Analyzer from modules.saver import Saver import datetime as dt import pandas as pd save_path = 'D:\\Users\\Yuan.ZHANG\\PycharmProjects\\compa0516\\data_save' CURRENT_TIME_AP = '2018-05-15' CURRENT_TIME_INT = '2018_05_15' Intfilename_lst = [ "BDDExportInterventions-{} du 01_01_2013 au 15_05_2018.xlsx".format( CURRENT_TIME_INT) ] loader = Loader(datadir="D:\\Users\Yuan.ZHANG\\PycharmProjects\\data") saver = Saver() cleaner = Cleaner() analyzer = Analyzer() # # standardize the format of the dataframe # for ville in VILLE_NAME: # # rename the dataframe,remove redundant info and save # data_Arm = loader.load_ArmPL(foldername=ville,filename="BDDExport_ArmoireBt_{}_{}.xlsx".format(ville,CURRENT_TIME_AP), NAME_LIST=Armoire_NAME) # data_PL = loader.load_ArmPL(foldername=ville,filename="BDDExport_PointLumineux_{}_{}.xlsx".format(ville,CURRENT_TIME_AP), NAME_LIST=PL_NAME) # data_Int = loader.load_Intervention(foldername=ville,filename_lst=Intfilename_lst, NAME_LIST=Int_NAME) # # data_Arm = cleaner.rv_dupRow(data_Arm) # data_Ar = cleaner.rep_dur(data_Arm, Var_lst=Armoire_TIME, currtime=dt.datetime(2018, 5, 15, 0, 0, 0, 0)) # data_PL = cleaner.rv_dupRow(data_PL)
from utils.constants import Armoire_NAME, Armoire_ARM_CAT, Armoire_DEPART_CAT, Armoire_TIME, Armoire_ARM_DIST from modules.loader import Loader from modules.cleaner import Cleaner from modules.analyser import Analyzer import datetime as dt loader = Loader( datadir= "/Users/zhangyuan/Documents/Workspace/StageCiteosWorkspace/data/noumea") cleaner = Cleaner() analyser = Analyzer() # load the data data_Ar = loader.load_ArmPL( filename="BDDExport_ArmoireBt_NOUMEA_2018-05-15.xlsx", NAME_LIST=Armoire_NAME) # remove the duplicated rows and replace the date with the duration data_Ar = cleaner.rv_dupRow(data_Ar) data_Ar = cleaner.rep_dur(data_Ar, Var_lst=Armoire_TIME, currtime=dt.datetime(2018, 5, 5, 0, 0, 0, 0)) # generate the count for NAN for all the variables analyser.gen_NAN_excel(data_Ar.iloc[:, 0:43], 'Armoire_arm', 'Armoire_arm_or') analyser.gen_NAN_excel(data_Ar.iloc[:, 43:], 'Armoire_depart', 'Armoire_depart_or') # pick the variables and regroup data_Ar_arm = analyser.pick_Var(data=data_Ar, Var_lst=Armoire_ARM_CAT + Armoire_ARM_DIST)
with open('body_face_sample.pickle', 'rb') as f: body_face_samples: {str: Sample} = pickle.load(f) with open('car_sample.pickle', 'rb') as f: car_samples: {str: Sample} = pickle.load(f) custom_samples = body_face_samples.copy() custom_samples.update(car_samples) keys = list(custom_samples.keys()) random.shuffle(keys) custom_samples = {key: custom_samples[key] for key in keys} settings = ProjectSettings("settings.yaml") # Load the label mapping. loader = Loader() loader.load_labels(settings.LABELS_FILE) body_face_labels = [ '/m/04yx4', '/m/03bt1vf', '/m/01g317', '/m/05r655', '/m/01bl7v', '/m/0dzct', '/m/04hgtk' ] car_labels = ['/m/01prls'] for key, value in custom_samples.items(): labelled_image = value.get_visualized_image_custom_label( label_map_function=loader.get_label, custom_label=car_labels + body_face_labels) cv2.imwrite( ProjectSettings.instance().CUSTOM_LABELLED_DIRECTORY + key + '.jpg',
from utils.constants import PL_NAME, PL_TIME,PL_PL_CAT,PL_PL_DIST,PL_LAN_CAT,PL_LAN_DIST from modules.loader import Loader from modules.cleaner import Cleaner from modules.analyser import Analyzer import datetime as dt loader = Loader(datadir="/Users/zhangyuan/Documents/Workspace/StageCiteosWorkspace/data/noumea") cleaner = Cleaner() analyser = Analyzer() data_PL = loader.load_ArmPL(filename="BDDExport_PointLumineux_NOUMEA_2018-05-15.xlsx", NAME_LIST=PL_NAME) data_PL = cleaner.rv_dupRow(data_PL) data_PL = cleaner.rep_dur(data_PL,Var_lst=PL_TIME,currtime=dt.datetime(2018, 5, 5, 0, 0, 0, 0)) data_PL_PL = analyser.pick_Var(data=data_PL, Var_lst=PL_PL_CAT+PL_PL_DIST) data_PL_LAN = analyser.pick_Var(data=data_PL, Var_lst=PL_LAN_CAT+PL_LAN_DIST) data_PL_PL = cleaner.rv_dupRow(data_PL_PL) analyser.gen_NAN_excel(data_PL.iloc[:,0:60],'PL_PL','PL_PL_or') analyser.gen_NAN_excel(data_PL.iloc[:,60:],'PL_LAN','PL_LAN_or') analyser.gen_histogram_Pie(data_PL_PL, 'PL_PL', Var_lst=PL_PL_CAT) analyser.gen_histogram_Pie(data_PL_LAN, 'PL_LAN', Var_lst=PL_LAN_CAT) analyser.gen_Dist(data_PL, 'PL_PL', Var_lst=PL_PL_DIST) analyser.gen_Dist(data_PL_LAN,'PL_LAN',Var_lst=PL_LAN_DIST) analyser.gen_NAN_excel(data_PL_PL,'PL_PL','PL_PL') analyser.gen_NAN_excel(data_PL_LAN,'PL_LAN','PL_LAN')
from modules.analyser import Analyzer from modules.loader import Loader import numpy as np date_str = '0723' analyzer = Analyzer(datestr=date_str) loader = Loader(date_str) # ArmInt_cluster = loader.load_excel(filename='ArmInt_cluster',foldername='Cluster') # ArmInt_cluster.drop(['PanneDelai_1'], axis=1,inplace=True) # feature_names = np.array(list(ArmInt_cluster.columns)) # # clf = loader.load_pickle('Randomforest_Armoire') # analyzer.plot_feature_importance(importances=clf.best_estimator_.feature_importances_,featurenames=feature_names,title='Randomforest_featureimportance_Armoire',top_n=40) # # clf = loader.load_pickle('GradientBoosting_Armoire') # analyzer.plot_feature_importance(importances=clf.best_estimator_.feature_importances_,featurenames=feature_names,title='GradientBoosting_featureimportance_Armoire',top_n=40) # PL_cluster = loader.load_excel(filename='PL_cluster',foldername='Cluster') PL_cluster.drop(['PanneDelai_1'], axis=1,inplace=True) feature_names = np.array(list(PL_cluster.columns)) clf = loader.load_pickle('Randomforest_PL') analyzer.plot_feature_importance(importances=clf.best_estimator_.feature_importances_,featurenames=feature_names,title='Randomforest_featureimportance_PL',top_n=40) clf = loader.load_pickle('GradientBoosting_PL') analyzer.plot_feature_importance(importances=clf.best_estimator_.feature_importances_,featurenames=feature_names,title='GradientBoosting_featureimportance_PL',top_n=40)
from modules.preprocessor import Processor import datetime as dt import seaborn as sns import os import pandas as pd from utils.constants import Armoire_MERGE,Int_MERGE,PL_MERGE import sklearn.feature_selection from sklearn.feature_selection import f_regression, mutual_info_regression from scipy.cluster.hierarchy import linkage # the date of saving the data date_str='0723' analyzer = Analyzer(datestr=date_str) cleaner = Cleaner() loader = Loader(datestr=date_str) saver = Saver(datestr=date_str) processor = Processor(datestr=date_str) plotter = Plotter(datestr=date_str) cluster = Cluster(datestr=date_str) modeler = Modeler(datestr=date_str) data_downloadtime = dt.datetime(2018, 5, 15, 0, 0, 0, 0) data_starttime = dt.datetime(2013, 1, 1, 0, 0, 0, 0) day_difference = (data_downloadtime - data_starttime).days CURRENT_TIME_AP = '2018-05-15' CURRENT_TIME_INT = '2018_05_15' Intfilename_lst = ["BDDExportInterventions-{} du 01_01_2013 au 15_05_2018.xlsx".format(CURRENT_TIME_INT)] """Attention: for this project, I dupmp the data of BOGOR
with open('./config.json') as configfile: config = json.loads("".join(configfile.readlines())) data_dir = config["data_dir"] neural_network_config = config["neural-network"] reload_weights = neural_network_config["reload_weights"] weight_dir = neural_network_config["weight_dir"] if len(sys.argv) > 1: epochs = int(sys.argv[1]) else: epochs = neural_network_config["epochs"] # Utility classes loader = Loader(neural_network_config) # Load stuff X = np.load(data_dir + "/features.npy") Y = np.load(data_dir + "/labels.npy") # Training stuff x_train, y_train, x_test, y_test = loader.load(X, Y) x_train, y_train, x_test, y_test = x_train[:,:,1:], y_train[:,:,1:], x_test[:,:,1:], y_test[:,:,1:] input_dim = x_train.shape[2] neural = Neural(input_dim, neural_network_config) neural.set_callbacks(EarlyStop(5)) if reload_weights: neural.load(weight_dir + "/weights")
help="How many do we want to visualize?") return parser.parse_args() args = get_args() set_index = args.set_index sample_count = args.sample_count if __name__ == "__main__": # Load the project settings and required modules. Logger.log_special("Running Sample Loader", with_gap=True) settings = ProjectSettings("settings.yaml") # Load the label mapping. loader = Loader() loader.load_labels(settings.LABELS_FILE) Logger.log_field("Labels Loaded", len(loader.label_map)) # Load the samples from the set that we want. samples = Loader.load_sample_set(set_index) loaded_samples = [ s for s in samples if (s.is_locally_loaded and len(s.detect_regions) > 0) ] # How many samples loaded? n_loaded_samples = len(loaded_samples) Logger.log_field("Samples with Images", n_loaded_samples) if n_loaded_samples == 0: raise Exception(
from utils.constants import Int_NAME, Int_TIME ,Int_INT_CAT,Int_INT_DIST,Int_PAN_CAT,Int_PAN_DIST from modules.loader import Loader from modules.cleaner import Cleaner from modules.analyser import Analyzer import datetime as dt loader = Loader(datadir="/Users/zhangyuan/Documents/Workspace/StageCiteosWorkspace/data/noumea") cleaner = Cleaner() analyser = Analyzer() Intfilename_lst = ["BDDExportInterventions-2018_05_15 du 01_01_2013 au 15_05_2018.xlsx"] data_Int = loader.load_Intervention(filename_lst=Intfilename_lst, NAME_LIST=Int_NAME) data_Int = cleaner.rv_dupRow(data_Int) data_Int = cleaner.rep_dur(data_Int,Var_lst=Int_TIME,currtime=dt.datetime(2018, 5, 15, 0, 0, 0, 0)) data_Int_PAN = analyser.pick_Var(data=data_Int, Var_lst=Int_PAN_CAT+Int_PAN_DIST+['pan_Code']) data_Int_PAN = cleaner.rv_dupRow(data=data_Int_PAN,Var_lst=['pan_Code']) data_Int_INT = analyser.pick_Var(data=data_Int, Var_lst=Int_INT_CAT+Int_INT_DIST) data_Int_INT = cleaner.rv_dupRow(data_Int_INT) analyser.gen_NAN_excel(data_Int.iloc[:,0:23],'Intervention_int','Intervention_int_or') analyser.gen_NAN_excel(data_Int.iloc[:,23:],'Intervention_pan','Intervention_pan_or') analyser.gen_histogram_Pie(data_Int_INT, 'Intervention_int', Var_lst=Int_INT_CAT) analyser.gen_histogram_Pie(data_Int_PAN, 'Intervention_pan', Var_lst=Int_PAN_CAT) analyser.gen_Dist(data_Int_INT, 'Intervention_int', Var_lst=Int_INT_DIST) analyser.gen_Dist(data_Int_PAN, 'Intervention_pan',Var_lst=Int_PAN_DIST) analyser.gen_NAN_excel(data_Int_INT,'Intervention_int','Intervention_int')
# Load the project settings and required modules. Logger.log_special("Running Sample Loader", with_gap=True) settings = ProjectSettings("settings.yaml") set_path = os.path.join(settings.SAMPLES_DIRECTORY, f"sample_set_{set_index}.json") if not os.path.exists(set_path): Logger.log_field( "Error", "No file found at {}. Have you created the samples using cmd_create_samples yet?" ) exit(1) Logger.log_special("Begin Sample Image Download", with_gap=True) samples = Loader.load_sample_set_from_file(set_path) unloaded_samples = [s for s in samples if not s.is_locally_loaded] n_unloaded_samples = len(unloaded_samples) n_samples = len(samples) Logger.log_field("Samples Loaded", "{}/{}".format(n_samples - n_unloaded_samples, n_samples)) i = 0 for sample in unloaded_samples: while True: if threading.active_count() <= max_threads: thread = threading.Thread(target=sample.load) thread.start() break else:
from modules.preprocessing import Processor from modules.loader import Loader from modules.analyzer import Analyzer from utils.constants import Var_NAME,STOP_LIST from modules.cleaner import Cleaner from modules.plotter import Plotter from scipy.cluster.hierarchy import dendrogram, linkage, fcluster from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer savepath = "/Users/zhangyuan/Documents/Workspace/StageCiteosWorkspace/Call0502/save_data/" # ==================load the data================== loader = Loader() Call_file = "Reporting Call Freshmile.xlsx" raw_data = loader.load_Callfile(filename=Call_file, NAME_LIST=Var_NAME) # ================merge "problem" and "action"==================== processor = Processor() # raw_data = processor.merge_col(data,Var_lst=['Problem','Action']) # ================remove stop words, numbers, punctuation, operator, tokenize, stemming=========== cleaner = Cleaner() data = cleaner.remove_digits_dataframe(raw_data,var='Problem') data.to_excel(savepath+'tp_rv_digits.xlsx') data = cleaner.remove_punctuation_dataframe(data,var='Problem') data.to_excel(savepath+'tp_rv_punctuation.xlsx') data = cleaner.remove_stop_words_dataframe(data,stopwords_to_add=STOP_LIST,var='Problem') data.to_excel(savepath+'tp_rv_stopwords.xlsx')
def train(labels_array, nb_epochs, nb_patience): import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Conv2D, GlobalAveragePooling2D, GlobalMaxPooling2D, MaxPooling2D, AveragePooling2D, Activation, Dropout, Flatten, Dense from tensorflow.keras.callbacks import EarlyStopping from tensorflow.keras.preprocessing import image import mlflow import mlflow.tensorflow # Telechargement du ZIP from modules.loader import Loader loader = Loader( "https://stdatalake010.blob.core.windows.net/public/cifar-100.zip", '../datas/ZIP/', extraction_target='../datas/RAW/' ) loader.ensure_data_loaded() # Extraction du jeu de donnees from modules.splitting import Splitting labels_array = ['apple', 'bee'] TRAIN_DATA_DIR = Splitting.copie_dossiers( '../datas/RAW/train', labels_array, 500, explorer=False ) print(TRAIN_DATA_DIR) # Chargement des images image_data_generator = tf.keras.preprocessing.image.ImageDataGenerator(validation_split=0.2) TRAIN_IMAGE_SIZE = 32 TRAIN_BATCH_SIZE = 64 train_generator = image_data_generator.flow_from_directory( TRAIN_DATA_DIR, target_size=(TRAIN_IMAGE_SIZE, TRAIN_IMAGE_SIZE), batch_size=TRAIN_BATCH_SIZE, class_mode='categorical', subset='training') validation_generator = image_data_generator.flow_from_directory( TRAIN_DATA_DIR, # same directory as training data target_size=(TRAIN_IMAGE_SIZE, TRAIN_IMAGE_SIZE), batch_size=TRAIN_BATCH_SIZE, class_mode='categorical', subset='validation') with mlflow.start_run(): model = Sequential() model.add(Conv2D(32, kernel_size=3, activation='elu', kernel_initializer='he_uniform', padding='same', input_shape=(32,32,3))) #Toujours à la fin model.add(Flatten()) model.add(Dense(2, activation='softmax')) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) es_callback = EarlyStopping(monitor='val_loss', patience=nb_patience) training = model.fit(train_generator, epochs=nb_epochs, callbacks=[es_callback], validation_data=validation_generator, shuffle=False)
def report(infile, outfile, displaycount): #initialize cleaner and load stopwords cleaner = TweetCleaner() stopwords = cleaner.load_stopwords(['stopwords/stopwords_en.txt', 'stopwords/stopwords_pt-br.txt']) #read file with loader module print('Reading file. This may take a while...') loader = Loader() items = loader.read_file(infile) print('File read successfully!\nProcessing the summary...') if 'text' not in items[0]: print("Warning: 'text' key is required.\nTerminating...") sys.exit(0) tweet_count = len(items) summary = "File name: " + infile + '\n' summary += "Tweet count: " + str(tweet_count) + "\n\n" if 'created_at' in items[0]: #created_at exists date_upper = items[0]['created_at'] date_lower = items[tweet_count - 1]['created_at'] summary += "Most recent tweet: " + date_upper + "\n" summary += "Oldest tweet: " + date_lower + "\n" elif 'date' in items[0]: date_upper = items[0]['date'] date_lower = items[tweet_count - 1]['date'] summary += "Most recent tweet: " + date_upper + "\n" summary += "Oldest tweet: " + date_lower + "\n" else: summary += "Warning: 'created_at' or 'date' key does not exist. Date range information cannot be fetched." username_key = get_username_key(items[0]) if 'retweets' in items[0]: summary+='\nTop retweeted tweets:\n' cont = 0 for tweet in sorted(items, reverse=True, key = lambda i: i['retweets']): if 'RT @' not in tweet['text'] and cont < displaycount: summary+= format_print_tweet(tweet, username_key) cont+=1 if cont>=10: break word_list = [] hashtag_list = [] user_list = [] for tweet in items: tweet['text'] = cleaner.standardize_quotes(tweet['text']) tweet['text'] = cleaner.clean_apostrophe_s(tweet['text']) tweet['text'] = cleaner.remove_urls(tweet['text']) tweet['text'] = cleaner.remove_symbols(tweet['text']) tweet['text'] = cleaner.remove_stopwords(tweet['text'], stopwords) tweet['text'] = cleaner.remove_emoji(tweet['text']) tweet['text'] = tweet['text'].lower() for tweet in items: #print(re.findall(r'#\w+', tweet['text'])) hashtag_list += re.findall(r'#\w+', tweet['text']) user_list += re.findall(r'@\w+', tweet['text']) word_list += re.findall(r'\b\w+', tweet['text']) word_dict = {} hashtag_dict = {} user_dict = {} for hashtag in hashtag_list: if hashtag in hashtag_dict: hashtag_dict[hashtag] += 1 else: hashtag_dict[hashtag] = 1 for user in user_list: if user in user_dict: user_dict[user] += 1 else: user_dict[user] = 1 for word in word_list: if word in word_dict: word_dict[word] += 1 else: word_dict[word] = 1 summary+='\n\nWord ranking:\n\n' count = 0 for key, value in sorted(list(word_dict.items()), reverse=True, key=lambda k_v: (k_v[1],k_v[0])): if count < displaycount: summary+= '\t%s: %s\n' % (key, value) count +=1 summary+='\nUser ranking:\n\n' count = 0 for key, value in sorted(list(user_dict.items()), reverse=True, key=lambda k_v: (k_v[1],k_v[0])): if count < displaycount: summary+= '\t%s: %s\n' % (key, value) count +=1 count = 0 summary+='\nHashtag ranking:\n\n' for key, value in sorted(list(hashtag_dict.items()), reverse=True, key=lambda k_v: (k_v[1],k_v[0])): if count < displaycount: summary+= '\t%s: %s\n' % (key, value) count +=1 with open(outfile, 'w', encoding='utf8') as f: f.write(summary) print('Succesfully wrote file to ' + outfile + '!')
# Check the MUT CAT vars and NL vars, find proper methods for processing and generate related constants from modules.analyser import Analyzer from modules.loader import Loader from modules.saver import Saver from modules.preprocessor import Processor from utils.constants import VILLE_NAME, Armoire_PICK, Int_PICK, PL_PICK import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from nltk.corpus import stopwords from sklearn.cluster import KMeans import matplotlib.pyplot as plt import os date_str = '0723' analyzer = Analyzer(datestr=date_str) loader = Loader(datestr=date_str) saver = Saver(datestr=date_str) processor = Processor(datestr=date_str) """ MUL CAT vars PL: lampe_Type INT: pan_Solde, int_Solde, int_ElemDefaut, int_TypeTnt, int_TypeEqt, pan_TypeEqt, pan_Defaut, int_Defaut NL vars """ ## lampe_Type: {} ## int_ElemDefaut: {'cover':['Crosse','Vasque','Enveloppe exterieure','Support','Coffret'], ## 'electricity':['Armorceur','Platine','Lampe','Câbles','Appareillage','Ballast','Protection électrique'], ## 'else':['NA','Luminaire','Armoire départ','Horloge','Alimentation générale']}
# This is the maximum number of samples that a single 'set' will contain. MAX_SAMPLE_SET_SIZE = 5000 # Remote URLs REMOTE_IMAGE_URL_FILE = "https://requestor-proxy.figure-eight.com/figure_eight_datasets/open-images/train-images" \ "-boxable.csv " REMOTE_GROUND_TRUTH_FILE = "https://requestor-proxy.figure-eight.com/figure_eight_datasets/open-images/train" \ "-annotations-bbox.csv " if __name__ == "__main__": # Load the project settings and required modules. Logger.log_special("Running Sample Creator", with_gap=True) settings = ProjectSettings("settings.yaml") loader: Loader = Loader() # Read in the source data, and create our own sample data. Logger.log_special("Begin Sample Initialization", with_gap=True) loader.check_and_load(settings.IMAGE_URL_FILE, REMOTE_IMAGE_URL_FILE) samples = loader.create_samples(settings.IMAGE_URL_FILE) # Now that we have sample IDs and URLs, we can associate them with the GT annotations. Logger.log_special("Begin Sample Association", with_gap=True) loader.check_and_load(settings.IMAGE_URL_FILE, REMOTE_GROUND_TRUTH_FILE) loader.associate_boxes_with_samples(samples, settings.GROUND_TRUTH_FILE) # Exporting the created samples. Logger.log_special("Begin Sample Export", with_gap=True) pather.create(settings.SAMPLES_DIRECTORY) loader.export_samples(samples, path=settings.SAMPLES_DIRECTORY, size=5000)
body_labels = ['/m/04yx4', '/m/03bt1vf', '/m/01g317', '/m/05r655', '/m/01bl7v'] # face_labels = ['HUMAN FACE', 'HUMAN HEAD'] face_labels = ['/m/0dzct', '/m/04hgtk'] # car_labels = ['Land vehicle'] car_labels = ['/m/01prls'] if __name__ == "__main__": # Load the project settings and required modules. Logger.log_special("Running Sample Analysis", with_gap=True) settings = ProjectSettings("settings.yaml") # Load the class labels. loader = Loader() loader.load_labels(settings.LABELS_FILE) # Get ALL of the samples in the directory. samples = [] sample_files = os.listdir(settings.SAMPLES_DIRECTORY) for i in sample_files[:20]: file_path = os.path.join(settings.SAMPLES_DIRECTORY, i) samples += Loader.load_sample_set_from_file(file_path) class_instances = {} class_appearances = {} for key in loader.label_map: class_instances[key] = 0 class_appearances[key] = 0
import math import numpy as np from modules.loader import Loader from modules.neural import Neural # Configuration with open('./config.json') as configfile: config = json.loads("".join(configfile.readlines())) data_dir = config["data_dir"] neural_network_config = config["neural-network"] weight_dir = neural_network_config["weight_dir"] # Utility classes loader = Loader(neural_network_config) def to_python_list(array): array = np.reshape(array, (-1)) return [ np.asscalar(a) if not math.isnan(np.asscalar(a)) else None for a in np.array(array) ] class Predicter: def __init__(self, config): self.time_frame_size = config["time_frame_size"] def predict(self, dates):