Пример #1
0
 def download_binary(self, binary_name, download_url):
     if os.path.exists(self.binary_path) is False:
         utilities.mkdir(self.binary_path)
     binary_file_path = self.get_required_binary_path(binary_name)
     utilities.log_info("Download url: %s" % download_url)
     with open(binary_file_path, 'wb') as file:
         response = requests.get(download_url, stream=True)
         total = response.headers.get('content-length')
         if total is None or int(total) < 100000:
             utilities.log_error(
                 "Download binary %s failed, Please check the existence of the binary version %s"
                 % (binary_name, self.version))
             return False
         utilities.log_info(
             "* Download %s from %s\n* size: %fMB, dst_path: %s" %
             (binary_name, download_url, float(total) / float(1000000),
              binary_file_path))
         downloaded = 0
         total = int(total)
         for data in response.iter_content(
                 chunk_size=max(int(total / 1000), 1024 * 1024)):
             downloaded += len(data)
             file.write(data)
             done = int(50 * downloaded / total)
             utilities.log_info("Download percent: %d%%" %
                                (downloaded / total * 100))
             sys.stdout.write('\r[{}{}]'.format('█' * done,
                                                '.' * (50 - done)))
             sys.stdout.flush()
     sys.stdout.write('\n')
     utilities.log_info("* Download %s from %s success" %
                        (binary_name, download_url))
     return True
Пример #2
0
def main():
    # load and transform data to lda format
    samples = preprocessing.load_dataset_from_disk('dataset/the_thao/2',
                                                   remove_tags=True)
    lda_data, vocab = preprocessing.build_lda_data(samples)

    # Get environment variables
    model_folder = 'models/ml-ope'
    tops = 10

    # Create model folder if it doesn't exist
    utilities.mkdir('models')
    utilities.mkdir(model_folder)

    # Build settings
    settings = build_setting(len(lda_data), len(vocab))

    # run algorithm
    runmlope = run_ML_OPE.runMLOPE(lda_data, settings, model_folder, tops)
    theta, beta = runmlope.run()

    duplicate_topics = get_duplicate_topics(beta, topn=20)

    new_theta = rebuild_theta(theta, duplicate_topics)

    unique_idx, unique_samples = remove_duplicate(new_theta, samples)
    pass
def sim_mat_dist(**kwargs):
	'''
	Saves the sim mat in a file along with median in last column.
	Plots the hist and box plot of median cosim of each topic
	'''
	data = np.load(kwargs.get("file","sim_mat"))
	labels=data['labels']
	sim_mat = data['sim_mat']
	sim_mat = np.round(sim_mat,decimals=6)
	medians = np.median(sim_mat, axis=1)
	maxs = []
	sim_mat[sim_mat>1] = 1
	sim_mat_sorted = np.copy(sim_mat)
	#find second highest value as max will always be 1
	for row in sim_mat_sorted:
		row.sort()
		maxs.append(row[-2])
	
	print "saving sim_mat in stats_data/sim_mat/"+c.query_name+".csv"
	u.mkdir("stats_data/sim_mat/")
	f= open("stats_data/sim_mat/"+c.query_name+"_sim_mat.csv","w")
	f.write(","+",".join(labels)+",median"+","+"max"+"\n")
	for i,med in enumerate(medians):
		f.write(labels[i]+","+",".join(map("{:.6f}".format,sim_mat[i]))+","+str(medians[i])+","+str(maxs[i])+"\n")

	u.box_plot(maxs,"stats_data/sim_mat/box_plot_"+c.query_name+"_maxcosim.png")
	u.hist_plot(maxs,"stats_data/sim_mat/hist_plot_"+c.query_name+"_maxcosim.png",xlabel='Maximum cosine similarity',ylabel='Number of topics',xticks=0)

	u.box_plot(medians,"stats_data/sim_mat/box_plot_"+c.query_name+"_median.png")
	u.hist_plot(medians,"stats_data/sim_mat/hist_plot_"+c.query_name+"_median.png",xlabel='median co-sim',ylabel='Number of topics',xticks=0)
def column_analysis(**kwargs):
	fpath = kwargs.get("file")
	path = u.mkdir("./stats_data/desc/")
	df = pd.read_csv(fpath)
	for domain in c.domains:
		df2 = df[df.domain == domain]
		u.box_plot(df2["med.cosim.inter"],path+"box_med.cosim.inter_"+domain+".png")
		u.dist_plot(df2["med.cosim.inter"],path+"dist_med.cosim.inter_"+domain+".png")
def desc_stat_data(**kwargs):
	for fpath in kwargs.get("files"):
		#path to where the files will be saved
		path = u.mkdir("./stats_data/desc/")
		df = pd.read_csv(fpath)
		r = re.compile('/(.*?).csv')
		fname = r.search(fpath).group(1).split("-")[0]
		#correlation between columns of stat_all
		'''
def create_domain_df():
	'''
	function to create data frame
	'''
	start_time = time.time()
	print "Creating dataframe"
	stat_data = pd.DataFrame()
	for d in c.domains:
		df = collect_data(d)
		print d,df.shape
		df.to_csv('stats_data/'+d+'_data.csv',sep=',',index=False)
		#if not stat_data: stat_data=pd.DataFrame(columns=df.columns)
		stat_data = pd.concat([stat_data,df],ignore_index=True)  

	#print "all",stat_data.shape
	df = collect_network_inter()
	stat_data = pd.merge(stat_data,df,how="inner")
	u.mkdir("./stats_data")
	stat_data.to_csv('stats_data/all_data.csv',sep=',',index=False)
	print "--- time for dataframe generation "+str((time.time() - start_time)/60)+" minutes ---"	
def reg_analysis(**kwargs):
	for fpath in kwargs.get("files"):
			#path to where the files will be saved
			path = u.mkdir("./stats_data/desc/")
			df = pd.read_csv(fpath)
			r = re.compile('/(.*?).csv')
			fname = r.search(fpath).group(1).split("-")[0]
			fname = fname[:3]+"."+fname[3:]
			cols = [col for col in  df.columns if col!=fname and col!='id']
			print "Running Regression Analysis for",fname
			y = np.array(df[fname])
			X = np.array(df[cols])
			#print "X",X.shape,"y",y.shape
			X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.33,random_state=42)
			#print "X_train",X_train.shape,"y_train",y_train.shape
			regr = RandomForestRegressor(max_depth=2, random_state=0)
			regr.fit(X_train, y_train)
			y_pred = regr.predict(X_test)
			zipped = zip(regr.feature_importances_,df[cols].columns)
			zipped.sort(key = lambda t: t[0])
			for imp,f in zipped:
				print f,":",imp
			print "-------- R2 score",regr.score(X_test,y_test),"----------"