def read_dataset(self): #train train_pre = Preprocess() train_pre.read_corpus(corpus_dir=self.TRAIN_CORPUS_DIR, label_dir=self.TRAIN_LABEL_DIR, first_k_char_stem=self.FIRST_K_CHAR_STEM) train_data = DatasetText(corpus=train_pre.get_corpus(), label=train_pre.get_label(), word2idx=Preprocess.get_word2idx(), max_sentence_len=self.MAX_SENTENCE_LEN) train_loader = DataLoader(dataset=train_data, batch_size=self.BATCH_SIZE, shuffle=True) #valid valid_pre = Preprocess() valid_pre.read_corpus(corpus_dir=self.VALID_CORPUS_DIR, label_dir=self.VALID_LABEL_DIR, first_k_char_stem=self.FIRST_K_CHAR_STEM) valid_data = DatasetText(corpus=valid_pre.get_corpus(), label=valid_pre.get_label(), word2idx=Preprocess.get_word2idx(), max_sentence_len=self.MAX_SENTENCE_LEN) valid_loader = DataLoader(dataset=valid_data, batch_size=self.BATCH_SIZE, shuffle=True) self.VOCAB_SIZE = len(Preprocess.get_vocab()) MainTextClass.save_dict(Preprocess.get_word2idx()) return train_loader, valid_loader
def preprocessData(): preprocessing = Preprocess(data="fundamental_ratios") print("retrieving fundamental ratios...") fr_train, fr_validate = preprocessing.get_data(dataType="scaled", dset="train_validate") print("retrieving returns...") ar = preprocessing.retrieve_return() print("split returns...") ar_train = ar[ar.index.isin(fr_train.index)] ar_validate = ar[ar.index.isin(fr_validate.index)] print("trim fundamental ratios...") fr_train = fr_train[fr_train.index.isin(ar_train.index)] fr_validate = fr_validate[fr_validate.index.isin(ar_validate.index)] # remove boundary values print(ar_train) ar_train.drop(ar_train.nlargest(250, "return").index, axis=0, inplace=True) ar_train.drop(ar_train.nsmallest(250, "return").index, axis=0, inplace=True) # re-order train set for visualization ar_train = ar_train.sort_values("return") fr_train = fr_train.loc[ar_train.index] train = (fr_train, ar_train) # re-order validation set for visualization ar_validate = ar_validate.sort_values("return") fr_validate = fr_validate.loc[ar_validate.index] validate = (fr_validate, ar_validate) return train, validate
def classify(self, img_test): """ Execute the model of neuronal network and return true if the image is a guinea pig. Args: img_mdf(numpy array). Array of image data Returns: True. If the max value of precitions is guinea pig """ save_path = os.path.join(h5_path, "guineapig.h5") preprocess = Preprocess() img = preprocess.prepare_image(img_test, 60) if not os.path.exists(save_path): raise FileExistsError("File guineapig.h5 was deleted. :(") model = keras.models.load_model(save_path) predictions = model.predict(img) max = np.argmax(predictions) categories = ["guinea pig", "not guinea pig"] result = categories[max] if (result == "guinea pig"): return True else: return False
def test(): import pandas as pd import numpy as np from Preprocess import Preprocess train = pd.read_csv('~/Downloads/ds-project-train.csv', dtype={ 'SHIPPER.ADDRESS': np.str, 'ZIPCODE': np.str }, parse_dates=['ARRIVAL.DATE']) test = pd.read_csv('~/Downloads/ds-project-test.csv', dtype={ 'SHIPPER.ADDRESS': np.str, 'ZIPCODE': np.str }, parse_dates=['ARRIVAL.DATE']) p = Preprocess() X_train = p.run(df=train) X_test = p.run(df=test, test=True) y_train = X_train['COUNTRY.OF.ORIGIN'] X_train = X_train.drop(['COUNTRY.OF.ORIGIN'], axis=1) y_test = X_test['COUNTRY.OF.ORIGIN'] X_test = X_test.drop(['COUNTRY.OF.ORIGIN'], axis=1) fe = FeatureEngineering() X_train = fe.run(df=X_train) X_test = fe.run(df=X_test, test=True) print('!')
def form(): file_path = "" # load the model cnn = keras.models.load_model('dr.h5') try: # get the image if request.method == "POST": image = request.files['image'] img_name = image.filename file_path = os.path.join('./static/uploaded_images', img_name) image.save(file_path) # preprocess the image to make it similar # to training data a = Preprocess() a.preprocess(file_path) # image is converted to grayscale # and then to numpy array image = Image.open('./static/uploaded_images/preprocessed.jpeg') image = ImageOps.grayscale(image) img_arr = img.img_to_array(image) img_arr = img_arr.astype("float32") img_arr = img_arr / 255.0 img_arr = np.expand_dims(img_arr, axis=0) # prediction predict = cnn.predict(img_arr) pred = np.argmax(predict[0]) os.remove(file_path) os.remove('./static/uploaded_images/preprocessed.jpeg') return render_template("index.html", image_name=pred) else: return render_template("index.html", image_name="None") except: return render_template("index.html", image_name="No proper image file selected")
def classify(self,img_mdf): """ Execute the model of neuronal network Returns if the image is an ostrich. Args: img_mdf(numpy array). Array of image data Returns: True. If the max value of precitions is ostrich """ if not os.path.exists(os.path.join(h5_path,"ostrich.h5")): print("Invalid path") return model = keras.models.load_model(os.path.join(h5_path,"ostrich.h5")) preprocess = Preprocess() img_mdf = preprocess.prepare_image(img_mdf,60) predictions = model.predict(img_mdf) max = np.argmax(predictions) categories = ["ostrich","not an ostrich"] result = categories[max] if (result == "ostrich"): return True else: return False
def __init__(self, p, q, lag): self.preprocess = Preprocess(lag=lag) self.p = p # order of residual term self.q = q # order of variance term self.omega = np.array([]) self.alpha = np.empty(shape=(0, self.p)) # residual term parameter self.beta = np.empty(shape=(0, self.q)) # variance term parameter
def test_retrieve_mkt_caps(self): self.preprocess = Preprocess(lag=7) try: df = self.preprocess.retrieve_mkt_caps(["GE", "MMM", "APPL"]) if not isinstance(df, pd.DataFrame) and not df.empty: raise Exception except: self.fail()
def test_retrieve_dividends(self): self.preprocess = Preprocess(lag=7) try: df = self.preprocess.retrieve_dividends() if not isinstance(df, pd.DataFrame) and not df.empty: raise Exception except: self.fail()
def test_retrieve_benchmark(self): self.preprocess = Preprocess(lag=30) try: df = self.preprocess.retrieve_benchmark("snp500") if not isinstance(df, pd.DataFrame) and not df.empty: raise Exception except: self.fail()
def test_retrieve_fundamental_ratios(self): self.preprocess = Preprocess() try: df = self.preprocess.retrieve_fundamental_ratios() if not isinstance(df, pd.DataFrame) and not df.empty: raise Exception except: self.fail()
def test_retrieve_benchmark_change(self): self.preprocess = Preprocess(lag=7) try: change = self.preprocess.retrieve_benchmark_change("snp500") if not isinstance(change, float): raise Exception except: self.fail()
def __init__(self, asset, risk_free=0): self.preprocess = Preprocess() self.asset = asset self.risk_free = risk_free self.covariance = None # type: pd.DataFrame self.mean = None #pd.Series(index=asset) self.max_sharpe_comp = None # maximum sharpe portfolio composition self.min_vol_comp = None # minimum volatility portfolio composition
def test_scale_data(self): self.preprocess = Preprocess(lag=7) data = [('x', [1, 2, 3, 4]), ('y', [51, -6, 43, -8])] df = pd.DataFrame.from_items(data) scaled = self.preprocess.scale_data(df) self.assertTrue(scaled['x'].max() <= 1) self.assertTrue(scaled['y'].max() <= 1) self.assertTrue(scaled['x'].min() >= 0) self.assertTrue(scaled['y'].min() >= 0)
def test_retrieve_return(self): self.preprocess = Preprocess(lag=7) try: df1 = self.preprocess.retrieve_return( ) # non-split is a super set of split returns if not isinstance(df1, pd.DataFrame) and not df1.empty: raise Exception except: self.fail()
def __init__(self, topics_filename, dir_name): self.parser = TopicsParser() self.preProcess = Preprocess() self.preprocessed = True self.topics_parsed = self.parser.get_data(topics_filename) self.topics = dict() for topic in self.topics_parsed: self.topics[topic['num']] = " ".join( self.preProcess.preprocess(topic['title'] + ' ' + topic['narr'] + ' ' + topic['desc']))
def __init__(self, load, dir_name, files): self.preProcess = Preprocess() self.documentParser = DocumentParser() self.preprocessed = True if not load: if not os.path.isdir(dir_name): os.mkdir(dir_name) schema = Schema(id=TEXT(stored=True), content=TEXT(stored=True)) self.ix = create_in(dir_name, schema) self.index(files) else: self.ix = open_dir(dir_name)
def test_filter_column(self): self.preprocess = Preprocess(density=0.5, lag=7) data = [('symbol', ['A', 'B', 'C', 'D']), ('index', [150, 200, 50, 10]), ('date', [200, 210, 90, 20]), ('currency', [140, 215, 95, 30]), ('latestadate', [140, 215, 95, 40]), ('dense', [140, 215, np.NaN, 50]), ('sparse', [np.NaN, np.NaN, np.NaN, 60])] df = pd.DataFrame.from_items(data) filtered = self.preprocess.filter_column(df) self.assertEqual(len(filtered.columns), 2) # only symbol and dense will survive
def main(): ''' Main function to preprocess data. This function uses the Preprocess class. ''' dataDirectory = './Data/ToProcessData' preprocessor = Preprocess(dataDirectory) print('Object created') st = time.time() preprocessor.PreprocessData() # preprocessor.PrepareTrainTestSet() print('Preprocess data execution ended') en = time.time() print('Time taken to process data = ', en - st, ' sec')
def __init__(self, df = None, continuous_features=[], unordered_categorical_features = [], ordered_categorical_features = []): self.data = df self.continuous_features = continuous_features self.unordered_categorical_features = unordered_categorical_features self.ordered_categorical_features = ordered_categorical_features self.prprcs = Preprocess() self.fs = FieldStatistics() self.trnsfrmr = Transformer() self.imptr = ImputeData() self.pltr = Plotting()
def VectorizerParseDoc(doc): tokenizer=Preprocess() parser=DocumentParser() parsedDoc = parser.parse(doc.replace('d_train', 'D_train').replace('d_test', 'D_test').replace('newsml.xml', 'newsML.xml')) words = [] if(parsedDoc.get('title') != None): words += tokenizer.preprocess(parsedDoc['title']) if(parsedDoc.get('text') != None): words += tokenizer.preprocess(parsedDoc['text']) if(parsedDoc.get('byline') != None): words += tokenizer.preprocess(parsedDoc['byline']) if(parsedDoc.get('dateline') != None): words += tokenizer.preprocess(parsedDoc['dateline']) return words
def __init__(self): ''' 初始化 新建对象时默认执行 ''' # ## 窗口 # 主窗口 self.mainWindow = MainWindow() # 信息窗口 self.informationMessageApp = QtWidgets.QWidget() self.informationMessageWindow = MessageWindow() # 服务项 self.preprocess = Preprocess() self.process = Process() self.postprocess = Postprocess() self.attachmentMatch = AttachmentMatch() self.jsonService = JsonService() self.settingJsonService = JsonService('settings.json') # ## 连接 Slots 和 Signals # 快速处理/开始: 按下 --> 快速处理 self.mainWindow.expressProcessButton.pressed.connect( self.expressProcess) # 开始处理/开始: 按下 --> 一般处理 self.mainWindow.generalProcessButton.pressed.connect( self.generalProcess) # 开始处理/附件匹配选择框: 状态变化 --> 附件选择是否生效 self.mainWindow.shouldMatchAttachmentCheckBox.stateChanged.connect( self.shouldEnableAttachmentMatch) # 开始处理/原始数据浏览: 按下 --> 选择原始数据文件 self.mainWindow.generalProcessOriginalDataExploreButton.pressed.connect( self.exploreOriginalDataFile) # 开始处理/附件目录浏览: 按下 --> 选择附件所在目录 self.mainWindow.generalProcessAttachmentLocationExploreButton.pressed.connect( self.exploreAttachmentDirectory) # 开始处理/导出数据浏览: 按下 --> 选择导出数据文件 self.mainWindow.generalProcessExportFileExploreButton.pressed.connect( self.exploreExportDataFile)
def __init__(self, name,topics_filename,relevance_filename, preprocessed): super().__init__(name) self.topicIndex = TopicsIndex('Topic Index', self,topics_filename,relevance_filename, preprocessed) self.documentParser = DocumentParser() self.docLength = 0 self.dictionary={} self.size = 0 self.preProcess=Preprocess() self.evalDocs = self.calcEvaluatedDocs() self.preprocessed = preprocessed self.processingTime = 0 self.indxingTime = 0 self.processingMemory = 0 self.indexingMemory = 0 self.document_lengths=dict()
def parseDoc(doc): tokenizer=Preprocess() parser=DocumentParser() doc = doc.replace('d_', 'D_') doc = doc.replace('newsml', 'newsML') parsedDoc = parser.parse(doc) words = [] if(parsedDoc.get('title') != None): words += tokenizer.preprocess(parsedDoc['title']) if(parsedDoc.get('text') != None): words += tokenizer.preprocess(parsedDoc['text']) if(parsedDoc.get('byline') != None): words += tokenizer.preprocess(parsedDoc['byline']) if(parsedDoc.get('dateline') != None): words += tokenizer.preprocess(parsedDoc['dateline']) return words
def __init__(self, lag=30, density=0.8, groupNum=21, scoreOrder=4, retMin=-0.25, retMax=0.25, p_value=0.05): self.db = DBConnect() self.preprocess = Preprocess(data='fundamental_ratios', lag=lag, density=density) self.groupNum = groupNum self.scoreOrder = scoreOrder self.retMin = retMin self.retMax = retMax self.p_value = p_value self.coefficient = pd.DataFrame()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--indir", default=None, type=str, required=True, help="directory of json files containing comments") parser.add_argument("--outdir", default=None, type=str, required=True, help="directory to store test.tsv file") args = parser.parse_args() pr = Preprocess() reader = pd.read_json(args.indir, lines=True, compression=None) comments = list(reader['body']) violated_rule = list(reader['violated_rule']) writer_addr = os.path.join(args.outdir, 'test.tsv') writer = open(writer_addr, 'w') writer.write('label comments\n') test_ah = 0 test_none = 0 for i in tqdm(range(len(comments)), unit=" comments", desc="comments processed"): label = 'NONE' test_none += 1 if violated_rule[i] == 2: label = 'AH' test_ah += 1 test_none -= 1 cur = pr.preprocess(comments[i]) cur = ' '.join(cur) cur = label + ' ' + cur + '\n' writer.write(cur) writer.close() print(f'Test set: AH: {test_ah} - NONE: {test_none}')
def __init__(self): data = Preprocess() data.load_data() data.preprocess_images() data.one_hot_encode_labels() self.train_x, self.train_y = data.get_training_data() self.test_x, self.test_y = data.get_testing_data() _, _, self.row, self.col, self.channel, self.classes = data.metadata() self.batch_size = 256 self.keep_probability = 0.4 self.name = "Model-A"
def test_cap_outlier(self): self.preprocess = Preprocess(lag=7, limit=3, outlier=4) data = [('x', [ 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 8 ]), ('y', [ 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 50 ]), ('z', [ 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 10000 ])] df = pd.DataFrame.from_items(data) dfc = df.copy( ) # method will operate on the same memory location without making copy capped = self.preprocess.cap_outlier(df) self.assertEqual(capped['x'].max(), dfc['x'].max()) self.assertTrue(capped['z'].max() < dfc['z'].max())
def create_unclassified(): parser = argparse.ArgumentParser() parser.add_argument("--indir", default=None, type=str, required=True, help="path of json file") parser.add_argument("--outdir", default=None, type=str, required=True, help="folder to write .log file") args = parser.parse_args() pr = Preprocess() reader = pd.read_json(args.indir, lines=True, compression=None) comments = list(reader['body']) violated_rule = list(reader['violated_rule']) writer_addr = os.path.join(args.outdir, 'comments_0.log') writer = open(writer_addr, 'wb') for i in tqdm(range(len(comments)), unit=' comments', desc='Comments processed: '): label = 'none' if violated_rule[i] == 2: label = 'ah' cur = ' '.join(pr.preprocess(comments[i])) e = Comment(i, cur, label) print(e) pickle.dump(e, writer) writer.close()
def main_fuction(readmidfile=False): log=ProcessLog('ClusterModel.log',1,'cluster model') ProcessLog.loggerName='cluster model' dataBasePaths=[] datatemp=dataBasePaths[1:] dataBasePaths.append('D:/NJ-KING-CAO3.db') distance_matrix=[] dm = Distance_Matrix() V=Visualize() moduleLabelDict={} ctkm=None ft = lambda x: x.nodesCount <= 25 # mt=MemoryTest() # print('test begin!') # mt.test() # print('sleep begin!') # time.sleep(500) # print('sleep end!') # # def eee(): pp = Preprocess() begin=time.time() pp.extractFromFiles_Robert(True, True, treeNumberLimit=5000, nodeLimit=25) end=time.time() log.getlog().debug("Read file took %f seconds."%(end-begin)) pp.generateTrees() if readmidfile: try: modulelabelfile=open("ModuleLabel.txt",'r') for l in modulelabelfile: row=[x for x in l.split(',')] if len(row)==2: moduleLabelDict[row[0]]=int(row[1]) modulelabelfile.close() # mc = ModuleCluster() # feature_names, feature_matrix = mc.moduleStatistic_feature_extraction(pp.moduleStatistic) # for epsNumber in range(100): # eps = float((epsNumber + 1) / 100) # labels = mc.cluster(eps) # file_open = open("ModuleLabel_eps=" + str(eps) + ".txt", 'w') # i = 0 # for moduleName in mc.moduleNameList: # file_open.write("%s,%d\n" % (moduleName, labels[i])) # i += 1 # file_open.close() # print("eps=%f is done." % eps) # V.cluster_result_linechart(eps_list, cluster_num_list, noise_num_list) # labels = mc.cluster() distancefile=open("Distancenew.txt",'r') colnum=0 rownum=0 row=[] for l in distancefile: row = [float(x) for x in l.split()] if len(row) > 0: distance_matrix.append(row) rownum+=1 print("Distance matrix read. %d trees"%rownum) distancefile.close() if rownum!=len(row): raise Exception("Distance file format wrong!") except Exception as e: print(e) #readmidfile=False if readmidfile==False: mc=ModuleCluster() feature_names,feature_matrix=mc.moduleStatistic_feature_extraction(pp.moduleStatistic) for epsNumber in range(100): eps=float((epsNumber+1)/100) labels = mc.cluster(eps) file_open = open("ModuleLabel_eps="+str(eps)+".txt", 'w') i = 0 for moduleName in mc.moduleNameList: file_open.write("%s,%d\n" % (moduleName, labels[i])) i += 1 file_open.close() print("eps=%f is done."%eps) labels=mc.cluster() moduleLabelDict=mc.moduleLabelDict file_open=open("ModuleLabel5000.txt",'w') i=0 for moduleName in mc.moduleNameList: file_open.write("%s,%d\n"%(moduleName,labels[i])) i+=1 file_open.close() ctkm = Convolution_Tree_Kernel_Mutation(moduleLabelDict) begin=time.time() distance_matrix=dm.compute([x for j in pp.allTrees for x in filter(ft,j)],ctkm) end=time.time() log.getlog().debug("Distance computation took %f seconds." % (end - begin)) file_open=open("Distancenew5000.txt",'w') for r,row in enumerate(distance_matrix): for c,col in enumerate(row): file_open.write("%.6f "%(col)) file_open.write("\n") file_open.close() ctkm = Convolution_Tree_Kernel_Mutation(moduleLabelDict) print("test begin!") eps_list = [] cluster_num_list = [] noise_num_list = [] for epsNumber in range(100): eps=float((epsNumber+1)/100) tc=TreeCluster(eps=eps,min_samples=5,metric="precomputed",n_jobs=4) tc.Train(distance_matrix) eps_list.append(eps) cluster_num_list.append(tc.clusterNumber) noise_num_list.append(tc.noiseNumber) #V.cluster_result_linechart(eps_list,cluster_num_list,noise_num_list) tc=TreeCluster(eps=0.5,n_jobs=4) begin=time.time() tc.Train(distance_matrix) end=time.time() log.getlog().debug("Clustering took %f seconds." % (end - begin)) V.cluster_result_linechart(range(2000),tc.DB.labels_,tc.DB.labels_)