Пример #1
0
    def read_dataset(self):
        #train
        train_pre = Preprocess()
        train_pre.read_corpus(corpus_dir=self.TRAIN_CORPUS_DIR,
                              label_dir=self.TRAIN_LABEL_DIR,
                              first_k_char_stem=self.FIRST_K_CHAR_STEM)

        train_data = DatasetText(corpus=train_pre.get_corpus(),
                                 label=train_pre.get_label(),
                                 word2idx=Preprocess.get_word2idx(),
                                 max_sentence_len=self.MAX_SENTENCE_LEN)

        train_loader = DataLoader(dataset=train_data,
                                  batch_size=self.BATCH_SIZE,
                                  shuffle=True)

        #valid
        valid_pre = Preprocess()
        valid_pre.read_corpus(corpus_dir=self.VALID_CORPUS_DIR,
                              label_dir=self.VALID_LABEL_DIR,
                              first_k_char_stem=self.FIRST_K_CHAR_STEM)

        valid_data = DatasetText(corpus=valid_pre.get_corpus(),
                                 label=valid_pre.get_label(),
                                 word2idx=Preprocess.get_word2idx(),
                                 max_sentence_len=self.MAX_SENTENCE_LEN)

        valid_loader = DataLoader(dataset=valid_data,
                                  batch_size=self.BATCH_SIZE,
                                  shuffle=True)

        self.VOCAB_SIZE = len(Preprocess.get_vocab())
        MainTextClass.save_dict(Preprocess.get_word2idx())

        return train_loader, valid_loader
Пример #2
0
def preprocessData():
    preprocessing = Preprocess(data="fundamental_ratios")
    print("retrieving fundamental ratios...")
    fr_train, fr_validate = preprocessing.get_data(dataType="scaled",
                                                   dset="train_validate")
    print("retrieving returns...")
    ar = preprocessing.retrieve_return()
    print("split returns...")
    ar_train = ar[ar.index.isin(fr_train.index)]
    ar_validate = ar[ar.index.isin(fr_validate.index)]
    print("trim fundamental ratios...")
    fr_train = fr_train[fr_train.index.isin(ar_train.index)]
    fr_validate = fr_validate[fr_validate.index.isin(ar_validate.index)]
    # remove boundary values
    print(ar_train)
    ar_train.drop(ar_train.nlargest(250, "return").index, axis=0, inplace=True)
    ar_train.drop(ar_train.nsmallest(250, "return").index,
                  axis=0,
                  inplace=True)
    # re-order train set for visualization
    ar_train = ar_train.sort_values("return")
    fr_train = fr_train.loc[ar_train.index]
    train = (fr_train, ar_train)
    # re-order validation set for visualization
    ar_validate = ar_validate.sort_values("return")
    fr_validate = fr_validate.loc[ar_validate.index]
    validate = (fr_validate, ar_validate)
    return train, validate
Пример #3
0
    def classify(self, img_test):
        """ Execute the model of neuronal network and return true if the image is a 
            guinea pig.

            Args:
                img_mdf(numpy array). Array of image data

            Returns:
                True. If the max value of precitions is guinea pig
        """

        save_path = os.path.join(h5_path, "guineapig.h5")

        preprocess = Preprocess()

        img = preprocess.prepare_image(img_test, 60)

        if not os.path.exists(save_path):
            raise FileExistsError("File guineapig.h5 was deleted. :(")

        model = keras.models.load_model(save_path)

        predictions = model.predict(img)

        max = np.argmax(predictions)

        categories = ["guinea pig", "not guinea pig"]

        result = categories[max]

        if (result == "guinea pig"):
            return True
        else:
            return False
Пример #4
0
def test():
    import pandas as pd
    import numpy as np
    from Preprocess import Preprocess
    train = pd.read_csv('~/Downloads/ds-project-train.csv',
                        dtype={
                            'SHIPPER.ADDRESS': np.str,
                            'ZIPCODE': np.str
                        },
                        parse_dates=['ARRIVAL.DATE'])
    test = pd.read_csv('~/Downloads/ds-project-test.csv',
                       dtype={
                           'SHIPPER.ADDRESS': np.str,
                           'ZIPCODE': np.str
                       },
                       parse_dates=['ARRIVAL.DATE'])
    p = Preprocess()
    X_train = p.run(df=train)
    X_test = p.run(df=test, test=True)

    y_train = X_train['COUNTRY.OF.ORIGIN']
    X_train = X_train.drop(['COUNTRY.OF.ORIGIN'], axis=1)

    y_test = X_test['COUNTRY.OF.ORIGIN']
    X_test = X_test.drop(['COUNTRY.OF.ORIGIN'], axis=1)

    fe = FeatureEngineering()
    X_train = fe.run(df=X_train)
    X_test = fe.run(df=X_test, test=True)

    print('!')
Пример #5
0
def form():
    file_path = ""
    # load the model
    cnn = keras.models.load_model('dr.h5')
    try:
        # get the image
        if request.method == "POST":
            image = request.files['image']
            img_name = image.filename
            file_path = os.path.join('./static/uploaded_images', img_name)
            image.save(file_path)
            # preprocess the image to make it similar
            # to training data
            a = Preprocess()
            a.preprocess(file_path)
            # image is converted to grayscale
            # and then to numpy array
            image = Image.open('./static/uploaded_images/preprocessed.jpeg')
            image = ImageOps.grayscale(image)
            img_arr = img.img_to_array(image)
            img_arr = img_arr.astype("float32")
            img_arr = img_arr / 255.0
            img_arr = np.expand_dims(img_arr, axis=0)
            # prediction
            predict = cnn.predict(img_arr)
            pred = np.argmax(predict[0])
            os.remove(file_path)
            os.remove('./static/uploaded_images/preprocessed.jpeg')
            return render_template("index.html", image_name=pred)
        else:
            return render_template("index.html", image_name="None")
    except:
        return render_template("index.html",
                               image_name="No proper image file selected")
    def classify(self,img_mdf):
        """ Execute the model of neuronal network
        Returns if the image is an ostrich.

            Args:
                img_mdf(numpy array). Array of image data

            Returns:
                True. If the max value of precitions is ostrich
        """

        if not os.path.exists(os.path.join(h5_path,"ostrich.h5")):
            print("Invalid path")
            return

        model = keras.models.load_model(os.path.join(h5_path,"ostrich.h5"))
        preprocess = Preprocess()
        img_mdf = preprocess.prepare_image(img_mdf,60)
        predictions = model.predict(img_mdf)

        max = np.argmax(predictions)

        categories = ["ostrich","not an ostrich"]

        result = categories[max]

        if (result == "ostrich"):
            return True
        else:
            return False
Пример #7
0
 def __init__(self, p, q, lag):
     self.preprocess = Preprocess(lag=lag)
     self.p = p  # order of residual term
     self.q = q  # order of variance term
     self.omega = np.array([])
     self.alpha = np.empty(shape=(0, self.p))  # residual term parameter
     self.beta = np.empty(shape=(0, self.q))  # variance term parameter
Пример #8
0
 def test_retrieve_mkt_caps(self):
     self.preprocess = Preprocess(lag=7)
     try:
         df = self.preprocess.retrieve_mkt_caps(["GE", "MMM", "APPL"])
         if not isinstance(df, pd.DataFrame) and not df.empty:
             raise Exception
     except:
         self.fail()
Пример #9
0
 def test_retrieve_dividends(self):
     self.preprocess = Preprocess(lag=7)
     try:
         df = self.preprocess.retrieve_dividends()
         if not isinstance(df, pd.DataFrame) and not df.empty:
             raise Exception
     except:
         self.fail()
Пример #10
0
 def test_retrieve_benchmark(self):
     self.preprocess = Preprocess(lag=30)
     try:
         df = self.preprocess.retrieve_benchmark("snp500")
         if not isinstance(df, pd.DataFrame) and not df.empty:
             raise Exception
     except:
         self.fail()
Пример #11
0
 def test_retrieve_fundamental_ratios(self):
     self.preprocess = Preprocess()
     try:
         df = self.preprocess.retrieve_fundamental_ratios()
         if not isinstance(df, pd.DataFrame) and not df.empty:
             raise Exception
     except:
         self.fail()
Пример #12
0
 def test_retrieve_benchmark_change(self):
     self.preprocess = Preprocess(lag=7)
     try:
         change = self.preprocess.retrieve_benchmark_change("snp500")
         if not isinstance(change, float):
             raise Exception
     except:
         self.fail()
Пример #13
0
 def __init__(self, asset, risk_free=0):
     self.preprocess = Preprocess()
     self.asset = asset
     self.risk_free = risk_free
     self.covariance = None  # type: pd.DataFrame
     self.mean = None  #pd.Series(index=asset)
     self.max_sharpe_comp = None  # maximum sharpe portfolio composition
     self.min_vol_comp = None  # minimum volatility portfolio composition
Пример #14
0
 def test_scale_data(self):
     self.preprocess = Preprocess(lag=7)
     data = [('x', [1, 2, 3, 4]), ('y', [51, -6, 43, -8])]
     df = pd.DataFrame.from_items(data)
     scaled = self.preprocess.scale_data(df)
     self.assertTrue(scaled['x'].max() <= 1)
     self.assertTrue(scaled['y'].max() <= 1)
     self.assertTrue(scaled['x'].min() >= 0)
     self.assertTrue(scaled['y'].min() >= 0)
Пример #15
0
 def test_retrieve_return(self):
     self.preprocess = Preprocess(lag=7)
     try:
         df1 = self.preprocess.retrieve_return(
         )  # non-split is a super set of split returns
         if not isinstance(df1, pd.DataFrame) and not df1.empty:
             raise Exception
     except:
         self.fail()
Пример #16
0
 def __init__(self, topics_filename, dir_name):
     self.parser = TopicsParser()
     self.preProcess = Preprocess()
     self.preprocessed = True
     self.topics_parsed = self.parser.get_data(topics_filename)
     self.topics = dict()
     for topic in self.topics_parsed:
         self.topics[topic['num']] = " ".join(
             self.preProcess.preprocess(topic['title'] + ' ' +
                                        topic['narr'] + ' ' +
                                        topic['desc']))
Пример #17
0
 def __init__(self, load, dir_name, files):
     self.preProcess = Preprocess()
     self.documentParser = DocumentParser()
     self.preprocessed = True
     if not load:
         if not os.path.isdir(dir_name):
             os.mkdir(dir_name)
         schema = Schema(id=TEXT(stored=True), content=TEXT(stored=True))
         self.ix = create_in(dir_name, schema)
         self.index(files)
     else:
         self.ix = open_dir(dir_name)
Пример #18
0
 def test_filter_column(self):
     self.preprocess = Preprocess(density=0.5, lag=7)
     data = [('symbol', ['A', 'B', 'C', 'D']), ('index', [150, 200, 50,
                                                          10]),
             ('date', [200, 210, 90, 20]), ('currency', [140, 215, 95, 30]),
             ('latestadate', [140, 215, 95, 40]),
             ('dense', [140, 215, np.NaN, 50]),
             ('sparse', [np.NaN, np.NaN, np.NaN, 60])]
     df = pd.DataFrame.from_items(data)
     filtered = self.preprocess.filter_column(df)
     self.assertEqual(len(filtered.columns),
                      2)  # only symbol and dense will survive
Пример #19
0
def main():
    '''
    Main function to preprocess data. This function uses the Preprocess class.
    '''
    dataDirectory = './Data/ToProcessData'
    preprocessor = Preprocess(dataDirectory)
    print('Object created')
    st = time.time()
    preprocessor.PreprocessData()
    # preprocessor.PrepareTrainTestSet()
    print('Preprocess data execution ended')
    en = time.time()
    print('Time taken to process data = ', en - st, ' sec')
Пример #20
0
	def __init__(self, df = None,
				continuous_features=[],
				unordered_categorical_features = [],
				ordered_categorical_features = []):
		self.data = df
		self.continuous_features = continuous_features
		self.unordered_categorical_features = unordered_categorical_features
		self.ordered_categorical_features = ordered_categorical_features
		self.prprcs = Preprocess()
		self.fs = FieldStatistics()
		self.trnsfrmr = Transformer()
		self.imptr = ImputeData()
		self.pltr = Plotting()
Пример #21
0
def VectorizerParseDoc(doc):
        tokenizer=Preprocess()
        parser=DocumentParser()
        parsedDoc = parser.parse(doc.replace('d_train', 'D_train').replace('d_test', 'D_test').replace('newsml.xml', 'newsML.xml'))
        words = []
        if(parsedDoc.get('title') != None):
            words += tokenizer.preprocess(parsedDoc['title'])
        if(parsedDoc.get('text') != None):
            words +=  tokenizer.preprocess(parsedDoc['text'])
        if(parsedDoc.get('byline') != None):
            words +=  tokenizer.preprocess(parsedDoc['byline'])
        if(parsedDoc.get('dateline') != None):
            words +=  tokenizer.preprocess(parsedDoc['dateline'])
        return words
Пример #22
0
    def __init__(self):
        '''
        初始化
        新建对象时默认执行
        '''

        # ## 窗口

        # 主窗口
        self.mainWindow = MainWindow()

        # 信息窗口
        self.informationMessageApp = QtWidgets.QWidget()
        self.informationMessageWindow = MessageWindow()

        # 服务项
        self.preprocess = Preprocess()
        self.process = Process()
        self.postprocess = Postprocess()
        self.attachmentMatch = AttachmentMatch()
        self.jsonService = JsonService()

        self.settingJsonService = JsonService('settings.json')

        # ## 连接 Slots 和 Signals

        # 快速处理/开始: 按下 --> 快速处理
        self.mainWindow.expressProcessButton.pressed.connect(
            self.expressProcess)

        # 开始处理/开始: 按下 --> 一般处理
        self.mainWindow.generalProcessButton.pressed.connect(
            self.generalProcess)

        # 开始处理/附件匹配选择框: 状态变化 --> 附件选择是否生效
        self.mainWindow.shouldMatchAttachmentCheckBox.stateChanged.connect(
            self.shouldEnableAttachmentMatch)

        # 开始处理/原始数据浏览: 按下 --> 选择原始数据文件
        self.mainWindow.generalProcessOriginalDataExploreButton.pressed.connect(
            self.exploreOriginalDataFile)

        # 开始处理/附件目录浏览: 按下 --> 选择附件所在目录
        self.mainWindow.generalProcessAttachmentLocationExploreButton.pressed.connect(
            self.exploreAttachmentDirectory)

        # 开始处理/导出数据浏览: 按下 --> 选择导出数据文件
        self.mainWindow.generalProcessExportFileExploreButton.pressed.connect(
            self.exploreExportDataFile)
Пример #23
0
 def __init__(self, name,topics_filename,relevance_filename, preprocessed):
     super().__init__(name)
     self.topicIndex = TopicsIndex('Topic Index', self,topics_filename,relevance_filename, preprocessed)
     self.documentParser = DocumentParser()
     self.docLength = 0
     self.dictionary={}
     self.size = 0
     self.preProcess=Preprocess()
     self.evalDocs = self.calcEvaluatedDocs()
     self.preprocessed = preprocessed
     self.processingTime = 0 
     self.indxingTime = 0
     self.processingMemory = 0
     self.indexingMemory = 0
     self.document_lengths=dict()
Пример #24
0
def parseDoc(doc):
        tokenizer=Preprocess()
        parser=DocumentParser()
        doc = doc.replace('d_', 'D_')
        doc = doc.replace('newsml', 'newsML')
        parsedDoc = parser.parse(doc)
        words = []
        if(parsedDoc.get('title') != None):
            words += tokenizer.preprocess(parsedDoc['title'])
        if(parsedDoc.get('text') != None):
            words +=  tokenizer.preprocess(parsedDoc['text'])
        if(parsedDoc.get('byline') != None):
            words +=  tokenizer.preprocess(parsedDoc['byline'])
        if(parsedDoc.get('dateline') != None):
            words +=  tokenizer.preprocess(parsedDoc['dateline'])
        return words
Пример #25
0
 def __init__(self,
              lag=30,
              density=0.8,
              groupNum=21,
              scoreOrder=4,
              retMin=-0.25,
              retMax=0.25,
              p_value=0.05):
     self.db = DBConnect()
     self.preprocess = Preprocess(data='fundamental_ratios',
                                  lag=lag,
                                  density=density)
     self.groupNum = groupNum
     self.scoreOrder = scoreOrder
     self.retMin = retMin
     self.retMax = retMax
     self.p_value = p_value
     self.coefficient = pd.DataFrame()
Пример #26
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--indir",
                        default=None,
                        type=str,
                        required=True,
                        help="directory of json files containing comments")
    parser.add_argument("--outdir",
                        default=None,
                        type=str,
                        required=True,
                        help="directory to store test.tsv file")
    args = parser.parse_args()
    pr = Preprocess()

    reader = pd.read_json(args.indir, lines=True, compression=None)
    comments = list(reader['body'])
    violated_rule = list(reader['violated_rule'])

    writer_addr = os.path.join(args.outdir, 'test.tsv')
    writer = open(writer_addr, 'w')
    writer.write('label comments\n')

    test_ah = 0
    test_none = 0

    for i in tqdm(range(len(comments)),
                  unit=" comments",
                  desc="comments processed"):
        label = 'NONE'
        test_none += 1
        if violated_rule[i] == 2:
            label = 'AH'
            test_ah += 1
            test_none -= 1
        cur = pr.preprocess(comments[i])
        cur = ' '.join(cur)
        cur = label + ' ' + cur + '\n'
        writer.write(cur)

    writer.close()

    print(f'Test set: AH: {test_ah} - NONE: {test_none}')
Пример #27
0
    def __init__(self):
        data = Preprocess()

        data.load_data()

        data.preprocess_images()

        data.one_hot_encode_labels()

        self.train_x, self.train_y = data.get_training_data()

        self.test_x, self.test_y = data.get_testing_data()

        _, _, self.row, self.col, self.channel, self.classes = data.metadata()

        self.batch_size = 256

        self.keep_probability = 0.4

        self.name = "Model-A"
Пример #28
0
 def test_cap_outlier(self):
     self.preprocess = Preprocess(lag=7, limit=3, outlier=4)
     data = [('x', [
         1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4,
         4, 4, 8
     ]),
             ('y', [
                 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 1, 1, 1, 2, 2, 2, 3, 3,
                 3, 4, 4, 4, 50
             ]),
             ('z', [
                 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 1, 1, 1, 2, 2, 2, 3, 3,
                 3, 4, 4, 4, 10000
             ])]
     df = pd.DataFrame.from_items(data)
     dfc = df.copy(
     )  # method will operate on the same memory location without making copy
     capped = self.preprocess.cap_outlier(df)
     self.assertEqual(capped['x'].max(), dfc['x'].max())
     self.assertTrue(capped['z'].max() < dfc['z'].max())
Пример #29
0
def create_unclassified():
    parser = argparse.ArgumentParser()
    parser.add_argument("--indir", default=None, type=str, required=True, help="path of json file")
    parser.add_argument("--outdir", default=None, type=str, required=True, help="folder to write .log file")
    args = parser.parse_args()
    
    pr = Preprocess()

    reader = pd.read_json(args.indir, lines=True, compression=None)
    comments = list(reader['body'])
    violated_rule = list(reader['violated_rule'])

    writer_addr = os.path.join(args.outdir, 'comments_0.log')
    writer = open(writer_addr, 'wb')

    for i in tqdm(range(len(comments)), unit=' comments', desc='Comments processed: '):
        label = 'none'
        if violated_rule[i] == 2:   label = 'ah'
        cur = ' '.join(pr.preprocess(comments[i]))
        e = Comment(i, cur, label)
        print(e)
        pickle.dump(e, writer)

    writer.close()
Пример #30
0
def main_fuction(readmidfile=False):
    log=ProcessLog('ClusterModel.log',1,'cluster model')
    ProcessLog.loggerName='cluster model'
    dataBasePaths=[]

    datatemp=dataBasePaths[1:]
    dataBasePaths.append('D:/NJ-KING-CAO3.db')
    distance_matrix=[]
    dm = Distance_Matrix()

    V=Visualize()
    moduleLabelDict={}
    ctkm=None
    ft = lambda x: x.nodesCount <= 25


    #     mt=MemoryTest()
#     print('test begin!')
#     mt.test()
#     print('sleep begin!')
#     time.sleep(500)
#     print('sleep end!')
#
# def eee():



    pp = Preprocess()
    begin=time.time()
    pp.extractFromFiles_Robert(True, True, treeNumberLimit=5000, nodeLimit=25)
    end=time.time()
    log.getlog().debug("Read file took %f seconds."%(end-begin))
    pp.generateTrees()
    if readmidfile:
        try:
            modulelabelfile=open("ModuleLabel.txt",'r')
            for l in modulelabelfile:
                row=[x for x in l.split(',')]
                if len(row)==2:
                    moduleLabelDict[row[0]]=int(row[1])
            modulelabelfile.close()

            # mc = ModuleCluster()
            # feature_names, feature_matrix = mc.moduleStatistic_feature_extraction(pp.moduleStatistic)
            # for epsNumber in range(100):
            #     eps = float((epsNumber + 1) / 100)
            #     labels = mc.cluster(eps)
            #     file_open = open("ModuleLabel_eps=" + str(eps) + ".txt", 'w')
            #     i = 0
            #     for moduleName in mc.moduleNameList:
            #         file_open.write("%s,%d\n" % (moduleName, labels[i]))
            #         i += 1
            #     file_open.close()
            #     print("eps=%f is done." % eps)
            # V.cluster_result_linechart(eps_list, cluster_num_list, noise_num_list)
            # labels = mc.cluster()
            distancefile=open("Distancenew.txt",'r')
            colnum=0
            rownum=0
            row=[]
            for l in distancefile:
                row = [float(x) for x in l.split()]
                if len(row) > 0:
                    distance_matrix.append(row)
                    rownum+=1
            print("Distance matrix read. %d trees"%rownum)
            distancefile.close()
            if rownum!=len(row):
                raise Exception("Distance file format wrong!")
        except Exception as e:
            print(e)
            #readmidfile=False



    if readmidfile==False:
        mc=ModuleCluster()
        feature_names,feature_matrix=mc.moduleStatistic_feature_extraction(pp.moduleStatistic)

        for epsNumber in range(100):
            eps=float((epsNumber+1)/100)
            labels = mc.cluster(eps)
            file_open = open("ModuleLabel_eps="+str(eps)+".txt", 'w')
            i = 0
            for moduleName in mc.moduleNameList:
                file_open.write("%s,%d\n" % (moduleName, labels[i]))
                i += 1
            file_open.close()
            print("eps=%f is done."%eps)
        labels=mc.cluster()
        moduleLabelDict=mc.moduleLabelDict
        file_open=open("ModuleLabel5000.txt",'w')
        i=0
        for moduleName in mc.moduleNameList:
            file_open.write("%s,%d\n"%(moduleName,labels[i]))
            i+=1
        file_open.close()
        ctkm = Convolution_Tree_Kernel_Mutation(moduleLabelDict)
        begin=time.time()
        distance_matrix=dm.compute([x for j in pp.allTrees for x in filter(ft,j)],ctkm)
        end=time.time()
        log.getlog().debug("Distance computation took %f seconds." % (end - begin))
        file_open=open("Distancenew5000.txt",'w')
        for r,row in enumerate(distance_matrix):
            for c,col in enumerate(row):
                file_open.write("%.6f "%(col))
            file_open.write("\n")
        file_open.close()



    ctkm = Convolution_Tree_Kernel_Mutation(moduleLabelDict)
    print("test begin!")
    eps_list = []
    cluster_num_list = []
    noise_num_list = []
    for epsNumber in range(100):
        eps=float((epsNumber+1)/100)
        tc=TreeCluster(eps=eps,min_samples=5,metric="precomputed",n_jobs=4)
        tc.Train(distance_matrix)
        eps_list.append(eps)
        cluster_num_list.append(tc.clusterNumber)
        noise_num_list.append(tc.noiseNumber)
    #V.cluster_result_linechart(eps_list,cluster_num_list,noise_num_list)
    tc=TreeCluster(eps=0.5,n_jobs=4)
    begin=time.time()
    tc.Train(distance_matrix)
    end=time.time()
    log.getlog().debug("Clustering took %f seconds." % (end - begin))
    V.cluster_result_linechart(range(2000),tc.DB.labels_,tc.DB.labels_)