Exemplo n.º 1
0
    def __init__(self):
        Preprocess.__init__(self)

        self.equal_regex = re.compile(r'=+[^=]+=+')
        self.pars_regex = re.compile(r'([^()]+)|\([^\(\)]+\)')

        self.chartype = Chartype()
Exemplo n.º 2
0
    def __init__(self):

        classifier_path1 = "stanford/english.muc.7class.distsim.crf.ser.gz"

        # scenario 1
        # classifier_path2 = "stanford/id-ner-model-half.ser.gz"
        # scenario 2
        # classifier_path2 = "stanford/id-ner-model-id.ser.gz"
        # scenario 3
        # classifier_path2 = "stanford/id-ner-model-2.ser.gz"
        ner_jar_path = "stanford/stanford-ner.jar"

        # for handling error nltk internals
        nltk.internals.config_java(options='-xmx5g')

        self.pre = Preprocess()
        self.scp = StanfordParser(
            './stanford/stanford-parser.jar',
            './stanford/stanford-parser-3.9.1-models.jar',
            encoding='utf8')
        self.ner_tagger = StanfordNERTagger(classifier_path1,
                                            ner_jar_path,
                                            encoding='utf8')  # for scenario 3
        self.pos_tagger = StanfordPOSTagger(
            './stanford/english-bidirectional-distsim.tagger',
            './stanford/stanford-postagger.jar',
            encoding='utf8')
        # combining classifier from Stanford with custom classifier
        # self.com_tagger = NERComboTagger(classifier_path1,ner_jar_path,stanford_ner_models=classifier_path1+","+classifier_path2) #for scenario 1 and 2
        self.core_nlp = StanfordCoreNLP('http://localhost', port=9000)
Exemplo n.º 3
0
    def __init__(self, image=False, images=[], GTPath=""):
        if images:
            try:
                self.homogenized = numpy.array(Image.open(images[0]))
                self.vesselEnhanced = numpy.array(Image.open(images[1]))
                self.images = images
            except IndexError:
                print("""`images` parameter must include the homogenized image 
                    at `images[0]` and vessel enhanced image at `images[1]`""")
                raise
        else:
            self.preprocess          = Preprocess(image)
            self.homogenized    = self.preprocess.process(enhance=False).image_array
            self.vesselEnhanced = self.preprocess.process(onlyEnhance=True).image_array
            self.mask           = self.preprocess.mask
            self.source         = image
            self.image          = Image.open(image)
            self.loaded         = self.image.load()
        if len(GTPath):
            self.gt             = True
            self.groundtruth    = Image.open(GTPath)
        else:
            self.gt             = False

        self.feature_array      = numpy.empty(0)
def load_data(file):
    mapping = load_embeddings('glove.6B.50d.txt')
    preprocess = Preprocess()
    data = pd.read_csv(file,
                       encoding='latin-1',
                       names=['sentiment', 'id', 'date', 'q', 'nick', 'tweet'])
    data = data.sample(frac=1)
    data = data[:100000]

    data_x = []
    data_y = []
    for index in data.index:
        row = data.loc[index, :]
        if row['sentiment'] != 2:
            row['tweet'] = preprocess.preprocess(row['tweet'])
            tweet = []
            for word in row['tweet'].split():
                if word in mapping:
                    word_embedding = mapping[word]
                    tweet.append(word_embedding)
                else:
                    tweet.append(np.zeros(50))
            tweet = add_padding(tweet, 20)
            data_x.append(tweet)
            data_y.append(one_hot_encoding(row['sentiment']))
    data_x = np.array(data_x)
    data_y = np.array(data_y)

    return data_x, data_y
Exemplo n.º 5
0
def main(args):
    pp = Preprocess()

    # ---- Run Spotlight ----
    # graphs = pp.parse_tcp_dump(args.tcp_folder, args.csv_file)

    # tcp = ad.read_csv_file(args.csv_file)
    # graphs = tcp.iloc[:, [0, 1, 3]]
    # graphs.columns = ['source', 'destination',  'hours_past']
    #
    # run_spotlight(args, np.array(graphs))

    # # ---- Run Shingle Sketch -----
    # graph_utils.create_graphs(args.csv_file, args.graph_folder)
    is_gexf = False
    graphs = pp.preprocess_gfiles(args.graph_folder)

    # #--- For Muta or Chemical Data ----
    # graphs = pp.preprocess_gexf(args.gexffile)
    # is_gexf = True
    #
    # #---For DOS Attack Data ---
    # graphs = pp.preprocess_single_gfile("data/dos.g")

    run_shingle(args, graphs, is_gexf)

    ad = AnomalyDetection()
    skvector = ad.read_sketch(args.sketch_vector)
    print(skvector.shape)
    ad.anomaly_detection(skvector, args)
Exemplo n.º 6
0
    def __init__(self, feature_list, **kwargs):
        """create a policy object that preprocesses according to feature_list and uses
		a neural network specified by keyword arguments (see create_network())
		"""
        self.preprocessor = Preprocess(feature_list)
        kwargs["input_dim"] = self.preprocessor.output_dim
        self.model = CNNPolicy.create_network(**kwargs)
        self.forward = self._model_forward()
Exemplo n.º 7
0
    def test_feature_match(self,tresh,retest,img1,imgref):
            
        print("Test_feature_match")  
        
        process = Preprocess("NA","NA")
        fonte = cv2.FONT_HERSHEY_SIMPLEX 
        
        try:
            x_detect,y_detect,score=process.feature_match(img1, imgref)   
            print(str("Score of Feature Match") + str(score))
        except:
            score=0
            print("except aqui")    
        finally:         
            url,CustomerName,Division,SerialNumber,AssemblyNumber,TesterName,ProcessStep,Operator = get_data_to_test()
            print("Teste de Serial:" + str(SerialNumber))        
            now = datetime.now()
            dt_string = now.strftime("%d_%m_%Y_%H%M%S")          
            if SerialNumber=="":
                SerialNumber=str("No_Serial" + str(dt_string))
                print(SerialNumber)
  
            #-----Inclusão de Serial Number---------

            self.Set_Serial_TestTime_List(SerialNumber)       
            
            #RESULT OF TEST 
            if(score>int(tresh)):
                cv2.putText(img1, "PASS - LABEL DETECTED", (50, 400), fonte, 3, (0,255,0), 3, cv2.LINE_AA)           
                cv2.putText(img1, "Score:" + str(score), (50, 430), fonte, 1, (125,255,255), 1, cv2.LINE_AA)
                send_test_result("P")
                #cv2.imwrite("./logs/" + str(SerialNumber)+ "_pass.jpg",img1)    
            
            elif(score<int(tresh)) and (score>=0):
                cv2.putText(img1, "FAIL- NO LABEL", (50, 400), fonte, 3, (0,0,255), 3, cv2.LINE_AA)
                cv2.putText(img1, "Score:" + str(score), (50, 430), fonte, 1, (125,255,255), 1, cv2.LINE_AA)
                
                #send_test_result("F")

                print("retest numbers:")
                print(str(self.Count_Serial_TestTime_Occurence(SerialNumber)))
                print(str(self.Get_Retest_Times_Before_Fail()))
                
                
                if (self.Count_Serial_TestTime_Occurence(SerialNumber) > self.Get_Retest_Times_Before_Fail()):
                    send_test_result("F")
                    #send_test_result_parser(ResultMes="F",Fail_Description=str("FAIL FIRMWARE VERSION "+ str(string)))
                    cv2.putText(img1, "MES REJECTION"+ str(self.Count_Serial_TestTime_Occurence(SerialNumber)), (50, 680), fonte, 1.5, (0,0,255), 2, cv2.LINE_AA)
                else:
                    cv2.putText(img1, "RETEST NUMBER:"+ str(self.Count_Serial_TestTime_Occurence(SerialNumber)), (50, 680), fonte, 1.5, (0,0,255), 2, cv2.LINE_AA)

                #cv2.imwrite("./logs/" + str(SerialNumber) +"_fail.jpg",img1)  

            #cv2.putText(img1, "Score:" + str(score), (50, 430), fonte, 1, (125,255,255), 1, cv2.LINE_AA)
        
        return score
Exemplo n.º 8
0
def main(argv):
    pre = Preprocess(argv[0], argv[1])
    pre.build_vectors()
    dataset = ToxicityDataset(pre.vectors, pre.targets)
    # Without sentiment
    # gru = GRU(360).double()
    # With sentiment
    gru = GRU(373).double()
    if use_GPU:
        gru.cuda()
    training.train(gru, dataset, 2, 4, 0.1, use_gpu=use_GPU)
Exemplo n.º 9
0
 def preprocess(self, new, vector, chi2):
     self.prepro = Preprocess(self.data_set, new)
     if new == 'True':
         if vector == 'hashing':
             self.prepro.hashVector()
         if vector == 'tfidf':
             self.prepro.tfidfVector()
         # print self.preprocess.y_train
     else:
         self.prepro.vectorize(vector)
     if chi2:
         self.prepro.chisquare()
Exemplo n.º 10
0
class Project:
    def __init__(self, num_rows, wrt_feature):
        self.db = DB(db='major_2')
        data = Data(self.db)
        self.data_set = data.getData(num_rows, wrt_feature)

    def preprocess(self, new, vector, chi2):
        self.prepro = Preprocess(self.data_set, new)
        if new == 'True':
            if vector == 'hashing':
                self.prepro.hashVector()
            if vector == 'tfidf':
                self.prepro.tfidfVector()
            # print self.preprocess.y_train
        else:
            self.prepro.vectorize(vector)
        if chi2:
            self.prepro.chisquare()

    def run_classifier(self, method, classifier):
        if method == 'classifier':
            self.classifier = Classifier(
                self.prepro.severity.keys(), self.prepro.X_train,
                self.prepro.y_train, self.prepro.X_test, self.prepro.y_test,
                self.prepro.train_size, self.prepro.test_size)
            self.classifier.classify(classifier)
        if method == 'pipeline':
            self.classifier = PipeLineClassifier(
                self.prepro.severity.keys(), self.prepro.train_corpus,
                self.prepro.y_train, self.prepro.X_test, self.prepro.y_test,
                self.prepro.train_size, self.prepro.test_size)
            self.classifier.setVariables(classifier)
            self.classifier.benchmark()
def load_data(file):
    mapping = load_embeddings('glove.6B.50d.txt')
    preprocess = Preprocess()
    file = open(file, "r")
    data_x = []
    data_y = []
    sentence = []
    categories = []
    for line in file:
        if len(line.split()) == 0:
            sentence, categories = add_padding(sentence, categories, 20)
            data_x.append(sentence)
            data_y.append(categories)
            sentence = []
            categories = []
        else:
            if (line.split()[0]).lower() in mapping:
                word_embedding = mapping[(line.split()[0]).lower()]
                word_category = one_hot_encoding(line.split()[2])
                sentence.append(word_embedding)
                categories.append(word_category)
            else:
                sentence.append(np.zeros(50))
                categories.append(one_hot_encoding(line.split()[2]))
    data_x = np.array(data_x)
    data_y = np.array(data_y)

    return data_x, data_y
Exemplo n.º 12
0
    def queryProcess(self):
        preprocess = Preprocess()
        self.query = self.query.lower()
        self.query = preprocess.preprocess(self.query)

        tokenizer = RegexpTokenizer(r"[\d-]+\w+|[A-Z][.A-Z]+\b\.*|[\w-]+|'.*'")
        self.query_tokens = tokenizer.tokenize(self.query)

        if self.query_tokens[0] in wh_qstn_words:
            self.query_type = 1
        elif self.query_tokens[0] in ab_qstn_words:
            self.query_type = 2
        elif self.query_tokens[0] in desc_qstn_words:
            self.query_type = 3
        else:
            self.query_type = 4
Exemplo n.º 13
0
   def parse(self, response):
      ''' This method is called repeatedly to process documents from the URL frontier.

      Scrapy handles compliance of Politeness policies    
      '''

      url = response.request.url

      # Remove html tags from the document
      raw_text = GetText(response.body)

      # Preprocess the document's content
      tokens = Preprocess(raw_text)

      # Add document to be stored in local storage
      if self.count < LIMIT:
         self.dstore.add_document(tokens, response.body, url)

      # Extract url references and add them to the url frontier
      for a in response.css('a'):
         if 'href' in a.attrib:
            yield response.follow(a, callback=self.parse)

      # Limit of pages to crawl
      if self.count > LIMIT:
         raise CloseSpider(reason='reached_limit')    # Force spider to close

      print(str(self.count) + '\n\n')     # IGNORE/COMMENT THIS
      
      self.count += 1
Exemplo n.º 14
0
def lambda_handler(event, context):
    # TODO implement

    json_data = json.loads(event['body'])
    preprocess = Preprocess(json_data=json_data)
    preprocess.scale_points(calculate_scale=False)

    pose_objects = preprocess.new_pose_objects

    features = []

    features_obj = Features(pose_objects=pose_objects)
    features_obj.compute_features()
    features = features_obj.get_features()
    # pca_model = pickle.load(open('pca.pkl', 'rb'))
    # reduced_feature_matrix = pca_model.transform(features)

    s3 = boto3.resource('s3')

    svm_classifier = pickle.loads(
        s3.Bucket("gesture-recognition").Object("SVM_model.pkl").get()
        ['Body'].read())

    logreg_classifier = pickle.loads(
        s3.Bucket("gesture-recognition").Object("LogReg_model.pkl").get()
        ['Body'].read())

    lda_classifier = pickle.loads(
        s3.Bucket("gesture-recognition").Object("LDA_model.pkl").get()
        ['Body'].read())

    random_forest_classifier = pickle.loads(
        s3.Bucket("gesture-recognition").Object("RForest_model.pkl").get()
        ['Body'].read())

    prediction_rf = random_forest_classifier.predict(features)
    prediction_svm = svm_classifier.predict(features)
    prediction_lda = lda_classifier.predict(features)
    prediction_logreg = logreg_classifier.predict(features)

    data = {
        "1": prediction_svm[0],
        "2": prediction_logreg[0],
        "3": prediction_lda[0],
        "4": prediction_rf[0]
    }
    return {'statusCode': 200, 'body': json.dumps(data)}
Exemplo n.º 15
0
	def __init__(self, feature_list, **kwargs):
		"""create a policy object that preprocesses according to feature_list and uses
		a neural network specified by keyword arguments (see create_network())
		"""
		self.preprocessor = Preprocess(feature_list)
		kwargs["input_dim"] = self.preprocessor.output_dim
		self.model = CNNPolicy.create_network(**kwargs)
		self.forward = self._model_forward()
Exemplo n.º 16
0
def pipeline(imgpath):
    img = io.imread(imgpath)
    try:
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    except cv2.error:
        print("Image already in Grayscale")
        gray = img
    processed = Preprocess.pre_process_image(gray)
    corners = Preprocess.find_corners_of_largest_polygon(processed)
    cropped = Preprocess.crop_and_warp(img, corners)
    resized = Preprocess.resize(cropped)
    inverted = Preprocess.invert(resized)
    #cv2.imshow('Inverted', inverted)

    # Press q on keyboard to  exit
    #cv2.waitKey(25) & 0xFF == ord('q')

    cells = Preprocess.boxes(inverted)

    new_cells = []
    for cell in cells:
        new_cell = Preprocess.process_cells(cell)
        new_cells.append(new_cell)

    return inverted, new_cells
Exemplo n.º 17
0
def algorithm(df, params):
    """
    wrapper function to put each individual algorithm inside
    :param df: dataframe that contains all the input dataset
    :param params: algorithm specific parameters
    :return: a dictionary of { outputname: output content in memory }
    """

    output = {}

    # algorithm specific code
    # construct sentiment analysis
    PP = Preprocess(df, params['column'])

    output['phrases'] = PP.get_phrases()
    output['filtered'] = filtered_tokens = PP.get_words()
    output['processed'] = processed_tokens = PP.stem_lematize(
        params['process'], filtered_tokens)
    output['tagged'] = PP.tagging(params['tagger'], processed_tokens)
    filtered_most_common, processed_most_common = PP.most_frequent(
        filtered_tokens, processed_tokens)
    output['most_common'] = processed_most_common

    # plot
    index = []
    counts = []
    for common in processed_most_common[1:51]:
        index.append(common[0])
        counts.append(common[1])
    title = 'Top 50 frequent words (' + params['process'] + ')'
    output['div'] = plot.plot_bar_chart(index, counts, title)

    return output
Exemplo n.º 18
0
class CombineNews(object):
    def __init__(self):
        self.pre = Preprocess()

    def cleansingText(self, text):
        text = self.pre.eliminatePunctuation(text)
        return self.pre.normalizePunctuation(text)

    def combineToCsvFromFolder(self):
        path_to_json = "datasets/"

        data = pd.DataFrame()

        for filename in os.listdir(path_to_json):
            if filename.endswith('.json'):
                with open(os.path.join(path_to_json, filename)) as json_data:
                    json_result = json.load(json_data)
                    for key, value in json_result['fiveWoneH'].iteritems():
                        json_result[key] = value
                    del json_result['fiveWoneH']
                    data = data.append(json_result, ignore_index=True)
        data['text'] = data['text'].apply(lambda x: self.cleansingText(x))
        data['title'] = data['title'].apply(lambda x: self.cleansingText(x))
        data = data.dropna(axis=1, how="any")
        data.to_csv('golden_data.csv', sep=';', index=False, encoding='utf-8')

    def combineToCsvFromFile(self):
        filename = "news_crawler/page_contents.json"

        data = pd.DataFrame()

        with open(filename) as json_data:
            json_result = json.load(json_data)
            temp = pd.DataFrame()
            temp = data.append(json_result, ignore_index=True)
            temp['body'] = temp['body'].apply(lambda x: self.cleansingText(x))
            temp['title'] = temp['title'].apply(
                lambda x: self.cleansingText(x))
            temp = temp.dropna(axis=0, how="any")
            data = temp

        data.to_csv('test.csv', sep=';', index=False, encoding='utf-8')
 def preprocess(self, new, vector, chi2):
 	self.prepro = Preprocess(self.data_set, new)
     if new == 'True':
         if vector == 'hashing':
             self.prepro.hashVector()
         if vector == 'tfidf':
             self.prepro.tfidfVector()
         # print self.preprocess.y_train
     else:
         self.prepro.vectorize(vector)
     if chi2:
         self.prepro.chisquare()
Exemplo n.º 20
0
 def define_model(self, name):
     if self.is_trained == False:
         if name == 'preprocInc':
             #self.mod = MultinomialNB()
             self.mod = Pipeline([('what', Preprocess()),
                                  ('a pain',
                                   MultinomialNB(alpha=0.05,
                                                 fit_prior=False,
                                                 class_prior=None))])
         else:
             print(
                 'Error selecting the model, choose by default Gaussian NB')
             self.mod = MultinomialNB()
     else:
         print("Model already load")
class Project:
    def __init__(self, num_rows, wrt_feature):
    	self.db = DB(db='major_2')
    	data = Data(self.db)
    	self.data_set = data.getData(num_rows, wrt_feature)

    def preprocess(self, new, vector, chi2):
    	self.prepro = Preprocess(self.data_set, new)
        if new == 'True':
            if vector == 'hashing':
                self.prepro.hashVector()
            if vector == 'tfidf':
                self.prepro.tfidfVector()
            # print self.preprocess.y_train
        else:
            self.prepro.vectorize(vector)
        if chi2:
            self.prepro.chisquare()

    def run_classifier(self, method, classifier):
        if method == 'classifier':
            self.classifier = Classifier(
                self.prepro.severity.keys(), self.prepro.X_train,
                self.prepro.y_train, self.prepro.X_test,
                self.prepro.y_test, self.prepro.train_size,
                self.prepro.test_size)
            self.classifier.classify(classifier)
        if method == 'pipeline':
            self.classifier = PipeLineClassifier(
                self.prepro.severity.keys(), self.prepro.train_corpus,
                self.prepro.y_train, self.prepro.X_test,
                self.prepro.y_test, self.prepro.train_size,
                self.prepro.test_size
            )
            self.classifier.setVariables(classifier)
            self.classifier.benchmark()
def get_data():
    files = os.listdir('./MealNoMealData')
    meal_data_files = []
    no_meal_data_files = []
    for file in files:
        if 'Nomeal' in file:
            no_meal_data_files.append(os.path.join('./MealNoMealData', file))
        else:
            meal_data_files.append(os.path.join('./MealNoMealData', file))

    data = []

    labels = []
    for meal_data_file, no_meal_data_file in zip(meal_data_files,
                                                 no_meal_data_files):

        preprocess_obj = Preprocess(meal_data_file)
        meal_df = preprocess_obj.get_dataframe()
        meal_features = Features(meal_df)
        meal_features.compute_features()
        # temp_meal_features = meal_features.pca_decomposition().tolist()
        temp_meal_features = meal_features.get_features()
        labels += [1] * len(temp_meal_features)

        preprocess_obj_ = Preprocess(no_meal_data_file)
        no_meal_df = preprocess_obj_.get_dataframe()
        no_meal_features = Features(no_meal_df)
        no_meal_features.compute_features()
        no_meal_features_ = no_meal_features.get_features()
        # no_meal_final_features = meal_features.pca.transform(no_meal_features_).tolist()
        no_meal_final_features = no_meal_features_
        labels += [0] * len(no_meal_features_)

        for no_meal_feature in no_meal_final_features:
            temp_meal_features.append(no_meal_feature)

        for meal_no_meal_feature in temp_meal_features:
            data.append(meal_no_meal_feature)

    return data, labels
Exemplo n.º 23
0
	rht = Removing HTML tags
	rurls = Revoing Urls 
	rn = Removing Numbers
	ntw = convert numbers to words
	sc = Spelling Correction
	ata = convert accented to ASCII code
	sto = short_to_original
	ec = Expanding Contractions
	ps = Stemming (Porter Stemming)
	l = Lemmatization
	re = Removing Emojis
	ret = Removing Emoticons
	ew = Convert Emojis to words
	etw = Convert Emoticons to words
	rp = Removing Punctuations
	rs = Removing Stopwords
	rfw = Removing Frequent Words
	rrw = Removing Rare Words
	rsc = Removing Single characters
	res = Removing Extra Spaces
"""
print(f"******** Before preprocessing technique ******* ")
for sent in sentences[:5]:
    print(sent)
preprocessing = Preprocess()

preprocessed_text = preprocessing.preprocessing(sentences, techniques)
print(f"******** After preprocessing ****************")
for sent in preprocessed_text[:5]:
    print(sent)
Exemplo n.º 24
0
class FeatureExtraction:
    """
        @param image {string} Source of raw image
        @param images {list} Source of preprocessed images. Where images[0] is 
        the homogenized image and images[1] is the vessel enhanced image.
        @param GTPath {string} Wether image has ground truth. If the image has 
        ground truth image with correct labels then gt is a path to the 
        groundtruth image.
    """
    def __init__(self, image=False, images=[], GTPath=""):
        if images:
            try:
                self.homogenized = numpy.array(Image.open(images[0]))
                self.vesselEnhanced = numpy.array(Image.open(images[1]))
                self.images = images
            except IndexError:
                print("""`images` parameter must include the homogenized image 
                    at `images[0]` and vessel enhanced image at `images[1]`""")
                raise
        else:
            self.preprocess          = Preprocess(image)
            self.homogenized    = self.preprocess.process(enhance=False).image_array
            self.vesselEnhanced = self.preprocess.process(onlyEnhance=True).image_array
            self.mask           = self.preprocess.mask
            self.source         = image
            self.image          = Image.open(image)
            self.loaded         = self.image.load()
        if len(GTPath):
            self.gt             = True
            self.groundtruth    = Image.open(GTPath)
        else:
            self.gt             = False

        self.feature_array      = numpy.empty(0)

    def __getHomogenized(self, forceNew=False):
        raise NotImplementedError
    """
        `exportCSV` exports `self.feature_array` to `filename` unless `array` 
        parameter is set if `balanced` then the exported features will have an 
        equal amount of class 0 and class 1. 
        The parameter `delim` can be used to change the seperator from commas to
        some other character.

        @method exportCSV
        @param filename {string} Name of file including the path to it where 
            features will be exported to
        @param array {numpy array} The feature array to export
        @param delim {string} The delimeter 
        @default ","
        @param balanced {bool} Wether to export the full feature array or a 
            balanced version with equal class representation
        @default False
    """
    def exportCSV(
            self, 
            filename="", 
            array=numpy.empty(0), 
            delim=",", 
            balanced=False
        ):

        if not array.any():
            array = self.feature_array
        if balanced:
            zeros   = array[numpy.less(array[:,0], 1)] 
            ones    = array[numpy.greater(array[:,0], 0)]
            if len(zeros) > len(ones):
                indices = numpy.random.choice(
                    len(zeros), 
                    size=len(ones), 
                    replace=False
                )
                array = numpy.concatenate(
                    (ones, zeros[indices]), 
                    axis=0
                )
            if len(ones) > len(zeros):
                indices = numpy.random.choice(
                    len(ones), 
                    size=len(zeros), 
                    replace=False
                )
                array = numpy.concatenate(
                    (zeros, ones[indices]), 
                    axis=0
                )
        if not len(filename):
            if hasattr(self, "source"):
                filename = "extracted_" + self.source
            else:
                filename = "extracted_" + self.images[1]
        if self.gt:
            formatting = ['%d', '%.0f', '%.0f', '%f', '%f', '%.0f', '%f', '%f']
            header = """label,\tfeat. 1,\tfeat. 2,\tfeat. 3,\tfeat. 4,\tfeat. 5,
                     \tHu mom. 1,\tHu mom. 2"""
        else:
            formatting = ['%.0f', '%.0f', '%f', '%f', '%.0f', '%f', '%f']
            header = """feat. 1,\tfeat. 2,\tfeat. 3,\tfeat. 4,\tfeat. 5,
                     \tHu mom. 1,\tHu mom. 2"""
        numpy.savetxt(
            filename,               
            array,                  
            fmt=formatting,         # formatting
            delimiter=',\t',        # column delimiter
            newline='\n',           # new line character
            footer='end of file',   # file footer
            comments='# ',          # character to use for comments
            header=header)          # file header
    """
        `normalize` is used to normalize the feature_array. If comp_only 
        (compute only) is set to `True` then only `self.std_vector` and 
        `self.mean_vector` will be set but the value of `self.feature_array` 
        will not be set. This can be useful if computing an accumulated mean and 
        standard deviation and then using the `mean` and `std` parameter later 
        to normalize with the accumulated mean and average standard deviation 
        vectors. 

        @method normalize 
        @param array {numpy array} The feature array if not set then 
            `self.feature_array`.
        @param mean {numpy array} The mean to use in the normalization. If not 
            set then it will be computed over the inside FOV pixels of the 
            `array` using the `self.mask`. 
        @param std {numpy array} The standard deviation to be used in 
            normalization. 
        @param comp_only {bool} If true then mean, sample variance and standard 
            deviation will be computed and saved to `self.var_vector`, 
            `self.std_vector` and `self.mean_vector` respectively. But they wont
            be used to normalize the feature array. 
        @default False
    """
    def normalize(
            self, 
            array=numpy.empty(0), 
            mean=numpy.empty(0), 
            std=numpy.empty(0), 
            comp_only=False
        ):
        if not array.any():
            array = self.feature_array
        # preserve label column
        # compute mean and std excluding out of FOV pixels
        indices = numpy.greater(self.mask.flatten(), 0)
        FOV = array[indices]

        # Since mean should only be computed on the training set
        # the assumption of ignoring the first column is made, since
        # this is the label column.
        if not mean.any():
            mean    = FOV.mean(axis=0)[1:]
        if not std.any():
            std     = FOV.std(axis=0)[1:]
            var     = FOV.var(axis=0)[1:]
        if comp_only:
            self.var_vector     = var
            self.std_vector     = std
            self.mean_vector    = mean
        else:
            if self.gt:
                labels = array[:,0]
                array[:,1:] = (array[:,1:] - mean) / std
            else:
                array = (array - mean) / std
            if self.gt:
                array[:,0] = labels
                # since there is a groundtruth then the first column
                # will be the label column, the rest are the actual features.d
            self.feature_array = array
        return self

    def computeFeatures(self, forceNew=False):
        if forceNew:
            return self._extract()

        elif self.feature_array.any():
            return self
        else:
            return self._extract()

    """
        `_extract` is responsible of extracting the feature array for every 
        pixel in the preprocessed image. If optional parameters 
        `homogenized_array` and `ve_array` are not provided then  

        @method _extract 
        @param homogenized_array {numpy array} The homogenized image from 
            preprocessing
        @param ve_array {numpy array} The vessel enhanced image from 
            preprocessing
    """
    def _extract(
            self, 
            homogenized_array=numpy.empty(0), 
            ve_array=numpy.empty(0)
        ):
        if not homogenized_array.any():
            homogenized_array = self.homogenized
        if not ve_array.any():
            ve_array = self.vesselEnhanced
        # erode image using an eroded mask 
        mask = binary_erosion(self.mask, square(10)) 
        homogenized_array = homogenized_array * mask
        # # # # # # # # # # # # # # # # # # # # #
        print("Extracting features ", end=" ")
        print("\t\t[", end="")
        self.feature_array = []
        for x in range(len(homogenized_array)):
            for y in range(len(homogenized_array[0])):
                if self.mask[x,y] or True: # disabled for now
                    #########################################
                    xstart  = x - 8 if x-8 >= 0 else 0
                    ystart  = y - 8 if y-8 >= 0 else 0

                    xend    = x + 8 if x+8 < len(ve_array) else len(ve_array) -1
                    yend    = y + 8 if y+8 < len(ve_array[0]) else len(ve_array[0]) -1
                    # 1 is added to the right and bottom boundary because of
                    # pythons way of indexing
                    xend += 1
                    yend += 1
                    
                    subarea = ve_array[xstart:xend, ystart:yend]

                    if subarea.max() != 0:
                    

                        Hu0, Hu1 = self.__moments(subarea)

                        ########################################
                        xstart  = x-4 if x-4 >= 0 else 0
                        ystart  = y-4 if y-4 >= 0 else 0

                        xend    = (x+4 
                            if x+4 < len(homogenized_array) 
                            else len(homogenized_array) -1)
                        yend    = (y+4 
                            if y+4 < len(homogenized_array[0]) 
                            else len(homogenized_array[0]) -1)
                        # 1 is added to the right and bottom boundary because of
                        # pythons way of indexing
                        xend += 1
                        yend += 1

                        subarea = homogenized_array[xstart:xend, ystart:yend]
                        FOV     = numpy.greater(subarea, 0)
                        subarea = (subarea[FOV] 
                            if FOV.any() and homogenized_array[x,y] > 0 
                            else numpy.array([0]))
                        # equation 5 from Marin et al.
                        f1      = homogenized_array[x,y] - subarea.min()
                        # equation 6 from Marin et al.
                        f2      = subarea.max() - homogenized_array[x,y]
                        # equation 7 from Marin et al.
                        f3      = homogenized_array[x,y] - subarea.mean()
                        # equation 8 from Marin et al.
                        f4      = subarea.std()
                        # equation 9 from Marin et al. 
                        # inverting the background, so setting zero to 255
                        f5      = homogenized_array[x,y]
                        ########################################

                        if self.gt:
                            # values in groundtruth are either 255 or 0
                            gtval = self.groundtruth.getpixel((x,y))
                            label = gtval if gtval == 0 else 1
                            features = [label, f1, f2, f3, f4, f5, Hu0, Hu1]
                        else:
                            features = [f1, f2, f3, f4, f5, Hu0, Hu1]

                    elif not self.gt:
                        features = [0, 0, 0.0, 0.0, 0, 0.0, 0.0]

                    else:
                        # values in groundtruth are either 255 or 0
                        gtval = self.groundtruth.getpixel((x,y))
                        label = gtval if gtval == 0 else 1
                        features = [label, 0, 0, 0.0, 0.0, 0, 0.0, 0.0]

                    self.feature_array.append(features)
            if x % (len(homogenized_array) * 0.05) < 1: 
                print("#", end="")

        self.feature_array = numpy.array(self.feature_array)
        print("]")
        return self

    """
        `__moments` computes the first two Hu moment over some array given by 
        the parameter `subarray`. 

        @private
        @method __moments
        @param subarray {numpy array} The area which the Hu moments are computed 
            over. 
    """
    def __moments(self, subarray):
        """
            I_HU(x,y) = subarray(x,y) * gaussian_matrix(x,y)

            returns absolute value of the log of the first two Hu moments
        """
        I_HU = self.__gausMatrix(subarray)
        h1, h2 = cv2.HuMoments(cv2.moments(I_HU))[0:2]
        h1 = numpy.log(h1) if h1 != 0 else h1
        h2 = numpy.log(h2) if h2 != 0 else h2
        return numpy.absolute( [h1[0], h2[0]] )

    def __gausMatrix(self, array, mu=0.0, sigma=1.7):
        x, y = array.shape
        return scipy.ndimage.filters.gaussian_filter(array, 1.7)
Exemplo n.º 25
0
class CNNPolicy(object):
    """uses a convolutional neural network to evaluate the state of the game
	and compute a probability distribution over the next action
	"""
    def __init__(self, feature_list, **kwargs):
        """create a policy object that preprocesses according to feature_list and uses
		a neural network specified by keyword arguments (see create_network())
		"""
        self.preprocessor = Preprocess(feature_list)
        kwargs["input_dim"] = self.preprocessor.output_dim
        self.model = CNNPolicy.create_network(**kwargs)
        self.forward = self._model_forward()

    def _model_forward(self):
        """Construct a function using the current keras backend that, when given a batch
		of inputs, simply processes them forward and returns the output

		This is as opposed to model.compile(), which takes a loss function
		and training method.

		c.f. https://github.com/fchollet/keras/issues/1426
		"""
        model_input = self.model.get_input(train=False)
        model_output = self.model.get_output(train=False)
        forward_function = K.function([model_input], [model_output])

        # the forward_function returns a list of tensors
        # the first [0] gets the front tensor.
        # this tensor, however, has dimensions (1, width, height)
        # and we just want (width,height) hence the second [0]
        return lambda inpt: forward_function(inpt)[0][0]

    def batch_eval_state(self, state_gen, batch=16):
        """Given a stream of states in state_gen, evaluates them in batches
		to make best use of GPU resources.

		Returns: TBD (stream of results? that would break zip(). 
			streaming pairs of pre-zipped (state, result)?)
		"""
        raise NotImplementedError()

    def eval_state(self, state):
        """Given a GameState object, returns a list of (action, probability) pairs
		according to the network outputs
		"""
        tensor = self.preprocessor.state_to_tensor(state)

        # run the tensor through the network
        network_output = self.forward([tensor])

        # get network activations at legal move locations
        # note: may not be a proper distribution by ignoring illegal moves
        return [((x, y), network_output[x, y])
                for (x, y) in state.get_legal_moves()]

    @staticmethod
    def create_network(**kwargs):
        """construct a convolutional neural network.

		Keword Arguments:
		- input_dim:         depth of features to be processed by first layer (no default)
		- board:             width of the go board to be processed (default 19)
		- filters_per_layer: number of filters used on every layer (default 128)
		- layers:            number of convolutional steps (default 12)
		- filter_width_K:    (where K is between 1 and <layers>) width of filter on 
							 layer K (default 3 except 1st layer which defaults to 5).
							 Must be odd.
		"""
        defaults = {
            "board": 19,
            "filters_per_layer": 128,
            "layers": 12,
            "filter_width_1": 5
        }
        # copy defaults, but override with anything in kwargs
        params = defaults
        params.update(kwargs)

        # create the network:
        # a series of zero-paddings followed by convolutions
        # such that the output dimensions are also board x board
        network = Sequential()

        # create first layer
        network.add(
            convolutional.Convolution2D(input_shape=(params["input_dim"],
                                                     params["board"],
                                                     params["board"]),
                                        nb_filter=params["filters_per_layer"],
                                        nb_row=params["filter_width_1"],
                                        nb_col=params["filter_width_1"],
                                        init='uniform',
                                        activation='relu',
                                        border_mode='same'))

        # create all other layers
        for i in range(2, params["layers"] + 1):
            # use filter_width_K if it is there, otherwise use 3
            filter_key = "filter_width_%d" % i
            filter_width = params.get(filter_key, 3)
            network.add(
                convolutional.Convolution2D(
                    nb_filter=params["filters_per_layer"],
                    nb_row=filter_width,
                    nb_col=filter_width,
                    init='uniform',
                    activation='relu',
                    border_mode='same'))

        # the last layer maps each <filters_per_layer> featuer to a number
        network.add(
            convolutional.Convolution2D(nb_filter=1,
                                        nb_row=1,
                                        nb_col=1,
                                        init='uniform',
                                        border_mode='same'))
        # reshape output to be board x board
        network.add(Reshape((params["board"], params["board"])))
        # softmax makes it into a probability distribution
        network.add(Activation('softmax'))

        return network

    def load_model(self, json_file):
        """load the architecture specified in json_file into 'self'
		"""
        raise NotImplementedError()

    def save_model(self, json_file):
        """write the network model and preprocessing features to the specified file
		"""
        raise NotImplementedError()

    def load_params(self, h5_file):
        """load model parameters (weights) in the specified file
		"""
        raise NotImplementedError()

    def save_params(self, h5_file):
        """save model parameters (weights) to the specified file
		"""
        raise NotImplementedError()
from preprocessing import Preprocess
from classification import Classification
import pickle
from features import Features
import json
import os

preprocess = Preprocess()
preprocess.scale_points()

pose_objects = preprocess.new_pose_objects

features = []

features_obj = Features(pose_objects=pose_objects)
features_obj.compute_features()
# reduced_feature_matrix = features_obj.compute_pca()

# print(reduced_feature_matrix)
# print(len(reduced_feature_matrix),len(reduced_feature_matrix[0]))

# X = reduced_feature_matrix
X = features_obj.get_features()
Y = [obj.label for obj in pose_objects]

print(len(X), len(Y))
clf_rforest = Classification('RForest', X, Y)
clf_rforest.get_classifier_object()
clf_rforest.get_metrics()
pickle.dump(clf_rforest.get_classifier(), open('RForest_model.pkl', 'wb'))
print()
Exemplo n.º 27
0
def main():
    
    opt = parse()
    model_path = "RESULT/"+ opt.save + "/model"
    vocab_path = "RESULT/" + opt.save + "/vocab"
    os.makedirs(model_path, exist_ok=True)
    os.makedirs(vocab_path, exist_ok=True)

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu
    device = torch.device("cuda:0")

    opt.log = "RESULT/" + opt.save + "/log"
    opt.save_model = model_path

    # write a setting 
    with open(opt.log, "a") as f:
        f.write("-----setting-----\n")
        f.write("MAX ITERATION : %d \
                \nCHECK INTERVAL : %d \
                \nBATCH SIZE : %d \
                \nACCUMULATION STEPS : %d \
                \nWORD CUT : %d \
                \nD_MODEL : %d \
                \nN_LAYERS : %d \
                \nN_HEAD : %d \
                \nDROPOUT : %.1f \
                \nMODE : %s \
                \nSAVE_MODEL : %s \
                \nLOG_PATH : %s \
                \nGPU NAME: %s \
                \nGPU NUM %s \
                \nDATASET : \n%s\n%s\n%s\n%s\n%s\n%s" \
                    %(opt.max_steps, \
                    opt.check_interval, \
                    opt.batch_size, \
                    opt.accumulation_steps, \
                    opt.word_cut, \
                    opt.d_model, \
                    opt.n_layers, \
                    opt.n_head, \
                    opt.dropout, \
                    opt.mode, \
                    opt.save, \
                    opt.log, \
                    torch.cuda.get_device_name(), \
                    opt.gpu, \
                    opt.train_src, \
                    opt.train_trg, \
                    opt.valid_src, \
                    opt.valid_trg, \
                    opt.test_src, \
                    opt.test_trg))

    #gradient accumulation
    opt.batch_size = int(opt.batch_size/opt.accumulation_steps)
    opt.batch_max_token = int(opt.batch_max_token/opt.accumulation_steps)
    opt.check_interval = int(opt.check_interval * opt.accumulation_steps)
    opt.max_steps = int(opt.max_steps * opt.accumulation_steps)

    #前処理
    source_vocab_path = "RESULT/" + opt.save + "/vocab/source_vocab"
    target_vocab_path = "RESULT/" + opt.save + "/vocab/target_vocab"

    SRC = Preprocess()
    TRG = Preprocess()

    train_source, valid_source, test_source = \
        SRC.load(train=opt.train_src,
                valid=opt.valid_src, 
                test = opt.test_src, 
                mode=1, 
                vocab_file=source_vocab_path)
    
    train_target, valid_target, test_target = \
        TRG.load(train=opt.train_trg,
                valid=opt.valid_trg, 
                test = opt.test_trg, 
                mode=1, 
                vocab_file=target_vocab_path)

    #SrcDict = SRC.reverse_dict
    TrgDict = TRG.reverse_dict
    src_size = len(SRC.dict)
    trg_size = len(TRG.dict)
    pad_idx = SRC.dict["<pad>"]
    trg_sos_idx = TRG.dict["<sos>"]
    trg_eos_idx = TRG.dict["<eos>"]

    #create batch sampler with the number of sentence
    train_batch_sampler = create_sentence_batch_sampler(train_source, train_target, opt.batch_size)
    valid_batch_sampler = create_sentence_batch_sampler(valid_source, valid_target, opt.valid_batch_size)

    #create batch sampler with the number of token
    #train_batch_sampler = create_token_batch_sampler(train_source, train_target, opt.batch_max_token)
    #valid_batch_sampler = create_sentence_batch_sampler(valid_source, valid_target, opt.valid_batch_size)
    
    #create dataset and dataloader
    train_data_set = MyDataset(train_source, train_target)
    valid_data_set = MyDataset(valid_source, valid_target)
    valid_data_loader = DataLoader(valid_data_set, batch_sampler=valid_batch_sampler, collate_fn=valid_data_set.collater)
    test_data_set = MyDataset(test_source, test_target)
    test_data_loader = DataLoader(test_data_set, batch_size=1, collate_fn=test_data_set.collater, shuffle=False)

    #train
    if opt.mode == "full" or opt.mode == "train":
        model = Transformer(src_size, trg_size, opt.d_model, opt.n_layers, opt.n_head, opt.dropout).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=1, betas=(0.9, 0.98), eps=1e-9)
        scheduler = LambdaLR(optimizer, lr_lambda=lr_schedule)
        model, optimizer = amp.initialize(model, optimizer, opt_level=opt.level)

        trainer = Trainer(
            model = model,
            optimizer = optimizer,
            train_data_set = train_data_set,
            train_batch_sampler = train_batch_sampler,
            valid_data_loader = valid_data_loader,
            lr_scheduler = scheduler,
            device = device,
            TrgDict = TrgDict,
            pad_idx = pad_idx
            )
            
        trainer.train(opt.epoch, opt)

    #test
    if opt.mode == "full" or opt.mode == "test":
        load_point = opt.max_steps//opt.check_interval
        model = average_model(load_point, opt, src_size, trg_size, device)

        torch.cuda.empty_cache()
        beam_size = 4
        max_seq_len = 410
        translator = Translator(
            model = model,
            test_data_loader = test_data_loader,
            TrgDict = TrgDict,
            device = device,
            beam_size =  beam_size,
            max_seq_len = max_seq_len,
            src_pad_idx = pad_idx,
            trg_pad_idx = pad_idx,
            trg_bos_idx = trg_sos_idx,
            trg_eos_idx = trg_eos_idx)
        
        translator.test(opt.save)
Exemplo n.º 28
0
from preprocessing import Preprocess
from activity import Activity
from threshold import Threshold
import pandas as pd
import os

# Read raw data from file
raw_data_frame = Preprocess("raw_data/girlbosskaty_tweets.csv", header=0)

# Select the Time and username column from raw data
data_time_uid = raw_data_frame.get_columns(["Screen_Name", "Time"])

# print(data_time_uid)

# Calculate Activity
act = Activity(data_time_uid)
dic_act = act.export_times()

# print(dic_act)

myThresh = Threshold(dic_act)
# print(myThresh.apply_clock_threshold(start_time="01:00:00", stop_time="05:00:00"),
#       "tweets between %s and %s" % (myThresh.start_time, myThresh.stop_time))
# print(myThresh.ckeck_day_tweets())

WeekDay_counter = myThresh.ckeck_day_tweets()
night_tweet_counter = myThresh.apply_clock_threshold(start_time="01:00:00",
                                                     stop_time="05:00:00")

print(WeekDay_counter)
Exemplo n.º 29
0
import torch
from preprocessing import Preprocess
import json
from dataset import QADataset
from transformers import BertModel
import os
from model import Answer
from solver import Solver
import sys

arg = sys.argv
ctx_max_len = 475
question_max = 30
pre = Preprocess(ctx_max_len=ctx_max_len, question_max=question_max)
data = {}
if not os.path.isdir('processed_data'):
    os.mkdir('processed_data')
if not os.path.isdir('ckpt'):
    os.mkdir('ckpt')
#
if arg[1] == 'train':
    for name in ['dev', 'train']:

        if not os.path.isfile(f'processed_data/{name}.pkl'):
            print(f"Start {name}......")
            with open(f"data/{name}.json") as f:
                file = json.load(f)
                file = [data for data in file['data']]

            pre_data = pre.preprocess_data(file,
                                           train=not (name == 'test'),
Exemplo n.º 30
0
#
# print(len(X), len(Y))
# clf_rforest = Classification('RForest', X, Y)
# clf_rforest.get_classifier_object()
# clf_rforest.get_metrics()
# pickle.dump(clf_rforest.get_classifier(), open('RForest_model.pkl', 'wb'))
# print()
files = os.listdir('./data/gift')
for file in files:
    file_path = os.path.join('./data/gift', file)
    with open(file_path, encoding="utf-8") as data:
        json_data = json.load(data)

    # print(json_data)

    preprocess = Preprocess(json_data=json_data)
    preprocess.scale_points(calculate_scale=False)

    pose_objects = preprocess.new_pose_objects

    features = []

    features_obj = Features(pose_objects=pose_objects)
    features_obj.compute_features()
    features = features_obj.get_features()
    pca_model = pickle.load(open('pca.pkl', 'rb'))
    # reduced_feature_matrix = pca_model.transform(features)

    random_forest_classifier = pickle.load(open('RForest_model.pkl', 'rb'))

    # prediction = random_forest_classifier.predict(reduced_feature_matrix)
Exemplo n.º 31
0
parser.add_argument('--chd_hcmp', type=str, default='chd', help='chd or hcmp')
parser.add_argument('--epochs',
                    type=int,
                    default=1000,
                    help='Number of epochs for training')
parser.add_argument('--init_learning_rate',
                    type=float,
                    default=0.02,
                    help='Initial learning rate')

FLAGS, unparsed = parser.parse_known_args()
if FLAGS.chd_hcmp != "chd" and FLAGS.chd_hcmp != 'hcmp':
    raise NotImplementedError('choose "chd" or "hcmp" model, got {}'.format(
        FLAGS.roi_seg))

process = Preprocess()

print('data directory :', FLAGS.data_dir)
print('checkpoint directory :', FLAGS.checkpoint_dir)

train_data_dir = FLAGS.data_dir
image_filelist = []
for file in os.listdir(train_data_dir + '/image/'):
    if FLAGS.image_filename in file:
        image_filelist.append(os.path.join(train_data_dir, 'image', file))
print(image_filelist)

num_labels = 1
num_channels = 1
input_shape = (None, None, None, num_channels)
output_shape = (None, None, None, num_labels)
Exemplo n.º 32
0
class CNNPolicy(object):
	"""uses a convolutional neural network to evaluate the state of the game
	and compute a probability distribution over the next action
	"""

	def __init__(self, feature_list, **kwargs):
		"""create a policy object that preprocesses according to feature_list and uses
		a neural network specified by keyword arguments (see create_network())
		"""
		self.preprocessor = Preprocess(feature_list)
		kwargs["input_dim"] = self.preprocessor.output_dim
		self.model = CNNPolicy.create_network(**kwargs)
		self.forward = self._model_forward()

	def _model_forward(self):
		"""Construct a function using the current keras backend that, when given a batch
		of inputs, simply processes them forward and returns the output

		This is as opposed to model.compile(), which takes a loss function
		and training method.

		c.f. https://github.com/fchollet/keras/issues/1426
		"""
		model_input = self.model.get_input(train=False)
		model_output = self.model.get_output(train=False)
		forward_function = K.function([model_input], [model_output])

		# the forward_function returns a list of tensors
		# the first [0] gets the front tensor.
		# this tensor, however, has dimensions (1, width, height)
		# and we just want (width,height) hence the second [0]
		return lambda inpt: forward_function(inpt)[0][0]

	def batch_eval_state(self, state_gen, batch=16):
		"""Given a stream of states in state_gen, evaluates them in batches
		to make best use of GPU resources.

		Returns: TBD (stream of results? that would break zip(). 
			streaming pairs of pre-zipped (state, result)?)
		"""
		raise NotImplementedError()

	def eval_state(self, state):
		"""Given a GameState object, returns a list of (action, probability) pairs
		according to the network outputs
		"""
		tensor = self.preprocessor.state_to_tensor(state)

		# run the tensor through the network
		network_output = self.forward([tensor])

		# get network activations at legal move locations
		# note: may not be a proper distribution by ignoring illegal moves
		return [((x,y), network_output[x,y]) for (x,y) in state.get_legal_moves()]

	@staticmethod
	def create_network(**kwargs):
		"""construct a convolutional neural network.

		Keword Arguments:
		- input_dim:         depth of features to be processed by first layer (no default)
		- board:             width of the go board to be processed (default 19)
		- filters_per_layer: number of filters used on every layer (default 128)
		- layers:            number of convolutional steps (default 12)
		- filter_width_K:    (where K is between 1 and <layers>) width of filter on 
							 layer K (default 3 except 1st layer which defaults to 5).
							 Must be odd.
		"""
		defaults = {
			"board": 19,
			"filters_per_layer": 128,
			"layers": 12,
			"filter_width_1": 5
		}
		# copy defaults, but override with anything in kwargs
		params = defaults
		params.update(kwargs)

		# create the network:
		# a series of zero-paddings followed by convolutions
		# such that the output dimensions are also board x board
		network = Sequential()

		# create first layer
		network.add(convolutional.Convolution2D(
			input_shape=(params["input_dim"], params["board"], params["board"]),
			nb_filter=params["filters_per_layer"],
			nb_row=params["filter_width_1"],
			nb_col=params["filter_width_1"],
			init='uniform',
			activation='relu',
			border_mode='same'))

		# create all other layers
		for i in range(2,params["layers"]+1):
			# use filter_width_K if it is there, otherwise use 3
			filter_key = "filter_width_%d" % i
			filter_width = params.get(filter_key, 3)
			network.add(convolutional.Convolution2D(
				nb_filter=params["filters_per_layer"],
				nb_row=filter_width,
				nb_col=filter_width,
				init='uniform',
				activation='relu',
				border_mode='same'))

		# the last layer maps each <filters_per_layer> featuer to a number
		network.add(convolutional.Convolution2D(
			nb_filter=1,
			nb_row=1,
			nb_col=1,
			init='uniform',
			border_mode='same'))
		# reshape output to be board x board
		network.add(Reshape((params["board"],params["board"])))
		# softmax makes it into a probability distribution
		network.add(Activation('softmax'))

		return network

	def load_model(self, json_file):
		"""load the architecture specified in json_file into 'self'
		"""
		raise NotImplementedError()

	def save_model(self, json_file):
		"""write the network model and preprocessing features to the specified file
		"""
		raise NotImplementedError()

	def load_params(self, h5_file):
		"""load model parameters (weights) in the specified file
		"""
		raise NotImplementedError()

	def save_params(self, h5_file):
		"""save model parameters (weights) to the specified file
		"""
		raise NotImplementedError()
Exemplo n.º 33
0
class Model:
    def __init__(self, info, test_timestamp, pred_timestamp):
        self.info = info
        self.primary_timestamp = info['primary_timestamp']
        self.primary_id = info['primary_id']
        self.primary_agg = None
        self.label = info['label']
        self.schema = info['schema']
        self.schema.pop(self.label)
        self.origin_feat = list(self.schema.keys())
        print(f"\ninfo: {self.info}")

        self.dtype_cols = {
            'cat':
            [col for col, types in self.schema.items() if types == 'str'],
            'num':
            [col for col, types in self.schema.items() if types == 'num']
        }

        self.test_timestamp = test_timestamp
        self.pred_timestamp = pred_timestamp

        self.n_test_timestamp = len(pred_timestamp)

        self.split_num = 5
        self.update_interval = int(self.n_test_timestamp / self.split_num)

        self.lgb_model = LGBMRegressor()
        self.linear_model = LinearRegressor()

        self.use_Linear = True
        self.use_sample_weight = False
        self.use_exp_y = True

        self.tmpControlType = 4

        self.time_seg = 0

        self.linear_weight = 0
        self.lgb_weight = 0

        self.n_predict = 0
        self.isfirst_predict = True
        self.last_drop_col = []
        self.history = pd.DataFrame()

        self.new_model_n_predict = 0
        self.new_model_history_label = []
        self.lgb_predict_list = []
        self.linear_predict_list = []

        self.train_time_num = 0
        self.preprocess = None
        self.featParamsad = None
        self.feat_engine = None
        self.data = pd.DataFrame()
        self.train_time = 0

    def update_data(self, df):
        self.data = df

    def train(self, train_data, time_info):
        self.new_model_history_label = []
        self.lgb_predict_list = []
        self.linear_predict_list = []
        self.new_model_n_predict = 0

        self.data = train_data
        gc.collect()

        self.data['changed_y'] = self.data[self.label].copy()
        self.preprocess = Preprocess()
        self.preprocess.train_preprocess(self)

        if self.n_predict == 0:
            tt, interval, na_num = time_interval(
                self.data[self.primary_timestamp])
            with time_limit("featParamsad"):
                self.featParamsad = FeatParams(copy.deepcopy(self), tt,
                                               interval, na_num)
                self.featParamsad.fit_transform()

        gc.collect()

        self.feat_engine = Feat_engine(self.featParamsad)
        self.feat_engine.same_feat_train(self)
        self.feat_engine.history_feat_train(self)

        if self.use_sample_weight:
            TransExponentialDecay(self.primary_timestamp,
                                  init=1.0,
                                  finish=0.75,
                                  offset=0).fit(train_data)

        gc.collect()

        col = self.data.any()
        col = col[col].index
        self.data = self.data[col]
        gc.collect()

        X = self.data

        categorical_feature = []
        self.last_drop_col.append(self.primary_timestamp)

        if self.n_predict == 0:
            y = self.data.pop(self.label)
            y1 = self.data['changed_y']
            X_train, y_train, X_eval, y_eval = time_train_test_split(
                X, y, self.primary_timestamp, shuffle=False)
            if self.time_seg:
                seg_num = len(X_train) // self.time_seg
                X_train['time_seg'] = [
                    (((i // seg_num) + 1) if
                     ((i // seg_num) + 1) <= self.time_seg else self.time_seg)
                    for i in range(len(X_train))
                ]
                X_eval['time_seg'] = self.time_seg

            self.lgb_model.param_opt_new(X_train, y_train, X_eval, y_eval,
                                         categorical_feature, self.primary_id,
                                         self.primary_agg,
                                         self.primary_timestamp)
            X_train.drop(self.last_drop_col, axis=1, inplace=True)

            _, sc1 = self.lgb_model.valid_fit(X_train,
                                              y_train,
                                              X_eval,
                                              y_eval,
                                              categorical_feature,
                                              self.use_sample_weight,
                                              round=100)
            if (y != y1).any():
                y_train = y1[:len(y_train)]
                mod1 = self.lgb_model.model
                self.lgb_model.model = None
                _, sc2 = self.lgb_model.valid_fit(X_train,
                                                  y_train,
                                                  X_eval,
                                                  y_eval,
                                                  categorical_feature,
                                                  self.use_sample_weight,
                                                  round=100)
                if sc2 < sc1:
                    gc.collect()
                    self.use_exp_y = False
                    y = y1
                else:
                    y_train = y[:len(y_train)]
                    self.lgb_model.model = mod1
            lgb_preds, _ = self.lgb_model.valid_fit(X_train, y_train, X_eval,
                                                    y_eval,
                                                    categorical_feature,
                                                    self.use_sample_weight)

            col = X_train.any()
            col = col[col].index
            X_train = X_train[col]
            X_eval = X_eval[col]
            gc.collect()
            linear_preds = self.linear_model.valid_fit(X_train, y_train,
                                                       X_eval, y_eval,
                                                       self.use_sample_weight)
            gc.collect()
            if self.tmpControlType == 1:
                self.linear_weight, self.lgb_weight = 1, 0
            elif self.tmpControlType == 2:
                self.linear_weight, self.lgb_weight = 0, 1
            else:
                self.linear_weight, self.lgb_weight = serch_best_fusion_proportion(
                    linear_preds, lgb_preds, y_eval)
        else:
            if not self.use_exp_y:
                self.data[self.label] = self.data['changed_y'].copy()
            y = self.data.pop(self.label)
            self.data.pop('changed_y')

        X.drop(self.last_drop_col, axis=1, inplace=True)

        if self.time_seg:
            seg_num = len(X) // self.time_seg
            X['time_seg'] = [
                (((i // seg_num) + 1) if
                 ((i // seg_num) + 1) <= self.time_seg else self.time_seg)
                for i in range(len(X))
            ]

        with time_limit("linear_fit"):
            self.linear_model.fit(X, y, self.use_sample_weight)

        with time_limit("fit"):
            self.lgb_model.fit(X, y, categorical_feature,
                               self.use_sample_weight)
        next_step = 'predict'
        return next_step

    def after_train(self):
        pass

    def predict(self, new_history, pred_record, time_info):
        if (time_info['predict'] < 5) and not new_history.empty:
            if self.primary_id:
                lab_list = pred_record.join(new_history.set_index(
                    self.primary_id)[self.label],
                                            how='left',
                                            on=self.primary_id)
                lab_list = lab_list[self.label].fillna(
                    new_history[self.label].mean())
            else:
                lab_list = pred_record.shape[0] * list(
                    new_history[self.label])[-1:]
            return list(lab_list), 'predict'
        self.data = pred_record

        if not new_history.empty:
            y = new_history[self.label]
            self.history[self.label] = y
            if len(self.linear_predict_list):
                self.new_model_history_label.extend(
                    list(new_history[self.label]))

        if self.tmpControlType == 4:
            if ((self.new_model_n_predict >= 50) and
                ((self.new_model_n_predict % 50)
                 == 0)) or (self.new_model_n_predict == 15):
                linear_weight, lgb_weight = serch_best_fusion_proportion(
                    pd.Series(self.linear_predict_list),
                    pd.Series(self.lgb_predict_list),
                    pd.Series(self.new_model_history_label))
                self.linear_weight = self.linear_weight * 0.5 + linear_weight * 0.5
                self.lgb_weight = self.lgb_weight * 0.5 + lgb_weight * 0.5
            self.new_model_n_predict += 1

        # preprocess
        self.preprocess.test_preprocess(self)

        # feat_engine
        self.feat_engine.same_feat_test(self)
        hh = self.data.copy()
        self.feat_engine.history_feat_test(self)
        self.history = hh
        self.n_predict += 1

        self.data.drop(self.last_drop_col, axis=1, inplace=True)

        if self.time_seg:
            self.data['time_seg'] = self.time_seg

        linear_preds = self.linear_model.predict(self.data)
        lgb_preds = self.lgb_model.predict(self.data)
        predictions = self.linear_weight * linear_preds + self.lgb_weight * lgb_preds
        self.lgb_predict_list.extend(list(lgb_preds))
        self.linear_predict_list.extend(list(linear_preds))

        if (self.n_predict % self.update_interval == 0) and (
                self.n_predict < self.split_num * self.update_interval) and (
                    time_info['update'] > self.train_time * 1.25):
            next_step = 'update'
            self.feat_engine = None
            self.preprocess = None
            self.history = pd.DataFrame()
            self.isfirst_predict = True
            self.new_model_history_label = None
            self.lgb_predict_list = None
            self.linear_predict_list = None
            gc.collect()
        else:
            self.isfirst_predict = False
            next_step = 'predict'
        if self.n_predict == self.n_test_timestamp:
            self.feat_engine = None
            self.preprocess = None
            self.history = pd.DataFrame()
            self.isfirst_predict = True
            self.new_model_history_label = None
            self.lgb_predict_list = None
            self.linear_predict_list = None
            gc.collect()
        return list(predictions), next_step

    def update(self, train_data, test_history_data, time_info):
        t1 = time.time()
        print(f"\nUpdate time budget: {time_info['update']}s")

        total_data = pd.concat([train_data, test_history_data])

        total_data.drop_duplicates(subset=[self.primary_timestamp] +
                                   self.primary_id,
                                   inplace=True)
        total_data.reset_index(drop=True, inplace=True)
        self.train(total_data, time_info)

        print("Finish update\n")
        self.train_time = time.time() - t1
        next_step = 'predict'
        return next_step

    def save(self, model_dir, time_info):
        print(f"\nSave time budget: {time_info['save']}s")
        self.data = pd.DataFrame()
        gc.collect()
        pkl_list = []

        for attr in dir(self):
            if attr.startswith('__') or attr in [
                    'train', 'predict', 'update', 'save', 'load'
            ]:
                continue

            pkl_list.append(attr)
            pickle.dump(getattr(self, attr),
                        open(os.path.join(model_dir, f'{attr}.pkl'), 'wb'))

        pickle.dump(pkl_list,
                    open(os.path.join(model_dir, f'pkl_list.pkl'), 'wb'))

        print("Finish save\n")

    def load(self, model_dir, time_info):
        print(f"\nLoad time budget: {time_info['load']}s")

        pkl_list = pickle.load(
            open(os.path.join(model_dir, 'pkl_list.pkl'), 'rb'))

        for attr in pkl_list:
            setattr(
                self, attr,
                pickle.load(open(os.path.join(model_dir, f'{attr}.pkl'),
                                 'rb')))

        print("Finish load\n")
Exemplo n.º 34
0
from preprocessing import Preprocess

if __name__ == '__main__':

    #os.system('sudo raspivid -br 80')
    cam = Camera(1280, 1080, dispositivo=1, camera_type='WEBCAM')
    cam.set_focus(25)
    cam.set_exposure(100)
    cam.set_exposure_auto(3)

    #Inicializa Testplan
    testplan = Testplan(produto='solo', posto=1)
    imReference = testplan.get_imgRef()

    #Inicializa Modelo de Preprocessamento
    preprocess = Preprocess(produto='solo', posto=1)

    while True:

        ret, frame1 = cam.camera_read()
        frame1 = cv2.resize(frame1, (640, 480), interpolation=cv2.INTER_CUBIC)

        preprocess.executa_preprocessamento(imgFrame=frame1,
                                            imgRef=imReference)
        #preprocess.segmentation(frame1)
        imReg, frame2, Result = preprocess.custom_processing(
            imReference, frame1)

        if (Result == True):

            testplan.executa_teste(imReg)
Exemplo n.º 35
0
    def train(self, train_data, time_info):
        self.new_model_history_label = []
        self.lgb_predict_list = []
        self.linear_predict_list = []
        self.new_model_n_predict = 0

        self.data = train_data
        gc.collect()

        self.data['changed_y'] = self.data[self.label].copy()
        self.preprocess = Preprocess()
        self.preprocess.train_preprocess(self)

        if self.n_predict == 0:
            tt, interval, na_num = time_interval(
                self.data[self.primary_timestamp])
            with time_limit("featParamsad"):
                self.featParamsad = FeatParams(copy.deepcopy(self), tt,
                                               interval, na_num)
                self.featParamsad.fit_transform()

        gc.collect()

        self.feat_engine = Feat_engine(self.featParamsad)
        self.feat_engine.same_feat_train(self)
        self.feat_engine.history_feat_train(self)

        if self.use_sample_weight:
            TransExponentialDecay(self.primary_timestamp,
                                  init=1.0,
                                  finish=0.75,
                                  offset=0).fit(train_data)

        gc.collect()

        col = self.data.any()
        col = col[col].index
        self.data = self.data[col]
        gc.collect()

        X = self.data

        categorical_feature = []
        self.last_drop_col.append(self.primary_timestamp)

        if self.n_predict == 0:
            y = self.data.pop(self.label)
            y1 = self.data['changed_y']
            X_train, y_train, X_eval, y_eval = time_train_test_split(
                X, y, self.primary_timestamp, shuffle=False)
            if self.time_seg:
                seg_num = len(X_train) // self.time_seg
                X_train['time_seg'] = [
                    (((i // seg_num) + 1) if
                     ((i // seg_num) + 1) <= self.time_seg else self.time_seg)
                    for i in range(len(X_train))
                ]
                X_eval['time_seg'] = self.time_seg

            self.lgb_model.param_opt_new(X_train, y_train, X_eval, y_eval,
                                         categorical_feature, self.primary_id,
                                         self.primary_agg,
                                         self.primary_timestamp)
            X_train.drop(self.last_drop_col, axis=1, inplace=True)

            _, sc1 = self.lgb_model.valid_fit(X_train,
                                              y_train,
                                              X_eval,
                                              y_eval,
                                              categorical_feature,
                                              self.use_sample_weight,
                                              round=100)
            if (y != y1).any():
                y_train = y1[:len(y_train)]
                mod1 = self.lgb_model.model
                self.lgb_model.model = None
                _, sc2 = self.lgb_model.valid_fit(X_train,
                                                  y_train,
                                                  X_eval,
                                                  y_eval,
                                                  categorical_feature,
                                                  self.use_sample_weight,
                                                  round=100)
                if sc2 < sc1:
                    gc.collect()
                    self.use_exp_y = False
                    y = y1
                else:
                    y_train = y[:len(y_train)]
                    self.lgb_model.model = mod1
            lgb_preds, _ = self.lgb_model.valid_fit(X_train, y_train, X_eval,
                                                    y_eval,
                                                    categorical_feature,
                                                    self.use_sample_weight)

            col = X_train.any()
            col = col[col].index
            X_train = X_train[col]
            X_eval = X_eval[col]
            gc.collect()
            linear_preds = self.linear_model.valid_fit(X_train, y_train,
                                                       X_eval, y_eval,
                                                       self.use_sample_weight)
            gc.collect()
            if self.tmpControlType == 1:
                self.linear_weight, self.lgb_weight = 1, 0
            elif self.tmpControlType == 2:
                self.linear_weight, self.lgb_weight = 0, 1
            else:
                self.linear_weight, self.lgb_weight = serch_best_fusion_proportion(
                    linear_preds, lgb_preds, y_eval)
        else:
            if not self.use_exp_y:
                self.data[self.label] = self.data['changed_y'].copy()
            y = self.data.pop(self.label)
            self.data.pop('changed_y')

        X.drop(self.last_drop_col, axis=1, inplace=True)

        if self.time_seg:
            seg_num = len(X) // self.time_seg
            X['time_seg'] = [
                (((i // seg_num) + 1) if
                 ((i // seg_num) + 1) <= self.time_seg else self.time_seg)
                for i in range(len(X))
            ]

        with time_limit("linear_fit"):
            self.linear_model.fit(X, y, self.use_sample_weight)

        with time_limit("fit"):
            self.lgb_model.fit(X, y, categorical_feature,
                               self.use_sample_weight)
        next_step = 'predict'
        return next_step
Exemplo n.º 36
0
import pickle
import math
import bisect

with open('collection.pickle','rb') as f:
    collection = pickle.load(f)
with open('max_tf.pickle','rb') as f:
    max_tf = pickle.load(f)
with open('documentRoot.pickle','rb') as f:
    documentRoot = pickle.load(f)
with open('objs.pickle','rb') as f:
    documentLength,subset , get_index, getReference = pickle.load(f)

while True:
    query = input("Enter a query: ")
    final_query = preprocessing.replace_dates(query)
    final_query = preprocessing.lemma_stop(final_query)
    for i in range(len(final_query)):
        final_query[i] = unidecode.unidecode(final_query[i])
        final_query[i] = final_query[i].lower()
    
    print(final_query)
    
    tf_query = {}
    for w in final_query:
        if w not in tf_query:
            tf_query[w] = 1
        else:
            tf_query[w] += 1
            
    scores = {}