Пример #1
0
 def __str__(self):
     import encoding # import late to avoid circular imports
     code = encoding.encoding(self)
     types = code["types"]
     args = dict((a.name, pretty_print(a, types.get(a.name, "")))
                 for a in self.args)
     return code["format"].format(**args)
Пример #2
0
class ScoringService(object):
    env = {
        'GRAPH_BUCKET': kg_path,
        'KG_DBPEDIA_KEY': dbpedia_key,
        'KG_ENTITY_KEY': entity_key,
        'KG_RELATION_KEY': relation_key,
        'KG_ENTITY_INDUSTRY_KEY': entity_industry_key,
        'KG_VOCAB_KEY': vocab_key,
        'DATA_INPUT_KEY': data_input_key,
        'TRAIN_OUTPUT_KEY': train_output_key
    }

    graph = kg.Kg(env)  # Where we keep the model when it's loaded
    model = encoding.encoding(graph, env)

    @classmethod
    def get_model(cls):
        """Get the model object for this instance, loading it if it's not already loaded."""
        if cls.model == None:
            # import kg
            # import encoding
            cls.model = model
            # with open(os.path.join(model_path, 'decision-tree-model.pkl'), 'r') as inp:
            #     cls.model = pickle.load(inp)
        return cls.model

    @classmethod
    def predict(cls, input):
        """For the input, do the predictions and return them.

        Args:
            input (a pandas dataframe): The data on which to do the predictions. There will be
                one prediction per row in the dataframe"""
        clf = cls.get_model()
        return clf[input]
Пример #3
0
 def _encoding(self):
     """helper method to lookup the encoding in the font"""
     c = cursor(self.data1, "/Encoding")
     token1 = c.gettoken()
     token2 = c.gettoken()
     if token1 == "StandardEncoding" and token2 == "def":
         self.encoding = encoding.adobestandardencoding
     else:
         encvector = [None] * 256
         while 1:
             self.encodingstart = c.pos
             if c.gettoken() == "dup":
                 break
         while 1:
             i = c.getint()
             glyph = c.gettoken()
             if 0 <= i < 256:
                 encvector[i] = glyph[1:]
             token = c.gettoken()
             assert token == "put"
             self.encodingend = c.pos
             token = c.gettoken()
             if token == "readonly" or token == "def":
                 break
             assert token == "dup"
         self.encoding = encoding.encoding(encvector)
Пример #4
0
class ScoringService(object):
    import kg
    import encoding
    graph = kg.Kg(
        kg_folder=kg_path)  # Where we keep the model when it's loaded
    model = encoding.encoding(graph)

    @classmethod
    def get_model(cls):
        """Get the model object for this instance, loading it if it's not already loaded."""
        if cls.model == None:
            # import kg
            # import encoding
            cls.model = model
            # with open(os.path.join(model_path, 'decision-tree-model.pkl'), 'r') as inp:
            #     cls.model = pickle.load(inp)
        return cls.model

    @classmethod
    def predict(cls, input):
        """For the input, do the predictions and return them.

        Args:
            input (a pandas dataframe): The data on which to do the predictions. There will be
                one prediction per row in the dataframe"""
        clf = cls.get_model()
        return clf[input]
Пример #5
0
def _read_txt(root=None):
    path = TXT_FILE
    result = {}
    for txt in res.get_texts(path, locale=True, root=root):
        lines = txt.split("\n")
        encoding_name = encoding.encoding(txt)
        for line in lines:
            try:
                line = line.strip()
                if line:
                    k, v = line.split(None, 1)
                    if v:
                        try:
                            v = unicode(v, encoding_name)
                        except ValueError:
                            v = unicode(v, encoding_name, "replace")
                            warning("in '%s', encoding error: %s", path, line)
                        result[k] = v
                    else:
                        warning("in '%s', line ignored: %s", path, line)
            except:
                warning("in '%s', syntax error: %s", path, line)
    result["9998"] = u","
    result["9999"] = u"."
    return result
Пример #6
0
 def _encoding(self):
     """helper method to lookup the encoding in the font"""
     c = cursor(self.data1, "/Encoding")
     token1 = c.gettoken()
     token2 = c.gettoken()
     if token1 == "StandardEncoding" and token2 == "def":
         self.encoding = encoding.adobestandardencoding
     else:
         encvector = [None]*256
         while 1:
             self.encodingstart = c.pos
             if c.gettoken() == "dup":
                 break
         while 1:
             i = c.getint()
             glyph = c.gettoken()
             if 0 <= i < 256:
                 encvector[i] = glyph[1:]
             token = c.gettoken(); assert token == "put"
             self.encodingend = c.pos
             token = c.gettoken()
             if token == "readonly" or token == "def":
                 break
             assert token == "dup"
         self.encoding = encoding.encoding(encvector)
Пример #7
0
def _read_txt(root=None):
    path = TXT_FILE
    result = {}
    for txt in res.get_texts(path, locale=True, root=root):
        lines = txt.split("\n")
        encoding_name = encoding.encoding(txt)
        for line in lines:
            try:
                line = line.strip()
                if line:
                    k, v = line.split(None, 1)
                    if v:
                        try:
                            v = unicode(v, encoding_name)
                        except ValueError:
                            v = unicode(v, encoding_name, "replace")
                            warning("in '%s', encoding error: %s", path, line)
                        result[k] = v
                    else:
                        warning("in '%s', line ignored: %s", path, line)
            except:
                warning("in '%s', syntax error: %s", path, line)
    result["9998"] = u","
    result["9999"] = u"."
    return result
Пример #8
0
    def goClick(self, key, sentence):
        for k in key:
            if k in '~!@#$%^&*()_+`1234567890-=<>?,./':
                tkinter.messagebox.showinfo("오류", "특수문자는 입력 불가입니다.")
                self.root.destroy()
                encodeGUI()

        for s in sentence:
            if s in '~!@#$%^&*()_+`1234567890-=<>?,./':
                tkinter.messagebox.showinfo("오류", "특수문자는 입력 불가입니다.")
                self.root.destroy()
                encodeGUI()
        from GUI.result import ResultGUI
        self.root.destroy()
        encoding.encoding(key, sentence)
        ResultGUI()
Пример #9
0
def encode(block_size,
           imagePath="test.jpg",
           encodedFile="encoded",
           probabilityFile="probability.npy",
           float_type='float64'):
    img = cv2.imread(imagePath, cv2.IMREAD_GRAYSCALE).flatten()
    img = numpy.append(img, [0] * (block_size - len(img) % block_size))
    codes = numpy.array([])
    probability = (collections.Counter(img))
    print("Encoding Started")
    prob = [None] * 256
    for i in range(0, 256):
        prob[i] = probability[i]

    if float_type != 'float16' and float_type != 'float32' and float_type != 'float64':
        float_type = 'float64'

    prob = numpy.asarray(prob)
    prob = numpy.true_divide(prob, len(img))

    for i in range(0, len(img), block_size):
        x = e.encoding(prob, img[i:(block_size + i)])
        if not (0 <= x <= 1):
            print("encoding error")
        codes = numpy.append(codes, [x])
    numpy.save('original.npy', img)
    print("Encoding Done:")
    numpy.save(encodedFile, codes.astype(float_type))  # save
    print(" ->" + encodedFile + " is created")
    numpy.save(probabilityFile, prob)  # save
    print(" ->" + probabilityFile + " is created\n")

    return prob
Пример #10
0
Файл: asm.py Проект: turbana/cpu
def label_apply(labels, tok, pos, signed=True, pc_relative=False):
    if tok.name == "jmp":
        pc_relative = True
    enc = encoding.encoding(tok)
    for j, arg in enumerate(tok.args):
        if isinstance(arg, tokens.Expression):
            label_apply(labels, arg, pos, False, pc_relative)
        elif isinstance(arg, tokens.Label):
            bits = 64
            signed = True
            # If we found an encoding then we're materializing a label within
            # an instruction and we want to ensure the proper size/sign.
            # Otherwise, we're inside an expression, so use a bit value that
            # won't truncate as expression do their own bit checking.
            if enc:
                syntax = [x[1] for x in enc["ast"] if len(x) == 2][j]
                signed = syntax.startswith("s")
                bits = int(syntax[1:])
            tok.args[j] = label_find(labels, arg, pos, bits, signed,
                                     pc_relative)
Пример #11
0
    def run(self):
        # initializing the vocabularies
        trainData = dataSet(self.training_file, 'train')
        # print(trainData.getIntentLabels())
        testData = dataSet(self.test_file, 'test', trainData.getWordVocab(),
                           trainData.getTagVocab(), trainData.getIntentVocab(),
                           trainData.getIndex2Word(), trainData.getIndex2Tag(),
                           trainData.getIntentLabels())

        intent_target_file = self.result_path + '/' + 'intent.list'
        with open(intent_target_file, 'w') as f:
            for intent in trainData.getIntentLabels():
                f.write(f"{intent}\n")

        tag_target_file = self.result_path + '/' + 'tag.list'
        with open(tag_target_file, 'w') as f:
            for tag in trainData.getIndex2Tag():
                f.write(f"{tag}\n")

        # preprocessing by padding 0 until maxlen
        X_train = sequence.pad_sequences(trainData.dataset['utterances'],
                                         maxlen=self.time_length,
                                         dtype='int32',
                                         padding='pre')
        X_test = sequence.pad_sequences(testData.dataset['utterances'],
                                        maxlen=self.time_length,
                                        dtype='int32',
                                        padding='pre')

        y_intent_train = trainData.dataset['intents']
        pad_y_tags_train = sequence.pad_sequences(trainData.dataset['tags'],
                                                  maxlen=self.time_length,
                                                  dtype='int32',
                                                  padding='pre')

        y_intent_test = testData.dataset['intents']
        pad_y_tags_test = sequence.pad_sequences(testData.dataset['tags'],
                                                 maxlen=self.time_length,
                                                 dtype='int32',
                                                 padding='pre')

        num_sample_train, max_len = np.shape(X_train)
        num_sample_test, _ = np.shape(X_test)

        if not self.nodev:
            validData = dataSet(self.validation_file, 'val',
                                trainData.getWordVocab(),
                                trainData.getTagVocab(),
                                trainData.getIntentVocab(),
                                trainData.getIndex2Word(),
                                trainData.getIndex2Tag(),
                                trainData.getIntentLabels())
            X_dev = sequence.pad_sequences(validData.dataset['utterances'],
                                           maxlen=self.time_length,
                                           dtype='int32',
                                           padding='pre')
            y_intent_dev = validData.dataset['intents']
            pad_y_tag_dev = sequence.pad_sequences(validData.dataset['tags'],
                                                   maxlen=self.time_length,
                                                   dtype='int32',
                                                   padding='pre')
            num_sample_dev, _ = np.shape(X_dev)

        # encoding input vectors
        self.input_vocab_size = trainData.getWordVocabSize()
        self.output_intent_size = trainData.getIntentVocabSize()
        self.output_vocab_size = trainData.getTagVocabSize()

        print('Building model architecture!!!!')
        self.build()
        print(self.model.summary())

        # data generation
        sys.stderr.write("Vectorizing the input.\n")
        y_intent_train = to_categorical(y_intent_train,
                                        num_classes=self.output_intent_size)
        y_tags_train = encoding(pad_y_tags_train, '1hot', self.time_length,
                                self.output_vocab_size)

        if not self.nodev:
            y_intent_dev = to_categorical(y_intent_dev,
                                          num_classes=self.output_intent_size)
            y_tags_dev = encoding(pad_y_tag_dev, '1hot', self.time_length,
                                  self.output_vocab_size)

        # encode history for memory network
        H_train = sequence.pad_sequences(history_build(trainData, X_train),
                                         maxlen=(self.time_length *
                                                 self.his_length),
                                         dtype='int32',
                                         padding='pre')
        H_test = sequence.pad_sequences(history_build(testData, X_test),
                                        maxlen=(self.time_length *
                                                self.his_length),
                                        dtype='int32',
                                        padding='pre')
        if not self.nodev:
            H_dev = sequence.pad_sequences(history_build(validData, X_dev),
                                           maxlen=(self.time_length *
                                                   self.his_length),
                                           dtype='int32',
                                           padding='pre')

        if self.record_epoch != -1 and self.load_weight is None:
            total_epochs = self.max_epochs
            self.max_epochs = self.record_epoch
            for i in range(1, total_epochs / self.record_epoch + 1):
                num_iter = i * self.record_epoch
                self.train(H_train=H_train,
                           X_train=X_train,
                           y_train=[y_intent_train, y_tags_train],
                           H_dev=H_dev,
                           X_dev=X_dev,
                           y_dev=[y_intent_dev, y_tags_dev])
                if not self.nodev:
                    self.test(H=H_dev,
                              X=X_dev,
                              data_type='dev.' + str(num_iter),
                              tagDict=trainData.dataSet['id2tag'],
                              pad_data=pad_X_dev)
                self.test(H=H_test,
                          X=X_test,
                          data_type='test.' + str(num_iter),
                          tagDict=trainData.dataSet['id2tag'],
                          pad_data=pad_X_test)
                # save weights for the current model
                whole_path = self.mdl_path + '/' + self.model_arch + '.' + str(
                    num_iter) + '.h5'
                sys.stderr.write("Writing model weight to %s...\n" %
                                 whole_path)
                self.model.save_weights(whole_path, overwrite=True)
        else:
            self.train(H_train=H_train,
                       X_train=X_train,
                       y_train=[y_intent_train, y_tags_train],
                       H_dev=H_dev,
                       X_dev=X_dev,
                       y_dev=[y_intent_dev, y_tags_dev])

            # if not self.nodev:
            #     self.test(H=H_dev, X=X_dev, data_type='dev', tagDict=trainData.dataSet['id2tag'], pad_data=pad_X_dev)

            # self.test(H=H_test, X=X_test, data_type='test', tagDict=trainData.dataSet['id2tag'], pad_data=pad_X_test)

            with open('model.json') as f:
                json.dump(f, self.model.to_json())

            if self.load_weight is None:
                whole_path = self.mdl_path + '/' + self.model_arch + '.final-' + str(
                    self.max_epochs) + '.h5'
                sys.stderr.write("Writing model weight to %s...\n" %
                                 whole_path)
                self.model.save_weights(whole_path, overwrite=True)
Пример #12
0
                continue
            writer.writerow([
                round(float(line[index] * 100),
                      3), route, cmd[i][dico_pivot['acheteur.dateCreation']],
                cmd[i][dico_pivot['id']], cmd[i][dico_pivot['typeParcours']],
                cmd[i][dico_pivot['modeLivraison.type']],
                cmd[i][dico_pivot['caracteristiquesLigne.techno']],
                cmd[i][dico_pivot['acte']],
                cmd[i][dico_pivot['historiques.historiqueStatut.valeur']]
            ])


if __name__ == '__main__':

    TRAIN, PREDICT, EPOCH, BATCH, RESTORE, PATH = gestion_arg()
    encode = encoding(PATH)
    by_model = ByModel()
    save_data = encode.recover_data(encode)
    pivot = save_data[0]
    dico_pivot = {}
    for i, d in enumerate(pivot):
        for a in LISTPIVOT:
            if a == d:
                dico_pivot[a] = i
    result = encode.encoding_list_ascii(encode)
    del (result[0])
    if (PREDICT == True):
        by_model.load_weights(RESTORE)
        result = scaler.fit_transform(result)
        try:
            prediction = by_model(result)
Пример #13
0
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from missing_data import imputate_missing_data, drop_col_with_missing
from encoding import encoding
from model import xgboost_model, forest_model

train_data = pd.read_csv('../input/train.csv')
test_data = pd.read_csv('../input/test.csv')
y = train_data.SalePrice
X = train_data.drop(['SalePrice'], axis=1)
encoded_train, encoded_test = encoding(X, test_data)

# handle missing data
final_data = []
final_data.append(
    imputate_missing_data(encoded_train.copy(), encoded_test.copy()))
final_data.append(
    drop_col_with_missing(encoded_train.copy(), encoded_test.copy()))

# split the data set
for data in final_data:
    (data['train_X'], data['test_X'], data['train_y'],
     data['test_y']) = train_test_split(data['final_train'],
                                        y,
                                        random_state=43)

# train the models
for data in final_data:
    model = (xgboost_model(data['train_X'], data['train_y'], data['test_X'],
                           data['test_y']))
Пример #14
0
    "batch_size": 1000,
    "target": "raw",  # "raw" or "edges"
    "total_epochs": 50,
    "type_n2v": "n2v",  # "n2v" or "nn2v"
    "n_hidden": [128, 32, 8, 2],
    "data_type": "npy"  # "npy" or "pickle"
}
input_dir = configs["basedir"]
for i in [
        "encoded_vector", configs["type_n2v"],
        str(configs["h_size"]), configs["data_type"]
]:
    input_dir += i + "/"
    if not os.path.isdir(input_dir):
        os.mkdir(input_dir)

output_dir = "../output/"
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)
for i in [configs["type_n2v"], str(configs["h_size"]), configs["data_type"]]:
    output_dir += i + "/"
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

preprocessing(configs)
encoding(configs)
if configs["data_type"] == "npy":
    pickle2npy(configs)
training(configs)
testing(configs)