예제 #1
0
def load_utterance_dataset():
    """
    """
    thread_data =  gen_structured_data_from_utterances("../data/utterances.txt")
    print "load_utterance_dataset"
    sens = train_sen_flatten(thread_data)

    # Vectorize Init...
    vectorize = Vectorize()
    logging.info("training the sentence model")
    vectorize.train_google_model("../data/G.bin")
    logging.info("the google model Done!")

    # apply the word vector to the thread data
    for sen_dict in thread_data:

        # get max sen len
        max_sen_len = 0
        len_of_sens = [len(v[1]) for k,v in sen_dict.items()]
        max_sen_len = max(len_of_sens)
        print max_sen_len
        for k, v in sen_dict.items():
            sen_vec = np.zeros((len(v[1]), 300))
            for i in xrange(len(v[1])):
                try:
                    sen_vec[i] = vectorize.google_model[v[1][i]]
                except:
                    continue
            v[2] = sen_vec

    # gen the train_x, train_y
    """
    thread_x, thread_y = gen_structured_xy(thread_data)
    # pdb.set_trace()
    logging.info(" rnn start training")
    # prepare train_data and test_data
    data_x = [data for thread in thread_x for data in thread]
    data_y = [data for thread in thread_y for data in thread]

    TRAIN_SET = 200000
    VALID_SET = 210000
    print "data len: ", len(data_x)
    train_x = data_x[:TRAIN_SET]
    train_y = data_y[:TRAIN_SET]
    valid_x = data_x[TRAIN_SET:VALID_SET]
    valid_y = data_y[TRAIN_SET:VALID_SET]
    test_x = data_x[VALID_SET:]
    test_y = data_y[VALID_SET:]
    return train_x, train_y, valid_x, valid_y, test_x, test_y
    """
    data_x, data_y = gen_structured_xy(thread_data)
    TRAIN_SET = 1000
    VALID_SET = 1110
    train_x = data_x[:TRAIN_SET]
    train_y = data_y[:TRAIN_SET]
    valid_x = data_x[TRAIN_SET:VALID_SET]
    valid_y = data_y[TRAIN_SET:VALID_SET]
    test_x = data_x[VALID_SET:]
    test_y = data_y[VALID_SET:]
    return train_x, train_y, valid_x, valid_y, test_x, test_y
예제 #2
0
def get_embedding_matrix_and_vectorizer(conversations):
    from vectorize import Vectorize
    vectorizer = Vectorize(conversations, MAX_VOCAB_SIZE)
    word_index = vectorizer.word2idx
    # train_sequences = vectorizer.vectorize_data(conversations, MAX_SEQUENCE_LENGTH)
    vectorizer.save_tokenizer(TOKENIZER_PATH)

    from embedding import Embedding
    embed = Embedding(word_index)
    embedding_matrix = embed.get_embedding_matrix()

    return embedding_matrix, vectorizer
예제 #3
0
def create_atari_env(env_id):
    env = gym.make(env_id)
    env = Vectorize(env)
    env = AtariRescale42x42(env)
    env = DiagnosticsInfo(env)
    env = Unvectorize(env)
    return env
예제 #4
0
def check_embedding_quality(conversations):
    from vectorize import Vectorize
    vectorizer = Vectorize(conversations, MAX_VOCAB_SIZE)
    word_index = vectorizer.word2idx

    from embedding import Embedding
    embed = Embedding(word_index)

    docu_vocab = vectorizer.word_counts
    embedding_vocab = embed.get_embedding_vocab()
    oov_words = embed.check_coverage(docu_vocab, embedding_vocab)
    print('Collected oov words.')
    return oov_words
    def __init__(self, data_infile, fasttext_model_path):

        self.loss_weights = {'siamese': 1, 'grounding': 1}

        with open(data_infile, 'r') as f:
            self.clusters = json.load(f)

        self.lookup = {}
        for label, strings in self.clusters.items():
            for string in strings:
                self.lookup[string] = label

        # embed all names
        all_names = set(self.lookup.keys())
        self.vectorize = Vectorize(fasttext_model_path)
        self.pretrained_name_embeddings = self.vectorize.create_reach_object(
            all_names)

        self.amount_negative_names = 1
        self.triplet_margin = 0.1
        self.anchor_margin = 0

        torch.autograd.set_detect_anomaly(True)
예제 #6
0
def Learn():
    logreg = SGDClassifier(loss='log', n_jobs=-1)
    data = read_csv('myds.csv', delimiter=';')
    print(
        "Датасет загружен в память\nПриводим загруженные данные к необходимому виду..."
    )

    X, vectorw, vectorc = Vectorize(data.Sentence)
    le = LabelEncoder()
    y = le.fit_transform(data.Category)
    print("Начинаем процесс обучения...")

    X_train, X_test, y_train, y_test = train(X, y)
    logreg.fit(X_train, y_train)
    print("Обучение закончено")
    dump(logreg, 'model.joblib')
    dump(vectorw, 'vectorw.joblib')
    dump(vectorc, 'vectorc.joblib')
    print("Дамп модели сохранен в файл 'model.joblib'")
예제 #7
0
    def format_vars(self):
        field_var = [x.get() for x in self.field_var]

        segment_var = self.segment_var.get()
        wh_var = self.wh_var.get()
        region_var = self.region_var.get()
        level_var = self.level_var.get()
        obj_var = self.objective.get()
        natl_acct_var = self.natl_acct.get()

        try:
            wh_var = [int(wh_var)]

        except ValueError:
            assert wh_var == 'All'
            wh_var = self.df['legacy_division_cd'].unique()

        try:
            region_var = [int(region_var)]

        except ValueError:
            assert region_var == 'All'
            region_var = self.df['legacy_system_cd'].unique()

        params = [
            obj_var, segment_var, field_var, natl_acct_var, self.field_options,
            self.cutoff, self.weights, self.df, self.fname
        ]

        if level_var == 'warehouse':
            self.model = Vectorize(level_var, wh_var, *params)

        elif level_var == 'region':
            self.model = Vectorize(level_var, region_var, *params)

        else:
            self.model = Vectorize('region', 'All', *params)
예제 #8
0
class GUI:
    def __init__(self):
        self.root = Tk()
        self.home()
        self.root.geometry("500x200")

    def home(self):
        # self.root.eval('tk::PlaceWindow %s center' % self.root.winfo_pathname(self.root.winfo_id()))
        self.root.title("Click UPLOAD to begin")
        self.root.config(bg="white")

        frame = Frame(self.root)

        Button(frame, text="UPLOAD", width=8,
               command=self.get_filename).grid(column=1, row=3, sticky=W)
        Button(frame, text="QUIT", width=8,
               command=self.root.destroy).grid(column=2, row=3, sticky=W)

        frame.grid(row=3, column=2)

        self.root.mainloop()

    def get_filename(self):
        try:
            self.root.withdraw()
            self.loading.withdraw()

        except:
            pass

        from tkinter.filedialog import askopenfilename
        self.fname = askopenfilename()

        if self.fname is tuple():
            self.home()
        else:
            self.loading1()

    def loading1(self):
        loading = Toplevel()
        self.loading = loading
        # self.root.eval('tk::PlaceWindow %s center' % self.loading.winfo_pathname(self.loading.winfo_id()))
        loading.title("Loading...")
        loading.config(bg="white")
        frame = Frame(loading)

        Label(frame, text='Loading...').grid(row=2,
                                             column=2,
                                             pady=100,
                                             padx=100)
        frame.grid(row=4, column=4)

        self.loading.after(200, self.check_read)
        loading.mainloop()

    def check_read(self):
        failed_read = self.read_file()
        if failed_read is True:
            self.get_filename()
        else:
            failed_set_var = self.set_vars()
            if failed_set_var is True:
                self.get_filename()
            else:
                self.input_page()

    def read_file(self):
        self.ext = self.fname.split('.')[1]
        if self.ext == 'xlsx':
            try:
                self.df = pd.read_excel(self.fname, sheet_name='Sheet1')
            except:
                self.df = pd.read_excel(self.fname)
        elif self.ext == '.csv':
            self.df = pd.read_csv(self.fname)
        else:
            return 'File extension not valid. Select another file.'

        self.df.columns = [
            c.replace(' ', '_').lower() for c in self.df.columns
        ]

        try:
            self.df = self.df[self.df['sales_channel'] == 'Warehouse']
        except KeyError:
            pass

        keep = [
            'legacy_division_cd',
            'legacy_system_cd',
            'segment',
            'legacy_product_cd',
            'sales_channel',
            'sales_6_mos',
            'cogs_6mos',
            'qty_6mos',
            'picks_6mos',
            'net_oh',
            'legacy_customer_cd',
            'core_item_flag',
            'margin_%',
            'net_oh_$',
            'dioh',
            'national_acct_flag',
            'item_poi_days',
        ]

        self.df = self.df[keep].fillna(0).replace('-',
                                                  0).replace('No Venloc', 0)

    def set_vars(self):
        options = [
            'sales_6_mos',
            'cogs_6mos',
            'qty_6mos',
            'picks_6mos',
            'net_oh_$_6mos',
            # 'pallet_quantity',
            'margin_%'
        ]

        try:
            self.field_options = ['turn_6mos', 'profit_6mos', 'customers_per_product'] + \
                                 [x for x in self.df.columns if x in options]
        except AttributeError:
            self.loading.withdraw()
            self.get_filename()

        wh_options = self.df.legacy_division_cd.unique().astype(str)

        if wh_options is None:
            return 'no warehouses'

        self.segment_options = self.df.segment.unique()
        if self.segment_options is None:
            return 'no segments'

        self.region_options = self.df['legacy_system_cd'].unique()
        if self.region_options is None:
            return 'no regions'

        self.level_options = ['warehouse', 'region', 'enterprise']
        self.region_options = np.append(['All'], self.region_options)
        self.wh_options = np.append(['All'], wh_options)

        self.wh_var = StringVar(self.root, value='All')
        self.segment_var = StringVar(self.root, value=self.segment_options[0])
        self.cutoff_var = StringVar(self.root, value='20')
        self.field_var = []
        self.weight_var = [
            StringVar(self.root, value='33.3'),
            StringVar(self.root, value='33.3'),
            StringVar(self.root, value='33.3')
        ]
        self.region_var = StringVar(self.root, value='All')
        self.level_var = StringVar(self.root, value='warehouse')
        self.objective = StringVar(self.root, value='Identify core products')
        self.natl_acct = IntVar(self.root, value=1)

    def input_page(self):
        self.loading.withdraw()

        try:
            # if tool has already done one analysis and needs to do another
            self.outputs.withdraw()

        except:
            pass

        define = Toplevel()
        self.define = define
        # self.root.eval('tk::PlaceWindow %s center' % self.define.winfo_pathname(self.define.winfo_id()))
        define.title("Define inputs")
        define.config(bg="white")
        frame = Frame(define)
        self.input_frame = frame

        Label(frame,
              text="Modify model inputs below and click RUN. ").grid(row=0,
                                                                     column=1,
                                                                     pady=10)
        Button(frame, text="RUN", width=8,
               command=self.check_inputs).grid(row=0, column=2, pady=10)

        Label(frame, text="Select model goal: ").grid(row=1, column=0, pady=10)
        OptionMenu(frame, self.objective, 'Identify core products', 'Identify products to remove')\
            .grid(row=1, column=1, pady=10)

        Label(frame, text="Select segment: ").grid(row=2, column=0, pady=10)
        OptionMenu(frame, self.segment_var,
                   *self.segment_options).grid(row=2, column=1, pady=10)

        Label(frame,
              text="Exclude products ordered by national account(s)? ").grid(
                  row=3, column=0, pady=10)
        Checkbutton(frame, text='', variable=self.natl_acct).grid(row=3,
                                                                  column=1)

        Label(frame,
              text="Select scope level and press REFRESH: ").grid(row=4,
                                                                  column=0,
                                                                  pady=10)
        OptionMenu(frame, self.level_var, *self.level_options).grid(row=4,
                                                                    column=1,
                                                                    pady=10)
        Button(frame, text='REFRESH',
               command=self.popup_level_options).grid(row=4, column=2, pady=10)

        Label(frame, text="Set % to identify: ").grid(row=5, column=0, pady=10)
        Entry(frame, textvariable=self.cutoff_var).grid(row=5,
                                                        column=1,
                                                        pady=10)

        Label(frame,
              text="Select field(s) to consider and enter weights: ").grid(
                  row=6, column=0, pady=10)

        self.btns = []
        self.entries = []
        self.rows = []
        preselected = list(
            compress(list(range(len(self.field_options))),
                     [float(x.get()) for x in self.weight_var]))

        for idx in range(len(self.field_options)):
            if idx in preselected:
                var = IntVar(self.root, value=1)
                weightvar = StringVar(self.root, value=33.33)
            else:
                var = IntVar(self.root)
                weightvar = IntVar(self.root, value=0)
                self.weight_var.append(weightvar)

            self.field_var.append(var)

            txt = self.field_options[idx]

            btn = Checkbutton(frame, text=txt, variable=self.field_var[idx])
            btn.grid(row=6 + idx, column=1, pady=10)

            entry = Entry(frame,
                          text=self.weight_var[idx].get(),
                          textvariable=self.weight_var[idx])
            entry.grid(row=6 + idx, column=2, pady=10, padx=10)

            self.btns += [btn]
            self.entries += [entry]
            self.rows += [6 + idx]

        self.last_row = 6 + len(self.field_options)

        Button(frame, text="Set equal weights among checked fields", command=self.reset_weights)\
            .grid(row=6, column=3, pady=10)

        frame.grid(row=4, column=len(self.field_options))

        define.mainloop()

    def popup_level_options(self):
        try:
            self.wh_label.destroy()
            self.wh_optionmenu.destroy()
        except AttributeError:
            pass

        try:
            self.region_label.destroy()
            self.region_optionmenu.destroy()
        except AttributeError:
            pass

        if self.level_var.get() == 'warehouse':
            self.wh_label = Label(self.input_frame,
                                  text="Select warehouse(s): ")
            self.wh_label.grid(row=4, column=3, pady=10, padx=10)
            self.wh_optionmenu = OptionMenu(self.input_frame, self.wh_var,
                                            *self.wh_options)
            self.wh_optionmenu.grid(row=4, column=4, pady=10)

        elif self.level_var.get() == 'region':
            self.region_label = Label(self.input_frame,
                                      text="Select region(s): ")
            self.region_label.grid(row=4, column=3, pady=10)
            self.region_optionmenu = OptionMenu(self.input_frame,
                                                self.region_var,
                                                *self.region_options)
            self.region_optionmenu.grid(row=4, column=4, pady=10)

    def reset_weights(self):
        total = sum([x.get() for x in self.field_var])
        for x in range(len(self.field_var)):
            if self.field_var[x].get() == 1:
                self.entries[x].destroy()
                self.weight_var[x] = StringVar(self.root,
                                               value=round(100 / total, 2))
                self.entries[x] = Entry(self.input_frame,
                                        text=self.weight_var[x].get(),
                                        textvariable=self.weight_var[x])
                self.entries[x].grid(row=self.rows[x],
                                     column=2,
                                     pady=10,
                                     padx=10)
            else:
                self.entries[x].destroy()
                self.weight_var[x] = StringVar(self.root, value=0)
                self.entries[x] = Entry(self.input_frame,
                                        text=self.weight_var[x].get(),
                                        textvariable=self.weight_var[x])
                self.entries[x].grid(row=self.rows[x],
                                     column=2,
                                     pady=10,
                                     padx=10)

    def check_inputs(self):
        try:
            self.ErrorLabel.destroy()

        except AttributeError:
            pass

        err = ''

        try:
            self.cutoff = float(self.cutoff_var.get())

        except ValueError:
            err = 'ERROR. Enter a numeric value for % core products.'

        try:
            self.weights = [float(w.get()) for w in self.weight_var]
            total = sum(self.weights)
            assert 99 <= total <= 101

        except TclError:
            err = 'ERROR. Enter a numeric value for each of the field weights.'

        except AssertionError:
            err = 'ERROR. Sum of field weights must equal 100.'

        if err:
            self.ErrorLabel = Label(self.input_frame,
                                    text=err).grid(row=self.last_row + 1,
                                                   column=0)
            return

        else:
            self.format_vars()
            self.loading_page2()

    def format_vars(self):
        field_var = [x.get() for x in self.field_var]

        segment_var = self.segment_var.get()
        wh_var = self.wh_var.get()
        region_var = self.region_var.get()
        level_var = self.level_var.get()
        obj_var = self.objective.get()
        natl_acct_var = self.natl_acct.get()

        try:
            wh_var = [int(wh_var)]

        except ValueError:
            assert wh_var == 'All'
            wh_var = self.df['legacy_division_cd'].unique()

        try:
            region_var = [int(region_var)]

        except ValueError:
            assert region_var == 'All'
            region_var = self.df['legacy_system_cd'].unique()

        params = [
            obj_var, segment_var, field_var, natl_acct_var, self.field_options,
            self.cutoff, self.weights, self.df, self.fname
        ]

        if level_var == 'warehouse':
            self.model = Vectorize(level_var, wh_var, *params)

        elif level_var == 'region':
            self.model = Vectorize(level_var, region_var, *params)

        else:
            self.model = Vectorize('region', 'All', *params)

    def loading_page2(self):
        self.define.withdraw()

        loading2 = Toplevel()
        self.loading2 = loading2
        # self.root.eval('tk::PlaceWindow %s center' % self.loading2.winfo_pathname(self.loading2.winfo_id()))
        loading2.title("Loading...")
        loading2.config(bg="white")
        frame = Frame(loading2)

        Label(frame, text='Loading...').grid(row=2,
                                             column=2,
                                             pady=100,
                                             padx=100)
        frame.grid(row=4, column=4)

        self.loading2.after(200, self.move_on)
        loading2.mainloop()

    def move_on(self):
        self.model.run()
        self.output_page()

    def output_page(self):
        self.redo = True
        self.loading2.withdraw()

        outputs = Toplevel()
        self.outputs = outputs
        # self.root.eval('tk::PlaceWindow %s center' % self.outputs.winfo_pathname(self.outputs.winfo_id()))
        outputs.title("Outputs")
        outputs.config(bg="white")
        self.output_frame = Frame(outputs)

        # Label(self.output_frame, text="Success!").grid(row=0, column=0, pady=10)
        Button(self.output_frame, text="Export to Excel",
               command=self.export).grid(row=1, column=0, pady=10)
        Button(self.output_frame,
               text="Rerun with new parameters",
               command=self.input_page).grid(row=2, column=0, pady=10)

        text = Text(self.output_frame)
        text.grid(row=3, column=0, pady=0)
        text.insert(INSERT, self.model.string_output())

        self.output_frame.grid(row=3, column=1)

        # self.output_frame.grid_columnconfigure(0, weight=1)
        # self.output_frame.grid_columnconfigure(1, weight=1)
        # self.output_frame.grid_rowconfigure(0, weight=1)
        # self.output_frame.grid_rowconfigure(1, weight=1)
        # self.output_frame.grid_rowconfigure(2, weight=1)
        # self.output_frame.grid_rowconfigure(3, weight=1)

        outputs.mainloop()

    def export(self):
        from tkinter.filedialog import asksaveasfilename
        addon = '_' + self.level_var.get()
        addon += '_new_core' if self.objective.get(
        ) == 'Identify core products' else '_rationalized'
        newfname = self.fname.split('/')[-1][:-len(self.ext) - 1] + addon

        fout = asksaveasfilename(initialdir=''.join(
            self.fname.split('/')[:-1]),
                                 initialfile=newfname,
                                 filetypes=[('Excel spreadsheet', '.xlsx')])

        if fout is '':
            return
        else:
            self.model.df.to_excel(fout)

            Label(self.output_frame,
                  text="Exported successfully!").grid(row=0, column=0, pady=10)
class EncoderBase:
    def __init__(self, data_infile, fasttext_model_path):

        self.loss_weights = {'siamese': 1, 'grounding': 1}

        with open(data_infile, 'r') as f:
            self.clusters = json.load(f)

        self.lookup = {}
        for label, strings in self.clusters.items():
            for string in strings:
                self.lookup[string] = label

        # embed all names
        all_names = set(self.lookup.keys())
        self.vectorize = Vectorize(fasttext_model_path)
        self.pretrained_name_embeddings = self.vectorize.create_reach_object(
            all_names)

        self.amount_negative_names = 1
        self.triplet_margin = 0.1
        self.anchor_margin = 0

        torch.autograd.set_detect_anomaly(True)

    def preprocess(self, name):
        return ' '.join(tokenize(name)).lower()

    def triplet_loss(self,
                     positive_distance,
                     negative_similarity,
                     override_margin=False,
                     new_margin=0):

        if override_margin:
            triplet_margin = new_margin
        else:
            triplet_margin = self.triplet_margin

        triplet_loss = positive_distance - negative_similarity + triplet_margin
        triplet_loss = F.relu(triplet_loss)

        return triplet_loss

    def positive_distance(self, anchor_batch, positive_batch):

        # take the cosine similarity of the outputted reference and synonym embedding
        ref = anchor_batch / anchor_batch.norm(dim=1).reshape(-1, 1)
        syn = positive_batch / positive_batch.norm(dim=1).reshape(-1, 1)
        dot_products = torch.stack([
            torch.mm(x.reshape(1, -1),
                     y.reshape(1, -1).t()) for x, y in zip(ref, syn)
        ],
                                   dim=0)
        dot_product = torch.mean(dot_products)

        positive_distance = 1 - dot_product

        return positive_distance

    def negative_distance(self,
                          anchor_batch,
                          negatives_batch,
                          override_amount_negative=0):

        if override_amount_negative:
            amount_negative = override_amount_negative
        else:
            amount_negative = self.amount_negative_names

        # take the negative dot product of the outputted reference and negatives embeddings
        reference_batch = anchor_batch.reshape(-1, 1,
                                               negatives_batch.shape[-1])
        ref = reference_batch / reference_batch.norm(dim=2).reshape(-1, 1, 1)
        neg = negatives_batch / negatives_batch.norm(dim=2).reshape(
            -1, amount_negative, 1)
        dot_products = []
        for x, y in zip(ref, neg):
            dot_product = torch.mm(x, y.t())
            # apply accumulation strategy for single instance
            accumulated_dot_product = dot_product.mean()
            dot_products.append(accumulated_dot_product)
        dot_products = torch.stack(dot_products, dim=0)

        # extract single loss value for entire batch
        dot_product = torch.mean(dot_products)

        negative_distance = 1 - dot_product

        return negative_distance

    def batch_cosines(self, anchor_batch, distance_batch):

        ref = anchor_batch / anchor_batch.norm(dim=1).reshape(-1, 1)
        dist = distance_batch / distance_batch.norm(dim=1).reshape(-1, 1)
        dot_products = []
        for x, y in zip(ref, dist):
            dot_product = torch.mm(x.reshape(1, -1), y.reshape(1, -1).t())
            dot_products.append(dot_product)
        dot_products = torch.stack(dot_products, dim=0)

        return dot_products

    def pretrained_loss(self, online_batch, pretrained_batch):

        # take the dot product of the outputted reference and original embedding
        online = online_batch / online_batch.norm(dim=1).reshape(-1, 1)
        pretrained = pretrained_batch / pretrained_batch.norm(dim=1).reshape(
            -1, 1)
        dot_products = torch.stack([
            torch.mm(x.reshape(1, -1),
                     y.reshape(1, -1).t()) for x, y in zip(online, pretrained)
        ],
                                   dim=0)
        dot_product = torch.mean(dot_products)

        pretrained_loss = 1 - dot_product + self.anchor_margin
        pretrained_loss = F.relu(pretrained_loss)

        return pretrained_loss