示例#1
0
 def __init__(self, epochs, batch_size, seq_length):
     self.epochs = epochs
     self.config = self.get_config()
     self.model: TaModel = TaModel()
     self.reader = TaDataReader(self.config['data'], batch_size, seq_length)
     self.timer = Timer()
     # Network Information
     self.criterion = nn.MSELoss()  #nn.CrossEntropyLoss() - nn.NLLLoss()
     self.optimizer = optim.Adam(self.model.parameters(), lr=0.003)
     print(self.reader.get_train_count())
     print(self.reader.get_test_count())
示例#2
0
 def __init__(self, path=None, vector_size=100, word_processing=None):
     self.timer = Timer()
     self.manager = multiprocessing.Manager()
     WordEmbedding.Words = self.manager.dict()
     self.vector_size = vector_size
     if path is None:
         self.path = 'glove.6B.100d.txt'
     else:
         self.path = path
     if word_processing is None:
         self.word_processing = PreProcessing()
     else:
         self.word_processing = word_processing
     self.__read_embeddings()
示例#3
0
    def __init__(self,
                 config,
                 epochs=None,
                 batch_size=None,
                 seq_length=None,
                 use_gpu=None,
                 lr=None,
                 hidden_size=None):
        self.config = config
        if epochs is None:
            self.epochs = config["networkConfig"]["epochs"]
        else:
            self.epochs = epochs
        if batch_size is None:
            self.batch_size = config["networkConfig"]["batch_size"]
        else:
            self.batch_size = batch_size
        if seq_length is None:
            self.seq_length = self.config["networkConfig"]["sequence_length"]
        else:
            self.seq_length = seq_length
        if use_gpu is None:
            self.use_gpu = self.config["networkConfig"]["useGPU"]
        else:
            self.use_gpu = use_gpu
        if hidden_size is None:
            if self.config["networkConfig"]["hidden_size"] < 0:
                self.hidden_size = None
            else:
                self.hidden_size = self.config["networkConfig"]["hidden_size"]
        else:
            self.hidden_size = hidden_size
        if lr is None:
            self.lr = self.config["networkConfig"]["learning_rate"]
        else:
            self.lr = lr

        self.timer = Timer()
        self.current_date = DateHelper.get_current_date()
        self.criterion = self.load_criterion()
示例#4
0
class WordEmbedding(object):

    Words = {}

    def __init__(self, path=None, vector_size=100, word_processing=None):
        self.timer = Timer()
        self.manager = multiprocessing.Manager()
        WordEmbedding.Words = self.manager.dict()
        self.vector_size = vector_size
        if path is None:
            self.path = 'glove.6B.100d.txt'
        else:
            self.path = path
        if word_processing is None:
            self.word_processing = PreProcessing()
        else:
            self.word_processing = word_processing
        self.__read_embeddings()

    def __read_embeddings(self):
        WordEmbedding.Words = self.manager.dict()
        self.timer.start()
        with open(self.path, 'r', encoding="utf-8") as f:
            for line in f:
                values = line.split()
                word = values[0]
                if self.word_processing.is_stop_word_or_punctuation(word):
                    continue
                vector = np.asarray(values[1:], dtype=np.float32)
                WordEmbedding.Words[word] = vector
        self.timer.stop(time_for='Word Embedding Loading')

    def __read_embeddings_gensim(self):
        self.timer.start()
        WordEmbedding.Words = gensim.models.KeyedVectors.load_word2vec_format(
            self.path, binary=True)
        self.timer.stop(time_for='Word Embedding Loading')

    @staticmethod
    def vec(w):
        return WordEmbedding.Words[w]

    @staticmethod
    def find_closest_embeddings(embedding):
        return sorted(WordEmbedding.Words.keys(),
                      key=lambda word: spatial.distance.cosine(
                          WordEmbedding.Words[word], embedding))

    @staticmethod
    def multi_cosine_distance_word_embedding(count, date, news_title):
        cpu_count = int((multiprocessing.cpu_count() / 2))
        p = multiprocessing.Pool(cpu_count)
        numbers = list()
        total = int(count / cpu_count)
        for a in range(cpu_count):
            if a == cpu_count - 1:
                info = {
                    "skip": total * a,
                    "to": (total + (count % cpu_count)),
                    "date": date,
                    "news_title": news_title
                }
                numbers.append(info)
            else:
                info = {
                    "skip": total * a,
                    "to": total,
                    "date": date,
                    "news_title": news_title
                }
                numbers.append(info)
        calculate_partial = partial(WordEmbedding.calculate_distance_for_tweet,
                                    input=input)
        result = p.map(calculate_partial, numbers)
        p.close()
        p.join()
        return sum(result)
        #p.map(mp_worker, data)

    @staticmethod
    def calculate_distance_for_tweet(info, input):
        skip = info["skip"]
        get = info["to"]
        date = info["date"]
        title = info["news_title"]
        db = Mongo(test=2)
        pre = PreProcessing()
        tweets = WordEmbedding.get_tweets_before_date(
            db, date).skip(skip).limit(get)
        tweetcount = 0
        count = 0
        print(get)
        vector = WordEmbedding.get_vector_list(title)
        for tweet in tweets:
            tweetcount += 1
            try:
                cosine = WordEmbedding.cosine_distance_word_embedding_with_vector(
                    vector, pre.preprocess(tweet["tweet_text"]))
                percentage = round((1 - cosine) * 100, 2)
            except Exception as exception:
                print("Exeption")
                percentage = 0

            if percentage > 80:
                count += 1
                if tweet["tweet_user_verified"]:
                    count += 1
        print("count" + str(count))
        return count

    @staticmethod
    def get_tweets_before_date(db, date, collection="Tweet", days=5):
        start = date - timedelta(days=days)
        end = date
        query = {"tweet_created_at": {"$gte": start, "$lt": end}}
        fields = {
            "tweet_text": 1,
            "tweet_user_fallowers_count": 1,
            "tweet_user_verified": 1,
            "tweet_created_at": 1,
            "_id": 0
        }
        return db.get_data(collection, query,
                           fields).sort([("tweet_created_at", 1)])

    @staticmethod
    def cosine_distance_word_embedding_with_vector(vector, s2):
        vector2 = WordEmbedding.get_vector_list(s2)
        if vector2 is np.NaN:
            return 0.99
        else:
            mean = np.mean(vector, axis=0)
            mean2 = np.mean(vector2, axis=0)
            cosine = spatial.distance.cosine(mean, mean2)
            return cosine

    @staticmethod
    def cosine_distance_word_embedding(s1, s2):
        try:
            vector_1 = np.mean(WordEmbedding.get_vector_list(s1), axis=0)
            vector_2 = np.mean(WordEmbedding.get_vector_list(s2), axis=0)
        except:
            return 0.99
        cosine = spatial.distance.cosine(vector_1, vector_2)
        return cosine

    @staticmethod
    def get_vector_list(paragraph):
        word_to_vector_list = []
        for word in paragraph:
            if word in WordEmbedding.Words:
                word_to_vector_list.append(WordEmbedding.vec(word))
        if len(word_to_vector_list) == 0:
            return np.NaN
        return word_to_vector_list

    def _similarity_query(self, word_vec, number):
        words_matrix = WordEmbedding.Words.values()
        dst = (np.dot(words_matrix, word_vec) /
               np.linalg.norm(words_matrix, axis=1) / np.linalg.norm(word_vec))
        word_ids = np.argsort(-dst)

        return [(WordEmbedding.Words[x].name, dst[x])
                for x in word_ids[:number] if x in WordEmbedding.Words]

    # return [(self.inverse_dictionary[x], dst[x]) for x in word_ids[:number]
    #            if x in self.inverse_dictionary]
    # https://github.com/maciejkula/glove-python/blob/749494290fdfd24379dcc2e244c583ee61808634/glove/glove.py#L273
    # https://stats.stackexchange.com/questions/242863/how-does-python-glove-compute-most-similar

    def get_weight_matrix(self, article):
        vocabulary_size = len(article)
        embedding_matrix = np.zeros((vocabulary_size, self.vector_size),
                                    dtype=np.double)
        for index in range(vocabulary_size):
            word = article[index]
            embedding_vector = WordEmbedding.Words.get(word)
            if embedding_vector is not None:
                embedding_matrix[index] = embedding_vector
        return embedding_matrix

    def get_weight_matrix_with_wiki_tweet(self, article, wiki, tweet):
        vocabulary_size = len(article)
        embedding_matrix = np.zeros((vocabulary_size, self.vector_size + 2),
                                    dtype=np.double)
        for index in range(vocabulary_size):
            word = article[index]
            embedding_vector = WordEmbedding.Words.get(word)
            if embedding_vector is not None:
                # Add Wiki Info
                embedding_matrix[index] = np.append(embedding_vector,
                                                    wiki / 100)
                # Add Tweet
                embedding_matrix[index] = np.append(embedding_vector, tweet)
        return embedding_matrix

    def get_weight_matrix_with_wiki(self, article, wiki):
        vocabulary_size = len(article)
        embedding_matrix = np.zeros((vocabulary_size, self.vector_size + 1),
                                    dtype=np.double)
        for index in range(vocabulary_size):
            word = article[index]
            embedding_vector = WordEmbedding.Words.get(word)
            if embedding_vector is not None:
                # Add Wiki Info
                embedding_matrix[index] = np.append(embedding_vector,
                                                    wiki / 100)
        return embedding_matrix

    def get_weight_matrix_all(self,
                              article,
                              wiki=None,
                              wiki_multiply_factors=0,
                              tweet=None,
                              tweet_multiply_factors=0):
        vocabulary_size = len(article)
        vector_size = self.vector_size + wiki_multiply_factors + tweet_multiply_factors
        embedding_matrix = np.zeros((vocabulary_size, vector_size),
                                    dtype=np.double)
        for index in range(vocabulary_size):
            word = article[index]
            embedding_vector = WordEmbedding.Words.get(word)
            if embedding_vector is not None:
                embedding_matrix[index] = embedding_vector
                if wiki is not None:
                    wiki_array = np.full(wiki_multiply_factors, wiki / 100)
                    embedding_matrix[index] = np.append(
                        embedding_vector, wiki_array)
                if tweet is not None:
                    tweet_array = np.full(wiki_multiply_factors, tweet)
                    embedding_matrix[index] = np.append(
                        embedding_vector, tweet_array)
        return embedding_matrix
示例#5
0
class TaMain(object):
    """ Initializer

            Arguments
            ---------
            epochs: Number of epochs to train
            batch_size: Number of mini-sequences per mini-batch, aka batch size
            seq_length: Number of character steps per mini-batch
    """
    def __init__(self, epochs, batch_size, seq_length):
        self.epochs = epochs
        self.config = self.get_config()
        self.model: TaModel = TaModel()
        self.reader = TaDataReader(self.config['data'], batch_size, seq_length)
        self.timer = Timer()
        # Network Information
        self.criterion = nn.MSELoss()  #nn.CrossEntropyLoss() - nn.NLLLoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.003)
        print(self.reader.get_train_count())
        print(self.reader.get_test_count())

    def train(self, lr=0.001, clip=5, val_frac=0.1, print_every=10):
        """ Training a network

            Arguments
            ---------
            lr: learning rate
            clip: gradient clipping
            val_frac: Fraction of data to hold out for validation
            print_every: Number of steps for printing training and validation loss

        """
        df = pandas.DataFrame(
            columns=['Epoch', 'Step', 'Last Train Loss', 'Mean Test Loss'])
        self.timer.start()
        self.model.train()

        if self.model.train_on_gpu:
            self.model.cuda()

        counter = 0
        h = None
        for e in range(self.epochs):
            if h is None:  # initialize hidden state
                h = self.model.init_hidden(self.reader.batch_size)

            for x, y in self.reader.get_train_data(
            ):  # get_batches(data, batch_size, seq_length):
                counter += 1
                inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

                if self.model.train_on_gpu:
                    inputs, targets = inputs.cuda(), targets.cuda()

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                h = tuple([each.data for each in h])

                # zero accumulated gradients
                self.model.zero_grad()
                # get the output from the model -
                output, h = self.model(
                    inputs, h
                )  # Input Should Be 3-Dimensional: seq_len, batch, input_size
                # calculate the loss and perform back propagation
                loss = self.criterion(
                    output,
                    targets.view(self.reader.batch_size *
                                 self.reader.sequence_length))
                loss.backward()
                # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
                nn.utils.clip_grad_norm_(self.model.parameters(), clip)
                self.optimizer.step()

                # loss stats
                if counter % print_every == 0:
                    # Get validation loss
                    val_h = self.model.init_hidden(self.reader.batch_size)
                    val_losses = []
                    self.model.eval()
                    for x, y in self.reader.get_test_data(
                    ):  # get_batches(val_data, batch_size, seq_length):

                        x, y = torch.from_numpy(x), torch.from_numpy(y)

                        # Creating new variables for the hidden state, otherwise
                        # we'd backprop through the entire training history
                        val_h = tuple([each.data for each in val_h])

                        inputs, targets = x, y
                        if self.model.train_on_gpu:
                            inputs, targets = inputs.cuda(), targets.cuda()

                        output, val_h = self.model(inputs, val_h)
                        val_loss = self.criterion(
                            output,
                            targets.view(self.reader.batch_size *
                                         self.reader.sequence_length))

                        val_losses.append(val_loss.item())

                    self.model.train(
                    )  # reset to train mode after iterationg through validation data
                    print("Epoch: {}/{}...".format(e + 1, self.epochs),
                          "Step: {}...".format(counter),
                          "Loss: {:.4f}...".format(loss.item()),
                          "Val Loss: {:.4f}".format(np.mean(val_losses)))
                    df = df.append(
                        {
                            'Epoch': "{}/{}".format(e + 1, self.epochs),
                            'Step': counter,
                            'Last Train Loss': loss.item(),
                            'Mean Test Loss': np.mean(val_losses)
                        },
                        ignore_index=True)
        self.timer.stop()
        self.save_model()
        date = DateHelper.get_current_date()
        Export.append_df_to_excel(df, date)
        Export.append_df_to_excel(self.get_info(), date)

    def test(self):
        # Test the network
        for data in self.reader.get_test_data():
            # Format Data
            print(data)
            # Train

    def get_info(self):
        info = pandas.DataFrame(columns=[
            'Database', 'Key', 'Batch Size', 'Sequence Length', 'Input Size',
            'Hidden', 'Number of Layers', 'Dropout Prob', 'Learning Rate'
        ])
        info = info.append(
            {
                'Database': self.config["data"]["db"],
                'Key': self.config["data"]["train_query"]["Key"],
                'Batch Size': self.reader.batch_size,
                'Sequence Length': self.reader.sequence_length,
                'Input Size': self.model.input_size,
                'Hidden': self.model.hidden,
                'Number of Layers': self.model.num_layers,
                'Dropout Prob': self.model.drop_prob,
                'Learning Rate': self.model.lr
            },
            ignore_index=True)
        return info

    def get_save_file_name(self):
        # serialize model to JSON
        save_file_name = os.path.join(
            self.config["model"]["save_dir"],
            '%s-e%s(%s-%s).pth' % (dt.datetime.now().strftime('%d%m%Y-%H%M%S'),
                                   str(self.epochs), self.config["data"]["db"],
                                   self.config["data"]["train_query"]["Key"]))

        return save_file_name

    def save_model(self):
        # serialize model to JSON
        save_file_name = self.get_save_file_name()
        checkpoint = {
            'model': TaModel(),
            'model_state_dict': self.model.state_dict(),
            'optimizer': optim.Adam(self.model.parameters(), lr=0.003),
            'optimizer_state_dict': self.optimizer.state_dict()
        }

        torch.save(checkpoint, save_file_name)
        print("Model Saved to disk")

    def load_model(self, path):
        checkpoint = torch.load(path)
        self.model = checkpoint['model']
        self.model.load_state_dict(checkpoint['state_dict'])
        self.optimizer = checkpoint['optimizer']
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        print("Model loaded from disk")

    @staticmethod
    def get_config():
        pwd = os.path.dirname(os.path.abspath(__file__))
        return json.load(open(pwd + '/config.json', 'r'), cls=DateTimeDecoder)
    def train(self, clip=5, val_frac=0.1, print_every=20):
        """ Training a network

            Arguments
            ---------
            clip: gradient clipping
            val_frac: Fraction of data to hold out for validation
            print_every: Number of steps for printing training and validation loss

        """
        df = pandas.DataFrame(
            columns=['Epoch', 'Step', 'Last Train Loss', 'Mean Test Loss'])
        self.timer.start()
        self.model.train()

        if self.model.can_use_gpu and self.config["networkConfig"]["useGPU"]:
            self.model.cuda()

        counter = 0
        h = None
        for e in range(self.epochs):
            h = self.model.init_hidden(self.reader.batch_size)

            print(self.config["options"]["network_type"])
            print(NewsDnnGeneralDataReader.DictDataType[self.config["options"]
                                                        ["network_type"]])
            # Batch Loop
            for x, y in self.reader.get_data(
                    fetch_type=NewsDnnGeneralDataReader.DictDataTerm["Train"],
                    data_type=NewsDnnGeneralDataReader.DictDataType[
                        self.config["options"]["network_type"]]):
                counter += 1
                inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

                if self.model.can_use_gpu and self.config["networkConfig"][
                        "useGPU"]:
                    inputs, targets = inputs.cuda(), targets.cuda()

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                h = tuple([each.data for each in h])

                # zero accumulated gradients
                self.model.zero_grad()

                # get the output from the model -
                output, h = self.model(
                    inputs, h
                )  # Input Should Be 3-Dimensional: seq_len, batch, input_size

                # calculate the loss and perform back propagation
                loss = self.criterion(output.squeeze(), targets.long())
                loss.backward()

                # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
                nn.utils.clip_grad_norm_(self.model.parameters(), clip)
                self.optimizer.step()

                # Validate
                if counter % print_every == 0:
                    timer = Timer()
                    timer.start()
                    df = self.validate(df, e, counter, loss)
                    timer.stop(time_for="Validate")
                self.model.train()
        self.timer.stop(time_for="Train")
        self.save_model()
        self.current_date = DateHelper.get_current_date()
        Export.append_df_to_excel(df, self.current_date)
        Export.append_df_to_excel(self.get_info(), self.current_date)
示例#7
0
    def train(self, clip=5, val_frac=0.1, print_every=20):
        """ Training a network

            Arguments
            ---------
            clip: gradient clipping
            val_frac: Fraction of data to hold out for validation
            print_every: Number of steps for printing training and validation loss

        """
        df = pandas.DataFrame(columns=['Epoch', 'Step',
                                       'Train Mean Loss Cumulative', 'Train Accuracy',
                                       'Val Mean Loss', 'Val Accuracy'])
        self.timer.start()
        self.model.train()

        if self.model.can_use_gpu and self.config["networkConfig"]["useGPU"]:
            self.model.cuda()

        counter = 0
        for e in range(self.epochs):

            print(self.config["options"]["network_type"])
            print(NewsDnnBaseDataReader.DictDataType[
                      self.config["options"]["network_type"]])
            train_accuracy = 0
            losses = []
            # Batch Loop
            for x, y in self.reader.get_data(fetch_type=NewsDnnBaseDataReader.DictDataTerm["Train"],
                                             data_type=NewsDnnBaseDataReader.DictDataType[self.config["options"]["network_type"]]):
                counter += 1
                inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

                if self.model.can_use_gpu and self.config["networkConfig"]["useGPU"]:
                    inputs, targets = inputs.cuda(), targets.cuda()



                # zero accumulated gradients
                self.optimizer.zero_grad()
                # self.model.zero_grad()

                # get the output from the model -
                output = self.model(inputs)  # Input Should Be 3-Dimensional: seq_len, batch, input_size

                # calculate the loss and perform back propagation
                loss = self.criterion(output, targets.long())
                loss.backward()
                losses.append(loss.item())
                train_accuracy += self.calculate_accuracy(output, targets)

                # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
                nn.utils.clip_grad_norm_(self.model.parameters(), clip)
                self.optimizer.step()

                # Validate In Steps
                if counter % print_every == 0:
                    timer = Timer()
                    timer.start()
                    df = self.validate(df, e, counter, losses, train_accuracy, print_every)
                    train_accuracy = 0  # Clear Train Accuracy
                    timer.stop(time_for="Validate")
                    self.model.train()
        self.timer.stop(time_for="Train")
        self.save_model()
        self.current_date = DateHelper.get_current_date()
        Export.append_df_to_excel(df, self.current_date)
        Export.append_df_to_excel(self.get_info(), self.current_date)
示例#8
0
    def train(self, print_every=20):
        df = pandas.DataFrame(columns=['Epoch', 'Step',
                                       'Train Mean Loss Cumulative', 'Train Accuracy',
                                       'Val Mean Loss', 'Val Accuracy'])
        self.timer.start()
        self.model.train()  # Set mode of model
        losses = []
        train_set = self.reader.get_data(fetch_type=NewsCateDataReader.DictDataTerm["Train"],
                                         data_type=NewsCateDataReader.DictDataType[
                                             self.config["options"]["network_type"]])
        for e in range(self.epochs):
            print(self.config["options"]["network_type"])
            print(NewsCateDataReader.DictDataType[
                      self.config["options"]["network_type"]])
            self.model.train()  # Set to Train Mode
            total_loss_for_epoch = 0

            epoch_timer = Timer()
            epoch_timer.start()
            for step, batch in enumerate(train_set): # For each batch of training data...
                # Progress update every 40 batches.
                if step % print_every == 0:
                    # Report progress.
                    print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_set)))
                # Get Data
                b_input_ids = batch[0].to(self.device)
                b_input_mask = batch[1].to(self.device)
                b_labels = batch[2].to(self.device)

                # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
                self.model.zero_grad()

                # Perform a forward pass (evaluate the model on this training batch).
                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                outputs = self.model(b_input_ids,
                                     token_type_ids=None,
                                     attention_mask=b_input_mask,
                                     labels=b_labels)
                loss = outputs[0]
                total_loss_for_epoch += loss.item()

                # Perform a backward pass to calculate the gradients.
                loss.backward()

                # This is to help prevent the "exploding gradients" problem.
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)

                # modified based on their gradients, the learning rate, etc.
                self.optimizer.step()

                # Update the learning rate.
                self.scheduler.step()

            # Calculate the average loss over the training data.
            avg_train_loss = total_loss_for_epoch / len(train_set)

            # Store the loss value for plotting the learning curve.
            losses.append(avg_train_loss)
            LoggerHelper.info("  Average training loss: {0:.2f}".format(avg_train_loss))
            epoch_timer.stop(time_for="Epoch")

            timer = Timer(start=True)
            df = self.validate(df, e, losses)
            timer.stop(time_for="Validate")
            self.model.train()
        self.timer.stop(time_for="Train")
        self.save_model()
        self.current_date = DateHelper.get_current_date()
        Export.append_df_to_excel(df, self.current_date)
        Export.append_df_to_excel(self.get_info(), self.current_date)