Exemplo n.º 1
0
 def __init__(self, model_path, label_path):
     self.predictor = BertClassificationPredictor(
                     model_path=model_path,
                     label_path=label_path, # location for labels.csv file
                     multi_label=False,
                     model_type='bert',
                     do_lower_case=False)
     self.preprocessor = TextPreprocessor()
def get_tf_idf_model(citations=None):
    if citations is None:
        citations = TextPreprocessor()
        citations.preprocess()

    documents = [
        citation['title'] + ' \n' + citation['abstract']
        for citation in list(citations.values())
    ]
    bigram_vectorizer = CountVectorizer(ngram_range=(1, 2))
    bigrams = bigram_vectorizer.fit_transform(documents)

    tfidf = TfidfTransformer().fit_transform(bigrams)

    return citations, bigram_vectorizer, tfidf
Exemplo n.º 3
0
class SentimentAnalyzer(object):
    def __init__(self, model_path, label_path):
        self.predictor = BertClassificationPredictor(
                        model_path=model_path,
                        label_path=label_path, # location for labels.csv file
                        multi_label=False,
                        model_type='bert',
                        do_lower_case=False)
        self.preprocessor = TextPreprocessor()


    def predict_sentiment(self, tweet):
        tweet = self.preprocessor.process(tweet)
        print(tweet)
        prediction = self.predictor.predict(tweet)
        print(prediction)
        for label, confidence in prediction:
            if label == "0" and confidence >= 0.7:
                return "Negative"

            if label == "4" and confidence >= 0.7:
                return "Positive"

        return "Neutral"

    def batch_predict_sentiment(self, tweets):
        processed_tweets = []

        for tweet in tweets:
            processed_tweets.append(self.preprocessor.process(tweet))

        predictions = self.predictor.predict_batch(processed_tweets)
        print(predictions)
        results = []

        for prediction in predictions:
            label_to_prob = dict(prediction)

            if label_to_prob["0"] >= 0.7:
                results.append("Negative")
            elif label_to_prob["4"] >= 0.7:
                results.append("Positive")
            else:
                results.append("Neutral")

        return results
Exemplo n.º 4
0
  def predict(self, model):
    tqdm.pandas()
    print('preprocessing test data...')
    tp = TextPreprocessor()
    self.dftest['clean_text'] = self.dftest['text'].progress_apply(tp.pre_process_text)
    self.dftest['label'] = self.dftest['label'].replace(4,1)

    print('bag of words test data...')
    X_test = self.vect.transform(self.dftest['clean_text'])
    Y_test = self.dftest['label']

    print('predict...')
    preds = self.evaluate_test(X_test, Y_test, model)
    return preds
Exemplo n.º 5
0
    def predict(self, model):
        tqdm.pandas()
        print('preprocessing test data...')
        tp = TextPreprocessor()
        self.dftest['clean_text'] = self.dftest['text'].progress_apply(
            tp.pre_process_text_no_stemming)

        print('word embeddings test data...')
        sequences = self.vect.texts_to_sequences(
            self.dftest['clean_text'].values)

        X_test = pad_sequences(sequences, maxlen=self.max_len)
        y_test = self.dftest['label'].values
        print('predict...')

        preds = model.predict(X_test)
        y_preds = [self.prob_to_sentiment_label(pred) for pred in preds]

        prob_map = ['negative', 'neutral', 'positive']

        probs = []
        for pred in preds:
            di = {}
            for i, prob in enumerate(pred):
                di[prob_map[i]] = prob
            probs.append(di)

        ##probs = ["{}:{}".format(prob_map[i[0]], prob) for i, prob in enumerate(preds)]

        self.dftest['pred'] = y_preds
        self.dftest['prob'] = probs

        submission = self.dftest[['text', 'label', 'pred', 'prob']]

        submission.to_csv('data/predictions_3_categories.csv')

        # print(classification_report(y_test, y_preds))

        score, acc = model.evaluate(X_test,
                                    np_utils.to_categorical(
                                        self.dftest['label'].values),
                                    verbose=2,
                                    batch_size=128)
        print("score: %.2f" % (score))
        print("acc: %.2f" % (acc))

        return y_preds
Exemplo n.º 6
0
def init_sentence_encoder():
    global session, graph, text_preprocessor, encoding_ops, messages_plh
    graph = tf.Graph()
    text_preprocessor = TextPreprocessor()

    print(" Start initializing Tensorflow hub")
    os.environ["TFHUB_CACHE_DIR"] = '/tf_hub_cache'
    # Create and intialize the Tensorflow session
    with graph.as_default():
        module_url = "https://tfhub.dev/google/universal-sentence-encoder/2"
        embed = hub.Module(module_url)
        session = tf.Session(config=tf.ConfigProto(device_count={'GPU': 0}))
        session.run(
            [tf.global_variables_initializer(),
             tf.tables_initializer()])
        messages_plh = tf.placeholder(dtype=tf.string, shape=[None])
        encoding_ops = embed(messages_plh)
    print(" Model Sentence Encoder is loaded")
Exemplo n.º 7
0
def main(hparams: HParams):
    '''
    generate captions from images
    '''
    device = torch.device(hparams.gpus if torch.cuda.is_available() else 'cpu')
    text_preprocessor = TextPreprocessor.load(hparams.text_preprocessor_path)

    transform = transforms.Compose([
        transforms.Resize([hparams.crop_size, hparams.crop_size]),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406),
                             (0.229, 0.224, 0.225))
    ])

    # build model
    encoder = EncoderCNN(hparams.hidden_dim).eval()
    decoder = FactoredLSTM(hparams.embed_dim, text_preprocessor.vocab_size, hparams.hidden_dim,
                           hparams.style_dim, hparams.num_layers, train=False, device=device)

    encoder = encoder.to(device)
    decoder = decoder.to(device)

    checkpoints = torch.load(hparams.checkpoint_path, map_location=device)
    encoder.load_state_dict(checkpoints['encoder'])
    decoder.load_state_dict(checkpoints['decoder'])

    img_names, img_list = load_images(hparams.img_dir, transform)
    for idx, (img_name, img) in enumerate(zip(img_names, img_list)):
        img = img.to(device)
        features = encoder(img)

        if hparams.decoder == 'greedy':
            output = decoder.sample_greedy(features, hparams.gen_max_len, hparams.mode,
                                           text_preprocessor.SOS_ID, text_preprocessor.EOS_ID)
            output = output[0].cpu().tolist()
        else:
            output = decoder.sample_beam(features, hparams.beam_width, hparams.gen_max_len, hparams.mode,
                                         text_preprocessor.SOS_ID, text_preprocessor.EOS_ID)

        output = output[1:output.index(text_preprocessor.EOS_ID)]  # delete SOS and EOS
        caption = text_preprocessor.indice2tokens(output)

        print(img_names[idx])
        print(' '.join(token for token in caption))
    tfidf = TfidfTransformer().fit_transform(bigrams)

    return citations, bigram_vectorizer, tfidf


def get_most_similar_documents(tfidf_matrix, vectorizer, query):
    query_tfidf = TfidfTransformer().fit_transform(
        vectorizer.transform([query])
    )
    document_similarities = linear_kernel(query_tfidf, tfidf_matrix).flatten()
    return document_similarities.argsort()[::-1]


if __name__ == '__main__':
    citations = TextPreprocessor()

    input("Hit enter to continue...")

    count = 0
    skipped_count = 0
    for article, attributes in citations.items():
        # Skip neighboring articles
        if len(attributes['neighbors']) == 0:
            skipped_count += 1
            continue

        candidate_terms = get_candidates(citations, article)

        # Make features
        # Just printing them for now
Exemplo n.º 9
0
def main(hparams: HParams):
    '''
    setup training.
    '''
    if torch.cuda.is_available() and not hparams.gpus:
        warnings.warn(
            'WARNING: you have a CUDA device, so you should probably run with -gpus 0'
        )

    device = torch.device(hparams.gpus if torch.cuda.is_available() else 'cpu')

    # data setup
    print(f"Loading vocabulary...")
    text_preprocessor = TextPreprocessor.load(hparams.preprocessor_path)

    transform = transforms.Compose([
        transforms.Resize([hparams.img_size, hparams.img_size]),
        transforms.RandomCrop([hparams.crop_size, hparams.crop_size]),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # create dataloader
    print('Creating DataLoader...')
    normal_data_loader = get_image_caption_loader(
        hparams.img_dir,
        hparams.normal_caption_path,
        text_preprocessor,
        hparams.normal_batch_size,
        transform,
        shuffle=True,
        num_workers=hparams.num_workers,
    )

    style_data_loader = get_caption_loader(
        hparams.style_caption_path,
        text_preprocessor,
        batch_size=hparams.style_batch_size,
        shuffle=True,
        num_workers=hparams.num_workers,
    )

    if hparams.train_from:
        # loading checkpoint
        print('Loading checkpoint...')
        checkpoint = torch.load(hparams.train_from)
    else:
        normal_opt = Optim(
            hparams.optimizer,
            hparams.normal_lr,
            hparams.max_grad_norm,
            hparams.lr_decay,
            hparams.start_decay_at,
        )
        style_opt = Optim(
            hparams.optimizer,
            hparams.style_lr,
            hparams.max_grad_norm,
            hparams.lr_decay,
            hparams.start_decay_at,
        )

    print('Building model...')
    encoder = EncoderCNN(hparams.hidden_dim)
    decoder = FactoredLSTM(hparams.embed_dim,
                           text_preprocessor.vocab_size,
                           hparams.hidden_dim,
                           hparams.style_dim,
                           hparams.num_layers,
                           hparams.random_init,
                           hparams.dropout_ratio,
                           train=True,
                           device=device)

    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # loss and optimizer
    criterion = nn.CrossEntropyLoss(ignore_index=text_preprocessor.PAD_ID)
    normal_params = list(encoder.parameters()) + list(
        decoder.default_parameters())
    style_params = list(decoder.style_parameters())
    normal_opt.set_parameters(normal_params)
    style_opt.set_parameters(style_params)

    if hparams.train_from:
        encoder.load_state_dict(checkpoint['encoder'])
        decoder.load_state_dict(checkpoint['decoder'])
        normal_opt.load_state_dict(checkpoint['normal_opt'])
        style_opt.load_state_dict(checkpoint['style_opt'])

    # traininig loop
    print('Start training...')
    for epoch in range(hparams.num_epoch):

        # result
        sum_normal_loss, sum_style_loss, sum_normal_ppl, sum_style_ppl = 0, 0, 0, 0

        # normal caption
        for i, (images, in_captions, out_captions,
                lengths) in enumerate(normal_data_loader):
            images = images.to(device)
            in_captions = in_captions.to(device)
            out_captions = out_captions.contiguous().view(-1).to(device)

            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(in_captions, features, mode='default')
            loss = criterion(outputs.view(-1, outputs.size(-1)), out_captions)
            encoder.zero_grad()
            decoder.zero_grad()
            loss.backward()
            normal_opt.step()

            # print log
            sum_normal_loss += loss.item()
            sum_normal_ppl += np.exp(loss.item())
            if i % hparams.normal_log_step == 0:
                print(
                    f'Epoch [{epoch}/{hparams.num_epoch}], Normal Step: [{i}/{len(normal_data_loader)}] '
                    f'Normal Loss: {loss.item():.4f}, Perplexity: {np.exp(loss.item()):5.4f}'
                )

        # style caption
        for i, (in_captions, out_captions,
                lengths) in enumerate(style_data_loader):
            in_captions = in_captions.to(device)
            out_captions = out_captions.contiguous().view(-1).to(device)

            # Forward, backward and optimize
            outputs = decoder(in_captions, None, mode='style')
            loss = criterion(outputs.view(-1, outputs.size(-1)), out_captions)

            decoder.zero_grad()
            loss.backward()
            style_opt.step()

            sum_style_loss += loss.item()
            sum_style_ppl += np.exp(loss.item())
            # print log
            if i % hparams.style_log_step == 0:
                print(
                    f'Epoch [{epoch}/{hparams.num_epoch}], Style Step: [{i}/{len(style_data_loader)}] '
                    f'Style Loss: {loss.item():.4f}, Perplexity: {np.exp(loss.item()):5.4f}'
                )

        model_params = {
            'encoder': encoder.state_dict(),
            'decoder': decoder.state_dict(),
            'epoch': epoch,
            'normal_opt': normal_opt.optimizer.state_dict(),
            'style_opt': style_opt.optimizer.state_dict(),
        }

        avg_normal_loss = sum_normal_loss / len(normal_data_loader)
        avg_style_loss = sum_style_loss / len(style_data_loader)
        avg_normal_ppl = sum_normal_ppl / len(normal_data_loader)
        avg_style_ppl = sum_style_ppl / len(style_data_loader)
        print(f'Epoch [{epoch}/{hparams.num_epoch}] statistics')
        print(
            f'Normal Loss: {avg_normal_loss:.4f} Normal ppl: {avg_normal_ppl:5.4f} '
            f'Style Loss: {avg_style_loss:.4f} Style ppl: {avg_style_ppl:5.4f}'
        )

        torch.save(
            model_params,
            f'{hparams.model_path}/n-loss_{avg_normal_loss:.4f}_s-loss_{avg_style_loss:.4f}_'
            f'n-ppl_{avg_normal_ppl:5.4f}_s-ppl_{avg_style_ppl:5.4f}_epoch_{epoch}.pt'
        )
Exemplo n.º 10
0
    def __init__(self) -> None:

        self.model_id = time.strftime("%Y_%m_%d-%H_%M_%S-") + str(uuid.uuid4())[:8]

        self.update_frequency = 10
        self.log_frequency = 1000
        self.gamma = 0.9

        self.use_cuda = False
        self.device = 'cpu'

        # load config & vocab
        with open("./vocab.txt") as f:
            self.word_vocab = f.read().split("\n")
        with open("config.yaml") as reader:
            self.config = yaml.safe_load(reader)

        self.max_vocab_size = len(self.word_vocab)
        self.word2id = {}
        for i, w in enumerate(self.word_vocab):
            self.word2id[w] = i
        self.EOS_id = self.word2id["</S>"]

        # Set the random seed manually for reproducibility.
        np.random.seed(self.config['general']['random_seed'])
        torch.manual_seed(self.config['general']['random_seed'])
        if torch.cuda.is_available():
            if not self.config['general']['use_cuda']:
                print("WARNING: CUDA device detected but 'use_cuda: false' found in config.yaml")
                self.use_cuda = False
            else:
                torch.backends.cudnn.deterministic = True
                torch.cuda.manual_seed(self.config['general']['random_seed'])
                self.use_cuda = True
                self.device = 'cuda:0'
        else:
            self.use_cuda = False

        self.batch_size = self.config['training']['batch_size']
        self.max_nb_steps_per_episode = self.config['training']['max_nb_steps_per_episode']
        self.nb_epochs = self.config['training']['nb_epochs']
        self.experiment_tag = self.config['checkpoint']['experiment_tag']
        self.model_checkpoint_path = self.config['checkpoint']['model_checkpoint_path']
        self.save_frequency = self.config['checkpoint']['save_frequency']
        self.update_per_k_game_steps = self.config['general']['update_per_k_game_steps']    # update_frequency ?
        self.clip_grad_norm = self.config['training']['optimizer']['clip_grad_norm']

        self._initialized = False
        self._epsiode_has_started = False
        self.current_episode = 0
        self.best_avg_score_so_far = 0.0

        # model_init
        self.model = CommandScorerModel(input_size=self.max_vocab_size,
                                        hidden_size=128,
                                        device=self.device,
                                        verbose=False)
        parameters = filter(lambda p: p.requires_grad, self.model.parameters())
        self.optimizer = torch.optim.Adam(parameters, lr=self.config['training']['optimizer']['learning_rate'])
        self.model.to(self.device)

        # using checkpoint
        if self.config['checkpoint']['load_pretrained']:
            self.load_pretrained_model(
                self.model_checkpoint_path + '/' + self.config['checkpoint']['pretrained_experiment_tag'] + '.pt')
        if self.use_cuda:
            self.model.cuda()

        # tokenizer load
        self.nlp = spacy.load('en', disable=['ner', 'parser', 'tagger'])
        self.preposition_map = {"take": "from",
                                "chop": "with",
                                "slice": "with",
                                "dice": "with",
                                "cook": "with",
                                "insert": "into",
                                "put": "on"}
        self.single_word_verbs = set(["inventory", "look"])

        self.mode = "test"

        # TODO
        self.rng = RandomState()
        self.text_processor = TextPreprocessor(self.nlp,
                                               self.device,
                                               self.word_vocab,
                                               self.single_word_verbs,
                                               self.EOS_id,
                                               self.preposition_map,
                                               self.word2id)
Exemplo n.º 11
0
class CustomAgent:

    def __init__(self) -> None:

        self.model_id = time.strftime("%Y_%m_%d-%H_%M_%S-") + str(uuid.uuid4())[:8]

        self.update_frequency = 10
        self.log_frequency = 1000
        self.gamma = 0.9

        self.use_cuda = False
        self.device = 'cpu'

        # load config & vocab
        with open("./vocab.txt") as f:
            self.word_vocab = f.read().split("\n")
        with open("config.yaml") as reader:
            self.config = yaml.safe_load(reader)

        self.max_vocab_size = len(self.word_vocab)
        self.word2id = {}
        for i, w in enumerate(self.word_vocab):
            self.word2id[w] = i
        self.EOS_id = self.word2id["</S>"]

        # Set the random seed manually for reproducibility.
        np.random.seed(self.config['general']['random_seed'])
        torch.manual_seed(self.config['general']['random_seed'])
        if torch.cuda.is_available():
            if not self.config['general']['use_cuda']:
                print("WARNING: CUDA device detected but 'use_cuda: false' found in config.yaml")
                self.use_cuda = False
            else:
                torch.backends.cudnn.deterministic = True
                torch.cuda.manual_seed(self.config['general']['random_seed'])
                self.use_cuda = True
                self.device = 'cuda:0'
        else:
            self.use_cuda = False

        self.batch_size = self.config['training']['batch_size']
        self.max_nb_steps_per_episode = self.config['training']['max_nb_steps_per_episode']
        self.nb_epochs = self.config['training']['nb_epochs']
        self.experiment_tag = self.config['checkpoint']['experiment_tag']
        self.model_checkpoint_path = self.config['checkpoint']['model_checkpoint_path']
        self.save_frequency = self.config['checkpoint']['save_frequency']
        self.update_per_k_game_steps = self.config['general']['update_per_k_game_steps']    # update_frequency ?
        self.clip_grad_norm = self.config['training']['optimizer']['clip_grad_norm']

        self._initialized = False
        self._epsiode_has_started = False
        self.current_episode = 0
        self.best_avg_score_so_far = 0.0

        # model_init
        self.model = CommandScorerModel(input_size=self.max_vocab_size,
                                        hidden_size=128,
                                        device=self.device,
                                        verbose=False)
        parameters = filter(lambda p: p.requires_grad, self.model.parameters())
        self.optimizer = torch.optim.Adam(parameters, lr=self.config['training']['optimizer']['learning_rate'])
        self.model.to(self.device)

        # using checkpoint
        if self.config['checkpoint']['load_pretrained']:
            self.load_pretrained_model(
                self.model_checkpoint_path + '/' + self.config['checkpoint']['pretrained_experiment_tag'] + '.pt')
        if self.use_cuda:
            self.model.cuda()

        # tokenizer load
        self.nlp = spacy.load('en', disable=['ner', 'parser', 'tagger'])
        self.preposition_map = {"take": "from",
                                "chop": "with",
                                "slice": "with",
                                "dice": "with",
                                "cook": "with",
                                "insert": "into",
                                "put": "on"}
        self.single_word_verbs = set(["inventory", "look"])

        self.mode = "test"

        # TODO
        self.rng = RandomState()
        self.text_processor = TextPreprocessor(self.nlp,
                                               self.device,
                                               self.word_vocab,
                                               self.single_word_verbs,
                                               self.EOS_id,
                                               self.preposition_map,
                                               self.word2id)

    def infos_to_request(self) -> EnvInfos:
        request_infos = EnvInfos()
        request_infos.description = True
        request_infos.inventory = True
        request_infos.entities = True
        request_infos.verbs = True
        request_infos.extras = ["recipe"]
        return request_infos

    def tokenize(self, text):
        text = preprocessing(text, tokenizer=self.nlp)
        word_ids = [get_word_id(t, self.word2id, self.max_vocab_size) for t in text]
        return word_ids

    def discount_rewards(self, last_values):
        returns, advantages = [], []
        R = last_values.data
        for t in reversed(range(len(self.transitions))):
            rewards, _, _, values = self.transitions[t]
            R = rewards + self.gamma * R
            adv = R - values
            returns.append(R)
            advantages.append(adv)

        return returns[::-1], advantages[::-1]

    def select_additional_infos(self) -> EnvInfos:
        return EnvInfos(description=True,
                        inventory=True,
                        admissible_commands=True,
                        has_won=True,
                        extras=["recipe"],
                        has_lost=True)

    def load_pretrained_model(self, load_from):
        print("loading model from %s\n" % (load_from))
        try:
            if self.use_cuda:
                state_dict = torch.load(load_from)
            else:
                state_dict = torch.load(load_from, map_location='cpu')
            self.model.load_state_dict(state_dict)
        except:
            print("Failed to load checkpoint...")

    def finish(self) -> None:
        """
        All games in the batch are finished. One can choose to save checkpoints,
        evaluate on validation set, or do parameter annealing here.

        """
        # Game has finished (either win, lose, or exhausted all the given steps).

        self.final_rewards = np.array(self.scores[-1], dtype='float32')  # batch
        dones = []
        for d in self.dones:
            d = np.array([float(dd) for dd in d], dtype='float32')
            dones.append(d)
        dones = np.array(dones)
        step_used = 1.0 - dones
        self.step_used_before_done = np.sum(step_used, 0)  # batch

        # save checkpoint
        if self.mode == "train" and self.current_episode % self.save_frequency == 0:
            avg_score = np.mean(self.final_rewards)
            if avg_score > self.best_avg_score_so_far:
                self.best_avg_score_so_far = avg_score

                save_to = self.model_checkpoint_path + '/' + self.experiment_tag + "_episode_" + str(
                    self.current_episode) + ".pt"
                if not os.path.isdir(self.model_checkpoint_path):
                    os.mkdir(self.model_checkpoint_path)
                torch.save(self.model.state_dict(), save_to)
                print("========= saved checkpoint =========")

        self.current_episode += 1

    def train(self):
        self.mode = "train"
        self.stats = {"max": defaultdict(list), "mean": defaultdict(list)}
        self.transitions = []
        self.model.reset_hidden(1)
        self.last_score = 0
        self.no_train_step = 0

        self.dones = []
        self.scores = []

    def eval(self):
        self.mode = "test"
        self.model.reset_hidden(1)

    def act(self, obs: List[str], scores: List[int], dones: List[bool], infos: Dict[str, List[Any]]) -> Optional[
        List[str]]:

        input_tensor, _, commands_tensor = self.text_processor.get_game_step_info(obs, infos)
        outputs, indexes, values = self.model(input_tensor, commands_tensor)

        print('outputs:', outputs)
        print('indexes:', indexes[0])
        print('values:', values)

        actions_per_batch = []
        for cmds_i in range(self.batch_size):
            action = None
            try:
                action = infos["admissible_commands"][cmds_i][indexes[0][cmds_i]]
            except IndexError:
                # TODO torch.Size([3, max_seq_len, max_commands_number])
                action = self.rng.choice(infos["admissible_commands"][cmds_i])
                warnings.warn("Warning model choice padded array: %s" % (
                        str(infos["admissible_commands"][cmds_i]) + ' ' +
                        str(len(infos["admissible_commands"][cmds_i])) + ' '+ str(indexes[0][cmds_i]),))
            actions_per_batch.append(action)
        print('*' * 100)

        if self.mode == "eval":
            if all(dones):
                self.model.reset_hidden(1)
            return actions_per_batch

        self.no_train_step += 1

        if self.transitions:
            reward = score - self.last_score  # Reward is the gain/loss in score.
            self.last_score = score
            if infos["has_won"]:
                reward += 100
            if infos["has_lost"]:
                reward -= 100

            self.transitions[-1][0] = reward  # Update reward information.




        if self.mode == "eval":
            if done:
                self.model.reset_hidden(1)
            return action

        if not self._epsiode_has_started:
            self.start_episode(obs, infos)

        if all(dones):
            self.end_episode(obs, scores, infos)
            return  # Nothing to return.

        if self.current_step > 0:
            # append scores / dones from previous step into memory
            self.scores.append(scores)
            self.dones.append(dones)



        if all(dones):
            self.end_episode(obs, scores, infos)
            return  # Nothing to return.

        return [self.rng.choice(cmds) for cmds in infos["admissible_commands"]]
Exemplo n.º 12
0
words = list(set(data["word"].values))
n_words = len(words)
print('Number of words:', n_words)

# calculate tags
tags = list(set(data["tag"].values))
n_tags = len(tags)
print('Number of tags:', n_tags)
print('Type of tags:', tags)

# create output folder for x and y
gfile.MakeDirs(os.path.dirname(args.output_x_path))
gfile.MakeDirs(os.path.dirname(args.output_y_path))

# preprocess text
processor = TextPreprocessor(140)
processor.fit(sentences_list)
processor.labels = list(set(data["tag"].values))

X = processor.transform(sentences_list)

# preprocess tags
tag2idx = {t: i for i, t in enumerate(tags)}
y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=140, sequences=y, padding="post", value=tag2idx["O"])
y = [to_categorical(i, num_classes=n_tags) for i in y]

# export features and labels for training
with gfile.GFile(args.output_x_path, 'w') as output_X:
    pickle.dump(X, output_X)
Exemplo n.º 13
0
    words = []
    for token_sequence in sequences:
        words.extend(token_sequence)

    word_counts = dict(Counter(words).most_common(max_words))

    most_common_words = list(word_counts.keys())
    word_ids = list(range(len(most_common_words)))

    vocabulary = dict(zip(most_common_words, word_ids))
    return vocabulary


sentences = np.genfromtxt('./tickets_QIT.txt', dtype=str, delimiter='\n')

prep = TextPreprocessor(sentences)
prep = QITEmailBodyCleaner(prep)
prep = Tokenizer(prep, language='italian')
tokens = prep.preprocess()
vocabulary = build_vocabulary(tokens)

unknown_token_id = max(vocabulary.values()) + 1
prep = IntegerEncoder(prep, vocabulary, unknown_token_id)
prep = WordContextPairsGenerator(prep, window_length=2)

word_context_pairs = prep.preprocess()
target_words = [tw for (tw, cw) in word_context_pairs]
context_words = [cw for (tw, cw) in word_context_pairs]

np.savetxt('target_words.txt', target_words, fmt='%d')
np.savetxt('context_words.txt', context_words, fmt='%d')
Exemplo n.º 14
0
from text_preprocessor import TextPreprocessor
from qit_cleaner import QITEmailBodyCleaner
from integer_encoder import IntegerEncoder
from tokenizer import Tokenizer
from padder import Padder

from collections import Counter
import numpy as np

sentences = np.genfromtxt('../upsampled/x_QIT.txt', delimiter='\n', dtype=str)
language = 'italian'
max_words = None
max_length = 25

# Text preprocessor with no functionalities whatsoever
prep = TextPreprocessor(sentences)

# Add decorator to clean email bodies
prep = QITEmailBodyCleaner(prep)

# Add tokenizer decorator
prep = Tokenizer(prep, language)

# Load vocabulary
with open('vocabulary_wikipedia', 'r') as vocabulary_file:
    vocabulary = eval(vocabulary_file.read())

# Add integer encoding decorator
unknown_token_id = max(vocabulary.values()) + 1
prep = IntegerEncoder(prep, vocabulary, unknown_token_id)