예제 #1
0
class BaseParser(Base):
    """
    base class for parser,
    """
    def __init__(self, config):
        super(BaseParser, self).__init__(config)
        self._init_vocab()

    @classmethod
    def load_default_config(cls) -> ConfigDict:
        config = ConfigDict(vocab_path=None)
        return config

    @Base.log
    def _init_vocab(self):
        self.vocab = Vocab()
        self.vocab.load(self.config.vocab_path)

    def parse_train(self):
        pass

    def parse_test(self):
        pass

    def parse_predump(self):
        pass

    def parse_predump_train(self):
        pass
예제 #2
0
def main():
    model = torch.load('CNNmodel/model.pt')
    model.eval()

    config = torch.load('CNNmodel/config.pt')

    audio2mfcc = torchaudio.transforms.MFCC(sample_rate=config.sample_rate,
                                            n_mfcc=config.n_mfcc,
                                            log_mels=False,
                                            melkwargs={
                                                'n_fft': config.n_fft_size
                                            }).to(config.device)

    logger.info('Start cache for test data')
    if not os.path.isfile("./mfcc/test_input.pt"):
        os.makedirs("./mfcc", exist_ok=True)
        val_root = os.path.join(config.data_path, 'test')
        val_files = [
            p for p in os.listdir(os.path.join(config.data_path, 'test'))
            if 'pcm' in p
        ]

        val_mfccs = {}
        for p in tqdm(val_files):
            sound_path = os.path.join(config.data_path, 'test', p)
            data, samplerate = sf.read(sound_path,
                                       channels=1,
                                       samplerate=16000,
                                       format='raw',
                                       subtype='PCM_16')

            mfcc = audio2mfcc(torch.Tensor(data).to(config.device))
            audio_array = torch.zeros(config.n_mfcc, config.input_max_len)
            sel_ix = min(mfcc.shape[1], config.input_max_len)
            audio_array[:, :sel_ix] = mfcc[:, :sel_ix]

            val_mfccs[sound_path] = audio_array.transpose(0, 1)
        torch.save(val_mfccs, './mfcc/test_input.pt')
    logger.info('Done cache for test data')
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    # sentence index
    vocab = Vocab(config)
    test_loader, test_label_path = data_loader(config, 'test', vocab)

    with open('prediction.txt', 'w') as file_writer:

        for tst_step, (file_name, mfcc, target_index) in enumerate(
                tqdm(test_loader, desc="Evaluating")):
            with torch.no_grad():

                logit, feature = model(mfcc.to(config.device))
                y_max = logit.max(dim=1)[1]
                pred = [vocab.index2text[i] for i in y_max.cpu().numpy()]
                for f, line in zip(file_name, pred):
                    print(file_name)
                    file_writer.write(f + " " + str(line) + '\n')
예제 #3
0
파일: loader.py 프로젝트: phvan2312/tf-OCR
    def __init__(self,
                 image_fodler,
                 image_fn,
                 label_fn,
                 min_freq=1,
                 vocab=None):
        self.image_fns = [
            os.path.join(image_fodler, _fn.strip())
            for _fn in open(image_fn, 'r').readlines()
        ]
        self.labels = [_fn.strip() for _fn in open(label_fn, 'r').readlines()]

        assert len(self.image_fns) == len(self.labels)

        if vocab is None:
            dict_of_unique_words = TextUtils.get_dict_of_unique_words(
                self.labels, min_freq=min_freq)
            self.vocab = Vocab(dict_of_unique_words)
        else:
            self.vocab = vocab
예제 #4
0
def main():
    args = argparse.ArgumentParser()
    args.add_argument("--input_max_len",
                      default=400,
                      type=int,
                      help="Maximum sequence length for audio")
    args.add_argument("--num_epochs", default=300, type=int, help="num_epochs")
    args.add_argument("--data_path", default='data', type=str, help="root")
    args.add_argument("--sample_rate",
                      default=16000,
                      type=int,
                      help="sampling rate for audio")
    args.add_argument("--n_fft_size",
                      default=400,
                      type=int,
                      help="time widnow for fourier transform")
    args.add_argument("--n_mfcc",
                      default=40,
                      type=int,
                      help="low frequency range (from 0 to n_mfcc)")
    args.add_argument("--max_len",
                      default=30,
                      type=int,
                      help="target_max_length")
    args.add_argument("--batch_size",
                      default=128,
                      type=int,
                      help="target_max_length")
    args.add_argument("--warmup_percent",
                      default=0.1,
                      type=float,
                      help="Linear warmup over warmup_percent.")
    args.add_argument("--when",
                      type=int,
                      default=5,
                      help="when to decay learning rate (default: 20)")
    args.add_argument("--clip",
                      type=float,
                      default=0.8,
                      help="gradient clip value (default: 0.8)")
    args.add_argument("--lr",
                      type=float,
                      default=1e-4,
                      help="initial learning rate (default: 1e-3)")
    args.add_argument("--seed", type=int, default=1234, help="random seed")
    args.add_argument("--logging_steps",
                      type=int,
                      default=50,
                      help="frequency of result logging (default: 30)")
    config = args.parse_args()
    set_seed(config)

    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    config.device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu")
    config.device

    # get mfcc
    createMFCC(config)

    # sentence index
    vocab = Vocab(config)

    # data loaders
    train_loader, train_label_path = data_loader(config, 'train', vocab)
    validate_loader, validate_label_path = data_loader(config, 'validate',
                                                       vocab)

    # build model(unknown sentence +1)
    model = CNN2D(len(vocab) + 1).to(config.device)

    # loss function
    loss_fct = torch.nn.CrossEntropyLoss()

    train(model, train_loader, validate_loader, loss_fct, config, vocab)
    logger.info('Done Training')
        conversation_length = [
            min(len(conv["lines"]), max_conv_len) for conv in conv_objects
        ]

        sentences, sentence_length = pad_sentences(
            conversations,
            max_sentence_length=max_sent_len,
            max_conversation_length=max_conv_len,
        )

        print("Saving preprocessed data at", split_data_dir)
        to_pickle(conversation_length,
                  split_data_dir.joinpath("conversation_length.pkl"))
        to_pickle(sentences, split_data_dir.joinpath("sentences.pkl"))
        to_pickle(sentence_length,
                  split_data_dir.joinpath("sentence_length.pkl"))

        if split_type == "train":

            print("Save Vocabulary...")
            vocab = Vocab(tokenizer)
            vocab.add_dataframe(conversations)
            vocab.update(max_size=max_vocab_size, min_freq=min_freq)

            print("Vocabulary size: ", len(vocab))
            vocab.pickle(convai2_dir.joinpath("word2id.pkl"),
                         convai2_dir.joinpath("id2word.pkl"))

    print("Done!")
예제 #6
0
 def _init_vocab(self):
     self.vocab = Vocab()
     self.vocab.load(self.config.vocab_path)