def process_train_data(v: int, n: int, delta: float, vocab_size: int, train_file: str) -> Ngram: """ Wrapper function for the training data processing. Either fetch or generate necessary Ngrams based on the training information. :param v: Vocabulary choice :param n: ngram choice :param delta: Smoothing choice :param vocab_size: The size of the vocabulary :param train_file: Path to training data :return: Ngram """ ngrams = Ngram(n) if ds.data_ser_exists(v, n, delta): print("Model with parameters already stored. Retrieving") ngrams = ds.data_ser_load(v, n, delta) else: print( "Model with parameters not stored. Generating model from provided training data" ) train_data = pd.read_csv(train_file, delimiter='\t', names=[ DF_COLUMN_ID, DF_COLUMN_NAME, DF_COLUMN_LANG, DF_COLUMN_TWEET ]) transform_to_vocab(train_data, v) print("Shape of Training Data (Rows, Columns) => {}".format( train_data.shape)) ngrams.generate(train_data, delta, vocab_size) ds.data_ser_save(ngrams, v, n, delta) return ngrams
def main(args): print(f'Loading corpus from `{args.data}`...') corpus = Corpus(args.data, order=args.order, lower=args.lower, max_lines=args.max_lines) model = Ngram(order=args.order) name = f'{args.name}.{args.order}gram' print('Example data:') print('Train:', corpus.train[:20]) print('Valid:', corpus.valid[:20]) print('Training model...') model.train(corpus.train, add_k=args.add_k, interpolate=args.interpolate, backoff=args.backoff) print(f'Vocab size: {len(model.vocab):,}') if args.save_arpa: print(f'Saving model to `{name}`...') model.save_arpa(name) assert model.sum_to_one(n=10) print('Generating text...') text = model.generate(100) text = ' '.join(text) path = os.path.join(args.out, f'generated.{name}.txt') print(text) with open(path, 'w') as f: print(text, file=f) if model.is_smoothed: print('\nPredicting test set NLL...') logprob = model(corpus.test) nll = -logprob / len(corpus.test) print(f'Test NLL: {nll:.2f} | Perplexity {exp(nll):.2f}') path = os.path.join(args.out, f'result.{name}.txt') with open(path, 'w') as f: print(f'Test NLL: {nll:.2f} | Perplexity {exp(nll):.2f}', file=f) else: exit( 'No evaluation with unsmoothed model: probability is probably 0 anyways.' )