def main(argv): if len(argv) < 2: usage() # Fetch data texts, tags = [], [] with open(sys.argv[1], 'rb') as f: reader = csv.DictReader( f, fieldnames=["title", "brand", "description", "categories", "tags"]) count = 0 for row in reader: count += 1 text, tag_set = row['title'], row['tags'].split(' ')[:-1] texts.append(text) tags.append(tag_set) if count >= MAX_TEXTS: break print('Processed %s texts.' % len(texts)) # Tokenize texts tokenizer = WordTokenizer() tokenizer.load() data = tokenizer.tokenize(texts) # Get labels from NER ner = ProductNER() labels = ner.get_labels(tags) # Compile NER network and train ner.compile(tokenizer) ner.train(data, labels)
def main(argv): if len(argv) < 2: usage() # Fetch data texts, categories = [], [] with open(sys.argv[1], 'rb') as f: reader = csv.DictReader( f, fieldnames=["title", "brand", "description", "categories"]) count = 0 for row in reader: count += 1 text, category = row['title'] + ' ' + row['description'], row[ 'categories'].split(' / ')[0] texts.append(text) categories.append(category) if count >= MAX_TEXTS: break print('Processed %s texts.' % len(texts)) # Tokenize texts tokenizer = WordTokenizer() tokenizer.load() data = tokenizer.tokenize(texts) # Get labels from classifier classifier = ProductClassifier() labels = classifier.get_labels(categories) # Compile classifier network and train classifier.compile(tokenizer) classifier.train(data, labels)
def main(argv): if len(argv) < 3: usage() model_dir = sys.argv[1] data_file = sys.argv[2] # Load tokenizer tokenizer = WordTokenizer() tokenizer.load(os.path.join(model_dir, 'tokenizer')) # Load classifier classifier = ProductClassifier() classifier.load(os.path.join(model_dir, 'classifier')) # Load named entity recognizer ner = ProductNER() ner.load(os.path.join(model_dir, 'ner')) with open(data_file, 'rb') as f: reader = csv.DictReader(f) outfile = open( '.'.join(data_file.split('.')[:-1] + ['processed', 'csv']), 'wb') writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames + ['category', 'brand']) writer.writeheader() count = 0 for row in reader: count += 1 processed_row = process(row, tokenizer, classifier, ner) print(processed_row) writer.writerow(processed_row)
def load_models(model_dir): # Load tokenizer tokenizer = WordTokenizer() tokenizer.load(os.path.join(model_dir, 'tokenizer')) # Load classifier classifier = ProductClassifier() classifier.load(os.path.join(model_dir, 'classifier')) # Load named entity recognizer ner = ProductNER() ner.load(os.path.join(model_dir, 'ner')) return tokenizer, classifier, ner
def poetic_preprocessing(self, text, remove_tek=False, tek_string=None): '''This function is only required when dealing with poetic corpora. Make sure to use this function along with the compulsory preprocessing to have decently accurate results with poetic corpora''' text = re.sub(r'।', '.', text) text = re.sub(' ।।[૧૨૩૪૫૬૭૮૯૦]।।', '.', text) if remove_tek: text = self.remove_tek(text, tek_string) tokens = WordTokenizer(text, keep_punctuations=False) # Remove poetic words for i in range(len(tokens)): try: if tokens[i] in [ 'જી', 'રે', 'હો', 'હોજી', 'લોલ', 'હે', 'હેજી', '...', 'સંતો' ]: del (tokens[i]) except: pass for i in range(len(tokens)): # Rule 1 if tokens[i].endswith('જી'): tokens[i] = tokens[i].strip('જી') # Rule 2 if tokens[i].endswith('ૈ'): tokens[i] = tokens[i].strip('ૈ') + 'ે' # Rule 3 index = tokens[i].find('ર') if tokens[i][index - 1] == 'િ' and index != len(tokens[i]) - 1: tokens[i] = re.sub('િર', 'ૃ', tokens[i]) return ' '.join(tokens)
class TfIdfEvaluator(SentencesEvaluator): def __init__(self, language_params): super(TfIdfEvaluator, self).__init__(language_params, "TF-IDF Evaluator") self.tokenizer = WordTokenizer(self.language_params) self.tf_idf = TfidfVectorizer(tokenizer=self.tokenizer.tokenize) def train(self, training_set): self.tf_idf.fit(training_set) def evaluate(self, sentences): words_weights = self.__get_words_weights(sentences) sentences_weights = [] for i, s in enumerate(sentences): words = self.tokenizer.tokenize(s) weights_sum = sum([words_weights.get(w, 0) for w in words]) if len(words) > 0: sentences_weights.append((i, float(weights_sum))) return sorted(sentences_weights, reverse=True) def __get_words_weights(self, test_set): weights = self.tf_idf.transform([''.join(test_set)]).toarray()[0] features = self.tf_idf.get_feature_names() f_weights = zip(features, weights) return dict(f_weights) def encode_list(self, list): return [self.__encode_text(a) for a in list] def __encode_text(self, text): return text.encode(sys.stdout.encoding, errors='replace')
def main(argv): if len(argv) < 2: usage() # Fetch data texts, categories = [], [] with open(sys.argv[1], 'rb') as f: reader = csv.DictReader( f, fieldnames=["title", "brand", "description", "categories"]) count = 0 for row in reader: count += 1 # TODO change here what we train on, and what categories are used text, category = row['title'], row['categories'].split(' / ')[0] texts.append(text) categories.append(category) if count >= MAX_TEXTS: break print(('Processed %s texts.' % len(texts))) tmpx, tmpy = [], [] c = Counter(categories) for x, y in zip(texts, categories): if c[y] > 200: tmpx.append(x) tmpy.append(y) texts = tmpx categories = tmpy print(Counter(tmpy)) # Tokenize texts tokenizer = WordTokenizer() tokenizer.load() data = tokenizer.tokenize(texts) # Get labels from classifier classifier = ProductClassifier() labels = classifier.get_labels(categories) # Compile classifier network and train classifier.compile(tokenizer) classifier.train(data, labels)
def main(argv): if len(argv) < 2: usage() # Fetch data texts, categories = [], [] with open(sys.argv[1], encoding='ISO8859') as f: reader = csv.DictReader(f, fieldnames=["title","brand","description","categories"]) count = 0 for row in reader: count += 1 text, category = row['title']+' '+row['description'], row['categories'].split(' / ')[0] texts.append(text) categories.append(category) if count >= MAX_TEXTS: break print(('Processed %s texts.' % len(texts))) # Tokenize texts tokenizer = WordTokenizer() tokenizer.train(texts)
import sys, os, csv import numpy as np from operator import itemgetter from tokenizer import WordTokenizer from classifier import ProductClassifier model_dir = './models' tokenizer = WordTokenizer() tokenizer.load(os.path.join(model_dir, 'tokenizer')) # Load classifier classifier = ProductClassifier() classifier.load(os.path.join(model_dir, 'classifier')) data = tokenizer.tokenize(["Cambridge wall calender"]) classScores = classifier.classify(data)[0] print(classScores) bestValIdx = np.argmax(classScores.values()) bestVal = classScores.values()[bestValIdx] bestClass = list(classScores)[bestValIdx] print(bestVal, bestClass)
# update vocab vocab = vocab.union(tokenized_text) # add to lists all_tokenized_text.append(tokenized_text) all_labels_a.append(labels_a) all_labels_o.append(labels_o) # return return vocab, all_tokenized_text, all_labels_a, all_labels_o # load data and unpdate vocab vocab, train_text_tokens, train_labels_a, train_labels_o = load_data(train_data_file) _, test_text_tokens, test_labels_a, test_labels_o = load_data(test_data_file) vocab = list(vocab.union(['[UNK]', '[PAD]'])) # create tokenizer and model print("Create Tokenizer and Model...") tokenizer = WordTokenizer(vocab, do_lower_case=do_lower_case) model = CMLA(50, 3, len(tokenizer), embedding_dim, 3, 20, 2, pad_id=tokenizer.pad_token_id) # load pretrained embeddings if gensim_embeddings_file is not None: print("Loading Pretrained Embeddings...") vocab = tokenizer.vocab if "german_deepset.bin" in gensim_embeddings_file: vocab = [("b'" + t + "'") for t in vocab] # match tokens in german_deepset embeddings n_loaded = model.load_gensim_embeddings(gensim_embeddings_file, vocab, limit=100_000, binary=True) print("Loaded %i/%i vectors from pretrained embedding." % (n_loaded, len(tokenizer))) # move model to device model.to(device) # optimizer and criterium print("Create Optimizer and Criterium...") optim = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
def main(cli_args): # Read from config file and make args with open("./tasks/korquad/config.json", "r") as f: args = AttrDict(json.load(f)) args.seed = cli_args.seed args.tokenizer = cli_args.tokenizer args.output_dir = args.output_dir.format(args.tokenizer) args.resource_dir = cli_args.resource_dir args.data_dir = cli_args.data_dir logger.info(f"Training/evaluation parameters {args}") if args.doc_stride >= args.max_seq_length - args.max_query_length: logger.warning( "WARNING - You've set a doc stride which may be superior to the document length in some " "examples. This could result in errors when building features from the examples. Please reduce the doc " "stride or increase the maximum length to ensure the features are correctly built." ) init_logger() set_seed(args.seed) logging.getLogger("transformers.data.metrics.squad_metrics").setLevel( logging.WARN) # Reduce model loading logs # custom tokenizers tokenizer_dir = os.path.join(args.resource_dir, args.tokenizer) logger.info(f"get vocab and tokenizer from {tokenizer_dir}") if args.tokenizer.startswith("mecab-"): custom_tokenizer = MeCabTokenizer( os.path.join(tokenizer_dir, "tok.json")) elif args.tokenizer.startswith("sp-"): custom_tokenizer = SentencePieceTokenizer( os.path.join(tokenizer_dir, "tok.model")) elif args.tokenizer.startswith("mecab_sp-"): mecab = MeCabTokenizer(os.path.join(tokenizer_dir, "tok.json")) sp = SentencePieceTokenizer(os.path.join(tokenizer_dir, "tok.model")) custom_tokenizer = MeCabSentencePieceTokenizer(mecab, sp) elif args.tokenizer.startswith("char-"): custom_tokenizer = CharTokenizer() elif args.tokenizer.startswith("word-"): custom_tokenizer = WordTokenizer() elif args.tokenizer.startswith("jamo-"): custom_tokenizer = JamoTokenizer() else: raise ValueError("Wrong tokenizer name.") # Load pretrained model and tokenizer config = BertConfig.from_json_file( os.path.join(args.resource_dir, args.tokenizer, args.bert_config_file_name)) tokenizer = BertTokenizer(os.path.join(tokenizer_dir, "tok.vocab"), custom_tokenizer) model = KorQuADModel(config) model.bert = load_pretrained_bert( config, os.path.join(args.resource_dir, args.tokenizer, args.pretrained_bert_file_name)) # GPU or CPU args.device = "cuda" if torch.cuda.is_available() else "cpu" model.to(args.device) logger.info(f"Training/evaluation parameters {args}") # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(f" global_step = {global_step}, average loss = {tr_loss}") # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory results = {} if args.do_eval: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + "pytorch_model.bin", recursive=True))) if not args.eval_all_checkpoints: checkpoints = checkpoints[-1:] else: logging.getLogger("transformers.configuration_utils").setLevel( logging.WARN) # Reduce model loading logs logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce model loading logs logger.info(f"Evaluate the following checkpoints: {checkpoints}") for checkpoint in checkpoints: # Reload the model global_step = checkpoint.split("-")[-1] model = KorQuADModel.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=global_step) result = dict((k + (f"_{global_step}" if global_step else ""), v) for k, v in result.items()) results.update(result) output_dir = os.path.join(args.output_dir, "eval") with open(os.path.join(output_dir, "eval_result.txt"), "w", encoding="utf-8") as f: official_eval_results = eval_during_train(args) for key in sorted(official_eval_results.keys()): logger.info(f" {key} = {official_eval_results[key]}") f.write(f" {key} = {official_eval_results[key]}\n")
import torch # import model and tokenizer from model import CMLA from tokenizer import WordTokenizer # model directory model_dir = "results/SemEval2015" # sample text text = "The ambience is nice for conversation." text = "The staff was really nice but the food was disgusting!" # text = "In the summer months, the back garden area is really nice." # text = "Das Essen war sehr lecker." # load tokenizer and model print("Loading Tokenizer and Model...") tokenizer = WordTokenizer(os.path.join(model_dir, 'vocab.txt')) model = CMLA.load(model_dir) model.eval() # tokenize text tokens = tokenizer.tokenize(text) token_ids = tokenizer.convert_tokens_to_ids(tokens) # convert to tensor and pass through model token_ids = torch.LongTensor([token_ids]) aspect_logits, opinion_logits = model.forward(token_ids) # get predictions from logits aspect_predicts = aspect_logits[0, :].max(dim=-1)[1] opinion_predicts = opinion_logits[0, :].max(dim=-1)[1] print(tokens) print(aspect_predicts) print(opinion_predicts)
def evaluate(self, sentences): tokenizer = WordTokenizer(self.language_params) tokenized_sents = [tokenizer.tokenize(s) for s in sentences] return self._get_lengths(tokenized_sents)
def pos_tag(self, sentence): stemmer = Stemmer() sent = stemmer.stem(sentence) sent = WordTokenizer(sent) tags = self.tag(sent) return tags
class EbooksQuotes(object): def __init__( self, keywords=None, probability=0.001, minimum_quote_size=8, maximum_quote_size=140, wrap_at=30, truncate_chance=1.0/4): keywords = keywords or [] self.keywords = [x.lower() for x in keywords] self.probability = probability self.minimum_quote_size = minimum_quote_size self.maximum_quote_size = maximum_quote_size self.wrap_at = wrap_at self.truncate_chance = truncate_chance self._blobs = {} COMMON_STARTING_WORDS= [ "I","How","The","You","What","A","Why", "And","This","It","Do","In","We","Learn","If", "But","Don't","Your","When","Discover", "Are","Get","There","My","Have","To","That", "As","Make","Let","One"] # Quotes that end in certain parts of speech get higher ratings. PART_OF_SPEECH_SCORE_MULTIPLIERS = { "NNP": 3.2, "NNS": 2.7, "NN": 2.5, "VGD": 1.9, "VBG": 1.9, "PRP": 1.8, "VB": 1.6, "JJR": 1.3, "CD": 1.2, "RB": 1.2, "VBP": 1} PUNCTUATION_AND_COMMON_STARTING_WORD = re.compile('[.!?"] (%s) ' % ( "|".join(COMMON_STARTING_WORDS))) SEVERAL_CAPITALIZED_WORDS = re.compile("(([A-Z][a-zA-Z]+,? ){2,}[A-Z][a-zA-Z]+[!?.]?)") ONE_LETTER = re.compile("[A-Za-z]") ONE_WORD = re.compile("\W+") data = ['" ', "' ", "--", '\)', ']', ',', '\.', '-'] BEGINNING_CRUFT = re.compile("^(%s)" % "|".join(data)) TOKENIZER = WordTokenizer() @classmethod def rate(cls, s, base_score=1.0, frequencies=None, obscurity_cutoff=None): "Rate a string's suitability as an _ebook quote." s = s.strip() score = float(base_score) # print s # print " Starting rating: %.2f" % score # People like very short or very long quotes. # if len(s) < 40: # score *= 2 if len(s) > 128: score *= 2 # print " Length bonus: %.2f" % score blob = TextBlob(s.decode("utf8")) try: words = blob.words except Exception, e: # TODO: I'm sick of trying to get TextBlob to parse # strings that include things like ". . . ". Just return # the current score. return score if frequencies: contains_known_word = False contains_obscure_word = False for word in words: l = word.lower() if l in frequencies: contains_known_word = True if frequencies[l] < obscurity_cutoff: contains_obscure_word = True if contains_known_word and contains_obscure_word: break # A string that contains no words that appear in the # frequency list is heavily penalized. It's probably # gibberish. if not contains_known_word: score *= 0.1 # print " No known word: %.2f" % score # A string that contains no obscure words is even more # heavily penalized. It's almost certainly boring. if not contains_obscure_word: score *= 0.01 # print " No obscure word: %.2f" % score if s[0].upper() == s[0]: # We like quotes that start with uppercase letters. score *= 2.5 # print " Starts with uppercase letter: %.2f" % score # Let's take a look at the first and last words. first_word, ignore = blob.tags[0] if first_word.capitalize() in cls.COMMON_STARTING_WORDS: score *= 2.5 # print " Starts with common starting word: %.2f" % score last_word, last_tag = blob.tags[-1] if last_tag in cls.PART_OF_SPEECH_SCORE_MULTIPLIERS: score *= cls.PART_OF_SPEECH_SCORE_MULTIPLIERS[last_tag] # print " Bonus for part of speech %s: %.2f" % (last_tag, score) if last_tag != 'NNP' and last_word[0].upper() == last_word[0]: score *= 1.25 # print " Bonus for ending with a capitalized word: %.2f" % score # print "Final score: %.2f" % score return score
def __text_to_vector(self, sentence): tokenizer = WordTokenizer(self.language_params) words = tokenizer.tokenize(sentence) return Counter(words)
def __init__(self, language_params): super(TfIdfEvaluator, self).__init__(language_params, "TF-IDF Evaluator") self.tokenizer = WordTokenizer(self.language_params) self.tf_idf = TfidfVectorizer(tokenizer=self.tokenizer.tokenize)
import os, time from multiprocessing import Pool, Manager import numpy as np from tqdm import tqdm import torch from torch.utils.data import Dataset, DataLoader from tokenizer import WordTokenizer tokenizer = WordTokenizer('models/tokenizer/tokenizer.json') class TextDataset(Dataset): def __init__(self, txt_file, block_len=64, mlm_percentage=0.15): with open(txt_file, 'r', encoding='utf-8') as f: textlines = f.readlines() self.examples = [] self.block_len = block_len for line in textlines: new_tokens = tokenizer.encode(line) if len(new_tokens) < self.block_len: new_tokens = [0,] * ((64 - len(new_tokens)) // 2) + \ new_tokens + [0,] * ((64 - len(new_tokens)) // 2 + 1) self.examples.append(new_tokens) self.mlm_percentage = mlm_percentage def __len__(self): return len(self.examples)
def main(args): # config config = TrainConfig(**args) config = config._replace( log_dir=config.log_dir.format(config.tokenizer), summary_dir=config.summary_dir.format(config.tokenizer), # checkpoint_dir=config.checkpoint_dir.format(config.tokenizer), ) set_seed(config.seed) os.makedirs(config.log_dir, exist_ok=True) os.makedirs(config.summary_dir, exist_ok=True) # os.makedirs(config.checkpoint_dir, exist_ok=True) # logger logger = get_logger(log_path=os.path.join(config.log_dir, "logs.txt")) logger.info(config) # 기본적인 모듈들 생성 (vocab, tokenizer) tokenizer_dir = os.path.join(config.resource_dir, config.tokenizer) logger.info(f"get vocab and tokenizer from {tokenizer_dir}") vocab = Vocab(os.path.join(tokenizer_dir, "tok.vocab")) if config.tokenizer.startswith("mecab-"): tokenizer = MeCabTokenizer(os.path.join(tokenizer_dir, "tok.json")) elif config.tokenizer.startswith("sp-"): tokenizer = SentencePieceTokenizer( os.path.join(tokenizer_dir, "tok.model")) elif config.tokenizer.startswith("mecab_sp-"): mecab = MeCabTokenizer(os.path.join(tokenizer_dir, "tok.json")) sp = SentencePieceTokenizer(os.path.join(tokenizer_dir, "tok.model")) tokenizer = MeCabSentencePieceTokenizer(mecab, sp) elif config.tokenizer.startswith("char-"): tokenizer = CharTokenizer() elif config.tokenizer.startswith("word-"): tokenizer = WordTokenizer() elif config.tokenizer.startswith("jamo-"): tokenizer = JamoTokenizer() else: raise ValueError("Wrong tokenizer name.") # 모델에 넣을 데이터 준비 # label-to-index label_to_index = {"0": 0, "1": 1} # Train logger.info(f"read training data from {config.train_path}") train_sentence_as, train_sentence_bs, train_labels = load_data( config.train_path, label_to_index) # Dev logger.info(f"read dev data from {config.dev_path}") dev_sentence_as, dev_sentence_bs, dev_labels = load_data( config.dev_path, label_to_index) # Test logger.info(f"read test data from {config.test_path}") test_sentence_as, test_sentence_bs, test_labels = load_data( config.test_path, label_to_index) # 데이터로 dataloader 만들기 # Train logger.info("create data loader using training data") train_dataset = PAWSDataset(train_sentence_as, train_sentence_bs, train_labels, vocab, tokenizer, config.max_sequence_length) train_random_sampler = RandomSampler(train_dataset) train_data_loader = DataLoader(train_dataset, sampler=train_random_sampler, batch_size=config.batch_size) # Dev logger.info("create data loader using dev data") dev_dataset = PAWSDataset(dev_sentence_as, dev_sentence_bs, dev_labels, vocab, tokenizer, config.max_sequence_length) dev_data_loader = DataLoader(dev_dataset, batch_size=1024) # Test logger.info("create data loader using test data") test_dataset = PAWSDataset(test_sentence_as, test_sentence_bs, test_labels, vocab, tokenizer, config.max_sequence_length) test_data_loader = DataLoader(test_dataset, batch_size=1024) # Summary Writer 준비 summary_writer = SummaryWriter(log_dir=config.summary_dir) # 모델을 준비하는 코드 logger.info("initialize model and convert bert pretrained weight") bert_config = BertConfig.from_json_file( os.path.join(config.resource_dir, config.tokenizer, config.bert_config_file_name)) model = PAWSModel(bert_config, config.dropout_prob) model.bert = load_pretrained_bert( bert_config, os.path.join(config.resource_dir, config.tokenizer, config.pretrained_bert_file_name)) trainer = Trainer(config, model, train_data_loader, dev_data_loader, test_data_loader, logger, summary_writer) trainer.train()
def train(args): train_data_path = args["--train-src"] val_data_path = args["--val-src"] glove_file = args["--glove-file"] print( "Reading and parsing in CONLL 2003 data...\nTrain data from {}\nValidation data from {}" .format(train_data_path, val_data_path)) train_sents, train_tags = conll_parse(train_data_path) val_sents, val_tags = conll_parse(val_data_path) print("Initializing tokenizers for words, NER tags and word cases...\n") tok_x = WordTokenizer(from_pre=True) train_sents_p = tok_x.initialize(train_sents, glove_file) print("Vocab size for tok_x: {}".format(tok_x.vocab_size)) tok_c = CaseTokenizer() print("Vocab size for tok_c: {}".format(tok_c.vocab_size)) tok_y = WordTokenizer(oov_tok=None) _ = tok_y.initialize(train_tags) print("Vocab size for tok_y: {}\n".format(tok_y.vocab_size)) train_sents_i = tok_x.text_to_indices(train_sents_p) train_cases_i = tok_c.text_to_indices(train_sents) train_tags_i = tok_y.text_to_indices(train_tags) val_sents_p = tok_x.pre_process(val_sents) val_sents_i = tok_x.text_to_indices(val_sents_p) val_cases_i = tok_c.text_to_indices(val_sents) val_tags_i = tok_y.text_to_indices(val_tags) train_data = [train_sents_i, train_cases_i, train_tags_i] val_data = [val_sents_i, val_cases_i, val_tags_i] print("Initializing NER model and beginning training...") ner_tagger = NERTagger(tok_x, tok_c, tok_y) ner_tagger.build() ner_tagger.train(train_data, val_data) if args["--model-tgt"] and args["--tok-tgt"]: ner_tagger.save(aux_file=args["--tok-tgt"], model_file=args["--model-tgt"]) else: ner_tagger.save()