def generate_answers_attention(session, model, word2id, qn_uuid_data, context_token_data, qn_token_data): """ Given a model, and a set of (context, question) pairs, each with a unique ID, use the model to generate an answer for each pair, and return a dictionary mapping each unique ID to the generated answer + prob which is product of start and end prob. Inputs: session: TensorFlow session model: QAModel word2id: dictionary mapping word (string) to word id (int) qn_uuid_data, context_token_data, qn_token_data: lists Outputs: uuid2ans: dictionary mapping uuid (string) to predicted answer (string; detokenized) """ uuid2ans = {} # maps uuid to string containing predicted answer data_size = len(qn_uuid_data) num_batches = ((data_size-1) / model.FLAGS.batch_size) + 1 batch_num = 0 detokenizer = MosesDetokenizer() print("Generating answers...") for batch in get_batch_generator(word2id, qn_uuid_data, context_token_data, qn_token_data, model.FLAGS.batch_size, model.FLAGS.context_len, model.FLAGS.question_len): # Get the predicted spans pred_start_batch, pred_end_batch, maxprob_batch = model.get_start_end_pos(session, batch) attn_distribution = model.get_attention_dist(session, batch) # Convert pred_start_batch and pred_end_batch to lists length batch_size pred_start_batch = pred_start_batch.tolist() pred_end_batch = pred_end_batch.tolist() maxprob_batch = maxprob_batch.tolist() attn_distribution_batch = attn_distribution.tolist() # For each example in the batch: for ex_idx, (pred_start, pred_end, maxprob, attn_dist) in enumerate(zip(pred_start_batch, pred_end_batch, maxprob_batch, attn_distribution_batch)): # Original context tokens (no UNKs or padding) for this example context_tokens = batch.context_tokens[ex_idx] # list of strings # Check the predicted span is in range assert pred_start in range(len(context_tokens)) assert pred_end in range(len(context_tokens)) # Predicted answer tokens pred_ans_tokens = context_tokens[pred_start : pred_end +1] # list of strings # Detokenize and add to dict uuid = batch.uuids[ex_idx] uuid2ans[uuid] = [detokenizer.detokenize(pred_ans_tokens, return_str=True), maxprob, attn_dist] batch_num += 1 if batch_num % 10 == 0: print("Generated answers for %i/%i batches = %.2f%%" % (batch_num, num_batches, batch_num*100.0/num_batches)) print("Finished generating answers for dataset.") return uuid2ans
def parsedToDB(data, cid, catID, fileid): file = document.objects.get(id=fileid) fileName = file.file_name fileName = fileName.split(".")[0] tempName = re.split('(\d+)', fileName) fileName = "" for part in tempName: fileName += (' ' + part) detokenizer = MosesDetokenizer() regex = re.compile('[^a-zA-Z0-9 \n\.]') data = data.replace('\x00', ' ') dbAnswer = data data = fileName + ' ' + data data = data.replace('<br>', ' ') data = regex.sub('', data) data_list = nltk.word_tokenize(data) data = [ word for word in data_list if word not in stopwords.words('english') ] detokenizer.detokenize(data, return_str=True) dbInfo = " ".join(data).lower() botanswers.objects.create(answer=dbAnswer, rating=0, category_id=catID.id, entities=dbInfo, course_id=cid, file_id=fileid)
def detokenize(line): tokens = line.replace(" n't", "n't").split(' ') tokens = list(map(lambda x: map_brackets_bw(x), tokens)) detokenizer = MosesDetokenizer() res = detokenizer.detokenize(tokens, return_str=True) res = res[0].upper() + res[1:] return res
def clean_text(raw_text, get_questions=False): """ Words consist of letters or numbers :param raw_text: text (not divided into sentences) :return: list of sanitized sentences """ # Tokenize text into sentences. raw_text = delete_parenthesis(raw_text) sentences = nltk.sent_tokenize(raw_text) #Tokenize each sentence sanitized_sentences = [] for s in sentences: #use Moses instead of nltk.word_tokenize(s) - better with apostrophes: cant -> (can + 't) but not (ca + 'n't) tokenizer = MosesTokenizer() s_tokens = tokenizer.tokenize(s) #s_tokens = nltk.word_tokenize(s) if (not get_questions and s_tokens[-1] != '?') or (get_questions and s_tokens[-1] == '?'): sanitized_sentences.append(sanitize(s_tokens)) #Sanitized tokens joined using detokenizer detokenizer = MosesDetokenizer() return [ detokenizer.detokenize(s, return_str=True) for s in sanitized_sentences ]
def generate_answers_with_start_end(model_flags, word2id, char2id, qn_uuid_data, context_token_data, qn_token_data, pred_start_batches, pred_end_batches): """ Given a model, and a set of (context, question) pairs, each with a unique ID, use the model to generate an answer for each pair, and return a dictionary mapping each unique ID to the generated answer. Inputs: model_flags: QAModel flags, batch size, must be the same for all models. word2id: dictionary mapping word (string) to word id (int) qn_uuid_data, context_token_data, qn_token_data: lists pred_start_batches, pred_end_batches: list of list, size is model_flags.batch_size Outputs: uuid2ans: dictionary mapping uuid (string) to predicted answer (string; detokenized) """ uuid2ans = {} # maps uuid to string containing predicted answer data_size = len(qn_uuid_data) num_batches = ((data_size-1) / model_flags.batch_size) + 1 batch_num = 0 detokenizer = MosesDetokenizer() print "Generating %d answers..." % len(qn_uuid_data) for batch in get_batch_generator(word2id, char2id, qn_uuid_data, context_token_data, qn_token_data, model_flags.batch_size, model_flags.context_len, model_flags.question_len, model_flags.word_len): # Get the predicted spans pred_start_batch = pred_start_batches[batch_num] pred_end_batch = pred_end_batches[batch_num] # Convert pred_start_batch and pred_end_batch to lists length batch_size pred_start_batch = pred_start_batch.tolist() pred_end_batch = pred_end_batch.tolist() # For each example in the batch: for ex_idx, (pred_start, pred_end) in enumerate(zip(pred_start_batch, pred_end_batch)): # Original context tokens (no UNKs or padding) for this example context_tokens = batch.context_tokens[ex_idx] # list of strings # Check the predicted span is in range assert pred_start in range(len(context_tokens)) assert pred_end in range(len(context_tokens)) # Predicted answer tokens pred_ans_tokens = context_tokens[pred_start : pred_end +1] # list of strings # Detokenize and add to dict uuid = batch.uuids[ex_idx] uuid2ans[uuid] = detokenizer.detokenize(pred_ans_tokens, return_str=True) batch_num += 1 if batch_num % 10 == 0: print "Generated answers for %i/%i batches = %.2f%%" % (batch_num, num_batches, batch_num*100.0/num_batches) print "Finished generating answers for dataset." return uuid2ans
def run(): detokenizer = MosesDetokenizer() with open('data.json') as f: data = json.load(f) word2vec = data['word2vec'] contexts = data['contexts'] questions = data['questions'] predictions = [] for c,qs in tqdm(zip(contexts, questions), total=len(contexts)): if len(c) == 1: continue # Get vector embedding of context ce = [] for sent in c: ct = sent_embed(sent,word2vec) ce.append(ct) # Get vector embedding of sentence # Find the most similar sentence in the context for q in qs: qe = sent_embed(q,word2vec) sims = [cosine_similarity(qe, cs) for cs in ce] max_sim = max(sims) idx = sims.index(max_sim) predictions.append(detokenizer.detokenize(c[idx], return_str=True)) return predictions
def __init__(self, endings, stop_phrases, only_bulleted_lines=True, confidence=95, *args, **kwargs): super().__init__(*args, **kwargs) self.endings = endings self.stop_phrases = stop_phrases self.only_bulleted_lines = only_bulleted_lines self.detokenizer = MosesDetokenizer() self.confidence = confidence
def detokenize(line): tokens = line.replace(" n't", "n't").split(' ') tokens = list(map(lambda x: map_brackets_bw(x), tokens)) detokenizer = MosesDetokenizer() res = detokenizer.detokenize(tokens, return_str=True) res = res[0].upper() + res[1:] return res
def __init__(self): try: from sacremoses import MosesDetokenizer self._detokenizer = MosesDetokenizer() except (ImportError, TypeError) as err: if isinstance(err, TypeError): warnings.warn('The instantiation of MosesDetokenizer in sacremoses is' ' currently only supported in python3.' ' Now try NLTKMosesDetokenizer using NLTK ...') else: warnings.warn('sacremoses is not installed. ' 'To install sacremoses, use pip install -U sacremoses' ' Now try NLTKMosesDetokenizer using NLTK ...') try: import nltk try: nltk.data.find('perluniprops') except LookupError: nltk.download('perluniprops') from nltk.tokenize.moses import MosesDetokenizer self._detokenizer = MosesDetokenizer() except ImportError: raise ImportError('NLTK is not installed. ' 'You must install NLTK <= 3.2.5 in order to use the ' 'NLTKMosesDetokenizer. You can refer to the official ' 'installation guide in https://www.nltk.org/install.html .')
def rm_addresses(doc): doc_l = [] rm_st = [] sens = doc.split("\n") for sent in sens: sent_l = [] usaddress.parse(sent) for tuple2 in usaddress.parse(sent): # print(tuple2) if tuple2[1] == 'BuildingName' or tuple2[ 1] == 'Recipient' or tuple2[1] == 'OccupancyType' or tuple2[ 1] == 'OccupancyIdentifier' or tuple2[ 1] == 'LandmarkName': sent_l.append(tuple2[0]) else: sent_l.append("█" * len(tuple2[0])) rm_st.append(tuple2[0]) # print(sent_l) deto = MosesDetokenizer() sent_n = deto.detokenize(sent_l, return_str=True) # sent_n = " ".join(sent_l) doc_l.append(sent_n) # print(doc_l) doc = "\n".join(doc_l) #print(doc) return doc, rm_st
def __init__(self): try: from sacremoses import MosesDetokenizer self._detokenizer = MosesDetokenizer() except (ImportError, TypeError) as err: if isinstance(err, TypeError): warnings.warn('The instantiation of MosesDetokenizer in sacremoses is' ' currently only supported in python3.' ' Now try NLTKMosesDetokenizer using NLTK ...') else: warnings.warn('sacremoses is not installed. ' 'To install sacremoses, use pip install -U sacremoses' ' Now try NLTKMosesDetokenizer using NLTK ...') try: import nltk try: nltk.data.find('perluniprops') except LookupError: nltk.download('perluniprops') from nltk.tokenize.moses import MosesDetokenizer self._detokenizer = MosesDetokenizer() except ImportError: raise ImportError('NLTK is not installed. ' 'You must install NLTK <= 3.2.5 in order to use the ' 'NLTKMosesDetokenizer. You can refer to the official ' 'installation guide in https://www.nltk.org/install.html .')
def spellCheck(s): tokens = nltk.word_tokenize(cleaner(s)) corrected = [ spell(s) if s not in string.punctuation else s for s in tokens ] mose = MosesDetokenizer() return mose.detokenize(corrected, return_str=True)
def __init__( self, separate_sentences=" ", # moses can be suitable for prose if we don't insert newlines separate_words=" "): super(JoinerNLTK, self).__init__(separate_sentences=separate_sentences, separate_words=separate_words) self.detokenizer = MosesDetokenizer(lang="en")
def generate_answers(session, model, word2id, qn_uuid_data, context_token_data, qn_token_data): """ Given a model, and a set of (context, question) pairs, each with a unique ID, use the model to generate an answer for each pair, and return a dictionary mapping each unique ID to the generated answer. Inputs: session: TensorFlow session model: QAModel word2id: dictionary mapping word (string) to word id (int) qn_uuid_data, context_token_data, qn_token_data: lists Outputs: uuid2ans: dictionary mapping uuid (string) to predicted answer (string; detokenized) """ uuid2ans = {} # maps uuid to string containing predicted answer data_size = len(qn_uuid_data) num_batches = ((data_size-1) / model.FLAGS.batch_size) + 1 batch_num = 0 detokenizer = MosesDetokenizer() print "Generating answers..." for batch in get_batch_generator(word2id, qn_uuid_data, context_token_data, qn_token_data, model.FLAGS.batch_size, model.FLAGS.context_len, model.FLAGS.question_len): # Get the predicted spans pred_start_batch, pred_end_batch = model.get_start_end_pos(session, batch) # Convert pred_start_batch and pred_end_batch to lists length batch_size pred_start_batch = pred_start_batch.tolist() pred_end_batch = pred_end_batch.tolist() # For each example in the batch: for ex_idx, (pred_start, pred_end) in enumerate(zip(pred_start_batch, pred_end_batch)): # Original context tokens (no UNKs or padding) for this example context_tokens = batch.context_tokens[ex_idx] # list of strings # Check the predicted span is in range assert pred_start in range(len(context_tokens)) assert pred_end in range(len(context_tokens)) # Predicted answer tokens pred_ans_tokens = context_tokens[pred_start : pred_end +1] # list of strings # Detokenize and add to dict uuid = batch.uuids[ex_idx] uuid2ans[uuid] = detokenizer.detokenize(pred_ans_tokens, return_str=True) batch_num += 1 if batch_num % 10 == 0: print "Generated answers for %i/%i batches = %.2f%%" % (batch_num, num_batches, batch_num*100.0/num_batches) print "Finished generating answers for dataset." return uuid2ans
def toDataFrame(tweets): # convert to dataframe DataSet = pd.DataFrame() # add parameters DataSet['tweetID'] = [tweet.id for tweet in tweets] DataSet['datetime'] = [tweet.created_at for tweet in tweets] DataSet['date'] = DataSet.datetime.dt.date DataSet['hour'] = DataSet.datetime.dt.hour DataSet['minute'] = DataSet.datetime.dt.minute DataSet['dayofweek'] = DataSet.datetime.dt.weekday_name DataSet['tweetRetweetCt'] = [tweet.retweet_count for tweet in tweets] DataSet['tweetFavoriteCt'] = [tweet.favorite_count for tweet in tweets] DataSet['tweetSource'] = [tweet.source for tweet in tweets] DataSet['userID'] = [tweet.user.id for tweet in tweets] DataSet['userScreen'] = [tweet.user.screen_name for tweet in tweets] DataSet['userName'] = [tweet.user.name for tweet in tweets] DataSet['userCreateDt'] = [tweet.user.created_at for tweet in tweets] DataSet['userDesc'] = [tweet.user.description for tweet in tweets] DataSet['userFollowerCt'] = [ tweet.user.followers_count for tweet in tweets ] DataSet['userFriendsCt'] = [tweet.user.friends_count for tweet in tweets] DataSet['userLocation'] = [tweet.user.location for tweet in tweets] DataSet['userTimezone'] = [tweet.user.time_zone for tweet in tweets] DataSet['tweetText'] = [tweet.full_text for tweet in tweets] # tokenize tweetsText, and filter for stop words detokenizer = MosesDetokenizer() noStopWords = [] for i in tweets: word_tokens = word_tokenize(i.full_text) filtered_sentence = [w for w in word_tokens if not w in stop_words] noStopWords.append( detokenizer.detokenize(filtered_sentence, return_str=True)) DataSet['tweetNoSW'] = noStopWords # sentiment analysis analyzer = SentimentIntensityAnalyzer() DataSet['sentimentPos'] = [ analyzer.polarity_scores(tweet)['pos'] for tweet in DataSet['tweetNoSW'] ] DataSet['sentimentNeut'] = [ analyzer.polarity_scores(tweet)['neu'] for tweet in DataSet['tweetNoSW'] ] DataSet['sentimentNeg'] = [ analyzer.polarity_scores(tweet)['neg'] for tweet in DataSet['tweetNoSW'] ] DataSet['sentimentComp'] = [ analyzer.polarity_scores(tweet)['compound'] for tweet in DataSet['tweetNoSW'] ] return DataSet
def __init__(self): try: from nltk.tokenize.moses import MosesDetokenizer except ImportError: raise ImportError( 'NLTK or relevant packages are not installed. You must install NLTK ' 'in order to use the NLTKMosesTokenizer. You can refer to the ' 'official installation guide in https://www.nltk.org/install.html .' ) self._detokenizer = MosesDetokenizer()
def generate_answers_from_dist(sess, model, total_dict, word2id, qn_uuid_data, context_token_data, qn_token_data): """ Given a model, and a set of (context, question) pairs, each with a unique ID, use the model to generate an answer for each pair, and return a dictionary mapping each unique ID to the generated answer. Inputs: session: TensorFlow session total_dict: dict uuid -> distributions word2id: dictionary mapping word (string) to word id (int) qn_uuid_data, context_token_data, qn_token_data: lists Outputs: uuid2ans: dictionary mapping uuid (string) to predicted answer (string; detokenized) """ uuid2ans = {} # maps uuid to string containing predicted answer data_size = len(qn_uuid_data) num_batches = ((data_size - 1) / model.FLAGS.batch_size) + 1 batch_num = 0 detokenizer = MosesDetokenizer() print "Generating answers..." for batch in get_batch_generator(word2id, qn_uuid_data, context_token_data, qn_token_data, model.FLAGS.batch_size, model.FLAGS.context_len, model.FLAGS.question_len): # For each example in the batch: for (ex_idx, uuid) in enumerate(batch.uuids): pred_start = np.argmax(total_dict[uuid][0]) pred_end = np.argmax(total_dict[uuid][1]) # Original context tokens (no UNKs or padding) for this example context_tokens = batch.context_tokens[ex_idx] # list of strings # Check the predicted span is in range assert pred_start in range(len(context_tokens)) assert pred_end in range(len(context_tokens)) # Predicted answer tokens pred_ans_tokens = context_tokens[pred_start:pred_end + 1] # list of strings uuid2ans[uuid] = detokenizer.detokenize(pred_ans_tokens, return_str=True) batch_num += 1 if batch_num % 10 == 0: print "Generated answers for %i/%i batches = %.2f%%" % ( batch_num, num_batches, batch_num * 100.0 / num_batches) print "Finished generating answers for dataset." return uuid2ans
def __init__(self, sess, model_name, dataset_name, checkpoint, char_emb=False, fix_problems=False): """Prepare the model's dataset and trained model.""" os.makedirs(os.path.join('training', 'data', 'dataset', dataset_name), exist_ok=True) CURRENT_PATH = os.path.dirname(os.path.realpath(__file__)) TRAINING_PATH = os.path.join(CURRENT_PATH, 'training') data_dir = os.path.join(TRAINING_PATH, 'data', 'dataset', dataset_name) model_dir = os.path.join(TRAINING_PATH, 'model', model_name) self.hparams = utils.load_hparams( os.path.join(model_dir, 'hparams.json')) self.detokenizer = MosesDetokenizer() self.char_emb = char_emb self.normalizer = predictor.Predictor(sess, dataset_dir=data_dir, output_dir=model_dir, output_file=checkpoint, hparams=self.hparams) self.fix_problems = fix_problems if self.fix_problems: ACCENT_PATH = os.path.join(TRAINING_PATH, 'data', 'accented_words.dic') PANDIWA_PATH = os.path.join(TRAINING_PATH, 'data', 'pandiwa.dic') accent_words_dict = csv_to_dict(ACCENT_PATH) accent_words_dict = { v2: k for k, v in accent_words_dict.items() for v2 in v } pprint(accent_words_dict) with open(PANDIWA_PATH, 'r') as pandiwa_file: pandiwa_words_dict = pandiwa_file.read().splitlines() with open(os.path.join(TRAINING_PATH, 'data', 'hyph_fil.tex'), 'r') as f: hyphenator_dict = f.read() self.spell_corrector = SpellCorrector(dict_path=os.path.join( TRAINING_PATH, 'data', 'corpus', 'merged_bicol.txt')) self.t_normalizer = TextNormalizer( accent_words_dict=accent_words_dict, hyphenator_dict=hyphenator_dict, pandiwa_words_dict=pandiwa_words_dict, spell_corrector=self.spell_corrector)
def detokenize(tokens, start, end): """ Given a list of tokens, take the tokens from index start to index end and detokenize them """ if end < start: return '' else: tokens = tokens[start:end + 1] detokenizer = MosesDetokenizer() return detokenizer.detokenize( [token.decode('utf-8') for token in tokens], return_str=True)
def spacyMethod(nq): #NLTK/SPACY METHOD nq = nq.replace('-',' ').lower() detokenizer = MosesDetokenizer() ent_list = nltk.word_tokenize(nq) #Spacy Stop Word Removal for ind,ent in enumerate(ent_list): if ent in STOP_WORDS and ent != 'name': del ent_list[ind] detokenizer.detokenize(ent_list, return_str=True) return(ent_list)
def __init__(self, opt): self.opt = opt self.sep = opt.seprator + " " if opt.cuda: torch.cuda.set_device(opt.gpu) self.bpe = BPE(codecs.open(opt.bpe_codes, 'r', encoding="UTF-8"), opt.seprator, None, None) self.tokenizer = MosesTokenizer() self.detokenizer = MosesDetokenizer() self.translator = onmt.Translator(opt)
def normalize(post): reaction = get_most_common_reaction(post['reactions']) message = word_tokenize(post['message']) sw = stopwords.words('english') words_without_stopwords = [word for word in message if word not in sw] detokenizer = MosesDetokenizer() message_without_stopwords = detokenizer.detokenize(words_without_stopwords, return_str=True) return {'message': message_without_stopwords, 'reaction': reaction}
def generate_answers(config, model, processor, qn_uuid_data, context_token_data, qn_token_data): uuid2ans = {} # maps uuid to string containing predicted answer data_size = len(qn_uuid_data) num_batches = ((data_size - 1) / config.batch_size) + 1 batch_num = 0 detokenizer = MosesDetokenizer() print "Generating answers..." for batch in get_batch_generator(processor.word2id, qn_uuid_data, context_token_data, qn_token_data, config.batch_size, config.context_len, config.question_len): # Get the predicted spans pred_start_batch, pred_end_batch = processor.test_one_batch( batch, model) # Convert pred_start_batch and pred_end_batch to lists length batch_size pred_start_batch = pred_start_batch.tolist() pred_end_batch = pred_end_batch.tolist() # For each example in the batch: for ex_idx, (pred_start, pred_end) in enumerate( zip(pred_start_batch, pred_end_batch)): # Original context tokens (no UNKs or padding) for this example context_tokens = batch.context_tokens[ex_idx] # list of strings # Check the predicted span is in range assert pred_start in range(len(context_tokens)) assert pred_end in range(len(context_tokens)) # Predicted answer tokens pred_ans_tokens = context_tokens[pred_start:pred_end + 1] # list of strings # Detokenize and add to dict uuid = batch.uuids[ex_idx] uuid2ans[uuid] = detokenizer.detokenize(pred_ans_tokens, return_str=True) batch_num += 1 if batch_num % 10 == 0: print "Generated answers for %i/%i batches = %.2f%%" % ( batch_num, num_batches, batch_num * 100.0 / num_batches) print "Finished generating answers for dataset." return uuid2ans
def tokenize_text(text): # Tokenizers are basically an advanced split tokenizer = MosesTokenizer() detokenizer = MosesDetokenizer() processed_text = tokenizer.tokenize(text) # Need to detokenize to get all the weird symbols back as symbols processed_text = detokenizer.detokenize(processed_text) processed_text = preprocess(processed_text) return " ".join(processed_text)
def extract(sent, ind): j = 0 trial.append(sent) length = len(ind) summary = [] #summary.append(detoken[0][0]) for j in range(length): summary.append(sent[ind[j]]) detokenizer = MosesDetokenizer() hello = detokenizer.detokenize(summary, return_str=True) #print(hello) main.append(hello) construct(hello)
def generate_answers(session, model, word2id, qn_uuid_data, context_token_data, qn_token_data): """ Given a model, and a set of (context, question) pairs, each with a unique ID, use the model to generate an answer for each pair, and return a dictionary mapping each unique ID to the generated answer. """ uuid2ans = {} data_size = len(qn_uuid_data) num_batches = ((data_size - 1) / model.FLAGS.batch_size) + 1 batch_num = 0 detokenizer = MosesDetokenizer() print("Generating answers...") for batch in get_batch_generator(word2id, qn_uuid_data, context_token_data, qn_token_data, model.FLAGS.batch_size, model.FLAGS.context_len, model.FLAGS.question_len): pred_start_batch, pred_end_batch = model.get_start_end_pos( session, batch) pred_start_batch = pred_start_batch.tolist() pred_end_batch = pred_end_batch.tolist() for ex_idx, (pred_start, pred_end) in enumerate( zip(pred_start_batch, pred_end_batch)): context_tokens = batch.context_tokens[ex_idx] assert pred_start in range(len(context_tokens)) assert pred_end in range(len(context_tokens)) pred_ans_tokens = context_tokens[pred_start:pred_end + 1] uuid = batch.uuids[ex_idx] uuid2ans[uuid] = detokenizer.detokenize(pred_ans_tokens, return_str=True) batch_num += 1 if batch_num % 10 == 0: print("Generated answers for %i/%i batches = %.2f%%" % (batch_num, num_batches, batch_num * 100.0 / num_batches)) print("Finished generating answers for dataset.") return uuid2ans
def preprocessing(data, row_name): """ Reads reviews in from csv and preprocesses the Parameters -------------------- data -- data frame row_name -- name of row containing reviews Returns -------------------- df -- dataframe with preprocessed reviews """ #create tokenizer tokenizer = RegexpTokenizer(r'\w+') # create English stop words list stop_words = set(stopwords.words('english')) # Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer() # create list to store preprocessed text new_data = [] for index, row in data.iterrows(): # lower case the text lower_case = row[row_name].lower() #tokenize the text (removes punctuation) tokens = tokenizer.tokenize(lower_case) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in stop_words] #stemming stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] #put it back into a string detokenizer = MosesDetokenizer() detokenized_text = detokenizer.detokenize(stemmed_tokens, return_str=True) new_data.append(detokenized_text) #data frame of reviews df = pd.DataFrame({'review': new_data}) return df
def build_corpus(model_data): #import libraries import nltk from nltk.tokenize import TweetTokenizer from nltk.corpus import stopwords #to use stopwords function and list from nltk.stem.porter import PorterStemmer from nltk.tokenize.moses import MosesDetokenizer #setting up tweets for train/test split #initialize corpus corpus = [] #create instances of tweettokenizer, detokenizer, and porterstemmer. #initialize pattern to filter urls and usernames twtoken = TweetTokenizer() detokenizer = MosesDetokenizer() ps = PorterStemmer() url_pattern = re.compile(r'https\S+') user_pattern = re.compile(r'@\S+') #build corpus for i in range(model_data.shape[0]): text = model_data['Text'][i] urls = re.findall(url_pattern, text) users = re.findall(user_pattern, text) users = [re.sub('[^@a-zA-z]', '', user) for user in users] text = twtoken.tokenize(text) for url in urls: if url in text: text.remove(url) for user in users: if user in text: text.remove(user) text = detokenizer.detokenize(text, return_str=True) text = re.sub('[^a-zA-z]', ' ', text) text = text.lower() text = text.split() try: text.remove('makeovermonday') except Exception: pass text = [ ps.stem(word) for word in text if not word in set(stopwords.words('english')) ] text = ' '.join(text) corpus.append(text) return corpus
def process_lines(cmd_args, src, ref, hyps, truecase_dict): """ Traverse all sentences from all sources. """ re_bpe = re.compile("@@ ") if cmd_args.google: try: import mtranslate except ImportError: print( "Error: Install package 'mtranslate': pip install --user -U mtranslate" ) exit(2) if cmd_args.detok: try: from nltk.tokenize.moses import MosesDetokenizer detok = MosesDetokenizer(lang=cmd_args.detok) except ImportError: # NLTK isn't installed print("Error: Install package 'nltk': pip install --user -U nltk") exit(3) except LookupError: # NLTK's data package perluniprops isn't installed print("Error: Install NLTK data package 'perluniprops': \ import nltk; nltk.download('perluniprops')") exit(4) #all_lines = {ref:[]} line_num = 0 for src_line in src: line_num += 1 if line_num > cmd_args.head: break src_line = process_line(cmd_args, src_line, re_bpe, detok, truecase_dict) print("Src: %s" % src_line) ref_line = '' if ref is not None: ref_line = ref.readline() ref_line = process_line(cmd_args, ref_line, re_bpe, detok, truecase_dict) #all_lines[ref.append(ref_line)] print("Ref: ", ref_line) hyp_num = 1 for hyp in hyps: hyp_line = hyp.readline() hyp_line = process_line(cmd_args, hyp_line, re_bpe, detok, truecase_dict) print_hyp(hyp_num, ref_line, hyp_line) hyp_num += 1 if cmd_args.google: google_line = mtranslate.translate(src_line, cmd_args.google) # Don't truecase Google output google_line = process_line(cmd_args, google_line, re_bpe, detok, {}) print_hyp(hyp_num, ref_line, google_line) print()
def run(): detokenizer = MosesDetokenizer() with open('data.json') as f: data = json.load(f) contexts = data['contexts'] questions = data['questions'] predictions = [] for c, qs in tqdm(zip(contexts, questions), total=len(contexts)): if len(c) == 1: continue # Get vector embedding of sentence # Find the most similar sentence in the context for q in qs: predictions.append( detokenizer.detokenize(random.choice(c), return_str=True)) return predictions
def preprocess(cls, to_dir='../../data/billion', max_line=None): # detokenizer = MosesDetokenizer() count = 0 with MosesDetokenizer('en') as detokenize, \ open(os.path.join(to_dir, 'train.src'), 'w+') as train_f, \ open(os.path.join(to_dir, 'val.src'), 'w+') as val_f, \ open(os.path.join(to_dir, 'test.src'), 'w+') as test_f: for root, dirs, files in os.walk(cls.path): for file in files: with open(os.path.join(root, file), 'r') as in_f: for line in in_f: # 重建原本的sentence(為了subword tokenizer) # line = detokenize(line.rstrip().split(' ')) # line += '\n' if count % 20 == 0: test_f.write(line) elif count % 10 == 0: val_f.write(line) else: train_f.write(line) if max_line is not None and count > max_line: break if count % 100000 == 0: print(count) count += 1
def normalize_text(html): try: url_re = re.compile("https{0,1}://[^\s]+") url2_re = re.compile("[a-z0-9\.]+\.[a-z0-9\.]+/[^\s]*") space_re = re.compile("[\s]{2,}") html = html.encode("ascii", errors="ignore") text = newspaper.fulltext(html) sent = text.encode('ascii', errors='ignore') sent = str(sent).replace("r\\", "") sent = str(sent).replace("n\\", "") sent = str(sent).replace("\\", "") text = sent t, d = MosesTokenizer(), MosesDetokenizer() tokens = t.tokenize(text) detokens = d.detokenize(tokens) text = " ".join(detokens) # Removing URLs text = url_re.sub(" ", text) text = url2_re.sub(" ", text) # Removing multiple spacing characters text = space_re.sub(" ", text) text = text.encode("ascii", errors="ignore").decode() text = preProcess(text) # Stripping leading and trailing spaces text = text.strip() return text except Exception as e: return ""
class NLTKMosesTokenizer(Component): """Class for splitting texts on tokens using NLTK wrapper over MosesTokenizer Attributes: escape: whether escape characters for use in html markup tokenizer: tokenizer instance from nltk.tokenize.moses detokenizer: detokenizer instance from nltk.tokenize.moses Args: escape: whether escape characters for use in html markup """ def __init__(self, escape: bool=False, *args, **kwargs): self.escape = escape self.tokenizer = MosesTokenizer() self.detokenizer = MosesDetokenizer() def __call__(self, batch: List[Union[str, List[str]]]) -> List[Union[List[str], str]]: """Tokenize given batch of strings or detokenize given batch of lists of tokens Args: batch: list of text samples or list of lists of tokens Returns: list of lists of tokens or list of text samples """ if isinstance(batch[0], str): return [self.tokenizer.tokenize(line, escape=self.escape) for line in batch] else: return [self.detokenizer.detokenize(line, return_str=True, unescape=self.escape) for line in batch]
class NLTKMosesDetokenizer(object): r"""Apply the Moses Detokenizer implemented in NLTK. Users of this class are required to `install NLTK <https://www.nltk.org/install.html>`_ and install relevant NLTK packages, such as :samp:`python -m nltk.downloader perluniprops nonbreaking_prefixes` Examples -------- >>> detokenizer = gluonnlp.data.NLTKMosesDetokenizer() >>> detokenizer(['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of', ... 'text', 'processing', 'tools', '.'], return_str=True) 'Gluon NLP toolkit provides a suite of text processing tools.' >>> detokenizer(['Das', 'Gluon','NLP-Toolkit','stellt','eine','Reihe','von', ... 'Textverarbeitungstools','zur','Verfügung','.'], return_str=True) 'Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools zur Verfügung.' """ def __init__(self): try: from nltk.tokenize.moses import MosesDetokenizer except ImportError: warnings.warn('NLTK or relevant packages are not installed. ' 'Due to the LGPL 2.1+, moses has been deprecated in NLTK since 3.3.0. ' 'You must install NLTK <= 3.2.5 in order to use the ' 'NLTKMosesDetokenizer. You can refer to the official ' 'installation guide in https://www.nltk.org/install.html .' ' Now try SacreMosesDetokenizer using sacremoses ...') try: from sacremoses import MosesDetokenizer except ImportError: raise ImportError('sacremoses is also not installed. ' 'Please use sacremoses or older nltk version, e.g. 3.2.5. ' 'To install sacremoses, use pip install -U sacremoses') try: self._detokenizer = MosesDetokenizer() except ValueError: raise ValueError('The instantiation of MosesDetokenizer in sacremoses is' ' currently only supported in python3.') def __call__(self, sample, return_str=False): """ Parameters ---------- sample: list(str) The sentence to detokenize return_str: bool, default False True: return a single string False: return a list of words Returns ------- ret : list of strs or str List of words or detokenized text """ return self._detokenizer.detokenize(sample, return_str=return_str)
def __init__(self): try: from nltk.tokenize.moses import MosesDetokenizer except ImportError: warnings.warn('NLTK or relevant packages are not installed. ' 'Due to the LGPL 2.1+, moses has been deprecated in NLTK since 3.3.0. ' 'You must install NLTK <= 3.2.5 in order to use the ' 'NLTKMosesDetokenizer. You can refer to the official ' 'installation guide in https://www.nltk.org/install.html .' ' Now try SacreMosesDetokenizer using sacremoses ...') try: from sacremoses import MosesDetokenizer except ImportError: raise ImportError('sacremoses is also not installed. ' 'Please use sacremoses or older nltk version, e.g. 3.2.5. ' 'To install sacremoses, use pip install -U sacremoses') try: self._detokenizer = MosesDetokenizer() except ValueError: raise ValueError('The instantiation of MosesDetokenizer in sacremoses is' ' currently only supported in python3.')
#!/usr/bin/python # -*- coding: utf-8 -*- import nltk data = ["Hi", ",", "my", "name", "is", "Bob", "!"] if nltk.__version__ == "3.2.2": from nltk.tokenize.moses import MosesDetokenizer # nltk 3.2.2 detokenizer = MosesDetokenizer() sent = detokenizer.detokenize(data, return_str=True) elif nltk.__version__ == "3.3": from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer # nltk 3.3 detokenizer = TreebankWordDetokenizer() sent = detokenizer.detokenize(data) else: exit() print(sent)
class SacreMosesDetokenizer(object): r"""Apply the Moses Detokenizer implemented in sacremoses. Users of this class are required to `install sacremoses <https://github.com/alvations/sacremoses>`_. For example, one can use :samp:`pip install sacremoses`. .. note:: sacremoses carries an LGPL 2.1+ license. Examples -------- >>> detokenizer = gluonnlp.data.SacreMosesDetokenizer() >>> detokenizer(['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of', ... 'text', 'processing', 'tools', '.'], return_str=True) 'Gluon NLP toolkit provides a suite of text processing tools.' >>> detokenizer(['Das', 'Gluon','NLP-Toolkit','stellt','eine','Reihe','von', ... 'Textverarbeitungstools','zur','Verfügung','.'], return_str=True) 'Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools zur Verfügung.' """ def __init__(self): try: from sacremoses import MosesDetokenizer self._detokenizer = MosesDetokenizer() except (ImportError, TypeError) as err: if isinstance(err, TypeError): warnings.warn('The instantiation of MosesDetokenizer in sacremoses is' ' currently only supported in python3.' ' Now try NLTKMosesDetokenizer using NLTK ...') else: warnings.warn('sacremoses is not installed. ' 'To install sacremoses, use pip install -U sacremoses' ' Now try NLTKMosesDetokenizer using NLTK ...') try: import nltk try: nltk.data.find('perluniprops') except LookupError: nltk.download('perluniprops') from nltk.tokenize.moses import MosesDetokenizer self._detokenizer = MosesDetokenizer() except ImportError: raise ImportError('NLTK is not installed. ' 'You must install NLTK <= 3.2.5 in order to use the ' 'NLTKMosesDetokenizer. You can refer to the official ' 'installation guide in https://www.nltk.org/install.html .') def __call__(self, sample, return_str=False): """ Parameters ---------- sample: list(str) The sentence to detokenize return_str: bool, default False True: return a single string False: return a list of words Returns ------- ret : list of strs or str List of words or detokenized text """ return self._detokenizer.detokenize(sample, return_str=return_str)
def detokenize(line): tokens = line.replace(" n't", "n't").split(' ') detokenizer = MosesDetokenizer() res = detokenizer.detokenize(tokens, return_str=True) res = res[0].upper() + res[1:] return res