def debate_text_process(text): from nltk.tokenize import word_tokenize tokens = word_tokenize(str(text)) # convert to lower case tokens = [w.lower() for w in tokens] # remove punctuation from each word import string table = str.maketrans('', '', string.punctuation) stripped = [w.translate(table) for w in tokens] # remove remaining tokens that are not alphabetic words = [word for word in stripped if word.isalpha()] # filter out stop words from nltk.corpus import stopwords from spacy.lang.en.stop_words import STOP_WORDS stop_words = set(stopwords.words('english')) STOP_WORDS.update(stop_words) STOP_WORDS.update({ 'nt', 'okay', 'ha', 'thank', 'wa', 'got', 'oh', 'said', 'going', 'want', 'let', 'know' }) words = [w for w in words if not w in STOP_WORDS] #print(len(STOP_WORDS)) from nltk.stem import WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() words = [wordnet_lemmatizer.lemmatize(w) for w in words] return words
import hashlib from pytorch_pretrained_bert import BertTokenizer import string import spacy from spacy.lang.en.stop_words import STOP_WORDS import string nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner', 'tagger']) STOP_WORDS.update(string.punctuation) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') def bert_tokenization_length(context, question, reference, candidate): context_len = len(tokenizer.tokenize(context)) question_len = len(tokenizer.tokenize(question)) candidate_len = len(tokenizer.tokenize(candidate)) reference_len = len(tokenizer.tokenize(reference)) return max(context_len + question_len + candidate_len, context_len + question_len + reference_len) def check_data_and_return_hash(context, question, reference, candidate): assert type(context) == type(question) == type(reference) == type( candidate) == str if context == '' or question == '' or reference == '' or candidate == '': return None sample = context + question + reference + candidate hash_object = hashlib.md5(sample.encode())
# build preprocess tokenizer remove_strs = ['<br />', '(', ')', '"'] nlp = spacy.load('en') # def tokenizer(text): # text = utils.remove_str_from_sentence(text, remove_strs) # return [token.text for token in nlp.tokenizer(text)] def tokenizer(text): # text = utils.remove_str_from_sentence(text, remove_strs) return text.split() user_stop_words = {'.', ','} STOP_WORDS.update(user_stop_words) stop_words = STOP_WORDS # Pretrain Model PRE_TRAIN_MODEL_BASE_PATH = '/home/ubuntu/likun/nlp_vectors' PRE_TRAIN_MODEL_DIR = 'glove' PRE_TRAIN_MODEL_NAME = 'glove.6B.200d.txt' USE_PRE_TRAIN_MODEL = True cache = '.vector_cache' vector_path = os.path.join(PRE_TRAIN_MODEL_BASE_PATH, PRE_TRAIN_MODEL_DIR, PRE_TRAIN_MODEL_NAME) vectors = Vectors(name=vector_path, cache=cache) if USE_PRE_TRAIN_MODEL else None # Build Dataset TEXT = data.Field(unk_token=UNK_TOKEN,
LEMMA: "not", NORM: "not", TAG: "RB" }] } TOKENIZER_EXCEPTIONS = update_exc(TOKENIZER_EXCEPTIONS) # updating the stopset calfresh_stopwords = { "Calfresh", "CalFresh", "calfresh", "CALFRESH", "foodstamps", "sar7", "sar", "sr7", "sr", "SAR7", "SR7", "SAR", "SR", "Sar", "Sar7", "ebt" } calfresh_placeholders = { "PERSON", "ORG", "GPE", "LOC", "DATE", "MONEY", "CARDINAL" } stopset = STOP_WORDS.update(calfresh_stopwords, calfresh_placeholders) regex = re.compile(r'\W|\d', flags=re.UNICODE) def clean_words(text): try: text = regex.sub('', text) except: pass return text def enchant_spellchecker(doc): for token in doc: word = token.text
non_empty_data = [ article for article in dataset['data'][:100] if article and not article.isspace() ] # Process the articles with spaCy (tokenization only needed) nlp = spacy.load('en_core_web_sm', disable=['parser', 'textcat', 'ner']) print('Running spaCy processing') id_to_tokens = { i: nlp(article) for i, article in tqdm(enumerate(non_empty_data)) } print('Done processing') # Remove the stop words and lemmatize STOP_WORDS.update( ['think', 'know', 'people', 'like', 'thing', 'good', 'use', 'come']) id_to_tokens = { i: preprocess_spacy_doc(article, STOP_WORDS) for i, article in id_to_tokens.items() } unique_words = set().union(*id_to_tokens.values()) vocabulary = list(unique_words) # Remove rare and overly common words from corpus filtered_tokens = filter_extremes(id_to_tokens.values(), vocabulary, more_than=10) id_to_filtered = {i: tokens for i, tokens in enumerate(filtered_tokens)} unique_words = set().union(*id_to_filtered.values()) vocabulary = list(unique_words)