def tokenize(): text = request.json["text"] try: spans = list(TreebankWordTokenizer().span_tokenize(text)) except LookupError: nltk.download('punkt') spans = list(TreebankWordTokenizer().span_tokenize(text)) return {"tokens": [(s[0], s[1], text[s[0]:s[1]]) for s in spans]}
async def tokenize(request: Request): body = await request.json() text = body["text"] print(text) try: spans = list(TreebankWordTokenizer().span_tokenize(text)) except LookupError: nltk.download('punkt') spans = list(TreebankWordTokenizer().span_tokenize(text)) return {"tokens": [(s[0], s[1], text[s[0]:s[1]]) for s in spans]}
def get_tf_idf_score(self, sentence, mode, ngram=1): if ngram not in range(1, 4): try: raise ValueError except ValueError as v: print "Only unigrams, bigrams and trigrams are supported." if mode != "lex" and mode != "pos": try: raise ValueError except ValueError as v: print "Only lexical and POS distinctness supported." if len(self.document_freqs_lex.keys()) == 0 or len( self.document_freqs_pos.keys()) == 0: try: raise AttributeError except AttributeError as ae: print "Document frequency dictionaries not initialized. Call load_doc_freqs() " \ "on the LM object." tokenizer = TreebankWordTokenizer() sentence = sentence.lower() tokens = tokenizer.tokenize(sentence) tokens = self.__fix_tokens(tokens) tags = nltk.pos_tag(tokens) tags = self.__add_start_end_tags(tags) if mode == "lex": score = self.__get_lex_tf_idf(tags, ngram) return score else: score = self.__get_pos_tf_idf(tags, ngram) return score
def term_frequency(sentence, ngrams=4): """Given a sentence, calculates term frequency of tuples. Parameters ---------- sentence : str Sentence whose term frequency has to be calculated. ngrams : int Number of n-grams for which term frequency is calculated. Returns ------- dict {tuple : int} key-value pairs representing term frequency. """ sentence = sentence.lower().strip() for punc in PUNCTUATIONS: sentence = sentence.replace(punc, "") words = TreebankWordTokenizer().tokenize(sentence) counts = {} for i in range(ngrams): for j in range(len(words) - i): ngram = tuple(words[j:(j + i + 1)]) if ngram in counts: counts[ngram] += 1 else: counts[ngram] = 1 return counts
def __init__(self, *args, **kwargs): if 'tokenize' in kwargs: raise TypeError( '``TreebankEncoder`` does not take keyword argument ``tokenize``.' ) if 'detokenize' in kwargs: raise TypeError( '``TreebankEncoder`` does not take keyword argument ``detokenize``.' ) try: import nltk # Required for moses nltk.download('perluniprops') nltk.download('nonbreaking_prefixes') from nltk.tokenize.treebank import TreebankWordTokenizer from nltk.tokenize.treebank import TreebankWordDetokenizer except ImportError: print("Please install NLTK. " "See the docs at http://nltk.org for more information.") raise super().__init__(*args, tokenize=TreebankWordTokenizer().tokenize, detokenize=TreebankWordDetokenizer().detokenize, **kwargs)
def __init__(self, sentences_file, stopwords): self.dictionary = None self.corpus = None f_sentences = codecs.open(sentences_file, encoding='utf-8') documents = list() count = 0 print "Gathering sentences and removing stopwords" for line in f_sentences: line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line) # remove stop words and tokenize document = [ word for word in TreebankWordTokenizer().tokenize(line.lower()) if word not in stopwords ] documents.append(document) count += 1 if count % 10000 == 0: sys.stdout.write(".") f_sentences.close() self.dictionary = corpora.Dictionary(documents) self.corpus = [self.dictionary.doc2bow(text) for text in documents] self.tf_idf_model = TfidfModel(self.corpus) # print(documents) print len(documents), "documents read" print len(self.dictionary), " unique tokens", self.dictionary
def generate_syntactically_similar_sentences_replace(num_of_perturb, dataset): """Generate syntactically similar sentences for each sentence in the dataset. For PaInv-Replace Returns dictionary of original sentence to list of generated sentences """ # Use nltk treebank tokenizer and detokenizer tokenizer = TreebankWordTokenizer() detokenizer = TreebankWordDetokenizer() # Stopwords from nltk stopWords = list(set(stopwords.words('english'))) # File from which sentences are read file = open(dataset, "r") # when we use Bert berttokenizer = BertTokenizer.from_pretrained('bert-large-uncased') bertmodel = BertForMaskedLM.from_pretrained('bert-large-uncased') bertmodel.eval() # Number of perturbations you want to make for a word in a sentence dic = {} num_of_perturb = 50 num_sent = 0 for line in file: s_list = line.split("\n") source_sent = s_list[0] # Generating new sentences using BERT new_sents = perturb(source_sent, bertmodel, num_of_perturb) dic[line] = new_sents if new_sents != []: num_sent += 1 return dic
def tokenize_text(text, language="english"): '''Tokenize a string into a list of tokens. Use NLTK's Treebankwordtokenizer. Note that we first split into sentences using NLTK's sent_tokenize. We additionally call a filtering function to remove un-wanted tokens. IN: - text, str OUT: - list of strings ''' ## list of tokens list_tokens = [] ## split text into sentences sentences = sent_tokenize(text, language=language) ## define the tokenizer tokenizer = TreebankWordTokenizer() ## loop over all sentences for sent in sentences: ## tokenize the sentence sent_tokenized = tokenizer.tokenize(sent) ## lowercase the tokens ## add tokens to list of tokens list_tokens += sent_tokenized list_tokens = filter_tokens(list_tokens) return list_tokens
def tokenize(documents): real_tokens = [] documents2 = [] tbw = TreebankWordTokenizer() for doc in documents: text = doc["text"] file = doc["id"] text = text.replace("\"","'") #text = text.replace("/", " ") text = text.replace("-", " ") text = text.replace(".", " ") tokens = tbw.span_tokenize(text) for token in tokens: token_txt = text[token[0]:token[1]] found = False for tag in doc["tags"]: if int(tag["start"])<=token[0] and int(tag["end"])>=token[1]: token_tag = tag["tag"] token_tag_type = tag["type"] found = True if found==False: token_tag = "O" token_tag_type = "O" real_tokens.append({"token":token_txt,"start":token[0],"end":token[1],"tag":token_tag,"tag_type":token_tag_type}) documents2.append({"id": file, "text": text, "tags": doc["tags"],"tokens":real_tokens}) return documents2
def __init__(self, data, tokenizer): self._text = to_unicode(data).strip() self._tokenizer = tokenizer self._treebank_word_tokenize = TreebankWordTokenizer().tokenize self.formdocument() self.extractsentences() self.extractwords()
def __init__(self): filename = 'Models/CRF_crfsuite_dict.crfsuite' self.crf_model = pycrfsuite.Tagger() self._treebank_word_tokenizer = TreebankWordTokenizer() country_file = open("Dictionaries/Countries.txt",'r', encoding='utf-8') self.dictionary_country = country_file.readlines() self.dictionary_country = set([line[:-1] for line in self.dictionary_country]) city_file = open("Dictionaries/Cities.txt",'r', encoding='utf-8') self.dictionary_city = city_file.readlines() self.dictionary_city = set([line[:-1] for line in self.dictionary_city]) first_name_file = open("Dictionaries/dictionary_first_names.txt", 'r', encoding='utf-8') self.dictionary_first_name = first_name_file.readlines() self.dictionary_first_name = set([line[:-1].lower() for line in self.dictionary_first_name]) surname_file = open("Dictionaries/dictionary_surnames.txt", 'r', encoding='utf-8') self.dictionary_surname = surname_file.readlines() self.dictionary_surname = set([line[:-1].lower() for line in self.dictionary_surname]) if os.path.exists(filename): self.crf_model.open('Models/CRF_crfsuite_dict.crfsuite') else: self.crf_model = None self.dictionary_job_titles = [] with open('Dictionaries/job_title_dictionary.csv', encoding='utf-8') as csv_file: csv_reader = csv.reader(csv_file,delimiter=',') for row in csv_reader: if row[2]=='assignedrole': candidates = row[0].lower().split(' ') for can in candidates: if len(can)>2: self.dictionary_job_titles.append(can) self.dictionary_job_titles = set(self.dictionary_job_titles) pass
def __init__(self, *args, **kwargs): if 'tokenize' in kwargs: raise TypeError( 'TreebankEncoder defines a tokenize callable TreebankWordTokenizer' ) try: import nltk # Required for moses nltk.download('perluniprops') nltk.download('nonbreaking_prefixes') from nltk.tokenize.treebank import TreebankWordTokenizer from nltk.tokenize.treebank import TreebankWordDetokenizer except ImportError: print("Please install NLTK. " "See the docs at http://nltk.org for more information.") raise self.detokenizer = TreebankWordDetokenizer() super().__init__(*args, **kwargs, tokenize=TreebankWordTokenizer().tokenize)
def create_data(stories, lang="english", doc_limit=-1, delimiter=""): from nltk.tokenize.treebank import TreebankWordTokenizer tokenizer = TreebankWordTokenizer() from nltk.corpus import stopwords stop = stopwords.words('english') from string import ascii_lowercase docs = {} print("Found %i stories" % stories.count()) for story in stories: text = zlib.decompress(story.story_content_z) # text = story.story_title text = ''.join( BeautifulSoup(text, features="lxml").findAll(text=True)).lower() if delimiter: sections = text.split(delimiter) else: sections = [text] if doc_limit > 0 and len(docs) > doc_limit: print("Passed doc limit %i" % len(docs)) break print(story.story_title, len(sections)) for jj in xrange(len(sections)): docs["%s-%i" % (story.story_title, jj)] = [x for x in tokenizer.tokenize(sections[jj]) \ if (not x in stop) and \ (min(y in ascii_lowercase for y in x))] return docs
def __init__(self): ''' Constructor ''' self.__tokenizer = TreebankWordTokenizer() self.__r_end_sentence = re.compile(r"\.|\?|!")
def word_tokenize(text, language="spanish"): """ It splits the text into words Args: text: text to be splited language: language of the tokenizer to be used Returns: List of words """ #try to use from local try: from nltk.tokenize.treebank import TreebankWordTokenizer _treebank_word_tokenize = TreebankWordTokenizer().tokenize return [ token for sent in sent_tokenize(text) for token in _treebank_word_tokenize(sent) ] #if not, use nltk except IOError: from nltk import word_tokenize return word_tokenize(text, language)
def __init__(self, language): """Take language as argument to the class. Check availability and setup class variables.""" self.language = language self.available_languages = [ 'akkadian', 'arabic', 'french', # defaults to old_french 'greek', 'latin', 'middle_english', 'middle_french', 'middle_high_german', 'old_french', 'old_norse', 'sanskrit', 'multilingual' ] assert self.language in self.available_languages, \ "Specific tokenizer not available for '{0}'. Only available for: '{1}'.".format( self.language, self.available_languages) # raise languages-specific warnings if self.language == 'french': self.language = 'old_french' LOG.warning("'french' defaults to 'old_french'. 'middle_french' also available.") # pylint: disable=line-too-long if self.language == 'arabic': self.toker = BaseArabyWordTokenizer('arabic') elif self.language == 'french': self.toker = BaseRegexWordTokenizer('old_french', OldFrenchTokenizerPatterns) elif self.language == 'greek': self.toker = BasePunktWordTokenizer('greek', GreekRegexSentenceTokenizer) elif self.language == 'latin': self.toker = LatinWordTokenizer() elif self.language == 'old_norse': self.toker = BaseRegexWordTokenizer('old_norse', OldNorseTokenizerPatterns) elif self.language == 'middle_english': self.toker = BaseRegexWordTokenizer( 'middle_english', MiddleEnglishTokenizerPatterns) elif self.language == 'middle_french': self.toker = BaseRegexWordTokenizer('old_french', OldFrenchTokenizerPatterns) elif self.language == 'middle_high_german': self.toker = BaseRegexWordTokenizer( 'middle_high_german', MiddleHighGermanTokenizerPatterns) elif self.language == 'old_french': self.toker = BaseRegexWordTokenizer('old_french', OldFrenchTokenizerPatterns) else: LOG.warning( "Falling back to default tokenizer, the NLTK's `TreebankWordTokenizer()`." ) self.toker = TreebankWordTokenizer()
def normalize(text): text = strip_accents_ascii(text.decode('utf-8')) text = text.encode('utf-8') text = ' '.join( map(lambda x: x.lower(), TreebankWordTokenizer().tokenize(text))) text = str(TextBlob(text).correct()) return text
def text2sentences(path): # feel free to make a better tokenization/pre-processing sentences = [] tokenizer = TreebankWordTokenizer() with open(path , encoding = 'utf8') as f: for l in f: table = str.maketrans(dict.fromkeys(string.punctuation + '0123456789')) #to remove numbers & punctuation sentences.append( tokenizer.tokenize(l.translate(table).lower()) ) return sentences
def treebank_tokenizer(sentence): # split 's but also split <>, wait to use in further work t = TreebankWordTokenizer() word_lst = t.tokenize(sentence.lower().replace("<", "LAB_").replace( ">", "_RAB")) ret = [] for w in word_lst: ret.append(w.replace("LAB_", "<").replace("_RAB", ">")) return ret
def tokenize(self, text: str): """ :rtype: list :param text: text to be tokenized into sentences :type text: str """ sents = self.sent_tokenizer.tokenize(text) tokenizer = TreebankWordTokenizer() return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]
def __init__(self): filename = 'Models/CRF_crfsuite.crfsuite' self.crf_model = pycrfsuite.Tagger() self._treebank_word_tokenizer = TreebankWordTokenizer() if os.path.exists(filename): self.crf_model.open('Models/CRF_crfsuite.crfsuite') else: self.crf_model = None pass
def normalize(text): text = text.decode('utf-8') text = re.sub(r'[a-zA-z]+://[^\s]*', '', text) text = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', '', text) text = strip_accents_ascii(text) text = text.encode('utf-8') text = ' '.join( map(lambda x: x.lower(), TreebankWordTokenizer().tokenize(text))) return text
def treebank_tokenizer(sentence): tokenizer = load('data/german.pickle') treebank_word_tokenize = TreebankWordTokenizer().tokenize tokens = [] for s in tokenizer.tokenize(sentence): tokens.extend([token for token in treebank_word_tokenize(s)]) tokens = [''.join(i for i in s if i not in string.punctuation) for s in tokens] tokens = list(filter(None, tokens)) return tokens
def english_tokenization(term): word_tokenizer = TreebankWordTokenizer() tokenized_term = "" for word in word_tokenizer.tokenize(term): if tokenized_term != "": tokenized_term += " " tokenized_term += word return tokenized_term
def get_words_locations(txt): sents = PunktSentenceTokenizer().tokenize(txt) words = [TreebankWordTokenizer().tokenize(i) for i in sents] words = [i for word in words for i in word] span_sents = [i for i in PunktSentenceTokenizer().span_tokenize(txt)] span_words = [[j for j in TreebankWordTokenizer().span_tokenize(i)] for i in PunktSentenceTokenizer().tokenize(txt)] new_span_words = [] for i, j in zip(span_sents, span_words): new_span_words_in_sent = [] for k in j: new_span_words_in_sent.append((k[0] + i[0], k[1] + i[0])) new_span_words.append(new_span_words_in_sent) new_span_words = [i for span_word in new_span_words for i in span_word] _words = [] _span_words = [] for i, j in zip(words, new_span_words): if i not in string.punctuation: punkt_list = re.split('\W', i.lower()) if len(punkt_list) >= 2 and not all( [ii.isnumeric() for ii in punkt_list]): mm = j[0] for k in punkt_list: if k == '': mm += 1 elif re.match('^\d+[a-zA-Z]+$', k): k0 = re.search('(^\d+)', k).group() k1 = re.search('([a-zA-Z]+$)', k).group() _words.extend([k0, k1]) _span_words.append((mm, mm + len(k0))) mm += len(k0) _span_words.append((mm, mm + len(k1))) mm += len(k1) mm += 1 else: _words.append(k) _span_words.append((mm, mm + len(k))) mm += len(k) mm += 1 else: _words.append(i.lower()) _span_words.append(j) return _words, _span_words
def transform_texts(art, period, site, ngrams=1, mod=None, text_column='text', text_token_column='text_token', remain_columns=('author', 'site', 'link')): """Transform dataframe with texts, create tokenized lists in columns. Save dataframe to mod directory, if mod is not None.""" text_column_paragraphs = text_column + '_paragraphs' text_token_column_lower = text_token_column + '_lower' text_token_column_stemmed = text_token_column + '_stemmed' text_token_column_count = text_token_column + '_count' st = SnowballStemmer('english') art.dropna(subset=[text_column], inplace=True) # maketrans fails if there are nans art_sh = art[list((text_column, ) + remain_columns)].copy() # we don't need more columns del art gc.collect() additional_punctuation = string.punctuation + '«»…—’‘“”–•' # a few additional, non-ascii chars # gigaom tt = TreebankWordTokenizer() art_sh[text_column] = art_sh[text_column].apply( lambda x: x.replace('Tweet\nShare\nPost\n', '').replace( '“', '').replace('”', '').replace('’', '\'')) # sent_tokenize tokenizes by paragraphs art_sh[text_column_paragraphs] = art_sh[text_column].apply( lambda x: x.split('\n\n')) art_sh[text_token_column] = art_sh[text_column_paragraphs].apply( lambda x: [flatten([tt.tokenize(z) for z in sent_tokenize(y)]) for y in x]) # to lower, stem art_sh[text_token_column_lower] = art_sh[text_token_column].apply( lambda x: [[word.lower() for word in paragraph] for paragraph in x]) art_sh[text_token_column_stemmed] = art_sh[text_token_column_lower].apply( lambda x: [[st.stem(word) for word in paragraph] for paragraph in x]) if ngrams == 2: # convert to bigrams art_sh[text_token_column] = art_sh[text_token_column_lower].apply( to_bigram) art_sh[text_token_column_lower] = art_sh[ text_token_column_lower].apply(to_bigram) art_sh[text_token_column_stemmed] = art_sh[ text_token_column_stemmed].apply(to_bigram) art_sh[text_token_column_count] = art_sh[text_token_column_stemmed].apply( lambda x: dict(Counter(FreqDist(flatten(x))))) if mod is not None: art_sh.to_csv(mod + 'dfs_articles' + period + site + '.csv') return art_sh
def tokenize(review: str) -> list: """Tokenize string based on NLTK TreebankWordTokenizer. Args: review: The raw review content. Returns: A list of tokens found by the NLTK tokenizer. """ tokenizer = TreebankWordTokenizer() return tokenizer.tokenize(review)
def tokenize(sents): """Identifica los tokens del las oraciones de entrada Returns: Una lista de oraciones. Cada oración es una lista de tokens """ tokenizer = TreebankWordTokenizer() sent_tokens = [tokenizer.tokenize(sent) for sent in sents] return sent_tokens
def __init__(self, tokenizer_method: str = "TreebankWordTokenizer"): self.token2idx = {} self.tokenizer = None if tokenizer_method == "TreebankWordTokenizer": self.tokenizer = TreebankWordTokenizer() else: raise NotImplementedError( "tokenizer_method {} doesn't exist".format(tokenizer_method)) self.add_token(UNK_TOKEN) # Add UNK token
def __init__(self): # nltk.download('punkt') self.tk = TreebankWordTokenizer() self.dtk = TreebankWordDetokenizer() self.BAD_CAT_REMOVE = re.compile('^Cat_') self.A_TILDE_REMOVE = re.compile('[á]') self.E_TILDE_REMOVE = re.compile('[é]') self.I_TILDE_REMOVE = re.compile('[í]') self.O_TILDE_REMOVE = re.compile('[ó]') self.U_TILDE_REMOVE = re.compile('[ú]') self.POINT_FOLLOWING_LETTER = re.compile('(?<=\S)\.(?=\w)')