def similarity(self, context, relation): try: relation_label = self.prop_map[relation] txt_in_brackets = re.findall(r'\(.*\)', relation_label) for txt in txt_in_brackets: relation_label = relation_label.replace(txt, '').strip() except: return 0.0 tokenizer = WordPunctTokenizer() rel_tokens_size = len(tokenizer.tokenize(relation_label.lower())) context_tokens = tokenizer.tokenize(context.lower()) if rel_tokens_size > 1: context_ngram_tokens = [ context_tokens[i:i + rel_tokens_size] for i in range(len(context_tokens) - rel_tokens_size + 1) ] context_tokens = context_ngram_tokens max_similarity = 0 for c_token in context_tokens: sim = fuzz.token_sort_ratio(relation_label, c_token) if sim > max_similarity: max_similarity = sim if max_similarity == 100: return max_similarity * rel_tokens_size / 100 return 0
def tfIdf(): TFIDF_MIN_SCORE = 100 import nltk from nltk.tokenize import WordPunctTokenizer tokenizer = WordPunctTokenizer() collection = initialize_collection('documents') docs = collection.find() tfidf = [] idfMap = create_idf_map() docs = collection.find() for d in docs: tfMap = {} for word in set(tokenizer.tokenize(d['content'].lower())): if word not in tfMap: tfMap[word] = 1 else: tfMap[word] += 1 tfIdfValues = [] for word in set(tokenizer.tokenize(d['content'].lower())): if (tfMap[word] * 1000 / idfMap[word]) > TFIDF_MIN_SCORE: tfIdfValues.append((word, tfMap[word] * 1000 / idfMap[word])) tfIdfValues = sorted(tfIdfValues, key = lambda x : x[1], reverse = True) d['tfidf'] = tfIdfValues tfidf.append({'d' : d, 'tfidf' : tfIdfValues}) collection.save(d) genFreq = generaral_frequency(idfMap) return render_template("tfidf.html", documents = tfidf)
def eng_seg(): #基于标点符号的分词 from nltk.tokenize import WordPunctTokenizer tokenizer = WordPunctTokenizer() print(tokenizer.tokenize("don't do that!")) path = "D:\\nlp语料\\机器翻译语料\\english.raw.sample.txt" f = open(path, "r") ''' text = f.read() splChars = set() for ch in text: if (ch >= 'a' and ch <= 'z') or (ch >= 'A' and ch <= 'Z'): pass else: splChars.add(ch) print(splChars) ''' lines = f.readlines() print(len(lines)) line_tokenized = [] split_char = " " for line in lines: line_tokenized.append(split_char.join(tokenizer.tokenize(line))) f2 = open("D:\\nlp语料\\机器翻译语料\\english.raw.sample.seg.txt", "w") for line in line_tokenized: f2.write(line + "\n") f.close() f2.close()
def class1(): import nltk from nltk.tokenize import WordPunctTokenizer docId = request.args.get('d') tokenizer = WordPunctTokenizer() collection = initialize_collection('documents') featuresets = [] tagSet = set() for d in collection.find(): bagOfWords = bag_of_words(tokenizer.tokenize(d['content'])) if 'tags' not in d: continue for tag in d['tags']: featuresets.append((bagOfWords, tag)) tagSet.add(tag) classifier = nltk.NaiveBayesClassifier.train(featuresets) d = collection.find_one({'_id' : ObjectId(docId)}) #classifier.show_most_informative_features(100) cl = classifier.prob_classify(bag_of_words(tokenizer.tokenize(d['content']))) probs = [] for tag in tagSet: probs.append((tag, round(cl.prob(tag)*100) )) classifier.show_most_informative_features(n=20) probs = sorted(probs, key = lambda x : x[1], reverse = True) return render_template('class1.html', probs = probs, d=d)
class AbstractStemmer(object): def __init__(self): super(AbstractStemmer, self).__init__() self.tokenizer = WordPunctTokenizer() self.vocab = set() self.basename = 'nostemmer' def stem_query(self, q): # isword = re.compile('[a-z0-9]+') q = utils.clean(q) curr_words = self.tokenizer.tokenize(q) clean_words = [word.lower() for word in curr_words] processed_words = self.process(clean_words) self.vocab.update(processed_words) return ' '.join(processed_words) def stem(self, files): # We write files to a -[stemmer].txt file filename_mod = files[0].split('.')[0] wf = codecs.open('{1}-{0}.txt'.format(self.basename, filename_mod), 'w', encoding='utf-8') isword = re.compile('[a-z0-9]+') # We can work with both gzip and non-gzip for fname in files: if fname.endswith('gz'): f = gzip.open(fname, 'r') else: f = open(fname) for no, line in enumerate(f): if isinstance(line, bytes): line = line.decode('utf-8') # We drop empty lines if len(line.strip()) == 0: continue # Clean and process words curr_words = self.tokenizer.tokenize(line) clean_words = [word.lower() for word in curr_words] processed_words = self.process(clean_words) # Keep track of vocab size self.vocab.update(processed_words) # We output according to the one-doc-per-line format for Mallet current_line = u' '.join(processed_words) line_fmt = '{0}\n'.format(current_line) wf.write(line_fmt) f.close() print('Resulting vocab size: {0}'.format(len(self.vocab))) wf.close() def process(self, words): raise NotImplementedError("No stemmer here!")
def build_word_dictionary(input_file_name, output_file_name): dictionary = Counter() tokenizer = WordPunctTokenizer() with open(input_file_name) as input_file: for record in json.loads(input_file.read()): dictionary.update(tokenizer.tokenize(record['content'])) dictionary.update(tokenizer.tokenize(record['abstract'])) dictionary = list(sorted(w for w in dictionary if dictionary[w] >= 5)) + ['PADDING', 'UNKNOWN'] with open(output_file_name, 'w') as output_file: output_file.write("{}\n".format(json.dumps(dictionary)))
def build_word_dictionary(input_file_name, output_file_name): dictionary = Counter() tokenizer = WordPunctTokenizer() with open(input_file_name) as input_file: for record in json.loads(input_file.read()): dictionary.update(tokenizer.tokenize(record['content'])) dictionary.update(tokenizer.tokenize(record['abstract'])) dictionary = list(sorted( w for w in dictionary if dictionary[w] >= 5)) + ['PADDING', 'UNKNOWN'] with open(output_file_name, 'w') as output_file: output_file.write("{}\n".format(json.dumps(dictionary)))
def w2v_training_sents(self, dataList, trainID): word_punct_tokenizer = WordPunctTokenizer() x = [] for currentId in trainID: currentData = dataList[currentId] currentSent = currentData[2] currentPreList = currentData[3] currentLatList = currentData[4] x.append(' '.join(word_punct_tokenizer.tokenize(currentSent))) for item in currentPreList: x.append(' '.join(word_punct_tokenizer.tokenize(item))) for item in currentLatList: x.append(' '.join(word_punct_tokenizer.tokenize(item))) return x
def data_cleaner(text): tokenizer = WordPunctTokenizer() pat_1 = r"(?:\@|https?\://)\S+" pat_2 = r'#\w+ ?' combined_pat = r'|'.join((pat_1, pat_2)) www_pat = r'www.[^ ]+' html_tag = r'<[^>]+>' negations_ = { "isn't": "is not", "can't": "can not", "couldn't": "could not", "hasn't": "has not", "hadn't": "had not", "won't": "will not", "wouldn't": "would not", "aren't": "are not", "haven't": "have not", "doesn't": "does not", "didn't": "did not", "don't": "do not", "shouldn't": "should not", "wasn't": "was not", "weren't": "were not", "mightn't": "might not", "mustn't": "must not" } negation_pattern = re.compile(r'\b(' + '|'.join(negations_.keys()) + r')\b') try: stripped = re.sub(combined_pat, '', text) stripped = re.sub(www_pat, '', stripped) cleantags = re.sub(html_tag, '', stripped) lower_case = cleantags.lower() neg_handled = negation_pattern.sub(lambda x: negations_[x.group()], lower_case) if remove_punctuation: letters_only = re.sub("[^a-zA-Z]", " ", neg_handled) tokens = tokenizer.tokenize(letters_only) else: tokens = tokenizer.tokenize(neg_handled) return (" ".join(tokens)).strip() except: return 'NC'
def message_to_wordlist(message, lemmas_bool, remove_stopwords=False): # Function to convert a document to a sequence of words, # optionally removing stop words. Returns a list of words. # # 1. Remove HTML #review_text = BeautifulSoup(review).get_text() # # 2. Remove messages numbers message_text = re.sub(">>\d+","", message) message_text = message_text.lower() message_text = re.sub(u"ё", 'e', message_text, re.UNICODE) message_text = clean_str(message_text) tokenizer = WordPunctTokenizer() # 3. Convert words to lower case and split them words = tokenizer.tokenize(message_text) lemmas = [] # 4. Optionally remove stop words (false by default) if remove_stopwords: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] if lemmas_bool == 'l': for word in words: word_parsed = morph.parse(word) if len(word_parsed) > 0: lemmas.append(word_parsed[0].normal_form) elif lemmas_bool == 's': for word in words: word = stemmer.stem(word) if len(word) > 0: lemmas.append(word) else: lemmas = words # 5. Return a list of words return(lemmas)
def clean_data(input_file_name, output_file_name): def clean_word(word): word = word.encode('ascii', 'ignore') word = word.lower() word = re.sub(r'(\S)\1+', r'\1\1', word) # normalize repeated characters to two word = re.sub(r'(\S\S)\1+', r'\1\1', word) if re.search(r'((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w-]*)?\??(?:[-\+=&;%@.\w]*)#?(?:[\w]*))?)',word) is not None: word = 'GENERIC_HTTP' return word tokenizer = WordPunctTokenizer() data = [] with open(input_file_name) as input_file: for sentences, label in json.load(input_file): cleaned_sentences = [] for sentence in sentences: cleaned_sentence = " ".join(map(clean_word, sentence.split())) cleaned_sentence = tokenizer.tokenize(cleaned_sentence) cleaned_sentences.append(cleaned_sentence) data.append([cleaned_sentences, label]) with codecs.open(output_file_name, 'w', encoding='utf-8') as output_file: json.dump(data, output_file)
def clean_data(input_file_name, output_file_name): def clean_word(word): word = word.lower() word = word.replace('&','&').replace('<','<').replace('>','>').replace('"','"').replace(''',"'") word = re.sub(r'(\S)\1+', r'\1\1', word) # normalize repeated characters to two word = re.sub(r'(\S\S)\1+', r'\1\1', word) word = word.encode('ascii', 'ignore') if re.search(r'((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w-]*)?\??(?:[-\+=&;%@.\w]*)#?(?:[\w]*))?)',word) is not None: word = 'GENERIC_HTTP' return word.encode('ascii', 'ignore') tokenizer = WordPunctTokenizer() with gzip.open(input_file_name) as input_file: with gzip.open(output_file_name, 'w') as output_file: for line in input_file: sentences, score = json.loads(line) cleaned_sentences = [] for sentence in sentences: cleaned_sentence = " ".join(map(clean_word, sentence.split())) cleaned_sentences.append(tokenizer.tokenize(cleaned_sentence)) json.dump([cleaned_sentences, score], output_file) output_file.write("\n")
def number_of_different_words(self): # TODO: Stemming, then move to language specific classes tokenizer = WordPunctTokenizer() words = tokenizer.tokenize(self.text.strip()) only_textual_words = filter(unicode.isalpha, words) return len(set(only_textual_words))
def test_name(): filename = "name.txt" name_file = 'tests/test_files/' + filename output_dir = 'tests/test_files/redacted/' main.init_stats(name_file, 0, None) # Get test file content = main.get_file_contents(name_file) # Used to split the file for POS analysis word_punct_tokenizer = WordPunctTokenizer() tagged_content = nltk.pos_tag(word_punct_tokenizer.tokenize(content)) # Redacte content = main.redact_names(content, tagged_content, name_file) # X nameed words in file assert (main.num_names[name_file] == 22) # Create path if (not os.path.isdir(output_dir)): sys.stderr.write("Output directory did not exist...creating " + output_dir + "/\n") os.makedirs(output_dir) # Write out the redacted test file for reference main.write_redacted(content, name_file, output_dir)
def clean_data(input_file_name, output_file_name): def clean_word(word): word = word.encode('ascii', 'ignore') word = word.lower() word = re.sub(r'(\S)\1+', r'\1\1', word) # normalize repeated characters to two word = re.sub(r'(\S\S)\1+', r'\1\1', word) if re.search( r'((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w-]*)?\??(?:[-\+=&;%@.\w]*)#?(?:[\w]*))?)', word) is not None: word = 'GENERIC_HTTP' return word tokenizer = WordPunctTokenizer() data = [] with open(input_file_name) as input_file: for sentences, label in json.load(input_file): cleaned_sentences = [] for sentence in sentences: cleaned_sentence = " ".join(map(clean_word, sentence.split())) cleaned_sentence = tokenizer.tokenize(cleaned_sentence) cleaned_sentences.append(cleaned_sentence) data.append([cleaned_sentences, label]) with codecs.open(output_file_name, 'w', encoding='utf-8') as output_file: json.dump(data, output_file)
def message_to_wordlist(message, lemmas_bool, remove_stopwords=False): # Function to convert a document to a sequence of words, # optionally removing stop words. Returns a list of words. # # 1. Remove HTML #review_text = BeautifulSoup(review).get_text() # # 2. Remove messages numbers message_text = re.sub(">>\d+","", message) message_text = message_text.lower() message_text = re.sub(u"ё", 'e', message_text, re.UNICODE) tokenizer = WordPunctTokenizer() # 3. Convert words to lower case and split them words = tokenizer.tokenize(message_text) lemmas = [] # 4. Optionally remove stop words (false by default) if remove_stopwords: stops = set(stopwords.words("russian")) words = [w for w in words if not w in stops] if lemmas_bool == 'l': for word in words: word_parsed = morph.parse(word) if len(word_parsed) > 0: lemmas.append(word_parsed[0].normal_form) elif lemmas_bool == 's': for word in words: word = stemmer.stem(word) if word and w.isalpha(): lemmas.append(word) else: lemmas = words # 5. Return a list of words return(lemmas)
def tokenize_words(sentence): """ :param sentence: :return: list of words in sentence """ tokenizer = WordPunctTokenizer() return tokenizer.tokenize(sentence)
def test_concept(): filename = "concept.txt" concept_file = 'tests/test_files/' + filename output_dir = 'tests/test_files/redacted/' main.init_stats(concept_file, 0, None) # Get test file content = main.get_file_contents(concept_file) # Used to split the file for POS analysis word_punct_tokenizer = WordPunctTokenizer() tagged_content = nltk.pos_tag(word_punct_tokenizer.tokenize(content)) # Make required dot structure. # See https://stackoverflow.com/questions/2352181/how-to-use-a-dot-to-access-members-of-dictionary arg = {"concept": ["child"]} args = temp(arg) # Redacte content = main.redact_concept(content, concept_file, args) # X concept words in file assert (main.num_concept[concept_file] == 12) # Create path if (not os.path.isdir(output_dir)): sys.stderr.write("Output directory did not exist...creating " + output_dir + "/\n") os.makedirs(output_dir) # Write out the redacted test file for reference main.write_redacted(content, concept_file, output_dir)
def _tokenize(self, text): tk = WordPunctTokenizer() result = tk.tokenize(text) if DEBUG: print("Result after tokenizing: "), print(result) return result
def to_index(vocab, texts, add_os=True): words_indices = [] tokenizer = WordPunctTokenizer() lemmatizer = WordNetLemmatizer() # maxlen = 0 for text in texts: words = tokenizer.tokenize(text) lemmas = [lemmatizer.lemmatize(w) for w in words] # maxlen = max(maxlen, len(lemmas)) words_index = [] if add_os is True: words_index.append(SOS_ID) # 开头 for lemma in lemmas: if lemma in vocab: words_index.append(vocab[lemma]) else: words_index.append(UNK_ID) # <unk> if add_os is True: words_index.append(EOS_ID) # 结尾 words_indices.append(words_index) # print maxlen return words_indices
def get_vocab(data_list, vocab_size=None): # 处理,统计词频 tokenizer = WordPunctTokenizer() lemmatizer = WordNetLemmatizer() all_words = [] for record in data_list: for passage in record['passages']: # 都看吧,不要只看选中的 passage_words = tokenizer.tokenize( passage['passage_text']) # 分词;要不要全转小写,再看 passage_lemma_words = [ lemmatizer.lemmatize(w) for w in passage_words ] # 词形还原 all_words.extend(passage_lemma_words) vocab_dict = collections.Counter(all_words) # 取词频最大的vocab_size个词,另外再加一个<unk>统计其他词,不用加,只需到时不在了index变一下 # (对于一起训练的都是这样,对于用预训练embedding的则在配的时候不在表中的是0向量) if vocab_size is not None: vocab_list = vocab_dict.most_common(vocab_size) # 就是倒序排的 else: vocab_list = vocab_dict.most_common() # 所有元素 print len(vocab_list) vocab = { } # 另外,'<PAD>': 0, '<UNK>': 1, '<SOS>': 2, '<EOS>': 3,没有实际替换,在index时直接用数字代替即可 i = EOS_ID + 1 for v in vocab_list: vocab[v[0]] = i # 只留词和位置索引 i += 1 # print vocab # 常用词挺多的,要不要考虑去停用词和标点 return vocab
def clean_tweets(tweet): """ Función para limpiar los tweets antes de ser enviados a la API de análisis de sentimiento. Nota: La API de Google es bastante flexible a la hora de realizar análisis de sentimiento. No estoy seguro de que todas estas "limpiezas" sean del todo necesarias. Args: tweet: Tweet (o texto) a limpiar. Returns: clean_tweet: Tweet ya limpio para proceder a realizar análisis de sentimiento. """ # Removemos el usuario en el tweet user_removed = re.sub(r'@[A-Za-z0-9]+', '', tweet.decode('utf-8')) # Removemos cualquier link presente en el tweet link_removed = re.sub('https?://[A-Za-z0-9./]+', '', user_removed) # llevamos todo a minúsculas lower_case_tweet = link_removed.lower() # Instanciamos un tokenizador y, de aucerdo a sus reglas, creamos la lista de tokens tok = WordPunctTokenizer() words = tok.tokenize(lower_case_tweet) # Unimos los tokens para crear un único string a ser enviado clean_tweet = (' '.join(words)).strip() return clean_tweet
def get_feeds(name): tweets_list = [] if consumer_key == '': f = open("example.txt") ttt = f.readlines() for t in ttt: tweets_list.extend(t) return tweets_list else: auth = OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_secret) api = tweepy.API(auth) new_tweet = api.user_timeline(screen_name=name, count=50) tweets_list.extend(new_tweet) # Processing the tweets cleaned_tweets_list = [] # All cleaned tweets are stored in this list for status in tweets_list: tweet_i = status.text.encode('utf-8') removed = re.sub(r'@[A-Za-z0-9]+', '', tweet_i.decode('utf-8')) link_rm = re.sub('https?://[A-Za-z0-9./]+', '', removed) number_rm = re.sub('[^a-zA-Z]', ' ', link_rm) lower = number_rm.lower() tok = WordPunctTokenizer() words = tok.tokenize(lower) cleaned = (' '.join(words)).strip() cleaned_tweets_list.append(cleaned) return cleaned_tweets_list
def normalize(cls, input_doc, language="english"): ''' Normalize given input. ''' # Remove special-chars if language == "german": processed_doc = re.sub(cls.NON_ALPHA_GER, '', input_doc) else: processed_doc = re.sub(cls.NON_ALPHA_GER, '', input_doc) # To Lowercase and Strip Whitespaces processed_doc = processed_doc.lower().strip() # Tokenize tokenizer = WordPunctTokenizer() tokens = tokenizer.tokenize(processed_doc) # Remove Stopwords. if language == "german": stop = stopwords.words("german") cleaned_tokens = [token for token in tokens if token not in stop] else: stop = stopwords.words("english") cleaned_tokens = [token for token in tokens if token not in stop] processed_doc = ' '.join(cleaned_tokens) return processed_doc
def clean_text(text): """ A function to pre-process text Parameters ---------- text : string the string to be processed Returns ------- text : string a clean string """ tok = WordPunctTokenizer() pat1 = r'@[A-Za-z0-9]+' pat2 = r'https?://[A-Za-z0-9./]+' combined_pat = r'|'.join((pat1, pat2)) soup = BeautifulSoup(text, 'lxml') souped = soup.get_text() stripped = re.sub(combined_pat, '', souped) try: clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?") except: clean = stripped letters_only = re.sub("[^a-zA-Z]", " ", clean) lower_case = letters_only.lower() words = tok.tokenize(lower_case) return (" ".join(words)).strip()
def compute_word_context_matrix(self, window=5): tokenizer = WordPunctTokenizer() self.word_context_matrix = np.zeros( (len(self.corpus.vocabulary), len(self.corpus.vocabulary))) for doc_id in range(self.corpus.size): print doc_id document = self.corpus.full_content(doc_id) terms = tokenizer.tokenize(document) nb_terms = len(terms) for i in range(nb_terms): row_index = self.corpus.id_for_word(terms[i]) if row_index != -1: start = i - window if start < 0: start = 0 end = i + window if end >= nb_terms: end = nb_terms - 1 context0 = terms[start:i] context1 = terms[i + 1:end + 1] context0.extend(context1) for term in context0: column_index = self.corpus.id_for_word(term) if column_index != -1: self.word_context_matrix[row_index][ column_index] += 1
def tag(self, sent): times = self.find_time(sent) intervals = dict([(time[0], time[1]) for time in times]) tag_dict = dict([(time[2], time[3]) for time in times]) tokenizer = WordPunctTokenizer() # for a in [time[2] for time in times]: # tokenizer.add_mwe(a.split()) # --- FIXED --- original_tokens = tokenizer.tokenize(sent) original_tags = pos_tag(original_tokens) # --- END FIXED --- tokens = [] current = 0 for span in tokenizer.span_tokenize(sent): if span[0] < current: continue if span[0] in intervals: tokens.append(f'__{sent[span[0]: intervals[span[0]]]}') current = intervals[span[0]] else: tokens.append(sent[span[0]:span[1]]) current = span[1] tags = pos_tag(tokens) new_tags = [] for word, tag in tags: if word[:2] == '__': new_tags.append((word[2:], tag_dict[word[2:]])) else: tag = [t[1] for t in original_tags if t[0] == word][0] # FIXED new_tags.append((word, tag)) return new_tags
def transferDataw2v(self, allLabeledList, trainID, alpha=0.1): word_punct_tokenizer = WordPunctTokenizer() total = [0] * len(self.avilableLabels) xtrain = [] pretrain = [] lattrain = [] ytrain = [] for currentId in trainID: currentData = dataList[currentId] currentLabel = currentData[1] currentSent = currentData[2] currentPreList = currentData[3] currentLatList = currentData[4] if currentLabel in self.avilableLabels: idx = self.avilableLabels.index(currentLabel) total[idx] += 1 binLabel = label_binarize([currentLabel], self.avilableLabels).tolist()[0] w2vList, fofeCode = self.sentW2v( word_punct_tokenizer.tokenize(currentSent), self.ebd_size) xtrain.append(w2vList) prefofe = self.w2vEncoding(currentPreList, word_punct_tokenizer, sentLevelAlpha=alpha) latfofe = self.w2vEncoding(currentPreList, word_punct_tokenizer, reverse=True, sentLevelAlpha=alpha) pretrain.append(prefofe) lattrain.append(latfofe) ytrain.append(binLabel) return xtrain, pretrain, lattrain, ytrain, total
def get_matrix_of_concatenated_document_embeddings(embeddings, n_dim, texts, token_limit=20, stop_words=[''], scale=False): """ :param embeddings: :param n_dim: :param texts: :param n_tokens: :param stop_words: :param scale: :return: """ scaler = preprocessing.MaxAbsScaler() # scaler = preprocessing.MinMaxScaler() tokenizer = WordPunctTokenizer() matrix = np.zeros((len(texts), token_limit*n_dim)) for i_texts in range(0, len(texts)): tokens = tokenizer.tokenize(texts[i_texts]) tmp = [] for i_token in range(0, token_limit): cur_embedding = [0] * n_dim # if text still has tokens left, the current token is in the embeddings, and it is not on the stop word list if i_token < len(tokens) and tokens[i_token] in embeddings.keys() and not tokens[i_token] in stop_words: tmp_embedding = scaler.fit_transform(embeddings[tokens[i_token]]) if scale else embeddings[tokens[i_token]] cur_embedding = tmp_embedding.tolist() tmp += cur_embedding matrix[i_texts] = np.array(tmp) return matrix
def lemmatize(text): word_punct_tokenizer = WordPunctTokenizer() tokens = word_punct_tokenizer.tokenize(text) lem = WordNetLemmatizer() ps = PorterStemmer() return [lem.lemmatize(w.lower()) for w in tokens ] # [lem.lemmatize(ps.stem(w.lower())) for w in tokens]
class w2vModel: def __init__(self): self.tokenizer = WordPunctTokenizer() #加载模型word2Vec self.word2VecModel = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True) #计算短文本的距离 def getWordDistance(self,word1,word2): if word1=='' or word2=='': return 0 A = self.tokenizer.tokenize(word1)#分词 B = self.tokenizer.tokenize(word2) scores=[] for w1 in A: ss=[] for w2 in B: try: ss.append(self.word2VecModel.similarity(w1,w2)) except: if w1==w2: ss.append(1) else: ss.append(0) scores.append(ss) La = 0 Lb = 0 for i in range(len(A)): La += max(scores[i]) La /= len(A) for i in range(len(B)): maxnum=0 for j in range(len(A)): maxnum = scores[j][i] if scores[j][i]>maxnum else maxnum Lb += maxnum Lb /= len(B) return (La+Lb)/2
class CocoDataset(Dataset): def __init__(self, image_dir, caption_dir, n_samples=5000, transform=None): self.image_dir = image_dir self.caption_dir = caption_dir self.transform = transform self.file_names = os.listdir(self.image_dir)[:n_samples] self.word_tokenizor = WordPunctTokenizer() self.id_to_captions = {} for id_caption in open(caption_dir, encoding='utf-8').read().strip().split('\n'): if len(id_caption.split('\t')) == 2: id, caption = id_caption.split('\t') self.id_to_captions[id] = caption.lower() def __len__(self): return len(self.file_names) def __getitem__(self, idx): image = Image.open(os.path.join(self.image_dir, self.file_names[idx])) image = image.convert('RGB') caption = [ word_to_id[word] if word in word_to_id.keys() else word_to_id['UNK'] for word in self.word_tokenizor.tokenize(self.id_to_captions[ self.file_names[idx]]) ] caption = torch.Tensor(caption + [word_to_id['<EOS>']], device=device).view(-1, 1) caption = caption.long() if self.transform: image_new = self.transform(image) sample = {'image': image_new, 'caption': caption} return sample
class fred_language_analyser(language_analyser): ''' a own analyser based on nltk with an asshole algoritme ''' def __init__(self, language = 'french'): '''Initialisation language : 'french' ''' self.tokenizer = WordPunctTokenizer() self.stopwords = set(stopwords.words(language)) self.stopwords.add(u"'") def text_to_vector(self, text): tokens = self.tokenizer.tokenize(text) tokens = [token for token in tokens if token.lower() not in self.stopwords] return tokens def distance(self, text1, text2): v1 = self.text_to_vector(text1) v2 = self.text_to_vector(text2) #En attendant l'optimisation, on limite à 6 mots v1 = v1[0:6] v2 = v2[0:6] n = max(len(v1),len(v2)) if len(v1)>len(v2): v1,v2 = v2,v1 v1_1 = v1 + [None]*(n-len(v1)) distance = 99 for v1_2 in itertools.permutations(v1_1):#un peu boeuf : on permutte aussi les None avec les None #Distance entre les mots d_mot=0 for i in range(n): try: d_mot += (6-min(6,edit_distance(v1_2[i],v2[i])))**2 except: d_mot += 1 #si None d_mot = 6*(n**0.5)-d_mot**0.5 #distance de la permuttation #Nb de Non insérés = nb de None pas au début ni à la fin v1_3 = [] debut = True for m in v1_2: if m or not debut: debut = False v1_3.append(m) v1_4 = [] debut = True for i in range(len(v1_3)-1,-1,-1): if v1_3[i] or not debut: debut = False v1_4.append(v1_3[i]) d_perm = len(v1_4)-len(v1) #Les permutation de mot : 3 par permutation l=[] for m in list(filter(lambda x:x,v1_4)): l.append(v1.index(m)) for i in range(len(l)-1): if l[i]<l[i+1]: d_perm +=3 distance = min(distance, (d_mot**2+d_perm**2)**0.5) return distance
def words(self, fileid=None): """ Returns all of the words and punctuation symbols in the specified file that were in text nodes -- ie, tags are ignored. Like the xml() method, fileid can only specify one file. :return: the given file's text nodes as a list of words and punctuation symbols :rtype: list(str) """ elt = self.xml(fileid) encoding = self.encoding(fileid) word_tokenizer = WordPunctTokenizer() try: iterator = elt.getiterator() except: iterator = elt.iter() out = [] for node in iterator: text = node.text if text is not None: if isinstance(text, bytes): text = text.decode(encoding) toks = word_tokenizer.tokenize(text) out.extend(toks) return out
def TextProcessor(src, tgt, low=True, num=True): print "processing "+src if low==True: print "lowercasing.." if num==True: print "removing numeric.." srcfile = codecs.open(src,"r","utf-8") tgtfile = codecs.open(tgt,"w","utf-8") word_punct_tokenizer = WordPunctTokenizer() linecount=0 for line in srcfile: linecount+=1 line = word_punct_tokenizer.tokenize(line) if low==True: for i in range(0,len(line)): line[i] = line[i].lower() if num==True: for i in range(0,len(line)): if line[i].isnumeric()==True: line[i] = "<number>" tgtfile.write(listtostring(line)) srcfile.close() tgtfile.close() print "done processing "+str(linecount)+" lines!!"
class SentencesIterator(object): def __init__(self, dirname): self.dirname = dirname self.tokenizer = WordPunctTokenizer() def __iter__(self): for fname in os.listdir(self.dirname): date = get_date(fname) with open(os.path.join(self.dirname, fname), 'r') as f: text = f.read() text = text.replace('Donald Trump', 'Donald_Trump') text = text.replace('Melania Trump', 'Melania_Trump') text = text.replace('Ivanka Trump', 'Ivanka_Trump') text = text.replace('Eric Trump', 'Eric_Trump') if date and date < ANNOUNCEMENT_DATE: text = text.replace('Trump', 'Trump_Pre_Campaign') elif date and date < ELECTION_DATE: text = text.replace('Trump', 'Trump_Pre_Election') elif date and date >= ELECTION_DATE: text = text.replace('Trump', 'Trump_Post_Election') text = text.replace("\xa0", " ").replace('“', '"').replace('”', '"') sents = sent_tokenize(text) for sent in sents: yield self.tokenizer.tokenize(sent)
def extract_nl_text(ms): """ Extracts and tokenizes text from malware sample object :param ms: MalwareSample object :return: list of tokenized strings found in malware sample object's internal strings list """ wpt = WordPunctTokenizer() all_tokenized_strings_in_ms = [] inside_xml_privileges = False for s in ms.strings: if 'requestedPrivileges' in s or 'This program cannot be run in DOS mode' in s: continue elif inside_xml_privileges: continue elif '<assembly xmlns' in s: inside_xml_privileges = True continue elif '</assembly>' in s: inside_xml_privileges = False continue tokenized_string = [] tokens = wpt.tokenize(s) if tokens: for t in tokens: if wordnet.synsets(t) and len(t) > 3: # had to use length to eliminate false positives tokenized_string.extend(tokens) break if tokenized_string: all_tokenized_strings_in_ms.append(tokenized_string) return all_tokenized_strings_in_ms
def main_wrapper(): model, enc, device = init_model(42, "gpt2-xl") messages = [] questions = np.load('all_questions.npy')[-30000:] answers = np.load('gpt_answers.npy').tolist() tokenizer = WordPunctTokenizer() for i in range(len(questions) // 30 + 1): for j, question in enumerate(questions[30 * i:30 * (i + 1)]): print("\n") # input_text = input("Enter your message here: ") output_text = produce_answer(question, messages, 30, 10, 1.0, False, model, enc, device, insert_intro=True, wrap_type='QA') output_text = " ".join(tokenizer.tokenize(output_text)[:30]) print(j + 30 * i) print(question) print(output_text) answers.append(output_text) np.save('gpt_answers.npy', np.array(answers))
def get_words_without_stopwords(self, text): stopwords = nltk.corpus.stopwords.words('english') stopwords.extend(string.punctuation) stopwords.append('') tokenizer = WordPunctTokenizer() tokens = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(text) \ if token.lower().strip(string.punctuation) not in stopwords] return tokens
def words(self, fileid=None): """ Returns all of the words and puncuation symbols in the specified file that were in 'section//p' text nodes. """ elt = self.xml(fileid).iterfind('.//section//p') word_tokenizer = WordPunctTokenizer() return [val for subl in [word_tokenizer.tokenize(nodetext) for nodetext in [''.join(el.itertext()) for el in elt]] for val in subl]
def extract_words(text): stemmer = PorterStemmer() tokenizer = WordPunctTokenizer() tokens = tokenizer.tokenize(text) result = [stemmer.stem(x.lower()) for x in tokens if x not in stopwords.words('english') and len(x) > 1] return result
def get_similarity_score(a, b): stopwords = nltk.corpus.stopwords.words('english') stopwords.extend(string.punctuation) stopwords.append('') tokenizer = WordPunctTokenizer() """Check if a and b are matches.""" tokens_a = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(a) \ if token.lower().strip(string.punctuation) not in stopwords] tokens_b = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(b) \ if token.lower().strip(string.punctuation) not in stopwords] # Calculate Jaccard similarity ratio = 0 if len(set(tokens_a).union(tokens_b)) > 0: ratio = len(set(tokens_a).intersection(tokens_b)) / float(len(set(tokens_a).union(tokens_b))) return (ratio)
def get_tokens(sentence): """ Tokenizes a list of sentences :param sentence: list of sentences :return: list of tokenized sentences """ tokenizer = WordPunctTokenizer() return tokenizer.tokenize(sentence)
def getBigram(haystack): tokenizer = WordPunctTokenizer() words = tokenizer.tokenize(haystack) bcf = BigramCollocationFinder.from_words(words) stopset = set(stopwords.words('english')) filter_stops = lambda w: len(w) < 3 or w in stopset bcf.apply_word_filter(filter_stops) return bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4)
def tokenize(text): tokens = tokenizer.tokenize(text) wordtokenizer = WordPunctTokenizer() wlist =[] for token in tokens: wtoken = wordtokenizer.tokenize(token) wlist = wlist+wtoken stems = stem_tokens(wlist, stemmer) return stems
def extract_words(text): stemmer = PorterStemmer() tokenizer = WordPunctTokenizer() tokens = tokenizer.tokenize(text) bigram_finder = BigramCollocationFinder.from_words(tokens) bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 500) for bigram_tuple in bigrams: x = "%s %s" % bigram_tuple tokens.append(x) result = [stemmer.stem(x.lower()) for x in tokens if x not in stopwords.words('english') and len(x) > 1] return result
def get_bigrams(text): tokenizer = WordPunctTokenizer() tokens = tokenizer.tokenize(text) result = [] bigram_finder = BigramCollocationFinder.from_words(tokens) bigrams = bigram_finder.nbest(BigramAssocMeasures.likelihood_ratio, 10) for bigram_tuple in bigrams: x = "%s %s" % bigram_tuple tokens.append(x) return tokens
def you_collocations(raw): tokenizer = WordPunctTokenizer() tokens = tokenizer.tokenize(raw) bigrams = [(tokens[i], tokens[i +1]) for i in range(len(tokens)-1)] collocations = [(t1, t2) for (t1, t2) in bigrams if t1 == "you" or t1 == 'your'] trigrams = [(tokens[i], tokens[i +1], tokens[i+2]) for i in range(len(tokens)-2)] trilocations = [(t1, t2, t3) for (t1, t2, t3) in trigrams if t1 == "you" or t1 == 'your'] return collocations, trilocations
def extract_bigrams(text): text = remove_stopwords(text) tokenizer = WordPunctTokenizer() tokens = [token for token in set(tokenizer.tokenize(text)) if not is_number(token) and (is_valid_token(token) or is_name(token))] bigram_finder = BigramCollocationFinder.from_words(tokens) bigrams = bigram_finder.nbest(BigramAssocMeasures.dice, 500) for bigram_tuple in bigrams: x = "%s %s" % bigram_tuple tokens.append(x) result = [x.lower() for x in tokens if x not in stopwords.words("english") and len(x) > 3] return result
def decisionTreeClassifier(): import nltk from nltk.tokenize import WordPunctTokenizer docId = request.args.get('d') tokenizer = WordPunctTokenizer() collection = initialize_collection('documents') featuresets = [] tagSet = set() for d in collection.find(): bagOfWords = bag_of_words(tokenizer.tokenize(d['content'])) if 'tags' not in d: continue for tag in d['tags']: featuresets.append((bagOfWords, tag)) tagSet.add(tag) classifier = nltk.DecisionTreeClassifier.train(featuresets) print classifier.pseudocode(depth=4) d = collection.find_one({'_id' : ObjectId(docId)}) print classifier.classify(bag_of_words(tokenizer.tokenize(d['content']))) return 'hello' """
def build_word_dictionary(input_file_name, output_file_name): dictionary = Counter() with gzip.open(input_file_name) as input_file: for line in json.loads(input_file.read()): text, label = line # dictionary.update(text.split()) tokenizer = WordPunctTokenizer() dictionary.update(tokenizer.tokenize(text)) dictionary = list(sorted(w for w in dictionary if dictionary[w] >= 3)) + ["PADDING", "UNKNOWN"] with open(output_file_name, "w") as output_file: output_file.write("{}\n".format(json.dumps(dictionary)))
def extract_words(text): stemmer = PorterStemmer() if type(text) == str: text = unicode(text, "utf-8", errors="ignore") else: text = unicode(text) tokenizer = WordPunctTokenizer() tokens = tokenizer.tokenize(text) result = [stemmer.stem(x.lower()) for x in tokens if x not in stopwords.words('english') and len(x) > 1] return result
def analyze(tweets): classifier = cache.get('classifier') if classifier is None: classifier = train_classifier() cache.set('classifier', classifier, None) tokenizer = WordPunctTokenizer() analyzed_tweets = [] for tweet in tweets: tokens = tokenizer.tokenize(tweet.lower()) featureset = word_feats(tokens) sentiment = classifier.prob_classify(featureset) analyzed_tweets.append(AnalyzedTweet(tweet, round(sentiment.prob('pos'),2), round(sentiment.prob('neg'),2))) return analyzed_tweets
def build_word_dictionary(input_file_name, output_file_name): dictionary = Counter() with open(input_file_name) as input_file: for line in json.loads(input_file.read()): text, label = line tokenizer = WordPunctTokenizer() dictionary.update(tokenizer.tokenize(text)) dictionary = list(sorted(w for w in dictionary if dictionary[w] >= 5)) + ['PADDING', 'UNKNOWN'] # dictionary = list(sorted(w for w,c in dictionary.most_common(3000))) + ['PADDING', 'UNKNOWN'] with open(output_file_name, 'w') as output_file: output_file.write("{}\n".format(json.dumps(dictionary)))
def convert(sgm_path, apf_path, bio_path=None): xml_parser = etree.XMLParser(recover=True) try: sgm_tree = etree.parse(sgm_path, xml_parser) apf_tree = etree.parse(apf_path, xml_parser) if not bio_path: bio_path = os.path.commonprefix([sgm_path, apf_path]) + 'bio' output = open(bio_path, 'w') except: print 'Something wrong when opening/parsing xml file, or opening output file' return init_offset = get_init_offset(sgm_path) text = sgm_tree.xpath('/DOC/BODY/TEXT')[0].text.strip('\n') tokenizer = WordPunctTokenizer() tokens = tokenizer.tokenize(text) spans = list(tokenizer.span_tokenize(text)) pos = pos_tag(tokens) ts = [] for i in range(len(tokens)): t = token() t.text = tokens[i] t.pos = pos[i][1] t.span = (spans[i][0] + init_offset, spans[i][1] - 1 + init_offset) t.bio = 'O' ts.append(t) entits = apf_tree.xpath('/source_file/document/entity') for enty in entits: enty_type = enty.get('TYPE') mentions = enty.xpath('entity_mention') for m in mentions: head = m.xpath('head')[0] span = (int(head[0].get('START')), int(head[0].get('END'))) found = False for t in ts: if t.span[0] == span[0]: t.bio = 'B-' + enty_type found = True if t.span[0] > span[0] and t.span[1] <= span[1]: t.bio = 'I-' + enty_type found = True if not found: print 'entity mention head span not found', span, apf_path for t in ts: #print t.text, t.span output.write('\t'.join([t.text, t.pos, t.bio]) + '\n') output.close()
def OnButtonClick (): file = tkFileDialog.askopenfile(parent=root,mode='rb',title='Select a file') if file != None: print "Initializing... Please Wait" ini_db() file_list=file.readlines() for line in file_list: line=line.strip() fp1=open(line,"r") document_count() text=fp1.read() #dictonary to store word frequency in text(temporary) doc_word_freq={} #Tokenize from nltk.tokenize import WordPunctTokenizer tokenizer = WordPunctTokenizer() text2=tokenizer.tokenize(text) #removing stopwords from nltk.corpus import stopwords eng_stop=set(stopwords.words('english')) text3=[word for word in text2 if word not in eng_stop] #pos tag import nltk text4=nltk.pos_tag(text3) text5=filter_for_tags(text4) #calculate frequency of word in the text for word in text5: if word in doc_word_freq: doc_word_freq[word] += 1 else: if(word != "'"): doc_word_freq[word] = 1 #update occurance of word in global table for (word,freq) in doc_word_freq.items(): if (check(word)): update_record(word) else: add_new_word(word) print "Initialization Done...\n\n" file.close()
def word_tokenizePT(self, text, tokenizer): """ tokenize a portuguese sentence in words @input params: sentence - a sentence, a phrase (self) tokenizer - "TB" for TreebankWordTokenizer "WP" for WordPunctTokenizer @returns word's list or error """ if tokenizer == "TB": tokenizerTB = TreebankWordTokenizer() return tokenizerTB.tokenize(text) elif tokenizer == "WP": tokenizerWP = WordPunctTokenizer() return tokenizerWP.tokenize(text) else: return "tokenizer error: not found"