def number_of_different_words(self): # TODO: Stemming, then move to language specific classes tokenizer = WordPunctTokenizer() words = tokenizer.tokenize(self.text.strip()) only_textual_words = filter(unicode.isalpha, words) return len(set(only_textual_words))
def tokens(self): """Tokenize the text. """ tokenizer = WordPunctTokenizer() # Get token character spans. spans = list(tokenizer.span_tokenize(self.text)) # Materialize the token stream. tokens = [self.text[c1:c2] for c1, c2 in spans] tags = pos_tag(tokens) return [ Token( token=token.lower(), char1=c1, char2=c2, pos=pos, ) for (c1, c2), token, (_, pos) in zip(spans, tokens, tags) ]
def extract_nl_text(ms): """ Extracts and tokenizes text from malware sample object :param ms: MalwareSample object :return: list of tokenized strings found in malware sample object's internal strings list """ wpt = WordPunctTokenizer() all_tokenized_strings_in_ms = [] inside_xml_privileges = False for s in ms.strings: if 'requestedPrivileges' in s or 'This program cannot be run in DOS mode' in s: continue elif inside_xml_privileges: continue elif '<assembly xmlns' in s: inside_xml_privileges = True continue elif '</assembly>' in s: inside_xml_privileges = False continue tokenized_string = [] tokens = wpt.tokenize(s) if tokens: for t in tokens: if wordnet.synsets(t) and len(t) > 3: # had to use length to eliminate false positives tokenized_string.extend(tokens) break if tokenized_string: all_tokenized_strings_in_ms.append(tokenized_string) return all_tokenized_strings_in_ms
def message_to_wordlist(message, lemmas_bool, remove_stopwords=False): # Function to convert a document to a sequence of words, # optionally removing stop words. Returns a list of words. # # 1. Remove HTML #review_text = BeautifulSoup(review).get_text() # # 2. Remove messages numbers message_text = re.sub(">>\d+","", message) message_text = message_text.lower() message_text = re.sub(u"ё", 'e', message_text, re.UNICODE) message_text = clean_str(message_text) tokenizer = WordPunctTokenizer() # 3. Convert words to lower case and split them words = tokenizer.tokenize(message_text) lemmas = [] # 4. Optionally remove stop words (false by default) if remove_stopwords: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] if lemmas_bool == 'l': for word in words: word_parsed = morph.parse(word) if len(word_parsed) > 0: lemmas.append(word_parsed[0].normal_form) elif lemmas_bool == 's': for word in words: word = stemmer.stem(word) if len(word) > 0: lemmas.append(word) else: lemmas = words # 5. Return a list of words return(lemmas)
def TextProcessor(src, tgt, low=True, num=True): print "processing "+src if low==True: print "lowercasing.." if num==True: print "removing numeric.." srcfile = codecs.open(src,"r","utf-8") tgtfile = codecs.open(tgt,"w","utf-8") word_punct_tokenizer = WordPunctTokenizer() linecount=0 for line in srcfile: linecount+=1 line = word_punct_tokenizer.tokenize(line) if low==True: for i in range(0,len(line)): line[i] = line[i].lower() if num==True: for i in range(0,len(line)): if line[i].isnumeric()==True: line[i] = "<number>" tgtfile.write(listtostring(line)) srcfile.close() tgtfile.close() print "done processing "+str(linecount)+" lines!!"
def tokenize_words(sentence): """ :param sentence: :return: list of words in sentence """ tokenizer = WordPunctTokenizer() return tokenizer.tokenize(sentence)
def class1(): import nltk from nltk.tokenize import WordPunctTokenizer docId = request.args.get('d') tokenizer = WordPunctTokenizer() collection = initialize_collection('documents') featuresets = [] tagSet = set() for d in collection.find(): bagOfWords = bag_of_words(tokenizer.tokenize(d['content'])) if 'tags' not in d: continue for tag in d['tags']: featuresets.append((bagOfWords, tag)) tagSet.add(tag) classifier = nltk.NaiveBayesClassifier.train(featuresets) d = collection.find_one({'_id' : ObjectId(docId)}) #classifier.show_most_informative_features(100) cl = classifier.prob_classify(bag_of_words(tokenizer.tokenize(d['content']))) probs = [] for tag in tagSet: probs.append((tag, round(cl.prob(tag)*100) )) classifier.show_most_informative_features(n=20) probs = sorted(probs, key = lambda x : x[1], reverse = True) return render_template('class1.html', probs = probs, d=d)
def clean_data(input_file_name, output_file_name): def clean_word(word): word = word.lower() word = word.replace('&','&').replace('<','<').replace('>','>').replace('"','"').replace(''',"'") word = re.sub(r'(\S)\1+', r'\1\1', word) # normalize repeated characters to two word = re.sub(r'(\S\S)\1+', r'\1\1', word) word = word.encode('ascii', 'ignore') if re.search(r'((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w-]*)?\??(?:[-\+=&;%@.\w]*)#?(?:[\w]*))?)',word) is not None: word = 'GENERIC_HTTP' return word.encode('ascii', 'ignore') tokenizer = WordPunctTokenizer() with gzip.open(input_file_name) as input_file: with gzip.open(output_file_name, 'w') as output_file: for line in input_file: sentences, score = json.loads(line) cleaned_sentences = [] for sentence in sentences: cleaned_sentence = " ".join(map(clean_word, sentence.split())) cleaned_sentences.append(tokenizer.tokenize(cleaned_sentence)) json.dump([cleaned_sentences, score], output_file) output_file.write("\n")
def clean_data(input_file_name, output_file_name): def clean_word(word): word = word.encode('ascii', 'ignore') word = word.lower() word = re.sub(r'(\S)\1+', r'\1\1', word) # normalize repeated characters to two word = re.sub(r'(\S\S)\1+', r'\1\1', word) if re.search(r'((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w-]*)?\??(?:[-\+=&;%@.\w]*)#?(?:[\w]*))?)',word) is not None: word = 'GENERIC_HTTP' return word tokenizer = WordPunctTokenizer() data = [] with open(input_file_name) as input_file: for sentences, label in json.load(input_file): cleaned_sentences = [] for sentence in sentences: cleaned_sentence = " ".join(map(clean_word, sentence.split())) cleaned_sentence = tokenizer.tokenize(cleaned_sentence) cleaned_sentences.append(cleaned_sentence) data.append([cleaned_sentences, label]) with codecs.open(output_file_name, 'w', encoding='utf-8') as output_file: json.dump(data, output_file)
def tfIdf(): TFIDF_MIN_SCORE = 100 import nltk from nltk.tokenize import WordPunctTokenizer tokenizer = WordPunctTokenizer() collection = initialize_collection('documents') docs = collection.find() tfidf = [] idfMap = create_idf_map() docs = collection.find() for d in docs: tfMap = {} for word in set(tokenizer.tokenize(d['content'].lower())): if word not in tfMap: tfMap[word] = 1 else: tfMap[word] += 1 tfIdfValues = [] for word in set(tokenizer.tokenize(d['content'].lower())): if (tfMap[word] * 1000 / idfMap[word]) > TFIDF_MIN_SCORE: tfIdfValues.append((word, tfMap[word] * 1000 / idfMap[word])) tfIdfValues = sorted(tfIdfValues, key = lambda x : x[1], reverse = True) d['tfidf'] = tfIdfValues tfidf.append({'d' : d, 'tfidf' : tfIdfValues}) collection.save(d) genFreq = generaral_frequency(idfMap) return render_template("tfidf.html", documents = tfidf)
def tokenize(text): """Tokenize a raw text. Args: text (str) Returns: list of {token, char1, char2, pos} """ tokenizer = WordPunctTokenizer() # Get token character spans. spans = list(tokenizer.span_tokenize(text)) # Materialize the token stream. tokens = [text[c1:c2] for c1, c2 in spans] # Tag parts-of-speech. tags = pos_tag(tokens) return [ dict( token=token.lower(), char1=c1, char2=c2, pos=pos, ) for (c1, c2), token, (_, pos) in zip(spans, tokens, tags) ]
def words(self, fileid=None): """ Returns all of the words and puncuation symbols in the specified file that were in 'section//p' text nodes. """ elt = self.xml(fileid).iterfind('.//section//p') word_tokenizer = WordPunctTokenizer() return [val for subl in [word_tokenizer.tokenize(nodetext) for nodetext in [''.join(el.itertext()) for el in elt]] for val in subl]
def get_words_without_stopwords(self, text): stopwords = nltk.corpus.stopwords.words('english') stopwords.extend(string.punctuation) stopwords.append('') tokenizer = WordPunctTokenizer() tokens = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(text) \ if token.lower().strip(string.punctuation) not in stopwords] return tokens
def extract_words(text): stemmer = PorterStemmer() tokenizer = WordPunctTokenizer() tokens = tokenizer.tokenize(text) result = [stemmer.stem(x.lower()) for x in tokens if x not in stopwords.words('english') and len(x) > 1] return result
def getBigram(haystack): tokenizer = WordPunctTokenizer() words = tokenizer.tokenize(haystack) bcf = BigramCollocationFinder.from_words(words) stopset = set(stopwords.words('english')) filter_stops = lambda w: len(w) < 3 or w in stopset bcf.apply_word_filter(filter_stops) return bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4)
def get_tokens(sentence): """ Tokenizes a list of sentences :param sentence: list of sentences :return: list of tokenized sentences """ tokenizer = WordPunctTokenizer() return tokenizer.tokenize(sentence)
def change_db2(text, origin_dict, id): print origin_dict tokens_ar = [] word_punct_tokenizer = WordPunctTokenizer() for token in word_punct_tokenizer.span_tokenize(origin_dict): tokens_ar.append(token) for line in text.split("\n"): markup_error_line = line.split(';') print "MARKUP", markup_error_line convert_coord_2dbformat(markup_error_line, tokens_ar, id)
def tokenize(text): tokens = tokenizer.tokenize(text) wordtokenizer = WordPunctTokenizer() wlist =[] for token in tokens: wtoken = wordtokenizer.tokenize(token) wlist = wlist+wtoken stems = stem_tokens(wlist, stemmer) return stems
def extract_words(text): stemmer = PorterStemmer() tokenizer = WordPunctTokenizer() tokens = tokenizer.tokenize(text) bigram_finder = BigramCollocationFinder.from_words(tokens) bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 500) for bigram_tuple in bigrams: x = "%s %s" % bigram_tuple tokens.append(x) result = [stemmer.stem(x.lower()) for x in tokens if x not in stopwords.words('english') and len(x) > 1] return result
def you_collocations(raw): tokenizer = WordPunctTokenizer() tokens = tokenizer.tokenize(raw) bigrams = [(tokens[i], tokens[i +1]) for i in range(len(tokens)-1)] collocations = [(t1, t2) for (t1, t2) in bigrams if t1 == "you" or t1 == 'your'] trigrams = [(tokens[i], tokens[i +1], tokens[i+2]) for i in range(len(tokens)-2)] trilocations = [(t1, t2, t3) for (t1, t2, t3) in trigrams if t1 == "you" or t1 == 'your'] return collocations, trilocations
def extract_bigrams(text): text = remove_stopwords(text) tokenizer = WordPunctTokenizer() tokens = [token for token in set(tokenizer.tokenize(text)) if not is_number(token) and (is_valid_token(token) or is_name(token))] bigram_finder = BigramCollocationFinder.from_words(tokens) bigrams = bigram_finder.nbest(BigramAssocMeasures.dice, 500) for bigram_tuple in bigrams: x = "%s %s" % bigram_tuple tokens.append(x) result = [x.lower() for x in tokens if x not in stopwords.words("english") and len(x) > 3] return result
def get_bigrams(text): tokenizer = WordPunctTokenizer() tokens = tokenizer.tokenize(text) result = [] bigram_finder = BigramCollocationFinder.from_words(tokens) bigrams = bigram_finder.nbest(BigramAssocMeasures.likelihood_ratio, 10) for bigram_tuple in bigrams: x = "%s %s" % bigram_tuple tokens.append(x) return tokens
def build_word_dictionary(input_file_name, output_file_name): dictionary = Counter() tokenizer = WordPunctTokenizer() with open(input_file_name) as input_file: for record in json.loads(input_file.read()): dictionary.update(tokenizer.tokenize(record['content'])) dictionary.update(tokenizer.tokenize(record['abstract'])) dictionary = list(sorted(w for w in dictionary if dictionary[w] >= 5)) + ['PADDING', 'UNKNOWN'] with open(output_file_name, 'w') as output_file: output_file.write("{}\n".format(json.dumps(dictionary)))
def extract_words(text): stemmer = PorterStemmer() if type(text) == str: text = unicode(text, "utf-8", errors="ignore") else: text = unicode(text) tokenizer = WordPunctTokenizer() tokens = tokenizer.tokenize(text) result = [stemmer.stem(x.lower()) for x in tokens if x not in stopwords.words('english') and len(x) > 1] return result
def analyze(tweets): classifier = cache.get('classifier') if classifier is None: classifier = train_classifier() cache.set('classifier', classifier, None) tokenizer = WordPunctTokenizer() analyzed_tweets = [] for tweet in tweets: tokens = tokenizer.tokenize(tweet.lower()) featureset = word_feats(tokens) sentiment = classifier.prob_classify(featureset) analyzed_tweets.append(AnalyzedTweet(tweet, round(sentiment.prob('pos'),2), round(sentiment.prob('neg'),2))) return analyzed_tweets
def build_word_dictionary(input_file_name, output_file_name): dictionary = Counter() with open(input_file_name) as input_file: for line in json.loads(input_file.read()): text, label = line tokenizer = WordPunctTokenizer() dictionary.update(tokenizer.tokenize(text)) dictionary = list(sorted(w for w in dictionary if dictionary[w] >= 5)) + ['PADDING', 'UNKNOWN'] # dictionary = list(sorted(w for w,c in dictionary.most_common(3000))) + ['PADDING', 'UNKNOWN'] with open(output_file_name, 'w') as output_file: output_file.write("{}\n".format(json.dumps(dictionary)))
def build_word_dictionary(input_file_name, output_file_name): dictionary = Counter() with gzip.open(input_file_name) as input_file: for line in json.loads(input_file.read()): text, label = line # dictionary.update(text.split()) tokenizer = WordPunctTokenizer() dictionary.update(tokenizer.tokenize(text)) dictionary = list(sorted(w for w in dictionary if dictionary[w] >= 3)) + ["PADDING", "UNKNOWN"] with open(output_file_name, "w") as output_file: output_file.write("{}\n".format(json.dumps(dictionary)))
def OnButtonClick (): file = tkFileDialog.askopenfile(parent=root,mode='rb',title='Select a file') if file != None: print "Initializing... Please Wait" ini_db() file_list=file.readlines() for line in file_list: line=line.strip() fp1=open(line,"r") document_count() text=fp1.read() #dictonary to store word frequency in text(temporary) doc_word_freq={} #Tokenize from nltk.tokenize import WordPunctTokenizer tokenizer = WordPunctTokenizer() text2=tokenizer.tokenize(text) #removing stopwords from nltk.corpus import stopwords eng_stop=set(stopwords.words('english')) text3=[word for word in text2 if word not in eng_stop] #pos tag import nltk text4=nltk.pos_tag(text3) text5=filter_for_tags(text4) #calculate frequency of word in the text for word in text5: if word in doc_word_freq: doc_word_freq[word] += 1 else: if(word != "'"): doc_word_freq[word] = 1 #update occurance of word in global table for (word,freq) in doc_word_freq.items(): if (check(word)): update_record(word) else: add_new_word(word) print "Initialization Done...\n\n" file.close()
def convert(sgm_path, apf_path, bio_path=None): xml_parser = etree.XMLParser(recover=True) try: sgm_tree = etree.parse(sgm_path, xml_parser) apf_tree = etree.parse(apf_path, xml_parser) if not bio_path: bio_path = os.path.commonprefix([sgm_path, apf_path]) + 'bio' output = open(bio_path, 'w') except: print 'Something wrong when opening/parsing xml file, or opening output file' return init_offset = get_init_offset(sgm_path) text = sgm_tree.xpath('/DOC/BODY/TEXT')[0].text.strip('\n') tokenizer = WordPunctTokenizer() tokens = tokenizer.tokenize(text) spans = list(tokenizer.span_tokenize(text)) pos = pos_tag(tokens) ts = [] for i in range(len(tokens)): t = token() t.text = tokens[i] t.pos = pos[i][1] t.span = (spans[i][0] + init_offset, spans[i][1] - 1 + init_offset) t.bio = 'O' ts.append(t) entits = apf_tree.xpath('/source_file/document/entity') for enty in entits: enty_type = enty.get('TYPE') mentions = enty.xpath('entity_mention') for m in mentions: head = m.xpath('head')[0] span = (int(head[0].get('START')), int(head[0].get('END'))) found = False for t in ts: if t.span[0] == span[0]: t.bio = 'B-' + enty_type found = True if t.span[0] > span[0] and t.span[1] <= span[1]: t.bio = 'I-' + enty_type found = True if not found: print 'entity mention head span not found', span, apf_path for t in ts: #print t.text, t.span output.write('\t'.join([t.text, t.pos, t.bio]) + '\n') output.close()
def word_tokenizePT(self, text, tokenizer): """ tokenize a portuguese sentence in words @input params: sentence - a sentence, a phrase (self) tokenizer - "TB" for TreebankWordTokenizer "WP" for WordPunctTokenizer @returns word's list or error """ if tokenizer == "TB": tokenizerTB = TreebankWordTokenizer() return tokenizerTB.tokenize(text) elif tokenizer == "WP": tokenizerWP = WordPunctTokenizer() return tokenizerWP.tokenize(text) else: return "tokenizer error: not found"
def wordtokenizer(sentence): words=WordPunctTokenizer().tokenize(sentence) return words
def __init__(self): self.tokenizer = WordPunctTokenizer() #加载模型word2Vec self.word2VecModel = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True)
import pandas as pd import numpy as np import matplotlib.pyplot as plt import re from bs4 import BeautifulSoup from nltk.tokenize import WordPunctTokenizer tok = WordPunctTokenizer() pat1 = r'@[A-Za-z0-9_]+' pat2 = r'https?://[^ ]+' combined_pat = r'|'.join((pat1, pat2)) www_pat = r'www.[^ ]+' negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not","weren't":"were not", "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not", "wouldn't":"would not", "don't":"do not", "doesn't":"doesnot","didn't":"did not", "can't":"can not","couldn't":"could not", "shouldn't":"should not","mightn't":"might not", "mustn't":"must not"} neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b') def tweet_cleaner(text): soup = BeautifulSoup(text, 'lxml') souped = soup.get_text() try: bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?") except: bom_removed = souped stripped = re.sub(combined_pat, '', bom_removed) stripped = re.sub(www_pat, '', stripped) lower_case = stripped.lower() neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], lower_case) letters_only = re.sub("[^a-zA-Z]", " ", neg_handled) words = [x for x in tok.tokenize(letters_only) if len(x)>1]
def main(): tokenizer = WordPunctTokenizer() posts = [] # 'questions-textonly.txt' with open(sys.argv[1], 'r') as f: for line in f: line = line[:-1] posts.append(line) # 'answers-textonly.txt' with open(sys.argv[2], 'r') as f: for line in f: line = line[:-1] posts.append(line) #print(len(posts)) posts_lengths = [] for post in posts: tokens = tokenizer.tokenize(post) tokenCount = len(tokens) posts_lengths.append(tokenCount) #print(len(posts_lengths)) posts_lengths.sort() posts_lengths_unique = set(posts_lengths) #print(len(posts_lengths_unique)) posts_lengths_unique_list = list(posts_lengths_unique) posts_lengths_count = [] prevCount = posts_lengths[0] currCount = posts_lengths[0] n = 0 for i in range(len(posts_lengths)): currCount = posts_lengths[i] if (currCount == prevCount): n += 1 else: posts_lengths_count.append(n) n = 1 prevCount = currCount posts_lengths_count.append(n) #print(len(posts_lengths_count)) #posts_lengths_unique_list.index(21) #posts_lengths_unique_list.index(101) #posts_lengths_unique_list.index(502) posts_lengths_counts = np.array(posts_lengths_count) posts_lengths_Counts = np.array([]) posts_lengths_Counts = np.append(posts_lengths_Counts, posts_lengths_counts[0]) posts_lengths_Counts = np.append(posts_lengths_Counts, posts_lengths_counts[1:21].sum()) posts_lengths_Counts = np.append(posts_lengths_Counts, posts_lengths_counts[21:101].sum()) posts_lengths_Counts = np.append(posts_lengths_Counts, posts_lengths_counts[101:387].sum()) posts_lengths_Counts = np.append(posts_lengths_Counts, posts_lengths_counts[387:].sum()) posts_labels = np.char.array(['0', '1~20', '21~100', '101~500', '>500']) percents = 100. * posts_lengths_Counts / posts_lengths_Counts.sum() labels = [ '{0} : {1:1.2f} % '.format(label, percentage) for label, percentage in zip(posts_labels, percents) ] patches, texts = plt.pie(posts_lengths_Counts, shadow=True, startangle=90) plt.legend(patches, labels, bbox_to_anchor=(0.2, 0.27), loc=1, fontsize='medium', borderaxespad=1.0) plt.title('Distribution of posts having X number of tokens') plt.savefig('posts_distribution.png') plt.show()
import flair import torch from flair.models import SequenceTagger from nltk.tokenize import WordPunctTokenizer, PunktSentenceTokenizer flair.device = torch.device('cpu') word_tokenizer = WordPunctTokenizer() tagger = SequenceTagger.load('fr-ner') sent_tokenizer = PunktSentenceTokenizer( "nltk_data/tokenizers/punkt/french.pickle")
class Pipeline: def __init__(self): self.tokenizer = WordPunctTokenizer() self.vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=50000) # self.classifier = LinearSVC(random_state=seed) self.classifier = LogisticRegression(random_state=seed, multi_class='multinomial') # self.classifier = RidgeClassifier(random_state=seed) # self.classifier = KNeighborsClassifier(n_jobs=4, n_neighbors=1) # self.classifier = KNeighborsClassifier(n_jobs=4, n_neighbors=3) # self.classifier = KNeighborsClassifier(n_jobs=4, n_neighbors=5) self.classifier = Perceptron(random_state=seed) # Raw file self.train_file = "raw/train_tweets.txt" self.test_file = "raw/test_tweets_unlabeled.txt" # Cleaned file self.train_file_cleaned = "data/train_tweets_cleaned.txt" self.test_file_cleaned = "data/test_tweets_cleaned.txt" self.total_file_cleaned = "data/total_tweets_cleaned.txt" # Vector File self.train_vector = "vector/train.vec" self.test_vector = "vector/test.vec" # Label File self.train_label = "label/train_label.txt" self.test_label = "label/test_label.csv" def tokenize(self): print("Tokenizing...") train_file_cleaned = open(self.train_file_cleaned, 'w') test_file_cleaned = open(self.test_file_cleaned, 'w') total_file_cleaned = open(self.total_file_cleaned, 'w') train_label = open(self.train_label, 'w') with open(self.train_file) as train_data: for line in train_data: label, tweet = line.strip().split('\t', 1)[:2] train_label.write(label + '\n') tokenized_tweet = " ".join(self.tokenizer.tokenize(tweet)) train_file_cleaned.write(tokenized_tweet + '\n') total_file_cleaned.write(tokenized_tweet + '\n') with open(self.test_file) as test_data: for line in test_data: tokenized_tweet = " ".join(self.tokenizer.tokenize(line)) test_file_cleaned.write(tokenized_tweet + '\n') total_file_cleaned.write(tokenized_tweet + '\n') def vectorize(self): print("Fitting vectorizer...") self.vectorizer.fit(open(self.total_file_cleaned)) print("Vectorizing train file...") train_vector = self.vectorizer.transform(open(self.train_file_cleaned)) print("Train vector: ", train_vector.shape) print("Vectorizing test file...") test_vector = self.vectorizer.transform(open(self.test_file_cleaned)) print("Test vector: ", test_vector.shape) print("Saving...") pickle.dump(train_vector, open(self.train_vector, 'wb')) pickle.dump(test_vector, open(self.test_vector, 'wb')) def evaluate(self): train_vector = pickle.load(open(self.train_vector, 'rb')) train_label = [] with open(self.train_label) as file: for line in file: train_label.append(int(line)) print("Total Data: ", train_vector.shape) X_train, X_evl, y_train, y_evl = train_test_split(train_vector, train_label, test_size=0.5, random_state=seed) _, X_train, _, y_train = train_test_split(X_train, y_train, test_size=0.1, random_state=seed) _, X_evl, _, y_evl = train_test_split(X_evl, y_evl, test_size=0.1, random_state=seed) print( "Training set has {} instances. Test set has {} instances.".format( X_train.shape[0], X_evl.shape[0])) start = time.time() print("Training Classifier...") self.classifier.fit(X_train, y_train) pred_labels = self.classifier.predict(X_evl) print("Training successfully in %s seconds " % int(time.time() - start)) print("Evaluate Accuracy: %0.2f" % (accuracy_score(y_evl, pred_labels) * 100)) def classify(self): train_vector = pickle.load(open(self.train_vector, 'rb')) train_label = [] with open(self.train_label) as file: for line in file: train_label.append(int(line)) print("Total Data: ", train_vector.shape) start = time.time() print("Training Classifier...") self.classifier.fit(train_vector, train_label) print("Training successfully in %s seconds " % int(time.time() - start)) print("Predicting...") test_vector = pickle.load(open(self.test_vector, 'rb')) test_label = self.classifier.predict(test_vector) df = pd.DataFrame(test_label, columns=['Predicted']) df.index += 1 df.index.name = 'Id' df.to_csv(self.test_label)
from polyglot.text import Text import nltk import re import spacy import pymongo from tqdm import tqdm import json from joblib import Parallel, delayed from fuzzywuzzy import process import gensim import threading import gensim.corpora as corpora import pandas as pd import config tok = WordPunctTokenizer() sent_detector = nltk.tokenize.punkt.PunktSentenceTokenizer() nlp = spacy.load('en_core_web_sm') nlp_spacy = spacy.load('en_core_web_sm') from config import nlp_corenlp nltk.download('stopwords') stop_words = stopwords.words('english') stop_words.extend(['from', 'subject', 're', 'edu', 'use']) class AsyncNLPProcess(threading.Thread): def __init__(self, Task_Complete): super().__init__() self.Task_Complete = Task_Complete
def run(): # Sentences From Text _sentence_tokenizer = nltk.data.load("./tokenizer/punkt_turkish.pickle") word_tokenizer = WordPunctTokenizer() abbreviations = set() with open("./tokenizer/abbreviations-long.txt") as f: for l in f: abbreviations.add(l.split(':')[0]) _sentence_tokenizer._params.abbrev_types = abbreviations def sentences_from_text(text): return _sentence_tokenizer.tokenize(text.strip()) def tokens_from_sentence(sentence): return sentence.split() # nltk.word_tokenize(sentence) def ngrams(obj, n): tokens = [] sentences = (sentences_from_text(obj["title"]) + sentences_from_text(obj["description"]) + sentences_from_text(obj["content"])) for sentence in sentences: tokens += tokens_from_sentence(sentence) pairs = nltk.ngrams(tokens, n) return [" ".join(pair) for pair in pairs] def convertToJsonObj(jsonText): return simplejson.loads(jsonText) def convertToObject(jsonObj): x = jsonObj obj = { "title": x.get("properties", {}).get("title", {}).get("stringValue", ""), "link": x.get("properties", {}).get("link", {}).get("stringValue", ""), "published": x.get("properties", {}).get("published", {}).get("stringValue", ""), "description": x.get("properties", {}).get("description", {}).get("stringValue", ""), "content": x.get("properties", {}).get("content", {}).get("stringValue", ""), } obj["key"] = obj["link"] if obj["link"] else str(uuid.uuid4()) return obj # https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string def cleanhtml(raw_html): cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, '', raw_html) return cleantext def removeHTMLFromStrings(obj): for key in obj.keys(): obj[key] = cleanhtml(obj[key]) return obj def tokenize_to_sentences(obj): obj["sentences"] = (sentences_from_text(obj["title"]) + sentences_from_text(obj["description"]) + sentences_from_text(obj["content"])) return obj def tokenize_to_words(obj): obj["tokens"] = [] for sentence in obj["sentences"]: obj["tokens"] += tokens_from_sentence(sentence) for token in obj["tokens"]: yield (obj["key"], token) def get_named_entities(mdl, tokens): stemmer = TurkishStemmer() res = mdl.analyze(tokens) entities = [] for entity in res["entities"]: for entity2 in entity["text"].split(", "): ne = stemmer.stem(entity2).split("'")[0] entities.append((entity["type"], ne, entity["score"])) return entities options = PipelineOptions() options.view_as(StandardOptions).runner = 'DirectRunner' p = beam.Pipeline(options=options) pairs = ( p | "Read From Text" >> ReadFromText("news.json", coder=beam.coders.coders.StrUtf8Coder()) # line by line | "Convert to Json Object" >> beam.Map(convertToJsonObj) | "Convert to Python Object" >> beam.Map(convertToObject) | "Remove HTML Tags From Strings (Normalization 1)" >> beam.Map(removeHTMLFromStrings)) tokens_1gram = ( pairs | 'Sentence Tokenization' >> beam.Map(tokenize_to_sentences) | 'Word Tokenization' >> beam.FlatMap( tokenize_to_words) # also convert to key value pairs ) tokens = tokens_1gram def process_tokens_last(doc, tokens): return (doc, get_named_entities(tokens)) doc_named_entities = ( tokens | beam.GroupByKey() # | beam.Map(lambda (doc, tokens): process_tokens_last(mdl, tokens)) ) (doc_named_entities | "Write Results" >> WriteToText("doc_tokens")) p.run()
from nltk.tag import pos_tag from konlpy.tag import Okt, Kkma tokenizer = TreebankWordTokenizer() okt = Okt() kkma = Kkma() print( word_tokenize( "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop." )) ## 모두 token 화 print(WordPunctTokenizer().tokenize( "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop." )) # ''' 단위는 띄움 print( text_to_word_sequence( "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop." )) #"don't 는 하나로 인식 text = "Starting a home-based restaurant may be an ideal. it doesn't have a food chain or restaurant of their own." # home-based 하나로 인식, does n't로 인식 --> 일반 word tokenizer와 동일 print(tokenizer.tokenize(text)) sentence = "His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally, the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. He looked about, to mae sure no one was near." print(sent_tokenize(sentence)) korean_sentence = "딥 러닝 자연어 처리가 재미있기는 합니다. 그런데 문제는 영어보다 한국어로 할 때 너무 어려워요. 농담아니에요. 이제 해보면 알걸요?"
def get_tokensForBigData(self,text): tokens = WordPunctTokenizer().tokenize(text) words = [x for x in tokens if x not in string.punctuation and x not in ['.','."', '".', '?"', '!"', '%"', '%.','@']] return words
__author__ = 'mdenil'
def get_tokens(self,text): textLow = text.lower() tokens = WordPunctTokenizer().tokenize(textLow) words = [x for x in tokens if x not in string.punctuation and x not in ['."', '".', '?"', '!"', '%"', '%.']] return words
open('../data/parent', 'rb') ) # parent is a dict(), which stores the ids of each query's duplicate questions querys = read_data.methods_to_classes(read_data.read_querys_from_file()) #querys = querys[0:100] print 'loading data finished' mrr = 0.0 map = 0.0 for item in querys: query = item[0] true_apis = item[1] query_words = WordPunctTokenizer().tokenize(query.lower()) query_words = [ SnowballStemmer('english').stem(word) for word in query_words ] query_matrix = similarity.init_doc_matrix(query_words, w2v) query_idf_vector = similarity.init_doc_idf_vector(query_words, idf) top_questions = recommendation.get_topk_questions(query, query_matrix, query_idf_vector, questions, 50, parent) recommended_api = recommendation.recommend_api_class( query_matrix, query_idf_vector, top_questions, questions, javadoc, javadoc_dict_classes, -1) #recommended_api = recommendation.recommend_api_class_baseline(query_matrix,query_idf_vector,javadoc,-1)
re.compile('^' + l.strip() + '$') for l in open(os.path.join(LIWC_dir, '%s' % (c)), 'r') if l.strip() not in stopwords ] for c in LIWC_categories } # replace positive/negative affect LIWC_categories += ['positive', 'negative'] LIWC_categories.remove('positive_affect') LIWC_categories.remove('negative_affect') LIWC_category_wordlists['positive'] = LIWC_category_wordlists.pop( 'positive_affect') LIWC_category_wordlists['negative'] = LIWC_category_wordlists.pop( 'negative_affect') TKNZR = WordPunctTokenizer() full_slice_list = set(range(N_SLICES)) # we count either the total number of tokens # or the number of unique tokens # count_option = 'total' count_option = 'unique' data = pd.read_csv(sub_file, sep='\t', index_col=False) data.sort_values('slice', ascending=True) fname = os.path.basename(sub_file).replace('.tsv', '') out_dir = os.path.dirname(sub_file) empty_slices = full_slice_list - set(data['slice'].unique()) if (len(empty_slices) > 0): print('filling %s with empty slices %s' % (e_name, empty_slices)) empty_slice_rows = pd.DataFrame([{ 'slice': c, 'dialogue': ''
def run(): import pickle import sys import math import numpy as np import apache_beam as beam reload(sys) sys.setdefaultencoding('utf8') import argparse import simplejson from gensim.models import KeyedVectors from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions, SetupOptions from apache_beam.io.textio import ReadFromText, WriteToText import nltk.data from nltk.tokenize import WordPunctTokenizer import re import uuid import perceptron # Sentences From Text _sentence_tokenizer = nltk.data.load("./tokenizer/punkt_turkish.pickle") word_tokenizer = WordPunctTokenizer() abbreviations = set() with open("./tokenizer/abbreviations-long.txt") as f: for l in f: abbreviations.add(l.split(':')[0]) _sentence_tokenizer._params.abbrev_types = abbreviations model_file = "perceptron_word2vec_stemmed_normalized.pickle" with open(model_file, 'rb') as model: w, b = pickle.load(model) def sentences_from_text(text): return _sentence_tokenizer.tokenize(text.strip()) def tokens_from_sentence(sentence): return nltk.word_tokenize(sentence) def ngrams(obj, n): tokens = [] sentences = (sentences_from_text(obj["title"]) + sentences_from_text(obj["description"]) + sentences_from_text(obj["content"])) for sentence in sentences: tokens += tokens_from_sentence(sentence) pairs = nltk.ngrams(tokens, n) return [" ".join(pair) for pair in pairs] def convertToJsonObj(jsonText): return simplejson.loads(jsonText) def convertToObject(jsonObj): x = jsonObj obj = { "title": x.get("properties", {}).get("title", {}).get("stringValue", ""), "link": x.get("properties", {}).get("link", {}).get("stringValue", ""), "published": x.get("properties", {}).get("published", {}).get("stringValue", ""), "description": x.get("properties", {}).get("description", {}).get("stringValue", ""), "content": x.get("properties", {}).get("content", {}).get("stringValue", ""), } obj["key"] = obj["link"] if obj["link"] else str(uuid.uuid4()) return obj # https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string def cleanhtml(raw_html): cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, '', raw_html) return cleantext def removeHTMLFromStrings(obj): for key in obj.keys(): obj[key] = cleanhtml(obj[key]) return obj def tokenize_to_sentences(obj): obj["sentences"] = (sentences_from_text(obj["title"]) + sentences_from_text(obj["description"]) + sentences_from_text(obj["content"])) return obj def tokenize_to_words(obj): obj["tokens"] = [] for sentence in obj["sentences"]: obj["tokens"] += tokens_from_sentence(sentence) for token in obj["tokens"]: yield (obj["key"], token) options = PipelineOptions() options.view_as(StandardOptions).runner = 'DirectRunner' p = beam.Pipeline(options=options) pairs = ( p | "Read From Text" >> ReadFromText("news.json", coder=beam.coders.coders.StrUtf8Coder()) # line by line | "Convert to Json Object" >> beam.Map(convertToJsonObj) | "Convert to Python Object" >> beam.Map(convertToObject) | "Remove HTML Tags From Strings (Normalization 1)" >> beam.Map(removeHTMLFromStrings)) tokens_1gram = ( pairs | 'Sentence Tokenization' >> beam.Map(tokenize_to_sentences) | 'Word Tokenization' >> beam.FlatMap( tokenize_to_words) # also convert to key value pairs ) """ tokens_2gram = (pairs | "Create 2-grams" >> beam.FlatMap(lambda obj: [(obj["key"], token) for token in ngrams(obj, 2)]) ) """ tokens = tokens_1gram """ vocabulary = (tokens | "Get words only" >> beam.Values() | "Remove duplicate words" >> beam.RemoveDuplicates() ) vocabulary_size = (vocabulary | "Count Vocabulary elements" >> beam.combiners.Count.Globally() ) doc_total_words = (tokens | "Count Words of Doc" >> beam.combiners.Count.PerKey() ) """ tokens_paired_with_1 = ( tokens | "Pair with 1" >> beam.Map(lambda (doc, token): ((doc, token), 1))) """ token_counts_per_doc = (tokens_paired_with_1 | "Group by Doc,Word" >> beam.GroupByKey() | "Count ones" >> beam.Map(lambda ((doc, token), counts): (doc, (token, sum(counts)))) | "Group by Doc" >> beam.GroupByKey() ) num_docs = (token_counts_per_doc | "Get Docs" >> beam.Keys() | "Count Docs" >> beam.combiners.Count.Globally() ) word_tf_pre = ( { 'total_tokens': doc_total_words, 'token_counts_per_doc': token_counts_per_doc } | "CoGroup By Document" >> beam.CoGroupByKey() ) def calc_tf((doc, count)): [token_count] = count['token_counts_per_doc'] [tokens_total] = count['total_tokens'] for token, cnt in token_count: yield token, (doc, float(cnt) / tokens_total) doc_word_tf = (word_tf_pre | "Compute Term Frequencies" >> beam.FlatMap(calc_tf) ) word_occurrences = (tokens | "Remove Multiple occurrences per doc" >> beam.RemoveDuplicates() | "Pair with 1s" >> beam.Map(lambda (doc, word): (word, 1)) | "Group by Word" >> beam.GroupByKey() | "Sum 1s" >> beam.Map(lambda (word, counts): (word, sum(counts))) ) token_df = ( word_occurrences | "Compute Document Frequency">> beam.Map(lambda (token, count), total: (token, float(count) / total), AsSingleton(num_docs))) token_tf_df = ( { 'term_frequency': doc_word_tf, 'document_frequency': token_df} | "CoGroup By Token" >> beam.CoGroupByKey()) def calc_tfidf((token, tfdf)): [df] = tfdf['document_frequency'] for doc, tf in tfdf['term_frequency']: yield (doc, token), tf * math.log(1.0 / df) token_tf_idf = (token_tf_df | "Calculate TF-IDF Scores" >> beam.FlatMap(calc_tfidf) ) """ word2vec = KeyedVectors.load_word2vec_format('tr_word2vec', binary=True) def get_vec(word2vec, token): try: x = word2vec.get_vector(token) x = x.reshape(400) except: x = np.zeros(400) return x def analyze_sentiment(x): res = perceptron.f(x, w, b) return res doc_sentiment = ( tokens_paired_with_1 | "Create Word2Vec Vector" >> beam.Map(lambda ((doc, token), cnt): (doc, get_vec(word2vec, token))) | "Group Word2Vec Vectors By Document" >> beam.CombinePerKey(sum) | "Sum Word2Vec Vectors" >> beam.Map(lambda (doc, vec): (doc, analyze_sentiment(vec)[0]))) result = (doc_sentiment | "Format Results" >> beam.Map(lambda (doc, tokens): '%s %s' % (doc, tokens))) (result | "Write Results" >> WriteToText("sentiments")) p.run()
class TextDatasetReader(DatasetReader): """ Reads raw text, finds replaceable words, generates instance: word with context """ @classmethod def read_dict(cls, file_path, limit_words=-1, limit_freq=0): word_dict = {} with open(file_path) as fd: for idx, line in enumerate(fd): word, *freq = line.strip().split() if idx == limit_words: break if len(freq) > 0: freq = freq[0] freq = int(freq) if freq < limit_freq: break else: freq = 1 word_dict[word] = freq return word_dict def __init__(self, dict_path, limit_words=-1, limit_freq=0, max_context_size: int = 4, token_indexers: Dict[str, TokenIndexer] = None, target_indexers: Dict[str, TokenIndexer] = None): """ :param dict_path: path to the dict of acceptable fords to change :param limit_words: Max word count from dictionary :param limit_freq: Minimum frequency of words :param max_context_size: """ super().__init__(lazy=True) self.max_context_size = max_context_size self.word_dict = self.read_dict(dict_path, limit_words, limit_freq) self.tokenizer = WordPunctTokenizer() self.token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self.target_indexer = target_indexers or { "target": SingleIdTokenIndexer(namespace='target', lowercase_tokens=True), "tokens": SingleIdTokenIndexer() } self.left_padding = 'BOS' self.right_padding = 'EOS' def text_to_instance(self, tokens, idx) -> Instance: target_word = tokens[idx] left_context, right_context = self.get_context(tokens, idx, self.max_context_size) if len(left_context) < self.max_context_size: left_context = [self.left_padding] + left_context if len(right_context) < self.max_context_size: right_context = right_context + [self.right_padding] left_context = TextField([Token(token) for token in left_context], self.token_indexers) right_context = TextField([Token(token) for token in right_context], self.token_indexers) target_token_field = TextField([Token(target_word)], self.target_indexer) return Instance({ "left_context": left_context, "right_context": right_context, "word": target_token_field }) @classmethod def get_context(cls, tokens, idx, size): """ >>> TextDatasetReader.get_context([1,2,3,4,5,7], 1, 2) ([1], [3, 4]) >>> TextDatasetReader.get_context([1,2,3,4,5,7], 4, 2) ([3, 4], [7]) :param tokens: :param idx: :param size: :return: """ return tokens[max(idx - size, 0):idx], tokens[idx + 1:idx + size + 1] def _read(self, file_path: str) -> Iterable[Instance]: with open(file_path) as fd: for line in fd: tokens = self.tokenizer.tokenize(line) for idx, token in enumerate(tokens): if token in self.word_dict: yield self.text_to_instance(tokens, idx)
import time import re from string import punctuation # from nltk.corpus import stopwords from nltk.stem import SnowballStemmer from nltk.tokenize import WordPunctTokenizer from conf.configure import Configure from utils import data_utils from utils.text.preprocessor import TextPreProcessor from utils import jobs from optparse import OptionParser # english_stopwords = set(stopwords.words('english')) word_tokenize = WordPunctTokenizer().tokenize preprocessor = TextPreProcessor() stop_words = ['the', 'a', 'an', 'and', 'but', 'if', 'or', 'because', 'as', 'what', 'which', 'this', 'that', 'these', 'those', 'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for', 'is', 'of', 'while', 'during', 'to', 'What', 'Which', 'Is', 'If', 'While', 'This'] def get_unigram_words(que): """ 获取单一有效词汇 """ return [word for word in word_tokenize(que.lower()) if word not in stop_words] def generate_unigram_words_features(df): df['unigrams_ques1'] = df['question1'].apply(lambda x: get_unigram_words(str(x)))
# အောက်ပါ WordPunctTokenizer ကတော့ punctuation သင်္ကေတတွေ အားလုံးကို token တစ်ခုစီအနေနဲ့ ဖြတ်ပေးမှာ ဖြစ်ပါတယ်။ from nltk.tokenize import WordPunctTokenizer # word tokenizing for English with NLTK library # Written by Ye Kyaw Thu, LST, NECTEC, Thailand # Date: 12 July 2021 # Reference: Python 3 Text Processing with NLTK 3 Cookbook # NLTK စာအုပ်ထဲမှာက PunktWordTokenizer အကြောင်းကိုပါ ဆွေးနွေးထားပေမဲ့ နောက်ပိုင်း NLTK version တွေမှာ အဲဒီကောင်က မပါတော့ပါဘူး... # Reference: https://stackoverflow.com/questions/44238864/importerror-cannot-import-name-punktwordtokenizer/53923708 # How to run: # $ echo "Don't do it! I can't stand it!" | python ./en-tokenization-on-punctuation.py parser = argparse.ArgumentParser() parser.add_argument('inputFile', default=sys.stdin, type=argparse.FileType('r'), nargs='?') args = parser.parse_args() textLines = args.inputFile.readlines() tb_tokenizer = TreebankWordTokenizer() wp_tokenizer = WordPunctTokenizer() count = 0 for line in textLines: count += 1 print("Treebank: ", tb_tokenizer.tokenize(line)) print("WordPunct", wp_tokenizer.tokenize(line))
"wouldn't": "would not", "aren't": "are not", "haven't": "have not", "doesn't": "does not", "didn't": "did not", "don't": "do not", "shouldn't": "should not", "wasn't": "was not", "weren't": "were not", "mightn't": "might not", "mustn't": "must not" } negation_pattern = re.compile(r'\b(' + '|'.join(negations_.keys()) + r')\b') from nltk.tokenize import WordPunctTokenizer tokenizer1 = WordPunctTokenizer() tokenizer2 = WordPunctTokenizer() corpus_summary = [] for i in range(0, 3000): stripped = re.sub(combined_pat, '', dataset2['summary'][i]) stripped = re.sub(www_pat, '', stripped) cleantags = re.sub(html_tag, '', stripped) #lower_case = cleantags.lower() neg_handled = negation_pattern.sub(lambda x: negations_[x.group()], cleantags) letters_only = re.sub("[^a-zA-Z]", " ", neg_handled) tokens = tokenizer1.tokenize(letters_only) tokens = ' '.join(tokens) corpus_summary.append(tokens)
import torch import nltk from nltk import tokenize from nltk.tokenize import TweetTokenizer import json import numpy as np import matplotlib.pyplot as plt import pylab as pl from tqdm import * from collections import defaultdict import operator import random from nltk.tokenize import WordPunctTokenizer import h5py wpt = WordPunctTokenizer() min_context_len = 20 max_context_len = 350 min_question_len = 2 max_question_len = 30 max_answer_len = 30 def helper(data_path,voc_path, number_data = None): data = json.load(open(data_path)) voc = json.load(open(voc_path)) p_set = [] p_len_set = [] p_c_s_e_set = []
def nltkSplit(testDt_List): seg_Dt1_List = [] for i in range(0, len(testDt_List) - 1): seg_Dt1 = WordPunctTokenizer().tokenize(testDt_List[i]) seg_Dt1_List.append(seg_Dt1) return seg_Dt1_List
def train_Bayes(): ripple = pd.read_table('ripple_train.csv', sep=',') btc = pd.read_table('btc_train.csv', sep=',') bitcoin = pd.read_table('bitcoin_train.csv', sep=',') cryptocurrency = pd.read_table('cryptocurrency_train.csv', sep=',') cryptomarkets = pd.read_table('cryptomarkets_train.csv', sep=',') ethereum = pd.read_table('ethereum_train.csv', sep=',') iota = pd.read_table('iota_train.csv', sep=',') litecoin = pd.read_table('litecoin_train.csv', sep=',') neo = pd.read_table('neo_train.csv', sep=',') stellar = pd.read_table('stellar_train.csv', sep=',') headlines = ripple['headline'] headlines.append(btc['headline']) headlines.append(bitcoin['headline']) headlines.append(cryptocurrency['headline']) headlines.append(cryptomarkets['headline']) headlines.append(ethereum['headline']) headlines.append(iota['headline']) headlines.append(litecoin['headline']) headlines.append(neo['headline']) headlines.append(stellar['headline']) labels = ripple['label'] labels.append(btc['label']) labels.append(bitcoin['label']) labels.append(cryptocurrency['label']) labels.append(cryptomarkets['label']) labels.append(ethereum['label']) labels.append(iota['label']) labels.append(litecoin['label']) labels.append(neo['label']) labels.append(stellar['label']) reformat = token_format(headlines) train = list(zip(reformat, labels)) dictionary = set(word.lower() for passage in train for word in WordPunctTokenizer().tokenize(passage[0])) print("First couple of titles and their associated values:") print(train[0]) print(train[1]) print(train[2]) print(train[3]) t = [({ word: (word in WordPunctTokenizer().tokenize(x[0])) for word in dictionary }, x[1]) for x in train] classifier = nltk.NaiveBayesClassifier.train(t) model = open('bayes_model.pickle', 'wb') words = open('dictionary.pickle', 'wb') pickle.dump(classifier, model) pickle.dump(dictionary, words) model.close() words.close()
#coding=utf-8 import numpy as np import json import pickle import nltk from nltk.tokenize import WordPunctTokenizer from collections import defaultdict #使用nltk分词分句器 sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') #print (type(sent_tokenizer)) word_tokenizer = WordPunctTokenizer() #print (type(word_tokenizer)) #记录每个单词及其出现的频率 word_freq = defaultdict(int) #print (word_freq) # 读取数据集,并进行分词,统计每个单词出现次数,保存在word freq中 with open('yelp_academic_dataset_review.json', 'rb') as f: for line in f: review = json.loads(line.decode('utf-8')) words = word_tokenizer.tokenize(review['text']) #print (type(words)) #list #print (len(words)) 包含标点符号 for word in words: word_freq[word] += 1 #print (review) #print (type(review)) print (word_freq[',']) print (word_freq['.'])
yield eval(l) def getDF(path): i = 0 df = {} for d in parse(path): df[i] = d i += 1 return pd.DataFrame.from_dict(df, orient='index') df = getDF('reviews_Musical_Instruments_5.json.gz') df.columns i = 0 word_punct_tokenizer = WordPunctTokenizer() punct = list(string.punctuation) stopword_list = stopwords.words('english') + punct + ['rt', 'via'] filt3 = [] uniq = [] i = 0 for index, rows in df.iterrows(): reviews = rows['reviewText'] reviews = reviews.lower() tokens = nltk.word_tokenize(reviews) tokens2 = word_punct_tokenizer.tokenize(reviews) filter = [ word for word in tokens2 if word not in stopwords.words('english') ] post = nltk.pos_tag(filter)
parser.add_argument("--normquotes", help="Normalize any quotes to quote single type.", default=1) parser.add_argument("--wptokenizer", help="Additionally apply treebank tokenizer.", default=1) pa = parser.parse_args() sentid = int(pa.sentid) normquotes = int(pa.normquotes) wptokenizer = int(pa.wptokenizer) if __name__ == "__main__": st = PunktSentenceTokenizer() wtw = WordPunctTokenizer() if wptokenizer == 1 else None wtt = TreebankWordTokenizer() for line in sys.stdin: line = line.decode("utf-8") if sentid == 1: m = textid_re.search(line) if m: sys.stdout.write(u".\n{{{%s}}}!!!\n" % m.group(1)) continue if line == "\n": continue if normquotes == 1:
def tokenize_text(text, punct=False): text = WordPunctTokenizer().tokenize(text) text = [word for word in text if punct or word.isalnum()] text = ' '.join(text) text = text.strip() return text
def init_word_tokenizer(): global word_tokenizer if word_tokenizer is None: word_tokenizer = WordPunctTokenizer()
from nltk.tokenize import WordPunctTokenizer from nltk.tokenize import TreebankWordTokenizer parser = argparse.ArgumentParser() parser.add_argument("--treebank", help="Additionally apply treebank tokenizer.", default=1) pa = parser.parse_args() treebank = int(pa.treebank) if __name__ == "__main__": st = PunktSentenceTokenizer() wtw = TreebankWordTokenizer() wtt = WordPunctTokenizer() for line in sys.stdin: if line[0:7] == "TEXTID(": sys.stdout.write(line) continue if line == "\n": sys.stdout.write(line) continue if treebank == 0: line = line.replace("«", " ' ") line = line.replace("»", " ' ") line = line.replace("“", " ' ")
class SQUADSupportingFactsProcessor(JiantSupportingFactsProcessor): DOC_ID = "squad_sup_facts" word_tokenizer = WordPunctTokenizer() sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') def process_file(self) -> List: """ Converts a SQuAD dataset file into samples for the Supporting Facts Probing task in Jiant format :return: A list of samples in jiant edge probing format. """ squad_data = self.json_from_file(self.input_path)['data'] samples = [] for article in squad_data: pars = article["paragraphs"] for par in pars: context = par["context"] tokenized_context = self.word_tokenizer.tokenize(context) sentences = list( self.sentence_tokenizer.tokenize(context.strip())) if len( sentences ) < 2: # There must be at least two sentences in the paragraph continue for qa in par["qas"]: targets = [] answer = qa["answers"][0] question = qa["question"] question_id = qa["id"] tokenized_question = self.word_tokenizer.tokenize(question) question_length = len(tokenized_question) sample_text = " ".join(tokenized_question) + " " answer_char_position = answer["answer_start"] answer_sentence_index = self.get_sentence_index_from_char_position( answer_char_position, sentences) found_answer_sentence_in_context = False # go through all sentences in context for sentence_index, sentence in enumerate(sentences): tokenized_sentence = self.word_tokenizer.tokenize( sentence) sample_text += " ".join(tokenized_sentence) + " " # get token start position for sentence in context sentence_pos = self.find_sentence_position_in_context( tokenized_context, tokenized_sentence) if sentence_pos is None: continue # define sentence token span for jiant target start_index = sentence_pos + question_length end_index = start_index + len(tokenized_sentence) sentence_span = [start_index, end_index] # if sentence contains answer, set label to "1" if sentence_index == answer_sentence_index: label = "1" found_answer_sentence_in_context = True else: label = "0" targets.append( self.create_target(question_length, sentence_span, label)) if not found_answer_sentence_in_context: # could not find answer in context, skip this example continue sample = { "info": { "doc_id": self.DOC_ID, "q_id": question_id }, "text": sample_text.strip(), "targets": targets } samples.append(sample) return samples @staticmethod def find_sentence_position_in_context(context: List, sentence_tokens: List) -> int: """ Goes through a list of context tokens and tries to find the sentence tokens. If sentence tokens are found, the start index is returned. :param context: List of tokens in a context document. :param sentence_tokens: List of tokens in a sentence, that is supposed to be within the context. :return: The start token position of the sentence in the context. If not found returns None. """ for token_index, token in enumerate(context): # check if current token equals the first sentence token if token == sentence_tokens[0]: match = True # go through all sentence tokens to see if they match with the following context tokens for i in range(1, len(sentence_tokens)): if len(context) > token_index + i and context[ token_index + i] == sentence_tokens[i]: continue match = False break if match: return token_index @staticmethod def get_sentence_index_from_char_position(char_pos: int, sentences: List) -> int: """ Gets a list of sentences from a paragraph and returns the index of the sentence that contains a certain character position. :param char_pos: Character position in paragraph :param sentences: List of paragraph sentences :return: Index of the sentence that contains the character """ char_count = 0 for sentence_index, sentence in enumerate(sentences): char_count += len(sentence) if char_count >= char_pos: return sentence_index
import re import numpy as np import matplotlib.pyplot as plt from bs4 import BeautifulSoup from utilites import dump, load from nltk.tokenize import PunktSentenceTokenizer from nltk.tokenize import WordPunctTokenizer from pymorphy2 import MorphAnalyzer pst = PunktSentenceTokenizer() wpt = WordPunctTokenizer() ma = MorphAnalyzer() def find_stop_words(text): rez = [] for word in wpt.tokenize(text): tags = ma.parse(word)[0].tag if 'UNKN' in tags or \ 'LATN' in tags or \ 'PNCT' in tags or \ 'NUMB' in tags or \ 'ROMN' in tags: rez += [word] return rez try: word_normal_form = load('word_normal_form.json') except FileNotFoundError: