class WhitespaceTokenizer(BaseTokenizer): """ Split only by whitespace. """ tokenizer = tokenize.WhitespaceTokenizer() name = 'Whitespace'
class WhitespaceTokenizer(BaseTokenizer): """ 根据空白分词. This example. → (This), (example.)""" tokenizer = tokenize.WhitespaceTokenizer() name = '空白'
import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import nltk from nltk import tokenize dados = pd.read_csv('imdb-reviews-pt-br.csv') palavras = " ".join([texto for texto in dados.text_pt]) # Tokenização tokenEspaco = tokenize.WhitespaceTokenizer() token = tokenEspaco.tokenize(palavras) frequencia = nltk.FreqDist(token) dataframe = pd.DataFrame({ "Palavra": list(frequencia.keys()), "Frequência": list(frequencia.values()) }) print(dataframe) # Gráfico dataframeMaiores = dataframe.nlargest(columns="Frequência", n=10) plt.figure(figsize=(12, 8)) ax = sns.barplot(data=dataframeMaiores, x="Palavra", y="Frequência",
import numpy as np import nnabla as nn import nnabla.solvers as S import nnabla.functions as F import nnabla.logger as logger from nnabla.ext_utils import get_extension_context from nltk import tokenize import src.model as model from .dataset import Dataset from .grammar import Grammar, Rule, NodeType from .python.grammar import to_ast from .annotation import to_encoder_input, Annotation from .decoder import Decoder tokenizer = tokenize.WhitespaceTokenizer() parser = argparse.ArgumentParser() parser.add_argument('--context', "-c", type=str, default="cpu") parser.add_argument('--max-query-length', type=int, default=70) parser.add_argument('--max-action-length', type=int, default=100) parser.add_argument('--embedding-size', type=int, default=128) parser.add_argument('--node-type-embedding-size', type=int, default=64) parser.add_argument('--lstm-state-size', type=int, default=256) parser.add_argument('--hidden-state-size', type=int, default=50) parser.add_argument('--result', type=str, default=os.path.join("result", "django")) parser.add_argument('--dropout', type=float, default=0.2) parser.add_argument('--beam-size', type=int, default=15) args = parser.parse_args()
plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show() from wordcloud import WordCloud real_data = data[data[“target”] == “true”] all_words = ‘ ‘.join([text for text in fake_data.text]) wordcloud = WordCloud(width= 800, height= 500, max_font_size = 110, collocations = False).generate(all_words) plt.figure(figsize=(10,7)) plt.imshow(wordcloud, interpolation=’bilinear’) plt.axis(“off”) plt.show() from nltk import tokenize token_space = tokenize.WhitespaceTokenizer() def counter(text, column_text, quantity): all_words = ' '.join([text for text in text[column_text]]) token_phrase = token_space.tokenize(all_words) frequency = nltk.FreqDist(token_phrase) df_frequency = pd.DataFrame({"Word": list(frequency.keys()), "Frequency": list(frequency.values())}) df_frequency = df_frequency.nlargest(columns = "Frequency", n = quantity) plt.figure(figsize=(12,8)) ax = sns.barplot(data = df_frequency, x = "Word", y = "Frequency", color = 'blue') ax.set(ylabel = "Count") plt.xticks(rotation='vertical') plt.show() counter(data[data[“target”] == “fake”], “text”, 20)
def pos_tag(sentence, model_path=None, verbose=False): """ Use TnT to parse a sentence @param sentence: Input sentence to parse @type sentence: L{str} @return: C{DepGraph} the dependency graph representation of the sentence """ tnt_bin = config_tnt(verbose=verbose) if not model_path: model_path = '%s/models/wsj' % tnt_bin[:-4] input_file = '%s/tnt_in.txt' % tnt_bin[:-4] output_file = '%s/tnt_out.txt' % tempfile.gettempdir() execute_string = '%s %s %s > %s' if not verbose: execute_string += ' 2> %s/tnt.out' % tempfile.gettempdir() tagged_words = [] f = None try: if verbose: print 'Begin input file creation' print 'input_file=%s' % input_file f = open(input_file, 'w') words = tokenize.WhitespaceTokenizer().tokenize(sentence) for word in words: f.write('%s\n' % word) f.write('\n') f.close() if verbose: print 'End input file creation' if verbose: print 'tnt_bin=%s' % tnt_bin print 'model_path=%s' % model_path print 'output_file=%s' % output_file execute_string = execute_string % (tnt_bin, model_path, input_file, output_file) if verbose: print 'execute_string=%s' % execute_string if verbose: print 'Begin tagging' tnt_exit = os.system(execute_string) if verbose: print 'End tagging (exit code=%s)' % tnt_exit f = open(output_file, 'r') lines = f.readlines() f.close() tagged_words = [] tokenizer = tokenize.WhitespaceTokenizer() for line in lines: if not line.startswith('%%'): tokens = tokenizer.tokenize(line.strip()) if len(tokens) == 2: tagged_words.append((tokens[0], tokens[1])) if verbose: for tag in tagged_words: print tag finally: if f: f.close() return tagged_words
self.svd_transformer = TruncatedSVD(n_components=k) except Exception as ex: print(ex) return self.svd_transformer.fit(X) def transform(self, X, Y=None): return self.svd_transformer.transform(X) def get_params(self, deep=True): return {} punctuation_token = tokenize.WordPunctTokenizer() space_token = tokenize.WhitespaceTokenizer() list_punctuation = [point for point in punctuation] punctuation_stopwords = list_punctuation + stop_words without_accents = [] without_accents_stop_words = [] @app.route('/') def index(): return flask.render_template('index.html') def tokenize(df): processed_sentence = list() for sentence in df.sentence: