def find_ml(self, td): f_tokenizer = TreebankWordTokenizer() query_words = f_tokenizer.tokenize(td) genres = self.sentiment_analysis(query_words) weighted_genres = [] genre_weights = {} for x in genres: if x[1] is not None: weighted_genres.append(x[0]) genre_weights[x[0]] = x[1] d_score_updates = {} for movie in self.movies: g = self.genre_dict[movie][0] total_genre_score = 0 if u'Comedy' in g and 'comedy' in weighted_genres: total_genre_score += genre_weights['comedy'] if u'Action' in g and 'action' in weighted_genres: total_genre_score += genre_weights['action'] if u'Crime' in g and 'crime' in weighted_genres: total_genre_score += genre_weights['crime'] if u'Drama' in g and 'drana' in weighted_genres: total_genre_score += genre_weights['drama'] d_score_updates[self.movies.index(movie)] = total_genre_score * .1 return d_score_updates
class CRCleaner(Cleaner): def __init__(self, input_dir, output_dir): super(CRCleaner,self).__init__(input_dir, output_dir, u"-\n'", punctuation+digits) self.t = TreebankWordTokenizer() def cleaned_text(self, text): if len(text) == 0: return u"" sans_xml = self.xml_to_txt(text) arr = self.t.tokenize(sans_xml) return self.reconstruct_arr(arr) def xml_to_txt(self, xml): arr = [] dom = parseString(xml) for node in (dom.firstChild.getElementsByTagName('speaking')+dom.firstChild.getElementsByTagName('speaking-unknown-id')): paragraphs = node.getElementsByTagName('paragraph') if len(paragraphs) > 0: for node2 in paragraphs: if node2.hasChildNodes(): child = node2.firstChild if child.nodeType == child.TEXT_NODE: arr += [child.data.replace(' ',' ')] return ' '.join(arr) def new_filename(self, old_filename): return old_filename.replace('.xml', '.txt')
def pos_titles_from(input_path, output_path = None, options = None): finput, foutput = get_streams(input_path, output_path) skip, end = get_options(options) tokenizer = Tokenizer() tagger = PerceptronTagger() line_counter = 0 skipped_lines = 0 for line in finput: log_advance(1000000, line_counter) line_counter += 1 if line_counter <= skip: continue if end and line_counter > end: break try: paper_id, title = get_fields(line) if is_english(title): print >> foutput, paper_id tokens = tokenizer.tokenize(title) for token in tagger.tag(tokens): print >> foutput, token[0], token[1] print >> foutput else: skipped_lines += 1 except: print >> sys.stderr, "Error:", line, sys.exc_info() log_nlines(line_counter, skipped_lines)
class TreebankWordTokenizerWrapper: """ Seriously I don't know why we need this class - this makes no sense """ PAT_NLTK_BUG = re.compile(r"^(?:(.+)(,|'s))$") def __init__(self): self.word_tokenizer = TreebankWordTokenizer() def tokenize(self, s): temp = self.word_tokenizer.tokenize(s) if temp: it = [] for t0 in temp: t = [t0] while True: m = self.PAT_NLTK_BUG.search(t[0]) if m: t.insert(0, m.group(1)) t[1] = m.group(2) else: break it += t #sys.stderr.write('DEBUG: t=%s => %s\n' % (t0, t)) else: it = temp return it
def transformTweetData(tweet): content = unicode(tweet.sentence.lower(), errors='ignore') words = content.strip().split() tokenizer = TreebankWordTokenizer() extra_features = [] content = " ".join(words + extra_features) tokens = tokenizer.tokenize(content) tokens = [t for t in tokens if t not in stopwords] return tokens
def tokenize_en(text): """ Return a list of lists of the tokens in text, separated by sentences. """ sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') tokenizer = TreebankWordTokenizer() sentences = [tokenizer.tokenize(sentence) for sentence in sent_tokenizer.tokenize(text)] return sentences
class DssgUnigramExtractor(object): """ An instance of this is used to obtain a list of unigrams, given a text. Usages: unigramExtractor = DssgUnigramExtractor() tokenList = unigramExtractor.extract("here is a text as a string") # ['text', 'string'] """ _cache = {} def __init__(self): self._tokenizer = TreebankWordTokenizer() self._stopwordSet = set(stopwords.words("english")) self._stemmer = PorterStemmer() def __repr__(self): return self.__class__.__name__ + "()" def extract(self, text): """ Given a text, return a list of unigram tokens. """ if text not in DssgUnigramExtractor._cache: text = ( text.replace("<", "<") .replace(">", ">") .replace(""", '"') .replace("&", "&") .replace(" ", " ") ) text = nltk.clean_html(text) tokens = self._tokenizer.tokenize(text) newTokens = [] for tok in tokens: # - lowercase, remove ' tok = tok.lower().strip("`'.,-_*/:;\\!@#$%^&*()=\"") # - remove stopwords, one character word, only numbers # - remove one character word # - remove only numbers if tok in self._stopwordSet or len(tok) <= 1 or isAllNumbers(tok): continue # - apply stemming # oldTok = copy.deepcopy(tok); # for debug tok = self._stemmer.stem(tok) # sometimes a token is like 'theres' and becomes stopword after # stemming if tok in self._stopwordSet: continue newTokens.append(tok) DssgUnigramExtractor._cache[text] = newTokens return DssgUnigramExtractor._cache[text]
def pos_per_line(text_file): try: tokenizer = Tokenizer() #pos tagger = PerceptronTagger() for s in text_file: tokens = tokenizer.tokenize(s) #print " ".join([" ".join(token) for token in tagger.tag(tokens)]) print " ".join([token[1] for token in tagger.tag(tokens)]) except: print >> sys.stderr, "Error pos_per_line(text_file): ", sys.exc_info()
def getNoun(self, parser, sentence): #mysent = sentence.encode('ascii','ignore') #sent = mysent.decode() penn = TreebankWordTokenizer() tags = parser.tag(penn.tokenize(sentence)) the_tags = [] nouns = [] for t in tags: if t[1].startswith('NN'): nouns.append(t[0]) return ' '.join(nouns)
def genLexicon(data): tok = TreebankWordTokenizer() texts = [] for doc in data: for sent in doc: texts.append(tok.tokenize( sent[1].lower() )) dictionary = corpora.Dictionary(texts) pickle.dump(dictionary, open("lex/toy.lex", "w"))
class MorphyStemmer: def __init__(self): self.tokenizer = TreebankWordTokenizer() def __call__(self, doc): stemmed_doc = [] for t in self.tokenizer.tokenize(doc): stem = wordnet.morphy(t) if stem: stemmed_doc.append(stem.lower()) else: stemmed_doc.append(t.lower()) return stemmed_doc
def crear_dicc_doc_term(path): result = [] result_aux = [] file = open(path) for f in file: result.append(f) tokenizer = TreebankWordTokenizer() for s in result: tokenizer = RegexpTokenizer("[\w']+") temp = tokenizer.tokenize(s) words = temp result_aux += eiminar_stopwords(words) return result_aux
def section_02_02( datDIR ): print("\n### ~~~~~ Section 02.02 ~~~~~~~~"); ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### textfile = os.path.join( datDIR , "the-great-gatsby.txt" ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### with open(file = textfile, mode = 'r') as inF: sentences = [] for i, tempLine in enumerate(inF): if i > 100: break tempLine = tempLine.strip() sentences.append(tempLine) print( "%5d: %s" % (i,tempLine) ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### mySentence = sentences[20] + " " + sentences[21] print("\nmySentence:") print( mySentence ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### #tokens = mySentence.split("([-\s.,;!?])+") tokens = re.split("([-\s.,;!?])+",mySentence) temp = list(filter(lambda x: x if x not in '- \t\n.,;!?' else None,tokens)) print("\ntemp") print( temp ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### myPattern = re.compile("([-\s.,;!?])+") tokens = myPattern.split(mySentence) print("\ntokens[-10:]") print( tokens[-10:] ) temp = list(filter(lambda x: x if x not in '- \t\n.,;!?' else None,tokens)) print("\ntemp") print( temp ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### myRegexpTokenizer = RegexpTokenizer("\w+|$[0-9.]+|\S+") print("\nmyRegexpTokenizer.tokenize(mySentence):") print( myRegexpTokenizer.tokenize(mySentence) ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### myTreebankWordTokenizer = TreebankWordTokenizer() print("\nmyTreebankWordTokenizer.tokenize(mySentence):") print( myTreebankWordTokenizer.tokenize(mySentence) ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return( None )
def word_tokenizePT(self, text, tokenizer): """ tokenize a portuguese sentence in words @input params: sentence - a sentence, a phrase (self) tokenizer - "TB" for TreebankWordTokenizer "WP" for WordPunctTokenizer @returns word's list or error """ if tokenizer == "TB": tokenizerTB = TreebankWordTokenizer() return tokenizerTB.tokenize(text) elif tokenizer == "WP": tokenizerWP = WordPunctTokenizer() return tokenizerWP.tokenize(text) else: return "tokenizer error: not found"
def tf_normalized(full_texts): tokenizer = Tokenizer() tf = {} max_value = 0 for text in full_texts: text_tokens = tokenizer.tokenize(text) text_tokens = escape_not_abbreviations(text_tokens) for token in text_tokens: token = token.lower() tf.setdefault(token, 0.0) tf[token] += 1.0 if tf[token] > max_value: max_value = tf[token] for t in tf: tf[t] = tf[t]/max_value return tf
def _compute_unigram_frequency(self): wordlists = PlaintextCorpusReader(self.prepared_training_data_root, '.*') tokenizer = TreebankWordTokenizer() total = len(wordlists.fileids()) count = 0 fdist = nltk.FreqDist() for fl in wordlists.fileids(): count += 1 fl_abs_path = os.path.join(self.prepared_training_data_root, fl) with open(fl_abs_path, 'r') as f: words = tokenizer.tokenize(f.read()) fdist.update(words) print 'freqdist: %s of %s' % (count, total) with open(os.path.join(self.corpus_root, 'unigram_frequency.txt'), 'w') as f: f.writelines(['%s %s\n' % (word, freq) for (word, freq) in fdist.items()]) return None
class Tokenizer(object): def __init__(self, language='english'): self.paragraph_tokenizer = nltk.data.load('tokenizers/punkt/%s.pickle' % language) self.sentence_tokenizer = TreebankWordTokenizer() self.english_stops = set(stopwords.words(language)) def tokenize(self, text, remove_stopwords=False): sentences = self.paragraph_tokenizer.tokenize(text) token = [] for sentence in sentences: words = self.sentence_tokenizer.tokenize(sentence) if remove_stopwords: token.append([word for word in words if word not in self.english_stops]) else: token.append(words) return token
def eiminar_stopwords(words): a = open('english.txt') result = [] english_stops = [] for f in a: result.append(f) tokenizer = TreebankWordTokenizer() for s in result: tokenizer = RegexpTokenizer("[\w']+") temp = tokenizer.tokenize(s) english_stops += temp resultado = [] from nltk.stem import PorterStemmer stemmer = PorterStemmer() for w in words: if not w in english_stops: resultado.append(stemmer.stem(w)) return resultado
def _compute_biagram_frequency(self): if not os.path.exists(self.bigram_frequency_dir): os.mkdir(self.bigram_frequency_dir) wordlists = PlaintextCorpusReader(self.prepared_training_data_root, '.*') tokenizer = TreebankWordTokenizer() total = len(wordlists.fileids()) count = 0 for fl in wordlists.fileids(): count += 1 print 'freqdist: %s of %s' % (count, total) fl_abs_path = os.path.join(self.prepared_training_data_root, fl) with open(fl_abs_path, 'r') as f: words = tokenizer.tokenize(f.read()) bi_words = nltk.bigrams(words) fdist = nltk.FreqDist(bi_words) with open(os.path.join(self.bigram_frequency_dir, fl), 'w') as f: f.writelines(['%s %s %s\n' % (word[0], word[1], freq) for (word, freq) in fdist.items()]) return None
def text_fdist(text, min_occurence): from nltk.probability import FreqDist from nltk.tokenize import TreebankWordTokenizer tokenizer = TreebankWordTokenizer() #tokenise words: tokens = tokenizer.tokenize(text) #remove stopwords tokens = [ token.lower() for token in tokens if token.lower() not in stopwords_fr ] print(tokens) fdist_in = FreqDist(tokens) #filter words with more than one occurence fdist = list(filter(lambda x: x[1] >= min_occurence, fdist_in.items())) return fdist
def tokenize(text, stopword=False, punct=False, lower=False, stem=False, num=False, single=False, link=False): """ num: True, exclude numbers single: True, exclude single char todo: deal with unicode mafuckers """ token = [] tokenizer = TreebankWordTokenizer() token_temp = tokenizer.tokenize(text) for elt in token_temp: #temp = i.decode('unicode-escape') #temp = re.sub(ur'[\xc2-\xf4][\x80-\xbf]+', # lambda m: m.group(0).encode('latin1').decode('utf8'), temp) temp = unicode(elt) temp = unicodedata.normalize('NFKD', temp).encode('ascii', 'ignore') # get rid of empty strings #temp = i if temp: token.append(temp) token = [clean_front_end(word) for word in token if clean_front_end(word)] if lower: token = [word.lower() for word in token] if stem: token = [stemmer.stem(word) for word in token] if num: token = [word for word in token if not is_number(word)] if single: token = [word for word in token if len(word) > 1] if stopword: token = [word for word in token if word not in STOPWORD] if punct: token = [word for word in token if word not in PUNCT] if link: token = [word for word in token if not is_link(word)] #exclude empty strings token = [word for word in token if word] return token
class nlp: def __init__(self): self.tb = tb self.porter = nltk.PorterStemmer() self.tk = TreebankWordTokenizer() self.stopwords = set(stopwords.words()) def tag(self,text): blob = self.tb(text) return blob.tags #clean是词干化和标点符号的 def noun(self,text,clean=True): text = text.replace('\\n',' ') text = text.replace('\\t',' ') blob = self.tb(text) tags = blob.tags result = [] for (aword,atag) in tags: if atag == "NNP" or atag == "NNS" or atag == "NN": result.append(aword.lower()) if clean == True: clean_result = [] for word in result: nword = porter.stem(remove_non_chap(word)) #nword = small_stem(remove_non_chap(word)) if len(nword) > 2: clean_result.append(nword) return clean_result return result #这个东西可能用着不太好,暂时先别用 def noun_p(self,text): blob = self.tb(text) return blob.noun_phrases def token(self,text): result,clean_result = self.tk.tokenize(text),[] for word in result: nword = word.lower() nword = small_stem(nword) if len(nword) <= 30: clean_result.append(nword) return ' '.join(clean_result)
def preprocessing(para): print "\n\n\nStep 1: Preprocessing" print "Involves Processing of text" print "\n\nTokenizing Text into Sentences" sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') sent = sent_tokenizer.tokenize(para) print "The sentences are:" for s in sent: print s print "\n\nTokenizing Sentences into Words" tokenizer = TreebankWordTokenizer() tokens = [] for s in sent: tokens.extend(tokenizer.tokenize(s)) print "The words are:" print tokens return tokens
def prepro_sent(text, word_map): # tokenizers word_tokenizer = TreebankWordTokenizer() # tokenize sentences into words sentence = word_tokenizer.tokenize(text)[:word_limit] # number of words in sentence words_per_sentence = len(sentence) words_per_sentence = torch.LongTensor([words_per_sentence]).to(device) # (1) # encode sentence with indices from the word map encoded_sent = list( map(lambda w: word_map.get(w, word_map['<unk>']), sentence) ) + [0] * (word_limit - len(sentence)) encoded_sent = torch.LongTensor(encoded_sent).unsqueeze(0).to(device) return encoded_sent, words_per_sentence
def stopwords(filename): """A function that returns a dictionary with tokens as keys and counts of how many times each token appeared as values in the file with the given filename. Inputs: filename - the name of a plaintext file with a document on each line Outputs: A list of stopwords and a dictionary mapping tokens to counts. """ # We now track the number of times a word shows up (term frequency) and # the number of documents with a given word in it (document frequency) # separately. We use a Counter, which is exactly like a dictionary except # - the values can only be ints # - any key it hasn't seen yet is assumed to already have a value of 0 # This means we don't have to check whether we've used a key before when # we use the "+= 1" operation. term_frequency_dict = Counter() word_total = 0 tokenizer = TreebankWordTokenizer() with open(filename, 'r') as f: for line in f: words = tokenizer.tokenize(line.lower()) # For the programmer types: there are several more efficient # ways to write this section using dictionaries or sets. You're # welcome to rewrite this part to exercise that. for word in words: term_frequency_dict[word] += 1 word_total += 1 # A fun feature of Counters is that they have a built-in function that # gives you the n keys with the biggest values, or the "most common" # things being counted. We can use this to find the most common words. # This comes out as a list of pairs of key and value, like # [('foo', 10), ('bar', 7), ... , ('rare', 1)] stoplist_pairs = term_frequency_dict.most_common(100) stoplist = [word for (word, freq) in stoplist_pairs] return stoplist, term_frequency_dict, word_total
def tokenizarPorTipo(): cadena = "Sorry, I can't go to the meeting.\n" print("TreebankWordTokenizer - 1") print("WhitespaceTokenizer - 2") print("SpaceTokenizer - 3") print("WordPunctTokenizer - 4") num = input("Introduzca un tokenizer: ") if num == "1": tokenizer = TreebankWordTokenizer() elif num == "2": tokenizer = WhitespaceTokenizer() elif num == "3": tokenizer = SpaceTokenizer() elif num == "4": tokenizer = WordPunctTokenizer() else: return tokens = tokenizer.tokenize(cadena) print(tokens)
def post(self): args = post_args.parse_args() word = args.word res = [] for root, dirs, files in os.walk(dataset_path): for file in files: filePath = root + "/" + str(file) fileOpen = open(filePath, "r", encoding="utf8") tokenizer = TreebankWordTokenizer() text = nltk.Text(tokenizer.tokenize(fileOpen.read())) r = fileOpen.read() texted = nltk.Text(text) ttokens = self.n_concordance_tokenised(text=texted, phrase=word) for t in ttokens: ans = t.partition(word) res.append(ans) return jsonify(res)
def compute_embeddings(self, texts, embedding_index): tokenizer = TreebankWordTokenizer() embeddings = [] for text in texts: embedding = [] for word in tokenizer.tokenize(text): word_embedding = self.compute_word_embedding( word, embedding_index) if word_embedding is not None: embedding.append(np.array(word_embedding)) else: # pad with 0s zero_arr = np.zeros(25, ) embedding.append(zero_arr) continue embeddings.append(embedding) return embeddings
def sentences(self, lowercase=False, strip_punct=[], num_placeholder=None): word_tokenizer=TreebankWordTokenizer() sent_tokenizer=nltk.data.LazyLoader('tokenizers/punkt/english.pickle') token_sents = [word_tokenizer.tokenize(sent) for sent in sent_tokenizer.tokenize(self.response)] if lowercase: token_sents = [[token.lower() for token in sent] for sent in token_sents] if len(strip_punct) > 0: token_sents = [[token for token in sent if token not in strip_punct] for sent in token_sents] if num_placeholder is not None: def replace_num(token, placeholder): try: float(token.replace(',','')) return placeholder except ValueError: return token token_sents = [[replace_num(token, num_placeholder) for token in sent] for sent in token_sents] return token_sents
def build(path): for filename in glob.glob(os.path.join(path, '*.txt')): with codecs.open(filename, 'r', 'utf-8') as f: for line in f: s = line.lower().strip('\n') # tokenizer = RegexpTokenizer('[a-z]\w+') tokenizer = TreebankWordTokenizer() tokens = tokenizer.tokenize(s) for t in tokens: if t in invert_index: files = invert_index[t] # Update the word count by 1 if filename in files: files[filename] += 1 # A new file contains this word else: invert_index[t][filename] = 1 else: invert_index[t] = {filename: 1} pickle.dump(invert_index, open("invert_index.p", "wb"))
def main(argv): if 'run_test' in argv: run_test = True else: run_test = False if 'print_runtime' in argv: print_runtime = True start_time = time.time() else: print_runtime = False file_names = find_files() stop_word_set = set(stopwords.words('english')) if 'short_test' in argv: files_to_process = ['xin_eng_200201.xml.gz'] else: files_to_process = file_names tokenizer = TreebankWordTokenizer() wonl = WordNetLemmatizer() output_filename = 'xinhua-om-lema.txt' output = open(output_filename, "wt") for downloaded_file in files_to_process: print('Working on {}'.format(downloaded_file)) for paragraph in [paragraphs.text for paragraphs in ET.fromstring(gzip.open(downloaded_file).read())\ .findall(".//*[@type='story']//P")]: if not paragraph: continue for sentence in sent_tokenize(paragraph): filtered_words = [word for word in tokenizer.tokenize(sentence)\ if word.lower() not in stop_word_set \ and re.search("^[a-zA-Z]+$", word)] if not filtered_words: continue output.write(' '.join( [wnl.lemmatize(word).lower() for word in filtered_words]) + '\n') output.close() if print_runtime: run_time = time.time() - start_time print('Total Processing Time: {0:.2f} minutes'.format(run_time / 60)) return None
def index_search(query, index, idf, doc_norms): treebank_tokenizer = TreebankWordTokenizer() query_toks = treebank_tokenizer.tokenize(query.lower()) scores = {} query_tf = Counter(query_toks) for term, term_tf in query_tf.items(): if term in index: for (doc, shoe_id, tf) in index[term]: scores[doc] = scores.get(doc, 0) + term_tf * idf[term] * tf * idf[term] q_norm = 0 for term, tf in query_tf.items(): if term in index: q_norm += math.pow(tf * idf[term], 2) q_norm = math.sqrt(q_norm) res = [] for doc, score in scores.items(): res.append((score / (q_norm * doc_norms[doc]), doc)) return sorted(res, key=lambda tup: (-tup[0], tup[1]))
def rate_sentence(sentence, Vfrase, listaNegativas, listaPositivas): negadores = [] valor = 0 tokenizer = TreebankWordTokenizer() tagger = nltk.data.load(_POS_TAGGER) tags = tagger.tag(tokenizer.tokenize(sentence)) for i in tags: if (i[1] == 'NN') or (i[1] == 'NNS') or (i[1] == 'NNP') or (i[1] == 'NNPS'): valor += calcularValorPalabra(i[0], "sust", "N", Vfrase, listaNegativas, listaPositivas) if (i[1] == 'JJ' or (i[1] == 'JJR') or (i[1] == 'JJS')): valor += calcularValorPalabra(i[0], "adj", "N", Vfrase, listaNegativas, listaPositivas) if (i[1] == 'VB' or (i[1] == 'VBD') or (i[1] == 'VBG') or (i[1] == 'VBN') or (i[1] == 'VBP') or (i[1] == 'VBZ')): valor += calcularValorPalabra(i[0], "verb", "N", Vfrase, listaNegativas, listaPositivas) if (i[1] == 'RB' or (i[1] == 'RBR') or (i[1] == 'RBS')): valor += calcularValorPalabra(i[0], "adv", "N", Vfrase, listaNegativas, listaPositivas) return valor
def prepro_doc(document, word_map): # tokenizers sent_tokenizer = PunktSentenceTokenizer() word_tokenizer = TreebankWordTokenizer() # a list to store the document tokenized into words doc = list() # tokenize document into sentences sentences = list() for paragraph in get_clean_text(document).splitlines(): sentences.extend([s for s in sent_tokenizer.tokenize(paragraph)]) # tokenize sentences into words for s in sentences[:sentence_limit_per_doc]: w = word_tokenizer.tokenize(s)[:word_limit_per_sentence] if len(w) == 0: continue doc.append(w) # number of sentences in the document sentences_per_doc = len(doc) sentences_per_doc = torch.LongTensor([sentences_per_doc]).to(device) # (1) # number of words in each sentence words_per_each_sentence = list(map(lambda s: len(s), doc)) words_per_each_sentence = torch.LongTensor(words_per_each_sentence).unsqueeze(0).to(device) # (1, n_sentences) # encode document with indices from the word map encoded_doc = list( map(lambda s: list( map(lambda w: word_map.get(w, word_map['<unk>']), s) ) + [0] * (word_limit_per_sentence - len(s)), doc) ) + [[0] * word_limit_per_sentence] * (sentence_limit_per_doc - len(doc)) encoded_doc = torch.LongTensor(encoded_doc).unsqueeze(0).to(device) return encoded_doc, sentences_per_doc, words_per_each_sentence
def _preprocess(self, listlikeobj, stop_lists=None): """Applies pre-processing pipelines to lists of string """ numeric = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', \ 'ten', 'Eleven', 'Twelve', 'Thirteen', 'Fourteen', 'Fifteen', 'Sixteen', 'Seventeen', \ 'Eighteen', 'Nineteen', 'Twenty', 'Twenty-one', 'Twenty-two', 'Twenty-three', \ 'Twenty-four', 'Twenty-five', 'Twenty-six', 'Twenty-seven', 'Twenty-eight', \ 'Twenty-nine', 'Thirty', 'Thirty-one'] ordinal = ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eight', 'ninth', \ 'tenth', 'eleventh', 'twelfth', 'thirteenth', 'fourteenth', 'fifteenth', 'sixteenth', \ 'seventeenth', 'eighteenth', 'nineteenth', 'twentieth', 'twenty-first', 'twenty-second', \ 'twenty-third', 'twenty-fourth', 'twenty-fifth', \ 'twenty-sixth', 'twenty-seventh', 'twenty eighth', 'twenty-ninth', 'thirtieth', 'thirty-first'] en_stop = get_stop_words('en') tokenizer = TreebankWordTokenizer() p_stemmer = PorterStemmer() listlikeobj = listlikeobj.apply(lambda row: row.lower()) listlikeobj = listlikeobj.apply(lambda row: tokenizer.tokenize(row)) listlikeobj = listlikeobj.apply(lambda row: [i for i in row if i not in en_stop]) listlikeobj = listlikeobj.apply(lambda row: [i for i in row if i not in string.punctuation]) listlikeobj = listlikeobj.apply(lambda row: [p_stemmer.stem(i) for i in row]) if stop_lists: for sw_dict in stop_lists: listlikeobj = listlikeobj.apply(lambda row: [i for i in row if i not in sw_dict]) #listlikeobj = listlikeobj.apply(lambda row: [re.sub(r'\d', "#", i) for i in row]) #listlikeobj = listlikeobj.apply(lambda row: ["#" for i in row if i in numeric]) #listlikeobj = listlikeobj.apply(lambda row: ["#th" for i in row if i in ordinal]) #print(listlikeobj) #listlikeobj = listlikeobj.apply(lambda row: [spell(i) for i in row if len(i)>6]) return listlikeobj
def GetWordCount2(self, data): #print(data) tokenizer = TreebankWordTokenizer() stop_words = set(stopwords.words('english')) words = [] POSVals = {} wordcount = defaultdict(int) words = tokenizer.tokenize(data) for j in set(words): wordcount[j] = wordcount[j] + words.count(j) for (k, v) in list(wordcount.items()): if (k.lower() in stop_words or k.lower() in list(string.punctuation)): del wordcount[k] else: # print(PosTags(k)) POSVals[k] = self.PosTags(k) # print(POSVals) return { 'WORDS': [k for k in sorted(wordcount.keys())], 'COUNTS': [wordcount[k] for k in sorted(wordcount.keys())], 'POS': [POSVals[k] for k in sorted(wordcount.keys())] }
def make_word_set(context): """ Computes the set of all words used in a list of strings. Arguments ========= context: a list of strings Returns ======= word_set: set of distinct words """ tokenizer = TreebankWordTokenizer() sw = stopwords.words('english') word_list = [] for string in context: tkns = tokenizer.tokenize(string) for tk in tkns: if tk not in sw: word_list.append(tk) word_set = set(word_list) return word_set
def vectorize(self, dataset): print("vectorizing") if not self.embedding: GLOVE_DIR = "/media/D/data/glove/" GLOVE_W2V_FILE = "glove.840B.300d.w2vformat.txt" GLOVE_W2V_PATH = os.path.join(GLOVE_DIR, GLOVE_W2V_FILE) glove_model = gensim.models.KeyedVectors.load_word2vec_format( GLOVE_W2V_PATH) # print("time taken loading glove: {}".format(time.time()-t)) self.embedding = glove_model.wv wv = self.embedding tokenizer = TreebankWordTokenizer() vectorized_data = [] for sentence in dataset: sample_vecs = [] for token in tokenizer.tokenize(sentence): try: sample_vecs.append(wv[token]) except KeyError: # print(token, "not in wv") pass vectorized_data.append(sample_vecs) return vectorized_data
class TextTokenizer(object): """分词 去除标点符号,保留@,$,&,',TreebankWordTokenizer分词 """ def __init__(self): self.tokenizer = TreebankWordTokenizer() self.puncts = [] # 需保留的字符@,$,& def _clean_punct(self, string): # 去除标点符号 new_string = re.sub(r'[,.;+=<>()/:_?!$@&%*|{}\-\[\]\"\']', ' ', string) # 保留@,$,& return new_string def _extract_punct(self, string): # 保留@,$,& """ new_string = string for punct in self.puncts: if punct == '$': new_string = re.sub('\$', ' ' + punct + ' ', new_string) else: new_string = re.sub(punct, ' '+punct+' ', new_string) return new_string """ return string def _tokenize(self, string): # TreebankWordTokenizer分词 word_list = self.tokenizer.tokenize(string) return word_list def tokenize(self, string): # 集成处理 new_string = self._extract_punct(self._clean_punct(string)) word_list = self._tokenize(new_string) return word_list
def GetWordCount2(data): tokenizer = TreebankWordTokenizer() stop_words = set(stopwords.words('english')) words = [] POSVals = {} wordcount = defaultdict(int) for i in data: if i == '\n': continue else: #i = i.encode('utf-8') words = tokenizer.tokenize(i) # print(words) for j in set(words): #j = j.decode('utf-8').strip() wordcount[j] = wordcount[j] + words.count(j) # print(wordcount) # print 'WORD::::::::::COUNT' for (k, v) in wordcount.items(): if k.lower() in stop_words: del wordcount[k] else: #print(PosTags(k)) POSVals[k] = PosTags(k) #print(POSVals) return { 'WORDS': [k for k in sorted(wordcount.keys())], 'COUNTS': [wordcount[k] for k in sorted(wordcount.keys())], 'POS': [POSVals[k] for k in sorted(wordcount.keys())] }
def write_out(infile, out_folder): if not os.path.exists(out_folder): os.mkdir(out_folder) qfile = open(os.path.join(out_folder, 'a.toks'), 'w') afile = open(os.path.join(out_folder, 'b.toks'), 'w') lfile = open(os.path.join(out_folder, 'sim.txt'), 'w') qids = [] questions = [] answers = [] labels = [] tokenizer = TreebankWordTokenizer() qid_count = 0 qid_old = None with open(infile) as inf: inf.readline() # header for line in inf: fields = line.lower().strip().split('\t') qid = fields[0] question = ' '.join(tokenizer.tokenize(fields[1])) sentence = ' '.join(tokenizer.tokenize(fields[5])) label = fields[6] if qid != qid_old: qid_old = qid qid_count += 1 qids.append(str(qid_count)) questions.append(question) answers.append(sentence) labels.append(label) dump(questions, os.path.join(out_folder, 'a.toks')) dump(answers, os.path.join(out_folder, 'b.toks')) dump(labels, os.path.join(out_folder, 'sim.txt')) dump(qids, os.path.join(out_folder, 'id.txt'))
stemer = nltk.SnowballStemmer("russian") inp_str = u"сегодня правительством ирана было объявлено о подавлении митингов благодаря напалму" name_obj = u'правительство ирана' #def find_object(name_obj, inp_str, inp_doc_id): otvet = [] res = {} flag = False name_obj = name_obj.split(",") #ton_doc = TonDocuments.query.get(inp_doc_id) ton_sents_list = [] for obj in name_obj: for sent in punkt_sent_token.tokenize(inp_str): tokens = tokenizer.tokenize(sent.lower()) if obj in sent.lower(): flag = True #if obj.lower().strip().encode("utf-8") in tokens: #for sistem # if obj.lower().strip() in tokens: # #sentiment_val = get_sentiment(sent, dics, tokenizer, punkt_sent_token, stemer) # #otvet.append(sentiment_val) # #ton_sents_list.append(TonSentences(sent.decode("utf-8"), sentiment_val[0][1], sentiment_val[0][0])) # flag=True else: stem_tokens = [] for t in tokens: # stem_tokens.append(stemer.stem(t.decode("utf-8"))) #for sistem stem_tokens.append(stemer.stem(t)) stem_obj = stemer.stem(obj.strip()) if stem_obj in ' '.join(stem_tokens):
mag_1 = math.sqrt(sum([x**2 for x in vec1])) mag_2 = math.sqrt(sum([x**2 for x in vec2])) return dot_prod / (mag_1 * mag_2) #文档123组成语料库,建立词汇表 docs = [ "The faster Harry got to the store,the faster and faster Harry would get home." ] docs.append("Harry is hairy and faster than Jill.") docs.append("Jill is not as hairy as Harry.") print(docs) #17,8,8 doc_tokens = [] for doc in docs: doc_tokens += [sorted(tokenizer.tokenize(doc.lower()))] #分词,大小写转化,未去重 print(len(doc_tokens[0])) all_doc_tokens = sum(doc_tokens, []) print(len(all_doc_tokens)) lexicon = sorted(set(all_doc_tokens)) #去重得到词库词汇表 18维 print(len(lexicon)) print(lexicon) #构建向量模板,词库零向量,确保后面的向量维度相同,将内容填入其中,没有的用零代替 zero_vector = OrderedDict((token, 0) for token in lexicon) print(zero_vector) #每篇文档的向量表示 doc_vectors = [] for doc in docs: vec = copy.copy(zero_vector)
from gensim import corpora, models, similarities from nltk.corpus import stopwords from collections import defaultdict from pprint import pprint import nltk from nltk.tokenize import TreebankWordTokenizer import string import os from similarity import is_ci_stem_stopword_set_match os.chdir(os.path.dirname(__file__)) documents = open("lsi_data.txt", "r").read().splitlines() stop_words = stopwords.words('english') tokenizer = TreebankWordTokenizer() word_list = [[ x.lower() for x in tokenizer.tokenize(sentence) if (x not in stop_words and x not in string.punctuation) ] for sentence in documents] print(word_list) frequency = defaultdict(int) for sent in word_list: for token in sent: frequency[token] += 1 word_list = [[x for x in sent if frequency[x] > 1] for sent in word_list] pprint(word_list) dictionary = corpora.Dictionary(documents=word_list) dictionary.save("LSA/doc1.dict") print(dictionary.token2id)
nltk.download() from nltk.tokenize import WhitespaceTokenizer, WordPunctTokenizer, TreebankWordTokenizer text = "this is a block of text. I am writing a piece to explain the use of nlp packages." text = 'Feet wolves talked cats' ######tokenize tokenizer1 = WhitespaceTokenizer() #extract based o white space tokenizer2 = WordPunctTokenizer( ) #extract based on the white space as well as punctuation tokenizer3 = TreebankWordTokenizer() tokens1 = tokenizer1.tokenize(text) tokens2 = tokenizer2.tokenize(text) tokens3 = tokenizer3.tokenize(text) ###### #best is first try to lemmetizing and then stem from nltk.stem import PorterStemmer, WordNetLemmatizer ps = PorterStemmer() lem = WordNetLemmatizer() lemmatized_tokens = [] for token in tokens3: lemmatized_tokens.append(lem.lemmatize(token)) #lemmatized and stemmed lemmatized_tokens = [] for token in tokens3:
""" ############################################################## # "Tokenizar" y "taggear" un texto: # nltk.download("maxent_ne_chunker") # Siempre que definamos una cadena en código lo haremos con el prefijo (u) cadena = u"—¡Joven «emponzoñado» con el whisky, qué fin… te aguarda exhibir!\nEl veloz murciélago hindú comía feliz cardillo y kiwi.\nLa cigüena tocaba el saxofón detrás del palenque de paja.\nEl pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y frío, añoraba a su querido cachorro.\nExhíbanse politiquillos zafios,\ncon orejas kilométricas\n\ty unas de gavilán." print u"Cadena:" print "\t", cadena # Ejemplo normal de tokenizador por palabras (las palabras se capturan con los signos de puntuación adyacentes) from nltk.tokenize import TreebankWordTokenizer tokenizer = TreebankWordTokenizer() tokens = tokenizer.tokenize(cadena) # print u"\nPalabras:" # print "\t","\n\t".join([addslashes(t) for t in tokens]) # Tokenizador que separa las palabras y luego los signos de puntuación from nltk.tokenize import WordPunctTokenizer word_punct_tokenizer = WordPunctTokenizer() palabras = word_punct_tokenizer.tokenize(cadena) # print u"\nPalabras/Puntuación:" # print "\t","\n\t".join([addslashes(t) for t in palabras]) # Versión en español del tokenizador por frases import nltk.data spanish_tokenizer = nltk.data.load("tokenizers/punkt/spanish.pickle") frases = spanish_tokenizer.tokenize(cadena) # print u"\nFrases:"
from nltk.corpus import brown text = "Are you curious about tokenization? Let's see how it " \ "works! We need to analyze a couple of sentences with punctuations " \ "to see it in action." sent_tokenize_list = sent_tokenize(text) print "\nSentence tokenizer:" print sent_tokenize_list print "\nWord tokenizer:" print word_tokenize(text) treebank_word_tokenizer = TreebankWordTokenizer() print "\nTreebank word tokenizer:" print treebank_word_tokenizer.tokenize(text) word_punct_tokenizer = WordPunctTokenizer() print "\nWord punct tokenizer:" print word_punct_tokenizer.tokenize(text) words = [ 'table', 'probably', 'wolves', 'playing', 'is', 'dog', 'the', 'beaches', 'grounded', 'dreamt', 'envision' ] # Compare different stemmers stemmers = ['PORTER', 'LANCASTER', 'SNOWBALL'] stemmer_porter = PorterStemmer() stemmer_lancaster = LancasterStemmer() stemmer_snowball = SnowballStemmer('english') formatted_row = '{:>16}' * (len(stemmers) + 1)
file_kpe = os.path.join(dir_output, f[:-4] + ".ann") kpe_file = open(file_kpe, "w") kp_list = [] projections_list = kpc.get_document_content_ann(dirname, f[:-4] + ".ann") for projection in projections_list: index_list = projection[1].split() start = int(index_list[1]) end = int(index_list[2]) prev_token = False if start > 0: prev_text = raw_text[0:start] prev_text_tokens = tokenizer.tokenize(prev_text) if prev_text_tokens: prev_token = prev_text_tokens[-1] else: prev_token = False next_text = raw_text[end:] next_text_tokens = tokenizer.tokenize(next_text) if next_text_tokens: next_token = next_text_tokens[0] else: next_token = False projection_tokens = tokenizer.tokenize(projection[2]) test_tokens = []
def tokenize(doc): tokenizer = TreebankWordTokenizer() token = tokenizer.tokenize(doc) #token = grams(token) return token
import ujson wordsTokenizer = TreebankWordTokenizer() stopWords = set(stopwords.words('english')) sentencesTokenizer = load('tokenizers/punkt/english.pickle') arquivoClassificados = open('classificados.json') classificados = ujson.load(arquivoClassificados) arquivoClassificados.close() acertos = 0 sentimentos = {} comeco = datetime.now() for resposta in classificados: texto = resposta['corpo'] frases = sentencesTokenizer.tokenize(texto) palavras = [] for frase in frases: palavrasTemp = wordsTokenizer.tokenize(frase) palavras.extend([palavra for palavra in palavrasTemp if palavra not in stopWords]) posTags = pos_tag(palavras) positivo = 0 negativo = 0 for palavra, tag in posTags: synsets = None if tag.startswith('J'): synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADJ) elif tag.startswith('V'): synsets = sentiwordnet.senti_synsets(palavra, wordnet.VERB) elif tag.startswith('N'): synsets = sentiwordnet.senti_synsets(palavra, wordnet.NOUN) elif tag.startswith('R'): synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADV) else:
nltk.download('punkt') nltk.download('treebank') from nltk.tokenize import word_tokenize from nltk.tokenize import WordPunctTokenizer from nltk.tokenize import TreebankWordTokenizer tb_tokenizer=TreebankWordTokenizer() text1 = "Love looks not with the eyes, but with the mind. And therefore is wing'd Cupid painted blind." text2 = "South Korea population is 48,750,000" word_tok = word_tokenize(text1) word_tok2 = word_tokenize(text2) wordpunct_tok = WordPunctTokenizer().tokenize(text1) wordpunct_tok2 = WordPunctTokenizer().tokenize(text2) tb_tok = tb_tokenizer.tokenize(text1) tb_tok2 = tb_tokenizer.tokenize(text2) print("word_tokenize를 사용한 경우는 아래와 같습니다.") print(word_tok) print(word_tok2) print("wordpunct_tokenize를 사용한 경우는 아래와 같습니다.") print(wordpunct_tok) print(wordpunct_tok2) print("Treebanktokenize를 사용한 경우는 아래와 같습니다.") print(tb_tok) print(tb_tok2)
for ann in ann_file: ann = unicode(ann, encoding="utf-8") if ann[0] not in ["R", "*"]: ann_items = ann.strip().split("\t") if ann_items[1].find(";") >= 0: type_indexes_tmp = ann_items[1].split(" ") type_indexes = type_indexes_tmp[0:2] + type_indexes_tmp[3:] else: type_indexes = ann_items[1].split(" ") type_indexes[1] = int(type_indexes[1]) type_indexes[2] = int(type_indexes[2]) indexes_kp_tmp.setdefault(type_indexes[1], -1) if indexes_kp_tmp[type_indexes[1]] < type_indexes[2]: indexes_kp_tmp[type_indexes[1]] = type_indexes[2] ann_text = ann_items[2] tokens = tokenizer.tokenize(ann_text) if without_types: annotation_type = 'KeyPhrase' else: annotation_type = type_indexes[0] pos_tags = [t + (annotation_type,) for t in tagger.tag(tokens)] if pos_tags: pos_tags[0] = pos_tags[0][0:2] + ("B-" + pos_tags[0][2],) if debug: print >> sys.stderr, pos_tags annotations[" ".join([str(ti) for ti in type_indexes[1:]])] = pos_tags #print >> ann_ext_file, " ".join([str(ti) for ti in type_indexes]) + "\t" + ann_text + "\t" + pos_tags ann_file.close() #ann_ext_file.close() if debug:
tokenizerPalavras = TreebankWordTokenizer() arquivoClassificador = open('classificador.pickle', 'rb') classificador = _pickle.load(arquivoClassificador) arquivoClassificador.close() arquivoClassificados = open('classificados.json') classificados = ujson.load(arquivoClassificados) arquivoClassificados.close() sentimentos = {} featuresClassificados = [] comeco = datetime.now() for resposta in classificados: texto = resposta['corpo'] frases = tokenizerFrases.tokenize(texto) feature = {} for frase in frases: palavras = tokenizerPalavras.tokenize(frase) palavras = [palavra for palavra in palavras if palavra not in stopWords] for palavra in palavras: feature[palavra] = True sentimentos[texto] = (resposta, classificador.classify(feature)) featuresClassificados.append((feature, resposta['sentimento'])) tempo = datetime.now() - comeco arquivoMedicoes = open('medicoes_analise_sequencial.txt', 'w') arquivoMedicoes.write('Tempo de Execução = ' + str(tempo) + '\nPrecisão = {0:.2f}%'.format(accuracy(classificador, featuresClassificados) * 100)) arquivoMedicoes.close() arquivoResultados = open('resultados_sem_stopwords.csv', 'w', newline='') w = writer(arquivoResultados, delimiter=',') linhas = [['Resposta', 'Pontos', 'Sentimento - Naive Bayes', 'Sentimento - AlchemyAPI']] for texto in sentimentos.keys(): tupla = sentimentos[texto] resposta = tupla[0]
class WordTokenizer(object): def __init__(self): self._word_tokenizer = TreebankWordTokenizer() def tokenize(self, document): return self._word_tokenizer.tokenize(document)
from nltk.tokenize import word_tokenize print( word_tokenize( "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop." )) from nltk.tokenize import WordPunctTokenizer print(WordPunctTokenizer().tokenize( "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop." )) from tensorflow.keras.preprocessing.text import text_to_word_sequence print( text_to_word_sequence( "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop." )) from nltk.tokenize import TreebankWordTokenizer tokenizer = TreebankWordTokenizer() text = "Starting a home-based restaurant may be an ideal. it doesn't have a food chain or restaurant of their own." print(tokenizer.tokenize(text))
from nltk.tokenize import TreebankWordTokenizer from nltk.stem import PorterStemmer, WordNetLemmatizer PHRASE = 'Cats pants and wolves' tokenizer = TreebankWordTokenizer() porter = PorterStemmer() word_net = WordNetLemmatizer() tokens = tokenizer.tokenize(PHRASE) print(tokens) print("Porter : ", ' '.join(porter.stem(token) for token in tokens)) print("Word Net : ", ' '.join(word_net.lemmatize(token) for token in tokens))
#test_sents = [] for (dirname, _, filenames) in os.walk(dir_corpus): for f in filenames: ext = f[-4:] if ext == '.ann': file_count += 1 if debug and file_count > debug_tests: break file_text = os.path.join(dirname, f[:-4] + ".txt") text_file = open(file_text, "r") file_kpe = os.path.join(dir_output, f[:-4] + ".ann") kpe_file = open(file_kpe, "w") raw_text = unicode(text_file.read(), encoding="utf-8") tokens = tokenizer.tokenize(raw_text) tagged_text = [t + ("None",) for t in tagger.tag(tokens)] text_file.close() #test_sents.append(tagged_text) if extra_features: X_test = kpc.sent2features_extra(tagged_text, qr) else: X_test = kpc.sent2features(tagged_text) is_not_kp = "None" tmp_label = is_not_kp new_kp = [] kp_list = [] for kp in zip(crftagger.tag(X_test), [tt[0] for tt in tagged_text]): if debug and False: print >> sys.stderr, " ---- ", kp
#!/usr/bin/env python import sys import string from nltk.corpus import stopwords from nltk.tokenize import TreebankWordTokenizer stop_words_english = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'yo', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn'] # Input : subreddit, created_utc, subreddit_id, link_id, name, id, gilded, author, score, body, controversiality, parent_id, compound, neg, neu, pos, sentiment_class # Output : subreddit, sentiment_class, gilded, score, body, compound, neg, neu, pos for line in sys.stdin: line = line.strip() row = line.split(",") subreddit = row[0] body = row[9] sentiment = row[16] tokenizer = TreebankWordTokenizer() word_tokens = tokenizer.tokenize(body.lower()) filtered_words = [word for word in word_tokens if word not in stop_words_english] new_comment = '' for word in filtered_words: new_comment += ''.join([i if i.isalpha() or ord(i)==32 else '' for i in word])+' ' print "%s,%s,%s,%s,%s,%s,%s,%s,%s" % (subreddit, sentiment, row[6], row[8], new_comment, row[12], row[13], row[14], row[15])
def tokenizer(text): tbwt = TreebankWordTokenizer() text_out = tbwt.tokenize(text) return text_out