def lemma_tokenize(paragraph): lmtzr = WordNetLemmatizer() try: return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence] except LookupError: nltk.download('wordnet') return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]
def parse_stories(lines, only_supporting=False): '''Parse stories provided in the bAbi tasks format If only_supporting is true, only the sentences that support the answer are kept. ''' data = [] story = [] for line in lines: line = line.decode('utf-8').strip() nid, line = line.split(' ', 1) nid = int(nid) if nid == 1: story = [] if '\t' in line: q, a, supporting = line.split('\t') q = tokenize(q) substory = None if only_supporting: # Only select the related substory supporting = map(int, supporting.split()) substory = [story[i - 1] for i in supporting] else: # Provide all the substories substory = [x for x in story if x] data.append((substory, q, a)) story.append('') else: sent = tokenize(line) story.append(sent) return data
def generation(cluster_matrix): # Alternative distance metric: # for each storyline-word, find most similar words; # filter for words that are reasonably close to other storyline-words allpool = [] for cluster in cluster_matrix: pool = [] try: for word in cluster: # find candidate words try: thisword_tk = nltk.tokenize(word) thistag_tk = nltk.pos_tag(thisword_tk) if (thisword_tk[1] not in tagdict): continue except: continue cand = [ tup[0] for tup in glove_model.most_similar(word, topn=200) ] # calculate distances from candidate words to other storyline-words cand2clust_dists = np.sum([ glove_model.distances(x, cand) for x in cluster if x != word ], axis=0) # indexes of qualified words in cand (comparing among themselves) indexes = cand2clust_dists.argsort()[:200] # get top 25 keep = set() tagdict = [ 'NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS' ] print(cand) smallest = len(cand) if smallest > 200: smallest = 200 for i in range(0, smallest): try: word_tk = nltk.tokenize(cand[i]) tag_tk = nltk.pos_tag(word_tk) if (tag_tk[1] in tagdict): keep.add(cand[i]) except: continue if len(keep) == 25: break # OR, comparing with all vocab # indexes of words whose total distance to other storyline-words is among top 1% of all vocab #top_dist = np.percentile(np.sum([glove_model.distances(x) for x in cluster if x!=word], axis=0),1) #keep = [cand[i] for i in range(len(cand2clust_dists)) if cand2clust_dists[i] <= top_dist] pool = pool + keep print(pool) allpool = allpool + pool except: print("Sad!!") return None return allpool
def jaccard_similarity(a, b, threshold=0.5): """Check if a and b are matches.""" tokens_a = [token.lower().strip(string.punctuation) for token in tokenize(a) \ if token.lower().strip(string.punctuation) not in stopwords] tokens_b = [token.lower().strip(string.punctuation) for token in tokenize(b) \ if token.lower().strip(string.punctuation) not in stopwords] # Calculate Jaccard similarity ratio = len(set(tokens_a).intersection(tokens_b)) / float( len(set(tokens_a).union(tokens_b))) return ratio
def compute_similarity(text1, text2): w1, w2 = tokenize(text1), tokenize(text2) # create a sorted common vocabulary for the two lists of words common_vocab = sorted(set(w1) | set(w2)) # create count vectors of the same lenght for the two lists of words v1 = count_vectorize(w1, common_vocab) v2 = count_vectorize(w2, common_vocab) distance = compute_distance(v1, v2) return distance
def chat_with_robo(): parser = Parser() flag = True print("The instructions for talk with me: \n", "If you want finish the conversation, please type thanks or bye.\n") print("ROBO: Hi, my name is Robo.") while flag == True: message = input() message = message.lower() if message != 'bye': # Analyzing the input print('\nvocabulary: ', nltk.tokenize(message)) print('\nword frequency: ' + nltk.FreqDist(nltk.tokenize(message)).most_common(10)) # ----------- # add part-of-speech tags to text # ----------- # Tagging message with basic nltk tokenize print(nltk.pos_tag(nltk.word_tokenize(message))) # Tiene problemas con la identificación del pronombre 'I', lo pone como noun (sustantivo) # Tagging message # trace = 1: then the parser will report the steps that it takes as it parses a text. # rd_parser = nltk.RecursiveDescentParser(, trace = 1) # Review grammar # rd_parser = nltk.RecursiveDescentParser(nltk.ChartParser) rd_parser = parser.parse(message) i = 1 for tree_struc in rd_parser: print(str(i) + 'tree_struc: ', tree_struc) wrong_syntax = 1 s = tree_struc wrong_syntax = 0 print("\n Correct Grammar") i += 1 if wrong_syntax == 1: print("\n Wrong Grammar") # write_output_file(... else: flag = False print("ROBO: Bye! take care..")
def is_text_initial(term, text, start_within=5, ignore_case=True): if type(term) != type(list()): if ignore_case: term = term.lower() term_tokens = tokenize(term) else: term_tokens = [token.lower() for token in term] if ignore_case else term if type(text) != type(list()): if ignore_case: text = text.lower() text_tokens = tokenize(text) else: text_tokens = [token.lower() for token in text] if ignore_case else text spacey_text = ' '+(' '.join(text_tokens[:start_within-1+len(term_tokens)]))+' ' spacey_term = ' '+(' '.join(term_tokens))+' ' return spacey_term in spacey_text
def predict(text): tokens = tokenize(text) i = 0 prep, origin = prepare_tokens(tokens[i:i + 200]) pred = model.predict(prep) origin = fix_holes(origin, pred) while i * 200 < len(origin): p, o = prepare_tokens(tokens[i:i + 200]) i += 1 pred = model.predict(p) o = fix_holes(o, pred) origin = origin + o return origin, avg(origin) #print(prepare_text(a), model.predict(pa))
def process_line(num, line): global docs docs.append(doc2vec.LabeledSentence(words=nltk.tokenize(line), labels=["SENT_" + str(num)])) if (len(docs) > 100): doc2vec_model.build_vocab(docs) doc2vec_model.train(random.shuffle(docs)) docs = []
def load_lists(): truelist = set() phrase_truelist = defaultdict(set) module_file = inspect.getfile(inspect.currentframe()) module_dir = os.path.dirname(os.path.abspath(module_file)) truelist_file = os.path.join(module_dir, "truelist") for line in open(truelist_file): line = line.split("#")[0].strip() if line == "": continue assert not any( is_hyphen(c) for c in line), f'Truelist entries should not contain hyphens: {line}' if ' ' not in line: truelist.add(line) else: toks = tuple(tokenize(line)) phrase_truelist[len(toks)].add( toks) # group phrases by number of tokens phrase_truelist = sorted(phrase_truelist.items(), reverse=True) # bins sorted by phrase length special_file = os.path.join(module_dir, "special-case-titles") with open(special_file) as inF: special_titles = { line.strip().lower(): line.strip() for line in inF if line.strip() } amodifiers = ( 'North', 'South', 'East', 'West', 'Northeast', 'Northwest', 'Southeast', 'Southwest', 'Central', 'Northern', 'Southern', 'Eastern', 'Western', 'Northeastern', 'Northwestern', 'Southeastern', 'Southwestern', 'Modern', 'Ancient', ) # use subsequent word to determine fixed-case. will miss hyphenated modifiers (e.g. South-East) ndescriptors = ( 'Bay', 'Coast', 'Gulf', 'Island', 'Isle', 'Lake', 'Republic', 'University', ) # use preceding word to determine fixed-case return truelist, phrase_truelist, special_titles, amodifiers, ndescriptors
def junk_count(html): tokens = tokenize(BS(html).get_text()) tokens = [ token for token in tokens if not token.isalpha() and token not in string.punctuation ] return len(tokens)
def predict(text): tokens = tokenize(text) i = 0 prep, origin = prepare_tokens(tokens[i:i + 200]) pred = model.predict(prep) origin = fix_holes(origin, pred) score = overall_model.predict(prep)[0][0] cnt = 1 while i * 200 < len(origin): p, o = prepare_tokens(tokens[i:i + 200]) i += 1 pred = model.predict(p) o = fix_holes(o, pred) origin = origin + o score += overall_model.predict(prep)[0][0] cnt += 1 return origin, score / cnt #print(prepare_text(a), model.predict(pa))
def prepare_text(text): tokens = tokenize(text) original = [] filteredTokens = [] for x in tokens: if x in w2v.vocab: filteredTokens.append(x) original.append(0.0) else: original.append(None) # for word in filteredTokens: # # # output.append(w2v.word_vec(word)) output = list(map(lambda word: w2v.word_vec(word), filteredTokens)) if len(output) > 200: output = output[:200] else: while len(output) < 200: output.append([0.0] * 300) return np.array([output]), original
def get_tokenized_dialog_lines(iterable_dialog_lines): """ Tokenizes with nltk tokenizer, adds START_TOKEND, EOS_SYMBOL :param iterable_dialog_lines: IterableSentences :return: IterableSentences """ return iterable_dialog_lines.add_postprocessing(lambda x: [START_TOKEN] + tokenize(x) + [EOS_SYMBOL])
def rank_sentences( tagdict, tags, topK, cooccurances, probs): stop_lst = set(filter_reviews.get_stop_lst()) punctuation = set(['.',',','?','!','\'','\"','`','``','*','-','/','+']) tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') tokenizer2 = RegexpTokenizer(r'(\w|\')+') tag_sentence_rank = {} stemmer = PorterStemmer() tokenize = tokenizer2.tokenize stem = stemmer.stem for tag in tags: candidates = tagdict[tag] if len(candidates) <= topK: tag_sentence_rank[tag] = candidates continue scores = [] for (reviewIdx, sentence) in candidates: score = 0 tokens = tokenize(sentence) clean_line = [stem(token) for token in tokens if token not in stop_lst and token not in punctuation and token.isalpha()] score = ( score + score_sentence_tag(clean_line, tag, cooccurances, probs) + 1) / (1.0*len(clean_line) + 1.0) scores.append((score, (reviewIdx, sentence) )) #max(scores) ret = sorted(scores, key=lambda score_sent: score_sent[0], reverse=True)[:topK] tag_sentence_rank[tag] = ret #for i,pair in enumerate(candidates): # if scores[i] >= lowest_score return tag_sentence_rank
def post(self): text = self.get_argument("rawtext") relations = [] entities = [] tokens = [] IN = re.compile(r'.*\bin\b') doc.headline = ['a'] def tokenize(text): for sentence in nltk.sent_tokenize(text): for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence))): if hasattr(chunk, 'node'): if chunk.node != 'GPE': tmp_tree = nltk.Tree(chunk.node, [(' '.join(c[0] for c in chunk.leaves()))]) else: tmp_tree = nltk.Tree('LOCATION', [(' '.join(c[0] for c in chunk.leaves()))]) tokens.append(tmp_tree) entities.append(tmp_tree) else: tokens.append(chunk[0]) return tokens def extract_people_in_locations(): for rel in nltk.sem.extract_rels('PERSON' , 'LOCATION', doc, corpus='ieer', pattern=IN): filler_tokens = dict(nltk.pos_tag(nltk.word_tokenize(rel['filler']))) tmp = rel['subjtext'] + " is in " + rel['objtext'] relations.append(tmp) doc.text = tokenize(text) #print doc.text extract_people_in_locations() self.render("extractor_post.html", text=text, entities=entities, relations=relations)
def comparison(p, plist): # print(p,plist) plisttok = tokenize_sents(plist) ptok = tokenize(p) # print(plisttok) data = rank_one.article2queries(plisttok, ptok, 1) return data[0]['driver']
def load_file(path): text = '' with open(path, 'r') as f: text = f.read() tokens = tokenize(text) filteredTokens = filter(lambda x: x in w2v.vocab, tokens) filteredTokens = list(filteredTokens) output = [] # for word in filteredTokens: # # # output.append(w2v.word_vec(word)) output = list(map(lambda word: w2v.word_vec(word), filteredTokens)) if len(output) > timesteps: output = output[:timesteps] else: while len(output) < timesteps: output.append([0.0] * 300) return np.array(output)
def __iter__(self): with open(self.file,'r') as fp: line = fp.readline() while line : if line != '': tockenLine = ''.join(tokenize(line)) word_sentences = [word for word in tockenLine.split()] yield word_sentences
def preProcess(df): df.sentence = tokenize(df) df.sentence = removePunctuation(df) df.sentence = textNormalize(df) df.sentence = toLower(df) df.sentence = stemming(df) # return tfidf(df) return df.sentence
def __iter__(self): with open(self.file, 'r') as fp: line = fp.readline() while line: if line != '': tockenLine = ''.join(tokenize(line)) word_sentences = [word for word in tockenLine.split()] yield word_sentences
def train(self): with open("../../LSTM/data/sentiment/trainsentence_and_label_binary.txt", 'r') as filedata: data = filedata.readlines() tokenized_sentences_with_labels = [] for sent in data: tokenized = nltk.tokenize(sent.lower()) tokenized_sentences_with_labels.append((int(tokenized[0]), tokenized[1:]))
def generateTextObservations(text): X = [] text_tokens = nltk.tokenize(text) for word in text_tokens: X.append(vec[word]) return X
def process_text(): try: for words in tokenized[:5]: tokenized_words=nltk.tokenize(words) part_of_speech_tag = nltk.pos_tag(tokenized_words) print(part_of_speech_tag) except Exception as e: print(str(e))
def shinglize(s, n): """ return size n shingles for the string s """ shingles = set() tokens = tokenize(s) for i in range(len(tokens) - n + 1): shingles.add('_'.join(tokens[i:i+n])) return shingles
def shinglize(s, n): """ return size n shingles for the string s """ shingles = set() tokens = tokenize(s) for i in range(len(tokens) - n + 1): shingles.add('_'.join(tokens[i:i + n])) return shingles
def word_counts(html): lem = WordNetLemmatizer() tokens = tokenize(BS(html).get_text()) tokens = [ lem.lemmatizer(token.lowercase) for token in tokens if token not in string.punctuation ] return make_dict(tokens)
def get_tokens(string): global stop_words rtn = [] tokens = tokenize(string) for token in tokens: if token.isalnum() and token not in stop_words: rtn.append(token) return rtn
def _processline(self,line): tokens=["__START"]+tokenize(line)+["__END"] previous="__END" for token in tokens: self.unigram[token]=self.unigram.get(token,0)+1 current=self.bigram.get(previous,{}) current[token]=current.get(token,0)+1 self.bigram[previous]=current previous=token
def getBagOfWords(categories, stars, maxNumberOfReviewToUse): reviews = getSetOfReviews(categories, stars, maxNumberOfReviewToUse) output = {} #string:int for currentReview in reviews: for token in nltk.tokenize(currentReview): if token in output.keys(): output[token]+=1 else: output[token] = 1 return output
def compute_prob_line(self,line,methodparams={}): #this will add _start to the beginning of a line of text #compute the probability of the line according to the desired model #and returns probability together with number of tokens tokens=["__START"]+tokenize(line)+["__END"] acc=0 for i,token in enumerate(tokens[1:]): acc+=math.log(self.get_prob(token,tokens[:i+1],methodparams)) return acc,len(tokens[1:])
def count_tokens_in_chunk(idx, chunk): print("Processing chunk", idx) counts = Counter() for dialog in chunk: for utterance in dialog: tokens = tokenize(utterance["text"]) counts += Counter(tokens) return counts
def word_tokens(text_or_stream): def tokenize(text): hold_back = None skip = False for word in nltk.tokenize.word_tokenize(text): if hold_back is not None: if word == hold_back[0]: yield Token(hold_back[0]) yield Token(hold_back[1]) yield Token(word) skip = True else: yield Token(hold_back[0] + hold_back[1]) hold_back = None if not skip: if word.startswith(Token.APOSTROPHE): # Use hold_back to fix tokenization errors of the form: # | input | output | expected | # | ------ | ------- | -------- | # | 'word' | 'word ' | ' word ' | hold_back = (word[0], word[1:]) else: hold_back = None if hold_back is None: yield Token(word) skip = False if hold_back is not None: yield Token(hold_back[0] + hold_back[1]) if isinstance(text_or_stream, str): for token in tokenize(text_or_stream): yield token else: for text in text_or_stream: for token in tokenize(text): yield token
def punc_count(html): tokens = tokenize(BS(html.get_text())) count = 0 for token in tokens: if len(token) > 1: for char in token: if char in string.punctuation: count += 1 break return count
def compute_yules_k_for_text(sentence): tokens = tokenize(sentence) counter = Counter(token.upper() for token in tokens) #compute number of word forms in a given sentence/text m1 = sum(counter.values()) m2 = sum([frequency ** 2 for frequency in counter.values()]) #compute yules k measure and return the value yules_k = 10000/((m1 * m1) / (m2 - m1)) return yules_k
def tfidf(documents): tokenized_documents = [tokenize(d) for d in documents] idf = inverse_document_frequencies(tokenized_documents) tfidf_documents = [] for document in tokenized_documents: doc_tfidf = [] for term in idf.keys(): tf = sublinear_term_frequency(term, document) doc_tfidf.append(tf * idf[term]) tfidf_documents.append(doc_tfidf) return tfidf_documents
def is_match(a, b): """Check if a and b are matches.""" pos_a = map(get_wordnet_pos, nltk.pos_tag(tokenize(a))) pos_b = map(get_wordnet_pos, nltk.pos_tag(tokenize(b))) lemmae_a = [ lemmatizer.lemmatize(token.lower().strip(string.punctuation), pos) for token, pos in pos_a if pos == wordnet.NOUN and token.lower().strip(string.punctuation) not in stopwords ] lemmae_b = [ lemmatizer.lemmatize(token.lower().strip(string.punctuation), pos) for token, pos in pos_b if pos == wordnet.NOUN and token.lower().strip(string.punctuation) not in stopwords ] # Calculate Jaccard similarity intersect = set(lemmae_a).intersection(lemmae_b) union = set(lemmae_a).union(lemmae_b) return len(intersect) / float(len(union))
def get_lines_for_validation(validation_set_path, index_to_token): with codecs.open(validation_set_path, 'r', 'utf-8') as dataset_fh: lines = dataset_fh.readlines() lines = [tokenize(line.strip()) for line in lines] screened_lines = get_transformed_dialog_lines(lines, index_to_token.values()) # return true array, not iterator lines_for_validation = [] for line in screened_lines: lines_for_validation.append(line) return lines_for_validation
def parse_text_to_stems(language, text, min_length=3): """ Parse a text attribute performing cleanup, tokenization, stemmization and removal of stop-words. :param language: The text language, relevant for stemmization. :param text: The text to be stemmized. :param min_length: The minimum number of characters that a word must have; otherwise it is discarded. :returns: A list of terms. """ text = re.sub(" +", " ", text).lower() tokens = tokenize(text) stems = get_stems(tokens, language) return remove_stopwords(stems, language, min_length)
def file2tokenized_list(files, lower=True, encoding="utf-8"): """ Takes a filepath, or a list of filepaths and returns lists of tokenized strings. You can actually also feed it a dictionary of file path lists, where each key of the dictionary represents some category. If you chose to feed it a dictionary, then the output will be a tuple with 3 values. tokenized_list = the usual lists of tokenized strings. labels = a list containing integer labels corresponding to the category that each element of tokenized_list belongs to cats = A list of the unique category names. Indices of the names correspond to the integer values used in labels such that cats[labels[i]] gives you the original name for the category that the ith training example belongs to. :param files: (str, or list of strings or dict) String of a single file path, or a list of file path strings. :param lower: (bool)(default = True) Convert all text to lowercase? :param encoding: (str)(default = "utf-8") Encoding used in the text files :return: A list of lists of tokenized strings (if files is a string or a list of strings) A tuple of 3 items if files is a dictionary containing lists of strings. (see the description section for more details about the 3 elements returned) """ # ========================================================================== #TODO: add argument replacements, which is a dictionary of regex replacements. # whenever it encounters some pattern, replarce it with some other text. print("Generating a tokenised list from files") if isinstance(files, dict): return dict_file2tokenized_list(files, lower, encoding) if isinstance(files, str): files = [files] num_items = len(files) tokenized_list = ["MISSING"] * num_items # Will store the tokenised text for i in range(num_items): with open(files[i], "r") as textFile: text = textFile.read() text = text.decode(encoding) if lower: text = text.lower() tokenized_list[i] = tokenize(text) print("---Done!") return tokenized_list
def main(): data = "data/top_100_entities.txt" pathToData = "data/funnyReviews/" fileName = "rev_data_" suffix = ".txt" lines = [] with open(data, 'r') as f: lines = f.readlines() entity_weight = [] wn_entities = [] for line in lines: key = line.split(",")[0].split(":")[1].strip() value = line.split(",")[1].split(":")[1].strip() #print wn.synsets(key, pos=wn.NOUN) wn_entities.append(wn.synsets(key, pos=wn.NOUN)[0]) entity_weight.append( (key, value) ) tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') tokenizer2 = RegexpTokenizer(r'(\w|\')+') stemmer = PorterStemmer() tokenize = tokenizer2.tokenize stem = stemmer.stem stop_lst = get_stop_lst() punctuation = set(['.',',','?','!','\'','\"','`','``','*','-','/','+']) review = "" with open(pathToData + fileName + str(1) + suffix, 'r') as f: review = f.read() review_lines = tokenizer.tokenize(review.lower()) scores = [] for sentence in review_lines: tokens = tokenize(sentence) clean_line = [stem(token) for token in tokens if token not in stop_lst and token not in punctuation and token.isalpha()] #print clean_line for item in clean_line: word1 = wn.synsets(item, pos=wn.NOUN) if len(word1) > 0: if word1[0] in wn_entities: print word1[0] # score = ( score + score_sentence_tag(clean_line, tag, cooccurances, probs) + 1) / (1.0*len(clean_line) + 1.0) return 0
def mark_possible_duplicates(dict_list, key): """ Marks the possible duplicates strings """ number_of_titles = len(dict_list) full_match_criteria = 0.9 # Build a list of token sets for the strings in the key token_list = [] for i in range(number_of_titles): # Removes stop words to create token set value = tokenize(dict_list[i][key]) value = remove_stop_words(value) token_list.append(value) # Dict of objects with the structure {(i:j) : score} similarity_map = {} score_threshold = 0.5 # Indexes of tokens will match indexes in dict_list for i in range(number_of_titles): # Trying brute forces comparison of all items (O(n^2)/2), seems fast enough for this for j in range(i+1, number_of_titles): score = get_token_set_match_ratio(token_list[i], token_list[j]) if score >= score_threshold: similarity_map[(i, j)] = score partial_matches = 0 full_matches = 0 for k, val in sorted(similarity_map.iteritems()): # print(k) # print(val) if (val < full_match_criteria): # print(dict_list[k[0]][key]).encode('utf-8') # print(dict_list[k[1]][key]).encode('utf-8') # print('Similarity score: ' + str(val)) partial_matches +=1 else: mark_database_for_full_match(k, dict_list) full_matches += 1 add_match_clusters(similarity_map, dict_list) remove_match(similarity_map, dict_list, threshold = full_match_criteria) remove_ids_for_corrected_clusters(dict_list) print('Partial matches: ' + str(partial_matches)) print('Full matches: ' + str(full_matches)) return dict_list
def dict_file2tokenized_list(files, lower=True, encoding="utf-8"): """ Takes a dictionary of file path lists, where each key of the dictionary represents some category. The output will be a tuple with 3 values. tokenized_list = lists of tokenized strings. labels = a list containing integer labels corresponding to the category that each element of tokenized_list belongs to cats = A list of the unique category names. Indices of the names correspond to the integer values used in labels such that cats[labels[i]] gives you the original name for the category that the ith training example belongs to. :param files: (str, or list of strings or dict) String of a single file path, or a list of file path strings. :param lower: (bool)(default = True) Convert all text to lowercase? :param encoding: (str)(default = "utf-8") Encoding used in the text files :return: A tuple of 3 items if files is a dictionary containing lists of strings. (see the description section for more details about the 3 elements returned) """ # ========================================================================== cats = files.keys() num_per_category = {cat: len(files[cat]) for cat in cats} num_items = sum(num_per_category.values()) tokenized_list = ["MISSING"] * num_items # Will store the tokenised text labels = ["MISSING"] * num_items # Will store the labels running_index = 0 for cat_i, cat in enumerate(cats): #num_items_for_cat = for example_i in range(num_per_category[cat]): with open(files[cat][example_i], "r") as textFile: text = textFile.read() text = text.decode(encoding) if lower: text = text.lower() tokenized_list[running_index] = tokenize(text) labels[running_index] = cat_i running_index += 1 print("---Done!") return (tokenized_list, labels, cats)
def get_prediction(inp): tokens = tokenize(inp) if len(tokens) > timesteps: import sys sys.stderr.write("Exceeding allowed input length. " "Cutting off after {} tokens.".format(timesteps)) tokens = tokens[:timesteps] # cut off after max model timesteps vecs = vectorizer.transform([tokens]) # list of seqs vecs = vecs.repeat(batch_size, axis=0) input_lengths = np.array([len(tokens)] * batch_size) pred, _ = model.step(session, task, vecs, input_lengths, mode="decode") pred = pred.reshape([batch_size, timesteps, -1]) # Find all candidate words per timestep (all words from top k clusters) candidates = {} for t in range(timesteps): if np.argmax(pred[0][t]) == dio.PAD_ID: break candidates[t] = set() # get the top k clusters # (http://stackoverflow.com/questions/6910641/) topclusters = np.argpartition(pred[0][t], -k_clusters)[-k_clusters:] for c in topclusters: candidates[t].update(task.i2l[c]) # expand with this cluster # Find the optimal sequence from the candidate words using beam search global lm if not lm: lm = kenlm.Model('data/lms/en-70k-0.2-pruned.lm') hypos = ["<s>"] for t in range(len(candidates)): t_hypos = [] content_words = [ w for w, t in nltk.pos_tag(tokens) if t in CONTENT_POS ] if prune: copies = candidates[t].intersection(set(content_words)) if copies: candidates[t] = copies for cand in candidates[t]: for h in hypos: cand_t = h + " " + cand score = lm.score(cand_t) # get language model score t_hypos.append((cand_t, score)) # get beam_width highest scoring hypotheses hypos = [ h for h, s in sorted(t_hypos, key=lambda x: x[1])[-beam_width:] ] return hypos[:-k_best:-1] # k highest scoring hypos, revert list
def get_input_sequence(sentence): """ Prepare chatbot's input by tokenizing the sentence and adding necessary punctuation marks. Input: "So what's up, buddy" Output: ["so", "what", "'", "s", "up", ",", "buddy", ".", "$$$"] """ if not sentence: return [START_TOKEN, EOS_SYMBOL] # add a dot to the end of the sent in case there is no punctuation mark if sentence[-1] not in _PUNKT_MARKS: sentence += '.' sequence = [START_TOKEN] + tokenize(sentence) + [EOS_SYMBOL] return sequence
def json_converter_dfd_orig(): file_list=make_filelist(raw_doc_path_dfd_orig); for filepath in file_list: dfd_orig_one_doc_map={} #ラベルの分解処理 label_list=(os.path.basename(filepath)).split('_')[:-1]; #tokenized_documentはリスト型 tokenized_document=tokenize(filepath); #オランダ語はわからんが,一応すべて小文字化はしておく tokenized_document=[t.lower() for t in tokenized_document]; dfd_orig_one_doc_map['labels']=label_list; dfd_orig_one_doc_map['doc_str']=tokenized_document; print filepath with codecs.open(json_doc_path_dfd_orig+os.path.basename(filepath),'w','utf-8') as json_content: json.dump(dfd_orig_one_doc_map,json_content,ensure_ascii=False,indent=4);
def train(self): for line in self: self.i += 1 self.minimum = int(round(math.log(self.i, 10))) i = 0 for line in self: tokens = tokenize(decode(line).lower()) targets = self.get_targets(tokens) for gram in targets: self.posterior[gram] += 1 self.grams_by_line[i].add(gram) for token in gram: self.prior[token] += 1 i += 1 # print self.prior.most_common(10) # print self.posterior.most_common(10) self.crunch()
def calculate_tf(lang="", doc=""): """ Returns a map with all non-stopwords and its respective frequencies. Ex: {"work": 1, "going": 1} """ tf_by_stem = {} # Cleaning document doc = re.sub(" +", " ", doc).lower() tokens = remove_stopwords(tokenize(doc), lang, min_len=3, max_len=30) stems = get_stems(tokens, lang) for stem in stems: tf_by_stem[stem] = tf_by_stem.get(stem, 0) + 1 return tf_by_stem
def main(): try: reader = csv.reader(open(args.files[0])) next(reader) # skip header row except: print "Error: could not read ", args.files[0] try: writer = open(args.files[1], "w") except: print "Error: could not write to file", args.files[1] # stopwords = nltk.corpus.stopwords.words('english') POS_TAGS = ['CC','CD', 'DT','EX','FW','IN', 'JJ','JJR', 'LS', 'MD', 'NN','NNS', 'NNP','NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB','RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBG', 'VBN','VBP', 'VBZ', 'WDT','WP', 'WP$', 'WRB'] # First column (label) for each input line should be either 0 or 1 (train) or row number (test) for line in reader: if line[0] < 0: # Labels in test and train should be at least 0 next else: label = line[0] if label == "0": label = "-1" lineout = label # Initialize lineout to be the label if NAMESPACE == "C": # No parts of speech tokens = tokenize(line[2]) lineout = lineout + " |C " + (''.join( str(x) for x in tokens)) elif NAMESPACE == "POS": # Use each part of speech as a namespace for VW tokens = tokenize_pos(line[2]) for key in tokens: if key not in POS_TAGS: next else: lineout = lineout + " |" + key + " " + (''.join( str(x) for x in tokens[key])) + " " else: print "ERROR - only two namespace options defined" exit writer.write(lineout + "\n") writer.close()
def find_sentences_from_reviews(tag_review_dict): d = {} print "looking for sentences" stop_lst = set(get_stop_lst()) tokenize=tokenizer2.tokenize stem = stemmer.stem for tag, reviews in tag_review_dict.iteritems(): print "current tag: ", tag sentences = [] for (idx, review) in reviews: lines = tokenizer.tokenize(review) for line in lines: #tokens = nltk.wordpunct_tokenize(line) tokens = tokenize(line) clean_line = [stem(token) for token in tokens if token not in stop_lst and token not in punctuation and token.isalpha()] if tag in clean_line: sentences.append((idx,line)) d[tag] = sentences return d
# <codecell> nltk.download() # <markdowncell> # Density # ======= # <codecell> from nltk import word_tokenize as tokenize # <codecell> nltk.pos_tag(tokenize("The quick brown fox jumps over the lazy dog.")) # <codecell> nltk.pos_tag(tokenize("If I were you I wouldn't do that with these.")) # <markdowncell> # Create a density checker # <codecell> import re matches = lambda x, re_parts: any([re.findall(y, x) for y in re_parts])
def get_tokenized_dialog_lines(iterable_dialog_lines): for line in iterable_dialog_lines: tokenized_dialog_line = tokenize(line) tokenized_dialog_line = [START_TOKEN] + tokenized_dialog_line + [EOS_SYMBOL] yield tokenized_dialog_line
thisTweet['minute_utc'] = float(created_utc.minute) thisTweet['day_time_utc'] = thisTweet['day_of_week_utc'] + float(created_utc.hour) / 24.0 created_est = created_utc.replace(tzinfo=pytz.utc).astimezone(localTz) thisTweet['date_est'] = str(created_est.date()) thisTweet['day_of_week_est'] = created_est.weekday() thisTweet['weekend_est'] = thisTweet['day_of_week_est'] >= 5 thisTweet['time_est'] = float(created_est.hour) + float(created_est.minute)/60 thisTweet['minute_est'] = float(created_est.minute) thisTweet['day_time_est'] = thisTweet['day_of_week_est'] + float(created_est.hour) / 24.0 # Org features. thisTweet['org'] = org thisTweet['org_category'] = orgData['category'].lower() thisTweet['social_flow_user'] = orgData['socialFlow'] thisTweet['followers_count'] = twAccounts[org]['followers_count'] # Sentiment features. thisTweet['word_count'] = len([t for t in tokenize(cleanMessage(tweet['text'])) if t not in skipTokens]) sent = sentiment.get(str(tweet['id'])) if sent: thisTweet['sentiment_class'] = sent['class'] thisTweet['sentiment_score_positive'] = sent['meanScorePosSig'] thisTweet['sentiment_score_negative'] = sent['meanScoreNegSig'] else: thisTweet['sentiment_class'] = None thisTweet['sentiment_score_positive'] = None thisTweet['sentiment_score_negative'] = None # Tweet outcomes. thisTweet['favorites'] = tweet['favorite_count'] thisTweet['retweets'] = tweet['retweet_count'] # Bitly features. This is going to be tougher.
# coding: utf-8 import sys; print('Python %s on %s' % (sys.version, sys.platform)) sys.path.extend(['/home/rharriso/Code/Python/NLTKWorkspace']) # Page 79 Natural language processing in python from __future__ import division import nltk, re, pprint f = open('ASOIAF/A Clash of Kings A Song of Ice and Fire Book 2_nodrm.txt') txt = f.read() text = nltk.text(text) text = nltk.text(nltk.tokenize(text)) text = nltk.text(nltk.word_tokenize(text)) nltk.word_tokenize(txt) tokens = nltk.word_tokenize(txt) text = nltk.Text(tokens) text.concordance("Arya") # # Load a feed # import feedparser import feedparser llog = feedparser.parse("http://rharriso.github.io/feed.xml") llog['feed'] llog['feed']['title'] len(llog.entries) post = llog[2] llog.entries[2] post = llog.entries[2] post.content