def __init__(self): self.split_dash = True self.split_single_quote = False self.split_period = False self.split_comma = False # Unix character classes to split on resplit = r"\p{Pd}\p{Po}\p{Pe}\p{S}\p{Pc}" # A list of optional exceptions, for this character will we trust nltk # to split correctly dont_split = "" if not self.split_dash: dont_split += "\-" if not self.split_single_quote: dont_split += "'" if not self.split_period: dont_split += "\." if not self.split_comma: dont_split += "," resplit = "([" + resplit + "]|'')" if len(dont_split) > 0: split_regex = r"(?![" + dont_split + "])" + resplit else: split_regex = resplit self.split_regex = regex.compile(split_regex) try: self.sent_tokenzier = nltk.load('tokenizers/punkt/english.pickle') except LookupError: logging.info("Downloading NLTK punkt tokenizer") nltk.download('punkt') self.sent_tokenzier = nltk.load('tokenizers/punkt/english.pickle') self.word_tokenizer = nltk.TreebankWordTokenizer()
def __init__(self, lang): try: nltk.data.find('tokenizers/punkt') except LookupError: for i in range(3): signal.signal(signal.SIGALRM, timeout) signal.alarm(120) try: result = nltk.download('punkt', quiet=True) signal.alarm(0) break except myTimeout: pass else: raise Exception( "Unable to download 'punkt' NLTK data after 3 retries: try to download it manually or check your internet connection." ) langname = self.getLanguageName(lang.lower()) try: self.segmenter = load( 'tokenizers/punkt/{0}.pickle'.format(langname)) except: self.segmenter = load('tokenizers/punkt/english.pickle')
def Q1(): text = [] stopwords = nltk.load('stopwords.txt') english_punctuations = [ ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '"', '\'s', '``', "''" ] i = 1 while i <= 10: path = '/txt/' + str(i) + '.txt' temp = nltk.load(path, encoding='gbk') temp = nltk.word_tokenize(temp) temp = [words.lower() for words in temp] temp = [words for words in temp if words not in english_punctuations] temp = [words for words in temp if words not in stopwords] text.append(temp) i += 1 #print(text) wordlist = [] for sens in text: for words in sens: if words not in wordlist: wordlist.append(words) else: continue print(len(wordlist)) wb = xlwt.Workbook() ws = wb.add_sheet('TF') i = 0 while i != len(wordlist): ws.write(i + 1, 0, label=wordlist[i]) i += 1 i = 0 while i != 10: ws.write(0, i + 1, label='text' + str(i + 1)) i += 1 i = 0 while i != 10: j = 0 for word in wordlist: ctr = 0 for item in text[i]: if word == item: ctr += 1 else: continue ws.write(j + 1, i + 1, label=ctr) j += 1 i += 1 wb.save('data.xls')
def tokenize_corpus(self, corpus): """Read the corpus a list sentences, each of which is a list of tokens and the spans in which they occur in the text.""" if os.path.isdir(corpus): corpus_dir = corpus corpus = [ os.path.join(corpus_dir, fn) for fn in os.listdir(corpus_dir) ] else: corpus = [corpus] tokenizer = nltk.load('tokenizers/punkt/{0}.pickle'.format('english')) for filename in corpus: with open(filename) as fin: print(filename) data = fin.read() segment_start = 0 for span in ps.split_quoted_quotes(data): for sent_tokens in split_sentences(span, tokenizer, segment_start): yield sent_tokens segment_start += len(span)
def tokenize_corpus(self, corpus): """Read the corpus a list sentences, each of which is a list of tokens and the spans in which they occur in the text.""" if os.path.isdir(corpus): corpus_dir = corpus corpus = [ os.path.join(corpus_dir, fn) for fn in os.listdir(corpus_dir) ] else: corpus = [corpus] tokenizer = nltk.load('tokenizers/punkt/{0}.pickle'.format('english')) for filename in corpus: with open(filename) as fin: data = fin.read() for start, end in tokenizer.span_tokenize(data): sent = data[start:end] sent_tokens = [] matches = re.finditer( r'\w+|[\'\"\/^/\,\-\:\.\;\?\!\(0-9]', sent ) for match in matches: mstart, mend = match.span() sent_tokens.append( (match.group(0).lower().replace('_', ''), (mstart+start, mend+start)) ) yield sent_tokens
def Q1(): text = nltk.load('text_0.txt', encoding='gbk') # code for Q1a # token_sentlist = nltk.sent_tokenize(text) # # token_list = [] # # for sent in token_sentlist: # token_list.append(nltk.word_tokenize(sent)) token_list = nltk.word_tokenize(text) # code for Q1a english_punctuations = [ ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '"', '\'s', '``', "''" ] token_list = [ word for word in token_list if word not in english_punctuations ] token_list1 = nltk.pos_tag(token_list) print(len(token_list1)) # code for Q1a print(token_list1) # code for Q1a token_list2 = [w.lower() for w in token_list] token_list2 = nltk.pos_tag(token_list2) print(token_list2)
def run(fpath): posLex = loadLexicon('positive-words.txt') negLex = loadLexicon('negative-words.txt') # make a new tagger _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' tagger = load(_POS_TAGGER) # read the input f = open(fpath) text = f.read().strip() f.close() # split sentences sentences = sent_tokenize(text) freq = { } # keep track of noun frequency (number of sentences that include the noun) matched4gramsPerSent = [ ] # will hold all 4grams with the following structure: not <any word> <pos/neg word> <noun> # for each sentence for sentence in sentences: matched4grams = processSentence(sentence, freq, posLex, negLex, tagger) matched4gramsPerSent.append(matched4grams) freqNouns = getTop3(freq) # atts=None#getAtts() #['bike','size'] final4grams = set() # final result for fgrams in matched4gramsPerSent: # for each sentence for fg in fgrams: # for each matched 4gram in this sentence if fg[3] in freqNouns: final4grams.add(' '.join(fg)) return final4grams
def __init__(self): self._word_tokenizer = nltk.TreebankWordTokenizer() if FLAGS.punkt_tokenizer_file is not None: self._sent_tokenizer = py_utils.load_pickle( FLAGS.punkt_tokenizer_file) else: self._sent_tokenizer = nltk.load("tokenizers/punkt/english.pickle")
def run(fpath): dict = { 'idea': 30, 'thing': 70, 'vitaan': 9, 'good': 7, 'might': 88, 'end': 99 } posLex = loadLexicon('positive-words.txt') negLex = loadLexicon('negative-words.txt') #make a new tagger _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' tagger = load(_POS_TAGGER) #read the input f = open(fpath) text = f.read().strip() f.close() #split sentences sentences = sent_tokenize(text) # for each sentence for sentence in sentences: print(processSentence(sentence, posLex, negLex, tagger)) freqNouns = getTop3(dict) return freqNouns
def Q2b(): # nltk.download('wordnet') text = nltk.load('text.txt', encoding='gbk') # code for Q2a token_list = nltk.sent_tokenize(text) english_punctuations = [ ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '"', '\'s', '``', "''", "-" ] token_list = [nltk.word_tokenize(sen) for sen in token_list] new_token = [] for sens in token_list: sens = [word for word in sens if word not in english_punctuations] new_token.append(sens) new_token = [nltk.pos_tag(sen) for sen in new_token] print(new_token) lemmatized = [] for sen in new_token: for word in sen: if "V" in word[1]: w = WordNetLemmatizer().lemmatize(word[0].lower(), 'v') else: w = WordNetLemmatizer().lemmatize(word[0], 'n') lemmatized.append(w.lower()) # test = [WordNetLemmatizer().lemmatize(new_token)] # print(new_token[1]) print(lemmatized)
def main(): print "Loading word2vec" global word2vec word2vec = Word2Vec.load_word2vec_format(sys.argv[2], binary=True) tagger = load("taggers/maxent_treebank_pos_tagger/english.pickle") f_sentences = codecs.open(sys.argv[1], encoding="utf-8") invalid = list() valid = list() on = False for line in f_sentences: if line.startswith("#"): continue if line.startswith("VALID"): on = True continue sentence = Sentence(line.strip(), "ORG", "LOC", 6, 1, 2, tagger) for rel in sentence.relationships: t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before, rel.between, rel.after) if on is True: valid.append(t) elif on is False: invalid.append(t) f_sentences.close() for v in valid: for i in invalid: score = similarity_3_contexts(v, i) print "VALID", v.e1, v.e2, "\t", v.bet_words print "INVALID", i.e1, i.e2, "\t", i.bet_words print score
def run(fpath): #make a new tagger _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' tagger = load(_POS_TAGGER) #read the input f = open(fpath) text = f.read().strip() f.close() #split sentences sentences = sent_tokenize(text) print('NUMBER OF SENTENCES: ', len(sentences)) adjAfterAdv = [] # for each sentence for sentence in sentences: #tokenize the sentence terms = nltk.word_tokenize(sentence) POStags = ['JJ', 'RB'] # POS tags of interest POSterms = getPOSterms(terms, POStags, tagger) adjectives = POSterms['JJ'] adverbs = POSterms['RB'] #get the results for this sentence adjAfterAdv += getAdvAdjTwograms(terms, adjectives, adverbs) return adjAfterAdv
def main(grammar_filename, sentence_filename, output_filename): # Load CNF grammar grammar = load(grammar_filename) # Generate parser based on grammar parser = CKYParser(grammar=grammar) # Iterate over sentences in sentence_filename, produce parses and write to file with output_filename with open(sentence_filename, 'r') as infile: number_parses = [] with open(output_filename, 'w') as outfile: for line in infile.readlines(): # Strip any trailing whitespace from line (including newlines) line = line.rstrip() print(line) outfile.write(line + '\n') valid_parses = parser.parse_sentence(sentence=line) for tree in valid_parses: print(tree) outfile.write(str(tree) + '\n') print('Number of parses: %d' % len(valid_parses)) print() number_parses.append(len(valid_parses)) outfile.write('Number of parses: %d\n\n' % len(valid_parses)) avg_number_parses = np.mean(number_parses) print('Average number of parses: %.3f' % avg_number_parses)
def run(fpath): #load the positive and negative lexicons posLex = loadLexicon('positive-words.txt') negLex = loadLexicon('negative-words.txt') #make a new tagger _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' tagger = load(_POS_TAGGER) #read the input f = open(fpath) text = f.read().strip() f.close() #split sentences sentences = sent_tokenize(text) structList = [] for sentence in sentences: #for each sentence #get the results for this sentence structList += processSentence(sentence, posLex, negLex, tagger) return structList
def reset_model(self, model_path): """ Reset the base model :param model_path: Model path for sentence tokenization """ self.__tokenizer = nltk.load(model_path)
def ngrammer(text): # tag for every word based on sentence. _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' tagger = load(_POS_TAGGER) # loading a pre-trained model # split sentences # converts to list of sentences. can do '.' split but not always true coz if i am saying 6.78 it will take as poimt or what if i have a ? sentences = sent_tokenize(text) print('NUMBER OF SENTENCES: ', len(sentences)) # holds the adverb-adjective pairs found inthe text(list of 2-grams) nounAfterAdj = [] # for each sentence for sentence in sentences: # tokenize the sentence (list of all words in sentence) (can do split a t space but not always works so use this) terms = nltk.word_tokenize(sentence) # do POS tagging on the tokenized sentence tagged_terms = tagger.tag(terms) for i in range(len(tagged_terms) - 1): # for every tagged term term1 = tagged_terms[i] # current term term2 = tagged_terms[i + 1] # following term # re.match checks if it starts with same prefix. re.look looks for whole word # current term is an adverb, next one is an adjective if re.match('JJ', term1[1]) and re.match('NN', term2[1]): # add the adverb-adj pair to the list nounAfterAdj.append((term1[0].lower(), term2[0].lower())) return nounAfterAdj
def transformText(self, textData, pcaModelPath): self.loadNltk() num_texts = textData.size columns = [] tagdict = nltk.load('help/tagsets/upenn_tagset.pickle') for key in tagdict.keys(): columns.append(str(key)) df = pd.DataFrame(columns=columns) for i in range(0, num_texts): if (i % 500 == 0): print("ShallowSyntax: Processed ", i, "/", num_texts) new_row = pd.DataFrame(index=[i], columns=columns) for key in tagdict.keys(): new_row.set_value(i, key, 0) text = nltk.tokenize.word_tokenize( (str(textData[i]).replace(r'"(.*?)"', ''))) tagged_text = nltk.pos_tag(text) tag_fd = nltk.FreqDist(tag for (word, tag) in tagged_text) for (key, value) in tag_fd.items(): if (key in tagdict.keys()): new_row.set_value(i, key, 100 * (value / tag_fd.N())**2) df = pd.concat([df, new_row]) self.pca = pickle.load(open(pcaModelPath, "rb")) transformed_features = self.pca.transform(df.values) self.shallow_syntax_features = pd.DataFrame(transformed_features)
def run(fpath): _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' tagger = load(_POS_TAGGER) #read the input f = open(fpath) text = f.read().strip() f.close() #split sentences sentences = sent_tokenize(text) output = [] posLex = loadLexicon('positive-words.txt') negLex = loadLexicon('negative-words.txt') # for each sentence for sentence in sentences: sentence = re.sub( '[^a-zA-Z\d]', ' ', sentence ) #replace chars that are not letters or numbers with a spac sentence = re.sub(' +', ' ', sentence).strip() #remove duplicate spaces terms = nltk.word_tokenize(sentence.lower()) c = 'e' output += processSentence(terms, posLex, negLex, tagger, c) return output
def initializer(self): # Use Encoder class as a container for global data Encoder.tokenizer = get_nmt_tokenizer( library=self.args.tokenizer_library, model_name=self.args.tokenizer_type, tokenizer_model=self.args.tokenizer_model, vocab_file=self.args.vocab_file, merges_file=self.args.merge_file, delimiter=self.args.delimiter, ) if self.args.split_sentences: if not nltk_available: print("NLTK is not available to split sentences.") exit() splitter = nltk.load("tokenizers/punkt/english.pickle") if self.args.keep_newlines: # this prevents punkt from eating newlines after sentences Encoder.splitter = nltk.tokenize.punkt.PunktSentenceTokenizer( train_text=splitter._params, lang_vars=CustomLanguageVars()) else: Encoder.splitter = splitter else: Encoder.splitter = IdentitySplitter()
def run(fpath): #make a new tagger _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' tagger = load(_POS_TAGGER) posLex = loadLexicon('positive-words.txt') negLex = loadLexicon('negative-words.txt') #read the input f = open(fpath) text = f.read().strip() f.close() #split sentences sentences = sent_tokenize(text) #print ('NUMBER OF SENTENCES: ',len(sentences)) reqdString = [] # for each sentence for sentence in sentences: sentence = re.sub( '[^a-zA-Z\d]', ' ', sentence ) #replace chars that are not letters or numbers with a spac sentence = re.sub(' +', ' ', sentence).strip() #remove duplicate spaces reqdString += processSentence(sentence, posLex, negLex, tagger) return reqdString
def run(fpath): posLex = loadLexicon("positive-words.txt") negLex = loadLexicon("negative-words.txt") #make a new tagger _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' tagger = load(_POS_TAGGER) #read the input f = open(fpath) text = f.read().strip() f.close() #split sentences sentences = sent_tokenize(text) fgram = [] # for each sentence for sentence in sentences: fgram += processSentence(sentence, posLex, negLex, tagger) #adjAfterAdv+=getAdvAdjTwograms(terms, adjectives, adverbs) return fgram
def run(fpath): # make a new tagger _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' tagger = load(_POS_TAGGER) # read the input f = open(fpath) text = f.read().strip() f.close() # split sentences sentences = sent_tokenize(text) print('NUMBER OF SENTENCES: ', len(sentences)) adjAfterAdv = [] # holds the adverb-adjective pairs found in the text # for each sentence for sentence in sentences: terms = nltk.word_tokenize(sentence) # tokenize the sentence # do POS tagging on the tokenized sentence tagged_terms = tagger.tag(terms) for i in range(len(tagged_terms) - 1): # for every tagged term term1 = tagged_terms[i] # current term term2 = tagged_terms[i + 1] # following term # current term is an adverb, next one is an adjective if re.match('RB', term1[1]) and re.match('JJ', term2[1]): # add the adverb-adj pair to the list adjAfterAdv.append((term1[0], term2[0])) return adjAfterAdv
def setup(self): self.tokenizer = tft.SentencepieceTokenizer( model=tf.io.gfile.GFile(self.vocab_model_file, "rb").read()) self.sentence_tokenizer = nltk.load(SENTENCE_TOKENIZER_PATH) self.delimiter_range_pair = rendering_utils.get_default_delimiter_range_pair( task=self.task, delimiter_type=self.delimiter_type, )
def run(path): # initialize list adjWithNoun = [] # make a tagger _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' tagger = load(_POS_TAGGER) # load sexicon of stop words stopLex = set(stopwords.words('english')) # get raw review text from file with open(in_path, 'rb') as f: review = [] reader = csv.reader(f) for row in reader: review = row[2] print(review) try: # split sentences sentences = sent_tokenize(review) #print (sentences) print 'NUMBER OF SENTENCES: ', len(sentences) continue except: print "Oops! That was not tokenizable. Try again..." # for each sentence for sentence in sentences: print (sentence) # replace chars that are not letters or numbers with a space sentence = re.sub('[^a-zA-Z\d]',' ',sentence) # remove duplicate spaces sentence = re.sub(' +',' ', sentence).strip() # tokenize the lowercase sentence terms = nltk.word_tokenize(sentence.lower()) print (terms) # POS tags of interest POStags = ['JJ','NN'] POSterms = getPOSterms(terms,POStags,tagger) # get the set of adjectives and nouns adjectives = POSterms['JJ'] nouns = POSterms['NN'] # get the results for this sentence # call function to get ngrams n = 2 adjWithNoun += getNounAdjNgrams(terms, nouns, adjectives, n) return adjWithNoun
def initialize_hardcoded(): global pos_tags global ner_tags pos_tags = nltk.load('help/tagsets/upenn_tagset.pickle') ner_tags = [ 'GSP', 'LOCATION', 'GPE', 'ORGANIZATION', 'PERSON', 'O', 'PERSON', 'FACILITY' ]
def __call__(self, text): if self._sent_tokenizer is None: self._tokenizer = nltk.load( 'tokenizers/punkt/{0}.pickle'.format('english')) self._sent_tokenizer = self._tokenizer.tokenize sentences = self._sent_tokenizer(text) tokens = [] for sent in sentences: tokens.extend(nltk.word_tokenize(sent, preserve_line=True)) return tokens
def get_lhs_terminal(grammar=load(grammar_url)): """ Return a production list of lhs(left hand side) that are terminal :param grammar: :return: """ lhs_list = [] for p in grammar.productions(): if p.lhs() not in lhs_list and is_terminal(p.rhs()[0]): lhs_list.append(p.lhs()) return lhs_list
def ent(): random = [] spam = [] stopwords = nltk.load('stopwords.txt') english_punctuations = [ ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '"', '\'s', '``', "''" ] i = 1 while i <= 10: path = '/txt/random/tweet' + str(i) + '.txt' temp = nltk.load(path, encoding='gbk') temp = nltk.word_tokenize(temp) temp = [words.lower() for words in temp] temp = [words for words in temp if words not in english_punctuations] temp = [words for words in temp if words not in stopwords] random.append(temp) i += 1 i = 1 while i <= 10: path = '/txt/spam/tweet' + str(i) + '.txt' temp = nltk.load(path, encoding='gbk') temp = nltk.word_tokenize(temp) temp = [words.lower() for words in temp] temp = [words for words in temp if words not in english_punctuations] temp = [words for words in temp if words not in stopwords] spam.append(temp) i += 1 # random = np.array(random) # spam = np.array(spam) # print(random) # print(spam) ent_random = 0.0 random = [' '.join(tweet) for tweet in random] spam = [' '.join(tweet) for tweet in spam] print(calcShannonEnt(random)) print(calcShannonEnt(spam))
def run(fpath): _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' tagger = load(_POS_TAGGER) f=open(fpath) text=f.read().strip() f.close() sentence=sent_tokenize(text) print ('NUMBER OF SENTENCES: ',len(sentence)) posLex=loadLexicon('positive-words.txt') negLex=loadLexicon('negative-words.txt') fourword=processSentence(sentence,posLex,negLex,tagger) return fourword
def PMI(): text = [] stopwords = nltk.load('stopwords.txt') english_punctuations = [ ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '"', '\'s', '``', "''" ] i = 1 while i <= 10: path = '/txt/' + str(i) + '.txt' temp = nltk.load(path, encoding='gbk') temp = nltk.word_tokenize(temp) temp = [words.lower() for words in temp] temp = [words for words in temp if words not in english_punctuations] temp = [words for words in temp if words not in stopwords] text.append(temp) i += 1 pairs = [] words = nltk.load('word1.txt') words = nltk.word_tokenize(words) wb = xlwt.Workbook() ws = wb.add_sheet('TF') # print(words) i = 0 for word in words: for sens in text: if word in sens: for item in sens: if item in words and item != word: temp1 = item + ' ' + word temp2 = word + ' ' + item if temp1 not in pairs and temp2 not in pairs: pairs.append(temp2) ws.write(i + 1, 0, word) ws.write(i + 1, 1, item) i += 1 print(pairs) wb.save('PMI.xls')
def lemmatize(article): _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' tagger = load(_POS_TAGGER) wnpos = lambda e: ('a' if e[0].lower() == 'j' else e[0].lower()) if e[0].lower() in ['n', 'r', 'v'] else 'n' lemmatizer = WordNetLemmatizer() words=article.split(' ') tagged=tagger.tag(words) words2 = [lemmatizer.lemmatize(t[0],wnpos(t[1])) for t in tagged] ret="" for i in words2: ret += i + ' ' return ret
def __preloaded_nltk_tokenizer(self): # Code pulled out of nltk == 3.2.5 tokenizer = nltk.load('tokenizers/punkt/{0}.pickle'.format('english')) sent_tokenizer = tokenizer.tokenize def word_tokenize(text): sentences = sent_tokenizer(text) tokens = [] for sent in sentences: tokens.extend(nltk.word_tokenize(sent, preserve_line=True)) return tokens return word_tokenize
def process_sentece(review): review=re.sub('[^a-zA-Z\d]',' ',review)#replace chars that are not letters or numbers with a spac review=re.sub(' +',' ',review).strip()#remove duplicate spaces #tokenize the sentence _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' tagger = load(_POS_TAGGER) terms = nltk.word_tokenize(review.lower()) #removecase POStags=["JJ","JJR","JJS","RB","RP","NN","NNS","NNP","RB","RBR","RBS","VB","VBD","VBG","VBN","VBP","VBZ","CC","DT","TO","UH","WDT"] # POS tags of interest FinalPostTerms=get_attributes(terms,POStags,tagger) #Remoeve unnecessary words Particles=FinalPostTerms["RP"] cleantemrs=clean(terms,Particles) ProperNames=FinalPostTerms["NNP"] cleantemrs=clean(terms,ProperNames) Conjuctions=FinalPostTerms["CC"] cleantemrs=clean(terms,Conjuctions) Determiners=FinalPostTerms["DT"] cleantemrs=clean(cleantemrs,Determiners) TOERS=FinalPostTerms["TO"] cleantemrs=clean(cleantemrs,TOERS) Interjections=FinalPostTerms["UH"] cleantemrs=clean(cleantemrs,Interjections) WhichDeterminer=FinalPostTerms["WDT"] cleantemrs=clean(cleantemrs,WhichDeterminer) cleanterms=clean_uselesswords(cleantemrs) #Get all Names Nouns=FinalPostTerms["NN"] #Get all Adjectives AdjectivesSimp=FinalPostTerms["JJ"] #Get all Verbs VerbsBasic=FinalPostTerms["VB"] #Get all Adverbs AdverbsSimp=FinalPostTerms["RB"] notanyword=[] #get the results for this sentence notanyword+=getAdjNoun(cleanterms,Nouns,AdjectivesSimp) notanyword+=getAdjNounAnyNoun(cleanterms,Nouns,AdjectivesSimp) notanyword+=getNounNoun(cleanterms,Nouns) notanyword+=getNounVerbAdj(cleanterms,Nouns,AdjectivesSimp,VerbsBasic) notanyword+=getNounVerbAdvAdj(cleanterms,Nouns,AdjectivesSimp,VerbsBasic,AdverbsSimp) notanyword+=getNounAnyNounVerbAdvAdj(cleanterms,Nouns,AdjectivesSimp,VerbsBasic,AdverbsSimp) return notanyword
def load_stopwords(bitextfn): """Determine source language from the input filename.""" langs = bitextfn.split(".")[1] sl = langs.split("-")[0] assert sl in ["en", "es"], "wrong sl {0}".format(sl) sl = "english" if sl == "en" else "spanish" wordtext = nltk.load("corpora/stopwords/{0}".format(sl), format="text") wordlist = wordtext.split() out = set(wordlist) ## XXX: remove some common verbs from the set. out.difference_update({"estar"}) out.difference_update({"have"}) out.difference_update({"be"}) out.difference_update({"do"}) return out
def word_token_gen(text): """ Parse the text into a series of WorkTokens. I don't use the default nltk work tokenizer here because it doesn't include offsets. Instead I am using a RegexpTokenizer which does not do a good of job on things like contractions. The sentence tokenizer is probably not necessairy at the moment, but if the work tokenizer is replaced with something more complex it might be needed. """ sent_tokenizer = nltk.load('tokenizers/punkt/english.pickle') for sent_offsets in sent_tokenizer.span_tokenize(text): for word_offsets in word_tokenizer.span_tokenize(text[sent_offsets[0]:sent_offsets[1]]): yield WordToken( text, sent_offsets[0] + word_offsets[0], sent_offsets[0] + word_offsets[1] )
def split_sentences(text, tokenizer=None, offset=0): """\ Splits text into lists of lists. Each list contains a sentence, which is a list of normalized tokens, including the token's indexes in the original text. """ if tokenizer is None: tokenizer = nltk.load('tokenizers/punkt/{0}.pickle'.format('english')) for start, end in tokenizer.span_tokenize(text): sent = text[start:end] sent_tokens = [] matches = re.finditer( r'\w+|[\'\"\/^/\,\-\:\.\;\?\!\(0-9]', sent ) for match in matches: mstart, mend = match.span() seg_start = start + offset sent_tokens.append( (match.group(0).lower().replace('_', ''), (mstart+seg_start, mend+seg_start)) ) yield sent_tokens
import csv import time import matplotlib import numpy as np import nltk import enchant import plotly.plotly as ply import plotly.graph_objs as pgo from nltk.corpus import stopwords from nltk.tokenize import sent_tokenize from nltk.tag.perceptron import PerceptronTagger from nltk.util import ngrams from enchant.checker import SpellChecker TAGS = sorted(nltk.load('help/tagsets/upenn_tagset.pickle').keys()) SEP = '|' MAXNG = 3 tagger = PerceptronTagger() F_WD = os.path.dirname(__file__) F_INP = os.path.join('.', 'inputs', 'chs') F_OUT = os.path.join('.', 'outputs', 'chs') F_SRC = '/chs/' class Doc: raws = {} tokenlists = {} datasets = {}
def __init__(self, min_count=0, min_len=4): self.stopwords = nltk.corpus.stopwords.words("english") self.tokenizer = nltk.load("tokenizers/punkt/english.pickle") self.min_count = min_count self.min_len = min_len
def chunker(): """Return an instance of ne_chunker by loading a stored model""" return nltk.load(MULTICLASS_NE_CHUNKER)
from nltk.stem.snowball import SpanishStemmer import unicodedata import pickle def stopwords_from_file(stopwords_filepath = "data/stopwords/spa.txt"): stopwords = codecs.open(stopwords_filepath, "r", "utf-8") ret = set() for line in stopwords: word = line.rstrip("\n") word = regex.sub(" *\|.*$", "", word) if regex.search("[^\s]", word): word = unicodedata.normalize("NFD", word) ret.add(word) return ret tokenizer = nltk.load("tokenizers/punkt/spanish.pickle") stopwords = stopwords_from_file("../../src/qlc/data/stopwords/spa.txt") stemmer = SpanishStemmer() doc = "" doc_id = 0 sentence_id = 0 sentences_for_stem = collections.defaultdict(set) docs_for_stem = collections.defaultdict(set) for l in fileinput.input("/Users/ramon/qlc-github/data/eswiki/AA/wiki00"): l = l.strip() l = l.decode("utf-8") l = unicodedata.normalize("NFD", l)
""" NOTE: This works, but it's a mess, so I'm planning on cleaning it up and refactoring it shortly. """ import nltk import string from collections import namedtuple from subprocess import call import sys tokenizer = nltk.load('tokenizers/punkt/english.pickle') Story = namedtuple('Story', ['text', 'queries']) Query = namedtuple('Query', ['text', 'choices', 'answer']) def load_answers(filename): with open(filename, 'r') as f: answers = f.read().split('\n')[:-1] answers = [a.split('\t') for a in answers] answers = [[c.strip() for c in a] for a in answers] answers = {i:j for i, j in enumerate(answers)} return answers def load_stories(filename, answerfile): stories = [] with open(filename, 'r') as f: data = f.read()
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys, time from nltk import tokenize from nltk.parse import ViterbiParser from nltk.grammar import toy_pcfg1, toy_pcfg2 from nltk.draw.tree import draw_trees from nltk import Tree from nltk.draw.util import CanvasFrame from nltk.draw import TreeWidget # Define two demos. Each demo has a sentence and a grammar. # demos = [('move the green sphere to the bottom left corner', learned_pcfg), # ('move the green ball over the red block', learned_pcfg), # ('take the green pyramid and put it in the top left corner', learned_pcfg), # ('put the green pyramid on the red block', learned_pcfg), # ('move the red cylinder and place it on top of the blue cylinder that is on top of a green cylinder', learned_pcfg),] # Ask the user which demo they want to use. # print() # for i in range(len(demos)): # print('%3s: %s' % (i+1, demos[i][0])) # print(' %r' % demos[i][1]) # print() # print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ') # try: # snum = int(sys.stdin.readline().strip())-1 # sent, grammar = demos[snum] # except: # print('Bad sentence number') # return max_scene = 1 if max_scene<10: sc = '0000'+str(max_scene) elif max_scene<100: sc = '000'+str(max_scene) elif max_scene<1000: sc = '00'+str(max_scene) elif max_scene<10000: sc = '0'+str(max_scene) g = 'grammar_'+sc+'.txt' learned_pcfg = load('/home/omari/Dropbox/robot_modified/AR/grammar/'+g) grammar = learned_pcfg file1 = open('/home/omari/Dropbox/robot_modified/AR/hypotheses/matched_commands.txt', 'r') g1 = [i for i in file1.readlines()] for line in g1: line = unicode(line,encoding='utf-8') sent = line.split('\n')[0].split('-')[-1] scene = line.split('\n')[0].split('-')[0] sent_num = line.split('\n')[0].split('-')[1] print(line) if scene == '239' and sent_num == '0': continue # Tokenize the sentence. tokens = sent.split() parser = ViterbiParser(grammar) all_parses = {} # print('\nsent: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar)) parser.trace(3) parses = parser.parse_all(tokens) average = (reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) if parses else 0) num_parses = len(parses) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics # print() # print('Time (secs) # Parses Average P(parse)') # print('-----------------------------------------') # print('%11.4f%11d%19.14f' % (time, num_parses, average)) parses = all_parses.keys() if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) else: p = 0 # print('------------------------------------------') # print('%11s%11d%19.14f' % ('n/a', len(parses), p)) # Ask the user if we should draw the parses. # print() # print('Draw parses (y/n)? ', end=' ') # if sys.stdin.readline().strip().lower().startswith('y'): # print(' please wait...') # draw_trees(*parses) cf = CanvasFrame() # t = Tree(parses) t = Tree.fromstring('(S (CH_POS_PREPOST move) (PRE_POST (PRE (the the) (_entity (F_HSV green) (F_SHAPE sphere))) (PREPOST_connect (to to) (the the)) (POST (_F_POS (F_POS (_bottom_left (bottom bottom) (left left)))) (corner corner))))') tc = TreeWidget(cf.canvas(), t, draggable=1, node_font=('helvetica', -14), leaf_font=('helvetica', -12), roof_fill='white', roof_color='black', leaf_color='green4', node_color='blue4') cf.add_widget(tc,10,10) # tc = TreeWidget(cf.canvas(),t) # cf.add_widget(tc,10,10) # (10,10) offsets cf.print_to_file('/home/omari/Dropbox/robot_modified/trees/scene-'+scene+'-'+sent_num+'.ps') cf.destroy()
def tokenize_into_sentences(content): # Lazy loaded? # When creating a new post, tokenize the content with this function detector = nltk.load('tokenizers/punkt/english.pickle') return detector.tokenize( content )
def load_resource(self): """Loads the resource needed for part of speech tagging.""" #Load resource using the NLTK protocol. nltk.load() searches for the resource URL in the directories specified by nltk.data.path nltk.load('taggers/maxent_treebank_pos_tagger/english.pickle')