def parseTextToSentences(text): punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'ms', 'mrs', 'prof', 'inc', 'no', 'e.g', 'i.e']) sentence_splitter = PunktSentenceTokenizer(punkt_param) data = text data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "') sentences = [] for para in data.split('\n'): if para: sentences.extend(sentence_splitter.tokenize(para)) return sentences
def get_todo_items(text): all_items = list() tokenizer = PunktSentenceTokenizer() sen_tokens = tokenizer.tokenize(text) for sen_token in sen_tokens: todo_items = list() tokens = nltk.word_tokenize(sen_token) tags = tagger.tag(tokens) stop_words = [word for (word, tag) in tags if tag in (tagVB, tagVBP)] ind = -1 for word in stop_words: curr_ind = tokens.index(word) if curr_ind != 0 and tags[curr_ind - 1][1] in (tagCC, tagRB): to_ind = curr_ind - 1 else: to_ind = curr_ind if ind != -1 and abs(to_ind - ind) > 1: todo_items.append(' '.join(tokens[ind:get_punctuation_index(tokens, ind, to_ind)])) elif ind != -1 and len(todo_items) > 0: last_ind = len(todo_items) todo_items[last_ind - 1] = ' '.join([todo_items[last_ind - 1], tokens[to_ind - 1]]) ind = curr_ind if ind != -1 and abs(len(tokens) - ind) > 1: todo_items.append(' '.join(tokens[ind:get_punctuation_index(tokens, ind, len(tokens))])) elif ind != -1 and len(todo_items) > 0: last_ind = len(todo_items) todo_items[last_ind - 1] = ' '.join([todo_items[last_ind - 1], tokens[len(tokens) - 1]]) all_items.extend(todo_items) return all_items
def sentence_tokenizer(self, untokenized_string, language): """Reads language .pickle for right language""" if language == 'greek': pickle_path = os.path.expanduser('~/cltk_data/greek/cltk_linguistic_data/tokenizers/sentence/greek.pickle') language_punkt_vars = PunktLanguageVars language_punkt_vars.sent_end_chars = ('.', ';') language_punkt_vars.internal_punctuation = (',', '·') elif language == 'latin': pickle_path = os.path.expanduser('~/cltk_data/latin/cltk_linguistic_data/tokenizers/sentence/latin.pickle') language_punkt_vars = PunktLanguageVars language_punkt_vars.sent_end_chars = ('.', '?', ':') language_punkt_vars.internal_punctuation = (',', ';') else: print("No sentence tokenizer for this language available.") with open(pickle_path, 'rb') as open_pickle: tokenizer = pickle.load(open_pickle) tokenizer.INCLUDE_ALL_COLLOCS = True tokenizer.INCLUDE_ABBREV_COLLOCS = True params = tokenizer.get_params() sbd = PunktSentenceTokenizer(params) tokenized_sentences = [] for sentence in sbd.sentences_from_text(untokenized_string, realign_boundaries=True): tokenized_sentences.append(sentence) return tokenized_sentences
def get_key_sentences(self, n=5): ''' Uses a simple implementation of TextRank to extract the top N sentences from a document. Sources: - Original paper: http://acl.ldc.upenn.edu/acl2004/emnlp/pdf/Mihalcea.pdf - Super useful blog post: http://joshbohde.com/blog/document-summarization - Wikipedia: http://en.wikipedia.org/wiki/Automatic_summarization#Unsupervised_keyphrase_extraction:_TextRank ''' # Tokenize the document into sentences. More NLP preprocesing should also happen here. sentence_tokenizer = PunktSentenceTokenizer() sentences = sentence_tokenizer.tokenize(self.doc) # Calculate word counts and TFIDF vectors word_counts = CountVectorizer(min_df=0).fit_transform(sentences) normalized = TfidfTransformer().fit_transform(word_counts) # Normalized graph * its transpose yields a sentence-level similarity matrix similarity_graph = normalized * normalized.T nx_graph = nx.from_scipy_sparse_matrix(similarity_graph) scores = nx.pagerank(nx_graph) return sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)[n]
def fractal_representation(self): punkt_param = PunktParameters() for each_paragraph in self.paragraphs: buffer_p = paragraph() buffer_p.paragraph = each_paragraph buffer_p.tokens = nltk.word_tokenize(preprocess(each_paragraph)) buffer_p.weights['words'] = FreqDist(buffer_p.tokens) buffer_p.weights['total'] = {'words':0, 'sentences':0} punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(each_paragraph) for each_sentence in sentences: self.stotal += 1 buffer_s = sentence() buffer_s.sentence = each_sentence buffer_s.tokens = nltk.word_tokenize(preprocess(each_sentence)) if len(buffer_s.tokens) > 0: buffer_s.weights['sentence'] = FreqDist(buffer_s.tokens) buffer_s.weights['paragraph'] = self.calculate_relative_frequence(buffer_s.tokens, buffer_p.weights['words']) buffer_s.weights['document'] = self.calculate_relative_frequence(buffer_s.tokens, self.fractal.weights) buffer_s.weights['total'] = {} buffer_s.weights['total']['sentence'] = 1 buffer_s.weights['total']['paragraph'] = sum(buffer_s.weights['paragraph'].values()) buffer_s.weights['total']['document'] = sum(buffer_s.weights['document'].values()) self.s_weight += buffer_s.weights['total']['document'] buffer_p.weights['total']['sentences'] += buffer_s.weights['total']['document'] buffer_p.sentences.append(buffer_s) buffer_p.weights['total']['words'] = sum(buffer_p.weights['words'].values()) self.fractal.paragraphs.append(buffer_p) self.pindex += 1
def textrank(document): pst = PunktSentenceTokenizer() sentences = pst.tokenize(document) # Bag of Words from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer() bow_matrix = cv.fit_transform(sentences) from sklearn.feature_extraction.text import TfidfTransformer normalized_matrix = TfidfTransformer().fit_transform(bow_matrix) ## mirrored matrix where the rows and columns correspond to ## sentences, and the elements describe how similar the ## sentences are. score 1 means sentences are exactly the same. similarity_graph = normalized_matrix * normalized_matrix.T similarity_graph.toarray() # PageRank import networkx as nx nx_graph = nx.from_scipy_sparse_matrix(similarity_graph) ## mapping of sentence indices to scores. use them to associate ## back to the original sentences and sort them scores = nx.pagerank(nx_graph) ranked = sorted(((scores[i], s) for i,s in enumerate(sentences)), reverse=True) print ranked[0][1]
def featureize(F, observation_files): word_tokenizer = PunktSentenceTokenizer() sent_tokenizer = PunktSentenceTokenizer() m = len(observation_files) # X is Nx2 X = np.zeros((m,2), dtype=np.float) for (i,filename) in enumerate(observation_files,start=0): file_text = read_file(filename).decode('string_escape') try: num_sents = len(sent_tokenizer.sentences_from_text(file_text)) except UnicodeDecodeError: num_sents = 2 #num_tokens = len(word_tokenize(file_text)) num_tokens = len(file_text.split()) # Return two features: # 1 (0) - Number of sentences per file # 2 (1) - Number of tokens per file X[i][0] = num_sents X[i][1] = num_tokens return X
def summarize(self): punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(self.text) structure = {} sentence_objects = [] for idx in range(len(sentences)): obj = {'text' : sentences[idx], 'index' : idx , 'data': {}} sentence_objects.append(obj) structure['sentences'] = sentence_objects self.sentencecount = len(structure['sentences']) structure['ordered'] = [] structure['weights'] = {'words' : FreqDist(nltk.word_tokenize(preprocess(self.text))), 'total': 0, 'transformed': 0} structure['weights']['total'] = sum(structure['weights']['words'].values()) self.sentenceIndex = 0 for each_sent in structure['sentences']: each_sent['data']['tokens'] = nltk.word_tokenize(preprocess(each_sent['text'])) each_sent['data']['sinTransform'] = (1-math.sin(self.sentenceIndex*(math.pi/self.sentencecount)))+1 for each_word in structure['weights']['words']: if each_word in each_sent['data']['tokens']: structure['weights']['words'][each_word] *= each_sent['data']['sinTransform'] self.sentenceIndex += 1 structure['weights']['transformed'] = sum(structure['weights']['words'].values()) self.sentenceIndex = 0 for each_sent in structure['sentences']: each_sent['data']['weights'] = {'words': self.calculate_relative_frequence(each_sent['data']['tokens'], structure['weights']['words']), 'total': 0} each_sent['data']['weights']['total'] = sum(each_sent['data']['weights']['words'].values()) self.sentenceIndex += 1 structure['ordered'] = sorted(structure['sentences'], key=lambda x:x['data']['weights']['total'], reverse=True) structure_keep = structure['ordered'][:self.quota] structure_keep.sort(key=lambda x:x['index']) for eac_sen in structure_keep: self.summary.append(eac_sen['text'])
def tokenize_sentences(self, untokenized_string: str): """Tokenize sentences by reading trained tokenizer and invoking ``PunktSentenceTokenizer()``. :type untokenized_string: str :param untokenized_string: A string containing one of more sentences. :rtype : list of strings """ # load tokenizer assert isinstance(untokenized_string, str), \ 'Incoming argument must be a string.' if self.language == 'latin': tokenizer = super() elif self.language == 'greek': # Workaround for regex tokenizer self.sent_end_chars=GreekLanguageVars.sent_end_chars self.sent_end_chars_regex = '|'.join(self.sent_end_chars) self.pattern = rf'(?<=[{self.sent_end_chars_regex}])\s' elif self.language in INDIAN_LANGUAGES: self.sent_end_chars=SanskritLanguageVars.sent_end_chars self.sent_end_chars_regex = '|'.join(self.sent_end_chars) self.pattern = rf'(?<=[{self.sent_end_chars_regex}])\s' else: # Warn that NLTK Punkt is being used by default??? tokenizer = PunktSentenceTokenizer() # mk list of tokenized sentences if self.language == 'greek' or self.language in INDIAN_LANGUAGES: return re.split(self.pattern, untokenized_string) else: return tokenizer.tokenize(untokenized_string)
def preprocess(phys): ''' :param fname: a text file :return: a json of sentences, processed for searchability ''' phys = phys.decode('utf-8') phys = re.sub('(\n)+', '. ', phys) sentence_tokenizer = PunktSentenceTokenizer() sentences = sentence_tokenizer.tokenize(phys) for i in xrange(len(sentences)): sentence = unicode(sentences[i]) sentence = sentence.replace('\n', ' ') sentence = re.sub(' +',' ',sentence) sentence = re.sub(r'\d+', '', sentence) sentence = sentence.replace("-"," ") exclude = string.punctuation sentence = ''.join(ch for ch in sentence if ch not in exclude) sentence = re.sub(' +',' ',sentence) sentences[i] = sentence # sentences[i] = sentence.encode('utf-8') count = 0 for sentence in sentences: if sentence == ' ' or sentence == '': sentences.pop(count) count +=1 # with open(fname.rstrip('txt')+'json', 'w') as outfile: # json.dump(sentences, outfile) return sentences
def preprocessin(self, cell_value): # to tokenize the tweet into sentences tweet = PunktSentenceTokenizer().tokenize(cell_value) # to remove 'u' tweet = '\n'.join(tweet) # to remove html tags tweet = self.remTags(tweet) # to lower aplphabets tweet = tweet.lower() ##Removing all junk tweet = re.sub(u'(RT |\\\\|\u201c)"?@.*?[: ]', ' ', tweet) tweet = re.sub('@', ' ', tweet) tweet = re.sub(r'[^\x00-\x7F]', ' ', tweet) tweet = re.sub('[\s]+', ' ', tweet) tweet = re.sub('_', ' ', tweet) tweet = re.sub('((www\.[\s]+)|(https?://[^\s]+))', '', tweet) tweet = re.sub(r'\\([^\s]+)', ' ', tweet) tweet = re.sub(u'[\u2018\u2019]', '\'', tweet) tweet = re.sub('(^|)?http?s?:?/?/?.*?( |$)', ' ', tweet) tweet = re.sub(u'\u2026', ' ', tweet) tweet = re.sub('---', ' ', tweet) tweet = re.sub(u'[\u201c\u201d]', '"', tweet) tweet = re.sub('\.?@.*?( |:|$)', ' ', tweet) tweet = re.sub(r"\.\.+", ' ', tweet) tweet = re.sub('&', ' ', tweet) tweet = re.sub('\.\.\.', ' ', tweet) tweet = tweet.strip('\'"') tweet = re.sub('(, |\.( |$))', ' ', tweet) tweet = re.sub('[][!"$*,/;<=>?@\\\\^_`{|}~]', ' ', tweet) tweet = re.sub('( - )', ' ', tweet) return tweet
def _split_sentences(self, text): from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(text) return sentences
def sentences(self): try: return self.sentences_list except(AttributeError): sentence_tokenizer = SentenceTokenizer() self.sentences_list = sentence_tokenizer.tokenize(self.corpus) return self.sentences_list
def _punkt_sent_tokenize(text): ''' Sentence segmentation using nltk PunktSentenceTokenizer. ''' punkt_param = PunktParameters() punkt_param.abbrev_types = set(config.tokenize_abbrev) sentence_splitter = PunktSentenceTokenizer(punkt_param) return sentence_splitter.tokenize(text)
def transform(self,documents): sentence_splitter = PunktSentenceTokenizer() for doc in documents: if not 'sentences' in doc.ext: doc.ext['sentences'] = [s.strip() for s in sentence_splitter.tokenize(doc.text)] # for doc in documents: # if not 'sentences' in doc.ext: # doc.ext['sentences'] = [s.strip() for s in doc.text.split('.') if s] return documents
def tokenize(self): """ Returns a list of tokenized sentences """ sentence_tokenizer = PunktSentenceTokenizer() sentences = sentence_tokenizer.sentences_from_text(self.text) sentences = [sentence.split() for sentence in sentences] sentences = [[word.strip(",.?!") for word in sentence] for sentence in sentences] return sentences
def parse (text): """Use nltk's PunktSentenceTokenizer to convert the text string into a list of English-language sentences.""" punkt_param = PunktParameters() punkt_param.abbrev_types = set(ABBREVIATIONS) sentence_splitter = PunktSentenceTokenizer(punkt_param) return sentence_splitter.tokenize(preprocess(text))
def bayesSentiment(self, text): from nltk.tokenize.punkt import PunktSentenceTokenizer from senti_classifier import senti_classifier # break up text into sentences stzr = PunktSentenceTokenizer() sents = stzr.tokenize(text) pos_score, neg_score = senti_classifier.polarity_scores(sents) #print pos_score, neg_score return [pos_score, neg_score]
def split_into_sentences(input_file_name, output_file_name): tokenizer = PunktSentenceTokenizer() with gzip.open(input_file_name) as input_file: with gzip.open(output_file_name, 'w') as sentence_file: for line in input_file: labelled_review = json.loads(line) tokenized_text = tokenizer.tokenize(labelled_review['text']) json.dump([tokenized_text, labelled_review['score']], sentence_file) sentence_file.write("\n")
def analyse_hansard_file(filename='House of Representatives_2018_05_10_6091.xml'): # Word frequency analysis my_abbrev = ['\'m', '.', ',', '\'s', '(', ')', 'n\'t', '\'ve', ';', '$', ':', '\'', '?', '\'ll', '\'re'] stoplist = set(stopwords.words('english') + my_abbrev) soup, sample = parse_hansard(filename) # Tokenisation, tagging, chunking sent_tokenizer = PunktSentenceTokenizer() # Stop breaking sentence at "No." sent_tokenizer._params.abbrev_types.add('no') #sentences = nltk.sent_tokenize(sample) # TODO: improve sentence tokenizer - still far from good sentences = sent_tokenizer.tokenize(sample) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True) # Word frequency over all sentences tokens = [] for sentence in tokenized_sentences: tokens += [word for word in sentence if word.lower() not in stoplist] display_freq(tokens) # Part-of-speech analysis tags = [] for sentence in tagged_sentences: tags += sentence pos_analysis(tags, my_abbrev) # spaCy NER nlp = spacy.load('en_core_web_sm') doc = nlp(sample) # Find named entities, phrases and concepts ne_spacy = {} for entity in doc.ents: if entity.label_ in ne_spacy: ne_spacy[entity.label_] += [entity.text] else: ne_spacy[entity.label_] = [entity.text] logger.debug("Entity number per type: %s" % {k:len(v) for k,v in ne_spacy.items()}) for k in ne_spacy.keys(): display_freq(ne_spacy[k], 'Named entities (%s)' % (k,), top=20) # Interjection analysis parties = {} all_interjections = soup.find_all('interjection') for interjection in all_interjections: # Can be either a party or a role (Speaker, President, etc, ...) party = interjection.party.text or interjection.find('name', role='metadata').text if party in parties: parties[party] = parties[party] + 1 else: parties[party] = 1 logger.debug("%s interjections: %s" % (len(all_interjections), parties))
def __init__(self, document): self.document = document self.sumLength = 10 self.weights = {} self.invWeights = {} self.sumIndex = {} self.summary = {} tokenizer = PunktSentenceTokenizer() self.sentences = [sentence.lower() for sentence in tokenizer.tokenize(document)]
def textrank(self, document): sentence_tokenizer = PunktSentenceTokenizer() sentences = sentence_tokenizer.tokenize(document) bow_matrix = CountVectorizer().fit_transform(sentences) normalized = TfidfTransformer().fit_transform(bow_matrix) similarity_graph = normalized * normalized.T nx_graph = nx.from_scipy_sparse_matrix(similarity_graph) scores = nx.pagerank(nx_graph) return sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
def preprocess_doc(doc): sentence_tokenizer = PunktSentenceTokenizer() sentences = sentence_tokenizer.sentences_from_text(doc) tokens = [] for sentence in sentences: #sentence1 = sentence.split() sentence1 = neg_scope(sentence) tokens.extend(w for w in sentence1 if w.lower() not in stopwords.words("english")) for ii in xrange(len(tokens)): if tokens[ii][-1] == '.': tokens[ii] = tokens[ii][:-1] return tokens
def myNLTKParser(document, tagger): lexical_diversity = len(document) / len(set(document)) * 1.0 punkt_param = PunktParameters() # if any customized abbrev # punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) # tokenize to sentence sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(document.replace("'s", "_s")) # tokenize sentence to words word_tokens = [[w.strip() for w in nltk.word_tokenize(s) if not w.strip().lower() in stopwords] for s in sentences] # extend token to bigram and trigram extended_tokens = [] for token_list in word_tokens: extended_tokens.append(token_list + nltk.bigrams(token_list) + nltk.trigrams(token_list)) # word stemmer to normalize p_stemmer = PorterStemmer() stem_tokens = [] for token_list in word_tokens: stem_tokens.append([p_stemmer.stem(w) for w in token_list]) # POS tags tags = [tagger.tag(a) for a in extended_tokens] tags_of_verbs = ["NN", "VB", "VBP", "VBG"] tags_of_interest = ["JJ", "JJR", "JJS", "NN", "NNP", "NNPS", "NNS", "RB", "RBR", "RBS"] tags_of_noun = ["NN"] merged_tags_uni = [ word for sublist in tags for (word, tag) in sublist if tag in tags_of_verbs and isinstance(word, tuple) == False ] merged_tags_bi = [ word for sublist in tags for (word, tag) in sublist if tag in tags_of_interest and isinstance(word, tuple) and len(word) == 2 ] merged_tags_tri = [ word for sublist in tags for (word, tag) in sublist if tag in tags_of_interest and isinstance(word, tuple) and len(word) == 3 ] uni_tags_fd = nltk.FreqDist(merged_tags_uni) bi_tags_fd = nltk.FreqDist(merged_tags_bi) tri_tags_fd = nltk.FreqDist(merged_tags_tri) return {"uni_fd": uni_tags_fd.max(), "bi_fd": bi_tags_fd.max(), "tri_fd": tri_tags_fd.max()}
def keyword_sentiment(): ## take in tht input word = sys.argv[1] date_diff = int(sys.argv[2]) ## create a sentence_tokenizer from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20']) sent_tokenizer = PunktSentenceTokenizer(punkt_param) ## caluclate the barrier date DD = datetime.timedelta(days=date_diff) barrier_date = datetime.datetime.now()- DD ## make connection to db and fetch tweets (and respective sentiment) above the barrier_date db = MySQLdb.connect(host="localhost",user="******",passwd="{2qGq(22+5iU",db="Insights") cur = db.cursor() sql = "SELECT Phrase,Sentiment FROM Phrases WHERE `Date`>'"+str(barrier_date)+"';" cur.execute(sql) total_sentiment = 0 total_count = 0 ## locate tweets which contain keyword, tokenize them into sentences for row in cur.fetchall(): if(row[0].lower().find(word.lower())!=-1): sentences = sent_tokenizer.tokenize(row[0]) ## if a single sentence then just take the sentiment from db if len(sentences) == 1: total_sentiment = total_sentiment + float(row[1]) total_count = total_count+1 ## else add together sentiment of sentence and keep the count else: for sentence in sentences: blob = TextBlob(sentence) total_sentiment= total_sentiment + int(blob.sentiment.polarity*1000)/1000.0 if(sentence.lower().find(word.lower())!=-1): total_count = total_count+1 ## json the total_sentiment/count and count if(total_count!=0): json_array = json_array = [{"sentiment": int(total_sentiment/total_count*1000)/1000.0, "count": total_count}] else: json_array = json_array = [{"sentiment": 0, "count": 0}] ## close the connection to the db db.close() ## print the json print(json.dumps(json_array))
def splitIntoSentences2(file_name): punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sentence_splitter = PunktSentenceTokenizer(punkt_param) fp = open(file_name) data = fp.read() data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "') sentences = [] for para in data.split('\n'): if para: sentences.extend(sentence_splitter.tokenize(para)) # print '\n-----\n'.join(sentences) return sentences
def build_doc2vec_model(save_file=False): client = MongoClient() db = client['metacritic'] coll = db['steam_games'] all_games = list(coll.find({'user_review': {"$exists": "true"}, 'total_user_reviews': {'$ne': 0}, 'game_name': {'$not': re.compile("Demo")} })) plv = PunktSentenceTokenizer() # stemmer = PorterStemmer() labeled_sentences = [] for game in all_games: game_name = game['game_name'] user_data = game['user_review'] # critic_data = game['critic_review'] user_reviews = user_data['reviews'] for user_review in user_reviews: review = user_review['review'] review = review.encode('ascii', 'replace') review = str(review).translate(string.maketrans("",""), string.punctuation) review_sentence = [sentence.split() for sentence in plv.tokenize(review.lower())] if len(review_sentence) == 0: continue else: review_sentence = review_sentence[0] # stemmed_sentence = [] # for word in review_sentence[0]: # stemmed_sentence.append(stemmer.stem(word)) sentence = doc2vec.LabeledSentence(words=review_sentence, labels=[game_name]) labeled_sentences.append(sentence) model = Doc2Vec(alpha=0.025, min_alpha=0.025, workers=4)#, train_words=False, train_lbls=True) model.build_vocab(labeled_sentences) for epoch in range(10): model.train(labeled_sentences) model.alpha -= 0.002 # decrease the learning rate model.min_alpha = model.alpha # fix the learning rate, no decay if save_file: with open('data/model.pkl', 'wb') as f_model: pickle.dump(model, f_model) else: return model
def test_tokenize(self): train = "\n".join(itertools.imap(strip_tags, itertools.chain(*(speech['text'] for speech in self.speeches[0:10])))) print train tokenizer = PunktSentenceTokenizer(train) sents = tokenizer.tokenize(strip_tags(self.speeches[0]['text'][0])) sents = tokenize_sents(strip_tags(self.speeches[0]['text'][0])) self.assertEqual(len(sents), 3)
def loadCorpus(self, path): for encoding in self.__encodings: try: self.__path = path fileName = codecs.open( self.__path,'r', encoding=encoding ) self.__rawText = fileName.read() break except UnicodeDecodeError: encoding = '' continue if encoding!='': self.initFields() #SENTENCES # more abbreviations with dots punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'n', 'v', 'etc', 'art', 'p', 'Cost', 'ss', 'pag']) punkt_param = PunktParameters() sentence_splitter = PunktSentenceTokenizer(punkt_param) text = re.sub(ur'[\'\<\>`’]', ' ', self.__rawText) #text = re.sub('(\d+)', r' \1 ', text) sentences = sentence_splitter.tokenize(text) #TOKENS self.__tokens = [[token, ''] for token in list(itertools.chain(*[ customWordtokenize(sent) for sent in sentences]))] wordTokenizer = RegexpTokenizer('[a-zA-Z0-9\xe0\xe1\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa]+') #wordTokenizer = RegexpTokenizer('[\w]+') sentences = [wordTokenizer.tokenize(sent.lower()) for sent in sentences if len(wordTokenizer.tokenize(sent)) > 0] words = list(itertools.chain(*sentences)) self.__words = words self.__sentences = sentences self.__avgSentLength = round(np.mean( [len(sent) for sent in sentences]), 3) self.__avgWordLength = round(np.mean( [len(word) for word in words]), 3) self.__freqDist = FreqDist(words) self.__wordCount = len(words) self.__lexicalDiversity = round(len(self.__freqDist.items())/float(len(words)), 5) ### resetting members self.__concordanceIndex = None self.__bigrams = None return encoding
def getSentences(paragraph): unicode_data= paragraph.decode("utf-8") data= "".join([i if ord(i) < 128 else "" for i in unicode_data]) tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') punkt_params = PunktParameters() punkt_params.abbrev_types = set(['al',"inc","mr","dr","mrs","prof"]) splitter = PunktSentenceTokenizer(punkt_params) sentences=splitter.tokenize(data) sentences1=filter_list(sentences) ##print sentences1,"\n----------------------------------------------------------------------------" return sentences1
def __init__(self): self.modelfile = 'punket_tokenizer.pk' if os.path.exists(self.modelfile): self.tokenizer = self.punkt_tokenize_load() else: self.trainer = PunktTrainer() text = "" for file_id in gutenberg.fileids(): text += gutenberg.raw(file_id) self.trainer.INCLUDE_ALL_COLLOCS = True self.trainer.train(text) self.tokenizer = PunktSentenceTokenizer(self.trainer.get_params()) self.tokenizer._params.abbrev_types.add('dr') self.tokenizer._params.abbrev_types.add('mr') self.tokenizer._params.abbrev_types.add('mrs') self.tokenizer._params.abbrev_types.add('miss') self.tokenizer._params.abbrev_types.add('ms') self.tokenizer._params.abbrev_types.add('no') self.tokenizer._params.abbrev_types.add('jan') self.tokenizer._params.abbrev_types.add('feb') self.tokenizer._params.abbrev_types.add('mar') self.tokenizer._params.abbrev_types.add('apr') self.tokenizer._params.abbrev_types.add('may') self.tokenizer._params.abbrev_types.add('jun') self.tokenizer._params.abbrev_types.add('aug') self.tokenizer._params.abbrev_types.add('sep') self.tokenizer._params.abbrev_types.add('oct') self.tokenizer._params.abbrev_types.add('nov') self.tokenizer._params.abbrev_types.add('dec') with open(self.modelfile, mode='wb') as fout: pickle.dump(self.tokenizer, fout, protocol=pickle.HIGHEST_PROTOCOL)
def tokenize_text(seq): '''Tokenizes a string containing one or more sentences, and returns a list of lists, with the outer list representing sentences and the inner lists representing tokenized words within each sentence. This does not remove stop words or do more advanced NL processing.''' def only_words(sent): # Takes a list and returns a version with only plausible words. return [w for w in sent if is_word(w)] def clean_words(sent): # Takes a list of words and cleans them to remove stray punctuation. return [re.sub(_stray_punct, '', word) for word in sent] # Replace common contractions that are safe to replace. replacer = RegexpReplacer(_common_contractions) text = replacer.replace(seq) # Compress multiple blank lines into one. text = re.sub(r'\n+', '\n', text) # Remove URLs. text = re.sub(url_compiled_regex, '', text) # Split words at certain characters that are not used in normal writing. text = str.translate(text, _odd_char_splitter) # Split the text into sentences. punkt_vars = ModifiedPunktLanguageVars() punkt_param = PunktParameters() punkt_param.abbrev_types = _common_abbrevs sentence_splitter = PunktSentenceTokenizer(punkt_param, lang_vars=punkt_vars) sentences = sentence_splitter.tokenize(text, realign_boundaries=True) # Tokenize each sentence individually. sentences = [nltk.word_tokenize(sent) for sent in sentences] # Filter out items that don't have any letters in them, or are too long. sentences = [only_words(sent) for sent in sentences] # Remove embedded quote characters & other oddball characters in strings. sentences = [clean_words(sent) for sent in sentences] # Remove blanks and return the result sentences = [x for x in sentences if x] return sentences
def setup_model(self, model_path, config=None, label_file=None, no_cuda=False): self.device = torch.device( "cuda" if torch.cuda.is_available() and not no_cuda else "cpu") # load labels self.labels_list = torch.load(label_file) # with open(label_file, "r", encoding="utf-8") as f: # for line in f: # line = line.strip().split("\t") # self.labels_list.append(line) config = BertConfig.from_pretrained(config, num_labels=2, cache_dir=None, output_hidden_states=True) tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True, cache_dir=None) model = MTDNNModelV2.from_pretrained( model_path, from_tf=bool(".ckpt" in model_path), config=config, labels_list=self.labels_list, task_list=TASK_LIST, do_task_embedding=False, do_alpha=False, do_adapter=False, num_adapter_layers=2) self.model_config = config self.tokenizer = tokenizer self.sent_tokenizer = PunktSentenceTokenizer() self.model = model self.model.to(self.device)
def sentoken(self, data): #分句 #token = nltk.data.load('tokenizers/punkt/english.pickle') #sents = token.tokenize(str(data)) text = re.sub('\n', ' ', str(data)) if isinstance(text, str): text = text else: raise ValueError('Document is not string!') point_re = re.compile(r'(\D)\.') text = re.sub(point_re, '\g<1>. ', str(text)) #text = re.sub(r'\.', '. ', str(text)) text = re.sub(r'\?', '? ', str(text)) text = re.sub(r'!', '! ', str(text)) text = re.sub(r'i\. e\. ', 'i.e.', str(text)) text = text.strip() punkt_param = PunktParameters() abbreviation = ['i.e'] punkt_param.abbrev_types = set(abbreviation) tokenizer = PunktSentenceTokenizer(punkt_param) sents = tokenizer.tokenize(text) sents = [sent.strip() for sent in sents] return sents
class GCBlockExtractor(ExtractionMapper): def __init__(self): super(GCBlockExtractor, self).__init__(extraction_function=self._blocks_from_text) self.tokenizer = PunktSentenceTokenizer() def _blocks_from_text(self, page): blocks = [] for sentence in self.tokenizer.sentences_from_text( page.text.replace('\n', '')): if sentence.strip(): blocks.append(len(sentence)) # maybe count tokens? or non-spaces? return blocks
def sentoken(self, data): #分句 #token = nltk.data.load('tokenizers/punkt/english.pickle') #sents = token.tokenize(str(data)) text = re.sub('\n', ' ', str(data)) #将data里的换行符替换成空格 if isinstance(text, str): text = text else: raise ValueError('Document is not string') point_re = re.compile(r'(\D)\.') #用于匹配以.结尾的非数字字符串 text = re.sub(point_re, '\g<1>. ', str(text)) #\g<1>相当于引用匹配置换前的内容 text = re.sub(r'\.', '. ', str(text)) text = re.sub(r'\?', '? ', str(text)) text = re.sub(r'\!', '! ', str(text)) text = re.sub(r'i\. e\. ', 'i.e.', str(text)) #复原被插入了空格的缩略语 text = text.strip() #移除字符串头尾指定的字符(默认为空格或换行符) punkt_param = PunktParameters() abbreviation = ['i.e'] punkt_param.abbrev_types = set(abbreviation) #自定义缩写词表,集合set是一个无序不重复的序列 tokenizer = PunktSentenceTokenizer(punkt_param) sents = tokenizer.tokenize(text) sents = [sent.strip() for sent in sents] return sents
def lda(document): #分句 sentence_tokenizer = PunktSentenceTokenizer() sentences = sentence_tokenizer.tokenize(document) #计算词频 c = CountVectorizer() bow_matrix = c.fit_transform(sentences) #print (bow_matrix.shape) #获取词袋模型中所有词语 all_words = (c.get_feature_names()) #index2word index2words = {v: k for k, v in c.vocabulary_.items()} lda = LatentDirichletAllocation(n_topics=2, max_iter=5) lda.fit(bow_matrix) print(lda.components_.shape) print(lda.transform(bow_matrix).shape)
def __init__(self, strip_accents="unicode", lowercase=True, remove_html=True, join_urls=True, use_bigrams=True, use_ner=True, stanford_ner_path="", use_lemmatizer=False, use_stemmer=False): self.stanford_ner_path = stanford_ner_path # path to stanford NER self.strip_accents = strip_accents # options: {‘ascii’, ‘unicode’, None} self.lowercase = lowercase self.remove_html = remove_html self.join_urls = join_urls self.use_bigrams = use_bigrams self.use_ner = use_ner self.use_lemmatizer = use_lemmatizer # use lemmatizer instead of stemmer? self.use_stemmer = use_stemmer # self.stanford_corenlp = StanfordCoreNLP(self.stanford_corenlp_path, memory="8g") self.sentence_splitter = PunktSentenceTokenizer( ).tokenize # Punkt sentence splitter self.stemmer = SnowballStemmer("english").stem # Snowball stemmer self.lemmatizer = WordNetLemmatizer().lemmatize # WordNet lemmatizer self.base_tokenizer = CountVectorizer().build_tokenizer( ) # sklearn tokenizer works the best, I think... self.stop_words = stopwords.words( "english") # nltk list of 128 stopwords self.token_pattern = re.compile( r"(?u)\b(\w*[a-zA-Z_]\w+|\w+[a-zA-Z_]\w*)\b" ) # default value was r"(?u)\b\w\w+\b" self.numeric_pattern = re.compile(r"^[0-9]+$") # number regex self.url_pattern = re.compile(r"((http://)?(www\..*?\.\w+).*?)\s") self.compound_pattern = re.compile(r"\w+(\-\w+)+") if self.use_lemmatizer: self.tokenizer = CustomTokenizer(self.base_tokenizer, self.lemmatizer, self.token_pattern, self.numeric_pattern) elif self.use_stemmer: self.tokenizer = CustomTokenizer(self.base_tokenizer, self.stemmer, self.token_pattern, self.numeric_pattern) else: self.tokenizer = CustomTokenizer(self.base_tokenizer, lambda x: x, self.token_pattern, self.numeric_pattern)
def create_sentences(text_file, min_sentence_len): trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True with open(text_file, "r") as input_file: paragraphs = input_file.read() trainer.train(paragraphs) tokenizer = PunktSentenceTokenizer(trainer.get_params()) # print(tokenizer._params.abbrev_types) sentences = [] for line in open(text_file, "r+").readlines(): sentences_tmp = tokenizer.tokenize(line) for sentence in sentences_tmp: sentences.append(sentence) with open("dataset/sentences.txt", "a") as out_file: for sentence in sentences: if len(sentence) > min_sentence_len: out_file.write(sentence + "\n\n")
def preprocess(doc): sentences = PunktSentenceTokenizer().tokenize(doc) corpus = [] for sentence in sentences: temp = [] words = nltk.word_tokenize(sentence) for word in words: word = re.sub(r'\W+', '', word) word = re.sub(r'_+', '', word) if (word != ''): temp.append(word.lower()) if (len(temp) != 0): corpus.append(temp) return corpus
def tokenize_sentences(input): clean_sentences = [] from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sentence_splitter = PunktSentenceTokenizer(punkt_param) input = input.rstrip() sentences = sentence_splitter.tokenize(input) # erkenne Sätze, Satz-Tokenizer for sentence in sentences: sentence = sentence[:-1] # Entferne den Punkt jeden Satzes if not ( len(sentence) ) < 20: # Wenn der Satz weniger als X Zeichen hat, beachte diesen nicht mehr sentence = sentence.replace( "\n", "") # entferne \n und ersetze mit nichts sentence = sentence.replace( "/", " ") # entferne / und ersetze mit Leerzeichen clean_sentences.append( "!$! " + sentence + " !€!" ) # Gebe Satz Start und Endsymbol und übergebe an Liste, clean_sentences #print(clean_sentences) return clean_sentences
def textRank(document): sentence_tokenizer = PunktSentenceTokenizer() sentences = sentence_tokenizer.tokenize(document) bow_matrix = CountVectorizer().fit_transform(sentences) normalized = TfidfTransformer().fit_transform(bow_matrix) similarity_graph = normalized * normalized.T nx_graph = nx.from_scipy_sparse_matrix(similarity_graph) scores = nx.pagerank(nx_graph) text_rank_graph = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) number_of_nodes = int(0.25 * len(text_rank_graph)) if number_of_nodes < 3: number_of_nodes = 3 del text_rank_graph[number_of_nodes:] summary = ' '.join(word for _, word in text_rank_graph) return summary
def create_sentence_tokens(class_num): if (class_num == 1): os.chdir( "C:\Users\MyPC\Desktop\Ass3\Ass3/20_newsgroups/comp.graphics/") else: os.chdir( "C:\Users\MyPC\Desktop\Ass3\Ass3/20_newsgroups/rec.motorcycles/") all_sentence_tokens = [] for file in glob.glob("*"): f = open(file, 'rb') sentences = PunktSentenceTokenizer().tokenize(f.read()) all_sentence_tokens += sentences f.close() return all_sentence_tokens
def IsItPlagiarized(): text_to_filter = request.form['text_to_check'] if (text_to_filter.lstrip().rstrip() == ''): return render_template('plagiarizer-submit.html') punkt_param = PunktParameters() sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(text_to_filter) probability_of_plagiarism = 0 for a_sentence in sentences: # add a timer so we don't upset bing! time.sleep(0.3) content = filter(lambda x: x in string.printable, a_sentence) the_term = urllib.parse.quote('+' + '"' + str(content) + '"') page = requests.get('https://www.bing.com/search?q=' + the_term) if ((not "No results found for" in page.text) and (not "No hay resultados para" in page.text) and (not "are no results for" in page.text)): probability_of_plagiarism += 1 is_it_plagiarized = str( (probability_of_plagiarism / len(sentences)) * 100) + '%' return render_template('plagiarizer-results.html', text_to_filter=text_to_filter, is_it_plagiarized=is_it_plagiarized)
def __init__(self, ignore_headers=True, raise_invalid_tags=False): """ :param ignore_headers: If true, ignores text inside of the tags included in HEADER_ELEMENTS. This defaults to true because the text inside of these "header elements" is typically not a sentence. :param raise_invalid_tags: If true, raises an InvalidTagError when parsing a tag not in INLINE_ELEMENTS, BLOCK_LEVEL_ELEMENTS (which includes the elements of HEADER_ELEMENTS), SKIPPED_ELEMENTS, EMPTY_ELEMENTS, or SENTENCE_VOID_ELEMENTS. If false, ignores this tag and all of its children. (Sentences descending from it will not be included in the value returned from feed) """ # self.parser is an etree parser by default. self.parser = html5lib.HTMLParser() self.walker = html5lib.getTreeWalker("etree") self.sentences = [] self.ignored_parent_count = 0 self.current_string = '' self.ignore_header_text = ignore_headers self.raise_invalid_tags = raise_invalid_tags punkt_param = PunktParameters() abbreviations = [ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'Adj', 'Adm', 'Adv', 'Asst', 'Bart', 'Bldg', 'Brig', 'Bros', 'Capt', 'Cmdr', 'Col', 'Comdr', 'Con', 'Corp', 'Cpl', 'DR', 'Dr', 'Drs', 'Ens', 'Gen', 'Gov', 'Hon', 'Hr', 'Hosp', 'Insp', 'Lt', 'MM', 'MR', 'MRS', 'MS', 'Maj', 'Messrs', 'Mlle', 'Mme', 'Mr', 'Mrs', 'Ms', 'Msgr', 'Op', 'Ord', 'Pfc', 'Ph', 'Prof', 'Pvt', 'Rep', 'Reps', 'Res', 'Rev', 'Rt', 'Sen', 'Sens', 'Sfc', 'Sgt', 'Sr', 'St', 'Supt', 'Surg', 'v', 'vs', 'i.e', 'inc', 'rev', 'e.g', 'etc', 'Nos', 'Nr', 'pp', 'Jan', 'Feb', 'Mar', 'Apr', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ] punkt_param.abbrev_types = set(abbreviations) self.tokenizer = PunktSentenceTokenizer(punkt_param) logging.basicConfig(filename='html-tokenizer.log', level=logging.WARNING, format='[%(asctime)s] [%(levelname)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
def summarize(self): punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(self.text) structure = {} sentence_objects = [] for idx in range(len(sentences)): obj = {'text' : sentences[idx], 'index' : idx , 'data': {}} sentence_objects.append(obj) structure['sentences'] = sentence_objects structure['ordered'] = [] structure['weights'] = {'words' : FreqDist(nltk.word_tokenize(preprocess(self.text))), 'total': 0} structure['weights']['total'] = sum(structure['weights']['words'].values()) for each_sent in structure['sentences']: each_sent['data']['tokens'] = nltk.word_tokenize(preprocess(each_sent['text'])) each_sent['data']['weights'] = {} each_sent['data']['weights']['words'] = self.calculate_relative_frequence(each_sent['data']['tokens'], structure['weights']['words']) each_sent['data']['weights']['total'] = sum(each_sent['data']['weights']['words'].values()) structure['ordered'] = sorted(structure['sentences'], key=lambda x:x['data']['weights']['total'], reverse=True) structure_keep = structure['ordered'][:self.quota] structure_keep.sort(key=lambda x:x['index']) for eac_sen in structure_keep: self.summary.append(eac_sen['text'])
def semafor_local(text): semafor = join(dirname(__file__),'../{0}/bin/runSemafor.sh'.format(config.get('semafor', 'base_dir'))) input_file = join(dirname(__file__),'../{0}/bin/in.txt'.format(config.get('semafor', 'base_dir'))) with open(input_file, 'w') as f: tokenizer = PunktSentenceTokenizer() sentences = tokenizer.tokenize(text) f.write('\n'.join(sentences)) output_file = join(dirname(__file__),'../{0}/bin/out.txt'.format(config.get('semafor', 'base_dir'))) if isfile(output_file): remove(output_file) process = subprocess.Popen([semafor, input_file, output_file, '1'], shell=False) out, err = process.communicate(text) if err: log.debug(err) sentences_semantics = [] with open(output_file) as f: # semafor outputs an invalid JSON, with one dictionary per line for line in f: sentence_dict = json.loads(line.rstrip()) sentences_semantics.append(sentence_dict) return sentences, sentences_semantics
def sentence_splitter(lang): """ :type lang: str :rtype: nltk.tokenize.punkt.PunktSentenceTokenizer """ punkt_param = PunktParameters() path = os.path.dirname(__file__) ab_file = ''.join([path, SUBFOLDER, lang]) if os.path.isfile(ab_file): punkt_param.abbrev_types = set(abbreviation_loader(ab_file)) else: logging.info('Abbreviation file not found for language: %s', lang) splitter = PunktSentenceTokenizer(punkt_param) return splitter
def newPred(self, text): sentence_tokenizer = PunktSentenceTokenizer() sentences = sentence_tokenizer.tokenize(text) print(len(sentences)) predList = PreprocessingClass().filter_sentences(text) with open('classifier.pickle', 'rb') as f: clf = pickle.load(f) with open('vectorize.pickle', 'rb') as f: vect = pickle.load(f) with open('tfidfmodel.pickle', 'rb') as f: tfidf = pickle.load(f) predText = vect.fit_transform(predList).toarray() predText = tfidf.transform(predText).toarray() new_pred = clf.predict(predText) finalSum = [] for i, j in enumerate(new_pred): if j == 1: finalSum.append(sentences[i]) return finalSum
def __init__(self, path): document = path if os.path.exists(path): with open(path, "r") as file: document = file.read().replace('\n', ' ') document = QUOTES.sub('', document) tfidf_vectorizer = TfidfVectorizer(stop_words='english') sentence_tokenizer = PunktSentenceTokenizer() self.sentences = sentence_tokenizer.tokenize(document) bow_matrix = tfidf_vectorizer.fit_transform(self.sentences) self.tfidf_features = tfidf_vectorizer.get_feature_names() sentence_similarity_matrix = bow_matrix * bow_matrix.T word_similarity_matrix = bow_matrix.T * bow_matrix self.sentence_nx_graph = nx.from_scipy_sparse_matrix( sentence_similarity_matrix) self.word_nx_graph = nx.from_scipy_sparse_matrix( word_similarity_matrix) self.__sentence_pagerank = None self.__word_pagerank = None
def get_sentence_tokenizer(language): """ Return the sentence tokenizer callable. """ pickle_path = 'sentence_tokenizer.pickle' try: input_file = open(pickle_path, 'rb') sentence_tokenizer = load(input_file) input_file.close() except FileNotFoundError: data_file_paths = [] sentences = [] try: # Get the paths to each file the bot will be trained with corpus_files = list_corpus_files('core.corpus.{language}'.format( language=language.ENGLISH_NAME.lower() )) except LookupError: # Fall back to English sentence splitting rules if a language is not supported corpus_files = list_corpus_files('core.corpus.{language}'.format( language=languages.ENG.ENGLISH_NAME.lower() )) data_file_paths.extend(corpus_files) for corpus, _categories, _file_path in load_corpus(*data_file_paths): for conversation in corpus: for text in conversation: sentences.append(text.upper()) sentences.append(text.lower()) trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train('\n'.join(sentences)) sentence_tokenizer = PunktSentenceTokenizer(trainer.get_params()) # Pickle the sentence tokenizer for future use output_file = open(pickle_path, 'wb') dump(sentence_tokenizer, output_file, -1) output_file.close() return sentence_tokenizer
def text_sentences(text): if isinstance(text, bytes): text = text.decode('utf-8') lines = [] for line in text.splitlines(keepends=False) if isinstance(text, str) else text: line = fix_text(line.decode('utf-8') if isinstance(line, bytes) else line).strip() if len(line) <= 1: continue line = blanksre.sub(' ', line) lines.append(line) punkt_param = PunktParameters() punkt = PunktSentenceTokenizer(punkt_param) punkt.train('\n'.join(lines)) r = [] for line in lines: r.extend(punkt.tokenize(line)) return r
def test(data_generator): for _id, query, docs in data_generator: # tokenization tokenized_query = tokenizer.texts_to_sequences([query])[0] if queries_sw is not None: tokenized_query = [token for token in tokenized_query if token not in queries_sw] for doc in docs: if isinstance(doc["text"], list): continue # cached tokenization # sentence splitting new_docs = [] _temp_new_docs = [] doc["offset"] = [] for start, end in PunktSentenceTokenizer().span_tokenize(doc["text"]): _temp_new_docs.append(doc["text"][start:end]) if start<(len(doc["title"])-1): doc["offset"].append(["title",(start, end), doc["text"][start:end], []]) else: doc["offset"].append(["abstract", (start-len(doc["title"]), end-len(doc["title"])), doc["text"][start:end], []]) _temp_new_docs = tokenizer.texts_to_sequences(_temp_new_docs) if docs_sw is not None: for tokenized_docs in _temp_new_docs: tokenized_docs = [token for token in tokenized_docs if token not in docs_sw] #doc["extra_features"] = compute_extra_features(tokenized_query, _temp_new_docs, idf_from_id_token)+[doc["score"]] for k,t_q in enumerate(tokenized_query): new_docs.append([]) for l,_new_doc in enumerate(_temp_new_docs): for i,t_d in enumerate(_new_doc): if t_d==t_q: new_docs[-1].append(_new_doc) doc["offset"][l][-1].append(k) break doc["text"] = new_docs yield _id, tokenized_query, docs
class SentenceSplitter(object): def __init__(self): super(SentenceSplitter, self).__init__() self.sent_tokeniser_ = PunktSentenceTokenizer() def process(self, text, tokens): token_strs = [text[e[0] : e[1]] for e in tokens] sents = self.sent_tokeniser_.sentences_from_tokens(token_strs) curr = 0 res_sents = list() for sent in sents: res_sents.append([Span(begin = e[0], end = e[1]) for e in tokens[curr : curr + len(sent)]]) curr += len(sent) return res_sents
def build_made_tokenizer(keep_token_strings=False): print('Building MADE tokenizer...') cs_preprocess_split_re_strings = [] # double newlines cs_preprocess_split_re_strings.append(r'[\r\n]{2,}') # newlines with only spaces cs_preprocess_split_re_strings.append(r'[\r\n]+\s+[\r\n]+') # numbered lists (e.g. "1.", "2)") cs_preprocess_split_re_strings.append(r'(^|\r|\n)+\s*\d+[.)-]') # bulleted lists (e.g."*", "-") cs_preprocess_split_re_strings.append(r'(^|\r|\n)+\s*[*-]') # starting labels (e.g. "WEIGHT:") cs_preprocess_split_re_strings.append(r'(^|\r|\n)+\s*\w+[:]') # break up other lines separated by dates cs_preprocess_split_re_strings.append( r'(^|\r|\n)+\s*\d{1,2}[/-]\d{1,2}[/-]\d{2,4}') # MIMIC has many lines that start with this [**YYYY-M-DD**] cs_preprocess_split_re_strings.append(r'^\[\*+\d{4}-\d{1,2}-\d{1,2}\*+\]') # TIU notes have long bars like this : '***********' or '===========' or '------' cs_preprocess_split_re_strings.append(r'[*=-]{3,}') # NOTE : This breaking rule was disabled 2-13-18 since the UMass MADE challenge data often ended each line with 2 spaces and a # newline which caused this aggressive rule to fire over and over again. # aggressively break anything with lots of spaces (tabular data) #cs_preprocess_split_re_strings.append(r'\s{3,}') custom_lang_vars = CustomSentenceBreakingLangVars() custom_lang_vars.sent_end_chars = ('.', '!') print(custom_lang_vars.sent_end_chars) punkt_tokenizer2 = PunktSentenceTokenizer(lang_vars=custom_lang_vars) treebank_tokenizer = TreebankWordTokenizer() # looks like "pt." and "D.R." and "P.R." are already being handled #punkt_tokenizer2._params.abbrev_types.update(extra_abbrev) cs_tokenizer = basic.nlp.tokenizers.clinical_tokenizers.ClinicalSentenceTokenizer( default_sentence_tokenizer=punkt_tokenizer2, preprocess_split_re_strs=cs_preprocess_split_re_strings) made_index_tokenizer = basic.nlp.tokenizers.clinical_tokenizers.IndexTokenizer( cs_tokenizer, treebank_tokenizer, keep_token_strings=keep_token_strings) return made_index_tokenizer
def generate_tweet(nltk_text): text = nltk_text.generate out = store_output(text, 10000) out = PunktSentenceTokenizer().tokenize(out) out = out[5:] # get rid of initial jargon/repetition criteria = False num_tries = 0 while not criteria and num_tries <= TRY_LIMIT: tweet = choice(out) tweet = pre_process(tweet) if len(tweet) > 80 and len(tweet) < 140: criteria = True else: num_tries += 1 if criteria: return tweet else: return None
def get_spliter(): with open('sent_tokenize_model_v1.0.pkl', 'rb') as fs: punkt_param = pickle.load(fs) punkt_param.sent_starters = {} abbrev_types = ['g.m.t', 'e.g', 'dr', 'dr', 'vs', "000", 'mr', 'mrs', 'prof', 'inc', 'tp', 'ts', 'ths', 'th', 'vs', 'tp', 'k.l', 'a.w.a.k.e', 't', 'a.i', '</i', 'g.w', 'ass', 'u.n.c.l.e', 't.e.s.t', 'ths', 'd.c', 've…', 'ts', 'f.t', 'b.b', 'z.e', 's.g', 'm.p', 'g.u.y', 'l.c', 'g.i', 'j.f', 'r.r', 'v.i', 'm.h', 'a.s', 'bs', 'c.k', 'aug', 't.d.q', 'b…', 'ph', 'j.k', 'e.l', 'o.t', 's.a'] abbrev_types.extend(string.ascii_uppercase) for abbrev_type in abbrev_types: punkt_param.abbrev_types.add(abbrev_type) for abbrev_type in string.ascii_lowercase: punkt_param.abbrev_types.add(abbrev_type) return PunktSentenceTokenizer(punkt_param)
class SentenceTokenizer: def __init__(self): self.tokenizer = PunktSentenceTokenizer() def tokenize(self, string): instructions = string sentences = self.tokenizer.tokenize(instructions) standoffs = [] lastStart = 0 for sentence in sentences: startIdx = instructions.index(sentence, lastStart) endIdx = startIdx + len(sentence) standoffs.append(TextStandoff(string, (startIdx, endIdx))) lastStart = endIdx for s1 in standoffs: for s2 in standoffs: assert s1 == s2 or not s1.overlaps(s2) return standoffs
class Summarization: def __init__(self,text): self.text=text self.text = ' '.join(self.text.strip().split('\n')) self.sentence_splitter = PunktSentenceTokenizer() self.sentences = self.sentence_splitter.tokenize(text) def tokenization(self): if(debug): return self.sentences def bag_of_words(self): self.bag_of_words_matrix = CountVectorizer().fit_transform(self.sentences) if(debug): return self.bag_of_words_matrix def normalization(self): self.normalized_matrix = TfidfTransformer().fit_transform(self.bag_of_words_matrix) self.similarity_graph = self.normalized_matrix * self.normalized_matrix.T if(debug): return self.normalized_matrix def similarity(self): if(debug): return self.similarity_graph def textrank(self): self.nx_graph = nx.from_scipy_sparse_matrix(self.similarity_graph) self.scores = nx.pagerank(self.nx_graph) self.sorted_text = sorted(((self.scores[i],s) for i,s in enumerate(self.sentences)),reverse=True) if(debug): print "\n\n" print "Scores.....\n" print self.sorted_text return self.sorted_text def summarized_text(self): self.summary="" for i in range(len(self.sorted_text)): self.summary+=self.sorted_text[i][1] self.summary = ' '.join(self.summary.strip().split('\n')) self.summary = ' '.join(self.summary.split()) return self.summary
def getrank(document): sentences = PunktSentenceTokenizer().tokenize(document) bow_matrix = CountVectorizer().fit_transform(sentences) normalized = TfidfTransformer().fit_transform(bow_matrix) similarity_graph = normalized * normalized.T nx_graph = networkx.from_scipy_sparse_matrix(similarity_graph) values = networkx.pagerank(nx_graph) sentence_array = sorted(((values[i], s) for i, s in enumerate(sentences)), reverse=True) sentence_array = numpy.asarray(sentence_array) freq_max = float(sentence_array[0][0]) freq_min = float(sentence_array[len(sentence_array) - 1][0]) temp_array = [] for i in range(0, len(sentence_array)): if freq_max - freq_min == 0: temp_array.append(0) else: temp_array.append((float(sentence_array[i][0]) - freq_min) / (freq_max - freq_min)) threshold = (sum(temp_array) / len(temp_array)) + 0.25 sentence_list = [] for i in range(0, len(temp_array)): if temp_array[i] > threshold: sentence_list.append(sentence_array[i][1]) seq_list = [] for sentence in sentences: if sentence in sentence_list: seq_list.append(sentence) return seq_list