def texttiling(): conn = db_conn("map") cur = conn.cursor() tt = TextTilingTokenizer() # select all unique observation sql = "SELECT DISTINCT(observation) FROM utterances" cur.execute(sql) unique_observs = [t[0] for t in cur.fetchall()] # for each obsv for i, obsv in enumerate(unique_observs): sql = 'SELECT utterID, tagged FROM utterances WHERE observation = %s AND tagged <> ""' cur.execute(sql, [obsv]) utter_id, tagged = zip(*cur.fetchall()) text = "\n\n\n\t".join(tagged) try: segmented_text = tt.tokenize(text) except Exception as e: raise e else: uid_idx = 0 for j, seg in enumerate(segmented_text): topic_id = j + 1 sents = [s for s in seg.split("\n\n\n\t") if s != ""] for k, s in enumerate(sents): in_topic_id = k + 1 sql = "UPDATE utterances SET topicID = %s, inTopicID = %s \ WHERE observation = %s AND utterID = %s" cur.execute(sql, (topic_id, in_topic_id, obsv, utter_id[uid_idx])) uid_idx += 1 conn.commit() sys.stdout.write("\r{}/{}".format(i + 1, len(unique_observs))) sys.stdout.flush()
def texttiling(): conn = db_conn('map') cur = conn.cursor() tt = TextTilingTokenizer() # select all unique observation sql = 'SELECT DISTINCT(observation) FROM utterances' cur.execute(sql) unique_observs = [t[0] for t in cur.fetchall()] # for each obsv for i, obsv in enumerate(unique_observs): sql = 'SELECT utterID, tagged FROM utterances WHERE observation = %s AND tagged <> ""' cur.execute(sql, [obsv]) utter_id, tagged = zip(*cur.fetchall()) text = '\n\n\n\t'.join(tagged) try: segmented_text = tt.tokenize(text) except Exception as e: raise e else: uid_idx = 0 for j, seg in enumerate(segmented_text): topic_id = j + 1 sents = [s for s in seg.split('\n\n\n\t') if s != ''] for k, s in enumerate(sents): in_topic_id = k + 1 sql = 'UPDATE utterances SET topicID = %s, inTopicID = %s \ WHERE observation = %s AND utterID = %s' cur.execute( sql, (topic_id, in_topic_id, obsv, utter_id[uid_idx])) uid_idx += 1 conn.commit() sys.stdout.write('\r{}/{}'.format(i + 1, len(unique_observs))) sys.stdout.flush()
def texttiling_text(text, k=20, w=40, smoothing_width=10, smoothing_rounds=5): tt = TextTilingTokenizer(stopwords=raw_stopword_list, k=k, w=w, smoothing_width=smoothing_width, smoothing_rounds=smoothing_rounds) o = tt.tokenize(text) return o
def split_pp_to_paragraphs(clean_pp, contractions_dict, pattern): """ Uses TextTilingTokenizer to split to paragraphs, the privacy policy document should be pre-processed (HTML cleaned) before reaching this function. :param clean_pp: clean pp before expansion of contractions and special cases :param contractions_dict: a dictionary that includes all varieties of contractions and their expansion :param pattern: pattern for the expansion of contractions :return: list of paragraphs """ clean_pp = clean_pp_advanced(clean_pp, contractions_dict, pattern) ttt = TextTilingTokenizer() paragraphs = ttt.tokenize(clean_pp) return paragraphs
def segments(txt): ttt = TextTilingTokenizer() tokens = ttt.tokenize(txt) start = 0 end = 0 tileSpan = [] for token in tokens: end = start + len(token) tileSpan.append((start, end)) start = end return tileSpan
def demo(text=None): from nltk.corpus import brown from matplotlib import pylab tt = TextTilingTokenizer(demo_mode=True) if text is None: text = brown.raw()[:10000] s, ss, d, b = tt.tokenize(text) pylab.xlabel("Sentence Gap index") pylab.ylabel("Gap Scores") pylab.plot(range(len(s)), s, label="Gap Scores") pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores") pylab.plot(range(len(d)), d, label="Depth scores") pylab.stem(range(len(b)), b) pylab.legend() pylab.show()
def vis_tokenize(context, question): glove = utils.load_glove(dim=200) ttt = TextTilingTokenizer() para_list = [] paras = [para for para in context.split('\\n') if para != ''] for para in paras: sent_list = [] for sent in sent_tokenize(para): temp = {} temp['words'] = word_tokenize(sent) temp['vectors'] = [ np.array(glove[word.lower()]) for word in temp['words'] ] sent_list.append(temp) para_list.append(sent_list) q_dict = {} q_dict['words'] = word_tokenize(question) q_dict['vectors'] = [ np.array(glove[word.lower()]) for word in q_dict['words'] ] return para_list, q_dict
def run(): for idx, filename in enumerate(os.listdir(os.getcwd() + '/papers')): paper_path = os.getcwd() + '/papers/' + filename content = (convert_pdf_to_txt(paper_path))\ .lower()\ .replace('. ', '. ')\ .replace('. ', '. ')\ # .replace('\n\n', '\n')\ # .replace('\f', '') # .replace('\n', ' ')\ # .replace('-', ' ')\ relevant_text = get_intro_conclusion(content) raw_docs[idx] = relevant_text relevant_text = relevant_text\ .replace('. ', '. ')\ .replace('. ', '. ')\ .replace('- ', '') # Utilising NLTK Text Tiling with default params # seg_2 = TextTilingTokenizer().tokenize(relevant_text) # Utilising NLTK Text Tiling with custom params(pseudosentence size, block comparison size) tt = TextTilingTokenizer(w=10, k=4) paper_tiles = tt.tokenize(relevant_text) text_tiles[idx] = paper_tiles return raw_docs, text_tiles
def __init__(self, cutoff_policy='HC', stop_words=stopwords.words('english'), w=20, k=10): """ Constructor """ self.__stop_words = stop_words self.__cutoff_policy = cutoff_policy self.__w = w self.__k = k self.__tiler = TextTilingTokenizer(stopwords=stop_words, cutoff_policy=cutoff_policy, w=w, k=k)
def reload_tiler(self): """ Reload the text tiler. Use if memory is an issue. """ del self.__tiler self.__tiler = self.__tiler = TextTilingTokenizer( stopwords=self.__stop_words, cutoff_policy=self.__cutoff_policy, w=self.__w, k=self.__k)
def segment_transcript(doc): """doc is a document object with text lines in 'transcript', add a list of 'topics' to the document object and return it """ tok = TextTilingTokenizer() lines = [turn['text'] for turn in doc['lines']] text = "\n\n".join(lines) doc['topics'] = [] start = 0 for topic in tok.tokenize(text): length = len(topic.strip().split('\n\n')) end = start + length doc['topics'].append({'start': start, 'end': end}) start = end return doc
def texttiling_BNC(): conn = db_conn('bnc') cur = conn.cursor() # select unique convId query = 'select distinct(convId) from entropy_DEM_full' cur.execute(query) conv_ids = [t[0] for t in cur.fetchall()] # for each convId, do texttiling, and update the episodeId and inEpisodeId columns tt = TextTilingTokenizer() for i, cid in enumerate(conv_ids): query = 'select strLower from entropy_DEM_full where convId = %s' cur.execute(query, [cid]) text = '\n\n\n\t'.join([t[0] for t in cur.fetchall()]) try: segmented = tt.tokenize(text) except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() if str(exc_obj) == 'Input vector needs to be bigger than window size.' or \ str(exc_obj) == 'No paragraph breaks were found(text too short perhaps?)': # it means the conversation is too short pass else: raise else: global_id = 1 for j, seg in enumerate(segmented): epi_id = j + 1 sents = [s for s in seg.split('\n\n\n\t') if s != ''] for k, s in enumerate(sents): in_epi_id = k + 1 # update query = 'update entropy_DEM_full set episodeId = %s, inEpisodeId = %s \ where convId = %s and globalId = %s' cur.execute(query, (epi_id, in_epi_id, cid, global_id)) global_id += 1 # print progress sys.stdout.write('\r%s/%s updated' % (i+1, len(conv_ids))) sys.stdout.flush() # commit conn.commit()
def buildClassifier(self,classifier): ''' Instantiates each of the Classifiers. Vectorizers should be built separately *Required Parameters* :param classifier: specify which type of classifier to build (mnb,rbf,sent,textTile) ''' if classifier == 'mnb': self.__mnbClassifier=MultinomialNB() elif classifier == 'rbf': self.__rbfClassifier=SVC() elif classifier=='sent': self.__sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') elif classifier == 'textTile': self.__textTiler=TextTilingTokenizer() elif classifier=='clust': self.__clustMatrix=None
class TopicTokenizer: """ Text tiling tokenizer """ def __init__(self, cutoff_policy='HC', stop_words=stopwords.words('english'), w=20, k=10): """ Constructor """ self.__stop_words = stop_words self.__cutoff_policy = cutoff_policy self.__w = w self.__k = k self.__tiler = TextTilingTokenizer(stopwords=stop_words, cutoff_policy=cutoff_policy, w=w, k=k) def get_boundaries(self, text): """ Get potential topic boundaries between the text. :param text: The text to tile :return: A list of potential topics """ topics = self.__tiler.tokenize(text) return topics def reload_tiler(self): """ Reload the text tiler. Use if memory is an issue. """ del self.__tiler self.__tiler = self.__tiler = TextTilingTokenizer( stopwords=self.__stop_words, cutoff_policy=self.__cutoff_policy, w=self.__w, k=self.__k)
def __init__(self, w=200, k=5): #call super class constructor TextTilingTokenizer.__init__(self, w, k, stopwords=nltkstop.words(LANG), demo_mode=True)
def __init__(self): self._tt = TextTilingTokenizer()
class TexttileWrapper: def __init__(self): self._tt = TextTilingTokenizer() def sentence_array_texttile(self, sentences): text = " \n\n".join(x for x in sentences if len(x) > 0) + "\n\n" tok = self._tt.tokenize(text) assignments = [0] * len(sentences) if tok: for ii in xrange(len(sentences)): try: assignments[ii] = min(x for x in xrange(len(tok)) if sentences[ii] in tok[x]) + 1 except ValueError: print("ERROR %i!" % ii) #print(text.encode("ascii", "ignore")) #print(tok) assignments[ii] = 0 print "**************" print assignments # Make assignments monotonically increasing last_assignment = -1 assignments_seen = -1 for ii in xrange(len(assignments)): if assignments[ii] != last_assignment: assignments_seen += 1 last_assignment = assignments[ii] assignments[ii] = assignments_seen print assignments return assignments def fallback_segmenter(self, text, max_sentence_length = 500, arbitrary_words_per_sent = 30, max_sentences_per_texttile = 15, arbitrary_sentences_per_tile = 6): # First, try to segment into sentences with punkt sentences = punkt.tokenize(text) # If that doesn't work, use a really stupid regexp longest_sentence = max(len(x) for x in sentences) print ("Longest sentence is %i" % longest_sentence) if longest_sentence > 500: print "Using regexp sentence breaker" sentences = punct_regexp.findall(text) # If that still doesn't work, use arbitrary breaks if max(len(x) for x in sentences) > 600: print "Using ad hoc sentence breaker" sentences = [] words = text.split() num_words = len(words) for ii in xrange(num_words // arbitrary_words_per_sent + 1): sentences.append(" ".join(words[ii * arbitrary_words_per_sent: min((ii + 1) * arbitrary_words_per_sent, num_words)])) # Now feed that into texttile print(sentences) try: tile_assignments = self.sentence_array_texttile(sentences) tiles = set(tile_assignments) except ValueError: tile_assignments = None # If that doesn't work, split "sentences", however defined, into reasonable # sized chunks if tile_assignments == None or max(sum(1 for y in tile_assignments if y == x) for x in tiles) > max_sentences_per_texttile: tile_assignments = [x // arbitrary_sentences_per_tile for x in xrange(len(sentences))] return sentences, tile_assignments def fallback_wrapper(self, text): sentences, assignments = self.fallback_segmenter(text) num_sents = len(sentences) tiles = [] for ii in xrange(max(assignments) + 1): tiles.append(" ".join(sentences[x] for x in xrange(num_sents) \ if assignments[x] == ii)) return tiles
def get_paragraphs_from_text(text): tiling_tokenizer = TextTilingTokenizer() paragraphs = tiling_tokenizer.tokenize(text) return paragraphs
from nltk.tokenize.texttiling import TextTilingTokenizer from nltk.tag import pos_tag, pos_tag_sents from nltk import word_tokenize import codecs from argparse import ArgumentParser import os argparser = ArgumentParser() argparser.add_argument('file', help="text document") args = argparser.parse_args() stopwords = stopwords.words('english') doc_path = os.path.splitext(args.file)[0] tt = TextTilingTokenizer() text = codecs.open(doc_path + '.txt', 'r', "utf-8").read() parags = tt.tokenize(text) buffer_tiled = '' buffer_tiled_tagged = '' buffer_tiled_tagged_clean = '' tagged_parags = pos_tag_sents([word_tokenize(p) for p in parags]) clean_parags = [ filter(lambda taggedword: taggedword[0] not in stopwords, p) for p in tagged_parags ] for i, p in enumerate(parags): buffer_tiled += p
# db init: ssh [email protected] -i ~/.ssh/id_rsa -L 1234:localhost:3306 conn = MySQLdb.connect(host="127.0.0.1", user="******", port=3306, passwd="05012014", db="bnc") cur = conn.cursor() # all convIDs sql = 'SELECT DISTINCT(convID) FROM entropy_DEM100' cur.execute(sql) convIDs = [tup[0] for tup in cur.fetchall()] convIDs.sort() # tokenizer tt = TextTilingTokenizer() # get the text of each convID, and do the TextTiling failed_convIDs = [] for cid in convIDs: sql = 'SELECT rawWord FROM entropy_DEM100 WHERE convID = %s' cur.execute(sql, [cid]) text = '\n\n\n\t'.join([tup[0] for tup in cur.fetchall()]) try: segmented_text = tt.tokenize(text) except Exception, e: print 'convID %d failed' % cid failed_convIDs.append(cid) else: global_idx = 1
user = "******", port = 1234, passwd = "05012014", db = "swbd") cur = conn.cursor() # create the table sql = 'DROP TABLE IF EXISTS textTiling' cur.execute(sql) sql = 'CREATE TABLE textTiling (convID INT, globalID INT, tileID INT, inTileID INT, entropy FLOAT, \ PRIMARY KEY (convID, globalID));' cur.execute(sql) # initialize tt = TextTilingTokenizer() # tt_demo = TextTilingTokenizer(demo_mode = True) # get all conversation IDs sql = 'SELECT DISTINCT(convID) FROM entropy' cur.execute(sql) convIDs = [tup[0] for tup in cur.fetchall()] # get text for each cid and do text tiling for cid in convIDs: sql = 'SELECT tagged FROM entropy WHERE convID = %d' % cid cur.execute(sql) text = '\n\n\n\t'.join([tup[0] for tup in cur.fetchall()]) # tiling try: segmented_text = tt.tokenize(text)
class TextHandler(object): ''' Parse Out Individual Pieces of Text. Tools utilizable from this API class include: -Text Tiling -RBF Networks -Fuzzy Clustering with Cosine Distances (custom and open source implementation of mine) -MNB recognition -sent tokenizing -minimum distance matching ''' def __init__(self): ''' Constructor ''' self.__mnbClassifier=None self.__rbfClassifier=None self.__textTiler=None self.__vectorizer=None self.__tfidf=None self.__mnbTrained=False self.__clustMatrix=None self.__preds=[] def delMNB(self): ''' GC Object ''' self.__mnbClassifier=None self.__mnb=False gc.collect() del gc.garbage[:] def delRBF(self): ''' GC Object ''' self.__rbfClassifier=None gc.collect() del gc.garbage[:] def delTextTiler(self): ''' GC Object ''' self.__textTiler=None gc.collect() del gc.garbage[:] def delVectorizer(self): ''' GC Object ''' self.__vectorizer=None gc.collect() del gc.garbage[:] def delTfIdf(self): ''' GC Object ''' self.__tfidf=None gc.collect() del gc.garbage[:] def delClustMatrix(self): ''' GC Object ''' self.__clustMatrix=None gc.collect() del gc.garbage[:] def resetPreds(self): ''' GC Object ''' self.__preds=[] gc.collect() del gc.garbage[:] def getPreds(self): ''' Return a list of the current prediction names. ''' return self.__preds def getClustMatrix(self): ''' Returns the cluster matrix ''' return self.__clustMatrix def getNamedEntities(self): ''' Parse out named entities ''' pass def buildVectorizer(self,vector): ''' Instantiates the vectorizers such as count vectorizer or tfidf vectorizer. This is useful when calling the vectorizer multiple times. *Required Parameters* :param vector: which vectorizer to build ('count','tfidf') ''' if vector == 'count': self.__vectorizer=CountVectorizer(stop_words='english') elif vector == 'tfidf': self.__tfidf=TfidfTransformer(norm='l2') def trainVectorizers(self,document): ''' Train the Vectorizers with a document that should be tokenized into sentences and words **Warning: All listed items will be concatenated to a single matrix** *Required Parameters* :param document: the document (text) or list of documents (file paths) to build count and tfidf vectorizers with (be as representative as possible) ''' self.buildVectorizer('count') self.buildVectorizer('tfidf') if type(document) is str: self.__tfidf.fit(self.__vectorizer.fit_transform(document)) else: uvecs=None sentences=[] for doc in document: if os.path.exists(doc) is True: sentences=[] with open(document,'r') as fp: sentences.extend([parse(x,tags=False,chunks=False).split(" ") for x in self.__sent_tokenizer.tokenize(fp.read())]) if uvecs is not None: self.__tfidf.fit(self.__vectorizer.fit(sentences)) def buildClassifier(self,classifier): ''' Instantiates each of the Classifiers. Vectorizers should be built separately *Required Parameters* :param classifier: specify which type of classifier to build (mnb,rbf,sent,textTile) ''' if classifier == 'mnb': self.__mnbClassifier=MultinomialNB() elif classifier == 'rbf': self.__rbfClassifier=SVC() elif classifier=='sent': self.__sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') elif classifier == 'textTile': self.__textTiler=TextTilingTokenizer() elif classifier=='clust': self.__clustMatrix=None def trainMinDistanceMat(self,cats): ''' Train the Minmum Distance Cluster Matrix from given files **WARNING: This will turn a sparse vector to a dense vector** *Required Parameters* :param cats: categories to use ''' for cat in cats: if os.path.exists(cat) is True: sentences=[] with open(cat,'r') as fp: sentences=[parse(x,tags=False,chunks=False).split(" ") for x in self.__sent_tokenizer.tokenize(fp.read())] if len(sentences)>0: vecs=self.__vectorizer.transform(sentences) vecs=self.__tfidf.transform(vecs) if self.__clustMatrix is None: self.__clustMatrix=vecs.mean() else: self.__clustMatrix=scipy.sparse.vstack((self.__clustMatrix,vecs.mean())) def getMinDistanceCategory(self,document): ''' Find the best strength document via the trained cats to the document. Requires building the clustMatrix and vectorizers. **WARNING: This will turn a sparse vector to a dense vector** *Required Parameters* :param document: document to test with ''' vecs=self.__tfidf.transform(self.__) def trainMNB(self,cats,partial=False): ''' Train Multinomial Bayes to use in obtaining the appropriate weights for data. Please instantiate the sent_tokenizer *Required Parameters* :param cats: list of category directories *Optional Parameters* :param partial: whether to create a partial fit from the data (if using partial, please train the vectorizers first) ''' self.__preds=[] gc.collect() del gc.garbage[:] if self.__vectorizer is None: self.buildVectorizer('count') if self.__tfidf is None: self.buildClassifier('tfidf') cl=[] uvecs=None for cat in cats: if os.path.exists(cat) is True: sentences=[] with open(cat,'r') as fp: sentences=[parse(x,tags=False,chunks=False).split(" ") for x in self.__sent_tokenizer.tokenize(fp.read())] if len(sentences)>0: vecs=self.__vectorizer.transform(sentences) vecs=self.__tfidf.transform(vecs) if partial is True: self.__preds.append(re.sub("\..*|\/","",os.path.split(cat)[1]).strip()) for i in range(vecs.shape[0]): cl.append(len(self.__preds)-1) self.__mnbClassifier.partial_fit(vecs,numpy.asarray(cl)) cl=[] else: self.__preds.append(re.sub("\..*|\/","",os.path.split(cat)[1]).strip()) for i in range(vecs.shape[0]): cl.append(len(self.__preds)-1) if uvecs is None: uvecs=vecs else: uvecs=scipy.sparse.vstack((uvecs,vecs)) if partial is False and uvecs is not None: self.__mnbClassifier.fit(uvecs, cl) del uvecs del cl gc.collect() del gc.garbage[:] def classifyMNB(self,document): ''' Multinomial Bayes Algorithm for fastest but least reliable results. Use only if the topics are clearly distinguishable. Requires building vectorizers and training MNB first. Returns the name and number of the category to work with *Required Parameters* :param document: document to classify ''' sentences=numpy.asarray([parse(x,tags=False,chunks=False).split(" ") for x in self.__sent_tokenizer.tokenize(document)]) vecs=self.__tfidf.transform(self.__vectorizer.transform(sentences)) return [(self.__preds[x],x) for x in self.__mnbClassifier.predict(vecs)] def trainRBF(self,cats): ''' Trains an RBF classifier for use in categorization. There is no Partial fit for a neural network. Everything must fit in memory. *Required Parameters* :param cats: list of category files to train on ''' cl=[] uvecs=None self.__preds=[] gc.collect() del gc.garbage[:] for cat in cats: if os.path.exists(cat) is True: sentences=[] with open(cat,'r') as fp: sentences=numpy.asarray([parse(x,tags=False,chunks=False).split(" ") for x in self.__sent_tokenizer.tokenize(fp.read())]) if len(sentences)>0: vecs=self.__vectorizer.transform(sentences) vecs=self.__tfidf.transform(vecs) if uvecs is None: uvecs=vecs else: uvecs=scipy.sparse.vstack((uvecs,vecs)) self.__preds.append(re.sub("\..*|\/","",os.path.split(cat)[1]).strip()) for i in range(vecs.shape[0]): cl.append(len(self.__preds)-1) if uvecs is not None: self.__rbfClassifier.fit(uvecs, cl) del cl del uvecs gc.collect() del gc.garbage[:] def classifyRBF(self,document): ''' Classify with RBF Neural Network from SK Learn. Requires training count and self.__sent_tokenizer.tokenize,tfidf and count vectorizers, and the RBF classifier first *Required Parameter* :param document: text document to use ''' sentences=numpy.asarray([parse(x,tags=False,chunks=False).split(" ") for x in self.__sent_tokenizer.tokenize(document)]) vecs=self.__tfidf.transform(self.__vectorizer.transform(sentences)) return [(self.__preds[x],x) for x in self.__rbfClassifier.predict(vecs)] def sentTokenize(self,document,parse=False,remPunc=True): ''' Used to tile on sentences using the Brown corpus from ntlk. No vectorizers are necessary but the sent_tokenizer needs to be established. *Required Parameters* :param document: full text of document *Optional Parameters* :param parse: whether to use CLIPS pattern to stem and disambiguate the sentence :param remPunc: whether to remove punctuation (default is true) [certain algos. such as max ent for sentence detection may require False] ''' print self.__sent_tokenizer.tokenize(document) sentences=[(lambda x:parse(x,tags=False,chunks=False).split(" ") if parse is True else x)(x) for x in self.__sent_tokenizer.tokenize(document)] return sentences def textTiler(self,document,parse=False): ''' Tile Text for further Processing. Separation by topic is recommended before identifying what that topic is. Even better results can be obtained with SimplrTerms feature folder but a tool like that can take a while. *Required Parameters* :param document: The Document to Tile *Optional Parameters* :param parse: whether to stem and disambiguate sentences in the document using pattern clips ''' if parse is True: document="\n".join(self.sentTokenize(document, parse,remPunc=False)) return self.__textTiler.tokenize(document)
class TexttileWrapper: def __init__(self): self._tt = TextTilingTokenizer() def sentence_array_texttile(self, sentences): text = " \n\n".join(x for x in sentences if len(x) > 0) + "\n\n" tok = self._tt.tokenize(text) assignments = [0] * len(sentences) if tok: for ii in xrange(len(sentences)): try: assignments[ii] = min(x for x in xrange(len(tok)) if sentences[ii] in tok[x]) + 1 except ValueError: print("ERROR %i!" % ii) #print(text.encode("ascii", "ignore")) #print(tok) assignments[ii] = 0 print "**************" print assignments # Make assignments monotonically increasing last_assignment = -1 assignments_seen = -1 for ii in xrange(len(assignments)): if assignments[ii] != last_assignment: assignments_seen += 1 last_assignment = assignments[ii] assignments[ii] = assignments_seen print assignments return assignments def fallback_segmenter(self, text, max_sentence_length=500, arbitrary_words_per_sent=30, max_sentences_per_texttile=15, arbitrary_sentences_per_tile=6): # First, try to segment into sentences with punkt sentences = punkt.tokenize(text) # If that doesn't work, use a really stupid regexp longest_sentence = max(len(x) for x in sentences) print("Longest sentence is %i" % longest_sentence) if longest_sentence > 500: print "Using regexp sentence breaker" sentences = punct_regexp.findall(text) # If that still doesn't work, use arbitrary breaks if max(len(x) for x in sentences) > 600: print "Using ad hoc sentence breaker" sentences = [] words = text.split() num_words = len(words) for ii in xrange(num_words // arbitrary_words_per_sent + 1): sentences.append(" ".join( words[ii * arbitrary_words_per_sent:min( (ii + 1) * arbitrary_words_per_sent, num_words)])) # Now feed that into texttile print(sentences) try: tile_assignments = self.sentence_array_texttile(sentences) tiles = set(tile_assignments) except ValueError: tile_assignments = None # If that doesn't work, split "sentences", however defined, into reasonable # sized chunks if tile_assignments == None or max( sum(1 for y in tile_assignments if y == x) for x in tiles) > max_sentences_per_texttile: tile_assignments = [ x // arbitrary_sentences_per_tile for x in xrange(len(sentences)) ] return sentences, tile_assignments def fallback_wrapper(self, text): sentences, assignments = self.fallback_segmenter(text) num_sents = len(sentences) tiles = [] for ii in xrange(max(assignments) + 1): tiles.append(" ".join(sentences[x] for x in xrange(num_sents) \ if assignments[x] == ii)) return tiles
from nltk.tag import pos_tag, pos_tag_sents from nltk import word_tokenize import codecs from argparse import ArgumentParser import os argparser = ArgumentParser() argparser.add_argument("file", help="text document") args = argparser.parse_args() stopwords = stopwords.words("english") doc_path = os.path.splitext(args.file)[0] tt = TextTilingTokenizer() text = codecs.open(doc_path + ".txt", "r", "utf-8").read() parags = tt.tokenize(text) buffer_tiled = "" buffer_tiled_tagged = "" buffer_tiled_tagged_clean = "" tagged_parags = pos_tag_sents([word_tokenize(p) for p in parags]) clean_parags = [filter(lambda taggedword: taggedword[0] not in stopwords, p) for p in tagged_parags] for i, p in enumerate(parags): buffer_tiled += p for word, tag in tagged_parags[i]: