def sentence_tokenizer(self, untokenized_string, language): """Reads language .pickle for right language""" if language == 'greek': pickle_path = os.path.expanduser('~/cltk_data/greek/cltk_linguistic_data/tokenizers/sentence/greek.pickle') language_punkt_vars = PunktLanguageVars language_punkt_vars.sent_end_chars = ('.', ';') language_punkt_vars.internal_punctuation = (',', '·') elif language == 'latin': pickle_path = os.path.expanduser('~/cltk_data/latin/cltk_linguistic_data/tokenizers/sentence/latin.pickle') language_punkt_vars = PunktLanguageVars language_punkt_vars.sent_end_chars = ('.', '?', ':') language_punkt_vars.internal_punctuation = (',', ';') else: print("No sentence tokenizer for this language available.") with open(pickle_path, 'rb') as open_pickle: tokenizer = pickle.load(open_pickle) tokenizer.INCLUDE_ALL_COLLOCS = True tokenizer.INCLUDE_ABBREV_COLLOCS = True params = tokenizer.get_params() sbd = PunktSentenceTokenizer(params) tokenized_sentences = [] for sentence in sbd.sentences_from_text(untokenized_string, realign_boundaries=True): tokenized_sentences.append(sentence) return tokenized_sentences
def featureize(F, observation_files): word_tokenizer = PunktSentenceTokenizer() sent_tokenizer = PunktSentenceTokenizer() m = len(observation_files) # X is Nx2 X = np.zeros((m, 2), dtype=np.float) for (i, filename) in enumerate(observation_files, start=0): file_text = read_file(filename).decode('string_escape') try: num_sents = len(sent_tokenizer.sentences_from_text(file_text)) except UnicodeDecodeError: num_sents = 2 #num_tokens = len(word_tokenize(file_text)) num_tokens = len(file_text.split()) # Return two features: # 1 (0) - Number of sentences per file # 2 (1) - Number of tokens per file X[i][0] = num_sents X[i][1] = num_tokens return X
def featureize(F, observation_files): word_tokenizer = PunktSentenceTokenizer() sent_tokenizer = PunktSentenceTokenizer() m = len(observation_files) # X is Nx2 X = np.zeros((m,2), dtype=np.float) for (i,filename) in enumerate(observation_files,start=0): file_text = read_file(filename).decode('string_escape') try: num_sents = len(sent_tokenizer.sentences_from_text(file_text)) except UnicodeDecodeError: num_sents = 2 #num_tokens = len(word_tokenize(file_text)) num_tokens = len(file_text.split()) # Return two features: # 1 (0) - Number of sentences per file # 2 (1) - Number of tokens per file X[i][0] = num_sents X[i][1] = num_tokens return X
def tokenize(self): """ Returns a list of tokenized sentences """ sentence_tokenizer = PunktSentenceTokenizer() sentences = sentence_tokenizer.sentences_from_text(self.text) sentences = [sentence.split() for sentence in sentences] sentences = [[word.strip(",.?!") for word in sentence] for sentence in sentences] return sentences
def tokenize(self): ''' Returns a list of tokenized sentences ''' sentence_tokenizer = PunktSentenceTokenizer() sentences = sentence_tokenizer.sentences_from_text(self.text) sentences = [sentence.split() for sentence in sentences] sentences = [[word.strip(",.?!") for word in sentence] for sentence in sentences] return sentences
def _iter_text_data(self): pst = PunktSentenceTokenizer() for fragment in self.fragments: text = (fragment.text or '').strip() if not text: continue label = fragment.label sentences = pst.sentences_from_text(fragment.text) for sentence in sentences: yield sentence.encode('utf-8'), label
def preprocess_doc(doc): sentence_tokenizer = PunktSentenceTokenizer() sentences = sentence_tokenizer.sentences_from_text(doc) tokens = [] for sentence in sentences: #sentence1 = sentence.split() sentence1 = neg_scope(sentence) tokens.extend(w for w in sentence1 if w.lower() not in stopwords.words("english")) for ii in xrange(len(tokens)): if tokens[ii][-1] == '.': tokens[ii] = tokens[ii][:-1] return tokens
class GCBlockExtractor(ExtractionMapper): def __init__(self): super(GCBlockExtractor, self).__init__(extraction_function=self._blocks_from_text) self.tokenizer = PunktSentenceTokenizer() def _blocks_from_text(self, page): blocks = [] for sentence in self.tokenizer.sentences_from_text( page.text.replace('\n', '')): if sentence.strip(): blocks.append(len(sentence)) # maybe count tokens? or non-spaces? return blocks
class GCBlockExtractor(ExtractionMapper): def __init__(self): super(GCBlockExtractor, self).__init__( extraction_function=self._blocks_from_text) self.tokenizer = PunktSentenceTokenizer() def _blocks_from_text(self, page): blocks = [] for sentence in self.tokenizer.sentences_from_text( page.text.replace('\n', '')): if sentence.strip(): blocks.append(len(sentence)) # maybe count tokens? or non-spaces? return blocks
def raw_records(crawl_collection,start): # Prepare a naive sentence tokeniser utility pst = PunktSentenceTokenizer() for rec in crawl_collection.query({'downloaded': True},field=None,skip=start): _id = rec['_id'] if rec['content'] is None: continue content = rec['content']['contents'] # A wiki page may probably comprise of multiple content for c in content: # Explode a long topic into list of sentences sentences = pst.sentences_from_text(c) for s in sentences: yield (_id,s)
def tokenize_sents_latin(sentences_string): global tokenenized_sentences """Tokenize a Latin string into sentences""" pickle_name = 'latin.pickle' pickle_path = os.path.join(cltk_data, 'compiled', 'sentence_tokens_latin/', pickle_name) with open(pickle_path, 'rb') as f: train_data = pickle.load(f) train_data.INCLUDE_ALL_COLLOCS = True train_data.INCLUDE_ABBREV_COLLOCS = True params = train_data.get_params() sbd = PunktSentenceTokenizer(params) tokenenized_sentences = [] for sentence in sbd.sentences_from_text(sentences_string, realign_boundaries=True): tokenenized_sentences.append(sentence) #print(tokenenized_sentences) return tokenenized_sentences
def add_sents(invid=None): if invid: findObj = {"_id": invid} else: findObj = {} for vd in vdigests.find(findObj): if not vd.get("nSentences") and vd.get('alignTrans') and vd.get( 'alignTrans').get('words'): twords = vd['alignTrans']['words'] twords_len = len(twords) trans = " ".join([wrd["word"] for wrd in twords]) STokenizer = PunktSentenceTokenizer() token_sents = STokenizer.sentences_from_text(trans) cwct = 0 sentct = 0 curword = twords[cwct] for tsent in token_sents: tswords = tsent.split(" ") for wnum, tsword in enumerate(tswords): if tsword == curword["word"]: curword["sentenceNumber"] = sentct cwct += 1 if cwct < twords_len: curword = twords[cwct] else: print "warning: not a one-to-one match: ", curword[ "word"], tsword if wnum == 0: curword["sentenceNumber"] = sentct - 1 cwct += 1 if cwct < twords_len: curword = twords[cwct] elif wnum == len(tswords) - 1: curword["sentenceNumber"] = sentct else: ipdb.set_trace() sentct += 1 vd['nSentences'] = len(token_sents) # write the separated sentences to file ssout_name = "ss-" + vd["_id"] outf = open("../ffdata/rawtrans/" + ssout_name, 'w') outf.write("\n".join(token_sents)) outf.close() vd['sentSepTransName'] = ssout_name vdigests.save(vd)
def raw_records(crawl_collection, start): # Prepare a naive sentence tokeniser utility pst = PunktSentenceTokenizer() for rec in crawl_collection.query({'downloaded': True}, field=None, skip=start): _id = rec['_id'] if rec['content'] is None: continue content = rec['content']['contents'] # A wiki page may probably comprise of multiple content for c in content: # Explode a long topic into list of sentences sentences = pst.sentences_from_text(c) for s in sentences: yield (_id, s)
def add_sents(invid=None): if invid: findObj = {"_id": invid} else: findObj = {} for vd in vdigests.find(findObj): if not vd.get("nSentences") and vd.get('alignTrans') and vd.get('alignTrans').get('words'): twords = vd['alignTrans']['words'] twords_len = len(twords) trans = " ".join([wrd["word"] for wrd in twords]) STokenizer = PunktSentenceTokenizer() token_sents = STokenizer.sentences_from_text(trans) cwct = 0 sentct = 0 curword = twords[cwct] for tsent in token_sents: tswords = tsent.split(" ") for wnum, tsword in enumerate(tswords): if tsword == curword["word"]: curword["sentenceNumber"] = sentct cwct += 1 if cwct < twords_len: curword = twords[cwct] else: print "warning: not a one-to-one match: ", curword["word"], tsword if wnum == 0: curword["sentenceNumber"] = sentct - 1 cwct += 1 if cwct < twords_len: curword = twords[cwct] elif wnum == len(tswords) - 1: curword["sentenceNumber"] = sentct else: ipdb.set_trace() sentct += 1 vd['nSentences'] = len(token_sents) # write the separated sentences to file ssout_name = "ss-" + vd["_id"] outf = open("../ffdata/rawtrans/" + ssout_name, 'w') outf.write("\n".join(token_sents)) outf.close() vd['sentSepTransName'] = ssout_name vdigests.save(vd)
def tokenize_greek_sentences(sentences_string): global tokenenized_sentences pickle_name = 'greek.pickle' pickle_path = os.path.join(cltk_data, 'compiled', 'sentence_tokens_greek/', pickle_name) with open(pickle_path, 'rb') as f: train_data = pickle.load(f) train_data.INCLUDE_ALL_COLLOCS = True train_data.INCLUDE_ABBREV_COLLOCS = True params = train_data.get_params() sbd = PunktSentenceTokenizer(params) ''' with open(input_file) as f: to_be_tokenized = f.read() ''' tokenenized_sentences = [] for sentence in sbd.sentences_from_text(sentences_string, realign_boundaries=True): tokenenized_sentences.append(sentence) #print(tokenenized_sentences) return tokenenized_sentences
def chunk_article(article): """ Given a long string, article, representing the full text of a given article, convert the string into a list of sentences :param article: A string representing the full text of an article :return: A list of strings representing the sentences of the article """ # Add support to NOT falsely split a sentence at a title like dr or mr p_params = PunktParameters() p_params.abbrev_types = set(ABBREV_TYPES) p = PunktSentenceTokenizer(p_params) sen = p.sentences_from_text(article, realign_boundaries=False) # Strip extra spaces for s in sen: s.strip() return sen
def export_crawl_to_text(mineDB): # Prepare a naive sentence tokeniser utility pst = PunktSentenceTokenizer() text_path = os.path.realpath('./mine.txt') with codecs.open(text_path, 'w', 'utf-8') as f: m = 0 for wiki in mineDB.query({'downloaded': True}, field=None): # Skip empty content or the added one if wiki['content'] is None or 'added_to_graph' in wiki: continue content = wiki['content'] # A wiki page may probably comprise of multiple content for c in content['contents']: # Explode content into sentences sentences = pst.sentences_from_text(c) print('... content #{} ==> {} sentences extracted.'.format( m, len(sentences))) for s in sentences: # Cleanse the sentence s_ = cleanse(s) # Filter out noise by length if len(s_) < 5 or len(s_.split(' ')) < 3: continue f.write(s_.lower() + '\n') m += 1 if m >= args['limit']: print( colored('[Ending] Maximum number of topics reached.', 'yellow')) break return text_path
def iter_topic(crawl_collection, start): # Prepare a naive sentence tokeniser utility pst = PunktSentenceTokenizer() n = 0 for wiki in crawl_collection.query({'downloaded': True}, field=None, skip=start): # Skip empty content or the added one if wiki['content'] is None or 'added_to_graph' in wiki: continue m = 0 content = wiki['content'] if args['verbose']: print(colored('[Extracting wiki] : ', 'cyan'), content['title']) # A wiki page may probably comprise of multiple content for c in content['contents']: # Explode a long topic into list of sentences sentences = pst.sentences_from_text(c) for s in sentences: m += 1 yield (content['title'], s.split(' ')) # After all sentences are processed, # mark the current wiki record as 'processed' crit = {'_id': wiki['_id']} crawl_collection.update(crit, {'$set': {'added_to_graph': True}}) n += 1 if args['verbose']: print(content['title'] + " processed with {0} nodes.".format(m)) print( colored("{0} wiki documents processed so far...".format(n), 'blue'))
class GaleChurchAlignmentDistance(DistanceScorer): def __init__(self): self.name = "Gale Church Alignment Scorer" self.tokenizer = PunktSentenceTokenizer() self.sblocks, self.tblocks = [], [] def _blocks_from_text(self, text): blocks = [] for sentence in self.tokenizer.sentences_from_text( text.replace('\n', '')): blocks.append(len(sentence)) # maybe count tokens? or non-spaces? return blocks def _extract(self, source_corpus, target_corpus): for url, page in source_corpus.iteritems(): self.sblocks.append(self._blocks_from_text(page.text)) for url, page in target_corpus.iteritems(): self.tblocks.append(self._blocks_from_text(page.text)) def _score_pair(self, s_idx, s_page, t_idx, t_page): return self.gc.align_score(self.sblocks[s_idx], self.tblocks[t_idx])
def export_crawl_to_text(mineDB): # Prepare a naive sentence tokeniser utility pst = PunktSentenceTokenizer() text_path = os.path.realpath('./mine.txt') with codecs.open(text_path, 'w', 'utf-8') as f: m = 0 for wiki in mineDB.query({'downloaded': True},field=None): # Skip empty content or the added one if wiki['content'] is None or 'added_to_graph' in wiki: continue content = wiki['content'] # A wiki page may probably comprise of multiple content for c in content['contents']: # Explode content into sentences sentences = pst.sentences_from_text(c) print('... content #{} ==> {} sentences extracted.'.format(m, len(sentences))) for s in sentences: # Cleanse the sentence s_ = cleanse(s) # Filter out noise by length if len(s_)<5 or len(s_.split(' '))<3: continue f.write(s_.lower() + '\n') m += 1 if m>=args['limit']: print(colored('[Ending] Maximum number of topics reached.','yellow')) break return text_path
class GaleChurchAlignmentDistance(DistanceScorer): def __init__(self): self.name = "Gale Church Alignment Scorer" self.tokenizer = PunktSentenceTokenizer() self.sblocks, self.tblocks = [], [] def _blocks_from_text(self, text): blocks = [] for sentence in self.tokenizer.sentences_from_text( text.replace('\n', '')): blocks.append(len(sentence)) # maybe count tokens? or non-spaces? return blocks def _extract(self, source_corpus, target_corpus): for url, page in source_corpus.iteritems(): self.sblocks.append(self._blocks_from_text(page.text)) for url, page in target_corpus.iteritems(): self.tblocks.append(self._blocks_from_text(page.text)) def _score_pair(self, s_idx, s_page, t_idx, t_page): return self.gc.align_score(self.sblocks[s_idx], self.tblocks[t_idx])
def iter_topic(crawl_collection,start): # Prepare a naive sentence tokeniser utility pst = PunktSentenceTokenizer() n = 0 for wiki in crawl_collection.query({'downloaded': True},field=None,skip=start): # Skip empty content or the added one if wiki['content'] is None or 'added_to_graph' in wiki: continue m = 0 content = wiki['content'] if args['verbose']: print(colored('[Extracting wiki] : ','cyan'), content['title']) # A wiki page may probably comprise of multiple content for c in content['contents']: # Explode a long topic into list of sentences sentences = pst.sentences_from_text(c) for s in sentences: m += 1 yield (content['title'],s.split(' ')) # After all sentences are processed, # mark the current wiki record as 'processed' crit = {'_id': wiki['_id']} crawl_collection.update(crit, {'$set':{'added_to_graph':True}}) n += 1 if args['verbose']: print(content['title'] + " processed with {0} nodes.".format(m)) print(colored("{0} wiki documents processed so far...".format(n),'blue'))
class SimhashDistance(DistanceScorer): CHAR, TOKEN = range(2) def __init__(self, source_tokenizer, target_tokenizer, n=2, level=TOKEN): self.name = "Simhash Distance Scorer, n=%d" % n self.sentence_splitter = PunktSentenceTokenizer() self.s_hashes, self.t_hashes = [], [] self.source_tokenizer = source_tokenizer if not source_tokenizer: self.source_tokenizer = SpaceTokenizer() self.target_tokenizer = target_tokenizer if not target_tokenizer: self.target_tokenizer = SpaceTokenizer() def ngrams(n, tokenizer, page): result = [] text = page.text.replace('\n', '') for sentence in self.sentence_splitter.sentences_from_text(text): if not sentence.strip(): continue # if '\n' in sentence: # print repr(sentence) assert '\n' not in sentence, sentence words = tokenizer.process(sentence).strip().split() result += [ " ".join(words[i:i + n]) for i in range(max(len(words) - n + 1, 1)) ] return result def tokens(n, tokenizer, page): # 180/1grams # words = page.html.split() words = filter(None, re.split("[^0-9a-zA-Z]", page.text)) return [ " ".join(words[i:i + n]) for i in range(max(len(words) - n + 1, 1)) ] def chars(n, tokenizer, page): s = "".join(page.text.split()) return [ " ".join(s[i:i + n]) for i in range(max(len(s) - n + 1, 1)) ] def html_tokens(n, tokenizer, page): # 153/trigrams words = page.html.split() return [ " ".join(words[i:i + n]) for i in range(max(len(words) - n + 1, 1)) ] if level == SimhashDistance.TOKEN: self.source_features = partial(tokens, n, self.source_tokenizer) self.target_features = partial(tokens, n, self.target_tokenizer) elif level == SimhashDistance.CHARS: self.source_features = partial(chars, n, self.source_tokenizer) self.target_features = partial(chars, n, self.target_tokenizer) # self.source_features = partial(ngrams, n, self.source_tokenizer) # self.target_features = partial(ngrams, n, self.target_tokenizer) # print self.source_features("How are you?\nI am fine. Thanks.") def _words_from_text(self, text, tokenizer): words = set() for line in self.sentence_splitter(text): for w in tokenizer.process(line).split("\n"): words.add(w) return words def _extract(self, source_corpus, target_corpus): for url, page in source_corpus.iteritems(): self.s_hashes.append(Simhash(self.source_features(page))) for url, page in target_corpus.iteritems(): self.t_hashes.append(Simhash(self.target_features(page))) def _score_pair(self, s_idx, s_page, t_idx, t_page): return -self.s_hashes[s_idx].distance(self.t_hashes[t_idx]) def get_features(self, text): width = 3 text = self.tokenizer.sentences_from_text(text) return [ text[i:i + width] for i in range(max(len(text) - width + 1, 1)) ]
class SimhashDistance(DistanceScorer): CHAR, TOKEN = range(2) def __init__(self, source_tokenizer, target_tokenizer, n=2, level=TOKEN): self.name = "Simhash Distance Scorer, n=%d" % n self.sentence_splitter = PunktSentenceTokenizer() self.s_hashes, self.t_hashes = [], [] self.source_tokenizer = source_tokenizer if not source_tokenizer: self.source_tokenizer = SpaceTokenizer() self.target_tokenizer = target_tokenizer if not target_tokenizer: self.target_tokenizer = SpaceTokenizer() def ngrams(n, tokenizer, page): result = [] text = page.text.replace('\n', '') for sentence in self.sentence_splitter.sentences_from_text(text): if not sentence.strip(): continue # if '\n' in sentence: # print repr(sentence) assert '\n' not in sentence, sentence words = tokenizer.process(sentence).strip().split() result += [" ".join(words[i:i + n]) for i in range(max(len(words) - n + 1, 1))] return result def tokens(n, tokenizer, page): # 180/1grams # words = page.html.split() words = filter(None, re.split("[^0-9a-zA-Z]", page.text)) return [" ".join(words[i:i + n]) for i in range(max(len(words) - n + 1, 1))] def chars(n, tokenizer, page): s = "".join(page.text.split()) return [" ".join(s[i:i + n]) for i in range(max(len(s) - n + 1, 1))] def html_tokens(n, tokenizer, page): # 153/trigrams words = page.html.split() return [" ".join(words[i:i + n]) for i in range(max(len(words) - n + 1, 1))] if level == SimhashDistance.TOKEN: self.source_features = partial(tokens, n, self.source_tokenizer) self.target_features = partial(tokens, n, self.target_tokenizer) elif level == SimhashDistance.CHARS: self.source_features = partial(chars, n, self.source_tokenizer) self.target_features = partial(chars, n, self.target_tokenizer) # self.source_features = partial(ngrams, n, self.source_tokenizer) # self.target_features = partial(ngrams, n, self.target_tokenizer) # print self.source_features("How are you?\nI am fine. Thanks.") def _words_from_text(self, text, tokenizer): words = set() for line in self.sentence_splitter(text): for w in tokenizer.process(line).split("\n"): words.add(w) return words def _extract(self, source_corpus, target_corpus): for url, page in source_corpus.iteritems(): self.s_hashes.append(Simhash(self.source_features(page))) for url, page in target_corpus.iteritems(): self.t_hashes.append(Simhash(self.target_features(page))) def _score_pair(self, s_idx, s_page, t_idx, t_page): return -self.s_hashes[s_idx].distance(self.t_hashes[t_idx]) def get_features(self, text): width = 3 text = self.tokenizer.sentences_from_text(text) return [text[i:i + width] for i in range(max(len(text) - width + 1, 1))]
ref_list = " ".join(ref) line = sent_detector.tokenize(ref_list.strip()) author_name= [] year_of_pub= [] paper_name=[] journal_name=[] year_found = False req_idx = 1 for i in Reference: line = sent_detector.tokenize(i.strip()) line2 = sent_detector.sentences_from_text(i.strip() ) References.append(line) line3 = [x for x in line if x != "."] if len(line3)==4: j=0 author_name.append(line3[j]) year_of_pub.append(line3[j+1]) paper_name.append(line3[j+2]) journal_name.append(line3[j+3]) else: name_str = [] regex = re.compile("(\d{4})") idx=0 req_idx = 1 while(idx<len(line3)): result = re.findall(regex,line3[idx])
class kbTokenizer: '''Tokenizer used to pre-process KB dataset for generating Word2Vec models from word2vecModels/*.w2v. ''' def __init__(self, bLowerCase=True): self.bLowerCase = bLowerCase self.oPunktSentTokenizer = PunktSentenceTokenizer() self.sNonTokenChars = (u"[‘’“”…”’“–«»\,‘\]\[;:\-\"'\?!¡¢∞§¶•ª≠∑´®†¨^π" "ƒ©˙∆˚¬≈√∫~⁄™‹›fifl‡°·±—‚„‰∏”`◊ˆ~¯˘¿÷\*\(\)<>=" "\+#^\\\/_]+") self.reNonTokenChars_start = \ re.compile(u"(\A|\s)%s" % self.sNonTokenChars, re.U) self.reNonTokenChars_end = \ re.compile(u"%s(\.?(\s|\Z))" % self.sNonTokenChars, re.U) self.reWhitespace = re.compile("\W+", re.U) def removeNonTokenChars(self, sString): sString = re.sub(self.reNonTokenChars_start, '\g<1>', sString) return re.sub(self.reNonTokenChars_end, '\g<1>', sString) def tokenizeSentence(self, sString): aTokens = None if self.bLowerCase: aTokens = self.reWhitespace.split( self.removeNonTokenChars(sString.lower())) else: aTokens = self.reWhitespace.split( self.removeNonTokenChars(sString)) # split() gives empty first/last elements if there were separators at # the start/end of the string (so whitespace, in this case). # We correct for that. iStart = 1 if aTokens[0] == '' else 0 if aTokens[-1] == '': return aTokens[iStart:-1] else: return aTokens[iStart:] def tokenizeText(self, sText): ''' Input is a utf8 text. Output is a list of lists of tokens. One list of tokens per sentence. ''' aTextTokens = [] for sSentence in self.oPunktSentTokenizer.sentences_from_text(sText): aTokens = self.tokenizeSentence(sSentence) if len(aTokens) > 0: aTextTokens.append(aTokens) return aTextTokens def tokenizeFile(self, sFile): try: fhInput = codecs.open(sFile, mode='r', encoding='utf8') except IOError, oError: print >> sys.stderr, "[ERROR] Error while opening '%s'" % sFile print >> sys.stderr, "[ERROR] '%s'" % oError exit(1) sText = fhInput.read() fhInput.close() return self.tokenizeText(sText)
def annotate_text(raw_data_folder, labels_data_folder, file_to_write, max_sent_len=35, improved_sent_splitting=True, training=True): """ Creates a token-level input file for the span identification task and adds sentence IDs to the tokens. """ # max_sent_len = -1 ==> no sentence splitting if max_sent_len == -1: # the corresponding if-block can handle this improved_sent_splitting = True nlp = English() tokenizer = nlp.Defaults.create_tokenizer(nlp) if improved_sent_splitting: punkt_param = PunktParameters() punkt_param.abbrev_types = set([ 'dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'ms', 'rep', 'u.s', 'feb', 'sen' ]) splitter = PunktSentenceTokenizer(punkt_param) splitter.PUNCTUATION = tuple(';:,.!?"') output_table = [] file_counter = 0 sent_no_total = 0 print("Total number of files - {}".format(len( os.listdir(raw_data_folder)))) # Reading all the files from the raw text directory article_file_names = [ file_name for file_name in os.listdir(raw_data_folder) if file_name.endswith(".txt") ] article_file_names.sort() for file_name in article_file_names: if training: label_file_name = file_name.replace(".txt", ".task2-TC.labels") print("raw_article: {}\tlabel_file: {}".format( file_name, label_file_name)) # Read the labels file with 4 columns of format # doc_id : label_of_span : idx_span_begin : idx_span_end with open(os.path.join(labels_data_folder, label_file_name), encoding="utf-8") as file: rows = file.readlines() rows = [ row.strip().split("\t") for row in rows if len(row.split("\t")) == 4 ] # Saving mappings char_idx->labels into the dictionary char_idx2label = dict() for row in rows: label = row[1] idx_from = int(row[2]) idx_to = int(row[3]) for idx in range(idx_from, idx_to): if idx not in char_idx2label.keys(): char_idx2label[idx] = [] char_idx2label[idx].append(label) else: print("raw_article: " + file_name) # Read the article and process the text with open(os.path.join(raw_data_folder, file_name), encoding="utf-8") as file: file_text = file.readlines() # Keep linebreaks for better sentence splitting file_text = ''.join([line for line in file_text]) # Normalizing punctuation marks to help the tokenizer. file_text = file_text.replace('“', '"').replace('”', '"') file_text = file_text.replace("’", "'").replace("‘", "'") sentences = [] if improved_sent_splitting: # Line breaks -> helps with headlines paragraphs = file_text.split('\n') for para in paragraphs: para = para.strip() sentences_raw = splitter.sentences_from_text(para) for sent in sentences_raw: sent = sent.strip() tokens = tokenizer(sent) if len(tokens) <= max_sent_len or max_sent_len == -1: # No need to split the sentence! if len(sent) == 0: # Can happen when paragraphs are separated by # several line breaks. continue sentences.append(sent) continue # Try splitting based on quotes. quote_fragments, all_ok = punct_based_split_sent( tokenizer, sent, max_sent_len, '"') if all_ok: sentences += quote_fragments continue # Other punctuation for splitting: ; : for quote_frag in quote_fragments: semicolon_fragments, all_ok =\ punct_based_split_sent(tokenizer, quote_frag, max_sent_len, ';') if all_ok: sentences += semicolon_fragments continue for semicolon_frag in semicolon_fragments: colon_fragments, all_ok =\ punct_based_split_sent(tokenizer, semicolon_frag, max_sent_len, ':') if all_ok: sentences += colon_fragments continue # Commas: for col_frag in colon_fragments: comma_fragments, all_ok =\ punct_based_split_sent(tokenizer, col_frag, max_sent_len, ',') if all_ok: sentences += comma_fragments continue # Last resort: # Split after max_sent_len tokens for comma_frag in comma_fragments: sentences += forcefully_split_sent( tokenizer, comma_frag, max_sent_len) else: # Cut long sentences into fragments that are (up to) # max_sent_len characters long # (the last fragment in a sentence might be shorter) file_text = file_text.replace('\n', ' ') sentences_raw = sent_tokenize(file_text) for sent in sentences_raw: sentences += forcefully_split_sent(tokenizer, sent, max_sent_len) i = 0 for sent in sentences: sent = sent.strip() i = file_text.find(sent, i) max_idx = i + len(sent) if sent == '': continue if improved_sent_splitting: if len(sent.strip()) < 2: # single char noise continue sent_no_total += 1 for token in tokenizer(sent): token = str(token) token_idx = file_text.find(token, i, max_idx) i = token_idx + len(token) output = [ file_name.replace("article", "").replace(".txt", ""), str(sent_no_total), str(token_idx), str(i), token ] if training: # Check the label of the corresponding char_idx label = char_idx2label.get(token_idx, ['None']) output.append("|".join(label)) output_table.append(output) file_counter += 1 print("Finished {} files\n".format(file_counter)) with open(file_to_write, 'w', encoding="utf-8") as f: f.write('# max_sent_len=' + str(max_sent_len) + ', improved_sent_splitting=' + str(improved_sent_splitting) + '\n') f.write('document_id\tsent_id\ttoken_start\ttoken_end\ttoken') if training: f.write('\tlabel') f.write('\n') for row in output_table: f.write('\t'.join(row) + "\n")