class CoreNLP: def __init__(self): self.parser = CoreNLPDependencyParser(url=self.corenlp_server()) self.sentence_tokenizer = PunktSentenceTokenizer() @staticmethod def corenlp_server(): return getenv('CORENLP_SERVER') def dep_parse(self, text: str, conll_version=10) -> str: """Get a CoreNLP depparse,lemma""" def get_conll(t): deps, = self.parser.raw_parse(t) return deps.to_conll(conll_version) # xrenner requires conll10 sentences = self.sentence_tokenizer.sentences_from_text(text) return '\n'.join(map(get_conll, sentences))
def sentence_split(input_text): input_text = "<root>" + input_text + "</root>" soup = BeautifulSoup(input_text, "xml") paragraphs = [] for doc in soup.find('root').findAll('DOC'): if doc['type'] == 'story': headlines = doc('HEADLINE') for h in headlines: paragraphs.append(h.contents[0]) p_blocks = doc.find('TEXT').findAll('P') for p in p_blocks: paragraphs.append(p.contents[0]) elif doc['type'] == 'multi': paragraphs.append(doc.find('TEXT').contents[0]) sentences = [] punkt = PunktSentenceTokenizer() for parag in paragraphs: for sent in punkt.sentences_from_text(parag, realign_boundaries=True): sentences.append(replace.sub(' ', sent).strip()) return sentences
def sent_tokenize(text): model_path = join(dirname(__file__), 'sent_tokenize_model_v1.0.pkl') with open(model_path, 'rb') as fs: punkt_param = pickle.load(fs) punkt_param.sent_starters = {} abbrev_types = [ 'g.m.t', 'e.g', 'dr', 'dr', 'vs', "000", 'mr', 'mrs', 'prof', 'inc', 'tp', 'ts', 'ths', 'th', 'vs', 'tp', 'k.l', 'a.w.a.k.e', 't', 'a.i', '</i', 'g.w', 'ass', 'u.n.c.l.e', 't.e.s.t', 'ths', 'd.c', 've…', 'ts', 'f.t', 'b.b', 'z.e', 's.g', 'm.p', 'g.u.y', 'l.c', 'g.i', 'j.f', 'r.r', 'v.i', 'm.h', 'a.s', 'bs', 'c.k', 'aug', 't.d.q', 'b…', 'ph', 'j.k', 'e.l', 'o.t', 's.a' ] abbrev_types.extend(string.ascii_uppercase) for abbrev_type in abbrev_types: punkt_param.abbrev_types.add(abbrev_type) for abbrev_type in string.ascii_lowercase: punkt_param.abbrev_types.add(abbrev_type) tokenizer = PunktSentenceTokenizer(punkt_param) sentences = tokenizer.sentences_from_text(text) return sentences