def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language) self.all_verb_tokens = set() self.token_to_lemma = {} for lemma, match_tokens in self.lemma_to_token.iteritems(): for match_token in match_tokens: self.all_verb_tokens.add(match_token.lower()) self.token_to_lemma[match_token.lower()] = lemma logger.debug("All match tokens: %s" % self.all_verb_tokens)
def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language) self.parser = StanfordParser( path_to_jar='dev/stanford-corenlp-3.6.0.jar', path_to_models_jar='dev/stanford-corenlp-3.6.0-models.jar', java_options=' -mx2G -Djava.ext.dirs=dev/') self.token_to_lemma = {} for lemma, tokens in self.lemma_to_token.iteritems(): for t in tokens: self.token_to_lemma[t] = lemma self.all_verbs = set(self.token_to_lemma.keys())
def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language) grammar = self.grammars.get(self.language) if grammar: self.parser = RegexpParser(grammar) else: raise ValueError( "Invalid or unsupported language: '%s'. Please use one of the currently supported ones: %s" % (self.language, self.grammars.keys())) for lemma, match_tokens in self.lemma_to_token.iteritems(): self.lemma_to_token[lemma] = set( [match.lower() for match in match_tokens])
def main(corpus, verbs, processes, outfile, sub_sentences): """ Compute the LU distribution in the corpus, i.e. how many LUs per sentence """ global splitter, tagger, parser, all_verbs splitter = PunktSentenceSplitter('en') tagger = TTPosTagger('en') parser = StanfordParser( path_to_jar='dev/stanford-corenlp-3.6.0.jar', path_to_models_jar='dev/stanford-corenlp-3.6.0-models.jar', java_options=' -mx1G -Djava.ext.dirs=dev/' ) # no way to make classpath work all_verbs = reduce(lambda x, y: x.union(y), imap(set, json.load(verbs).values()), set()) all_verbs.discard('be') all_verbs.discard('have') args = load_corpus(corpus, 'bio', text_only=True) worker = worker_with_sub_sentences if sub_sentences else worker_with_sentences counter = defaultdict(int) for i, counts in enumerate(parallel.map(worker, args, processes)): for k, v in counts.iteritems(): counter[k] += v if (i + 1) % 10000 == 0: logger.info('Processed %d documents', i + 1) counter = OrderedDict(sorted(counter.items(), key=lambda (k, v): k)) for k, v in counter.iteritems(): print k, v json.dump(counter, outfile, indent=2)
class OneToOneExtractor(SentenceExtractor): """ 121 extraction strategy: 1 sentence per 1 LU N.B.: the same sentence will appear only once the sentence is assigned to a RANDOM LU """ splitter = None all_verb_tokens = None token_to_lemma = None def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language) self.all_verb_tokens = set() self.token_to_lemma = {} for lemma, match_tokens in self.lemma_to_token.iteritems(): for match_token in match_tokens: self.all_verb_tokens.add(match_token.lower()) self.token_to_lemma[match_token.lower()] = lemma logger.debug("All match tokens: %s" % self.all_verb_tokens) def extract_from_item(self, item): extracted = [] url = item.get('url') if not url: logger.warn('skipping item without url') return document = item.get(self.document_key) if not document: logger.warn('skipping item without document') return elif isinstance(document, list): document = '\n'.join(document) sentences = self.splitter.split(document) for sentence in sentences: tagged = self.tagger.tag_one(sentence, skip_unknown=False) sentence_verbs = [token for token, pos, lemma in tagged if pos.startswith('V')] matched = [] for token in self.all_verb_tokens: if token in sentence_verbs: matched.append(token) if matched: assigned_token = choice(matched) assigned_lu = self.token_to_lemma[assigned_token] extracted.append({ 'lu': assigned_lu, 'text': sentence, 'tagged': tagged, 'url': url, }) if extracted: logger.debug("%d sentences extracted", len(extracted)) return item, extracted else: logger.debug("No sentences extracted")
def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language) self.parser = StanfordParser(path_to_jar='dev/stanford-corenlp-3.6.0.jar', path_to_models_jar='dev/stanford-corenlp-3.6.0-models.jar', java_options=' -mx2G -Djava.ext.dirs=dev/') self.token_to_lemma = {} for lemma, tokens in self.lemma_to_token.iteritems(): for t in tokens: self.token_to_lemma[t] = lemma self.all_verbs = set(self.token_to_lemma.keys())
def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language) grammar = self.grammars.get(self.language) if grammar: self.parser = RegexpParser(grammar) else: raise ValueError( "Invalid or unsupported language: '%s'. Please use one of the currently supported ones: %s" % ( self.language, self.grammars.keys()) ) for lemma, match_tokens in self.lemma_to_token.iteritems(): self.lemma_to_token[lemma] = set([match.lower() for match in match_tokens])
class ManyToManyExtractor(SentenceExtractor): """ n2n extraction strategy: many sentences per many LUs N.B.: the same sentence is likely to appear multiple times """ splitter = None def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language) def extract_from_item(self, item): extracted = [] text = item.get(self.document_key) url = item.get('url') if not text or not url: logger.debug('skipping item without url or bio') return elif isinstance(text, list): text = '\n'.join(text) sentences = self.splitter.split(text) for sentence in sentences: if not sentence.strip(): continue tagged = self.tagger.tag_one(sentence, skip_unknown=False) sentence_verbs = { token.lower() for token, pos, lemma in tagged if pos.startswith('V') } for lemma, match_tokens in self.lemma_to_token.iteritems(): for match in match_tokens: if match.lower() in sentence_verbs: extracted.append({ 'url': url, 'lu': lemma, 'text': sentence, 'tagged': tagged, }) if extracted: logger.debug("%d sentences extracted", len(extracted)) return item, extracted else: logger.debug("No sentences extracted")
class ManyToManyExtractor(SentenceExtractor): """ n2n extraction strategy: many sentences per many LUs N.B.: the same sentence is likely to appear multiple times """ splitter = None def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language) def extract_from_item(self, item): extracted = [] text = item.get(self.document_key) url = item.get('url') if not text or not url: logger.debug('skipping item without url or bio') return elif isinstance(text, list): text = '\n'.join(text) sentences = self.splitter.split(text) for sentence in sentences: tagged = self.tagger.tag_one(sentence, skip_unknown=False) sentence_verbs = {token.lower() for token, pos, lemma in tagged if pos.startswith('V')} for lemma, match_tokens in self.lemma_to_token.iteritems(): for match in match_tokens: if match.lower() in sentence_verbs: extracted.append({ 'url': url, 'lu': lemma, 'text': sentence, 'tagged': tagged, }) if extracted: logger.debug("%d sentences extracted", len(extracted)) return item, extracted else: logger.debug("No sentences extracted")
class OneToOneExtractor(SentenceExtractor): """ 121 extraction strategy: 1 sentence per 1 LU N.B.: the same sentence will appear only once the sentence is assigned to a RANDOM LU """ splitter = None all_verb_tokens = None token_to_lemma = None def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language) self.all_verb_tokens = set() self.token_to_lemma = {} for lemma, match_tokens in self.lemma_to_token.iteritems(): for match_token in match_tokens: self.all_verb_tokens.add(match_token.lower()) self.token_to_lemma[match_token.lower()] = lemma logger.debug("All match tokens: %s" % self.all_verb_tokens) def extract_from_item(self, item): extracted = [] url = item.get('url') if not url: logger.warn('skipping item without url') return document = item.get(self.document_key) if not document: logger.debug('skipping item without document') return elif isinstance(document, list): document = '\n'.join(document) sentences = self.splitter.split(document) for sentence in sentences: if not sentence.strip(): continue tagged = self.tagger.tag_one(sentence, skip_unknown=False) sentence_verbs = [ token for token, pos, lemma in tagged if pos.startswith('V') ] matched = [] for token in self.all_verb_tokens: if token in sentence_verbs: matched.append(token) if matched: assigned_token = choice(matched) assigned_lu = self.token_to_lemma[assigned_token] extracted.append({ 'lu': assigned_lu, 'text': sentence, 'tagged': tagged, 'url': url, }) if extracted: logger.debug("%d sentences extracted", len(extracted)) return item, extracted else: logger.debug("No sentences extracted")
class GrammarExtractor(SentenceExtractor): """ Grammar-based extraction strategy: pick sentences that comply with a pre-defined grammar. """ splitter = None parser = None # Grammars rely on POS labels, which are language-dependent grammars = { 'en': r""" NOPH: {<PDT>?<DT|PP.*|>?<CD>?<JJ.*|VVN>*<N.+|FW>+<CC>?} CHUNK: {<NOPH>+<MD>?<V.+>+<IN|TO>?<NOPH>+} """, 'it': r""" SN: {<PRO.*|DET.*|>?<ADJ>*<NUM>?<NOM|NPR>+<NUM>?<ADJ|VER:pper>*} CHUNK: {<SN><VER.*>+<SN>} """, } def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language) grammar = self.grammars.get(self.language) if grammar: self.parser = RegexpParser(grammar) else: raise ValueError( "Invalid or unsupported language: '%s'. Please use one of the currently supported ones: %s" % ( self.language, self.grammars.keys()) ) for lemma, match_tokens in self.lemma_to_token.iteritems(): self.lemma_to_token[lemma] = set([match.lower() for match in match_tokens]) def extract_from_item(self, item): extracted = [] url = item.get('url') if not url: logger.warn('skipping item without url') return document = item.get(self.document_key) if not document: return elif isinstance(document, list): document = '\n'.join(document) # Sentence splitting sentences = self.splitter.split(document) tokens = 0 for sentence in sentences: tagged = [(token, pos) for token, pos, lemma in self.tagger.tag_one(sentence)] # Parsing via grammar parsed = self.parser.parse(tagged) # Loop over sub-sentences that match the grammar for grammar_match in parsed.subtrees(lambda t: t.label() == 'CHUNK'): logger.debug("Grammar match: '%s'" % grammar_match) # Look up the LU for token, pos in grammar_match.leaves(): # Restrict match to sub-sentence verbs only if pos.startswith('V'): for lemma, match_tokens in self.lemma_to_token.iteritems(): if token.lower() in match_tokens: # Return joined chunks only # TODO test with full sentence as well # TODO re-constitute original text (now join on space) text = ' '.join([leaf[0] for leaf in grammar_match.leaves()]) logger.debug("Extracted sentence: '%s'" % text) logger.debug("Sentence token '%s' is in matches %s" % (token, match_tokens)) logger.debug("Extracted sentence: %s" % text) extracted.append({ 'lu': lemma, 'text': text, 'tagged': tagged, 'url': url, }) if extracted: logger.debug("%d sentences extracted. Removing the full text from the item ...", len(extracted)) item.pop(self.document_key) return item, extracted else: logger.debug("No sentences extracted. Skipping the whole item ...")
class SyntacticExtractor(SentenceExtractor): """ Tries to split sentences into sub-sentences so that each of them contains only one LU """ splitter = None parser = None token_to_lemma = None all_verbs = None def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language) self.parser = StanfordParser(path_to_jar='dev/stanford-corenlp-3.6.0.jar', path_to_models_jar='dev/stanford-corenlp-3.6.0-models.jar', java_options=' -mx2G -Djava.ext.dirs=dev/') self.token_to_lemma = {} for lemma, tokens in self.lemma_to_token.iteritems(): for t in tokens: self.token_to_lemma[t] = lemma self.all_verbs = set(self.token_to_lemma.keys()) def extract_from_item(self, item): extracted = [] bio = item.get(self.document_key, '').lower() url = item.get('url') if not bio or not url: logger.warn('skipping item without url or bio') return try: roots = self.parser.raw_parse_sents(self.splitter.split(bio)) except (OSError, UnicodeDecodeError): logger.exception('cannot parse biography, skipping') return for root in roots: root = root.next() try: sub_sents = self.find_sub_sentences(root) except: logger.exception('cannot find sub-sentences') continue for sub in sub_sents: try: text = ' '.join(chunk for _, chunk in self.find_terminals(sub)) logger.debug('processing text ' + text) verbs = set(chunk for _, chunk in self.find_terminals(sub, 'V')) except: logger.exception('cannot extract verbs or parse sentence') continue found = verbs.intersection(self.all_verbs) if len(found) == 0: logger.debug('No matching verbs found in sub sentence') elif len(found) == 1: extracted.append({ 'lu': self.token_to_lemma[found.pop()], 'text': text, 'url': url, }) else: logger.debug('More than one matching verbs found in sentence %s: %s', text, repr(found)) if extracted: logger.debug("%d sentences extracted...", len(extracted)) return item, extracted else: logger.debug("No sentences extracted. Skipping the whole item ...") def find_sub_sentences(self, tree): # sub-sentences are the lowest S nodes in the parse tree if not isinstance(tree, Tree): return [] s = reduce(lambda x, y: x + y, map(self.find_sub_sentences, iter(tree)), []) if tree.label() == 'S': return s or [tree] else: return s def find_terminals(self, tree, label=None): # finds all terminals in the tree with the given label prefix if len(tree) == 1 and not isinstance(tree[0], Tree): if label is None or tree.label().startswith(label): yield (tree.label(), tree[0]) else: for child in tree: for each in self.find_terminals(child, label): yield each
def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language)
class GrammarExtractor(SentenceExtractor): """ Grammar-based extraction strategy: pick sentences that comply with a pre-defined grammar. """ splitter = None parser = None # Grammars rely on POS labels, which are language-dependent grammars = { 'en': r""" NOPH: {<PDT>?<DT|PP.*|>?<CD>?<JJ.*|VVN>*<N.+|FW>+<CC>?} CHUNK: {<NOPH>+<MD>?<V.+>+<IN|TO>?<NOPH>+} """, 'it': r""" SN: {<PRO.*|DET.*|>?<ADJ>*<NUM>?<NOM|NPR>+<NUM>?<ADJ|VER:pper>*} CHUNK: {<SN><VER.*>+<SN>} """, } def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language) grammar = self.grammars.get(self.language) if grammar: self.parser = RegexpParser(grammar) else: raise ValueError( "Invalid or unsupported language: '%s'. Please use one of the currently supported ones: %s" % (self.language, self.grammars.keys())) for lemma, match_tokens in self.lemma_to_token.iteritems(): self.lemma_to_token[lemma] = set( [match.lower() for match in match_tokens]) def extract_from_item(self, item): extracted = [] url = item.get('url') if not url: logger.warn('skipping item without url') return document = item.get(self.document_key) if not document: return elif isinstance(document, list): document = '\n'.join(document) # Sentence splitting sentences = self.splitter.split(document) tokens = 0 for sentence in sentences: if not sentence.strip(): continue tagged = [(token, pos) for token, pos, lemma in self.tagger.tag_one(sentence)] # Parsing via grammar parsed = self.parser.parse(tagged) # Loop over sub-sentences that match the grammar for grammar_match in parsed.subtrees( lambda t: t.label() == 'CHUNK'): logger.debug("Grammar match: '%s'" % grammar_match) # Look up the LU for token, pos in grammar_match.leaves(): # Restrict match to sub-sentence verbs only if pos.startswith('V'): for lemma, match_tokens in self.lemma_to_token.iteritems( ): if token.lower() in match_tokens: # Return joined chunks only # TODO test with full sentence as well # TODO re-constitute original text (now join on space) text = ' '.join([ leaf[0] for leaf in grammar_match.leaves() ]) logger.debug("Extracted sentence: '%s'" % text) logger.debug( "Sentence token '%s' is in matches %s" % (token, match_tokens)) logger.debug("Extracted sentence: %s" % text) extracted.append({ 'lu': lemma, 'text': text, 'tagged': tagged, 'url': url, }) if extracted: logger.debug( "%d sentences extracted. Removing the full text from the item ...", len(extracted)) item.pop(self.document_key) return item, extracted else: logger.debug("No sentences extracted. Skipping the whole item ...")
class SyntacticExtractor(SentenceExtractor): """ Tries to split sentences into sub-sentences so that each of them contains only one LU """ splitter = None parser = None token_to_lemma = None all_verbs = None def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language) self.parser = StanfordParser( path_to_jar='dev/stanford-corenlp-3.6.0.jar', path_to_models_jar='dev/stanford-corenlp-3.6.0-models.jar', java_options=' -mx2G -Djava.ext.dirs=dev/') self.token_to_lemma = {} for lemma, tokens in self.lemma_to_token.iteritems(): for t in tokens: self.token_to_lemma[t] = lemma self.all_verbs = set(self.token_to_lemma.keys()) def extract_from_item(self, item): extracted = [] bio = item.get(self.document_key, '').lower() url = item.get('url') if not bio or not url: logger.warn('skipping item without url or bio') return try: roots = self.parser.raw_parse_sents(self.splitter.split(bio)) except (OSError, UnicodeDecodeError): logger.exception('cannot parse biography, skipping') return for root in roots: root = root.next() try: sub_sents = self.find_sub_sentences(root) except: logger.exception('cannot find sub-sentences') continue for sub in sub_sents: try: text = ' '.join(chunk for _, chunk in self.find_terminals(sub)) logger.debug('processing text ' + text) verbs = set(chunk for _, chunk in self.find_terminals(sub, 'V')) except: logger.exception('cannot extract verbs or parse sentence') continue found = verbs.intersection(self.all_verbs) if len(found) == 0: logger.debug('No matching verbs found in sub sentence') elif len(found) == 1: extracted.append({ 'lu': self.token_to_lemma[found.pop()], 'text': text, 'url': url, }) else: logger.debug( 'More than one matching verbs found in sentence %s: %s', text, repr(found)) if extracted: logger.debug("%d sentences extracted...", len(extracted)) return item, extracted else: logger.debug("No sentences extracted. Skipping the whole item ...") def find_sub_sentences(self, tree): # sub-sentences are the lowest S nodes in the parse tree if not isinstance(tree, Tree): return [] s = reduce(lambda x, y: x + y, map(self.find_sub_sentences, iter(tree)), []) if tree.label() == 'S': return s or [tree] else: return s def find_terminals(self, tree, label=None): # finds all terminals in the tree with the given label prefix if len(tree) == 1 and not isinstance(tree[0], Tree): if label is None or tree.label().startswith(label): yield (tree.label(), tree[0]) else: for child in tree: for each in self.find_terminals(child, label): yield each