def classify(self, txt, topics): sentences = sent_tokenize(txt) for sentence in sentences: if self._ignore(sentence): continue extract_topics = NPExtractor(sentence).extract() _topics = list(set(topics).union(set(extract_topics))) _topics = [x for x in _topics if len(x.split()) < 5] if len(_topics) == 0: continue is_classified = False for topic in _topics: if not topic.isalnum(): continue classifier = self._classify(sentence, topic) if not classifier: continue sentence = ' '.join(util.tokenize(sentence)) yield classifier, topic, sentence is_classified = True break if not is_classified and self._is_no_def(sentence): topic = _topics[0] if _topics[0] in sentence: topic = _topics[0] elif len(_topics) > 1: topic = _topics[1] sentence = ' '.join(util.tokenize(sentence)) yield 'nodef', topic, sentence
def pull(self, dest, download): print('Pulling from w00 dataset...') if download: f_path = os.path.join(dest, 'w00.zip') with open(f_path, 'wb') as f_out: f_out.write(urllib.request.urlopen(self._LINK).read()) with zipfile.ZipFile(f_path) as zipf: zipf.extractall(dest) if util.tsv_already_exist(dest, [self._OUT_DEF_FILE, self._OUT_NODEF_FILE]): return source_word = os.path.join(dest, self._SOURCE_WORD) source_meta = os.path.join(dest, self._SOURCE_META) assert os.path.exists(source_word) assert os.path.exists(source_meta) f_out_def_path = os.path.join(dest, self._OUT_DEF_FILE) f_out_nodef_path = os.path.join(dest, self._OUT_NODEF_FILE) for line, meta in zip(open(source_word), open(source_meta)): line = line.strip() if not line: continue line = ' '.join(util.tokenize(line)) def_flag = 1 if meta.startswith('1') else 0 if def_flag == 1: util.save_output(f_out_def_path, line, def_flag, self.KEY) else: util.save_output(f_out_nodef_path, line, def_flag, self.KEY) print('\tDONE\n') return
def pull(self, dest, download): print('Pulling from msresearch dataset...') f_path = os.path.join(dest, 'msresearch.txt') if download: with open(f_path, 'wb') as f_out: f_out.write(urllib.request.urlopen(self._LINK).read()) if util.tsv_already_exist(dest, [self._OUT_DEF_FILE, self._OUT_NODEF_FILE]): return source = open(f_path) f_out_def_path = os.path.join(dest, self._OUT_DEF_FILE) f_out_nodef_path = os.path.join(dest, self._OUT_NODEF_FILE) for line in source: line = line.strip() if not line: continue is_def, phrase = line.split('/', 1) def_flag = is_def == 'DEF' _def = 1 if def_flag else 0 f_out_path = f_out_nodef_path topic, pos = self._extract_topic_pos(phrase) if _def: f_out_path = f_out_def_path phrase = ' '.join(util.tokenize(phrase)) util.save_output(f_out_path, phrase, _def, self.KEY, topic, pos) print('\tDONE\n') return
def _classify(self, sentence, topic): _sentence = ' '.join(util.tokenize(sentence.lower())) _topic = topic.lower() if self._is_anafora(_sentence): return 'anafora' if self._is_def(_sentence, _topic): return 'def' return
def pull(self, dest, download): print('Pulling from wcl dataset...') if download: f_path = os.path.join(dest, 'wcl.tar.gz') with open(f_path, 'wb') as f_out: f_out.write(urllib.request.urlopen(self._LINK).read()) with tarfile.open(f_path, 'r:gz') as targz: targz.extractall(dest) if util.tsv_already_exist(dest, [self._OUT_DEF_FILE, self._OUT_NODEF_FILE]): return source_uwak = os.path.join(dest, self._SOURCE_UKWAC) source_good = os.path.join(dest, self._SOURCE_WIKI_GOOD) source_bad = os.path.join(dest, self._SOURCE_WIKI_BAD) sources = [(source_uwak, True), (source_good, True), (source_bad, False)] for source, _ in sources: assert os.path.exists(source) f_out_def_path = os.path.join(dest, self._OUT_DEF_FILE) f_out_nodef_path = os.path.join(dest, self._OUT_NODEF_FILE) for source, _def in sources: prevLine = '' for i, line in enumerate(open(source)): line = line.replace('\t', '') line = line.strip('! #\n') if not line: continue if i % 2 == 0: prevLine = line continue subject, _ = line.split(':', maxsplit=1) phrase = prevLine.replace('TARGET', subject) phrase = ' '.join(util.tokenize(phrase)) is_def = 1 if _def else 0 if is_def == 1: pos = util.topic_position(subject, phrase) util.save_output(f_out_def_path, phrase, is_def, self.KEY, topic=subject, topic_pos=pos) else: util.save_output(f_out_nodef_path, phrase, is_def, self.KEY, topic=subject) print('\tDONE\n') return
def _extract_topics_definitions_from(self, article, topics): result = set() definitions = DifferenceBetween(article, topics).extractDefinitions() for topic in topics: for _def in definitions.values(): if _def is None: continue lower_def = _def.lower() if re.match(r"^((a)|(an) )?{} ((is)|(are)).+".format(topic), lower_def): result.add((topic, ' '.join(util.tokenize(_def)))) break return result