def pull(self, dest, download): print('Pulling from w00 dataset...') if download: f_path = os.path.join(dest, 'w00.zip') with open(f_path, 'wb') as f_out: f_out.write(urllib.request.urlopen(self._LINK).read()) with zipfile.ZipFile(f_path) as zipf: zipf.extractall(dest) if util.tsv_already_exist(dest, [self._OUT_DEF_FILE, self._OUT_NODEF_FILE]): return source_word = os.path.join(dest, self._SOURCE_WORD) source_meta = os.path.join(dest, self._SOURCE_META) assert os.path.exists(source_word) assert os.path.exists(source_meta) f_out_def_path = os.path.join(dest, self._OUT_DEF_FILE) f_out_nodef_path = os.path.join(dest, self._OUT_NODEF_FILE) for line, meta in zip(open(source_word), open(source_meta)): line = line.strip() if not line: continue line = ' '.join(util.tokenize(line)) def_flag = 1 if meta.startswith('1') else 0 if def_flag == 1: util.save_output(f_out_def_path, line, def_flag, self.KEY) else: util.save_output(f_out_nodef_path, line, def_flag, self.KEY) print('\tDONE\n') return
def pull(self, dest, download): print('Pulling from msresearch dataset...') f_path = os.path.join(dest, 'msresearch.txt') if download: with open(f_path, 'wb') as f_out: f_out.write(urllib.request.urlopen(self._LINK).read()) if util.tsv_already_exist(dest, [self._OUT_DEF_FILE, self._OUT_NODEF_FILE]): return source = open(f_path) f_out_def_path = os.path.join(dest, self._OUT_DEF_FILE) f_out_nodef_path = os.path.join(dest, self._OUT_NODEF_FILE) for line in source: line = line.strip() if not line: continue is_def, phrase = line.split('/', 1) def_flag = is_def == 'DEF' _def = 1 if def_flag else 0 f_out_path = f_out_nodef_path topic, pos = self._extract_topic_pos(phrase) if _def: f_out_path = f_out_def_path phrase = ' '.join(util.tokenize(phrase)) util.save_output(f_out_path, phrase, _def, self.KEY, topic, pos) print('\tDONE\n') return
def pull(self, dest, download): print('Pulling from wcl dataset...') if download: f_path = os.path.join(dest, 'wcl.tar.gz') with open(f_path, 'wb') as f_out: f_out.write(urllib.request.urlopen(self._LINK).read()) with tarfile.open(f_path, 'r:gz') as targz: targz.extractall(dest) if util.tsv_already_exist(dest, [self._OUT_DEF_FILE, self._OUT_NODEF_FILE]): return source_uwak = os.path.join(dest, self._SOURCE_UKWAC) source_good = os.path.join(dest, self._SOURCE_WIKI_GOOD) source_bad = os.path.join(dest, self._SOURCE_WIKI_BAD) sources = [(source_uwak, True), (source_good, True), (source_bad, False)] for source, _ in sources: assert os.path.exists(source) f_out_def_path = os.path.join(dest, self._OUT_DEF_FILE) f_out_nodef_path = os.path.join(dest, self._OUT_NODEF_FILE) for source, _def in sources: prevLine = '' for i, line in enumerate(open(source)): line = line.replace('\t', '') line = line.strip('! #\n') if not line: continue if i % 2 == 0: prevLine = line continue subject, _ = line.split(':', maxsplit=1) phrase = prevLine.replace('TARGET', subject) phrase = ' '.join(util.tokenize(phrase)) is_def = 1 if _def else 0 if is_def == 1: pos = util.topic_position(subject, phrase) util.save_output(f_out_def_path, phrase, is_def, self.KEY, topic=subject, topic_pos=pos) else: util.save_output(f_out_nodef_path, phrase, is_def, self.KEY, topic=subject) print('\tDONE\n') return
def pull(self, dest, download): print('Pulling for dictionary dataset...') self.wcl_process = util.start_wcl_process() self.dest = dest folder_path = '{}/{}'.format(dest, self._PAGE_FOLDER) if not os.path.exists(folder_path): os.makedirs(folder_path) if download: self._save_locally(folder_path) if util.tsv_already_exist(dest, self._WCL_OUT_FILE): return self._extract_topics_definitions_from(folder_path) return
def pull(self, dest, download): print('Pulling from diffbetween dataset...') self.dest = dest self.f_out_path = os.path.join(dest, self._OUT_FILES[0]) folder_path = '{}/{}'.format(dest, self._PAGE_FOLDER) if not os.path.exists(folder_path): os.makedirs(folder_path) if download: downloader = Downloader(folder_path) downloader.save_locally() print('\n') if util.tsv_already_exist(dest, self._OUT_FILES): return self._extract_from(folder_path) print('\n\tDONE\n') return
def pull(self, dest, download): print('Pulling from wikipedia dataset...\n') if util.tsv_already_exist(dest, self._OUT_FILES): return self.dest = dest wiki_folder = os.path.join(dest, self._INPUT_FOLDER) classifier = TxtClassifier() content_folders = os.listdir(wiki_folder) for i, folder in enumerate(content_folders): content_folder_path = os.path.join(wiki_folder, folder) content_files = os.listdir(content_folder_path) for j, file_name in enumerate(content_files): progress = '{} [{}/{}]'.format(i, j, len(content_files)) util.print_progress('Extracting def/nodef ', progress, len(content_folders)) file = os.path.join(content_folder_path, file_name) self._parse(file, classifier) return