def tokenize_dataset(root, spacy_model): nlp = spacy.load(spacy_model) for topic in sorted(os.listdir(root)): print('TOPIC:', topic) if os.path.exists(root / topic / 'articles.jsonl.gz'): articles = list(utils.read_jsonl_gz(root / topic / 'articles.jsonl.gz')) elif os.path.exists(root / topic / 'articles.jsonl'): articles = list(utils.read_jsonl(root / topic / 'articles.jsonl')) else: continue jsonl_out_path = root / topic / 'articles.tokenized.jsonl' out_batch = [] for i, a in enumerate(articles): tokenized_doc = '' doc = nlp(a['text']) for sent in doc.sents: tokens = [tok.text for tok in sent if not tok.text.isspace()] tokenized_doc += ' '.join(tokens) + '\n' a['text'] = tokenized_doc.strip() out_batch.append(a) if i % 100 == 0: utils.write_jsonl(out_batch, jsonl_out_path, override=False) out_batch = [] print(i) utils.write_jsonl(out_batch, jsonl_out_path, override=False) gz_out_path = root / topic / 'articles.tokenized.jsonl.gz' utils.gzip_file(jsonl_out_path, gz_out_path, delete_old=True)
def preprocess_dataset(root, nlp): ftmp = open("log.txt", "w") article_path = root / 'articles.tokenized.jsonl.gz' articles = utils.read_jsonl_gz(article_path) h_output_dir = root / 'demo_time_annotated' out_path = root / 'demo_articles.preprocessed.jsonl' out_batch = [] i = 0 for old_a, timeml_raw in read_articles(articles, h_output_dir): print("old_a: {}\ntimeml_raw: {}\n".format(old_a, timeml_raw), file=ftmp) a = preprocess_article(old_a, timeml_raw, nlp) if a: out_batch.append(a.to_dict()) else: date = arrow.get(old_a['time']).date() print('cannot process:', date, old_a['id']) if i % 100 == 0: print('writing batch,', i, 'articles done') if i == 0: utils.write_jsonl(out_batch, out_path, override=True) else: utils.write_jsonl(out_batch, out_path, override=False) out_batch = [] i += 1 utils.write_jsonl(out_batch, out_path, override=False) gz_path = str(out_path) + '.gz' utils.gzip_file(inpath=out_path, outpath=gz_path, delete_old=True)
def preprocess_dataset(root, nlp): for topic in sorted(os.listdir(root)): print("TOPIC:", topic) article_path = root / topic / "articles.tokenized.jsonl.gz" articles = utils.read_jsonl_gz(article_path) h_output_dir = root / topic / "time_annotated" out_path = root / topic / "articles.preprocessed.jsonl" out_batch = [] i = 0 for old_a, timeml_raw in read_articles(articles, h_output_dir): a = preprocess_article(old_a, timeml_raw, nlp) if a: out_batch.append(a.to_dict()) else: date = arrow.get(old_a["time"]).date() print("cannot process:", date, old_a["id"]) if i % 100 == 0: print("writing batch,", i, "articles done") if i == 0: utils.write_jsonl(out_batch, out_path, override=True) else: utils.write_jsonl(out_batch, out_path, override=False) out_batch = [] i += 1 utils.write_jsonl(out_batch, out_path, override=False) gz_path = str(out_path) + ".gz" utils.gzip_file(inpath=out_path, outpath=gz_path, delete_old=True)
def preprocess_dataset(root, nlp): for topic in sorted(os.listdir(root)): print('TOPIC:', topic) if topic == '.DS_Store': continue article_path = root / topic / 'articles.tokenized.jsonl.gz' articles = utils.read_jsonl_gz(article_path) h_output_dir = root / topic / 'time_annotated' out_path = root / topic / 'articles.preprocessed.jsonl' out_batch = [] i = 0 for old_a, timeml_raw in read_articles(articles, h_output_dir): a = preprocess_article(old_a, timeml_raw, nlp) if a: out_batch.append(a.to_dict()) else: date = arrow.get(old_a['time']).date() print('cannot process:', date, old_a['id']) if i % 100 == 0: print('writing batch,', i, 'articles done') if i == 0: utils.write_jsonl(out_batch, out_path, override=True) else: utils.write_jsonl(out_batch, out_path, override=False) out_batch = [] i += 1 utils.write_jsonl(out_batch, out_path, override=False) gz_path = str(out_path) + '.gz' utils.gzip_file(inpath=out_path, outpath=gz_path, delete_old=False)
def articles(self): path1 = self.path / "articles.preprocessed.jsonl" path2 = self.path / "articles.preprocessed.jsonl.gz" if path1.exists(): articles = utils.read_jsonl(path1) else: articles = utils.read_jsonl_gz(path2) for a_ in articles: a = load_article(a_) t = self.normalise_time(a.time) if self.start and t < self.start: continue if self.end and t > self.end: break yield a
def time_batches(self): articles = utils.read_jsonl_gz(self.path / 'articles.preprocessed.jsonl.gz') time = None batch = [] for a_ in articles: a = load_article(a_) a_time = self.normalise_time(a.time) if self.start and a_time < self.start: continue if self.end and a_time > self.end: break if time and a_time > time: yield time, batch time = a_time batch = [a] else: batch.append(a) time = a_time yield time, batch
def heideltime_preprocess(dataset_dir, heideltime_path): apply_heideltime = heideltime_path / 'apply-heideltime.jar' heideltime_config = heideltime_path / 'config.props' for topic in os.listdir(dataset_dir): print('TOPIC:', topic) articles = utils.read_jsonl_gz(dataset_dir / topic / 'articles.tokenized.jsonl.gz') out_dir = dataset_dir / topic / 'time_annotated' utils.force_mkdir(out_dir) write_input_articles(articles, out_dir) subprocess.run([ 'java', '-jar', str(apply_heideltime), str(heideltime_config), str(out_dir), 'txt' ]) delete_input_articles(articles, out_dir)