def __load__data__(self, data): if(isinstance(data, Category)): if hasattr(data, "_id"): self._id = data._id self.name = data.name if hasattr(data, "domain"): self.domain = data.domain self.patterns = data.patterns self.sentences = data.sentences self.sentence_ids = data.sentence_ids else: self._id = data["_id"] self.name = data["name"] self.domain = data.get("domain") if self.domain is None: self.domain = "none" patterns = data.get("patterns") if patterns is None: patterns = [] for pat in patterns: self.patterns.append(pat) self.sentence_ids = data.get("sentences") self.sentences = [] if self.sentence_ids is None: self.sentence_ids = [] for id in self.sentence_ids: data = Database.find_one(Database.COLLECTIONS.SENTENCE, {"_id": id}) s = Sentence() s.load_from_dict(data) self.sentences.append(s)
def read_sentence(line, position): infos = line.split(";;") nb_clause = int(infos[0]) text = infos[-1].strip() propositions = [prop.split("\t") for prop in infos[1:len(infos)-1]] return Sentence(propositions, text, nb_clause, position)
def main() -> None: in_dir = Path.home() / 'Research/hyperpartisan_news' out_dir = Path('../../data/interim/news/train') Path.mkdir(out_dir, parents=True, exist_ok=True) corpus = in_dir / 'articles-training-bypublisher-20181122.xml' metadata = in_dir / 'ground-truth-training-bypublisher-20181122.xml' # dev_corpus = in_dir / 'articles-validation-bypublisher-20181122.xml' # dev_metadata = in_dir / 'ground-truth-validation-bypublisher-20181122.xml' data = parse_xml(corpus, metadata) processor = stanza.Pipeline(lang='en', processors='tokenize', tokenize_batch_size=4096) # processor = stanza.Pipeline(lang='en', processors={'tokenize': 'spacy'}) for part_index, some_docs in tqdm(enumerate(partition(data, 100)), total=100, desc='Total'): for doc in tqdm(some_docs, desc='Chunk'): processed = processor(doc.text) doc.sentences = [ Sentence([token.text for token in stanza_sent.tokens]) for stanza_sent in processed.sentences ] with open(out_dir / f'tokenized_{part_index}.pickle', 'wb') as file: pickle.dump(some_docs, file, protocol=-1)
def _preprocess_sents(self, raw_sents): processed_sents = [] for s in raw_sents: processed_sent = Sentence(text=s, words=word_tokenize(s), position=None) processed_sents.append(processed_sent) return processed_sents
def sentence_break(self): if len(self.texts) == 0: return if self.config.iobes: self.tags = iob_to_iobes(self.tags) tokens = [Token(t, g) for t, g in zip(self.texts, self.tags)] self.document.add_child(Sentence(tokens=tokens)) self.texts = [] self.tags = []
def make_document(token_texts, label): """Return Document object initialized with given token texts.""" tokens = [Token(t) for t in token_texts] # We don't have sentence splitting, but the data structure expects # Documents to contain Sentences which in turn contain Tokens. # Create a dummy sentence containing all document tokens to work # around this constraint. sentences = [Sentence(tokens=tokens)] return Document(target_str=label, sentences=sentences)
def draw_weighted_alignment_from_file(alignment_path, french_path, english_path, output_file, sure=False, sentence_id=1): """ Draws an alignment that is weighted according to probs of alignment. We use the last column (4th) for the probability of alignment. """ french = _read_sentences_from_file(french_path, sentence_id) english = _read_sentences_from_file(english_path, sentence_id) # alignments, _ = _read_alignment_from_file(naacl_path, sentence_id, sure) alignments, prediction_weights = _read_alignment_from_file(alignment_path, sentence_id, sure, weighted=True) draw_alignment(alignments, prediction_weights, Sentence(sentence_id, english, french), output_file)
def _preprocess(self, articles): sent_splitter = SentenceSplitter() processed_articles = [] for a in articles: body_sents = sent_splitter.split_sents(a['text']) processed_title = Sentence(text=a['title'], words=word_tokenize(a['title']), position=-1, is_title=True) processed_sents = [] for position, s in enumerate(body_sents): processed_sent = Sentence(text=s, words=word_tokenize(s), position=position) processed_sents.append(processed_sent) processed_article = Article(processed_title, processed_sents) processed_articles.append(processed_article) return processed_articles
def draw_alignment_from_file(naacl_path, french_path, english_path, file_name: str, sure=False, sentence_id=1): """ input: naacl_path, file with gold alignments french_path, french sentences english_path, enlgish sentences fig_path, output figure path sure, print sure alignments sentence, position id of sentence to print form the corpus """ french = _read_sentences_from_file(french_path, sentence_id) english = _read_sentences_from_file(english_path, sentence_id) alignments, _ = _read_alignment_from_file(naacl_path, sentence_id, sure, weighted=False) draw_alignment(alignments, [], Sentence(sentence_id, english, french), file_name)
def loadSrc(self): corpus = self.corpus src = self.src tokens = [] id = 0 with open(src, 'r') as fin: for line in fin.readlines(): if line == '\n': tmp_tokens = list(tokens) le = len(tmp_tokens) for i, token in enumerate(tmp_tokens): h_id = token.h_id h_rel = token.rel if i < le - 1: tmp_tokens[i].add_d_id_rel(i + 1, '@+1@') if i > 0: tmp_tokens[i].add_u_id_rel(i - 1, '@-1@') if h_id != -1: tmp_tokens[h_id].add_d_id_rel(i, h_rel) sent = Sentence(tmp_tokens) corpus.append(sent) tokens = [] id = 0 else: items = line.strip().split() t_str = items[0] h_id = int(items[1]) rel = items[2] label = items[3] token = Token(id, t_str, h_id, rel, label) tokens.append(token) id += 1 return corpus
def test_sentences(self): print('test sentences') texts = ["a x d z w w a", "a b c d w x y z", "w w x x a a"] sentences = [Sentence(text, self.vocab) for text in texts] tensors = [sentence.to_tensor() for sentence in sentences] print('vocab') print(self.vocab.w2id) print('encode') tokens = [self.vocab.encode(text) for text in texts] print(tokens) print('decode') decoded = [self.vocab.decode(l) for l in tokens] print(decoded) print('sentences') print(sentences) print('tensors') print(tensors) return sentences
def train(**kwargs): print(kwargs) start = time.time() # 根据命令行参数更新配置 vis = Visualizer(opt.env) opt.parse(kwargs) # 加载词向量 print("Loading word vectors...Please wait.") vector = KeyedVectors.load_word2vec_format( os.path.join(os.path.dirname(os.path.realpath(opt.train_data_root)), 'vector.txt') ) print("Successfully loaded word vectors.") # step1: 模型 model = getattr(models, opt.model)(input_size=vector.vector_size+2, output_size=opt.class_num) if opt.load_model_path: model.load(opt.load_model_path) # 预加载 if opt.use_gpu and t.cuda.is_available(): model = model.cuda() print(f"Structure of {model.model_name}:\n{model}\n") # step2: 数据 train_data = Sentence(root=opt.train_data_root, relations=opt.relations, max_length=opt.max_length, vector=vector, train=True) # 训练集 train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True) val_data = Sentence(opt.train_data_root, opt.relations, opt.max_length, vector, train=False) # 验证集 val_dataloader = DataLoader(val_data, opt.batch_size, shuffle=True) # step3: 目标函数和优化器 loss_fn = t.nn.CrossEntropyLoss() lr = opt.lr optimizer = t.optim.Adam(params=model.parameters(), lr=lr, weight_decay = opt.weight_decay) # step4: 统计指标:平滑处理之后的损失,还有混淆矩阵 loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(opt.class_num) previous_loss = 1e100 # 训练 for epoch in range(opt.max_epoch): loss_meter.reset() confusion_matrix.reset() for ii, (data, label) in enumerate(train_dataloader): # 训练模型参数 input = data target = label if opt.use_gpu: input = input.cuda() target = target.cuda() optimizer.zero_grad() prediction = model(input) loss = loss_fn(prediction, target) loss.backward() optimizer.step() # 更新统计指标以及可视化 loss_meter.add(loss.item()) confusion_matrix.add(prediction.data, target.data) # if ii % opt.print_freq == opt.print_freq - 1: # vis.plot('train loss', loss_meter.value()[0]) # 如果需要的话,进入debug模式 # if os.path.exists(opt.debug_file): # import ipdb; # ipdb.set_trace() cm_value = confusion_matrix.value() correct = 0 for i in range(cm_value.shape[0]): correct += cm_value[i][i] accuracy = 100. * correct / (cm_value.sum()) vis.plot('train loss', loss_meter.value()[0]) vis.plot('train accuracy', accuracy) if epoch % opt.save_epoch == opt.save_epoch -1: model.save() # 计算验证集上的指标及可视化 val_lm, val_cm, val_accuracy = val(model, val_dataloader) vis.plot('val loss', val_lm.value()[0]) vis.plot('val accuracy', val_accuracy) print("epoch:{epoch}, lr:{lr}, loss:{loss}\ntrain_cm:\n{train_cm}\nval_cm:\n{val_cm}" .format(epoch=epoch, loss=loss_meter.value()[0], val_cm=str(val_cm.value()), train_cm=str(confusion_matrix.value()), lr=lr) ) # 如果损失不再下降,则降低学习率 if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0] cost = int(time.time()) - int(start) print(f"Cost {int(cost/60)}min{cost%60}s.")
def main(sessions: Iterable[int], in_dir: Path, out_dir: Path, subsampling_implementation: Optional[str], subsampling_threshold: float, min_word_freq: int, min_sent_len: int, fixed_sent_len: int, eval_min_freq: int, eval_R_thresholds: Iterable[float], eval_num_random_samples: int, conserve_RAM: bool) -> None: Path.mkdir(out_dir, parents=True, exist_ok=True) preview = open(out_dir / f'preview.txt', 'w') print(f'Reading sessions {sessions}. Writing to {out_dir}') print(f'Reading sessions {sessions}. Writing to {out_dir}', file=preview) print(f'Min word frequency = {min_word_freq}', file=preview) print(f'Min sentence length = {min_sent_len}', file=preview) print(f'Faux sentence fixed length = {fixed_sent_len}', file=preview) print(f'SGNS subsample implementation= {subsampling_implementation}', file=preview) print(f'SGNS subsample threshold = {subsampling_threshold}', file=preview) corpus: List[LabeledDoc] = [] norm_freq: Counter[str] = Counter() for session in tqdm( sessions, desc='Loading multi-word expression underscored pickles...'): for party in ('D', 'R'): in_path = in_dir / f'underscored_{party}{session}.txt' with open(in_path) as underscored_corpus: for line in underscored_corpus: underscored_tokens = line.split() norm_freq.update(underscored_tokens) corpus.append( LabeledDoc(uid=None, title=None, url=None, party=party, referent=None, text=underscored_tokens, date=None, sentences=[])) cumulative_freq = sum(freq for freq in norm_freq.values()) print(f'Noramlized vocabulary size = {len(norm_freq):,}', file=preview) print(f'Number of words = {cumulative_freq:,}', file=preview) # Filter counter with MIN_FREQ and count UNK UNK_filtered_freq: Counter[str] = Counter() for key, val in norm_freq.items(): if val >= min_word_freq: UNK_filtered_freq[key] = val else: UNK_filtered_freq['[UNK]'] += val print(f'Filtered vocabulary size = {len(UNK_filtered_freq):,}', file=preview) assert sum(freq for freq in norm_freq.values()) == cumulative_freq # Subsampling & filter by min/max sentence length keep_prob = subsampling(UNK_filtered_freq, subsampling_implementation, subsampling_threshold) ground: Dict[str, GroundedWord] = {} final_freq: Counter[str] = Counter() for doc in tqdm(corpus, desc='Subsampling frequent words'): subsampled_words = [] for token in doc.text: if token in discard: continue if token not in UNK_filtered_freq: token = '[UNK]' if random.random() < keep_prob[token]: subsampled_words.append(token) for faux_sent in faux_sent_tokenize(subsampled_words, fixed_sent_len, min_sent_len): final_freq.update(faux_sent) doc.sentences.append(Sentence(subsampled_tokens=faux_sent)) for word in faux_sent: if word not in ground: ground[word] = GroundedWord(text=word, deno=None, cono=Counter({doc.party: 1})) else: ground[word].cono[doc.party] += 1 if conserve_RAM: doc.text = None # End looping documents print(f'Final vocabulary size = {len(final_freq):,}', file=preview) print( f'Subsampled number of words = ' f'{sum(freq for freq in final_freq.values()):,}', file=preview) # Filter out empty documents corpus = [doc for doc in corpus if len(doc.sentences) > 0] # Numericalize corpus by word_ids word_to_id, id_to_word = build_vocabulary(final_freq) for doc in tqdm(corpus, desc='Converting to word ids'): for sent in doc.sentences: sent.numerical_tokens = [ word_to_id[token] for token in sent.subsampled_tokens ] if conserve_RAM: sent.subsampled_tokens = None # Prepare grounding for intrinsic evaluation random_eval_words = set() for gw in ground.values(): gw.majority_cono = gw.cono.most_common(1)[0][0] gw.freq = sum(gw.cono.values()) gw.R_ratio = gw.cono['R'] / gw.freq if gw.freq >= eval_min_freq: random_eval_words.add(gw.text) random_eval_words = random.sample(random_eval_words, eval_num_random_samples) with open(out_dir / f'eval_words_random.txt', 'w') as file: file.write('\n'.join(random_eval_words)) for R_threshold in eval_R_thresholds: D_threshold = 1 - R_threshold partisan_eval_words = [] for gw in ground.values(): if gw.freq >= eval_min_freq: if gw.R_ratio >= R_threshold or gw.R_ratio <= D_threshold: partisan_eval_words.append(gw) print( f'{len(partisan_eval_words)} partisan eval words ' f'with R_threshold = {R_threshold}', file=preview) out_path = out_dir / f'inspect_{R_threshold}_partisan.tsv' with open(out_path, 'w') as file: print('word\tfreq\tR_ratio', file=file) for gw in partisan_eval_words: print(gw.text, gw.freq, gw.R_ratio, sep='\t', file=file) if len(partisan_eval_words) > 2 * eval_num_random_samples: partisan_eval_words = random.sample(partisan_eval_words, 2 * eval_num_random_samples) else: random.shuffle(partisan_eval_words) mid = len(partisan_eval_words) // 2 with open(out_dir / f'{R_threshold}partisan_dev_words.txt', 'w') as file: for gw in partisan_eval_words[:mid]: print(gw.text, file=file) with open(out_dir / f'{R_threshold}partisan_test_words.txt', 'w') as file: for gw in partisan_eval_words[mid:]: print(gw.text, file=file) # Helper for negative sampling cumulative_freq = sum(freq**0.75 for freq in final_freq.values()) negative_sampling_probs: Dict[int, float] = { word_to_id[word]: (freq**0.75) / cumulative_freq for word, freq in final_freq.items() } vocab_size = len(word_to_id) negative_sampling_probs: List[float] = [ # negative_sampling_probs[word_id] # strict negative_sampling_probs.get(word_id, 0) # prob = 0 if missing vocab for word_id in range(vocab_size) ] random.shuffle(corpus) cucumbers = { 'word_to_id': word_to_id, 'id_to_word': id_to_word, 'ground': ground, 'negative_sampling_probs': negative_sampling_probs, 'documents': corpus } print(f'Writing to {out_dir}') with open(out_dir / 'train.pickle', 'wb') as out_file: pickle.dump(cucumbers, out_file, protocol=-1) # Print out vocabulary & some random sentences for sanity check docs = random.sample(corpus, 100) preview.write('\n') for doc in docs: sent = doc.sentences[0] if not conserve_RAM: # print(sent.tokens, file=preview) # print(sent.normalized_tokens, file=preview) print(sent.subsampled_tokens, file=preview) print(sent.numerical_tokens, file=preview, end='\n\n') else: print(sent.numerical_tokens, file=preview) # print(vars(doc), end='\n\n', file=preview) preview.write('\n\nfinal_freq\tword\n') for key, val in final_freq.most_common(): print(f'{val:,}\t{ground[key]}', file=preview) preview.close() print('All set!')
def inputFromSentence(sentence : Sentence, max_length : int) -> Sentence: if len(sentence) > max_length: sentence = sentence[:max_length] if len(sentence) < max_length: sentence.extend([0] * (max_length - len(sentence))) return sentence