def create_tagging_schema(directory_source,source,attributes_chosed,raf=False,raf_attribute="",name=""): files = os.listdir(directory_source) source_sentences = [] source_example_counts = dict() print("Tagging al the sentence of source: "+source+" ...") for filename in [file for file in files if file.endswith(".json")]: js = utils.open_json(filename,source) for attribute in js: if raf: if isinstance(js[attribute],str) and attribute!="<page title>" and attribute==raf_attribute: sentence = [] for token in utils.tokenizer(attribute): sentence.append((token,"O")) sentence.append(("ENDNAME","O")) sentence = sentence + tag_sentence(attribute,js[attribute],attributes_chosed,source,raf,raf_attribute,name) sentence.append(("ENDVALUE","O")) if useful(sentence)>0: source_sentences.append((raf_attribute,sentence)) source_example_counts.setdefault(raf_attribute,0) source_example_counts[raf_attribute]=source_example_counts[raf_attribute]+1 else: if isinstance(js[attribute],str) and attribute!="<page title>" and [t for t in d.get_predicate_name(attribute,source,True) if t in attributes_chosed]: sentence = [] for token in utils.tokenizer(attribute): sentence.append((token,"O")) sentence.append(("ENDNAME","O")) sentence = sentence + tag_sentence(attribute,js[attribute],attributes_chosed,source) sentence.append(("ENDVALUE","O")) if useful(sentence)>0: p_name = d.get_predicate_name(attribute,source,True)[0] source_sentences.append((p_name,sentence)) source_example_counts.setdefault(p_name,0) source_example_counts[p_name]=source_example_counts[p_name]+1 return (source_example_counts,source_sentences)
def train(): training_ls = dir_reader(TRAIN_DIR) relations = relation_reader(cache=RELATIONS) assert relations['Other'] == 0 assert Relation_type == len(relations) print(relations) train_ls, val_ls = training_ls[:len(training_ls) - 800], training_ls[len(training_ls) - 800:] #train_ls, val_ls = training_ls[:2], training_ls[:10] train_data = tokenizer((train_ls, TRAIN_DIR), relations, pretrain_type='elmo_repre') val_data = tokenizer((val_ls, TRAIN_DIR), relations, pretrain_type='elmo_repre') print('%d training data, %d validation data' % (len(train_data.data), len(val_data.data)), flush=True) LSTM_layer = SeqLayer(ELMo_size, Hidden_size, Hidden_layer, Dropout, Bidirection).cuda() RE = RelationDetect_woemb(Hidden_size, Relation_type, Hidden_size, Dropout).cuda() print('network initialized', flush=True) #LSTM_layer.load_state_dict(torch.load(SAVE_DIR + 'LSTM_499')) #RE.load_state_dict(torch.load(SAVE_DIR + 'RE_499')) if os.path.exists(LOG_FILE): os.remove(LOG_FILE) end2end(train_data, val_data, LSTM_layer, RE, Learning_rate, Epoch)
def add_false_examples(set,target): sentences = [] for s in [x[1] for x in os.walk(config["DIRECTORY_DATASET"])][0]: if s!="www.ebay.com" and s!="www.alibaba.com": directory_source = config["DIRECTORY_DATASET"] + s files = os.listdir(directory_source) print(directory_source+"...") for filename in [file for file in files if file.endswith(".json")]: js = utils.open_json(filename,s) sentence = [] for attribute in js: if d.get_predicate_name(attribute,s,False)[0]==target and isinstance(js[attribute],str) and js[attribute]!="Black": for token in utils.tokenizer(attribute): sentence.append((token,"O")) sentence.append(("ENDNAME","O")) for token in utils.tokenizer(js[attribute]): sentence.append((token,"O")) sentence.append(("ENDVALUE","O")) sentences.append(sentence) with open("dataset/"+set+"_set.txt","a") as f: for sentence in sentences: for (token,tag) in sentence: f.write(token+"\t"+tag+"\n") f.write("\n") f.close()
def main(args): device = flow.device("cpu") if args.no_cuda else flow.device("cuda") with open(args.config_path, "r") as f: config = json.load(f) with open(args.vocab_path, "rb") as f: vocab = pickle.load(f) textcnn = textCNN( word_emb_dim=config["word_emb_dim"], vocab_size=len(vocab), dim_channel=config["dim_channel"], kernel_wins=config["kernel_wins"], dropout_rate=config["dropout_rate"], num_class=config["num_class"], max_seq_len=config["max_seq_len"], ) textcnn.load_state_dict(flow.load(args.model_path)) textcnn.eval() textcnn.to(device) text = utils.clean_str(args.text) text = [utils.tokenizer(text)] input = flow.tensor(np.array(utils.tensorize_data(text, vocab, max_len=200)), dtype=flow.long).to(device) predictions = textcnn(input).softmax() predictions = predictions.numpy() clsidx = np.argmax(predictions) print("predict prob: %f, class name: %s" % (np.max(predictions), clsidx))
def syntactic_test(path, syntactic_model, syntactic_device, vocab_mapping, batch_size: int = 72): """ Args: syntactic_model: model tested vocab_mapping: dictionary mapping words to unique integers syntactic_device: computing device batch_size: batch size to use while computing logprobs Returns: list of log probabilities assigned to each sentence """ # load & tokenize stimuli test_sentences = load_sentences(path) tokenized_sentences = tokenizer(test_sentences) encoded_tokens = encode_words(tokenized_sentences, vocab_mapping) print("number of sentences after encoding tokens:", len(encoded_tokens), len(encoded_tokens[-1])) num_steps = math.ceil(len(encoded_tokens) / batch_size) all_probs = [] for i in tqdm.trange(num_steps, desc="Computing logprobs"): sent_tok_ids, logprobs = get_words_logprobs( encoded_tokens[i * batch_size:(i + 1) * batch_size], syntactic_model, vocab_mapping, syntactic_device) all_probs.extend(get_sentences_probs(sent_tok_ids, logprobs)) return all_probs
def __init__(self): self.token_en = utils.tokenizer('en') self.token_de = utils.tokenizer('de') device = torch.device( ('cuda' if torch.cuda.is_available() == True else 'cpu')) print(self.token_en.get_tokenizer()("I LOVE U")) self.SRC = utils.create_filed(self.token_de.get_list) self.TRG = utils.create_filed(self.token_en.get_list) self.train_data, self.valid_data, self.test_data = Multi30k.splits( exts=('.de', '.en'), fields=(self.SRC, self.TRG)) self.SRC.build_vocab(self.train_data, min_freq=MIN_FREQ) self.TRG.build_vocab(self.train_data, min_freq=MIN_FREQ) self.train_iterator, self.valid_iterator, self.test_iterator = BucketIterator.splits( (self.train_data, self.valid_data, self.test_data), batch_size=BATCH_SIZE, device=device)
def predict_batch(self, list_sentences, all_words, word2idx, idx2tag): sent_token, sent_matrix = utils.tokenizer(list_sentences, all_words, word2idx, self.MAX_LENGTH) predict = self.sess.run(tf.argmax(self.predict, 2), feed_dict={self.X: sent_matrix}) # convert to tag tags = [] for i in range(len(predict)): tag_predict = [] for j in range(len(sent_token[i])): tag_predict.append(idx2tag[predict[i][j]]) tags.append(tag_predict) return sent_token, tags
def _token_chunks(s: str, s2=None, add_special_tokens=False): """ Helper function to tokenize without special tokens and returning only a numpy array for speed. """ text = s if s2 is None else (s, s2) tokens = utils.tokenizer( [text], return_tensors="np", truncation="only_first", add_special_tokens=add_special_tokens, ) return utils.tokenizer.convert_ids_to_tokens(tokens["input_ids"][0])
def transcribe(self, recognizer): """ use kaldi asr model to transcribe pcm_data for model structure check https://alphacephei.com/vosk/models.html input: pcm_data, instance of KaldiRecognizer, instance of nltk PorterStemmer output: [list of single-word Strings] """ # ASR recognizer.AcceptWaveform(self.bytes) utterance = loads(recognizer.Result())['text'] # tokenize into a list of 'words' self.transcript.extend(tokenizer(utterance))
def best_tagging(attribute,value,tokens_tag,attributes_chosed,source,raf,raf_attribute,name): tokens = utils.tokenizer(value) l = 0 json_sentence = [] single_value = single_value_tag(attribute,tokens,tokens_tag,attributes_chosed,source,raf,raf_attribute,name) while l<len(tokens)-1: current = value.rsplit(' ',l)[0] predicate_names = [t_attr for (v,t_attr) in tokens_tag if v==current] if not raf: predicate_name = d.coeherent_attribute(attribute,source) if predicate_name in predicate_names: temp = 0 last_tag = "O" for token in tokens: if token in current and not re.match("[,:;()\\\/]",token): if last_tag=="O": json_sentence.append((token,"B-"+predicate_name)) last_tag = "B" else: json_sentence.append((token,"I-"+predicate_name)) last_tag = "I" else: json_sentence.append((token,"O")) last_tag = "O" temp+=1 else: if name in predicate_names: temp = 0 last_tag = "O" for token in tokens: if token in current and not re.match("[,:;()\\\/]",token): if last_tag=="O": json_sentence.append((token,"B-"+name)) last_tag = "B" else: json_sentence.append((token,"I-"+name)) last_tag = "I" else: json_sentence.append((token,"O")) last_tag = "O" temp+=1 if useful(json_sentence)>useful(single_value): return json_sentence l+=1 return single_value
def predict_fn(input_data, model): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') tokenized = tokenizer(input_data) numericalized = [model.txt_field.vocab.stoi[t] for t in tokenized] sentence_length = torch.LongTensor([len(numericalized)]).to(device) tensor = torch.LongTensor(numericalized).unsqueeze(1).to(device) model.to(device) model.eval() with torch.no_grad(): forwardpass = model(tensor, sentence_length) _, indices = torch.topk(torch.sigmoid(forwardpass),k=3) tags = [model.label_field.vocab.itos[t] for t in indices.tolist()[0]] return tags
def __init__(self, topic, comma=False, num=False): self.origin = topic # 全角变半角 half_topic = utils.fullToHalf(topic) # 空格变逗号 self.rep_topic = half_topic if comma: self.rep_topic = utils.spacesToComma(half_topic) # 分字 self.words = utils.tokenizer(self.rep_topic) # 变换大小写和数字 self.rep_words = [w.lower() for w in self.words] if num: self.rep_words = utils.replaceCaseAndNums(self.words) self.len = len(self.rep_words) self.rep_string = " ".join(self.rep_words).strip()
def __init__(self, file_path, vocab, tokenizer): self.file_path = file_path self.data = [] self.vocab = vocab self.tokenizer = tokenizer # open text file file = open(self.file_path, 'r', encoding='utf-8') lines = file.read() lines = lines.split("\n") datasets = [] now = "" for i, line in enumerate(lines): if i % 30 == 0 and i != 0: datasets.append(now) now = "" continue now = now + "\n" + line print("tokenizer ending") # use for loop to iterate array of lines for line in datasets: if not line: break if len(line) < 3: continue toeknized_line = tokenizer(line[:-1]) index_of_words = [vocab[vocab.bos_token], ] + vocab[toeknized_line] + [vocab[vocab.eos_token]] self.data.append(index_of_words) print(np.shape(self.data)) file.close()
def processing_data(infile, labelfile, outfile, vocab_file, stopwords_file): print('Loading stopwords...') stopwords = get_stopwords(stopwords_file) print('Loading data...') data = pd.read_csv(infile) print('Saving labels') with open(labelfile, 'w') as f: for label in data.columns[2:]: f.write(label + '\n') # 把句子分割成词 print('Splitting content') contents = data['content'].tolist() seg_contents = segmentData(contents, stopwords) if not os.path.exists(vocab_file): print('Creating vocabulary...') create_vocab(seg_contents, vocab_file, 50000) print('Loading vocabulary...') w2i, _ = read_vocab(vocab_file) # word2id print('Tokenize...') token_contents = [tokenizer(c, w2i) for c in seg_contents] data['content'] = token_contents # 把标签转换成one hot形式 print('One-hot label') for col in data.columns[2:]: label = data[col].tolist() onehot_label = [onehot(l) for l in label] data[col] = onehot_label print('Saving...') data[data.columns[1:]].to_csv(outfile, index=False)
def process_data(self, comment): seg_comment = segmentData([comment], self.stopwords)[0] tokens = tokenizer(seg_comment, self.w2i) return tokens
def __init__(self, target_vocabulary): self.target_vocabulary = tokenizer(target_vocabulary) self.dictionary = {}
frequency_by_doc = [] for post in data: # get title if count_iter==len(random_idx): break if count == random_idx[count_iter]: count_iter+=1 # procesado para core | PRUEBA ENERO text = '\n'.join([post.get('title',''),post.get('description','')]) if text=='': print("Wot, texto vacio!!") # Tokenize and assign filter tags tokens = tokenizer(text) # procesado para core_tokenized | PRUEBA DE 1000 #tokens = post['tokens'] filtered_tokens = filterTokens(tokens,word_dict_filtered) # Title map and doc_title mapping title = ' '.join(tokens[0])[:50] doc_title = 'doc%i' % count_iter title_map_file.write(doc_title+'\n') doc_title_map[doc_title] = title ## content to display content = [' '.join(sent) for sent in filtered_tokens] content = '<br>\n'.join(content) open(os.path.join(docs_dir,doc_title),'w').write(content)
def load_data(): datasets = load_csv(DATA_PATH, filter_title=True, total=8000) X = [tokenizer(review) for label, review in datasets] y = [int(label) for label, review in datasets] return X, y
def test(): test_ls = dir_reader(TEST_DIR) relations = relation_reader(cache=RELATIONS) assert relations['Other'] == 0 assert Relation_type == len(relations) print(relations) test_data = tokenizer((test_ls, TEST_DIR), relations, pretrain_type='elmo_repre') print('%d test data' % len(test_data.data), flush=True) LSTM_layer = SeqLayer(ELMo_size, Hidden_size, Hidden_layer, Dropout, Bidirection).cuda() RE = RelationDetect_woemb(Hidden_size, Relation_type, Hidden_size, Dropout).cuda() LSTM_layer.load_state_dict( torch.load(SAVE_DIR + 'LSTM' + MODEL_NAME + '999')) RE.load_state_dict(torch.load(SAVE_DIR + 'RE' + MODEL_NAME + '999')) print('network initialized', flush=True) if os.path.exists(TEST_LOG_FILE): os.remove(TEST_LOG_FILE) test_data.reset_epoch() LSTM_layer.eval() RE.eval() TP = [[0.] * Relation_type for _ in range(len(Relation_threshold))] FP = [[0.] * Relation_type for _ in range(len(Relation_threshold))] FN = [[0.] * Relation_type for _ in range(len(Relation_threshold))] F1 = [[0.] * Relation_type for _ in range(len(Relation_threshold))] Precision = [[0.] * Relation_type for _ in range(len(Relation_threshold))] Recall = [[0.] * Relation_type for _ in range(len(Relation_threshold))] total_F1 = [0.] * len(Relation_threshold) micro_F1 = [0.] * len(Relation_threshold) total_F1_9 = [0.] * len(Relation_threshold) micro_F1_9 = [0.] * len(Relation_threshold) macro_F1_9 = [0.] * len(Relation_threshold) precision_9 = [0.] * len(Relation_threshold) recall_9 = [0.] * len(Relation_threshold) while not test_data.epoch_finish: standard_emb, e_label, e_posi, r_label, seq_length, mask, seq_pos = test_data.get_batch( Batch_size) #print(standard_emb.size()) #print(e_label) #print(e_posi, r_label, seq_length) #input() ctx = LSTM_layer(standard_emb, seq_length) # get relationship for i in range(Batch_size): '''# take NER into computation for s in range(1, seq_length[i] + 1): # s is the count of word number if s - 1 in e_posi[i][0] and e_posi[i][0][0] > e_posi[i][1][0]: gts = [(posi, r_label[i]) for posi in e_posi[i][1]] elif s - 1 in e_posi[i][1] and e_posi[i][1][0] > e_posi[i][0][0]: gts = [(posi, r_label[i]) for posi in e_posi[i][0]] else: gts = [((s - 1), 0)] #print(gts) u = RE(ctx[i:i + 1, :s, :]) result = nn.Softmax(dim=-1)(u[0, :, :].view(-1)) #print(result) #print(result.size()) #input() for j, th in enumerate(Relation_threshold): candidates = (result > th).nonzero() #print(candidates) for location, rtype in gts: gt = location * Relation_type + rtype if gt in candidates: # correct entity correct relation TP[j][rtype] += 1 candidates = candidates[candidates != gt] else: # at least one is wrong FN[j][rtype] += 1 for candidate in candidates: gt_locations = [l for (l, rt) in gts] if candidate // Relation_type in gt_locations: # correct entity wrong relation, omit continue else: # wrong entity FP[j][candidate % Relation_type] += 1 #print(TP[j]) #print(FN[j]) #print(FP[j]) #input()''' # ignore NER if e_posi[i][0][0] > e_posi[i][1][0]: s = e_posi[i][0][0] gts = [ posi * Relation_type + r_label[i] for posi in e_posi[i][1] ] gtp = [posi for posi in e_posi[i][1]] else: s = e_posi[i][1][0] gts = [ posi * Relation_type + r_label[i] for posi in e_posi[i][0] ] gtp = [posi for posi in e_posi[i][0]] u = RE(ctx[i:i + 1, :s + 1, :]) result = nn.Softmax(dim=-1)(u[0, :, :].view(-1)) for j, th in enumerate(Relation_threshold): candidates = (result > th).nonzero() # print(candidates) for candidate in candidates: if candidate in gts: # correct entity correct relation TP[j][r_label[i]] += 1 else: # at least one is wrong FN[j][r_label[i]] += 1 FP[j][candidate % Relation_type] += 1 for j, th in enumerate(Relation_threshold): for r in range(Relation_type): F1[j][r] = (2 * TP[j][r] + epsilon) / (2 * TP[j][r] + FP[j][r] + FN[j][r] + epsilon) Precision[j][r] = (TP[j][r] + epsilon) / (TP[j][r] + FP[j][r] + epsilon) Recall[j][r] = (TP[j][r] + epsilon) / (TP[j][r] + FN[j][r] + epsilon) total_F1[j] = np.average(np.array(F1[j])) micro_F1[j] = (2 * sum(TP[j]) + epsilon) / ( 2 * sum(TP[j]) + sum(FP[j]) + sum(FN[j]) + epsilon) total_F1_9[j] = np.average(np.array(F1[j][1:])) micro_F1_9[j] = (2 * sum(TP[j][1:]) + epsilon) / ( 2 * sum(TP[j][1:]) + sum(FP[j][1:]) + sum(FN[j][1:]) + epsilon) precision_9[j] = np.average(np.array(Precision[j][1:])) recall_9[j] = np.average(np.array(Recall[j][1:])) macro_F1_9[j] = (2 * recall_9[j] * precision_9[j] + epsilon) / (recall_9[j] + precision_9[j] + epsilon) print('(threshold %.2f)' % th, flush=True) print('with other: ave F1: %.4f, micro F1: %.4f' % (total_F1[j], micro_F1[j]), flush=True) print( 'without other: ave F1: %.4f, micro F1: %.4f, macro F1: %.4f, ave precision: %.4f, ave recall: %.4f' % (total_F1_9[j], micro_F1_9[j], macro_F1_9[j], precision_9[j], recall_9[j]), flush=True) with open(TEST_LOG_FILE, 'a+') as LogDump: LogWriter = csv.writer(LogDump) LogWriter.writerows(F1)
def predict(text, model): X = VECTORIZER.transform([tokenizer(text)]) y = MODELS[model].predict(X)[0] return y
def test(): test_ls = dir_reader(TEST_DIR) relations = relation_reader(cache=RELATIONS) assert relations['Other'] == 0 assert Relation_type == len(relations) print(relations) test_data = tokenizer((test_ls, TEST_DIR), relations, pretrain_type='elmo_repre') print('%d test data' % len(test_data.data), flush=True) LSTM_layer = SeqLayer(ELMo_size, Hidden_size, Hidden_layer, Dropout, Bidirection).cuda() NER = EntityDetect(Label_embed, Hidden_size, 3, Dropout).cuda() RE = RelationDetect(Hidden_size, Label_embed, Relation_type, Hidden_size, Dropout).cuda() LSTM_layer.load_state_dict(torch.load(SAVE_DIR + 'LSTM_2_999')) NER.load_state_dict(torch.load(SAVE_DIR + 'NER_2_999')) RE.load_state_dict(torch.load(SAVE_DIR + 'RE_2_999')) print('network initialized', flush=True) if os.path.exists(TEST_LOG_FILE): os.remove(TEST_LOG_FILE) test_data.reset_epoch() LSTM_layer.eval() NER.eval() RE.eval() TP = [[0.] * Relation_type for _ in range(len(Relation_threshold))] FP = [[0.] * Relation_type for _ in range(len(Relation_threshold))] FN = [[0.] * Relation_type for _ in range(len(Relation_threshold))] F1 = [[0.] * Relation_type for _ in range(len(Relation_threshold))] total_F1 = [0.] * len(Relation_threshold) micro_F1 = [0.] * len(Relation_threshold) total_F1_9 = [0.] * len(Relation_threshold) micro_F1_9 = [0.] * len(Relation_threshold) precision_9 = [0.] * len(Relation_threshold) recall_9 = [0.] * len(Relation_threshold) count_all = 0 correct_raw = 0 while not test_data.epoch_finish: standard_emb, e_label, e_posi, r_label, seq_length, mask, seq_pos = test_data.get_batch( Batch_size) # print(standard_emb.size()) # print(e_label) # print(e_posi, r_label, seq_length) # input() ctx = LSTM_layer(standard_emb, seq_length) label_emb = torch.zeros((Batch_size, max(seq_length), Label_embed), requires_grad=False).cuda() y_out = torch.zeros(Batch_size, requires_grad=False).long().cuda() y_all = torch.zeros((Batch_size, max(seq_length)), requires_grad=False).long().cuda() for s in range(max(seq_length)): v_tp, logit, y_out = NER(ctx[:, s, :], y_out) for i in range(Batch_size): y_all[i, s] = y_out[i].detach() if s < seq_length[i] else -1 if s > 0 and s <= seq_length[i]: label_emb[i, s - 1, :] = v_tp[i, :].detach( ) # record embedding of label of last time step # get label embedding of the last step v_tp, _, _ = NER(torch.zeros(Batch_size, Hidden_size).cuda(), y_out) for i in range(Batch_size): if seq_length[i] == max(seq_length): label_emb[i, -1, :] = v_tp[i, :].detach() # print(y_all) # print(e_label) # print(label_emb[:, :, 0]) # input() # compute entity detection accuracy for i in range(Batch_size): count_all += 1 e1 = y_all[i, :seq_length[i]].nonzero() e2 = e_label[i, :seq_length[i]].nonzero() correct_raw += int(torch.equal(e1, e2)) # get relationship for i in range(Batch_size): for s in range(1, seq_length[i] + 1): # s is the count of word number if s - 1 in e_posi[i][0] and e_posi[i][0][0] > e_posi[i][1][0]: gts = [(posi, r_label[i]) for posi in e_posi[i][1]] elif s - 1 in e_posi[i][ 1] and e_posi[i][1][0] > e_posi[i][0][0]: gts = [(posi, r_label[i]) for posi in e_posi[i][0]] else: gts = [((s - 1), 0)] u = RE(ctx[i:i + 1, :s, :], label_emb[i:i + 1, :s, :]) result = nn.Softmax(dim=-1)(u[0, :, :].view(-1)) for j, th in enumerate(Relation_threshold): candidates = (result > th).nonzero() for location, rtype in gts: gt = location * Relation_type + rtype if gt in candidates: # correct entity correct relation TP[j][rtype] += 1 candidates = candidates[candidates != gt] elif gt not in candidates: # at least one is wrong FN[j][rtype] += 1 for candidate in candidates: gt_locations = [l for (l, rt) in gts] if candidate // Relation_type in gt_locations: # correct entity wrong relation, omit continue else: # wrong entity FP[j][candidate % Relation_type] += 1 print('NER raw accuracy: %.4f' % (correct_raw / count_all), flush=True) for j, th in enumerate(Relation_threshold): for r in range(Relation_type): F1[j][r] = (2 * TP[j][r] + epsilon) / (2 * TP[j][r] + FP[j][r] + FN[j][r] + epsilon) total_F1[j] = np.average(np.array(F1[j])) micro_F1[j] = (2 * sum(TP[j]) + epsilon) / ( 2 * sum(TP[j]) + sum(FP[j]) + sum(FN[j]) + epsilon) total_F1_9[j] = np.average(np.array(F1[j][1:])) micro_F1_9[j] = (2 * sum(TP[j][1:]) + epsilon) / ( 2 * sum(TP[j][1:]) + sum(FP[j][1:]) + sum(FN[j][1:]) + epsilon) precision_9[j] = (sum(TP[j][1:]) + epsilon) / ( sum(TP[j][1:]) + sum(FP[j][1:]) + epsilon) recall_9[j] = (sum(TP[j][1:]) + epsilon) / (sum(TP[j][1:]) + sum(FN[j][1:]) + epsilon) print('(threshold %.2f)' % th, flush=True) print('with other: val ave F1: %.4f, val micro F1: %.4f' % (total_F1[j], micro_F1[j]), flush=True) print( 'without other: val ave F1: %.4f, val micro F1: %.4f, precision: %.4f, recall: %.4f' % (total_F1_9[j], micro_F1_9[j], precision_9[j], recall_9[j]), flush=True) with open(TEST_LOG_FILE, 'a+') as LogDump: LogWriter = csv.writer(LogDump) LogWriter.writerows(F1)
predictions.append(predicted_code.split()) bleu = corpus_bleu(actual, predictions) return bleu, actual, predictions if __name__ == '__main__': argv = sys.argv[1:] if len(argv) != 1: print('Need to supply an argument specifying model path') exit(0) model_path = argv[0] test_dir = '../data/test/' # model_path = '../results/' vocab_path = '../data/code.vocab' tokenizer = tokenizer(vocab_path) bleu, actual, predictions = evaluate_model(test_dir, model_path, tokenizer, CONTEXT_LENGTH, display=False) # Calculate BLEU score (standard is 4-gram, but just get all individual N-Gram BLEU scores from 1 gram to 4 gram) # By default, the sentence_bleu() and corpus_bleu() scores calculate the cumulative 4-gram BLEU score, also called BLEU-4. # It is common to report the cumulative BLEU-1 to BLEU-4 scores when describing the skill of a text generation system. # 4-gram is the most strict and corresponds the best to human translations print('BLEU-1: %f' % corpus_bleu(actual, predictions, weights=(1.0, 0, 0, 0))) print('BLEU-2: %f' % corpus_bleu(actual, predictions, weights=(0.5, 0.5, 0, 0))) print('BLEU-3: %f' % corpus_bleu(actual, predictions, weights=(0.3, 0.3, 0.3, 0)))
import numpy.linalg as linalg import re import pdb USE_ALLENNLP = False #use flag, as some users reported issues with installation. if USE_ALLENNLP: import allennlp.data.tokenizers.word_tokenizer as tokenizer from allennlp.data.tokenizers.word_filter import StopwordFilter tk = tokenizer.WordTokenizer() stop_word_filter = StopwordFilter() else: print('Note: using rudimentary tokenizer, for better results enable allennlp.') stop_word_filter = utils.stop_word_filter() tk = utils.tokenizer() ''' Combines content and noise words embeddings ''' def doc_word_embed_content_noise(content_path, noise_path, whiten_path=None, content_lines=None, noise_lines=None, opt=None): no_add_set = set() doc_word_embed_f = doc_word_embed_sen content_words_ar, content_word_embeds = doc_word_embed_f(content_path, no_add_set, content_lines=content_lines) words_set = set(content_words_ar) noise_words_ar, noise_word_embeds = doc_word_embed_f(noise_path, set(content_words_ar), content_lines=noise_lines) content_words_ar.extend(noise_words_ar) words_ar = content_words_ar word_embeds = torch.cat((content_word_embeds, noise_word_embeds), dim=0) whitening = opt.whiten if opt is not None else True
for post in data: # get title if count_iter == len(random_idx): break if count == random_idx[count_iter]: count_iter += 1 # procesado para core | PRUEBA ENERO text = '\n'.join( [post.get('title', ''), post.get('description', '')]) if text == '': print("Wot, texto vacio!!") # Tokenize and assign filter tags tokens = tokenizer(text) # procesado para core_tokenized | PRUEBA DE 1000 #tokens = post['tokens'] filtered_tokens = filterTokens(tokens, word_dict_filtered) # Title map and doc_title mapping title = ' '.join(tokens[0])[:50] doc_title = 'doc%i' % count_iter title_map_file.write(doc_title + '\n') doc_title_map[doc_title] = title ## content to display content = [' '.join(sent) for sent in filtered_tokens] content = '<br>\n'.join(content) open(os.path.join(docs_dir, doc_title), 'w').write(content)
from solver import Solver from data_loader import get_loader, get_vocab from configs import get_config from utils import tokenizer if __name__ == '__main__': config = get_config(batch_size=1) print(config) data_loader = get_loader(batch_size=config.batch_size, max_size=config.vocab_size, is_train=False, data_dir=config.data_dir) solver = Solver(config, data_loader) solver.build(is_train=False) solver.load(epoch=2) vocab = get_vocab() while True: text = input('Insert Sentence: ') text = tokenizer(text) text = [vocab.stoi[word] for word in text] prediction = solver.inference(text) if prediction == 0: print('Positive!') else: print('Negative')
def build_tokenizer(self, tokenize='default'): self.indices_token, self.token_indices = tokenizer(mode=tokenize) self.n_chars = len(self.indices_token.keys())
import csv import utils sentences = [] with open(r'..\data\raw\corpus_raw.csv', encoding='utf-8') as file: reader = csv.DictReader(file) for row in reader: text = utils.normalizer(row['SOSPECHA_DIAGNOSTICA']) tokens = utils.tokenizer(text) tokens.append('<END>') tokens.insert(0, '<START>') sentence = '!#!'.join(tokens) if len(sentence) > 0: sentences.append(sentence) with open(r"..\data\processed\corpus.csv", "w", encoding='utf-8') as output: for sentence in sentences: output.write(sentence + '\n')
total_fp += fp total_tn += tn total_fn += fn #print(str(tp)+" "+str(fp)+" "+str(tn)+" "+str(fn)+" ") #print("tp: "+str(total_tp)+"fp: "+str(total_fp)+"tn: "+str(total_tn)+"fn: "+str(total_fn)) #print("\n") #matched_num += sum([1 for tag in pred if tag in true and tag["type"]!=0]) precision = total_tp / (total_tp + total_fp + eps) recall = total_tp / (total_tp + total_fn + eps) #recall = (matched_num + eps) / (total_true + eps) f1 = 2 * precision * recall / (precision + recall + eps) print('P: %.4f R: %.4f F: %.4f' % (precision, recall, f1)) with open(root + "/" + config["TEST_SET"], "r") as f: tokens = [] for line in [l for l in f if not l.startswith("-DOCSTART-")]: words = utils.tokenizer(line) if words: word = words[0] tokens.append(word) f.close() with open( "risultati_opentag/esperimento#" + str(config["ESPERIMENTO"]) + ".txt", "w+") as f: f.truncate(0) for token, tag in zip(tokens, lines): f.write(token + tag) f.close() config["ESPERIMENTO"] += 1 with open("config.json", "w") as c: json.dump(config, c)