def __init__(self, device: torch.device): # print(os.getcwd()) self.__device = device # print(device) self.__rdrsegmenter = VnCoreNLP(VnCoreNLP_JAR_PATH, annotators="wseg", max_heap_size='-Xmx500m') self.__tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base") self.__model = AutoModel.from_pretrained("vinai/phobert-base", output_hidden_states=True).to(self.__device)
class SIFRank4VN(): def __init__(self): # path = os.path.dirname(os.path.realpath('__file__')) self.vncorenlp = VnCoreNLP( "auxiliary_data/VnCoreNLP-master/VnCoreNLP-1.1.1.jar", annotators="wseg, pos", max_heap_size='-Xmx500m') self.phoBERT = word_emb_phoBert.WordEmbeddings() self.SIF = sent_emb_sif.SentEmbeddings(self.phoBERT, lamda=1.0, embeddings_type='bert') def sifrank_extract(self, text, nphrase=15, ratio=0.6): keyphrases = SIFRank(text, self.SIF, self.vncorenlp, N=nphrase, ratio=ratio) return keyphrases def sifrank_plus_extract(self, text, nphrase=15, ratio=0.6): keyphrases = SIFRank_plus(text, self.SIF, self.vncorenlp, N=nphrase, ratio=ratio) return keyphrases def close_vncorenlp(self): self.vncorenlp.close()
def __init__( self, path="/home/thanh/DATN/FakeNewDetection/vncorenlp/VnCoreNLP-1.1.1.jar" ): self.rdrsegmenter = VnCoreNLP(path, annotators="wseg", max_heap_size='-Xmx500m')
def __init__(self, data_dir, max_length=150, remove_negative_pair=True): super(VNNewsDataset, self).__init__() self.data_dir = data_dir self.max_length = max_length self.sentence_1 = open(os.path.join(self.data_dir, 'Sentences_1.txt'), mode='r', encoding='utf-8-sig').read().split('\n') self.sentence_2 = open(os.path.join(self.data_dir, 'Sentences_2.txt'), mode='r', encoding='utf-8-sig').read().split('\n') self.labels = open(os.path.join(self.data_dir, 'Labels.txt'), mode='r', encoding='utf-8-sig').read().split('\n') self.bpe = fastBPE(BPEConfig) self.vocab = Dictionary() self.vocab.add_from_file( os.path.join(os.getcwd(), '../pretrained', 'PhoBERT_base_transformers', 'dict.txt')) self.rdr_segmenter = VnCoreNLP(os.path.join('../vncorenlp', 'VnCoreNLP-1.1.1.jar'), annotators='wseg', max_heap_size='-Xmx500m') if remove_negative_pair is True: self.remove_negative_pair()
def tokenize(self, raw_sentence: str): if self.vncore: if self.annotator is None: self.annotator = VnCoreNLP(VNCORENLP_ADDRESS, port=VNCORENLP_PORT) word_tokenizes = ' '.join(sum(self.annotator.tokenize(raw_sentence), [])) else: word_tokenizes = raw_sentence return self.bpe.encode(word_tokenizes)
def main(args): print( "-" * 20, "START", "-" * 20, ) nlp = args.nlp print("Initialize annotator...") #Change this to real path of VnCoreNLP file annotator = VnCoreNLP(nlp, annotators="wseg,pos,ner,parse", max_heap_size='-Xmx2g') DATA_PATH, MODEL_PATH = args.i, args.o # Variables num_feature = args.nfeature if args.nfeature else 256 min_word_count = args.mincount if args.mincount else 2 window_size = args.window if args.window else 2 num_epochs = args.nepoch if args.nepoch else 50 num_worker = multiprocessing.cpu_count() # Read corpus print("Reading data file...") raw_data = ut.read(DATA_PATH).split('\n') sentences_tokenized = [] print("Tokenazing...") for line in raw_data: line = line.lower() word_segmented_text = annotator.tokenize(line) # f.write("%s\n" % word_segmented_text) for tokens in word_segmented_text: sentences_tokenized.append(tokens) print('Building model...') model = w2v.Word2Vec(size=num_feature, min_count=min_word_count, workers=num_worker, window=window_size) model.build_vocab(sentences_tokenized) print("Vocabularies count is: %d" % len(model.wv.vocab)) print("Training word2vec...") model.train(sentences=sentences_tokenized, total_examples=model.corpus_count, epochs=num_epochs) print('Build model successfully') print('Saving model...') if not os.path.exists(MODEL_PATH): os.makedirs(MODEL_PATH) model.save(os.path.join(MODEL_PATH, 'word2vec.w2v')) print('Done') return None
def read_pages(start_page, end_page, doc_file): VNCORENLP_FILE_PATH = 'VnCoreNLP/VnCoreNLP-1.1.1.jar' vncorenlp = VnCoreNLP(VNCORENLP_FILE_PATH) words = [] doc = pdf2txt(doc_file, range(start_page - 1, end_page)) for para in doc: words.extend(vncorenlp.tokenize(para)) return words
def __init__(self): # path = os.path.dirname(os.path.realpath('__file__')) self.vncorenlp = VnCoreNLP( "auxiliary_data/VnCoreNLP-master/VnCoreNLP-1.1.1.jar", annotators="wseg, pos", max_heap_size='-Xmx500m') self.phoBERT = word_emb_phoBert.WordEmbeddings() self.SIF = sent_emb_sif.SentEmbeddings(self.phoBERT, lamda=1.0, embeddings_type='bert')
def __init__(self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False, vncorenlp_path=None): self.stop_words = set(stop_words) self.do_lower_case = do_lower_case self.set_vocab(vocab) self.vncorenlp_path = vncorenlp_path self.rdrsegmenter = VnCoreNLP(vncorenlp_path, annotators="wseg", max_heap_size='-Xmx1g')
def __init__(self, max_length=512): self.bpe = fastBPE(BPEConfig) self.vocab = Dictionary() self.vocab.add_from_file(os.path.join(os.getcwd(), 'pretrained', 'PhoBERT_base_transformers', 'dict.txt')) self.rdr_segmenter = VnCoreNLP( os.path.join('vncorenlp', 'VnCoreNLP-1.1.1.jar'), annotators='wseg', max_heap_size='-Xmx500m' ) self.max_length = max_length
def __init__(self, stopwords, ngrams=1, window_size=3, candidate_pos=["N", "Np"], num_keywords=5, use_vncorenlp=True): self.d = 0.85 # damping coefficient, usually is .85 self.min_diff = 1e-5 # convergence threshold self.steps = 10 # iteration steps self.node_weight = None # save keywords and its weight self.ngrams = ngrams self.window_size = window_size self.candidate_pos = candidate_pos self.num_keywords = num_keywords self.stopwords = stopwords self.use_vncorenlp = use_vncorenlp if self.use_vncorenlp: self.annotator = VnCoreNLP(VNCORENLP_JAR_PATH, annotators="wseg,pos", max_heap_size='-Xmx2g')
def analyse_data(corpus_path): ''' return output data folder path ''' try: vncorenlp_file = r'./VnCoreNLP/VnCoreNLP-1.1.1.jar' vncorenlp = VnCoreNLP(vncorenlp_file) print('Create VNCoreNLP Object.') path = corpus_path.split('/') corpus_folder_path = '/'.join(path[:-1]) corpus_filename = path[-1] print("corpus folder: %s" % corpus_folder_path) print("corpus filename: %s" % corpus_filename) output_data_folder_path = corpus_folder_path + '/output-data/' if not os.path.exists(output_data_folder_path): os.makedirs(output_data_folder_path) print("Created %s folder" % output_data_folder_path) fi = open(corpus_path, 'r') fo_token = open(output_data_folder_path + corpus_filename + '-token', 'w') print("Open %s" % corpus_path) print("Open %s" % fo_token.name) line_number = 0 for line in fi: line_number += 1 fi.close() fi = open(corpus_path, 'r') print('We have %d in our corpus.' % line_number) for count in tqdm(range(line_number)): sentences = fi.readline() fo_token.write(str(vncorenlp.tokenize(sentences)) + '\n') print('Finish analysis data.') except Exception as e: raise finally: fi.close() fo_token.close() print("Close %s" % corpus_path) print("Close %s" % fo_token.name) return output_data_folder_path
def annotate(self, lib, text_list, mode, output): f = open(output, 'w') if lib == 'underthesea': t = time.time() count = 0 for text in text_list: f.write(f'{text}\t{self.underthesea_annotate(text, mode)}\n') count += 1 if time.time() - t > 1: break print(count) elif lib == 'vncorenlp': vncorenlp_file = r'VnCoreNLP_lib/VnCoreNLP-1.1.1.jar' with VnCoreNLP(vncorenlp_file) as vncorenlp_class: t = time.time() count = 0 for text in text_list: f.write( f'{text}\t{self.vncorenlp_annotate(vncorenlp_class, text, mode)}\n' ) count += 1 if time.time() - t > 1: break print(count) else: raise Exception("Wrong request, please check your request") f.close()
class VNNewsDataset(Dataset): def __init__(self, data_dir, max_length=150, remove_negative_pair=True): super(VNNewsDataset, self).__init__() self.data_dir = data_dir self.max_length = max_length self.sentence_1 = open(os.path.join(self.data_dir, 'Sentences_1.txt'), mode='r', encoding='utf-8-sig').read().split('\n') self.sentence_2 = open(os.path.join(self.data_dir, 'Sentences_2.txt'), mode='r', encoding='utf-8-sig').read().split('\n') self.labels = open(os.path.join(self.data_dir, 'Labels.txt'), mode='r', encoding='utf-8-sig').read().split('\n') self.bpe = fastBPE(BPEConfig) self.vocab = Dictionary() self.vocab.add_from_file( os.path.join(os.getcwd(), '../pretrained', 'PhoBERT_base_transformers', 'dict.txt')) self.rdr_segmenter = VnCoreNLP(os.path.join('../vncorenlp', 'VnCoreNLP-1.1.1.jar'), annotators='wseg', max_heap_size='-Xmx500m') if remove_negative_pair is True: self.remove_negative_pair() def remove_negative_pair(self): self.sentence_1 = [ sent for idx, sent in enumerate(self.sentence_1) if self.labels[idx] == '1' ] self.sentence_2 = [ sent for idx, sent in enumerate(self.sentence_2) if self.labels[idx] == '1' ] def encode(self, raw_text): line = self.rdr_segmenter.tokenize(raw_text) line = ' '.join([' '.join(sent) for sent in line]) line = re.sub(r' _ ', '_', line) subwords = '<s> ' + self.bpe.encode(line) + ' </s>' input_ids = self.vocab.encode_line(subwords, append_eos=False, add_if_not_exist=False) return padding(input_ids, self.max_length) def __len__(self): assert self.sentence_1.__len__() == self.sentence_2.__len__() return self.sentence_1.__len__() def __getitem__(self, item): sent_1 = self.encode(self.sentence_1[item]) sent_2 = self.encode(self.sentence_2[item]) lb = self.labels[item] return sent_1, sent_2, lb
def get_instance(cls): if cls._instance is None: cur_dir = os.path.dirname(os.path.abspath(__file__)) cls._instance = VnCoreNLP(os.path.join(cur_dir, 'VnCoreNLP-1.1.1.jar'), annotators='wseg', max_heap_size='-Xmx500m') return cls._instance
def Main(): vncorenlp_file = VNCORENLP_FILE_PATH vncorenlp = VnCoreNLP(vncorenlp_file) f = open(TEXT_FILE_PATH, 'r', encoding='utf-8') text = f.read() f.close() tokenize = vncorenlp.tokenize(text) words, len = total_words_and_len(tokenize, punc, stopwords) tf = TF(words, len) idf = IDF(words, tokenize) tfidf = TFIDF(tf, idf) N = 20 print(get_top(tfidf, N))
class VnSegmentNLP: def __init__(self, jar_file='./tokenizer/VnCoreNLP-1.1.1.jar'): self.annotator = VnCoreNLP(jar_file, annotators="wseg", max_heap_size='-Xmx2g') def word_segment(self, inp: str): word_segmented_text = self.annotator.tokenize(inp) sentences = [' '.join(word) for word in word_segmented_text] return ' '.join(sentences)
def get_instance(cls): if cls._instance is None: cur_dir = os.path.dirname(os.path.abspath(__file__)) cls._instance = VnCoreNLP( os.path.join(cur_dir, "VnCoreNLP-1.1.1.jar"), annotators="wseg", max_heap_size="-Xmx500m", ) return cls._instance
def vn_ner(self): annotator = VnCoreNLP(address=DEFAULT_LOCAL_ADDRESS, port=DEFAULT_VI_NER_PORT) for line in self.textMap.keys(): taggedText = annotator.annotate(line) try: taggedText = taggedText['sentences'][0] for value in taggedText: if value['nerLabel'] in ['B-PER', 'I-PER']: self.textMap[line][self.PER_KEY] += 1 if value['nerLabel'] in ['B-LOC', 'I-LOC']: self.textMap[line][self.LOC_KEY] += 1 if value['nerLabel'] in ['B-ORG', 'I-ORG']: self.textMap[line][self.ORG_KEY] += 1 except Exception as e: print("Unable to anotate " + str(line)) print(e) return e
def vn_format_to_json(args): stories_dir = os.path.abspath(args.raw_path) tokenized_stories_dir = os.path.abspath(args.save_path) print("Preparing to tokenize %s to %s..." % (stories_dir, tokenized_stories_dir)) stories = glob.glob(pjoin(args.raw_path, '*.txt')) annotator = VnCoreNLP("./vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m') dataset = [] for s in stories: tgt = [] source = [] flag = False f = open(pjoin(stories_dir, s), encoding='utf-8') for line in f: if line == '\n': continue if line == '@highlight\n': flag = True continue tokens = annotator.tokenize(line) if flag: tgt.extend(tokens) else: source = tokens dataset.append({"src": [clean(' '.join(sent)).split() for sent in source], "tgt": [clean(' '.join(sent)).split() for sent in tgt]}) print("Tokenizing %i files in %s" % (len(stories), stories_dir)) print("VNCoreNLP Tokenizer has finished.") valid_test_ratio = 0.1 all_size = len(dataset) test_sets = dataset[:int(all_size * valid_test_ratio)] valid_sets = dataset[int(all_size * valid_test_ratio):int(all_size * valid_test_ratio * 2)] train_sets = dataset[int(all_size * valid_test_ratio * 2):] corpora = {'train': train_sets, 'valid': valid_sets, 'test': test_sets} for corpus_type in ['train', 'valid', 'test']: p_ct = 0 for split in [corpora[corpus_type][i * args.shard_size:(i + 1) * args.shard_size] for i in range((len(corpora[corpus_type]) + args.shard_size - 1) // args.shard_size)]: pt_file = pjoin(args.save_path, corpus_type + '.' + str(p_ct) + '.json') with codecs.open(pt_file, 'w', encoding='utf-8') as save: json.dump(split, save, ensure_ascii=False) p_ct += 1
class PhoBert(EmbeddingModel): def __init__(self, device: torch.device): # print(os.getcwd()) self.__device = device # print(device) self.__rdrsegmenter = VnCoreNLP(VnCoreNLP_JAR_PATH, annotators="wseg", max_heap_size='-Xmx500m') self.__tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base") self.__model = AutoModel.from_pretrained("vinai/phobert-base", output_hidden_states=True).to(self.__device) def embed_text(self, text: str) -> torch.Tensor: """ Tokenize and embed a sentence with PhoBERT """ # print(sentence) line = self.__tokenize(text) # print(line) # mapping words and their ids in vncorenlp dictionary # print(self.__tokenizer.encode(line)) input_ids = torch.tensor([self.__tokenizer.encode(line)]) # print(input_ids) input_ids.to(self.__device) with torch.no_grad(): features = self.__model(input_ids) embeddings = self.__to_embedding(features) # cleanup del input_ids return embeddings def __tokenize(self, text: str): """ To perform word segmentation """ segments = self.__rdrsegmenter.tokenize(text) segmentation = None if len(segments) > 1: segmentation = " ".join( [" ".join(segment) for segment in segments] ) elif len(segments) == 1: segmentation = " ".join(segments[0]) return segmentation def __to_embedding(self, features): """ Convert features to sentence embedding """ hidden_states = features[2] last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)] return torch.mean( torch.cat( tuple(last_four_layers), dim=-1 ), dim=1 ).squeeze()
def __init__(self, bpe_path: str, vncorenlp_path: str, do_lower_case: bool = False): bpe_codes_path = os.path.join(bpe_path, BPECODE_FILE) vocab_file_path = os.path.join(bpe_path, VOCAB_FILE) if not os.path.isfile(bpe_codes_path): raise EnvironmentError(f"{BPECODE_FILE} not found in {bpe_path}") if not os.path.isfile(vocab_file_path): raise EnvironmentError(f"{VOCAB_FILE} not found in {bpe_path}") self.do_lower_case = do_lower_case BPEConfig = namedtuple('BPEConfig', 'vncorenlp bpe_codes vocab') self.pho_config = BPEConfig(vncorenlp=vncorenlp_path, bpe_codes=bpe_codes_path, vocab=vocab_file_path) self.rdrsegmenter = VnCoreNLP(self.pho_config.vncorenlp, annotators="wseg", max_heap_size='-Xmx1g') self.bpe = fastBPE(self.pho_config) self.vocab = Dictionary() self.vocab.add_from_file(self.pho_config.vocab)
def annotate(self, text, annotators="wseg", output_format=None, properties=None, max_heap_size="-Xmx500m"): with VnCoreNLP(self.vncorenlp_file, annotators=annotators, max_heap_size=max_heap_size) as vncorenlp: result = vncorenlp.tokenize(text) return result
def ppt2txt(filename): ppt = Presentation(filename) sentences = "" for slide in ppt.slides: for shape in slide.shapes: if shape.has_text_frame: sentences += shape.text + ". " with VnCoreNLP(vncorenlp_file, annotators="wseg", max_heap_size='-Xmx4g', quiet=False) as vncorenlp: split_sentence = vncorenlp.tokenize(sentences) return split_sentence
def vncorenlp_pos_tag(sentence): with VnCoreNLP(address='http://127.0.0.1', port=8888) as vn_core_nlp: tagged = vn_core_nlp.pos_tag(sentence) result = list() fs_tagged = tagged[0] for w in fs_tagged: parsed_w = {'txt': w[0], 'type': w[1]} result.append(parsed_w) return result
def nlp_tokenize(path): data = pd.read_excel(path) data = data[['ID', 'Content', 'ID người đăng']] data = data.dropna() data['Content'] = data['Content'].str.strip() data['Content'] = data['Content'].str.lower() data['status'] = data['Content'] for i in range(len(data['status'])): data['status'].iloc[i] = re.sub('\W+', ' ', data['Content'].iloc[i]) data['Content'].iloc[i] = data['status'].iloc[i] vncorenlp_file = r'VnCoreNLP/VnCoreNLP-1.1.1.jar' vncorenlp = VnCoreNLP(vncorenlp_file) # content = vncorenlp.tokenize(content) for i in range(len(data['status'])): data['status'].iloc[i] = vncorenlp.tokenize(data['status'].iloc[i]) key_word = [] for i in data['status']: key_word = key_word + i vncorenlp.close() return key_word, data[['Content', 'ID', 'ID người đăng']]
class VnCoreTokenizer(): def __init__( self, path="/home/thanh/DATN/FakeNewDetection/vncorenlp/VnCoreNLP-1.1.1.jar" ): self.rdrsegmenter = VnCoreNLP(path, annotators="wseg", max_heap_size='-Xmx500m') def tokenize(self, text: str) -> str: sentences = self.rdrsegmenter.tokenize(text) output = "" for sentence in sentences: output += " ".join(sentence) return output
def vncorenlp_dep_parse(paragraph): with VnCoreNLP(address='http://127.0.0.1', port=8888) as vn_core_nlp: tagged = vn_core_nlp.pos_tag(paragraph) parsed = vn_core_nlp.dep_parse(paragraph) result = [] tokens = tagged[0] parsed = parsed[0] for idx, token in enumerate(tokens): w = { 'txt': token[0], 'type': token[1], 'kind': parsed[idx][0], 'dependence': parsed[idx][1] } result.append(w) return result
def tokenized(infile, outfile): count = 0 with VnCoreNLP(address='http://127.0.0.1', port=9000) as vncorenlp: with open(infile, encoding='utf-8') as file: with open(outfile, 'w', encoding='utf-8') as out: for line in file: if line: try: word_seg = vncorenlp.tokenize(line) except: time.sleep(5) for sent in word_seg: seg = ' '.join(sent) out.writelines(seg + '\n') print('done line ' + str(count)) count += 1 print(f'done {infile}')
def load_phobert_model(): device = torch.device("cpu") parser = argparse.ArgumentParser() parser.add_argument('--bpe-codes', default=paths.bpe_codes_path, required=False, type=str, help='path to fastBPE BPE') args = parser.parse_args() bpe = fastBPE(args) vn_tokenizer = VnCoreNLP(paths.vncore_jar_path, annotators="wseg", max_heap_size='-Xmx500m') # config model config = RobertaConfig.from_pretrained(paths.config_path, output_hidden_states=True, num_labels=3) model_bert = RobertaForAIViVN.from_pretrained(paths.pretrained_path, config=config) # model_bert.cuda() # Load the dictionary vocab = Dictionary() vocab.add_from_file(paths.dict_path) ''' if torch.cuda.device_count(): print(f"Testing using {torch.cuda.device_count()} gpus") model_bert = nn.DataParallel(model_bert) tsfm = model_bert.module.roberta else: tsfm = model_bert.roberta ''' model_bert = nn.DataParallel(model_bert) tsfm = model_bert.module.roberta model_bert.load_state_dict( torch.load(paths.phobert_path, map_location=device)) return bpe, vn_tokenizer, model_bert, vocab