def simple_usage():
    # Uncomment this line for debugging
    # logging.basicConfig(level=logging.DEBUG)

   
    vncorenlp_file = 'D:\study\PlagismDetector\PlagismDetector/VnCoreNLP/VnCoreNLP-1.1.1.jar'
    
    sentences = 'VTV đồng ý chia sẻ bản quyền World Cup 2018 cho HTV để khai thác. ' \
                'Nhưng cả hai nhà đài đều phải chờ sự đồng ý của FIFA mới thực hiện được điều này.'

    # Use "with ... as" to close the server automatically
    with VnCoreNLP(vncorenlp_file) as vncorenlp:
        print('Tokenizing:', vncorenlp.tokenize(sentences))
        print('POS Tagging:', vncorenlp.pos_tag(sentences))
        print('Named-Entity Recognizing:', vncorenlp.ner(sentences))
        print('Dependency Parsing:', vncorenlp.dep_parse(sentences))
        print('Annotating:', vncorenlp.annotate(sentences))
        print('Language:', vncorenlp.detect_language(sentences))

    # In this way, you have to close the server manually by calling close function
    vncorenlp = VnCoreNLP(vncorenlp_file)

    print('Tokenizing:', vncorenlp.tokenize(sentences))
    print('POS Tagging:', vncorenlp.pos_tag(sentences))
    print('Named-Entity Recognizing:', vncorenlp.ner(sentences))
    print('Dependency Parsing:', vncorenlp.dep_parse(sentences))
    print('Annotating:', vncorenlp.annotate(sentences))
    print('Language:', vncorenlp.detect_language(sentences))

    # Do not forget to close the server
    vncorenlp.close()
示例#2
0
    def __init__(self, data_dir, max_length=150, remove_negative_pair=True):
        super(VNNewsDataset, self).__init__()
        self.data_dir = data_dir
        self.max_length = max_length

        self.sentence_1 = open(os.path.join(self.data_dir, 'Sentences_1.txt'),
                               mode='r',
                               encoding='utf-8-sig').read().split('\n')

        self.sentence_2 = open(os.path.join(self.data_dir, 'Sentences_2.txt'),
                               mode='r',
                               encoding='utf-8-sig').read().split('\n')

        self.labels = open(os.path.join(self.data_dir, 'Labels.txt'),
                           mode='r',
                           encoding='utf-8-sig').read().split('\n')

        self.bpe = fastBPE(BPEConfig)
        self.vocab = Dictionary()
        self.vocab.add_from_file(
            os.path.join(os.getcwd(), '../pretrained',
                         'PhoBERT_base_transformers', 'dict.txt'))
        self.rdr_segmenter = VnCoreNLP(os.path.join('../vncorenlp',
                                                    'VnCoreNLP-1.1.1.jar'),
                                       annotators='wseg',
                                       max_heap_size='-Xmx500m')

        if remove_negative_pair is True:
            self.remove_negative_pair()
示例#3
0
    def annotate(self, lib, text_list, mode, output):
        f = open(output, 'w')
        if lib == 'underthesea':
            t = time.time()
            count = 0
            for text in text_list:
                f.write(f'{text}\t{self.underthesea_annotate(text, mode)}\n')
                count += 1
                if time.time() - t > 1:
                    break
            print(count)

        elif lib == 'vncorenlp':
            vncorenlp_file = r'VnCoreNLP_lib/VnCoreNLP-1.1.1.jar'
            with VnCoreNLP(vncorenlp_file) as vncorenlp_class:
                t = time.time()
                count = 0
                for text in text_list:
                    f.write(
                        f'{text}\t{self.vncorenlp_annotate(vncorenlp_class, text, mode)}\n'
                    )
                    count += 1
                    if time.time() - t > 1:
                        break
            print(count)

        else:
            raise Exception("Wrong request, please check your request")
        f.close()
 def __init__(
     self,
     path="/home/thanh/DATN/FakeNewDetection/vncorenlp/VnCoreNLP-1.1.1.jar"
 ):
     self.rdrsegmenter = VnCoreNLP(path,
                                   annotators="wseg",
                                   max_heap_size='-Xmx500m')
示例#5
0
 def __init__(self, device: torch.device):
     # print(os.getcwd())
     self.__device = device
     # print(device)
     self.__rdrsegmenter = VnCoreNLP(VnCoreNLP_JAR_PATH, annotators="wseg", max_heap_size='-Xmx500m')
     self.__tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
     self.__model = AutoModel.from_pretrained("vinai/phobert-base", output_hidden_states=True).to(self.__device)
示例#6
0
 def get_instance(cls):
     if cls._instance is None:
         cur_dir = os.path.dirname(os.path.abspath(__file__))
         cls._instance = VnCoreNLP(os.path.join(cur_dir,
                                                'VnCoreNLP-1.1.1.jar'),
                                   annotators='wseg',
                                   max_heap_size='-Xmx500m')
     return cls._instance
示例#7
0
 def tokenize(self, raw_sentence: str):
     if self.vncore:
         if self.annotator is None:
             self.annotator = VnCoreNLP(VNCORENLP_ADDRESS, port=VNCORENLP_PORT)
         word_tokenizes = ' '.join(sum(self.annotator.tokenize(raw_sentence), []))
     else:
         word_tokenizes = raw_sentence
     return self.bpe.encode(word_tokenizes)
示例#8
0
 def get_instance(cls):
     if cls._instance is None:
         cur_dir = os.path.dirname(os.path.abspath(__file__))
         cls._instance = VnCoreNLP(
             os.path.join(cur_dir, "VnCoreNLP-1.1.1.jar"),
             annotators="wseg",
             max_heap_size="-Xmx500m",
         )
     return cls._instance
示例#9
0
def main(args):

    print(
        "-" * 20,
        "START",
        "-" * 20,
    )
    nlp = args.nlp
    print("Initialize annotator...")
    #Change this to real path of VnCoreNLP file
    annotator = VnCoreNLP(nlp,
                          annotators="wseg,pos,ner,parse",
                          max_heap_size='-Xmx2g')
    DATA_PATH, MODEL_PATH = args.i, args.o

    # Variables
    num_feature = args.nfeature if args.nfeature else 256
    min_word_count = args.mincount if args.mincount else 2
    window_size = args.window if args.window else 2
    num_epochs = args.nepoch if args.nepoch else 50
    num_worker = multiprocessing.cpu_count()

    # Read corpus
    print("Reading data file...")
    raw_data = ut.read(DATA_PATH).split('\n')
    sentences_tokenized = []

    print("Tokenazing...")
    for line in raw_data:
        line = line.lower()
        word_segmented_text = annotator.tokenize(line)
        # f.write("%s\n" % word_segmented_text)
        for tokens in word_segmented_text:
            sentences_tokenized.append(tokens)

    print('Building model...')
    model = w2v.Word2Vec(size=num_feature,
                         min_count=min_word_count,
                         workers=num_worker,
                         window=window_size)

    model.build_vocab(sentences_tokenized)
    print("Vocabularies count is: %d" % len(model.wv.vocab))

    print("Training word2vec...")
    model.train(sentences=sentences_tokenized,
                total_examples=model.corpus_count,
                epochs=num_epochs)

    print('Build model successfully')
    print('Saving model...')
    if not os.path.exists(MODEL_PATH): os.makedirs(MODEL_PATH)
    model.save(os.path.join(MODEL_PATH, 'word2vec.w2v'))

    print('Done')
    return None
示例#10
0
 def __init__(self):
     # path = os.path.dirname(os.path.realpath('__file__'))
     self.vncorenlp = VnCoreNLP(
         "auxiliary_data/VnCoreNLP-master/VnCoreNLP-1.1.1.jar",
         annotators="wseg, pos",
         max_heap_size='-Xmx500m')
     self.phoBERT = word_emb_phoBert.WordEmbeddings()
     self.SIF = sent_emb_sif.SentEmbeddings(self.phoBERT,
                                            lamda=1.0,
                                            embeddings_type='bert')
示例#11
0
def read_pages(start_page, end_page, doc_file):
    VNCORENLP_FILE_PATH = 'VnCoreNLP/VnCoreNLP-1.1.1.jar'
    vncorenlp = VnCoreNLP(VNCORENLP_FILE_PATH)

    words = []
    doc = pdf2txt(doc_file, range(start_page - 1, end_page))
    for para in doc:
        words.extend(vncorenlp.tokenize(para))

    return words
示例#12
0
 def annotate(self,
              text,
              annotators="wseg",
              output_format=None,
              properties=None,
              max_heap_size="-Xmx500m"):
     with VnCoreNLP(self.vncorenlp_file,
                    annotators=annotators,
                    max_heap_size=max_heap_size) as vncorenlp:
         result = vncorenlp.tokenize(text)
     return result
示例#13
0
 def __init__(self,
              vocab: Iterable[str] = [],
              stop_words: Iterable[str] = ENGLISH_STOP_WORDS,
              do_lower_case: bool = False,
              vncorenlp_path=None):
     self.stop_words = set(stop_words)
     self.do_lower_case = do_lower_case
     self.set_vocab(vocab)
     self.vncorenlp_path = vncorenlp_path
     self.rdrsegmenter = VnCoreNLP(vncorenlp_path,
                                   annotators="wseg",
                                   max_heap_size='-Xmx1g')
示例#14
0
 def __init__(self, max_length=512):
     self.bpe = fastBPE(BPEConfig)
     self.vocab = Dictionary()
     self.vocab.add_from_file(os.path.join(os.getcwd(),
                                           'pretrained',
                                           'PhoBERT_base_transformers',
                                           'dict.txt'))
     self.rdr_segmenter = VnCoreNLP(
         os.path.join('vncorenlp', 'VnCoreNLP-1.1.1.jar'),
         annotators='wseg',
         max_heap_size='-Xmx500m'
     )
     self.max_length = max_length
def ppt2txt(filename):
    ppt = Presentation(filename)
    sentences = ""
    for slide in ppt.slides:
        for shape in slide.shapes:
            if shape.has_text_frame:
                sentences += shape.text + ". "
    with VnCoreNLP(vncorenlp_file,
                   annotators="wseg",
                   max_heap_size='-Xmx4g',
                   quiet=False) as vncorenlp:
        split_sentence = vncorenlp.tokenize(sentences)
    return split_sentence
示例#16
0
 def __init__(self, stopwords, ngrams=1, window_size=3, candidate_pos=["N", "Np"], num_keywords=5, use_vncorenlp=True):
     self.d = 0.85 # damping coefficient, usually is .85
     self.min_diff = 1e-5 # convergence threshold
     self.steps = 10 # iteration steps
     self.node_weight = None # save keywords and its weight
     self.ngrams = ngrams
     self.window_size = window_size
     self.candidate_pos = candidate_pos
     self.num_keywords = num_keywords
     self.stopwords = stopwords
     self.use_vncorenlp = use_vncorenlp
     if self.use_vncorenlp:
         self.annotator = VnCoreNLP(VNCORENLP_JAR_PATH, annotators="wseg,pos", max_heap_size='-Xmx2g')
示例#17
0
def vncorenlp_pos_tag(sentence):
    with VnCoreNLP(address='http://127.0.0.1', port=8888) as vn_core_nlp:
        tagged = vn_core_nlp.pos_tag(sentence)

    result = list()
    fs_tagged = tagged[0]

    for w in fs_tagged:
        parsed_w = {'txt': w[0], 'type': w[1]}

        result.append(parsed_w)

    return result
示例#18
0
def analyse_data(corpus_path):
    '''
    return output data folder path
    '''

    try:

        vncorenlp_file = r'./VnCoreNLP/VnCoreNLP-1.1.1.jar'
        vncorenlp = VnCoreNLP(vncorenlp_file)
        print('Create VNCoreNLP Object.')

        path = corpus_path.split('/')
        corpus_folder_path = '/'.join(path[:-1])
        corpus_filename = path[-1]
        print("corpus folder: %s" % corpus_folder_path)
        print("corpus filename: %s" % corpus_filename)
        output_data_folder_path = corpus_folder_path + '/output-data/'
        if not os.path.exists(output_data_folder_path):
            os.makedirs(output_data_folder_path)
            print("Created %s folder" % output_data_folder_path)

        fi = open(corpus_path, 'r')
        fo_token = open(output_data_folder_path + corpus_filename + '-token',
                        'w')
        print("Open %s" % corpus_path)
        print("Open %s" % fo_token.name)

        line_number = 0
        for line in fi:
            line_number += 1
        fi.close()
        fi = open(corpus_path, 'r')
        print('We have %d in our corpus.' % line_number)

        for count in tqdm(range(line_number)):

            sentences = fi.readline()
            fo_token.write(str(vncorenlp.tokenize(sentences)) + '\n')

        print('Finish analysis data.')

    except Exception as e:
        raise
    finally:
        fi.close()
        fo_token.close()

        print("Close %s" % corpus_path)
        print("Close %s" % fo_token.name)

    return output_data_folder_path
示例#19
0
def Main():
    vncorenlp_file = VNCORENLP_FILE_PATH
    vncorenlp = VnCoreNLP(vncorenlp_file)
    
    f = open(TEXT_FILE_PATH, 'r', encoding='utf-8')
    text = f.read()
    f.close()

    tokenize = vncorenlp.tokenize(text)
    words, len = total_words_and_len(tokenize, punc, stopwords)

    tf = TF(words, len)
    idf = IDF(words, tokenize)
    tfidf = TFIDF(tf, idf)

    N = 20
    print(get_top(tfidf, N))
示例#20
0
def load_phobert_model():

    device = torch.device("cpu")

    parser = argparse.ArgumentParser()
    parser.add_argument('--bpe-codes',
                        default=paths.bpe_codes_path,
                        required=False,
                        type=str,
                        help='path to fastBPE BPE')

    args = parser.parse_args()
    bpe = fastBPE(args)

    vn_tokenizer = VnCoreNLP(paths.vncore_jar_path,
                             annotators="wseg",
                             max_heap_size='-Xmx500m')

    # config model
    config = RobertaConfig.from_pretrained(paths.config_path,
                                           output_hidden_states=True,
                                           num_labels=3)

    model_bert = RobertaForAIViVN.from_pretrained(paths.pretrained_path,
                                                  config=config)
    # model_bert.cuda()

    # Load the dictionary
    vocab = Dictionary()
    vocab.add_from_file(paths.dict_path)
    '''
    if torch.cuda.device_count():
        print(f"Testing using {torch.cuda.device_count()} gpus")
        model_bert = nn.DataParallel(model_bert)
        tsfm = model_bert.module.roberta
    else:
        tsfm = model_bert.roberta
    '''

    model_bert = nn.DataParallel(model_bert)
    tsfm = model_bert.module.roberta

    model_bert.load_state_dict(
        torch.load(paths.phobert_path, map_location=device))

    return bpe, vn_tokenizer, model_bert, vocab
示例#21
0
def vncorenlp_dep_parse(paragraph):
    with VnCoreNLP(address='http://127.0.0.1', port=8888) as vn_core_nlp:
        tagged = vn_core_nlp.pos_tag(paragraph)
        parsed = vn_core_nlp.dep_parse(paragraph)

    result = []
    tokens = tagged[0]
    parsed = parsed[0]
    for idx, token in enumerate(tokens):
        w = {
            'txt': token[0],
            'type': token[1],
            'kind': parsed[idx][0],
            'dependence': parsed[idx][1]
        }
        result.append(w)
    return result
示例#22
0
def tokenized(infile, outfile):
    count = 0
    with VnCoreNLP(address='http://127.0.0.1', port=9000) as vncorenlp:
        with open(infile, encoding='utf-8') as file:
            with open(outfile, 'w', encoding='utf-8') as out:
                for line in file:
                    if line:
                        try:
                            word_seg = vncorenlp.tokenize(line)
                        except:
                            time.sleep(5)
                        for sent in word_seg:
                            seg = ' '.join(sent)
                            out.writelines(seg + '\n')
                        print('done line ' + str(count))
                    count += 1
    print(f'done {infile}')
def tokenized(infile, outfile):
    count = 0

    with VnCoreNLP(address='http://127.0.0.1', port=9000) as vncorenlp:
        with jsonlines.open(infile + '.json') as file:
            with open(outfile, 'w', encoding='utf-8') as out:
                for obj in file:
                    # tags = obj['tags']
                    if len(obj['tags']) > 1:
                        for tag in obj['tags']:
                            tag = tag.strip()
                            for _ in range(5):
                                try:
                                    tag = vncorenlp.tokenize(tag)
                                    break
                                except:
                                    print("retry")
                                    time.sleep(5)
                            if len(tag) > 1:
                                print("NOPE")
                                for t in tag:
                                    label = ' '.join(t)
                            else:
                                label = ' '.join(tag[0])
                                label = re.sub(r' ', '-', label)
                            # out.write('__label__%s ' % label)
                            out.write(f'__label__{label} ')
                        for _ in range(5):
                            try:
                                word_seg = vncorenlp.tokenize(obj['title'])
                                break
                            except:
                                print('retry')
                                time.sleep(5)
                        if len(word_seg) > 1:
                            print("NOPE-2")
                            for sent in word_seg:
                                seg = ' '.join(sent)
                                out.write(seg + ' ')
                        else:
                            seg = ' '.join(word_seg[0])
                            out.write(f'{seg} ')
                        out.write('\n')
                    print('done line {0}'.format(count))
                    count += 1
    def loadModel(self):
        parser = argparse.ArgumentParser(description='Process some integers.')
        parser.add_argument('--bpe-codes',
                            type=str,
                            help='path to fastBPE BPE',
                            default=self.BPE_PATH)
        args = parser.parse_args("")

        phoBERT = RobertaModel.from_pretrained(self.MODEL_PATH,
                                               checkpoint_file='model.pt')
        phoBERT.eval()
        phoBERT.bpe = fastBPE(args)

        rdrsegmenter = VnCoreNLP(self.VNCORENLP_PATH,
                                 annotators="wseg",
                                 max_heap_size='-Xmx500m')

        return phoBERT, rdrsegmenter
示例#25
0
文件: ner.py 项目: lstrgiang/NER
 def vn_ner(self):
     annotator = VnCoreNLP(address=DEFAULT_LOCAL_ADDRESS,
                           port=DEFAULT_VI_NER_PORT)
     for line in self.textMap.keys():
         taggedText = annotator.annotate(line)
         try:
             taggedText = taggedText['sentences'][0]
             for value in taggedText:
                 if value['nerLabel'] in ['B-PER', 'I-PER']:
                     self.textMap[line][self.PER_KEY] += 1
                 if value['nerLabel'] in ['B-LOC', 'I-LOC']:
                     self.textMap[line][self.LOC_KEY] += 1
                 if value['nerLabel'] in ['B-ORG', 'I-ORG']:
                     self.textMap[line][self.ORG_KEY] += 1
         except Exception as e:
             print("Unable to anotate " + str(line))
             print(e)
             return e
示例#26
0
def vn_format_to_json(args):
    stories_dir = os.path.abspath(args.raw_path)
    tokenized_stories_dir = os.path.abspath(args.save_path)

    print("Preparing to tokenize %s to %s..." % (stories_dir, tokenized_stories_dir))
    stories = glob.glob(pjoin(args.raw_path, '*.txt'))
    annotator = VnCoreNLP("./vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')

    dataset = []
    for s in stories:
        tgt = []
        source = []
        flag = False
        f = open(pjoin(stories_dir, s), encoding='utf-8')
        for line in f:
            if line == '\n':
                continue
            if line == '@highlight\n':
                flag = True
                continue
            tokens = annotator.tokenize(line)
            if flag:
                tgt.extend(tokens)
            else:
                source = tokens
        dataset.append({"src": [clean(' '.join(sent)).split() for sent in source],
                        "tgt": [clean(' '.join(sent)).split() for sent in tgt]})

    print("Tokenizing %i files in %s" % (len(stories), stories_dir))
    print("VNCoreNLP Tokenizer has finished.")

    valid_test_ratio = 0.1
    all_size = len(dataset)
    test_sets = dataset[:int(all_size * valid_test_ratio)]
    valid_sets = dataset[int(all_size * valid_test_ratio):int(all_size * valid_test_ratio * 2)]
    train_sets = dataset[int(all_size * valid_test_ratio * 2):]
    corpora = {'train': train_sets, 'valid': valid_sets, 'test': test_sets}
    for corpus_type in ['train', 'valid', 'test']:
        p_ct = 0
        for split in [corpora[corpus_type][i * args.shard_size:(i + 1) * args.shard_size] for i in range((len(corpora[corpus_type]) + args.shard_size - 1) // args.shard_size)]:
            pt_file = pjoin(args.save_path, corpus_type + '.' + str(p_ct) + '.json')
            with codecs.open(pt_file, 'w', encoding='utf-8') as save:
                json.dump(split, save, ensure_ascii=False)
            p_ct += 1
示例#27
0
    def __init__(self, bpe_path: str, vncorenlp_path: str, do_lower_case: bool = False):
        bpe_codes_path = os.path.join(bpe_path, BPECODE_FILE)
        vocab_file_path = os.path.join(bpe_path, VOCAB_FILE)
        
        if not os.path.isfile(bpe_codes_path):
            raise EnvironmentError(f"{BPECODE_FILE} not found in {bpe_path}")
            
        if not os.path.isfile(vocab_file_path):
            raise EnvironmentError(f"{VOCAB_FILE} not found in {bpe_path}")

        self.do_lower_case = do_lower_case
        
        BPEConfig = namedtuple('BPEConfig', 'vncorenlp bpe_codes vocab')

        self.pho_config = BPEConfig(vncorenlp=vncorenlp_path, bpe_codes=bpe_codes_path, vocab=vocab_file_path)
        self.rdrsegmenter = VnCoreNLP(self.pho_config.vncorenlp, annotators="wseg", max_heap_size='-Xmx1g')
        self.bpe = fastBPE(self.pho_config)
        self.vocab = Dictionary()
        self.vocab.add_from_file(self.pho_config.vocab)
示例#28
0
def tokenized_file(infile, outfile):
    count = 0
    with VnCoreNLP(address='http://127.0.0.1', port=9000) as vncorenlp:
        df = pd.read_csv(infile)
        for i, row in df.iterrows():
            try:
                word_seg = vncorenlp.tokenize(row['title'])
            except:
                time.sleep(5)
            seg = list(itertools.chain.from_iterable(word_seg))
            sent = ' '.join(seg)
            df.at[i, 'title'] = sent
            print(f'done line {count}')
            count += 1
        df.to_csv(outfile, header=None, index=None)
        # df['title'] = df['title'].apply(lambda row: tokenizer, axis=1)
        # df.to_csv(outfile, mode='a', header=None)

    print(f'done {infile}')
示例#29
0
def nlp_tokenize(path):
    data = pd.read_excel(path)
    data = data[['ID', 'Content', 'ID người đăng']]
    data = data.dropna()
    data['Content'] = data['Content'].str.strip()
    data['Content'] = data['Content'].str.lower()
    data['status'] = data['Content']

    for i in range(len(data['status'])):
        data['status'].iloc[i] = re.sub('\W+', ' ', data['Content'].iloc[i])
        data['Content'].iloc[i] = data['status'].iloc[i]
    vncorenlp_file = r'VnCoreNLP/VnCoreNLP-1.1.1.jar'
    vncorenlp = VnCoreNLP(vncorenlp_file)
    # content = vncorenlp.tokenize(content)
    for i in range(len(data['status'])):
        data['status'].iloc[i] = vncorenlp.tokenize(data['status'].iloc[i])
    key_word = []
    for i in data['status']:
        key_word = key_word + i

    vncorenlp.close()
    return key_word, data[['Content', 'ID', 'ID người đăng']]
def get_data_tokenized(file, out):
    count_line = 1
    with VnCoreNLP(address='http://127.0.0.1', port=9000) as vncorenlp:
        with jsonlines.open(file) as infile:
            with open(out, 'w') as outfile:
                for obj in infile:
                    category = obj['category']
                    category = category.strip()
                    for _ in range(5):
                        try:
                            tag = vncorenlp.tokenize(tag)
                            break
                        except:
                            print("retry")
                            time.sleep(5)
                    label = ' '.join(category)
                    label = re.sub(r' ', '-', label)
                    outfile.write(f'__label__{label} ')
                    for _ in range(5):
                        try:
                            word_seg = vncorenlp.tokenize(obj['title'])
                            break
                        except:
                            print('retry')
                            time.sleep(5)
                    if len(word_seg) > 1:
                        lis = []
                        for l in word_seg:
                            li = ' '.join(l)
                            lis.append(li)
                    seg = ' '.join(lis)
                    outfile.write(f'{seg} ')
                    outfile.write('\n')
                    print('done line {0}'.format(count_line))
                    count_line += 1
                print(f'wrote to {out}')