Python SentenceSplitter примеры, pyltp.SentenceSplitter Python примеры использования

Пример #1

0

Показать файл

Файл: casual_tag.py Проект: xiao-creat/Causal_event

def post():#登陆主页

    f = request.files['file']
    input_text = request.form['input_txt'].encode("utf-8-sig")

    filename = f.filename
    uname = session['username']

    if request.form['click'] == '上传文件':

        #record user ip , date, user_name 
        if filename:
            curr_time = datetime.datetime.now()
            if '.txt' in filename:
                content=f.read().decode(encoding="utf-8-sig")
                sentences=[sentence for sentence in SentenceSplitter.split(content) if sentence]
            # Id=uni_id()
                for i in sentences:
                    sentences_collection.insert({'username':uname,'filename':filename.replace('.txt',''),'sentence':i,'uploadTime':curr_time,'casualtag':'0','eventtag':'0','cflag':'0','eflag':'0'})

                return render_template("upload_succeed.html",suc_message='upload successful')
            else:
                err_message='text format not right!'
                return render_template("upload_error.html",err_message=err_message)
        else:
            curr_time = datetime.datetime.now()
            content=input_text
            sentences=[sentence for sentence in SentenceSplitter.split(content) if sentence]
            for i in sentences:
                sentences_collection.insert({'username':uname,'filename':'输入上传','sentence':i,'uploadTime':curr_time,'casualtag':'0','eventtag':'0','cflag':'0','eflag':'0'})

            return render_template("upload_succeed.html",suc_message='upload successful')

Пример #2

0

Показать файл

    def __init__(self):
        """
        Initial
        """
        self.sen_split = SentenceSplitter()
        self.seg = Segmentor()
        self.seg.load_with_lexicon(CWS_MODEL, "resource/lexicon")
        self.pos = Postagger()
        self.pos.load_with_lexicon(POS_MODEL, "resource/lexicon")
        self.parser = Parser()
        self.parser.load(PARSER_MODEL)

        self.rule = IterDocument("resource/rule")

Пример #3

0

Показать файл

Файл: tools.py Проект: ishine/News-speech-extraction

 def __init__(self):
     """
     初始化模型
     """
     self.seg_sent = SentenceSplitter()  # 分句
     self.seg = Segmentor()  # 分词
     self.seg.load(cws_model_path)
     self.pos = Postagger()  # 词性标注
     self.pos.load(pos_model_path)
     self.ner = NamedEntityRecognizer()  # 命名实体识别
     self.ner.load(ner_model_path)
     self.par = Parser()  # 依存分析
     self.par.load(par_model_path)
     self.similar_word = load_similar_word()  # 读取相似词列表

Пример #4

0

Показать файл

    def __init__(self):
        """
        Load remote lexicon and ltp model
        """
        self.temp_lexicon = "temp_lexicon"
        self.fetch_lexicon()

        self.sentence_splitter = SentenceSplitter()
        self.segment = Segmentor()
        self.segment.load_with_lexicon(CWS_MODEL, self.temp_lexicon)
        self.pos = Postagger()
        self.pos.load_with_lexicon(POS_MODEL, self.temp_lexicon)
        self.tree_parser = Parser()
        self.tree_parser.load(PARSER_MODEL)

Пример #5

0

Показать файл

Файл: baikecontent.py Проект: wozaimalubian/BaikeInfoExtraction

def collect_infos(word):
    infos = hudong.info_extract_hudong(word)
    for info in infos:
        intro_sents = [
            sent for sent in SentenceSplitter.split(info['intro'])
            if len(sent) > 0
        ]
        desc_sents = [
            sent for sent in SentenceSplitter.split(info['desc'])
            if len(sent) > 0
        ]
        print(intro_sents)
        print('****' * 5)
        print(desc_sents)

Пример #6

0

Показать файл

Файл: row_apply.py Проект: 2012060010010/DL_code

def tokenized_sub_sents(row):
    content = row.iloc[1]
    sub_sents = []
    sub_tags = []
    sents = SentenceSplitter.split(content)
    for sent in sents:
        subs = [x for x in re.split(punc_pattern, sent) if x]
        subss = [
            jieba.posseg.cut(x, HMM=False) for x in subs
            if not re.findall(no_chn, x)
        ]
        tags = []
        subs = []
        for s in subss:
            tag = []
            sub = []
            for t0, t1 in s:
                tag.append(t1)
                sub.append(t0)
            tags.append(tag)
            subs.append(sub)
        assert len(tags) == len(subs)
        sub_sents.extend(subs)
        sub_tags.extend(tags)


#     print(sub_sents, sub_tags)
    row["sub_sents_tokenized"] = sub_sents
    row["sub_sents_postagged"] = sub_tags
    return row

Пример #7

0

Показать файл

    def get_roles_emotions(self):
        '''
        获取self.book中的人物及情绪变化指标
        :return:
        '''
        contents = self.get_chapterList()
        roles = self.get_roles()
        role_points = [[] for i in range(len(roles))]

        for content in contents:
            sents = SentenceSplitter.split(content)
            sent_words = []
            for sent in sents:
                words = list(self.segmentor.segment(sent))
                for role in roles:
                    if role in words:
                        sent_words.append(words)
                        break
            role_emotionWords = self.get_role_emotionWord(roles, sent_words)
            points = self.count_point(role_emotionWords, roles)
            for i in range(len(role_points)):
                role_points[i].append(points[i])
        for term in range(role_points.__len__()):
            index = role_points[term].__len__() - 2
            while index >= 1:
                if role_points[term][index] == 0 or role_points[term][
                        index] == 0.0:
                    role_points[term][index] = (
                        role_points[term][index - 1] +
                        role_points[term][index + 1]) / 2
                index -= 1
        for i in range(len(role_points)):
            print roles[i], role_points[i]
        return roles, role_points

Пример #8

0

Показать файл

    def getDAN(self,path = "测试测试啊哈哈哈"):
        '''
        读取文本文件,获取danlist
        :return:
        '''
        #txts = []
        dan_list = []
        files = os.listdir(self.book_root_path)
        for file in files:
            fileposition = self.book_root_path + "\\" + file
            print("file name:", fileposition)
            with open(fileposition,"r",encoding="utf-8")as f:
                lines = f.readlines()
                for line in lines :
                    #txts.append(line)
                    if line != "":
                        sentences = SentenceSplitter.split(line)
                        #print("sentences:", sentences)
                        for sentence in sentences:
                            words = self.segmentor(sentence)
                            #print("sentences  11:",words)
                            dan_list_line = self.postagger(words)
                            dan_list += dan_list_line
                f.close()
        return list(set(dan_list))



    #def getDAN(self):

        '''

Пример #9

0

Показать файл

Файл: main_role.py Проект: yangtiangithub/pyltp-books

 def mainName(self, dirName):
     txtlist = os.listdir(self.book_root_path + dirName)
     name_list_book = []
     for txt in txtlist:
         name_list_chapter = []
         print txt
         lines = self.readBookLines(self.book_root_path + dirName + "/" +
                                    txt)
         for line in lines:
             if line != "":
                 sents = SentenceSplitter.split(line)
                 for sent in sents:
                     words_line = self.segmentor(sent)
                     postags_line, name_list_line = self.posttaggerNH(
                         words_line)
                     name_list_chapter += name_list_line
         # 统计每一章节top 10
         top_itf_chapter, top_name_chapter = self.getTopTen(
             name_list_chapter)  # [(name,times,freq),()]
         name_list_book += top_name_chapter
         self.writeTxt(self.mainrole_root_path + dirName + "/" + txt,
                       top_itf_chapter)
         print txt + "本章节top 10----------------------"
         for cname, ctimes, cfreq in top_itf_chapter:
             print cname, ctimes, cfreq
     # 统计整本书 top 10
     top_itf_book, top_name_book = self.getTopTen(name_list_book)
     self.writeTxt(self.mainrole_root_path + dirName + "/AllChapter.txt",
                   top_itf_book)
     print "整本书 top 10----------------------"
     for bname, btimes, bfreq in top_itf_book:
         print bname, btimes, bfreq

Пример #10

0

Показать файл

Файл: serviceExtractor.py Проект: sadxiaohu/icbckg

def Relation_Extraction(content):
    statements = SentenceSplitter.split(str(content))
    relation = json.load(open(resource_path + 'all_relations.json'))
    all_triples = []
    for statement in statements:
        # print(statement)
        words, postags, arcs = ltp_parser.parse(statement)
        triples = rule_based_extraction(words, postags, arcs)

        tri = []

        for i in range(len(triples)):
            flag = 0
            for j in range(len(tri)):
                if triples[i][0] == tri[j][0] and triples[i][1] == tri[j][
                        1] and triples[i][2] == tri[j][2]:
                    flag = 1
            if triples[i][0] == triples[i][2]:
                flag = 1
            if flag == 0:
                tri.append(triples[i])

        flag1 = 0
        if tri:
            # print('关系三元组:')
            for triple in tri:
                for i in relation:
                    if (triple[1] == i):
                        flag1 = 1
                        break
                if (flag1 == 1):
                    all_triples.append(triple)
    return all_triples

Пример #11

0

Показать файл

Файл: NewsFeatureAnalyzer.py Проект: xinke0802/NLPML

def feature_about():
    # 获取特征列表
    feature_dict = NewsUtil.get_feature()
    # 获取新闻中出现特征后最近的5个词及其属性
    logger.info("In Prepare Raw News...")
    raw_news_data = CommonUtil.read_excel(RAW_NEWS_DEMO_PATH)
    raw_news_table = raw_news_data.sheet_by_index(0)
    raw_news_rows = raw_news_table.nrows
    segmentor = Segmentor()  # 初始化实例
    segmentor.load_with_lexicon(cws_model_path,
                                CFETSFX_LEXICON_PATH)  # 加载模型，第二个参数是您的外部词典文件路径
    feature_about_list = list()
    for rowN in range(0, raw_news_rows):
        news_content = raw_news_table.cell_value(rowN, 2)
        sentences = SentenceSplitter.split(news_content)
        for sentence in sentences:
            print(sentence)
            # 分词
            words = segmentor.segment(sentence)
            print(list(words))
            for word_index in range(0, len(words)):
                word = words[word_index]
                for feature_word in feature_dict.values():
                    if feature_word in word:
                        about_list = list()
                        count = 0
                        while word_index < len(words) and count < 6:
                            about_list.append(words[word_index])
                            count += 1
                            word_index += 1
                        feature_about_list.append(about_list)
                        print(about_list)
                        break
    segmentor.release()
    CommonUtil.write_csv(FEATURE_ABOUT_PATH, feature_about_list)

Пример #12

0

Показать файл

def sentence_splitter():
    """
    分句
    """
    sentence = '你好，你觉得这个例子从哪里来的？当然还是直接复制官方文档，然后改了下这里得到的。'
    sents = SentenceSplitter.split(sentence)  # 分句
    print("\n".join(sents))

Пример #13

0

Показать файл

 def get_sentences(self, news):
     """
     分句
     :param news: str 新闻文本
     :return:  list 句子列表
     """
     return list(SentenceSplitter.split(news))

Пример #14

0

Показать файл

 def __handel(self, news):
     news = TextHandle.cht_to_chs(news)
     sentences = []
     for line in news.strip().split('\n'):
         sentences += list(SentenceSplitter.split(line.strip()))
         # sentences += [line.strip()]
     return sentences

Пример #15

0

Показать файл

def split_sentences(text):
    '''
    params:
        text    文本，包含多句话的文本
    划分句子
    '''
    return SentenceSplitter.split(text)

Пример #16

0

Показать файл

Файл: find_tuple.py Проект: Erisu0014/relationExtraction

def sentence_split(read_file):
    """ 对段落中的句子进行基于符号的划分

    :param read_file:   文件txt
    :return:    分好的句子存入到sequences了，所以只需要返回状态信息就好了
    """
    for paragraph in read_file.readlines():
        # 太短的段落（词？）没有分的必要了
        if paragraph == '' or len(paragraph) <= 4:
            continue
        sentence_splitter = SentenceSplitter.split(paragraph)
        for sequence in sentence_splitter:
            # 去除空行
            if sequence == '':
                continue
            # 二次分隔
            second_sentences = re.split('[，,]', sequence)
            for second_sentence in second_sentences:
                # 对于句子的筛选工作
                second_sentence = deal_data(second_sentence)
                if second_sentence == '' or len(second_sentence) <= 4:
                    continue
                sentences.add(second_sentence)
    str = "分句步骤已完成"
    print("=" * 10, str, "=" * 10)
    return str

Пример #17

0

Показать файл

Файл: ltp.py Проект: stayrascal/rascal-recsys

def split(content):
    global _segmentor, _sent_splitter
    if _segmentor is None:
        model_path = r''
        segmentor = Segmentor()
        segmentor.load(model_path)
        _segmentor = segmentor
        _sent_splitter = SentenceSplitter()
    sents = _sent_splitter.split(content)
    _sents = []
    for sent in sents:
        words = _segmentor.segment(sent)
        sent = ' '.join(words)
        _sents.append(sent)
    content = '. '.join(_sents)
    return content

Пример #18

0

Показать файл

Файл: englishProcessor.py Проект: BarryZM/dataProcessor

def main():
    file_list = []
    path = r'C:\Users\JeremySun\Desktop\Internship\Project02_corpusProcessor\english_text_pre'
    file_path = batch_file(path=path, file_list=file_list)
    for path in file_path:
        english_text_connect = open(path, encoding='utf-8').readlines()
        assetPath_loss_url = loss_url(text=english_text_connect)
        assetPath_loss_img = loss_img(text=assetPath_loss_url)
        assetPath_loss_video = loss_video(text=assetPath_loss_img)
        assetPath_loss_src = loss_src(text=assetPath_loss_video)
        assetPath_loss_div = loss_div(text=assetPath_loss_src)
        assetPath_loss_span = loss_span(text=assetPath_loss_div)
        assetPath_loss_p = loss_p(text=assetPath_loss_span)
        assetPath_loss_special = loss_special(text=assetPath_loss_p)
        assetPath_loss_continue = loss_continue(text=assetPath_loss_special)
        assetPath_loss_word = loss_word(text=assetPath_loss_continue)
        assetPath_loss_chino = loss_chino(text=assetPath_loss_word)
        assetPath_loss_greek = loss_greek(text=assetPath_loss_chino)
        assetPath_loss_pinyin = loss_pinyin(text=assetPath_loss_greek)
        assetPath_loss_fake = loss_fake(text=assetPath_loss_pinyin)
        assetPath_loss_tradition = loss_tradition(text=assetPath_loss_fake)
        assetPath_loss_comma = loss_comma(text=assetPath_loss_tradition)

        # 分句
        english_text_sentence = SentenceSplitter.split(assetPath_loss_comma)

        # 去掉其余符号并写入文件
        pattern_all = re.compile(r"[。.；;？?!！]")
        f = open("english_text_sentence_pre.txt", 'a', encoding='utf-8')
        for i in tqdm(english_text_sentence):
            i = re.sub(pattern=pattern_all, repl='', string=i)
            f.write(i + '\n')
        f.close()

Пример #19

0

Показать файл

Файл: segment.py Проект: whxf/awesome-chinese-nlp

class LtpSegment(object):
    """LTP文档分句 句子分词工具"""
    __model_dir = os.path.join('source', 'ltp_data_v3.4.0')

    # 分句
    splitter = SentenceSplitter()

    # 分词
    segmentor = Segmentor()
    segmentor.load(os.path.join(__model_dir, "cws.model"))

    def split(self, document):
        """
        长文档分句
        :param document: str
        :return: 句子 list
        """
        sentences = self.splitter.split(document)
        return [sentence for sentence in sentences if len(sentence) > 0]

    def segment(self, sentence):
        """
        句子分词
        :param sentence: str
        :return:  词语 list
        """
        words = self.segmentor.segment(sentence)
        return list(words)

Пример #20

0

Показать файл

Файл: test1.py Проект: yuemingyilou/mycail

def ltpSentenceSplit(content):
    """
    ltp split sentence
    :param content:
    :return:
    """
    return list(SentenceSplitter.split(content))

Пример #21

0

Показать файл

def main():
    file_list = []
    path = r'C:/Users/JeremySun/Desktop/Internship/Project02_corpusProcessor/app'
    file_path = batch_file(path=path, file_list=file_list)
    for path in file_path:
        app = open(path, encoding='utf-8').readlines()
        assetPath_text = get_assetPath(text=app)
        assetPath_loss_html = loss_html(text=assetPath_text)
        assetPath_loss_label = loss_label(text=assetPath_loss_html)
        assetPath_loss_mail = loss_mail(text=assetPath_loss_label)
        assetPath_loss_other = loss_other(text=assetPath_loss_mail)
        assetPath_loss_url = loss_url(text=assetPath_loss_other)
        assetPath_clean_url = clean_url(text=assetPath_loss_url)
        assetPath_loss_continue = loss_continue(text=assetPath_clean_url)
        assetPath_loss_word = loss_word(text=assetPath_loss_continue)
        assetPath_loss_comma = loss_comma(text=assetPath_loss_word)

        # 分句
        assetPath_sentence = SentenceSplitter.split(assetPath_loss_comma)

        # 去掉其余符号并写入文件
        pattern_all = re.compile(r"[。.；;:：？?!！]")  # 加：:
        f = open("app_pre.txt", 'a', encoding='utf-8')
        for i in tqdm(assetPath_sentence):
            i = re.sub(pattern=pattern_all, repl='', string=i)
            f.write(i + '\n')
        f.close()

Пример #22

0

Показать файл

Файл: main_role.py Проект: yangtiangithub/pyltp-books

 def mainLocaltion(self, dirName="西游记白话文"):
     txtlist = os.listdir(self.book_root_path + dirName)
     lo_list_book = []
     for txt in txtlist:
         lo_list_chapter = []
         print txt
         lines = self.readBookLines(self.book_root_path + dirName + "/" +
                                    txt)
         for line in lines:
             if line != "":
                 sents = SentenceSplitter.split(line)
                 for sent in sents:
                     words_line = self.segmentor(sent)
                     lo_list_line = self.posttagerNLNS(words_line)
                     lo_list_chapter += lo_list_line
         # 统计每一章节top 10
         top_itf_chapter, top_lo_chapter = self.getTopTen(lo_list_chapter)
         lo_list_book += top_lo_chapter
         self.writeTxt(self.mainlo_root_path + dirName + "/" + txt,
                       top_itf_chapter)
         print txt + "本章节top 10----------------------"
         for cloname, clotimes, clofreq in top_itf_chapter:
             print cloname, clotimes, clofreq
     # 统计整本书 top 10
     top_loitf_book, top_lo_book = self.getTopTen(lo_list_book)
     self.writeTxt(self.mainlo_root_path + dirName + "/AllChapter.txt",
                   top_loitf_book)
     print "整本书 top 10----------------------"
     for bloname, blotimes, blofreq in top_loitf_book:
         print bloname, blotimes, blofreq

Пример #23

0

Показать файл

 def get_sentence_list(self):
     """
     获取分句
     :return:
     """
     sents = SentenceSplitter.split(self.sentence)
     return [s for s in sents if s]

Пример #24

0

Показать файл

Файл: newsClean.py Проект: BarryZM/dataProcessor

def main():
    file_list = []
    path = r'D:\实习数据备份\备份\xinwenshuju'
    file_path = batch_file(path=path, file_list=file_list)
    for path in file_path:
        try:
            news_text_connect = open(path, encoding='utf-8').readlines()
            assetPath_loss_html = loss_html(text=news_text_connect)
            assetPath_loss_label = loss_label(text=assetPath_loss_html)
            assetPath_loss_mail = loss_mail(text=assetPath_loss_label)
            assetPath_loss_other = loss_other(text=assetPath_loss_mail)
            assetPath_loss_url = loss_url(text=assetPath_loss_other)
            assetPath_clean_url = clean_url(text=assetPath_loss_url)
            assetPath_loss_continue = loss_continue(text=assetPath_clean_url)
            assetPath_loss_word = loss_word(text=assetPath_loss_continue)
            assetPath_loss_comma = loss_comma(text=assetPath_loss_word)

            # 分句
            assetPath_sentence = SentenceSplitter.split(assetPath_loss_comma)

            # 去掉其余符号并写入文件
            pattern_all = re.compile(r"[。.；;:：？?!！|]")  # 加：:
            f = open("news_pre.txt", 'a', encoding='utf-8')
            for i in tqdm(assetPath_sentence):
                i = re.sub(pattern=pattern_all, repl='', string=i)
                f.write(i + '\n')
            f.close()

        except:
            print("utf-8 codec can not decode byte 0xc3 in position 0")

Пример #25

0

Показать файл

def pad_batch(batch_docs, TEXT):
    res_batch_docs = []
    max_words, max_sents = 0, 0
    res_batch_targets = []
    for doc in batch_docs:
        doc_text = doc[2]
        res_doc_text = []
        # 使用LTP将一篇文章划分成若干句子
        sents = SentenceSplitter.split(doc_text)
        max_sents = max(max_sents, len(sents))
        for i, sent in enumerate(sents):
            sent = TEXT.preprocess(sent)
            sent = [TEXT.vocab.stoi[word] for word in sent]
            max_words = max(max_words, len(sent))
            res_doc_text.append(sent)
        res_batch_docs.append(res_doc_text)
        res_batch_targets.append(doc[1])

    for doc in res_batch_docs:
        sents = doc
        for sent in sents:
            while len(sent) < max_words:
                sent.append(0)
        while len(sents) < max_sents:
            sents.append([0 for _ in range(max_words)])
    return torch.LongTensor(res_batch_docs), torch.LongTensor(
        res_batch_targets)

Пример #26

0

Показать файл

Файл: professionClean.py Проект: BarryZM/dataProcessor

def main():
    file_list = []
    path = r'D:\实习数据备份\备份\professional'
    file_path = batch_file(path=path, file_list=file_list)
    for path in file_path:
        profession_text_connect = open(path, encoding='utf-8').readlines()
        assetPath_loss_html = loss_html(text=profession_text_connect)
        assetPath_loss_label = loss_label(text=assetPath_loss_html)
        assetPath_loss_mail = loss_mail(text=assetPath_loss_label)
        assetPath_loss_other = loss_other(text=assetPath_loss_mail)
        assetPath_loss_url = loss_url(text=assetPath_loss_other)
        assetPath_clean_url = clean_url(text=assetPath_loss_url)
        assetPath_loss_continue = loss_continue(text=assetPath_clean_url)
        assetPath_loss_word = loss_word(text=assetPath_loss_continue)
        assetPath_loss_comma = loss_comma(text=assetPath_loss_word)

        # 分句
        assetPath_sentence = SentenceSplitter.split(assetPath_loss_comma)

        # 去掉其余符号并写入文件
        pattern_all = re.compile(r"[。.；;:：？?!！/|]")  # 加：:
        f = open("profession_pre.txt", 'a', encoding='utf-8')
        for i in tqdm(assetPath_sentence):
            i = re.sub(pattern=pattern_all, repl='', string=i)
            f.write(i + '\n')
        f.close()

Пример #27

0

Показать файл

Файл: utils.py Проект: SkipperJK/EventExtraction

def article_preprocessing(article):
    """
    对News进行预处理，对文本进行分词和词性标注
    这里的分词,没有去掉stopwords,也没有去掉标点符号

    Args:
        Article: 原始的新闻Article

    Return：
        预处理之后的新闻PreprocessArticle
    """
    # print("article preprocessing")
    pre_article = PreprocessArticle()
    # pre_article.id = article.id
    pre_article.date = article.time
    pre_article.title = article.title
    pre_article.seg_pos_title = seg_pos.cut(article.title)
    sentences = SentenceSplitter.split(article.content)  # Using ltp split sentence
    sentences = [x for x in sentences if x != '']
    for idx, x in enumerate(sentences):
        sentence = Sentence()
        sentence.text = x
        sentence.location = idx
        sentence.seg_pos = seg_pos.cut(x)
        pre_article.sentences.append(sentence)
    return pre_article

Пример #28

0

Показать файл

Файл: modelAnswerSelection.py Проект: Leext/actkg-v2

 def predict(self, question, text):
     input_question = self.sentence2input(question)
     sentences = list(SentenceSplitter.split(text))
     input_answers_temp = [
         self.sentence2input(sentence) for sentence in sentences
     ]
     input_answers_len = np.array([len(a) for a in input_answers_temp])
     input_answers = tl.prepro.pad_sequences(input_answers_temp)
     input_questions = tl.prepro.pad_sequences([input_question] *
                                               len(sentences))
     input_question_len = np.array([len(q) for q in input_questions])
     simi_list = self.predict_sess.run(self.simi_P,
                                       feed_dict={
                                           self.Ques: input_questions,
                                           self.Ques_seq_len:
                                           input_question_len,
                                           self.Pos_Ans: input_answers,
                                           self.Pos_Ans_len:
                                           input_answers_len,
                                           self.keep_prob: 1.0
                                       })
     #    for sen,simi in zip(sentences,simi_list):
     #           print simi, sen
     simi_list = np.array(simi_list)
     return sentences[np.argmax(simi_list)], max(simi_list)

Пример #29

0

Показать файл

Файл: englishClean.py Проект: BarryZM/dataProcessor

def main():
    file_list = []
    path = r'C:\Users\JeremySun\Desktop\Internship\Project02_corpusProcessor\english_folder'
    file_path = batch_file(path=path, file_list=file_list)
    for path in file_path:
        english_text_connect = open(path, encoding='utf-8').readlines()
        assetPath_loss_html = loss_html(text=english_text_connect)
        assetPath_loss_label = loss_label(text=assetPath_loss_html)
        assetPath_loss_mail = loss_mail(text=assetPath_loss_label)
        assetPath_loss_other = loss_other(text=assetPath_loss_mail)
        assetPath_loss_url = loss_url(text=assetPath_loss_other)
        assetPath_clean_url = clean_url(text=assetPath_loss_url)
        assetPath_loss_continue = loss_continue(text=assetPath_clean_url)
        assetPath_loss_word = loss_word(text=assetPath_loss_continue)
        assetPath_loss_comma = loss_comma(text=assetPath_loss_word)

        # 分句
        english_text_sentence = SentenceSplitter.split(assetPath_loss_comma)

        # 去掉其余符号并写入文件
        pattern_all = re.compile(r"[。.；;？?!！:：]")  # 加：:
        pattern_last = re.compile(r'[a-zA-Z0-9]{13,}')
        f = open("english_text_sent_pre.txt", 'a', encoding='utf-8')
        for i in tqdm(english_text_sentence):
            if len(i) <= 100:
                i = re.sub(pattern=pattern_all, repl=' ', string=i)
                i = re.sub(pattern=pattern_last, repl='', string=i)
                f.write(i.strip() + '\n')
        f.close()

Пример #30

0

Показать файл

Файл: text_preprocessing.py Проект: Vendredi218/NLP

 def split2sent(self, text):
     '''
     对文本进行分句
     '''
     from pyltp import SentenceSplitter
     sents = SentenceSplitter.split(text)
     return sents

Пример #31

0

Показать файл

Файл: example.py Проект: FrankBlood/pyltp

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys, os

ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir)
sys.path = [os.path.join(ROOTDIR, "lib")] + sys.path

# Set your own model path
MODELDIR=os.path.join(ROOTDIR, "ltp_data")

from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller

paragraph = '中国进出口银行与中国银行加强合作。中国进出口银行与中国银行加强合作！'

sentence = SentenceSplitter.split(paragraph)[0]

segmentor = Segmentor()
segmentor.load(os.path.join(MODELDIR, "cws.model"))
words = segmentor.segment(sentence)
print "\t".join(words)

postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))
postags = postagger.postag(words)
# list-of-string parameter is support in 0.1.5
# postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
print "\t".join(postags)

parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))
arcs = parser.parse(words, postags)

Python SentenceSplitter примеры использования