示例#1
0
def load_sentences(path, lower, zero):
    """
    加载训练样本,一句话就是一个样本。
    训练样本中,每一行是这样的:长 B-Dur,即字和对应的标签
    句子之间使用空行隔开的
    return : sentences: [[['无', 'O'], ['长', 'B-Dur'], ['期', 'I-Dur'],...]]
    """

    sentences = []
    sentence = []

    for line in open(path, 'r', encoding='utf8'):
        """ 如果包含有数字,就把每个数字用0替换 """
        line = line.rstrip()
        line = zero_digits(line) if zero else line
        """ 如果不是句子结束的换行符,就继续添加单词到句子中 """
        if line:
            word_pair = ["<unk>", line[2:]] if line[0] == " " else line.split()
            assert len(word_pair) == 2
            sentence.append(word_pair)

        else:
            """ 如果遇到换行符,说明一个句子处理完毕 """
            if len(sentence) > 0:
                sentences.append(sentence)
                sentence = []
    """ 最后一个句子没有换行符,处理好后,直接添加到样本集中 """
    if len(sentence) > 0:
        sentences.append(sentence)

    return sentences
示例#2
0
def load_sentences(path, lower, zeros):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    num = 0
    for line in codecs.open(path, 'r', 'utf8'):
        num+=1
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        # print(list(line))
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            if line[0] == " ":
                line = "$" + line[1:]
                word = line.split()
                # word[0] = " "
            else:
                word= line.split()
            assert len(word) >= 2, print([word[0]])
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences
示例#3
0
def load_sentences(path, lower, zeros):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    :param path: the data path
    :param lower: whether lower case(not use)
    :param zeros: whether replace digits with zero
    :return: [num_sents, seq_len, 2]
    """
    sentences = []
    sentence = []
    num = 0
    for line in codecs.open(path, 'r', 'utf8'):
        num += 1
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        # print(list(line))
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            if line[0] == " ":
                line = "$" + line[1:]
                word = line.split()
                # word[0] = " "
            else:
                word = line.split()
            assert len(word) >= 2, print([word[0]])
            sentence.append(word)

    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences
示例#4
0
def load_sentences(path, lower, zeros):
    sentences = []
    sentence = []
    num = 0
    for line in open(path, 'r', encoding='utf8'):
        num += 1
        # 在这里将line中的数字(正则表达式是\d)转换成0
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        # print(list(line))
        # 从该处开始对数据进行校验
        # 如果文档内容为空返回一个空语句
        if not line:
            if len(sentence) > 0:
                sentences.append(sentence)
                sentence = []

        else:
            if line[0] == " ":
                line = "$" + line[1:]
                word = line.split()
                # word[0] = " "
            else:
                # 在该处分割每个单词,为该函数的主要操作内容
                word = line.split()
            assert len(word) == 2
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences
示例#5
0
def load_sentences(path, lower, zeros):
    sentences = []
    sentence = []
    num = 0
    for line in codecs.open(path, 'r', 'utf8'):
        num += 1
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        # print(list(line))
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            if line[0] == " ":
                line = "$" + line[1:]
                word = line.split()
                # word[0] = " "
            else:
                word = line.split()
            assert len(word) >= 2, print([word])
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences
示例#6
0
def load_sentences(path, lower, zeros):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    num = 0
    for line in codecs.open(path, 'r', 'utf8'):
        num+=1
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        # print(list(line))
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            if line[0] == " ":
                line = "$" + line[1:]
                word = line.split()
                # word[0] = " "
            else:
                word= line.split()
            #assert len(word) >= 2, print([word[0]])
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences
示例#7
0
def load_sentences(path, lower, zero):
    """
    加载数据集,一行至少包含一个词和对应的标签
    :param path:
    :param lower:
    :param zero:
    :return:
    """
    sentences = []
    sentence = []
    # 每一行去读
    for line in codecs.open(path, 'r', encoding='utf-8'):
        # 此处一定要将各种数字转化为0,这样就能泛化识别
        line = zero_digits(line.rstrip()) if zero else line.rstrip()
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            if line[0] == " ":
                line = "$" + line[1:]
                word = line.split()
                # word[0] = " "
            else:
                word = line.split()
            if len(word) == 2:
                sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences
示例#8
0
文件: loader.py 项目: HJYgotoPLAY/NER
def load_sentences(path, lower, zeros):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    f = open(path, encoding='utf-8')
    for line in f:
        line_list = line.strip().split(" ")
        if zeros:
            line_list[0] = zero_digits(line_list[0])
        if len(line_list) == 2:
            sentence.append(line_list)
        else:
            sentences.append(sentence)
            sentence = []
    return sentences
def load_sentences(path, lower, zeros):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    num = 0
    for line in codecs.open(path, 'r', 'utf8'):
        num += 1
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        # 即根据zeros的真假确定line的赋值,True则赋值为:zero_digits(line.rstrip()), 反之则赋值为line.rstrip()
        # str.rstrip([chars]); chars -- 指定删除的字符(默认为空格); 返回删除 string 字符串末尾的指定字符后生成的新字符串。
        # 而对于txt中的“\n”而言,对它进行rstrip操作会把它变成[], 从而方便把句子区分开
        if not line:
            # 空列表相当于 False
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                    # append() 方法用于在列表末尾添加新的对象。
                sentence = []
                # 这里的操作是利用回车给句子分段,并分别将每个句子存在sentences里
        else:
            if line[0] == " ":
                line = "$" + line[1:]
                word = line.split()
                # 感觉这里是把空格行用“$”标注的样子
            else:
                word = line.split()
                # str.split(str="", num=string.count(str))
                # 通过指定分隔符对字符串进行切片,如果参数num 有指定值,则仅分隔 num 个子字符串
                # str -- 分隔符,默认为所有的空字符,包括空格、换行(\n)、制表符(\t)等; num -- 分割次数。
                # 这里会将'海 0\n'分成['海','0']
            assert len(word) >= 2, print([word[0]])
            # 不满足assert条件报错
            # 如果word的长度小于2 则会打印word首字然后报Assertion的错 即认为一个词对应一个标签
            sentence.append(word)
            # sentence 存储的是每个词签对的列表
    # 这个循环操作是把文档逐字地进行读取,并读取每个字的标签
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences
示例#10
0
def load_sentences(path, lower, zeros, data_augment=True):
    """
    读取训练数据
    数据文件格式如下:
    如/O 何/O 演/O 好/O 自/O 己/O 的/O 角/O 色/O ,/O 请/O 读/O 《/O 演/O 员/O 自/O 我/O 修/O 养/O 》/O
    :param path: 数据文件
    :param lower:
    :param zeros:
    :param data_augment: 是否需要数据增强;
    :return:
    """
    sentences = []
    for line in codecs.open(path, 'r', 'utf8'):
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        sentence = [[word[0], word[2:]] for word in line.split()
                    if word[1] == '/']
        if sentence:
            sentences.append(sentence)
    if data_augment:
        sentences = data_augmentation(sentences)
    return sentences
示例#11
0
def load_sentences(path, lower, zeros):
    """
    加载数据集中的语句,将语句中的字符及对应的标签存储为列表,然后每个语句又单独形成一个列表
    :param path:数据集路径
    :param lower:是否将英文字符小写
    :param zeros:是否将数字全赋值为0
    :return:
    """
    sentences = []  # 存储所有语句
    sentence = []  # 存储一个语句的所有字符及相应的标签
    num = 0
    for line in codecs.open(path, 'r', 'utf8'):
        # print(line)
        num += 1
        # 根据zero参数的值决定是否将所有的数字设为0
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        # print(list(line))
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    # 每句话结束的时候将sentence添加到sentences中
                    sentences.append(sentence)
                sentence = []
        else:
            if line[0] == " ":
                line = "$" + line[1:]
                # 将每个词与相应的标注存储为一个数组word
                word = line.split()
                # word[0] = " "
            else:
                word = line.split()
            # assert len(word) >= 2, print([word[0]])  # 若训练数据每一行只有一个字符串,则报错(因为每一行应该是word+标签)
            if len(word) == 1:
                word.append("O")
            # 每个word数组添加到sentence中
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences
示例#12
0
def load_sentences(path, lower, zeros):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    num = 0
    for line in open(path, 'r', encoding='utf8'):  #其中一行,如“长 B-Dur”
        # if num == 18053727 or num == 18053726:
        #     print(line)
        # t = line
        num += 1
        # line = zero_digits(line.rstrip()) if zeros else line.rstrip()  #若忽略大小写,则就处理,rstrip()表示删除 string 字符串末尾的指定字符(默认为空格)
        line = zero_digits(line.replace(' ', '')) if zeros else line.rstrip(
        )  #若忽略大小写,则就处理,rstrip()表示删除 string 字符串末尾的指定字符(默认为空格)
        # print(list(line))
        if not line:  #Sentences are separated by empty lines. 若处理到空行,则保存之前处理过的句子
            if len(sentence) > 0:
                sentences.append(sentence)
                sentence = []
        else:
            if line[0] == " ":  #若出现“  O”这种情况
                line = "$" + line[1:]
                word = line.split()
                # word[0] = " "
            else:
                word = line.split()
            # if len(word) != 2:
            #     print(sentence)
            #     print(word)
            #     print(num)
            #     print(t)
            assert len(word) == 2
            sentence.append(word)
    if len(sentence) > 0:  #保存最后一句话处理的结果
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences
示例#13
0
def load_sentences(path, lower, zeros):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    num = 0
    for line in codecs.open(path, 'r', 'utf8'):
        num += 1
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        # print("line:{}".format(list(line)))
        # print("line:{}".format(line))
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                # print("sentence:{}".format(sentence))
                # "sentence:[['我', 'O'], ['要', 'O'], ['看', 'O'], ['乌', 'B-SLOC'], ['鲁', 'I-SLOC'], ['木', 'I-SLOC'], ['齐', 'I-SLOC'], ['市', 'I-SLOC'], ['第', 'I-SLOC'], ['四', 'I-SLOC'], ['十', 'I-SLOC'], ['九', 'I-SLOC'], ['中', 'I-SLOC'], ['学', 'I-SLOC'], ['东', 'I-SLOC'], ['门', 'I-SLOC'], ['去', 'O'], ['乌', 'B-ELOC'], ['鲁', 'I-ELOC'], ['木', 'I-ELOC'], ['齐', 'I-ELOC'], ['推', 'I-ELOC'], ['拿', 'I-ELOC'], ['职', 'I-ELOC'], ['业', 'I-ELOC'], ['学', 'I-ELOC'], ['校', 'I-ELOC'], ['南', 'I-ELOC'], ['门', 'I-ELOC'], ['沿', 'O'], ['西', 'B-ROAD'], ['虹', 'I-ROAD'], ['东', 'I-ROAD'], ['路', 'I-ROAD'], ['的', 'O'], ['监', 'B-TYPE'], ['控', 'I-TYPE']]"
                sentence = []
        else:
            if line[0] == " ":
                line = "$" + line[1:]
                word = line.split()
                # word[0] = " "
            else:
                word = line.split()
                # print("word:{}".format(word))
                # word: ['监', 'B-TYPE']
            assert len(word) >= 2, print([word[0]])
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    # print("sentences:{}".format(sentences))
    # print("sentences[0]:{}".format(sentences[0]))
    # "sentences[0]:[['我', 'O'], ['要', 'O'], ['看', 'O'], ['乌', 'B-SLOC'], ['鲁', 'I-SLOC'], ['木', 'I-SLOC'], ['齐', 'I-SLOC'], ['市', 'I-SLOC'], ['第', 'I-SLOC'], ['四', 'I-SLOC'], ['十', 'I-SLOC'], ['九', 'I-SLOC'], ['中', 'I-SLOC'], ['学', 'I-SLOC'], ['东', 'I-SLOC'], ['门', 'I-SLOC'], ['去', 'O'], ['乌', 'B-ELOC'], ['鲁', 'I-ELOC'], ['木', 'I-ELOC'], ['齐', 'I-ELOC'], ['推', 'I-ELOC'], ['拿', 'I-ELOC'], ['职', 'I-ELOC'], ['业', 'I-ELOC'], ['学', 'I-ELOC'], ['校', 'I-ELOC'], ['南', 'I-ELOC'], ['门', 'I-ELOC'], ['沿', 'O'], ['西', 'B-ROAD'], ['虹', 'I-ROAD'], ['东', 'I-ROAD'], ['路', 'I-ROAD'], ['的', 'O'], ['监', 'B-TYPE'], ['控', 'I-TYPE']]"
    return sentences
示例#14
0
def load_sentences(path, lower=False, zeros=False):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    num = 0
    with codecs.open(path, 'r', 'utf8') as fread:
        # n_lines = len(fread)
        print("Read from {:s}".format(path))
        # pbar = progressbar.ProgressBar(max_value=n_lines)
        for line_idx, line in enumerate(fread):
            assert line_idx == num, 'ER'
            num += 1

            # pbar.update(line_idx)
            line = zero_digits(line.rstrip()) if zeros else line.rstrip()
            # print(list(line))
            if not line:  #Update: only deal with space between sentences
                if len(sentence) > 0:
                    if 'DOCSTART' not in sentence[0][0]:  # remove the DOCstart
                        sentences.append(sentence)
                    sentence = []
            else:
                if line[0] == " ":  #Update: this part is never used in Chinese ner!
                    line = "$" + line[1:]
                    word = line.split()
                    # word[0] = " "
                else:
                    word = line.split()
                assert len(word) >= 2, ([word[0]])
                sentence.append(word)
        if len(sentence) > 0:
            if 'DOCSTART' not in sentence[0][0]:
                sentences.append(sentence)

    return sentences
示例#15
0
def load_sentences(path, lower, zeros):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    #������path  'e:\\objectTest\\NERuselocal\\data\\example.train'ѵ��(�Ѿ���־)�ļ�·��   lower  ��ĸת��Сд�� zeros����ת��0
    #���ؾ��ӡ�һ�б������ٰ���һ�����ʺ����ı�ǩ�������ÿ��зָ�
    sentences = []  #����������ݼ���������ӵ����
    sentence = []  #��ŵ���һ������
    num = 0
    for line in open(
            path, 'r', encoding='utf8'
    ):  #path 'e:\\objectTest\\NERuselocal\\data\\example.train'
        num += 1  #ɾ�� string �ַ���ĩβ��ָ���ַ���Ĭ��Ϊ�ո�
        line = zero_digits(line.rstrip()) if zeros else line.rstrip(
        )  #���������ȥ��β���ո�ת��Ϊ0 ����ȥ��β���ո�
        #1��ѭ�� '�� O'  '�� B-Dur' '�� I-Dur'
        # print(list(line))
        if not line:  #������У����ӽ��� ���뵽sentences  sentence���»�Ϊ��
            if len(sentence) > 0:
                sentences.append(sentence)
                sentence = []
        else:
            if line[0] == " ":  #����ǿ��ַ�  ��"$"����
                line = "$" + line[1:]
                word = line.split()
                # word[0] = " "  ��ȡÿ���֣���Ӧ�ı�Ǵ���list ['��', 'O'] Ȼ����뵽 sentence
            else:
                word = line.split(
                )  #['��', 'O'] ['��', 'B-Dur'] ['��', 'I-Dur']
            assert len(word) == 2
            sentence.append(
                word)  #[['��', 'O'], ['��', 'B-Dur'],['��', 'I-Dur']]
    if len(sentence) > 0:
        #if 'DOCSTART' not in sentence[0][0]:
        sentences.append(sentence)
    return sentences