def extract_sentece(): lines = read_lines('./Data/corpus/training.seg.csv') lines += read_lines('./Data/corpus/testing.seg.csv') with codecs.open('./Data/corpus/sentence.txt', 'w', encoding='utf-8') as file_w: for line in lines: index = line.index(',') word_tag = line[index + 1:] file_w.write('%s\n' % get_sentence(word_tag))
def init_result(): labels = [] for i in range(config.KFOLD): lines = read_lines('./Data/result/best_%d' % i) temp = [] for line in lines: label = line.split(',')[1] temp.append(label) labels.append(temp) return labels
def init_voc(): """ 初始化voc """ lines = read_lines(config.TRAIN_PATH) lines += read_lines(config.TEST_PATH) words = [] # 句子 pos_tags = [] # 词性标记类型 for line in lines: index = line.index(',') sentence = line[index + 1:] # words and tags words_tags = sentence.split(' ') words_temp, tag_temp = [], [] for item in words_tags: r_index = item.rindex('/') word, tag = item[:r_index], item[r_index + 1:] words_temp.append(word) tag_temp.append(tag) pos_tags.extend(tag_temp) words.extend(words_temp) # word voc create_dictionary(words, config.WORD_VOC_PATH, start=config.WORD_VOC_START, min_count=5, sort=True, lower=True, overwrite=True) # tag voc create_dictionary(pos_tags, config.TAG_VOC_PATH, start=config.TAG_VOC_START, sort=True, lower=False, overwrite=True) # label voc label_types = [str(i) for i in range(1, 12)] create_dictionary(label_types, config.LABEL_VOC_PATH, start=0, overwrite=True)
def load_train_data(word_voc, tag_voc, label_voc): """ 加载训练测试数据 Args: word_voc: dict tag_voc: dict label_voc: dict Returns: xx """ return init_data(read_lines(config.TRAIN_PATH), word_voc, tag_voc, label_voc)
def load_test_data(word_voc, tag_voc, label_voc): """ 加载测试数据 Args: word_voc: dict tag_voc: dict label_voc: dict Returns: xx """ sentences, tags, _ = init_data(read_lines(config.TEST_PATH), word_voc, tag_voc, label_voc) return sentences, tags
def init_voc(): """ 初始化voc """ #TRAIN_PATH = './Data/corpus/training.seg.csv' #TRAIN_PATH = 'F:\\PubMedSpyder\\new_together.txt' lines = read_lines(config.TRAIN_PATH) #lines += read_lines(config.TEST_PATH) words = [] # 句子 pos_tags = [] # 词性标记类型 for line in lines: #index = line.index(',') #sentence = line[index+1:] sentence = line # words and tags words_tags = sentence.split(' ') words_temp, tag_temp = [], [] for item in words_tags: r_index = item.rindex('/') #/是词与词性的界限 word, tag = item[:r_index], item[r_index + 1:] #分别构造词典和词性词典 words_temp.append(word) tag_temp.append(tag) pos_tags.extend(tag_temp) words.extend(words_temp) # word voc #WORD_VOC_PATH是含词(注意不是词向量)的pkl文件 #得到的字典是下标与单词的词典 create_dictionary(words, config.WORD_VOC_PATH, start=config.WORD_VOC_START, min_count=1, sort=True, lower=True, overwrite=True) # tag voc #TAG_VOC_PATH是含词性(注意不是词向量)的pkl文件 #TAG_VOC_START=1代表起始下标 create_dictionary(pos_tags, config.TAG_VOC_PATH, start=config.TAG_VOC_START, sort=True, lower=False, overwrite=True) # label voc #在BIONLP中事件类型有九种(这里可以先理解为触发词类型,因为是由触发词直接得到的类型 label_types = [str(i) for i in range(1, 10)] create_dictionary(label_types, config.LABEL_VOC_PATH, start=0, overwrite=True)
def extract_sentece(): #将测试集和训练集的句子合在一起 #lines = read_lines('./Data/corpus/training.seg.csv') #lines += read_lines('./Data/corpus/testing.seg.csv') lines = read_lines('F:\\PubMedSpider\\sample\\new_together.txt') #创建一个新的文本 with codecs.open('F:\\PubMedSpider\\sample\\only_sentence.txt', 'w', encoding='utf-8') as file_w: for line in lines: #注意:这里line.index(' ')里面的符号根据得到词性标记的文本实际情况而定 #index = line.index(' ') #word_tag = line[index+1:] word_tag = line file_w.write('%s\n' % get_sentence(word_tag))