예제 #1
0
def split_train_test(new_databunch, split_value):
    lenth = len(new_databunch.contents)  # 数据总量
    split_point = int((1 - split_value) * lenth)
    trainbunch = Bunch()
    testbunch = Bunch()
    trainbunch.contents = new_databunch.contents[:split_point]
    testbunch.contents = new_databunch.contents[split_point:]
    trainbunch.accu = new_databunch.accu[:split_point]
    testbunch.accu = new_databunch.accu[split_point:]
    return trainbunch, testbunch
예제 #2
0
def tfidfspace(bunch_file, tfidf_file, train_bunch_file=None):
    tfidfbunch = Bunch(labels=[], contents=[], tdm=[], vocabulary={})
    # 读取bunch_file中的bunch, 将label赋予tfidfbunch中的label
    with open(bunch_file, "rb") as f:
        bunch = pickle.load(f)
    tfidfbunch.label = bunch.label
    tfidfbunch.contents = bunch.contents
    if train_bunch_file is None:  # 此时对训练数据生成tfidf空间
        vectorizer = TfidfVectorizer(max_df=0.4, sublinear_tf=True)
        tfidfbunch.tdm = vectorizer.fit_transform(bunch.contents)
        tfidfbunch.vocabulary = vectorizer.vocabulary_
    else:  # 对测试数据生成tfidf空间,保证与训练集的单词字典是相同的。
        with open(train_bunch_file, "rb") as f:
            train_bunch = pickle.load(f)
        tfidfbunch.vocabulary = train_bunch.vocabulary
        vectorizer = TfidfVectorizer(max_df=0.4,
                                     sublinear_tf=True,
                                     vocabulary=train_bunch.vocabulary)
        tfidfbunch.tdm = vectorizer.fit_transform(bunch.contents)
    # 将tfidfbunch写入tfidf_file
    with open(tfidf_file, "wb") as f:
        pickle.dump(tfidfbunch, f)
    #保存tfidf模型
    joblib.dump(vectorizer, TFIDF_FILE)
                elif children.tag == 'contenttitle':
                    contenttitle = children.text
                elif children.tag == 'content':
                    content = str(contenttitle)+' '+str(children.text)
                    if (len(content) > 0):
                        seg = jieba.cut(content, cut_all=False)
                        bunch.contents.append(' '.join(seg))
                    else:
                        bunch.contents.append('null')
        print('finish train file:',filePath)
fileutils.saveBatchObj(trainRawPath, bunch)

# parser all test data and save it to bunch
bunch.lable=[]
bunch.filenames=[]
bunch.contents=[]
contenttitle =''
for file in os.listdir(testDataPath):
    filePath = testDataPath + os.sep + file
    if os.path.isdir(filePath):
        print(file, ' is dir. continue')
        continue
    with open(filePath, 'r') as file:
        text = file.read()
        text = re.sub(u"[\x00-\x08\x0b-\x0c\x0e-\x1f|&]+", u"", text)
        root = ET.fromstring(text)
        for child in root:
            # 第二层节点的标签名称和属性,遍历xml文档的第三层
            for children in child:
                # 第三层节点的标签名称和属性
                bunch.filenames.append(filePath)