Пример #1
0
def find_weburldata(weburl):
    path = "/home/jiangy2/webdata/"
    fs = os.listdir(path)
    for subpath in fs:
        filepath = os.path.join(path, subpath)
        if (os.path.isdir(filepath)):
            webdata_path = os.listdir(filepath)
            for filename in webdata_path:
                fileurl = filename.replace(".txt", "")
                fileurl = fileurl.replace("www.", "")
                stripfilename = weburl.replace("www.", "")
                if fileurl == stripfilename:
                    webdata = mytool.read_webdata(
                        os.path.join(filepath, filename))
                    if webdata['title'] != "" and webdata[
                            'description'] != "" and webdata['keywords'] != "":
                        if len(webdata['webtext']) >= 15:
                            return mytool.get_all_webdata(webdata)
    newpath = "/home/jiangy2/dnswork/newwebdata/"
    newfs = os.listdir(newpath)
    for subpath in newfs:
        filepath = os.path.join(newpath, subpath)
        if (os.path.isdir(filepath)):
            webdata_path = os.listdir(filepath)
            for filename in webdata_path:
                fileurl = filename.replace(".txt", "")
                fileurl = fileurl.replace("www.", "")
                stripfilename = weburl.replace("www.", "")
                if fileurl == stripfilename:
                    webdata = mytool.read_webdata(
                        os.path.join(filepath, filename))
                    if webdata['title'] != "" and webdata[
                            'description'] != "" and webdata['keywords'] != "":
                        if len(webdata['webtext']) >= 15:
                            return mytool.get_all_webdata(webdata)
    return ""
Пример #2
0

#读取文件
result_dic = {}  #保存结果

newpath = "/home/jiangy2/dnswork/newwebdata/"
newfs = os.listdir(newpath)
for subpath in newfs:
    filepath = os.path.join(newpath, subpath)
    if (os.path.isdir(filepath)):
        print(filepath)
        webdata_path = os.listdir(filepath)
        for filename in webdata_path:
            fileurl = filename.replace(".txt", "")
            fileurl = fileurl.replace("www.", "")
            webdata = mytool.read_webdata(os.path.join(filepath, filename))
            if fileurl not in result_dic.keys():
                if webdata['title'] != "" and webdata[
                        'description'] != "" and webdata['keywords'] != "":
                    if len(webdata['webtext']) >= 15:
                        result = predict_webclass(webdata, model)
                        if result != "":
                            print(result)
                            result_dic[fileurl] = result

newpath = "/home/jiangy2/webdata/"
newfs = os.listdir(newpath)

for subpath in newfs:
    filepath = os.path.join(newpath, subpath)
    if (os.path.isdir(filepath)):
Пример #3
0
        # 遍历文件
        #  for f in files:
        #      print(os.path.join(root, f))
        # 遍历路径下所有的文件夹
        for d in dirs:
            dirlist.append(os.path.join(root, d))
            # print(os.path.join(root, d))

## 读取疑似的网页数据
webdata_list = []
for dirpath in dirlist:
    print(dirpath)
    for root, dirs, files in os.walk(dirpath):
        for f in files:
            if f.replace(".txt","") in filenamelist: ###
                data = mytool.read_webdata(os.path.join(root, f))
                # print(os.path.join(root, f))
                # 网页数据存入一个list
                target_data = mytool.get_all_webdata(data)
                #分词
                tmp_words = mytool.seg_sentence(target_data, stopwordslist)
                webdata_list.append(tmp_words)

print(webdata_list[0])
print("读取疑似网页内容共:" , len(webdata_list))


#构建词频矩阵,训练LDA模型
dictionary = corpora.Dictionary(webdata_list)
# corpus[0]: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1),...]
# corpus是把每条新闻ID化后的结果,每个元素是新闻中的每个词语,在字典中的ID和频率