def find_weburldata(weburl): path = "/home/jiangy2/webdata/" fs = os.listdir(path) for subpath in fs: filepath = os.path.join(path, subpath) if (os.path.isdir(filepath)): webdata_path = os.listdir(filepath) for filename in webdata_path: fileurl = filename.replace(".txt", "") fileurl = fileurl.replace("www.", "") stripfilename = weburl.replace("www.", "") if fileurl == stripfilename: webdata = mytool.read_webdata( os.path.join(filepath, filename)) if webdata['title'] != "" and webdata[ 'description'] != "" and webdata['keywords'] != "": if len(webdata['webtext']) >= 15: return mytool.get_all_webdata(webdata) newpath = "/home/jiangy2/dnswork/newwebdata/" newfs = os.listdir(newpath) for subpath in newfs: filepath = os.path.join(newpath, subpath) if (os.path.isdir(filepath)): webdata_path = os.listdir(filepath) for filename in webdata_path: fileurl = filename.replace(".txt", "") fileurl = fileurl.replace("www.", "") stripfilename = weburl.replace("www.", "") if fileurl == stripfilename: webdata = mytool.read_webdata( os.path.join(filepath, filename)) if webdata['title'] != "" and webdata[ 'description'] != "" and webdata['keywords'] != "": if len(webdata['webtext']) >= 15: return mytool.get_all_webdata(webdata) return ""
#读取文件 result_dic = {} #保存结果 newpath = "/home/jiangy2/dnswork/newwebdata/" newfs = os.listdir(newpath) for subpath in newfs: filepath = os.path.join(newpath, subpath) if (os.path.isdir(filepath)): print(filepath) webdata_path = os.listdir(filepath) for filename in webdata_path: fileurl = filename.replace(".txt", "") fileurl = fileurl.replace("www.", "") webdata = mytool.read_webdata(os.path.join(filepath, filename)) if fileurl not in result_dic.keys(): if webdata['title'] != "" and webdata[ 'description'] != "" and webdata['keywords'] != "": if len(webdata['webtext']) >= 15: result = predict_webclass(webdata, model) if result != "": print(result) result_dic[fileurl] = result newpath = "/home/jiangy2/webdata/" newfs = os.listdir(newpath) for subpath in newfs: filepath = os.path.join(newpath, subpath) if (os.path.isdir(filepath)):
# 遍历文件 # for f in files: # print(os.path.join(root, f)) # 遍历路径下所有的文件夹 for d in dirs: dirlist.append(os.path.join(root, d)) # print(os.path.join(root, d)) ## 读取疑似的网页数据 webdata_list = [] for dirpath in dirlist: print(dirpath) for root, dirs, files in os.walk(dirpath): for f in files: if f.replace(".txt","") in filenamelist: ### data = mytool.read_webdata(os.path.join(root, f)) # print(os.path.join(root, f)) # 网页数据存入一个list target_data = mytool.get_all_webdata(data) #分词 tmp_words = mytool.seg_sentence(target_data, stopwordslist) webdata_list.append(tmp_words) print(webdata_list[0]) print("读取疑似网页内容共:" , len(webdata_list)) #构建词频矩阵,训练LDA模型 dictionary = corpora.Dictionary(webdata_list) # corpus[0]: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1),...] # corpus是把每条新闻ID化后的结果,每个元素是新闻中的每个词语,在字典中的ID和频率