def processdata(urllists, word_count_threshold, depth): content = [] nums = [] nums.append(0) for url in urllists: crawler = webCrawler(url, depth) crawler.crawl() nums.append(len(crawler.data)) content.extend(crawler.data) instance = features(word_count_threshold) word_counts, wordtoix = instance.extractwords(content) N = len(word_counts) for i in range(1, len(nums)): nums[i] = nums[i-1] + nums[i] cid = 0 output = np.zeros((nums[len(nums)-1], N+1)) for url in urllists: crawler = webCrawler(url, depth) crawler.crawl() currlen = len(crawler.data) feats = instance.bagofwords(crawler.data, word_counts, wordtoix) print feats.shape b = np.zeros((currlen,N+1)) print b[:, :-1].shape b[:,0:N] = feats b[:,N] = cid +1 output[nums[cid]:nums[cid+1],:] = b cid = cid + 1 np.savetxt('test.out', output, delimiter=',') # X is an array
def getdata(urllists, depth): content = [] nums = [] nums.append(0) for url in urllists: #if url != "https://en.wikipedia.org/wiki/1990_RTHK_Top_10_Gold_Songs_Awards": # continue crawler = webCrawler(url, depth) crawler.crawl() nums.append(len(crawler.data)) content.extend(crawler.data) return content
urllists =[] urllists.append( "https://en.wikipedia.org/wiki/Sandra_Bullock"); urllists.append( "https://en.wikipedia.org/wiki/Far_East_scarlet-like_fever"); filepath = os.path.dirname(os.path.realpath(__file__)) dictname = 'dictionary.txt' dict2idx = 'dict2idx.txt' with open(os.path.join(filepath,dictname), 'r') as fread: word_counts = json.load(fread) with open(os.path.join(filepath,dict2idx), 'r') as fread: wordtoix = json.load(fread) #np.random.shuffle clf = joblib.load('model.pkl') for url in urllists: crawler = webCrawler(url, 1) crawler.crawl() instance = features(word_count_threshold) feats = instance.bagofwords(crawler.data, word_counts, wordtoix) X = feats #print X.shape print fsum(X) transformer = TfidfTransformer() tfidf = transformer.fit_transform(X) X = tfidf.toarray() print fsum(X) yhat = clf.predict(X) print yhat print "finish page testing"