def getPageData(): pageCollection = getPageCollection() pageDataCollection = getPageDataCollection() queue = Queue() index = 1 for page_id in pageCollection.find(): if not pageDataCollection.find_one(page_id): queue.put(page_id['_id']) index += 1 print index for i in range(200): t = fetchingPageData(queue) t.setDaemon(True) t.start() queue.join() for doc in pageDataCollection.find(): f = open('data/' + doc['_id'] + '.txt','w') f.write(doc['data']) f.close()
def getPageData(): pageCollection = getPageCollection() pageDataCollection = getPageDataCollection() queue = Queue() index = 1 for page_id in pageCollection.find(): if not pageDataCollection.find_one(page_id): queue.put(page_id['_id']) index += 1 print index for i in range(200): t = fetchingPageData(queue) t.setDaemon(True) t.start() queue.join() for doc in pageDataCollection.find(): f = open('data/' + doc['_id'] + '.txt', 'w') f.write(doc['data']) f.close()
def __init__(self, queue): threading.Thread.__init__(self) self.queue = queue self.collection = getPageDataCollection()
def __init__(self, queue): threading.Thread.__init__(self) self.queue = queue self.collection = getPageDataCollection()
try: k = KVAL except ValueError(): usage() vocab = {} xs = [] args = [] fid = '' fetchTestpage(PID) clusterCollection = getClusterCollection() clusters = clusterCollection.distinct("cluster") fbdataCollection = getPageDataCollection() for cluster in clusterCollection.find(): pageIds = cluster['pages'][:len(cluster['pages']) / 10] for p in pageIds: data = fbdataCollection.find_one({'_id': p}) _id = data['_id'] string = data['data'] f = open('temp/' + _id + '.txt', 'w') f.write(string) f.close() for name in glob.glob('./temp/*.txt'): args.append(name) for a in args: x = defaultdict(float)
try: k = KVAL except ValueError(): usage() vocab = {} xs = [] args = [] fid = '' fetchTestpage(PID) clusterCollection = getClusterCollection() clusters = clusterCollection.distinct("cluster") fbdataCollection = getPageDataCollection() for cluster in clusterCollection.find(): pageIds = cluster['pages'][:len(cluster['pages']) / 10] for p in pageIds: data = fbdataCollection.find_one({'_id': p}) _id = data['_id'] string = data['data'] f = open('temp/' + _id + '.txt', 'w') f.write(string) f.close() for name in glob.glob('./temp/*.txt'): args.append(name) for a in args: x = defaultdict(float)