Exemplo n.º 1
0
	def save_labels(self):
		review_txt = ""
		content_list = self.fu.get_content_list()
		print 'get content list'
		grams_list = []
		for content in content_list:
			grams_list.append(get_2_grams(content))
		print 'get grams list'
		label_list = []
		content_len = len(content_list)
		for x in xrange(0,content_len):
			label_list.append(0)
		print 'start labeling'
		for i in xrange(0,content_len):
			grams_a = grams_list[i]
			for j in xrange(i+1,content_len):
				grams_b = grams_list[j]
				sim = jaccard_distance(grams_a, grams_b)
				if sim >= 0.9:
					print "sim is : " , sim
					label_list[i] = 1
					label_list[j] = 1
		with open(self.old_file + '36') as fp:
			lines = fp.readlines()
			for index, line in enumerate(lines):
				product_id = product_list[index]
				review_txt += lines[index].replace('\n', '') + '\t' + str(label_list[index]) +'\n'
				
		with open(self.new_file + '37', 'w') as fp:
			fp.write(review_txt)
Exemplo n.º 2
0
def write_review_distance_to_file(q, l, name, dirname='jaccard_distance'):
    """
	多进程方法
	获取一个grams_pair的数组
	计算数组中每个元素的distance
	然后存入自己的dis_list中
	在任务完成之后,写入自己的进程对应的文件中
	"""
    print 'starting process %s' % name
    # dis_list = []
    if not path.exists(dirname):
        makedirs(dirname)
    count = 0
    while True:
        dis_list = []
        l.acquire()
        if q.empty():
            l.release()
            time.sleep(0.01)
            continue
        else:
            grams_pair_list = q.get()
            if grams_pair_list == 'STOP':
                print 'process', name, ' exit'
                l.release()
                break
            l.release()
            print 'process', name, 'have got ', len(grams_pair_list), 'reviews'
            # dis_list = []
            with open(dirname + '/jd.' + str(name) + '_' + str(count),
                      'w') as fp:

                # fp_list = ast.literal_eval(fp_list)
                for grams in grams_pair_list:
                    jaccard_distance = summary_plot.jaccard_distance(
                        grams[0], grams[1])
                    dis_list.append(jaccard_distance)
                # print name, len(total_list)
                fp.write(str(dis_list))
                count += 1
Exemplo n.º 3
0
def write_review_distance_to_file(q, l, name, dirname="jaccard_distance"):
    """
	多进程方法
	获取一个grams_pair的数组
	计算数组中每个元素的distance
	然后存入自己的dis_list中
	在任务完成之后,写入自己的进程对应的文件中
	"""
    print "starting process %s" % name
    # dis_list = []
    if not path.exists(dirname):
        makedirs(dirname)
    count = 0
    while True:
        dis_list = []
        l.acquire()
        if q.empty():
            l.release()
            time.sleep(0.01)
            continue
        else:
            grams_pair_list = q.get()
            if grams_pair_list == "STOP":
                print "process", name, " exit"
                l.release()
                break
            l.release()
            print "process", name, "have got ", len(grams_pair_list), "reviews"
            # dis_list = []
            with open(dirname + "/jd." + str(name) + "_" + str(count), "w") as fp:

                # fp_list = ast.literal_eval(fp_list)
                for grams in grams_pair_list:
                    jaccard_distance = summary_plot.jaccard_distance(grams[0], grams[1])
                    dis_list.append(jaccard_distance)
                    # print name, len(total_list)
                fp.write(str(dis_list))
                count += 1