def save_labels(self): review_txt = "" content_list = self.fu.get_content_list() print 'get content list' grams_list = [] for content in content_list: grams_list.append(get_2_grams(content)) print 'get grams list' label_list = [] content_len = len(content_list) for x in xrange(0,content_len): label_list.append(0) print 'start labeling' for i in xrange(0,content_len): grams_a = grams_list[i] for j in xrange(i+1,content_len): grams_b = grams_list[j] sim = jaccard_distance(grams_a, grams_b) if sim >= 0.9: print "sim is : " , sim label_list[i] = 1 label_list[j] = 1 with open(self.old_file + '36') as fp: lines = fp.readlines() for index, line in enumerate(lines): product_id = product_list[index] review_txt += lines[index].replace('\n', '') + '\t' + str(label_list[index]) +'\n' with open(self.new_file + '37', 'w') as fp: fp.write(review_txt)
def convert_list_to_grams(threadName, q): """ 这个线程主要是负责把content_list的文本弄成2-grams的形式 然后再添加到content_2_grams_list中 """ global content_2_grams_list while not exitFlag: if not workQueue.empty(): # data = q.get() content_list = q.get() content_2_grams_list = content_2_grams_list + [summary_plot.get_2_grams(content) for content in content_list] print "%s processing %s" % (threadName, len(content_2_grams_list)) queueLock.release() else: queueLock.release()
def run(self): print 'start GramConverter' global content_list global grams_list global file_end_flag global gram_end_flag while not (file_end_flag and len(content_list) == 0): file_lock.acquire() if content_list: content = content_list.pop() grams = summary_plot.get_2_grams(content) gram_lock.acquire() grams_list.append(grams) print '\t\t append one grams into grams_list' gram_lock.release() file_lock.release() time.sleep(1) gram_end_flag = True print 'end GramConverter'