Exemplo n.º 1
0
	def run(self):
		df = sf.load_sframe(self.input()[0].fn)[self.train_field]
		with self.output().open('w') as out_fd:
			print "start to make vocab"
			vocab = Vocab(df)
			vocab.trim(self.no_below, self.no_above, self.keep_n)
	  		vocab.save(out_fd)
Exemplo n.º 2
0
	def run(self):
		df = sf.load_sframe(self.input()[0].fn)
		sampled_df = df.sample(self.train_fraction)
		print "sampled %d documents" % sampled_df.num_rows()
		delete_cols = [col for col in sampled_df.column_names() if col != self.train_field]
		sampled_df.remove_columns(delete_cols)
		sampled_df.save(self.output().fn)
Exemplo n.º 3
0
        def run(self):
		df = sf.load_sframe(self.input()[0]['user'].fn)
		self.wordset = WordSet(self.input()[1].fn)
		df['ensemble'] = df.apply(self.ensemble)
		df = df.select_columns(['id', 'ensemble'])
		df = df[df['ensemble'].apply(lambda x: 1 if len(x) > 0 else 0)]
		df.export_csv(self.output().fn, quote_level=csv.QUOTE_NONE, delimiter="\t", header=False)
Exemplo n.º 4
0
        def run(self):
		df = sf.load_sframe(self.input()[0]['user'].fn)
		delete_cols = [col for col in df.column_names() if col != self.user_field and col != "id"]
		df.remove_columns(delete_cols)
		wordset = WordSet(self.input()[1].fn)
		df[self.user_field] = df[self.user_field].apply(wordset.filter_bows)
		df = df[df[self.user_field] != ""]
		df.export_csv(self.output().fn, quote_level=csv.QUOTE_NONE, delimiter="\t", header=False)
Exemplo n.º 5
0
	def run(self):
		hdfs = luigi.contrib.hdfs.hadoopcli_clients.create_hadoopcli_client()
		df = sf.load_sframe(self.input()[0]['rec'].fn)
		delete_cols = [col for col in df.column_names() if col != "history" and col != "id" and col != "rlist"]
		df.remove_columns(delete_cols)
		df.export_csv(self.local_csv, quote_level=csv.QUOTE_NONE, delimiter="\t", header=False)	
		hbase_input_csv = "%s/user.rec.csv" % self.hbase_input_path
		hdfs.mkdir(self.hbase_input_path)	
		hdfs.put(self.local_csv, hbase_input_csv)
		os.remove(self.local_csv)
		to_hbase(self.hbase_input_path, self.bin)
		hdfs.remove(self.hbase_input_path)
Exemplo n.º 6
0
def merge_recommend(merged_fn, latest_rec_fn, latest_user_fn, latest_topic_fn):
	#read recommend df
	latest_rec_df = sf.SFrame.read_csv(latest_rec_fn, delimiter="\t", column_type_hints=[str, list], header=False)
	latest_rec_df.rename({"X1": "id", "X2": "rlist"})
	#read history df
	latest_history_df = sf.load_sframe(latest_user_fn)
	delete_cols = [col for col in latest_history_df.column_names() if col != "history" and col != "id"]
	latest_history_df.remove_columns(delete_cols)
	#read topic df	
	latest_topic_df = sf.SFrame.read_csv(latest_topic_fn, delimiter="\t", column_type_hints=[str, str], header=False)
	latest_topic_df.rename({"X1": "id", "X2": "fea"})
	#join all
	latest_df = latest_history_df.join(latest_rec_df, on='id', how='left').join(latest_topic_df, on='id', how='left')

	if not os.path.exists(merged_fn):
		lasted_df.save(merged_fn)
	else:	
		merged_df = sf.load_sframe(merged_fn)
		latest_id = latest_df.select_column("id")
		merged_df = merged_df.filter_by(latest_id, 'id', exclude=True)
		merged_df = merged_df.append(latest_df)
		merged_df.save(merged_fn)
Exemplo n.º 7
0
	def run(self):
		merged_rec_df = sf.load_sframe(self.input()[0]['rec'].fn)
		#import history
		print "import history"
		history_df = merged_rec_df.select_columns(['id', 'history'])
		history_df.export_csv(self.history_fn, quote_level=csv.QUOTE_NONE, delimiter="\t", header=False)
		import_history(self.history_fn, self.host, self.db, self.user, self.passwd)		
		os.remove(self.history_fn)
		#import rlist
		print "import recommendation"
		rlist_df = merged_rec_df.select_columns(['id', 'rlist'])
		rlist_df.export_csv(self.rlist_fn, quote_level=csv.QUOTE_NONE, delimiter="\t", header=False)
		import_rlist(self.rlist_fn, self.host, self.db, self.user, self.passwd)		
		os.remove(self.rlist_fn)
		#import user-topic-table
		print "import user-topic-table"
		fea_df = merged_rec_df.select_columns(['id', 'fea'])
		fea_df.export_csv(self.fea_fn, quote_level=csv.QUOTE_NONE, delimiter="\t", header=False)
		import_user_topic(self.fea_fn, self.host, self.db, self.user, self.passwd)		
		os.remove(self.fea_fn)
Exemplo n.º 8
0
    'X.4': '#_of_words_in_intersection_no_stops',
    'X.5': '#_of_words_in_union_no_stops',
    'X.6': 'jaccard_similarity_no_stops',
    'X.7': 'average_word_length_no_stops1',
    'X.8': 'average_word_length_no_stops2',
    'X.9': 'lists_identical_no_stops'
}

feature_names = features.values() + features_no_stops.values() + [
    'lower_identical'
]

print 'start'
print datetime.datetime.now()

train = sframe.load_sframe('train_lists.sf')
test = sframe.load_sframe('test_lists.sf')

print 'files loaded'
print datetime.datetime.now()

# create features with and without stop words for train
result = train.apply(lambda row: generate_features_from_lists(
    row['word_list1'], row['word_list2'])).unpack()
train.add_columns(result.rename(features))

result = train.apply(lambda row: generate_features_from_lists(
    row['word_list_no_stops1'], row['word_list_no_stops2'])).unpack()
train.add_columns(result.rename(features_no_stops))
train['lower_identical'] = train.apply(
    lambda row: lower_identical(row['question1'], row['question2']))
Exemplo n.º 9
0
	def run(self):
		df = sf.load_sframe(self.input()[0].fn)
		wordset = WordSet(self.input()[1].fn)
		df[self.train_field] = df[self.train_field].apply(wordset.filter_bows)
		df = df[df[self.train_field] != ""]
		df.export_csv(self.output().fn, delimiter="\t", quote_level=csv.QUOTE_NONE, header=False)
Exemplo n.º 10
0
	def run(self):
		df = sf.load_sframe(self.input()[0].fn)
		df = df.select_columns(['union'])
		sampled_df = df.sample(self.train_fraction)
		print "sampled %d documents" % sampled_df.num_rows()
		sampled_df.save(self.output().fn)