Exemplo n.º 1
0
def main():
    """
        main function
    """
    global data_mode, out_dir, per_day_path, w2v_path, valid_urls

    options, args = parser.parse_args()

    if (options.mode == None) or (options.output == None) or (options.input == None) or \
                      (options.w2v_path == None) or (options.dataset == None):
        return

    data_mode = options.mode
    per_day_path = options.input
    out_dir = options.output
    w2v_path = options.w2v_path
    dataset = options.dataset

    if dataset not in ['adressa', 'glob']:
        print('Wrong dataset name : {}'.format(dataset))
        return

    os.system('mkdir -p {}'.format(out_dir))

    write_log('w2v Load : start')
    with open(w2v_path, 'r') as f_w2v:
        dict_w2v = json.load(f_w2v)
    write_log('w2v Load : end')

    valid_urls = dict_w2v.keys()
    dict_w2v = None

    merge_per_time(dataset)
    merge_per_user(dataset)
Exemplo n.º 2
0
    def load(self, epoch=None, model_path=None):
        if model_path == None or not os.path.exists(model_path):
            return 0

        states = torch.load(model_path)

        self._model.load_state_dict(states['model'])
        self._optimizer.load_state_dict(states['optimizer'])

        write_log('Model loaded!! - {}'.format(model_path))

        return states['epoch']
Exemplo n.º 3
0
    def save(self, epoch=0, model_path=None):
        if model_path == None:
            return

        states = {
            'epoch': epoch,
            'model': self._model.state_dict(),
            'optimizer': self._optimizer.state_dict(),
        }

        torch.save(states, model_path)
        write_log('Model saved! - {}'.format(model_path))
Exemplo n.º 4
0
def main():
    """
        main function
    """
	global contentdata_path, dict_article_info

	options, args = parser.parse_args()
	if (options.output == None) or (options.url2id == None) or (options.input == None) \
						or (options.dataset == None) or (options.glob_meta == None):
		return

	contentdata_path = options.input
	out_path = options.output
	url2id_path = options.url2id
	dataset = options.dataset
	glob_meta_path = options.glob_meta

	if dataset not in ['adressa', 'glob']:
		print('Wrong dataset name : {}'.format(dataset))
		return

	dict_article_info = {}

	if dataset == 'adressa':
		with open(url2id_path, 'r') as f_dict:
			dict_url2id = json.load(f_dict)

		write_log('Starting threads')

		with ThreadPool(8) as pool:
			pool.map(extract_article_info, dict_url2id.items())
		write_log('Thread works done')

	elif dataset == 'glob':
		with open(glob_meta_path, 'r') as f_meta:
			lines = f_meta.readlines()

		dict_header_idx = None
		for line in lines:
			line = line.strip()

			if dict_header_idx == None:
				dict_header_idx = {}
				for i, k in enumerate(line.split(',')):
					dict_header_idx[k] = i
				continue

			line_split = line.split(',')
			url = 'url_{}'.format(line_split[dict_header_idx['article_id']])
			category_id = 'cate_{}'.format(line_split[dict_header_idx['category_id']])

			dict_article_info[url] = {
				'category0': category_id,
			}

	write_log('Save to {}'.format(out_path))
	with open(out_path, 'w') as f_json:
		json.dump(dict_article_info, f_json)
	write_log('Done')
Exemplo n.º 5
0
def preprocess_rnn_input(args=(-1, [])):
    """
        trigger the multi-process tasks to generate RNN inputs merging for all users
        :args: arguments of tasks
        :return: none
    """
	global dict_per_user, dict_url_idx, seperated_output_path

	max_seq_len = 20

	worker_id, user_ids = args

	write_log('worker({}) : start'.format(worker_id))
	dict_data = {}
	for user_id in user_ids:
		# remove duplication
		sequence = []

		# "cx:i68bn3gbf0ql786n:1hyr7mridb1el": [[1483570820, "http://adressa.no/100sport/ballsport/byasen-fiasko-mot-tabelljumboen-228288b.html"]]
		prev_url = None
		for seq_entry in dict_per_user[user_id]:
			timestamp, url = seq_entry
			if (prev_url == None) or (url != prev_url):
				prev_url = url
				sequence.append(seq_entry)

		seq_len = len(sequence)

		if seq_len < 2:
			continue

		if seq_len > max_seq_len:
			sequence = sequence[-max_seq_len:]

		start_time = sequence[0][0]
		end_time = sequence[-1][0]
		idx_sequence = list(map(lambda x:dict_url_idx[x[1]], sequence))

		dict_data[user_id] = {
			'start_time': start_time,
			'end_time': end_time,
			'sequence': idx_sequence,
		}

	with open(seperated_output_path + '/' + str(worker_id) + '_data.json', 'w') as f_out:
		json.dump(dict_data, f_out)
	write_log('worker({}) : end'.format(worker_id))
Exemplo n.º 6
0
def main():
    """
        main function
    """
	global dict_per_user, dict_per_time, dict_url_idx, seperated_output_path

	options, args = parser.parse_args()
	if (options.data_path == None) or (options.output_file_path == None):
		return

	per_time_path = options.data_path + '/per_time.json'
	per_user_path = options.data_path + '/per_user.json'

	output_path = options.output_file_path
	seperated_output_path = output_path + '/seperated'

	if not os.path.exists(output_path):
		os.system('mkdir -p ' + output_path)

	if not os.path.exists(seperated_output_path):
		os.system('mkdir -p ' + seperated_output_path)

	write_log('Preprocessing ...')
	with open(per_user_path, 'r') as f_user:
		dict_per_user = json.load(f_user)

	with open(per_time_path, 'r') as f_time:
		dict_per_time = json.load(f_time)

	user_ids = list(dict_per_user.keys())
	dict_url_idx = generate_unique_url_idxs()

	write_log('Preprocessing End : total {} user_ids'.format(len(user_ids)))

	n_div = 100
	multi_worker = MultiWorker(worker_count=10)
	works = list(map(lambda x: (x[0], x[1]), [(i, user_ids[i::n_div]) for i in range(n_div)]))

	multi_worker.work(works=works, work_function=preprocess_rnn_input)
	multi_worker = None

	# genrate_rnn_input
	generate_rnn_input(seperated_input_path=seperated_output_path,
			output_path=output_path + '/rnn_input.json')
Exemplo n.º 7
0
def raw_to_per_day(raw_path):
    """
        extract user-specifit interaction data for each file in parallel
        :raw_path: path of data file
        :return: none
    """
    global out_dir, dict_url2id

    write_log('Processing : {}'.format(raw_path))

    with open(raw_path, 'r') as f_raw:
        lines = f_raw.readlines()

    dict_per_user = {}
    list_per_time = []

    total_count = len(lines)
    count = 0

    for line in lines:
        if count % 10000 == 0:
            write_log('Processing({}) : {}/{}'.format(raw_path, count,
                                                      total_count))
        count += 1

        line = line.strip()
        line_json = json.loads(line)

        user_id = line_json.get('userId', None)
        url = find_best_url(event_dict=line_json)
        time = line_json.get('time', -1)
        article_id = line_json.get('id', None)

        if (user_id == None) or (url == None) or (time < 0) or (article_id
                                                                == None):
            continue

        if dict_per_user.get(user_id, None) == None:
            dict_per_user[user_id] = []

        dict_per_user[user_id].append(tuple((time, url)))
        list_per_time.append(tuple((time, user_id, url)))

        dict_url2id[url] = article_id

    lines = None

    per_user_path = out_dir + '/per_user/' + os.path.basename(raw_path)
    per_time_path = out_dir + '/per_time/' + os.path.basename(raw_path)

    with open(per_user_path, 'w') as f_user:
        json.dump(dict_per_user, f_user)

    with open(per_time_path, 'w') as f_time:
        json.dump(list_per_time, f_time)

    dict_per_user = None
    list_per_time = None

    write_log('Done : {}'.format(raw_path))
Exemplo n.º 8
0
    def work(self, works, work_function):
        self._working_sema = Semaphore(1)
        self._child_count = 0

        total_work_count = len(works)
        cur_work_done = 0

        for work in works:
            while (True):
                if (self._child_count < self._worker_count):
                    break
                time.sleep(1)

            cur_work_done += 1
            if ((cur_work_done % 1000) == 0):
                write_log('working : {}/{}'.format(cur_work_done,
                                                   total_work_count))

            if (self._time_to_die):
                break

            def run_on_subproc(work):
                child_pid = os.fork()
                # child process
                if (child_pid == 0):
                    work_function(work)
                    exit(0)
                os.waitpid(child_pid, 0)

                self._working_sema.acquire()
                self._child_count -= 1
                self._working_sema.release()

            self._working_sema.acquire()
            self._child_count += 1
            Thread(target=run_on_subproc, args=(work, )).start()
            self._working_sema.release()

        while (self._child_count > 0):
            time.sleep(1)
Exemplo n.º 9
0
def raw_to_per_day_glob(raw_path):
    global out_dir, dict_url2id

    write_log('Processing : {}'.format(raw_path))

    with open(raw_path, 'r') as f_raw:
        lines = f_raw.readlines()

    dict_per_user = {}
    list_per_time = []

    total_count = len(lines)
    count = 0

    dict_header_idx = None
    for line in lines:
        if count % 10000 == 0:
            write_log('Processing({}) : {}/{}'.format(raw_path, count, total_count))
        count += 1

        line = line.strip()
        if dict_header_idx == None:
            dict_header_idx = {}
            for i, k in enumerate(line.split(',')):
                dict_header_idx[k] = i
            continue
             
        line_split = line.split(',')
        
        user_id = 'uid_{}'.format(line_split[dict_header_idx['user_id']])
        time = int(line_split[dict_header_idx['click_timestamp']]) // 1000
        url = 'url_{}'.format(line_split[dict_header_idx['click_article_id']])
        article_id = 'id_{}'.format(line_split[dict_header_idx['click_article_id']])

        if (user_id == None) or (url == None) or (time < 0) or (article_id == None):
            continue

        if dict_per_user.get(user_id, None) == None:
            dict_per_user[user_id] = []

        dict_per_user[user_id].append(tuple((time, url)))
        list_per_time.append(tuple((time, user_id, url)))

        dict_url2id[url] = article_id

    lines = None

    per_user_path = out_dir + '/per_user/' + os.path.splitext(os.path.basename(raw_path))[0]
    per_time_path = out_dir + '/per_time/' + os.path.splitext(os.path.basename(raw_path))[0]

    with open(per_user_path, 'w') as f_user:
        json.dump(dict_per_user, f_user)

    with open(per_time_path, 'w') as f_time:
        json.dump(list_per_time, f_time)

    dict_per_user = None
    list_per_time = None

    write_log('Done : {}'.format(raw_path))
Exemplo n.º 10
0
def generate_w2v_map():
    """
        generate a map for doc2vec training.
        :return: none
    """
    global article_info_path, output_path, embedding_dimension, model_path

    write_log('W2V Load article info : Start')
    with open(article_info_path, 'r') as f_art:
        article_info = json.load(f_art)
    write_log('W2V Load article info : End')

    write_log('W2V Generate labeled_sentences : Start')
    labeled_sentences = []
    for url, dict_info in article_info.items():
        sentence_header = dict_info.get('sentence_header', None)
        sentence_body = dict_info.get('sentence_body', None)

        if (sentence_header == None) or (sentence_body == None):
            continue

        words = []
        for sentence in sentence_header + sentence_body:
            for word in sentence.split(' '):
                words.append(word)

        labeled_sentence = gensim.models.doc2vec.LabeledSentence(words=words,
                                                                 tags=[url])
        labeled_sentences.append(labeled_sentence)
    write_log('W2V Generate labeled_sentences : End')

    w2v_model = gensim.models.Doc2Vec(alpha=.025,
                                      min_alpha=.001,
                                      min_count=1,
                                      vector_size=embedding_dimension,
                                      window=10,
                                      dm=0,
                                      dbow_words=1,
                                      workers=16,
                                      epochs=10)

    w2v_model.build_vocab(labeled_sentences)

    for epoch in range(20):
        start_time = time.time()
        write_log('W2V epoch {} : Start'.format(epoch))

        random.shuffle(labeled_sentences)
        w2v_model.train(labeled_sentences,
                        total_examples=w2v_model.corpus_count,
                        epochs=w2v_model.epochs)

        w2v_model.alpha -= 0.001
        w2v_model.min_alpha = w2v_model.alpha
        write_log('W2V epoch {} ends : tooks {}'.format(
            epoch,
            time.time() - start_time))

    w2v_model.save(model_path)

    dict_w2v = {}
    for url in article_info.keys():
        dict_w2v[url] = w2v_model[url].tolist()
    dict_w2v['url_pad'] = [float(0)] * embedding_dimension

    write_log('W2V json dump : start')
    with open(output_path, 'w') as out_f:
        json.dump(dict_w2v, out_f)
    write_log('W2V json dump : end')
Exemplo n.º 11
0
def merge_per_time(dataset):
    """
        merge the dateset which is seprated by the time
        :dataset: target dataset
        :return: none
    """
    global data_mode, out_dir, per_day_path, valid_urls, dict_new_ts
    write_log('Merging per_time Start')

    time_files = get_files_under_path(per_day_path + '/per_time')

    list_merged = []

    write_log('Merging per_time : Load Start')
    for time_path in time_files:
        with open(time_path, 'r') as f_data:
            list_per_time = json.load(f_data)

        list_merged += list_per_time
        list_per_time = None
    write_log('Merging per_time : Load End')

    write_log('Merging per_time : Sort Start')
    # (timestamp, user_id, url)
    list_merged = list(filter(lambda x: x[2] in valid_urls, list_merged))
    list_merged.sort(key=lambda x: x[0])

    # time interval compression
    new_timestamp = 1
    if dataset == 'glob_':
        dict_new_ts = {}
        prev_ts = -1
        for ts in [x[0] for x in list_merged]:
            if prev_ts < 0:
                dict_new_ts[str(ts)] = new_timestamp
                prev_ts = ts
                continue

            if prev_ts == ts:
                continue

            new_timestamp += min(ts - prev_ts, 60 * 60 * 3)
            dict_new_ts[str(ts)] = new_timestamp

            prev_ts = ts
        list_merged = [(dict_new_ts[str(x[0])], x[1], x[2])
                       for x in list_merged]

    write_log('Merging per_time : Sort End')

    with open(out_dir + '/per_time.json', 'w') as f_time:
        json.dump(list_merged, f_time)

    list_merged = None

    write_log('Merging per_time End')
Exemplo n.º 12
0
def merge_per_user(dataset):
    """
        merge the dateset which is seprated by the user
        :dataset: target dataset
        :return: none
    """
    global data_mode, out_dir, per_day_path, valid_urls, dict_new_ts

    write_log('Merging per_user Start')
    user_files = get_files_under_path(per_day_path + '/per_user')

    dict_merged = {}

    total_count = len(user_files)
    count = 0
    for user_path in user_files:
        write_log('Merging per_user : {}/{}'.format(count, total_count))
        count += 1
        with open(user_path, 'r') as f_data:
            dict_per_user = json.load(f_data)
        write_log('Merging per_user Loaded: {}/{}'.format(count, total_count))

        for key in dict_per_user.keys():
            dict_merged[key] = dict_merged.get(key, []) + dict_per_user[key]

        write_log('Merging per_user Merged: {}/{}'.format(count, total_count))
        dict_per_user = None

    write_log('Merging per_user : sorting start')
    for user_id in dict_merged:
        # (timestamp, url)
        dict_merged[user_id] = list(
            filter(lambda x: x[1] in valid_urls, dict_merged[user_id]))
        # time interval compression
        if dataset == 'glob_':
            dict_merged[user_id] = [(dict_new_ts[str(x[0])], x[1])
                                    for x in dict_merged[user_id]]
        dict_merged[user_id].sort(key=lambda x: x[0])
    write_log('Merging per_user : sorting end')

    write_log('Merging per_user start to writing')
    with open(out_dir + '/per_user.json', 'w') as f_user:
        json.dump(dict_merged, f_user)
    write_log('Merging per_user End')

    dict_merged = None
Exemplo n.º 13
0
def generate_rnn_input(seperated_input_path=None, output_path=None):
    """
    generate an RNN input of each task
    :seperated_input_path: path of the input directory storing RNN input seperated by the user
    :output_path: path of output to save RNN input
    :return: none
    """
	global dict_url_idx, dict_per_time

	if (seperated_input_path == None) or (output_path == None):
		return

	merged_sequences = []

	write_log('Merging seperated infos ...')
	for seperated_path in get_files_under_path(seperated_input_path):
		with open(seperated_path, 'r') as f_dict:
			seperated_dict = json.load(f_dict)

#		seperated_dict[user_id] = {
#			'start_time': start_time,
#			'end_time': end_time,
#			'sequence': idx_sequence,
#		}

		# dict_url_idx
		for user_id, dict_data in seperated_dict.items():
			sequence_entry = (dict_data['start_time'], dict_data['end_time'],
					dict_data['sequence'])
			merged_sequences.append(sequence_entry)

	write_log('Merging seperated infos ...  Done !')
	write_log('Sort by time : start')
	merged_sequences.sort(key=lambda x:x[0])
	write_log('Sort by time : end')

	timestamp_tuple = list(map(lambda x:tuple((x[0], x[1])), merged_sequences))
	seq_len = list(map(lambda x:len(x[2]), merged_sequences))
	sequence = list(map(lambda x:x[2], merged_sequences))

	write_log('Generate idx2url : start')
	merged_sequences = None
	dict_idx2url = {idx:word for word, idx in dict_url_idx.items()}
	write_log('Generate idx2url : end')

	write_log('Generate candidate data structure : start')
	dict_time_idx = {}

	prev_timestamp = None
	for (timestamp, user_id, url) in dict_per_time:
		if prev_timestamp != timestamp:
			if prev_timestamp != None:
				dict_time_idx[prev_timestamp]['next_time'] = timestamp
			dict_time_idx[timestamp] = {
				'prev_time': prev_timestamp,
				'next_time': None,
				'indices': {},
			}

		idx_of_url = dict_url_idx[url]
		dict_time_idx[timestamp]['indices'][idx_of_url] = dict_time_idx[timestamp]['indices'].get(idx_of_url, 0) + 1

		prev_timestamp = timestamp

	write_log('Generate candidate data structure : end')

	write_log('Save rnn_inputs : start')
	dict_rnn_input = {
		'timestamp': timestamp_tuple,
		'seq_len': seq_len,
		'sequence': sequence,
		'idx2url': dict_idx2url,
		'time_idx': dict_time_idx,
	}

	with open(output_path, 'w') as f_input:
		json.dump(dict_rnn_input, f_input)
	write_log('Save rnn_inputs : end')
def extract_article_content(content_dir):

    target_files = []

    for file_name in os.listdir(content_dir):
        file_path = os.path.join(content_dir, file_name)

        if not os.path.isfile(file_path):
            continue

        target_files.append(file_path)

    output = {}
    for file_idx, file_path in enumerate(target_files):
        lines = []
        with open(file_path, 'r') as f_con:
            lines = [line.strip() for line in f_con.readlines() if len(line.strip()) > 0]

        for line in lines:
            try:
                dict_cont = json.loads(line)
            except:
                print('Error: {}'.format(line))
                continue

            dict_data = {}

            for field in dict_cont.get('fields', []):
                field_name = field.get('field', None)
                field_value = field.get('value', None)

                if not field_name or not field_value:
                    continue

                if field_name not in ['url', 'cannonicalUrl', 'referrerUrl', 
                        'title', 'body',
                        'category0', 'category1']:
                    continue

                dict_data[field_name] = field_value

            # find the best URL
            best_url = find_best_url(dict_data)
            if not best_url:
                continue

            for key in ['url', 'cannonicalUrl', 'referrerUrl']:
                dict_data.pop(key, None)

            # preprocess title & body
            if ('title' not in dict_data) or ('body' not in dict_data):
                continue

            def preprocess_sentence(sentences):
                new_sentences = []
                regex_remove = re.compile('[\'|\"|,|\-|\\.| |\?|«|»|:|!|–|@|\\(|\\)|−]+')
                for sentence in sentences:
                    sentence = re.sub(regex_remove, ' ', sentence)
                    new_sentences.append(sentence.strip())
                return new_sentences

            dict_data['sentence_header'] = preprocess_sentence([dict_data['title']])
            dict_data['sentence_body'] = preprocess_sentence(dict_data['body'])

            for key in ['title', 'body']:
                dict_data.pop(key, None)

            output[best_url] = dict_data

    write_log('Save to Json : start')
    with open(out_dir, 'w') as f_json:
        json.dump(output, f_json)
    write_log('Save to Json : end')