print(time2 - time1) indices = final_logits.argsort(axis=0)[-args.amount:].reshape(args.amount) word_logits = np.dot(word_emb, publication_emb.reshape(args.emb_size, 1)) + word_bias top_articles = word_articles[indices.tolist()[0]] broadcasted_words_per_article = top_articles.toarray() * word_logits.T sorted_word_indices = broadcasted_words_per_article.argsort(axis=1) return_articles = [] raw_data = Articles(args.real_data_path) print(len(raw_data)) id_to_word = {v: k for k, v in final_word_ids.items()} i = 0 for idx in indices.tolist()[0]: current_article = raw_data[int(idx)] current_article["logit"] = float(final_logits[int(idx)]) current_sorted_words = sorted_word_indices[i] top_words = [] least_words = [] for top_word in current_sorted_words[-20:]: word = id_to_word[top_word] if "unused" not in word and "##" not in word and len(word) > 1: top_words.append(word) for least_word in current_sorted_words[:20]:
print("Cannot use GPU. Using CPU instead.") print(f"Device: {device}") # set output directory path output_path = Path(args.output_dir) # tensboard log and graph output folder declaration log_tensorboard_dir = output_path / "runs" / args.word_embedding_type writer = SummaryWriter(log_tensorboard_dir) # load datasets train_path = Path(args.train_path) test_path = Path(args.test_path) eval_path = Path(args.eval_path) train_data = Articles(train_path) test_data = Articles(test_path) eval_data = Articles(eval_path) print("Data Loaded") # check if items need to be tokenized if args.map_items and args.tokenize: train_data.tokenize() test_data.tokenize() eval_data.tokenize() print("Items tokenized") # create and save or load dictionaries based on arguments if args.create_dicts: final_word_ids, final_url_ids, final_publication_ids = dictionary.create_merged_dictionaries( train_data.examples, "target")
help="This is required to load dictionaries") parser.add_argument('--dataset_path', type=expand_path, required=True, help='Path to data to be ranked.') args = parser.parse_args() dict_dir = Path(args.dict_dir) final_word_ids, final_url_ids, final_publication_ids = dictionary.load_dictionaries( dict_dir) print("Dictionaries loaded.") data_path = Path(args.dataset_path) dataset = Articles(data_path) print("Data loaded.") dataset.tokenize() print("Data tokenized.") word_counter = collections.Counter() for example in dataset.examples: word_counter.update(example['text']) unique_words = [word for word in word_counter.keys()] len(set(unique_words)) abs_model_path = Path(args.model_path) kwargs = dict(n_publications=len(final_publication_ids), n_articles=len(final_url_ids), n_attributes=len(final_word_ids),
if torch.cuda.is_available() and args.use_gpu: device = "cuda" elif not args.use_gpu: device = "cpu" else: device = "cpu" print("Cannot use GPU. Using CPU instead.") print(f"Device: {device}") print("-------------------") # set output directory path output_path = Path(args.output_dir) # load in dataset raw_data_path = Path(args.dataset_path) raw_data = Articles(raw_data_path) print("Data Loaded") print("-------------------") # load dictionaries from path dictionary_dir = Path(args.dict_dir) final_word_ids, final_url_ids, final_publication_ids = dictionary.load_dictionaries( dictionary_dir) print("Dictionaries Loaded") print("-------------------") # map items to their dictionary values if args.map_items: # initialize tokenizer from BERT library tokenizer = BertWordPieceTokenizer(args.tokenizer_file, lowercase=True) print("Tokenizer Initialized!")
help='Path to data to be ranked.') parser.add_argument('--mapped_data_dir', type=expand_path, required=True, help="The place to store the mapped data.") args = parser.parse_args() dict_dir = Path(args.dict_dir) final_word_ids, final_url_ids, final_publication_ids = dictionary.load_dictionaries( dict_dir) print("Dictionaries loaded.") data_path = Path(args.dataset_path) dataset = Articles(data_path) print("Data loaded.") abs_model_path = Path(args.model_path) kwargs = dict(n_publications=len(final_publication_ids), n_articles=len(final_url_ids), n_attributes=len(final_word_ids), emb_size=100, sparse=False, use_article_emb=False, mode='mean') model = InnerProduct(**kwargs) model.load_state_dict(torch.load(abs_model_path)) print("Model Loaded.") dataset.tokenize()
print("Cannot use GPU. Using CPU instead.") print(f"Device: {device}") # set output directory path output_path = Path(args.output_dir) # tensboard log and graph output folder declaration log_tensorboard_dir = output_path / "runs" / args.word_embedding_type writer = SummaryWriter(log_tensorboard_dir) # load datasets train_path = Path(args.train_path) test_path = Path(args.test_path) eval_path = Path(args.eval_path) train_data = Articles(train_path) test_data = Articles(test_path) eval_data = Articles(eval_path, index_file=args.index_file_path) print("Data Loaded") # initialize tokenizer from BERT library tokenizer = BertWordPieceTokenizer(args.tokenizer_file, lowercase=True) print("Tokenizer Initialized!") # create and save or load dictionaries based on arguments if args.create_dicts: ( final_word_ids, final_url_ids, final_publication_ids, ) = dictionary.create_merged_dictionaries(
if torch.cuda.is_available() and args.use_gpu: device = "cuda" elif not args.use_gpu: device = "cpu" else: device = "cpu" print("Cannot use GPU. Using CPU instead.") print(f"Device: {device}") print("-------------------") # set output directory path output_path = Path(args.output_dir) # load in dataset raw_data_path = Path(args.dataset_path) raw_data = Articles(raw_data_path) print("Data Loaded") print("-------------------") # load dictionaries from path dictionary_dir = Path(args.dict_dir) final_word_ids, final_url_ids, final_publication_ids = dictionary.load_dictionaries(dictionary_dir) print("Dictionaries Loaded") print("-------------------") # map items to their dictionary values if args.map_items: # tokenize data and split into words raw_data.tokenize() # map items to their ids in dictionaries and filter articles proper_data = raw_data.map_items(final_word_ids,
) parser.add_argument( "--tokenizer_file", type=str, help="Designate tokenizer source file.", ) args = parser.parse_args() tokenizer = BertWordPieceTokenizer(args.tokenizer_file, lowercase=True) dictionary_dir = Path(args.dict_dir) final_word_ids, final_url_ids, final_publication_ids = dictionary.load_dictionaries( dictionary_dir ) print("Dictionaries loaded.") if args.filter: raw_dataset = Articles(args.dataset_path) print("Initial: ", len(raw_dataset)) if args.days is not None: filtered_data = raw_dataset.map_items( tokenizer, final_url_ids, final_publication_ids, filter=True, min_length=args.min_length, day_range=args.days, ) else: filtered_data = raw_dataset.map_items( tokenizer, final_url_ids, final_publication_ids,
print("Cannot use GPU. Using CPU instead.") print(f"Device: {device}") # set output directory path output_path = Path(args.output_dir) # load in dataset, add easily returnable link, then create PyTorch Dataset raw_data_path = Path(args.dataset_path) temp_df = pd.read_json(raw_data_path) if "link" not in temp_df.columns: temp_df['link'] = temp_df['url'] if "orig_title" not in temp_df.columns: temp_df['orig_title'] = temp_df['title'] temp_df.to_json(args.dataset_path, orient="records") raw_data = Articles(raw_data_path) print("Data Loaded") # load dictionaries from path dictionary_dir = Path(args.dict_dir) final_word_ids, final_url_ids, final_publication_ids = dictionary.load_dictionaries( dictionary_dir) # map items to their dictionary values if args.map_items: raw_data.map_items(final_word_ids, final_url_ids, final_publication_ids) mapped_data_path = Path(args.data_dir) / "mapped-data" print("Mapped Data!") if not mapped_data_path.is_dir(): mapped_data_path.mkdir()