def main(): # load arguments args = parse_args() # load config with open(args.config) as f: config = yaml.load(f, Loader=yaml.FullLoader) # define logging level and format level = logging.INFO if args.debug: level = logging.DEBUG logging.basicConfig(format="%(asctime)s %(levelname)s:%(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=level) # load splits test_doc_ids = utils.read_split(config["split"]) logging.info(f"Number of test documents: {len(test_doc_ids)}") # load dataset dataset = utils.read_jsonl(config["dataset"], dict_key="id") # create word embeddings for scene labels if os.path.basename(config["scene_labels"]) == "places365_en.txt": language = "en" else: # places365_de.txt language = "de" logging.info('Generate word embedding for scene labels ...') scene_labels = read_scene_labels(config["scene_labels"]) scene_word_embeddings = get_scene_word_embeddings(scene_labels, fasttext_bin_folder=args.fasttext, language=language) # generate results for each document testset_similarities = {} with multiprocessing.Pool(args.threads) as p: pool_args = [(doc, test_doc_ids, scene_word_embeddings, config) for doc in dataset.values()] cnt_docs = 0 for document_result in p.imap(calculate_results, pool_args): if document_result is None: continue cnt_docs += 1 if cnt_docs % 100 == 0: logging.info(f"{cnt_docs} / {len(test_doc_ids)} documents processed ...") for key, val in document_result.items(): if key not in testset_similarities: testset_similarities[key] = [] testset_similarities[key].append(val) results = metrics.calculate_metrics(testset_similarities) metrics.print_results(results) return 0
def process_reviews(product_name): print("Processing Reviews...") ratings_documents = read_split('sample_reviews.json', product_name) documents_rating_1 = ratings_documents[1] documents_rating_5 = ratings_documents[5] bow_corpus_1, dictionary_1 = process_single_product_reviews( documents_rating_1) bow_corpus_5, dictionary_5 = process_single_product_reviews( documents_rating_5) return bow_corpus_1, dictionary_1, bow_corpus_5, dictionary_5, documents_rating_1, documents_rating_5
def bert_model(product_name): log.info("Processing Reviews...") ratings_documents, raw_data = read_split('sample_reviews.json', product_name) documents_rating_1 = ratings_documents[1] documents_rating_5 = ratings_documents[5] corpus_1 = documents_rating_1['reviewText'].to_list() corpus_5 = documents_rating_5['reviewText'].to_list() # print(corpus_5) corpus_embeddings_1 = bert.encode(corpus_1) corpus_embeddings_5 = bert.encode(corpus_5) return corpus_embeddings_1, corpus_embeddings_5, corpus_1, corpus_5
def main(): # load arguments args = parse_args() # load config with open(args.config) as f: config = yaml.load(f, Loader=yaml.FullLoader) # define logging level and format level = logging.INFO if args.debug: level = logging.DEBUG logging.basicConfig(format="%(asctime)s %(levelname)s:%(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=level) # load splits test_doc_ids = utils.read_split(config["split"]) logging.info(f"Number of test documents: {len(test_doc_ids)}") # load dataset dataset = utils.read_jsonl(config["dataset"], dict_key="id") # generate results for each document testset_similarities = {} with multiprocessing.Pool(args.threads) as p: pool_args = [(doc, test_doc_ids, config) for doc in dataset.values()] cnt_docs = 0 for document_result in p.imap(calculate_results, pool_args): if document_result is None: continue cnt_docs += 1 if cnt_docs % 100 == 0: logging.info( f"{cnt_docs} / {len(test_doc_ids)} documents processed ..." ) for key, val in document_result.items(): if key not in testset_similarities: testset_similarities[key] = [] testset_similarities[key].append(val) results = metrics.calculate_metrics(testset_similarities) metrics.print_results(results) return 0
edge_types.append((type_num_dict[j], type_num_dict[i])) # Load data G = load_graph_data(graph_path) adjs_orig = get_edge_adj_matrices(G, {et: None for et in edge_types_strings}) # # get adjajcency matrices for subgraphs adj_orig = nx.to_scipy_sparse_matrix(G) adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() p = dataset_path + "random_splits/" + edge_type + "/random" + str( random_seed) + "/" G_train, test_positive_e, test_negative_e, val_positive_e, val_negative_e, train_edges = read_split( G, edge_type.split("_"), random_seed, p) t0 = time.time() adjs_train = get_edge_adj_matrices( G_train, {et: adjs_orig[et]["nodes"] for et in adjs_orig}) adj_train = nx.to_scipy_sparse_matrix(G_train) # adj = adj_train k = tuple([type_num_dict[t] for t in edge_type.split("_")]) print("k", k, edge_type) nodes0 = adjs_orig[edge_type]["nodes"][0] nodes1 = adjs_orig[edge_type]["nodes"][1]
length = int(sys.argv[6]) # 100 f = open(graph_path) G = nx.Graph() for line in f: a, b = line.strip().split("\t") G.add_edge(a, b) G.remove_edges_from(G.selfloop_edges()) GC = max(nx.connected_component_subgraphs(G), key=len) # take greatest connected component p = path + "random_splits/" + edge_type[0] + "_" + edge_type[ 1] + "/random" + str(random_seed) + "/" G_train, test_positive, test_negative, val_positive, val_negative, train_edges = read_split( GC, edge_type, random_seed, p) t0 = time.time() print("Meta path classifier") if dataset == "bio": mpg = MetaPathGeneratorBio(random_seed) elif dataset == "sicris": mpg = MetaPathGeneratorSicris(random_seed) elif dataset == "imdb": mpg = MetaPathGeneratorImdb(random_seed) elif dataset == "amazon": mpg = MetaPathGeneratorAmazon(random_seed) elif dataset == "yelp": mpg = MetaPathGeneratorYelp(random_seed)
f = open(graph_path) G = nx.Graph() for line in f: a, b = line.strip().split("\t") G.add_edge(a, b) G.remove_edges_from(G.selfloop_edges()) print(G.number_of_nodes()) GC = max(nx.connected_component_subgraphs(G), key=len) # take greatest connected component print(GC.number_of_nodes()) p = path + "random_splits/" + edge_type[0] + "_" + edge_type[ 1] + "/random" + str(num) + "/" print(p) G_train, test_positive, test_negative, val_positive, val_negative, train_edges = read_split( GC, edge_type, num, p) p = path + "features/" + edge_type[0] + "_" + edge_type[ 1] + "/random" + str(num) + "/" t0 = time.time() simple_model = SimpleClassifier(G_train, train_edges, test_positive, test_negative, val_positive, val_negative, p) t1 = time.time() print("Preparation:", t1 - t0) simple_model.train(method, num) print("Training:", time.time() - t1) simple_model.predict() print("Acc:", simple_model.evaluate()) simple_model.predict(prob=True)