def gen_dataset(graphs): random.Random(123).shuffle(graphs) dirout_train = get_data_path() + '/IMDBMulti/train' dirout_test = get_data_path() + '/IMDBMulti/test' create_dir_if_not_exists(dirout_train) create_dir_if_not_exists(dirout_test) for g in graphs[0:1200]: nx.write_gexf(g, dirout_train + '/{}.gexf'.format(g.graph['gid'])) for g in graphs[1200:]: nx.write_gexf(g, dirout_test + '/{}.gexf'.format(g.graph['gid']))
def _get_gs_and_metric_info(name, dir_name, natts, eatts, align_metric): train_gs = iterate_get_graphs(join(get_data_path(), dir_name, 'train'), natts=natts, eatts=eatts) test_gs = iterate_get_graphs(join(get_data_path(), dir_name, 'test'), natts=natts, eatts=eatts) if name == 'aids700nef_old_small': train_gs = train_gs[0:4] test_gs = test_gs[0:2] graphs = train_gs + test_gs dist_or_sim, true_algo = get_ds_metric_config(align_metric) return train_gs, test_gs, graphs, dist_or_sim, true_algo
def gen_dataset(graphs): random.Random(123).shuffle(graphs) dirout_train = get_data_path() + '/{}/train'.format(conf.outfolder) dirout_test = get_data_path() + '/{}/test'.format(conf.outfolder) create_dir_if_not_exists(dirout_train) create_dir_if_not_exists(dirout_test) sp = int(len(graphs) * conf.train_perc_) for g in graphs[0:sp]: nx.write_gexf(g, dirout_train + '/{}.gexf'.format(g.graph['gid'])) for g in graphs[sp:]: nx.write_gexf(g, dirout_test + '/{}.gexf'.format(g.graph['gid']))
def load_fashion_mnist(with_y=False): datapath = get_data_path("fashion_mnist") paths = [ "train-images-idx3-ubyte.gz", "train-labels-idx1-ubyte.gz", "t10k-images-idx3-ubyte.gz", "t10k-labels-idx1-ubyte.gz" ] datasets = [os.path.join(datapath, fp) for fp in paths] if not os.path.isfile(datasets[0]): urls = [ os.path.join( "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com", fp) for fp in paths ] for url, fn in zip(urls, paths): print("Downloading %s data..." % (fn)) urlretrieve(url, os.path.join(datapath, fn)) train_x, train_t = load_mnist(datapath, "train") test_x, test_t = load_mnist(datapath, "t10k") if with_y: return (train_x, train_t), (test_x, test_t) return train_x, test_x
def load_mnist_binarized(): datapath = get_data_path("mnist") dataset = os.path.join(datapath, "mnist.gz") if not os.path.isfile(dataset): datafiles = { "train": "http://www.cs.toronto.edu/~larocheh/public/" "datasets/binarized_mnist/binarized_mnist_train.amat", "valid": "http://www.cs.toronto.edu/~larocheh/public/datasets/" "binarized_mnist/binarized_mnist_valid.amat", "test": "http://www.cs.toronto.edu/~larocheh/public/datasets/" "binarized_mnist/binarized_mnist_test.amat" } datasplits = {} for split in datafiles.keys(): print("Downloading %s data..." % (split)) datasplits[split] = np.loadtxt(urlretrieve(datafiles[split])[0]) pkl.dump( [datasplits['train'], datasplits['valid'], datasplits['test']], open(dataset, "wb")) x_train, x_valid, x_test = pkl.load(open(dataset, "rb")) return x_train, x_valid, x_test
def main(args): path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) for count in range(2, len(urls) + 1): print '[learner] clustering with %d urls' % count # load data data = [utils.load_data(path, id) for id, url in enumerate(urls)] data = data[:count] # process data processor = processors.Processor(data) features = processor.extract() # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # score clusters = processor.score(labels) with open(os.path.join(path, 'clusters.%03d.json' % count), 'w') as f: f.write(json.dumps(clusters, indent=2, ensure_ascii=False).encode('utf8'))
def __init__(self, sset="train", transform=None): path = get_data_path('coco') self.root = os.path.join(path["COCO_ROOT"], "images/") self.transform = transform # dataset.json come from Karpathy neural talk repository and contain the restval split of coco with open(path["COCO_RESTVAL_SPLIT"], 'r') as f: datas = json.load(f) if sset == "train": self.content = [ x for x in datas["images"] if x["split"] == "train" ] elif sset == "trainrv": self.content = [ x for x in datas["images"] if x["split"] == "train" or x["split"] == "restval" ] elif sset == "val": self.content = [x for x in datas["images"] if x["split"] == "val"] else: self.content = [x for x in datas["images"] if x["split"] == "test"] self.content = [(os.path.join(y["filepath"], y["filename"]), [x["raw"] for x in y["sentences"]]) for y in self.content] self.word_dict_path = path["WORD_DICT"] path_params = os.path.join(self.word_dict_path, 'utable.npy') self.params = np.load(path_params, encoding='latin1', allow_pickle=True) self.dico = _load_dictionary(self.word_dict_path)
def main(args): extractor = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'extractor.coffee') path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) # extract data from each url for id, url in enumerate(urls): url = url.strip() if not url: continue # skip already extracted if os.path.exists(os.path.join(path, '%03d.json' % id)): continue print '[extractor] #%03d: %s' % (id, url) subprocess.call( 'cd "%(path)s" && phantomjs "%(extractor)s" "%(url)s" "%(label)03d" > "%(label)03d.log" 2>&1' % { 'path': path, 'extractor': extractor, 'url': url, 'label': id, }, shell=True)
def main(args): path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) # extract data from each url data = [] for id, url in enumerate(urls): url = url.strip() if not url: continue print '[diffbot] #%03d: %s' % (id, url) response = urllib2.urlopen( 'http://www.diffbot.com/api/article?' + urllib.urlencode({ 'url': url, 'token': '4bc6e407da88dd8723c70a5297cdf7fb', 'timeout': '60000', })) data.append(json.loads(response.read())) with open(os.path.join(path, 'diffbot.json'), 'w') as f: f.write(json.dumps(data, indent=2, ensure_ascii=False).encode('utf8'))
def main(args): path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) for count in range(2, len(urls) + 1): print '[learner] clustering with %d urls' % count # load data data = [utils.load_data(path, id) for id, url in enumerate(urls)] data = data[:count] # process data processor = processors.Processor(data) features = processor.extract() # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # score clusters = processor.score(labels) with open(os.path.join(path, 'clusters.%03d.json' % count), 'w') as f: f.write( json.dumps(clusters, indent=2, ensure_ascii=False).encode('utf8'))
def test_model(args): models = os.listdir(args.save_path) # load dataset data_paths = get_data_path(args.mode, args.label_type) datasets = BertSumLoader().process(data_paths) print('Information of dataset is:') print(datasets) test_set = datasets.datasets['test'] # only need 1 gpu for testing device = int(args.gpus) args.batch_size = 1 for cur_model in models: print('Current model is {}'.format(cur_model)) # load model model = torch.load(join(args.save_path, cur_model)) # configure testing original_path, dec_path, ref_path = get_rouge_path(args.label_type) test_metric = RougeMetric(data_path=original_path, dec_path=dec_path, ref_path=ref_path, n_total=len(test_set)) tester = Tester(data=test_set, model=model, metrics=[test_metric], batch_size=args.batch_size, device=device) tester.test()
def extract_files(self): self.extract_error = None location = self.get_setting('download_dir').value version = self.selected_version() for setting_name, setting in self.settings['export_settings'].items(): save_file_path = setting.save_file_path(version, location) try: if setting.value: extract_path = get_data_path('files/'+setting.name) setting.extract(extract_path, version) #if os.path.exists(save_file_path): # setting_fbytes = setting.get_file_bytes(version) # for dest_file, fbytes in setting_fbytes: # path = utils.path_join(extract_path, dest_file) # with open(path, 'wb+') as d: # d.write(fbytes) # self.progress_text += '.' self.progress_text += '.' except (tarfile.ReadError, zipfile.BadZipfile) as e: if os.path.exists(save_file_path): os.remove(save_file_path) self.extract_error = e self.logger.error(unicode(self.extract_error)) # cannot use GUI in thread to notify user. Save it for later self.progress_text = '\nDone.\n' return True
def render_training(self): history = self.history.history loss = [history['loss'][0]] + history['loss'] val_loss = [history['val_loss'][0]] + history['val_loss'] lr = [-log10(r) for r in self.scheduler.history_lr] lr = lr + [lr[-1]] import matplotlib.pyplot as plt epochs = range(1, len(loss) + 1) plt.plot(epochs, loss, 'k', label='Training loss') plt.plot(epochs, val_loss, 'y', label='Validation loss') plt.plot(epochs, lr, 'r', label='Learning rate (1e-X)') min_max = [ r or 0 for r in [self.scheduler.min_rate, self.scheduler.max_rate] ] plt.title( f'Training and validation loss [{min_max[0]:0.0E} - {min_max[1]:0.0E}]' ) plt.legend() fname = 'loss' plt.savefig( utils.get_data_path(settings.PLOT_PATH, utils.get_exp_key(self) + '-' + fname + '.svg')) plt.close()
def __init__(self, x, y): Platform.__init__(self, x, y) boltAnim = [] for anim in ANIMATION_FIRE: boltAnim.append((anim, ANIMATION_DELAY)) self.boltAnim = pyganim.PygAnimation(boltAnim) self.boltAnim.play() self.image = pygame.image.load((get_data_path('Fogo_1.png', 'img')))
def init(self): self.graphs = [] datadir = '{}/LINUX/{}'.format( get_data_path(), get_train_str(self.train)) self.graphs = iterate_get_graphs(datadir) print('Loaded {} graphs from {}'.format(len(self.graphs), datadir)) self.graphs, self.glabels = add_glabel_to_each_graph(self.graphs, '', True) assert (self.glabels is None) # fake graph labels
def download_path(path=None): # Ensure that the default download path exists path = path or utils.get_data_path('files/downloads') try: os.makedirs(path) except: pass return path
def gen_graphs(): dirin = get_data_path() file = dirin + '/linux_Format-2' # train_dirout = dirin + '/train' # test_dirout = dirin + '/test' # dirin = get_data_path() + '/iGraph20/datasets' # file = dirin + '/nasa.igraph' train_dirout = dirin + '/train' test_dirout = dirin + '/test' graphs = {} gid = None types_count = defaultdict(int) total_num_nodes = 0 disconnects = set() less_than_eq_10 = set() types_count_less_than_eq_10 = defaultdict(int) total_num_nodes_less_than_eq_10 = 0 with open(file) as f: for line in f: ls = line.rstrip().split() if ls[0] == 't': assert (len(ls) == 3) assert (ls[1] == '#') if gid: assert (gid not in graphs) graphs[gid] = g print(gid, g.number_of_nodes()) if g.number_of_nodes() <= 10 and nx.is_connected(g): less_than_eq_10.add(gid) total_num_nodes_less_than_eq_10 += g.number_of_nodes() d = nx.get_node_attributes(g, 'type') for _, type in d.items(): types_count_less_than_eq_10[type] += 1 if not nx.is_connected(g): disconnects.add(g) g = nx.Graph() gid = int(ls[2]) elif ls[0] == 'v': assert (len(ls) == 3) type = int(ls[2]) types_count[type] += 1 g.add_node(int(ls[1]), type=type) total_num_nodes += 1 elif ls[0] == 'e': assert (len(ls) == 4) edge_type = int(ls[3]) assert (edge_type == 0) g.add_edge(int(ls[1]), int(ls[2])) print(len(graphs), 'graphs in total') print(len(types_count), 'node types out of total', total_num_nodes, 'nodes') print(len(disconnects), 'disconnected graphs') for i in range(10): print(i, types_count[i]) print(len(less_than_eq_10), 'small graphs (<= 10 nodes)') print(len(types_count_less_than_eq_10), 'node types out of total', total_num_nodes_less_than_eq_10, 'nodes') select_dump_graphs(graphs, sorted(list(less_than_eq_10)))
def __init__(self, datestr=None): if datestr is None: datestr = self.datestr self.path = get_data_path(__file__) assert os.path.exists(self.path) files = [self.file_fmt.format(date=datestr, name=f) for f in self.files] self.file_paths = [os.path.join(self.path, f) for f in files] assert all([os.path.exists(f) for f in self.file_paths])
def main(filename, user1, user2, user3, color1, color2, color3, color_lands, name_out, map_size, line_width): '''Fonction principale pour récupérer les bases de données Args: filename (str): Nom du fichier à charger user1 (str) : name of the traveler 1 user2 (str) : name of the traveler 2 user3 (str) : name of the traveler 3 color1 (str) : RGB color for the traveler 1 color2 (str) : RGB color for the traveler 2 color3 (str) : RGB color for the traveler 3 color_lands (str) : RGB color for lands map_size (str) : map_size Raises: TypeError : si l'objet filename n'est pas du type str TypeError : si l'objet user1 n'est pas du type str TypeError : si l'objet user2 n'est pas du type str TypeError : si l'objet user3 n'est pas du type str TypeError : si l'objet color1 n'est pas du type str TypeError : si l'objet color2 n'est pas du type str TypeError : si l'objet color3 n'est pas du type str TypeError : si l'objet color_lands n'est pas du type str TypeError : si l'objet map_size n'est pas du type str ''' utils.info('Récupération des bases de données') if type(filename) != str: raise TypeError('L\'objet filename doit être du type str.') if type(user1) != str: raise TypeError('L\'objet user1 doit être du type str.') if type(user2) != str: raise TypeError('L\'objet user2 doit être du type str.') if type(user3) != str: raise TypeError('L\'objet user3 doit être du type str.') if type(color1) != str: raise TypeError('L\'objet color1 doit être du type str.') if type(color2) != str: raise TypeError('L\'objet color2 doit être du type str.') if type(color3) != str: raise TypeError('L\'objet color3 doit être du type str.') if type(map_size) != str: raise TypeError('L\'objet map_size doit être du type str.') if type(color_lands) != str: raise TypeError('L\'objet color_lands doit être du type str.') data_dir = utils.get_data_path() images_dir = utils.get_images_path() df_path = os.path.join(data_dir, filename) image_path = os.path.join(images_dir, name_out) df_journeys = pd.read_excel(df_path) df_journeys = prepare_table(df_journeys, user1, user2, user3, color1, color2, color3) fig = make_map(df_journeys, color_lands, map_size, line_width) #fig.show() fig.write_image(image_path, width=12800, height=8400, scale=1)
def __init__(self): super(App, self).__init__() self.ui = Ui_MainWindow() self.ui.setupUi(self) self.data_path = get_data_path() self.tabs = [] self.options_menu_widget = None self.tab_widget = None self.setups()
def __init__(self, kuruczPfPath: Optional[str]=None, metallicity: float=0.0, abundances: Dict=None, abundDex: bool=True): if set(AtomicWeights.keys()) != set(AtomicAbundances.keys()): raise ValueError('AtomicWeights and AtomicAbundances keys differ (Problem keys: %s)' % repr(set(AtomicWeights.keys()) - set(AtomicAbundances.keys()))) self.indices = OrderedDict(zip(AtomicWeights.keys(), range(len(AtomicWeights)))) # Convert abundances and overwrite any provided secondary abundances self.abund = deepcopy(AtomicAbundances) if self.abund['H '] == 12.0: for k, v in self.abund.items(): self.abund[k] = 10**(v - 12.0) if abundances is not None: if abundDex: for k, v in abundances.items(): abundances[k] = 10**(v - 12.0) for k, v in abundances.items(): self.abund[k] = v metallicity = 10**metallicity for k, v in self.abund.items(): if k != 'H ': self.abund[k] = v*metallicity kuruczPfPath = get_data_path() + 'pf_Kurucz.input' if kuruczPfPath is None else kuruczPfPath with open(kuruczPfPath, 'rb') as f: s = f.read() u = Unpacker(s) self.Tpf = np.array(u.unpack_array(u.unpack_double)) ptIndex = [] # Index in the periodic table (fortran based, so +1) -- could be used for validation stages = [] pf = [] ionpot = [] for i in range(len(AtomicWeights)): ptIndex.append(u.unpack_int()) stages.append(u.unpack_int()) pf.append(np.array(u.unpack_farray(stages[-1] * self.Tpf.shape[0], u.unpack_double)).reshape(stages[-1], self.Tpf.shape[0])) ionpot.append(np.array(u.unpack_farray(stages[-1], u.unpack_double))) ionpot = [i * Const.HC / Const.CM_TO_M for i in ionpot] pf = [np.log(p) for p in pf] totalAbund = 0.0 avgWeight = 0.0 self.elements: List[Element] = [] for k, v in AtomicWeights.items(): i = self.indices[k] ele = Element(k, v, self.abund[k], ionpot[i], self.Tpf, pf[i]) self.elements.append(ele) totalAbund += ele.abundance avgWeight += ele.abundance * ele.weight self.totalAbundance = totalAbund self.weightPerH = avgWeight self.avgMolWeight = avgWeight / totalAbund
def get_proc_graphs(datadir, train): if logging_enabled == True: print("- Entered data::get_proc_graphs Global Method") datadir = '{}\\{}\\{}'.format( get_data_path(), datadir, get_train_str(train)) graphs = iterate_get_graphs(datadir) print('info: Loaded {} graphs from {}'.format(len(graphs), datadir)) return graphs
def load_cifar(levels=256, with_y=False): dataset = 'cifar-10-python.tar.gz' data_dir, data_file = os.path.split(dataset) if data_dir == "" and not os.path.isfile(dataset): # Check if dataset is in the data directory. new_path = os.path.join(get_data_path("cifar10"), dataset) if os.path.isfile(new_path) or data_file == 'cifar-10-python.tar.gz': dataset = new_path if (not os.path.isfile(dataset)) and data_file == 'cifar-10-python.tar.gz': origin = ('http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz') print("Downloading data from {}...".format(origin)) urlretrieve(origin, dataset) f = tarfile.open(dataset, 'r:gz') b1 = pkl.load(f.extractfile("cifar-10-batches-py/data_batch_1"), encoding="bytes") b2 = pkl.load(f.extractfile("cifar-10-batches-py/data_batch_2"), encoding="bytes") b3 = pkl.load(f.extractfile("cifar-10-batches-py/data_batch_3"), encoding="bytes") b4 = pkl.load(f.extractfile("cifar-10-batches-py/data_batch_4"), encoding="bytes") b5 = pkl.load(f.extractfile("cifar-10-batches-py/data_batch_5"), encoding="bytes") test = pkl.load(f.extractfile("cifar-10-batches-py/test_batch"), encoding="bytes") train_x = np.concatenate( [b1[b'data'], b2[b'data'], b3[b'data'], b4[b'data'], b5[b'data']], axis=0) / 255. train_x = np.asarray(train_x, dtype='float32') train_t = np.concatenate([ np.array(b1[b'labels']), np.array(b2[b'labels']), np.array(b3[b'labels']), np.array(b4[b'labels']), np.array(b5[b'labels']) ], axis=0) test_x = test[b'data'] / 255. test_x = np.asarray(test_x, dtype='float32') test_t = np.array(test[b'labels']) f.close() train_x = train_x.reshape((train_x.shape[0], 3, 32, 32)).transpose( (0, 2, 3, 1)).reshape((train_x.shape[0], -1)) test_x = test_x.reshape((test_x.shape[0], 3, 32, 32)).transpose( (0, 2, 3, 1)).reshape((test_x.shape[0], -1)) train_x = quantisize(train_x, levels) / (levels - 1.) test_x = quantisize(test_x, levels) / (levels - 1.) if with_y: return (train_x, train_t), (test_x, test_t) return train_x, test_x
def init(self): self.graphs = [] datadir = '{}/{}/{}'.format(get_data_path(), self.get_folder_name(), get_train_str(self.train)) self.graphs = iterate_get_graphs(datadir) print('Loaded {} graphs from {}'.format(len(self.graphs), datadir)) if 'nef' in self.get_folder_name(): print('Removing edge features') for g in self.graphs: self._remove_valence(g)
def predict(tree, toy_id, true_category, target): path = get_data_path(dev=False) df = importData(path) X_test = df.loc[df['id'] == toy_id] X_test = X_test.drop('id', axis=1) print(X_test) y_pred = tree.predict(X_test) return y_pred
def read_y_true_for_all_classes(): y_true = [] for class_number in range(1, 12): y_true.append(read_y_true_for_class(utils.get_data_path() + "/Class{}.csv".format(class_number))) # Creates an array [n_samples, number of classes] which each position is the labeled data for the given class y_true = np.array(y_true) y_true = np.transpose(y_true) return y_true
def command_base(): config.TESTING = True dpath = utils.get_data_path('') if os.path.exists(dpath): utils.rmtree(dpath) base = CommandBase() base._project_name = 'Test' return base
def clean_data(dataset): clean_text_path = join(get_data_path(), 'corpus', dataset + '_sentences_clean.txt') if not exists(clean_text_path): docs_list = [] old_name = dataset if "no_hashtag" in dataset: dataset = '_'.join(dataset.split('_')[:-2]) with open(join(get_data_path(), 'corpus', dataset + '_sentences.txt')) as f: for line in f.readlines(): docs_list.append(line.strip()) dataset = old_name word_counts = defaultdict(int) for doc in docs_list: temp = clean_doc(doc, dataset) words = temp.split() for word in words: word_counts[word] += 1 clean_docs = clean_documents(docs_list, word_counts, dataset) corpus_str = '\n'.join(clean_docs) f = open(clean_text_path, 'w') f.write(corpus_str) f.close() f = open(clean_text_path, 'r') lines = f.readlines() min_len = 10000 aver_len = 0 max_len = 0 for line in lines: line = line.strip() temp = line.split() aver_len = aver_len + len(temp) if len(temp) < min_len: min_len = len(temp) if len(temp) > max_len: max_len = len(temp) f.close() aver_len = 1.0 * aver_len / len(lines) print('min_len : ' + str(min_len)) print('max_len : ' + str(max_len)) print('average_len : ' + str(aver_len))
def main(): dirin = get_data_path() + '/{}/graph'.format(conf.infolder) k = float('inf') lesseqk = [] glabel_map = read_graph_labels() info_map = {} disconnected = [] files = glob(dirin + '/*.gexf') if conf.need_sort_: files = sorted_nicely(files) for i, file in enumerate(files): g = nx.read_gexf(file) gid = get_file_base_id(file) print(i, gid, g.number_of_nodes()) if g.number_of_nodes() <= k: if not nx.is_connected(g): print(gid, 'is not connected') gsize = g.number_of_nodes() g = max(nx.connected_component_subgraphs(g), key=len) grmd = gsize - g.number_of_nodes() assert (grmd > 0) g_info = 'rm_{}_nodes'.format(grmd) disconnected.append(g) else: g_info = '' lesseqk.append(g) info_map[gid] = g_info g.graph['gid'] = gid g.graph['label'] = glabel_map[gid] for node, d in g.nodes(data=True): type = d['node_class'] if conf.has_node_type: d.pop('node_class') d['type'] = type for edge in g.edges_iter(data=True): del edge[2]['weight'] print(len(lesseqk)) gen_dataset(lesseqk) gen_dataset(disconnected) save_glabels_as_txt(get_data_path() + '/{}/glabels'.format(conf.outfolder), glabel_map) save_glabels_as_txt(get_data_path() + '/{}/info'.format(conf.outfolder), info_map)
def main(categories, data, dev: bool): target = categories[0] # import the data without classification path = get_data_path(dev) df = importData(path) if(target in df.columns): target = "target" # preprocess the data by adding the target column with the given values df = preprocess(df, target, data) X_train, X_test, y_train, y_test = split(df) tree = train(X_train, y_train) return tree
def init(self): self.graphs = [] datadir = '{}/{}/{}'.format( get_data_path(), self.get_folder_name(), get_train_str(self.train)) self.graphs = iterate_get_graphs(datadir) print('Loaded {} graphs from {}'.format(len(self.graphs), datadir)) if 'nef' in self.get_folder_name(): print('Removing edge features') for g in self.graphs: self._remove_valence(g) self.graphs, self.glabels = add_glabel_to_each_graph(self.graphs, '', True) assert (self.glabels is None) # fake graph labels
def classify_class(class_number, features, test_features): y_true = read_y_true_for_class(utils.get_data_path() + "/Class{}.csv".format(class_number)) # splits the train data into train and validation with validation being 20% of the original train data set x_train, x_validation, y_train, y_validation = train_test_split(features, y_true, test_size=0.20, random_state=0) classifier = create_rf_classifier(240) classifier.fit(x_train, y_train) score = classifier.score(x_validation, y_validation) print("Training score for Class {}: {:0.2f}".format(class_number, score)) return classifier.predict(test_features).tolist()
def main(args): extractor = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'extractor.coffee') path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) # extract data from each url for id, url in enumerate(urls): url = url.strip() if not url: continue # skip already extracted if os.path.exists(os.path.join(path, '%03d.json' % id)): continue print '[extractor] #%03d: %s' % (id, url) subprocess.call('cd "%(path)s" && phantomjs "%(extractor)s" "%(url)s" "%(label)03d" > "%(label)03d.log" 2>&1' % { 'path': path, 'extractor': extractor, 'url': url, 'label': id, }, shell=True)
def main(args): path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) # extract data from each url data = [] for id, url in enumerate(urls): url = url.strip() if not url: continue print '[diffbot] #%03d: %s' % (id, url) response = urllib2.urlopen('http://www.diffbot.com/api/article?' + urllib.urlencode({ 'url': url, 'token': '4bc6e407da88dd8723c70a5297cdf7fb', 'timeout': '60000', })) data.append(json.loads(response.read())) with open(os.path.join(path, 'diffbot.json'), 'w') as f: f.write(json.dumps(data, indent=2, ensure_ascii=False).encode('utf8'))
# we are running in a normal Python environment CWD = os.getcwd() def get_file(path): parts = path.split('/') independent_path = utils.path_join(CWD, *parts) return independent_path __version__ = "v0.0.0" with open(get_file('files/version.txt')) as f: __version__ = f.read().strip() TEMP_DIR = get_temp_dir() DEFAULT_DOWNLOAD_PATH = get_data_path('files/downloads') logger = logging.getLogger('W2E logger') LOG_FILENAME = get_data_file_path('files/error.log') if __name__ != '__main__': logging.basicConfig( filename=LOG_FILENAME, format=("%(levelname) -10s %(asctime)s %(module)s.py: " "%(lineno)s %(funcName)s - %(message)s"), level=logging.DEBUG ) logger = logging.getLogger('W2E logger') handler = lh.RotatingFileHandler(LOG_FILENAME, maxBytes=100000, backupCount=2) logger.addHandler(handler)
def main(args): path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) #print path count=0 # load data data = [utils.load_data(path, id) for id, url in enumerate(urls)] random.shuffle(data) for page in data: # print count #count+=1 random.shuffle(page['texts']) # process data processor = processors.Processor(data, tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer) features = processor.extract() #print len(features) # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # prepare features continuous_features, discrete_features, cluster_labels , texts1, urls , classes = processor.prepare(labels,path) res=[texts1,labels] # print len(texts1) lab=[] urlss=[] for k in texts1: lab.append(k.encode('ascii','ignore')) for l in urls: urlss.append(l.encode('ascii','ignore')) # decode the uncodes into string variable with open("rohit.csv","w") as fp: writer = csv.writer(fp) for row in zip(urls, lab, labels,classes): writer.writerow(row) input("enter data ") classes=[] with open("rohit.csv","r") as fp: reader = csv.reader(fp) for row in reader: classes += [row[3]] # Label the dataset and give them classes to which they belong.Classes are in 4th column for i in xrange(1,len(classes)): if classes[i] == 0: classes[i]= cluster_labels[i] vectorizer = DictVectorizer() discrete_features = vectorizer.fit_transform(discrete_features).toarray() discrete_features.resize(len(discrete_features), 10000) continuous_features = np.array(continuous_features) labels = np.array(labels).astype(np.float32) #print len(discrete_features[2]) features = np.hstack([continuous_features, discrete_features]).astype(np.float32) #print features # scale features features = preprocessing.scale(features) #preprocess the features rf=RandomForestClassifier(n_estimators=300) rf.fit(features,classes) #make a randomforest model and fit into them features and classes filename = '/home/test/nutch/runtime/local/phantomjslearning/classlibraries/ivfhaveababy.joblib.pkl' _ = joblib.dump(rf, filename, compress=9) rf = joblib.load(filename) #dump the model file into the particular directory precisions = [] recalls = [] f1scores = [] supports = [] return
def send_image(filename): path = get_data_path(app.config.get('AVATAR_FILE_CONF')['path']) return send_from_directory(path, filename)
def main(args): path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) #print path count=0 # load data data = [utils.load_data(path, id) for id, url in enumerate(urls)] random.shuffle(data) for page in data: # print count #count+=1 random.shuffle(page['texts']) # process data processor = processors.Processor(data, tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer) features = processor.extract() #print len(features) # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # prepare features continuous_features, discrete_features, cluster_labels , texts1, urls , classes = processor.prepare(labels,path) #It retrieves all the dataset which gets after getting clustered and store them in various lists lab=[] urlss=[] for k in texts1: lab.append(k.encode('ascii','ignore')) #It decodes the unicode into text vectorizer = DictVectorizer() discrete_features = vectorizer.fit_transform(discrete_features).toarray() #This is the feature extraction part which I have mentioned in my doc discrete_features.resize(len(discrete_features), 10000) #resize the discreet_features array to a uniform size so that in further using it model and test data set have same length features array continuous_features = np.array(continuous_features) labels = np.array(labels).astype(np.float32) features = np.hstack([continuous_features, discrete_features]).astype(np.float32) features = preprocessing.scale(features) #This is the normalization process where features are preprocessed to a scale im1=Image.open("/home/test/nutch/runtime/local/phantomjslearning/data/dazedandconfused/000.png") im2=Image.open("/home/test/nutch/runtime/local/phantomjslearning/data/fruitsofotherhands/000.png") im3=Image.open("/home/test/nutch/runtime/local/phantomjslearning/data/rohitanurag/000.png") im4=Image.open("/home/test/nutch/runtime/local/phantomjslearning/data/thegirlwhoreadtoomuch/000.png") im5=Image.open("/home/test/nutch/runtime/local/phantomjslearning/data/timcotson/000.png") imtest=Image.open(path+"/000.png") result1=equal(imtest,im1) result2=equal(imtest,im2) result3=equal(imtest,im3) result4=equal(imtest,im4) result5=equal(imtest,im5) choose=0 testresult=result1 if result1 <= testresult: choose=1 testresult=result1 if result2 <= testresult: choose=2 testresult=result2 if result3 <= testresult: choose=3 testresult=result3 if result4 <= testresult: choose=4 testresult=result4 if result5 <= testresult: choose=5 testresult=result5 if choose == 1: usemodel="/home/test/nutch/runtime/local/phantomjslearning/classlibraries/rfdazedandconfused.joblib.pkl" if choose == 2: usemodel="/home/test/nutch/runtime/local/phantomjslearning/classlibraries/rffruitsofother.joblib.pkl" if choose == 3: usemodel="/home/test/nutch/runtime/local/phantomjslearning/classlibraries/rfrohitanurag.joblib.pkl" if choose == 4: usemodel="/home/test/nutch/runtime/local/phantomjslearning/classlibraries/rfthegirlwhoused.joblib.pkl" if choose == 5: usemodel="/home/test/nutch/runtime/local/phantomjslearning/classlibraries/rftimscoton.joblib.pkl" #Here we get the predicted model which we use to predict classes like title , date,paragraphs of blogs usemodel = "/home/test/nutch/runtime/local/phantomjslearning/classlibraries/ivfhaveababy.joblib.pkl" rf = joblib.load(usemodel) #loads the model and then use it for prediction predicted = rf.predict(features) print usemodel for i in xrange(1,len(predicted)): print lab[i] print "*********" print predicted[i] print "**********" return
def main(args): extractor = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'label.py') path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) # load each JSON file from chaos. # Read each block of that file. # [P2] Sort the blocks by their size. # Also load the gold-text of that file. # If matching between gold-text and that element text is # above a certain threshold, label that block as 1. # [P2] remove the matching part from gold-text. # Rewrite the blocks to another json file. # extract data from each url # load data pages = [] domains = collections.defaultdict(lambda: 0) for id, url in enumerate(urls): if not url.strip(): continue host = url.split('/', 3)[2] #if domains[host] > 2: # continue domains[host] += 1 print host page = utils.load_data(path, id) processor = processors.Processor([page], tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer) features = processor.extract() clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ clusters = collections.defaultdict(list) for text, label in zip(processor.texts, labels): clusters[int(label)].append(text) gold_text = utils.load_gold_text(path, id) gold_text = processor.tokenizer.tokenize(gold_text) max_score = 0 best_label = None for label, texts in clusters.iteritems(): tokens = '' for text in texts: tokens += text['tokens'] score = processor.analyzer.get_similarity(tokens, gold_text) if score > max_score: max_score = score best_label = label for text in clusters[best_label]: text['label'] = 1 page_texts = [] for label, texts in clusters.iteritems(): page_texts += texts random.shuffle(page_texts) pages.append(page_texts) #random.shuffle(pages) continuous_features = [] discrete_features = [] labels = [] for page in pages: for text in page: text_length = len(text['tokens']) area = text['bound']['height'] * text['bound']['width'] text_density = float(text_length) / float(area) # continuous_feature continuous_feature = [] #text_length, text_density] continuous_features.append(continuous_feature) # discrete features discrete_feature = dict() discrete_feature = dict(text['computed'].items()) discrete_feature['path'] = ' > '.join(text['path']) """ discrete_feature['selector'] = ' > '.join([ '%s%s%s' % ( selector['name'], '#' + selector['id'] if selector['id'] else '', '.' + '.'.join(selector['classes']) if selector['classes'] else '', ) for selector in text['selector'] ]) """ discrete_feature['class'] = ' > '.join([ '%s%s' % ( selector['name'], '.' + '.'.join(selector['classes']) if selector['classes'] else '', ) for selector in text['selector'] ]) """ discrete_feature['id'] = ' > '.join([ '%s%s' % ( selector['name'], '#' + selector['id'] if selector['id'] else '', ) for selector in text['selector'] ]) """ discrete_features.append(discrete_feature) # label labels.append(text['label']) vectorizer = DictVectorizer() discrete_features = vectorizer.fit_transform(discrete_features).toarray() continuous_features = np.array(continuous_features) labels = np.array(labels).astype(np.float32) # scale features features = preprocessing.scale(features) features = np.hstack([continuous_features, discrete_features]).astype(np.float32) print features.shape precisions = [] recalls = [] f1scores = [] supports = [] rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0) for train_index, test_index in rs: print 'training size = %d, testing size = %d' % (len(train_index), len(test_index)) clf = svm.SVC(verbose=False, kernel='linear', probability=False, random_state=0, cache_size=2000, class_weight='auto') clf.fit(features[train_index], labels[train_index]) print clf.n_support_ """ negatives = [] for i in clf.support_[:clf.n_support_[0]]: negatives.append(all_texts[i]) positives = [] for i in clf.support_[clf.n_support_[0]:]: positives.append(all_texts[i]) stats(negatives, positives) """ print "training:" predicted = clf.predict(features[train_index]) print classification_report(labels[train_index], predicted) print "testing:" predicted = clf.predict(features[test_index]) print classification_report(labels[test_index], predicted) precision, recall, f1score, support = precision_recall_fscore_support(labels[test_index], predicted) precisions.append(precision) recalls.append(recall) f1scores.append(f1score) supports.append(support) precisions = np.mean(np.array(precisions), axis=0) recalls = np.mean(np.array(recalls), axis=0) f1scores = np.mean(np.array(f1scores), axis=0) supports = np.mean(np.array(supports), axis=0) for label in range(2): print '%f\t%f\t%f\t%f' % (precisions[label], recalls[label], f1scores[label], supports[label]) return
def main(args): path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) # load data data = [utils.load_data(path, id) for id, url in enumerate(urls)] random.shuffle(data) for page in data: random.shuffle(page["texts"]) # process data processor = processors.Processor(data, tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer) features = processor.extract() # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # prepare features continuous_features, discrete_features, labels = processor.prepare(labels) vectorizer = DictVectorizer() discrete_features = vectorizer.fit_transform(discrete_features).toarray() continuous_features = np.array(continuous_features) labels = np.array(labels).astype(np.float32) features = np.hstack([continuous_features, discrete_features]).astype(np.float32) # scale features features = preprocessing.scale(features) print features.shape precisions = [] recalls = [] f1scores = [] supports = [] rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0) for train_index, test_index in rs: print "training size = %d, testing size = %d" % (len(train_index), len(test_index)) clf = svm.SVC( verbose=False, kernel="linear", probability=False, random_state=0, cache_size=2000, class_weight="auto" ) clf.fit(features[train_index], labels[train_index]) print clf.n_support_ print "training:" predicted = clf.predict(features[train_index]) print classification_report(labels[train_index], predicted) print "testing:" predicted = clf.predict(features[test_index]) print classification_report(labels[test_index], predicted) precision, recall, f1score, support = precision_recall_fscore_support(labels[test_index], predicted) precisions.append(precision) recalls.append(recall) f1scores.append(f1score) supports.append(support) precisions = np.mean(np.array(precisions), axis=0) recalls = np.mean(np.array(recalls), axis=0) f1scores = np.mean(np.array(f1scores), axis=0) supports = np.mean(np.array(supports), axis=0) for label in range(2): print "%f\t%f\t%f\t%f" % (precisions[label], recalls[label], f1scores[label], supports[label]) return negatives = [] positives = [] for i in range(len(processor.texts)): if labels[i]: positives.append(processor.texts[i]) else: negatives.append(processor.texts[i]) stats(negatives, positives) return """
def main(args): # path = utils.get_data_path(args.site[0]) sites = ['theverge', 'sina', 'qq', 'techcrunch', 'usatoday', 'npr', 'prothomalo'] all_continuous_features = [] all_discrete_features= [] all_labels = [] for site in sites: print 'clustering %s ...' % site path = utils.get_data_path(site) urls = utils.load_urls(path) # load data data = [utils.load_data(path, id) for id, url in enumerate(urls)] # process data processor = processors.Processor(data, tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer) features = processor.extract() # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # prepare features continuous_features, discrete_features, labels = processor.prepare(labels) all_continuous_features += continuous_features all_discrete_features += discrete_features all_labels += labels vectorizer = DictVectorizer() discrete_features = vectorizer.fit_transform(all_discrete_features).toarray() continuous_features = np.array(all_continuous_features) labels = np.array(all_labels).astype(np.float32) features = np.hstack([continuous_features, discrete_features]).astype(np.float32) precisions = [] recalls = [] f1scores = [] supports = [] rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0) for train_index, test_index in rs: print 'training size = %d, testing size = %d' % (len(train_index), len(test_index)) clf = svm.SVC(verbose=False, kernel='linear', probability=False, random_state=0, cache_size=2000, class_weight='auto') clf.fit(features[train_index], labels[train_index]) predicted = clf.predict(features[test_index]) print classification_report(labels[test_index], predicted) precision, recall, f1score, support = precision_recall_fscore_support(labels[test_index], predicted) precisions.append(precision) recalls.append(recall) f1scores.append(f1score) supports.append(support) precisions = np.mean(np.array(precisions), axis=0) recalls = np.mean(np.array(recalls), axis=0) f1scores = np.mean(np.array(f1scores), axis=0) supports = np.mean(np.array(supports), axis=0) for label in range(2): print '%f\t%f\t%f\t%f' % (precisions[label], recalls[label], f1scores[label], supports[label]) return """
def make_output_dirs(self): self.output_err = '' try: self.progress_text = 'Removing old output directory...\n' output_dir = utils.path_join(self.output_dir(), self.project_name()) if os.path.exists(output_dir): utils.rmtree(output_dir, ignore_errors=True) temp_dir = utils.path_join(TEMP_DIR, 'webexectemp') if os.path.exists(temp_dir): utils.rmtree(temp_dir, ignore_errors=True) self.progress_text = 'Making new directories...\n' if not os.path.exists(output_dir): os.makedirs(output_dir) os.makedirs(temp_dir) self.copy_files_to_project_folder() json_file = utils.path_join(self.project_dir(), 'package.json') global_json = utils.get_data_file_path('files/global.json') if self.output_package_json: with codecs.open(json_file, 'w+', encoding='utf-8') as f: f.write(self.generate_json()) with codecs.open(global_json, 'w+', encoding='utf-8') as f: f.write(self.generate_json(global_json=True)) zip_file = utils.path_join(temp_dir, self.project_name()+'.nw') app_nw_folder = utils.path_join(temp_dir, self.project_name()+'.nwf') utils.copytree(self.project_dir(), app_nw_folder, ignore=shutil.ignore_patterns(output_dir)) zip_files(zip_file, self.project_dir(), exclude_paths=[output_dir]) for ex_setting in self.settings['export_settings'].values(): if ex_setting.value: self.progress_text = '\n' name = ex_setting.display_name self.progress_text = u'Making files for {}...'.format(name) export_dest = utils.path_join(output_dir, ex_setting.name) versions = re.findall('(\d+)\.(\d+)\.(\d+)', self.selected_version())[0] minor = int(versions[1]) if minor >= 12: export_dest = export_dest.replace('node-webkit', 'nwjs') if os.path.exists(export_dest): utils.rmtree(export_dest, ignore_errors=True) # shutil will make the directory for us utils.copytree(get_data_path('files/'+ex_setting.name), export_dest, ignore=shutil.ignore_patterns('place_holder.txt')) utils.rmtree(get_data_path('files/'+ex_setting.name), ignore_errors=True) self.progress_text += '.' if 'mac' in ex_setting.name: uncomp_setting = self.get_setting('uncompressed_folder') uncompressed = uncomp_setting.value app_path = utils.path_join(export_dest, self.project_name()+'.app') try: utils.move(utils.path_join(export_dest, 'nwjs.app'), app_path) except IOError: utils.move(utils.path_join(export_dest, 'node-webkit.app'), app_path) plist_path = utils.path_join(app_path, 'Contents', 'Info.plist') plist_dict = plistlib.readPlist(plist_path) plist_dict['CFBundleDisplayName'] = self.project_name() plist_dict['CFBundleName'] = self.project_name() version_setting = self.get_setting('version') plist_dict['CFBundleShortVersionString'] = version_setting.value plist_dict['CFBundleVersion'] = version_setting.value plistlib.writePlist(plist_dict, plist_path) self.progress_text += '.' app_nw_res = utils.path_join(app_path, 'Contents', 'Resources', 'app.nw') if uncompressed: utils.copytree(app_nw_folder, app_nw_res) else: utils.copy(zip_file, app_nw_res) self.create_icns_for_app(utils.path_join(app_path, 'Contents', 'Resources', 'nw.icns')) self.progress_text += '.' else: ext = '' windows = False if 'windows' in ex_setting.name: ext = '.exe' windows = True nw_path = utils.path_join(export_dest, ex_setting.dest_files[0]) if windows: self.replace_icon_in_exe(nw_path) self.compress_nw(nw_path) dest_binary_path = utils.path_join(export_dest, self.project_name() + ext) if 'linux' in ex_setting.name: self.make_desktop_file(dest_binary_path, export_dest) join_files(dest_binary_path, nw_path, zip_file) sevenfivefive = (stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH) os.chmod(dest_binary_path, sevenfivefive) self.progress_text += '.' if os.path.exists(nw_path): os.remove(nw_path) except Exception: error = u''.join([unicode(x) for x in traceback.format_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2])]) self.logger.error(error) self.output_err += error finally: utils.rmtree(temp_dir, ignore_errors=True)
def send_image(filename): path = get_data_path(conf.image_file['path']) return send_from_directory(path, filename)