Python get_data_path 예제들, utils.get_data_path Python 예제들

예제 #1

0

파일 보기

def gen_dataset(graphs):
    random.Random(123).shuffle(graphs)
    dirout_train = get_data_path() + '/IMDBMulti/train'
    dirout_test = get_data_path() + '/IMDBMulti/test'
    create_dir_if_not_exists(dirout_train)
    create_dir_if_not_exists(dirout_test)
    for g in graphs[0:1200]:
        nx.write_gexf(g, dirout_train + '/{}.gexf'.format(g.graph['gid']))
    for g in graphs[1200:]:
        nx.write_gexf(g, dirout_test + '/{}.gexf'.format(g.graph['gid']))

예제 #2

0

파일 보기

파일: load_old_data.py 프로젝트: openpublicforpapers/NeuralMCS

def _get_gs_and_metric_info(name, dir_name, natts, eatts, align_metric):
    train_gs = iterate_get_graphs(join(get_data_path(), dir_name, 'train'),
                                  natts=natts, eatts=eatts)
    test_gs = iterate_get_graphs(join(get_data_path(), dir_name, 'test'),
                                 natts=natts, eatts=eatts)
    if name == 'aids700nef_old_small':
        train_gs = train_gs[0:4]
        test_gs = test_gs[0:2]
    graphs = train_gs + test_gs
    dist_or_sim, true_algo = get_ds_metric_config(align_metric)
    return train_gs, test_gs, graphs, dist_or_sim, true_algo

예제 #3

0

파일 보기

파일: preprocess_imdb_redit_multi_ptc_mutag_nci109_collab.py 프로젝트: yunshengb/GraphSim

def gen_dataset(graphs):
    random.Random(123).shuffle(graphs)
    dirout_train = get_data_path() + '/{}/train'.format(conf.outfolder)
    dirout_test = get_data_path() + '/{}/test'.format(conf.outfolder)
    create_dir_if_not_exists(dirout_train)
    create_dir_if_not_exists(dirout_test)
    sp = int(len(graphs) * conf.train_perc_)
    for g in graphs[0:sp]:
        nx.write_gexf(g, dirout_train + '/{}.gexf'.format(g.graph['gid']))
    for g in graphs[sp:]:
        nx.write_gexf(g, dirout_test + '/{}.gexf'.format(g.graph['gid']))

예제 #4

0

파일 보기

def load_fashion_mnist(with_y=False):
    datapath = get_data_path("fashion_mnist")
    paths = [
        "train-images-idx3-ubyte.gz", "train-labels-idx1-ubyte.gz",
        "t10k-images-idx3-ubyte.gz", "t10k-labels-idx1-ubyte.gz"
    ]
    datasets = [os.path.join(datapath, fp) for fp in paths]

    if not os.path.isfile(datasets[0]):
        urls = [
            os.path.join(
                "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com",
                fp) for fp in paths
        ]

        for url, fn in zip(urls, paths):
            print("Downloading %s data..." % (fn))
            urlretrieve(url, os.path.join(datapath, fn))

    train_x, train_t = load_mnist(datapath, "train")
    test_x, test_t = load_mnist(datapath, "t10k")

    if with_y:
        return (train_x, train_t), (test_x, test_t)

    return train_x, test_x

예제 #5

0

파일 보기

def load_mnist_binarized():
    datapath = get_data_path("mnist")
    dataset = os.path.join(datapath, "mnist.gz")

    if not os.path.isfile(dataset):

        datafiles = {
            "train":
            "http://www.cs.toronto.edu/~larocheh/public/"
            "datasets/binarized_mnist/binarized_mnist_train.amat",
            "valid":
            "http://www.cs.toronto.edu/~larocheh/public/datasets/"
            "binarized_mnist/binarized_mnist_valid.amat",
            "test":
            "http://www.cs.toronto.edu/~larocheh/public/datasets/"
            "binarized_mnist/binarized_mnist_test.amat"
        }
        datasplits = {}
        for split in datafiles.keys():
            print("Downloading %s data..." % (split))
            datasplits[split] = np.loadtxt(urlretrieve(datafiles[split])[0])

        pkl.dump(
            [datasplits['train'], datasplits['valid'], datasplits['test']],
            open(dataset, "wb"))

    x_train, x_valid, x_test = pkl.load(open(dataset, "rb"))
    return x_train, x_valid, x_test

예제 #6

0

파일 보기

파일: run.py 프로젝트: CharlesPang/spider-1

def main(args):

    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    for count in range(2, len(urls) + 1):

        print '[learner] clustering with %d urls' % count

        # load data
        data = [utils.load_data(path, id) for id, url in enumerate(urls)]
        data = data[:count]

        # process data
        processor = processors.Processor(data)
        features = processor.extract()

        # clustering
        clusterer = clusterers.DBSCAN()
        labels = clusterer.cluster(features).labels_

        # score
        clusters = processor.score(labels)
        
        with open(os.path.join(path, 'clusters.%03d.json' % count), 'w') as f:
            f.write(json.dumps(clusters, indent=2, ensure_ascii=False).encode('utf8'))

예제 #7

0

파일 보기

    def __init__(self, sset="train", transform=None):
        path = get_data_path('coco')
        self.root = os.path.join(path["COCO_ROOT"], "images/")
        self.transform = transform
        # dataset.json come from Karpathy neural talk repository and contain the restval split of coco
        with open(path["COCO_RESTVAL_SPLIT"], 'r') as f:
            datas = json.load(f)

        if sset == "train":
            self.content = [
                x for x in datas["images"] if x["split"] == "train"
            ]
        elif sset == "trainrv":
            self.content = [
                x for x in datas["images"]
                if x["split"] == "train" or x["split"] == "restval"
            ]
        elif sset == "val":
            self.content = [x for x in datas["images"] if x["split"] == "val"]
        else:
            self.content = [x for x in datas["images"] if x["split"] == "test"]

        self.content = [(os.path.join(y["filepath"], y["filename"]),
                         [x["raw"] for x in y["sentences"]])
                        for y in self.content]
        self.word_dict_path = path["WORD_DICT"]
        path_params = os.path.join(self.word_dict_path, 'utable.npy')
        self.params = np.load(path_params,
                              encoding='latin1',
                              allow_pickle=True)
        self.dico = _load_dictionary(self.word_dict_path)

예제 #8

0

파일 보기

def main(args):

    extractor = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                             'extractor.coffee')
    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    # extract data from each url
    for id, url in enumerate(urls):
        url = url.strip()
        if not url:
            continue

        # skip already extracted
        if os.path.exists(os.path.join(path, '%03d.json' % id)):
            continue

        print '[extractor] #%03d: %s' % (id, url)
        subprocess.call(
            'cd "%(path)s" && phantomjs "%(extractor)s" "%(url)s" "%(label)03d" > "%(label)03d.log" 2>&1'
            % {
                'path': path,
                'extractor': extractor,
                'url': url,
                'label': id,
            },
            shell=True)

예제 #9

0

파일 보기

파일: diffbot.py 프로젝트: meelement/web-mining

def main(args):

    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    # extract data from each url
    data = []
    for id, url in enumerate(urls):
        url = url.strip()
        if not url:
            continue

        print '[diffbot] #%03d: %s' % (id, url)
        response = urllib2.urlopen(
            'http://www.diffbot.com/api/article?' +
            urllib.urlencode({
                'url': url,
                'token': '4bc6e407da88dd8723c70a5297cdf7fb',
                'timeout': '60000',
            }))

        data.append(json.loads(response.read()))

    with open(os.path.join(path, 'diffbot.json'), 'w') as f:
        f.write(json.dumps(data, indent=2, ensure_ascii=False).encode('utf8'))

예제 #10

0

파일 보기

def main(args):

    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    for count in range(2, len(urls) + 1):

        print '[learner] clustering with %d urls' % count

        # load data
        data = [utils.load_data(path, id) for id, url in enumerate(urls)]
        data = data[:count]

        # process data
        processor = processors.Processor(data)
        features = processor.extract()

        # clustering
        clusterer = clusterers.DBSCAN()
        labels = clusterer.cluster(features).labels_

        # score
        clusters = processor.score(labels)

        with open(os.path.join(path, 'clusters.%03d.json' % count), 'w') as f:
            f.write(
                json.dumps(clusters, indent=2,
                           ensure_ascii=False).encode('utf8'))

예제 #11

0

파일 보기

def test_model(args):

    models = os.listdir(args.save_path)

    # load dataset
    data_paths = get_data_path(args.mode, args.label_type)
    datasets = BertSumLoader().process(data_paths)
    print('Information of dataset is:')
    print(datasets)
    test_set = datasets.datasets['test']

    # only need 1 gpu for testing
    device = int(args.gpus)

    args.batch_size = 1

    for cur_model in models:

        print('Current model is {}'.format(cur_model))

        # load model
        model = torch.load(join(args.save_path, cur_model))

        # configure testing
        original_path, dec_path, ref_path = get_rouge_path(args.label_type)
        test_metric = RougeMetric(data_path=original_path,
                                  dec_path=dec_path,
                                  ref_path=ref_path,
                                  n_total=len(test_set))
        tester = Tester(data=test_set,
                        model=model,
                        metrics=[test_metric],
                        batch_size=args.batch_size,
                        device=device)
        tester.test()

예제 #12

0

파일 보기

파일: command_line.py 프로젝트: thugetry/Web2Executable

    def extract_files(self):
        self.extract_error = None
        location = self.get_setting('download_dir').value
        version = self.selected_version()
        for setting_name, setting in self.settings['export_settings'].items():
            save_file_path = setting.save_file_path(version,
                                                    location)
            try:
                if setting.value:
                    extract_path = get_data_path('files/'+setting.name)
                    setting.extract(extract_path, version)

                    #if os.path.exists(save_file_path):
                    #    setting_fbytes = setting.get_file_bytes(version)
                    #    for dest_file, fbytes in setting_fbytes:
                    #        path = utils.path_join(extract_path, dest_file)
                    #        with open(path, 'wb+') as d:
                    #            d.write(fbytes)
                    #        self.progress_text += '.'

                    self.progress_text += '.'

            except (tarfile.ReadError, zipfile.BadZipfile) as e:
                if os.path.exists(save_file_path):
                    os.remove(save_file_path)
                self.extract_error = e
                self.logger.error(unicode(self.extract_error))
                # cannot use GUI in thread to notify user. Save it for later
        self.progress_text = '\nDone.\n'
        return True

예제 #13

0

파일 보기

파일: command_line.py 프로젝트: zcgelppa/Web2Executable

    def extract_files(self):
        self.extract_error = None
        location = self.get_setting('download_dir').value
        version = self.selected_version()
        for setting_name, setting in self.settings['export_settings'].items():
            save_file_path = setting.save_file_path(version,
                                                    location)
            try:
                if setting.value:
                    extract_path = get_data_path('files/'+setting.name)
                    setting.extract(extract_path, version)

                    #if os.path.exists(save_file_path):
                    #    setting_fbytes = setting.get_file_bytes(version)
                    #    for dest_file, fbytes in setting_fbytes:
                    #        path = utils.path_join(extract_path, dest_file)
                    #        with open(path, 'wb+') as d:
                    #            d.write(fbytes)
                    #        self.progress_text += '.'

                    self.progress_text += '.'

            except (tarfile.ReadError, zipfile.BadZipfile) as e:
                if os.path.exists(save_file_path):
                    os.remove(save_file_path)
                self.extract_error = e
                self.logger.error(unicode(self.extract_error))
                # cannot use GUI in thread to notify user. Save it for later
        self.progress_text = '\nDone.\n'
        return True

예제 #14

0

파일 보기

    def render_training(self):
        history = self.history.history
        loss = [history['loss'][0]] + history['loss']
        val_loss = [history['val_loss'][0]] + history['val_loss']
        lr = [-log10(r) for r in self.scheduler.history_lr]
        lr = lr + [lr[-1]]

        import matplotlib.pyplot as plt

        epochs = range(1, len(loss) + 1)

        plt.plot(epochs, loss, 'k', label='Training loss')
        plt.plot(epochs, val_loss, 'y', label='Validation loss')
        plt.plot(epochs, lr, 'r', label='Learning rate (1e-X)')

        min_max = [
            r or 0 for r in [self.scheduler.min_rate, self.scheduler.max_rate]
        ]

        plt.title(
            f'Training and validation loss [{min_max[0]:0.0E} - {min_max[1]:0.0E}]'
        )
        plt.legend()

        fname = 'loss'
        plt.savefig(
            utils.get_data_path(settings.PLOT_PATH,
                                utils.get_exp_key(self) + '-' + fname +
                                '.svg'))
        plt.close()

예제 #15

0

파일 보기

파일: blocks.py 프로젝트: Hi-Timofey/Deadline

 def __init__(self, x, y):
     Platform.__init__(self, x, y)
     boltAnim = []
     for anim in ANIMATION_FIRE:
         boltAnim.append((anim, ANIMATION_DELAY))
     self.boltAnim = pyganim.PygAnimation(boltAnim)
     self.boltAnim.play()
     self.image = pygame.image.load((get_data_path('Fogo_1.png', 'img')))

예제 #16

0

파일 보기

 def init(self):
     self.graphs = []
     datadir = '{}/LINUX/{}'.format(
         get_data_path(), get_train_str(self.train))
     self.graphs = iterate_get_graphs(datadir)
     print('Loaded {} graphs from {}'.format(len(self.graphs), datadir))
     self.graphs, self.glabels = add_glabel_to_each_graph(self.graphs, '', True)
     assert (self.glabels is None)  # fake graph labels

예제 #17

0

파일 보기

파일: config.py 프로젝트: jyapayne/Web2Executable

def download_path(path=None):
    # Ensure that the default download path exists
    path = path or utils.get_data_path('files/downloads')
    try:
        os.makedirs(path)
    except:
        pass
    return path

예제 #18

0

파일 보기

파일: config.py 프로젝트: baskar007/Web2Executable

def download_path(path=None):
    # Ensure that the default download path exists
    path = path or utils.get_data_path('files/downloads')
    try:
        os.makedirs(path)
    except:
        pass
    return path

예제 #19

0

파일 보기

def gen_graphs():
    dirin = get_data_path()
    file = dirin + '/linux_Format-2'
    # train_dirout = dirin + '/train'
    # test_dirout = dirin + '/test'
    # dirin = get_data_path() + '/iGraph20/datasets'
    # file = dirin + '/nasa.igraph'
    train_dirout = dirin + '/train'
    test_dirout = dirin + '/test'
    graphs = {}
    gid = None
    types_count = defaultdict(int)
    total_num_nodes = 0
    disconnects = set()
    less_than_eq_10 = set()
    types_count_less_than_eq_10 = defaultdict(int)
    total_num_nodes_less_than_eq_10 = 0
    with open(file) as f:
        for line in f:
            ls = line.rstrip().split()
            if ls[0] == 't':
                assert (len(ls) == 3)
                assert (ls[1] == '#')
                if gid:
                    assert (gid not in graphs)
                    graphs[gid] = g
                    print(gid, g.number_of_nodes())
                    if g.number_of_nodes() <= 10 and nx.is_connected(g):
                        less_than_eq_10.add(gid)
                        total_num_nodes_less_than_eq_10 += g.number_of_nodes()
                        d = nx.get_node_attributes(g, 'type')
                        for _, type in d.items():
                            types_count_less_than_eq_10[type] += 1
                    if not nx.is_connected(g):
                        disconnects.add(g)
                g = nx.Graph()
                gid = int(ls[2])
            elif ls[0] == 'v':
                assert (len(ls) == 3)
                type = int(ls[2])
                types_count[type] += 1
                g.add_node(int(ls[1]), type=type)
                total_num_nodes += 1
            elif ls[0] == 'e':
                assert (len(ls) == 4)
                edge_type = int(ls[3])
                assert (edge_type == 0)
                g.add_edge(int(ls[1]), int(ls[2]))
    print(len(graphs), 'graphs in total')
    print(len(types_count), 'node types out of total', total_num_nodes,
          'nodes')
    print(len(disconnects), 'disconnected graphs')
    for i in range(10):
        print(i, types_count[i])
    print(len(less_than_eq_10), 'small graphs (<= 10 nodes)')
    print(len(types_count_less_than_eq_10), 'node types out of total',
          total_num_nodes_less_than_eq_10, 'nodes')
    select_dump_graphs(graphs, sorted(list(less_than_eq_10)))

예제 #20

0

파일 보기

파일: discogs_to_sql.py 프로젝트: xsleonard/discviz

 def __init__(self, datestr=None):
     if datestr is None:
         datestr = self.datestr
     self.path = get_data_path(__file__)
     assert os.path.exists(self.path)
     files = [self.file_fmt.format(date=datestr, name=f)
              for f in self.files]
     self.file_paths = [os.path.join(self.path, f) for f in files]
     assert all([os.path.exists(f) for f in self.file_paths])

예제 #21

0

파일 보기

def main(filename, user1, user2, user3, color1, color2, color3, color_lands,
         name_out, map_size, line_width):
    '''Fonction principale pour récupérer les bases de données

    Args:
        filename (str): Nom du fichier à charger
        user1 (str) : name of the traveler 1
        user2 (str) : name of the traveler 2
        user3 (str) : name of the traveler 3
        color1 (str) : RGB color for the traveler 1
        color2 (str) : RGB color for the traveler 2
        color3 (str) : RGB color for the traveler 3
        color_lands (str) : RGB color for lands
        map_size (str) : map_size
    Raises:
        TypeError : si l'objet filename n'est pas du type str
        TypeError : si l'objet user1 n'est pas du type str
        TypeError : si l'objet user2 n'est pas du type str
        TypeError : si l'objet user3 n'est pas du type str
        TypeError : si l'objet color1 n'est pas du type str
        TypeError : si l'objet color2 n'est pas du type str
        TypeError : si l'objet color3 n'est pas du type str
        TypeError : si l'objet color_lands n'est pas du type str
        TypeError : si l'objet map_size n'est pas du type str
    '''
    utils.info('Récupération des bases de données')
    if type(filename) != str:
        raise TypeError('L\'objet filename doit être du type str.')
    if type(user1) != str:
        raise TypeError('L\'objet user1 doit être du type str.')
    if type(user2) != str:
        raise TypeError('L\'objet user2 doit être du type str.')
    if type(user3) != str:
        raise TypeError('L\'objet user3 doit être du type str.')
    if type(color1) != str:
        raise TypeError('L\'objet color1 doit être du type str.')
    if type(color2) != str:
        raise TypeError('L\'objet color2 doit être du type str.')
    if type(color3) != str:
        raise TypeError('L\'objet color3 doit être du type str.')
    if type(map_size) != str:
        raise TypeError('L\'objet map_size doit être du type str.')
    if type(color_lands) != str:
        raise TypeError('L\'objet color_lands doit être du type str.')

    data_dir = utils.get_data_path()
    images_dir = utils.get_images_path()
    df_path = os.path.join(data_dir, filename)
    image_path = os.path.join(images_dir, name_out)

    df_journeys = pd.read_excel(df_path)
    df_journeys = prepare_table(df_journeys, user1, user2, user3, color1,
                                color2, color3)
    fig = make_map(df_journeys, color_lands, map_size, line_width)

    #fig.show()
    fig.write_image(image_path, width=12800, height=8400, scale=1)

예제 #22

0

파일 보기

파일: app.py 프로젝트: Pazitos10/time-tracker

 def __init__(self):
     super(App, self).__init__()
     self.ui = Ui_MainWindow()
     self.ui.setupUi(self)
     self.data_path = get_data_path()
     self.tabs = []
     self.options_menu_widget = None
     self.tab_widget = None
     self.setups()

예제 #23

0

파일 보기

파일: atomic_table.py 프로젝트: Goobley/Lightspinner

    def __init__(self, kuruczPfPath: Optional[str]=None, metallicity: float=0.0, 
                        abundances: Dict=None, abundDex: bool=True):
        if set(AtomicWeights.keys()) != set(AtomicAbundances.keys()):
            raise ValueError('AtomicWeights and AtomicAbundances keys differ (Problem keys: %s)' % repr(set(AtomicWeights.keys()) - set(AtomicAbundances.keys())))

        self.indices = OrderedDict(zip(AtomicWeights.keys(), range(len(AtomicWeights))))

        # Convert abundances and overwrite any provided secondary abundances
        self.abund = deepcopy(AtomicAbundances)
        if self.abund['H '] == 12.0:
            for k, v in self.abund.items():
                self.abund[k] = 10**(v - 12.0)

        if abundances is not None:
            if abundDex:
                for k, v in abundances.items():
                    abundances[k] = 10**(v - 12.0)
            for k, v in abundances.items():
                self.abund[k] = v

        metallicity = 10**metallicity
        for k, v in self.abund.items():
            if k != 'H ':
                self.abund[k] = v*metallicity

        kuruczPfPath = get_data_path() + 'pf_Kurucz.input' if kuruczPfPath is None else kuruczPfPath
        with open(kuruczPfPath, 'rb') as f:
            s = f.read()
        u = Unpacker(s)

        self.Tpf = np.array(u.unpack_array(u.unpack_double))
        ptIndex = [] # Index in the periodic table (fortran based, so +1) -- could be used for validation
        stages = []
        pf = []
        ionpot = []
        for i in range(len(AtomicWeights)):
            ptIndex.append(u.unpack_int())
            stages.append(u.unpack_int())
            pf.append(np.array(u.unpack_farray(stages[-1] * self.Tpf.shape[0], u.unpack_double)).reshape(stages[-1], self.Tpf.shape[0]))
            ionpot.append(np.array(u.unpack_farray(stages[-1], u.unpack_double)))

        ionpot = [i * Const.HC / Const.CM_TO_M for i in ionpot]
        pf = [np.log(p) for p in pf]

        totalAbund = 0.0
        avgWeight = 0.0
        self.elements: List[Element] = []
        for k, v in AtomicWeights.items():
            i = self.indices[k]
            ele = Element(k, v, self.abund[k], ionpot[i], self.Tpf, pf[i])
            self.elements.append(ele)
            totalAbund += ele.abundance
            avgWeight += ele.abundance * ele.weight

        self.totalAbundance = totalAbund
        self.weightPerH = avgWeight
        self.avgMolWeight = avgWeight / totalAbund

예제 #24

0

파일 보기

def get_proc_graphs(datadir, train):
    if logging_enabled == True:
        print("- Entered data::get_proc_graphs Global Method")

    datadir = '{}\\{}\\{}'.format(
        get_data_path(), datadir, get_train_str(train))
    graphs = iterate_get_graphs(datadir)
    print('info: Loaded {} graphs from {}'.format(len(graphs), datadir))
    return graphs

예제 #25

0

파일 보기

파일: cifar10.py 프로젝트: larsmaaloee/BIVA

def load_cifar(levels=256, with_y=False):
    dataset = 'cifar-10-python.tar.gz'
    data_dir, data_file = os.path.split(dataset)
    if data_dir == "" and not os.path.isfile(dataset):
        # Check if dataset is in the data directory.
        new_path = os.path.join(get_data_path("cifar10"), dataset)
        if os.path.isfile(new_path) or data_file == 'cifar-10-python.tar.gz':
            dataset = new_path

    if (not os.path.isfile(dataset)) and data_file == 'cifar-10-python.tar.gz':
        origin = ('http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz')
        print("Downloading data from {}...".format(origin))
        urlretrieve(origin, dataset)

    f = tarfile.open(dataset, 'r:gz')
    b1 = pkl.load(f.extractfile("cifar-10-batches-py/data_batch_1"),
                  encoding="bytes")
    b2 = pkl.load(f.extractfile("cifar-10-batches-py/data_batch_2"),
                  encoding="bytes")
    b3 = pkl.load(f.extractfile("cifar-10-batches-py/data_batch_3"),
                  encoding="bytes")
    b4 = pkl.load(f.extractfile("cifar-10-batches-py/data_batch_4"),
                  encoding="bytes")
    b5 = pkl.load(f.extractfile("cifar-10-batches-py/data_batch_5"),
                  encoding="bytes")
    test = pkl.load(f.extractfile("cifar-10-batches-py/test_batch"),
                    encoding="bytes")
    train_x = np.concatenate(
        [b1[b'data'], b2[b'data'], b3[b'data'], b4[b'data'], b5[b'data']],
        axis=0) / 255.
    train_x = np.asarray(train_x, dtype='float32')
    train_t = np.concatenate([
        np.array(b1[b'labels']),
        np.array(b2[b'labels']),
        np.array(b3[b'labels']),
        np.array(b4[b'labels']),
        np.array(b5[b'labels'])
    ],
                             axis=0)

    test_x = test[b'data'] / 255.
    test_x = np.asarray(test_x, dtype='float32')
    test_t = np.array(test[b'labels'])
    f.close()

    train_x = train_x.reshape((train_x.shape[0], 3, 32, 32)).transpose(
        (0, 2, 3, 1)).reshape((train_x.shape[0], -1))
    test_x = test_x.reshape((test_x.shape[0], 3, 32, 32)).transpose(
        (0, 2, 3, 1)).reshape((test_x.shape[0], -1))

    train_x = quantisize(train_x, levels) / (levels - 1.)
    test_x = quantisize(test_x, levels) / (levels - 1.)

    if with_y:
        return (train_x, train_t), (test_x, test_t)
    return train_x, test_x

예제 #26

0

파일 보기

 def init(self):
     self.graphs = []
     datadir = '{}/{}/{}'.format(get_data_path(), self.get_folder_name(),
                                 get_train_str(self.train))
     self.graphs = iterate_get_graphs(datadir)
     print('Loaded {} graphs from {}'.format(len(self.graphs), datadir))
     if 'nef' in self.get_folder_name():
         print('Removing edge features')
         for g in self.graphs:
             self._remove_valence(g)

예제 #27

0

파일 보기

def predict(tree, toy_id, true_category, target):
    path = get_data_path(dev=False)
    df = importData(path)

    X_test = df.loc[df['id'] == toy_id]
    X_test = X_test.drop('id', axis=1)
    print(X_test)
    y_pred = tree.predict(X_test)

    return y_pred

예제 #28

0

파일 보기

파일: classifiers.py 프로젝트: orsiberton/galaxy-zoo

def read_y_true_for_all_classes():
    y_true = []
    for class_number in range(1, 12):
        y_true.append(read_y_true_for_class(utils.get_data_path() + "/Class{}.csv".format(class_number)))

    # Creates an array [n_samples, number of classes] which each position is the labeled data for the given class
    y_true = np.array(y_true)
    y_true = np.transpose(y_true)

    return y_true

예제 #29

0

파일 보기

def command_base():
    config.TESTING = True
    dpath = utils.get_data_path('')

    if os.path.exists(dpath):
        utils.rmtree(dpath)

    base = CommandBase()
    base._project_name = 'Test'
    return base

예제 #30

0

파일 보기

파일: prep_data.py 프로젝트: xinjianlv/Text-GCN

def clean_data(dataset):
    clean_text_path = join(get_data_path(), 'corpus',
                           dataset + '_sentences_clean.txt')
    if not exists(clean_text_path):
        docs_list = []
        old_name = dataset
        if "no_hashtag" in dataset:
            dataset = '_'.join(dataset.split('_')[:-2])
        with open(join(get_data_path(), 'corpus',
                       dataset + '_sentences.txt')) as f:
            for line in f.readlines():
                docs_list.append(line.strip())
        dataset = old_name
        word_counts = defaultdict(int)
        for doc in docs_list:
            temp = clean_doc(doc, dataset)
            words = temp.split()
            for word in words:
                word_counts[word] += 1
        clean_docs = clean_documents(docs_list, word_counts, dataset)
        corpus_str = '\n'.join(clean_docs)
        f = open(clean_text_path, 'w')
        f.write(corpus_str)
        f.close()
    f = open(clean_text_path, 'r')
    lines = f.readlines()
    min_len = 10000
    aver_len = 0
    max_len = 0
    for line in lines:
        line = line.strip()
        temp = line.split()
        aver_len = aver_len + len(temp)
        if len(temp) < min_len:
            min_len = len(temp)
        if len(temp) > max_len:
            max_len = len(temp)
    f.close()
    aver_len = 1.0 * aver_len / len(lines)
    print('min_len : ' + str(min_len))
    print('max_len : ' + str(max_len))
    print('average_len : ' + str(aver_len))

예제 #31

0

파일 보기

파일: preprocess_imdb_redit_multi_ptc_mutag_nci109_collab.py 프로젝트: yunshengb/GraphSim

def main():
    dirin = get_data_path() + '/{}/graph'.format(conf.infolder)
    k = float('inf')
    lesseqk = []
    glabel_map = read_graph_labels()
    info_map = {}
    disconnected = []
    files = glob(dirin + '/*.gexf')
    if conf.need_sort_:
        files = sorted_nicely(files)
    for i, file in enumerate(files):
        g = nx.read_gexf(file)
        gid = get_file_base_id(file)
        print(i, gid, g.number_of_nodes())
        if g.number_of_nodes() <= k:
            if not nx.is_connected(g):
                print(gid, 'is not connected')
                gsize = g.number_of_nodes()
                g = max(nx.connected_component_subgraphs(g), key=len)
                grmd = gsize - g.number_of_nodes()
                assert (grmd > 0)
                g_info = 'rm_{}_nodes'.format(grmd)
                disconnected.append(g)
            else:
                g_info = ''
                lesseqk.append(g)
            info_map[gid] = g_info
            g.graph['gid'] = gid
            g.graph['label'] = glabel_map[gid]
            for node, d in g.nodes(data=True):
                type = d['node_class']
                if conf.has_node_type:
                    d.pop('node_class')
                    d['type'] = type
            for edge in g.edges_iter(data=True):
                del edge[2]['weight']
    print(len(lesseqk))
    gen_dataset(lesseqk)
    gen_dataset(disconnected)
    save_glabels_as_txt(get_data_path() + '/{}/glabels'.format(conf.outfolder), glabel_map)
    save_glabels_as_txt(get_data_path() + '/{}/info'.format(conf.outfolder), info_map)

예제 #32

0

파일 보기

def main(categories, data, dev: bool):
    target = categories[0]
    # import the data without classification
    path = get_data_path(dev)
    df = importData(path)
    if(target in df.columns):
        target = "target"
    # preprocess the data by adding the target column with the given values
    df = preprocess(df, target, data)
    X_train, X_test, y_train, y_test = split(df)
    tree = train(X_train, y_train)
    return tree

예제 #33

0

파일 보기

 def init(self):
     self.graphs = []
     datadir = '{}/{}/{}'.format(
         get_data_path(), self.get_folder_name(), get_train_str(self.train))
     self.graphs = iterate_get_graphs(datadir)
     print('Loaded {} graphs from {}'.format(len(self.graphs), datadir))
     if 'nef' in self.get_folder_name():
         print('Removing edge features')
         for g in self.graphs:
             self._remove_valence(g)
     self.graphs, self.glabels = add_glabel_to_each_graph(self.graphs, '', True)
     assert (self.glabels is None)  # fake graph labels

예제 #34

0

파일 보기

파일: classifiers.py 프로젝트: orsiberton/galaxy-zoo

def classify_class(class_number, features, test_features):
    y_true = read_y_true_for_class(utils.get_data_path() + "/Class{}.csv".format(class_number))

    # splits the train data into train and validation with validation being 20% of the original train data set
    x_train, x_validation, y_train, y_validation = train_test_split(features, y_true, test_size=0.20, random_state=0)

    classifier = create_rf_classifier(240)
    classifier.fit(x_train, y_train)

    score = classifier.score(x_validation, y_validation)
    print("Training score for Class {}: {:0.2f}".format(class_number, score))
    return classifier.predict(test_features).tolist()

예제 #35

0

파일 보기

파일: run.py 프로젝트: CharlesPang/spider-1

def main(args):

    extractor = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'extractor.coffee')
    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    # extract data from each url
    for id, url in enumerate(urls):
        url = url.strip()
        if not url:
            continue

        # skip already extracted
        if os.path.exists(os.path.join(path, '%03d.json' % id)):
            continue

        print '[extractor] #%03d: %s' % (id, url)
        subprocess.call('cd "%(path)s" && phantomjs "%(extractor)s" "%(url)s" "%(label)03d" > "%(label)03d.log" 2>&1' % {
            'path': path,
            'extractor': extractor,
            'url': url,
            'label': id,
        }, shell=True)

예제 #36

0

파일 보기

파일: diffbot.py 프로젝트: CharlesPang/spider-1

def main(args):

    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    # extract data from each url
    data = []
    for id, url in enumerate(urls):
        url = url.strip()
        if not url:
            continue

        print '[diffbot] #%03d: %s' % (id, url)
        response = urllib2.urlopen('http://www.diffbot.com/api/article?' + urllib.urlencode({
            'url': url,
            'token': '4bc6e407da88dd8723c70a5297cdf7fb',
            'timeout': '60000',
        }))

        data.append(json.loads(response.read()))

    with open(os.path.join(path, 'diffbot.json'), 'w') as f:
        f.write(json.dumps(data, indent=2, ensure_ascii=False).encode('utf8'))

예제 #37

0

파일 보기

파일: command_line.py 프로젝트: thugetry/Web2Executable

    # we are running in a normal Python environment
    CWD = os.getcwd()

def get_file(path):
    parts = path.split('/')
    independent_path = utils.path_join(CWD, *parts)
    return independent_path

__version__ = "v0.0.0"

with open(get_file('files/version.txt')) as f:
    __version__ = f.read().strip()


TEMP_DIR = get_temp_dir()
DEFAULT_DOWNLOAD_PATH = get_data_path('files/downloads')

logger = logging.getLogger('W2E logger')
LOG_FILENAME = get_data_file_path('files/error.log')
if __name__ != '__main__':
    logging.basicConfig(
        filename=LOG_FILENAME,
        format=("%(levelname) -10s %(asctime)s %(module)s.py: "
                "%(lineno)s %(funcName)s - %(message)s"),
        level=logging.DEBUG
    )
    logger = logging.getLogger('W2E logger')

handler = lh.RotatingFileHandler(LOG_FILENAME, maxBytes=100000, backupCount=2)
logger.addHandler(handler)

예제 #38

0

파일 보기

파일: run.py 프로젝트: rohitanurag-innoplexus/nutch-phantomjs

def main(args):

    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)
    #print path
    count=0
    # load data
    data = [utils.load_data(path, id) for id, url in enumerate(urls)]
    random.shuffle(data)
    for page in data:
       # print count
        #count+=1
        random.shuffle(page['texts'])

    # process data
    processor = processors.Processor(data, tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer)
    features = processor.extract()
    #print len(features)
    # clustering
    clusterer = clusterers.DBSCAN()
    labels = clusterer.cluster(features).labels_

    # prepare features
    continuous_features, discrete_features, cluster_labels , texts1, urls , classes = processor.prepare(labels,path)

    res=[texts1,labels]
   # print len(texts1)
    lab=[]
    urlss=[]
    for k in texts1:
        lab.append(k.encode('ascii','ignore'))
    
    for l in urls:
        urlss.append(l.encode('ascii','ignore'))
    # decode the uncodes into string variable
    with open("rohit.csv","w") as fp:
	    writer = csv.writer(fp)
	    for row in zip(urls, lab, labels,classes):
		    writer.writerow(row)


    input("enter data ")
    
    classes=[]
    with open("rohit.csv","r") as fp:
	    reader = csv.reader(fp)
	    for row in reader:
		    classes += [row[3]]
    # Label the dataset and give them classes to which they belong.Classes are in 4th column
    for i in xrange(1,len(classes)):
        if classes[i] == 0:
            classes[i]= cluster_labels[i]
    vectorizer = DictVectorizer()
    discrete_features = vectorizer.fit_transform(discrete_features).toarray()
    discrete_features.resize(len(discrete_features), 10000) 
    continuous_features = np.array(continuous_features)
    labels = np.array(labels).astype(np.float32)
    #print len(discrete_features[2])
    features = np.hstack([continuous_features, discrete_features]).astype(np.float32)
    #print features
    # scale features
    features = preprocessing.scale(features)
    #preprocess the features
    rf=RandomForestClassifier(n_estimators=300)
    rf.fit(features,classes)
    #make a randomforest model and fit into them features and classes
    filename = '/home/test/nutch/runtime/local/phantomjslearning/classlibraries/ivfhaveababy.joblib.pkl'
    _ = joblib.dump(rf, filename, compress=9)
    rf = joblib.load(filename)
    #dump the model file into the particular directory
    precisions = []
    recalls = []
    f1scores = []
    supports = []

    return

예제 #39

0

파일 보기

파일: __init__.py 프로젝트: dotajin/haoku-open

		def send_image(filename):
			path = get_data_path(app.config.get('AVATAR_FILE_CONF')['path'])
			return send_from_directory(path, filename)

예제 #40

0

파일 보기

파일: checkclassifier.py 프로젝트: rohitanurag-innoplexus/nutch-phantomjs

def main(args):

    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)
    #print path
    count=0
    # load data
    data = [utils.load_data(path, id) for id, url in enumerate(urls)]
    random.shuffle(data)
    for page in data:
       # print count
        #count+=1
        random.shuffle(page['texts'])

    # process data
    processor = processors.Processor(data, tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer)
    features = processor.extract()
    #print len(features)
    # clustering
    clusterer = clusterers.DBSCAN()
    labels = clusterer.cluster(features).labels_

    # prepare features
    continuous_features, discrete_features, cluster_labels , texts1, urls , classes = processor.prepare(labels,path)
    #It retrieves all the dataset which gets after getting clustered and store them in various lists
    lab=[]
    urlss=[]
    for k in texts1:
        lab.append(k.encode('ascii','ignore'))
    
    #It decodes the unicode into text 
    
    vectorizer = DictVectorizer()
    discrete_features = vectorizer.fit_transform(discrete_features).toarray()
    #This is the feature extraction part which I have mentioned in my doc
    discrete_features.resize(len(discrete_features), 10000) 
    #resize the discreet_features array to a uniform size so that in further using it model and test data set have same length features array
    continuous_features = np.array(continuous_features)
    labels = np.array(labels).astype(np.float32)

    features = np.hstack([continuous_features, discrete_features]).astype(np.float32)

    features = preprocessing.scale(features)
    #This is the normalization process where features are preprocessed to a scale
    im1=Image.open("/home/test/nutch/runtime/local/phantomjslearning/data/dazedandconfused/000.png")
    im2=Image.open("/home/test/nutch/runtime/local/phantomjslearning/data/fruitsofotherhands/000.png")
    im3=Image.open("/home/test/nutch/runtime/local/phantomjslearning/data/rohitanurag/000.png")
    im4=Image.open("/home/test/nutch/runtime/local/phantomjslearning/data/thegirlwhoreadtoomuch/000.png")
    im5=Image.open("/home/test/nutch/runtime/local/phantomjslearning/data/timcotson/000.png")

    imtest=Image.open(path+"/000.png")


    result1=equal(imtest,im1)
    result2=equal(imtest,im2)
    result3=equal(imtest,im3)
    result4=equal(imtest,im4)
    result5=equal(imtest,im5)
    choose=0
    testresult=result1

    if result1 <= testresult:
        choose=1
        testresult=result1

    if result2 <= testresult:
        choose=2
        testresult=result2

    if result3 <= testresult:
        choose=3
        testresult=result3

    if result4 <= testresult:
        choose=4
        testresult=result4

    if result5 <= testresult:
        choose=5
        testresult=result5


    if choose == 1:
    	usemodel="/home/test/nutch/runtime/local/phantomjslearning/classlibraries/rfdazedandconfused.joblib.pkl"
    if choose == 2:
    	usemodel="/home/test/nutch/runtime/local/phantomjslearning/classlibraries/rffruitsofother.joblib.pkl"
    if choose == 3:
    	usemodel="/home/test/nutch/runtime/local/phantomjslearning/classlibraries/rfrohitanurag.joblib.pkl"
    if choose == 4:
    	usemodel="/home/test/nutch/runtime/local/phantomjslearning/classlibraries/rfthegirlwhoused.joblib.pkl"
    if choose == 5:
    	usemodel="/home/test/nutch/runtime/local/phantomjslearning/classlibraries/rftimscoton.joblib.pkl"
#Here we get the predicted model which we use to predict classes like title , date,paragraphs of blogs

    usemodel = "/home/test/nutch/runtime/local/phantomjslearning/classlibraries/ivfhaveababy.joblib.pkl"
    rf = joblib.load(usemodel)
    #loads the model and then use it for prediction
    predicted = rf.predict(features)
    print usemodel
    for i in xrange(1,len(predicted)):
        print lab[i]
        print  "*********"
        print predicted[i]
        print "**********"
    return

예제 #41

0

파일 보기

파일: label.py 프로젝트: CharlesPang/spider-1

def main(args):

    extractor = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'label.py')
    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    # load each JSON file from chaos.
    # Read each block of that file.
    # [P2] Sort the blocks by their size.
    # Also load the gold-text of that file.
    # If matching between gold-text and that element text is
    #   above a certain threshold, label that block as 1.
    # [P2] remove the matching part from gold-text.
    # Rewrite the blocks to another json file.

    # extract data from each url

    # load data
    pages = []
    domains = collections.defaultdict(lambda: 0)

    for id, url in enumerate(urls):
        if not url.strip():
            continue
        
        host = url.split('/', 3)[2]
        #if domains[host] > 2:
        #    continue
        domains[host] += 1
        print host

        page = utils.load_data(path, id)
        processor = processors.Processor([page], tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer)
        features = processor.extract()

        clusterer = clusterers.DBSCAN()
        labels = clusterer.cluster(features).labels_

        clusters = collections.defaultdict(list)
        for text, label in zip(processor.texts, labels):
            clusters[int(label)].append(text)

        gold_text = utils.load_gold_text(path, id)
        gold_text = processor.tokenizer.tokenize(gold_text)

        max_score = 0
        best_label = None
        for label, texts in clusters.iteritems():
            tokens = ''
            for text in texts:
                tokens += text['tokens']
            score = processor.analyzer.get_similarity(tokens, gold_text)
            if score > max_score:
                max_score = score
                best_label = label

        for text in clusters[best_label]:
            text['label'] = 1


        page_texts = []
        for label, texts in clusters.iteritems():
            page_texts += texts
        random.shuffle(page_texts)
        pages.append(page_texts)

    #random.shuffle(pages)

    continuous_features = []
    discrete_features = []
    labels = []

    for page in pages:
        for text in page:
            text_length = len(text['tokens'])
            area = text['bound']['height'] * text['bound']['width']
            text_density = float(text_length) / float(area)

            # continuous_feature
            continuous_feature = [] #text_length, text_density]
            continuous_features.append(continuous_feature)

            # discrete features
            discrete_feature = dict()
            discrete_feature = dict(text['computed'].items())
            discrete_feature['path'] = ' > '.join(text['path'])
            """
            discrete_feature['selector'] = ' > '.join([
                '%s%s%s' % (
                    selector['name'],
                    '#' + selector['id'] if selector['id'] else '',
                    '.' + '.'.join(selector['classes']) if selector['classes'] else '',
                )
                for selector in text['selector']
            ])
            """
            discrete_feature['class'] = ' > '.join([
                '%s%s' % (
                    selector['name'],
                    '.' + '.'.join(selector['classes']) if selector['classes'] else '',
                )
                for selector in text['selector']
            ])
            """
            discrete_feature['id'] = ' > '.join([
                '%s%s' % (
                    selector['name'],
                    '#' + selector['id'] if selector['id'] else '',
                )
                for selector in text['selector']
            ])
            """
            discrete_features.append(discrete_feature)

            # label
            labels.append(text['label'])

    vectorizer = DictVectorizer()
    discrete_features = vectorizer.fit_transform(discrete_features).toarray()
    continuous_features = np.array(continuous_features)
    labels = np.array(labels).astype(np.float32)

    # scale features
    features = preprocessing.scale(features)

    features = np.hstack([continuous_features, discrete_features]).astype(np.float32)
    print features.shape

    precisions = []
    recalls = []
    f1scores = []
    supports = []

    rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0)
    for train_index, test_index in rs:
        print 'training size = %d, testing size = %d' % (len(train_index), len(test_index))

        clf = svm.SVC(verbose=False, kernel='linear', probability=False, random_state=0, cache_size=2000, class_weight='auto')
        clf.fit(features[train_index], labels[train_index])

        print clf.n_support_

        """
        negatives = []
        for i in clf.support_[:clf.n_support_[0]]:
            negatives.append(all_texts[i])

        positives = []
        for i in clf.support_[clf.n_support_[0]:]:
            positives.append(all_texts[i])

        stats(negatives, positives)
        """

        print "training:"
        predicted = clf.predict(features[train_index])
        print classification_report(labels[train_index], predicted)

        print "testing:"
        predicted = clf.predict(features[test_index])
        print classification_report(labels[test_index], predicted)

        precision, recall, f1score, support = precision_recall_fscore_support(labels[test_index], predicted)

        precisions.append(precision)
        recalls.append(recall)
        f1scores.append(f1score)
        supports.append(support)

    precisions = np.mean(np.array(precisions), axis=0)
    recalls = np.mean(np.array(recalls), axis=0)
    f1scores = np.mean(np.array(f1scores), axis=0)
    supports = np.mean(np.array(supports), axis=0)

    for label in range(2):
        print '%f\t%f\t%f\t%f' % (precisions[label], recalls[label], f1scores[label], supports[label])

    return

예제 #42

0

파일 보기

파일: run.py 프로젝트: drone076/spider

def main(args):

    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    # load data
    data = [utils.load_data(path, id) for id, url in enumerate(urls)]
    random.shuffle(data)
    for page in data:
        random.shuffle(page["texts"])

    # process data
    processor = processors.Processor(data, tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer)
    features = processor.extract()

    # clustering
    clusterer = clusterers.DBSCAN()
    labels = clusterer.cluster(features).labels_

    # prepare features
    continuous_features, discrete_features, labels = processor.prepare(labels)

    vectorizer = DictVectorizer()
    discrete_features = vectorizer.fit_transform(discrete_features).toarray()
    continuous_features = np.array(continuous_features)
    labels = np.array(labels).astype(np.float32)

    features = np.hstack([continuous_features, discrete_features]).astype(np.float32)

    # scale features
    features = preprocessing.scale(features)
    print features.shape

    precisions = []
    recalls = []
    f1scores = []
    supports = []

    rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0)
    for train_index, test_index in rs:
        print "training size = %d, testing size = %d" % (len(train_index), len(test_index))

        clf = svm.SVC(
            verbose=False, kernel="linear", probability=False, random_state=0, cache_size=2000, class_weight="auto"
        )
        clf.fit(features[train_index], labels[train_index])

        print clf.n_support_

        print "training:"
        predicted = clf.predict(features[train_index])
        print classification_report(labels[train_index], predicted)

        print "testing:"
        predicted = clf.predict(features[test_index])
        print classification_report(labels[test_index], predicted)

        precision, recall, f1score, support = precision_recall_fscore_support(labels[test_index], predicted)

        precisions.append(precision)
        recalls.append(recall)
        f1scores.append(f1score)
        supports.append(support)

    precisions = np.mean(np.array(precisions), axis=0)
    recalls = np.mean(np.array(recalls), axis=0)
    f1scores = np.mean(np.array(f1scores), axis=0)
    supports = np.mean(np.array(supports), axis=0)

    for label in range(2):
        print "%f\t%f\t%f\t%f" % (precisions[label], recalls[label], f1scores[label], supports[label])

    return

    negatives = []
    positives = []
    for i in range(len(processor.texts)):
        if labels[i]:
            positives.append(processor.texts[i])
        else:
            negatives.append(processor.texts[i])

    stats(negatives, positives)

    return

    """

예제 #43

0

파일 보기

파일: run_old.py 프로젝트: CharlesPang/spider-1

def main(args):
    # path = utils.get_data_path(args.site[0])

    sites = ['theverge', 'sina', 'qq', 'techcrunch', 'usatoday', 'npr', 'prothomalo']

    all_continuous_features = []
    all_discrete_features= []
    all_labels = []

    for site in sites:
        print 'clustering %s ...' % site

        path = utils.get_data_path(site)
        urls = utils.load_urls(path)

        # load data
        data = [utils.load_data(path, id) for id, url in enumerate(urls)]

        # process data
        processor = processors.Processor(data, tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer)
        features = processor.extract()

        # clustering
        clusterer = clusterers.DBSCAN()
        labels = clusterer.cluster(features).labels_

        # prepare features
        continuous_features, discrete_features, labels = processor.prepare(labels)
        all_continuous_features += continuous_features
        all_discrete_features += discrete_features
        all_labels += labels


    vectorizer = DictVectorizer()
    discrete_features = vectorizer.fit_transform(all_discrete_features).toarray()
    continuous_features = np.array(all_continuous_features)
    labels = np.array(all_labels).astype(np.float32)

    features = np.hstack([continuous_features, discrete_features]).astype(np.float32)

    precisions = []
    recalls = []
    f1scores = []
    supports = []

    rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0)
    for train_index, test_index in rs:
        print 'training size = %d, testing size = %d' % (len(train_index), len(test_index))

        clf = svm.SVC(verbose=False, kernel='linear', probability=False, random_state=0, cache_size=2000, class_weight='auto')
        clf.fit(features[train_index], labels[train_index])

        predicted = clf.predict(features[test_index])
        print classification_report(labels[test_index], predicted)

        precision, recall, f1score, support = precision_recall_fscore_support(labels[test_index], predicted)

        precisions.append(precision)
        recalls.append(recall)
        f1scores.append(f1score)
        supports.append(support)

    precisions = np.mean(np.array(precisions), axis=0)
    recalls = np.mean(np.array(recalls), axis=0)
    f1scores = np.mean(np.array(f1scores), axis=0)
    supports = np.mean(np.array(supports), axis=0)

    for label in range(2):
        print '%f\t%f\t%f\t%f' % (precisions[label], recalls[label], f1scores[label], supports[label])

    return

    """

예제 #44

0

파일 보기

파일: command_line.py 프로젝트: thugetry/Web2Executable

    def make_output_dirs(self):
        self.output_err = ''
        try:
            self.progress_text = 'Removing old output directory...\n'

            output_dir = utils.path_join(self.output_dir(), self.project_name())
            if os.path.exists(output_dir):
                utils.rmtree(output_dir, ignore_errors=True)

            temp_dir = utils.path_join(TEMP_DIR, 'webexectemp')
            if os.path.exists(temp_dir):
                utils.rmtree(temp_dir, ignore_errors=True)

            self.progress_text = 'Making new directories...\n'

            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

            os.makedirs(temp_dir)

            self.copy_files_to_project_folder()

            json_file = utils.path_join(self.project_dir(), 'package.json')

            global_json = utils.get_data_file_path('files/global.json')

            if self.output_package_json:
                with codecs.open(json_file, 'w+', encoding='utf-8') as f:
                    f.write(self.generate_json())


            with codecs.open(global_json, 'w+', encoding='utf-8') as f:
                f.write(self.generate_json(global_json=True))

            zip_file = utils.path_join(temp_dir, self.project_name()+'.nw')

            app_nw_folder = utils.path_join(temp_dir, self.project_name()+'.nwf')

            utils.copytree(self.project_dir(), app_nw_folder,
                           ignore=shutil.ignore_patterns(output_dir))

            zip_files(zip_file, self.project_dir(), exclude_paths=[output_dir])
            for ex_setting in self.settings['export_settings'].values():
                if ex_setting.value:
                    self.progress_text = '\n'
                    name = ex_setting.display_name
                    self.progress_text = u'Making files for {}...'.format(name)
                    export_dest = utils.path_join(output_dir, ex_setting.name)
                    versions = re.findall('(\d+)\.(\d+)\.(\d+)', self.selected_version())[0]

                    minor = int(versions[1])
                    if minor >= 12:
                        export_dest = export_dest.replace('node-webkit', 'nwjs')

                    if os.path.exists(export_dest):
                        utils.rmtree(export_dest, ignore_errors=True)

                    # shutil will make the directory for us
                    utils.copytree(get_data_path('files/'+ex_setting.name),
                                   export_dest,
                                    ignore=shutil.ignore_patterns('place_holder.txt'))
                    utils.rmtree(get_data_path('files/'+ex_setting.name), ignore_errors=True)
                    self.progress_text += '.'

                    if 'mac' in ex_setting.name:
                        uncomp_setting = self.get_setting('uncompressed_folder')
                        uncompressed = uncomp_setting.value
                        app_path = utils.path_join(export_dest,
                                                self.project_name()+'.app')

                        try:
                            utils.move(utils.path_join(export_dest,
                                                     'nwjs.app'),
                                       app_path)
                        except IOError:
                            utils.move(utils.path_join(export_dest,
                                                     'node-webkit.app'),
                                       app_path)

                        plist_path = utils.path_join(app_path, 'Contents', 'Info.plist')

                        plist_dict = plistlib.readPlist(plist_path)

                        plist_dict['CFBundleDisplayName'] = self.project_name()
                        plist_dict['CFBundleName'] = self.project_name()
                        version_setting = self.get_setting('version')
                        plist_dict['CFBundleShortVersionString'] = version_setting.value
                        plist_dict['CFBundleVersion'] = version_setting.value

                        plistlib.writePlist(plist_dict, plist_path)


                        self.progress_text += '.'

                        app_nw_res = utils.path_join(app_path,
                                                  'Contents',
                                                  'Resources',
                                                  'app.nw')

                        if uncompressed:
                            utils.copytree(app_nw_folder, app_nw_res)
                        else:
                            utils.copy(zip_file, app_nw_res)
                        self.create_icns_for_app(utils.path_join(app_path,
                                                              'Contents',
                                                              'Resources',
                                                              'nw.icns'))

                        self.progress_text += '.'
                    else:
                        ext = ''
                        windows = False
                        if 'windows' in ex_setting.name:
                            ext = '.exe'
                            windows = True

                        nw_path = utils.path_join(export_dest,
                                               ex_setting.dest_files[0])

                        if windows:
                            self.replace_icon_in_exe(nw_path)

                        self.compress_nw(nw_path)

                        dest_binary_path = utils.path_join(export_dest,
                                                        self.project_name() +
                                                        ext)
                        if 'linux' in ex_setting.name:
                            self.make_desktop_file(dest_binary_path, export_dest)

                        join_files(dest_binary_path, nw_path, zip_file)

                        sevenfivefive = (stat.S_IRWXU |
                                         stat.S_IRGRP |
                                         stat.S_IXGRP |
                                         stat.S_IROTH |
                                         stat.S_IXOTH)
                        os.chmod(dest_binary_path, sevenfivefive)

                        self.progress_text += '.'

                        if os.path.exists(nw_path):
                            os.remove(nw_path)

        except Exception:
            error = u''.join([unicode(x) for x in traceback.format_exception(sys.exc_info()[0],
                                                                             sys.exc_info()[1],
                                                                             sys.exc_info()[2])])
            self.logger.error(error)
            self.output_err += error
        finally:
            utils.rmtree(temp_dir, ignore_errors=True)

예제 #45

0

파일 보기

파일: app.py 프로젝트: dotajin/haoku-open

		def send_image(filename):
			path = get_data_path(conf.image_file['path'])
			return send_from_directory(path, filename)