def get_clusterer(trainer, args, output_size, model):
    assert len(
        trainer.layer_list_all
    ) == 1, 'Active learning is only implemented for a single layer ablations'
    assert args.clustering, 'Active learning samples are associated with a specific clustering. The clustering flag ' \
                            'is necessary'
    active_paths = torch.load(
        os.path.join(args.active_learning_name, 'a_paths.pth'))
    active_units = torch.load(
        os.path.join(args.active_learning_name, 'units.pth'))
    active_binary_masks = torch.load(
        os.path.join(args.active_learning_name, 'a_hmaps.pth'))

    trainer.active_dict = {}
    for i, path in enumerate(active_paths):
        trainer.active_dict[path] = {
            'mask': active_binary_masks[i],
            'units': active_units[i],
            'index': i
        }

    cluster_path = os.path.join(args.active_learning_name, 'cluster')
    trainer.clusterer = Clusterer(trainer.loaders['train'],
                                  model,
                                  path_store=cluster_path,
                                  model_dim=args.embedding_dim,
                                  load_datapoints=True,
                                  load_histogram=True,
                                  load_clustering=True,
                                  load_name_final=True,
                                  save_results=True,
                                  output_size=output_size,
                                  args=args)
    return trainer.clusterer
Пример #2
0
 def cluster(self, kmeans, hyper):
     clus1 = Clusterer(
         self.get_rel_docs(),
         APIAdapter.get_data_foldername(self.get_search_term()), kmeans,
         hyper)
     clus1.cluster()
     self.clusterer = clus1
    def cdr3_length_precluster(self, waterer, preclusters=None):
        cdr3lengthfname = self.args.workdir + '/cdr3lengths.csv'
        with opener('w')(cdr3lengthfname) as outfile:
            writer = csv.DictWriter(
                outfile, ('unique_id', 'second_unique_id', 'cdr3_length',
                          'second_cdr3_length', 'score'))
            writer.writeheader()
            for query_name, second_query_name in self.get_pairs(preclusters):
                cdr3_length = waterer.info[query_name]['cdr3_length']
                second_cdr3_length = waterer.info[second_query_name][
                    'cdr3_length']
                same_length = cdr3_length == second_cdr3_length
                if not self.args.is_data:
                    assert cdr3_length == int(
                        self.reco_info[query_name]['cdr3_length'])
                    if second_cdr3_length != int(
                            self.reco_info[second_query_name]['cdr3_length']):
                        print 'WARNING did not infer correct cdr3 length'
                        assert False
                writer.writerow({
                    'unique_id': query_name,
                    'second_unique_id': second_query_name,
                    'cdr3_length': cdr3_length,
                    'second_cdr3_length': second_cdr3_length,
                    'score': int(same_length)
                })

        clust = Clusterer(
            0.5,
            greater_than=True)  # i.e. cluster together if same_length == True
        clust.cluster(cdr3lengthfname, debug=False)
        os.remove(cdr3lengthfname)
        return clust
Пример #4
0
def map_segments_to_clusters(x):
    # print('mapper: %s working on %s' % (os.getpid(), x))
    ((filename, start, end, size), config) = x
    clusterer = Clusterer(**config)
    lines = FileSegmentReader.read(filename, start, end, size)
    clusters = clusterer.find(lines)
    return [(FIXED_MAP_JOB_KEY, clusters)]
    def hamming_precluster(self, preclusters=None):
        assert self.args.truncate_pairs
        start = time.time()
        print 'hamming clustering'
        chopped_off_left_sides = False
        hamming_info = []
        all_pairs = self.get_pairs(preclusters)
        # print '    getting pairs: %.3f' % (time.time()-start); start = time.time()
        # all_pairs = itertools.combinations(self.input_info.keys(), 2)
        if self.args.n_fewer_procs > 1:
            pool = Pool(processes=self.args.n_fewer_procs)
            subqueries = self.split_input(
                self.args.n_fewer_procs,
                info=list(all_pairs),
                prefix='hamming'
            )  # NOTE 'casting' to a list here makes me nervous!
            sublists = []
            for queries in subqueries:
                sublists.append([])
                for id_a, id_b in queries:
                    sublists[-1].append({
                        'id_a': id_a,
                        'id_b': id_b,
                        'seq_a': self.input_info[id_a]['seq'],
                        'seq_b': self.input_info[id_b]['seq']
                    })

            # print '    preparing info: %.3f' % (time.time()-start); start = time.time()
            subinfos = pool.map(utils.get_hamming_distances, sublists)
            # NOTE this starts the proper number of processes, but they seem to end up i/o blocking or something (wait % stays at zero, but they each only get 20 or 30 %cpu on stoat)
            pool.close()
            pool.join()
            # print '    starting pools: %.3f' % (time.time()-start); start = time.time()

            for isub in range(len(subinfos)):
                hamming_info += subinfos[isub]
            # print '    merging pools: %.3f' % (time.time()-start); start = time.time()
        else:
            hamming_info = self.get_hamming_distances(all_pairs)

        if self.outfile is not None:
            self.outfile.write('hamming clusters\n')

        clust = Clusterer(
            self.args.hamming_cluster_cutoff, greater_than=False
        )  # NOTE this 0.5 is reasonable but totally arbitrary
        clust.cluster(input_scores=hamming_info,
                      debug=self.args.debug,
                      outfile=self.outfile,
                      reco_info=self.reco_info)
        # print '    clustering: %.3f' % (time.time()-start); start = time.time()

        if chopped_off_left_sides:
            print 'WARNING encountered unequal-length sequences, so chopped off the left-hand sides of each'
        print '    hamming time: %.3f' % (time.time() - start)

        return clust
Пример #6
0
 def process_single_core(self, filenames):
     """
     Process multiple files sequencially using a single processor
     """
     clusterer = Clusterer(**self.cluster_config)
     for filename in filenames:
         with open(filename, 'r') as f:
             for line in f:
                 clusterer.process_line(line)
     return clusterer.result()
Пример #7
0
 def process_pipe(self):
     """
     Process continuously from stdin input stream
     """
     clusterer = Clusterer(**self.cluster_config)
     try:
         for line in sys.stdin:
             clusterer.process_line(line)
     except KeyboardInterrupt:
         pass
     finally:
         return clusterer.result()
Пример #8
0
 def test(self):
     clusterer = Clusterer(k1=1, k2=1, max_dist=0.5, variables=[])
     clusters = clusterer.find([
         'hello 1 y 3',
         'hello 1 x 3',
         'abc m n q',
     ])
     self.assertEqual(
         clusters,
         [
             [['hello', '1', 'y', '3'], 2, ['hello', '1', '---', '3']],
             [['abc', 'm', 'n', 'q'], 1, ['abc', 'm', 'n', 'q']]
         ]
     )
Пример #9
0
 def test_min_members(self):
     clusterer = Clusterer(
         k1=1, k2=1, max_dist=0.5, variables=[], min_members=2)
     clusters = clusterer.find([
         'hello 1 y 3',
         'hello 1 x 3',
         'abc m n q',
     ])
     self.assertEqual(
         clusters,
         [
             [['hello', '1', 'y', '3'], 2, ['hello', '1', '---', '3']],
         ]
     )
Пример #10
0
    def run(self):
        path = self.mw.sourcePathField.text()
        if not path:
            print "[Error] File path is empty"
            return
        try:
            img = Clusterer.readImage(path)
            imageBGRA = cv2.cvtColor(img, cv2.cv.CV_BGR2BGRA)
            self.mw.refreshSource(imageBGRA)
            features = self.mw.selectedFeatures
            if not features:
                return
            self.mw.clusterer = Clusterer()
            backgroundColor = self.mw.backgroundColor
            backgroundColor = backgroundColor.blue(), backgroundColor.green(
            ), backgroundColor.red()
            if self.mw.transparentBg.isChecked():
                backgroundColor = None
            mode = self.mw.modeCombo.itemText(self.mw.modeCombo.currentIndex())
            mode = Clusterer.getModeByName(mode)
            modeK = self.mw.modeK.itemText(self.mw.modeK.currentIndex())
            modeK = Clusterer.getKModeByName(modeK)
            k = self.mw.clusterCount.value()
            self.mw.runButton.setEnabled(False)
            self.mw.clusters = self.mw.clusterer.getClusters(
                path,
                mode=mode,
                kmode=modeK,
                clusterCount=k,
                features=features,
                backgroundColor=backgroundColor,
                slider=self.mw.clusterSlider.value())
            self.mw.currentCluster = 0
            self.mw.refreshCluster()
            self.mw.saveButton.setEnabled(True)

            self.mw.clusterer.graph(self.mw.figure)
            self.mw.canvas.setMinimumSize(self.mw.canvas.size())
            self.mw.canvas.draw()

        except (OSError, cv2.error, urllib2.HTTPError) as err:
            print err
        self.mw.runButton.setEnabled(True)
Пример #11
0
	def search_click(self):
		_textval = self.searchbox.text()
		self._search_term = _textval
		if self.gene_button.isChecked() and self.fileselected:
			if self.fileName:
				goldencorpus = GoldenCorpus(_textval,self.fileName)
				goldencorpus.fetchData()
				self.rel_docs = goldencorpus.get_rel_docs_pmid()
				self.mesh_terms = goldencorpus.get_mesh_terms()
				mesh_explosion = DataForEachMeshTerm(self.mesh_terms,_textval)
				path = mesh_explosion.get_data_foldername(_textval)
				clus = Clusterer(self.rel_docs,path,True,5)
				self.representative_id,self.representative,self.best_mesh_terms_id, self.best_mesh_terms = clus.cluster()
				if self.representative:
					self.updateRepresentativeInformation()
			else:
				print("Error! getting file name")
		elif self.pmid_button.isChecked():
			print("Golden corpus exists..")
		else:
			print("Please select related file..")
Пример #12
0
def gen_window_model(window_event, proc_events, clusterer=Clusterer()):
    global default_window_event
    if window_event == default_window_event.wm_name:
        return None

    assignments = {}
    clustered_events = {}

    for et in proc_events[window_event]:
        clusterer.clear_data()

        if et is EventType.NONE:
            continue

        try:
            for e in proc_events[window_event][et]:
                f = e.get_features()
                if len(f) == 0:
                    break
                clusterer.append_data(f)
            if clusterer.shape[1] == 0:
                continue
            centroids, assigns = clusterer.cluster(clusterer.recommend_clusters(), 10)

            clustered_events[str(et)] = centroids

            for i in range(len(proc_events[window_event][et])):
                assignments[proc_events[window_event][et][i]] = assigns[i]

        except NotImplementedError as e:
            print(e)
            pass

    ngram = Ngram("")

    clustered_windowed_events = windowed_events[window_event][:]

    for i in range(len(clustered_windowed_events)):
        we = clustered_windowed_events[i]
        name = str(we.event_type)

        id = we.get_identifier()
        if not id is None:
            name += "[" + id + "]"

        if we in assignments:
            assignment = "{" + str(assignments[we]) + "}"
            if "{cluster}" in name:
                name = name.replace("{cluster}", assignment)
            else:
                name += "[" + assignment + "]"

        clustered_windowed_events[i] = name

    sequence = " ".join(clustered_windowed_events).replace("EventType.NONE", ngram.delimiter)
    ngram.construct(sequence, 5)

    ngram.calculate_probabilities()

    window_model = WindowModel(ngram, clustered_events)

    return window_model
Пример #13
0
    def __init__(self, appraisal, cluster_identity, marker, appraisal_colours):
        '''
        appraisal: Appraisal
        cluster_identity: float, as in Clusterer
        marker: str
            the marker being plotted
        '''
        self.appraisal_colours = appraisal_colours
        logging.debug("Generating plot info for %s" % marker)
        # Collect all OTUs from all samples so that they can be processed
        # together.
        all_binned_otus = []
        all_assembled_not_binned_otus = []
        all_not_found_otus = []
        max_count = 0

        # yuck. Sloppy scope in Python, but not in lambdas when I need it..
        def add_to_totality(otus, totality, max_count):
            count = 0
            for otu in otus:
                if otu.marker == marker:
                    totality.append(otu)
                    count += otu.count
            if count > max_count:
                return count
            else:
                return max_count

        for sample_appraisal in appraisal.appraisal_results:
            max_count = add_to_totality(sample_appraisal.binned_otus,
                                        all_binned_otus, max_count)
            max_count = add_to_totality(
                sample_appraisal.assembled_not_binned_otus(),
                all_assembled_not_binned_otus, max_count)
            max_count = add_to_totality(sample_appraisal.not_found_otus,
                                        all_not_found_otus, max_count)
        logging.debug("Found maximal count of seqs as %i" % max_count)

        sequence_to_cluster = {}
        cluster_rep_and_count = []
        collection = OtuTableCollection()
        collection.otu_table_objects = [
            all_not_found_otus, all_assembled_not_binned_otus, all_binned_otus
        ]
        for cotu in Clusterer().cluster(collection, cluster_identity):
            cluster_rep_and_count.append([cotu.sequence, cotu.count])
            for otu in cotu.otus:
                sequence_to_cluster[otu.sequence] = cotu

        # Sort the OTUs by descending order of counts, so that more abundant
        # OTUs get colour.
        sorted_cluster_rep_and_count = sorted(cluster_rep_and_count,
                                              key=lambda x: x[1],
                                              reverse=True)
        cluster_sequence_to_order = {}
        i = 0
        for pair in sorted_cluster_rep_and_count:
            cluster_sequence_to_order[pair[0]] = i
            i += 1

        self._sequence_to_cluster = sequence_to_cluster
        self._sorted_cluster_rep_and_count = sorted_cluster_rep_and_count
        self._cluster_sequence_to_order = cluster_sequence_to_order
        self.max_count = max_count
Пример #14
0
    not_picked = clean[(clean['eligible'] == 1) & (clean['oz'] == 0)]
    picked = clean[clean['oz'] == 1]

    nonfeatures = drop_columns(picked, drop_cols)
    features = picked.columns

    ## standardize
    standardize = StandardScaler()
    X, features = picked.values, picked.columns.values
    X = standardize.fit_transform(X)

    ## build model
    cluster_labels = pd.DataFrame()
    for k in range(6, 7):
        pax = Clusterer(model, n_clusters=k, linkage=linkage, random_state=24)
        centers = pax.fit(X)
        pax.store_features(features)
        print("{} grouped {} clusters.".format(model, np.shape(centers)[0]))

        ## update labels and scores for column k
        filepath = "{}/{}/labels.pkl".format(data, model)
        with open(filepath, "rb") as f:
            k = pax.attributes['n_clusters']
            model_labels_df = pickle.load(f)
            model_labels_df["k={}".format(k)] = pax.attributes['labels_']
            model_labels_df["k{}silho_score".format(
                k)] = pax.get_silhouette_samples()
        model_labels_df.to_pickle(filepath)
        print("Updated labels @ {}".format(filepath))
    def run_hmm(self, algorithm, sw_info, parameter_in_dir, parameter_out_dir='', preclusters=None, hmm_type='', stripped=False, prefix='', \
                count_parameters=False, plotdir=None, make_clusters=False):  # @parameterfetishist

        if prefix == '' and stripped:
            prefix = 'stripped'
        print '\n%shmm' % prefix
        csv_infname = self.args.workdir + '/' + prefix + '_hmm_input.csv'
        csv_outfname = self.args.workdir + '/' + prefix + '_hmm_output.csv'
        self.write_hmm_input(csv_infname,
                             sw_info,
                             preclusters=preclusters,
                             hmm_type=hmm_type,
                             stripped=stripped,
                             parameter_dir=parameter_in_dir)
        print '    running'
        sys.stdout.flush()
        start = time.time()
        if self.args.n_procs > 1:
            self.split_input(self.args.n_procs,
                             infname=csv_infname,
                             prefix='hmm')
            procs = []
            for iproc in range(self.args.n_procs):
                cmd_str = self.get_hmm_cmd_str(algorithm,
                                               csv_infname,
                                               csv_outfname,
                                               parameter_dir=parameter_in_dir,
                                               iproc=iproc)
                procs.append(Popen(cmd_str.split()))
                time.sleep(0.1)
            for proc in procs:
                proc.wait()
            for iproc in range(self.args.n_procs):
                if not self.args.no_clean:
                    os.remove(
                        csv_infname.replace(
                            self.args.workdir,
                            self.args.workdir + '/hmm-' + str(iproc)))
            self.merge_hmm_outputs(csv_outfname)
        else:
            cmd_str = self.get_hmm_cmd_str(algorithm,
                                           csv_infname,
                                           csv_outfname,
                                           parameter_dir=parameter_in_dir)
            check_call(cmd_str.split())

        sys.stdout.flush()
        print '      hmm run time: %.3f' % (time.time() - start)

        hmminfo = self.read_hmm_output(algorithm,
                                       csv_outfname,
                                       make_clusters=make_clusters,
                                       count_parameters=count_parameters,
                                       parameter_out_dir=parameter_out_dir,
                                       plotdir=plotdir)

        if self.args.pants_seated_clustering:
            viterbicluster.cluster(hmminfo)

        clusters = None
        if make_clusters:
            if self.outfile is not None:
                self.outfile.write('hmm clusters\n')
            else:
                print '%shmm clusters' % prefix
            clusters = Clusterer(self.args.pair_hmm_cluster_cutoff,
                                 greater_than=True,
                                 singletons=preclusters.singletons)
            clusters.cluster(input_scores=hmminfo,
                             debug=self.args.debug,
                             reco_info=self.reco_info,
                             outfile=self.outfile,
                             plotdir=self.args.plotdir + '/pairscores')

        if self.args.outfname is not None:
            outpath = self.args.outfname
            if self.args.outfname[
                    0] != '/':  # if full output path wasn't specified on the command line
                outpath = os.getcwd() + '/' + outpath
            shutil.copyfile(csv_outfname, outpath)

        if not self.args.no_clean:
            if os.path.exists(
                    csv_infname
            ):  # if only one proc, this will already be deleted
                os.remove(csv_infname)
            os.remove(csv_outfname)

        return clusters
Пример #16
0
'''
Find the household with the lowest carbon emissions from a singe group
'''


def find_greenest(cluster):
    min = 100000000
    min_index = -1
    for i in range(len(cluster)):
        if sum(cluster[1:len(cluster) - 1]) < min:
            min = cluster[i]
            min_index = i
    return min, min_index


# Preprocessing
preprocessor = Preprocessor()
preprocessor.run_preprocessor()

# Elbow method
elbow = Elbow()
elbow.run_elbow()

# Clustering
clusterer = Clusterer()
clusterer.run_clusterer()

# Regression
regressor = Regressor()
regressor.run_regressor()
Пример #17
0
from clusterer import Clusterer
import webbrowser

# Get the user input track
track_name = input(
    "Enter the name (artist optional) of a song: ") or 'Give it up Knife Party'

# Run the clustering on the track
c = Clusterer(track_name=track_name, alg_type='affprop')
results = c.get_target_cluster()
c.plot_clusters()
print('Graph saved to ./Database/clusters.png')

# convert the track ids returned from clustering back into track data
print('Loading 20 of', len(results), 'track recommendations, please wait...')
print()
shift_tracks = []
for i, item in enumerate(results):
    shift_tracks += [c.ret.sp.track(item[1])]


# output and save the recommended tracks to a file
def output_recommendations(source, filename, tracks):
    print(source + ' Recommendations:')
    fout = open(filename, 'w')
    for track in tracks[:20]:
        print('track:', track['name'], '-',
              track['album']['artists'][0]['name'])
        print('track:',
              track['name'],
              '-',
Пример #18
0
 def __init__(self, config):
     self.clusterer = Clusterer(**config)
     self.pattern_generator = self.clusterer.pattern_generator
Пример #19
0
    not_picked = clean[(clean['eligible'] == 1) & (clean['oz'] == 0)]
    picked = clean[clean['oz'] == 1]

    nonfeatures = drop_columns(picked, drop_cols)
    features = picked.columns

    ## standardize
    standardize = StandardScaler()
    X, features = picked.values, picked.columns.values
    X = standardize.fit_transform(X)

    ## build model
    cluster_labels = pd.DataFrame()
    for k in range(6, 7):
        pax = Clusterer(model, n_clusters=k, random_state=24)
        centers = pax.fit(X)
        pax.store_features(features)
        print("{} grouped {} clusters.".format(model, np.shape(centers)[0]))

        ## update labels and scores for column k
        filepath = "{}/{}/labels.pkl".format(data, model)
        with open(filepath, "rb") as f:
            k = pax.attributes['n_clusters']
            model_labels_df = pickle.load(f)
            model_labels_df["k={}".format(k)] = pax.attributes['labels_']
            model_labels_df["k{}silhouette_score".format(
                k)] = pax.get_silhouette_samples()
        model_labels_df.to_pickle(filepath)
        print("Updated labels @ {}".format(filepath))
# Grid of 100x100
# 3 circles of 15x15 with each 10 points
import testgenerator
from clusterer import Clusterer
from clustervisualizer import ClusterVisualizer

points = testgenerator.create_circle_points(1000, 50, 50, 20, point_mass=10)
clusterer = Clusterer(5, 10, 2)
clustervisualizer = ClusterVisualizer(clusterer)
clusterer.set_points(points)
clusterer.run()

Пример #21
0
from fastapi import FastAPI
from vector_space import VectorSpace
from org_dataset import OrgDataset
from org_recommender import OrgRecommender
from clusterer import Clusterer
from keyword_finder import KeywordFinder
from keyword_matcher import KeywordMatcher
from gcd_utils import get_account_liked_tags

app = FastAPI()
dataset = OrgDataset.load_instance('./orgs.pkl')
vs = VectorSpace.load_instance('./test_vs.pkl')
recommender = OrgRecommender(dataset, vs)

c = Clusterer(dataset, vs, 20)
kw_finder = KeywordFinder(dataset, vs)
matcher = KeywordMatcher(c, kw_finder, vs.data_centroid)

@app.get('/get_init_recs/')
async def get_init_recs(userId: str, numOrgs: int):
    keywords = get_account_liked_tags(userId)
    centroid = matcher.get_kw_centroid(keywords)
    orgids = recommender.centroid_recommend(centroid, numOrgs)
    return_arr = []
    for id in orgids:
        entry = {'orgId': id}
        return_arr.append(entry)
    return return_arr
"""Example get request for api on local host:

http://127.0.0.1:8000/get_recommendations/?userId=334614c0-7f55-11ea-b1bc-2f9730f51173&numOrgs=2
Пример #22
0
# Grid of 100x100
# 3 circles of 15x15 with each 10 points
import testgenerator
from clusterer import Clusterer
from clustervisualizer import ClusterVisualizer

points = testgenerator.create_circle_points(200, 8, 15, 10)
clusterer = Clusterer(1, 2, 2)
clustervisualizer = ClusterVisualizer(clusterer)
clusterer.set_points(points)
clusterer.run()

Пример #23
0
    def __init__(self, model, optimizer, all_loaders, args, resume_epoch):

        self.resume_epoch = resume_epoch
        self.args = args

        self.optimizer = torch.optim.SGD((model.parameters()),
                                         args.lr,
                                         momentum=args.momentum,
                                         weight_decay=args.weight_decay)

        self.layer_list_all = args.layers
        self.layers_dict = {
            'layer2': {
                'name': 'layer2',
                'depth': 512,
                'size': 4
            },
            'layer3': {
                'name': 'layer3',
                'depth': 512,
                'size': 8
            },
            'layer4': {
                'name': 'layer4',
                'depth': 512,
                'size': 8
            },
            'layer5': {
                'name': 'layer5',
                'depth': 256,
                'size': 16
            },
            'layer6': {
                'name': 'layer6',
                'depth': 256,
                'size': 16
            },
        }

        self.generator = gantest.GanTester(args.path_model_gan,
                                           self.layer_list_all,
                                           device=torch.device('cuda'))
        self.z = self.generator.standard_z_sample(200000)

        self.model = model
        self.optimizer = optimizer
        self.loaders = all_loaders
        self.loss_type = args.loss_type

        # Other parameters
        self.margin = args.margin
        self.clustering = args.clustering

        self.epoch = 0
        self.unorm = utils.UnNormalize(mean=(0.485, 0.456, 0.406),
                                       std=(0.229, 0.224, 0.225))

        output_size = 32 if 'large' in args.audio_model else 256

        if args.active_learning:
            active_learning.get_clusterer(self, args, output_size, model)
        else:
            if args.clustering:
                print('Creating cluster from scratch')
                cluster_path = os.path.join(
                    self.args.results, 'clusters',
                    args.name_checkpoint + '_' + str(time.time()))
                self.clusterer = Clusterer(
                    self.loaders['train'],
                    model,
                    path_store=cluster_path,
                    model_dim=args.embedding_dim,
                    save_results=True,
                    output_size=output_size,
                    args=self.args,
                    path_cluster_load=args.path_cluster_load)

        self.epochs_clustering = self.args.epochs_clustering
        self.clusters = self.mean_clust = self.std_clust = self.cluster_counts = self.clusters_unit = None
Пример #24
0
from database import Database
from youtube import YouTube
from clusterer import Clusterer

env = 'desktop'
db_name = 'comment_sense_3'
db = Database(env, db_name)
yt = YouTube()

videoId = 'kQibkV_V8-c'
video_data = yt.video(videoId)
comment_topics = db.comment_topics(videoId)

cl = Clusterer(video_data, db)
topics = cl.cluster(comment_topics)
print(topics)
Пример #25
0
def train(cfg,
          model,
          dataset,
          optimizer,
          scheduler=None,
          logger=None,
          is_continue=False,
          use_pretrained=False,
          cluster_vis_path=None):

    save_to = cfg.TRAIN.CHECKPOINT_PATH
    epochs = cfg.TRAIN.EPOCHS
    batch_size = cfg.TRAIN.BATCHSIZE

    if logger is None:
        print('>>> No tensorboard logger used in training.')
    else:
        print('>>> Logger is used in training.')
        counter = 0

    if len(save_to) == 0:
        print('>>> No checkpoints will be saved.')

    start_ep = 0  # initiate start epoch number

    # 继续训练至预定epoch全部完成
    if is_continue:
        print('>>> Continue training from the latest checkpoint.')
        if save_to is None:
            print('>>> Without checkpoint folder, cannot continue training!')
            exit(0)
        ckpts = glob.glob(os.path.join(save_to, '*.pth'))
        if len(ckpts) == 0:
            print('>>> No earlier checkpoints, train from the beginning.')
        else:
            start_ckpt = find_latest_checkpoint(ckpts)
            print('>>> Found earlier checkpoints, continue training with {}.'.
                  format(start_ckpt))

            # load latest model
            start_ep = torch.load(os.path.join(save_to, start_ckpt))['epoch']
            model_state = torch.load(os.path.join(
                save_to,
                start_ckpt))['model_state_dict']  # 加载权重、优化器、scheduler等信息
            opt_state = torch.load(os.path.join(
                save_to, start_ckpt))['optimizer_state_dict']
            model.load_state_dict(model_state)
            optimizer.load_state_dict(opt_state)
            optimizer = opt_to_gpu(optimizer, torch.cuda.is_available())
            if scheduler is not None:
                scheduler_state = torch.load(os.path.join(
                    save_to, start_ckpt))['scheduler_state_dict']
                scheduler.load_state_dict(scheduler_state)
            if logger is not None:
                counter = torch.load(os.path.join(
                    save_to, start_ckpt))['logger_counter']

    # 仅使用pretrained权重从头开始训练
    if use_pretrained:
        print('>>> Use pretrained model weights to start a new training.')
        model_state = torch.load(
            cfg.TRAIN.PRETRAINED_PATH)['model_state_dict']  # 只加载模型权重
        model.load_state_dict(model_state)

    if torch.cuda.is_available():
        model = model.cuda()

    # training loop
    for epoch in range(start_ep, epochs):
        # extract global features
        print('>>> Extracting global features ...')
        features, v_labels, cam_labels = extract_global_features(
            img_shape=(256, 256),
            batch_size=batch_size,
            workers=8,
            model=model,
            dataset=dataset,
            mode='train',
            is_cuda=torch.cuda.is_available())

        # clustering
        print('>>> Start clustering ...')
        features = merge_features_from_dict(features)
        pseudo_labels, num_ids, centroids = Clusterer(
            features, eps=0.5, is_cuda=torch.cuda.is_available()).cluster(
                visualize_path=cluster_vis_path, epoch=epoch + 1)

        # create non-outlier refined dataset
        print('>>> Refining dataset ...')
        good_dataset = refine_dataset((256, 256), dataset, pseudo_labels)
        sampler = ClusterSampler(good_dataset)
        sampler = torch.utils.data.BatchSampler(sampler,
                                                batch_size=cfg.TRAIN.BATCHSIZE,
                                                drop_last=False)
        # good_dataloader = DataLoader(good_dataset, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=4)
        good_dataloader = DataLoader(good_dataset,
                                     shuffle=False,
                                     batch_sampler=sampler,
                                     num_workers=8)

        # memory bank initialization
        memory = MemoryBank(num_feature_dims=2048,
                            num_samples=num_ids,
                            temp=0.07,
                            momentum=0.02)
        memory = init_memory_bank(memory, centroids)

        # training step
        for i, (imgs, pids, fnames, vids,
                camids) in enumerate(good_dataloader):
            if torch.cuda.is_available():
                imgs = imgs.cuda()
                memory = memory.cuda()
            optimizer.zero_grad()
            features = model(imgs)
            loss = memory(features,
                          pids)  # update memory bank and compute loss
            loss.backward()
            optimizer.step()

            if (i + 1) % 50 == 0:  # print loss each 50 iters
                print('[epoch: {}/{}][iter: {}/{}] loss: {}'.format(
                    epoch + 1, epochs, i + 1, len(good_dataloader), loss))

            # update logger
            if logger is not None:
                logger.add_scalar('loss', loss.item(), global_step=counter)
                logger.add_scalar('cluster_centroids',
                                  memory.num_samples,
                                  global_step=counter)
                logger.add_scalar(
                    'lr',
                    optimizer.state_dict()['param_groups'][0]['lr'],
                    global_step=counter)
                counter += 1

        # update scheduler
        if scheduler is not None:
            scheduler.step()

        # save checkpoint
        if len(save_to) != 0 and (epoch + 1) % cfg.TRAIN.SAVE_INTERVAL == 0:
            save_name = os.path.join(save_to,
                                     'backbone-epoch-{}.pth'.format(epoch + 1))
            state_dict = {
                'epoch':
                epoch + 1,
                'model_state_dict':
                model.state_dict(),
                'optimizer_state_dict':
                optimizer.state_dict(),
                'scheduler_state_dict':
                scheduler.state_dict() if scheduler is not None else None,
                'logger_counter':
                counter if logger is not None else None
            }
            torch.save(state_dict, save_name)
            print('>>> Checkpoint is saved as {}.'.format(save_name))
Пример #26
0
def main():

    # Lecture des arguments passés en ligne de commandes
    try:
        opts, args = getopt.getopt(sys.argv[1:], "et:rcn:s", ["enc=", "chemin=", "nc=", "mots="])
    except getopt.GetoptError as err:
        print(err)
        sys.exit()

    task = None
    clusterWordList = []

    # Mise en mémoire des aruments passés en ligne de commandes
    for opt, arg in opts:
        if opt == '-e':
            task = 'training'
        elif opt == '-t':
            if arg.isnumeric():
                windowSize = arg
            else:
                print("Erreur! Taille de la fenêtre!")
                sys.exit(1)
        elif opt == '-r':
            task = 'search'
        elif opt == '-c':
            task = 'clustering'
        elif opt == '-n':
            if arg.isnumeric():
                wordQty = arg
            else:
                print("Erreur! Nombre de mots à afficher par centroïdes!")
                sys.exit(1)
        elif opt == '-s':
            task = 'table'
        elif opt == '--enc':
            fileEncoding = arg
        elif opt == '--chemin':
            filePath = arg
        elif opt == '--nc':
            if arg.isnumeric():
                clusteringType = 'random'
                clusterQty = arg
            else:
                print("Erreur! Nombre de centroïdes à afficher!")
                sys.exit(1)
        elif opt == '--mots':
            clusteringType = 'words'
            clusterWordList = arg.split(" ")
            clusterQty = len(clusterWordList)

    if task == 'training':
        trainer = Trainer(filePath, fileEncoding, windowSize)
        trainer.execute()

    elif task == 'search':
        try:
            searcher = Searcher(windowSize)
            if searcher.isWindowValid:
                searcher.execute()
            else:
                print("Aucune donnée pour la taille de fenêtre {}".format(windowSize))
        except:
            print("Erreur! Base de données inexistante!")
            
    elif task == 'clustering':
        clusterer = Clusterer(windowSize, wordQty, clusterWordList, clusterQty, clusteringType)

        if clusterer.isWindowValid:
            clusterer.execute()
        else:
            print("Aucune donnée pour la taille de fenêtre {}".format(windowSize))

    elif task == 'table':
        db = DBManager()
        print("Création de Dictionnaire_Commun")
        db.createTableDict()
        print("Création de Cooccurrences")
        db.createTableCooc()
        print("Fermeture de la connexion")
        db.closeConnection()

    else:
        print("Erreur! Arguments -e, -r ou -c introuvables")
        sys.exit()

    print("\nFin du programme")

    return 0