예제 #1
0
    def load_data(self):
        self.features, self.labels, self.idx_train, self.idx_val, self.idx_test \
            = feature_reader(dataset=self.dataset, scale=self.args.scale,
                            train_ratio=self.args.train_ratio, feature_size=self.args.feature_size)

        # print('feature_size', self.features.shape)
        self.n_nodes = len(self.labels)
        self.n_features = self.features.shape[1]
        self.n_classes = self.labels.max().item() + 1

        self.edges = graph_reader(dataset=self.dataset)

        self.labeler = Labeler(self.features, self.labels, self.n_classes,
                               self.idx_train, self.idx_val, self.idx_test)

        if self.mode in ('clusteradj', 'clusteradj-clean'):
            self.generate_fake_labels()
            if self.args.break_down:
                self.break_down()
            self.adj = self.build_cluster_adj()
            self.prj = self.build_cluster_prj()
        else:
            self.adj = self.build_adj_mat(mode=self.mode)

        # self.calculate_connectivity()

        if torch.cuda.is_available():
            self.features = self.features.cuda()
            self.adj = self.adj.cuda()
            self.labels = self.labels.cuda()
            if hasattr(self, 'prj'):
                self.prj = self.prj.cuda()
예제 #2
0
class Predicter(BernoulliNB):
    
    # accept training data set as 2-column DataFrame
    
    def __init__(self):
        super().__init__()
    
    # give dictionary to the labeler to be initiated
    def init_labeler(self, dictionary):
        self.labeler = Labeler(dictionary)
    
    # now that labeler can vectorize our sentences
    # we are ready to train our model
    # df[0] -> sentences
    # df[1] -> scores
    def train(self, sentence_list, labels):
        
        feature_vector = (self.labeler.label_sentence_list(sentence_list))
        super().fit(feature_vector, labels)
    
    def test(self, sentences, real_values):
        # sentences -> pandas Series of sentences
        test_vector = self.labeler.label_sentence_list(sentences)
        test_results = super().predict(test_vector)
        # analyze the results
        real_pred = zip(real_values, test_results)
        correct_lbl = sum(list(map(lambda x:1 if x[0]==x[1] else 0, real_pred)))
        return correct_lbl *1.0/ len(real_values)
    
    def predict_sentence(self, sentence):
        s = [sentence]
        vector = self.labeler.label_sentence_list(s)
        return super().predict(vector)[0]
예제 #3
0
    def __init__(self, mode="rb"):
        if mode == "wb":
            self.__labeler = Labeler(mode)
        self.__model = None
        global graph

        if os.path.isfile(MODEL_LOCATION):
            self.__model = load_model(MODEL_LOCATION)
            graph = tf.get_default_graph()
        else:
            print("Could not init TLClassifier!")
def load_and_label_training():
    for directory in os.listdir(train_dir):
        current_dir = os.path.join(train_dir, directory)
        if os.path.isdir(current_dir):
            for filename in os.listdir(current_dir):
                f = os.path.join(current_dir, filename)
                if os.path.isfile(f):
                    for label in pa.answers:
                        if directory.__contains__(label):
                            training_images.append(
                                Labeler(filename, load_image(f), label))
예제 #5
0
    def __init__(self, cp, snaptype):
	self.cp = cp.getSection('snapshot.'+snaptype)
	self.labeler = Labeler(cp)
	self.cmprs = []
	for col,cmpn in json.loads(self.cp("comparators")):
	    cmpsn = "comparator.%s"%cmpn
	    self.cmprs.append( Comparator(col-1, cp.getSection(cmpsn), self.labeler) )

	self.pubDate = time.strftime(self.cp('dateFormat'))
	self.typ = self.cp('type')
	self.stype = self.cp('stype')
	self.sformat = self.cp('sformat',True)
예제 #6
0
class Snapshot:
    def __init__(self, cp, snaptype):
	self.cp = cp.getSection('snapshot.'+snaptype)
	self.labeler = Labeler(cp)
	self.cmprs = []
	for col,cmpn in json.loads(self.cp("comparators")):
	    cmpsn = "comparator.%s"%cmpn
	    self.cmprs.append( Comparator(col-1, cp.getSection(cmpsn), self.labeler) )

	self.pubDate = time.strftime(self.cp('dateFormat'))
	self.typ = self.cp('type')
	self.stype = self.cp('stype')
	self.sformat = self.cp('sformat',True)

    def convertZeros(self, r):
        for i in range(len(r)):
	    if r[i] == '0':
	        r[i] = ''
	return r

    # Compares two files, f1 and f2, and reports any differences
    # Written as an iterator that yields a series of dicts.
    # Each one is a difference record.
    def diffs(self, f1, f2):
	# foreach pair of records
	for idVal, r1, r2 in mergeIter(f1, f2, all=True):
	    r1 = r1 and self.convertZeros(r1) or None
	    r2 = r2 and self.convertZeros(r2) or None
	    # try each comparison
	    for cmpr in self.cmprs:
		try:
		    # report any diffs
		    for d in cmpr.diffs(r1, r2):
			d['id'] = idVal
			d['type'] = self.typ
			if d.get('subject',None) is None:
			    d['subject'] = self.labeler.get(self.stype, idVal, self.sformat)
			    if d['subject'] is None:
				logging.warn("No label found for: "+idVal)
				d['subject'] = '???'
			d['label'] = xmlEscape(d['subject']) + ' [' + idVal + ']'
			d['updateMessage'] = xmlEscape(d['updateMessage'] % d)
			d['pubDate'] = self.pubDate
			yield d
		except:
		    print "ERROR!"
		    print "comparator=",str(cmpr)
		    print "r1=",r1
		    print "r2=",r2
		    raise
from statistics import mean, variance
from tqdm import tqdm
from PIL import Image

from shared import directory_contents
from labeler import Labeler

LOGGER = logging.getLogger(__name__)
LOGGER.setLevel(logging.DEBUG)

INPUTS = f"{PROJECT_ROOT}/outputs"
OUTPUTS = f"{PROJECT_ROOT}/outputs/patches"

LABELS = pd.read_csv(f"{INPUTS}/labels.csv")
LABELER = Labeler(PROJECT_ROOT)

KEEP_TOP_N = 10


def delete_patch(fname):
    subprocess.Popen(["rm", fname])


def delete_patches_using_labels():
    for subdir in tqdm(directory_contents(INPUTS)):
        for fname in tqdm(directory_contents(subdir)):
            label = LABELER.labels(fname, top_n_labels=1)[0]
            if label in LABELS.label.unique():
                if LABELS.loc[LABELS.label == label, "action"] == "delete":
                    delete_patch(fname)
예제 #8
0
 def init_labeler(self, dictionary):
     self.labeler = Labeler(dictionary)
예제 #9
0
if __name__ == "__main__":
    posts_path = r"C:\Users\karlc\Documents\uoft\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\anon.contributions.csv"
    path_corpus = r"C:\Users\karlc\Documents\uoft\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\corpus.pkl"
    path_corpus_embeddings = r"C:\Users\karlc\Documents\uoft\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\corpus_embeddings.pkl"
    label_path = r"C:\Users\karlc\Documents\uoft\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\Labeler.pkl"

    data_loader = DataLoader()
    data_loader.load(posts_path)

    qs, followup_qs = data_loader.questions_in_folder("", index=True)
    as2, followup_as2 = data_loader.questions_in_folder("assignment2", index=True)

    bert_s_s = BertSemanticSearch().from_files(path_corpus, path_corpus_embeddings)

    # label dataset
    labeler = Labeler(label_path)

    for i in range(len(as2)):
        idx, text = as2[i]
        choices_idx = bert_s_s.single_semantic_search(text, 10)

        labeler.label(
            text=text,
            text_idx=idx,
            choices=[qs[int(choice_idx)][1] for choice_idx in choices_idx],
            choices_idx=[qs[int(choice_idx)][0] for choice_idx in choices_idx]
        )
        print(labeler.labels)

    labeler.save()
예제 #10
0
class TLClassifier(object):
    def __init__(self, mode="rb"):
        if mode == "wb":
            self.__labeler = Labeler(mode)
        self.__model = None
        global graph

        if os.path.isfile(MODEL_LOCATION):
            self.__model = load_model(MODEL_LOCATION)
            graph = tf.get_default_graph()
        else:
            print("Could not init TLClassifier!")

    @property
    def model(self):
        assert self.__model is not None
        return self.__model

    def save_image(self, image, label):
        cv2.imwrite("test.png", image)
        return self.__labeler.label_image(image, label)

    def get_classification(self, image):
        """Determines the color of the traffic light in the image

        Args:
            image (cv::Mat): image containing the traffic light

        Returns:
            int: ID of traffic light color (specified in styx_msgs/TrafficLight)
            uint8 UNKNOWN=4
            uint8 GREEN=2
            uint8 YELLOW=1
            uint8 RED=0
        """
        # TODO implement light color prediction
        if self.__model is None:
            return TrafficLight.UNKNOWN

        image_array = np.array(resize_image(image))
        global graph
        with graph.as_default():
            traffic_light = int(
                self.model.predict(image_array[None, :, :, :], batch_size=1))
            prob = self.model.predict_proba(image_array[None, :, :, :],
                                            batch_size=1)

            if prob < 0.5:
                print("Using hough due to low probability: " + str(prob))
                return self.__hough_stop_light_detector(image_array)

        if traffic_light == 0:
            return TrafficLight.RED
        elif traffic_light == 1:
            return TrafficLight.YELLOW
        elif traffic_light == 2:
            return TrafficLight.GREEN
        return TrafficLight.UNKNOWN

    def __hough_stop_light_detector(self, img):
        gray = np.array(img)[:, :, 2]
        cv2.medianBlur(gray, 7)
        circles = cv2.HoughCircles(
            gray,
            cv2.HOUGH_GRADIENT,
            dp=1.0,
            minDist=5,
            param1=100,
            param2=15,
            minRadius=3,
            maxRadius=10,
        )
        if circles is not None:
            circles = np.uint16(np.around(circles))
            center_dots = []
            for i in circles[0, :]:
                # draw the outer circle
                cv2.circle(img, (i[0], i[1]), i[2], (0, 255, 0), 2)
                # draw the center of the circle
                cv2.circle(img, (i[0], i[1]), 2, (0, 0, 255), 3)
                center_dots.append(img[i[1], i[0]])

            median = np.median(center_dots, axis=0)
            is_red = median[0] < 10 and median[1] < 10 and median[2] > 200
            is_green = median[0] < 10 and median[1] > 200 and median[2] < 10
            if is_red:
                return TrafficLight.RED
            elif is_green:
                return TrafficLight.GREEN
            # TODO: orange case

        return TrafficLight.UNKNOWN
예제 #11
0
cols = 2
fig, axes = plt.subplots(rows, cols, figsize=figsize)
plt.subplots_adjust(bottom=0.07,
                    top=0.95,
                    left=0.12,
                    right=0.98,
                    hspace=0.2,
                    wspace=0.5)

# Turn axes off on upper left corner plots
for i in range(2):
    for j in range(2):
        plt.setp(axes[i, j], frame_on=False, xticks=[], yticks=[])

# Make a labler to add labels to subplots
labeler = Labeler(xpad=.07, ypad=0.0, fontsize=10)

# Label upper left corner
#ax = axes[0,0]
ax = plt.subplot(451)
labeler.label_subplot(ax, 'A')
plt.setp(ax, frame_on=False, xticks=[], yticks=[])

ax = plt.subplot(453)
labeler.label_subplot(ax, 'B')
plt.setp(ax, frame_on=False, xticks=[], yticks=[])

ax = plt.subplot(455)
labeler.label_subplot(ax, 'C')
plt.setp(ax, frame_on=False, xticks=[], yticks=[])
예제 #12
0
def generator(samples, batch_sz=32):
    num_samples = len(samples)
    while 1:  # Loop forever so the generator never terminates
        shuffle(samples)
        for offset in range(0, num_samples, batch_sz):
            batch_samples = samples[offset : offset + batch_sz]

            tmp_features, tmp_labels = load_data(batch_samples)
            yield shuffle(tmp_features, tmp_labels)


# ======== MAIN ========
feature_shape = [150, 200, 3]

labeler = Labeler("rb")
data = labeler.load()
data["features"] = data["features"].reshape([-1] + feature_shape)

print("Labels: " + str(data["labels"].size))
print("Features: " + str(data["features"].size))
print("Shape: " + str(data["features"].shape))

model = None
if os.path.isfile("model.h5"):
    model = load_model("model.h5")
    print("+++ TRANSFER LEARNING +++")
elif feature_shape is not None:
    model = Sequential()
    model.add(Lambda(lambda x: x / 255.0 - 0.5, input_shape=feature_shape))
    #    tmp_model.add(Cropping2D(cropping=((70, 25), (0, 0))))
예제 #13
0
#fig, axes = plt.subplots(rows,cols,figsize=figsize)

bottom = 0.15
top = 0.95
width = 0.7
height = top - bottom
pad = 0.1
fig, axes = plt.subplots(rows, cols, figsize=figsize)
plt.subplots_adjust(bottom=0.1,
                    top=0.95,
                    left=0.1,
                    right=0.9,
                    wspace=0.6,
                    hspace=0.4)

labeler = Labeler(xpad=.08, ypad=.01, fontsize=10)

ax = axes[0, 0]
labeler.label_subplot(ax, 'A')

lims = [-6, -2]
plot_fraction_compare(rep1, rep3, ax)
ax.plot(lims, lims, '--', c='k', zorder=10)
ax.set_ylabel('fraction of population\nreplicate 1', labelpad=2)
ax.set_xlabel('fraction of population\nreplicate 3', labelpad=2)
ticks = range(lims[0], lims[1] + 1)
tick_labels = [r'$10^{' + str(t) + '}$' for t in ticks]
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(tick_labels)
ax.set_yticklabels(tick_labels)
예제 #14
0
plt.subplots_adjust(
    #top=.55,
    #bottom=.05,
    top=0.98,
    bottom=0.38,
    left=.12,
    right=.95,
    hspace=0,
    wspace=.5)

# Tite-Seq vs Flow

# Panel C

# Make a labler to add labels to subplots
labeler = Labeler(xpad=.07, ypad=-.01, fontsize=10)

# Position panel
#bottom=0.62
#top=0.98
bottom = 0.05
top = 0.30
left = 0.30
right = 0.75
height = top - bottom
width = right - left
ax = fig.add_axes([left, bottom, width, height])
labeler.label_subplot(ax, 'C', xpad_adjust=.05, ypad_adjust=0)

log_bounds = [-10, -4.5]
lims = log_bounds
예제 #15
0
    posts_path = r"C:\Users\karlc\Documents\ut\_y4\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\anon.contributions.csv"
    dupe_check_path = r"C:\Users\karlc\Documents\ut\_y4\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\dupe_check.pkl"
    label_path = r"C:\Users\karlc\Documents\ut\_y4\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\dupe_check_labels.pkl"

    data_loader = DataLoader()
    data_loader.load(posts_path)

    # map question indices to their text
    qs, followup_qs = data_loader.questions_in_folder("", index=True)
    qs = {q[0]: q[1] for q in qs}

    # load piazza's pred
    dupe_check = load_pickle(dupe_check_path)

    # label dataset
    labeler = Labeler(label_path)

    # # randomly select 100
    # indices = random.sample([i for i in range(len(dupe_check))], 100)
    # dupe_check = [dupe_check[i] for i in indices]

    for curr in dupe_check:
        idx = curr[0]
        text = qs[idx]

        labeler.label(
            text=text,
            text_idx=idx,
            choices=[qs[qidx] for qidx in curr[1:]],
            choices_idx=curr[1:]
        )
예제 #16
0
plt.close('all')

# Create figure with subplots and specified spacing
figsize = (6, 7)
rows = 10
cols = 4
fig, axes = plt.subplots(rows, cols, figsize=figsize)
plt.subplots_adjust(top=.98,
                    bottom=.05,
                    left=.05,
                    right=.95,
                    hspace=0,
                    wspace=.5)

# Make a labler to add labels to subplots
labeler = Labeler(xpad=.03, ypad=-.01, fontsize=10)

# fluorescein grid

summary = get_clone_data()
clones = summary.keys()
inds = np.argsort(
    [np.nanmean(np.log10(np.array(summary[k]['KD']))) for k in clones])

# Panel B
labeler.label_subplot(axes[0, 0], 'A')
fl = np.array([
    0, 10**-9.5, 10**-9, 10**-8.5, 10**-8, 10**-7.5, 10**-7, 10**-6.5, 10**-6,
    10**-5.5, 10**-5
])
예제 #17
0
from downloader import Downloader
from labeler import Labeler

if __name__ == "__main__":
    downloader = Downloader("apple", "data", img_count=5)
    downloader.download()
    labeler = Labeler("./data", ["apple", "not apple"],
                      dataset_dir="def_not_data")
    labeler.label()
예제 #18
0
import logging

from flask import Flask

from flask_ask import Ask, question, session, context, version

from labeler import Labeler

app = Flask(__name__)

ask = Ask(app, "/")

labeler = Labeler()

# log = logging.getLogger("flask_ask").setLevel(logging.DEBUG)
logging.basicConfig(level=logging.INFO)


@ask.launch
def new_game():
    logging.info("Session New?: {}".format(session.new))
    logging.info("User ID: {}".format(session.user.userId))
    logging.info("Alexa Version: {}".format(version))
    logging.info("Device ID: {}".format(context.System.device.deviceId))
    #     logging.info("Device: {}".format(context.System.device.keys()))
    logging.info("System: {}".format(context.System))
    print("User: {}".format(context.System.user))
    return labeler.get_intro_statement()


@ask.intent("GlobalIntent", convert={"item": str})
예제 #19
0
n_trials = len(crp_ratios_nbr_mat)
nst_pval = stats.binom_test(n_success, n_trials)
print 'Nonparametric sign test for nbr > mat: P = %f' % nst_pval

#
# Make figure
#

width = 160
height = 175
bottom = 5
fig = plt.figure(figsize=(mm2inch(width), mm2inch(height + bottom)))
sns.set(font_scale=0.8)

# Make a labler to add labels to subplots
labeler = Labeler(xpad=.07, ypad=0.02, fontsize=10)

left = width_mm2fig(15, fig)
stat_left = left
middle = width_mm2fig(70, fig)
right = width_mm2fig(125, fig)

level1 = height_mm2fig(140 + bottom, fig)
level2 = height_mm2fig(105 + bottom, fig)
level3 = height_mm2fig(60 + bottom, fig)
level4 = height_mm2fig(10 + bottom, fig)

hm_width = width_mm2fig(160, fig)
hm_height = height_mm2fig(20, fig)
stat_width = width_mm2fig(30, fig)
stat_height = height_mm2fig(30, fig)
예제 #20
0
    # Create figure with subplots and specified spacing
    figsize = (3.5, 5.6)
    rows = 14
    cols = 1
    col = 1
    fig, axes = plt.subplots(figsize=figsize)
    gs = gridspec.GridSpec(28, 2)
    plt.subplots_adjust(bottom=0.06,
                        top=0.95,
                        left=0.17,
                        right=0.96,
                        wspace=0.6,
                        hspace=0.0)

    # Make a labler to add labels to subplots
    labeler = Labeler(xpad=.13, ypad=.01, fontsize=10)

    # For CDR1H and CDR3H
    conc_labels = ['$0$'] + \
                  ['$10^{%1.1f}$'%x for x in np.arange(-9.5,-4.5,0.5)]

    file_labels = ['0M'] + \
                  ['10^%1.1fM'%x for x in np.arange(-9.5,-4.5,0.5)]

    #csv_name = 'out.csv'
    filenames = get_filenames(directory)
    names = [re.search('Sort (\d+)', ii) for ii in filenames]
    condition = [n.group(1) for n in names]

    # Make plots
    for [filename, well] in zip(filenames, condition):
예제 #21
0
labelsize = 8
panelsize = 12

param_lims = [-1,1]
param_ticks = [-1,-.5,0,.5,1]

# Set colormaps
cmap = sns.cubehelix_palette(8, start=0.0, rot=0.0, reverse=True, as_cmap=True) 
vmax = 100
vmin = 75

sns.set_style('white')

# Make a labler to add labels to subplots
labeler = Labeler(xpad=.07,ypad=0.02,fontsize=10)

## RNAP heatmap

# Plot results for real RNAP data
ax = fig.add_axes([left, level1, hm_width, hm_height])
labeler.label_subplot(ax,'A',xpad_adjust=0.03,ypad_adjust=0.04)
sns.heatmap(
    df_rnap_comparison.transpose(), annot=True, fmt="d", vmin=vmin, vmax=vmax, 
    annot_kws={"size": 7}, cmap=cmap, cbar_kws={"pad":.03})
gelx(ax,df_rnap_xannotation,annotation_spacing=0.8,fontsize=labelsize)
gely(ax,df_rnap_yannotation,annotation_spacing=0.8,fontsize=labelsize,rotation=0)

# Draw white lines 
(num_cols,num_rows) = df_rnap_comparison.shape
for y in range(num_rows):
예제 #22
0
        shutil.copy(f, dst)


if __name__ == "__main__":
    wandb_logger = WandbLogger(project="nnsplit")

    parser = Network.get_parser()
    parser.set_defaults(logger=wandb_logger)
    hparams = parser.parse_args()

    if hparams.logger:
        store_code(wandb_logger.experiment)

    labeler = Labeler([
        SpacySentenceTokenizer("de_core_news_sm",
                               lower_start_prob=0.7,
                               remove_end_punct_prob=0.7),
        SpacyWordTokenizer("de_core_news_sm"),
    ])

    model = Network(
        MemoryMapDataset("../train_data/texts.txt",
                         "../train_data/slices.pkl"),
        labeler,
        hparams,
    )
    n_params = np.sum([np.prod(x.shape) for x in model.parameters()])

    trainer = Trainer.from_argparse_args(hparams)
    print(f"Training model with {n_params} parameters.")
    trainer.fit(model)
예제 #23
0
n_trials = len(crp_ratios_nbr_mat )
nst_pval = stats.binom_test(n_success,n_trials)
print 'Nonparametric sign test for nbr > mat: P = %f'%nst_pval
'''
#
# Make figure
#

width=160
height=175
bottom=5
fig = plt.figure(figsize=(mm2inch(width),mm2inch(height+bottom)))
sns.set(font_scale=0.8)

# Make a labler to add labels to subplots
labeler = Labeler(xpad=.07,ypad=0.02,fontsize=10)

left = width_mm2fig(15,fig)
stat_left = left
middle = width_mm2fig(70,fig)
right = width_mm2fig(125,fig)

level1 = height_mm2fig(140+bottom,fig)
level2 = height_mm2fig(105+bottom,fig)
level3 = height_mm2fig(60+bottom,fig)
level4 = height_mm2fig(10+bottom,fig)

hm_width = width_mm2fig(160,fig)
hm_height = height_mm2fig(20,fig)
stat_width = width_mm2fig(30,fig)
stat_height = height_mm2fig(30,fig)
예제 #24
0
class ClusterWorker():
    def __init__(self, args, dataset='', mode=''):
        self.args = args
        self.dataset = dataset
        self.mode = mode
        self.load_data()

    def build_cluster_adj(self, clean=False):
        """
        build a adjacency matrix which only record what kind of fake labels each node link to
        """
        adj = np.zeros((self.n_nodes, self.n_clusters), dtype=np.float64)

        for dst, src in self.edges.tolist():
            adj[src, self.fake_labels[dst]] += 1
            adj[dst, self.fake_labels[src]] += 1

        if self.mode in ('clusteradj') and not clean:
            adj += get_noise(self.args.noise_type,
                             self.n_nodes,
                             self.n_clusters,
                             self.args.noise_seed,
                             eps=self.args.epsilon,
                             delta=self.args.delta)

            adj = np.clip(adj, a_min=0, a_max=None)
            adj = normalize(adj)
            return torch.FloatTensor(adj)

        adj = sp.coo_matrix(adj)
        adj = normalize(adj)
        return sparse_mx_to_torch_sparse_tensor(adj)

    def build_cluster_prj(self):
        """
        :return: a projection matrix, each column has 1 non-zero element, which is the inverse of the number
        of the class it belongs to.
        """
        unique, count = np.unique(self.fake_labels, return_counts=True)

        prj = np.zeros((self.n_clusters, self.n_nodes))

        for i, label in enumerate(self.fake_labels):
            prj[label, i] = 1 / count[label]
        return torch.FloatTensor(prj)

    def break_down(self):
        """
        generating broken down fake labels
        """
        indice = [[] for i in range(self.n_classes)]
        for i, label in enumerate(self.fake_labels):
            indice[label].append(i)

        unique, count = torch.unique(self.fake_labels, return_counts=True)

        # print('unique', unique)
        # print('count', count)
        min_size = int(torch.min(count).item() * self.args.break_ratio + 0.5)
        if min_size == 0: min_size = 1
        # print('min_size', min_size)
        split = [self.labeler.get_equal_size(val, min_size) for val in count]

        # print('split', [elem[0] for elem in split], sum([elem[0] for elem in split]))

        t0 = time.time()
        start = 0
        # hierarchical clustering
        for i in range(self.n_classes):
            idx = indice[
                i]  # all the indexes in fake_labels whose labels are i
            if not idx: continue

            n_clusters, quota = split[(unique == i).nonzero().item()]

            self.fake_labels[idx] = self.labeler.get_cluster_labels(
                self.features[idx],
                n_clusters,
                quota=quota,
                start=start,
                same_size=True)
            start += n_clusters

        self.n_clusters = start  # the number of class after clustering

        print('generating broken down fake labels done using {} secs!'.format(
            time.time() - t0))
        # torch.save(self.fake_labels, 'flabels_{}.pt'.format(self.n_clusters))

    def build_adj_vanilla(self):
        adj = np.zeros((self.n_nodes, self.n_nodes), dtype=np.float64)
        for dst, src in self.edges:
            adj[src][dst] = adj[dst][src] = 1

        t0 = time.time()
        adj += get_noise(self.args.noise_type,
                         self.n_nodes,
                         self.n_nodes,
                         self.args.noise_seed,
                         eps=self.args.epsilon,
                         delta=self.args.delta)
        adj = np.clip(adj, a_min=0, a_max=None)
        print('adding noise done using {} secs!'.format(time.time() - t0))
        return adj

    def build_adj_original(self):
        adj = sp.coo_matrix((np.ones(self.edges.shape[0]),
                             (self.edges[:, 0], self.edges[:, 1])),
                            shape=(self.n_nodes, self.n_nodes),
                            dtype=np.float32)

        # build symmetric adjacency matrix
        adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
        return adj

    def build_adj_mat(self, mode='vanilla-clean'):
        if mode == 'vanilla-clean':
            adj = self.build_adj_original()

        elif mode == 'vanilla':
            adj = self.build_adj_vanilla()

        else:
            raise NotImplementedError(
                'mode = {} not implemented!'.format(mode))

        adj = normalize(adj + sp.eye(adj.shape[0]))
        adj = sparse_mx_to_torch_sparse_tensor(
            adj) if mode == 'vanilla-clean' else torch.FloatTensor(adj)
        return adj

    def load_data(self):
        self.features, self.labels, self.idx_train, self.idx_val, self.idx_test \
            = feature_reader(dataset=self.dataset, scale=self.args.scale,
                            train_ratio=self.args.train_ratio, feature_size=self.args.feature_size)

        # print('feature_size', self.features.shape)
        self.n_nodes = len(self.labels)
        self.n_features = self.features.shape[1]
        self.n_classes = self.labels.max().item() + 1

        self.edges = graph_reader(dataset=self.dataset)

        self.labeler = Labeler(self.features, self.labels, self.n_classes,
                               self.idx_train, self.idx_val, self.idx_test)

        if self.mode in ('clusteradj', 'clusteradj-clean'):
            self.generate_fake_labels()
            if self.args.break_down:
                self.break_down()
            self.adj = self.build_cluster_adj()
            self.prj = self.build_cluster_prj()
        else:
            self.adj = self.build_adj_mat(mode=self.mode)

        # self.calculate_connectivity()

        if torch.cuda.is_available():
            self.features = self.features.cuda()
            self.adj = self.adj.cuda()
            self.labels = self.labels.cuda()
            if hasattr(self, 'prj'):
                self.prj = self.prj.cuda()

    def generate_fake_labels(self):
        cluster_method = self.args.cluster_method
        t0 = time.time()

        if cluster_method == 'random':
            self.n_clusters = self.args.n_clusters
            self.fake_labels = self.labeler.get_random_labels(
                self.n_clusters, self.args.cluster_seed)

        elif cluster_method == 'hierarchical':
            init_method = self.args.init_method
            self.n_clusters = self.n_classes

            if init_method == 'naive':
                self.fake_labels = self.labeler.get_naive_labels(
                    self.args.assign_seed)

            elif init_method == 'voting':
                self.fake_labels = self.labeler.get_majority_labels(
                    self.edges, self.args.assign_seed)

            elif init_method == 'knn':
                self.fake_labels = self.labeler.get_knn_labels(self.args.knn)

            elif init_method == 'gt':
                self.fake_labels = self.labels.clone()

            else:
                raise NotImplementedError(
                    'init_method={} in cluster_method=label not implemented!'.
                    format(init_method))

        elif cluster_method in ('kmeans', 'sskmeans'):
            self.n_clusters = self.args.n_clusters
            self.fake_labels = self.labeler.get_kmeans_labels(
                self.n_clusters,
                self.args.knn,
                cluster_method,
                same_size=self.args.same_size)

        else:
            raise NotImplementedError(
                'cluster_method={} not implemented!'.format(cluster_method))

        print('generating fake labels done using {} secs!'.format(time.time() -
                                                                  t0))
        # torch.save(self.fake_labels, 'flabels_{}.pt'.format(self.n_clusters))

    def calculate_connectivity(self):
        n_edges = len(self.edges)
        kappa = n_edges / (0.5 * self.n_nodes * (self.n_nodes - 1))
        labels = self.fake_labels

        edge_adj = np.zeros((self.n_clusters, self.n_clusters))
        for edge in self.edges:
            u, v = labels[edge[0]], labels[edge[1]]
            edge_adj[u][v] += 1
            edge_adj[v][u] += 1

        unique, count = np.unique(labels, return_counts=True)

        kappa_intra = 0
        for i in range(self.n_clusters):
            kappa_intra += edge_adj[i][i] / (0.5 * count[i] * (count[i] - 1))
        kappa_intra /= self.n_clusters

        kappa_inter = 0
        for i in range(self.n_clusters):
            for j in range(i + 1, self.n_clusters):
                kappa_inter += edge_adj[i][j] / (count[i] * count[j])
        kappa_inter /= (0.5 * self.n_clusters * (self.n_clusters - 1))

        print('k_inter = {:4f}, k = {:4f}, k_intra = {:4f}'.format(
            kappa_inter, kappa, kappa_intra))
        logging.info('k_inter = {:4f}, k = {:4f}, k_intra = {:4f}'.format(
            kappa_inter, kappa, kappa_intra))

    def calculate_degree(self):
        degrees = np.zeros(self.n_nodes)
        for edge in self.edges:
            u, v = edge
            degrees[u] += 1
            degrees[v] += 1
        return degrees

    def update_adj(self):
        if self.mode == 'clusteradj':
            self.adj = self.build_cluster_adj(clean=True)
        elif self.mode == 'vanilla':
            self.adj = self.build_adj_mat(mode='vanilla-clean')

        if torch.cuda.is_available():
            self.adj = self.adj.cuda()
예제 #25
0
black = [0., 0., 0.]

# Create figure with subplots and specified spacing
figsize = (3.42, 4.5)
rows = 2
cols = 2
fig, axes = plt.subplots(rows, cols, figsize=figsize)
plt.subplots_adjust(bottom=0.07,
                    top=0.95,
                    left=0.07,
                    right=0.88,
                    hspace=0.4,
                    wspace=0.6)

# Make a labler to add labels to subplots
labeler = Labeler(xpad=.035, ypad=.015, fontsize=10)

wtseq1 = 'TFSDYWMNWV'
seq1pos = np.arange(28, 38)
optseq1_dict = {30: 'G', 31: 'H'}
wtseq2 = 'GSYYGMDYWG'
seq2pos = np.arange(100, 110)
optseq2_dict = {101: 'A', 102: 'S', 106: 'E', 108: 'L'}

# Get affinity zero
A_heatmaps = []
A_wts = []
for rep in all_reps:
    temp_hm, wt_temp = c_matrix(rep, aff_fun)
    A_heatmaps.append(temp_hm)
    A_wts.append(wt_temp)
예제 #26
0
    )
    parser.add_argument(
        "--model_path",
        help="Directory to store the model at.",
    )

    hparams = parser.parse_args()

    if hparams.logger:
        store_code(wandb_logger.experiment)

    labeler = Labeler([
        SpacySentenceTokenizer(hparams.spacy_model,
                               lower_start_prob=0.7,
                               remove_end_punct_prob=0.7,
                               punctuation=".?!"),
        SpacyWordTokenizer(hparams.spacy_model),
        WhitespaceTokenizer(),
        # SECOSCompoundTokenizer("../../../Experiments/SECOS/"), # used for german
    ])

    model = Network(
        MemoryMapDataset(hparams.text_path, hparams.slice_path),
        labeler,
        hparams,
    )
    n_params = np.sum([np.prod(x.shape) for x in model.parameters()])

    trainer = Trainer.from_argparse_args(hparams)
    print(f"Training model with {n_params} parameters.")
    trainer.fit(model)