def check_all(repo_kernel_file, threshold_list, top_k):
	total = 0
	tp_list = [0]*len(threshold_list)
	fp_list = [0]*len(threshold_list)
	tn_list = [0]*len(threshold_list)
	fn_list = [0]*len(threshold_list)
	acc_list = [0]*len(threshold_list)
	# read kernel only once
	sim = Similarity()
	sim.read_graph_kernels(repo_kernel_file)
	with open(repo_kernel_file, 'r') as fi:
		for line in fi:
			line = line.rstrip()
			parts = line.split('\t')
			dot_file = parts[0]
			result_program_list_with_score = sim.find_top_k_similar_graphs(dot_file, 'g', top_k, 3) # num_iter = 3
			path_parts = dot_file.split(os.sep)
			true_prob = path_parts[-4]
			total += 1
			for (i, threshold) in enumerate(threshold_list):
				cr = check_result(true_prob, result_program_list_with_score, threshold)
				if cr=='tp':
					tp_list[i] += 1
				elif cr=='fp':
					fp_list[i] += 1
				elif cr=='fn':
					fn_list[i] += 1
				else:
					tn_list[i] += 1
				acc = check_top_k_result(true_prob, result_program_list_with_score, threshold, top_k)
				acc_list[i] += acc
	return total, tp_list, fp_list, tn_list, fn_list, acc_list
Exemplo n.º 2
0
 def __init__(self):
     self.faces = {}
     self.similarity = Similarity()
     self.bling = Image.open('images/' + 'bling.png').resize((50, 50))
     for i in ikon_categories.keys():
         self.faces[i] = Image.open('images/' + ikon_categories[i] +
                                    '.png').resize((60, 60))
Exemplo n.º 3
0
 def __init__(self, path_to_tfcsv):
     self.database = Database(path_to_tfcsv)
     self.dictionary = self.database.get_dictionary()
     self.similarity = Similarity(self.database.documents)
     self.rank_limit = 6
     self.num_leaders = 5
     self.similarity.k_means_cluster(self.num_leaders)
    def test_generate(self):

        df_features = self.spark.read.csv(
            'tests/fixtures/similarity/features.csv', header=True)

        columns_to_convert = [
            col for col in df_features.columns if 'id' not in col
        ]
        df_features_int = df_features
        for col in columns_to_convert:
            df_features_int = df_features_int.withColumn(
                col,
                f.col(col).cast(IntegerType()))

        similarity_cos = Similarity(df_features=df_features_int,
                                    similarity_type='cosine')

        pd_df_similarity_cos, _ = similarity_cos.generate()

        self.assertEqual(pd_df_similarity_cos.shape[0], df_features.count())
        self.assertEqual(pd_df_similarity_cos.shape[1], df_features.count())

        similarity_euc = Similarity(df_features=df_features_int,
                                    similarity_type='euclidean')

        pd_df_similarity_euc, _ = similarity_euc.generate()

        self.assertEqual(pd_df_similarity_euc.shape[0], df_features.count())
        self.assertEqual(pd_df_similarity_euc.shape[1], df_features.count())

        similarity_fail = Similarity(df_features=df_features_int,
                                     similarity_type='test')
        with self.assertRaises(ValueError):
            similarity_fail.generate()
Exemplo n.º 5
0
    def contentBasedFiltering(self, key, n=3):
        '''Return list of n top match scores along with other keys'''

        dataset = self.dataset
        scores = []

        for other_key in dataset:
            if other_key == key:
                continue

            # Fetching common inner keys to calculate similarity score
            common_inner_keys = self.fetchCommonInnerKeys(key, other_key)

            # If there is no common inner key, skip this other keys
            if len(common_inner_keys) == 0:
                continue

            x = [dataset[key][inner_key] for inner_key in common_inner_keys]
            y = [
                dataset[other_key][inner_key]
                for inner_key in common_inner_keys
            ]

            # Appending the similarity score to a list
            sim = Similarity()
            scores.append((sim.pearson(x, y), other_key))

        # Sorting the list so the highest score appear at the top
        scores.sort()
        scores.reverse()

        return scores[0:n]
Exemplo n.º 6
0
def find_top_k_similar_program(repo_kernel_file, user_prog_graph_dot_file,
                               graph_name, k, num_iter, cluster_json):
    sim = Similarity()
    sim.read_graph_kernels(repo_kernel_file)
    result_program_list_with_score = sim.find_top_k_similar_graphs(
        user_prog_graph_dot_file, graph_name, k, num_iter, cluster_json)
    return result_program_list_with_score
	def contentBasedFiltering(self, key, n=3):
		'''Return list of n top match scores along with other keys'''

		dataset = self.dataset
		scores = []

		for other_key in dataset:
			if other_key == key:
				continue

			# Fetching common inner keys to calculate similarity score
			common_inner_keys = self.fetchCommonInnerKeys(key, other_key)

			# If there is no common inner key, skip this other keys
			if len(common_inner_keys) == 0:
				continue

			x = [dataset[key][inner_key] for inner_key in common_inner_keys]
			y = [dataset[other_key][inner_key] for inner_key in common_inner_keys]

			# Appending the similarity score to a list
			sim = Similarity()
			scores.append((sim.pearson(x, y), other_key))

		# Sorting the list so the highest score appear at the top
		scores.sort()
		scores.reverse()

		return scores[0:n]
    def test__check_nulls_in_feature_columns(self):

        df_nulls_features = self.spark.read.csv(
            'tests/fixtures/similarity/nulls_features.csv', header=True)
        df_no_nulls_features = self.spark.read.csv(
            'tests/fixtures/similarity/no_nulls_features.csv', header=True)

        columns_to_convert_nulls = [
            col for col in df_nulls_features.columns if 'id' not in col
        ]
        for col in columns_to_convert_nulls:
            df_nulls_features = df_nulls_features.withColumn(
                col,
                f.col(col).cast(IntegerType()))

        columns_to_convert_no_nulls = [
            col for col in df_no_nulls_features.columns if 'id' not in col
        ]
        for col in columns_to_convert_no_nulls:
            df_no_nulls_features = df_no_nulls_features.withColumn(
                col,
                f.col(col).cast(IntegerType()))

        Similarity(df_features=df_no_nulls_features)

        with self.assertRaises(AssertionError):
            Similarity(df_features=df_nulls_features)
Exemplo n.º 9
0
def extractPhrasePaireFeature(phrasedir):
    for lec in annotation.Lectures:
        path = phrasedir + str(lec) + '/'
        fio.NewPath(path)

        for prompt in ['q1', 'q2']:
            prefix = os.path.join(path, '%s.%s.' % (prompt, method))
            filename = path + prompt + sim_exe
            print filename

            featureset = []

            feature_extractor = Similarity(prefix)

            phrasefile = os.path.join(path, "%s.%s.key" % (prompt, method))

            phrases = fio.LoadList(phrasefile)

            for p1 in phrases:
                for p2 in phrases:
                    featureset.append(
                        (feature_extractor.get_features(p1, p2), 0.0, {
                            'p1': p1,
                            'p2': p2
                        }))

            fio.SaveDict2Json(featureset, filename)

            feature_extractor.save()
def extractPhrasePaireFeature(phrasedir):
    for lec in annotation.Lectures:
        path = phrasedir + str(lec)+ '/'
        fio.NewPath(path)
        
        for prompt in ['q1', 'q2']:
            prefix = os.path.join(path, '%s.%s.'%(prompt, method))
            filename = path + prompt + sim_exe
            print filename
            
            featureset = []
            
            feature_extractor = Similarity(prefix)
            
            phrasefile = os.path.join(path, "%s.%s.key"%(prompt, method))
            
            phrases = fio.LoadList(phrasefile)
            
            for p1 in phrases:
                for p2 in phrases:
                    featureset.append((feature_extractor.get_features(p1, p2), 0.0, {'p1':p1, 'p2':p2}))
            
            fio.SaveDict2Json(featureset, filename)
            
            feature_extractor.save()
Exemplo n.º 11
0
    def build_sim_matrix(self, sentence_list, logger):
        sim = Similarity()
        self.sentences = sentence_list
        sim_matrix = np.empty([len(self.sentences), len(self.sentences)])

        for i in range(0, len(self.sentences)):
            logger.info('Processing sentence # {} => {}'.format(
                i, self.sentences[i]))
            for j in range(i + 1, len(self.sentences)):
                s1 = self.sentences[i]
                s2 = self.sentences[j]
                try:
                    score = sim.calculate_similarity_score(s1, s2)
                except ZeroDivisionError:
                    # print('Problematic s1 {}'.format(s1))
                    # print('Problematic s2 {}\n'.format(s2))
                    pass

                # print('{} | {} | {},{} | {}'.format(s1, s2, i, j, score))

                sim_matrix[i][j] = round(score, 2)
                sim_matrix[j][i] = sim_matrix[i][j]

            sim_matrix[i][i] = 1.00

        try:
            del sim
            del score
        except:
            pass

        return sim_matrix
Exemplo n.º 12
0
 def similarity_action(self):
     dialog = Similarity(parent=self, df=self.table)
     if dialog.exec_():
         res = dialog.execute()
         QMessageBox.information(
             self, f'Similarity: {dialog.method}',
             f'Columns {dialog.first_column} and {dialog.second_column} have a similarity value of {res}',
             QMessageBox.Ok)
Exemplo n.º 13
0
 def __init__(self):
     self.m_preprocessor = Preprocessor()
     self.m_similarity = Similarity()
     self.m_plt = Plot()
     self.m_evaluator = Evaluator()
     self.m_file = "sts-train"
     self.m_metric = "path"
     self.m_ic = "brown"
     self.m_metric_w2v = "cosine"
     self.m_metric_t = "path"
     self.m_thr = 20
     self.m_mode = "ontology"
Exemplo n.º 14
0
    def collaborativeRecommendation(self, key, n=3):
        '''Return list of n top match scores along with inner keys'''

        dataset = self.dataset
        weighted_inner_values = {}
        total_scores = {}

        for other_key in dataset:
            if other_key == key:
                continue

            # Fetching common inner keys to calculate similarity score
            common_inner_keys = self.fetchCommonInnerKeys(key, other_key)

            # If there is no common inner key, skip this other keys
            if len(common_inner_keys) == 0:
                continue

            x = [dataset[key][inner_key] for inner_key in common_inner_keys]
            y = [
                dataset[other_key][inner_key]
                for inner_key in common_inner_keys
            ]

            # Finding similarity score
            sim = Similarity()
            score = sim.pearson(x, y)

            # Ignoring scores of zero or below
            if score <= 0:
                continue

            for inner_key in dataset[other_key]:
                if inner_key not in dataset[key] or dataset[key][
                        inner_key] == 0:
                    # Weighted sum of value times similarity score
                    weighted_inner_values.setdefault(inner_key, 0)
                    weighted_inner_values[
                        inner_key] += score * dataset[other_key][inner_key]

                    # Sum of similarity score
                    total_scores.setdefault(inner_key, 0)
                    total_scores[inner_key] += score

        scores = [(weighted_inner_values[inner_key] / total_scores[inner_key],
                   inner_key) for inner_key in weighted_inner_values]

        # Sorting the list so that highest score appear at the top
        scores.sort()
        scores.reverse()

        return scores[0:n]
Exemplo n.º 15
0
def extractPhrasePaireFromAnnotation(phrasedir, annotators, id):
    for doc, lec, annotator in annotation.generate_all_files(
            annotation.datadir + 'json/',
            '.json',
            anotators=annotators,
            lectures=annotation.Lectures):
        print doc

        #load task
        task = annotation.Task()
        task.loadjson(doc)

        path = phrasedir + str(lec) + '/'
        fio.NewPath(path)

        for prompt in ['q1', 'q2']:
            prefix = os.path.join(path, '%s.%s.' % (prompt, method))
            filename = path + prompt + sim_exe
            print filename

            featureset = []

            feature_extractor = Similarity(prefix)

            phrase_annotation = task.get_phrase_annotation(prompt)

            #positive examples
            for rank1 in sorted(phrase_annotation):
                for rank2 in sorted(phrase_annotation):
                    if rank1 == rank2:
                        score = 1.0
                    else:
                        score = 0.0

                    phrases1 = phrase_annotation[rank1]
                    phrases2 = phrase_annotation[rank2]
                    for phrasedict1 in phrases1:
                        p1 = phrasedict1['phrase'].lower().strip()

                        for phrasedict2 in phrases2:
                            p2 = phrasedict2['phrase'].lower().strip()

                            featureset.append(
                                (feature_extractor.get_features(p1,
                                                                p2), score, {
                                                                    'p1': p1,
                                                                    'p2': p2
                                                                }))

            fio.SaveDict2Json(featureset, filename)

            feature_extractor.save()
Exemplo n.º 16
0
def main(args, config):
    wDir = os.getcwd()
    #Instance Preprocessing class
    window = Preprocessing(args.fasta_file, config['win_length'], config['win_step'])
    window.output_window()
    print >> sys.stderr, "Creating windows_sequence.fasta"
    
    #Instance Similarity and Composition class
    sim = Similarity(args.fasta_file, config['score_adj'],wDir)
    sim_matrix = sim.mcl_perform() 
    comp_results = Composition(config['kmer_len'])
    comp_matrix = comp_results.joined()
    #Join similarity and composition matrix for PCA
    join = pd.concat([comp_matrix, sim_matrix], axis= 1, join='inner')
    print >> sys.stderr, "Calculating similarity and composition matrix"
    
    #Instance Reduction class
    pca = Reduction(join, config['pca_comp'])
    pca_data = pca.perform_pca()
    print >> sys.stderr, "Performing PCA"
    
    #Instance Clustering class
    cluster = Clustering(pca_data)
    clust_obj = cluster.plot()
    print >> sys.stderr, "Performing clustering plot"
    
    #Instance ClusterReport class
    report = ClusterReport(clust_obj)
    file_name, querySeq = report.output_queryseq()
    print >> sys.stderr, "Doing report of clusters"

    #Instance Validate class
    valid = Validate(file_name, args.fasta_file,wDir)
    jfileComp, jfileMinus = valid.roundTwo()
    print >> sys.stderr, "Validation of results"
    
    #Instance ParseJplace Class
    parsing = ParseJplace(jfileComp, jfileMinus)
    corrMat = parsing.correlation()
    print >> sys.stderr, "Doing profiles"
    
    #Instance Profile Class
    ttest = Profiles(corrMat, querySeq)
    bestWin = ttest.windowsAssigment()
    print >>sys.stderr, "Doing permutations"
    
    #Instance StatsBinom
    finalResult = StatsBinom(args.fasta_file, config['win_length'],bestWin)
    finalResult.binomial()
    
    cleaning(file_name)
Exemplo n.º 17
0
	def __init__(self, messages, model, questions: set, answers: set,
				 pc_questions: dict, pc_answers: dict, tokenizer):

		self.questions = questions
		self.answers = answers
		self.pc_questions = pc_questions
		self.pc_answers = pc_answers	
		self.tokenizer = tokenizer	
		self.model = model
		self.messages = messages
		self.pp = PreProcessing()		
		self.s = Similarity(questions=self.questions,
							answers=self.answers
							)
Exemplo n.º 18
0
def main(experiment_name, phenotypes, data_directory, anchor_genes,
         num_replicates=1, percent=0.4, num_anchors=50, min_dangle_size=3,
         max_dangle_size=10, test_ratio=0.5):
    assert isinstance(phenotypes, list)
    alphas = random.choices(range(min_dangle_size, max_dangle_size),
                            k=int(num_anchors * test_ratio))
    assert len(alphas) < len(anchor_genes)
    anchor_train_groups = []
    anchor_test_groups = []
    backbones = []
    # Create all backbones
    for rep_id in range(num_replicates):
        random.shuffle(anchor_genes)
        candidates = anchor_genes[:int(num_anchors)]
        genes_of_interest_train, genes_of_interest_test = train_test_split(
            candidates,
            shuffle=True,
            test_size=test_ratio)

        anchor_train_groups.append(genes_of_interest_train)
        anchor_test_groups.append(genes_of_interest_test)
        backbones.append(
            build_backbone(anchors=anchor_train_groups[rep_id], alphas=alphas,
                           weight=1, edge_percentage=percent))
    # Write train anchors to file
    with open(os.path.join(experiment_name, 'train_anchors.csv'), 'w') as fout:
        for gene_group in anchor_train_groups:
            fout.write(','.join(gene_group))
            fout.write("\n")
    # Write test anchors to file
    with open(os.path.join(experiment_name, 'test_anchors.csv'), 'w') as fout:
        for gene_group in anchor_test_groups:
            fout.write(','.join(gene_group))
            fout.write("\n")
    # Adding the backbones and create the similarity object
    for pheno in phenotypes:
        file_name = os.path.join(data_directory, "{}.csv".format(pheno))
        for rep_id in range(num_replicates):
            sim_file_name = "anchored_{}_{}.csv".format(pheno, str(rep_id))
            out_address = os.path.join(experiment_name, sim_file_name)
            similarity = Similarity(file_name,
                                    anchors=anchor_train_groups[rep_id],
                                    alphas=alphas, string_id=True)
            similarity.transform()
            similarity.apply_threshold(lower_cor=0.2, upper_cor=0.8,
                                       value=0)
            similarity.augment(backbones[rep_id])

            similarity.to_csv(out_address)
Exemplo n.º 19
0
class Prediction:

	def __init__(self, messages, model, questions: set, answers: set,
				 pc_questions: dict, pc_answers: dict, tokenizer):

		self.questions = questions
		self.answers = answers
		self.pc_questions = pc_questions
		self.pc_answers = pc_answers	
		self.tokenizer = tokenizer	
		self.model = model
		self.messages = messages
		self.pp = PreProcessing()		
		self.s = Similarity(questions=self.questions,
							answers=self.answers
							)


	def predict(self, msg):
		if msg == '' or msg is None:
			return emergency_message()
			
		try:
			msg = self.pp.pre_processing_text_for_similarity(msg)
			msg_nn = self.pp.pre_processing_text_for_neural_network(msg)
		except Exception as e:
			save_content_to_log(e)
			return BOT_PREFIX + emergency_message() + '\n' + str(e)

		if msg == '' or msg is None:
			return emergency_message()

		p = self.tokenizer.texts_to_matrix([msg_nn])

		res = self.model.predict(p)

		if res >= 0.5:
			pc = self.pc_questions
		else:
			pc = self.pc_answers

		conversations = self.s.return_conversation_by_cossine(msg, res)
		
		conversations = self.s.return_conversation_by_page_rank(msg, conversations,
																page_compute=pc,
																reverse=True)		
		
		return self.s.get_the_next_conversation(conversations, self.messages)
Exemplo n.º 20
0
def train_IE256_svm(traincourse, model_dir, name='simlearn_cv'):
    sim_extractor = Similarity()
    allfeatures = sorted(sim_extractor.features.keys())

    features = allfeatures

    name = '_'.join(features)

    lectures = annotation.Lectures

    dict = defaultdict(int)

    if traincourse == 'IE256':
        train = [x for x in range(14, 26) if x != 22]
    else:
        train = [x for x in range(3, 27)]

    model_file = os.path.join(model_dir, '%s_%s.model' % (traincourse, name))

    if fio.IsExist(model_file):
        with open(model_file, 'rb') as handle:
            clf = pickle.load(handle)
    else:
        train_X, train_Y = combine_files_course(traincourse, train, features)
        clf = svm.SVC()
        clf.fit(train_X, train_Y)

        with open(model_file, 'wb') as handle:
            pickle.dump(clf, handle)
Exemplo n.º 21
0
def combine_files(lectures, features=None, prompts=['q1', 'q2']):
    phrasedir1 = '../data/%s/oracle_annotator_1/phrase/' % course
    phrasedir2 = '../data/%s/oracle_annotator_2/phrase/' % course

    X = []
    Y = []

    if features == None:
        sim_extractor = Similarity()
        features = sorted(sim_extractor.features.keys())

    for i, lec in enumerate(lectures):
        for q in prompts:

            for phrasedir in [phrasedir1, phrasedir2]:
                path = phrasedir + str(lec) + '/'

                filename = os.path.join(path, q + sim_exe)

                data = fio.LoadDictJson(filename)

                for fdict, score, _ in data:
                    row = []

                    for name in features:
                        x = fdict[name]
                        if str(x) == 'nan':
                            x = 0.0
                        row.append(x)

                    X.append(row)
                    Y.append(score)

    return X, Y
Exemplo n.º 22
0
def gather_performance(output):
    sim_extractor = Similarity()
    allfeatures = sorted(sim_extractor.features.keys())

    allbody = []
    for k in range(len(allfeatures) + 1):
        #features = allfeatures#['WordEmbedding']

        if k == len(allfeatures):  #use all features
            features = allfeatures
        else:
            features = [allfeatures[k]]
            #features = allfeatures[0:k] + allfeatures[k+1:]

        name = '_'.join(features)

        resultfile = '../data/%s/simlearning.cv.svm.%s.txt' % (course, name)

        head, body = fio.ReadMatrix(resultfile, hasHead=True)

        #get the average
        allhead = ['name'] + head[2:]
        average = [name]
        for i in range(2, len(head)):  #start from the third one
            values = [float(row[i]) for row in body]
            average.append(np.mean(values))

        allbody.append(average)

    fio.WriteMatrix(output, allbody, allhead)
    def test__convert_to_long_format(self):

        pd_df_similarities_wide = pd.read_csv(
            'tests/fixtures/similarity/similarities_wide.csv', index_col=0)

        df_simple_table = self.spark.read.csv(
            'tests/fixtures/similarity/simple_table.csv', header=True)
        columns_to_convert = [
            col for col in df_simple_table.columns if 'id' not in col
        ]
        for col in columns_to_convert:
            df_simple_table = df_simple_table.withColumn(
                col,
                f.col(col).cast(IntegerType()))
        similarity = Similarity(df_features=df_simple_table)

        pd_df_similarities_long = similarity._convert_to_long_format(
            pd_df_similarities_wide)

        self.assertEqual(
            pd_df_similarities_long.shape[0],
            pd_df_similarities_wide.shape[0] *
            pd_df_similarities_wide.shape[1])

        check_1_3 = pd_df_similarities_long.loc[
            (pd_df_similarities_long['recipe_id_1'] == 1)
            & (pd_df_similarities_long['recipe_id_2'] == '3'
               )]['similarity'].values[0]
        self.assertEqual(check_1_3, 9)

        check_3_1 = pd_df_similarities_long.loc[
            (pd_df_similarities_long['recipe_id_1'] == 3)
            & (pd_df_similarities_long['recipe_id_2'] == '1'
               )]['similarity'].values[0]
        self.assertEqual(check_3_1, 6)

        check_2_3 = pd_df_similarities_long.loc[
            (pd_df_similarities_long['recipe_id_1'] == 2)
            & (pd_df_similarities_long['recipe_id_2'] == '3'
               )]['similarity'].values[0]
        self.assertEqual(check_2_3, 1)

        check_1_2 = pd_df_similarities_long.loc[
            (pd_df_similarities_long['recipe_id_1'] == 1)
            & (pd_df_similarities_long['recipe_id_2'] == '2'
               )]['similarity'].values[0]
        self.assertEqual(check_1_2, 6)
	def collaborativeRecommendation(self, key, n=3):
		'''Return list of n top match scores along with inner keys'''

		dataset = self.dataset
		weighted_inner_values = {}
		total_scores = {}

		for other_key in dataset:
			if other_key == key:
				continue

			# Fetching common inner keys to calculate similarity score
			common_inner_keys = self.fetchCommonInnerKeys(key, other_key)

			# If there is no common inner key, skip this other keys
			if len(common_inner_keys) == 0:
				continue

			x = [dataset[key][inner_key] for inner_key in common_inner_keys]
			y = [dataset[other_key][inner_key] for inner_key in common_inner_keys]

			# Finding similarity score
			sim = Similarity()
			score = sim.pearson(x, y)

			# Ignoring scores of zero or below
			if score <= 0:
				continue

			for inner_key in dataset[other_key]:
				if inner_key not in dataset[key] or dataset[key][inner_key] == 0:
					# Weighted sum of value times similarity score
					weighted_inner_values.setdefault(inner_key, 0)
					weighted_inner_values[inner_key] += score * dataset[other_key][inner_key]

					# Sum of similarity score
					total_scores.setdefault(inner_key, 0)
					total_scores[inner_key] += score

		scores = [(weighted_inner_values[inner_key]/total_scores[inner_key], inner_key) for inner_key in weighted_inner_values]

		# Sorting the list so that highest score appear at the top
		scores.sort()
		scores.reverse()

		return scores[0:n]
Exemplo n.º 25
0
def train_leave_one_lecture_out(model_dir, name='simlearn_cv'):
    #     model_dir = '../data/IE256/%s/model/%s/'%(system, name)
    #     fio.NewPath(model_dir)
    #
    #     outputdir = '../data/IE256/%s/extraction/%s_output/'%(system, name)
    #     fio.NewPath(outputdir)

    sim_extractor = Similarity()
    allfeatures = sorted(sim_extractor.features.keys())

    if True:
        k = len(allfeatures)
        #for k in range(len(allfeatures)+1):
        #features = allfeatures#['WordEmbedding']

        if k == len(allfeatures):  #use all features
            features = allfeatures
        else:
            features = [allfeatures[k]]

        name = '_'.join(features)

        lectures = annotation.Lectures

        dict = defaultdict(int)

        MSE = []
        for i, lec in enumerate(lectures):
            train = [x for x in lectures if x != lec]
            test = [lec]

            print train
            print test

            model_file = os.path.join(model_dir, '%d_%s.model' % (lec, name))

            if fio.IsExist(model_file):
                with open(model_file, 'rb') as handle:
                    clf = pickle.load(handle)
            else:
                train_X, train_Y = combine_files(train, features)
                clf = svm.SVR()
                clf.fit(train_X, train_Y)

                with open(model_file, 'wb') as handle:
                    pickle.dump(clf, handle)

            for q in ['q1', 'q2']:
                test_X, test_Y = combine_files(test, features, prompts=[q])
                predict_Y = clf.predict(test_X)

                mse = mean_squared_error(test_Y, predict_Y)

                MSE.append([lec, q, mse])

        output = '../data/%s/simlearning.cv.%s.txt' % (course, name)

        fio.WriteMatrix(output, MSE, header=['lec', 'prompt', 'MSE'])
    def calculate(self):
        self.allPredicts = np.zeros((4, self.testSize))

        bias = Bias(self.trainData, self.testData)
        bias.calculateBias()
        answers, predicts = bias.predict()
        self.biasClass = bias
        self.allPredicts[0, :] = predicts
        #print("Bias: %f" % evaluationRMSE(answers, predicts))

        similarity = Similarity(self.trainData, self.testData)
        similarity.calculateBias()
        similarity.calcSimiMatrix()
        answers, predicts = similarity.predict()
        self.similarityClass = similarity
        self.allPredicts[1, :] = predicts
        #print("Similarity: %f" % evaluationRMSE(answers, predicts))

        svd = SVD(self.trainData, self.testData)
        svd.generaterMat()
        svd.calcSVD()
        answers, predicts = svd.predict()
        self.svdClass = svd
        self.allPredicts[2, :] = predicts
        #print("SVD: %f" % evaluationRMSE(answers, predicts))

        matFactory = MatFactory(self.trainData, self.testData)
        matFactory.train(10, 11)
        answers, predicts = matFactory.predict()
        self.matFactoryClass = matFactory
        self.allPredicts[3, :] = predicts
        #print("MatFactory: %f" % evaluationRMSE(answers, predicts))

        pickleFile = open(predictsFile, 'wb')
        pickle.dump(self.allPredicts, pickleFile)
Exemplo n.º 27
0
def correlation_analysis(course):
    phrasedir1 = '../data/%s/oracle_annotator_1/phrase/' % course
    phrasedir2 = '../data/%s/oracle_annotator_2/phrase/' % course

    outdir = '../data/%s/simlearning/' % course
    fio.NewPath(outdir)

    sim_extractor = Similarity()

    features = sorted(sim_extractor.features.keys())
    head = features + ['score', 'predict']
    body = []
    lectures = annotation.Lectures
    name = '_'.join(features)

    for i, lec in enumerate(lectures):

        model_file = os.path.join(model_dir, '%d_%s.model' % (lec, name))

        with open(model_file, 'rb') as handle:
            clf = pickle.load(handle)

        for q in ['q1', 'q2']:

            outfile = os.path.join(outdir, str(lec), '%s%s' % (q, sim_exe))

            for phrasedir in [phrasedir1, phrasedir2]:
                path = phrasedir + str(lec) + '/'

                filename = os.path.join(path, q + sim_exe)

                data = fio.LoadDictJson(filename)

                for fdict, score, _ in data:
                    row = []

                    for fname in features:
                        x = fdict[fname]

                        if str(x) == 'nan':
                            x = 0.0

                        row.append(x)

                    predict_score = clf.predict([row])

                    row.append(score)

                    row.append(predict_score[0])

                    body.append(row)

    out_correlation = os.path.join(outdir, 'data.txt')

    print out_correlation
    fio.WriteMatrix(out_correlation, body, head)
def extractPhrasePaireFromAnnotation(phrasedir, annotators, id):
    for doc, lec, annotator in annotation.generate_all_files(annotation.datadir + 'json/', '.json', anotators = annotators, lectures=annotation.Lectures):
        print doc
        
        #load task
        task = annotation.Task()
        task.loadjson(doc)
        
        path = phrasedir + str(lec)+ '/'
        fio.NewPath(path)
        
        for prompt in ['q1', 'q2']:
            prefix = os.path.join(path, '%s.%s.'%(prompt, method))
            filename = path + prompt + sim_exe
            print filename
            
            featureset = []
            
            feature_extractor = Similarity(prefix)
            
            phrase_annotation = task.get_phrase_annotation(prompt)
            
            #positive examples
            for rank1 in sorted(phrase_annotation):
                for rank2 in sorted(phrase_annotation):
                    if rank1 == rank2:
                        score = 1.0
                    else:
                        score = 0.0
                                
                    phrases1 = phrase_annotation[rank1]
                    phrases2 = phrase_annotation[rank2]
                    for phrasedict1 in phrases1:
                        p1 = phrasedict1['phrase'].lower().strip()
                        
                        for phrasedict2 in phrases2:
                            p2 = phrasedict2['phrase'].lower().strip()
                            
                            featureset.append((feature_extractor.get_features(p1, p2), score, {'p1':p1, 'p2':p2}))
            
            fio.SaveDict2Json(featureset, filename)
            
            feature_extractor.save()
    def test__check_is_spark_data_frame(self):

        df_simple_table = self.spark.read.csv(
            'tests/fixtures/similarity/simple_table.csv', header=True)
        pd_df_simple_table = pd.read_csv(
            'tests/fixtures/similarity/simple_table.csv')

        columns_to_convert = [
            col for col in df_simple_table.columns if 'id' not in col
        ]
        for col in columns_to_convert:
            df_simple_table = df_simple_table.withColumn(
                col,
                f.col(col).cast(IntegerType()))

        Similarity(df_features=df_simple_table)

        with self.assertRaises(AssertionError):
            Similarity(df_features=pd_df_simple_table)
Exemplo n.º 30
0
    def calculate(self):
        self.allPredicts = np.zeros((4, self.testSize))

        bias = Bias(self.trainData, self.testData)
        bias.calculateBias()
        answers, predicts = bias.predict()
        self.biasClass = bias
        self.allPredicts[0, :] = predicts
        #print("Bias: %f" % evaluationRMSE(answers, predicts))

        similarity = Similarity(self.trainData, self.testData)
        similarity.calculateBias()
        similarity.calcSimiMatrix()
        answers, predicts = similarity.predict()
        self.similarityClass = similarity
        self.allPredicts[1, :] = predicts
        #print("Similarity: %f" % evaluationRMSE(answers, predicts))

        svd = SVD(self.trainData, self.testData)
        svd.generaterMat()
        svd.calcSVD()
        answers, predicts = svd.predict()
        self.svdClass = svd
        self.allPredicts[2, :] = predicts
        #print("SVD: %f" % evaluationRMSE(answers, predicts))

        matFactory = MatFactory(self.trainData, self.testData)
        matFactory.train(10, 11)
        answers, predicts = matFactory.predict()
        self.matFactoryClass = matFactory
        self.allPredicts[3, :] = predicts
        #print("MatFactory: %f" % evaluationRMSE(answers, predicts))

        pickleFile = open(predictsFile, 'wb')
        pickle.dump(self.allPredicts, pickleFile)
Exemplo n.º 31
0
class QueryEngine:
    def __init__(self, path_to_tfcsv):
        self.database = Database(path_to_tfcsv)
        self.dictionary = self.database.get_dictionary()
        self.similarity = Similarity(self.database.documents)
        self.rank_limit = 6
        self.num_leaders = 5
        self.similarity.k_means_cluster(self.num_leaders)

    def handle_query(self, query):
        """ process user search query """
        query = query.lower()
        query_terms = query.split()

        if query_terms[0] == "stop":
            exit(0)
        # only include terms that are in the data set
        query_terms = [x for x in query_terms if x in self.dictionary]
        if not query_terms:
            print("No results for the given query")

        # generate ranked results and print them to console
        ranked_results = sorted(self.similarity.cosine_scores(query_terms),
                                key=lambda x: x[1],
                                reverse=True)
        print("\nQUERY RESULTS: ")
        print("--------------------------------------")

        # include top k results that have a score > 0
        for i, res in enumerate(filter(lambda x: x[1] > 0, ranked_results)):
            if i < self.rank_limit:
                doc_title = res[0]
                cos_score = res[1]
                doc_url = self.database.get_url(doc_title)
                print("[{}] {}\n{}\n(cosine_score={})".format(
                    i + 1, doc_title, doc_url, cos_score))
                print("--------------------------------------")
        print()
    def test__check_is_numerical_data(self):

        df_numerical = self.spark.read.csv(
            'tests/fixtures/similarity/numerical_data.csv', header=True)

        columns_to_convert = [
            col for col in df_numerical.columns if 'id' not in col
        ]
        df_numerical_int = df_numerical
        df_numerical_float = df_numerical

        for col in columns_to_convert:
            df_numerical_int = df_numerical_int.withColumn(
                col,
                f.col(col).cast(IntegerType()))
            df_numerical_float = df_numerical_float.withColumn(
                col,
                f.col(col).cast(DoubleType()))

        Similarity(df_features=df_numerical_int)
        Similarity(df_features=df_numerical_float)

        with self.assertRaises(AssertionError):
            Similarity(df_features=df_numerical)
Exemplo n.º 33
0
def generate_walks(edge_list_address, walk_per_node, walk_length, workers = 4):
    similarity = Similarity(correlation_file_path=edge_list_address, anchors=[],
                            alphas=[], sep=',', prefix='pseudo')
    genes = list(similarity.idx.keys())
    start_time = time.time()
    gen_walk = WalkGenerator(similarity.matrix, genes, walk_length, walk_per_node)
    print("takes {} seconds to create walk object.".format(
        time.time() - start_time))

    num_cpus = workers
    pool = mp.Pool(num_cpus)
    arguments = list(range(len(gen_walk)))
    chunk_size = len(gen_walk) // num_cpus
    walks = pool.map(gen_walk, arguments, chunksize=chunk_size)
    return walks
def check_all(repo_kernel_file, threshold_list, top_k):
    total = 0
    tp_list = [0] * len(threshold_list)
    fp_list = [0] * len(threshold_list)
    tn_list = [0] * len(threshold_list)
    fn_list = [0] * len(threshold_list)
    acc_list = [0] * len(threshold_list)
    # read kernel only once
    sim = Similarity()
    sim.read_graph_kernels(repo_kernel_file)
    with open(repo_kernel_file, 'r') as fi:
        for line in fi:
            line = line.rstrip()
            parts = line.split('\t')
            dot_file = parts[0]
            result_program_list_with_score = sim.find_top_k_similar_graphs(
                dot_file, 'g', top_k, 3)  # num_iter = 3
            path_parts = dot_file.split(os.sep)
            true_prob = path_parts[-4]
            total += 1
            for (i, threshold) in enumerate(threshold_list):
                cr = check_result(true_prob, result_program_list_with_score,
                                  threshold)
                if cr == 'tp':
                    tp_list[i] += 1
                elif cr == 'fp':
                    fp_list[i] += 1
                elif cr == 'fn':
                    fn_list[i] += 1
                else:
                    tn_list[i] += 1
                acc = check_top_k_result(true_prob,
                                         result_program_list_with_score,
                                         threshold, top_k)
                acc_list[i] += acc
    return total, tp_list, fp_list, tn_list, fn_list, acc_list
Exemplo n.º 35
0
def main():

    cats = prepare_categories()

    test_token_freqs = prepare_test_data()

    similarity = Similarity(cats=cats, data=test_token_freqs)

    print(max(similarity.jaccard(), key=similarity.jaccard().get))
    print(max(similarity.cosine(), key=similarity.cosine().get))
Exemplo n.º 36
0
def main():
    similarity = Similarity()

    pg.init()
    clock = pg.time.Clock()
    clock.tick(30)
    # create screen
    display_width = 960
    display_height = 650
    pos_x = 0
    pos_y = 30
    os.environ['SDL_VIDEO_WINDOW_POS'] = '%i,%i' % (pos_x, pos_y)
    gameDisplay = pg.display.set_mode((display_width, display_height))
    pg.display.set_caption('Pose Dance')

    game = PoseDance(gameDisplay, similarity)
    game.run()
Exemplo n.º 37
0
    def run(self, corpus_path, test_path, minfreq):
        self._database = self._construct_database(corpus_path)
        before_unique, before_total = self._stas(self._database)
        self._database.apply_minfreq(minfreq)
        after_unique, after_total = self._stas(self._database)
        sim = Similarity(self._database)
        test_phrases = self.IO.read_phrases(test_path)

        with open('trace.txt', 'w', encoding='utf8') as f:
            # Write the head line.
            args = [before_unique, after_unique, before_total, after_total]
            f.write('\n')
            self._write_head(f, args)

            for phrase in test_phrases:
                most_similar = self._find_k_similar(phrase, sim, 5)
                self._write_result(f, phrase, most_similar)
Exemplo n.º 38
0
class TestSimilarityFunkcions(unittest.TestCase):
    def setUp(self):
        self.sim = Similarity(1,1)

    def test_getIndex(self):
        self.sim.indexes = {}
        self.assertEqual(0, self.sim.getIndex('aa'))
        self.sim.indexes = {'aa':0}
        self.assertEqual(0, self.sim.getIndex('aa'))
        self.sim.indexes = {'aa':0}
        self.assertEqual(1, self.sim.getIndex('bb'))
        self.sim.indexes = {'bb':0,'aa':1}
        self.assertEqual(0, self.sim.getIndex('bb'))
        self.sim.indexes = {'bb':0,'aa':1}
        self.assertEqual(1, self.sim.getIndex('aa'))
        self.sim.indexes = {1:0,'aa':1}
        self.assertEqual(0, self.sim.getIndex(1))
        self.sim.indexes = {'bb':0,'aa':1}
        self.assertEqual(1, self.sim.getIndex('aa'))
Exemplo n.º 39
0
    def calAll(self):
        self.errs = [0] * 5
        bias = Bias(self.data, self.test)
        bias.calculateBias()
        answers, predicts = bias.predict()
        err = evaluationRMSE(answers, predicts)
        self.errs[0] = err
        print("Bias: %f" % err)

        similarity = Similarity(self.data, self.test)
        similarity.calculateBias()
        similarity.calcSimiMatrix()
        answers, predicts = similarity.predict()
        err = evaluationRMSE(answers, predicts)
        self.errs[1] = err
        print("Similarity: %f" % err)

        svd = SVD(self.data, self.test)
        svd.generaterMat()
        svd.calcSVD()
        answers, predicts = svd.predict()
        err = evaluationRMSE(answers, predicts)
        self.errs[2] = err
        print("SVD: %f" % err)

        matFactory = MatFactory(self.data, self.test)
        matFactory.train(20, 35)
        answers, predicts = matFactory.predict()
        err = evaluationRMSE(answers, predicts)
        self.errs[3] = err
        print("MatFactory: %f" % evaluationRMSE(answers, predicts))

        combination = Combination(self.data)
        combination.separateData()
        combination.calculate()
        combination.train(alpha = 0.01, iter = 10000)
        answers, predicts = combination.predict(self.test)
        err = evaluationRMSE(answers, predicts)
        self.errs[4] = err
        print("Combination: %f" % err)
        return self.errs
Exemplo n.º 40
0
def find_top_k_similar_program(repo_kernel_file, user_prog_graph_dot_file, graph_name, k, num_iter):
	sim = Similarity()
	sim.read_graph_kernels(repo_kernel_file)
	result_program_list_with_score = sim.find_top_k_similar_graphs(user_prog_graph_dot_file, graph_name, k, num_iter)
	return result_program_list_with_score
Exemplo n.º 41
0
# data = vstack((rand(5,2) + array([.5,.5]),rand(5,2)))
#print data


# computing K-Means with K = 2 (2 clusters)
centroids,_ = kmeans(data,3)
print 'centroids'
print centroids

# assign each sample to a cluster
idx,cc = vq(data,centroids)
print 'indice de cada especie'
print cc


# some plotting using numpy's logical indexing
plot(data[idx==0,0],data[idx==0,1],'ob',
     data[idx==1,0],data[idx==1,1],'or')
plot(centroids[:,0],centroids[:,1],'sg',markersize=8)
#show()


for c in c1:
	for c_ in c2:
		coef = Similarity.intersection(c, c_)
#		print  '%d -> %d : %f' % (c.code, c_.code, coef)

## Caracteristiscas de uma especie
#for cs in s2.characters():
#	print cs[0].name + " -> " + cs[1].label
Exemplo n.º 42
0
 def setUp(self):
     self.sim = Similarity(1,1)
Exemplo n.º 43
0
def main(corpus, annotations):
  """ SUMMARY: use case of the user-driven functionality of PASCALI.
  Scenario: User provides the concept of Sequence and the equivalent Java
  types, and the concept of sorted sequence and the relevant type invariant.
  Goal: learn how to get from Sequence -> Sorted Sequence.
  """

  """
  INPUT: annotations, dictionary mapping string -> list of strings
  OUTPUT: recompiles generic-inference-solver with new annotations"""

  run_pa2checker(annotations)

  """ Look for new mapping from 'ontology concepts'->'java type' and run
  checker framework. Should be implemented in type_inference
  Mapping example:
    Sequence -> java.lang.Array, java.util.List, LinkedHashSet, etc.

  INPUT: corpus, file containing set of concept->java_type mapping
  OUTPUT: Set of jaif files that are merged into the classes. The jaif files are
          stored as default.jaif in each project's directory.
  BODY: This also triggers back-end labeled graph generation.
  """

  for project in corpus:
    run_inference(project)

  """ Missing step: interact with PA to add a definition of Sorted Sequence
  which is a specialization of Sequence that has a sortedness invariants.
  The sortedness invariant gets turned into a Daikon template
  INPUT: user interaction
  OUTPUT: type_annotation and type_invariant (for sorted sequence)

  """

  ordering_operator = "<="

  ontology_invariant_file = "TODO_from_Howie.txt"
  with open(ontology_invariant_file, 'w') as f:
    f.write(ordering_operator)

  invariant_name = "TODO_sorted_sequence"

  daikon_pattern_java_file = ontology_to_daikon.create_daikon_invariant(ontology_invariant_file, invariant_name)


  """ Find all methods that have one input parameter annotated as Sequence and return a variable also
  annotated as Sequence.
  INPUT: The corpus and the desired annotations on the method signature
  OUTPUT: List of methods that have the desired signature.
  NOTE: This is a stub and will be implemented as LB query in the future.
  """
  sig_methods = find_methods_with_signature(corpus, "@ontology.qual.Sequence", ["@ontology.qual.Sequence"])
  print ("\n   ************")
  print ("The following corpus methods have the signature Sequence->Sequence {}:")
  for (project, package, clazz, method) in sig_methods:
    print("{}:\t{}.{}.{}".format(project, package, clazz, method))
  print ("\n   ************")


  """ Search for methods that have a return type annotated with Sequence
  and for which we can establish a sortedness invariant (may done by LB).

  INPUT: dtrace file of project
         daikon_pattern_java_file that we want to check on the dtrace file.

  OUTPUT: list of ppt names that establish the invariant. Here a ppt
  is a Daikon program point, s.a. test01.TestClass01.sort(int[]):::EXIT

  Note: this step translate the type_invariant into a Daikon
  template (which is a Java file).
  """

  pattern_class_name = invariant_name
  pattern_class_dir = os.path.join(common.WORKING_DIR, "invClass")
  if os.path.isdir(pattern_class_dir):
    shutil.rmtree(pattern_class_dir)
  os.mkdir(pattern_class_dir)

  cmd = ["javac", "-g", "-classpath", common.get_jar('daikon.jar'),
         daikon_pattern_java_file, "-d", pattern_class_dir]
  common.run_cmd(cmd)

  list_of_methods = []
  for project in corpus:
    dtrace_file = backend.get_dtrace_file_for_project(project)
    if not dtrace_file:
      print ("Ignoring folder {} because it does not contain dtrace file".format(project))
      continue
    ppt_names = inv_check.find_ppts_that_establish_inv(dtrace_file, pattern_class_dir, pattern_class_name)
    methods = set()
    for ppt in ppt_names:
      method_name = ppt[:ppt.find(':::EXIT')]
      methods.add(method_name)
    list_of_methods +=[(project, methods)]

  print ("\n   ************")
  print ("The following corpus methods return a sequence sorted by {}:".format(ordering_operator))
  for project, methods in list_of_methods:
    if len(methods)>0:
      print (project)
      for m in methods:
        print("\t{}".format(m))
  print ("\n   ************")

  shutil.rmtree(pattern_class_dir)

  """ Expansion of dynamic analysis results ....
  Find a list of similar methods that are similar to the ones found above (list_of_methods).
  INPUT: list_of_methods, corpus with labeled graphs generated, threshold value for similarity,
  OUTPUT: superset_list_of_methods
  """

  # WENCHAO
  print("Expanding the dynamic analysis results using graph-based similarity:")
  union_set = set()
  for project, methods in list_of_methods:
    # map Daikon output on sort method to method signature in methods.txt in generated graphs
    for m in methods:
      method_name = common.get_method_from_daikon_out(m)
      #kernel_file = common.get_kernel_path(project)
      method_file = common.get_method_path(project)
      dot_name = common.find_dot_name(method_name, method_file)
      if dot_name:
        # find the right dot file for each method
        dot_file = common.get_dot_path(project, dot_name)
        # find all graphs that are similar to it using WL based on some threshold
        sys.path.insert(0, 'simprog')
        from similarity import Similarity
        sim = Similarity()
        sim.read_graph_kernels("corpus_kernel.txt")
        top_k = 3
        iter_num = 3
        result_program_list_with_score = sim.find_top_k_similar_graphs(dot_file, 'g', top_k, iter_num)
        print(project+":")
        print(result_program_list_with_score)
        result_set = set([x[0] for x in result_program_list_with_score])
        # take the union of all these graphs
        union_set = union_set | result_set
  print("Expanded set:")
  print([x.split('/')[-4] for x in union_set])

  # return this set as a list of (project, method)
  fo = open("methods.txt", "w")
  expanded_list = []
  for dot_path in union_set:
    method_summary = common.get_method_summary_from_dot_path(dot_path)
    fo.write(method_summary)
    fo.write("\n")
  fo.close()

  """ Update the type annotations for the expanded dynamic analysis results.
  INPUT: superset_list_of_methods, annotation to be added
  OUTPUT: nothing
  EFFECT: updates the type annotations of the methods in superset_list_of_methods.
  This requires some additional checks to make sure that the methods actually
  perform some kind of sorting. Note that we do it on the superset because the original
  list_of_methods might miss many implementations because fuzz testing could not
  reach them.
  """
  for class_file in []: # MARTIN
    generated_jaif_file = "TODO"
    insert_jaif.merge_jaif_into_class(class_file, generated_jaif_file)


  """ Ordering of expanded dynamic analysis results ....
  Find the k 'best' implementations in superset of list_of_methods
  INPUT: superset_list_of_methods, corpus, k
  OUTPUT: k_list_of_methods
  Note: similarity score is used. may consider using other scores; e.g., TODO:???
  """

  #TODO: create input file for huascar where each line is formatted like:
  # ../corpus/Sort05/src/Sort05.java::sort(int[]):int[]

  ordering_dir = os.path.join(common.WORKING_DIR, "ordering_results/")

  methods_file = os.path.join(common.WORKING_DIR, 'methods.txt')
  with common.cd(ordering_dir):
    #TODO generate a proper relevant methods file.
    cmd = ["./run.sh",
           "-k", "3",
           "-t", "typicality",
           "-f", methods_file]
    common.run_cmd(cmd, print_output=True)

  """
  Close the loop and add the best implementation found in the previous
  step back to the ontology.
  INPUT: k_list_of_methods
  OUTPUT: patch file for the ontology. Worst case: just add the 'best' implementation
  found in the corpus as a blob to the ontology. Best case: generate an equivalent
  flow-graph in the ontology.
  """
  print "TODO" # ALL