def test__check_nulls_in_feature_columns(self):

        df_nulls_features = self.spark.read.csv(
            'tests/fixtures/similarity/nulls_features.csv', header=True)
        df_no_nulls_features = self.spark.read.csv(
            'tests/fixtures/similarity/no_nulls_features.csv', header=True)

        columns_to_convert_nulls = [
            col for col in df_nulls_features.columns if 'id' not in col
        ]
        for col in columns_to_convert_nulls:
            df_nulls_features = df_nulls_features.withColumn(
                col,
                f.col(col).cast(IntegerType()))

        columns_to_convert_no_nulls = [
            col for col in df_no_nulls_features.columns if 'id' not in col
        ]
        for col in columns_to_convert_no_nulls:
            df_no_nulls_features = df_no_nulls_features.withColumn(
                col,
                f.col(col).cast(IntegerType()))

        Similarity(df_features=df_no_nulls_features)

        with self.assertRaises(AssertionError):
            Similarity(df_features=df_nulls_features)
    def test_generate(self):

        df_features = self.spark.read.csv(
            'tests/fixtures/similarity/features.csv', header=True)

        columns_to_convert = [
            col for col in df_features.columns if 'id' not in col
        ]
        df_features_int = df_features
        for col in columns_to_convert:
            df_features_int = df_features_int.withColumn(
                col,
                f.col(col).cast(IntegerType()))

        similarity_cos = Similarity(df_features=df_features_int,
                                    similarity_type='cosine')

        pd_df_similarity_cos, _ = similarity_cos.generate()

        self.assertEqual(pd_df_similarity_cos.shape[0], df_features.count())
        self.assertEqual(pd_df_similarity_cos.shape[1], df_features.count())

        similarity_euc = Similarity(df_features=df_features_int,
                                    similarity_type='euclidean')

        pd_df_similarity_euc, _ = similarity_euc.generate()

        self.assertEqual(pd_df_similarity_euc.shape[0], df_features.count())
        self.assertEqual(pd_df_similarity_euc.shape[1], df_features.count())

        similarity_fail = Similarity(df_features=df_features_int,
                                     similarity_type='test')
        with self.assertRaises(ValueError):
            similarity_fail.generate()
예제 #3
0
 def __init__(self, path_to_tfcsv):
     self.database = Database(path_to_tfcsv)
     self.dictionary = self.database.get_dictionary()
     self.similarity = Similarity(self.database.documents)
     self.rank_limit = 6
     self.num_leaders = 5
     self.similarity.k_means_cluster(self.num_leaders)
예제 #4
0
 def __init__(self):
     self.faces = {}
     self.similarity = Similarity()
     self.bling = Image.open('images/' + 'bling.png').resize((50, 50))
     for i in ikon_categories.keys():
         self.faces[i] = Image.open('images/' + ikon_categories[i] +
                                    '.png').resize((60, 60))
예제 #5
0
    def build_sim_matrix(self, sentence_list, logger):
        sim = Similarity()
        self.sentences = sentence_list
        sim_matrix = np.empty([len(self.sentences), len(self.sentences)])

        for i in range(0, len(self.sentences)):
            logger.info('Processing sentence # {} => {}'.format(
                i, self.sentences[i]))
            for j in range(i + 1, len(self.sentences)):
                s1 = self.sentences[i]
                s2 = self.sentences[j]
                try:
                    score = sim.calculate_similarity_score(s1, s2)
                except ZeroDivisionError:
                    # print('Problematic s1 {}'.format(s1))
                    # print('Problematic s2 {}\n'.format(s2))
                    pass

                # print('{} | {} | {},{} | {}'.format(s1, s2, i, j, score))

                sim_matrix[i][j] = round(score, 2)
                sim_matrix[j][i] = sim_matrix[i][j]

            sim_matrix[i][i] = 1.00

        try:
            del sim
            del score
        except:
            pass

        return sim_matrix
    def calculate(self):
        self.allPredicts = np.zeros((4, self.testSize))

        bias = Bias(self.trainData, self.testData)
        bias.calculateBias()
        answers, predicts = bias.predict()
        self.biasClass = bias
        self.allPredicts[0, :] = predicts
        #print("Bias: %f" % evaluationRMSE(answers, predicts))

        similarity = Similarity(self.trainData, self.testData)
        similarity.calculateBias()
        similarity.calcSimiMatrix()
        answers, predicts = similarity.predict()
        self.similarityClass = similarity
        self.allPredicts[1, :] = predicts
        #print("Similarity: %f" % evaluationRMSE(answers, predicts))

        svd = SVD(self.trainData, self.testData)
        svd.generaterMat()
        svd.calcSVD()
        answers, predicts = svd.predict()
        self.svdClass = svd
        self.allPredicts[2, :] = predicts
        #print("SVD: %f" % evaluationRMSE(answers, predicts))

        matFactory = MatFactory(self.trainData, self.testData)
        matFactory.train(10, 11)
        answers, predicts = matFactory.predict()
        self.matFactoryClass = matFactory
        self.allPredicts[3, :] = predicts
        #print("MatFactory: %f" % evaluationRMSE(answers, predicts))

        pickleFile = open(predictsFile, 'wb')
        pickle.dump(self.allPredicts, pickleFile)
예제 #7
0
def combine_files(lectures, features=None, prompts=['q1', 'q2']):
    phrasedir1 = '../data/%s/oracle_annotator_1/phrase/' % course
    phrasedir2 = '../data/%s/oracle_annotator_2/phrase/' % course

    X = []
    Y = []

    if features == None:
        sim_extractor = Similarity()
        features = sorted(sim_extractor.features.keys())

    for i, lec in enumerate(lectures):
        for q in prompts:

            for phrasedir in [phrasedir1, phrasedir2]:
                path = phrasedir + str(lec) + '/'

                filename = os.path.join(path, q + sim_exe)

                data = fio.LoadDictJson(filename)

                for fdict, score, _ in data:
                    row = []

                    for name in features:
                        x = fdict[name]
                        if str(x) == 'nan':
                            x = 0.0
                        row.append(x)

                    X.append(row)
                    Y.append(score)

    return X, Y
예제 #8
0
def find_top_k_similar_program(repo_kernel_file, user_prog_graph_dot_file,
                               graph_name, k, num_iter, cluster_json):
    sim = Similarity()
    sim.read_graph_kernels(repo_kernel_file)
    result_program_list_with_score = sim.find_top_k_similar_graphs(
        user_prog_graph_dot_file, graph_name, k, num_iter, cluster_json)
    return result_program_list_with_score
예제 #9
0
def gather_performance(output):
    sim_extractor = Similarity()
    allfeatures = sorted(sim_extractor.features.keys())

    allbody = []
    for k in range(len(allfeatures) + 1):
        #features = allfeatures#['WordEmbedding']

        if k == len(allfeatures):  #use all features
            features = allfeatures
        else:
            features = [allfeatures[k]]
            #features = allfeatures[0:k] + allfeatures[k+1:]

        name = '_'.join(features)

        resultfile = '../data/%s/simlearning.cv.svm.%s.txt' % (course, name)

        head, body = fio.ReadMatrix(resultfile, hasHead=True)

        #get the average
        allhead = ['name'] + head[2:]
        average = [name]
        for i in range(2, len(head)):  #start from the third one
            values = [float(row[i]) for row in body]
            average.append(np.mean(values))

        allbody.append(average)

    fio.WriteMatrix(output, allbody, allhead)
예제 #10
0
    def contentBasedFiltering(self, key, n=3):
        '''Return list of n top match scores along with other keys'''

        dataset = self.dataset
        scores = []

        for other_key in dataset:
            if other_key == key:
                continue

            # Fetching common inner keys to calculate similarity score
            common_inner_keys = self.fetchCommonInnerKeys(key, other_key)

            # If there is no common inner key, skip this other keys
            if len(common_inner_keys) == 0:
                continue

            x = [dataset[key][inner_key] for inner_key in common_inner_keys]
            y = [
                dataset[other_key][inner_key]
                for inner_key in common_inner_keys
            ]

            # Appending the similarity score to a list
            sim = Similarity()
            scores.append((sim.pearson(x, y), other_key))

        # Sorting the list so the highest score appear at the top
        scores.sort()
        scores.reverse()

        return scores[0:n]
예제 #11
0
def train_IE256_svm(traincourse, model_dir, name='simlearn_cv'):
    sim_extractor = Similarity()
    allfeatures = sorted(sim_extractor.features.keys())

    features = allfeatures

    name = '_'.join(features)

    lectures = annotation.Lectures

    dict = defaultdict(int)

    if traincourse == 'IE256':
        train = [x for x in range(14, 26) if x != 22]
    else:
        train = [x for x in range(3, 27)]

    model_file = os.path.join(model_dir, '%s_%s.model' % (traincourse, name))

    if fio.IsExist(model_file):
        with open(model_file, 'rb') as handle:
            clf = pickle.load(handle)
    else:
        train_X, train_Y = combine_files_course(traincourse, train, features)
        clf = svm.SVC()
        clf.fit(train_X, train_Y)

        with open(model_file, 'wb') as handle:
            pickle.dump(clf, handle)
예제 #12
0
def extractPhrasePaireFeature(phrasedir):
    for lec in annotation.Lectures:
        path = phrasedir + str(lec) + '/'
        fio.NewPath(path)

        for prompt in ['q1', 'q2']:
            prefix = os.path.join(path, '%s.%s.' % (prompt, method))
            filename = path + prompt + sim_exe
            print filename

            featureset = []

            feature_extractor = Similarity(prefix)

            phrasefile = os.path.join(path, "%s.%s.key" % (prompt, method))

            phrases = fio.LoadList(phrasefile)

            for p1 in phrases:
                for p2 in phrases:
                    featureset.append(
                        (feature_extractor.get_features(p1, p2), 0.0, {
                            'p1': p1,
                            'p2': p2
                        }))

            fio.SaveDict2Json(featureset, filename)

            feature_extractor.save()
예제 #13
0
 def similarity_action(self):
     dialog = Similarity(parent=self, df=self.table)
     if dialog.exec_():
         res = dialog.execute()
         QMessageBox.information(
             self, f'Similarity: {dialog.method}',
             f'Columns {dialog.first_column} and {dialog.second_column} have a similarity value of {res}',
             QMessageBox.Ok)
예제 #14
0
def train_leave_one_lecture_out(model_dir, name='simlearn_cv'):
    #     model_dir = '../data/IE256/%s/model/%s/'%(system, name)
    #     fio.NewPath(model_dir)
    #
    #     outputdir = '../data/IE256/%s/extraction/%s_output/'%(system, name)
    #     fio.NewPath(outputdir)

    sim_extractor = Similarity()
    allfeatures = sorted(sim_extractor.features.keys())

    if True:
        k = len(allfeatures)
        #for k in range(len(allfeatures)+1):
        #features = allfeatures#['WordEmbedding']

        if k == len(allfeatures):  #use all features
            features = allfeatures
        else:
            features = [allfeatures[k]]

        name = '_'.join(features)

        lectures = annotation.Lectures

        dict = defaultdict(int)

        MSE = []
        for i, lec in enumerate(lectures):
            train = [x for x in lectures if x != lec]
            test = [lec]

            print train
            print test

            model_file = os.path.join(model_dir, '%d_%s.model' % (lec, name))

            if fio.IsExist(model_file):
                with open(model_file, 'rb') as handle:
                    clf = pickle.load(handle)
            else:
                train_X, train_Y = combine_files(train, features)
                clf = svm.SVR()
                clf.fit(train_X, train_Y)

                with open(model_file, 'wb') as handle:
                    pickle.dump(clf, handle)

            for q in ['q1', 'q2']:
                test_X, test_Y = combine_files(test, features, prompts=[q])
                predict_Y = clf.predict(test_X)

                mse = mean_squared_error(test_Y, predict_Y)

                MSE.append([lec, q, mse])

        output = '../data/%s/simlearning.cv.%s.txt' % (course, name)

        fio.WriteMatrix(output, MSE, header=['lec', 'prompt', 'MSE'])
예제 #15
0
def correlation_analysis(course):
    phrasedir1 = '../data/%s/oracle_annotator_1/phrase/' % course
    phrasedir2 = '../data/%s/oracle_annotator_2/phrase/' % course

    outdir = '../data/%s/simlearning/' % course
    fio.NewPath(outdir)

    sim_extractor = Similarity()

    features = sorted(sim_extractor.features.keys())
    head = features + ['score', 'predict']
    body = []
    lectures = annotation.Lectures
    name = '_'.join(features)

    for i, lec in enumerate(lectures):

        model_file = os.path.join(model_dir, '%d_%s.model' % (lec, name))

        with open(model_file, 'rb') as handle:
            clf = pickle.load(handle)

        for q in ['q1', 'q2']:

            outfile = os.path.join(outdir, str(lec), '%s%s' % (q, sim_exe))

            for phrasedir in [phrasedir1, phrasedir2]:
                path = phrasedir + str(lec) + '/'

                filename = os.path.join(path, q + sim_exe)

                data = fio.LoadDictJson(filename)

                for fdict, score, _ in data:
                    row = []

                    for fname in features:
                        x = fdict[fname]

                        if str(x) == 'nan':
                            x = 0.0

                        row.append(x)

                    predict_score = clf.predict([row])

                    row.append(score)

                    row.append(predict_score[0])

                    body.append(row)

    out_correlation = os.path.join(outdir, 'data.txt')

    print out_correlation
    fio.WriteMatrix(out_correlation, body, head)
    def test__check_is_spark_data_frame(self):

        df_simple_table = self.spark.read.csv(
            'tests/fixtures/similarity/simple_table.csv', header=True)
        pd_df_simple_table = pd.read_csv(
            'tests/fixtures/similarity/simple_table.csv')

        columns_to_convert = [
            col for col in df_simple_table.columns if 'id' not in col
        ]
        for col in columns_to_convert:
            df_simple_table = df_simple_table.withColumn(
                col,
                f.col(col).cast(IntegerType()))

        Similarity(df_features=df_simple_table)

        with self.assertRaises(AssertionError):
            Similarity(df_features=pd_df_simple_table)
예제 #17
0
def main():

    cats = prepare_categories()

    test_token_freqs = prepare_test_data()

    similarity = Similarity(cats=cats, data=test_token_freqs)

    print(max(similarity.jaccard(), key=similarity.jaccard().get))
    print(max(similarity.cosine(), key=similarity.cosine().get))
예제 #18
0
 def __init__(self):
     self.m_preprocessor = Preprocessor()
     self.m_similarity = Similarity()
     self.m_plt = Plot()
     self.m_evaluator = Evaluator()
     self.m_file = "sts-train"
     self.m_metric = "path"
     self.m_ic = "brown"
     self.m_metric_w2v = "cosine"
     self.m_metric_t = "path"
     self.m_thr = 20
     self.m_mode = "ontology"
예제 #19
0
    def collaborativeRecommendation(self, key, n=3):
        '''Return list of n top match scores along with inner keys'''

        dataset = self.dataset
        weighted_inner_values = {}
        total_scores = {}

        for other_key in dataset:
            if other_key == key:
                continue

            # Fetching common inner keys to calculate similarity score
            common_inner_keys = self.fetchCommonInnerKeys(key, other_key)

            # If there is no common inner key, skip this other keys
            if len(common_inner_keys) == 0:
                continue

            x = [dataset[key][inner_key] for inner_key in common_inner_keys]
            y = [
                dataset[other_key][inner_key]
                for inner_key in common_inner_keys
            ]

            # Finding similarity score
            sim = Similarity()
            score = sim.pearson(x, y)

            # Ignoring scores of zero or below
            if score <= 0:
                continue

            for inner_key in dataset[other_key]:
                if inner_key not in dataset[key] or dataset[key][
                        inner_key] == 0:
                    # Weighted sum of value times similarity score
                    weighted_inner_values.setdefault(inner_key, 0)
                    weighted_inner_values[
                        inner_key] += score * dataset[other_key][inner_key]

                    # Sum of similarity score
                    total_scores.setdefault(inner_key, 0)
                    total_scores[inner_key] += score

        scores = [(weighted_inner_values[inner_key] / total_scores[inner_key],
                   inner_key) for inner_key in weighted_inner_values]

        # Sorting the list so that highest score appear at the top
        scores.sort()
        scores.reverse()

        return scores[0:n]
예제 #20
0
def extractPhrasePaireFromAnnotation(phrasedir, annotators, id):
    for doc, lec, annotator in annotation.generate_all_files(
            annotation.datadir + 'json/',
            '.json',
            anotators=annotators,
            lectures=annotation.Lectures):
        print doc

        #load task
        task = annotation.Task()
        task.loadjson(doc)

        path = phrasedir + str(lec) + '/'
        fio.NewPath(path)

        for prompt in ['q1', 'q2']:
            prefix = os.path.join(path, '%s.%s.' % (prompt, method))
            filename = path + prompt + sim_exe
            print filename

            featureset = []

            feature_extractor = Similarity(prefix)

            phrase_annotation = task.get_phrase_annotation(prompt)

            #positive examples
            for rank1 in sorted(phrase_annotation):
                for rank2 in sorted(phrase_annotation):
                    if rank1 == rank2:
                        score = 1.0
                    else:
                        score = 0.0

                    phrases1 = phrase_annotation[rank1]
                    phrases2 = phrase_annotation[rank2]
                    for phrasedict1 in phrases1:
                        p1 = phrasedict1['phrase'].lower().strip()

                        for phrasedict2 in phrases2:
                            p2 = phrasedict2['phrase'].lower().strip()

                            featureset.append(
                                (feature_extractor.get_features(p1,
                                                                p2), score, {
                                                                    'p1': p1,
                                                                    'p2': p2
                                                                }))

            fio.SaveDict2Json(featureset, filename)

            feature_extractor.save()
예제 #21
0
	def __init__(self, messages, model, questions: set, answers: set,
				 pc_questions: dict, pc_answers: dict, tokenizer):

		self.questions = questions
		self.answers = answers
		self.pc_questions = pc_questions
		self.pc_answers = pc_answers	
		self.tokenizer = tokenizer	
		self.model = model
		self.messages = messages
		self.pp = PreProcessing()		
		self.s = Similarity(questions=self.questions,
							answers=self.answers
							)
예제 #22
0
파일: dangle.py 프로젝트: klovens/juxtapose
def main(experiment_name, phenotypes, data_directory, anchor_genes,
         num_replicates=1, percent=0.4, num_anchors=50, min_dangle_size=3,
         max_dangle_size=10, test_ratio=0.5):
    assert isinstance(phenotypes, list)
    alphas = random.choices(range(min_dangle_size, max_dangle_size),
                            k=int(num_anchors * test_ratio))
    assert len(alphas) < len(anchor_genes)
    anchor_train_groups = []
    anchor_test_groups = []
    backbones = []
    # Create all backbones
    for rep_id in range(num_replicates):
        random.shuffle(anchor_genes)
        candidates = anchor_genes[:int(num_anchors)]
        genes_of_interest_train, genes_of_interest_test = train_test_split(
            candidates,
            shuffle=True,
            test_size=test_ratio)

        anchor_train_groups.append(genes_of_interest_train)
        anchor_test_groups.append(genes_of_interest_test)
        backbones.append(
            build_backbone(anchors=anchor_train_groups[rep_id], alphas=alphas,
                           weight=1, edge_percentage=percent))
    # Write train anchors to file
    with open(os.path.join(experiment_name, 'train_anchors.csv'), 'w') as fout:
        for gene_group in anchor_train_groups:
            fout.write(','.join(gene_group))
            fout.write("\n")
    # Write test anchors to file
    with open(os.path.join(experiment_name, 'test_anchors.csv'), 'w') as fout:
        for gene_group in anchor_test_groups:
            fout.write(','.join(gene_group))
            fout.write("\n")
    # Adding the backbones and create the similarity object
    for pheno in phenotypes:
        file_name = os.path.join(data_directory, "{}.csv".format(pheno))
        for rep_id in range(num_replicates):
            sim_file_name = "anchored_{}_{}.csv".format(pheno, str(rep_id))
            out_address = os.path.join(experiment_name, sim_file_name)
            similarity = Similarity(file_name,
                                    anchors=anchor_train_groups[rep_id],
                                    alphas=alphas, string_id=True)
            similarity.transform()
            similarity.apply_threshold(lower_cor=0.2, upper_cor=0.8,
                                       value=0)
            similarity.augment(backbones[rep_id])

            similarity.to_csv(out_address)
예제 #23
0
def generate_walks(edge_list_address, walk_per_node, walk_length, workers = 4):
    similarity = Similarity(correlation_file_path=edge_list_address, anchors=[],
                            alphas=[], sep=',', prefix='pseudo')
    genes = list(similarity.idx.keys())
    start_time = time.time()
    gen_walk = WalkGenerator(similarity.matrix, genes, walk_length, walk_per_node)
    print("takes {} seconds to create walk object.".format(
        time.time() - start_time))

    num_cpus = workers
    pool = mp.Pool(num_cpus)
    arguments = list(range(len(gen_walk)))
    chunk_size = len(gen_walk) // num_cpus
    walks = pool.map(gen_walk, arguments, chunksize=chunk_size)
    return walks
    def test__check_is_numerical_data(self):

        df_numerical = self.spark.read.csv(
            'tests/fixtures/similarity/numerical_data.csv', header=True)

        columns_to_convert = [
            col for col in df_numerical.columns if 'id' not in col
        ]
        df_numerical_int = df_numerical
        df_numerical_float = df_numerical

        for col in columns_to_convert:
            df_numerical_int = df_numerical_int.withColumn(
                col,
                f.col(col).cast(IntegerType()))
            df_numerical_float = df_numerical_float.withColumn(
                col,
                f.col(col).cast(DoubleType()))

        Similarity(df_features=df_numerical_int)
        Similarity(df_features=df_numerical_float)

        with self.assertRaises(AssertionError):
            Similarity(df_features=df_numerical)
    def test__convert_to_long_format(self):

        pd_df_similarities_wide = pd.read_csv(
            'tests/fixtures/similarity/similarities_wide.csv', index_col=0)

        df_simple_table = self.spark.read.csv(
            'tests/fixtures/similarity/simple_table.csv', header=True)
        columns_to_convert = [
            col for col in df_simple_table.columns if 'id' not in col
        ]
        for col in columns_to_convert:
            df_simple_table = df_simple_table.withColumn(
                col,
                f.col(col).cast(IntegerType()))
        similarity = Similarity(df_features=df_simple_table)

        pd_df_similarities_long = similarity._convert_to_long_format(
            pd_df_similarities_wide)

        self.assertEqual(
            pd_df_similarities_long.shape[0],
            pd_df_similarities_wide.shape[0] *
            pd_df_similarities_wide.shape[1])

        check_1_3 = pd_df_similarities_long.loc[
            (pd_df_similarities_long['recipe_id_1'] == 1)
            & (pd_df_similarities_long['recipe_id_2'] == '3'
               )]['similarity'].values[0]
        self.assertEqual(check_1_3, 9)

        check_3_1 = pd_df_similarities_long.loc[
            (pd_df_similarities_long['recipe_id_1'] == 3)
            & (pd_df_similarities_long['recipe_id_2'] == '1'
               )]['similarity'].values[0]
        self.assertEqual(check_3_1, 6)

        check_2_3 = pd_df_similarities_long.loc[
            (pd_df_similarities_long['recipe_id_1'] == 2)
            & (pd_df_similarities_long['recipe_id_2'] == '3'
               )]['similarity'].values[0]
        self.assertEqual(check_2_3, 1)

        check_1_2 = pd_df_similarities_long.loc[
            (pd_df_similarities_long['recipe_id_1'] == 1)
            & (pd_df_similarities_long['recipe_id_2'] == '2'
               )]['similarity'].values[0]
        self.assertEqual(check_1_2, 6)
예제 #26
0
def correlation_analysis_noduplicate():
    phrasedir1 = '../data/%s/oracle_annotator_1/phrase/' % course
    phrasedir2 = '../data/%s/oracle_annotator_2/phrase/' % course

    outdir = '../data/%s/simlearning/' % course
    fio.NewPath(outdir)

    sim_extractor = Similarity()

    features = sorted(sim_extractor.features.keys())
    head = features + ['score']
    body = []
    lectures = annotation.Lectures

    for i, lec in enumerate(lectures):
        for q in ['q1', 'q2']:

            outfile = os.path.join(outdir, str(lec), '%s%s' % (q, sim_exe))

            for phrasedir in [phrasedir1, phrasedir2]:
                path = phrasedir + str(lec) + '/'

                filename = os.path.join(path, q + sim_exe)

                data = fio.LoadDictJson(filename)

                for fdict, score, pd in data:
                    if pd['p1'] == pd['p2']:
                        print pd['p1']
                        continue

                    row = []

                    for name in features:
                        x = fdict[name]

                        if str(x) == 'nan':
                            x = 0.0

                        row.append(x)
                    row.append(score)

                    body.append(row)

    out_correlation = os.path.join(outdir, 'data.txt')
    fio.WriteMatrix(out_correlation, body, head)
예제 #27
0
def main():
    similarity = Similarity()

    pg.init()
    clock = pg.time.Clock()
    clock.tick(30)
    # create screen
    display_width = 960
    display_height = 650
    pos_x = 0
    pos_y = 30
    os.environ['SDL_VIDEO_WINDOW_POS'] = '%i,%i' % (pos_x, pos_y)
    gameDisplay = pg.display.set_mode((display_width, display_height))
    pg.display.set_caption('Pose Dance')

    game = PoseDance(gameDisplay, similarity)
    game.run()
예제 #28
0
def predict_IE256(train_course, model_dir, phrasedir, modelname='svm'):
    sim_extractor = Similarity()
    allfeatures = sorted(sim_extractor.features.keys())

    features = allfeatures

    name = '_'.join(features)

    lectures = annotation.Lectures

    for i, lec in enumerate(lectures):
        test = [lec]

        print test
        model_file = os.path.join(model_dir,
                                  '%s_%s.model' % (train_course, name))

        with open(model_file, 'rb') as handle:
            clf = pickle.load(handle)

        path = os.path.join(phrasedir, str(lec))

        for q in ['q1', 'q2']:
            test_X, test_Y = combine_files_test(phrasedir,
                                                test,
                                                features,
                                                prompts=[q])
            predict_Y = clf.predict(test_X)

            #write the output
            phrasefile = os.path.join(path, "%s.%s.key" % (q, method))
            phrases = fio.LoadList(phrasefile)

            assert (len(predict_Y) == len(phrases) * len(phrases))

            k = 0
            body = []
            for p1 in phrases:
                row = []
                for p2 in phrases:
                    row.append(predict_Y[k])
                    k += 1
                body.append(row)

            output = os.path.join(path, "%s.%s.%s" % (q, method, modelname))
            fio.WriteMatrix(output, body, phrases)
예제 #29
0
    def run(self, corpus_path, test_path, minfreq):
        self._database = self._construct_database(corpus_path)
        before_unique, before_total = self._stas(self._database)
        self._database.apply_minfreq(minfreq)
        after_unique, after_total = self._stas(self._database)
        sim = Similarity(self._database)
        test_phrases = self.IO.read_phrases(test_path)

        with open('trace.txt', 'w', encoding='utf8') as f:
            # Write the head line.
            args = [before_unique, after_unique, before_total, after_total]
            f.write('\n')
            self._write_head(f, args)

            for phrase in test_phrases:
                most_similar = self._find_k_similar(phrase, sim, 5)
                self._write_result(f, phrase, most_similar)
예제 #30
0
def get_best_indices(list, sin_val):
    ''' The function takes on single row and finds out the best indexes according to similarity distance. The similarity values used are
        Euclidean distance, Manhattan distance, Minkowski distance, Cosine distance and Jaccard distance.
        It returns a dictionary of list'''

    ### local optima saves a dictionary where dictionary is like { distance_type: [best_distance_value, best_lowest_index, best_upper_index] }
    local_optima = {
        "Euclidean": [9999999999, 9999999, 99999999],
        "Manhattan": [9999999999, 9999999, 99999999],
        "Minkowski": [9999999999, 9999999, 99999999],
        "Cosine": [9999999999, 9999999, 99999999],
        "Jaccard": [9999999999, 9999999, 99999999]
    }

    measures = Similarity()  ### Calling Similarity class
    size = len(sin_val)  ### size of sine value list which is 40

    for i in range(len(list) - size):

        ### Euclidean Portion
        val = measures.euclidean_distance(list[i:i + size], sin_val)
        if val <= local_optima["Euclidean"][0]:
            local_optima["Euclidean"] = [val, i, i + size]

        ### Manhattan Portion
        val = measures.manhattan_distance(list[i:i + size], sin_val)
        if val <= local_optima["Manhattan"][0]:
            local_optima["Manhattan"] = [val, i, i + size]

        ### Minkowski Portion
        val = measures.minkowski_distance(list[i:i + size], sin_val, 3)
        if val <= local_optima["Minkowski"][0]:
            local_optima["Minkowski"] = [val, i, i + size]

        ### Cosine Portion
        val = measures.cosine_similarity(list[i:i + size], sin_val)
        if val <= local_optima["Cosine"][0]:
            local_optima["Cosine"] = [val, i, i + size]

        ### Jaccard Portion
        val = measures.jaccard_similarity(list[i:i + size], sin_val)
        if val <= local_optima["Jaccard"][0]:
            local_optima["Jaccard"] = [val, i, i + size]

    return local_optima