Пример #1
0
    def test_distributions(self):

        # checking bag of words as inputs
        vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)]
        vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1)]
        result = matutils.hellinger(vec_1, vec_2)
        expected = 0.185241936534
        self.assertAlmostEqual(expected, result)

        # checking ndarray, csr_matrix as inputs
        vec_1 = numpy.array([[1, 0.3], [0, 0.4], [2, 0.3]])
        vec_2 = csr_matrix([[1, 0.4], [0, 0.2], [2, 0.2]])
        result = matutils.hellinger(vec_1, vec_2)
        expected = 0.160618030536
        self.assertAlmostEqual(expected, result)

        # checking ndarray, list as inputs
        vec_1 = numpy.array([0.6, 0.1, 0.1, 0.2])
        vec_2 = [0.2, 0.2, 0.1, 0.5]
        result = matutils.hellinger(vec_1, vec_2)
        expected = 0.309742984153
        self.assertAlmostEqual(expected, result)

        # testing LDA distribution vectors
        numpy.random.seed(0)
        model = self.class_(self.corpus,
                            id2word=dictionary,
                            num_topics=2,
                            passes=100)
        lda_vec1 = model[[(1, 2), (2, 3)]]
        lda_vec2 = model[[(2, 2), (1, 3)]]
        result = matutils.hellinger(lda_vec1, lda_vec2)
        expected = 1.0406845281146034e-06
        self.assertAlmostEqual(expected, result)
Пример #2
0
def calculate_hellinger_predictions(users_profiles,
                                    papers_topics,
                                    fold,
                                    splits,
                                    lag=500):
    print("Calculating predictions based on hellinger distance...")
    s_time = time.time()
    predictions = np.zeros((users_profiles.shape[0], papers_topics.shape[0]))
    step = 0
    if not (splits is None):
        print("Calculating for test items only...")
        for (i, u) in enumerate(users_profiles):
            # Get the test items, calculate the predictions for the test items only
            test_items = np.array(splits[i, fold])
            for j in test_items:
                predictions[i, j] = 1 - hellinger(u, papers_topics[j])
            step += 1
            if step % lag == 0:
                print(
                    "{} users done, time since prediction calculation: {:5.2f} minutes"
                    .format(step, (time.time() - s_time) / 60))
    else:
        print("Calculating for all items ...")
        for (i, u) in enumerate(users_profiles):
            for (j, p) in enumerate(papers_topics):
                predictions[i, j] = 1 - hellinger(u, p)
            step += 1
            if step % lag == 0:
                print(
                    "{} users done, time since prediction calculation: {:5.2f} minutes"
                    .format(step, (time.time() - s_time) / 60))

    return predictions
def main():
    args = parser.parse_args()
    # dialect = ['pa','sy']
    dialect = [args.dialect_one, args.dialect_two]

    folder = args.corpus_folder + '/'
    # clean_data/comparable/msa/ , clean_data/comparable/egypt/
    corpus_files = [folder + dialect[0] + '.txt', folder + dialect[1] + '.txt']

    dictionary, corpus = models.build_comparable_ldamodel_training(folder, dialect)
    # sys.exit()
    # print('dict',len(dictionary))
    # print(dictionary.token2id)
    # print('corpus', len(corpus))
    lda_model = models.build_ldamodel(corpus, dictionary)

    folders = [folder + dialect[0] + '/', folder + dialect[1] + '/']
    # for sub_folder in folders:
    Hellinger_summation = 0
    Jaaccard_summation = 0
    for file in os.listdir(folders[0]):
        try:

            extension = os.path.splitext(file)[1]
            if extension == '.txt':
                first_filepath = os.path.join(folders[0], file)
                second_filepath = os.path.join(folders[1], file)

                with open(first_filepath, encoding='utf-8') as f:  # we can define file_name
                    first_documents = f.read()
                first_dialect = [word for word in first_documents.split()]

                # print(first_dialect)
                with open(second_filepath, encoding='utf-8') as f:  # we can define file_name
                    second_documents = f.read()
                second_dialect = [word for word in second_documents.split()]

                # print(second_dialect)
                bow_first_dialect = lda_model.id2word.doc2bow(first_dialect)
                bow_second_dialect = lda_model.id2word.doc2bow(second_dialect)
                # print(bow_first_dialect)
                # we can now get the LDA topic distributions for these
                lda_bow_first_dialect = lda_model[bow_first_dialect]
                lda_bow_second_dialect = lda_model[bow_second_dialect]

                # print(lda_bow_first_dialect)

                print('Hellinger distance between 1 and 2 ')
                print(hellinger(lda_bow_first_dialect, lda_bow_second_dialect))
                Hellinger_summation = Hellinger_summation + hellinger(lda_bow_first_dialect, lda_bow_second_dialect)
                print('Jcard Distance')
                print(jaccard(bow_first_dialect, bow_second_dialect))
                Jaaccard_summation = Jaaccard_summation + jaccard(bow_first_dialect, bow_second_dialect)
                # sys.exit()

        except :
            pass

    print('total hellinger = ', Hellinger_summation / 10197)
    print('Total JC = ', Jaaccard_summation / 10197)
Пример #4
0
    def test_distributions(self):

        # checking bag of words as inputs
        vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)]
        vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1)]
        result = matutils.hellinger(vec_1, vec_2)
        expected = 0.185241936534
        self.assertAlmostEqual(expected, result)


        # checking ndarray, csr_matrix as inputs
        vec_1 = np.array([[1, 0.3], [0, 0.4], [2, 0.3]])
        vec_2 = csr_matrix([[1, 0.4], [0, 0.2], [2, 0.2]])
        result = matutils.hellinger(vec_1, vec_2)
        expected = 0.160618030536
        self.assertAlmostEqual(expected, result)

        # checking ndarray, list as inputs
        vec_1 = np.array([0.6, 0.1, 0.1, 0.2])
        vec_2 = [0.2, 0.2, 0.1, 0.5]
        result = matutils.hellinger(vec_1, vec_2)
        expected = 0.309742984153
        self.assertAlmostEqual(expected, result)

         # testing LDA distribution vectors
        np.random.seed(0)
        model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes= 100)
        lda_vec1 = model[[(1, 2), (2, 3)]]
        lda_vec2 = model[[(2, 2), (1, 3)]]
        result = matutils.hellinger(lda_vec1, lda_vec2)
        expected = 1.0406845281146034e-06
        self.assertAlmostEqual(expected, result)
def comparable_corpus_distance(folder, dialect):
    dictionary, corpus = models.build_comparable_ldamodel_training(
        folder, dialect)
    lda_model = models.build_ldamodel(corpus, dictionary)
    folders = [folder + dialect[0] + '/', folder + dialect[1] + '/']

    Hellinger_summation = 0
    Jaaccard_summation = 0
    for file in os.listdir(folders[0]):
        try:

            extension = os.path.splitext(file)[1]
            if extension == '.txt':
                first_filepath = os.path.join(folders[0], file)
                second_filepath = os.path.join(folders[1], file)

                with open(first_filepath,
                          encoding='utf-8') as f:  # we can define file_name
                    first_documents = f.read()
                first_dialect = [word for word in first_documents.split()]

                # print(first_dialect)
                with open(second_filepath,
                          encoding='utf-8') as f:  # we can define file_name
                    second_documents = f.read()
                second_dialect = [word for word in second_documents.split()]

                # print(second_dialect)
                bow_first_dialect = lda_model.id2word.doc2bow(first_dialect)
                bow_second_dialect = lda_model.id2word.doc2bow(second_dialect)
                # print(bow_first_dialect)
                # we can now get the LDA topic distributions for these
                lda_bow_first_dialect = lda_model[bow_first_dialect]
                lda_bow_second_dialect = lda_model[bow_second_dialect]

                # print(lda_bow_first_dialect)

                print('Hellinger distance between 1 and 2 ')
                print(hellinger(lda_bow_first_dialect, lda_bow_second_dialect))
                Hellinger_summation = Hellinger_summation + hellinger(
                    lda_bow_first_dialect, lda_bow_second_dialect)
                print('Jcard Distance')
                print(jaccard(bow_first_dialect, bow_second_dialect))
                Jaaccard_summation = Jaaccard_summation + jaccard(
                    bow_first_dialect, bow_second_dialect)
                # sys.exit()

        except:
            pass

    print('total hellinger = ', Hellinger_summation / 10197)
    print('Total JC = ', Jaaccard_summation / 10197)
Пример #6
0
    def get_vector_similarity_hellinger(self, vec1, vec2, model):
        '''Get similarity between two vectors'''

        dist = matutils.hellinger(matutils.sparse2full(vec1, model.num_topics), \
                                  matutils.sparse2full(vec2, model.num_topics))
        sim = 1.0 / (1.0 + dist)
        return sim
Пример #7
0
 def compute_hellinger(dist01, dist02):
     unique_words = set([x[1] for x in dist01] + [x[1] for x in dist02])
     dict_dist01 = {x[1]: x[0] for x in dist01}
     dict_dist02 = {x[1]: x[0] for x in dist02}
     vec01 = [dict_dist01.get(x, 0) for x in unique_words]
     vec02 = [dict_dist02.get(x, 0) for x in unique_words]
     return hellinger(vec01, vec02)
    def test_inputs(self):
        # checking empty inputs
        vec_1 = []
        vec_2 = []
        result = matutils.hellinger(vec_1, vec_2)
        expected = 0.0
        self.assertEqual(expected, result)

        # checking np array and list input
        vec_1 = np.array([])
        vec_2 = []
        result = matutils.hellinger(vec_1, vec_2)
        expected = 0.0
        self.assertEqual(expected, result)

        # checking scipy csr matrix and list input
        vec_1 = csr_matrix([])
        vec_2 = []
        result = matutils.hellinger(vec_1, vec_2)
        expected = 0.0
        self.assertEqual(expected, result)
    def test_inputs(self):
        # checking empty inputs
        vec_1 = []
        vec_2 = []
        result = matutils.hellinger(vec_1, vec_2)
        expected = 0.0
        self.assertEqual(expected, result)

        # checking np array and list input
        vec_1 = np.array([])
        vec_2 = []
        result = matutils.hellinger(vec_1, vec_2)
        expected = 0.0
        self.assertEqual(expected, result)

        # checking scipy csr matrix and list input
        vec_1 = csr_matrix([])
        vec_2 = []
        result = matutils.hellinger(vec_1, vec_2)
        expected = 0.0
        self.assertEqual(expected, result)
Пример #10
0
def distance(text, doc_topic_probs):
    topic, x = predict_topic(text)
    #print(topic)
    #print(x)
    #print("*****")
    #print(lda_output)

    #dists = euclidean_distances(x, doc_topic_probs)[0]
    #print(j+"----------------------------------------------------------------------")
    #print(x[0])
    #print(doc_topic_probs[0])
    #print(hellinger(x[0],doc_topic_probs[0]))
    #for i in dists:
    #print(i)
    #return dists
    return hellinger(text, lda_output)
Пример #11
0
 def CheckOverlaps(self, dist_tolerance, parsed_dict, topic_dict):
     overlaps_graph = {}
     overlaps_print = {}
     for doc1 in parsed_dict:
         #print(doc1)
         for index_doc1, topics_doc1 in enumerate(parsed_dict[doc1]):
             #print("    ", topics_doc1)
             #print('------------------')
             for doc2 in parsed_dict:
                 if doc1 == doc2:
                     break
                 for index_doc2, topics_doc2 in enumerate(
                         parsed_dict[doc2]):
                     dist = hellinger(topics_doc1, topics_doc2)
                     if (dist <= dist_tolerance):
                         doc1_topic_graph = doc1 + ': Topic ' + str(
                             index_doc1 + 1)
                         doc2_topic_graph = doc2 + ': Topic ' + str(
                             index_doc2 + 1)
                         doc1_topic_print = self.GetNestedElement(
                             topic_dict, doc1, index_doc1)
                         doc2_topic_print = self.GetNestedElement(
                             topic_dict, doc2, index_doc2)
                         try:
                             overlaps_graph[(doc1_topic_graph)] += [
                                 (doc2_topic_graph, dist)
                             ]
                             overlaps_print[(doc1 + ': Topic ' +
                                             str(index_doc1 + 1),
                                             doc1_topic_print)] += [
                                                 (doc2 + ': Topic ' +
                                                  str(index_doc2 + 1),
                                                  doc2_topic_print)
                                             ]
                         except KeyError:
                             overlaps_graph[(doc1_topic_graph)] = [
                                 (doc2_topic_graph, dist)
                             ]
                             overlaps_print[(doc1 + ': Topic ' +
                                             str(index_doc1 + 1),
                                             doc1_topic_print)] = [
                                                 (doc2 + ': Topic ' +
                                                  str(index_doc2 + 1),
                                                  doc2_topic_print)
                                             ]
     return overlaps_graph, overlaps_print
Пример #12
0
 def Hellinger_similiarity(self,
                           corpus,
                           corpus_model_user_description,
                           num_best=5):
     'implements Hellinger similarity using gensim modules'
     length = len(corpus_model_user_description)
     queryXhotel = np.zeros((length, len(corpus)))
     print('It takes some time')
     for i in range(length):
         for j in range(len(corpus)):
             queryXhotel[i][j] = hellinger(corpus_model_user_description[i],
                                           corpus[j])
         print(i)
     #np.save('hellinger_similiarity', queryXhotel)
     accuracy_array = self.make_accuracy_array(queryXhotel,
                                               num_best,
                                               bol=False)  #true?
     return accuracy_array
def corpus_distance(folder, dialect, corpus_files):
    dictionary, corpus = models.build_ldamodel_training(folder, dialect)

    # dictionary, corpus = premodel.upload_data(dialect)

    # print('here', len(corpus))
    lda_model = models.build_ldamodel(corpus, dictionary)

    # now we add the two dialects to test the distance betwen them
    with open(corpus_files[0],
              encoding='utf-8') as f:  # we can define file_name
        first_documents = f.read()
    first_dialect = [word for word in first_documents.split()]

    with open(corpus_files[1],
              encoding='utf-8') as f:  # we can define file_name
        second_documents = f.read()
    second_dialect = [word for word in second_documents.split()]
    # now let's make these into a bag of words format

    bow_first_dialect = lda_model.id2word.doc2bow(first_dialect)
    bow_second_dialect = lda_model.id2word.doc2bow(second_dialect)

    # we can now get the LDA topic distributions for these
    lda_bow_first_dialect = lda_model[bow_first_dialect]
    lda_bow_second_dialect = lda_model[bow_second_dialect]

    print('Hellinger distance between 1 and 2 ')
    print(hellinger(lda_bow_first_dialect, lda_bow_second_dialect))

    print('Jcard Distance')
    print(jaccard(bow_first_dialect, bow_second_dialect))

    print('kullback_leibler between 1 to 2')
    # print(kullback_leibler(lda_bow_first_dialect, lda_bow_second_dialect))

    print('kullback_leibler between 2 to 1')
    def test_distributions(self):

        # checking different length bag of words as inputs
        vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)]
        vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1)]
        result = matutils.hellinger(vec_1, vec_2)
        expected = 0.484060507634
        self.assertAlmostEqual(expected, result)

        # checking symmetrical bag of words inputs return same distance
        vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)]
        vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1), (8, 0.1), (10, 0.8), (9, 0.1)]
        result = matutils.hellinger(vec_1, vec_2)
        result_symmetric = matutils.hellinger(vec_2, vec_1)
        expected = 0.856921568786
        self.assertAlmostEqual(expected, result)
        self.assertAlmostEqual(expected, result_symmetric)

        # checking ndarray, csr_matrix as inputs
        vec_1 = np.array([[1, 0.3], [0, 0.4], [2, 0.3]])
        vec_2 = csr_matrix([[1, 0.4], [0, 0.2], [2, 0.2]])
        result = matutils.hellinger(vec_1, vec_2)
        expected = 0.160618030536
        self.assertAlmostEqual(expected, result)

        # checking ndarray, list as inputs
        vec_1 = np.array([0.6, 0.1, 0.1, 0.2])
        vec_2 = [0.2, 0.2, 0.1, 0.5]
        result = matutils.hellinger(vec_1, vec_2)
        expected = 0.309742984153
        self.assertAlmostEqual(expected, result)

        # testing LDA distribution vectors
        np.random.seed(0)
        model = self.class_(self.corpus,
                            id2word=common_dictionary,
                            num_topics=2,
                            passes=100)
        lda_vec1 = model[[(1, 2), (2, 3)]]
        lda_vec2 = model[[(2, 2), (1, 3)]]
        result = matutils.hellinger(lda_vec1, lda_vec2)
        expected = 1.0406845281146034e-06
        self.assertAlmostEqual(expected, result)
Пример #15
0
    def test_distributions(self):

        # checking different length bag of words as inputs
        vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)]
        vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1)]
        result = matutils.hellinger(vec_1, vec_2)
        expected = 0.484060507634
        self.assertAlmostEqual(expected, result)

        # checking symmetrical bag of words inputs return same distance
        vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)]
        vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1), (8, 0.1), (10, 0.8), (9, 0.1)]
        result = matutils.hellinger(vec_1, vec_2)
        result_symmetric = matutils.hellinger(vec_2, vec_1)
        expected = 0.856921568786
        self.assertAlmostEqual(expected, result)
        self.assertAlmostEqual(expected, result_symmetric)

        # checking ndarray, csr_matrix as inputs
        vec_1 = np.array([[1, 0.3], [0, 0.4], [2, 0.3]])
        vec_2 = csr_matrix([[1, 0.4], [0, 0.2], [2, 0.2]])
        result = matutils.hellinger(vec_1, vec_2)
        expected = 0.160618030536
        self.assertAlmostEqual(expected, result)

        # checking ndarray, list as inputs
        vec_1 = np.array([0.6, 0.1, 0.1, 0.2])
        vec_2 = [0.2, 0.2, 0.1, 0.5]
        result = matutils.hellinger(vec_1, vec_2)
        expected = 0.309742984153
        self.assertAlmostEqual(expected, result)

        # testing LDA distribution vectors
        np.random.seed(0)
        model = self.class_(self.corpus, id2word=common_dictionary, num_topics=2, passes=100)
        lda_vec1 = model[[(1, 2), (2, 3)]]
        lda_vec2 = model[[(2, 2), (1, 3)]]
        result = matutils.hellinger(lda_vec1, lda_vec2)
        expected = 1.0406845281146034e-06
        self.assertAlmostEqual(expected, result)
Пример #16
0
bow_water = model.id2word.doc2bow(doc_water)   
bow_finance = model.id2word.doc2bow(doc_finance)   
bow_bank = model.id2word.doc2bow(doc_bank)   

lda_bow_water = model[bow_water]
lda_bow_finance = model[bow_finance]
lda_bow_bank = model[bow_bank]

tfidf_bow_water = tfidf[bow_water]
tfidf_bow_finance = tfidf[bow_finance]
tfidf_bow_bank = tfidf[bow_bank]

from gensim.matutils import kullback_leibler, jaccard, hellinger

hellinger(lda_bow_water, lda_bow_finance)
hellinger(lda_bow_finance, lda_bow_bank)
hellinger(lda_bow_bank, lda_bow_water)

hellinger(lda_bow_finance, lda_bow_water)
kullback_leibler(lda_bow_water, lda_bow_bank)
kullback_leibler(lda_bow_bank, lda_bow_water)


jaccard(bow_water, bow_bank)
jaccard(doc_water, doc_bank)
jaccard(['word'], ['word'])

def make_topics_bow(topic):
    # takes the string returned by model.show_topics()
    # split on strings to get topics and the probabilities
Пример #17
0
# In[36]:

#testing the model on this research paper
test_doc = []
f = open('/Users/Moukthika/Desktop/ultimate_test.txt', 'r', encoding='utf8')
test_doc.append(f.read())
#print(test_doc)
for d in test_doc:
    doc_words = d.split(" ")
    #print(doc_words)

doc_words = dictionary.doc2bow(doc_words)
doc_words = ldaseq[doc_words]
print(doc_words)

#testing the model on another document not in the corpus
test_doc2 = []
p = open('/Users/Moukthika/Desktop/pdf_extract/99.txt', 'r', encoding='utf8')
test_doc2.append(p.read())
for d1 in test_doc2:
    doc2_words = d1.split(" ")
doc2_words = dictionary.doc2bow(doc2_words)
doc2_words = ldaseq[doc2_words]
print(doc2_words)

# In[37]:

#comparing the above two documents
hellinger(doc_words, doc2_words)
Пример #18
0
        """
        dis1 = self.get_topic_distrb(doc1_tk)
        dis2 = self.get_topic_distrb(doc2_tk)
        # return 1 - matutils.hellinger(dis1, dis2)
        return matutils.cossim(dis1, dis2)

    def get_model_name(self):
        return "LDA"


if __name__ == "__main__":
    docs = [
        'this is a test',
        'test assure quality',
        'test is important',

    ]
    lda = LDA(fo_lang_code="en")
    new_doc1 = ["software", 'quality', 'rely', 'test']
    new_doc2 = ["quality", "is", "important"]
    new_doc3 = ["i", "have", "a", "pretty", "dog"]
    lda.train(docs)
    dis1 = lda.get_topic_distrb(new_doc1)
    dis2 = lda.get_topic_distrb(new_doc2)
    dis3 = lda.get_topic_distrb(new_doc3)
    print(dis1)
    print(dis2)
    print(dis3)
    print(matutils.hellinger(dis1, dis2))
    print(matutils.hellinger(dis1, dis3))
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model,
                                     texts=data_lemmatized,
                                     dictionary=id2word,
                                     coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# Hellinger distance
H_distance = []
for i in range(1747, len(data_reg)):
    temp_distance = []
    for j in range(len(data_news)):
        lda_doc1 = lda_model[corpus_news[j]]
        lda_reg1 = lda_model[corpus_reg[i]]
        temp_distance.append(hellinger(lda_doc1[0], lda_reg1[0]))
    H_distance.append(temp_distance)

H_dist = DataFrame(H_distance)
H_dist.to_csv('Hillinger_distance.csv')


# DTM
def BasicCleanText(raw_text):
    cleantextprep = str(raw_text)

    expression = "[^a-zA-Z0-9 ]"  # keep only letters, numbers and whitespace
    cleantextCAP = re.sub(expression, '', cleantextprep)  # apply regex
    cleantext = cleantextCAP.lower()  # lower case

    # Tokenization
Пример #20
0
lda_bow_water = model[bow_water]
lda_bow_finance = model[bow_finance]
lda_bow_bank = model[bow_bank]

###############################################################################
# Hellinger
# ---------
# 
# We're now ready to apply our distance metrics.  These metrics return a value between 0 and 1, where values closer to 0 indicate a smaller 'distance' and therefore a larger similarity.
# 
# Let's start with the popular Hellinger distance. 
# 
# The Hellinger distance metric gives an output in the range [0,1] for two probability distributions, with values closer to 0 meaning they are more similar.
#
from gensim.matutils import hellinger
print(hellinger(lda_bow_water, lda_bow_finance))
print(hellinger(lda_bow_finance, lda_bow_bank))

###############################################################################
# Makes sense, right? In the first example, Document 1 and Document 2 are hardly similar, so we get a value of roughly 0.5. 
# 
# In the second case, the documents are a lot more similar, semantically. Trained with the model, they give a much less distance value.
# 

###############################################################################
# Kullback–Leibler
# ----------------
# 
# Let's run similar examples down with Kullback Leibler.
# 
from gensim.matutils import kullback_leibler
Пример #21
0
 def hellinger_distance(self, doc_bow, bow_corpus):
     scores = [(i, hellinger(doc_bow, document))
               for i, document in enumerate(bow_corpus)]
     return heapq.nsmallest(100, scores, lambda x: x[1])
Пример #22
0
def phish_extraction():
    result = {
        '1': 0,
        '2': 0,
        '3': 0,
        '4': 0,
        '5': 0,
        '6': 0,
        '7': 0,
        '8': 0,
        '9': 0,
        '10': 0,
        '11': 0,
        '12': 0,
        '13': 0,
        '14': 0,
        '15': 0,
        '16': 0,
        '17': 0,
        '18': 0,
        '19': 0,
        '20': 0,
        '21': 0,
        '22': 0,
        '23': 0,
        '24': 0,
        '25': 0,
        '26': 0,
        '27': 0,
        '28': 0,
        '29': 0,
        '30': 0,
        '31': 0,
        '32': 0,
        '33': 0,
        '34': 0,
        '35': 0,
        '36': 0,
        '37': 0,
        '38': 0,
        '39': 0,
        '40': 0,
        '1': 0,
        '42': 0,
        '43': 0,
        '44': 0,
        '45': 0,
        '46': 0,
        '47': 0,
        '48': 0,
        '49': 0,
        '50': 0,
        '51': 0,
        '52': 0,
        '53': 0,
        '54': 0,
        '55': 0,
        '56': 0,
        '57': 0,
        '58': 0,
        '59': 0,
        '60': 0,
        '61': 0,
        '62': 0,
        '63': 0,
        '64': 0,
        '65': 0,
        '66': 0,
        '67': 0,
        '68': 0,
        '69': 0,
        '70': 0,
        '71': 0,
        '72': 0,
        '73': 0,
        '74': 0,
        '75': 0,
        '76': 0,
        '77': 0,
        '78': 0,
        '79': 0,
        '80': 0,
        '81': 0,
        '82': 0,
        '83': 0,
        '84': 0,
        '85': 0,
        '86': 0,
        '87': 0,
        '88': 0,
        '89': 0,
        '90': 0,
        '91': 0,
        '92': 0,
        '93': 0,
        '94': 0,
        '95': 0,
        '96': 0,
        '97': 0,
        '98': 0,
        '99': 0,
        '100': 0,
        '101': 0,
        '102': 0,
        '103': 0,
        '104': 0,
        '105': 0,
        '106': 0,
        '107': 0,
        '108': 0,
        '109': 0,
        '110': 0,
        '111': 0,
        '112': 0,
        '113': 0,
        '114': 0,
        '115': 0,
        '116': 0,
        '117': 0,
        '118': 0,
        '119': 0,
        '120': 0,
        '121': 0,
        '122': 0,
        '123': 0,
        '124': 0,
        '125': 0,
        '126': 0,
        '127': 0,
        '128': 0,
        '129': 0,
        '130': 0,
        '131': 0,
        '132': 0,
        '133': 0,
        '134': 0,
        '135': 0,
        '136': 0,
        '137': 0,
        '138': 0,
        '139': 0,
        '140': 0,
        '141': 0,
        '142': 0,
        '143': 0,
        '144': 0,
        '145': 0,
        '146': 0,
        '147': 0,
        '148': 0,
        '149': 0,
        '150': 0,
        '151': 0,
        '152': 0,
        '153': 0,
        '154': 0,
        '155': 0,
        '156': 0,
        '157': 0,
        '158': 0,
        '159': 0,
        '160': 0,
        '161': 0,
        '162': 0,
        '163': 0,
        '164': 0,
        '165': 0,
        '166': 0,
        '167': 0,
        '168': 0,
        '169': 0,
        '170': 0,
        '171': 0,
        '172': 0,
        '173': 0,
        '174': 0,
        '175': 0,
        '176': 0,
        '177': 0
    }
    interhref = []
    exterhref = []
    interlog = []
    exterlog = []
    chain = []
    title = []
    text = []
    chainurl(chain)
    starturl = chain[0]
    landurl = chain[-1]
    interandextern(landurl, interhref, exterhref, "file/href.txt")
    interandextern(landurl, interlog, exterlog, "file/logged.txt")
    loaddata(title, 'file/title.txt')
    loaddata(text, 'file/text.txt')
    feature_1 = []
    f1_8feature(feature_1, starturl)
    f1_8feature(feature_1, landurl)

    f1_3_8feature(feature_1, interhref)
    f1_3_8feature(feature_1, interlog)
    f1_3_8feature(feature_1, exterhref)
    f1_3_8feature(feature_1, exterlog)

    ##
    #   Feature 2 calculating
    ##
    start = list(getfreeurl(starturl))
    land = list(getfreeurl(landurl))
    startrdn = list(getrdn(starturl))
    landrdn = list(getrdn(landurl))
    intlog = []
    intlink = []
    intrdn = []
    extrdn = []
    extlog = []
    extlink = []

    for var in interhref:
        intlink.append(getfreeurl(var))
        intrdn.append(getrdn(var))

    for var in interlog:
        intlog.append(getfreeurl(var))
        intrdn.append(getrdn(var))

    for var in exterhref:
        extlink.append(getfreeurl(var))

    for var in exterlog:
        extlog.append(getfreeurl(var))
        extrdn.append(getrdn(var))

    # you can use any corpus, this is just illustratory
    texts = [
        text, title, start, land, startrdn, landrdn, intlog, intlink, intrdn,
        extrdn, extlog, extlink
    ]
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    import numpy
    numpy.random.seed(
        1)  # setting random seed to get the same results each time.

    from gensim.models import ldamodel
    model = ldamodel.LdaModel(corpus, id2word=dictionary,
                              num_topics=2)  #, minimum_probability=1e-8)
    model.show_topics()
    #print_wo("\n")

    from gensim.matutils import hellinger
    feature_2 = []
    for combo in combinations(texts, 2):  # 2 for pairs, 3 for triplets, etc
        ## we can now get the LDA topic distributions for these
        bow0 = model.id2word.doc2bow(combo[0])
        bow1 = model.id2word.doc2bow(combo[1])

        lda_bow0 = model[bow0]
        lda_bow1 = model[bow1]

        #print_wo("Distance #",count,":",hellinger(lda_bow0,lda_bow1))
        feature_2.append(hellinger(lda_bow0, lda_bow1))
        #print_wo(hellinger(lda_bow0,lda_bow1),",")
    #for i in range(16):
    #    print_wo(i,":",dictionary.get(i))
    # now let's make these into a bag of words format
    #

    feature_2.append(binaryfeatures(intrdn, title))
    feature_2.append(binaryfeatures(extrdn, title))

    #print_wo(binaryfeatures(intrdn,title),",")
    #print_wo(binaryfeatures(extrdn,title),",")
    ##
    # f3 features calculeting
    ##
    feature_3n4 = []
    startmld = getmld(starturl)
    landmld = getmld(landurl)

    mlds = [startmld, landmld]

    startrdn = getrdn(starturl)
    landrdn = getrdn(landurl)

    rdns = [startmld, landmld]

    compare = [text, title, intlog, extlog, intlink, extlink]

    for i in range(2):
        for j in range(6):
            if mlds[i] in compare[j]:
                feature_3n4.append(1)
                #print_wo("1",",")
            else:
                feature_3n4.append(0)
                #print_wo("0",",")
    compare = [title, intlog, extlog, intlink, extlink]
    compare = " ".join(str(x) for x in compare)
    for i in range(2):
        for j in range(5):
            if compare[j] in mlds[i]:
                feature_3n4.append(1)
                #print_wo("1",",")
            else:
                feature_3n4.append(0)
                #print_wo("0",",")
    for m in range(2):
        for n in range(5):
            if compare[j] in rdns[i] and compare[j] not in mlds[i]:
                feature_3n4.append(1)
                #print_wo("1",",")
            else:
                feature_3n4.append(0)
                #print_wo("0",",")

    ##
    # f3 features calculeted
    ##

    ##
    # f4 features calculeting
    ##

    if getrdn(starturl) in getrdn(landurl):
        feature_3n4.append(1)
        #print_wo(1,",")
    else:
        feature_3n4.append(0)
        #print_wo(0,",")
    if len(chain) > 2:
        feature_3n4.append(len(chain) - 2)
        #print_wo(len(chain)-2,",")
    else:
        feature_3n4.append(0)
        #print_wo(0,",")
    feature_3n4.append(len(interlog))
    feature_3n4.append(len(interhref))
    ##print_wo(len(interlog),",")
    ##print_wo(len(interhref),",")

    feature_3n4.append(len(exterlog))
    feature_3n4.append(len(exterhref))
    #print_wo(len(exterlog),",")
    #print_wo(len(exterhref),",")

    count = 0
    for comp in interlog:
        if getrdn(starturl) in getrdn(comp):
            count += 1
    feature_3n4.append(count)
    #print_wo(count,",")

    count = 0
    for comp in interhref:
        if getrdn(starturl) in getrdn(comp):
            count += 1
    feature_3n4.append(count)
    #print_wo(count,",")

    count = 0
    if len(chain) > 2:
        for comp in chain[1:len(chain) - 1]:  #check later
            if getrdn(starturl) in getrdn(comp):
                count += 1
    feature_3n4.append(count)
    #print_wo(count,",")

    count = 0
    if len(chain) > 2:
        for comp in chain[1:len(chain) - 1]:  #check later
            if getrdn(landurl) in getrdn(comp):
                count += 1
    feature_3n4.append(count)
    #print_wo(count,",")

    count = 0
    for comp in exterlog:  #check later
        if getrdn(starturl) in getrdn(comp):
            count += 1
    feature_3n4.append(count)
    #print_wo(count,",")

    count = 0
    for comp in exterlog:  #check later
        if getrdn(starturl) in getrdn(comp):
            count += 1
    feature_3n4.append(count)
    #print_wo(count,",")

    ##
    # f4 features calculed
    ##
    ##
    # f5 features calculation
    ##
    feature_5 = []
    file = open('file/input.txt', "r")
    data = file.read()
    word = data.split()

    feature_5.append(len(word))
    #print_wo(len(word),",")

    file = open('file/img.txt', "r")
    data = file.read()
    word = data.split()
    feature_5.append(len(word))

    #print_wo(len(word),",")

    file = open('file/iframe.txt', "r")
    data = file.read()
    word = data.split()
    feature_5.append(len(word))

    #print_wo(len(word),",")

    file = open('file/text.txt', "r")
    data = file.read()
    word = data.split()
    feature_5.append(len(word))

    #print_wo(len(word),",")

    file = open('file/title.txt', "r")
    data = file.read()
    word = data.split()
    feature_5.append(len(word))

    #print(len(word))

    #sys.stdout = original_stdout # Reset the standard output to its original value
    res = feature_1 + feature_2 + feature_3n4 + feature_5

    for i in range(len(res)):
        result[str(i)] = res[i]

    return result
Пример #23
0
    from gensim.models import ldamodel
    model = ldamodel.LdaModel(corpus, id2word=dictionary,
                              num_topics=2)  #, minimum_probability=1e-8)
    model.show_topics()
    #print_wo("\n")
    from gensim.matutils import hellinger
    for combo in combinations(texts, 2):  # 2 for pairs, 3 for triplets, etc
        ## we can now get the LDA topic distributions for these
        bow0 = model.id2word.doc2bow(combo[0])
        bow1 = model.id2word.doc2bow(combo[1])

        lda_bow0 = model[bow0]
        lda_bow1 = model[bow1]

        #print_wo("Distance #",count,":",hellinger(lda_bow0,lda_bow1))
        print_wo(hellinger(lda_bow0, lda_bow1), ",")
    #for i in range(16):
    #    print_wo(i,":",dictionary.get(i))
    # now let's make these into a bag of words format
    #
    print_wo(binaryfeatures(intrdn, title), ",")
    print_wo(binaryfeatures(extrdn, title), ",")
    ##
    # f3 features calculeting
    ##

    startmld = getmld(starturl)
    landmld = getmld(landurl)

    mlds = [startmld, landmld]
Пример #24
0
    def hellinger_distance(self, x, y):
        """ return hellinger between two lists """

        return hellinger(x, y)
Пример #25
0
 def similarity(vec1, vec2):
     '''Similaridad entre dos vectores'''
     dist = matutils.hellinger(matutils.sparse2full(vec1, atmodel.num_topics), \
                               matutils.sparse2full(vec2, atmodel.num_topics))
     sim = 1.0 / (1.0 + dist)
     return sim
Пример #26
0
             'worth', 'hoke','happy','foot','tv','weed',
             'hard paint','good luck','olga','hair','gas',
             'sex','especially','pretty','hope','basically',
             'dream','hit','bit','ben krenke','weird','saying',
             'okay','doesnt','understand','f**k','job','hard',
             'night','weekend','f****d','sorry','school','cheap',
             'literally','crazy','mom','year_old','home','year',
             'old','bitch','song']
# bt_2 topics
bt_2_topics = ['black pipe','bike','hard','suck','dick','fart noise',
              'life','buy','ride','sorry','nah','working','easy',
              'worst','comment','win','pissed','interview','bad',
              'high','game','rib','drink','fast','dog','smoke weed',
              'kit','happy birthday','apparently','lol','cleveland','sweet',
              'hang','summer','get paper','good','jesus christ','idea',
              'gay','dumb','jesus','sound','god damn','house','health insurance',
              'stock','set nickname']


#  Transforms user topics to bag of words 
bt_1_bow = bt_1_ldamodel.id2word.doc2bow(bt_1_topics)
bt_1 = bt_1_ldamodel[bt_1_bow]

bt_2_bow = bt_2_ldamodel.id2word.doc2bow(bt_2_topics)
bt_2 = bt_2_ldamodel[bt_2_bow]


# Computes the similarity of bt_1 & bt_2
from gensim.matutils import hellinger
print('Similarity between bt_1 & bt_2:', hellinger(bt_1,bt_2))
Пример #27
0
def phish_extraction(url, driver):

    try:
        result = {}
        interhref=[]
        exterhref=[]
        interlog=[]
        exterlog=[]
        #chain=[]
        title=[]
        text=[]
        chain, logged, href, img, iframe, _input, title, text, flag = web_scrapping(url, driver)
        #print(chain, logged, href, img, iframe, _input, title, text, flag)
        if type(chain) is list and len(chain) > 0:
            starturl=chain[0]
            landurl=chain[-1]
        else:
            starturl=chain
            landurl=chain



        if flag == 0:
            return False
        interandextern(landurl,interhref,exterhref,href)
        interandextern(landurl,interlog,exterlog,logged)

        feature_1 = []
        f1_8feature(feature_1, starturl)
        f1_8feature(feature_1, landurl)
        f1_3_8feature(feature_1, interhref)
        f1_3_8feature(feature_1, interlog)
        f1_3_8feature(feature_1, exterlog)
        f1_3_8feature(feature_1, exterhref)
     #
        #Feature 2 calculating
     #
        start=list(getfreeurl(starturl))
        land=list(getfreeurl(landurl))
        startrdn=list(getrdn(starturl))
        landrdn=list(getrdn(landurl))
        intlog=[]
        intlink=[]
        intrdn=[]
        extrdn=[]
        extlog=[]
        extlink=[]

        for var in interhref:
           intlink.append(getfreeurl(var))
           intrdn.append(getrdn(var))

        for var in interlog:
           intlog.append(getfreeurl(var))
           intrdn.append(getrdn(var))

        for var in exterhref:
           extlink.append(getfreeurl(var))

        for var in exterlog:
           extlog.append(getfreeurl(var))
           extrdn.append(getrdn(var))

        # you can use any corpus, this is just illustratory
        texts = [
            text,title,start,land,startrdn,landrdn,intlog,intlink,intrdn,extrdn,extlog,extlink
        ]
        dictionary = Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        import numpy
        numpy.random.seed(1) # setting random seed to get the same results each time.

        from gensim.models import ldamodel
        model = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=2)#, minimum_probability=1e-8)
        model.show_topics()
        #print_wo("\n")

        from gensim.matutils import hellinger
        feature_2 = []
        for combo in combinations(texts, 2):  # 2 for pairs, 3 for triplets, etc
        ## we can now get the LDA topic distributions for these
            bow0 = model.id2word.doc2bow(combo[0])
            bow1 = model.id2word.doc2bow(combo[1])

            lda_bow0 = model[bow0]
            lda_bow1 = model[bow1]

            #print_wo("Distance #",count,":",hellinger(lda_bow0,lda_bow1))
            feature_2.append(hellinger(lda_bow0,lda_bow1))
            #print_wo(hellinger(lda_bow0,lda_bow1),",")
        #for i in range(16):
        #    print_wo(i,":",dictionary.get(i))
        # now let's make these into a bag of words format
        #

        feature_2.append(binaryfeatures(intrdn,title))
        feature_2.append(binaryfeatures(extrdn,title))

               ##
        # f3 features calculeting
        ##
        feature_3n4 = []
        startmld = getmld(starturl)
        landmld = getmld(landurl)

        mlds = [startmld,landmld]

        startrdn = getrdn(starturl)
        landrdn = getrdn(landurl)

        rdns = [startmld,landmld]

        compare = [text,title,intlog,extlog,intlink,extlink]

        for i in range(2):
            for j in range(6):
                if mlds[i] in compare[j]:
                    feature_3n4.append(1)
                else:
                    feature_3n4.append(0)
        compare = [title,intlog,extlog,intlink,extlink]
        compare = " ".join(str(x) for x in compare)
        for i in range(2):
            for j in range(5):
                if compare[j] in mlds[i]:
                    feature_3n4.append(1)
                else:
                    feature_3n4.append(0)
        for m in range(2):
            for n in range(5):
                if compare[j] in rdns[i] and compare[j] not in mlds[i]:
                    feature_3n4.append(1)
                else:
                    feature_3n4.append(0)


        ##
        # f3 features calculeted
        ##

        ##
        # f4 features calculeting
        ##

        if getrdn(starturl) in getrdn(landurl):
            feature_3n4.append(1)
        else:
            feature_3n4.append(0)
        if len(chain) > 2:
            feature_3n4.append(len(chain)-2)
        else:
            feature_3n4.append(0)
        feature_3n4.append(len(interlog))
        feature_3n4.append(len(interhref))

        feature_3n4.append(len(exterlog))
        feature_3n4.append(len(exterhref))

        count = 0
        for comp in interlog:
            if getrdn(starturl) in getrdn(comp):
                count += 1
        feature_3n4.append(count)


        count = 0
        for comp in interhref:
            if getrdn(starturl) in getrdn(comp):
                count += 1
        feature_3n4.append(count)

        count = 0
        if len(chain) > 2 :
            for comp in chain[1:len(chain)-1] :#check later
                if getrdn(starturl) in getrdn(comp):
                    count += 1
        feature_3n4.append(count)


        count = 0
        if len(chain) > 2 :
            for comp in chain[1:len(chain)-1] :#check later
                if getrdn(landurl) in getrdn(comp):
                    count += 1
        feature_3n4.append(count)


        count = 0
        for comp in exterlog :#check later
            if getrdn(starturl) in getrdn(comp):
                count += 1
        feature_3n4.append(count)

        count = 0
        for comp in exterlog : #check later
            if getrdn(starturl) in getrdn(comp):
                count += 1
        feature_3n4.append(count)

        ##
        # f4 features calculed
        ##
        ##
        # f5 features calculation
        ##
        feature_5 = []
        data = _input#file.read()

        feature_5.append(len(data))

        data = img# file.read()
        feature_5.append(len(data))


        data = iframe# file.read()
        feature_5.append(len(data))


        data = text#file.read()
        feature_5.append(len(data))


        data = title# file.read()
        feature_5.append(len(data))
        res = feature_1 + feature_2 + feature_3n4 + feature_5

        if flag == 1:
            for i in range(len(res)) :
                if res[i] is not None:
                    result[str(i)] = res[i]
                else:
                    result[str(i)] = 0
        else:
            return False
    except Exception as e:
        #print("extraction error",e)
        trace_back = sys.exc_info()[2]
        line = trace_back.tb_lineno
        print(format(line),e)

        if flag == 1:
            return False
        else:
            return False
    return result
Пример #28
0
    filecontent = ''
    for word in file:
        filecontent = filecontent + word + ' '
        documents.append(filecontent)
    stoplist = set(stopwords.words('english'))
    texts = [[
        word for word in document.lower().split() if word not in stoplist
    ] for document in documents]
    basetext = []
    for list in texts:
        for item in list:
            basetext.append(item)
    bow_1 = lda.id2word.doc2bow(basetext)
    lda_1 = lda[bow_1]
    print("******************", filename)
    print("hellinger", hellinger(lda_1, lda_2))
    print("kullback_leibler", kullback_leibler(lda_1, lda_2))
    print("jaccard", jaccard(lda_1, lda_2))
    file.close()
    #dictionary = corpora.Dictionary(texts)
    #corpus = [dictionary.doc2bow(text) for text in texts]
    #lda1 = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, update_every=1, chunksize=10000, passes=5)
#print(lda1)
#print(texts)
"""
basetext=[]
for list in texts:
    for item in list:
        basetext.append(item)
#print(len(basetext))
#print(basetext)
    cluster_model = func(corpus, dictionary, words)
    print clist
    print model.show_topics(cluster_model)
    model_list2.append(cluster_model)
    model_list3.append(model.show_topics(cluster_model))

print '\n========'
print('Comparison between Cluster topics and Version topics')
print '========\n'

distances = []


def print_cluster_version(i, j, mylist):
    print '\n========'
    print 'version', version[i]
    print 'cluster\t', mylist[j]


for i in model_list2:
    for j in model_list:
        if hellinger(i, j) > 0.5:
            print "\ntopic comparison distance between version and reviews"
            print hellinger(i, j)
            print_cluster_version(model_list.index(j), model_list2.index(i),
                                  model_list3)
            distances.append(hellinger(i, j))

        else:
            continue