示例#1
0
def unigram_smoothing(data, model):
    td, idf, vocab = data
    td = td[:, :-1]
    lsa = LSA(model)
    res = lsa.unigram_smoothing()
    print res.shape
    print np.abs(td - res).sum() / float(res.shape[0] * res.shape[1])
示例#2
0
def unigram_smoothing(data, model):
    td, idf, vocab = data
    td = td[:,:-1]
    lsa = LSA(model)
    res = lsa.unigram_smoothing()
    print res.shape
    print np.abs(td-res).sum() / float(res.shape[0] * res.shape[1])
示例#3
0
    def summarize(self, text, n_sents=3):
        """ Summarize a given text and get top sentences """
        try:
            prediction = dict()

            if text:
                if self.lang_code in self.valid_langs:
                    if Utility.get_doc_length(text) > self.n_words:
                        # generate sentences, normalized sentences from text
                        sents, norm_sents = self.p.text_preprocessing(text)
                        # generate doc-term-matrix, term-doc-matrix
                        dt_matrix = self.generate_doc_term_matrix(norm_sents)
                        td_matrix = self.generate_term_doc_matrix(dt_matrix)

                        if self.method == "LSA":
                            lsa = LSA(self.k, td_matrix)
                            term_topic_matrix, singular_values, topic_doc_matrix = lsa.u, lsa.s, lsa.vt
                            # remove singular values below given treshold
                            singular_values = lsa.filter_singular_values(
                                singular_values, self.sv_threshold)
                            # get salience scores from top singular values & topic document matrix
                            salience_scores = lsa.get_salience_scores(
                                singular_values, topic_doc_matrix)
                            # get the top sentence indices for summarization
                            top_sentence_indices = lsa.get_top_sent_indices(
                                salience_scores, n_sents)
                            summary = self.generate_summary(
                                sents, top_sentence_indices)
                        elif self.method == "TEXT_RANK":
                            tr = TextRank(dt_matrix, td_matrix)
                            # build similarity graph
                            similarity_matrix = tr.similiarity_matrix
                            similarity_graph = tr.get_similarity_graph(
                                similarity_matrix)
                            # compute pagerank scores for all sentences
                            ranked_sents = tr.rank_sentences(similarity_graph)
                            # get the top sentence indices for summarization
                            top_sentence_indices = tr.get_top_sentence_indices(
                                ranked_sents, n_sents)
                            summary = self.generate_summary(
                                sents, top_sentence_indices)
                        else:
                            return "no method found"

                        # apply cleaning for readability
                        summary = Utility.remove_multiple_whitespaces(summary)
                        summary = Utility.remove_trailing_whitespaces(summary)
                        prediction["summary"] = summary
                        prediction["message"] = "successful"
                    else:
                        return "required at least {} words".format(
                            self.n_words)
                else:
                    return "language not supported".format()
            else:
                return "required textual content"
            return prediction
        except Exception:
            logging.error("exception occured", exc_info=True)
示例#4
0
def extract_file_features(filename, output_file):
    csvfile = open(output_file, 'w')
    wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL, lineterminator='\n')

    header = ["id", "input_sim_article", "query_sim_article", "input_qcount", "input_qpercentage", "input_qonesen", "query_qcount", "query_qpercentage", "query_qonesen", "input_sim_word2vec", "query_sim_word2vec"]
    wr.writerow(header)

    data = pd.read_csv(filename, sep=';', encoding="utf-8", error_bad_lines=False)
    query_list = data[COL_QUERY_TEXT].unique()

    for index, row in data.iterrows():
        features = []
        try:
            query_text = row[COL_QUERY_TEXT]
            query_search = row[COL_QUERY_SEARCH]
            article = row[COL_ARTICLE_CONTENT]
            col_article = data[data[COL_QUERY_TEXT] == query_text][COL_ARTICLE_CONTENT]
            documents = []
            for idx, art in enumerate(col_article):
                documents.append(art)
                if art == article:
                    cur_idx = idx

            # id
            # features.append(hashlib.sha1(article.rstrip().encode()).hexdigest())
            features.append(index)
            # similarity (input - article)
            similar = LSA(query_text, documents)
            features.append(similar.rank[cur_idx][1])
            # similarity (query - article)
            similar = LSA(query_search, documents)
            features.append(similar.rank[cur_idx][1])
            # word count
            query_count, query_percentage, query_onesen = word_count_features(query_text, article)
            features.append(query_count)
            features.append(query_percentage)
            features.append(query_onesen)
            query_count, query_percentage, query_onesen = word_count_features(query_search, article)
            features.append(query_count)
            features.append(query_percentage)
            features.append(query_onesen)
            # # word2vec sim (input - article)
            # features.append(old_calculate_similarity(query_text, article))
            # # word2vec sim (query - article)
            # features.append(old_calculate_similarity(query_search, article))
            # label
            features.append(row[COL_LABEL])
        except:
            while len(features) < len(header):
                features.append(-1)
            features[len(features-1)] = row[COL_LABEL]
        print(features)
        wr.writerow(features)
    print("=============== \n")
    csvfile.close()
示例#5
0
def word_topics(model):
    lsa = LSA(model)
    print lsa.word_topics().shape
示例#6
0
def document_topics(model):
    lsa = LSA(model)
    print lsa.document_topics().shape
示例#7
0
def folding_in(data, model):
    td, idf, vocab = data
    d = td[:, -1]
    lsa = LSA(model)
    print lsa.folding_in(d).shape
示例#8
0
def train(data):
    td, idf, vocab = data
    td = td[:, :-1]
    lsa = LSA()
    return lsa.train(td, Z=10)
示例#9
0
def train(data):
    td, idf, vocab = data
    td = td[:,:-1]
    lsa = LSA()
    return lsa.train(td, Z=10)
    def initialize(self, article):

        similarity_array = []
        # similarity_array.append(article)
        test = self.articleSummerization(article, 1)  # in one line

        # for i in summerizedSentence:
        # 	test=str(i)
        print('-------Summerized Title-------')
        print(test)

        sitesContainingArticle, scrapId = self.googleSearch(article)

        print('sites_length_after_google search', len(sitesContainingArticle))

        for index, url in enumerate(sitesContainingArticle):
            print('URL ', url, scrapId[index], '\n')

            raw_html = self.simple_get(url)  #full page site content
            try:
                soup = BeautifulSoup(
                    raw_html, 'html.parser')  #proper formattinh raw_html
                # print('hua idhar')
                # print(soup)

            except Exception as e:
                print(e)
                return 0, []

            _ = [s.extract() for s in soup('script')]

            soup_article = soup.find_all('div', {"class": scrapId[index]})

            # print(soup_article)

            article_string = ''
            for data in soup_article:
                # print(data)
                article_string += data.text
                # article_string += data.text
            # print(article_string)
            if not article_string == '':
                # print('aaya\n')
                similarity_array.append(
                    self.articleSummerization(article_string, 5))

            else:
                print('nahi aaya\n')
                pass

        # for c in similarity_array:
        # 	print('\n\n\n',c)

        mylsa = LSA()
        wmdinit = WordMoverDistance()

        length = len(similarity_array)
        # print(length)

        if length == 0:
            return 0, sitesContainingArticle
        else:
            count = 0
            score_array = []

            while (count < length):
                print('\n\n', similarity_array[count])
                lsa_similarity = mylsa.start([article + ' ' + article] +
                                             similarity_array, count + 1)
                wmdinit.data_accept(similarity_array[count], article)
                wmddistance = wmdinit.model()

                print('wordmover distance is', wmddistance)

                fuzzy = Fuzzy(lsa_similarity, wmddistance)
                score = fuzzy.get_score_data()
                # score = score/10
                print('final score ', score)

                score_array.append(score)
                count = count + 1

            score_array = sorted(score_array, key=lambda x: x, reverse=True)

            return min(100,
                       np.around(sum(score_array[:2]), decimals=2) *
                       100), sitesContainingArticle


# wmdinit=wordmover.WordMoverDistance(titles[count],titles[0])
# wmddistance=wmdinit.model()
示例#11
0
def topic_labels(data, model, N=15):
    td, idf, vocab = data
    lsa = LSA(model)
    inv_vocab = inverse_vocab(vocab)
    print lsa.topic_labels(inv_vocab, N)
示例#12
0
def word_topics(model):
    lsa = LSA(model)
    print lsa.word_topics().shape
示例#13
0
def document_topics(model):
    lsa = LSA(model)
    print lsa.document_topics().shape
示例#14
0
def folding_in(data, model):
    td, idf, vocab = data
    d = td[:,-1]
    lsa = LSA(model)
    print lsa.folding_in(d).shape
示例#15
0
def topic_labels(data, model, N=15):
    td, idf, vocab = data
    lsa = LSA(model)
    inv_vocab = inverse_vocab(vocab)
    print lsa.topic_labels(inv_vocab, N)
示例#16
0
def fetch_regular_features(
        corpus_dir_path: str,
        reset=False) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    """
	FR : Récupère les mesures standards tel que les probabilités, distances, patron syntaxique
	pour chaque candidat du corpus\n
	EN : Fetch regular features such as probabilities, distances, syntaxical pattern for
	each candidat of the given corpus \n
	Params
	------
	corpus_dir_path : str\n
		FR : Emplacement du corpus\n
		EN : The corpus path\n
	reset : bool\n
		#TODOC
	Returns
	-------
	features : DataFrame\n
		FR : Tableau des candidats et de leur mesures.\n
		EN : Table of the candidats and their features.\n
	lsa_noun : DataFrame\n
		FR : Tableau des candidats et du vecteur de leur nom\n
		EN : Table of the candidats and their noun vector\n
	lsa_verb : DataFrame\n
		FR : Tableau des candidats et du vecteur de leur verbe\n
		EN : Table of the candidats and their verb vector\n
	exps_lsa : DataFrae\n
		#TODO
	"""
    corpus_id = encode(str.encode(corpus_dir_path), 'hex').decode() + '.pkl'

    get_features = utilities.drive_cached_func(measurer.get_features,
                                               'features' + corpus_id, reset)
    features = get_features(corpus_dir_path)

    get_pattern_frequency = utilities.drive_cached_func(
        measurer.get_candidats_pattern_frequency, 'patterns' + corpus_id,
        reset)
    patterns = get_pattern_frequency(corpus_dir_path)
    features = pd.merge(features,
                        patterns,
                        how='left',
                        left_index=True,
                        right_index=True).fillna(0)

    tmp = LSA(corpus_dir_path)
    lsa = pd.DataFrame(tmp.lsa, index=tmp.word_id)
    lsa.columns.name = 'WORD'

    exps_lsa = utilities.drive_cached_func(tmp, 'exps' + corpus_id,
                                           reset)(features)

    lsa_noun = pd.merge(features,
                        lsa,
                        how='left',
                        left_on='NOUN',
                        right_index=True).iloc[:, -100:].fillna(0)
    lsa_verb = pd.merge(features,
                        lsa,
                        how='left',
                        left_on='VERB',
                        right_index=True).iloc[:, -100:].fillna(0)
    len_v = pd.DataFrame(lsa_verb.abs().sum(axis=1))
    len_v.columns = ['len_v']
    len_n = pd.DataFrame(lsa_noun.abs().sum(axis=1))
    len_n.columns = ['len_n']

    features = pd.merge(features, len_v, left_index=True, right_index=True)
    features = pd.merge(features, len_n, left_index=True, right_index=True)
    dist_noun = pd.DataFrame(utilities.cos_similarities(
        exps_lsa.loc(axis=0)[lsa_noun.index].fillna(0).sort_index().values,
        lsa_noun.sort_index().values),
                             index=lsa_noun.sort_index().index)

    dist_noun.columns = ['dist_noun']
    features = pd.merge(features, dist_noun, left_index=True, right_index=True)
    # TODO replace loc with reindex
    dist_verb = pd.DataFrame(utilities.cos_similarities(
        exps_lsa.loc(axis=0)[lsa_verb.index].fillna(0).sort_index().values,
        lsa_verb.sort_index().values),
                             index=lsa_verb.sort_index().index)

    dist_verb.columns = ['dist_verb']
    features = pd.merge(features, dist_verb, left_index=True, right_index=True)

    features = features.assign(
        dist_relative=features['dist_noun'] /
        (features['dist_noun'] + features['dist_verb'])).fillna(0.5)
    return features, lsa_noun, lsa_verb, exps_lsa
示例#17
0
from lsa import LSA


if __name__ == '__main__':
    documents = [line.rstrip() for line in open('temp/all_book_titles.txt')]

    stopwords = set(line.rstrip() for line in open('temp/stopwords.txt')).union({
    'introduction', 'edition', 'series', 'application',
    'approach', 'card', 'access', 'package', 'plus', 'etext',
    'brief', 'vol', 'fundamental', 'guide', 'essential', 'printed',
    'third', 'second', 'fourth'
    })

    model = LSA(stopwords)
    model.fit(documents)
    model.transform_plot()
示例#18
0
文件: lsa_test.py 项目: zxsted/finch
from lsa import LSA

if __name__ == '__main__':
    documents = [line.rstrip() for line in open('temp/all_book_titles.txt')]

    stopwords = set(line.rstrip()
                    for line in open('temp/stopwords.txt')).union({
                        'introduction', 'edition', 'series', 'application',
                        'approach', 'card', 'access', 'package', 'plus',
                        'etext', 'brief', 'vol', 'fundamental', 'guide',
                        'essential', 'printed', 'third', 'second', 'fourth'
                    })

    model = LSA(stopwords)
    model.fit(documents)
    model.plot()