예제 #1
0
def _preprocess_string(news):
    _id, con, label = news

    preprocessed_con = []
    if isinstance(con, tuple):
        for _con in con:
            preprocessed_con.append(preprocess_string(str(_con)))

        preprocessed_con = tuple(preprocessed_con)
    else:
        preprocessed_con = preprocess_string(con)

    return _id, preprocessed_con, label
예제 #2
0
 def _get_vocabulary(texts):
     sentences = [preprocess_string(text) for text in texts]
     count_words = Counter(set(word for lst in sentences for word in lst))
     total_words = len(count_words)
     sorted_words = count_words.most_common(total_words)
     vocab_to_int = {w: i + 1 for i, (w, c) in enumerate(sorted_words)}
     return vocab_to_int
예제 #3
0
def cleaning_pipe(document):

    transform_to_lower = lambda s: s.lower()
    remove_single_char = lambda s: re.sub(r'\s+\w{1}\s+', '', s)

    # Filters to be executed in pipeline
    CLEAN_FILTERS = [strip_tags, strip_numeric, strip_punctuation, strip_multiple_whitespaces, transform_to_lower, remove_stopwords, remove_single_char]

    # Invoking gensim.parsing.preprocess_string method with set of filters
    processed_words = preprocess_string(document, CLEAN_FILTERS)
    return processed_words
예제 #4
0
    def __iter__(self):
        with self.db_object as conn:
            cursor = conn.cursor()
            cursor.execute(self.__dbSQL.sql)
            record = cursor.fetchone()
            while record:
                document_text = record[self.__dbSQL.doc_location]
                if self.preprocessor:
                    document_text = self.preprocessor.clean_text(document_text)

                yield parsing.preprocess_string(document_text)
                record = cursor.fetchone()
예제 #5
0
    def __iter__(self):
        with self.db_object as conn:
            cursor = conn.cursor()
            cursor.execute(self.__dbSQL.sql)
            record = cursor.fetchone()
            while record:
                document_text = record[self.__dbSQL.doc_location]
                if self.preprocessor:
                    document_text = self.preprocessor.clean_text(document_text)

                yield parsing.preprocess_string(document_text)
                record = cursor.fetchone()
예제 #6
0
파일: lsa.py 프로젝트: xkuang/summarizer
def save_word_dict(text):
    proc_text = []

    sentences = text
    sentences = tokenize.sent_tokenize(sentences)

    for sentence in sentences:
        proc_sentence = preprocess_string(sentence)

        if (len(proc_sentence) == 0):
            continue
        proc_text.append(proc_sentence)

    dictionary = corpora.Dictionary(proc_text)
    return [dictionary, proc_text, sentences]
예제 #7
0
def save_word_dict(text):
  proc_text = []
  
  sentences = text 
  sentences = tokenize.sent_tokenize(sentences)
  
  for sentence in sentences:
    proc_sentence = preprocess_string(sentence)

    if(len(proc_sentence) == 0):
      continue
    proc_text.append(proc_sentence)

  dictionary = corpora.Dictionary(proc_text)
  return [dictionary, proc_text, sentences]
    def print_cluster(self, cluster_id_list):
        """Prints the clusters in the given cluster list and performs
        coherence calculation.

        Args:
            cluster_id_list (list): A list of clusters that make up a
                                    coherent chain.
        """
        cluster_content = []
        print "----- Cluster -----"
        for cid in sorted(cluster_id_list):
            cur_cluster = self.clusters[cid - self.cluster_const]
            print cur_cluster[7]
            print cur_cluster[8]
            print utils.load_nyt_by_article_id(str(cur_cluster[9]))
            print "Distance: ", cur_cluster[10]
            article_list = cur_cluster[4].strip("[]").split(", ")
            tmp = ""
            for article_id in article_list:
                res = utils.load_nyt_by_article_id(article_id)
                print res[0][0] + " # " + res[0][4] + " # " + res[0][2]
                tmp += res[0][0] + " " + res[0][4] + " "
            cluster_content.append(parsing.preprocess_string(
                str.lower(str(tmp)),
                filters=[parsing.strip_tags,
                         parsing.strip_punctuation,
                         parsing.strip_multiple_whitespaces,
                         parsing.strip_numeric,
                         parsing.remove_stopwords]
            )
            )
            print

        # Coherence calculation
        cluster_content = [list(set(x)) for x in cluster_content]

        coherence = sys.maxint
        for i in range(0, len(cluster_content) - 1):
            cnt = 0
            tmp = []
            for word in cluster_content[i]:
                if word in cluster_content[i + 1]:
                    tmp.append(word)
                    cnt += 1
            print tmp
            coherence = min(coherence, cnt)

        print "Coherence: " + str(coherence)
예제 #9
0
def save_word_dict(text):
  proc_text = []
  
  sentences = text 
  sentences = tokenize.sent_tokenize(sentences)
  
  for sentence in sentences:
    proc_sentence = preprocess_string(sentence) # ' '.join(preprocess_string(sentence))

    if(len(proc_sentence) == 0):
      continue
    proc_text.append(proc_sentence)

  dictionary = corpora.Dictionary(proc_text)
  # dictionary.save(os.pardir + '/data/text.dict')
  return [dictionary, proc_text, sentences]
예제 #10
0
def save_word_dict(text):
    proc_text = []

    sentences = text
    sentences = tokenize.sent_tokenize(sentences)

    for sentence in sentences:
        proc_sentence = preprocess_string(
            sentence)  # ' '.join(preprocess_string(sentence))

        if (len(proc_sentence) == 0):
            continue
        proc_text.append(proc_sentence)

    dictionary = corpora.Dictionary(proc_text)
    # dictionary.save(os.pardir + '/data/text.dict')
    return [dictionary, proc_text, sentences]
예제 #11
0
    def __iter__(self):
        db = self.dict_source.db_object
        dbsql = self.dict_source.db_sql
        with db as conn:
            cursor = conn.cursor()
            cursor.execute(dbsql.sql)
            record = cursor.fetchone()
            while record:
                self._length += 1
                document_text = record[dbsql.doc_location]
                pk, title = record[dbsql.unique_key_location], record[dbsql.title_location]

                if self.preprocessor:
                    title = self.preprocessor.clean_text(title)
                    document_text = self.preprocessor.clean_text(document_text)

                self.record_identifiers.append((pk, title))
                tokens = parsing.preprocess_string(document_text)
                yield self._dictionary.doc2bow(tokens)
                record = cursor.fetchone()
예제 #12
0
    def __iter__(self):
        db = self.dict_source.db_object
        dbsql = self.dict_source.db_sql
        with db as conn:
            cursor = conn.cursor()
            cursor.execute(dbsql.sql)
            record = cursor.fetchone()
            while record:
                self._length += 1
                document_text = record[dbsql.doc_location]
                pk, title = record[dbsql.unique_key_location], record[
                    dbsql.title_location]

                if self.preprocessor:
                    title = self.preprocessor.clean_text(title)
                    document_text = self.preprocessor.clean_text(document_text)

                self.record_identifiers.append((pk, title))
                tokens = parsing.preprocess_string(document_text)
                yield self._dictionary.doc2bow(tokens)
                record = cursor.fetchone()
예제 #13
0
 def tokenize_dictionary_content(self, licenses_dict):
     for name, content in licenses_dict.items():
         licenses_dict[name] = parsing.preprocess_string(content)
예제 #14
0
def searchIndbFacebookSaved(search_value):
    for x in "and or it is the a".split():
        search_value.replace(" " + x + " ", "")
    result = dbFacebookSaved.query.filter(
        dbFacebookSaved.title.ilike("%" + search_value.replace(" ", "%") +
                                    "%"))  #("%" + search_value + "%"))#
    idList = [
        result.order_by(dbFacebookSaved.date)[count - 1].id
        for count in range(result.count(), 0, -1)
    ]
    idDict = dict()
    idDict = adding_weight_to_dict(idDict, idList, 1)
    print ".ilike"
    print idDict

    stemmer = PorterStemmer()
    search_value = search_value.split()
    search_valueRaw = list(search_value)
    if len(search_value) > 1:
        sumVector = model3['car'] * 0
        for searchTerm in search_valueRaw:
            if searchTerm.lower() in model3.vocab:
                sumVector = sumVector + model3[searchTerm.lower()]
        similarList = model3.similar_by_vector(sumVector)
        print "similarList (sumVector)"
        print similarList
        """
        for i in range(min(5,len(similarList))):
                if similarList[i][1] >= 0.7 and similarList[i][0] not in search_value:
                    search_value.append(similarList[i][0])
                    print "append " + similarList[i][0] + " from fasttext(sum of vec)"
        """
        print "New search value after sumVec:"
        search_value += [
            similarList[i][0] for i in range(min(5, len(similarList))) if
            similarList[i][1] >= 0.72 and similarList[i][0] not in search_value
        ]
        print search_value

    search_valueR = []
    for searchTerm in search_valueRaw:
        for i, mdl in enumerate([model, model2]):
            if searchTerm.lower() in mdl.vocab:
                similarList = mdl.most_similar(searchTerm.lower())
                listLengh = 3 if i == 0 else 5
                scoreThreshold = 0.5 if i == 0 else 0.55
                tempText = " from gensim_word2vec for relating to " if i == 0 else " from fasttext(CBOW) for relating to "
                for i in range(min(listLengh, len(similarList))):
                    if similarList[i][1] >= scoreThreshold and similarList[i][
                            0] not in search_value:
                        search_value.append(similarList[i][0])
                        search_valueR.append(similarList[i][0])
                        print "append " + similarList[i][
                            0] + tempText + searchTerm
        """
        if searchTerm.lower() in model.vocab:
            similarList = model.most_similar(searchTerm.lower())
            for i in range(min(3,len(similarList))):
                if similarList[i][1] >= 0.5 and similarList[i][0] not in search_value:
                    search_value.append(similarList[i][0])
                    search_valueR.append(similarList[i][0])
                    print "append " + similarList[i][0] + " from gensim_word2vec for relating to " + searchTerm
        if searchTerm.lower() in model2.vocab:
            similarList = model2.most_similar(searchTerm.lower())
            for i in range(min(5,len(similarList))):
                if similarList[i][1] >= 0.55 and similarList[i][0] not in search_value:
                    search_value.append(similarList[i][0])
                    search_valueR.append(similarList[i][0])
                    print "append " + similarList[i][0] + " from fasttext(CBOW) for relating to " + searchTerm
        """
    """
    print "search_value before stemming:"
    print search_value
    stemmer = PorterStemmer()
    search_value = [stemmer.stem(word) for word in search_value]
    search_value = list(set(search_value))
    search_valueR = [stemmer.stem(word) for word in search_valueR]
    search_valueR = list(set(search_valueR))
    print "search_value bafter stemming:"
    """
    print search_value

    for word in search_value:
        if word == stemmer.stem(
                word) or not stemmer.stem(word) in search_value:
            result = dbFacebookSaved.query.filter(
                dbFacebookSaved.title.contains(word))
            resultKwd = dbFacebookSaved.query.filter(
                dbFacebookSaved.keywords.contains(word))
            resultSummary = dbFacebookSaved.query.filter(
                dbFacebookSaved.summary.contains(word))
            weight = 1
            if len(preprocess_string(word)) == 0:
                weight = 0.1
            elif word in search_valueR:
                weight = 0.5

            idList = [
                read_db_data_to_article(
                    result.order_by(dbFacebookSaved.date)[count - 1])['id']
                for count in range(result.count(), 0, -1)
            ]
            idDict = adding_weight_to_dict(idDict, idList, 1 * weight)
            print ".title.contains(" + word + ")"
            print idDict

            idList = [
                read_db_data_to_article(
                    resultKwd.order_by(dbFacebookSaved.date)[count - 1])['id']
                for count in range(resultKwd.count(), 0, -1)
            ]
            idDict = adding_weight_to_dict(idDict, idList, 0.5 * weight)
            print ".keywords.contains(" + word + ")"
            print idDict

            idList = []
            for count in range(resultSummary.count(), 0, -1):
                if not resultSummary.order_by(
                        dbFacebookSaved.date)[count - 1].id in idList and len(
                            preprocess_string(word)) > 0:
                    article = read_db_data_to_article(
                        resultSummary.order_by(dbFacebookSaved.date)[count -
                                                                     1])
                    idList.append(article['id'])
                    cumsum = 0
                    # preprocess_string is a gensim function that do preprocessing for a string. ex: people -> peopl, Oranges -> orang
                    word = preprocess_string(word)[0]
                    for w in article['text']:
                        if len(preprocess_string(w)) > 0:
                            w = preprocess_string(w)
                        if cumsum <= 0.6 and word in w:
                            idDict[article['id']] = idDict.get(
                                article['id'], 0) + 0.2 * weight
                            cumsum = cumsum + 0.2 * weight
            print ".summary.contains(" + word + ")"
            #idDict = adding_weight_to_dict(idDict, idList, 0.2)
            print idDict
        else:
            print "ignore " + word + " for " + stemmer.stem(word)
    return idDict
예제 #15
0
# -*- coding: utf-8 -*-
from numpy import *
from sklearn.datasets import fetch_20newsgroups
from gensim.parsing import preprocess_string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from time import time

random.seed(0)

df = fetch_20newsgroups()
N = len(df.data) # 訓練文書数

# 前処理 (Porterのステマーなど)
corpus = map(lambda s: " ".join(preprocess_string(s)), df.data)

# 全体の9割を訓練に使い,残りの1割のクラスを正しく決定出来るか検証
indices = arange(N)
random.shuffle(indices)
train_indices = indices[:9*N/10]
test_indices  = indices[9*N/10:]

# コーパスを読み込み,特徴ベクトル変換器を構築
vec = CountVectorizer()
vec.fit([corpus[i] for i in train_indices])

# 全文書を特徴ベクトルに変換
X = vec.transform(corpus)

# 単純ベイズ(ベルヌーイ)
start = time()
예제 #16
0
 def create_query(self, query_content_filepath, corpora_dict, lsi_model):
     query_content = self.get_file_content(query_content_filepath)
     tokenized_content = parsing.preprocess_string(query_content)
     vec_bag_of_words = corpora_dict.doc2bow(tokenized_content)
     vec_lsi = lsi_model[vec_bag_of_words]
     return vec_lsi
예제 #17
0
 def process_articles(self):
     final_articles = []
     for article in self.articles:
         final_articles.append(parsing.preprocess_string(article))
     return final_articles
예제 #18
0
def normalize_with_gensim(text):
    custom_filters = [strip_multiple_whitespaces,
                      strip_numeric, strip_punctuation, strip_short]
    text = preprocess_string(to_unicode(text).lower(), custom_filters)
    text = [word for word in text if word not in stopwords.words('indonesian')]
    return text, len(text)
예제 #19
0
def preprocess_new_document(doc):
    return preprocess_string(doc)
예제 #20
0
 def tokenize_dictionary_content(self, licenses_dict):
     for name, content in licenses_dict.items():
         licenses_dict[name] = parsing.preprocess_string(content)
예제 #21
0
 def create_query(self, query_content_filepath, corpora_dict, lsi_model):
     query_content = self.get_file_content(query_content_filepath)
     tokenized_content = parsing.preprocess_string(query_content)
     vec_bag_of_words = corpora_dict.doc2bow(tokenized_content)
     vec_lsi = lsi_model[vec_bag_of_words]
     return vec_lsi
예제 #22
0
def find_similar(doc, model, *args, **kwargs):
    cleaned_doc = preprocess_string(doc)
    inferred_vector = model.infer_vector(cleaned_doc)
    sims = model.docvecs.most_similar([inferred_vector], **kwargs)
    return sims
예제 #23
0
 def encode(self, texts, seq_length):
     processed_texts = [preprocess_string(text) for text in texts]
     encoded_texts = []
     for text in processed_texts:
         encoded_texts.append([self.vocab.get(w, 0) for w in text])
     return self._pad_features(encoded_texts, seq_length)
예제 #24
0
 def process_articles(self):
     final_articles = []
     for article in self.articles:
         final_articles.append(parsing.preprocess_string(article))
     return final_articles
예제 #25
0
from gensim.parsing import preprocess_string
from sklearn.preprocessing import LabelEncoder

data_path = '.'
rating = pd.read_feather(os.path.join(data_path, 'ratings.feather'))
print(rating.iidx.min())
user_num, item_num = rating.uidx.max() + 1, rating.iidx.max() + 1

word_set = set()
genre_set = set()
data = []
with open(os.path.join(data_path, 'movies.dat')) as f:
    for line in f:
        iidx_raw, title_raw, genre_raw = line.strip().split('::')
        iidx = int(iidx_raw)
        title_feat = preprocess_string(genre_raw)
        word_set.update(title_feat)
        genre_list = genre_raw.strip().split('|')
        genre_set.update(genre_list)

        data.append((iidx, title_feat, genre_list))

word_encoder = LabelEncoder().fit(list(word_set))
genre_encoder = LabelEncoder().fit(list(genre_set))

bow_title = np.zeros((item_num, len(word_set)))
bow_genre = np.zeros((item_num, len(genre_set)))
for iidx, word_list, genre_list in data:
    word_idx_list = word_encoder.transform(word_list)
    genre_idx_list = genre_encoder.transform(genre_list)
    bow_title[iidx, word_idx_list] += 1
예제 #26
0
 def __iter__(self):
     for url, doc in scrape(self.testing):
         yield doc2vec.TaggedDocument(preprocess_string(doc),
                                      [url])