def main():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    parser = OptionParser()
    parser.add_option('-f', '--corpus-file')
    parser.add_option('-p', '--parse-procs', default=1, type=int)
    parser.add_option('-s', '--sublexicalize-procs', default=1, type=int)
    parser.add_option('-t', '--tfidf-model')
    parser.add_option('-v', '--vocabulary')
    parser.add_option('-m', '--model-file')
    opts, args = parser.parse_args()

    corpus_fn = opts.corpus_file or sys.exit()
    n_proc_parse = opts.parse_procs
    n_proc_sublex = opts.sublexicalize_procs
    vocab_fn = opts.vocabulary
    tfidf_fn = opts.tfidf_model
    model_fn = opts.model_file or sys.exit()

    with BZ2File(corpus_fn) as f:
        corpus = SublexicalizedCorpus(WikiCorpus(corpus_fn, processes=n_proc_parse, dictionary=Dictionary()),
                                      order=(3, 6), clean_func=normalize_whitespace, n_proc=n_proc_sublex,
                                      create_dictionary=False)

        if vocab_fn and os.path.exists(vocab_fn):
            logging.info("Loading vocabulary from %s" % vocab_fn)
            vocab = Dictionary.load(vocab_fn)
        else:
            logging.info("Creating vocabulary")

            start = time.clock()
            vocab = Dictionary(corpus.get_texts())
            end = time.clock()
            logging.info("Vocabulary created in %d seconds" % (end - start))

            if vocab_fn:
                logging.info("Saving dictionary to %s" % vocab_fn)
                vocab.save(vocab_fn)

        corpus.dictionary = vocab

        corpus.dictionary.filter_extremes(no_below=5, no_above=.8)
        corpus.dictionary.compactify()

        if tfidf_fn and os.path.exists(tfidf_fn):
            logging.info("Reading TF-IDF model from %s" % tfidf_fn)
            tfidf = TfidfModel.load(tfidf_fn)
        else:
            logging.info("creating TF-IDF model")
            tfidf = TfidfModel(corpus)

            if tfidf_fn:
                logging.info("Saving TFF-IDF model to %s" % tfidf_fn)
                tfidf.save(tfidf_fn)

        bow_corpus = (tfidf[art] for art in corpus)

        model = LsiModel(corpus=bow_corpus, num_topics=10, id2word=corpus.dictionary)

        model.save(model_fn)
Exemplo n.º 2
0
def save_dictionary(
    dic: corpora.Dictionary,
    filename: str
) -> None:
    dic.save(filename)
    print("saved dictionary: {} items to {}".format(
        len(dic.values()), filename
    ))
Exemplo n.º 3
0
def bag_of_words(lemma):
    "Takes in lemmatised words and returns a bow."
    # Create bag of words from dictionnary
    dictionary = Dictionary(lemma)
    dictionary.save('text.dict')
    # Term frequency–inverse document frequency (TF-IDF)
    bow = [dictionary.doc2bow(l) for l in lemma] # Calculates inverse document counts for all terms
    return (bow, dictionary)
		def fetch_dict():
			global dictionary
			dictionary=Dictionary([i for i in my_dictionary])
			once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
			dictionary.filter_tokens(once_ids)
			dictionary.compactify()
			dictionary.save("Topic/dic.loc")
			return dictionary
Exemplo n.º 5
0
def create_dictionary(doc_iterator, dict_file, as_text=False):
    """
    Creates a gensim.corpora.Dictionary object from given document iterator 
    and serializes it to given dict_file (filename) in a memory efficient way.
    @Params:
      as_text   - flag: dictionary saved as text (default: binary)
    """    
    d = Dictionary(doc.strip().lower().split() for doc in doc_iterator)
    if as_text:
        d.save_as_text(dict_file)
    else:
        d.save(dict_file)
Exemplo n.º 6
0
def build_corpora(db):
    dictionary = Dictionary()
    corpus = []
    for article in db.articles.find():
        text = article['clean_text']
        dictionary.doc2bow(text, allow_update=True)
    dictionary.filter_extremes()
    for article in db.articles.find():
        text = article['clean_text']
        corpus.append(dictionary.doc2bow(text))
    gensim.corpora.MmCorpus.serialize('data/corpus.mm', corpus)
    dictionary.save('data/cnn.dict')
    return corpus, dictionary
	def fetch_dict():
		print "Fetching Dictionary...",
		try:
			dictionary=Dictionary().load("Topic/dic.tm")
			print "Dictionary loaded!"
		except IOError:
			print "Dictionary not found, building Dictionary..."
			dictionary=Dictionary(i for i in MyDictionary())
			once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
			dictionary.filter_tokens(once_ids)
			dictionary.compactify() 
			print "\rDictionary Built!"
			print dictionary
			dictionary.save("Topic/dic.tm")
		return dictionary
Exemplo n.º 8
0
def run():
  try:
    print "starting to build LSI Model"

    start = datetime.now()
    documents = Feature.objects.exclude(text=None).values_list("text", flat=True)
    number_of_documents = len(documents)
    print "number_of_documents:", number_of_documents

    stopwords = []
    stopwords += [month.lower() for month in month_to_number.keys()]
    stopwords += nltk_stopwords.words('english')
    print "stopwords:", len(stopwords)
    with open(path_to_directory_of_this_file + "/stopwords.txt") as f:
        stopwords.extend([word for word in f.read().decode("utf-8").split("\n") if word and not word.startswith("#")])
    stopwords = set(stopwords)

    texts = [[word for word in document.lower().replace("#"," ").replace("_"," ").replace("("," ").replace(")"," ").replace("/"," ").replace(":"," ").replace("."," ").split() if word not in stopwords and len(word) > 3 ] for document in documents]

    counter = Counter()
    for text in texts:
        counter.update(text)

    texts = [[token for token in text if counter[token] > 1] for text in texts]

    dictionary = Dictionary(texts)
    print "dictionary:", dictionary
    dictionary.save(path_to_directory_of_this_file + "/dictionary")

    corpus = [dictionary.doc2bow(text) for text in texts]
    print "corpus:", type(corpus)

    print "generating lsi model"
    
    lsi = LsiModel(corpus=corpus, id2word=dictionary, num_topics=10)
    print "saving LSI model"
    lsi.save(path_to_directory_of_this_file + "/model")

    Topic.objects.all().delete()
    topics = []
    for topic in lsi.show_topics():
        topics.append(Topic(id=topic[0], name=prettify_topic(topic[1])))

    Topic.objects.bulk_create(topics)

  except Exception as e:
    print e
Exemplo n.º 9
0
def main():
    parser = ArgumentParser()
    parser.add_argument('-e', '--encoding')
    parser.add_argument('-o', '--output-file')
    args = parser.parse_args()

    encoding = args.encoding
    output_fn = args.output_file

    if not output_fn:
        sys.exit(-1)

    if encoding:
        sys.stdout = codecs.getwriter(encoding)(sys.stdout)
        sys.stdin = codecs.getreader(encoding)(sys.stdin)

    texts = (line.split() for line in sys.stdin)

    logging.info('Creating vocabulary ...')
    vocab = Dictionary(texts)

    logging.info('Saving vocabulary to %s ...' % (output_fn + '.bz2'))
    vocab.save(output_fn)

    logging.info('Compressing vocabulary ...')

    with open(output_fn, 'rb') as input:
        with bz2.BZ2File(output_fn + '.bz2', 'wb', compresslevel=9) as output:
            copyfileobj(input, output)

    os.remove(output_fn)

    logging.info('Creating IDF model ...')
    tfidf = TfidfModel(dictionary=vocab)

    logging.info('Saving IDF model to %s ...' % (output_fn + '.tfidf.bz2'))
    tfidf.save(output_fn + '.tfidf')

    logging.info('Compressing IDF model ...')

    with open(output_fn + '.tfidf', 'rb') as input:
        with bz2.BZ2File(output_fn + '.tfidf.bz2', 'wb', compresslevel=9) as output:
            copyfileobj(input, output)

    os.remove(output_fn + '.tfidf')
Exemplo n.º 10
0
def prepare_data():
    # returns the corpus object required by learn
    # skips datasets/dspace/2481.json
    base = 'datasets/dspace'
    documents = []
    for filename in tqdm(os.listdir(base)):
        path = os.path.join(base, filename)
        with open(path) as f:
            d = json.load(f)
            abstract = d['abstract']
            if abstract is not None:
                words = tokenize(abstract.split())
                documents.append(words)

    dictionary = Dictionary(documents)
    dictionary.filter_extremes(no_below=5, no_above=0.3)
    dictionary.save('lda.dict')
    corpus = map(dictionary.doc2bow, documents)
    return corpus
Exemplo n.º 11
0
def get_topics_lda(tokens, n_topics=10):
    """
    Using the `gensim` package for LDA. 
    LDA is a little better than LSA as it provides a reasonal mixture of topics (Wikipedia).
    `gensim` is a package for topic modeling only. So for a particular topic modeling task,
    it is a lighter option to install and run. Also it can be run distributed and updated over an existing model

    :param tokens: Preprocessed tokens for faster dictionary building
    :param n_topics: Number of topics to decompose data to
    :return: list() of topics
    """
    dict_file = 'resources/deals.dict'
    if not os.path.isfile(dict_file):
        print "Dictionary file does not exist. Creating one"
        dictionary = Dictionary(tokens)
        freq1 = [id for id, freq in dictionary.dfs.iteritems() if freq == 1]
        dictionary.filter_tokens(freq1)
        dictionary.compactify()
        dictionary.save(dict_file)
    dictionary = Dictionary.load(dict_file)
    # print dictionary

    corpus_file = 'resources/deals.mm'
    if not os.path.isfile(corpus_file):
        print "Corpus file does not exist. Creating one"
        corpus = [dictionary.doc2bow(token) for token in tokens]
        MmCorpus.serialize(corpus_file, corpus)
    mm = MmCorpus(corpus_file)
    # print mm
    # tfidf = TfidfModel(mm)
    # corpus_tfidf = tfidf[mm]

    lda = LdaModel(corpus=mm, id2word=dictionary, num_topics=n_topics, update_every=1, chunksize=1000,
                   passes=1)
    topics = []
    for i in range(0, n_topics):
        words = lda.print_topic(i).split('+')
        topic = []
        for word in words:
            score, w = word.split('*')
            topic.append((w, score))
        topics.append(topic)
    return topics
Exemplo n.º 12
0
    def _build_model(self, all_documents, remove_once=False):
        '''
        Builds the lsa model

        Returns:
            dictionary, corpus
        '''
        doc_hash = hash_obj(all_documents)
        corp_cache_path = CACHE_DIR + '/' + doc_hash +\
            '_corp_' + str(int(remove_once))
        dic_cache_path = CACHE_DIR + '/' + doc_hash +\
            '_dic_' + str(int(remove_once))
        lsi_cache_path = CACHE_DIR + '/' + doc_hash +\
            '_lsi_' + str(int(remove_once))
        if os.path.exists(corp_cache_path) \
                and os.path.exists(dic_cache_path)\
                and os.path.exists(lsi_cache_path):
            lsi = models.LsiModel.load(lsi_cache_path)
            corp = MmCorpus(corp_cache_path)
            dic = Dictionary.load(dic_cache_path)
        else:
            texts = [self.tokenize(doc) for doc in all_documents]
            all_tokens = sum(texts, [])
            if remove_once:
                tokens_once = set(word for word in set(all_tokens)
                                  if all_tokens.count(word) == 1)
                texts = [[word for word in text if word not in tokens_once]
                         for text in texts]
            dic = Dictionary(texts)
            corp = [dic.doc2bow(text) for text in texts]

            MmCorpus.serialize(corp_cache_path, corp)
            dic.save(dic_cache_path)
            lsi = models.LsiModel(
                corp, id2word=dic, num_topics=20)
            lsi.save(lsi_cache_path)
        return dic, corp, lsi
Exemplo n.º 13
0
class TfidfVectorizer():
    """
    Transform text to tf-idf representation
    """

    def __init__(self):

        self.base_path = os.path.dirname(__file__)
        self.dictionary_path = os.path.join(self.base_path, "dictionary")
        self.tf_idf_model_path = os.path.join(self.base_path, "tfidf")

        self.stemmer = NepStemmer()
        self.tf_idf_model = None

    def get_tokens(self, document):
        if not self.stemmer:
            raise Exception("Stemmer not available")

        return self.stemmer.get_stems(document)

    def construct_model(self, documents):
        logging.basicConfig(
            format='%(asctime)s:%(levelname)s:%(message)s',
            level=logging.INFO
        )

        logging.info("Obtaining word tokens")
        tokens = [self.get_tokens(document) for document in documents]
        # self.tf_idf_model = TfidfModel(tokens)

        logging.info("Constructing dictionary")
        self.dictionary = Dictionary(tokens)
        self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)
        self.dictionary.compactify()
        self.dictionary.save(self.dictionary_path)

        logging.info("Constructing TF-IDF model")
        self.tf_idf_model = TfidfModel(dictionary=self.dictionary)
        self.tf_idf_model.save(self.tf_idf_model_path)

    def load_data(self):

        if not self.tf_idf_model:
            if not os.path.exists(self.tf_idf_model_path):
                raise Exception('TF-IDF model file not found')

            self.dictionary = Dictionary.load(self.dictionary_path)
            self.tf_idf_model = TfidfModel.load(self.tf_idf_model_path)

    def doc2vector(self, document):
        """ Returns the sparse tf-idf vector for given document """

        tokens = self.get_tokens(document)
        bag_of_words = self.dictionary.doc2bow(tokens)

        return (self.tf_idf_model[bag_of_words])

    def obtain_feature_vector(self, document):
        """
        Returns a single dense tf-idf vector for a given document
        """

        self.load_data()

        tf_idf_vector = matutils.sparse2full(
            self.doc2vector(document),
            self.no_of_features
        ).reshape(1, -1)

        return tf_idf_vector

    def obtain_feature_matrix(self, documents):
        """
        Returns the tf-idf dense matrix for the given documents
        """

        self.load_data()

        input_matrix_sparse = [
            self.doc2vector(x)
            for x in documents
        ]

        no_of_features = len(self.tf_idf_model.idfs)

        input_matrix = matutils.corpus2dense(
            input_matrix_sparse,
            no_of_features
        ).transpose()

        return input_matrix
Exemplo n.º 14
0
class Builder(object):
    def __init__(self,
                 ndocs,
                 phrase_min_count=5,
                 vocabulary_size=10000,
                 bigram_min_count=5,
                 bigram_threshold=10,
                 trigram_min_count=5,
                 trigram_threshold=10,
                 substitutions=dict(),
                 data_directory='./data',
                 model_directory='./model'):
        self.ndocs = ndocs
        self.phrase_min_count = phrase_min_count
        self.vocabulary_size = vocabulary_size
        self.bigram_min_count = bigram_min_count
        self.bigram_threshold = bigram_threshold
        self.trigram_min_count = trigram_min_count
        self.trigram_threshold = trigram_threshold
        self.substitutions = substitutions
        self.data_directory = data_directory
        self.model_directory = model_directory
        self.load_bad_phrases()

    def tokenize(self, text):
        return [token.lower() for token in word_tokenize(text)]

    def stream_sentences(self, texts, description="Streaming sentences ..."):
        with tqdm.tqdm(texts) as pbar:
            pbar.set_description(description)
            for text in pbar:
                for sentence in sent_tokenize(text):
                    yield self.tokenize(sentence)

    def load_bad_phrases(self):
        with open("%s/bad-phrases.txt" % self.data_directory,
                  mode='r',
                  encoding='UTF-8') as fp:
            self.bad_phrases = set(
                [phrase.strip() for phrase in fp.readlines()])

    def add_bad_phrase(self, phrase):
        self.bad_phrases.add(phrase)

    def save_bad_phrases(self):
        bad_phrases = list(self.bad_phrases)
        bad_phrases.sort()
        with open("%s/bad-phrases.txt" % self.data_directory,
                  mode='w',
                  encoding='UTF-8') as fp:
            for phrase in bad_phrases:
                fp.write("%s\n" % phrase)

    def train_phrasers(self, texts):
        bigrams = Phrases(self.stream_sentences(
            texts, description="Streaming text for bigram phraser  ..."),
                          min_count=self.bigram_min_count,
                          threshold=self.bigram_threshold)
        #print("Training bigram phraser ...")
        self.bigram_phraser = Phraser(bigrams)

        #print("Collecting trigrams ...")
        trigrams = Phrases(self.bigram_phraser[self.stream_sentences(
            texts, description="Streaming text for trigram phraser ...")],
                           min_count=self.trigram_min_count,
                           threshold=self.trigram_threshold)
        #print("Training trigram phraser ...")
        self.trigram_phraser = Phraser(trigrams)

    def save_phrasers(self):
        path = os.path.join(self.model_directory, "bigram-phraser.pkl")
        self.bigram_phraser.save(path)

        path = os.path.join(self.model_directory, "trigram-phraser.pkl")
        self.trigram_phraser.save(path)

    def load_phrasers(self):
        path = os.path.join(self.model_directory, "bigram-phraser.pkl")
        self.bigram_phraser = Phraser.load(path)

        path = os.path.join(self.model_directory, "trigram-phraser.pkl")
        self.trigram_phraser = Phraser.load(path)

    def prepare_text(self, text):
        for key, value in self.substitutions.items():
            text = text.replace(key, value)
        tokens = self.tokenize(text)
        tokens = self.bigram_phraser[tokens]
        tokens = self.trigram_phraser[tokens]
        return [token for token in tokens if not token in self.bad_phrases]

    def prepare_texts(self, texts):
        with tqdm.tqdm(texts) as pbar:
            pbar.set_description("Preparing texts ...")
            prepared_texts = [self.prepare_text(text) for text in pbar]
        return prepared_texts

    def keep_phrase(self, phrase, cnt):
        if "'" in phrase: return False
        for c in PUNCTUATION:
            if c in phrase: return False
        if phrase in self.bad_phrases: return False
        phrase_set = set(phrase)
        if SYMBOLS & phrase_set: return False
        if (LETTERS & set(phrase)) and cnt > self.phrase_min_count: return True
        return False

    def build_vocabulary(self, texts, save=False):
        self.ndocs = len(texts)
        with tqdm.tqdm(texts) as pbar:
            pbar.set_description("Building vocabulary over %d documents." %
                                 self.ndocs)
            phrase_map = {}
            for document in pbar:
                for phrase in document:
                    if not phrase in phrase_map: phrase_map[phrase] = 0
                    phrase_map[phrase] += 1
        phrases = list(phrase_map.keys())
        phrases = sorted(phrases, key=lambda phrase: -phrase_map[phrase])

        vocabulary = [
            phrase for phrase in phrases
            if self.keep_phrase(phrase, phrase_map[phrase])
        ]

        hyphenated = {
            phrase.replace('-', '_')
            for phrase in vocabulary if "-" in phrase
        }
        vocabulary = [
            phrase for phrase in vocabulary if not phrase in hyphenated
        ][:self.vocabulary_size]
        if save:
            path = os.path.join(
                self.data_directory, "vocabulary-%d-%d-%d.tsv" %
                (len(texts), self.phrase_min_count, self.vocabulary_size))
            fp = open(path, mode='w', encoding='UTF-8')
            for phrase in vocabulary:
                fp.write("%s\t%d\n" % (phrase, phrase_map[phrase]))
            fp.close()
        self.vocabulary = set(vocabulary)

    def load_vocabulary(self):
        path = os.path.join(
            self.data_directory, "vocabulary-%d-%d-%d.tsv" %
            (self.ndocs, self.phrase_min_count, self.vocabulary_size))
        fp = open(path, mode='r', encoding='UTF-8')
        self.vocabulary = set([])
        for line in fp:
            line = line.strip()
            if line:
                phrase, cnt = line.split('\t')
                self.vocabulary.add(phrase)
        fp.close()

    def build_document(self, text):
        return [phrase for phrase in text if phrase in self.vocabulary]

    def build_corpus(self, texts):
        with tqdm.tqdm(texts) as pbar:
            pbar.set_description("Building corpus ...")
            corpus = [self.build_document(text) for text in pbar]
        return corpus

    def build_dictionary(self, corpus, save=False):
        self.dictionary = Dictionary(corpus)
        self.dictionary.filter_extremes(no_below=self.phrase_min_count,
                                        no_above=0.6,
                                        keep_n=self.vocabulary_size)
        if save: self.save_dictionary()

    def save_dictionary(self, path=None):
        if path is None:
            path = os.path.join(self.model_directory, "dictionary.pkl")
        self.dictionary.save(path)

    def load_dictionary(self, path=None):
        if path is None:
            path = os.path.join(self.model_directory, "dictionary.pkl")
        self.dictionary = Dictionary.load(path)

    def encode_corpus(self, corpus):
        return [self.dictionary.doc2bow(document) for document in corpus]
Exemplo n.º 15
0
#             docs[idx].append(token)

# stem the token
ps = PorterStemmer()
docs = [[ps.stem(token) for token in doc] for doc in docs]

# Remove rare and common tokens.
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 1000 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=1000, no_above=0.5)

# path for saved dictionary: /Users/rachelzheng/opt/anaconda3/lib/python3.7/site-packages/gensim/test/test_data/dict-www-cnndm
# saved /home/rachelzheng/www/venv/lib/python3.6/site-packages/gensim/test/test_data/dict-www-cnndm
dictionary.save(datapath('dict-www-cnndm-unigram'))
# dictionary = Dictionary.load(datapath('dict-www-cnndm'))

# Bag-of-words representation of the documents.
# corpus = [dictionary.doc2bow(doc) for doc in docs]

# Number of unique tokens: 88978 - plus bigram
# Number of unique tokens: 44984 - unigram - 20 documents
# Number of unique tokens: 21185 - unigram - 100 documents
# Number of unique tokens: 6439 - unigram - 1000 docyments
# Number of documents: 287113
print('Number of unique tokens: %d' % len(dictionary))
# print('Number of documents: %d' % len(corpus))

# Make a index to word dictionary.
# temp = dictionary[0]  # This is only to "load" the dictionary.
Exemplo n.º 16
0
        except KeyError as e:
            print(e)
            continue

        # replace encoded characters
        for code, value in replace_dict.items():
            text = text.replace(code, value).lower().strip()
            keywords = [
                kw.replace(code, value).lower().strip() for kw in keywords
            ]
            abstract = abstract.replace(code, value).lower().strip()
            title = title.replace(code, value).lower().strip()
        text = title + abstract + text
        doc = nlp(text)

        # clean:
        doc = [
            token.lemma_ for token in doc
            if not token.is_stop and token.shape > 2 and not token.is_currency
            and not token.is_punct and not token.is_digit
        ]
        documents.append(doc)

print("create dict")
dct = Dictionary(documents)
dct.save("../data/models/tfidf/dictionary.model")

print("create tfidf")
model = TfidfModel(dictionary=dct)
model.save("../data/models/tfidf/tfidf.model")
Exemplo n.º 17
0
def build_lda_model(tokens_tags,
                    pos_tags,
                    use_nouns=True,
                    use_verbs=True,
                    use_all=False,
                    num_of_topics=8,
                    passes=25,
                    verbose=True):
    path = os.getcwd()[:os.getcwd().rfind('/')]
    topics_filename = str(num_of_topics) + "topics"
    if use_nouns:
        topics_filename += "_nouns"
    if use_verbs:
        topics_filename += "_verbs"
    if use_all:
        topics_filename += "_all"

    # Set the LDA, Dictionary and Corpus filenames
    lda_filename = path + "/models/topic_models/lda_" + topics_filename + ".model"
    dict_filename = path + "/res/topic_data/dict/dict_" + topics_filename + ".dict"
    corpus_filename = path + "/res/topic_data/corpus/corpus_" + topics_filename + ".mm"

    # Build a topic model if it wasn't created yet
    if not os.path.exists(lda_filename):
        # Extract the lemmatized documents
        docs = []
        for index in range(len(tokens_tags)):
            tokens = tokens_tags[index].split()
            pos = pos_tags[index].split()
            docs.append(
                data_proc.extract_lemmatized_tweet(tokens, pos, use_verbs,
                                                   use_nouns, use_all))

        # Compute the dictionary and save it
        dictionary = Dictionary(docs)
        dictionary.filter_extremes(keep_n=40000)
        dictionary.compactify()
        Dictionary.save(dictionary, dict_filename)

        # Compute the bow corpus and save it
        corpus = [dictionary.doc2bow(d) for d in docs]
        MmCorpus.serialize(corpus_filename, corpus)

        if verbose:
            print("\nCleaned documents:", docs)
            print("\nDictionary:", dictionary)
            print("\nCorpus in BoW form:", corpus)

        # Start training an LDA Model
        start = time.time()
        print("\nBuilding the LDA topic model...")
        lda_model = LdaModel(corpus=corpus,
                             num_topics=num_of_topics,
                             passes=passes,
                             id2word=dictionary)
        lda_model.save(lda_filename)
        end = time.time()
        print("Completion time for building LDA model: %.3f s = %.3f min" %
              ((end - start), (end - start) / 60.0))

        if verbose:
            print("\nList of words associated with each topic:")
            lda_topics = lda_model.show_topics(formatted=False)
            lda_topics_list = [[word for word, prob in topic]
                               for topic_id, topic in lda_topics]
            print([t for t in lda_topics_list])

    # Load the previously saved dictionary
    dictionary = Dictionary.load(dict_filename)

    # Load the previously saved corpus
    mm_corpus = MmCorpus(corpus_filename)

    # Load the previously saved LDA model
    lda_model = LdaModel.load(lda_filename)

    # Print the top 10 words for each topic
    if verbose:
        for topic_id in range(num_of_topics):
            print("\nTop 10 words for topic ", topic_id)
            print([
                dictionary[word_id]
                for (word_id,
                     prob) in lda_model.get_topic_terms(topic_id, topn=10)
            ])

    index = 0
    if verbose:
        for doc_topics, word_topics, word_phis in lda_model.get_document_topics(
                mm_corpus, per_word_topics=True):
            print('Index ', index)
            print('Document topics:', doc_topics)
            print('Word topics:', word_topics)
            print('Phi values:', word_phis)
            print('-------------- \n')
            index += 1
    return dictionary, mm_corpus, lda_model
Exemplo n.º 18
0
def run_tm(topics, below, above, chunksize, passes, iterations):

    m, valid = arevalid(topics, below, above, chunksize, passes, iterations)
    if not valid:

        fehlerfenster = Toplevel()
        fehlerfenster.title('Fehler')
        fehlerfenster.geometry('300x300')
        # Label mit der Fehlermeldung
        labelfehler = Label(master=fehlerfenster, text=m)
        labelfehler.place(x=10, y=10, width=300, height=300)

    else:

        with open('../data/docs', 'rb') as f:
            docs = pickle.load(f)

        tweet_dictionary = Dictionary(docs)
        tweet_dictionary.filter_extremes(no_below=int(below),
                                         no_above=float(above))
        tweet_dictionary.save('../data/tweet_dictionary')

        ngram_docs = ngrams(input_docs=docs)
        corpus = make_bow_corpus(tweet_dictionary, ngram_docs)
        with open('../data/bow_corpus', 'wb') as f:
            pickle.dump(corpus, f)
        print('Number of unique tokens: %d' % len(tweet_dictionary))
        print('Number of documents: %d' % len(corpus))
        """Training parameters."""
        num_topics = int(
            topics
        )  # Number of topics, here relatively low so we can interpret them more easily -> can be set higher
        chunk_size = int(
            chunksize
        )  # Numbers of documents fed into the training algorithm (we have 7)
        passes = int(passes)  # Number of times trained on the entire corpus
        iterations = int(iterations)  # Number of loops over each document
        eval_every = None  # Don't evaluate model perplexity, takes too much time.
        """ Make a index to word dictionary."""
        temp = tweet_dictionary[0]  # This is only to "load" the dictionary.
        id2word = tweet_dictionary.id2token
        """Create model
        We set alpha = 'auto' and eta = 'auto'. Again this is somewhat technical, but essentially we are automatically learning
        two parameters in the model that we usually would have to specify explicitly."""
        model = LdaModel(corpus=corpus,
                         id2word=id2word,
                         chunksize=chunk_size,
                         alpha='auto',
                         eta='auto',
                         iterations=iterations,
                         num_topics=num_topics,
                         passes=passes,
                         eval_every=eval_every)
        model_file = '../data/model/LDA_model_v1'
        model.save(model_file)
        """ Tests """
        # Top topics
        top_topics = model.top_topics(
            corpus
        )  # , num_words=20) Default value = 20, input is our corpus in BOW format

        # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
        """Topic Coherence measures score a single topic by measuring the degree of semantic similarity between high scoring 
        words in the topic. These measurements help distinguish between topics that are semantically interpretable topics and 
        topics that are artifacts of statistical inference """
        avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
        print('Average topic coherence: %.4f.' % avg_topic_coherence)

        pprint(top_topics)
Exemplo n.º 19
0
mecab = MeCab.Tagger("-Owakati")

# 辞書に含めない単語たち
words_blacklist = [
    ">>",  # チャットのアノテーション
    "some_agent",
    "\u3000",  # 全角スペースを意味している
    "。",
    "、",
]

dct = Dictionary()
# csvファイルの読み込み
df = pd.read_csv(filepath, delimiter=",", names=["talker", "words", "type"])
# 文を分かち書き -> 半角スペースで区切り -> 最後の1文字(改行コード)を消したリストを得る
wakati_df = df["words"].map(lambda x: mecab.parse(x).split(" ")[:-1])
# 辞書に追加
dct.add_documents(wakati_df)

# ブラックリストの辞書内でのidを得る
words_blacklist_id = dct.doc2idx(words_blacklist)
# 辞書から削除
dct.filter_tokens(bad_ids=words_blacklist_id)
#dct.filter_n_most_frequent(600)

# 辞書の保存
dct.save(os.path.join(filedir, ".".join([filename, "dict"])))

# 辞書の中身と単語数の表示
print(dct.token2id)
print(len(dct.token2id))
Exemplo n.º 20
0
            print("Number of distinct users:", len(users))
            print("Number of tweets total:", db_collection.count())

            print(
                "Concatenate all tweets of one person in one month to one single document..."
            )
            tweet_docs, tweet_docs_prep = concatenateTweetsOfMonthToDoc(
                users, db_collection, format="mongoDB")
            print("Number of documents (concatenated tweets): ",
                  len(tweet_docs))

        print("Create dictionary...")
        dictionary = Dictionary(tweet_docs_prep)
        print("Save dictionary (nTokens={}) to file {}...".format(
            len(dictionary.values()), path_save_dict))
        dictionary.save(path_save_dict)

        print("Create bag of words...")
        corpus = [dictionary.doc2bow(text) for text in tweet_docs_prep]

        list_num_topics = args.numberTopics.replace(" ", "").split(
            ",")  #[16, 20, 22, 24, 26, 28, 30]
        print("Calculate LDA...")
        lmlist, c_v, logPerplex = calculateLDA(dictionary=dictionary,
                                               corpus=corpus,
                                               texts=tweet_docs_prep,
                                               list_num_topics=list_num_topics,
                                               saveModelPath=path_save_LDA)

#        print("Plot...")
#        plt.figure()
Exemplo n.º 21
0
class VAEEmbeddingsSearchEngine(SmartSearchEngine):
    #
    # Uses a Keras model as the base to compute document similarity

    def __init__(self):
        super(VAEEmbeddingsSearchEngine, self).__init__()
        self._service_array = []
        self._index = None
        self._corpus = None
        self._train_model = False
        self._load_wmd = False
        self._preprocessor = StringPreprocessor('english.long')

    def load_configuration(self, configuration_file):
        super(VAEEmbeddingsSearchEngine,
              self).load_configuration(configuration_file)
        config = configparser.ConfigParser()
        config.read(configuration_file)
        latent_dim = config.getint('RegistryConfigurations', 'latent_dim')
        intermediate_dim = config.getint('RegistryConfigurations',
                                         'intermediate_dim')
        batch_size = config.getint('RegistryConfigurations', 'batch_size')
        epochs = config.getint('RegistryConfigurations', 'epochs')
        learning_rate = config.getfloat('RegistryConfigurations',
                                        'learning_rate')
        epsilon_std = config.getfloat('RegistryConfigurations', 'epsilon_std')
        self._precomputed_vectors_path = config.get(
            'RegistryConfigurations', 'precomputed_vectors_path')
        if config.get('RegistryConfigurations',
                      'load_wmd_model').lower() == 'true':
            self._load_wmd = True
        if config.get('RegistryConfigurations',
                      'train_model').lower() == 'true':
            self._train_model = True
            if config.get('RegistryConfigurations',
                          'reproducible').lower() == 'true':
                self._model = VAEWasserstein(latent_dim,
                                             intermediate_dim,
                                             epsilon_std,
                                             batch_size,
                                             epochs,
                                             learning_rate,
                                             reproducible=True)
            else:
                self._model = VAEWasserstein(latent_dim, intermediate_dim,
                                             epsilon_std, batch_size, epochs,
                                             learning_rate)
        else:
            self._model = VAEWasserstein()
            self._model.load('models/vae.h5')
            self._vectorizer = Dictionary.load('models/vectorizer.npy')

    def _doc_to_nbow(self, document):
        vocab_len = len(self._vectorizer)
        d = np.zeros(vocab_len, dtype=np.double)
        nbow = self._vectorizer.doc2bow(document)  # Word frequencies.
        doc_len = len(document)
        for idx, freq in nbow:
            d[idx] = freq / float(doc_len)  # Normalized word frequencies.
        return d

    def _corpus_to_nbow(self, documents):
        corpus = np.zeros((len(documents), len(self._vectorizer)))
        for i in range(len(documents)):
            corpus[i, :] = self._doc_to_nbow(documents[i])
        return corpus

    def unpublish(self, service):
        pass

    def _preprocess(self, bag_of_words):
        words = bag_of_words.get_words_list()
        return self._preprocessor(words)

    def _save_obj(self, obj, name):
        with open('models/' + name + '.pkl', 'wb') as f:
            pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

    def _load_obj(self, name):
        with open('models/' + name + '.pkl', 'rb') as f:
            return pickle.load(f)

    def _create_filter_vocab(self, documents, vocab):
        filter_set = set()
        for document in documents:
            for word in document:
                if word not in vocab:
                    filter_set.add(word)
        return filter_set

    def _after_publish(self, documents):
        if self._train_model:
            filter_set = None
            if self._load_wmd:
                filter_set = self._load_obj('word_filter')
                documents = [[x for x in document if x not in filter_set]
                             for document in documents]
                self._vectorizer = Dictionary(documents)
                distance = WMDDistance.load('models/distances.npy',
                                            self._vectorizer)
            else:
                self._word2vec_model = models.KeyedVectors.load_word2vec_format(
                    self._precomputed_vectors_path, binary=False)
                self._word2vec_model.init_sims(replace=True)
                filter_set = self._create_filter_vocab(
                    documents, self._word2vec_model.vocab)
                self._save_obj(filter_set, 'word_filter')
                documents = [[x for x in document if x not in filter_set]
                             for document in documents]
                self._vectorizer = Dictionary(documents)
                distance = WMDDistance(self._vectorizer, self._word2vec_model)
                distance.save('models/distances')
            X = self._corpus_to_nbow(documents)
            self._vectorizer.save(open('models/vectorizer.npy', 'wb'))
            X_train, X_test, _, _ = train_test_split(X,
                                                     np.zeros(X.shape),
                                                     test_size=0.33,
                                                     random_state=23)
            print(X_train)
            print(X_test)
            self._model.train(X_train, X_test, distance.get_distances())
            self._model.save('models/vae.h5')
        else:
            filter_set = self._load_obj('word_filter')
            documents = [[x for x in document if x not in filter_set]
                         for document in documents]
            X = self._corpus_to_nbow(documents)
        self._index = self._model.transform(X)

    def publish(self, service):
        pass

    def find(self, query):
        query = StringTransformer().transform(query)
        query_vector = self._doc_to_nbow(
            self._query_transformer.transform(query).get_words_list())
        query_vector = np.expand_dims(query_vector, axis=0)
        query_vae = self._model.transform(query_vector)
        results = cosine_similarity(query_vae, self._index)
        results = sorted(enumerate(results[0]), key=lambda item: -item[1])
        result_list = []
        for tuple_result in results:
            result_list.append(self._service_array[tuple_result[0]])
        return result_list

    def number_of_services(self):
        pass
Exemplo n.º 22
0
class Preprocessor(object):
    def __init__(self):
        self.text_dict = Dictionary()
        self.label_dict = Dictionary()

    def SentencesToVectors(self, splited_sentences):
        vec_sentences = []
        for splited_sentence in tqdm(splited_sentences):
            vec_sentences.append([
                self.text_dict.token2id.get(word, 0)
                for word in splited_sentence
            ])
        return vec_sentences

    def LabelsToVectors(self, splited_labels):
        vec_labels = [
            self.label_dict.token2id[label] for label in splited_labels
        ]
        return vec_labels

    def SaveDicts(self):
        self.text_dict.save(CURRENT_MAIN_PATH + "/dicts/text_dict.dict")
        self.label_dict.save(CURRENT_MAIN_PATH + "/dicts/label_dict.dict")

    def LoadDicts(self):
        self.text_dict = self.text_dict.load(CURRENT_MAIN_PATH +
                                             '/dicts/text_dict.dict')
        self.label_dict = self.label_dict.load(CURRENT_MAIN_PATH +
                                               '/dicts/label_dict.dict')

    def SaveTrainingData(self, vec_sentences, vec_labels):
        np.savez(CURRENT_MAIN_PATH + "/npz_data/train.npz",
                 x_data=vec_sentences,
                 y_data=vec_labels)

    def SaveTestingData(self, vec_sentences):
        np.savez(CURRENT_MAIN_PATH + "/npz_data/test.npz",
                 x_data=vec_sentences)

    def PreprocessTSV(self, mode='train'):
        if mode == 'train':
            filepath = CURRENT_MAIN_PATH + '/raw_data/train.tsv'
            raw_data = pandas.read_csv(filepath, sep='\t', engine='c')
            raw_sentences = raw_data.iloc[:, 0]
            raw_triples = raw_data.iloc[:, 1]
            # Split by jieba
            print("Spliting sentences and labels")
            splited_sentences = [
                Util.GetPreProcessedSentence(raw_sentence)
                for raw_sentence in tqdm(raw_sentences)
            ]
            splited_labels = [
                Util.GetPreProcessedLabels(raw_triple)
                for raw_triple in tqdm(raw_triples)
            ]
            # Add words to text dictionary and triple-labels dictionary
            self.text_dict.add_documents(splited_sentences)
            self.label_dict.add_documents([[label]
                                           for label in splited_labels])
            # Transform sentences to vectors
            vec_sentences = self.SentencesToVectors(splited_sentences)
            # Transform labels to vectors
            vec_labels = self.LabelsToVectors(splited_labels)
            # Save Dictionaries and training data
            self.SaveDicts()
            self.SaveTrainingData(vec_sentences, vec_labels)
        elif mode == 'test':
            filepath = CURRENT_MAIN_PATH + '/raw_data/test.tsv'
            raw_data = pandas.read_csv(filepath, sep='\t', engine='c')
            raw_sentences = raw_data.iloc[:, 0]
            splited_sentences = [
                Util.GetPreProcessedSentence(raw_sentence)
                for raw_sentence in tqdm(raw_sentences)
            ]
            # Load Dictionaries
            self.LoadDicts()
            # Transform sentences to vectors
            vec_sentences = self.SentencesToVectors(splited_sentences)
            # Save testing data
            self.SaveTestingData(vec_sentences)
Exemplo n.º 23
0
dest_file_prefix = sys.argv[2]
epoch = int(sys.argv[3])
batch = int(sys.argv[4])
n_hidden = int(sys.argv[5])

BOS = '\t'
EOS = '\n'

df = pd.read_csv(data_file, names=('question', 'answer'), dtype='object')

a_maxlen = df['answer'].map(len).max() + 2

ans = df['answer'].map(lambda a: f'{BOS}{a}{EOS}')

dic = Dictionary([list(BOS + EOS + ' '.join(df.values.flatten()))])
dic.save(f'{data_file}.dic')

padding_one_hot = lambda d, size: np.vstack(
    (np.eye(len(dic))[dic.doc2idx(list(d))], np.zeros(
        (size - len(d), len(dic)))))

one_hot = lambda s: np.eye(len(dic))[dic.doc2idx(list(s))]
sum_one_hot = lambda s: np.add.reduce(one_hot(s))

x1 = np.array([sum_one_hot(q) for q in df['question']])
x2 = np.array([padding_one_hot(a, a_maxlen) for a in ans])
y = np.array([np.vstack((d[1:], np.zeros((1, len(dic))))) for d in x2])

enc_inputs = Input(shape=(len(dic), ))
enc_outputs = Dense(n_hidden, activation='relu')(enc_inputs)
def main():
    parser = ArgumentParser(
        description="wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information"
    )
    parser.add_argument("-ds", "--dataset", default="wiki", help="What kind of dataset to use. (wiki,es,file)")
    parser.add_argument("-d", "--dump-file", help="Wiki: bz2 dump file with wiki in it")
    parser.add_argument("-l", "--limit", help="Wiki: How many documents to extract from wiki")
    parser.add_argument("--model-id", default="model", help="Filename for created model.")
    parser.add_argument("--model-type", default="lsi", help="Model type (lsi, lda, word2vec, hdp, vocabulary).")
    parser.add_argument("--n-topics", default=10, help="Number of topics to model.")
    parser.add_argument("--n-passes", default=1, help="Number of passes for LDA  model.")
    parser.add_argument("--w2v-size", default=100, help="size of Word2Vec context.")
    parser.add_argument("--w2v-window", default=5, help="window for Word2Vec.")
    parser.add_argument("-q", "--query", default=None, help="Elasticsearch: Query to use to fetch documents")
    parser.add_argument("--index", help="Elasticsearch: index to read from.")
    parser.add_argument("--doc_type", default="doc", help="Elasticsearch: data type in index.")
    parser.add_argument("--data-dir", help="Directory to save the generated models and vocabularies into.")
    parser.add_argument("--vocab", help="Prebuilt Vocabulary file. Use this to avoid having to generate one.")

    opts = parser.parse_args()

    model_type = opts.model_type.lower()
    if model_type not in ["lsi", "lda", "word2vec", "hdp", "vocabulary"]:
        logging.error("Invalid model type %s" % model_type)
        parser.print_usage()
        exit(-1)

    logging.info("Using model type %s" % model_type)

    dump_fn = opts.dump_file
    limit = int(opts.limit) if opts.limit else None

    data_type = opts.dataset.lower()
    if data_type not in ["es", "wiki", "file"]:
        logging.error("Invalid dataset  type %s" % data_type)
        parser.print_usage()
        exit(-1)
    limit = None
    if opts.limit:
        limit = int(opts.limit)
    if not dump_fn and data_type in ["wiki"]:
        logging.error("--dump-file required for wiki dataset")
        sys.exit(1)

    query = opts.query
    index = opts.index
    doc_type = opts.doc_type
    if data_type == "es" and index is None:
        logging.error(
            "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter"
        )
        sys.exit(1)

    n_topics = int(opts.n_topics)
    n_passes = int(opts.n_passes)
    logging.info("Using %d topics." % n_topics)
    data_dir = opts.data_dir
    model_id = opts.model_id
    model_fn = "%s_%s_%d" % (model_id, model_type, n_topics)
    if data_dir:
        model_fn = "%s/%s" % (data_dir, model_fn)
    if model_type == "word2vec":
        w2v_size = int(opts.w2v_size)
        w2v_window = int(opts.w2v_window)
        model_fn = "%s_w_%s_s_%s" % (model_fn, w2v_window, w2v_size)
    logging.info("Writing models to %s." % model_fn)

    if data_type == "es":
        logging.info("Using data type %s with index %s, doc_type %s query %s" % (data_type, index, doc_type, query))
        dataset = ElasticsearchDataset(
            read_index=index, read_doc_type=doc_type, query=query, normalize_func=normalize_es
        )
    elif data_type == "wiki":
        logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit))
        dataset = WikipediaDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_wiki)
    elif data_type == "file":
        logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit))
        dataset = FileDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_file)
    vocab_file = opts.vocab
    vocab = Dictionary()
    sw = set(stopwords.words("norwegian"))
    if not vocab_file or model_type == "vocabulary":
        vocab.add_documents([get_tokenized(page, sw) for page in dataset])
        vocab.filter_extremes()
        vocab.compactify()
        vocab.save(model_fn + ".vocab")
    else:
        vocab = Dictionary.load(vocab_file)
    if model_type == "vocabulary":
        return
    tfidf = TfidfModel(dictionary=vocab)
    if model_type == "lsi":
        corpus = IterableDataset(dataset, sw, vocab)
        model = LsiModel(corpus=tfidf[corpus], num_topics=n_topics, id2word=vocab)
    elif model_type == "lda":
        corpus = IterableDataset(dataset, sw, vocab)
        model = LdaModel(corpus=tfidf[corpus], num_topics=n_topics, passes=n_passes, id2word=vocab)

    elif model_type == "word2vec":
        corpus = IterableDataset(dataset, sw, vocab, doc2bow=False)
        corpus.dictionary = vocab
        model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size)
    elif model_type == "hdp":
        corpus = IterableDataset(dataset, sw, vocab)
        model = HdpModel(corpus=tfidf[corpus], id2word=vocab)

    logging.info(model)
    model.save(model_fn)
Exemplo n.º 25
0
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    working_corpus = path.join(base_path, p['corpus_path'], p['corpus_name'])
    human_data_file = path.join(base_path, p['human_data_file'])
    lee_corpus = path.join(base_path, p['lee_corpus'])
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    # remember starting time for runtime evaluation
    start = datetime.now()

    logger.info('loading word mapping')
    dictionary = Dictionary.load(path.join(base_path,
                                           p['corpus_path'],
                                           p['dict_name']))
    Dictionary.save(dictionary, path.join(output_dir, p['dict_name']))
    logger.info(dictionary)

    logger.info('loading corpus')
    corpus_bow = MmCorpus(working_corpus)

    logger.info("create preprocessing model and save it to disk")
    if p['pre_model'] == 'tfidf':
        pre_model = TfidfModel(corpus_bow, id2word=dictionary, normalize=True)
    elif p['pre_model'] == 'log_ent':
        pre_model = LogEntropyModel(corpus_bow,
                                    id2word=dictionary, normalize=True)
    else:
        raise ValueError('model parameter %s not known' % p['pre_model'])
    pre_model.save(os.path.join(output_dir, p['pre_model_extension']))

    logger.info('initialize LSI model')
    lsi = models.LsiModel(pre_model[corpus_bow],
                          id2word=dictionary, num_topics=p['num_topics'])
    lsi.save(os.path.join(output_dir, p['lsi_extension']))
    logger.info('finished --> lsi model saved to: %s' %
                os.path.join(output_dir, p['lsi_extension']))

    # check for correlation with lee human data
    logger.info('load smal lee corpus and preprocess')
    with open(lee_corpus, 'r') as f:
        preproc_lee_texts = preprocessing.preprocess_documents(f.readlines())
    bow_lee_texts = [dictionary.doc2bow(text,
                                        allow_update=False,
                                        return_missing=False)
                    for text in preproc_lee_texts]

    logger.info('transforming small lee corpus (LSI)')
    corpus_lsi = lsi[pre_model[bow_lee_texts]]

    # # compute pairwise similarity matrix of transformed corpus
    sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi)))
    for i, par1 in enumerate(corpus_lsi):
        for j, par2 in enumerate(corpus_lsi):
            sim_matrix[i, j] = matutils.cossim(par1, par2)
    sim_vector = sim_matrix[np.triu_indices(len(corpus_lsi), 1)]

    # read the human similarity data and flatten upper triangular
    human_sim_matrix = np.loadtxt(human_data_file)
    sim_m_size = np.shape(human_sim_matrix)[0]
    human_sim_vector = human_sim_matrix[np.triu_indices(sim_m_size, 1)]

    # compute correlations
    cor = np.corrcoef(sim_vector, human_sim_vector)
    logger.info("correlation with lee human data: %f" % cor[0, 1])

    dif = start - datetime.now()
    logger.info("finished after %d days and %d secs" % (dif.days, dif.seconds))
Exemplo n.º 26
0
def train_dictionary(corpus, fname):
    make_folder_tree(fname)
    # Corpus will be examined by unique symbols
    genes = Dictionary(no_syns[by_token[corpus]], prune_at=None)
    genes.save(fname)
Exemplo n.º 27
0
class LDATagger:
    _lda_model = None
    _dictionary = None
    _lda_model_path = None
    _dictionary_path = None
    DEFAULT_MODEL_PATH = os.path.join(os.path.dirname(__file__), "model")
    DEFAULT_NUM_TOPICS = 1000

    def __init__(self,
                 model_path=DEFAULT_MODEL_PATH,
                 num_topics=DEFAULT_NUM_TOPICS,
                 lock=threading.Lock()):
        self.save_model_lock = lock

        if os.path.isfile(model_path):
            raise Exception("Invalid Model Path; Should Be a Directory")
        if not os.path.exists(model_path):
            os.makedirs(model_path)
        self._lda_model_path = os.path.join(model_path, "lda.model")
        self._dictionary_path = os.path.join(model_path, "tokens.dict")
        self.num_topics = num_topics
        self.model_folder_lock = FileLock(model_path)

    def topics_for_documents(self, doc_tokens_map):
        self.check_and_load_model()
        doc_topics_map = defaultdict(list)
        for document_id, document_tokens in doc_tokens_map.iteritems():
            doc_topics_map[document_id] = self.topics_for_document(
                document_tokens)
        return doc_topics_map

    def topics_for_document(self, tokens):
        self.check_and_load_model()
        bow_tokens = self._dictionary.doc2bow(tokens)
        topics = self._lda_model[bow_tokens]
        return topics

    def build_topics(self, tokens_list):
        self._dictionary = Dictionary(tokens_list)
        corpus = [
            self._dictionary.doc2bow(document_tokens)
            for document_tokens in tokens_list
        ]
        self._lda_model = LdaModel(corpus=corpus,
                                   id2word=self._dictionary,
                                   num_topics=self.num_topics,
                                   passes=100)
        self.save_model()

    def save_model(self, sleep_for_test=False, mock_datastruct=None):
        self.save_model_lock.acquire()
        self.model_folder_lock.acquire()
        if mock_datastruct: mock_datastruct.acquire()
        if sleep_for_test:
            import time
            time.sleep(1)
        print
        "Acquired Lock "
        try:
            self._lda_model.save(self._lda_model_path)
            self._dictionary.save(self._dictionary_path)
        finally:
            print
            "Released Lock"
            if mock_datastruct: mock_datastruct.release()
            self.model_folder_lock.release()
            self.save_model_lock.release()

    def check_and_load_model(self):
        if self._lda_model and self._dictionary:
            return
        if os.path.exists(self._lda_model_path):
            self._lda_model = LdaModel.load(self._lda_model_path)
        else:
            raise Exception("LDA Model Not found in the path")
        if os.path.exists(self._dictionary_path):
            self._dictionary = Dictionary.load(self._dictionary_path)
        else:
            raise Exception("Tokens Dictionary Not found in the path")

    def update_model(self, tokens_list):
        self.check_and_load_model()
        corpus = [
            self._dictionary.doc2bow(document_tokens)
            for document_tokens in tokens_list
        ]
        self._lda_model.update(corpus=corpus)
        self.save_model()

    def build_or_update_model(self, tokens_list):
        if not self.does_model_exist():
            self.build_topics(tokens_list)
        else:
            self.update_model(tokens_list)

    def does_model_exist(self):
        if os.path.exists(self._lda_model_path) and os.path.exists(
                self._dictionary_path):
            return True
        return False

    def get_model(self):
        self.check_and_load_model()
        model_hash = {
            "lda_model": cPickle.dumps(self._lda_model),
            "dictionary": cPickle.dumps(self._dictionary)
        }
        return model_hash

    def restore_model(self, model_hash):
        self._lda_model = cPickle.loads(
            model_hash["lda_model"].encode('utf-8'))
        self._dictionary = cPickle.loads(
            model_hash["dictionary"].encode('utf-8'))
        self.save_model()

    def topics_to_tokens(self):
        topics_tokens_map = defaultdict(list)
        if not self.does_model_exist():
            return []
        else:
            model = self._lda_model
            topics_to_tokens = model.show_topics(
                topics=self.DEFAULT_NUM_TOPICS,
                topn=25,
                log=False,
                formatted=False)

            for topic_id, tokens in enumerate(topics_to_tokens):
                topics_tokens_map[topic_id] = self.list_of_tuples_to_hash(
                    tokens)

            return topics_tokens_map

    def list_of_tuples_to_hash(self, tokens):
        tokens_hash = defaultdict(float)
        for token_probability, token in tokens:
            tokens_hash[token] = token_probability
        return tokens_hash
	return [token for token in simple_preprocess(text) if token not in stop_words]

def iter_wiki(dump_file):
    """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple."""
    ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
    for title, text, pageid in _extract_pages(smart_open(dump_file)):
        text = filter_wiki(text)
        tokens = tokenize(text)
        if len(tokens) < 50 or any(title.startswith(ns + ':') for ns in ignore_namespaces):
            continue  # ignore short articles and various meta-articles
        yield title, tokens


wiki_stream = (tokens for _, tokens in iter_wiki('enwiki-latest-pages-articles.xml.bz2'))

print "making of dictionary started"
wiki_dictionary = Dictionary(wiki_stream)
print "wikipedia dictionary made"

wiki_dictionary.filter_extremes(no_below=10, no_above=0.3, keep_n=200000)

print "...... saving the dictionary"
wiki_dictionary.save('WikiDictionary200k.dict')
print "dictionary saved ........"

# wiki = WikiCorpus('enwiki-latest-pages-articles.xml.bz2')  # make a corpus from wiki dump

# MmCorpus.save_corpus('WikiCorpus.mm', wiki) # Saving the corpus


Exemplo n.º 29
0
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

data = []
for filename in glob(
        r'C:\Users\75043\PycharmProjects\NLP\TF-IDF\corups\THUCNews\体育\*.txt'
)[:1100]:
    with open(filename, encoding='utf-8') as f:
        # print(filename)
        text = ' '.join(jieba.cut(f.read().replace('\n', '')))
        data.append(text)

with open('finance_news_train.json', 'w', encoding='utf-8') as f:
    json.dump(data[:1000], f, indent=2, ensure_ascii=False)

with open('finance_news_test.json', 'w', encoding='utf-8') as f:
    json.dump(data[1000:], f, indent=2, ensure_ascii=False)

with open('finance_news_train.json', encoding='utf-8') as f:
    data = json.load(f)
    data = [doc.split() for doc in data
            ]  # the parameter of Dictionaryis iterable of iterable of str

dct = Dictionary(data)
corpus = [dct.doc2bow(doc) for doc in data]  # convert corpus to BoW format
# print(corpus[0])
model = TfidfModel(corpus)  # fit model
dct.save('news.dict')
# print(dct[0],dct[1],len(dct),dct)
model.save('news_tfidf.model')
Exemplo n.º 30
0
class DictionaryPipeline(Pipeline):
    """Pipeline for creating and updating a gensim dictionary and converting
    documents to a bag of words representation.
    """
    def __init__(self, *args, **kwargs):
        """Loads a dictionary for updating
        """
        super().__init__(*args, **kwargs)

        # This is only for lazy loading. Use get_dict() unless you are sure you
        # need this.
        self._dictionary = None

    @staticmethod
    def load_dictionary():
        """This function is used to load a gensim dictionary from the models
        folder.

        Returns:
            :obj:`gensim.corpora.dictionary.Dictionary`: The dictionary found
            in ucla_topic_analysis/model/dictionary.gensim or None if there was
            no dictionary.
        """
        file_name = "dictionary.gensim"
        file_path = get_training_file_path(file_name)
        if os.path.isfile(file_path):
            return Dictionary.load(file_path)
        return None

    @staticmethod
    def get_input_stream(schema=None):
        """This function is used to get a pipeline to feed into a dictionary for
        training an LDA model.

        Args:
            schema(:obj:`dict`): The schema for the file pipeline

        Returns:
            An iterable containing lists of words to train a dictionary with.
        """
        # Build the pipeline
        files = ReadFilePipeline.get_input_stream()
        file_stream = ReadFilePipeline(input_stream=files,
                                       schema=schema).output_stream()
        sent_stream = SentencePipeline(
            input_stream=file_stream).output_stream()
        word_stream = WordPipeline(input_stream=sent_stream).output_stream()
        return LemmaPipeline(input_stream=word_stream).output_stream()

    async def train_dictionary(self):
        """This function trains a new gensim dictionary from the corpus.
        """
        input_stream = self.get_input_stream()
        # Train the dictionary
        count = 1
        total = len(get_file_list())
        async for data in input_stream:
            await self.run(data)
            print_progress(count, total)
            count += 1
        print("")
        self.save_dict()

    async def get_dictionary(self):
        """This function is used to get an instance of a gensim dictionary. It
        will load a dictionary from file if one has not already been loaded. If
        no previous dictionary has been loaded and no dictionary has been saved
        to file it will train a new one.

        Returns:
            :obj:`gensim.corpora.dictionary.Dictionary`: The dictionary found
            in ucla_topic_analysis/model/dictionary.gensim or None if there was
            no dictionary.
        """
        if self._dictionary is None:
            self._dictionary = self.load_dictionary()
        if self._dictionary is None:
            print("Did not find a saved dictionary. Training one now.")
            self._dictionary = Dictionary()
            await self.train_dictionary()
        return self._dictionary

    def save_dict(self):
        """Saves the updated dictionary to file
        """
        file_name = "dictionary.gensim"
        file_path = get_training_file_path(file_name)

        self._dictionary.save(file_path)

    async def coroutine(self, data):
        """Converts the documents in the data to bags of words

        Args:
            data (:obj:`dict`): A dict with the key "text" containing a list of
                lists with tokenised words that need to be changed to a bag of
                words format.
        Returns:
            :obj:`dict`: The data dict with the value associated with "text"
            replaced with a list containing a bag of words representation for
            each document.
        """
        dictionary = await self.get_dictionary()
        data["text"] = [
            dictionary.doc2bow(document, allow_update=True)
            for document in data["text"]
        ]
        self.save_dict()
        return data
Exemplo n.º 31
0
text_data = []
with codecs.open("output.txt", "r", "utf-8") as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        print(tokens, '\n')
        text_data.append(tokens)

from gensim.corpora import Dictionary

dictionary = Dictionary(text_data)

corpus = [dictionary.doc2bow(text) for text in text_data]

import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')
import gensim
NUM_TOPICS = 4
ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                           num_topics=NUM_TOPICS,
                                           id2word=dictionary,
                                           passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

import pyLDAvis.gensim
Exemplo n.º 32
0
# LDA
trigram_dictionary_filepath = os.path.join('.','trigram_dict_all_diags.dict')

if rerun:
    trigram_reviews = LineSentence(trigram_records_filepath)

    # learn the dictionary by iterating over all of the reviews
    trigram_dictionary = Dictionary(trigram_reviews)

    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    trigram_dictionary.compactify()

    trigram_dictionary.save(trigram_dictionary_filepath)

# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

trigram_bow_filepath = os.path.join('.', 'trigram_bow_corpus_all_diags.mm')

def trigram_bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """

    for review in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(review)
Exemplo n.º 33
0
print((
    'LDA Model based on {3} dataset.\n\tSample Size: {0},\n\tTop {1} Words,\n\tNo of Topics {2}'
    .format(sample_size, len(dictionary.values()), num_topics,
            data_scope_name)))

LDAmodel_scope = LdaMulticore(
    corpus=corpus,  #mm,
    id2word=dictionary,
    num_topics=num_topics,
    workers=4,
    chunksize=5000,
    passes=50,
    alpha='asymmetric',
    random_state=random_state)

dictionary.save(
    'data/model/{0}_dictionary.pkl'.format(research_scope))  #data_scope_name))
LDAmodel_scope.save(
    'data/model/{0}'.format(research_scope))  #data_scope_name))
# pickle the model here and insert in SQL
LDAmodel_scope = LdaMulticore.load(
    'data/model/{0}'.format(research_scope))  #data_scope_name))

# Feature vector
df_lda_features(LDAmodel_scope, scope_lda_sample)

# Topic distribution
RequestTopicDistribution = scope_lda_sample['lda_features'].mean()
fig, ax1 = plt.subplots(1, 1, figsize=(20, 6))
nr_top_bars = 5
title_dist = '{}_Request Topic distributions showing top {} bars of {} topics'.format(
    research_scope, nr_top_bars, num_topics)
Exemplo n.º 34
0
                r.append(word[0])
        rl = (" ".join(r)).strip()
    return rl


with open(data_path, 'w') as f:
    data = csv.reader(f, delimiter=',')
    for row in data:
        results.append(clean_posts(row))


with open(parsed_path, 'w', encoding='utf-8') as fp:
    fp.write("\n".join(results))

# making of the lda phrases analysis

# making of the dictionary for lda topic analysis
dict_made = False

dict_path = 'dictionary.dict'

if dict_made:
    dictionary = Dictionary.load(dict_path)
else:
    reviews_for_lda = word2vec.LineSentence(reviews_for_lda_filepath)
    dictionary = Dictionary(reviews_for_lda)
    dictionary.filter_extremes(no_below=10, no_above=0.4)
    dictionary.compactify()

    dictionary.save(dict_path)
Exemplo n.º 35
0
    tokens = list(filter(None, tokens))
    return tokens


class Corpus(object):
    def __iter__(self):
        for file in glob.glob("*.txt"):
            print(file)
            paper = Path(file).read_text(encoding='utf8')
            yield paper


corpus_memory_friendly = Corpus()
papers = list(corpus_memory_friendly)

texts = [list(preprocess(t)) for t in papers]

# define the dictionary:
dictionary = Dictionary(texts)
dictionary.save('reasoning_corpura.dict')

corpus = [dictionary.doc2bow(text) for text in texts]
MmCorpus.serialize('reasoning_bow.mm', corpus)


hash_dictionary = HashDictionary(texts )
hash_dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
hash_dictionary.save_as_text('reasoning_wordids.txt.bz2')
hash_dictionary.save('reasoning_corpura_hash.dict')

Exemplo n.º 36
0
import json
import numpy as np
import boto3
s3 = boto3.resource("s3")
myBucket = s3.Bucket('workspace.scitodate.com')
homedir = os.environ['HOME']
from gensim.corpora import Dictionary
from gensim.models import *
f = open(homedir + "/results/ontology/c2n.json", 'r')
c2n = json.load(f)
f.close()
prefix = 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#'
ncit_dict = [k.split('#')[1] for k in c2n.keys()]
dictionary = Dictionary([ncit_dict])
dictionary[0]
dictionary.save(homedir + "/results/models/lda_dict")
tfidf = TfidfModel.load(homedir + "/results/models/tfidf_model")

for i in [2, 3, 4]:
    lda = AuthorTopicModel(id2word=dictionary.id2token,
                           num_topics=i,
                           eval_every=False)
    f = open(homedir + "/thesiswork/source/corpus/lda_doc_2k.json", 'r')
    _corpus = json.load(f)
    f.close()
    bow_corpus = [dictionary.doc2bow(doc) for doc in _corpus]
    tfidf_corpus = [tfidf[doc] for doc in bow_corpus]
    f = open(homedir + "/thesiswork/source/corpus/lda_d2a_2k.json", 'r')
    _d2a = json.load(f)
    f.close()
    d2a = {}
Exemplo n.º 37
0
def listify_commenters(commenters_str):
    """Transform string of commenters into list"""
    return commenters_str.split(", ")


def get_commenters_set():
    """Get list of all unique commenters"""
    all_commenters = list(
        set(x for l in commenters_df["commenters"].values for x in l))
    all_commenters.remove("")
    return all_commenters


def get_boc(tokens):
    """Convert list of commenters for each story to "bag of commenters" (boc)"""
    return dct.doc2bow(tokens)


commenters_df = fetch_commentors()
dct = Dictionary(commenters_df["commenters"].values)
dct.filter_extremes()
dct.save('commenters_dct.dict')
commenters_df["boc"] = commenters_df["commenters"].apply(get_boc)
commenters_ary = commenters_df["boc"].values
# Dimensionality reduction of "bag of commenters" with LSI
commenters_dimrec_model = LsiModel(corpus=commenters_ary,
                                   num_topics=300,
                                   id2word=dct)
MmCorpus.serialize('commenters_corpus.mm', commenters_dimrec_ary)
Exemplo n.º 38
0
class Initialize():
    def __init__(self, w2v_dim=300, w2v_window=10, alpha=0.7, sigma=0.3):
        self.w2v_dim = w2v_dim
        self.w2v_window = w2v_window
        self.nlp = spacy.load('nl', disable=['tagger', 'parser', 'ner'])

        self.category2parent = json.load(open(CATEGORY2PARENT_PATH))

        with open(STOPWORDS_PATH) as f:
            self.stopwords = [word.lower().strip() for word in f.read().split()]

        print('Parsing questions...')
        self.init_questions()
        print('Parsing answers...')
        self.init_answers()
        print('Filtering seeds...')
        self.init_seeds()
        print('Parsing labeled data...')
        self.init_labeled_data()
        print('Parsing corpus...')
        self.init_corpus()
        print('Parsing dictionary...')
        self.init_dictionary()

        # word2vec
        print('Initializing Word2Vec...')
        self.init_word2vec()
        # translation
        print('Initializing Translation...')
        self.init_translation(alpha, sigma)
        # softcosine
        print('Initializing Softcosine...')
        self.init_sofcos()


    def init_questions(self):
        # QUESTIONS
        # if not os.path.exists(NEW_QUESTIONS):
        self.questions = {}
        questions = json.load(open(QUESTIONS))
        for i, question in enumerate(questions):
            if i % 1000 == 0:
                percentage = round(float(i+1) / len(questions), 2)
                # print('Question Progress: ', percentage, end='\r')
            text = question['questiontext']
            text = list(map(lambda token: str(token), self.nlp(text)))

            tokens_proc = [w.lower() for w in text]
            tokens_proc = [w for w in tokens_proc if w not in self.stopwords and w not in punctuation]

            self.questions[question['id']] = {
                'id': question['id'],
                'tokens_proc': tokens_proc,
                'starcount': question['starcount'],
                'answercount': question['answercount'],
                'text': question['questiontext']
                'cid': question['cid']
            }
        json.dump(self.questions, open(NEW_QUESTIONS, 'w'))
        # else:
        #     self.questions = json.load(open(NEW_QUESTIONS))


    def init_answers(self):
        self.answers = {}
        answers = json.load(open(ANSWERS))
        for i, answer in enumerate(answers):
            if i % 1000 == 0:
                percentage = round(float(i+1) / len(answers), 2)
                # print('Answer Progress: ', percentage, end='\r')
            text = answer['answertext']
            text = list(map(lambda token: str(token), self.nlp(text)))

            tokens_proc = [w.lower() for w in text]
            tokens_proc = [w for w in tokens_proc if w not in self.stopwords and w not in punctuation]

            self.answers[answer['id']] = { 'tokens_proc': tokens_proc }

        json.dump(self.answers, open(NEW_ANSWERS, 'w'))


    def init_seeds(self):
        seeds_ = [question for question in self.questions.values() if int(question['answercount']) >= 1]
        self.seeds = []
        for question in seeds_:
            if int(question['starcount']) >= 1:
                category = self.category2parent[question['cid']] if question['cid'] in self.category2parent else question['cid']
                self.seeds.append({'id': question['id'], 'tokens':question['tokens_proc'], 'text':question['text'], 'category':category})
        json.dump(self.seeds, open(SEEDS_PATH, 'w'))


    def init_labeled_data(self):
        procdata = json.load(open(TRAINDATA))
        self.procdata = {}
        for i, row in enumerate(procdata):
            if i % 1000 == 0:
                percentage = round(float(i+1) / len(procdata), 2)
                # print('Answer Progress: ', percentage, end='\r')
            q1id = row['id']
            q1_tokens_proc = self.questions[q1id]['tokens_proc']

            self.procdata[q1id] = {}
            for row2 in row['bm25']:
                score = float(row2['BM25-score'])
                label = 1 if row2['Lax'] == 'Similar' else 0
                q2id = row2['id']
                q2_tokens_proc = self.questions[q2id]['tokens_proc']

                self.procdata[q1id][q2id] = {
                    'q1': q1_tokens_proc,
                    'q2': q2_tokens_proc,
                    'score': score,
                    'label': label
                }
        qids = list(self.procdata.keys())
        shuffle(qids)
        trainsize = int(0.8 * len(qids))

        trainids = qids[:trainsize]
        self.traindata = {}
        for qid in trainids:
            self.traindata[qid] = self.procdata[qid]

        testids = qids[trainsize:]
        self.testdata = {}
        for qid in testids:
            self.testdata[qid] = self.procdata[qid]
        json.dump({'procdata': self.procdata, 'train': self.traindata, 'test': self.testdata}, open(NEW_TRAINDATA, 'w'))


    def init_corpus(self):
        self.corpus = []
        for qid in self.questions:
            if qid not in self.testdata:
                question = self.questions[qid]
                self.corpus.append(question['tokens_proc'])
        for answer in self.answers.values():
            self.corpus.append(answer['tokens_proc'])
        json.dump({'corpus': self.corpus}, open(CORPUS_PATH, 'w'))


    def init_dictionary(self):
        self.dict = Dictionary(self.corpus)  # fit dictionary
        self.dict.save(DICT_PATH)


    # WORD2VEC
    def init_word2vec(self):
        w2v.run(documents=self.corpus, write_path=DATA_PATH, w_dim=self.w2v_dim, window=self.w2v_window)


    # Softcosine
    def init_sofcos(self):
        corpus = [self.dict.doc2bow(line) for line in self.corpus]  # convert corpus to BoW format
        self.tfidf = TfidfModel(corpus)  # fit model
        self.tfidf.save(TFIDF_PATH)


    # Translation
    def init_translation(self, alpha, sigma):
        tokens = []
        for question in list(self.corpus):
            for token in question:
                tokens.append(token)

        Q_len = float(len(tokens))
        aux_w_Q = self.dict.doc2bow(tokens)
        aux_w_Q = dict([(self.dict[w[0]], (w[1]+1.0)/(Q_len+len(self.dict))) for w in aux_w_Q])

        w_Q = {}
        for w in aux_w_Q:
            if w[0] not in w_Q:
                w_Q[w[0]] = {}
            w_Q[w[0]][w] = aux_w_Q[w]
        translation = { 'w_Q': w_Q, 'alpha': alpha, 'sigma': sigma }
        json.dump(translation, open(TRANSLATION_PATH, 'w'))
Exemplo n.º 39
0
    dict_token2id = dictionary.token2id
    tokens = list(dict_token2id.keys())
    ne_tokens = [token for token in tokens if token.startswith('ne_')]
    # find the ids of the ne
    ne_token_ids = [dict_token2id[token] for token in ne_tokens]
    ne_token_ids = set(ne_token_ids)
    
    # ne term weighting
    # add max token frequency tuple in documents
    bow_news = [news + [(-1, max([t[1] for t in news]))] for news in bow_news]
    # add max token frequency to ne
    bow_news = [[(t[0], t[1]+news[-1][1]) if t[0] in ne_token_ids else (t[0], t[1]) for t in news] for news in bow_news]
    # remove last tuple
    bow_news = [news[:-1] for news in bow_news]
    
    dictionary.save(os.path.join(data_dir, 'ne_nedf_weighting.dict'))
    save_model(bow_news, os.path.join(data_dir, 'ne_nedf_weighting.bow'))
    
    endtime = datetime.datetime.now()
    print('Totol running for ', (endtime - starttime).seconds, ' seconds.')


"""
加载需要的Dictionary和bag-of-words文件,调用Gensim中的LDA库训练LDA,每种主题数设置做5词实验
"""
dataset = ['20news']
# Set training parameters.
# num_topics = 100
num_topics_list = [20,50,100]
# num_topics_list = [100]
passes_list = [100]
Exemplo n.º 40
0
class LDAModelMaker():
    def __init__(self,
                 create,
                 texts_filepath,
                 corpus_filepath,
                 dictionary_filepath,
                 lda_filepath,
                 pyldavis_filepath,
                 database=None,
                 **run_parameters):
        """
        Pull up information required for generating the LDA Model. 
        
        :param {boolean} create:
            Whether or not we must create texts. 
        
        :param {str} texts_filepath: 
            Either the location of where to load texts from or where we must save texts to. 
            Explained above. 
            
        :param {str} corpus_filepath: 
            Location of where we must save or load the corpus from. 
        
        :param {str} dictionary_filepath: 
            Location of where we must save or load the dictionary from. 
        
        :param {str} lda_filepath: 
            Location of where we must save the lda model to. 
            
        :param {str} pyldavis_filepath: 
            Location of where we must save the pyldavis to.
        
        :param {str} database: 
            Which kind of database we will be using. 
        """

        self.next_steps = {'mongo': self.mongo}

        self.texts_filepath = texts_filepath
        self.corpus_filepath = corpus_filepath
        self.dictionary_filepath = dictionary_filepath
        self.lda_filepath = lda_filepath
        self.pyldavis_filepath = pyldavis_filepath

        self.dictionary = Dictionary()
        self.run_parameters = run_parameters

        if create:
            self.apply = self.create_corpus_dict
        else:
            self.apply = self.load_corpus_dict

        if database:
            self.database = self.next_steps[database]

    def create_corpus_dict(self, texts):
        """
        Save texts, dictionary, and corpus. Generate and save lda & pyldavis model. 
        
        :param {list[list[str]]} texts:
            Tokenized words
        
        """

        self.texts = texts
        self.save_texts()
        self.set_dict_corp()

        self.database()
        self.fit_LDA()

    def mongo(self):
        import os
        from pymongo import MongoClient

        emailClient = MongoClient()
        self.db = emailClient[os.getenv('EMAIL_DATABASE_NAME')]
        self.col = self.db[os.getenv('EMAIL_COLLECTION_NAME')]
        self.email_database_content()

    def load_corpus_dict(self, useless):
        self.texts = self.load_texts()
        self.dictionary = self.load_dictionary()
        self.corpus = self.load_corpus()
        self.fit_LDA()

    def save_texts(self):
        """
        Save self.texts to a file so we don't have to keep re-computing

        """
        with open(self.texts_filepath, 'wb') as save:
            pickle.dump(self.texts, save)

    def load_texts(self):
        """
        Load self.texts from a file it was saved to earlier.

        """
        with open(self.texts_filepath, 'rb') as save:
            self.texts = pickle.load(save)


#        print(len(self.texts))

    def load_dictionary(self):
        """
        Load self.dictionary from a file it was saved to earlier. 
        
        """
        self.dictionary = self.dictionary.load(self.dictionary_filepath)

    def load_corpus(self):
        """
        Load self.corpus from a file it was saved to earlier.

        """
        self.corpus = corpora.MmCorpus(self.corpus_filepath)

    """
        TO-DO:
            For code meant to work with updating the corpus: analyze trade-off. 
                figure out if passing texts as parameter is better than just re-running 
                with complete self.texts
            If not: don't worry about it
    """

    def set_dict_corp(self):
        """
            AT THE END OF THIS METHOD:
                self.dictionary will be updated with the new values
                self.corpus will have a new value that will be an updated version
                    of the previous one. 
                
            NOTE: if you make new dictionaries and corpuses (?) each time you run through this
                then the previous data will be lost. you want to keep all the data. 
                
            ISSUE: when updating the dictionary, new words may be introduced. that means in previous
                additions to the corpus those words that initially weren't there will only show up 
                for the latter ones. Dictionary is fine, corpus is not. 
        """
        self.dictionary.add_documents(self.texts)
        # self.dictionary.save('../../../Enron/LDAVar/dictionary.dict')
        self.dictionary.save(self.dictionary_filepath)
        self.corpus = self.make_corpus()
        #        print(len(self.corpus))
        corpora.MmCorpus.serialize(self.corpus_filepath, self.corpus)

    """ Maybe make an object out of this? Separate class maybe? """

    def make_corpus(self):
        """ 
        Make corpus 
        
        """
        return [self.dictionary.doc2bow(text) for text in self.texts]

    def fit_LDA(self):
        """ 
        Fit data in LDA. currently assuming that number of cores remains constant at 1. 
        
        :param {str} lda_filepath: 
            Where to save lda model to. 
            
        :param {str} pyldavis_filepath: 
            Where to save pyldavis model to. 
        
        :param {int} num_topics: 
            Number of topics the LDA model should look for. 
            
        """

        self.lda = ldamodel.LdaModel(corpus=self.corpus,
                                     alpha='auto',
                                     id2word=self.dictionary,
                                     **self.run_parameters)
        lda_vis_serialized = pyLDAvis.gensim.prepare(self.lda,
                                                     self.corpus,
                                                     self.dictionary,
                                                     sort_topics=False)
        pyLDAvis.save_html(lda_vis_serialized, self.pyldavis_filepath)
        self.lda.save(self.lda_filepath)

    def get_domain(self, list_filtered_emails, email):
        """ 
        Find domain of the email. 
        
        :param {str} list_filtered_emails:
            List of filtered emails. 
        
        :param {line in database} email:
            One line in the database
        """

        if list_filtered_emails is None:
            return None
        try:
            domains = [
                re.search('@[\w.]+', e).group() for e in list_filtered_emails
            ]
        except Exception as e:
            print(e)
            return None
        return domains

    def email_database_content(self):
        """
        Join each list in self.texts to form strings which can be later manipulated
        to be stored in the email database. 
        
        Need to do this if calling setEmailDatabase
        
        """
        self.texts = [' '.join(text) for text in self.texts]
        self.set_email_database()

    def set_email_database(self):
        """
        Add the filtered content of each email and document in corpus to the email database.
        Now, don't need to serialize self.corpus: it's always in the email database
        
        NOTE: It doesn't matter that we're not iterating through the database in the correct numerical order!
        It's pulling the value from the corresponding index in self.texts
        
        """

        for email in self.col.find():
            try:
                self.col.update_one(
                    {'_id': email['_id']},
                    {
                        '$set': {
                            'filtered_content':
                            self.texts[email['email_counter']],
                            'email_corpus_value':
                            self.corpus[email['email_counter']]
                        }
                    },
                    #                'sender_domain': self.getDomain(email['sender_email'], email),
                    #                'recipient_domain': self.getDomain(email['recipient_email'], email)}},
                    upsert=False)
            except IndexError:
                print(email['email_counter'])
                print(email['_id'])
class MyCorpus(object):
    '''
    Corpus class for streaming review documents
    '''
    def __init__(self, file_list, file_dir, dictionary = None, mindf = MINDF, maxdf = MAXDF, \
                 maxwords = MAXWORDS, cluster_words = CLUSTER_WORDS, cluster_ul = CLUSTER_UL):
        self.file_list = file_list           # list of cuisine text files
        self.file_dir = file_dir             # directory of cuisine text files
        self.maxwords = maxwords             # maximum number of words to keep after building dictionary from clusters
        self.cluster_words = cluster_words   # maximum number of words to keep from each cluster
        self.cluster_ul = cluster_ul         # upper proportion of reviews to limit for cluster processing
        self.mindf = mindf                   # minimum number of documents to keep word
        self.maxdf = maxdf                   # max proportion of documents to keep word
        self.agglomerate = True              # return clusters as single documents (True) or return single reviews (False)
        if dictionary:
            self.dictionary = dictionary
        else:
            self.dictionary = Dictionary()
            self._build_dict()
            
    def __str__(self):
        return "<MyCorpus at " + str(hex(id(self))) + ">"
        
    def __repr__(self):
        return self.__str__()
    
    def _build_dict(self):
        for filename in self.file_list:
            dictionary = dict()
            num_reviews = 0
            with open(os.path.join(self.file_dir, filename), "rt") as f:
                for line in f:
                    num_reviews += 1
                    words = line[REVIEW_INDEX:].split()
                    for word in set(words):
                        if word not in dictionary:
                            dictionary[word] = 1
                        else:
                            dictionary[word] += 1
                doc = [item for item in dictionary.items() if dictionary[item[0]] > 2 and dictionary[item[0]] / num_reviews < self.cluster_ul]
                doc.sort(key = lambda x: -x[1])
                doc = [word for word, f in doc]
                self.dictionary.add_documents([doc[:self.cluster_words]])
                print("%s added to corpus dictionary!" % (filename,))
        self.dictionary.filter_extremes(self.mindf, self.maxdf, self.maxwords)
        self.dictionary.save("cuisine_dictionary.gensimDict")
        
    def __iter__(self):
        '''
        Iterates through cuisines by combining all reviews for each cuisine into a single
        processed document.  Also stores the length of each processed document
        '''
        if self.agglomerate:
            for filename in self.file_list:
                with open(os.path.join(self.file_dir, filename), "rt") as f:
                    doc = " ".join([line[REVIEW_INDEX:].rstrip() for line in f])
                    yield self.dictionary.doc2bow(doc.split())
        else:
            reviewIDs = set()
            for filename in self.file_list:
                with open(os.path.join(self.file_dir, filename), "rt") as f:
                    for line in f:
                        id = line[:RATING_INDEX - 1]
                        if id not in reviewIDs:
                            reviewIDs.update([id])
                            doc = line[REVIEW_INDEX:].rstrip()
                            yield self.dictionary.doc2bow(doc.split())
Exemplo n.º 42
0
        if w.text == doc[-1].text:
            skl_texts.append(' '.join(article))
            texts.append(article)
            article = []
    count += 1
    if count % 100000 == 0:
        print(count, end=' ')

with open("texts.txt", "wb") as fp:  # Pickling
    pickle.dump(texts, fp)
print('texts.csv created')

bigram = gensim.models.Phrases(texts)

dictionary = Dictionary(texts)
dictionary.save("hdp_dictionary.dict")
print("Dictionary saved as hdp_dictionary.dict")
corpus = [dictionary.doc2bow(text) for text in texts]
MmCorpus.serialize('hdp_corpus.mm', corpus)
print('Corpus saved as hdp_corpus.mm')

hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

hdpmodel.save('hdp_model_spacy.gensim')
print('hdp model created')

hdptopics = [[word for word, prob in topic]
             for topicid, topic in hdpmodel.show_topics(formatted=False)]

hdp_coherence = CoherenceModel(topics=hdptopics[:10],
                               texts=texts,
def main():
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    parser = OptionParser()
    parser.add_option('-f', '--corpus-file')
    parser.add_option('-p', '--parse-procs', default=1, type=int)
    parser.add_option('-s', '--sublexicalize-procs', default=1, type=int)
    parser.add_option('-t', '--tfidf-model')
    parser.add_option('-v', '--vocabulary')
    parser.add_option('-m', '--model-file')
    opts, args = parser.parse_args()

    corpus_fn = opts.corpus_file or sys.exit()
    n_proc_parse = opts.parse_procs
    n_proc_sublex = opts.sublexicalize_procs
    vocab_fn = opts.vocabulary
    tfidf_fn = opts.tfidf_model
    model_fn = opts.model_file or sys.exit()

    with BZ2File(corpus_fn) as f:
        corpus = SublexicalizedCorpus(WikiCorpus(corpus_fn,
                                                 processes=n_proc_parse,
                                                 dictionary=Dictionary()),
                                      order=(3, 6),
                                      clean_func=normalize_whitespace,
                                      n_proc=n_proc_sublex,
                                      create_dictionary=False)

        if vocab_fn and os.path.exists(vocab_fn):
            logging.info("Loading vocabulary from %s" % vocab_fn)
            vocab = Dictionary.load(vocab_fn)
        else:
            logging.info("Creating vocabulary")

            start = time.clock()
            vocab = Dictionary(corpus.get_texts())
            end = time.clock()
            logging.info("Vocabulary created in %d seconds" % (end - start))

            if vocab_fn:
                logging.info("Saving dictionary to %s" % vocab_fn)
                vocab.save(vocab_fn)

        corpus.dictionary = vocab

        corpus.dictionary.filter_extremes(no_below=5, no_above=.8)
        corpus.dictionary.compactify()

        if tfidf_fn and os.path.exists(tfidf_fn):
            logging.info("Reading TF-IDF model from %s" % tfidf_fn)
            tfidf = TfidfModel.load(tfidf_fn)
        else:
            logging.info("creating TF-IDF model")
            tfidf = TfidfModel(corpus)

            if tfidf_fn:
                logging.info("Saving TFF-IDF model to %s" % tfidf_fn)
                tfidf.save(tfidf_fn)

        bow_corpus = (tfidf[art] for art in corpus)

        model = LsiModel(corpus=bow_corpus,
                         num_topics=10,
                         id2word=corpus.dictionary)

        model.save(model_fn)
Exemplo n.º 44
0
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
import pyLDAvis
import pyLDAvis.gensim

sent = LineSentence('articles.txt')

# learn the dictionary
article_dict = Dictionary(sent)

# filter tokens that are very rare or too common from
# the dictionary (filter_extremes) and reassign integer ids (compactify)
article_dict.filter_extremes(no_below=5, no_above=0.2)
article_dict.compactify()

article_dict.save('articles.dict')

# load the finished dictionary from disk
article_dict = Dictionary.load('articles.dict')


def bow(filepath, d):  # output bag of words representation
    for review in LineSentence(filepath):
        yield d.doc2bow(review)


# generate bag-of-words representations for all reviews and save them as a matrix
MmCorpus.serialize('articles.mm', bow('articles.txt', article_dict))

# load the finished bag-of-words corpus from disk
corpus = MmCorpus('articles.mm')
Exemplo n.º 45
0

# %% get corpus & dictionary to use for further nlp analysis

'''
I suggest to prepare the dictionary and the corpus `once for all' -- that is, 
dumping the files that, eventually, will be loaded for further analysis.
'''

# get dictionary and write it to a file
'''
a dictionary is a mapping between words and their integer ids. See Gensim 
documentation here: https://radimrehurek.com/gensim/corpora/dictionary.html
'''
pr_dictionary = Dictionary(docs_phrased)
pr_dictionary.save('/tmp/pr_dictionary.dict')

# get corpus and write it to a file
'''
as per the Gensim documentation, it possible to convert document into the 
bag-of-words (format = list of (token_id, token_count) tuples) via doc2bow
'''
pr_corpus = [pr_dictionary.doc2bow(doc) for doc in docs_phrased]
'''
Gensim offers several utilities to write a corpus of text to a file. 
Personally, I prefer the Matrix Market format [1]

[1]: https://math.nist.gov/MatrixMarket/formats.html
'''
corpora.MmCorpus.serialize('/tmp/pr_corpus.mm', pr_corpus)
Exemplo n.º 46
0
class DocumentTermMatrix(CompactIOMachine):
    """ Document-term matrix for corpus.

    This is a class that handles the document-term matrix (DTM). With a given corpus, users can
    retrieve term frequency, document frequency, and total term frequency. Weighing using tf-idf
    can be applied.
    """
    def __init__(self, corpus, docids=None, tfidf=False):
        """ Initialize the document-term matrix (DTM) class with a given corpus.

        If document IDs (docids) are given, it will be stored and output as approrpriate.
        If not, the documents are indexed by numbers.

        Users can choose to weigh by tf-idf. The default is not to weigh.

        The corpus has to be a list of lists, with each of the inside list contains all the tokens
        in each document.

        :param corpus: corpus.
        :param docids: list of designated document IDs. (Default: None)
        :param tfidf: whether to weigh using tf-idf. (Default: False)
        :type corpus: list
        :type docids: list
        :type tfidf: bool
        """
        CompactIOMachine.__init__(self, {'classifier': 'dtm'}, 'dtm',
                                  dtm_suffices)
        if docids == None:
            self.docid_dict = {i: i for i in range(len(corpus))}
            self.docids = range(len(corpus))
        else:
            if len(docids) == len(corpus):
                self.docid_dict = {docid: i for i, docid in enumerate(docids)}
                self.docids = docids
            elif len(docids) > len(corpus):
                self.docid_dict = {
                    docid: i
                    for i, docid in zip(range(len(corpus)),
                                        docids[:len(corpus)])
                }
                self.docids = docids[:len(corpus)]
            else:
                self.docid_dict = {docid: i for i, docid in enumerate(docids)}
                self.docid_dict = {
                    i: i
                    for i in range(len(docids), range(corpus))
                }
                self.docids = docids + range(len(docids), range(corpus))
        # generate DTM
        self.generate_dtm(corpus, tfidf=tfidf)

    def generate_dtm(self, corpus, tfidf=False):
        """ Generate the inside document-term matrix and other peripherical information
        objects. This is run when the class is instantiated.

        :param corpus: corpus.
        :param tfidf: whether to weigh using tf-idf. (Default: False)
        :return: None
        :type corpus: list
        :type tfidf: bool
        """
        self.dictionary = Dictionary(corpus)
        self.dtm = dok_matrix((len(corpus), len(self.dictionary)),
                              dtype=np.float)
        bow_corpus = [
            self.dictionary.doc2bow(doctokens) for doctokens in corpus
        ]
        if tfidf:
            weighted_model = TfidfModel(bow_corpus)
            bow_corpus = weighted_model[bow_corpus]
        for docid in self.docids:
            for tokenid, count in bow_corpus[self.docid_dict[docid]]:
                self.dtm[self.docid_dict[docid], tokenid] = count

    def get_termfreq(self, docid, token):
        """ Retrieve the term frequency of a given token in a particular document.

        Given a token and a particular document ID, compute the term frequency for this
        token. If `tfidf` is set to `True` while instantiating the class, it returns the weighted
        term frequency.

        :param docid: document ID
        :param token: term or token
        :return: term frequency or weighted term frequency of the given token in this document (designated by docid)
        :type docid: any
        :type token: str
        :rtype: numpy.float
        """
        return self.dtm[self.docid_dict[docid],
                        self.dictionary.token2id[token]]

    def get_total_termfreq(self, token):
        """ Retrieve the total occurrences of the given token.

        Compute the total occurrences of the term in all documents. If `tfidf` is set to `True`
        while instantiating the class, it returns the sum of weighted term frequency.

        :param token: term or token
        :return: total occurrences of the given token
        :type token: str
        :rtype: numpy.float
        """
        return sum(self.dtm[:, self.dictionary.token2id[token]].values())

    def get_doc_frequency(self, token):
        """ Retrieve the document frequency of the given token.

        Compute the document frequency of the given token, i.e., the number of documents
        that this token can be found.

        :param token: term or token
        :return: document frequency of the given token
        :type token: str
        :rtype: int
        """
        return len(self.dtm[:, self.dictionary.token2id[token]].values())

    def get_token_occurences(self, token):
        """ Retrieve the term frequencies of a given token in all documents.

        Compute the term frequencies of the given token for all the documents. If `tfidf` is
        set to be `True` while instantiating the class, it returns the weighted term frequencies.

        This method returns a dictionary of term frequencies with the corresponding document IDs
        as the keys.

        :param token: term or token
        :return: a dictionary of term frequencies with the corresponding document IDs as the keys
        :type token: str
        :rtype: dict
        """
        return {
            self.docids[docidx]: count
            for (
                docidx,
                _), count in self.dtm[:,
                                      self.dictionary.token2id[token]].items()
        }

    def get_doc_tokens(self, docid):
        """ Retrieve the term frequencies of all tokens in the given document.

        Compute the term frequencies of all tokens for the given document. If `tfidf` is
        set to be `True` while instantiating the class, it returns the weighted term frequencies.

        This method returns a dictionary of term frequencies with the tokens as the keys.

        :param docid: document ID
        :return: a dictionary of term frequencies with the tokens as the keys
        :type docid: any
        :rtype: dict
        """
        return {
            self.dictionary[tokenid]: count
            for (
                _,
                tokenid), count in self.dtm[self.docid_dict[docid], :].items()
        }

    def generate_dtm_dataframe(self):
        """ Generate the data frame of the document-term matrix. (shorttext <= 1.0.3)

        Now it raises exception.

        :return: data frame of the document-term matrix
        :rtype: pandas.DataFrame
        :raise: NotImplementedException
        """
        raise NotImplementedException()

    def savemodel(self, prefix):
        """ Save the model.

        :param prefix: prefix of the files
        :return: None
        :type prefix: str
        """
        pickle.dump(self.docids, open(prefix + '_docids.pkl', 'wb'))
        self.dictionary.save(prefix + '_dictionary.dict')
        pickle.dump(self.dtm, open(prefix + '_dtm.pkl', 'wb'))

    def loadmodel(self, prefix):
        """ Load the model.

        :param prefix: prefix of the files
        :return: None
        :type prefix: str
        """
        self.docids = pickle.load(open(prefix + '_docids.pkl', 'rb'))
        self.docid_dict = {docid: i for i, docid in enumerate(self.docids)}
        self.dictionary = Dictionary.load(prefix + '_dictionary.dict')
        self.dtm = pickle.load(open(prefix + '_dtm.pkl', 'rb'))
Exemplo n.º 47
0

if __name__ == '__main__':
    textdirectory = sys.argv[1]

    files = [f for f in os.listdir(textdirectory) if f.endswith('.conll')]

    order = json.dumps(files, indent=4, sort_keys=True)
    orderfile = open('docorder.json', 'w')
    orderfile.write(order)
    orderfile.close()

    texts = []

    for doc in files:
        print(doc, file=sys.stderr)
        data = open(os.path.join(textdirectory, doc),
                    errors='replace').readlines()
        text = extract_lemmas(data)
        texts.append(text)

    dictionary = Dictionary(texts)
    dictionary.save('tfidf.dic')

    corpus = [dictionary.doc2bow(line) for line in texts]

    model = TfidfModel(corpus, id2word=dictionary)
    model.save('tfidf.model')
    sim_index = similarities.MatrixSimilarity(model[corpus])
    sim_index.save('tfidf.index')
def main():
    parser = ArgumentParser(
        description=
        'wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information'
    )
    parser.add_argument('-ds',
                        '--dataset',
                        default='wiki',
                        help='What kind of dataset to use. (wiki,es,file)')
    parser.add_argument('-d',
                        '--dump-file',
                        help='Wiki: bz2 dump file with wiki in it')
    parser.add_argument('-l',
                        '--limit',
                        help='Wiki: How many documents to extract from wiki')
    parser.add_argument('--model-id',
                        default='model',
                        help='Filename for created model.')
    parser.add_argument(
        '--model-type',
        default='lsi',
        help='Model type (lsi, lda, word2vec, hdp, vocabulary).')
    parser.add_argument('--n-topics',
                        default=10,
                        help='Number of topics to model.')
    parser.add_argument('--n-passes',
                        default=1,
                        help='Number of passes for LDA  model.')
    parser.add_argument('--w2v-size',
                        default=100,
                        help='size of Word2Vec context.')
    parser.add_argument('--w2v-window', default=5, help='window for Word2Vec.')
    parser.add_argument('-q',
                        '--query',
                        default=None,
                        help='Elasticsearch: Query to use to fetch documents')
    parser.add_argument('--index', help='Elasticsearch: index to read from.')
    parser.add_argument('--doc_type',
                        default='doc',
                        help='Elasticsearch: data type in index.')
    parser.add_argument(
        '--data-dir',
        help='Directory to save the generated models and vocabularies into.')
    parser.add_argument(
        '--vocab',
        help=
        'Prebuilt Vocabulary file. Use this to avoid having to generate one.')

    opts = parser.parse_args()

    model_type = opts.model_type.lower()
    if model_type not in ['lsi', 'lda', 'word2vec', 'hdp', 'vocabulary']:
        logging.error("Invalid model type %s" % model_type)
        parser.print_usage()
        exit(-1)

    logging.info("Using model type %s" % model_type)

    dump_fn = opts.dump_file
    limit = int(opts.limit) if opts.limit else None

    data_type = opts.dataset.lower()
    if data_type not in ['es', 'wiki', 'file']:
        logging.error("Invalid dataset  type %s" % data_type)
        parser.print_usage()
        exit(-1)
    limit = None
    if opts.limit:
        limit = int(opts.limit)
    if not dump_fn and data_type in ['wiki']:
        logging.error('--dump-file required for wiki dataset')
        sys.exit(1)

    query = opts.query
    index = opts.index
    doc_type = opts.doc_type
    if data_type == 'es' and index is None:
        logging.error(
            "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter"
        )
        sys.exit(1)

    n_topics = int(opts.n_topics)
    n_passes = int(opts.n_passes)
    logging.info("Using %d topics." % n_topics)
    data_dir = opts.data_dir
    model_id = opts.model_id
    model_fn = '%s_%s_%d' % (model_id, model_type, n_topics)
    if data_dir:
        model_fn = '%s/%s' % (data_dir, model_fn)
    if model_type == 'word2vec':
        w2v_size = int(opts.w2v_size)
        w2v_window = int(opts.w2v_window)
        model_fn = '%s_w_%s_s_%s' % (model_fn, w2v_window, w2v_size)
    logging.info("Writing models to %s." % model_fn)

    if data_type == 'es':
        logging.info("Using data type %s with index %s, doc_type %s query %s" %
                     (data_type, index, doc_type, query))
        dataset = ElasticsearchDataset(read_index=index,
                                       read_doc_type=doc_type,
                                       query=query,
                                       normalize_func=normalize_es)
    elif data_type == 'wiki':
        logging.info("Using data type %s with dump_file %s and limit %s" %
                     (data_type, dump_fn, limit))
        dataset = WikipediaDataset(dump_fn=dump_fn,
                                   num_articles=limit,
                                   normalize_func=normalize_wiki)
    elif data_type == 'file':
        logging.info("Using data type %s with dump_file %s and limit %s" %
                     (data_type, dump_fn, limit))
        dataset = FileDataset(dump_fn=dump_fn,
                              num_articles=limit,
                              normalize_func=normalize_file)
    vocab_file = opts.vocab
    vocab = Dictionary()
    sw = set(stopwords.words('norwegian'))
    if not vocab_file or model_type == 'vocabulary':
        vocab.add_documents([get_tokenized(page, sw) for page in dataset])
        vocab.filter_extremes()
        vocab.compactify()
        vocab.save(model_fn + '.vocab')
    else:
        vocab = Dictionary.load(vocab_file)
    if model_type == 'vocabulary':
        return
    tfidf = TfidfModel(dictionary=vocab)
    if model_type == 'lsi':
        corpus = IterableDataset(dataset, sw, vocab)
        model = LsiModel(corpus=tfidf[corpus],
                         num_topics=n_topics,
                         id2word=vocab)
    elif model_type == 'lda':
        corpus = IterableDataset(dataset, sw, vocab)
        model = LdaModel(corpus=tfidf[corpus],
                         num_topics=n_topics,
                         passes=n_passes,
                         id2word=vocab)

    elif model_type == 'word2vec':
        corpus = IterableDataset(dataset, sw, vocab, doc2bow=False)
        corpus.dictionary = vocab
        model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size)
    elif model_type == 'hdp':
        corpus = IterableDataset(dataset, sw, vocab)
        model = HdpModel(corpus=tfidf[corpus], id2word=vocab)

    logging.info(model)
    model.save(model_fn)
Exemplo n.º 49
0
Arquivo: dmp.py Projeto: npiaq/dmp
class DMP(object):

    def __init__(self):
        self.dic = None
        self.lda = None
        self.topic_num = config.getint('dmp', 'topic_num')
        self.corpus_file = config.get('dmp', 'corpus_file')

    @staticmethod
    def __text2doc(iterator, sep=u' '):
        '''将文本转换为文档
        通过 split 函数将文本切成词的列表.

        参数
            sep: 分隔符

        返回
            返回已经切割好的词的列表
        '''
        docs = []
        for line in iterator:
            text = line.strip().split(sep)
            docs.append(text)
        return docs

    def __load_corpus(self):
        '''读取语料. 通过调用 text2doc 将文本转换为词的列表.

        返回
            返回处理过后的文档的列表.
        '''
        docs = None
        with codecs.open(self.corpus_file, 'r', 'utf-8') as iterator:
            docs = self.__text2doc(iterator)
        return docs

    def train(self):
        '''训练模型, 将会得到词典 (dic) 和模型 (lda) 两个对象.

        dic: 用来存储词, 每个词会有一个编号. 可以通过 dic[id] 来获取词
        lda: 模型, 包含主题的列表. 每个主题有一个编号, 可以通过
             lda.print_topic(id) 来获取主题中词的列表
        '''
        docs = self.__load_corpus()
        self.dic = Dictionary(docs)
        bow = [self.dic.doc2bow(doc) for doc in docs]
        self.lda = LdaModel(bow, id2word=self.dic,
                            num_topics=self.topic_num)

    def infer(self, doc):
        '''推断新的文档是什么主题

        参数
            doc: 新的文档. 要以词的列表的形式呈现

        返回
            返回主题列表的迭代器, 其中主题均采用编号呈现, 需调用 lda.print_topic
            函数来方便人工理解.
        '''
        bow = self.dic.doc2bow(doc)
        topics = self.lda[bow]
        return topics

    def dump(self):
        '''导出 lda 模型和 dic 词典.
        '''
        lda_file = config.get('dmp', 'lda_file')
        dic_file = config.get('dmp', 'dic_file')
        self.lda.save(lda_file)
        self.dic.save(dic_file)

    def load(self):
        '''读取 lda 模型和 dic 词典.
        '''
        lda_file = config.get('dmp', 'lda_file')
        dic_file = config.get('dmp', 'dic_file')
        self.lda = LdaModel.load(lda_file)
        self.dic = Dictionary.load(dic_file)