Exemplo n.º 1
0
    def _inference_one_chain(self, doc_tokens, rand):
        """Inference topics with one markov chain.

        Returns the sparse topics p(z|d).
        """
        document = Document(self.model.num_topics)
        document.parse_from_tokens(doc_tokens, rand, self.vocabulary,
                                   self.model)
        if document.num_words() == 0:
            return dict()

        accumulated_topic_hist = {}
        for i in xrange(self.total_iterations):
            # one iteration
            for word in document.words:
                # --
                document.decrease_topic(word.topic, 1)

                new_topic = self._sample_word_topic(document, word.id, rand)
                assert new_topic != None
                word.topic = new_topic
                # ++
                document.increase_topic(new_topic, 1)

            if i >= self.burn_in_iterations:
                for non_zero in document.doc_topic_hist.non_zeros:
                    if non_zero.topic in accumulated_topic_hist:
                        accumulated_topic_hist[
                            non_zero.topic] += non_zero.count
                    else:
                        accumulated_topic_hist[non_zero.topic] = non_zero.count

        topic_dist = self._l1normalize_distribution(accumulated_topic_hist)
        return topic_dist
Exemplo n.º 2
0
    def load_corpus(self, corpus_dir):
        """Load corpus from a given directory, then initialize the documents
        and model.
        Line format: token1 \t token2 \t token3 \t ... ...
        """
        self.documents = []
        rand = random.Random()

        logging.info('Load corpus from %s.' % corpus_dir)
        for root, dirs, files in os.walk(corpus_dir):
            for f in files:
                filename = os.path.join(root, f)
                logging.info('Load filename %s.' % filename)
                fp = open(filename, 'r')
                for doc_str in fp.readlines():
                    doc_str = doc_str.decode('gbk')
                    doc_tokens = doc_str.strip().split('\t')
                    if len(doc_tokens) < 2:
                        continue
                    document = Document(self.model.num_topics)
                    document.parse_from_tokens(doc_tokens, rand,
                                               self.vocabulary)
                    if document.num_words() < 2:
                        continue
                    self.documents.append(document)
                fp.close()

        logging.info('The document number is %d.' % len(self.documents))
        self._initialize_model()

        self._compute_smoothing_only_bucket()
        self._initialize_topic_word_coefficient()
Exemplo n.º 3
0
 def test_compute_loglikelihood(self):
     doc_tokens = ['macbook', 'ipad',  # exist in vocabulary and model
             'mac os x', 'chrome',  # only exist in vocabulary
             'nokia', 'null']  # inexistent
     document = Document(self.model.num_topics)
     rand = random.Random()
     rand.seed(0)
     document.parse_from_tokens(
             doc_tokens, rand, self.vocabulary, self.model)
     documents = [document, document]
     self.assertEqual(-14.113955684239654,
             self.model_evaluator.compute_loglikelihood(documents))
Exemplo n.º 4
0
def load_corpus(instance_folder_path: str,
                non_instance_folder_path: str) -> Corpus:
    files = [(file, DocumentClass.INSTANCE) for file in listdir(instance_folder_path) if isfile(join(instance_folder_path, file))] \
        + [(file, DocumentClass.NON_INSTANCE) for file in listdir(non_instance_folder_path) if isfile(join(non_instance_folder_path, file))]
    documents = []
    for file, is_instance in files:
        folder_path = instance_folder_path if is_instance == DocumentClass.INSTANCE else non_instance_folder_path
        file_path = join(folder_path, file)
        with open(file_path, 'r') as document:
            doc = Document(document.read(), is_instance)
            documents.append(doc)

    return Corpus(documents)
Exemplo n.º 5
0
def load_documents(folder_path: str) -> [Document]:
    files = [
        file for file in listdir(folder_path)
        if isfile(join(folder_path, file))
    ]

    documents = []
    for file in files:
        with open(join(folder_path, file), 'r') as document:
            doc = Document(document.read(), DocumentClass.UNKNOWN)
            documents.append(doc)

    return documents
Exemplo n.º 6
0
    def get_document(self):

        self.__doc = Document(text=self.__text)

        self.__doc.set_emails(self.__emails)
        self.__doc.set_links(self.__links)
        self.__doc.set_hash_tags(self.__hash_tags)

        self.__doc.set_tokens(self.__tokens)

        self.__doc.set_sentences(list(self.__blob.sentences))

        return self.__doc
Exemplo n.º 7
0
    def _load_corpus(self, corpus_dir):
        self.documents = []
        if not os.path.exists(corpus_dir):
            logging.error('The corpus directory %s does not exists.' %
                          corpus_dir)
            return False

        for root, dirs, files in os.walk(corpus_dir):
            for f in files:
                filename = os.path.join(root, f)
                fp = open(filename, 'rb')
                record_reader = RecordReader(fp)
                while True:
                    blob = record_reader.read()
                    if blob == None:
                        break
                    document = Document(self.model.num_topics)
                    document.parse_from_string(blob)
                    self.documents.append(document)

        return True