Python Document 예제들, document.Document Python 예제들

예제 #1

0

파일 보기

파일: test_suite.py 프로젝트: abeautifulman/DoubleCheck

 def test_word_tokenizing(self):
     text = "This is a test sentence."
     with open("../process/tmp_test_file.txt", "w") as test_file:
         test_file.write(text)
     d = Document("tmp_test_file.txt", "testuser")
     d.preprocess_text()
     self.assertEqual(d.preprocessed['tokens'], 6, "word tokenizing failed, incorrect number of tokens")

예제 #2

0

파일 보기

파일: pool.py 프로젝트: TechBK/NLP

    def Probability(self, doc, dclass = ""):
        """Calculates the probability for a class dclass given a document doc"""
        if dclass:
            sum_dclass = self.sum_words_in_class(dclass)
            prob = 0

            d = Document(self.__vocabulary)
            d.read_document(doc)

            for j in self.__document_classes:
                sum_j = self.sum_words_in_class(j)
                prod = 1
                for i in d.Words():
                    wf_dclass = 1 + self.__document_classes[dclass].WordFreq(i)
                    wf = 1 + self.__document_classes[j].WordFreq(i)
                    r = wf * sum_dclass / (wf_dclass * sum_j)
                    prod *= r
                prob += prod * self.__document_classes[j].NumberOfDocuments() / self.__document_classes[dclass].NumberOfDocuments()
            if prob != 0:
                return 1 / prob
            else:
                return -1
        else:
            prob_list = []
            for dclass in self.__document_classes:
                prob = self.Probability(doc, dclass)
                prob_list.append([dclass,prob])
            prob_list.sort(key = lambda x: x[1], reverse = True)
            return prob_list

예제 #3

0

파일 보기

파일: test_document.py 프로젝트: jainarchita/60-days-of-python

class TestDocument(unittest.TestCase):
    def setUp(self):
        self.d = Document()
        self.d.insert("a")

    def test_cursor(self):
        self.assertEqual(self.d.cursor.position, 1)
        self.d.save("tst")
        try:
            remove("tst")
        except OSError:
            pass
        self.d.cursor.back()
        self.d.delete()
        self.assertEqual(self.d.cursor.position, 0)

    def test_multiple_chars_and_escape(self):
        self.d.cursor.home()
        self.d.delete()
        string = ["h", "e", "l", "l", "o", "\n", "w", "o", "r", "l", "d", "!"]
        for i in string:
            self.d.insert(i)
        self.assertEqual(self.d.string, "hello\nworld!")

    def test_string_property(self):
        self.assertEqual(self.d.string, "a")

예제 #4

0

파일 보기

파일: document_tests.py 프로젝트: citizencurator/majormajor

    def test_array_delete(self):
        doc0 =  Document()
        doc0.snapshot = []
        doc1 = self.doc1
        doc2 = self.doc2

        # can technically delete nothing from empty list. why not
        op1 = Op('ad', [], offset=0, val=0)
        doc0.apply_op(op1)
        self.assertEqual(doc0.snapshot, [])

        # remove one from list
        op2 = Op('ad', [], offset=1, val=1)
        doc2.apply_op(op2)
        self.assertEqual(doc2.get_value([1]), 'normal, ol string')

        # from nested lists
        op3 = Op('ad', [2], offset=1, val=1)
        doc2.apply_op(op3)
        self.assertEqual(doc2.get_value([2]), [['multi'],['array']])

        # delete multiple elements
        op4 = Op('ad', [], offset=0, val=4)
        doc2.apply_op(op4)
        self.assertEqual(doc2.snapshot, [None, 42])

        # delete last in list:
        op5 = Op('ad', [], offset=1, val=1)
        doc2.apply_op(op5)
        self.assertEqual(doc2.snapshot, [None])

        # in dicts
        op6 = Op('ad', ['fifth'], offset=2, val=2)
        doc1.apply_op(op6)
        self.assertEqual(doc1.get_value(['fifth']), [55,66])

예제 #5

0

파일 보기

파일: tests.py 프로젝트: stclair/pyowa-pod

 def test_call_pod_renderer_with_document_file_and_context_and_result_file_in_render_then_call_run(self, renderer):
     doc = Document(context="context")
     doc.document_file = "document"
     doc.result_file = "result"
     doc.render()
     renderer.assert_called_once_with("document", "context", "result")
     renderer.return_value.run.assert_called_once_with()

예제 #6

0

파일 보기

파일: program.py 프로젝트: pkt-fit-knu/I21-07

def categorize_document(unknown_document, k):
    nearest_neighbors = dict()

    for football_document in footballDocuments:
        distance = Document.calculate_tanimoto_distance(unknown_document, football_document)
        print(distance)
        if nearest_neighbors.__len__() < k:
            nearest_neighbors[distance] = football_document.category
        else:
            update_neighbors(nearest_neighbors, football_document.category, distance)

    print("\n")

    for python_document in pythonDocuments:
        distance = Document.calculate_tanimoto_distance(unknown_document, python_document)
        print(distance)
        if nearest_neighbors.__len__() < k:
            nearest_neighbors[distance] = python_document.category
        else:
            update_neighbors(nearest_neighbors, python_document.category, distance)

    football_documents_count = 0
    python_documents_count = 0

    for value in nearest_neighbors.values():
        if value == Category.Football:
            football_documents_count += 1
        elif value == Category.Python:
            python_documents_count += 1

    if football_documents_count >= python_documents_count:
        document.category = Category.Football
    elif football_documents_count < python_documents_count:
        document.category = Category.Python

예제 #7

0

파일 보기

파일: json_document.py 프로젝트: AlejoAsd/lx300printerHelper

 def __init__(self, json_str):
     Document.__init__(self)
     self.json_object = json.loads(json_str)
     if "document_width" in self.json_object:
         self.document_width = self.json_object["document_width"]
     for field in self.json_object["fields"]:
         self.add_field(Field(unicode(field["text"]), field["x"], field["y"], field["length"]))

예제 #8

0

파일 보기

파일: feature_selector.py 프로젝트: film42/lda-topic-modeling

    def run(self, index_file):
        """
        Generate the features using Top N algorithm
        """
        with open(index_file) as f:
            lines = f.readlines()
            for line in lines:
                name = line[:-1]
                with open("../data/scoped/%s" % name, 'r') as d:
                    document = Document(d.read())
                    self.table.add_document(name, document.content_lower)

        new_data_set = self.table.top_n_words(10)
        for document_name, words in new_data_set.iteritems():

            with open("../data/scoped/%s" % document_name, 'r') as d:
                    document = Document(d.read())

            path_name = "../data/features/%s" % document_name

            with open("%s" % path_name, 'w') as f:
                for word in words:
                    for _ in xrange(document.count(word)):
                        f.write(word)
                        f.write("\n")

예제 #9

0

파일 보기

파일: parser.py 프로젝트: Dylnuge/once-and-future-vis

def generate_document_data(chapter_paths, word_count):
    """
    Generate visualization data for a set of chapters.

    Given input chapters we want to find both the unique words being used inside
    of each chapter and how frequent they are within the text as a whole.

    chapter_paths - A list of paths to chapters
    word_count - The number of most frequent words to grab for each chapter

    Returns a list looking like this:
    [
        [
            {
                "word": wart
                "freq": .7
                "uniqueness": .5
                "pos": .1
            }
        ],
    ]

    This is a list of chapters, where each chapter is a list of word
    dictionaries and each word dictionary has the word itself, the frequency of
    the word in that chapter, the uniqueness of the word overall, and the first
    position the word is observed. All of the latter three values are scaled
    from 0-1 with respect to the chapter (the most frequent word receives a 1,
    for instance).
    """
    document = Document(chapter_paths)
    return [generate_chapter_data(word_list, word_count, document) for word_list
            in document.get_chapters()]

예제 #10

0

파일 보기

파일: document_tests.py 프로젝트: citizencurator/majormajor

    def test_array_insert(self):
        doc0 =  Document()
        doc0.snapshot = []
        doc1 = self.doc1
        doc2 = self.doc2

        # whole doc is just an empty array. alter it
        op1 = Op('ai', [], val='c', offset=0)
        doc0.apply_op(op1)
        self.assertEqual(doc0.snapshot, ['c'])
        # insert at start
        op2 = Op('ai', [], val='a', offset=0)
        doc0.apply_op(op2)
        self.assertEqual(doc0.snapshot, ['a', 'c'])
        # insert at end
        op3 = Op('ai', [], val='d', offset=2)
        doc0.apply_op(op3)
        self.assertEqual(doc0.snapshot, ['a','c','d'])
        # insert in middle
        op4 = Op('ai', [], val='b', offset=1)
        doc0.apply_op(op4)
        self.assertEqual(doc0.snapshot, ['a','b','c','d'])

        # insert into some array deep in doc
        op5 = Op('ai', [3,1], val='a', offset=1)
        doc2.apply_op(op5)
        self.assertEqual(doc2.get_value([3,1]), ['dimen', 'a'])

        # again
        op6 = Op('ai', ['fifth'], val='a', offset=1)
        doc1.apply_op(op6)
        result6 = [55,'a',66,{'sixth': 'deep string'}, 'rw']
        self.assertEqual(doc1.get_value(['fifth']), result6)

예제 #11

0

파일 보기

파일: test_docx.py 프로젝트: gcsolaroli/metadata-processor

	def test_textWithWeirdFormatting (self):
		sampleDocxFile = os.path.join(os.getcwd(), 'samples', 'docx', '11_weird_formatting.docx')
		docxProcessor = DocxProcessor(sampleDocxFile)
		document = docxProcessor.document()
		expectedDocument = Document().initWithFile(os.path.join(os.getcwd(), 'samples', 'expected outcome', 'docx', 'test_11'))

		self.assertEquals(expectedDocument.content(), document.content())
		self.assertEquals(expectedDocument.formatting(), document.formatting())

예제 #12

0

파일 보기

파일: test_docx.py 프로젝트: gcsolaroli/metadata-processor

	def test_textWithLineBlocks (self):
		sampleDocxFile = os.path.join(os.getcwd(), 'samples', 'docx', '08_line_blocks.docx')
		docxProcessor = DocxProcessor(sampleDocxFile)
		document = docxProcessor.document()
		expectedDocument = Document().initWithFile(os.path.join(os.getcwd(), 'samples', 'expected outcome', 'docx', 'test_08'))

		self.assertEquals(expectedDocument.content(),	 document.content())
		self.assertEquals(expectedDocument.formatting(), document.formatting())

예제 #13

0

파일 보기

파일: _test_pdf.py 프로젝트: gcsolaroli/metadata-processor

	def test_parseSimplePdf (self):
		samplePdfFile = os.path.join(os.getcwd(), 'samples', 'pdf', '01_simple_text.pdf')
		pdfProcessor = PdfProcessor(samplePdfFile)
		document = pdfProcessor.document()
		expectedDocument = Document().initWithFile(os.path.join(os.getcwd(), 'samples', 'expected outcome', 'docx', 'test_01'))

		self.assertEquals(expectedDocument.content(), document.content())
		self.assertEquals(expectedDocument.formatting(), document.formatting())

예제 #14

0

파일 보기

파일: canvas_test.py 프로젝트: lilliemarck/linjal-py

    def testSelectsWhenMouseClicked(self):
        document = Document()
        document.new_shape()
        document.current_shape.append_point((0, 0))
        tool = SelectTool(document)

        self.assertTrue(document.selected_point_index is None)
        _perform_click(tool, 0, 0)
        self.assertTrue(document.selected_point_index == 0)

예제 #15

0

파일 보기

def main():
    tokenizer = Tokenizer()

    doc = Document(1)
    #doc.add_text("hello i am paradox")
    doc.load_from_file("documents/test.txt")
    doc.extract_terms(tokenizer)
    doc.generate_frequency_map()
    doc.display()

예제 #16

0

파일 보기

파일: test_substance.py 프로젝트: gcsolaroli/metadata-processor

	def test_sampleDocumentProcessing (self):
		sampleSubstanceFile = os.path.join(os.getcwd(), 'samples', 'substance', '01_sample.json')
		substanceProcessor = SubstanceProcessor().initWithFile(sampleSubstanceFile)
		document = substanceProcessor.document()
		expectedDocument = Document().initWithFile(os.path.join(os.getcwd(), 'samples', 'expected outcome', 'substance_01'))

		print("DOCUMENT METADATA: " + str(document.metadata()))
		self.assertEqual(expectedDocument.content(),	document.content())
		self.assertEqual(expectedDocument.metadata(),	document.metadata())

예제 #17

0

파일 보기

파일: document_test.py 프로젝트: JackieXie168/mltk

    def test_serialize_and_parse(self):
        self.document.parse_from_tokens(
                self.doc_tokens, random, self.vocabulary)

        test_doc = Document(20)
        test_doc.parse_from_string(self.document.serialize_to_string())

        self.assertEqual(self.document.num_words(), test_doc.num_words())
        self.assertEqual(str(self.document), str(test_doc))

예제 #18

0

파일 보기

파일: canvas_test.py 프로젝트: lilliemarck/linjal-py

    def testDeselectsWhenClickingElsewhere(self):
        document = Document()
        document.new_shape()
        document.current_shape.append_point((0, 0))
        tool = SelectTool(document)

        _perform_click(tool, 0, 0)
        _perform_click(tool, 1000, 0) # Click far away
        self.assertTrue(document.selected_point_index is None)

예제 #19

0

파일 보기

def test_document_class_2():
    file = 'empty.txt'
    test = Document(file)

    # Tests Document object representing empty file
    assert_equals([], test.get_words())
    assert_equals(0, test.term_frequency('dogs'))
    assert_equals(file, test._file_name)
    assert_equals({}, test._term_frequency)

예제 #20

0

파일 보기

파일: pool.py 프로젝트: TechBK/NLP

 def DocumentIntersectionWithClasses(self, doc_name):
     res = [doc_name]
     for dc in self.__document_classes:
         d = Document(self.__vocabulary)
         d.read_document(doc_name, learn=False)
         o = self.__document_classes[dc] &  d
         intersection_ratio = len(o) / len(d.Words())
         res += (dc, intersection_ratio)
     return res

예제 #21

0

파일 보기

def assemble_orders(rein, job_ids):
    """
    Take a list of job_ids and build their entire orders. The idea here is that one Job ID should
    allow us to query each available server for each document type that is associated with it, then
    filter out cruft by focusing on who's signed correctly.

    TODO: look for attempted changes in foundational info like participants public keys and redeem scripts.
    """
    urls = Bucket.get_urls(rein)
    documents = []
    arg_job_ids = ','.join(job_ids)
    for url in urls:
        # queries remote server for all docs associated with a job_id
        res = Document.get_documents_by_job_id(rein, url, arg_job_ids)
        if res:
            documents += res

    order_ids = {}
    order_id = None
    for job_id in job_ids:
        order_id = Order.get_order_id(rein, job_id)
        if not order_id:
            o = Order(job_id, testnet=rein.testnet)
            rein.session.add(o)
            rein.session.commit()
        order_id = Order.get_order_id(rein, job_id)
        order_ids[job_id] = order_id

    if not order_id:
        return 0

    for document in documents:
        doc_type = Document.get_document_type(document)
        if not doc_type:
            rein.log.info('doc_type not detected')
            continue
        doc_hash = Document.calc_hash(document)
        job_id = Document.get_job_id(document)
        d = rein.session.query(Document).filter(
            Document.doc_hash == doc_hash).first()
        if d:
            d.set_order_id(order_ids[job_id])
            rein.session.add(d)
        else:
            new_document = Document(rein,
                                    doc_type,
                                    document,
                                    order_id,
                                    'remote',
                                    source_key=None,
                                    sig_verified=True,
                                    testnet=rein.testnet)
            rein.session.add(new_document)
        rein.session.commit()

    return len(documents)

예제 #22

0

파일 보기

파일: tab.py 프로젝트: jhasse/taluka

	def __init__(self):
		super(type(self), self).__init__()

		self._state = TAB_STATE_NORMAL

		self._not_editable = False

		self._save_flags = 0
		
		self._ask_if_externally_modified = True;
		
		# Create the scrolled window
		sw = gtk.ScrolledWindow()
		self._view_scrolled_window = sw

		sw.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC)


		self._auto_save_timeout = 0 # Not Implemented
#TODO:
#	/* Manage auto save data */
#	lockdown = gedit_app_get_lockdown (gedit_app_get_default ());
#	tab->priv->auto_save = gedit_prefs_manager_get_auto_save () &&
#			       !(lockdown & GEDIT_LOCKDOWN_SAVE_TO_DISK);
#	tab->priv->auto_save = (tab->priv->auto_save != FALSE);

#	tab->priv->auto_save_interval = gedit_prefs_manager_get_auto_save_interval ();
#	if (tab->priv->auto_save_interval <= 0)
#		tab->priv->auto_save_interval = GPM_DEFAULT_AUTO_SAVE_INTERVAL;

		doc = Document()
		doc.set_data(TAB_KEY, self)
		
		self._document = doc

#	_gedit_document_set_mount_operation_factory (doc,
#						     tab_mount_operation_factory,
#						     tab);

		self._view = View(doc)
		self._view.show()
		self._view.set_data(TAB_KEY, self)
		
		self.pack_end(sw, True, True, 0)
#	gtk_box_pack_end (GTK_BOX (tab), sw, TRUE, TRUE, 0);

		sw.add(self._view)
#	gtk_container_add (GTK_CONTAINER (sw), tab->priv->view);

		sw.set_shadow_type(gtk.SHADOW_IN)
#	gtk_scrolled_window_set_shadow_type (GTK_SCROLLED_WINDOW (sw),
#					     GTK_SHADOW_IN);

		sw.show()
		
		self._scrolledwindow = sw

예제 #23

0

파일 보기

파일: hw4_test.py 프로젝트: griffinmiller14/search_engine

def main():
    test1 = Document('test_docs/test1.txt')
    test2 = Document('test_docs/test2.txt')
    test3 = Document('test_docs/test3.txt')
    test4 = Document('test_docs/test4.txt')
    test_search1 = SearchEngine('test_docs')

    test_document(test1, test2, test3, test4)
    test_single(test_search1)
    test_mulit(test_search1)

예제 #24

0

파일 보기

def step(context):
    archivedVersion = Document().initWithFile(context.archivedVersion)
    #	archivedVersion.writeTo(os.path.join(context.tmpFolder, '_archived'))
    newVersion = Document().initWithFile(context.processedFile)
    #	newVersion.writeTo(os.path.join(context.tmpFolder, '_new'))
    mergedVersion = archivedVersion.mergeWithDocument(newVersion)
    mergedVersionFile = os.path.join(context.tmpFolder, 'mergedVersion')
    mergedVersion.writeTo(mergedVersionFile)
    #	mergedVersion.writeTo(os.path.join(context.tmpFolder, '_merged'))
    context.processedFile = mergedVersionFile

예제 #25

0

파일 보기

파일: steps.py 프로젝트: gcsolaroli/metadata-processor

def step(context):
	archivedVersion = Document().initWithFile(context.archivedVersion)
#	archivedVersion.writeTo(os.path.join(context.tmpFolder, '_archived'))
	newVersion = Document().initWithFile(context.processedFile)
#	newVersion.writeTo(os.path.join(context.tmpFolder, '_new'))
	mergedVersion = archivedVersion.mergeWithDocument(newVersion)
	mergedVersionFile = os.path.join(context.tmpFolder, 'mergedVersion')
	mergedVersion.writeTo(mergedVersionFile)
#	mergedVersion.writeTo(os.path.join(context.tmpFolder, '_merged'))
	context.processedFile = mergedVersionFile

예제 #26

0

파일 보기

파일: crawler.py 프로젝트: todorgrigorov/web-crawler

    def handle_invalid_url(self, url, message):
        document = Document(url)
        document.valid = False

        if document not in self.invalid_documents:
            self.invalid_documents.append(document)

        print(colored(message, 'red'))
        print('URL: %s' % (url))
        print('\n')

예제 #27

0

파일 보기

파일: document_test.py 프로젝트: JoshuaWhittemore/Gap

 def test_023(self):
     """ Document text setter """
     document = Document("tests/4page.pdf", "tests")
     document.text[0] = "goo"
     # TODO
     #self.assertEqual(document.text[0], "goo")
     for i in range(1,5):
         os.remove("tests/4page" + str(i) + ".pdf")
         os.remove("tests/4page" + str(i) + ".txt")
         os.remove("tests/4page" + str(i) + ".json")

예제 #28

0

파일 보기

파일: outline.py 프로젝트: lkesteloot/lathser

def generate_prn(out, paths, title):
    doc = Document(title)
    dpi = doc.getResolution()
    for path in paths:
        cut = Cut(4, 100, 50)
        # Convert to doc's resolution.
        cut.points = [Vector2(p.x*dpi/DPI, p.y*dpi/DPI) for p in path]
        doc.addCut(cut)

    epilog.generate_prn(out, doc)

예제 #29

0

파일 보기

    def __init__(self):
        super(type(self), self).__init__()

        self._state = TAB_STATE_NORMAL

        self._not_editable = False

        self._save_flags = 0

        self._ask_if_externally_modified = True

        # Create the scrolled window
        sw = gtk.ScrolledWindow()
        self._view_scrolled_window = sw

        sw.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC)

        self._auto_save_timeout = 0  # Not Implemented
        #TODO:
        #	/* Manage auto save data */
        #	lockdown = gedit_app_get_lockdown (gedit_app_get_default ());
        #	tab->priv->auto_save = gedit_prefs_manager_get_auto_save () &&
        #			       !(lockdown & GEDIT_LOCKDOWN_SAVE_TO_DISK);
        #	tab->priv->auto_save = (tab->priv->auto_save != FALSE);

        #	tab->priv->auto_save_interval = gedit_prefs_manager_get_auto_save_interval ();
        #	if (tab->priv->auto_save_interval <= 0)
        #		tab->priv->auto_save_interval = GPM_DEFAULT_AUTO_SAVE_INTERVAL;

        doc = Document()
        doc.set_data(TAB_KEY, self)

        self._document = doc

        #	_gedit_document_set_mount_operation_factory (doc,
        #						     tab_mount_operation_factory,
        #						     tab);

        self._view = View(doc)
        self._view.show()
        self._view.set_data(TAB_KEY, self)

        self.pack_end(sw, True, True, 0)
        #	gtk_box_pack_end (GTK_BOX (tab), sw, TRUE, TRUE, 0);

        sw.add(self._view)
        #	gtk_container_add (GTK_CONTAINER (sw), tab->priv->view);

        sw.set_shadow_type(gtk.SHADOW_IN)
        #	gtk_scrolled_window_set_shadow_type (GTK_SCROLLED_WINDOW (sw),
        #					     GTK_SHADOW_IN);

        sw.show()

        self._scrolledwindow = sw

예제 #30

0

파일 보기

파일: pool.py 프로젝트: TechBK/NLP

 def learn(self, directory, dclass_name):
     """ directory is a path, where the files of the class with the name dclass_name can be found """
     x = DocumentClass(self.__vocabulary)
     dir = os.listdir(directory)
     for file in dir:
         d = Document(self.__vocabulary)
         print(directory + "/" + file)
         d.read_document(directory + "/" +  file, learn = True)
         x = x + d
     self.__document_classes[dclass_name] = x
     x.SetNumberOfDocs(len(dir))

예제 #31

0

파일 보기

    def update_orders(self, rein, Document):
        from market import assemble_orders
        documents = Document.get_user_documents(rein)
        job_ids = []
        for document in documents:
            job_id = Document.get_job_id(document.contents)
            if job_id not in job_ids:
                if document.source_url == 'local' and document.doc_type != 'enrollment':
                    job_ids.append(job_id)

        assemble_orders(rein, job_ids)

예제 #32

0

파일 보기

    def test_copy(self):
        """
        Test that when copying a document, the text, dimensions and attributes are identical.
        """

        document = Document('this is a pipe', {'pipe': 1},
                            attributes={'timestamp': time.time()})
        copy = document.copy()
        self.assertEqual(document.text, copy.text)
        self.assertEqual(document.dimensions, copy.dimensions)
        self.assertEqual(document.attributes, copy.attributes)

예제 #33

0

파일 보기

    def test_export_attributes(self):
        """
        Test that exporting and importing documents include their attributes.
        """

        text = 'this is not a pipe'
        d = Document(text, text.split(), attributes={'timestamp': 10})
        e = d.to_array()
        self.assertEqual(d.attributes, Document.from_array(e).attributes)
        self.assertEqual(d.attributes['timestamp'],
                         Document.from_array(e).attributes['timestamp'])

예제 #34

0

파일 보기

    def _createDocument(self):

        document = Document()
        document.setPreferences(self._preferences)
        document.aboutToClose.connect(self._onDocumentAboutToClose)

        subWindow = self._documentArea.addSubWindow(document)
        subWindow.setWindowIcon(QIcon())
        subWindow.showMaximized()

        return document

예제 #35

0

파일 보기

    def BuildIndex(doclist):
        index = Index()

        for docId, doc in enumerate(doclist):
            doc1 = Document()
            doc1 = doc
            # print doc.GetTermVec()
            for word in doc1.GetTermVec():
                index.AddOne(word, docId)  #doc1.GetId())

        return index

예제 #36

0

파일 보기

파일: user.py 프로젝트: campadrenalin/ConcurrenTree

 def doc_get(self, docname):
     # Retrieve a document from cache, creating from CPS if necessary.
     if docname in self.cache:
         return self.cache[docname]
     doc = Document({})
     if docname in self:
         json_data = json.loads(self[docname])
         doc.load(json_data)
     self.doc_set(docname, doc)
     self.doc_save(docname)
     return doc

예제 #37

0

파일 보기

파일: oc11client.py 프로젝트: ChrisHughes/python-occlient

	def __init__(self, canvas_size, layers):
		default_brush_style = BrushInfo(brushdata)
		Document.__init__(self, default_brush_style)
		self.canvas_size = canvas_size
		self.layer_count = layers
		
		tile_size = (canvas_size[0] + 64 - (canvas_size[0] % 64), canvas_size[1] + 64 - (canvas_size[1] % 64))
		self.set_frame(0, 0, *tile_size)
		
		for layer in range(layers):
			self.add_layer(layer)

예제 #38

0

파일 보기

파일: user.py 프로젝트: aral/ConcurrenTree

 def doc_get(self, docname):
     # Retrieve a document from cache, creating from CPS if necessary.
     if docname in self.cache:
         return self.cache[docname]
     doc = Document({})
     if docname in self:
         json_data = json.loads(self[docname])
         doc.load(json_data)
     self.doc_set(docname, doc)
     self.doc_save(docname)
     return doc

예제 #39

0

파일 보기

파일: main_window.py 프로젝트: rrzaripov/udivee

 def dropEvent(self, e):
     mime = e.mimeData()
     if mime.hasUrls():
         urls = mime.urls()
         for url in urls:
             d = Document()
             d.load_file(url.toLocalFile())
             sub_widget = QtGui.QMdiSubWindow()
             sub_widget.setWidget(d)
             self.mdiArea.addSubWindow(sub_widget)
     e.accept()

예제 #40

0

파일 보기

    def _set_vertical(r1, r2, olabels,
                      length,
                      posmethod="beyondten",
                      docmethod="insert",
                      vertrel="nonrel",
                      blocksize=3,
                      independentplacement=True):

        if independentplacement:
            pos1 = VASyntheticComparisonExperiment._vertpos(r1,
                                                            r2,
                                                            posmethod,
                                                            blocksize)
            pos2 = VASyntheticComparisonExperiment._vertpos(r1,
                                                            r2,
                                                            posmethod,
                                                            blocksize)
        else:
            pos1 = pos2 = VASyntheticComparisonExperiment._vertpos(r1,
                                                               r2,
                                                               posmethod,
                                                               blocksize)

        if docmethod == "assign":
            r1 = [d.set_type(pos1 <= i < (pos1 + blocksize))
                  for i, d in enumerate(r1)]
            r2 = [d.set_type(pos2 <= i < (pos2 + blocksize))
                  for i, d in enumerate(r2)]
        elif docmethod == "insert":
            maxid = max(r1 + r2)
            r1 = [d.set_type(False) for d in r1]
            r2 = [d.set_type(False) for d in r2]
            for i in range(blocksize):
                r1.insert(pos1, Document(maxid + i + 1, True))
                r2.insert(pos2, Document(maxid + i + 1, True))

        labels = olabels[:]
        for doc in set(r1 + r2):
            if not doc.get_type:
                continue

            vdoc = doc.get_id()
            if vdoc >= len(labels):
                labels += [0] * (vdoc - len(labels) + 1)

            if vertrel == "nonrel":
                labels[vdoc] = 0
            elif vertrel == "rel":
                labels[vdoc] = 1
            elif vertrel == "ratio":
                ratio = float(sum(olabels)) / length
                labels[vdoc] = numpy.random.binomial(1, ratio)

        return r1, r2, labels

예제 #41

0

파일 보기

파일: document_test.py 프로젝트: JoshuaWhittemore/Gap

 def test_064(self):
     """ config stem is valid """
     document = Document("tests/4page.pdf", "tests", config=['stem=internal'])
     document = Document("tests/4page.pdf", "tests", config=['stem=porter'])
     document = Document("tests/4page.pdf", "tests", config=['stem=snowball'])
     document = Document("tests/4page.pdf", "tests", config=['stem=lancaster'])
     document = Document("tests/4page.pdf", "tests", config=['stem=lemma'])
     for i in range(1,5):
         os.remove("tests/4page" + str(i) + ".txt")
         os.remove("tests/4page" + str(i) + ".pdf")
         os.remove("tests/4page" + str(i) + ".json")

예제 #42

0

파일 보기

파일: _test_pdf.py 프로젝트: simplicissimus/metadata-processor

    def test_parseSimplePdf(self):
        samplePdfFile = os.path.join(os.getcwd(), 'samples', 'pdf',
                                     '01_simple_text.pdf')
        pdfProcessor = PdfProcessor(samplePdfFile)
        document = pdfProcessor.document()
        expectedDocument = Document().initWithFile(
            os.path.join(os.getcwd(), 'samples', 'expected outcome', 'docx',
                         'test_01'))

        self.assertEquals(expectedDocument.content(), document.content())
        self.assertEquals(expectedDocument.formatting(), document.formatting())

예제 #43

0

파일 보기

파일: main.py 프로젝트: sebastian-thomas/tso

def main():
	#inputs = ['ip1.txt','ip2.txt']
	#inputs = ['ip3.txt','ip4.txt']
	#inputs = ['sachin1.txt']
	#inputs = ['mal1.txt']
	inputs = ['ip5.txt','ip6.txt','ip7.txt']
	no_of_clusters = int(sys.argv[1])
	doc = Document(inputs,no_of_clusters)
	count = 0
	print "Number of Sentences :"
	print len(doc.sentences)
	#print doc.sent_no_swords
	#print len(doc.sent_no_swords)
	'''

	print "Initial cluster sentences:"

	for i in range(len(doc.clusters)):
		print doc.clusters[i][0],
	'''	

	print "Selecting sentence from each cluster..."
	doc.cluster_vector()
	doc.find_clust_similar_sent()
	#print ""
	#print "Cluster sentences:\n"
	#print doc.clust_sentences

	#print "Assigning weights to cluster sentences:"
	#doc.select_cluster_sentences()

	

	#doc.printclust_sentences()
	#doc.print_rogue_clust_sentences()
	print "Ordering...."
	for input_file in inputs:
		count = count +1
	if count == 1:
		doc.print_sent_ordered()

	#ordering
	

	first = ordering.precedence_ordering(doc,doc.clust_sentences)

	tempv = doc.clust_sentences[0]
	doc.clust_sentences[0] = doc.clust_sentences[first]
	doc.clust_sentences[first] = tempv

	ordered_sentences=ordering.similarity_ordering(doc,doc.clust_sentences)
	#print doc.clust_sentences,ordered_sentences

	#****exchange 1st sentence in the cluster with first


	for i in ordered_sentences:
		print doc.sentences[i].lstrip().capitalize(),". ",

예제 #44

0

파일 보기

파일: json_io.py 프로젝트: MatthewMcAteer/cadnano2

def doc_from_legacy_dict(obj):
    """
    take a loaded legacy dictionary, returns a loaded Document
    """
    doc = Document()
    part = DNAHoneycombPart()   # TODO must generalize
    doc.addPart(part)
    part.setName(obj["name"])
    #self.addVirtualHelixAt(coord, vh, requestSpecificIdnum=num, noUndo=True)
    numBases = len(obj['vstrands'][0]['scaf'])
    part.setDimensions((30, 32, numBases))
    for helix in obj['vstrands']:
        row = helix['row']
        col = helix['col']
        scaf= helix['scaf']
        vh = VirtualHelix(numBases=len(scaf), idnum=helix['num'])
        part.addVirtualHelixAt((row,col), vh, requestSpecificIdnum=helix['num'], noUndo=True)
    helixNo, numHelixes = -1, len(obj['vstrands'])
    for helix in obj['vstrands']:
        helixNo += 1
        # print "helix %i/%i (%i%%)"%(helixNo, numHelixes, helixNo*100/numHelixes)
        vh = part.getVirtualHelix(helix['num'])
        scaf = helix['scaf']
        stap = helix['stap']
        loops = helix['loop']
        skips = helix['skip']
        assert(len(scaf)==len(stap) and len(stap)==vh.numBases() and\
               len(scaf)==len(loops) and len(loops)==len(skips))
        for i in range(len(scaf)):
            fiveVH, fiveIdx, threeVH, threeIdx = scaf[i]
            threeVH = part.getVirtualHelix(threeVH)
            # Installing an Xover works on the same strand
            # as well (there is nothing inherently different
            # between an Xover and a same-strand linkage
            # in our current model)
            if threeVH==-1 or threeIdx==-1:
                continue
            
            vh.installXoverFrom3To5(StrandType.Scaffold, i, threeVH, threeIdx, undoable=False, speedy=True)
        for i in range(len(stap)):
            fiveVH, fiveIdx, threeVH, threeIdx = stap[i]
            threeVH = part.getVirtualHelix(threeVH)
            if threeVH==-1 or threeIdx==-1:
                continue
            vh.installXoverFrom3To5(StrandType.Staple, i, threeVH, threeIdx, undoable=False, speedy=True)
        for baseIdx, colorNumber in helix['stap_colors']:
            color = QColor((colorNumber>>16)&0xFF, (colorNumber>>8)&0xFF, colorNumber&0xFF)
            vh.applyColorAt(color, StrandType.Staple, baseIdx, undoable=False)
        for i in range(len(stap)):
            combinedLoopSkipAmount = loops[i] + skips[i]
            if combinedLoopSkipAmount != 0:
                vh.installLoop(StrandType.Scaffold, i, combinedLoopSkipAmount, undoable=False)
    return doc

예제 #45

0

파일 보기

    def __init__(self, canvas_size, layers):
        default_brush_style = BrushInfo(brushdata)
        Document.__init__(self, default_brush_style)
        self.canvas_size = canvas_size
        self.layer_count = layers

        tile_size = (canvas_size[0] + 64 - (canvas_size[0] % 64),
                     canvas_size[1] + 64 - (canvas_size[1] % 64))
        self.set_frame(0, 0, *tile_size)

        for layer in range(layers):
            self.add_layer(layer)

예제 #46

0

파일 보기

파일: testing.py 프로젝트: Roaming-Initiative/python-libdeje

def document(handler_lua=None, handler_lua_template=None):
    from document import Document
    from resource import Resource
    doc = Document()
    if handler_lua_template:
        import deje.handlers.lua as handlers
        handler_lua = getattr(handlers, handler_lua_template)()
    if handler_lua:
        handler = Resource('/handler.lua', handler_lua, 'The primary handler',
                           'text/lua')
        doc.add_resource(handler)
    return doc

예제 #47

0

파일 보기

파일: evaluate.py 프로젝트: JoeAcanfora/ProjFocusedCrawler

 def buildVSMClassifier(self,posFile,vsmClassifierFileName,th,topK):
     
     try:
         classifierFile = open(vsmClassifierFileName,"rb")
         self.classifier = pickle.load(classifierFile)
         classifierFile.close()
     except:
         docs = []
         f = open(posFile,'r')
         for url in f:
             url = url.strip()
             d = Document(url)
             if d and d.text:
                 docs.append(d)
         f.close()
         '''
         docsTF = []
         for d in docs:
             wordsFreq = getFreq(d.getWords())
             docsTF.append(wordsFreq)
         self.classifier = VSMClassifier(docsTF,th)
         '''
         docsTF = []
         vocabTFDic = {}
         for d in docs:
             wordsFreq = getFreq(d.getWords())
             #docsTF.append(wordsFreq)
             for w in wordsFreq:
                 if w in vocabTFDic:
                     vocabTFDic[w] += wordsFreq[w]
                 else:
                     vocabTFDic[w] = wordsFreq[w]
         
         vocabSorted = getSorted(vocabTFDic.items(), 1)
         topVocabDic = dict(vocabSorted[:topK])
         #topVocabDic = vocabTFDic
         
         ndocsTF = []
         '''
         for d in docsTF:
             ndocTF = {}
             for k in topVocabDic:
                 if k in d:
                     ndocTF[k] = d[k]
                 else: 
                     ndocTF[k] = 1/math.e
             ndocsTF.append(ndocTF)
          '''   
         
         self.classifier = VSMClassifier(topVocabDic,ndocsTF,th)
         classifierFile = open(vsmClassifierFileName,"wb")
         pickle.dump(self.classifier,classifierFile)
         classifierFile.close()

예제 #48

0

파일 보기

 def _build_index(dirname: str) -> dict:
     """Helper method to initialize inverted index"""
     inv_index = {}
     for fname in os.listdir(dirname):
         if fname.startswith('.'):
             continue
         doc = Document(f'{dirname}/{fname}')
         for word in doc.get_words():
             if word not in inv_index:
                 inv_index[word] = []
             inv_index[word].append(doc)
     return inv_index

예제 #49

0

파일 보기

파일: querydocument.py 프로젝트: chobeat/mapredush

class QueryDocument(Document):
    def __init__(self, field):
        Document.__init__(self)
        self.insidedoc = Document()
        self.field = field

    def addeq(self, value):
        self.add(self.field, value)

    def addgt(self, value):
        self.addnormaloperator("$gt", value)

    def addgte(self, value):
        self.addnormaloperator("$gte", value)

    def addlt(self, value):
        self.addnormaloperator("$lt", value)

    def addlte(self, value):
        self.addnormaloperator("$lte", value)

    def addne(self, value):
        self.addnormaloperator("$ne", value)

    def addin(self, value):
        self.addnormaloperator("$in", value)

    def addnin(self, value):
        self.addnormaloperator("$nin", value)

    def negate(self):
        self.insidedoc = {"$not": self.insidedoc.getdoc()}

    def addexists(self):
        self.addnormaloperator("$exists", True)

    def addnotexists(self):
        self.addnormaloperator("$exists", False)

    def addnormaloperator(self, operator, value):
        self.insidedoc.add(operator, value)

    def getdoc(self):
        try:
            if self.insidedoc:
                self.add(self.field, self.insidedoc)
            return self.doc
        except AttributeError:
            return self.insidedoc.getdoc()

    def __str__(self):
        return str(self.getdoc())

예제 #50

0

파일 보기

파일: testing.py 프로젝트: Roaming-Initiative/python-libdeje

def document(handler_lua=None, handler_lua_template=None):
    from document import Document
    from resource import Resource

    doc = Document()
    if handler_lua_template:
        import deje.handlers.lua as handlers

        handler_lua = getattr(handlers, handler_lua_template)()
    if handler_lua:
        handler = Resource("/handler.lua", handler_lua, "The primary handler", "text/lua")
        doc.add_resource(handler)
    return doc

예제 #51

0

파일 보기

파일: pool.py 프로젝트: TechBK/NLP-new

    def learn(self,directory,classname):
        x = DocumentClass()
        dir = os.listdir(directory)

        for file in dir:
            d = Document()
            print(directory + "/" + file)
            d.read_document(directory + "/" +  file)

            x = x + d
        self.__document_classes[classname] = x
        #print(len(dir))
        x.setNumberOfDocs(len(dir))

예제 #52

0

파일 보기

파일: test_substance.py 프로젝트: simplicissimus/metadata-processor

    def test_sampleDocumentProcessing(self):
        sampleSubstanceFile = os.path.join(os.getcwd(), 'samples', 'substance',
                                           '01_sample.json')
        substanceProcessor = SubstanceProcessor().initWithFile(
            sampleSubstanceFile)
        document = substanceProcessor.document()
        expectedDocument = Document().initWithFile(
            os.path.join(os.getcwd(), 'samples', 'expected outcome',
                         'substance_01'))

        print("DOCUMENT METADATA: " + str(document.metadata()))
        self.assertEqual(expectedDocument.content(), document.content())
        self.assertEqual(expectedDocument.metadata(), document.metadata())

예제 #53

0

파일 보기

 def _build_dict(self, dir_name):
     """
     Helper function that builds up the inverse index
     """
     index = dict()
     # Builds dictionary as word -> Document object
     for file_name in os.listdir(dir_name):
         doc = Document(dir_name + '/' + file_name)
         for word in doc.get_words():
             if word not in index:
                 index[word] = list()
             index[word].append(doc)
     return index

예제 #54

0

파일 보기

def assemble_order(rein, document):
    """
    Take one document and build the entire order based on it. The idea here is that one Job ID should
    allow us to query each available server for each document type that is associated with it, then
    filter out bogus shit by focusing on who's signed correct stuff. This kind of command can also
    look for attempted changes in foundational info like participants public keys and redeem scripts.
    If this works well, we can reduce how much data is required at each stage. Finally, we should
    be able to serialize a job from end to end so it can be easily reviewed by a mediator.
    """
    parsed = parse_document(document.contents)
    if 'Job ID' not in parsed:
        return 0
    job_id = parsed['Job ID']
    urls = Bucket.get_urls(rein)
    documents = []
    if job_id:
        for url in urls:
            # queries remote server for all docs associated with a job_id
            res = Document.get_documents_by_job_id(rein, url, job_id)
            if res:
                documents += res
        order_id = Order.get_order_id(rein, job_id)
        if not order_id:
            o = Order(job_id, testnet=rein.testnet)
            rein.session.add(o)
            rein.session.commit()

    for document in documents:
        doc_type = Document.get_document_type(document)
        if not doc_type:
            rein.log.info('doc_type not detected')
            continue
        doc_hash = Document.calc_hash(document)
        d = rein.session.query(Document).filter(
            Document.doc_hash == doc_hash).first()
        if d:
            d.set_order_id(order_id)
            rein.session.add(d)
        else:
            new_document = Document(rein,
                                    doc_type,
                                    document,
                                    order_id,
                                    'remote',
                                    source_key=None,
                                    sig_verified=True,
                                    testnet=rein.testnet)
            rein.session.add(new_document)
        rein.session.commit()

    return len(documents)

예제 #55

0

파일 보기

파일: fhir.py 프로젝트: asherdale/sample-patients-stu3

    def set_clinical_notes(self, bundle, prefix=None):
        """Generates and appends a ClinicalNote entry to the transaction"""
        if GENERATION_MAP[
                "ClinicalNotes"] and self.pid in ClinicalNote.clinicalNotes:
            for d in ClinicalNote.clinicalNotes[self.pid]:
                if d.mime_type == 'text/plain':
                    data = fetch_document(self.pid, d.file_name)
                    # d.content = data['base64_content']
                    # b = d
                    # id = uid("Binary", "%s-note" % d.id, prefix)
                    # d.binary_id = id

                    binary_id = uid(None, "%s-note" % d.id, prefix)

                    note = Binary({
                        "mime_type": d.mime_type,
                        "content": data['base64_content'],
                        "id": binary_id
                    })

                    bundle["entry"].append(note)

                    # if GENERATION_MAP["Documents"]:
                    docRef = Document({
                        'ID':
                        uid(None, "%s-note-ref" % d.id, prefix),
                        'PID':
                        self.pid,
                        'DATE':
                        datetime.now().strftime("%Y-%m-%dT%H:%M:%S+" +
                                                "05:00"),  #.isoformat(),
                        'TITLE':
                        "Note",
                        'MIME_TYPE':
                        d.mime_type,
                        'FILE_NAME':
                        d.file_name,
                        'TYPE':
                        "Note",
                        'mime_type':
                        d.mime_type
                    })
                    bundle["entry"].append(
                        docRef.toJSON(data, binary_id, prefix))
            #         id = uid("DocumentReference", "%s-note" % d.id, prefix)
            #         d.system = "http://loinc.org"
            #         d.code = '34109-9'
            #         d.display = 'Note'
            #         template = template_env.get_template('document.xml')
            #         print >>pfile, template.render(dict(globals(), **locals()))
        return bundle

예제 #56

0

파일 보기

def generate_single_image(fn_args):
    """
    Generate and save a single image

    Parameters
    ----------
    fn_args : argparse.Namespace

    Using arguments passed in the command line, generate a single image and
    save it to the given output directory.
    """
    dprint("Generating image #{}".format(fn_args['iter'] + 1))

    try:
        document = Document(fn_args['args'].stain_level,
                            fn_args['args'].text_noise_level,
                            output_loc=fn_args['args'].output_dir)

        document.create(bypass=fn_args['args'].bypass_divadid)
        document.save()
        document.save_ground_truth()

    except cv2.error as exception:
        dprint(document.random_seed)
        dprint(type(exception))
        dprint(exception.args)
        dprint(exception.args)

        with open("errors.txt", "a+") as errors:
            errors.write("{}\n".format(document.random_seed))

예제 #57

0

파일 보기

파일: train.py 프로젝트: abeautifulman/DoubleCheck

    def __init__(self):

        for dir_ in glob(self.master_dir + "/*"):
            print "\nProcessing", dir_
            for essay in glob(dir_ + "/*"): # essays nested in subdirs
                if essay not in self.essay_vectors.keys():
                    print "\nDoubleChecking", essay 
                    doc = Document(essay, "Wil")
                    doc.document_to_text(essay, essay) # should probably truncate the first "essay" argument to just the filename
                    doc.preprocess_text()
                    doc.statistics()
                    errors = doc.proofread()
                    err_stats = {'grammar': 0,
                                 'suggestion': 0,
                                 'spelling': 0
                                 }
                    try:
                        for err in errors:
                            err_stats[err["type"]] += 1
                    except TypeError:
                        print "No errors!"
                    token_sentence_ratio = doc.stats['tokens'] / doc.stats['sentences']
                    self.essay_vectors[essay] = [
                                                    err_stats['grammar'], 
                                                    err_stats['suggestion'], 
                                                    err_stats['spelling'], 
                                                    token_sentence_ratio
                                                ]
                    print "Completed " + essay + ". Sleeping..."
                    sleep(10)

예제 #58

0

파일 보기

파일: PubmedArticleSet.py 프로젝트: fay19/gene_onology

class PubmedArticleSet(handler.ContentHandler):
    def __init__(self):
        handler.feature_external_ges = "false"
        self.docs = {}
        self.doc = None
        self.chars = ""
                      
    def startElement(self, name, attr):
        if name == 'PubmedArticle' or name == 'PubmedBookArticle':
            self.doc = Document()            
        self.chars = ""
          
    def endElement(self, name):
        if name == 'PubmedArticle':
            self.docs[self.doc.pmid] = self.doc
        if name == 'PMID' and self.doc.pmid == None:
            self.doc.pmid = self.text()
        if name == 'ArticleTitle':
            self.doc.title = self.text()
        if name == 'AbstractText':
            if self.doc.abstract == None:
                self.doc.abstract = self.text()
            else:
                self.doc.abstract += self.text()
        if name == 'DescriptorName':
            self.doc.addMeSH(self.text())        
    
    def characters(self, data):
        self.chars += data

    def text(self):
        return self.chars.strip().encode('ascii', 'ignore')
    
    ## Method to parse a PubmedArticleSet XML file.
    # @param location The location of the xml file to parse
    # return A PubmedArticleSet object 
    @classmethod
    def parse(self, location):
        parser = make_parser()
        parser.setFeature("http://xml.org/sax/features/external-general-entities", False)
        parser.setFeature("http://xml.org/sax/features/external-parameter-entities", False)
        handler = PubmedArticleSet()
        parser.setContentHandler(handler)
        try:
            f = open(location, 'r')
            parser.parse(f)
            f.close()
        except Exception, e:
            raise RuntimeError, "Could not parse PubmedArticleSet XML file at %s" % location
        return handler

예제 #59

0

파일 보기

def vector_test():
    doc = Document("Untitled-1")

    cut = Cut(4, 100, 50)
    cut.points = [
        Vector2(1200, 1300),
        Vector2(1400, 1500),
    ]
    doc.addCut(cut)

    cut = Cut(4, 100, 50)
    cut.points = [
        Vector2(1200 + 600 * 10, 1300 + 200),
        Vector2(1400 + 600 * 10, 1500 + 200),
    ]
    doc.addCut(cut)

    cut = Cut(50, 100, 50)
    cut.points = [
        Vector2(1200 + 200, 1300 + 200),
        Vector2(1400 + 200, 1500 + 200),
    ]
    doc.addCut(cut)

    return doc

예제 #60

0

파일 보기

파일: lightlda.py 프로젝트: nzw0301/lightLDA

 def __init__(self, K: int, docs: Document, num_MH=2) -> None:
     self.K = K
     self._documents = docs.get_documents()
     self._V = docs.get_num_vocab()
     self._D = docs.get_num_docs()
     self._beta = 0.1
     self._Vbeta = self._V * self._beta
     self._alpha = 0.01
     self._sum_alpha = 0.1 * K
     self._nkv = np.zeros((self.K, self._V)).astype(np.int32)
     self._ndk = np.zeros((self._D, self.K)).astype(np.int32)
     self._nk = np.zeros(self.K).astype(np.int32)
     self._z = []
     self.num_MH = num_MH