Пример #1
0
 def test_word_tokenizing(self):
     text = "This is a test sentence."
     with open("../process/tmp_test_file.txt", "w") as test_file:
         test_file.write(text)
     d = Document("tmp_test_file.txt", "testuser")
     d.preprocess_text()
     self.assertEqual(d.preprocessed['tokens'], 6, "word tokenizing failed, incorrect number of tokens")
Пример #2
0
Файл: pool.py Проект: TechBK/NLP
    def Probability(self, doc, dclass = ""):
        """Calculates the probability for a class dclass given a document doc"""
        if dclass:
            sum_dclass = self.sum_words_in_class(dclass)
            prob = 0

            d = Document(self.__vocabulary)
            d.read_document(doc)

            for j in self.__document_classes:
                sum_j = self.sum_words_in_class(j)
                prod = 1
                for i in d.Words():
                    wf_dclass = 1 + self.__document_classes[dclass].WordFreq(i)
                    wf = 1 + self.__document_classes[j].WordFreq(i)
                    r = wf * sum_dclass / (wf_dclass * sum_j)
                    prod *= r
                prob += prod * self.__document_classes[j].NumberOfDocuments() / self.__document_classes[dclass].NumberOfDocuments()
            if prob != 0:
                return 1 / prob
            else:
                return -1
        else:
            prob_list = []
            for dclass in self.__document_classes:
                prob = self.Probability(doc, dclass)
                prob_list.append([dclass,prob])
            prob_list.sort(key = lambda x: x[1], reverse = True)
            return prob_list
class TestDocument(unittest.TestCase):
    def setUp(self):
        self.d = Document()
        self.d.insert("a")

    def test_cursor(self):
        self.assertEqual(self.d.cursor.position, 1)
        self.d.save("tst")
        try:
            remove("tst")
        except OSError:
            pass
        self.d.cursor.back()
        self.d.delete()
        self.assertEqual(self.d.cursor.position, 0)

    def test_multiple_chars_and_escape(self):
        self.d.cursor.home()
        self.d.delete()
        string = ["h", "e", "l", "l", "o", "\n", "w", "o", "r", "l", "d", "!"]
        for i in string:
            self.d.insert(i)
        self.assertEqual(self.d.string, "hello\nworld!")

    def test_string_property(self):
        self.assertEqual(self.d.string, "a")
Пример #4
0
    def test_array_delete(self):
        doc0 =  Document()
        doc0.snapshot = []
        doc1 = self.doc1
        doc2 = self.doc2

        # can technically delete nothing from empty list. why not
        op1 = Op('ad', [], offset=0, val=0)
        doc0.apply_op(op1)
        self.assertEqual(doc0.snapshot, [])

        # remove one from list
        op2 = Op('ad', [], offset=1, val=1)
        doc2.apply_op(op2)
        self.assertEqual(doc2.get_value([1]), 'normal, ol string')

        # from nested lists
        op3 = Op('ad', [2], offset=1, val=1)
        doc2.apply_op(op3)
        self.assertEqual(doc2.get_value([2]), [['multi'],['array']])

        # delete multiple elements
        op4 = Op('ad', [], offset=0, val=4)
        doc2.apply_op(op4)
        self.assertEqual(doc2.snapshot, [None, 42])

        # delete last in list:
        op5 = Op('ad', [], offset=1, val=1)
        doc2.apply_op(op5)
        self.assertEqual(doc2.snapshot, [None])

        # in dicts
        op6 = Op('ad', ['fifth'], offset=2, val=2)
        doc1.apply_op(op6)
        self.assertEqual(doc1.get_value(['fifth']), [55,66])
Пример #5
0
 def test_call_pod_renderer_with_document_file_and_context_and_result_file_in_render_then_call_run(self, renderer):
     doc = Document(context="context")
     doc.document_file = "document"
     doc.result_file = "result"
     doc.render()
     renderer.assert_called_once_with("document", "context", "result")
     renderer.return_value.run.assert_called_once_with()
Пример #6
0
def categorize_document(unknown_document, k):
    nearest_neighbors = dict()

    for football_document in footballDocuments:
        distance = Document.calculate_tanimoto_distance(unknown_document, football_document)
        print(distance)
        if nearest_neighbors.__len__() < k:
            nearest_neighbors[distance] = football_document.category
        else:
            update_neighbors(nearest_neighbors, football_document.category, distance)

    print("\n")

    for python_document in pythonDocuments:
        distance = Document.calculate_tanimoto_distance(unknown_document, python_document)
        print(distance)
        if nearest_neighbors.__len__() < k:
            nearest_neighbors[distance] = python_document.category
        else:
            update_neighbors(nearest_neighbors, python_document.category, distance)

    football_documents_count = 0
    python_documents_count = 0

    for value in nearest_neighbors.values():
        if value == Category.Football:
            football_documents_count += 1
        elif value == Category.Python:
            python_documents_count += 1

    if football_documents_count >= python_documents_count:
        document.category = Category.Football
    elif football_documents_count < python_documents_count:
        document.category = Category.Python
Пример #7
0
 def __init__(self, json_str):
     Document.__init__(self)
     self.json_object = json.loads(json_str)
     if "document_width" in self.json_object:
         self.document_width = self.json_object["document_width"]
     for field in self.json_object["fields"]:
         self.add_field(Field(unicode(field["text"]), field["x"], field["y"], field["length"]))
Пример #8
0
    def run(self, index_file):
        """
        Generate the features using Top N algorithm
        """
        with open(index_file) as f:
            lines = f.readlines()
            for line in lines:
                name = line[:-1]
                with open("../data/scoped/%s" % name, 'r') as d:
                    document = Document(d.read())
                    self.table.add_document(name, document.content_lower)

        new_data_set = self.table.top_n_words(10)
        for document_name, words in new_data_set.iteritems():

            with open("../data/scoped/%s" % document_name, 'r') as d:
                    document = Document(d.read())

            path_name = "../data/features/%s" % document_name

            with open("%s" % path_name, 'w') as f:
                for word in words:
                    for _ in xrange(document.count(word)):
                        f.write(word)
                        f.write("\n")
Пример #9
0
def generate_document_data(chapter_paths, word_count):
    """
    Generate visualization data for a set of chapters.

    Given input chapters we want to find both the unique words being used inside
    of each chapter and how frequent they are within the text as a whole.

    chapter_paths - A list of paths to chapters
    word_count - The number of most frequent words to grab for each chapter

    Returns a list looking like this:
    [
        [
            {
                "word": wart
                "freq": .7
                "uniqueness": .5
                "pos": .1
            }
        ],
    ]

    This is a list of chapters, where each chapter is a list of word
    dictionaries and each word dictionary has the word itself, the frequency of
    the word in that chapter, the uniqueness of the word overall, and the first
    position the word is observed. All of the latter three values are scaled
    from 0-1 with respect to the chapter (the most frequent word receives a 1,
    for instance).
    """
    document = Document(chapter_paths)
    return [generate_chapter_data(word_list, word_count, document) for word_list
            in document.get_chapters()]
Пример #10
0
    def test_array_insert(self):
        doc0 =  Document()
        doc0.snapshot = []
        doc1 = self.doc1
        doc2 = self.doc2

        # whole doc is just an empty array. alter it
        op1 = Op('ai', [], val='c', offset=0)
        doc0.apply_op(op1)
        self.assertEqual(doc0.snapshot, ['c'])
        # insert at start
        op2 = Op('ai', [], val='a', offset=0)
        doc0.apply_op(op2)
        self.assertEqual(doc0.snapshot, ['a', 'c'])
        # insert at end
        op3 = Op('ai', [], val='d', offset=2)
        doc0.apply_op(op3)
        self.assertEqual(doc0.snapshot, ['a','c','d'])
        # insert in middle
        op4 = Op('ai', [], val='b', offset=1)
        doc0.apply_op(op4)
        self.assertEqual(doc0.snapshot, ['a','b','c','d'])

        # insert into some array deep in doc
        op5 = Op('ai', [3,1], val='a', offset=1)
        doc2.apply_op(op5)
        self.assertEqual(doc2.get_value([3,1]), ['dimen', 'a'])

        # again
        op6 = Op('ai', ['fifth'], val='a', offset=1)
        doc1.apply_op(op6)
        result6 = [55,'a',66,{'sixth': 'deep string'}, 'rw']
        self.assertEqual(doc1.get_value(['fifth']), result6)
Пример #11
0
	def test_textWithWeirdFormatting (self):
		sampleDocxFile = os.path.join(os.getcwd(), 'samples', 'docx', '11_weird_formatting.docx')
		docxProcessor = DocxProcessor(sampleDocxFile)
		document = docxProcessor.document()
		expectedDocument = Document().initWithFile(os.path.join(os.getcwd(), 'samples', 'expected outcome', 'docx', 'test_11'))

		self.assertEquals(expectedDocument.content(), document.content())
		self.assertEquals(expectedDocument.formatting(), document.formatting())
Пример #12
0
	def test_textWithLineBlocks (self):
		sampleDocxFile = os.path.join(os.getcwd(), 'samples', 'docx', '08_line_blocks.docx')
		docxProcessor = DocxProcessor(sampleDocxFile)
		document = docxProcessor.document()
		expectedDocument = Document().initWithFile(os.path.join(os.getcwd(), 'samples', 'expected outcome', 'docx', 'test_08'))

		self.assertEquals(expectedDocument.content(),	 document.content())
		self.assertEquals(expectedDocument.formatting(), document.formatting())
Пример #13
0
	def test_parseSimplePdf (self):
		samplePdfFile = os.path.join(os.getcwd(), 'samples', 'pdf', '01_simple_text.pdf')
		pdfProcessor = PdfProcessor(samplePdfFile)
		document = pdfProcessor.document()
		expectedDocument = Document().initWithFile(os.path.join(os.getcwd(), 'samples', 'expected outcome', 'docx', 'test_01'))

		self.assertEquals(expectedDocument.content(), document.content())
		self.assertEquals(expectedDocument.formatting(), document.formatting())
Пример #14
0
    def testSelectsWhenMouseClicked(self):
        document = Document()
        document.new_shape()
        document.current_shape.append_point((0, 0))
        tool = SelectTool(document)

        self.assertTrue(document.selected_point_index is None)
        _perform_click(tool, 0, 0)
        self.assertTrue(document.selected_point_index == 0)
Пример #15
0
def main():
    tokenizer = Tokenizer()

    doc = Document(1)
    #doc.add_text("hello i am paradox")
    doc.load_from_file("documents/test.txt")
    doc.extract_terms(tokenizer)
    doc.generate_frequency_map()
    doc.display()
	def test_sampleDocumentProcessing (self):
		sampleSubstanceFile = os.path.join(os.getcwd(), 'samples', 'substance', '01_sample.json')
		substanceProcessor = SubstanceProcessor().initWithFile(sampleSubstanceFile)
		document = substanceProcessor.document()
		expectedDocument = Document().initWithFile(os.path.join(os.getcwd(), 'samples', 'expected outcome', 'substance_01'))

		print("DOCUMENT METADATA: " + str(document.metadata()))
		self.assertEqual(expectedDocument.content(),	document.content())
		self.assertEqual(expectedDocument.metadata(),	document.metadata())
Пример #17
0
    def test_serialize_and_parse(self):
        self.document.parse_from_tokens(
                self.doc_tokens, random, self.vocabulary)

        test_doc = Document(20)
        test_doc.parse_from_string(self.document.serialize_to_string())

        self.assertEqual(self.document.num_words(), test_doc.num_words())
        self.assertEqual(str(self.document), str(test_doc))
Пример #18
0
    def testDeselectsWhenClickingElsewhere(self):
        document = Document()
        document.new_shape()
        document.current_shape.append_point((0, 0))
        tool = SelectTool(document)

        _perform_click(tool, 0, 0)
        _perform_click(tool, 1000, 0) # Click far away
        self.assertTrue(document.selected_point_index is None)
Пример #19
0
def test_document_class_2():
    file = 'empty.txt'
    test = Document(file)

    # Tests Document object representing empty file
    assert_equals([], test.get_words())
    assert_equals(0, test.term_frequency('dogs'))
    assert_equals(file, test._file_name)
    assert_equals({}, test._term_frequency)
Пример #20
0
Файл: pool.py Проект: TechBK/NLP
 def DocumentIntersectionWithClasses(self, doc_name):
     res = [doc_name]
     for dc in self.__document_classes:
         d = Document(self.__vocabulary)
         d.read_document(doc_name, learn=False)
         o = self.__document_classes[dc] &  d
         intersection_ratio = len(o) / len(d.Words())
         res += (dc, intersection_ratio)
     return res
Пример #21
0
def assemble_orders(rein, job_ids):
    """
    Take a list of job_ids and build their entire orders. The idea here is that one Job ID should
    allow us to query each available server for each document type that is associated with it, then
    filter out cruft by focusing on who's signed correctly.

    TODO: look for attempted changes in foundational info like participants public keys and redeem scripts.
    """
    urls = Bucket.get_urls(rein)
    documents = []
    arg_job_ids = ','.join(job_ids)
    for url in urls:
        # queries remote server for all docs associated with a job_id
        res = Document.get_documents_by_job_id(rein, url, arg_job_ids)
        if res:
            documents += res

    order_ids = {}
    order_id = None
    for job_id in job_ids:
        order_id = Order.get_order_id(rein, job_id)
        if not order_id:
            o = Order(job_id, testnet=rein.testnet)
            rein.session.add(o)
            rein.session.commit()
        order_id = Order.get_order_id(rein, job_id)
        order_ids[job_id] = order_id

    if not order_id:
        return 0

    for document in documents:
        doc_type = Document.get_document_type(document)
        if not doc_type:
            rein.log.info('doc_type not detected')
            continue
        doc_hash = Document.calc_hash(document)
        job_id = Document.get_job_id(document)
        d = rein.session.query(Document).filter(
            Document.doc_hash == doc_hash).first()
        if d:
            d.set_order_id(order_ids[job_id])
            rein.session.add(d)
        else:
            new_document = Document(rein,
                                    doc_type,
                                    document,
                                    order_id,
                                    'remote',
                                    source_key=None,
                                    sig_verified=True,
                                    testnet=rein.testnet)
            rein.session.add(new_document)
        rein.session.commit()

    return len(documents)
Пример #22
0
	def __init__(self):
		super(type(self), self).__init__()

		self._state = TAB_STATE_NORMAL

		self._not_editable = False

		self._save_flags = 0
		
		self._ask_if_externally_modified = True;
		
		# Create the scrolled window
		sw = gtk.ScrolledWindow()
		self._view_scrolled_window = sw

		sw.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC)


		self._auto_save_timeout = 0 # Not Implemented
#TODO:
#	/* Manage auto save data */
#	lockdown = gedit_app_get_lockdown (gedit_app_get_default ());
#	tab->priv->auto_save = gedit_prefs_manager_get_auto_save () &&
#			       !(lockdown & GEDIT_LOCKDOWN_SAVE_TO_DISK);
#	tab->priv->auto_save = (tab->priv->auto_save != FALSE);

#	tab->priv->auto_save_interval = gedit_prefs_manager_get_auto_save_interval ();
#	if (tab->priv->auto_save_interval <= 0)
#		tab->priv->auto_save_interval = GPM_DEFAULT_AUTO_SAVE_INTERVAL;

		doc = Document()
		doc.set_data(TAB_KEY, self)
		
		self._document = doc

#	_gedit_document_set_mount_operation_factory (doc,
#						     tab_mount_operation_factory,
#						     tab);

		self._view = View(doc)
		self._view.show()
		self._view.set_data(TAB_KEY, self)
		
		self.pack_end(sw, True, True, 0)
#	gtk_box_pack_end (GTK_BOX (tab), sw, TRUE, TRUE, 0);

		sw.add(self._view)
#	gtk_container_add (GTK_CONTAINER (sw), tab->priv->view);

		sw.set_shadow_type(gtk.SHADOW_IN)
#	gtk_scrolled_window_set_shadow_type (GTK_SCROLLED_WINDOW (sw),
#					     GTK_SHADOW_IN);

		sw.show()
		
		self._scrolledwindow = sw
Пример #23
0
def main():
    test1 = Document('test_docs/test1.txt')
    test2 = Document('test_docs/test2.txt')
    test3 = Document('test_docs/test3.txt')
    test4 = Document('test_docs/test4.txt')
    test_search1 = SearchEngine('test_docs')

    test_document(test1, test2, test3, test4)
    test_single(test_search1)
    test_mulit(test_search1)
Пример #24
0
def step(context):
    archivedVersion = Document().initWithFile(context.archivedVersion)
    #	archivedVersion.writeTo(os.path.join(context.tmpFolder, '_archived'))
    newVersion = Document().initWithFile(context.processedFile)
    #	newVersion.writeTo(os.path.join(context.tmpFolder, '_new'))
    mergedVersion = archivedVersion.mergeWithDocument(newVersion)
    mergedVersionFile = os.path.join(context.tmpFolder, 'mergedVersion')
    mergedVersion.writeTo(mergedVersionFile)
    #	mergedVersion.writeTo(os.path.join(context.tmpFolder, '_merged'))
    context.processedFile = mergedVersionFile
Пример #25
0
def step(context):
	archivedVersion = Document().initWithFile(context.archivedVersion)
#	archivedVersion.writeTo(os.path.join(context.tmpFolder, '_archived'))
	newVersion = Document().initWithFile(context.processedFile)
#	newVersion.writeTo(os.path.join(context.tmpFolder, '_new'))
	mergedVersion = archivedVersion.mergeWithDocument(newVersion)
	mergedVersionFile = os.path.join(context.tmpFolder, 'mergedVersion')
	mergedVersion.writeTo(mergedVersionFile)
#	mergedVersion.writeTo(os.path.join(context.tmpFolder, '_merged'))
	context.processedFile = mergedVersionFile
Пример #26
0
    def handle_invalid_url(self, url, message):
        document = Document(url)
        document.valid = False

        if document not in self.invalid_documents:
            self.invalid_documents.append(document)

        print(colored(message, 'red'))
        print('URL: %s' % (url))
        print('\n')
Пример #27
0
 def test_023(self):
     """ Document text setter """
     document = Document("tests/4page.pdf", "tests")
     document.text[0] = "goo"
     # TODO
     #self.assertEqual(document.text[0], "goo")
     for i in range(1,5):
         os.remove("tests/4page" + str(i) + ".pdf")
         os.remove("tests/4page" + str(i) + ".txt")
         os.remove("tests/4page" + str(i) + ".json")
Пример #28
0
def generate_prn(out, paths, title):
    doc = Document(title)
    dpi = doc.getResolution()
    for path in paths:
        cut = Cut(4, 100, 50)
        # Convert to doc's resolution.
        cut.points = [Vector2(p.x*dpi/DPI, p.y*dpi/DPI) for p in path]
        doc.addCut(cut)

    epilog.generate_prn(out, doc)
Пример #29
0
    def __init__(self):
        super(type(self), self).__init__()

        self._state = TAB_STATE_NORMAL

        self._not_editable = False

        self._save_flags = 0

        self._ask_if_externally_modified = True

        # Create the scrolled window
        sw = gtk.ScrolledWindow()
        self._view_scrolled_window = sw

        sw.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC)

        self._auto_save_timeout = 0  # Not Implemented
        #TODO:
        #	/* Manage auto save data */
        #	lockdown = gedit_app_get_lockdown (gedit_app_get_default ());
        #	tab->priv->auto_save = gedit_prefs_manager_get_auto_save () &&
        #			       !(lockdown & GEDIT_LOCKDOWN_SAVE_TO_DISK);
        #	tab->priv->auto_save = (tab->priv->auto_save != FALSE);

        #	tab->priv->auto_save_interval = gedit_prefs_manager_get_auto_save_interval ();
        #	if (tab->priv->auto_save_interval <= 0)
        #		tab->priv->auto_save_interval = GPM_DEFAULT_AUTO_SAVE_INTERVAL;

        doc = Document()
        doc.set_data(TAB_KEY, self)

        self._document = doc

        #	_gedit_document_set_mount_operation_factory (doc,
        #						     tab_mount_operation_factory,
        #						     tab);

        self._view = View(doc)
        self._view.show()
        self._view.set_data(TAB_KEY, self)

        self.pack_end(sw, True, True, 0)
        #	gtk_box_pack_end (GTK_BOX (tab), sw, TRUE, TRUE, 0);

        sw.add(self._view)
        #	gtk_container_add (GTK_CONTAINER (sw), tab->priv->view);

        sw.set_shadow_type(gtk.SHADOW_IN)
        #	gtk_scrolled_window_set_shadow_type (GTK_SCROLLED_WINDOW (sw),
        #					     GTK_SHADOW_IN);

        sw.show()

        self._scrolledwindow = sw
Пример #30
0
Файл: pool.py Проект: TechBK/NLP
 def learn(self, directory, dclass_name):
     """ directory is a path, where the files of the class with the name dclass_name can be found """
     x = DocumentClass(self.__vocabulary)
     dir = os.listdir(directory)
     for file in dir:
         d = Document(self.__vocabulary)
         print(directory + "/" + file)
         d.read_document(directory + "/" +  file, learn = True)
         x = x + d
     self.__document_classes[dclass_name] = x
     x.SetNumberOfDocs(len(dir))
Пример #31
0
    def update_orders(self, rein, Document):
        from market import assemble_orders
        documents = Document.get_user_documents(rein)
        job_ids = []
        for document in documents:
            job_id = Document.get_job_id(document.contents)
            if job_id not in job_ids:
                if document.source_url == 'local' and document.doc_type != 'enrollment':
                    job_ids.append(job_id)

        assemble_orders(rein, job_ids)
Пример #32
0
    def test_copy(self):
        """
        Test that when copying a document, the text, dimensions and attributes are identical.
        """

        document = Document('this is a pipe', {'pipe': 1},
                            attributes={'timestamp': time.time()})
        copy = document.copy()
        self.assertEqual(document.text, copy.text)
        self.assertEqual(document.dimensions, copy.dimensions)
        self.assertEqual(document.attributes, copy.attributes)
Пример #33
0
    def test_export_attributes(self):
        """
        Test that exporting and importing documents include their attributes.
        """

        text = 'this is not a pipe'
        d = Document(text, text.split(), attributes={'timestamp': 10})
        e = d.to_array()
        self.assertEqual(d.attributes, Document.from_array(e).attributes)
        self.assertEqual(d.attributes['timestamp'],
                         Document.from_array(e).attributes['timestamp'])
Пример #34
0
    def _createDocument(self):

        document = Document()
        document.setPreferences(self._preferences)
        document.aboutToClose.connect(self._onDocumentAboutToClose)

        subWindow = self._documentArea.addSubWindow(document)
        subWindow.setWindowIcon(QIcon())
        subWindow.showMaximized()

        return document
Пример #35
0
    def BuildIndex(doclist):
        index = Index()

        for docId, doc in enumerate(doclist):
            doc1 = Document()
            doc1 = doc
            # print doc.GetTermVec()
            for word in doc1.GetTermVec():
                index.AddOne(word, docId)  #doc1.GetId())

        return index
Пример #36
0
 def doc_get(self, docname):
     # Retrieve a document from cache, creating from CPS if necessary.
     if docname in self.cache:
         return self.cache[docname]
     doc = Document({})
     if docname in self:
         json_data = json.loads(self[docname])
         doc.load(json_data)
     self.doc_set(docname, doc)
     self.doc_save(docname)
     return doc
Пример #37
0
	def __init__(self, canvas_size, layers):
		default_brush_style = BrushInfo(brushdata)
		Document.__init__(self, default_brush_style)
		self.canvas_size = canvas_size
		self.layer_count = layers
		
		tile_size = (canvas_size[0] + 64 - (canvas_size[0] % 64), canvas_size[1] + 64 - (canvas_size[1] % 64))
		self.set_frame(0, 0, *tile_size)
		
		for layer in range(layers):
			self.add_layer(layer)
Пример #38
0
 def doc_get(self, docname):
     # Retrieve a document from cache, creating from CPS if necessary.
     if docname in self.cache:
         return self.cache[docname]
     doc = Document({})
     if docname in self:
         json_data = json.loads(self[docname])
         doc.load(json_data)
     self.doc_set(docname, doc)
     self.doc_save(docname)
     return doc
Пример #39
0
 def dropEvent(self, e):
     mime = e.mimeData()
     if mime.hasUrls():
         urls = mime.urls()
         for url in urls:
             d = Document()
             d.load_file(url.toLocalFile())
             sub_widget = QtGui.QMdiSubWindow()
             sub_widget.setWidget(d)
             self.mdiArea.addSubWindow(sub_widget)
     e.accept()
Пример #40
0
    def _set_vertical(r1, r2, olabels,
                      length,
                      posmethod="beyondten",
                      docmethod="insert",
                      vertrel="nonrel",
                      blocksize=3,
                      independentplacement=True):

        if independentplacement:
            pos1 = VASyntheticComparisonExperiment._vertpos(r1,
                                                            r2,
                                                            posmethod,
                                                            blocksize)
            pos2 = VASyntheticComparisonExperiment._vertpos(r1,
                                                            r2,
                                                            posmethod,
                                                            blocksize)
        else:
            pos1 = pos2 = VASyntheticComparisonExperiment._vertpos(r1,
                                                               r2,
                                                               posmethod,
                                                               blocksize)

        if docmethod == "assign":
            r1 = [d.set_type(pos1 <= i < (pos1 + blocksize))
                  for i, d in enumerate(r1)]
            r2 = [d.set_type(pos2 <= i < (pos2 + blocksize))
                  for i, d in enumerate(r2)]
        elif docmethod == "insert":
            maxid = max(r1 + r2)
            r1 = [d.set_type(False) for d in r1]
            r2 = [d.set_type(False) for d in r2]
            for i in range(blocksize):
                r1.insert(pos1, Document(maxid + i + 1, True))
                r2.insert(pos2, Document(maxid + i + 1, True))

        labels = olabels[:]
        for doc in set(r1 + r2):
            if not doc.get_type:
                continue

            vdoc = doc.get_id()
            if vdoc >= len(labels):
                labels += [0] * (vdoc - len(labels) + 1)

            if vertrel == "nonrel":
                labels[vdoc] = 0
            elif vertrel == "rel":
                labels[vdoc] = 1
            elif vertrel == "ratio":
                ratio = float(sum(olabels)) / length
                labels[vdoc] = numpy.random.binomial(1, ratio)

        return r1, r2, labels
Пример #41
0
 def test_064(self):
     """ config stem is valid """
     document = Document("tests/4page.pdf", "tests", config=['stem=internal'])
     document = Document("tests/4page.pdf", "tests", config=['stem=porter'])
     document = Document("tests/4page.pdf", "tests", config=['stem=snowball'])
     document = Document("tests/4page.pdf", "tests", config=['stem=lancaster'])
     document = Document("tests/4page.pdf", "tests", config=['stem=lemma'])
     for i in range(1,5):
         os.remove("tests/4page" + str(i) + ".txt")
         os.remove("tests/4page" + str(i) + ".pdf")
         os.remove("tests/4page" + str(i) + ".json")
Пример #42
0
    def test_parseSimplePdf(self):
        samplePdfFile = os.path.join(os.getcwd(), 'samples', 'pdf',
                                     '01_simple_text.pdf')
        pdfProcessor = PdfProcessor(samplePdfFile)
        document = pdfProcessor.document()
        expectedDocument = Document().initWithFile(
            os.path.join(os.getcwd(), 'samples', 'expected outcome', 'docx',
                         'test_01'))

        self.assertEquals(expectedDocument.content(), document.content())
        self.assertEquals(expectedDocument.formatting(), document.formatting())
Пример #43
0
def main():
	#inputs = ['ip1.txt','ip2.txt']
	#inputs = ['ip3.txt','ip4.txt']
	#inputs = ['sachin1.txt']
	#inputs = ['mal1.txt']
	inputs = ['ip5.txt','ip6.txt','ip7.txt']
	no_of_clusters = int(sys.argv[1])
	doc = Document(inputs,no_of_clusters)
	count = 0
	print "Number of Sentences :"
	print len(doc.sentences)
	#print doc.sent_no_swords
	#print len(doc.sent_no_swords)
	'''

	print "Initial cluster sentences:"

	for i in range(len(doc.clusters)):
		print doc.clusters[i][0],
	'''	

	print "Selecting sentence from each cluster..."
	doc.cluster_vector()
	doc.find_clust_similar_sent()
	#print ""
	#print "Cluster sentences:\n"
	#print doc.clust_sentences

	#print "Assigning weights to cluster sentences:"
	#doc.select_cluster_sentences()

	

	#doc.printclust_sentences()
	#doc.print_rogue_clust_sentences()
	print "Ordering...."
	for input_file in inputs:
		count = count +1
	if count == 1:
		doc.print_sent_ordered()

	#ordering
	

	first = ordering.precedence_ordering(doc,doc.clust_sentences)

	tempv = doc.clust_sentences[0]
	doc.clust_sentences[0] = doc.clust_sentences[first]
	doc.clust_sentences[first] = tempv

	ordered_sentences=ordering.similarity_ordering(doc,doc.clust_sentences)
	#print doc.clust_sentences,ordered_sentences

	#****exchange 1st sentence in the cluster with first


	for i in ordered_sentences:
		print doc.sentences[i].lstrip().capitalize(),". ",
Пример #44
0
def doc_from_legacy_dict(obj):
    """
    take a loaded legacy dictionary, returns a loaded Document
    """
    doc = Document()
    part = DNAHoneycombPart()   # TODO must generalize
    doc.addPart(part)
    part.setName(obj["name"])
    #self.addVirtualHelixAt(coord, vh, requestSpecificIdnum=num, noUndo=True)
    numBases = len(obj['vstrands'][0]['scaf'])
    part.setDimensions((30, 32, numBases))
    for helix in obj['vstrands']:
        row = helix['row']
        col = helix['col']
        scaf= helix['scaf']
        vh = VirtualHelix(numBases=len(scaf), idnum=helix['num'])
        part.addVirtualHelixAt((row,col), vh, requestSpecificIdnum=helix['num'], noUndo=True)
    helixNo, numHelixes = -1, len(obj['vstrands'])
    for helix in obj['vstrands']:
        helixNo += 1
        # print "helix %i/%i (%i%%)"%(helixNo, numHelixes, helixNo*100/numHelixes)
        vh = part.getVirtualHelix(helix['num'])
        scaf = helix['scaf']
        stap = helix['stap']
        loops = helix['loop']
        skips = helix['skip']
        assert(len(scaf)==len(stap) and len(stap)==vh.numBases() and\
               len(scaf)==len(loops) and len(loops)==len(skips))
        for i in range(len(scaf)):
            fiveVH, fiveIdx, threeVH, threeIdx = scaf[i]
            threeVH = part.getVirtualHelix(threeVH)
            # Installing an Xover works on the same strand
            # as well (there is nothing inherently different
            # between an Xover and a same-strand linkage
            # in our current model)
            if threeVH==-1 or threeIdx==-1:
                continue
            
            vh.installXoverFrom3To5(StrandType.Scaffold, i, threeVH, threeIdx, undoable=False, speedy=True)
        for i in range(len(stap)):
            fiveVH, fiveIdx, threeVH, threeIdx = stap[i]
            threeVH = part.getVirtualHelix(threeVH)
            if threeVH==-1 or threeIdx==-1:
                continue
            vh.installXoverFrom3To5(StrandType.Staple, i, threeVH, threeIdx, undoable=False, speedy=True)
        for baseIdx, colorNumber in helix['stap_colors']:
            color = QColor((colorNumber>>16)&0xFF, (colorNumber>>8)&0xFF, colorNumber&0xFF)
            vh.applyColorAt(color, StrandType.Staple, baseIdx, undoable=False)
        for i in range(len(stap)):
            combinedLoopSkipAmount = loops[i] + skips[i]
            if combinedLoopSkipAmount != 0:
                vh.installLoop(StrandType.Scaffold, i, combinedLoopSkipAmount, undoable=False)
    return doc
Пример #45
0
    def __init__(self, canvas_size, layers):
        default_brush_style = BrushInfo(brushdata)
        Document.__init__(self, default_brush_style)
        self.canvas_size = canvas_size
        self.layer_count = layers

        tile_size = (canvas_size[0] + 64 - (canvas_size[0] % 64),
                     canvas_size[1] + 64 - (canvas_size[1] % 64))
        self.set_frame(0, 0, *tile_size)

        for layer in range(layers):
            self.add_layer(layer)
Пример #46
0
def document(handler_lua=None, handler_lua_template=None):
    from document import Document
    from resource import Resource
    doc = Document()
    if handler_lua_template:
        import deje.handlers.lua as handlers
        handler_lua = getattr(handlers, handler_lua_template)()
    if handler_lua:
        handler = Resource('/handler.lua', handler_lua, 'The primary handler',
                           'text/lua')
        doc.add_resource(handler)
    return doc
Пример #47
0
 def buildVSMClassifier(self,posFile,vsmClassifierFileName,th,topK):
     
     try:
         classifierFile = open(vsmClassifierFileName,"rb")
         self.classifier = pickle.load(classifierFile)
         classifierFile.close()
     except:
         docs = []
         f = open(posFile,'r')
         for url in f:
             url = url.strip()
             d = Document(url)
             if d and d.text:
                 docs.append(d)
         f.close()
         '''
         docsTF = []
         for d in docs:
             wordsFreq = getFreq(d.getWords())
             docsTF.append(wordsFreq)
         self.classifier = VSMClassifier(docsTF,th)
         '''
         docsTF = []
         vocabTFDic = {}
         for d in docs:
             wordsFreq = getFreq(d.getWords())
             #docsTF.append(wordsFreq)
             for w in wordsFreq:
                 if w in vocabTFDic:
                     vocabTFDic[w] += wordsFreq[w]
                 else:
                     vocabTFDic[w] = wordsFreq[w]
         
         vocabSorted = getSorted(vocabTFDic.items(), 1)
         topVocabDic = dict(vocabSorted[:topK])
         #topVocabDic = vocabTFDic
         
         ndocsTF = []
         '''
         for d in docsTF:
             ndocTF = {}
             for k in topVocabDic:
                 if k in d:
                     ndocTF[k] = d[k]
                 else: 
                     ndocTF[k] = 1/math.e
             ndocsTF.append(ndocTF)
          '''   
         
         self.classifier = VSMClassifier(topVocabDic,ndocsTF,th)
         classifierFile = open(vsmClassifierFileName,"wb")
         pickle.dump(self.classifier,classifierFile)
         classifierFile.close()
Пример #48
0
 def _build_index(dirname: str) -> dict:
     """Helper method to initialize inverted index"""
     inv_index = {}
     for fname in os.listdir(dirname):
         if fname.startswith('.'):
             continue
         doc = Document(f'{dirname}/{fname}')
         for word in doc.get_words():
             if word not in inv_index:
                 inv_index[word] = []
             inv_index[word].append(doc)
     return inv_index
Пример #49
0
class QueryDocument(Document):
    def __init__(self, field):
        Document.__init__(self)
        self.insidedoc = Document()
        self.field = field

    def addeq(self, value):
        self.add(self.field, value)

    def addgt(self, value):
        self.addnormaloperator("$gt", value)

    def addgte(self, value):
        self.addnormaloperator("$gte", value)

    def addlt(self, value):
        self.addnormaloperator("$lt", value)

    def addlte(self, value):
        self.addnormaloperator("$lte", value)

    def addne(self, value):
        self.addnormaloperator("$ne", value)

    def addin(self, value):
        self.addnormaloperator("$in", value)

    def addnin(self, value):
        self.addnormaloperator("$nin", value)

    def negate(self):
        self.insidedoc = {"$not": self.insidedoc.getdoc()}

    def addexists(self):
        self.addnormaloperator("$exists", True)

    def addnotexists(self):
        self.addnormaloperator("$exists", False)

    def addnormaloperator(self, operator, value):
        self.insidedoc.add(operator, value)

    def getdoc(self):
        try:
            if self.insidedoc:
                self.add(self.field, self.insidedoc)
            return self.doc
        except AttributeError:
            return self.insidedoc.getdoc()

    def __str__(self):
        return str(self.getdoc())
Пример #50
0
def document(handler_lua=None, handler_lua_template=None):
    from document import Document
    from resource import Resource

    doc = Document()
    if handler_lua_template:
        import deje.handlers.lua as handlers

        handler_lua = getattr(handlers, handler_lua_template)()
    if handler_lua:
        handler = Resource("/handler.lua", handler_lua, "The primary handler", "text/lua")
        doc.add_resource(handler)
    return doc
Пример #51
0
    def learn(self,directory,classname):
        x = DocumentClass()
        dir = os.listdir(directory)

        for file in dir:
            d = Document()
            print(directory + "/" + file)
            d.read_document(directory + "/" +  file)

            x = x + d
        self.__document_classes[classname] = x
        #print(len(dir))
        x.setNumberOfDocs(len(dir))
    def test_sampleDocumentProcessing(self):
        sampleSubstanceFile = os.path.join(os.getcwd(), 'samples', 'substance',
                                           '01_sample.json')
        substanceProcessor = SubstanceProcessor().initWithFile(
            sampleSubstanceFile)
        document = substanceProcessor.document()
        expectedDocument = Document().initWithFile(
            os.path.join(os.getcwd(), 'samples', 'expected outcome',
                         'substance_01'))

        print("DOCUMENT METADATA: " + str(document.metadata()))
        self.assertEqual(expectedDocument.content(), document.content())
        self.assertEqual(expectedDocument.metadata(), document.metadata())
Пример #53
0
 def _build_dict(self, dir_name):
     """
     Helper function that builds up the inverse index
     """
     index = dict()
     # Builds dictionary as word -> Document object
     for file_name in os.listdir(dir_name):
         doc = Document(dir_name + '/' + file_name)
         for word in doc.get_words():
             if word not in index:
                 index[word] = list()
             index[word].append(doc)
     return index
Пример #54
0
def assemble_order(rein, document):
    """
    Take one document and build the entire order based on it. The idea here is that one Job ID should
    allow us to query each available server for each document type that is associated with it, then
    filter out bogus shit by focusing on who's signed correct stuff. This kind of command can also
    look for attempted changes in foundational info like participants public keys and redeem scripts.
    If this works well, we can reduce how much data is required at each stage. Finally, we should
    be able to serialize a job from end to end so it can be easily reviewed by a mediator.
    """
    parsed = parse_document(document.contents)
    if 'Job ID' not in parsed:
        return 0
    job_id = parsed['Job ID']
    urls = Bucket.get_urls(rein)
    documents = []
    if job_id:
        for url in urls:
            # queries remote server for all docs associated with a job_id
            res = Document.get_documents_by_job_id(rein, url, job_id)
            if res:
                documents += res
        order_id = Order.get_order_id(rein, job_id)
        if not order_id:
            o = Order(job_id, testnet=rein.testnet)
            rein.session.add(o)
            rein.session.commit()

    for document in documents:
        doc_type = Document.get_document_type(document)
        if not doc_type:
            rein.log.info('doc_type not detected')
            continue
        doc_hash = Document.calc_hash(document)
        d = rein.session.query(Document).filter(
            Document.doc_hash == doc_hash).first()
        if d:
            d.set_order_id(order_id)
            rein.session.add(d)
        else:
            new_document = Document(rein,
                                    doc_type,
                                    document,
                                    order_id,
                                    'remote',
                                    source_key=None,
                                    sig_verified=True,
                                    testnet=rein.testnet)
            rein.session.add(new_document)
        rein.session.commit()

    return len(documents)
Пример #55
0
    def set_clinical_notes(self, bundle, prefix=None):
        """Generates and appends a ClinicalNote entry to the transaction"""
        if GENERATION_MAP[
                "ClinicalNotes"] and self.pid in ClinicalNote.clinicalNotes:
            for d in ClinicalNote.clinicalNotes[self.pid]:
                if d.mime_type == 'text/plain':
                    data = fetch_document(self.pid, d.file_name)
                    # d.content = data['base64_content']
                    # b = d
                    # id = uid("Binary", "%s-note" % d.id, prefix)
                    # d.binary_id = id

                    binary_id = uid(None, "%s-note" % d.id, prefix)

                    note = Binary({
                        "mime_type": d.mime_type,
                        "content": data['base64_content'],
                        "id": binary_id
                    })

                    bundle["entry"].append(note)

                    # if GENERATION_MAP["Documents"]:
                    docRef = Document({
                        'ID':
                        uid(None, "%s-note-ref" % d.id, prefix),
                        'PID':
                        self.pid,
                        'DATE':
                        datetime.now().strftime("%Y-%m-%dT%H:%M:%S+" +
                                                "05:00"),  #.isoformat(),
                        'TITLE':
                        "Note",
                        'MIME_TYPE':
                        d.mime_type,
                        'FILE_NAME':
                        d.file_name,
                        'TYPE':
                        "Note",
                        'mime_type':
                        d.mime_type
                    })
                    bundle["entry"].append(
                        docRef.toJSON(data, binary_id, prefix))
            #         id = uid("DocumentReference", "%s-note" % d.id, prefix)
            #         d.system = "http://loinc.org"
            #         d.code = '34109-9'
            #         d.display = 'Note'
            #         template = template_env.get_template('document.xml')
            #         print >>pfile, template.render(dict(globals(), **locals()))
        return bundle
Пример #56
0
def generate_single_image(fn_args):
    """
    Generate and save a single image

    Parameters
    ----------
    fn_args : argparse.Namespace

    Using arguments passed in the command line, generate a single image and
    save it to the given output directory.
    """
    dprint("Generating image #{}".format(fn_args['iter'] + 1))

    try:
        document = Document(fn_args['args'].stain_level,
                            fn_args['args'].text_noise_level,
                            output_loc=fn_args['args'].output_dir)

        document.create(bypass=fn_args['args'].bypass_divadid)
        document.save()
        document.save_ground_truth()

    except cv2.error as exception:
        dprint(document.random_seed)
        dprint(type(exception))
        dprint(exception.args)
        dprint(exception.args)

        with open("errors.txt", "a+") as errors:
            errors.write("{}\n".format(document.random_seed))
Пример #57
0
    def __init__(self):

        for dir_ in glob(self.master_dir + "/*"):
            print "\nProcessing", dir_
            for essay in glob(dir_ + "/*"): # essays nested in subdirs
                if essay not in self.essay_vectors.keys():
                    print "\nDoubleChecking", essay 
                    doc = Document(essay, "Wil")
                    doc.document_to_text(essay, essay) # should probably truncate the first "essay" argument to just the filename
                    doc.preprocess_text()
                    doc.statistics()
                    errors = doc.proofread()
                    err_stats = {'grammar': 0,
                                 'suggestion': 0,
                                 'spelling': 0
                                 }
                    try:
                        for err in errors:
                            err_stats[err["type"]] += 1
                    except TypeError:
                        print "No errors!"
                    token_sentence_ratio = doc.stats['tokens'] / doc.stats['sentences']
                    self.essay_vectors[essay] = [
                                                    err_stats['grammar'], 
                                                    err_stats['suggestion'], 
                                                    err_stats['spelling'], 
                                                    token_sentence_ratio
                                                ]
                    print "Completed " + essay + ". Sleeping..."
                    sleep(10)
Пример #58
0
class PubmedArticleSet(handler.ContentHandler):
    def __init__(self):
        handler.feature_external_ges = "false"
        self.docs = {}
        self.doc = None
        self.chars = ""
                      
    def startElement(self, name, attr):
        if name == 'PubmedArticle' or name == 'PubmedBookArticle':
            self.doc = Document()            
        self.chars = ""
          
    def endElement(self, name):
        if name == 'PubmedArticle':
            self.docs[self.doc.pmid] = self.doc
        if name == 'PMID' and self.doc.pmid == None:
            self.doc.pmid = self.text()
        if name == 'ArticleTitle':
            self.doc.title = self.text()
        if name == 'AbstractText':
            if self.doc.abstract == None:
                self.doc.abstract = self.text()
            else:
                self.doc.abstract += self.text()
        if name == 'DescriptorName':
            self.doc.addMeSH(self.text())        
    
    def characters(self, data):
        self.chars += data

    def text(self):
        return self.chars.strip().encode('ascii', 'ignore')
    
    ## Method to parse a PubmedArticleSet XML file.
    # @param location The location of the xml file to parse
    # return A PubmedArticleSet object 
    @classmethod
    def parse(self, location):
        parser = make_parser()
        parser.setFeature("http://xml.org/sax/features/external-general-entities", False)
        parser.setFeature("http://xml.org/sax/features/external-parameter-entities", False)
        handler = PubmedArticleSet()
        parser.setContentHandler(handler)
        try:
            f = open(location, 'r')
            parser.parse(f)
            f.close()
        except Exception, e:
            raise RuntimeError, "Could not parse PubmedArticleSet XML file at %s" % location
        return handler
Пример #59
0
def vector_test():
    doc = Document("Untitled-1")

    cut = Cut(4, 100, 50)
    cut.points = [
        Vector2(1200, 1300),
        Vector2(1400, 1500),
    ]
    doc.addCut(cut)

    cut = Cut(4, 100, 50)
    cut.points = [
        Vector2(1200 + 600 * 10, 1300 + 200),
        Vector2(1400 + 600 * 10, 1500 + 200),
    ]
    doc.addCut(cut)

    cut = Cut(50, 100, 50)
    cut.points = [
        Vector2(1200 + 200, 1300 + 200),
        Vector2(1400 + 200, 1500 + 200),
    ]
    doc.addCut(cut)

    return doc
Пример #60
0
 def __init__(self, K: int, docs: Document, num_MH=2) -> None:
     self.K = K
     self._documents = docs.get_documents()
     self._V = docs.get_num_vocab()
     self._D = docs.get_num_docs()
     self._beta = 0.1
     self._Vbeta = self._V * self._beta
     self._alpha = 0.01
     self._sum_alpha = 0.1 * K
     self._nkv = np.zeros((self.K, self._V)).astype(np.int32)
     self._ndk = np.zeros((self._D, self.K)).astype(np.int32)
     self._nk = np.zeros(self.K).astype(np.int32)
     self._z = []
     self.num_MH = num_MH