def test_word_tokenizing(self): text = "This is a test sentence." with open("../process/tmp_test_file.txt", "w") as test_file: test_file.write(text) d = Document("tmp_test_file.txt", "testuser") d.preprocess_text() self.assertEqual(d.preprocessed['tokens'], 6, "word tokenizing failed, incorrect number of tokens")
def Probability(self, doc, dclass = ""): """Calculates the probability for a class dclass given a document doc""" if dclass: sum_dclass = self.sum_words_in_class(dclass) prob = 0 d = Document(self.__vocabulary) d.read_document(doc) for j in self.__document_classes: sum_j = self.sum_words_in_class(j) prod = 1 for i in d.Words(): wf_dclass = 1 + self.__document_classes[dclass].WordFreq(i) wf = 1 + self.__document_classes[j].WordFreq(i) r = wf * sum_dclass / (wf_dclass * sum_j) prod *= r prob += prod * self.__document_classes[j].NumberOfDocuments() / self.__document_classes[dclass].NumberOfDocuments() if prob != 0: return 1 / prob else: return -1 else: prob_list = [] for dclass in self.__document_classes: prob = self.Probability(doc, dclass) prob_list.append([dclass,prob]) prob_list.sort(key = lambda x: x[1], reverse = True) return prob_list
class TestDocument(unittest.TestCase): def setUp(self): self.d = Document() self.d.insert("a") def test_cursor(self): self.assertEqual(self.d.cursor.position, 1) self.d.save("tst") try: remove("tst") except OSError: pass self.d.cursor.back() self.d.delete() self.assertEqual(self.d.cursor.position, 0) def test_multiple_chars_and_escape(self): self.d.cursor.home() self.d.delete() string = ["h", "e", "l", "l", "o", "\n", "w", "o", "r", "l", "d", "!"] for i in string: self.d.insert(i) self.assertEqual(self.d.string, "hello\nworld!") def test_string_property(self): self.assertEqual(self.d.string, "a")
def test_array_delete(self): doc0 = Document() doc0.snapshot = [] doc1 = self.doc1 doc2 = self.doc2 # can technically delete nothing from empty list. why not op1 = Op('ad', [], offset=0, val=0) doc0.apply_op(op1) self.assertEqual(doc0.snapshot, []) # remove one from list op2 = Op('ad', [], offset=1, val=1) doc2.apply_op(op2) self.assertEqual(doc2.get_value([1]), 'normal, ol string') # from nested lists op3 = Op('ad', [2], offset=1, val=1) doc2.apply_op(op3) self.assertEqual(doc2.get_value([2]), [['multi'],['array']]) # delete multiple elements op4 = Op('ad', [], offset=0, val=4) doc2.apply_op(op4) self.assertEqual(doc2.snapshot, [None, 42]) # delete last in list: op5 = Op('ad', [], offset=1, val=1) doc2.apply_op(op5) self.assertEqual(doc2.snapshot, [None]) # in dicts op6 = Op('ad', ['fifth'], offset=2, val=2) doc1.apply_op(op6) self.assertEqual(doc1.get_value(['fifth']), [55,66])
def test_call_pod_renderer_with_document_file_and_context_and_result_file_in_render_then_call_run(self, renderer): doc = Document(context="context") doc.document_file = "document" doc.result_file = "result" doc.render() renderer.assert_called_once_with("document", "context", "result") renderer.return_value.run.assert_called_once_with()
def categorize_document(unknown_document, k): nearest_neighbors = dict() for football_document in footballDocuments: distance = Document.calculate_tanimoto_distance(unknown_document, football_document) print(distance) if nearest_neighbors.__len__() < k: nearest_neighbors[distance] = football_document.category else: update_neighbors(nearest_neighbors, football_document.category, distance) print("\n") for python_document in pythonDocuments: distance = Document.calculate_tanimoto_distance(unknown_document, python_document) print(distance) if nearest_neighbors.__len__() < k: nearest_neighbors[distance] = python_document.category else: update_neighbors(nearest_neighbors, python_document.category, distance) football_documents_count = 0 python_documents_count = 0 for value in nearest_neighbors.values(): if value == Category.Football: football_documents_count += 1 elif value == Category.Python: python_documents_count += 1 if football_documents_count >= python_documents_count: document.category = Category.Football elif football_documents_count < python_documents_count: document.category = Category.Python
def __init__(self, json_str): Document.__init__(self) self.json_object = json.loads(json_str) if "document_width" in self.json_object: self.document_width = self.json_object["document_width"] for field in self.json_object["fields"]: self.add_field(Field(unicode(field["text"]), field["x"], field["y"], field["length"]))
def run(self, index_file): """ Generate the features using Top N algorithm """ with open(index_file) as f: lines = f.readlines() for line in lines: name = line[:-1] with open("../data/scoped/%s" % name, 'r') as d: document = Document(d.read()) self.table.add_document(name, document.content_lower) new_data_set = self.table.top_n_words(10) for document_name, words in new_data_set.iteritems(): with open("../data/scoped/%s" % document_name, 'r') as d: document = Document(d.read()) path_name = "../data/features/%s" % document_name with open("%s" % path_name, 'w') as f: for word in words: for _ in xrange(document.count(word)): f.write(word) f.write("\n")
def generate_document_data(chapter_paths, word_count): """ Generate visualization data for a set of chapters. Given input chapters we want to find both the unique words being used inside of each chapter and how frequent they are within the text as a whole. chapter_paths - A list of paths to chapters word_count - The number of most frequent words to grab for each chapter Returns a list looking like this: [ [ { "word": wart "freq": .7 "uniqueness": .5 "pos": .1 } ], ] This is a list of chapters, where each chapter is a list of word dictionaries and each word dictionary has the word itself, the frequency of the word in that chapter, the uniqueness of the word overall, and the first position the word is observed. All of the latter three values are scaled from 0-1 with respect to the chapter (the most frequent word receives a 1, for instance). """ document = Document(chapter_paths) return [generate_chapter_data(word_list, word_count, document) for word_list in document.get_chapters()]
def test_array_insert(self): doc0 = Document() doc0.snapshot = [] doc1 = self.doc1 doc2 = self.doc2 # whole doc is just an empty array. alter it op1 = Op('ai', [], val='c', offset=0) doc0.apply_op(op1) self.assertEqual(doc0.snapshot, ['c']) # insert at start op2 = Op('ai', [], val='a', offset=0) doc0.apply_op(op2) self.assertEqual(doc0.snapshot, ['a', 'c']) # insert at end op3 = Op('ai', [], val='d', offset=2) doc0.apply_op(op3) self.assertEqual(doc0.snapshot, ['a','c','d']) # insert in middle op4 = Op('ai', [], val='b', offset=1) doc0.apply_op(op4) self.assertEqual(doc0.snapshot, ['a','b','c','d']) # insert into some array deep in doc op5 = Op('ai', [3,1], val='a', offset=1) doc2.apply_op(op5) self.assertEqual(doc2.get_value([3,1]), ['dimen', 'a']) # again op6 = Op('ai', ['fifth'], val='a', offset=1) doc1.apply_op(op6) result6 = [55,'a',66,{'sixth': 'deep string'}, 'rw'] self.assertEqual(doc1.get_value(['fifth']), result6)
def test_textWithWeirdFormatting (self): sampleDocxFile = os.path.join(os.getcwd(), 'samples', 'docx', '11_weird_formatting.docx') docxProcessor = DocxProcessor(sampleDocxFile) document = docxProcessor.document() expectedDocument = Document().initWithFile(os.path.join(os.getcwd(), 'samples', 'expected outcome', 'docx', 'test_11')) self.assertEquals(expectedDocument.content(), document.content()) self.assertEquals(expectedDocument.formatting(), document.formatting())
def test_textWithLineBlocks (self): sampleDocxFile = os.path.join(os.getcwd(), 'samples', 'docx', '08_line_blocks.docx') docxProcessor = DocxProcessor(sampleDocxFile) document = docxProcessor.document() expectedDocument = Document().initWithFile(os.path.join(os.getcwd(), 'samples', 'expected outcome', 'docx', 'test_08')) self.assertEquals(expectedDocument.content(), document.content()) self.assertEquals(expectedDocument.formatting(), document.formatting())
def test_parseSimplePdf (self): samplePdfFile = os.path.join(os.getcwd(), 'samples', 'pdf', '01_simple_text.pdf') pdfProcessor = PdfProcessor(samplePdfFile) document = pdfProcessor.document() expectedDocument = Document().initWithFile(os.path.join(os.getcwd(), 'samples', 'expected outcome', 'docx', 'test_01')) self.assertEquals(expectedDocument.content(), document.content()) self.assertEquals(expectedDocument.formatting(), document.formatting())
def testSelectsWhenMouseClicked(self): document = Document() document.new_shape() document.current_shape.append_point((0, 0)) tool = SelectTool(document) self.assertTrue(document.selected_point_index is None) _perform_click(tool, 0, 0) self.assertTrue(document.selected_point_index == 0)
def main(): tokenizer = Tokenizer() doc = Document(1) #doc.add_text("hello i am paradox") doc.load_from_file("documents/test.txt") doc.extract_terms(tokenizer) doc.generate_frequency_map() doc.display()
def test_sampleDocumentProcessing (self): sampleSubstanceFile = os.path.join(os.getcwd(), 'samples', 'substance', '01_sample.json') substanceProcessor = SubstanceProcessor().initWithFile(sampleSubstanceFile) document = substanceProcessor.document() expectedDocument = Document().initWithFile(os.path.join(os.getcwd(), 'samples', 'expected outcome', 'substance_01')) print("DOCUMENT METADATA: " + str(document.metadata())) self.assertEqual(expectedDocument.content(), document.content()) self.assertEqual(expectedDocument.metadata(), document.metadata())
def test_serialize_and_parse(self): self.document.parse_from_tokens( self.doc_tokens, random, self.vocabulary) test_doc = Document(20) test_doc.parse_from_string(self.document.serialize_to_string()) self.assertEqual(self.document.num_words(), test_doc.num_words()) self.assertEqual(str(self.document), str(test_doc))
def testDeselectsWhenClickingElsewhere(self): document = Document() document.new_shape() document.current_shape.append_point((0, 0)) tool = SelectTool(document) _perform_click(tool, 0, 0) _perform_click(tool, 1000, 0) # Click far away self.assertTrue(document.selected_point_index is None)
def test_document_class_2(): file = 'empty.txt' test = Document(file) # Tests Document object representing empty file assert_equals([], test.get_words()) assert_equals(0, test.term_frequency('dogs')) assert_equals(file, test._file_name) assert_equals({}, test._term_frequency)
def DocumentIntersectionWithClasses(self, doc_name): res = [doc_name] for dc in self.__document_classes: d = Document(self.__vocabulary) d.read_document(doc_name, learn=False) o = self.__document_classes[dc] & d intersection_ratio = len(o) / len(d.Words()) res += (dc, intersection_ratio) return res
def assemble_orders(rein, job_ids): """ Take a list of job_ids and build their entire orders. The idea here is that one Job ID should allow us to query each available server for each document type that is associated with it, then filter out cruft by focusing on who's signed correctly. TODO: look for attempted changes in foundational info like participants public keys and redeem scripts. """ urls = Bucket.get_urls(rein) documents = [] arg_job_ids = ','.join(job_ids) for url in urls: # queries remote server for all docs associated with a job_id res = Document.get_documents_by_job_id(rein, url, arg_job_ids) if res: documents += res order_ids = {} order_id = None for job_id in job_ids: order_id = Order.get_order_id(rein, job_id) if not order_id: o = Order(job_id, testnet=rein.testnet) rein.session.add(o) rein.session.commit() order_id = Order.get_order_id(rein, job_id) order_ids[job_id] = order_id if not order_id: return 0 for document in documents: doc_type = Document.get_document_type(document) if not doc_type: rein.log.info('doc_type not detected') continue doc_hash = Document.calc_hash(document) job_id = Document.get_job_id(document) d = rein.session.query(Document).filter( Document.doc_hash == doc_hash).first() if d: d.set_order_id(order_ids[job_id]) rein.session.add(d) else: new_document = Document(rein, doc_type, document, order_id, 'remote', source_key=None, sig_verified=True, testnet=rein.testnet) rein.session.add(new_document) rein.session.commit() return len(documents)
def __init__(self): super(type(self), self).__init__() self._state = TAB_STATE_NORMAL self._not_editable = False self._save_flags = 0 self._ask_if_externally_modified = True; # Create the scrolled window sw = gtk.ScrolledWindow() self._view_scrolled_window = sw sw.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC) self._auto_save_timeout = 0 # Not Implemented #TODO: # /* Manage auto save data */ # lockdown = gedit_app_get_lockdown (gedit_app_get_default ()); # tab->priv->auto_save = gedit_prefs_manager_get_auto_save () && # !(lockdown & GEDIT_LOCKDOWN_SAVE_TO_DISK); # tab->priv->auto_save = (tab->priv->auto_save != FALSE); # tab->priv->auto_save_interval = gedit_prefs_manager_get_auto_save_interval (); # if (tab->priv->auto_save_interval <= 0) # tab->priv->auto_save_interval = GPM_DEFAULT_AUTO_SAVE_INTERVAL; doc = Document() doc.set_data(TAB_KEY, self) self._document = doc # _gedit_document_set_mount_operation_factory (doc, # tab_mount_operation_factory, # tab); self._view = View(doc) self._view.show() self._view.set_data(TAB_KEY, self) self.pack_end(sw, True, True, 0) # gtk_box_pack_end (GTK_BOX (tab), sw, TRUE, TRUE, 0); sw.add(self._view) # gtk_container_add (GTK_CONTAINER (sw), tab->priv->view); sw.set_shadow_type(gtk.SHADOW_IN) # gtk_scrolled_window_set_shadow_type (GTK_SCROLLED_WINDOW (sw), # GTK_SHADOW_IN); sw.show() self._scrolledwindow = sw
def main(): test1 = Document('test_docs/test1.txt') test2 = Document('test_docs/test2.txt') test3 = Document('test_docs/test3.txt') test4 = Document('test_docs/test4.txt') test_search1 = SearchEngine('test_docs') test_document(test1, test2, test3, test4) test_single(test_search1) test_mulit(test_search1)
def step(context): archivedVersion = Document().initWithFile(context.archivedVersion) # archivedVersion.writeTo(os.path.join(context.tmpFolder, '_archived')) newVersion = Document().initWithFile(context.processedFile) # newVersion.writeTo(os.path.join(context.tmpFolder, '_new')) mergedVersion = archivedVersion.mergeWithDocument(newVersion) mergedVersionFile = os.path.join(context.tmpFolder, 'mergedVersion') mergedVersion.writeTo(mergedVersionFile) # mergedVersion.writeTo(os.path.join(context.tmpFolder, '_merged')) context.processedFile = mergedVersionFile
def handle_invalid_url(self, url, message): document = Document(url) document.valid = False if document not in self.invalid_documents: self.invalid_documents.append(document) print(colored(message, 'red')) print('URL: %s' % (url)) print('\n')
def test_023(self): """ Document text setter """ document = Document("tests/4page.pdf", "tests") document.text[0] = "goo" # TODO #self.assertEqual(document.text[0], "goo") for i in range(1,5): os.remove("tests/4page" + str(i) + ".pdf") os.remove("tests/4page" + str(i) + ".txt") os.remove("tests/4page" + str(i) + ".json")
def generate_prn(out, paths, title): doc = Document(title) dpi = doc.getResolution() for path in paths: cut = Cut(4, 100, 50) # Convert to doc's resolution. cut.points = [Vector2(p.x*dpi/DPI, p.y*dpi/DPI) for p in path] doc.addCut(cut) epilog.generate_prn(out, doc)
def __init__(self): super(type(self), self).__init__() self._state = TAB_STATE_NORMAL self._not_editable = False self._save_flags = 0 self._ask_if_externally_modified = True # Create the scrolled window sw = gtk.ScrolledWindow() self._view_scrolled_window = sw sw.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC) self._auto_save_timeout = 0 # Not Implemented #TODO: # /* Manage auto save data */ # lockdown = gedit_app_get_lockdown (gedit_app_get_default ()); # tab->priv->auto_save = gedit_prefs_manager_get_auto_save () && # !(lockdown & GEDIT_LOCKDOWN_SAVE_TO_DISK); # tab->priv->auto_save = (tab->priv->auto_save != FALSE); # tab->priv->auto_save_interval = gedit_prefs_manager_get_auto_save_interval (); # if (tab->priv->auto_save_interval <= 0) # tab->priv->auto_save_interval = GPM_DEFAULT_AUTO_SAVE_INTERVAL; doc = Document() doc.set_data(TAB_KEY, self) self._document = doc # _gedit_document_set_mount_operation_factory (doc, # tab_mount_operation_factory, # tab); self._view = View(doc) self._view.show() self._view.set_data(TAB_KEY, self) self.pack_end(sw, True, True, 0) # gtk_box_pack_end (GTK_BOX (tab), sw, TRUE, TRUE, 0); sw.add(self._view) # gtk_container_add (GTK_CONTAINER (sw), tab->priv->view); sw.set_shadow_type(gtk.SHADOW_IN) # gtk_scrolled_window_set_shadow_type (GTK_SCROLLED_WINDOW (sw), # GTK_SHADOW_IN); sw.show() self._scrolledwindow = sw
def learn(self, directory, dclass_name): """ directory is a path, where the files of the class with the name dclass_name can be found """ x = DocumentClass(self.__vocabulary) dir = os.listdir(directory) for file in dir: d = Document(self.__vocabulary) print(directory + "/" + file) d.read_document(directory + "/" + file, learn = True) x = x + d self.__document_classes[dclass_name] = x x.SetNumberOfDocs(len(dir))
def update_orders(self, rein, Document): from market import assemble_orders documents = Document.get_user_documents(rein) job_ids = [] for document in documents: job_id = Document.get_job_id(document.contents) if job_id not in job_ids: if document.source_url == 'local' and document.doc_type != 'enrollment': job_ids.append(job_id) assemble_orders(rein, job_ids)
def test_copy(self): """ Test that when copying a document, the text, dimensions and attributes are identical. """ document = Document('this is a pipe', {'pipe': 1}, attributes={'timestamp': time.time()}) copy = document.copy() self.assertEqual(document.text, copy.text) self.assertEqual(document.dimensions, copy.dimensions) self.assertEqual(document.attributes, copy.attributes)
def test_export_attributes(self): """ Test that exporting and importing documents include their attributes. """ text = 'this is not a pipe' d = Document(text, text.split(), attributes={'timestamp': 10}) e = d.to_array() self.assertEqual(d.attributes, Document.from_array(e).attributes) self.assertEqual(d.attributes['timestamp'], Document.from_array(e).attributes['timestamp'])
def _createDocument(self): document = Document() document.setPreferences(self._preferences) document.aboutToClose.connect(self._onDocumentAboutToClose) subWindow = self._documentArea.addSubWindow(document) subWindow.setWindowIcon(QIcon()) subWindow.showMaximized() return document
def BuildIndex(doclist): index = Index() for docId, doc in enumerate(doclist): doc1 = Document() doc1 = doc # print doc.GetTermVec() for word in doc1.GetTermVec(): index.AddOne(word, docId) #doc1.GetId()) return index
def doc_get(self, docname): # Retrieve a document from cache, creating from CPS if necessary. if docname in self.cache: return self.cache[docname] doc = Document({}) if docname in self: json_data = json.loads(self[docname]) doc.load(json_data) self.doc_set(docname, doc) self.doc_save(docname) return doc
def __init__(self, canvas_size, layers): default_brush_style = BrushInfo(brushdata) Document.__init__(self, default_brush_style) self.canvas_size = canvas_size self.layer_count = layers tile_size = (canvas_size[0] + 64 - (canvas_size[0] % 64), canvas_size[1] + 64 - (canvas_size[1] % 64)) self.set_frame(0, 0, *tile_size) for layer in range(layers): self.add_layer(layer)
def dropEvent(self, e): mime = e.mimeData() if mime.hasUrls(): urls = mime.urls() for url in urls: d = Document() d.load_file(url.toLocalFile()) sub_widget = QtGui.QMdiSubWindow() sub_widget.setWidget(d) self.mdiArea.addSubWindow(sub_widget) e.accept()
def _set_vertical(r1, r2, olabels, length, posmethod="beyondten", docmethod="insert", vertrel="nonrel", blocksize=3, independentplacement=True): if independentplacement: pos1 = VASyntheticComparisonExperiment._vertpos(r1, r2, posmethod, blocksize) pos2 = VASyntheticComparisonExperiment._vertpos(r1, r2, posmethod, blocksize) else: pos1 = pos2 = VASyntheticComparisonExperiment._vertpos(r1, r2, posmethod, blocksize) if docmethod == "assign": r1 = [d.set_type(pos1 <= i < (pos1 + blocksize)) for i, d in enumerate(r1)] r2 = [d.set_type(pos2 <= i < (pos2 + blocksize)) for i, d in enumerate(r2)] elif docmethod == "insert": maxid = max(r1 + r2) r1 = [d.set_type(False) for d in r1] r2 = [d.set_type(False) for d in r2] for i in range(blocksize): r1.insert(pos1, Document(maxid + i + 1, True)) r2.insert(pos2, Document(maxid + i + 1, True)) labels = olabels[:] for doc in set(r1 + r2): if not doc.get_type: continue vdoc = doc.get_id() if vdoc >= len(labels): labels += [0] * (vdoc - len(labels) + 1) if vertrel == "nonrel": labels[vdoc] = 0 elif vertrel == "rel": labels[vdoc] = 1 elif vertrel == "ratio": ratio = float(sum(olabels)) / length labels[vdoc] = numpy.random.binomial(1, ratio) return r1, r2, labels
def test_064(self): """ config stem is valid """ document = Document("tests/4page.pdf", "tests", config=['stem=internal']) document = Document("tests/4page.pdf", "tests", config=['stem=porter']) document = Document("tests/4page.pdf", "tests", config=['stem=snowball']) document = Document("tests/4page.pdf", "tests", config=['stem=lancaster']) document = Document("tests/4page.pdf", "tests", config=['stem=lemma']) for i in range(1,5): os.remove("tests/4page" + str(i) + ".txt") os.remove("tests/4page" + str(i) + ".pdf") os.remove("tests/4page" + str(i) + ".json")
def test_parseSimplePdf(self): samplePdfFile = os.path.join(os.getcwd(), 'samples', 'pdf', '01_simple_text.pdf') pdfProcessor = PdfProcessor(samplePdfFile) document = pdfProcessor.document() expectedDocument = Document().initWithFile( os.path.join(os.getcwd(), 'samples', 'expected outcome', 'docx', 'test_01')) self.assertEquals(expectedDocument.content(), document.content()) self.assertEquals(expectedDocument.formatting(), document.formatting())
def main(): #inputs = ['ip1.txt','ip2.txt'] #inputs = ['ip3.txt','ip4.txt'] #inputs = ['sachin1.txt'] #inputs = ['mal1.txt'] inputs = ['ip5.txt','ip6.txt','ip7.txt'] no_of_clusters = int(sys.argv[1]) doc = Document(inputs,no_of_clusters) count = 0 print "Number of Sentences :" print len(doc.sentences) #print doc.sent_no_swords #print len(doc.sent_no_swords) ''' print "Initial cluster sentences:" for i in range(len(doc.clusters)): print doc.clusters[i][0], ''' print "Selecting sentence from each cluster..." doc.cluster_vector() doc.find_clust_similar_sent() #print "" #print "Cluster sentences:\n" #print doc.clust_sentences #print "Assigning weights to cluster sentences:" #doc.select_cluster_sentences() #doc.printclust_sentences() #doc.print_rogue_clust_sentences() print "Ordering...." for input_file in inputs: count = count +1 if count == 1: doc.print_sent_ordered() #ordering first = ordering.precedence_ordering(doc,doc.clust_sentences) tempv = doc.clust_sentences[0] doc.clust_sentences[0] = doc.clust_sentences[first] doc.clust_sentences[first] = tempv ordered_sentences=ordering.similarity_ordering(doc,doc.clust_sentences) #print doc.clust_sentences,ordered_sentences #****exchange 1st sentence in the cluster with first for i in ordered_sentences: print doc.sentences[i].lstrip().capitalize(),". ",
def doc_from_legacy_dict(obj): """ take a loaded legacy dictionary, returns a loaded Document """ doc = Document() part = DNAHoneycombPart() # TODO must generalize doc.addPart(part) part.setName(obj["name"]) #self.addVirtualHelixAt(coord, vh, requestSpecificIdnum=num, noUndo=True) numBases = len(obj['vstrands'][0]['scaf']) part.setDimensions((30, 32, numBases)) for helix in obj['vstrands']: row = helix['row'] col = helix['col'] scaf= helix['scaf'] vh = VirtualHelix(numBases=len(scaf), idnum=helix['num']) part.addVirtualHelixAt((row,col), vh, requestSpecificIdnum=helix['num'], noUndo=True) helixNo, numHelixes = -1, len(obj['vstrands']) for helix in obj['vstrands']: helixNo += 1 # print "helix %i/%i (%i%%)"%(helixNo, numHelixes, helixNo*100/numHelixes) vh = part.getVirtualHelix(helix['num']) scaf = helix['scaf'] stap = helix['stap'] loops = helix['loop'] skips = helix['skip'] assert(len(scaf)==len(stap) and len(stap)==vh.numBases() and\ len(scaf)==len(loops) and len(loops)==len(skips)) for i in range(len(scaf)): fiveVH, fiveIdx, threeVH, threeIdx = scaf[i] threeVH = part.getVirtualHelix(threeVH) # Installing an Xover works on the same strand # as well (there is nothing inherently different # between an Xover and a same-strand linkage # in our current model) if threeVH==-1 or threeIdx==-1: continue vh.installXoverFrom3To5(StrandType.Scaffold, i, threeVH, threeIdx, undoable=False, speedy=True) for i in range(len(stap)): fiveVH, fiveIdx, threeVH, threeIdx = stap[i] threeVH = part.getVirtualHelix(threeVH) if threeVH==-1 or threeIdx==-1: continue vh.installXoverFrom3To5(StrandType.Staple, i, threeVH, threeIdx, undoable=False, speedy=True) for baseIdx, colorNumber in helix['stap_colors']: color = QColor((colorNumber>>16)&0xFF, (colorNumber>>8)&0xFF, colorNumber&0xFF) vh.applyColorAt(color, StrandType.Staple, baseIdx, undoable=False) for i in range(len(stap)): combinedLoopSkipAmount = loops[i] + skips[i] if combinedLoopSkipAmount != 0: vh.installLoop(StrandType.Scaffold, i, combinedLoopSkipAmount, undoable=False) return doc
def document(handler_lua=None, handler_lua_template=None): from document import Document from resource import Resource doc = Document() if handler_lua_template: import deje.handlers.lua as handlers handler_lua = getattr(handlers, handler_lua_template)() if handler_lua: handler = Resource('/handler.lua', handler_lua, 'The primary handler', 'text/lua') doc.add_resource(handler) return doc
def buildVSMClassifier(self,posFile,vsmClassifierFileName,th,topK): try: classifierFile = open(vsmClassifierFileName,"rb") self.classifier = pickle.load(classifierFile) classifierFile.close() except: docs = [] f = open(posFile,'r') for url in f: url = url.strip() d = Document(url) if d and d.text: docs.append(d) f.close() ''' docsTF = [] for d in docs: wordsFreq = getFreq(d.getWords()) docsTF.append(wordsFreq) self.classifier = VSMClassifier(docsTF,th) ''' docsTF = [] vocabTFDic = {} for d in docs: wordsFreq = getFreq(d.getWords()) #docsTF.append(wordsFreq) for w in wordsFreq: if w in vocabTFDic: vocabTFDic[w] += wordsFreq[w] else: vocabTFDic[w] = wordsFreq[w] vocabSorted = getSorted(vocabTFDic.items(), 1) topVocabDic = dict(vocabSorted[:topK]) #topVocabDic = vocabTFDic ndocsTF = [] ''' for d in docsTF: ndocTF = {} for k in topVocabDic: if k in d: ndocTF[k] = d[k] else: ndocTF[k] = 1/math.e ndocsTF.append(ndocTF) ''' self.classifier = VSMClassifier(topVocabDic,ndocsTF,th) classifierFile = open(vsmClassifierFileName,"wb") pickle.dump(self.classifier,classifierFile) classifierFile.close()
def _build_index(dirname: str) -> dict: """Helper method to initialize inverted index""" inv_index = {} for fname in os.listdir(dirname): if fname.startswith('.'): continue doc = Document(f'{dirname}/{fname}') for word in doc.get_words(): if word not in inv_index: inv_index[word] = [] inv_index[word].append(doc) return inv_index
class QueryDocument(Document): def __init__(self, field): Document.__init__(self) self.insidedoc = Document() self.field = field def addeq(self, value): self.add(self.field, value) def addgt(self, value): self.addnormaloperator("$gt", value) def addgte(self, value): self.addnormaloperator("$gte", value) def addlt(self, value): self.addnormaloperator("$lt", value) def addlte(self, value): self.addnormaloperator("$lte", value) def addne(self, value): self.addnormaloperator("$ne", value) def addin(self, value): self.addnormaloperator("$in", value) def addnin(self, value): self.addnormaloperator("$nin", value) def negate(self): self.insidedoc = {"$not": self.insidedoc.getdoc()} def addexists(self): self.addnormaloperator("$exists", True) def addnotexists(self): self.addnormaloperator("$exists", False) def addnormaloperator(self, operator, value): self.insidedoc.add(operator, value) def getdoc(self): try: if self.insidedoc: self.add(self.field, self.insidedoc) return self.doc except AttributeError: return self.insidedoc.getdoc() def __str__(self): return str(self.getdoc())
def document(handler_lua=None, handler_lua_template=None): from document import Document from resource import Resource doc = Document() if handler_lua_template: import deje.handlers.lua as handlers handler_lua = getattr(handlers, handler_lua_template)() if handler_lua: handler = Resource("/handler.lua", handler_lua, "The primary handler", "text/lua") doc.add_resource(handler) return doc
def learn(self,directory,classname): x = DocumentClass() dir = os.listdir(directory) for file in dir: d = Document() print(directory + "/" + file) d.read_document(directory + "/" + file) x = x + d self.__document_classes[classname] = x #print(len(dir)) x.setNumberOfDocs(len(dir))
def test_sampleDocumentProcessing(self): sampleSubstanceFile = os.path.join(os.getcwd(), 'samples', 'substance', '01_sample.json') substanceProcessor = SubstanceProcessor().initWithFile( sampleSubstanceFile) document = substanceProcessor.document() expectedDocument = Document().initWithFile( os.path.join(os.getcwd(), 'samples', 'expected outcome', 'substance_01')) print("DOCUMENT METADATA: " + str(document.metadata())) self.assertEqual(expectedDocument.content(), document.content()) self.assertEqual(expectedDocument.metadata(), document.metadata())
def _build_dict(self, dir_name): """ Helper function that builds up the inverse index """ index = dict() # Builds dictionary as word -> Document object for file_name in os.listdir(dir_name): doc = Document(dir_name + '/' + file_name) for word in doc.get_words(): if word not in index: index[word] = list() index[word].append(doc) return index
def assemble_order(rein, document): """ Take one document and build the entire order based on it. The idea here is that one Job ID should allow us to query each available server for each document type that is associated with it, then filter out bogus shit by focusing on who's signed correct stuff. This kind of command can also look for attempted changes in foundational info like participants public keys and redeem scripts. If this works well, we can reduce how much data is required at each stage. Finally, we should be able to serialize a job from end to end so it can be easily reviewed by a mediator. """ parsed = parse_document(document.contents) if 'Job ID' not in parsed: return 0 job_id = parsed['Job ID'] urls = Bucket.get_urls(rein) documents = [] if job_id: for url in urls: # queries remote server for all docs associated with a job_id res = Document.get_documents_by_job_id(rein, url, job_id) if res: documents += res order_id = Order.get_order_id(rein, job_id) if not order_id: o = Order(job_id, testnet=rein.testnet) rein.session.add(o) rein.session.commit() for document in documents: doc_type = Document.get_document_type(document) if not doc_type: rein.log.info('doc_type not detected') continue doc_hash = Document.calc_hash(document) d = rein.session.query(Document).filter( Document.doc_hash == doc_hash).first() if d: d.set_order_id(order_id) rein.session.add(d) else: new_document = Document(rein, doc_type, document, order_id, 'remote', source_key=None, sig_verified=True, testnet=rein.testnet) rein.session.add(new_document) rein.session.commit() return len(documents)
def set_clinical_notes(self, bundle, prefix=None): """Generates and appends a ClinicalNote entry to the transaction""" if GENERATION_MAP[ "ClinicalNotes"] and self.pid in ClinicalNote.clinicalNotes: for d in ClinicalNote.clinicalNotes[self.pid]: if d.mime_type == 'text/plain': data = fetch_document(self.pid, d.file_name) # d.content = data['base64_content'] # b = d # id = uid("Binary", "%s-note" % d.id, prefix) # d.binary_id = id binary_id = uid(None, "%s-note" % d.id, prefix) note = Binary({ "mime_type": d.mime_type, "content": data['base64_content'], "id": binary_id }) bundle["entry"].append(note) # if GENERATION_MAP["Documents"]: docRef = Document({ 'ID': uid(None, "%s-note-ref" % d.id, prefix), 'PID': self.pid, 'DATE': datetime.now().strftime("%Y-%m-%dT%H:%M:%S+" + "05:00"), #.isoformat(), 'TITLE': "Note", 'MIME_TYPE': d.mime_type, 'FILE_NAME': d.file_name, 'TYPE': "Note", 'mime_type': d.mime_type }) bundle["entry"].append( docRef.toJSON(data, binary_id, prefix)) # id = uid("DocumentReference", "%s-note" % d.id, prefix) # d.system = "http://loinc.org" # d.code = '34109-9' # d.display = 'Note' # template = template_env.get_template('document.xml') # print >>pfile, template.render(dict(globals(), **locals())) return bundle
def generate_single_image(fn_args): """ Generate and save a single image Parameters ---------- fn_args : argparse.Namespace Using arguments passed in the command line, generate a single image and save it to the given output directory. """ dprint("Generating image #{}".format(fn_args['iter'] + 1)) try: document = Document(fn_args['args'].stain_level, fn_args['args'].text_noise_level, output_loc=fn_args['args'].output_dir) document.create(bypass=fn_args['args'].bypass_divadid) document.save() document.save_ground_truth() except cv2.error as exception: dprint(document.random_seed) dprint(type(exception)) dprint(exception.args) dprint(exception.args) with open("errors.txt", "a+") as errors: errors.write("{}\n".format(document.random_seed))
def __init__(self): for dir_ in glob(self.master_dir + "/*"): print "\nProcessing", dir_ for essay in glob(dir_ + "/*"): # essays nested in subdirs if essay not in self.essay_vectors.keys(): print "\nDoubleChecking", essay doc = Document(essay, "Wil") doc.document_to_text(essay, essay) # should probably truncate the first "essay" argument to just the filename doc.preprocess_text() doc.statistics() errors = doc.proofread() err_stats = {'grammar': 0, 'suggestion': 0, 'spelling': 0 } try: for err in errors: err_stats[err["type"]] += 1 except TypeError: print "No errors!" token_sentence_ratio = doc.stats['tokens'] / doc.stats['sentences'] self.essay_vectors[essay] = [ err_stats['grammar'], err_stats['suggestion'], err_stats['spelling'], token_sentence_ratio ] print "Completed " + essay + ". Sleeping..." sleep(10)
class PubmedArticleSet(handler.ContentHandler): def __init__(self): handler.feature_external_ges = "false" self.docs = {} self.doc = None self.chars = "" def startElement(self, name, attr): if name == 'PubmedArticle' or name == 'PubmedBookArticle': self.doc = Document() self.chars = "" def endElement(self, name): if name == 'PubmedArticle': self.docs[self.doc.pmid] = self.doc if name == 'PMID' and self.doc.pmid == None: self.doc.pmid = self.text() if name == 'ArticleTitle': self.doc.title = self.text() if name == 'AbstractText': if self.doc.abstract == None: self.doc.abstract = self.text() else: self.doc.abstract += self.text() if name == 'DescriptorName': self.doc.addMeSH(self.text()) def characters(self, data): self.chars += data def text(self): return self.chars.strip().encode('ascii', 'ignore') ## Method to parse a PubmedArticleSet XML file. # @param location The location of the xml file to parse # return A PubmedArticleSet object @classmethod def parse(self, location): parser = make_parser() parser.setFeature("http://xml.org/sax/features/external-general-entities", False) parser.setFeature("http://xml.org/sax/features/external-parameter-entities", False) handler = PubmedArticleSet() parser.setContentHandler(handler) try: f = open(location, 'r') parser.parse(f) f.close() except Exception, e: raise RuntimeError, "Could not parse PubmedArticleSet XML file at %s" % location return handler
def vector_test(): doc = Document("Untitled-1") cut = Cut(4, 100, 50) cut.points = [ Vector2(1200, 1300), Vector2(1400, 1500), ] doc.addCut(cut) cut = Cut(4, 100, 50) cut.points = [ Vector2(1200 + 600 * 10, 1300 + 200), Vector2(1400 + 600 * 10, 1500 + 200), ] doc.addCut(cut) cut = Cut(50, 100, 50) cut.points = [ Vector2(1200 + 200, 1300 + 200), Vector2(1400 + 200, 1500 + 200), ] doc.addCut(cut) return doc
def __init__(self, K: int, docs: Document, num_MH=2) -> None: self.K = K self._documents = docs.get_documents() self._V = docs.get_num_vocab() self._D = docs.get_num_docs() self._beta = 0.1 self._Vbeta = self._V * self._beta self._alpha = 0.01 self._sum_alpha = 0.1 * K self._nkv = np.zeros((self.K, self._V)).astype(np.int32) self._ndk = np.zeros((self._D, self.K)).astype(np.int32) self._nk = np.zeros(self.K).astype(np.int32) self._z = [] self.num_MH = num_MH