def test_rep(self): print('test rep') doc_1 = document.Document(r"sim_0.8/orig.txt") doc_2 = document.Document(r"sim_0.8/orig_0.8_rep.txt") ans = document.caculate_similarity(doc_1, doc_2) print(ans) self.assertGreaterEqual(ans, 0) self.assertLessEqual(ans, 1)
def update_index(self): import document import inverted_index from os import listdir from os.path import join options = QtWidgets.QFileDialog.Options() options |= QtWidgets.QFileDialog.DontUseNativeDialog fileName, _ = QtWidgets.QFileDialog.getOpenFileName( QtWidgets.QFileDialog(), "QtWidgets.QFileDialog.getOpenFileName()", "", "Text Files (*.txt);;", options=options) data = None with open(fileName, "r", encoding="utf-8") as fp: data = fp.read() print(data) fileName = fileName.split('/')[-1] collection_path = "/media/djaballah/54523AA71752BD7A/SII/S3/RI/Tp/RiProject/collection" collection = [f for f in listdir(collection_path)] if fileName not in collection: with open(join(collection_path, fileName), "w") as fp: fp.write(data) collection = [f for f in listdir(collection_path)] for document_name in collection: print(document_name) doc = document.Document(join(collection_path, document_name)) inverted_index.construct_index(doc)
def load(self, uuid=None, raw_uuid=None): cursor = self.conn.cursor() stmt = "SELECT name, creation_date, document_date, extra FROM document WHERE uuid = :uuid" if raw_uuid is not None: uuid = uuidlib.UUID(bytes=raw_uuid) sqlite_uuid = sqlite3.Binary(uuid.bytes) cursor.execute(stmt, {"uuid": sqlite_uuid}) result = cursor.fetchone() if result is None: return None stmt = "SELECT tag FROM tag WHERE uuid=:uuid" cursor.execute(stmt, {"uuid": sqlite_uuid}) tags = set([t[0] for t in cursor.fetchall()]) return document.Document( uuid=uuid, name=result[0], creation_date=datetime.datetime.strptime(result[1], "%Y-%m-%d").date(), document_date=datetime.datetime.strptime(result[2], "%Y-%m-%d").date(), extra=result[3], tags=tags, in_database=True, )
def bufferSaveAs(self): """Save buffer to a new filename.""" if self.bufferHasChanged() and self.buffer.doc.filepath: cancel = self.bufferSuggestSave() if cancel: return cancel filedir = '' if self.buffer and self.buffer.doc.filedir: filedir = self.buffer.doc.filedir result = editor.saveSingle( title='Save PySlices File', directory=filedir, wildcard='PySlices Files (*.pyslices)|*.pyslices') if result.path not in ['', None]: if result.path[-9:] != ".pyslices": result.path += ".pyslices" self.buffer.doc = document.Document(result.path) self.buffer.name = self.buffer.doc.filename self.buffer.modulename = self.buffer.doc.filebase self.simpleSave(confirmed=True) # allow overwrite cancel = False else: cancel = True return cancel
def load_snapshot(args): """Revert the document code to the state described by the selected snapshot.""" html_doc = document.Document(get_code(args.file)) snapshot = html_doc.load(args.index, date=args.edition, region=args.region) set_code(args.file, html_doc) print('Loaded snapshot {0!r:} - {1:%B} {1.day:2}, {1:%Y %l:%M:%S.%f %p}'. format(snapshot[1], snapshot[0]))
def create_train(train_file_name, classes_file_name): train = rd.read_vectors(train_file_name) classes = rd.read_classes(classes_file_name) train_objs = [] for i in range(len(train)): train_objs.append(doc.Document(train[i], classes[i])) return train_objs
def save_snapshot(args): """Save a snapshot of the current document state.""" html_doc = document.Document(get_code(args.file)) info = html_doc.save(args.message, date=args.edition, region=args.region) if info is None: print('Duplicate snapshot. No snapshot saved.') else: print('Snapshot saved for {:s} {:%B %d, %Y}. '.format( info[2].capitalize(), info[3]) + '{0!r:} - {1:%B} {1.day:2}, {1:%Y %l:%M:%S.%f %p}'.format( info[0], info[1]))
def list_snapshots(args): """Print a list of all available snapshots along with their indexes.""" html_doc = document.Document(get_code(args.file)) edition, region, snapshots = html_doc.list(date=args.edition, region=args.region) print('Snapshots for {:s} {:%B %d, %Y}'.format(region.capitalize(), edition)) for i in range(len(snapshots)): print( '({:2d}) {!r:} -'.format(i, snapshots[i][1]) + ' {0:%B} {0.day:2}, {0:%Y %l:%M:%S.%f %p}'.format(snapshots[i][0]))
def __init__(self): """Initialize the editor""" # Initialize an empty document self.document = document.Document() # Initialize an empty drawing self.drawing = drawing.Drawing() self._clients = {} self._colors = ['#AAFF00', '#FFAA00', '#FF00AA', '#AA00FF', '#00AAFF'] self._color_index = 0
def max_freq(doc_name): """ Return the frequence of the most occured term in doc_name :param doc_name: the document name :type: str :return: the maximum frequence :type: int """ assert isinstance(doc_name, str) doc = document.Document(join(collection_path, doc_name)) maximum = doc.get_max_freq() return maximum
def repair(args): """Perform a repair operation specified by the given arguments.""" html_doc = document.Document(get_code(args.file)) summary = html_doc.repair() print('{:d} typographical errors in ismailinsight.org corrected.'.format( summary['typos']), '{:d} style tags removed.'.format(summary['styles']), 'Background fix {:s}applied.'.format( 'not ' if summary['background'] == 0 else ''), sep='\n') set_code(args.file, html_doc)
def get(self, *args, **kwargs): if kwargs.has_key("pk"): searcher = pylucene.Searcher( storage_path=self.index_model._meta.storage_path, ) doc = searcher.get_document_by_uid( utils.get_uid(self.model, kwargs.get("pk"))) if doc is None: raise ObjectDoesNotExist, "" return document.Document(doc) return super(QuerySet, self).get(*args, **kwargs)
def setUp(self): """Prepare the environment before executing each test.""" current_dir = os.path.dirname(os.path.abspath(__file__)) with open(os.path.join(current_dir, 'files/test.html'), 'r', encoding='UTF-8') as file: self._document = document.Document(file.read()) with open(os.path.join(current_dir, 'files/transform.yml'), 'r', encoding='UTF-8') as file: with mock.patch('document.requests.get', remocks.get): self._remaining = self._document.apply(yaml.load(file))
def apply(args): """Apply a transform to an HTML template.""" html_doc = document.Document(get_code(args.file)) with open(args.transform_file, 'r', encoding='UTF-8') as tfr_file: tfr_json = yaml.load(tfr_file) not_applied = html_doc.apply(tfr_json) if len(not_applied) == 0: print('All transforms applied.') else: print('The following transforms could not be applied:') print(yaml.dump(not_applied)) set_code(args.file, html_doc)
def getRankingScoreForDocument(query, documentID): doc = document.Document(documentID) queriesLeadingToDoc = doc.queries if len(queriesLeadingToDoc) == 0: avLevenshteinDistance = -9999 else: avLevenshteinDistance = 0 for q in queriesLeadingToDoc: avLevenshteinDistance += sims.levenshtein(q, query) avLevenshteinDistance = avLevenshteinDistance / float(len(queriesLeadingToDoc)) avLevenshteinDistance = avLevenshteinDistance * -1 return avLevenshteinDistance
def document_from_query(query): query_dict = {"$format": "json", "Query": "'{0}'".format(query)} url = "https://api.datamarket.azure.com/Bing/Search/Web?" + urlencode( query_dict) auth_string = b64encode("{0}:{0}".format(secrets.BING_API_KEY)) headers = {"Authorization": "Basic " + auth_string} response, content = Http().request(url, "GET", headers=headers) results = json.loads(content)['d']['results'] html = u"<h1>Web search for '{0}'</h1>".format(query) + u"<br/>".join([ u"<a href='{0}'>{1} ({2})</a>".format(r['Url'], r['Title'], r['DisplayUrl']) for r in results ]) doc = document.Document(html=html) return doc
def bufferNew(self): """Create new buffer.""" cancel = self.bufferSuggestSave() if cancel: return cancel self.sliceshell.clear() self.SetTitle('PySlices') self.sliceshell.NeedsCheckForSave = False self.sliceshell.SetSavePoint() self.buffer.doc = document.Document() self.buffer.name = 'This shell' self.buffer.modulename = self.buffer.doc.filebase #self.bufferCreate() cancel = False return cancel
def __init__(self, title, body, width=400, **params): title = basic.Label(title) creditsLabel = basic.WidthLabel(value=body, width=width) creditsExit = button.Button("Okay", style={ 'width': 80, 'height': 40 }) creditsExit.connect(CLICK, self.close) creditsPanel = document.Document() creditsPanel.add(creditsLabel) creditsPanel.br(10) creditsPanel.br(4) creditsPanel.space((width // 2 - 40, 4)) creditsPanel.add(creditsExit) Dialog.__init__(self, title, creditsPanel)
def run(): jieba.setLogLevel(logging.INFO) # get arguments from system terminal try: file_path_1 = sys.argv[1] file_path_2 = sys.argv[2] output_path = sys.argv[3] except: print("缺少参数!") return # build Document object try: doc_1 = document.Document(file_path_1) doc_2 = document.Document(file_path_2) except: return # caculate similarity try: cosine_similiarity = document.caculate_similarity(doc_1, doc_2) except: return # output try: output_file = open(output_path, "w") output_file.write("%.2f"%(cosine_similiarity)) output_file.close() except: print("%s打开失败 "%(sys.argv[3])) return print("%.2f"%(cosine_similiarity)) print("OK")
def open(self, filename): """Open file into buffer.""" self.doc = document.Document(filename) self.name = self.doc.filename or ('Untitled:' + str(self.id)) self.modulename = self.doc.filebase # XXX This should really make sure filedir is first item in syspath. # XXX Or maybe this should be moved to the update namespace method. if self.doc.filedir and self.doc.filedir not in self.syspath: # To create the proper context for updateNamespace. self.syspath.insert(0, self.doc.filedir) if self.doc.filepath and os.path.exists(self.doc.filepath): self.confirmed = True if self.editor: text = self.doc.read() self.editor._setBuffer(buffer=self, text=text)
def test_website_typo(self): """Confirm that the code corrects the typographical error in ismailinsight.org.""" markup = """ <html> <body> <a href="https://www.ismailinsight.org">FIX THE TYPO</a> </body> </html> """ apple = document.Document(markup) apple.repair() self.assertEqual( "https://www.ismailiinsight.org", apple._data.a['href'], 'The typographical error in ismailinsight.org should be fixed.')
def iterator(self): for row in self.query.results_iter(): row = document.Document(row, query=self.get_raw_query()) if self._fields: if self.flat: if self._kind: yield map(lambda x: row.filter(x, kind=self._kind), self._fields)[0] else: yield tuple( map(lambda x: getattr(row, x), self._fields)) else: yield dict( map(lambda x: (x, getattr(row, x)), self._fields)) else: yield row
def study_from(self, start_point): """Starts the main study loop from a starting wikipedia page""" self.docs = [] self.register = document.Document('') self.how_many = 1 self.study(start_point) processes = [] # IDEA Make this loop run on parallel # downloading and extracting information while len(self.pages) > 0: page = self.pages[0] del self.pages[0] self.study(page) # generating final files self.analyze()
def restoreSession(key): """Restore a session specified by key, previously saved by the session manager.""" settings = sessionSettings(key) ## restore current named session name session_name = settings.value('session_name', "", type("")) if session_name: import sessions sessions.setCurrentSession(session_name) ## restore documents numdocuments = settings.value('numdocuments', 0, int) doc = None for index in range(numdocuments): settings.beginGroup("document{0}".format(index)) url = settings.value("url", QUrl(), QUrl) if url.isEmpty(): import document doc = document.Document() else: try: doc = app.openUrl(url) except IOError: pass settings.endGroup() # open at least one if doc is None: doc = app.openUrl(QUrl()) ## restore windows numwindows = settings.value('numwindows', 0, int) if numwindows > 0: for index in range(numwindows): settings.beginGroup("mainwindow{0}".format(index)) win = mainwindow.MainWindow() win.readSessionSettings(settings) win.show() u = settings.value("active_document", QUrl(), QUrl) # we don't use app.findDocument because it does not allow empty url for d in app.documents: if u == d.url(): win.setCurrentDocument(d) break else: win.setCurrentDocument(app.documents[0]) settings.endGroup() else: win = mainwindow.MainWindow() win.show()
def test_remove_styles(self): """Confirm that the code is stripped of all style tags.""" markup = """ <html> <head> <style>THIS IS VALID CSS</style> <style>THIS IS ALSO VALID CSS</style> <style>THIS IS NOT VALID CSS</style> </head> </html> """ apple = document.Document(markup) apple.repair() self.assertEqual(0, len(apple._data.find_all('style')), 'All style tags should be removed.')
def newDocument(self): """ Creates a new, empty document. """ d = document.Document() self.setCurrentDocument(d) s = QSettings() ndoc = s.value("new_document", "empty", str) template = s.value("new_document_template", "", str) if ndoc == "template" and template: from snippet import snippets, insert if snippets.text(template): insert.insert(template, self.currentView()) d.setUndoRedoEnabled(False) d.setUndoRedoEnabled(True) # d.clearUndoRedoStacks() only in Qt >= 4.7 d.setModified(False) elif ndoc == "version": import lilypondinfo d.setPlainText('\\version "{0}"\n\n'.format(lilypondinfo.preferred().versionString())) d.setModified(False)
def main(argv): args = docopt(__doc__, argv = argv) filename = args['<file>'] db, _ = get_dbfs() with open(filename) as file: docs = json.load(file) for data in docs: doc = document.Document( uuid = uuidlib.UUID(data['uuid']), name = data['name'], creation_date = dateutil.parser.parse(data['creation_date']).date(), document_date = dateutil.parser.parse(data['document_date']).date(), tags = data['tags'], extra = data['extra']) try: db.save(doc) except database.ExistsError: print(str(doc.uuid) + " already exists")
def review(args): """Perform a review operation specified by the given arguments.""" html_doc = document.Document(get_code(args.file)) summary = html_doc.review() print('{:d} blank links removed.'.format(summary['links']['removed']), '{:d} misdirected links set to open in new window.'.format( summary['links']['retargetted']), '{:d} double-tracked links decoded.'.format( summary['links']['decoded']), '{:d} broken links marked.'.format(summary['links']['broken']), '{:d} unchecked links marked.'.format(summary['links']['unchecked']), '{:d} links referencing missing anchors marked.'.format( summary['anchors']['marked']), '{:d} emails cleaned.'.format(summary['emails']['cleaned']), '{:d} invalid emails marked.'.format(summary['emails']['invalid']), '{:d} unchecked emails marked.'.format( summary['emails']['unchecked']), sep='\n') set_code(args.file, html_doc)
def test_do_nothing(self): """Confirm whether the code is not modified unnecessarily.""" markup = """ <html> <head> </head> <body> <div style="background-color: #595959;"> <a href="https://www.ismailiinsight.com">NOTHING SHOULD BE CHANGED</a> </div> </body> </html> """ apple = document.Document(markup) apple.repair() banana = bs4.BeautifulSoup(markup, 'html5lib') self.assertEqual( banana, apple._data, 'The code should not be changed if it is already correct.')
def main(): ''' Main process of the PyPt program ''' # Initialize configurations conf = config.Config() conf.readArgu() conf.readConf() conf.writeConf() theme = config.Config.theme doc = document.Document() doc.readFile(config.Config.inName) # Create window win = window.Window(doc, theme) # Allow resize win.master.resizable(width=True, height=True) # Begin main events loop log.info('Begin main loop') win.mainloop()