Пример #1
0
 def test_rep(self):
     print('test rep')
     doc_1 = document.Document(r"sim_0.8/orig.txt")
     doc_2 = document.Document(r"sim_0.8/orig_0.8_rep.txt")
     ans = document.caculate_similarity(doc_1, doc_2)
     print(ans)
     self.assertGreaterEqual(ans, 0)
     self.assertLessEqual(ans, 1)
Пример #2
0
    def update_index(self):

        import document
        import inverted_index
        from os import listdir
        from os.path import join

        options = QtWidgets.QFileDialog.Options()
        options |= QtWidgets.QFileDialog.DontUseNativeDialog
        fileName, _ = QtWidgets.QFileDialog.getOpenFileName(
            QtWidgets.QFileDialog(),
            "QtWidgets.QFileDialog.getOpenFileName()",
            "",
            "Text Files (*.txt);;",
            options=options)
        data = None
        with open(fileName, "r", encoding="utf-8") as fp:
            data = fp.read()
            print(data)
        fileName = fileName.split('/')[-1]
        collection_path = "/media/djaballah/54523AA71752BD7A/SII/S3/RI/Tp/RiProject/collection"
        collection = [f for f in listdir(collection_path)]

        if fileName not in collection:
            with open(join(collection_path, fileName), "w") as fp:
                fp.write(data)
        collection = [f for f in listdir(collection_path)]
        for document_name in collection:
            print(document_name)
            doc = document.Document(join(collection_path, document_name))
            inverted_index.construct_index(doc)
Пример #3
0
    def load(self, uuid=None, raw_uuid=None):
        cursor = self.conn.cursor()
        stmt = "SELECT name, creation_date, document_date, extra FROM document WHERE uuid = :uuid"
        if raw_uuid is not None:
            uuid = uuidlib.UUID(bytes=raw_uuid)
        sqlite_uuid = sqlite3.Binary(uuid.bytes)
        cursor.execute(stmt, {"uuid": sqlite_uuid})
        result = cursor.fetchone()
        if result is None:
            return None

        stmt = "SELECT tag FROM tag WHERE uuid=:uuid"
        cursor.execute(stmt, {"uuid": sqlite_uuid})
        tags = set([t[0] for t in cursor.fetchall()])

        return document.Document(
            uuid=uuid,
            name=result[0],
            creation_date=datetime.datetime.strptime(result[1],
                                                     "%Y-%m-%d").date(),
            document_date=datetime.datetime.strptime(result[2],
                                                     "%Y-%m-%d").date(),
            extra=result[3],
            tags=tags,
            in_database=True,
        )
Пример #4
0
    def bufferSaveAs(self):
        """Save buffer to a new filename."""
        if self.bufferHasChanged() and self.buffer.doc.filepath:
            cancel = self.bufferSuggestSave()
            if cancel:
                return cancel
        filedir = ''
        if self.buffer and self.buffer.doc.filedir:
            filedir = self.buffer.doc.filedir
        result = editor.saveSingle(
            title='Save PySlices File',
            directory=filedir,
            wildcard='PySlices Files (*.pyslices)|*.pyslices')
        if result.path not in ['', None]:
            if result.path[-9:] != ".pyslices":
                result.path += ".pyslices"

            self.buffer.doc = document.Document(result.path)
            self.buffer.name = self.buffer.doc.filename
            self.buffer.modulename = self.buffer.doc.filebase
            self.simpleSave(confirmed=True)  # allow overwrite
            cancel = False
        else:
            cancel = True
        return cancel
Пример #5
0
def load_snapshot(args):
    """Revert the document code to the state described by the selected snapshot."""
    html_doc = document.Document(get_code(args.file))
    snapshot = html_doc.load(args.index, date=args.edition, region=args.region)
    set_code(args.file, html_doc)
    print('Loaded snapshot {0!r:} - {1:%B} {1.day:2}, {1:%Y %l:%M:%S.%f %p}'.
          format(snapshot[1], snapshot[0]))
Пример #6
0
def create_train(train_file_name, classes_file_name):
    train = rd.read_vectors(train_file_name)
    classes = rd.read_classes(classes_file_name)

    train_objs = []

    for i in range(len(train)):
        train_objs.append(doc.Document(train[i], classes[i]))

    return train_objs
Пример #7
0
def save_snapshot(args):
    """Save a snapshot of the current document state."""
    html_doc = document.Document(get_code(args.file))
    info = html_doc.save(args.message, date=args.edition, region=args.region)
    if info is None:
        print('Duplicate snapshot. No snapshot saved.')
    else:
        print('Snapshot saved for {:s} {:%B %d, %Y}. '.format(
            info[2].capitalize(), info[3]) +
              '{0!r:} - {1:%B} {1.day:2}, {1:%Y %l:%M:%S.%f %p}'.format(
                  info[0], info[1]))
Пример #8
0
def list_snapshots(args):
    """Print a list of all available snapshots along with their indexes."""
    html_doc = document.Document(get_code(args.file))
    edition, region, snapshots = html_doc.list(date=args.edition,
                                               region=args.region)
    print('Snapshots for {:s} {:%B %d, %Y}'.format(region.capitalize(),
                                                   edition))
    for i in range(len(snapshots)):
        print(
            '({:2d}) {!r:} -'.format(i, snapshots[i][1]) +
            ' {0:%B} {0.day:2}, {0:%Y %l:%M:%S.%f %p}'.format(snapshots[i][0]))
Пример #9
0
    def __init__(self):
        """Initialize the editor"""
        # Initialize an empty document
        self.document = document.Document()

        # Initialize an empty drawing
        self.drawing = drawing.Drawing()

        self._clients = {}
        self._colors = ['#AAFF00', '#FFAA00', '#FF00AA', '#AA00FF', '#00AAFF']
        self._color_index = 0
Пример #10
0
def max_freq(doc_name):
    """
    Return the frequence of the most occured term in doc_name
    :param doc_name: the document name
    :type: str
    :return: the maximum frequence
    :type: int
    """
    assert isinstance(doc_name, str)
    doc = document.Document(join(collection_path, doc_name))
    maximum = doc.get_max_freq()
    return maximum
Пример #11
0
def repair(args):
    """Perform a repair operation specified by the given arguments."""
    html_doc = document.Document(get_code(args.file))
    summary = html_doc.repair()

    print('{:d} typographical errors in ismailinsight.org corrected.'.format(
        summary['typos']),
          '{:d} style tags removed.'.format(summary['styles']),
          'Background fix {:s}applied.'.format(
              'not ' if summary['background'] == 0 else ''),
          sep='\n')
    set_code(args.file, html_doc)
Пример #12
0
    def get(self, *args, **kwargs):
        if kwargs.has_key("pk"):
            searcher = pylucene.Searcher(
                storage_path=self.index_model._meta.storage_path, )
            doc = searcher.get_document_by_uid(
                utils.get_uid(self.model, kwargs.get("pk")))
            if doc is None:
                raise ObjectDoesNotExist, ""

            return document.Document(doc)

        return super(QuerySet, self).get(*args, **kwargs)
Пример #13
0
 def setUp(self):
     """Prepare the environment before executing each test."""
     current_dir = os.path.dirname(os.path.abspath(__file__))
     with open(os.path.join(current_dir, 'files/test.html'),
               'r',
               encoding='UTF-8') as file:
         self._document = document.Document(file.read())
     with open(os.path.join(current_dir, 'files/transform.yml'),
               'r',
               encoding='UTF-8') as file:
         with mock.patch('document.requests.get', remocks.get):
             self._remaining = self._document.apply(yaml.load(file))
Пример #14
0
def apply(args):
    """Apply a transform to an HTML template."""
    html_doc = document.Document(get_code(args.file))
    with open(args.transform_file, 'r', encoding='UTF-8') as tfr_file:
        tfr_json = yaml.load(tfr_file)
    not_applied = html_doc.apply(tfr_json)

    if len(not_applied) == 0:
        print('All transforms applied.')
    else:
        print('The following transforms could not be applied:')
        print(yaml.dump(not_applied))
    set_code(args.file, html_doc)
Пример #15
0
def getRankingScoreForDocument(query, documentID):
	doc = document.Document(documentID)
	queriesLeadingToDoc = doc.queries

	if len(queriesLeadingToDoc) == 0:
		avLevenshteinDistance = -9999
	else:
		avLevenshteinDistance = 0
		for q in queriesLeadingToDoc:
			avLevenshteinDistance += sims.levenshtein(q, query)
		avLevenshteinDistance = avLevenshteinDistance / float(len(queriesLeadingToDoc))
		avLevenshteinDistance = avLevenshteinDistance * -1

 	return avLevenshteinDistance
Пример #16
0
def document_from_query(query):
    query_dict = {"$format": "json", "Query": "'{0}'".format(query)}
    url = "https://api.datamarket.azure.com/Bing/Search/Web?" + urlencode(
        query_dict)
    auth_string = b64encode("{0}:{0}".format(secrets.BING_API_KEY))
    headers = {"Authorization": "Basic " + auth_string}
    response, content = Http().request(url, "GET", headers=headers)
    results = json.loads(content)['d']['results']
    html = u"<h1>Web search for '{0}'</h1>".format(query) + u"<br/>".join([
        u"<a href='{0}'>{1} ({2})</a>".format(r['Url'], r['Title'],
                                              r['DisplayUrl']) for r in results
    ])
    doc = document.Document(html=html)
    return doc
Пример #17
0
 def bufferNew(self):
     """Create new buffer."""
     cancel = self.bufferSuggestSave()
     if cancel:
         return cancel
     self.sliceshell.clear()
     self.SetTitle('PySlices')
     self.sliceshell.NeedsCheckForSave = False
     self.sliceshell.SetSavePoint()
     self.buffer.doc = document.Document()
     self.buffer.name = 'This shell'
     self.buffer.modulename = self.buffer.doc.filebase
     #self.bufferCreate()
     cancel = False
     return cancel
Пример #18
0
 def __init__(self, title, body, width=400, **params):
     title = basic.Label(title)
     creditsLabel = basic.WidthLabel(value=body, width=width)
     creditsExit = button.Button("Okay", style={
         'width': 80,
         'height': 40
     })
     creditsExit.connect(CLICK, self.close)
     creditsPanel = document.Document()
     creditsPanel.add(creditsLabel)
     creditsPanel.br(10)
     creditsPanel.br(4)
     creditsPanel.space((width // 2 - 40, 4))
     creditsPanel.add(creditsExit)
     Dialog.__init__(self, title, creditsPanel)
Пример #19
0
def run():
    jieba.setLogLevel(logging.INFO)

    # get arguments from system terminal
    try:
        file_path_1 = sys.argv[1]
        file_path_2 = sys.argv[2]
        output_path = sys.argv[3]
    except:
        print("缺少参数!")
        return

    # build Document object
    try:
        doc_1 = document.Document(file_path_1)
        doc_2 = document.Document(file_path_2)
    except:
        return

    # caculate similarity
    try:
        cosine_similiarity = document.caculate_similarity(doc_1, doc_2)
    except:
        return

    # output
    try:
        output_file = open(output_path, "w")
        output_file.write("%.2f"%(cosine_similiarity))
        output_file.close()
    except:
        print("%s打开失败 "%(sys.argv[3]))
        return

    print("%.2f"%(cosine_similiarity))
    print("OK")
Пример #20
0
 def open(self, filename):
     """Open file into buffer."""
     self.doc = document.Document(filename)
     self.name = self.doc.filename or ('Untitled:' + str(self.id))
     self.modulename = self.doc.filebase
     # XXX This should really make sure filedir is first item in syspath.
     # XXX Or maybe this should be moved to the update namespace method.
     if self.doc.filedir and self.doc.filedir not in self.syspath:
         # To create the proper context for updateNamespace.
         self.syspath.insert(0, self.doc.filedir)
     if self.doc.filepath and os.path.exists(self.doc.filepath):
         self.confirmed = True
     if self.editor:
         text = self.doc.read()
         self.editor._setBuffer(buffer=self, text=text)
Пример #21
0
    def test_website_typo(self):
        """Confirm that the code corrects the typographical error in ismailinsight.org."""
        markup = """
            <html>
                <body>
                    <a href="https://www.ismailinsight.org">FIX THE TYPO</a>
                </body>
            </html>
        """

        apple = document.Document(markup)
        apple.repair()

        self.assertEqual(
            "https://www.ismailiinsight.org", apple._data.a['href'],
            'The typographical error in ismailinsight.org should be fixed.')
Пример #22
0
 def iterator(self):
     for row in self.query.results_iter():
         row = document.Document(row, query=self.get_raw_query())
         if self._fields:
             if self.flat:
                 if self._kind:
                     yield map(lambda x: row.filter(x, kind=self._kind),
                               self._fields)[0]
                 else:
                     yield tuple(
                         map(lambda x: getattr(row, x), self._fields))
             else:
                 yield dict(
                     map(lambda x: (x, getattr(row, x)), self._fields))
         else:
             yield row
Пример #23
0
    def study_from(self, start_point):
        """Starts the main study loop from a starting wikipedia page"""
        self.docs = []
        self.register = document.Document('')
        self.how_many = 1
        self.study(start_point)
        processes = []

        # IDEA Make this loop run on parallel
        # downloading and extracting information
        while len(self.pages) > 0:
            page = self.pages[0]
            del self.pages[0]
            self.study(page)

        # generating final files
        self.analyze()
Пример #24
0
def restoreSession(key):
    """Restore a session specified by key, previously saved by the session manager."""
    settings = sessionSettings(key)
    ## restore current named session name
    session_name = settings.value('session_name', "", type(""))
    if session_name:
        import sessions
        sessions.setCurrentSession(session_name)
    ## restore documents
    numdocuments = settings.value('numdocuments', 0, int)
    doc = None
    for index in range(numdocuments):
        settings.beginGroup("document{0}".format(index))
        url = settings.value("url", QUrl(), QUrl)
        if url.isEmpty():
            import document
            doc = document.Document()
        else:
            try:
                doc = app.openUrl(url)
            except IOError:
                pass
        settings.endGroup()
    # open at least one
    if doc is None:
        doc = app.openUrl(QUrl())
    ## restore windows
    numwindows = settings.value('numwindows', 0, int)
    if numwindows > 0:
        for index in range(numwindows):
            settings.beginGroup("mainwindow{0}".format(index))
            win = mainwindow.MainWindow()
            win.readSessionSettings(settings)
            win.show()
            u = settings.value("active_document", QUrl(), QUrl)
            # we don't use app.findDocument because it does not allow empty url
            for d in app.documents:
                if u == d.url():
                    win.setCurrentDocument(d)
                    break
            else:
                win.setCurrentDocument(app.documents[0])
            settings.endGroup()
    else:
        win = mainwindow.MainWindow()
        win.show()
Пример #25
0
    def test_remove_styles(self):
        """Confirm that the code is stripped of all style tags."""
        markup = """
            <html>
                <head>
                    <style>THIS IS VALID CSS</style>
                    <style>THIS IS ALSO VALID CSS</style>
                    <style>THIS IS NOT VALID CSS</style>
                </head>
            </html>
        """

        apple = document.Document(markup)
        apple.repair()

        self.assertEqual(0, len(apple._data.find_all('style')),
                         'All style tags should be removed.')
Пример #26
0
 def newDocument(self):
     """ Creates a new, empty document. """
     d = document.Document()
     self.setCurrentDocument(d)
     s = QSettings()
     ndoc = s.value("new_document", "empty", str)
     template = s.value("new_document_template", "", str)
     if ndoc == "template" and template:
         from snippet import snippets, insert
         if snippets.text(template):
             insert.insert(template, self.currentView())
             d.setUndoRedoEnabled(False)
             d.setUndoRedoEnabled(True) # d.clearUndoRedoStacks() only in Qt >= 4.7
             d.setModified(False)
     elif ndoc == "version":
         import lilypondinfo
         d.setPlainText('\\version "{0}"\n\n'.format(lilypondinfo.preferred().versionString()))
         d.setModified(False)
Пример #27
0
def main(argv):
    args = docopt(__doc__, argv = argv)
    filename = args['<file>']
    db, _ = get_dbfs()

    with open(filename) as file:
        docs = json.load(file)
        for data in docs:
            doc = document.Document(
                    uuid = uuidlib.UUID(data['uuid']),
                    name = data['name'],
                    creation_date = dateutil.parser.parse(data['creation_date']).date(),
                    document_date = dateutil.parser.parse(data['document_date']).date(),
                    tags = data['tags'],
                    extra = data['extra'])
            try:
                db.save(doc)
            except database.ExistsError:
                print(str(doc.uuid) + " already exists")
Пример #28
0
def review(args):
    """Perform a review operation specified by the given arguments."""
    html_doc = document.Document(get_code(args.file))
    summary = html_doc.review()

    print('{:d} blank links removed.'.format(summary['links']['removed']),
          '{:d} misdirected links set to open in new window.'.format(
              summary['links']['retargetted']),
          '{:d} double-tracked links decoded.'.format(
              summary['links']['decoded']),
          '{:d} broken links marked.'.format(summary['links']['broken']),
          '{:d} unchecked links marked.'.format(summary['links']['unchecked']),
          '{:d} links referencing missing anchors marked.'.format(
              summary['anchors']['marked']),
          '{:d} emails cleaned.'.format(summary['emails']['cleaned']),
          '{:d} invalid emails marked.'.format(summary['emails']['invalid']),
          '{:d} unchecked emails marked.'.format(
              summary['emails']['unchecked']),
          sep='\n')
    set_code(args.file, html_doc)
Пример #29
0
    def test_do_nothing(self):
        """Confirm whether the code is not modified unnecessarily."""
        markup = """
            <html>
                <head>
                </head>
                <body>
                    <div style="background-color: #595959;">
                        <a href="https://www.ismailiinsight.com">NOTHING SHOULD BE CHANGED</a>
                    </div>
                </body>
            </html>
        """

        apple = document.Document(markup)
        apple.repair()

        banana = bs4.BeautifulSoup(markup, 'html5lib')
        self.assertEqual(
            banana, apple._data,
            'The code should not be changed if it is already correct.')
Пример #30
0
def main():
    '''
    Main process of the PyPt program
    '''
    # Initialize configurations
    conf = config.Config()
    conf.readArgu()
    conf.readConf()
    conf.writeConf()
    theme = config.Config.theme
    doc = document.Document()
    doc.readFile(config.Config.inName)

    # Create window
    win = window.Window(doc, theme)

    # Allow resize
    win.master.resizable(width=True, height=True)

    # Begin main events loop
    log.info('Begin main loop')
    win.mainloop()