예제 #1
0
 def load_collection(self):
     documents = []
     for doc in la('docs/'):
         lines = open('docs/' + doc, 'r').readlines()
         bow = []
         for line in lines:
             bow += self.lineToarray(line)
         d = Document(doc, bow)
         documents.append(d)
     return documents
예제 #2
0
 def __init__(self, fileName):
     self._fN=fileName
     self._doc=Document()
     self._numStaples=0
     self._scaffoldLength=0
     decode(self._doc,file(self._fN).read())
     self.getModelInfo()
     self.printModelInfo()
     self.vhLoop()
     self.numberScaffold()
예제 #3
0
    def _create_item(self, metadata, parent):
        if metadata["Type"] == "CollectionType":
            new_object = Collection(metadata, parent)

        elif metadata["Type"] == "DocumentType":
            new_object = Document(metadata, parent)

        else:
            raise Exception("Unknown type %s" % metadata["Type"])

        parent.add_child(new_object)
        return new_object
예제 #4
0
 def searchDocuments(self):
     QApplication.instance().doclist.show()
     result = self.search.text()
     QApplication.instance().currentWord = result
     self.clearResults()
     documents = []
     if result == "":
         for filepath in self.indexer.data['FILES']:
             QApplication.instance().doclist.addItem(Document(filepath, os.path.basename(filepath)))
             self.parent.leftDock.show()
     else:
         try:
             documentSet = parseResult(result)
             for filepath in documentSet:
                 documents.append(Document(filepath, os.path.basename(filepath), result))
             documents.sort(key=lambda x: x.rank, reverse=True)
             for doc in documents:
                 QApplication.instance().doclist.addItem(doc)
             self.parent.leftDock.show()
         except ValueError:
             pass
예제 #5
0
def read_articles(path):
    articles = []
    cnt = 0

    for filename in os.listdir(path):
        with open(path + '/' + filename) as file:
            doc = Document(cnt, file.read())
            doc.words, doc.sentences = process_document(doc.raw_text)
            count_tfidf(doc.sentences, doc.words)
            doc.graph = create_graph(doc.sentences, doc.words)
            cnt = cnt + 1
            articles.append(doc)

    return articles
예제 #6
0
def upload_file():
    if request.method == 'POST':

        if 'projectName' in request.form:
            project_name = str(request.form['projectName'])

            if 'inputFile' not in request.files:
                response = {'status_code': 400, 'message': 'No file selected'}
                response = make_response(response)
                return response

            file = request.files['inputFile']
            if file.filename == '':
                response = {'status_code': 400, 'message': 'No file selected'}
                response = make_response(response)
                return response
            if file:
                filename = secure_filename(file.filename)
                filelocation = os.path.join(uploads_dir, filename)
                file.save(filelocation)

                with open(filelocation) as csv_file:
                    csv_reader = csv.reader(csv_file, delimiter=",")
                    is_first_line = True

                    for row in csv_reader:
                        if is_first_line:
                            is_first_line = False
                        else:
                            document = Document(ObjectId(row[0]), [], row[1])
                            # Find project database and populate document collection
                            project = Project(project_name, [], [])
                            project.add_document(document)

                # Delete file when done
                os.remove(filelocation)

                response = {
                    'status_code': 200,
                    'message': 'Documents imported successfully'
                }
                return make_response(response)
        else:
            response = {
                'status_code': 400,
                'message': 'No project id provided'
            }
            response = make_response(response)
            return response
def create_document():
    if 'project' in request.json:
        project = request.json['project']
    else:
        response = {'message': "Missing project"}
        response = make_response(response)
        return response, 400
    if 'content' in request.json:
        content = request.json['content']
    else:
        response = {'message': "Missing content"}
        response = make_response(response)
        return response, 400

    doc = Document(content, [], [])
    doc.data = content
    doc.upload(project)
    return '', 204
예제 #8
0
def to_documents(lines):
    documents = []
    for l in lines:
        a = l.strip().split(' ')
        num_terms = int(a[0])  # number of unique terms
        terms = []
        counts = []
        num_words = 0
        # Add word to doc
        for t in a[1:]:
            b = t.split(':')
            w = int(b[0])  # term
            n_w = int(b[1])  # number of occurrence
            terms.append(w)
            counts.append(n_w)
            num_words += n_w
        # Add doc to corpus
        doc = Document(num_terms, num_words, terms, counts)
        documents.append(doc)
    return documents
예제 #9
0
def create_document(project_name):
    id_token = request.args.get('id_token')

    if id_token is None or id_token == "":
        response = {
            'message': "ID Token is not included with the request uri in args"
        }
        response = make_response(response)
        return response, 400

    requestor_email = get_email(id_token)

    if requestor_email is None:
        response = {'message': "ID Token has expired or is invalid"}
        response = make_response(response)
        return response, 400

    users_col = get_col(project_name, "users")
    requestor = users_col.find_one({
        'email': requestor_email,
        'isContributor': True
    })
    if requestor is None:
        response = {'message': "You are not authorised to perform this action"}
        response = make_response(response)
        return response, 403

    if 'content' in request.json:
        content = request.json['content']
    else:
        response = {'message': "Missing content"}
        response = make_response(response)
        return response, 400

    doc = Document(content, [], [])
    doc.data = content
    doc.upload(project_name)
    return '', 204
예제 #10
0
파일: app.py 프로젝트: etvincen/filRouge
def upload():
    dico = {}
    dico['metadata'] = {}
    output_dir = os.path.join(os.path.join(os.getcwd(), 'app'), 'output_dir/')
    if request.method == 'POST':
        file = request.files['file']
        f_name = request.files["file"].filename
        doc = Document(file, f_name)
        _data = doc.refersTo()

        if 'error' not in list(_data.keys()):
            content = ""
            for key, value in _data.items():
                if key != "content":
                    dico['metadata'][key] = value
                else:
                    dico[key] = value
            dico['metadata']['mime_type'] = request.files["file"].content_type
        else:
            resp = jsonify({'message': _data['error']})
            resp.status_code = 400
            return resp
        try:
            with open(
                    os.path.join(output_dir,
                                 f_name.split('.')[0]) + '.json',
                    'w+') as outfile:
                json.dump(dico, outfile)
        except:
            print("Can't write json")

        return jsonify(dico)
    else:
        resp = jsonify(
            {'message': 'Cette méthode ne peut être exécuté que par un POST'})
        resp.status_code = 405
        return resp
    def __init__(self):
        """docstring for __init__"""
        # initialize variables
        self._document = Document()
        self._document.setController(self)
        self._activePart = None
        self._filename = None
        self._fileOpenPath = None  # will be set in _readSettings
        self._hasNoAssociatedFile = True
        self._pathViewInstance = None
        self._sliceViewInstance = None
        self._undoStack = None
        self.win = None
        self.fileopendialog = None
        self.filesavedialog = None

        self.settings = QSettings()
        self._readSettings()

        # call other init methods
        self._initWindow()
        if app().isInMaya():
            self._initMaya()
        app().documentControllers.add(self)
def save_document(document):
    d = Document(document['name'], jwt.current_user.id)
    db.session.add(d)
    db.session.commit()
    return d
예제 #13
0
def upload_file():
    id_token = request.args.get('id_token')

    if id_token is None or id_token == "":
        response = {
            'message': "ID Token is not included with the request uri in args"
        }
        response = make_response(response)
        return response, 400

    requestor_email = get_email(id_token)

    if requestor_email is None:
        response = {'message': "ID Token has expired or is invalid"}
        response = make_response(response)
        return response, 400

    if request.method == 'POST':

        if 'projectName' in request.form:
            project_name = str(request.form['projectName'])
        else:
            response = {'message': 'No project id provided'}
            response = make_response(response)
            return response, 400

        users_col = get_col(project_name, "users")
        requestor = users_col.find_one({
            'email': requestor_email,
            'isContributor': True
        })
        if requestor is None:
            response = {
                'message': "You are not authorised to perform this action"
            }
            response = make_response(response)
            return response, 403

        if 'inputFile' not in request.files:
            response = {'message': 'No file selected'}
            response = make_response(response)
            return response, 400

        file = request.files['inputFile']

        if file.filename == '':
            response = {'message': 'No file selected'}
            response = make_response(response)
            return response, 400

        if file:
            filename = secure_filename(file.filename)
            filelocation = os.path.join(uploads_dir, filename)
            file.save(filelocation)

            with open(filelocation) as csv_file:
                csv_reader = csv.reader(csv_file, delimiter=",")
                is_first_line = True

                for row in csv_reader:
                    if is_first_line:
                        is_first_line = False
                    else:
                        document = Document(row[1], [], [])
                        # Find project database and populate document collection
                        project = Project(project_name, [], [])
                        project.add_document(document)

            # Delete file when done
            os.remove(filelocation)

            response = {'message': 'Documents imported successfully'}
            response = make_response(response)
            return response, 200
def test_setup_document():
    my_document = Document("data", [], [])
    assert (my_document.data == "data")
예제 #15
0
from model.document import Document
from utils.DBUtils import get_db_session, array_to_bytes
from utils.InitializeUtils import initialize

if __name__ == '__main__':
    session = get_db_session()

    print("Initializing...")
    docs_df = initialize()
    print("Initialization completed!\n")

    print("Inserting documents into db...")
    for index, row in docs_df.iterrows():
        doc = Document(id=row['id'],
                       path=row['path'],
                       filename=row['filename'],
                       text=array_to_bytes(row['text']))
        print("Inserting obj:", doc)
        session.add(doc)
        session.commit()
    print("Done!")
예제 #16
0
    def parse(self, file_path):
        """It parses the content of file_path and extracts relevant information
        from a TempEval-3 annotated file. Those information are packed in a
        Document object, which is our internal representation.
        """
        assert os.path.isfile(file_path), 'File path does not exist!'
        logging.info('Document {}: parsing...'.format(
            os.path.relpath(file_path)))
        xml = etree.parse(file_path)
        text_node = xml.findall(".//TEXT")[0]
        text_string = etree.tostring(text_node, method='text', encoding='utf8')
        text_xml = etree.tostring(text_node, method='xml', encoding='utf8')
        text_string = unicode(text_string, 'UTF-8')
        text_xml = unicode(text_xml, 'UTF-8')
        right_chars = len(text_xml.split('</TEXT>')[1])
        text_string = text_string[:-right_chars]
        text_xml = etree.tostring(text_node)

        # StanfordParser strips internally the text :(
        left_chars = len(text_string) - len(text_string.lstrip())
        with Mute_stderr():
            stanford_tree = CORENLP.parse(text_string)

        document = Document(file_path)
        document.text_offset = left_chars
        document.file_path = os.path.abspath(file_path)
        document.doc_id = os.path.basename(file_path)
        document.sec_times = self.get_dct(file_path)
        document.dct = document.sec_times.admission_date
        document.dct_text = document.dct.replace('-', '')
        document.title = os.path.basename(file_path)
        document.text = text_string
        document._coref = stanford_tree.get('coref', [])

        for num_sen, stanford_sentence in\
                enumerate(stanford_tree['sentences']):
            collp_deps = stanford_sentence.get('collapsed_dependencies', None)
            basic_deps = stanford_sentence.get('basic_dependencies', None)
            parsetree = stanford_sentence.get('parsetree', u'')

            sentence_text = stanford_sentence.get('text', u'')

            sentence = Sentence(id_sentence=num_sen,
                                basic_dependencies=basic_deps,
                                collapsed_dependencies=collp_deps,
                                parsetree=parsetree,
                                text=sentence_text)
            for num_word, (word_form, attr) in\
                    enumerate(stanford_sentence['words']):
                offset_begin = int(attr['CharacterOffsetBegin']) - left_chars
                offset_end = int(attr['CharacterOffsetEnd']) - left_chars
                word = Word(word_form=word_form,
                            char_offset_begin=offset_begin,
                            char_offset_end=offset_end,
                            lemma=attr['Lemma'],
                            named_entity_tag=attr['NamedEntityTag'],
                            part_of_speech=attr['PartOfSpeech'],
                            id_token=num_word,
                            id_sentence=num_sen)
                sentence.words.append(word)
            document.sentences.append(sentence)

        document.gold_annotations = self._get_annotations(xml, document)
        document.store_gold_annotations()
        document.complete_structure()

        logging.info('Document {}: parsed.'.format(os.path.relpath(file_path)))
        return document
예제 #17
0
 def init(cls):
     cls._current_document = Document()
     log = vtk.vtkFileOutputWindow()  #para guardar los mensajes de vtk
     log.SetFileName("logs/vtk.log")  #fichero donde se guardan
     log.SetInstance(log)  #usar este fichero
예제 #18
0
from operator import attrgetter

from model.document import Document
from model.inverted_index import InvertedIndex

if __name__ == "__main__":
    document = []
    document.append(Document(
        "1", "Candi Prambanan merupakan salah satu candi yang ada di Indonesia"))
    document.append(
        Document("2", "Indonesia Merupakan Negara Yang Mempunyai Banyak Candi"))
    document.append(Document("3", "Liburan di Negara Berkembang"))

    # iterate doc
    doc = Document()
    for doc in document:
        print(doc.docId + " -> " + doc.content)  # menampilkan konten dokumen
    # end iterate

    print("=======")
    print("Terms : ")
    print()

    iIndex = InvertedIndex.toInvertedIndex(document)
    print(iIndex)
예제 #19
0
 def __init__(self, term="", posting=Posting(Document())):
     self.posting = posting
     self.term = term
예제 #20
0
def test_setup_document():
    my_document = Document(1, "data")
    assert(my_document.identifier == 1 and my_document.data == "data")