def load_collection(self): documents = [] for doc in la('docs/'): lines = open('docs/' + doc, 'r').readlines() bow = [] for line in lines: bow += self.lineToarray(line) d = Document(doc, bow) documents.append(d) return documents
def __init__(self, fileName): self._fN=fileName self._doc=Document() self._numStaples=0 self._scaffoldLength=0 decode(self._doc,file(self._fN).read()) self.getModelInfo() self.printModelInfo() self.vhLoop() self.numberScaffold()
def _create_item(self, metadata, parent): if metadata["Type"] == "CollectionType": new_object = Collection(metadata, parent) elif metadata["Type"] == "DocumentType": new_object = Document(metadata, parent) else: raise Exception("Unknown type %s" % metadata["Type"]) parent.add_child(new_object) return new_object
def searchDocuments(self): QApplication.instance().doclist.show() result = self.search.text() QApplication.instance().currentWord = result self.clearResults() documents = [] if result == "": for filepath in self.indexer.data['FILES']: QApplication.instance().doclist.addItem(Document(filepath, os.path.basename(filepath))) self.parent.leftDock.show() else: try: documentSet = parseResult(result) for filepath in documentSet: documents.append(Document(filepath, os.path.basename(filepath), result)) documents.sort(key=lambda x: x.rank, reverse=True) for doc in documents: QApplication.instance().doclist.addItem(doc) self.parent.leftDock.show() except ValueError: pass
def read_articles(path): articles = [] cnt = 0 for filename in os.listdir(path): with open(path + '/' + filename) as file: doc = Document(cnt, file.read()) doc.words, doc.sentences = process_document(doc.raw_text) count_tfidf(doc.sentences, doc.words) doc.graph = create_graph(doc.sentences, doc.words) cnt = cnt + 1 articles.append(doc) return articles
def upload_file(): if request.method == 'POST': if 'projectName' in request.form: project_name = str(request.form['projectName']) if 'inputFile' not in request.files: response = {'status_code': 400, 'message': 'No file selected'} response = make_response(response) return response file = request.files['inputFile'] if file.filename == '': response = {'status_code': 400, 'message': 'No file selected'} response = make_response(response) return response if file: filename = secure_filename(file.filename) filelocation = os.path.join(uploads_dir, filename) file.save(filelocation) with open(filelocation) as csv_file: csv_reader = csv.reader(csv_file, delimiter=",") is_first_line = True for row in csv_reader: if is_first_line: is_first_line = False else: document = Document(ObjectId(row[0]), [], row[1]) # Find project database and populate document collection project = Project(project_name, [], []) project.add_document(document) # Delete file when done os.remove(filelocation) response = { 'status_code': 200, 'message': 'Documents imported successfully' } return make_response(response) else: response = { 'status_code': 400, 'message': 'No project id provided' } response = make_response(response) return response
def create_document(): if 'project' in request.json: project = request.json['project'] else: response = {'message': "Missing project"} response = make_response(response) return response, 400 if 'content' in request.json: content = request.json['content'] else: response = {'message': "Missing content"} response = make_response(response) return response, 400 doc = Document(content, [], []) doc.data = content doc.upload(project) return '', 204
def to_documents(lines): documents = [] for l in lines: a = l.strip().split(' ') num_terms = int(a[0]) # number of unique terms terms = [] counts = [] num_words = 0 # Add word to doc for t in a[1:]: b = t.split(':') w = int(b[0]) # term n_w = int(b[1]) # number of occurrence terms.append(w) counts.append(n_w) num_words += n_w # Add doc to corpus doc = Document(num_terms, num_words, terms, counts) documents.append(doc) return documents
def create_document(project_name): id_token = request.args.get('id_token') if id_token is None or id_token == "": response = { 'message': "ID Token is not included with the request uri in args" } response = make_response(response) return response, 400 requestor_email = get_email(id_token) if requestor_email is None: response = {'message': "ID Token has expired or is invalid"} response = make_response(response) return response, 400 users_col = get_col(project_name, "users") requestor = users_col.find_one({ 'email': requestor_email, 'isContributor': True }) if requestor is None: response = {'message': "You are not authorised to perform this action"} response = make_response(response) return response, 403 if 'content' in request.json: content = request.json['content'] else: response = {'message': "Missing content"} response = make_response(response) return response, 400 doc = Document(content, [], []) doc.data = content doc.upload(project_name) return '', 204
def upload(): dico = {} dico['metadata'] = {} output_dir = os.path.join(os.path.join(os.getcwd(), 'app'), 'output_dir/') if request.method == 'POST': file = request.files['file'] f_name = request.files["file"].filename doc = Document(file, f_name) _data = doc.refersTo() if 'error' not in list(_data.keys()): content = "" for key, value in _data.items(): if key != "content": dico['metadata'][key] = value else: dico[key] = value dico['metadata']['mime_type'] = request.files["file"].content_type else: resp = jsonify({'message': _data['error']}) resp.status_code = 400 return resp try: with open( os.path.join(output_dir, f_name.split('.')[0]) + '.json', 'w+') as outfile: json.dump(dico, outfile) except: print("Can't write json") return jsonify(dico) else: resp = jsonify( {'message': 'Cette méthode ne peut être exécuté que par un POST'}) resp.status_code = 405 return resp
def __init__(self): """docstring for __init__""" # initialize variables self._document = Document() self._document.setController(self) self._activePart = None self._filename = None self._fileOpenPath = None # will be set in _readSettings self._hasNoAssociatedFile = True self._pathViewInstance = None self._sliceViewInstance = None self._undoStack = None self.win = None self.fileopendialog = None self.filesavedialog = None self.settings = QSettings() self._readSettings() # call other init methods self._initWindow() if app().isInMaya(): self._initMaya() app().documentControllers.add(self)
def save_document(document): d = Document(document['name'], jwt.current_user.id) db.session.add(d) db.session.commit() return d
def upload_file(): id_token = request.args.get('id_token') if id_token is None or id_token == "": response = { 'message': "ID Token is not included with the request uri in args" } response = make_response(response) return response, 400 requestor_email = get_email(id_token) if requestor_email is None: response = {'message': "ID Token has expired or is invalid"} response = make_response(response) return response, 400 if request.method == 'POST': if 'projectName' in request.form: project_name = str(request.form['projectName']) else: response = {'message': 'No project id provided'} response = make_response(response) return response, 400 users_col = get_col(project_name, "users") requestor = users_col.find_one({ 'email': requestor_email, 'isContributor': True }) if requestor is None: response = { 'message': "You are not authorised to perform this action" } response = make_response(response) return response, 403 if 'inputFile' not in request.files: response = {'message': 'No file selected'} response = make_response(response) return response, 400 file = request.files['inputFile'] if file.filename == '': response = {'message': 'No file selected'} response = make_response(response) return response, 400 if file: filename = secure_filename(file.filename) filelocation = os.path.join(uploads_dir, filename) file.save(filelocation) with open(filelocation) as csv_file: csv_reader = csv.reader(csv_file, delimiter=",") is_first_line = True for row in csv_reader: if is_first_line: is_first_line = False else: document = Document(row[1], [], []) # Find project database and populate document collection project = Project(project_name, [], []) project.add_document(document) # Delete file when done os.remove(filelocation) response = {'message': 'Documents imported successfully'} response = make_response(response) return response, 200
def test_setup_document(): my_document = Document("data", [], []) assert (my_document.data == "data")
from model.document import Document from utils.DBUtils import get_db_session, array_to_bytes from utils.InitializeUtils import initialize if __name__ == '__main__': session = get_db_session() print("Initializing...") docs_df = initialize() print("Initialization completed!\n") print("Inserting documents into db...") for index, row in docs_df.iterrows(): doc = Document(id=row['id'], path=row['path'], filename=row['filename'], text=array_to_bytes(row['text'])) print("Inserting obj:", doc) session.add(doc) session.commit() print("Done!")
def parse(self, file_path): """It parses the content of file_path and extracts relevant information from a TempEval-3 annotated file. Those information are packed in a Document object, which is our internal representation. """ assert os.path.isfile(file_path), 'File path does not exist!' logging.info('Document {}: parsing...'.format( os.path.relpath(file_path))) xml = etree.parse(file_path) text_node = xml.findall(".//TEXT")[0] text_string = etree.tostring(text_node, method='text', encoding='utf8') text_xml = etree.tostring(text_node, method='xml', encoding='utf8') text_string = unicode(text_string, 'UTF-8') text_xml = unicode(text_xml, 'UTF-8') right_chars = len(text_xml.split('</TEXT>')[1]) text_string = text_string[:-right_chars] text_xml = etree.tostring(text_node) # StanfordParser strips internally the text :( left_chars = len(text_string) - len(text_string.lstrip()) with Mute_stderr(): stanford_tree = CORENLP.parse(text_string) document = Document(file_path) document.text_offset = left_chars document.file_path = os.path.abspath(file_path) document.doc_id = os.path.basename(file_path) document.sec_times = self.get_dct(file_path) document.dct = document.sec_times.admission_date document.dct_text = document.dct.replace('-', '') document.title = os.path.basename(file_path) document.text = text_string document._coref = stanford_tree.get('coref', []) for num_sen, stanford_sentence in\ enumerate(stanford_tree['sentences']): collp_deps = stanford_sentence.get('collapsed_dependencies', None) basic_deps = stanford_sentence.get('basic_dependencies', None) parsetree = stanford_sentence.get('parsetree', u'') sentence_text = stanford_sentence.get('text', u'') sentence = Sentence(id_sentence=num_sen, basic_dependencies=basic_deps, collapsed_dependencies=collp_deps, parsetree=parsetree, text=sentence_text) for num_word, (word_form, attr) in\ enumerate(stanford_sentence['words']): offset_begin = int(attr['CharacterOffsetBegin']) - left_chars offset_end = int(attr['CharacterOffsetEnd']) - left_chars word = Word(word_form=word_form, char_offset_begin=offset_begin, char_offset_end=offset_end, lemma=attr['Lemma'], named_entity_tag=attr['NamedEntityTag'], part_of_speech=attr['PartOfSpeech'], id_token=num_word, id_sentence=num_sen) sentence.words.append(word) document.sentences.append(sentence) document.gold_annotations = self._get_annotations(xml, document) document.store_gold_annotations() document.complete_structure() logging.info('Document {}: parsed.'.format(os.path.relpath(file_path))) return document
def init(cls): cls._current_document = Document() log = vtk.vtkFileOutputWindow() #para guardar los mensajes de vtk log.SetFileName("logs/vtk.log") #fichero donde se guardan log.SetInstance(log) #usar este fichero
from operator import attrgetter from model.document import Document from model.inverted_index import InvertedIndex if __name__ == "__main__": document = [] document.append(Document( "1", "Candi Prambanan merupakan salah satu candi yang ada di Indonesia")) document.append( Document("2", "Indonesia Merupakan Negara Yang Mempunyai Banyak Candi")) document.append(Document("3", "Liburan di Negara Berkembang")) # iterate doc doc = Document() for doc in document: print(doc.docId + " -> " + doc.content) # menampilkan konten dokumen # end iterate print("=======") print("Terms : ") print() iIndex = InvertedIndex.toInvertedIndex(document) print(iIndex)
def __init__(self, term="", posting=Posting(Document())): self.posting = posting self.term = term
def test_setup_document(): my_document = Document(1, "data") assert(my_document.identifier == 1 and my_document.data == "data")