def __parse_documents(datapath): """ function: parse_document ------------------------ extract list of Document objects from token list :returns: list of document entities generated by generate_document() """ documents = [] pairs = dict([]) # generate well-formatted document set for each file for file in os.listdir(datapath): # open 'reut2-XXX.sgm' file from /data directory path = os.path.join(datapath, file) data = open(path, 'r') text = data.read() data.close() tree = __generate_tree(text.lower()) # separate segments & generate documents for reuter in tree.find_all("reuters"): document = Document(reuter) pairs[document] = reuter # generate tokenized word list for each document for document, reuter in pairs.iteritems(): document.populate_word_list(reuter) documents.append(document) print "Finished extracting information from file:", file return documents
def main(): parser = argparse.ArgumentParser(description='Crée le dossier d\'un document à partir d\'un fichier bibtex') parser.add_argument('bib', help='le fichier bibtex du document') parser.add_argument("-", "--path", type=str, help="chemin du dossier où créer le dossier du document") args = parser.parse_args() bibfile = args.bib path = args.path doc = Document() if(doc.readFields(bibfile)): doc.createDocumentFolder(bibfile, path)
def main(): parser = argparse.ArgumentParser(description='Crée un template de notes latex à partir d\'un fichier bibtex') parser.add_argument('bib', help='le fichier bibtex du document') parser.add_argument("-o", "--path", type=str, help="chemin du dossier où créer le dossier du document") args = parser.parse_args() bibfile = args.bib path = args.path doc = Document() if(doc.readFields(bibfile)): doc.writeLatexNotes(path)
def parse(self, path): """Méthode qui analyse un fichier .txt est renvoie un corpus Parameters ---------- path : str Chemin du fichier à analyser Raises ------ FileNotFoundError Si le chemin vers le fichier n'existe pas PermissionError Si les permissions du fichier ne permettent pas l'ouverture """ corpusRes = Corpus() with open(path, "r", encoding="utf-8") as file: txt = file.read() regex = r'<article title=\".*?\">\n(.*?)</article>' contenus = re.findall(regex, txt, re.DOTALL) for content in contenus: corpusRes.addDocument(Document(content)) return corpusRes
def parse(self, path): """Methode qui permet de construire un corpus à partir d'un .txt. Ici le motif "##END##" doit être une ligne entre deux documents dans le .txt. Si aucun motif "##END##" n'est présent alors le corpus sera composé d'un seul document de contenu tout le .txt. Parameters ---------- path : str Chemin du fichier à analyser Raises ------ FileNotFoundError Si le chemin vers le fichier n'existe pas PermissionError Si les permissions du fichier ne permettent pas l'ouverture Returns ------- Corpus Le corpus extrait du fichier passé en argument """ corpusRes = Corpus() with open(path, 'r', encoding='utf-8') as f: txt = f.read() listedoc = re.split('^##END##$', txt, flags=re.MULTILINE) for contenu in listedoc: corpusRes.addDocument(Document(contenu)) return corpusRes
def get_candidate_fragments_from_web(fragments: List[Fragment], **params) -> List[FragmentReport]: fragments_url_snippet_pairs = get_url_snippet_pairs(fragments, params) # create reports reports = [] url_doc = {} user_login, _ = get_current_user() for i in range(len(fragments)): similar_fragments = [] for url, snippet in fragments_url_snippet_pairs[i]: if url not in url_doc.keys(): doc = Document(id=None, uri=url, snippet=snippet, date_added=datetime.datetime.now(), user_login=user_login) url_doc[url] = doc web_doc_fragments = url_doc[url].get_fragments() similar_fragments.extend([(wb_fragment, 1.0) for page in web_doc_fragments for wb_fragment in page]) reports.append( FragmentReport(checked_fragment=fragments[i], most_similar=similar_fragments)) return reports
def test_uploading_content_to_index(self): # Search for term, add similar content and check that hits increase doc = Document("Test project", "Test customer", "test_file") hits = len(docfuncs.search("test", 999999999999999999)) docfuncs.upload_to_index( doc, "src/tests/test_upload_files/one_page_test_file.pdf") self.assertEqual(len(docfuncs.search("test", 99999999999999999)), hits + 1)
def get_all_documents_from_db(): """ Returns: All document objects from the database """ conn = db_conn.get_database_connection() cur = conn.cursor() cur.execute("SELECT * FROM documents") doc_records = cur.fetchall() doc_list = [] for doc in doc_records: document = Document(doc[1], doc[2], doc[3]) document.set_doc_id(doc[0]) doc_list.append(document) cur.close() return doc_list
def get_document_from_db(doc_id): """ Args: doc_id: Document id to be retreived from the database Returns: Document object given the id """ conn = db_conn.get_database_connection() cur = conn.cursor() cur.execute("SELECT * FROM documents WHERE id=?", (doc_id, )) doc_query_object = cur.fetchall()[0] # Construct Document object with params: project, customer, file doc = Document(doc_query_object[1], doc_query_object[2], doc_query_object[3]) doc.set_doc_id(doc_query_object[0]) cur.close() return doc
def test_upload_metadata_to_db(self): # Test should return id 3 when new document is inserted # At last the inserted row is removed doc = Document("Test project", "Test customer", "test_file.xyz") self.assertEqual(docfuncs.upload_document_to_db(doc), 3) conn = database_connect.get_database_connection() cur = conn.cursor() sql = '''DELETE FROM documents WHERE id = 3 ''' cur.execute(sql) conn.commit() cur.close()
def upload_similar_docs_from_web(fragments: List[Fragment], **params): fragments_url_snippet_pairs = get_url_snippet_pairs(fragments, params) for url, snippet in set( itertools.chain.from_iterable(fragments_url_snippet_pairs)): doc = Document(id=None, uri=url, snippet=snippet, date_added=datetime.datetime.now(), user_login=params["user_login"]) Documents.add_document(doc)
def function_to_debug(): doc_a = Document("DocA", [ Feature("f0", 10), Feature("f1", 5), Feature("f2", 10), Feature("f3", 20), Feature("f4", 10), Feature("f5", 10), Feature("f6", 10) ]) doc_b = Document("DocB", [ Feature("f0", 10), Feature("f3", 5), Feature("f5", 8), Feature("f6", 1) ]) m = Match(doc_a, doc_b) m.elaborated_match_v2() sys.exit(0)
def _pass1(self, thread_no, queue): while True: logging.debug("INDEXER-P1-THREAD-%d: Waiting for next document" % thread_no) document_path = queue.get() # blocks until a document is avialable document = Document.from_file(document_path) if document is None: logging.warning("INDEXER-P1-THREAD-%d: Document %s contains " "invalid format" % (thread_no, document_path)) queue.task_done() continue logging.debug("INDEXER-P1-THREAD-%d: Processing '%s'" % (thread_no, document.title)) # Assign an ID to the document with self._ifile_lock: document.document_id = int(document.document_number) self._corpus[document.document_id] = document # For similar context index if (self._full_text is not None): cleaned_text = [] for word in document.text.split(" "): if not word.lower() in constants.DO_NOT_INDEX: cleaned_text.append(word) self._full_text = self._full_text + " " + " ".join( cleaned_text) # Tokenize tokens = re.compile(constants.DELIMITERS).split(document.text) tokens_title = re.compile(constants.DELIMITERS).split( document.title) tokens_author = re.compile(constants.DELIMITERS).split( document.author) tokens_biblio = re.compile(constants.DELIMITERS).split( document.biblio) # Insert tokens in inverted files for token in tokens: self._pass1_process_token(document.document_id, token) for token in tokens_title: self._pass1_process_token(document.document_id, token) for token in tokens_author: self._pass1_process_token(document.document_id, token) for token in tokens_biblio: self._pass1_process_token(document.document_id, token) queue.task_done()
def _pass1(self, thread_no, queue): while True: logging.debug("INDEXER-P1-THREAD-%d: Waiting for next document" % thread_no) document_path = queue.get() # blocks until a document is avialable document = Document.from_file(document_path) if document is None: logging.warning("INDEXER-P1-THREAD-%d: Document %s contains " "invalid format" % (thread_no,document_path)) queue.task_done() continue logging.debug("INDEXER-P1-THREAD-%d: Processing '%s'" % (thread_no,document.title)) # Assign an ID to the document with self._ifile_lock: document.document_id = int(document.document_number) self._corpus[document.document_id] = document # For similar context index if (self._full_text is not None): cleaned_text = [] for word in document.text.split(" "): if not word.lower() in constants.DO_NOT_INDEX: cleaned_text.append(word) self._full_text = self._full_text + " " + " ".join(cleaned_text) # Tokenize tokens = re.compile(constants.DELIMITERS).split(document.text) tokens_title = re.compile(constants.DELIMITERS).split(document.title) tokens_author = re.compile(constants.DELIMITERS).split(document.author) tokens_biblio = re.compile(constants.DELIMITERS).split(document.biblio) # Insert tokens in inverted files for token in tokens: self._pass1_process_token(document.document_id, token) for token in tokens_title: self._pass1_process_token(document.document_id, token) for token in tokens_author: self._pass1_process_token(document.document_id, token) for token in tokens_biblio: self._pass1_process_token(document.document_id, token) queue.task_done()
def save_file(project, customer, file, long_file_name): """ Processes the server functions of the file upload request. Args: document: Document object to be added to the database and Whoosh index long_file_name: OS path to the file to be stored to server and extracted added to search index. Returns: """ document = Document(project, customer, file) document.document_id = upload_document_to_db(document) save_path = DOCUMENT_FILEPATH filename = os.path.join(save_path, str(document.document_id)) newfile = open(filename, "w+b") newfile.write(open(long_file_name, "r+b").read()) newfile.close() upload_to_index(document, long_file_name)
docs = set() dictStatDoc[q] = {} dictStatFeat[q] = {} list = collection.find({'query':q},{'_id':0,'docs':1}) count = 0 for i in list : for d in i['docs'] : #print "**********" count += 1 name = d['doc_name'] #list_feat = [] list_feat ={} for f in d['features'] : list_feat[f] = Feature(f,d['features'][f]) list_doc[name] = Document(name,list_feat) for p in dictQRels[q]['pertinent'] : if p in list_doc : rankedFeat_p = sorted(list_doc[p].features.values(), key=attrgetter('value'), reverse=True) for np in dictQRels[q]['nonpertinent'] : if np in list_doc : rankedFeat_np = sorted(list_doc[np].features.values(), key=attrgetter('value'), reverse=True) val = compareFeatures(rankedFeat_p,rankedFeat_np,k) if val == 1 : nb_p_kdomine += 1 nb_match_total += 1
model_path="data/models/sentencepiece/", train_text_files=dmgr.builder.list_source_files( dmgrs=['TEXT_BOOK', 'TEXT_BOOK_LOW', 'TEXT_WEB']), sample_count= 30000000, # Total sentence count in my corpus is about 80M(80000000) config=config, ) # Dataset build (using modules) print("Building dataset") dmgr.builder.build_all([ "TEXT_BOOK", "TEXT_BOOK_LOW", "TEXT_WEB", "TEXT_WIKI", "TEXT_NAMUWIKI", "TEXT_NEWS_COMMENT" ], config) dmgr.builder.build_all(["PARALLEL_KO_KO"], config) dmgr.builder.build_all(["TEXT_BERT"], config) dmgr.builder.build_all(["NER"], config) # Module train / load config = embedder.bert.initialize(model_path="data/models/bert", train_dataset="TEXT_BERT", config=config) config = ner.bert_ner.initialize(model_path="data/models/bert_ner", train_dataset="NER", config=config) # depends on bert module # Generate report doc = Document("data/datasets/TEXT_BOOK.json", config=config) reporter.report_to_file(doc)
def createFolderFromBibfile(self): doc = Document() doc.readFields("article.bib") doc.createDocumentFolder()
import directoryparser as dir import drinks_parser as dp from document.document import Document from document.constants import * from csv import CSV_Writer drink_directory = "../data/drinks/" pub_directory = "../pub/" print_dict = dict() docs = [] if __name__ == '__main__': for drink_file in dir.get_drink_files(drink_directory): docs.append(dp.get_drink_dict(drink_file)) # To make headlines country->brewery->drinks->review where # country, brewery, drink is uniqe: # {"country": {"brewery":[{drink_name:doc}]}} print_dict = dp.order_drinks(docs) main_file = Document(pub_directory, js_header) main_file.create_content(print_dict) main_file.save_basic_file("provningar_main.org") one_file = Document(pub_directory, header) one_file.create_content(print_dict) one_file.save_basic_file("provningar_en_fil.org") csv_writer = CSV_Writer(pub_directory, docs) csv_writer.write_lines("csv_file.csv")
def main(): # Handle user options try: opts, args = getopt.getopt( sys.argv[1:], "dht:i:l:s:n:r:f:vc:b:g:am:o:p:x:w:y:z:", [ "debug", "help", "type=", "impact=", "life=", "strategy=", "nbFeats=", "rounds=", "featureList=", "verbose", 'collection=', 'group=', 'accepted', 'model=', 'optim', 'process=', "cross=", "boost=", "alpha=", "topX=" ]) except getopt.GetoptError as err: # print help information and exit: print str(err) # will print something like "option -a not recognized" usage() sys.exit(2) # Set the default values optim = "order" debug = False nb_rounds = 10 nb_groups = 10 nb_documents = 16 nbFeats = 0 group = 1 type_tournament = "robin" collection_name = 'trec_adhoc_lee' output_directory = "/osirim/sig/PROJET/PRINCESS/results/princess/" impact = 0 boost = "undifferentiated" # or upper or seed alpha = 3 # valide si upper or seed topx = 20 # valide si seed features_to_remove = [] strategy = 1 process = 100 life = 0 best = 0.1 accepted = False verbose = False model = "f45" fold = -1 step = "training" # ou "test" queriesToProcess = [] strategy = [ 'f48', 'f17', 'f19', 'f16', 'f46', 'f9', 'f21', 'f3', 'f39', 'f7', 'f40', 'f6', 'f37', 'f42', 'f2', 'f15', 'f25', 'f33', 'f36', 'f10', 'f30', 'f51', 'f28', 'f43', 'f45', 'f34', 'f24', 'f13', 'f50', 'f27', 'f31', 'f1', 'f35', 'f14', 'f47', 'f41', 'f4', 'f22', 'f12', 'f8', 'f26', 'f44' ] for o, a in opts: if o in ("-v", "--verbose"): verbose = True elif o in ("-a", "--accepted"): accepted = True elif o in ("-o", "--optim"): optim = a elif o in ("-d", "--debug"): debug = True elif o in ("-h", "--help"): usage() sys.exit() elif o in ("-t", "--type"): type_tournament = a elif o in ("-g", "--group"): group = int(a) elif o in ("-f", "--featureList"): features_to_remove = str(a).split(",") # print features_to_remove elif o in ("-r", "--rounds"): nb_rounds = int(a) elif o in ("-m", "--model"): model = str(a) elif o in ("-p", "--process"): process = int(a) elif o in ("-b", "--best"): best = float(a) elif o in ("-c", "--collection"): collection_name = a elif o in ("-i", "--impact"): impact = int(a) elif o in ("-l", "--life"): life = int(a) elif o in ("-n", "--nbFeats"): nbFeats = int(a) elif o in ("-s", "--strategy"): strategy = int(a) elif o in ("-w", "--boost"): boost = str(a) elif o in ("-y", "--alpha"): alpha = int(a) elif o in ("-z", "--topX"): topx = int(a) elif o in ("-x", "--cross"): fold = a if '-' in fold: step = "training" fold = -int(a) else: step = "test" else: assert False, "unhandled option" # load appropriate queries for this run if "web" in collection_name: output_directory += "web2014/" + str(fold) + "/" with open( "/osirim/sig/PROJET/PRINCESS/queries/web2014/folds/" + str(fold) + ".txt", "r") as fq: for l in fq: queriesToProcess.append(l.strip()) elif "robust" in collection_name: output_directory += "robust2004/" + str(fold) + "/" with open( "/osirim/sig/PROJET/PRINCESS/queries/robust2004/folds/" + str(fold) + ".txt", "r") as fq: for l in fq: queriesToProcess.append(l.strip()) else: output_directory += collection_name + "/" + str(fold) + "/" with open( "/osirim/sig/PROJET/PRINCESS/queries/" + collection_name.lower() + "/folds/" + str(fold) + ".txt", "r") as fq: for l in fq: queriesToProcess.append(l.strip()) # One tournament per query connection = MongoClient(host='co2-ni01.irit.fr', port=28018) db = connection.princess collection = db[collection_name.lower()] queries = collection.distinct('query') if debug: evaluateQRels(collection_name) outputFolderName = '' if step == "training": output_directory += "training/" else: output_directory += "test/" if len(features_to_remove) > 0: outputFolderName = 't:' + type_tournament + '-o:' + optim + '-r:' + str( nb_rounds) + '-b:' + str( best) + '-c:' + collection_name + '-i:' + str( impact) + '-l:' + str(life) + '-n:' + str( nbFeats) + '-s:' + str(strategy) + '-g:' + str( group) + '-f:' + ','.join(features_to_remove) if accepted: outputFolderName += '-a' else: outputFolderName = 't:' + type_tournament + '-o:' + optim + '-r:' + str( nb_rounds) + '-b:' + str( best) + '-c:' + collection_name + '-i:' + str( impact) + '-l:' + str(life) + '-n:' + str( nbFeats) + '-s:' + str(strategy) + '-g:' + str(group) outputFolderName += '/' output_directory += outputFolderName if os.path.exists(output_directory): os.system("rm -r " + output_directory) secure_mkdir(output_directory) # print "output directory", output_directory os.system("rm " + output_directory + "*") print "Nb process", process begin = time.time() for q in queries: # print("docstoCompete:", docsToCompete) processQuery = False if step == "training": if q in queriesToProcess: processQuery = False else: processQuery = True else: if q in queriesToProcess: processQuery = True else: processQuery = False if processQuery: print "Query " + q deb = time.time() docsToCompete = [] if "indri" not in collection_name: docsToCompete = loadDocsToCompete(collection_name, q) dictQRels.setdefault(q, {}) qstr = str(q) list = collection.find({'query': qstr}, {'_id': 0, 'docs': 1}) count = 0 list_doc = [] for i in list: # print i for d in i['docs']: # print "**********" count += 1 name = d['doc_name'] if len(docsToCompete) == 0 or (len(docsToCompete) > 0 and name in docsToCompete): # list_feat = [] list_feat = {} for f in d['features']: # print f list_feat[f] = Feature(f, d['features'][f]) # if float(d['features'][f]) > 1.0 : # print f + " = "+ str(d['features'][f]) if model not in list_feat: list_feat[model] = Feature(model, 0.0) list_doc.append(Document(name, list_feat)) # sys.exit() colName = collection_name.lower() + "_std" #print colName collection_std = db[colName] listStd = {} res = collection_std.find({'query': str(q)}, {'_id': 0}) # print colName # print q #print res[0] listStd = res[0]['stds'] if type_tournament == "robin": to = RoundRobin(query=q, impact=impact, health=life, nbFeat=nbFeats, strategy=strategy, nbRound=nb_rounds, featsToRemove=features_to_remove, qrel=dictQRels[q], accepted=accepted, optim=optim, listStd=listStd, process=process) elif type_tournament == "return": to = RoundRobinReturnMatch(query=q, impact=0, health=life, nbFeat=nbFeats, strategy=strategy, nbRound=nb_rounds, featsToRemove=features_to_remove, accepted=accepted, optim=optim, listStd=listStd) elif type_tournament == "swiss": to = SwissSystem(query=q, impact=impact, health=life, nbFeat=nbFeats, strategy=strategy, nbRound=nb_rounds, featsToRemove=features_to_remove, accepted=accepted, optim=optim, listStd=listStd, process=process) elif type_tournament == "random": to = RandomTournament(query=q, impact=impact, health=life, nbFeat=nbFeats, strategy=strategy, nbRound=nb_rounds, featsToRemove=features_to_remove, qrel=dictQRels[q], accepted=accepted, optim=optim, listStd=listStd) elif type_tournament == "grouprobin": to = GroupStage(query=q, impact=impact, health=life, nbFeat=nbFeats, strategy=strategy, nbGroups=group, featsToRemove=features_to_remove, qrel=dictQRels[q], best=best, accepted=accepted, optim=optim, listStd=listStd) elif type_tournament == "grouprobinoptim": to = GroupStageOptim(query=q, impact=impact, health=life, nbFeat=nbFeats, strategy=strategy, nbGroups=group, featsToRemove=features_to_remove, qrel=dictQRels[q], best=best, accepted=accepted, model=model, optim=optim, listStd=listStd, process=process) elif type_tournament == "groupswiss": to = GroupSwiss(query=q, impact=impact, health=life, nbFeat=nbFeats, strategy=strategy, nbGroups=group, nbRound=nb_rounds, featsToRemove=features_to_remove, qrel=dictQRels[q], best=best, accepted=accepted, optim=optim, listStd=listStd, process=process) elif type_tournament == "groupswissoptim": to = GroupSwissOptim(query=q, impact=impact, health=life, nbFeat=nbFeats, strategy=strategy, nbGroups=group, nbRound=nb_rounds, featsToRemove=features_to_remove, qrel=dictQRels[q], best=best, accepted=accepted, model=model, optim=optim, listStd=listStd, process=process) elif type_tournament == "seed": to = Seed(query=q, impact=impact, health=life, nbFeat=nbFeats, strategy=strategy, nbRound=nb_rounds, featsToRemove=features_to_remove, qrel=dictQRels[q], accepted=accepted, model=model, optim=optim, listStd=listStd) elif type_tournament == "upper": to = Upper(query=q, impact=impact, health=life, nbFeat=nbFeats, strategy=strategy, nbRound=nb_rounds, featsToRemove=features_to_remove, qrel=dictQRels[q], accepted=accepted, model=model, optim=optim, listStd=listStd) print "setCompetitors" to.setCompetitors(list_doc) #print len(list_doc) print "runCompetition" to.runCompetition() print "printResults" to.printResults(output_directory) print "Query processing time:", (time.time() - deb), "sec" print "[ n=", process, type_tournament, "] total time:", (time.time() - begin), "ms" with open(output_directory + "completed.txt", "w") as f: f.write("completed!!")
def test_create_new_document(self): doc = Document("Test", "Customer", "testfile.PDF") self.assertEqual(doc.document_id, None) self.assertEqual(doc.project, "Test") self.assertEqual(doc.customer, "Customer") self.assertEqual(doc.file, "testfile.PDF")
import os, sys sys.path.append(os.path.join(os.path.dirname(__file__), "../")) import pandas as pd from document.document import Document from formatter import simple as formatter from tokenizer import simple as tokenizer test = {} doc = Document(formatter=formatter, tokenizer=tokenizer, file="./data/test.txt") df = pd.DataFrame([line for line in doc]) df.to_pickle("./data/test_document_simple.gz", compression="gzip") test["simple"] = df.equals( pd.read_pickle("./data/test_document_simple.gz", compression="gzip")) print(test)
def test_set_document_id(self): doc = Document("Test", "Customer", "testfile.PDF") doc.set_doc_id(1) self.assertEqual(doc.document_id, 1) doc.set_doc_id(2) self.assertEqual(doc.document_id, 2)