def process(threadName): while True: with open('configurations.json') as f: data = json.load(f) DATABASEIP = data["DATABASEIP"] DB_USER = data["DB_USER"] DB_PASSWORD = data["DB_PASSWORD"] DATABASE = data["DATABASE"] THREAD_SLEEP_TIME = data["THREAD_SLEEP_TIME"] DBHandler_ = DBHandler(DATABASEIP, DB_USER, DB_PASSWORD, DATABASE) fileList = [] try: fileList = DBHandler_.getFilesToProcess() print("Going to process" + str(len(fileList)) + "files") try: for file_ in fileList: FileProcessor_ = FileProcessor() FileProcessor_.process(file_) except Exception as e: print("Error in File Processing Thread" + str(e)) print(traceback.format_exc()) DBHandler_.updateFileStatus(file_.FileName, "F") except Exception as e: print("Error in File Processing Thread" + str(e)) print(traceback.format_exc()) print(threadName + "going to sleep for " + str(THREAD_SLEEP_TIME)) time.sleep(THREAD_SLEEP_TIME)
def process_file(self): def progress(): self.progress.grid(row=1, column=1, sticky='e', padx=50) self.progress.start() time.sleep(5) self.progress.stop() self.progress.grid_forget() self.btn_search['state'] = 'normal' file = tkinter.filedialog.askopenfilename(filetypes=[("MS Excel", "*.xlsx")]) if file: p = FileProcessor(file) p.process() self.btn_search['state'] = 'disabled' threading.Thread(target=progress).start()
def run(self): # ignore sigterm signal and let parent take care of this signal.signal(signal.SIGINT, signal.SIG_IGN) while True: to_process = self.input.get() # Poison pill if to_process is None: self.log.info("Got poison pill - shutting down") break # Make a unique output file name output_file_name = os.path.join( self.output_dir, hashlib.md5(to_process).hexdigest() + '.root') self.log.info("Processing file %s => %s", to_process, output_file_name) try: processor = FileProcessor(to_process, self.tree, self.selector, output_file_name, **self.options) result = processor.process() self.output.put(result) except: # If we fail, put a poison pill to stop the merge job. self.log.error("Caught exception in worker, killing merger") self.output.put(None) raise
def process_file(filepath): fp = codecs.open(filepath, 'rU', 'iso-8859-2') content = fp.read() file_processor = FileProcessor(content) file_processor.process() fp.close() print("nazwa pliku:", filepath) print("autor:", file_processor.author) print("dzial:", file_processor.section) print("slowa kluczowe:", ", ".join(file_processor.keywords)) print("liczba zdan:", len(file_processor.sentences)) print("liczba skrotow:", len(file_processor.shortcuts)) print("liczba liczb calkowitych z zakresu int:", len(file_processor.integers)) print("liczba liczb zmiennoprzecinkowych:", len(file_processor.floats)) print("liczba dat:", len(file_processor.dates)) print("liczba adresow email:", len(file_processor.emails)) print("\n")
import nltk import unicodedata import re import spacy from Parser import Parser from FileProcessor import FileProcessor from TextPreProcessor import TextPreProcessor import json import spacy fp = FileProcessor() p = Parser() tpp = TextPreProcessor() file = open("rapport.pdf", "rb").read() fp.process(("/pfe/sd/sd:sd/sd:sd/", file)) nlp_spacy = spacy.load("fr") content, content_type = p.parse_file(file) # words, content_type, content = tpp.preprocess_text("t- - pp les obstacles9 5 0ç à école éllève l'élève qu'affaire FAVEUR. \n\n\t Qu’il a rencontrés", nlp_spacy) words, content_type, content = tpp.preprocess_text(content, nlp_spacy) data = {} data["content"] = content data["content_type"] = content_type data["words"] = words with open('data.json', 'w') as outfile: json.dump(data, outfile)
"event": NotificationConstants.PROCESSING_STARTED } notification_producer.publish(json.dumps(notification_payload)) notification_producer.close_connection() start = time. time() file_processor = FileProcessor() spark_processor = SparkProcessor() file_index_repository = FileIndexRepository() lda_topics_description_repository = LdaTopicsDescriptionRepository() spark_utils = SparkUtils("local[2]", "indexing-script-app") files_rdd = spark_utils.read_files("hdfs://localhost/pfe/data/save/*/*/*") files_rdd = files_rdd.map(lambda file: file_processor.process(file)).cache() # try: # Machine Learning files_df = spark_utils.rdd_to_df(files_rdd, ["url", "file_name", "timestamp", "uuid", "words", "pre_processed_text", "summary", "most_common", "thumbnail", "content_type", "content"]).cache() files_df_ready_for_ml = files_df.select("url", "words") kmeans_df, bisecting_kmeans_df, (lda_with_count_vectorizer_df, topics_descriptions) = \ spark_processor.process(files_df_ready_for_ml) result_df = spark_utils \ .join_df(files_df, bisecting_kmeans_df, "url", ["url", "file_name", "timestamp", "uuid", "pre_processed_text", "summary", "most_common", "thumbnail", "content_type", "content"], ["bisecting_kmeans_prediction"]) result_df = spark_utils \
print 'feature:', i plt.hist(data[:, i], bins = 100, color = 'blue', alpha = 0.5) plt.show() if __name__ == '__main__': path = '../../Data/' # Test the file processor dirs = ['old/ZJetsToNuNu_HT-200to400_Tune4C_13TeV-madgraph-tauola'] dirs = [path + d for d in dirs] print dirs fprocessor = FileProcessor(dirs) leaves = [3, 2, 2] fprocessor.process(leaves, overwrite = False, parallel = False) # Initialise the processor path = '../../Data/' files = ['old/ZJetsToNuNu_HT-200to400_Tune4C_13TeV-madgraph-tauola.hdf5'] files = [path + f for f in files] print files dprocessor = DatasetProcessor(files) # Perform cuts in the dataset sumcut = 0 ptcut = 0 mrcut = 0 r2cut = 0.09
def upload_file_s(): try: print("Here in uploader new") if request.method == 'POST': file = request.files['file'] pname = request.form['pname'] lang = request.form['lang'] if file.filename == '': print("file name is empty") return redirect( url_for('addfiles.html', message='No selected file')) if file and allowed_file(file.filename): filename = secure_filename(file.filename) file.filename = filename print("file", file.filename) file.save( os.path.join(app.config['UPLOAD_PATH_PDF'], file.filename)) processed = "N" datetime_now = datetime.datetime.now() formatted_date = datetime_now.strftime('%Y-%m-%d') projectFile_ = ProjectFile(file.filename, pname, session['user'], processed, formatted_date, app.config['UPLOAD_PATH_PDF'], lang) dbHandler_ = DBHandler(app.config["DATABASEIP"], app.config["DB_USER"], app.config["DB_PASSWORD"], app.config["DATABASE"]) dbHandler_.insertFiles(projectFile_) if (projectFile_.lang == "eng"): try: FileProcessor_ = FileProcessor() FileProcessor_.process(projectFile_) dbHandler_.updateFileStatus(projectFile_.FileName, "Y") except Exception as e: dbHandler_.updateFileStatus(projectFile_.FileName, "F") return render_template( 'addfiles.html', email=session['user'], projectList=session['projectList'], message="File processing is failed due to error:" + str(e)) return render_template( 'addfiles.html', email=session['user'], projectList=session['projectList'], message= "File is successfully uploaded. File will be processed in a while" ) except DBError as e: return render_template('addfiles.html', projectList=session['projectList'], email=session['user'], message='Exception in file processing' + str(e)) except Exception as e: return render_template('addfiles.html', projectList=session['projectList'], email=session['user'], message='Exception in file processing' + str(e))
""" print("Usage: MipLogTool.py logFileName [-F|-S]") print("-S generates statistic of the log file") print("-F \"pattern\" get the records were the pattern matches") def evaluateCommand(args): """ Evaluates the command who should be executed. """ if len(args) < 2: usage() exit() elif len(args) == 2: return BaseCommand() elif args[2] == CmdFilter: if len(args) < 4: usage() exit() return FilterCommad(args[3]) elif args[2] == CmdPrint: return BaseCommand() elif args[2] == CmdStatistic: return StatisticCommand() else: usage() exit() fileProcessor = FileProcessor(sys.argv[1], [evaluateCommand(sys.argv)]) fileProcessor.process()
plt.hist(data[:, i], bins=100, color='blue', alpha=0.5) plt.show() if __name__ == '__main__': path = '../../Data/' # Test the file processor dirs = ['old/ZJetsToNuNu_HT-200to400_Tune4C_13TeV-madgraph-tauola'] dirs = [path + d for d in dirs] print dirs fprocessor = FileProcessor(dirs) leaves = [3, 2, 2] fprocessor.process(leaves, overwrite=False, parallel=False) # Initialise the processor path = '../../Data/' files = ['old/ZJetsToNuNu_HT-200to400_Tune4C_13TeV-madgraph-tauola.hdf5'] files = [path + f for f in files] print files dprocessor = DatasetProcessor(files) # Perform cuts in the dataset sumcut = 0 ptcut = 0 mrcut = 0 r2cut = 0.09