def process(threadName): while True: with open('configurations.json') as f: data = json.load(f) DATABASEIP = data["DATABASEIP"] DB_USER = data["DB_USER"] DB_PASSWORD = data["DB_PASSWORD"] DATABASE = data["DATABASE"] THREAD_SLEEP_TIME = data["THREAD_SLEEP_TIME"] DBHandler_ = DBHandler(DATABASEIP, DB_USER, DB_PASSWORD, DATABASE) fileList = [] try: fileList = DBHandler_.getFilesToProcess() print("Going to process" + str(len(fileList)) + "files") try: for file_ in fileList: FileProcessor_ = FileProcessor() FileProcessor_.process(file_) except Exception as e: print("Error in File Processing Thread" + str(e)) print(traceback.format_exc()) DBHandler_.updateFileStatus(file_.FileName, "F") except Exception as e: print("Error in File Processing Thread" + str(e)) print(traceback.format_exc()) print(threadName + "going to sleep for " + str(THREAD_SLEEP_TIME)) time.sleep(THREAD_SLEEP_TIME)
def test_FileProcessorCopyFilesTest_Run_EmptyFolder(self): inputFolder = r"e:" outputFolder = r"d:\Temp\FileProcessorTestOuput" processor = FileProcessor(inputFolder, outputFolder) processor.Run()
def run(self): # ignore sigterm signal and let parent take care of this signal.signal(signal.SIGINT, signal.SIG_IGN) while True: to_process = self.input.get() # Poison pill if to_process is None: self.log.info("Got poison pill - shutting down") break # Make a unique output file name output_file_name = os.path.join( self.output_dir, hashlib.md5(to_process).hexdigest() + '.root') self.log.info("Processing file %s => %s", to_process, output_file_name) try: processor = FileProcessor(to_process, self.tree, self.selector, output_file_name, **self.options) result = processor.process() self.output.put(result) except: # If we fail, put a poison pill to stop the merge job. self.log.error("Caught exception in worker, killing merger") self.output.put(None) raise
def test_naive_bayes(hypothises): fp = FileProcessor(test_filepath, ' ') parsed_lines = fp.get_lines_as_array() results = [] for row in parsed_lines: exclude_label = row[1:] max_sum = -float('Inf') max_label = -1 for label in label_mappings: label_instance = hypothises[label] log_prior = log(label_instance.get_prior(), 2) densities = label_instance.get_densities() log_sum = 0 for word in exclude_label: log_sum += log(densities[word], 2) cur_sum = log_sum + log_prior if cur_sum > max_sum: max_sum = cur_sum max_label = label results.append(label_mappings[max_label]) fp.generate_output(output_filepath, results)
def create_vocabulary(): fp = FileProcessor(vocabulary_filepath, delimiter) parsed_data = fp.get_lines_as_array() for row in parsed_data: word = row[0] frequency = row[1] vocabulary[word] = frequency
def generate_labels(): fp = FileProcessor(testing_data_filepath, ' ') rows = fp.parse_input_file() expected = [] for row in rows: expected.append(row[0]) if fp.generate_output(labels_output_filepath, expected): return True
def init_corpus_sizes(): fp = FileProcessor(training_metadata_filepath, '/') parsed_lines = fp.get_lines_as_array() for row in parsed_lines: label = row[0] if label in corpus_sizes: corpus_sizes[label] += 1 else: corpus_sizes[label] = 1
def init_corpus(): for label in label_mappings: training_filepath = get_training_filepath(label) fp = FileProcessor(training_filepath, ' ') parsed_lines = fp.get_lines_as_array() label_map = {} for row in parsed_lines: word = row[0] frequency = row[1] label_map[word] = frequency corpus[label] = label_map
def __checkDirectoryTreeCopied(self, destinationFolder, expectedReportForDestination): fileLister = FileProcessor(destinationFolder) fileLister.Run() print(fileLister.Report) actualReport = fileLister.Report.splitlines() actualReport.sort() expectedReportForDestination.sort() self.assertEqual(len(expectedReportForDestination), len(actualReport), "fileListReport does not contain the expected number of entries.") self.assertListEqual(expectedReportForDestination, actualReport, "fileListReport does not contain the expected values.")
def __testFileProcessorListing(self, inputFolder, expectedReport): fileLister = FileProcessor(inputFolder) fileLister.Run() print(fileLister.Report) actualReport = fileLister.Report.splitlines() self.assertEqual( len(expectedReport), len(actualReport), "The Report does not contain the expected number of entries.") self.assertListEqual( expectedReport, actualReport, "The Report does not contain the expected values.")
def process_file(self): def progress(): self.progress.grid(row=1, column=1, sticky='e', padx=50) self.progress.start() time.sleep(5) self.progress.stop() self.progress.grid_forget() self.btn_search['state'] = 'normal' file = tkinter.filedialog.askopenfilename(filetypes=[("MS Excel", "*.xlsx")]) if file: p = FileProcessor(file) p.process() self.btn_search['state'] = 'disabled' threading.Thread(target=progress).start()
def __init__(self,proxyObject,configObject): ControllableThread.__init__(self) self.config = configObject self.amarok = proxyObject self.observe(self.amarok) self.observe(self.amarok.player) self.observe(self.amarok.contextBrowser) self.observe(self.config) self.amarok_available_yet = False self.update = False self.contentUpdaterCode = file(os.path.join(os.path.dirname(__file__), "ContextBrowserUpdater.js")).read() self.has_injected = 0 self.fp = FileProcessor(self.amarok, self.config) self.manualsearchsite = "" self.searchnow = "" if self.config["autoSearchSites"] == "": self.config["autoSearchSites"] = self.fp.getInitialSites() # # remove playlouder (conTEXT 2m) # self.config["autoSearchSites"] = re.sub('playlouder.com:(False|True) ', '', self.config["autoSearchSites"]) self.browser = "kioclient exec" self.font = self.tryToReadFontFromKde() or "10pt sans-serif" self.fontcolor = self.tryToReadFontColorFromKde() or "0,0,0" self.logger.debug("using font \"%s\" and color \"%s\"", self.font, self.fontcolor) # self.searchMenuCode = self.searchMenuCode % (imgPath, self.font) self.artist = "" self.album = "" self.url = ""
def process_data(training_file, testing_file, training_output, testing_output, attribute_descriptors): delimiter = ',' training_file_processor = FileProcessor(training_file, delimiter) testing_file_processor = FileProcessor(testing_file, delimiter) training_lines = training_file_processor.get_lines_as_array() testing_lines = testing_file_processor.get_lines_as_array() all_lines = training_lines + testing_lines knn_processor = KNNProcess(all_lines, attribute_descriptors) imputed_lines = map(lambda line_no: knn_processor.replace_missing_line(line_no), range(0, len(all_lines))) normalized_lines = map(lambda line: knn_processor.normalize_line(line), imputed_lines) for line_no, line in enumerate(normalized_lines[:len(training_lines)]): training_file_processor.set_line(line_no, line) for line_no, line in enumerate(normalized_lines[len(training_lines):]): testing_file_processor.set_line(line_no, line) if training_file_processor.generate_output(training_output) and testing_file_processor.generate_output(testing_output): print 'Success!'
def main(): titlePattern = "(.)*<div\sid=\"title\">(\s)*game(.+)of(.+)thrones(.*)" categoryPattern = "<dt>type:</dt>(\s*)<dd><a href=(.+)Video(.*)TV(.*)shows</a></dd>" seedCountPattern = "<dt>Seeders:</dt>(\s*)<dd>[1-9][0-9]*</dd>" patternList = [titlePattern, categoryPattern, seedCountPattern] count = 0 files = getAllFiles() for file in files: fileProcessor = FileProcessor(FILE_PATH, file) if fileProcessor.checkForContent(patternList, startpos=7000, endpos=11000): moveToTarget(file, FILTER_PATH) count += 1 else: moveToTarget(file, FAILED_PATH) print(count/len(files))
class ContextBrowserUpdater(ControllableThread, Observer): logger = logging.getLogger("ContextBrowserUpdater") def __init__(self,proxyObject,configObject): ControllableThread.__init__(self) self.config = configObject self.amarok = proxyObject self.observe(self.amarok) self.observe(self.amarok.player) self.observe(self.amarok.contextBrowser) self.observe(self.config) self.amarok_available_yet = False self.update = False self.contentUpdaterCode = file(os.path.join(os.path.dirname(__file__), "ContextBrowserUpdater.js")).read() self.has_injected = 0 self.fp = FileProcessor(self.amarok, self.config) self.manualsearchsite = "" self.searchnow = "" if self.config["autoSearchSites"] == "": self.config["autoSearchSites"] = self.fp.getInitialSites() # # remove playlouder (conTEXT 2m) # self.config["autoSearchSites"] = re.sub('playlouder.com:(False|True) ', '', self.config["autoSearchSites"]) self.browser = "kioclient exec" self.font = self.tryToReadFontFromKde() or "10pt sans-serif" self.fontcolor = self.tryToReadFontColorFromKde() or "0,0,0" self.logger.debug("using font \"%s\" and color \"%s\"", self.font, self.fontcolor) # self.searchMenuCode = self.searchMenuCode % (imgPath, self.font) self.artist = "" self.album = "" self.url = "" def tryToReadFontFromKde(self): try: if '.kde4/' in __file__: kdeconfig = file(re.sub('\.kde4/share.*', '.kde4/share/config/kdeglobals', __file__)).read() else: kdeconfig = file(re.sub('\.kde/share.*', '.kde/share/config/kdeglobals', __file__)).read() s = re.search("\[General\].*?font=(.*?)\n", kdeconfig, re.S) if s: font = s.group(1).split(',') fontName = re.sub('^Sans Serif', 'Sans-Serif', font[0], 0, re.IGNORECASE) fontSize = (int(font[1]) + 1) if fontName == 'Serif' or fontName == 'Sans-Serif' else font[1] # don't know why but it seems we have to add 1pt for these two font = "%spt %s, sans-serif" % (fontSize, fontName) self.logger.debug("...successfully read FONT from kde config") return font else: raise Exception("pattern not found") except Exception, e: self.logger.debug("Failed to read FONT from kde config: %s", e) return False
def process_file(filepath): fp = codecs.open(filepath, 'rU', 'iso-8859-2') content = fp.read() file_processor = FileProcessor(content) file_processor.process() fp.close() print("nazwa pliku:", filepath) print("autor:", file_processor.author) print("dzial:", file_processor.section) print("slowa kluczowe:", ", ".join(file_processor.keywords)) print("liczba zdan:", len(file_processor.sentences)) print("liczba skrotow:", len(file_processor.shortcuts)) print("liczba liczb calkowitych z zakresu int:", len(file_processor.integers)) print("liczba liczb zmiennoprzecinkowych:", len(file_processor.floats)) print("liczba dat:", len(file_processor.dates)) print("liczba adresow email:", len(file_processor.emails)) print("\n")
def main(): titlePattern = "(.)*<div\sid=\"title\">(\s)*game(.+)of(.+)thrones(.*)" categoryPattern = "<dt>type:</dt>(\s*)<dd><a href=(.+)Video(.*)TV(.*)shows</a></dd>" seedCountPattern = "<dt>Seeders:</dt>(\s*)<dd>[1-9][0-9]*</dd>" patternList = [titlePattern, categoryPattern, seedCountPattern] count = 0 files = getAllFiles() for file in files: fileProcessor = FileProcessor(FILE_PATH, file) if fileProcessor.checkForContent(patternList, startpos=7000, endpos=11000): moveToTarget(file, FILTER_PATH) count += 1 else: moveToTarget(file, FAILED_PATH) print(count / len(files))
def editing_images(self): self.judge_user_input_or_not() file_handler = FileProcessor(self.text_input_path.get()) if not file_handler.data_exist(): tkMessageBox.showerror(u"Error", u"表格为空!\n") return name_id_dict = dict() name_id_dict = file_handler.parse_sheet() for k, v in name_id_dict.iteritems(): print k, 'maps to', v image_handler = ImageProcessor(self.text_output_path.get()) self.text_status_prompt.set(u"正在处理......") message = image_handler.add_name_and_id_on_img(name_id_dict) self.text_status_prompt.set(u'') tkMessageBox.showerror(u"Well done", message)
def modify_files(self, i): worker_id = i[0] path_folder = self.path + "coded_" + str(worker_id) + '/' # print("worker", worker_id, "starts") fileprocess = FileProcessor(path_folder + "Map/", self.users, self.num_of_links, self.path) fileprocess.create_pair_files('', worker_id) fileprocess.collect_file_data() return
def __testFileProcessorCopying(self, inputFolder, outputFolder, expectedReport): self.__cleanTestOuput() fileCopier = FileProcessor(inputFolder, outputFolder) fileCopier.Run() print(fileCopier.Report) actualReport = fileCopier.Report.splitlines() self.assertEqual(len(expectedReport), len(actualReport), "fileListReport does not contain the expected number of entries.") self.assertListEqual(expectedReport, actualReport, "fileListReport does not contain the expected values.") expectedReportForDestination = [ os.getcwd() + r'\Tests\TestOuput',] expectedReportForDestination.extend(expectedReport) for i, s in enumerate(expectedReportForDestination): outputFolderInReport = outputFolder + "\\" outputFolderInReport = outputFolderInReport + os.path.split(inputFolder)[1] expectedReportForDestination[i] = s.replace(inputFolder, outputFolderInReport) expectedReportForDestination.append( os.getcwd() + r'\Tests\TestOuput\Report.txt') for i, s in enumerate(expectedReportForDestination): expectedReportForDestination[i] = s.replace('OK\t', '') self.__checkDirectoryTreeCopied(outputFolder, expectedReportForDestination)
def test_logistic_regression(w): if not generate_labels(): return fp = FileProcessor(testing_data_filepath, ' ') rows = fp.parse_input_file() output = [] expected = [] labels = get_labels() for row in rows: expected.append(row[0]) row = row[1:] sum_val = w[0] for feature in row: feature_id = int(feature.split(':')[0]) sum_val += w[feature_id] if sigmoid(sum_val) >= 0.5: output.append(labels[0]) else: output.append(labels[1]) if fp.generate_output(output_filepath, output): print 'Successfully generated predictions.lr'
def get_labels(): fp = FileProcessor(labels_filepath, ' ') lines = fp.parse_input_file() return [float(lines[0][1]), float(lines[1][1])]
for value in range(num_of_links) } reverse_linkdict = { links[value].strip(): value + 1 for value in range(num_of_links) } # print(reverse_linkdict) os.chdir(folder_path) # change the director for the folder path # Created an instance of crawler and pass user number and links file into crawler = Crawler(user, "res.txt", reverse_linkdict, linkdict) # #Call crawl_and_createfile method to get all target links and create file for each source link crawler.crawl_and_createfile() fileprocess = FileProcessor(folder_path, user, num_of_links) fileprocess.file_filling() fileprocess.index_value() # fileprocess.index2pair() fileprocess.rename() # rename() fileprocess.create_pair_files('pair_dir') # if need for shuffle and reduce file fileprocess.max_len = fileprocess.find_largest() fileprocess.write_bin_files() if remapping: file_transfer = FileTransfer(users, folder_path, path) result, number_result = file_transfer.Mapping()
links[value].strip(): value + 1 for value in range(num_of_links) } # print(reverse_linkdict) os.chdir(folder_path) # change the director for the folder path if re_crawl: # Created an instance of crawler and pass user number and links file into # #Call crawl_and_createfile method to get all target links and create file for each source link crawler = Crawler(user, "res.txt", reverse_linkdict, linkdict) crawler.crawl_and_createfile() if reprocess: fileprocess = FileProcessor(folder_path, user, num_of_links, path) fileprocess.file_filling() fileprocess.index_value() # fileprocess.index2pair() fileprocess.rename() results, number_result, user_list = fileprocess.file_mapping() fileprocess.file_changes('pair_dir', user_list) # fileprocess.create_pair_files('pair_dir') # # if need for shuffle and reduce file # # fileprocess.max_len = fileprocess.find_largest() # fileprocess.write_bin_files() if remapping: file_transfer = FileTransfer(users, folder_path, path)
def learn_logistic_regression(): fp = FileProcessor(training_data_filepath, ' ') training_corpus = fp.parse_input_file() return learn_lr_classifier(training_corpus)
for i in range(6, features): print 'feature:', i plt.hist(data[:, i], bins=100, color='blue', alpha=0.5) plt.show() if __name__ == '__main__': path = '../../Data/' # Test the file processor dirs = ['old/ZJetsToNuNu_HT-200to400_Tune4C_13TeV-madgraph-tauola'] dirs = [path + d for d in dirs] print dirs fprocessor = FileProcessor(dirs) leaves = [3, 2, 2] fprocessor.process(leaves, overwrite=False, parallel=False) # Initialise the processor path = '../../Data/' files = ['old/ZJetsToNuNu_HT-200to400_Tune4C_13TeV-madgraph-tauola.hdf5'] files = [path + f for f in files] print files dprocessor = DatasetProcessor(files) # Perform cuts in the dataset sumcut = 0
import os import time import json notification_producer = NotificationProducer() notification_payload = { "event": NotificationConstants.PROCESSING_STARTED } notification_producer.publish(json.dumps(notification_payload)) notification_producer.close_connection() start = time. time() file_processor = FileProcessor() spark_processor = SparkProcessor() file_index_repository = FileIndexRepository() lda_topics_description_repository = LdaTopicsDescriptionRepository() spark_utils = SparkUtils("local[2]", "indexing-script-app") files_rdd = spark_utils.read_files("hdfs://localhost/pfe/data/save/*/*/*") files_rdd = files_rdd.map(lambda file: file_processor.process(file)).cache() # try: # Machine Learning files_df = spark_utils.rdd_to_df(files_rdd, ["url", "file_name", "timestamp", "uuid", "words", "pre_processed_text", "summary", "most_common", "thumbnail", "content_type", "content"]).cache() files_df_ready_for_ml = files_df.select("url", "words")
""" print("Usage: MipLogTool.py logFileName [-F|-S]") print("-S generates statistic of the log file") print("-F \"pattern\" get the records were the pattern matches") def evaluateCommand(args): """ Evaluates the command who should be executed. """ if len(args) < 2: usage() exit() elif len(args) == 2: return BaseCommand() elif args[2] == CmdFilter: if len(args) < 4: usage() exit() return FilterCommad(args[3]) elif args[2] == CmdPrint: return BaseCommand() elif args[2] == CmdStatistic: return StatisticCommand() else: usage() exit() fileProcessor = FileProcessor(sys.argv[1], [evaluateCommand(sys.argv)]) fileProcessor.process()
features = data.shape[1] for i in range(6, features): print 'feature:', i plt.hist(data[:, i], bins = 100, color = 'blue', alpha = 0.5) plt.show() if __name__ == '__main__': path = '../../Data/' # Test the file processor dirs = ['old/ZJetsToNuNu_HT-200to400_Tune4C_13TeV-madgraph-tauola'] dirs = [path + d for d in dirs] print dirs fprocessor = FileProcessor(dirs) leaves = [3, 2, 2] fprocessor.process(leaves, overwrite = False, parallel = False) # Initialise the processor path = '../../Data/' files = ['old/ZJetsToNuNu_HT-200to400_Tune4C_13TeV-madgraph-tauola.hdf5'] files = [path + f for f in files] print files dprocessor = DatasetProcessor(files) # Perform cuts in the dataset sumcut = 0
def upload_file_s(): try: print("Here in uploader new") if request.method == 'POST': file = request.files['file'] pname = request.form['pname'] lang = request.form['lang'] if file.filename == '': print("file name is empty") return redirect( url_for('addfiles.html', message='No selected file')) if file and allowed_file(file.filename): filename = secure_filename(file.filename) file.filename = filename print("file", file.filename) file.save( os.path.join(app.config['UPLOAD_PATH_PDF'], file.filename)) processed = "N" datetime_now = datetime.datetime.now() formatted_date = datetime_now.strftime('%Y-%m-%d') projectFile_ = ProjectFile(file.filename, pname, session['user'], processed, formatted_date, app.config['UPLOAD_PATH_PDF'], lang) dbHandler_ = DBHandler(app.config["DATABASEIP"], app.config["DB_USER"], app.config["DB_PASSWORD"], app.config["DATABASE"]) dbHandler_.insertFiles(projectFile_) if (projectFile_.lang == "eng"): try: FileProcessor_ = FileProcessor() FileProcessor_.process(projectFile_) dbHandler_.updateFileStatus(projectFile_.FileName, "Y") except Exception as e: dbHandler_.updateFileStatus(projectFile_.FileName, "F") return render_template( 'addfiles.html', email=session['user'], projectList=session['projectList'], message="File processing is failed due to error:" + str(e)) return render_template( 'addfiles.html', email=session['user'], projectList=session['projectList'], message= "File is successfully uploaded. File will be processed in a while" ) except DBError as e: return render_template('addfiles.html', projectList=session['projectList'], email=session['user'], message='Exception in file processing' + str(e)) except Exception as e: return render_template('addfiles.html', projectList=session['projectList'], email=session['user'], message='Exception in file processing' + str(e))
def get_vocabulary_size(): fp = FileProcessor(vocabulary_filepath, ' ') lines = fp.parse_input_file() return len(lines)
import nltk import unicodedata import re import spacy from Parser import Parser from FileProcessor import FileProcessor from TextPreProcessor import TextPreProcessor import json import spacy fp = FileProcessor() p = Parser() tpp = TextPreProcessor() file = open("rapport.pdf", "rb").read() fp.process(("/pfe/sd/sd:sd/sd:sd/", file)) nlp_spacy = spacy.load("fr") content, content_type = p.parse_file(file) # words, content_type, content = tpp.preprocess_text("t- - pp les obstacles9 5 0ç à école éllève l'élève qu'affaire FAVEUR. \n\n\t Qu’il a rencontrés", nlp_spacy) words, content_type, content = tpp.preprocess_text(content, nlp_spacy) data = {} data["content"] = content data["content_type"] = content_type data["words"] = words with open('data.json', 'w') as outfile: json.dump(data, outfile)
if recrawl: # print(len(reverse_linkdict.keys())) os.chdir(folder_path) # change the director for the folder path print("Start crawling") # Created an instance of crawler and pass user number and links file into crawler = Crawler(user, "res.txt", reverse_linkdict, linkdict) # #Call crawl_and_createfile method to get all target links and create file for each source link crawler.crawl_and_createfile(False, False) if reprocess: if not reinit: with open(dir + "/res.txt", "r") as f: num_of_links = len(f.readlines()) fileprocess = FileProcessor(folder_path, user, num_of_links, path) fileprocess.file_filling() fileprocess.index_value() fileprocess.rename() if remapping: if mode == 1: file_transfer = FileTransfer(user, folder_path, path, num_of_links) file_coded_transfer = FileCodedTransfer(user, folder_path, path, num_of_links) result, number_result = file_transfer.Mapping() else: # file_transfer = (user, folder_path, path, num_of_links) file_coded_transfer = FileCodedTransfer(user, folder_path, path, num_of_links) result, number_result = file_coded_transfer.Mapping()