def process(threadName):
    while True:
        with open('configurations.json') as f:
            data = json.load(f)

        DATABASEIP = data["DATABASEIP"]
        DB_USER = data["DB_USER"]
        DB_PASSWORD = data["DB_PASSWORD"]
        DATABASE = data["DATABASE"]
        THREAD_SLEEP_TIME = data["THREAD_SLEEP_TIME"]
        DBHandler_ = DBHandler(DATABASEIP, DB_USER, DB_PASSWORD, DATABASE)
        fileList = []
        try:
            fileList = DBHandler_.getFilesToProcess()
            print("Going to process" + str(len(fileList)) + "files")
            try:
                for file_ in fileList:
                    FileProcessor_ = FileProcessor()
                    FileProcessor_.process(file_)
            except Exception as e:
                print("Error in File Processing Thread" + str(e))
                print(traceback.format_exc())
                DBHandler_.updateFileStatus(file_.FileName, "F")

        except Exception as e:
            print("Error in File Processing Thread" + str(e))
            print(traceback.format_exc())

        print(threadName + "going to sleep for " + str(THREAD_SLEEP_TIME))
        time.sleep(THREAD_SLEEP_TIME)
Exemplo n.º 2
0
    def process_file(self):
        def progress():
            self.progress.grid(row=1, column=1, sticky='e', padx=50)
            self.progress.start()
            time.sleep(5)
            self.progress.stop()
            self.progress.grid_forget()
            self.btn_search['state'] = 'normal'

        file = tkinter.filedialog.askopenfilename(filetypes=[("MS Excel",
                                                              "*.xlsx")])
        if file:
            p = FileProcessor(file)
            p.process()
            self.btn_search['state'] = 'disabled'
            threading.Thread(target=progress).start()
Exemplo n.º 3
0
    def run(self):
        # ignore sigterm signal and let parent take care of this
        signal.signal(signal.SIGINT, signal.SIG_IGN)
        while True:
            to_process = self.input.get()
            # Poison pill
            if to_process is None:
                self.log.info("Got poison pill - shutting down")
                break

            # Make a unique output file name
            output_file_name = os.path.join(
                self.output_dir,
                hashlib.md5(to_process).hexdigest() + '.root')

            self.log.info("Processing file %s => %s",
                          to_process, output_file_name)

            try:
                processor = FileProcessor(to_process, self.tree, self.selector,
                                          output_file_name, **self.options)
                result = processor.process()
                self.output.put(result)
            except:
                # If we fail, put a poison pill to stop the merge job.
                self.log.error("Caught exception in worker, killing merger")
                self.output.put(None)
                raise
Exemplo n.º 4
0
def process_file(filepath):
    fp = codecs.open(filepath, 'rU', 'iso-8859-2')

    content = fp.read()

    file_processor = FileProcessor(content)
    file_processor.process()

    fp.close()
    print("nazwa pliku:", filepath)
    print("autor:", file_processor.author)
    print("dzial:", file_processor.section)
    print("slowa kluczowe:", ", ".join(file_processor.keywords))
    print("liczba zdan:", len(file_processor.sentences))
    print("liczba skrotow:", len(file_processor.shortcuts))
    print("liczba liczb calkowitych z zakresu int:",
          len(file_processor.integers))
    print("liczba liczb zmiennoprzecinkowych:", len(file_processor.floats))
    print("liczba dat:", len(file_processor.dates))
    print("liczba adresow email:", len(file_processor.emails))
    print("\n")
Exemplo n.º 5
0
import nltk
import unicodedata
import re
import spacy
from Parser import Parser
from FileProcessor import FileProcessor
from TextPreProcessor import TextPreProcessor

import json
import spacy

fp = FileProcessor()
p = Parser()
tpp = TextPreProcessor()
file = open("rapport.pdf", "rb").read()

fp.process(("/pfe/sd/sd:sd/sd:sd/", file))

nlp_spacy = spacy.load("fr")
content, content_type = p.parse_file(file)
# words, content_type, content = tpp.preprocess_text("t- - pp les obstacles9 5 0ç  à école éllève l'élève qu'affaire FAVEUR. \n\n\t Qu’il a rencontrés", nlp_spacy)
words, content_type, content = tpp.preprocess_text(content, nlp_spacy)
data = {}
data["content"] = content
data["content_type"] = content_type
data["words"] = words

with open('data.json', 'w') as outfile:
    json.dump(data, outfile)
Exemplo n.º 6
0
    "event": NotificationConstants.PROCESSING_STARTED
}
notification_producer.publish(json.dumps(notification_payload))
notification_producer.close_connection()


start = time. time()

file_processor = FileProcessor()
spark_processor = SparkProcessor()
file_index_repository = FileIndexRepository()
lda_topics_description_repository = LdaTopicsDescriptionRepository()
spark_utils = SparkUtils("local[2]", "indexing-script-app")

files_rdd = spark_utils.read_files("hdfs://localhost/pfe/data/save/*/*/*")
files_rdd = files_rdd.map(lambda file: file_processor.process(file)).cache()

# try:
# Machine Learning
files_df = spark_utils.rdd_to_df(files_rdd,
                                 ["url", "file_name", "timestamp", "uuid", "words", "pre_processed_text", "summary", "most_common", "thumbnail", "content_type", "content"]).cache()

files_df_ready_for_ml = files_df.select("url", "words")

kmeans_df, bisecting_kmeans_df, (lda_with_count_vectorizer_df, topics_descriptions) = \
    spark_processor.process(files_df_ready_for_ml)

result_df = spark_utils \
    .join_df(files_df, bisecting_kmeans_df, "url",
             ["url", "file_name", "timestamp", "uuid", "pre_processed_text", "summary", "most_common", "thumbnail", "content_type", "content"], ["bisecting_kmeans_prediction"])
result_df = spark_utils \
Exemplo n.º 7
0
		print 'feature:', i
		plt.hist(data[:, i], bins = 100, color = 'blue', alpha = 0.5)
		plt.show()

if __name__ == '__main__':
	path = '../../Data/'

	# Test the file processor 
	dirs = ['old/ZJetsToNuNu_HT-200to400_Tune4C_13TeV-madgraph-tauola']
	dirs = [path + d for d in dirs]
	print dirs 

	fprocessor = FileProcessor(dirs)
	leaves = [3, 2, 2]

	fprocessor.process(leaves, overwrite = False, parallel = False)

	# Initialise the processor 
	path = '../../Data/'
	files = ['old/ZJetsToNuNu_HT-200to400_Tune4C_13TeV-madgraph-tauola.hdf5']

	files = [path + f for f in files] 
	print files

	dprocessor = DatasetProcessor(files)

	# Perform cuts in the dataset
	sumcut = 0 
	ptcut = 0 
	mrcut = 0 
	r2cut = 0.09
Exemplo n.º 8
0
def upload_file_s():
    try:
        print("Here in uploader new")
        if request.method == 'POST':
            file = request.files['file']
            pname = request.form['pname']
            lang = request.form['lang']

            if file.filename == '':
                print("file name is empty")
                return redirect(
                    url_for('addfiles.html', message='No selected file'))
            if file and allowed_file(file.filename):
                filename = secure_filename(file.filename)
                file.filename = filename
                print("file", file.filename)

                file.save(
                    os.path.join(app.config['UPLOAD_PATH_PDF'], file.filename))
                processed = "N"
                datetime_now = datetime.datetime.now()
                formatted_date = datetime_now.strftime('%Y-%m-%d')

                projectFile_ = ProjectFile(file.filename, pname,
                                           session['user'], processed,
                                           formatted_date,
                                           app.config['UPLOAD_PATH_PDF'], lang)

                dbHandler_ = DBHandler(app.config["DATABASEIP"],
                                       app.config["DB_USER"],
                                       app.config["DB_PASSWORD"],
                                       app.config["DATABASE"])
                dbHandler_.insertFiles(projectFile_)
                if (projectFile_.lang == "eng"):
                    try:
                        FileProcessor_ = FileProcessor()
                        FileProcessor_.process(projectFile_)
                        dbHandler_.updateFileStatus(projectFile_.FileName, "Y")
                    except Exception as e:
                        dbHandler_.updateFileStatus(projectFile_.FileName, "F")
                        return render_template(
                            'addfiles.html',
                            email=session['user'],
                            projectList=session['projectList'],
                            message="File processing is failed due to error:" +
                            str(e))

                return render_template(
                    'addfiles.html',
                    email=session['user'],
                    projectList=session['projectList'],
                    message=
                    "File is successfully uploaded. File will be processed in a while"
                )

    except DBError as e:
        return render_template('addfiles.html',
                               projectList=session['projectList'],
                               email=session['user'],
                               message='Exception in file processing' + str(e))

    except Exception as e:
        return render_template('addfiles.html',
                               projectList=session['projectList'],
                               email=session['user'],
                               message='Exception in file processing' + str(e))
Exemplo n.º 9
0
  """
  print("Usage: MipLogTool.py logFileName [-F|-S]")
  print("-S generates statistic of the log file")
  print("-F \"pattern\" get the records were the pattern matches")

def evaluateCommand(args):
  """
  Evaluates the command who should be executed.
  """
  if len(args) < 2:
    usage()
    exit()
  elif len(args) == 2:
    return BaseCommand()
  elif args[2] == CmdFilter:
    if len(args) < 4:
      usage()
      exit()
    return FilterCommad(args[3])
  elif args[2] == CmdPrint:
    return BaseCommand()
  elif args[2] == CmdStatistic:
    return StatisticCommand()
  else:
    usage()
    exit()

fileProcessor = FileProcessor(sys.argv[1], [evaluateCommand(sys.argv)])
fileProcessor.process()

Exemplo n.º 10
0
        plt.hist(data[:, i], bins=100, color='blue', alpha=0.5)
        plt.show()


if __name__ == '__main__':
    path = '../../Data/'

    # Test the file processor
    dirs = ['old/ZJetsToNuNu_HT-200to400_Tune4C_13TeV-madgraph-tauola']
    dirs = [path + d for d in dirs]
    print dirs

    fprocessor = FileProcessor(dirs)
    leaves = [3, 2, 2]

    fprocessor.process(leaves, overwrite=False, parallel=False)

    # Initialise the processor
    path = '../../Data/'
    files = ['old/ZJetsToNuNu_HT-200to400_Tune4C_13TeV-madgraph-tauola.hdf5']

    files = [path + f for f in files]
    print files

    dprocessor = DatasetProcessor(files)

    # Perform cuts in the dataset
    sumcut = 0
    ptcut = 0
    mrcut = 0
    r2cut = 0.09