def process(threadName):
    while True:
        with open('configurations.json') as f:
            data = json.load(f)

        DATABASEIP = data["DATABASEIP"]
        DB_USER = data["DB_USER"]
        DB_PASSWORD = data["DB_PASSWORD"]
        DATABASE = data["DATABASE"]
        THREAD_SLEEP_TIME = data["THREAD_SLEEP_TIME"]
        DBHandler_ = DBHandler(DATABASEIP, DB_USER, DB_PASSWORD, DATABASE)
        fileList = []
        try:
            fileList = DBHandler_.getFilesToProcess()
            print("Going to process" + str(len(fileList)) + "files")
            try:
                for file_ in fileList:
                    FileProcessor_ = FileProcessor()
                    FileProcessor_.process(file_)
            except Exception as e:
                print("Error in File Processing Thread" + str(e))
                print(traceback.format_exc())
                DBHandler_.updateFileStatus(file_.FileName, "F")

        except Exception as e:
            print("Error in File Processing Thread" + str(e))
            print(traceback.format_exc())

        print(threadName + "going to sleep for " + str(THREAD_SLEEP_TIME))
        time.sleep(THREAD_SLEEP_TIME)
Пример #2
0
    def test_FileProcessorCopyFilesTest_Run_EmptyFolder(self):

        inputFolder = r"e:"
        outputFolder = r"d:\Temp\FileProcessorTestOuput"
                          
        processor = FileProcessor(inputFolder, outputFolder)
        processor.Run()
Пример #3
0
    def run(self):
        # ignore sigterm signal and let parent take care of this
        signal.signal(signal.SIGINT, signal.SIG_IGN)
        while True:
            to_process = self.input.get()
            # Poison pill
            if to_process is None:
                self.log.info("Got poison pill - shutting down")
                break

            # Make a unique output file name
            output_file_name = os.path.join(
                self.output_dir,
                hashlib.md5(to_process).hexdigest() + '.root')

            self.log.info("Processing file %s => %s",
                          to_process, output_file_name)

            try:
                processor = FileProcessor(to_process, self.tree, self.selector,
                                          output_file_name, **self.options)
                result = processor.process()
                self.output.put(result)
            except:
                # If we fail, put a poison pill to stop the merge job.
                self.log.error("Caught exception in worker, killing merger")
                self.output.put(None)
                raise
Пример #4
0
def test_naive_bayes(hypothises):
    fp = FileProcessor(test_filepath, ' ')
    parsed_lines = fp.get_lines_as_array()
    results = []
    for row in parsed_lines:
        exclude_label = row[1:]
        max_sum = -float('Inf')
        max_label = -1

        for label in label_mappings:
            label_instance = hypothises[label]
            log_prior = log(label_instance.get_prior(), 2)
            densities = label_instance.get_densities()
            log_sum = 0

            for word in exclude_label:
                log_sum += log(densities[word], 2)
                    
            cur_sum = log_sum + log_prior
            if cur_sum > max_sum:
                max_sum = cur_sum
                max_label = label

        results.append(label_mappings[max_label])

    fp.generate_output(output_filepath, results)
def create_vocabulary():
    fp = FileProcessor(vocabulary_filepath, delimiter)
    parsed_data = fp.get_lines_as_array()
    for row in parsed_data:
        word = row[0]
        frequency = row[1]
        vocabulary[word] = frequency
Пример #6
0
def generate_labels():
        fp = FileProcessor(testing_data_filepath, ' ')
        rows = fp.parse_input_file()
        expected = []
        for row in rows:
                expected.append(row[0])
        
        if fp.generate_output(labels_output_filepath, expected):
                return True
Пример #7
0
def init_corpus_sizes():
    fp = FileProcessor(training_metadata_filepath, '/')
    parsed_lines = fp.get_lines_as_array()
    for row in parsed_lines:
        label = row[0]
        if label in corpus_sizes:
            corpus_sizes[label] += 1
        else:
            corpus_sizes[label] = 1
Пример #8
0
def init_corpus():
    for label in label_mappings:
        training_filepath = get_training_filepath(label)
        fp = FileProcessor(training_filepath, ' ')
        parsed_lines = fp.get_lines_as_array()
        label_map = {}
        for row in parsed_lines:
            word = row[0]
            frequency = row[1]
            label_map[word] = frequency
        corpus[label] = label_map
Пример #9
0
 def __checkDirectoryTreeCopied(self, destinationFolder, expectedReportForDestination):       
     fileLister = FileProcessor(destinationFolder)
     fileLister.Run()
     print(fileLister.Report)
     
     actualReport = fileLister.Report.splitlines()
     actualReport.sort()
     expectedReportForDestination.sort()
     
     self.assertEqual(len(expectedReportForDestination), len(actualReport), "fileListReport does not contain the expected number of entries.")
     self.assertListEqual(expectedReportForDestination, actualReport, "fileListReport does not contain the expected values.")   
Пример #10
0
    def __testFileProcessorListing(self, inputFolder, expectedReport):
        fileLister = FileProcessor(inputFolder)
        fileLister.Run()
        print(fileLister.Report)

        actualReport = fileLister.Report.splitlines()
        self.assertEqual(
            len(expectedReport), len(actualReport),
            "The Report does not contain the expected number of entries.")
        self.assertListEqual(
            expectedReport, actualReport,
            "The Report does not contain the expected values.")
Пример #11
0
    def process_file(self):
        def progress():
            self.progress.grid(row=1, column=1, sticky='e', padx=50)
            self.progress.start()
            time.sleep(5)
            self.progress.stop()
            self.progress.grid_forget()
            self.btn_search['state'] = 'normal'

        file = tkinter.filedialog.askopenfilename(filetypes=[("MS Excel",
                                                              "*.xlsx")])
        if file:
            p = FileProcessor(file)
            p.process()
            self.btn_search['state'] = 'disabled'
            threading.Thread(target=progress).start()
Пример #12
0
    def __init__(self,proxyObject,configObject):
        ControllableThread.__init__(self)
        self.config = configObject
        self.amarok = proxyObject
        self.observe(self.amarok)
        self.observe(self.amarok.player)
        self.observe(self.amarok.contextBrowser)
        self.observe(self.config)
        self.amarok_available_yet = False
        self.update = False
        self.contentUpdaterCode = file(os.path.join(os.path.dirname(__file__), "ContextBrowserUpdater.js")).read()
        self.has_injected = 0
        self.fp = FileProcessor(self.amarok, self.config)
        self.manualsearchsite = ""
        self.searchnow = ""
        if self.config["autoSearchSites"] == "":
            self.config["autoSearchSites"] = self.fp.getInitialSites()
            
        # # remove playlouder (conTEXT 2m)
        # self.config["autoSearchSites"] = re.sub('playlouder.com:(False|True) ', '', self.config["autoSearchSites"])

        self.browser = "kioclient exec"
        self.font = self.tryToReadFontFromKde() or "10pt sans-serif"
        self.fontcolor = self.tryToReadFontColorFromKde() or "0,0,0"
        self.logger.debug("using font \"%s\" and color \"%s\"", self.font, self.fontcolor)

        # self.searchMenuCode = self.searchMenuCode % (imgPath, self.font)
        self.artist = ""
        self.album = ""
        self.url = ""
Пример #13
0
def process_data(training_file, testing_file, training_output, testing_output, attribute_descriptors):
    delimiter = ','
    training_file_processor = FileProcessor(training_file, delimiter)
    testing_file_processor = FileProcessor(testing_file, delimiter)
    training_lines = training_file_processor.get_lines_as_array()
    testing_lines = testing_file_processor.get_lines_as_array()
    all_lines = training_lines + testing_lines
    knn_processor = KNNProcess(all_lines, attribute_descriptors)
    imputed_lines = map(lambda line_no: knn_processor.replace_missing_line(line_no), range(0, len(all_lines)))
    normalized_lines = map(lambda line: knn_processor.normalize_line(line), imputed_lines)
    for line_no, line in enumerate(normalized_lines[:len(training_lines)]):
        training_file_processor.set_line(line_no, line)
    for line_no, line in enumerate(normalized_lines[len(training_lines):]):
        testing_file_processor.set_line(line_no, line)
    if training_file_processor.generate_output(training_output) and testing_file_processor.generate_output(testing_output):
        print 'Success!'
Пример #14
0
def main():

    titlePattern    = "(.)*<div\sid=\"title\">(\s)*game(.+)of(.+)thrones(.*)"
    categoryPattern = "<dt>type:</dt>(\s*)<dd><a href=(.+)Video(.*)TV(.*)shows</a></dd>"
    seedCountPattern = "<dt>Seeders:</dt>(\s*)<dd>[1-9][0-9]*</dd>"
    patternList  = [titlePattern, categoryPattern, seedCountPattern]

    count = 0
    files = getAllFiles()
    for file in files:
        fileProcessor = FileProcessor(FILE_PATH, file)

        if fileProcessor.checkForContent(patternList, startpos=7000, endpos=11000):
            moveToTarget(file, FILTER_PATH)
            count += 1
        else:
            moveToTarget(file, FAILED_PATH)

    print(count/len(files))
Пример #15
0
class ContextBrowserUpdater(ControllableThread, Observer):

    logger = logging.getLogger("ContextBrowserUpdater")

    def __init__(self,proxyObject,configObject):
        ControllableThread.__init__(self)
        self.config = configObject
        self.amarok = proxyObject
        self.observe(self.amarok)
        self.observe(self.amarok.player)
        self.observe(self.amarok.contextBrowser)
        self.observe(self.config)
        self.amarok_available_yet = False
        self.update = False
        self.contentUpdaterCode = file(os.path.join(os.path.dirname(__file__), "ContextBrowserUpdater.js")).read()
        self.has_injected = 0
        self.fp = FileProcessor(self.amarok, self.config)
        self.manualsearchsite = ""
        self.searchnow = ""
        if self.config["autoSearchSites"] == "":
            self.config["autoSearchSites"] = self.fp.getInitialSites()
            
        # # remove playlouder (conTEXT 2m)
        # self.config["autoSearchSites"] = re.sub('playlouder.com:(False|True) ', '', self.config["autoSearchSites"])

        self.browser = "kioclient exec"
        self.font = self.tryToReadFontFromKde() or "10pt sans-serif"
        self.fontcolor = self.tryToReadFontColorFromKde() or "0,0,0"
        self.logger.debug("using font \"%s\" and color \"%s\"", self.font, self.fontcolor)

        # self.searchMenuCode = self.searchMenuCode % (imgPath, self.font)
        self.artist = ""
        self.album = ""
        self.url = ""

    def tryToReadFontFromKde(self):
        try:
            if '.kde4/' in __file__:
                kdeconfig = file(re.sub('\.kde4/share.*', '.kde4/share/config/kdeglobals', __file__)).read()
            else:
                kdeconfig = file(re.sub('\.kde/share.*', '.kde/share/config/kdeglobals', __file__)).read()
            s = re.search("\[General\].*?font=(.*?)\n", kdeconfig, re.S)
            if s:
                font = s.group(1).split(',')
                fontName = re.sub('^Sans Serif', 'Sans-Serif', font[0], 0, re.IGNORECASE)
                fontSize = (int(font[1]) + 1) if fontName == 'Serif' or fontName == 'Sans-Serif' else font[1]
                         # don't know why but it seems we have to add 1pt for these two
                font = "%spt %s, sans-serif" % (fontSize, fontName)
                self.logger.debug("...successfully read FONT from kde config")
                return font
            else:
                raise Exception("pattern not found")
        except Exception, e:
            self.logger.debug("Failed to read FONT from kde config: %s", e)
            return False
Пример #16
0
def process_file(filepath):
    fp = codecs.open(filepath, 'rU', 'iso-8859-2')

    content = fp.read()

    file_processor = FileProcessor(content)
    file_processor.process()

    fp.close()
    print("nazwa pliku:", filepath)
    print("autor:", file_processor.author)
    print("dzial:", file_processor.section)
    print("slowa kluczowe:", ", ".join(file_processor.keywords))
    print("liczba zdan:", len(file_processor.sentences))
    print("liczba skrotow:", len(file_processor.shortcuts))
    print("liczba liczb calkowitych z zakresu int:",
          len(file_processor.integers))
    print("liczba liczb zmiennoprzecinkowych:", len(file_processor.floats))
    print("liczba dat:", len(file_processor.dates))
    print("liczba adresow email:", len(file_processor.emails))
    print("\n")
Пример #17
0
def main():

    titlePattern = "(.)*<div\sid=\"title\">(\s)*game(.+)of(.+)thrones(.*)"
    categoryPattern = "<dt>type:</dt>(\s*)<dd><a href=(.+)Video(.*)TV(.*)shows</a></dd>"
    seedCountPattern = "<dt>Seeders:</dt>(\s*)<dd>[1-9][0-9]*</dd>"
    patternList = [titlePattern, categoryPattern, seedCountPattern]

    count = 0
    files = getAllFiles()
    for file in files:
        fileProcessor = FileProcessor(FILE_PATH, file)

        if fileProcessor.checkForContent(patternList,
                                         startpos=7000,
                                         endpos=11000):
            moveToTarget(file, FILTER_PATH)
            count += 1
        else:
            moveToTarget(file, FAILED_PATH)

    print(count / len(files))
Пример #18
0
    def editing_images(self):

        self.judge_user_input_or_not()

        file_handler = FileProcessor(self.text_input_path.get())

        if not file_handler.data_exist():
            tkMessageBox.showerror(u"Error", u"表格为空!\n")
            return

        name_id_dict = dict()
        name_id_dict = file_handler.parse_sheet()

        for k, v in name_id_dict.iteritems():
            print k, 'maps to', v

        image_handler = ImageProcessor(self.text_output_path.get())
        self.text_status_prompt.set(u"正在处理......")

        message = image_handler.add_name_and_id_on_img(name_id_dict)
        self.text_status_prompt.set(u'')
        tkMessageBox.showerror(u"Well done", message)
Пример #19
0
 def modify_files(self, i):
     worker_id = i[0]
     path_folder = self.path + "coded_" + str(worker_id) + '/'
     # print("worker", worker_id, "starts")
     fileprocess = FileProcessor(path_folder + "Map/", self.users, self.num_of_links, self.path)
     fileprocess.create_pair_files('', worker_id)
     fileprocess.collect_file_data()
     return
Пример #20
0
    def __testFileProcessorCopying(self, inputFolder, outputFolder, expectedReport):
            self.__cleanTestOuput()

            fileCopier = FileProcessor(inputFolder, outputFolder)
            fileCopier.Run()
            print(fileCopier.Report)
            
            actualReport = fileCopier.Report.splitlines()
            self.assertEqual(len(expectedReport), len(actualReport), "fileListReport does not contain the expected number of entries.")
            self.assertListEqual(expectedReport, actualReport, "fileListReport does not contain the expected values.")   
            
            expectedReportForDestination = [ os.getcwd() + r'\Tests\TestOuput',]
            expectedReportForDestination.extend(expectedReport)
            for i, s in enumerate(expectedReportForDestination):
                outputFolderInReport = outputFolder + "\\"
                outputFolderInReport = outputFolderInReport + os.path.split(inputFolder)[1]
                expectedReportForDestination[i] = s.replace(inputFolder, outputFolderInReport)
            expectedReportForDestination.append( os.getcwd() + r'\Tests\TestOuput\Report.txt')

            for i, s in enumerate(expectedReportForDestination):
                expectedReportForDestination[i] = s.replace('OK\t', '')

            self.__checkDirectoryTreeCopied(outputFolder, expectedReportForDestination)
Пример #21
0
def test_logistic_regression(w):
        if not generate_labels():
                return
        fp = FileProcessor(testing_data_filepath, ' ')
        rows = fp.parse_input_file()
        output = []
        expected = []
        labels = get_labels()
        
        for row in rows:
             expected.append(row[0])
             row = row[1:]
             sum_val = w[0]
             for feature in row:
                     feature_id = int(feature.split(':')[0])
                     sum_val += w[feature_id]
             
             if sigmoid(sum_val) >= 0.5:
                     output.append(labels[0])
             else:
                     output.append(labels[1])
        
        if fp.generate_output(output_filepath, output):
                print 'Successfully generated predictions.lr'
Пример #22
0
def get_labels():
        fp = FileProcessor(labels_filepath, ' ')
        lines = fp.parse_input_file()
        return [float(lines[0][1]), float(lines[1][1])]
Пример #23
0
                for value in range(num_of_links)
            }
            reverse_linkdict = {
                links[value].strip(): value + 1
                for value in range(num_of_links)
            }

        # print(reverse_linkdict)
        os.chdir(folder_path)  # change the director for the folder path

        # Created an instance of crawler and pass user number and links file into
        crawler = Crawler(user, "res.txt", reverse_linkdict, linkdict)
        # #Call crawl_and_createfile method to get all target links and create file for each source link
        crawler.crawl_and_createfile()

        fileprocess = FileProcessor(folder_path, user, num_of_links)
        fileprocess.file_filling()
        fileprocess.index_value()
        #    fileprocess.index2pair()
        fileprocess.rename()
        # rename()
        fileprocess.create_pair_files('pair_dir')
        # if need for shuffle and reduce file

        fileprocess.max_len = fileprocess.find_largest()
        fileprocess.write_bin_files()

    if remapping:
        file_transfer = FileTransfer(users, folder_path, path)
        result, number_result = file_transfer.Mapping()
Пример #24
0
                links[value].strip(): value + 1
                for value in range(num_of_links)
            }

        # print(reverse_linkdict)
        os.chdir(folder_path)  # change the director for the folder path

    if re_crawl:
        # Created an instance of crawler and pass user number and links file into
        # #Call crawl_and_createfile method to get all target links and create file for each source link
        crawler = Crawler(user, "res.txt", reverse_linkdict, linkdict)

        crawler.crawl_and_createfile()

    if reprocess:
        fileprocess = FileProcessor(folder_path, user, num_of_links, path)
        fileprocess.file_filling()
        fileprocess.index_value()
        #    fileprocess.index2pair()
        fileprocess.rename()
        results, number_result, user_list = fileprocess.file_mapping()
        fileprocess.file_changes('pair_dir', user_list)

#        fileprocess.create_pair_files('pair_dir')
#        # if need for shuffle and reduce file
#
#        fileprocess.max_len = fileprocess.find_largest()
#        fileprocess.write_bin_files()

    if remapping:
        file_transfer = FileTransfer(users, folder_path, path)
Пример #25
0
def learn_logistic_regression():
        fp = FileProcessor(training_data_filepath, ' ')
        training_corpus = fp.parse_input_file()
        return learn_lr_classifier(training_corpus)
Пример #26
0
    for i in range(6, features):
        print 'feature:', i
        plt.hist(data[:, i], bins=100, color='blue', alpha=0.5)
        plt.show()


if __name__ == '__main__':
    path = '../../Data/'

    # Test the file processor
    dirs = ['old/ZJetsToNuNu_HT-200to400_Tune4C_13TeV-madgraph-tauola']
    dirs = [path + d for d in dirs]
    print dirs

    fprocessor = FileProcessor(dirs)
    leaves = [3, 2, 2]

    fprocessor.process(leaves, overwrite=False, parallel=False)

    # Initialise the processor
    path = '../../Data/'
    files = ['old/ZJetsToNuNu_HT-200to400_Tune4C_13TeV-madgraph-tauola.hdf5']

    files = [path + f for f in files]
    print files

    dprocessor = DatasetProcessor(files)

    # Perform cuts in the dataset
    sumcut = 0
Пример #27
0
import os
import time
import json


notification_producer = NotificationProducer()
notification_payload = {
    "event": NotificationConstants.PROCESSING_STARTED
}
notification_producer.publish(json.dumps(notification_payload))
notification_producer.close_connection()


start = time. time()

file_processor = FileProcessor()
spark_processor = SparkProcessor()
file_index_repository = FileIndexRepository()
lda_topics_description_repository = LdaTopicsDescriptionRepository()
spark_utils = SparkUtils("local[2]", "indexing-script-app")

files_rdd = spark_utils.read_files("hdfs://localhost/pfe/data/save/*/*/*")
files_rdd = files_rdd.map(lambda file: file_processor.process(file)).cache()

# try:
# Machine Learning
files_df = spark_utils.rdd_to_df(files_rdd,
                                 ["url", "file_name", "timestamp", "uuid", "words", "pre_processed_text", "summary", "most_common", "thumbnail", "content_type", "content"]).cache()

files_df_ready_for_ml = files_df.select("url", "words")
Пример #28
0
  """
  print("Usage: MipLogTool.py logFileName [-F|-S]")
  print("-S generates statistic of the log file")
  print("-F \"pattern\" get the records were the pattern matches")

def evaluateCommand(args):
  """
  Evaluates the command who should be executed.
  """
  if len(args) < 2:
    usage()
    exit()
  elif len(args) == 2:
    return BaseCommand()
  elif args[2] == CmdFilter:
    if len(args) < 4:
      usage()
      exit()
    return FilterCommad(args[3])
  elif args[2] == CmdPrint:
    return BaseCommand()
  elif args[2] == CmdStatistic:
    return StatisticCommand()
  else:
    usage()
    exit()

fileProcessor = FileProcessor(sys.argv[1], [evaluateCommand(sys.argv)])
fileProcessor.process()

Пример #29
0
	features = data.shape[1] 

	for i in range(6, features):
		print 'feature:', i
		plt.hist(data[:, i], bins = 100, color = 'blue', alpha = 0.5)
		plt.show()

if __name__ == '__main__':
	path = '../../Data/'

	# Test the file processor 
	dirs = ['old/ZJetsToNuNu_HT-200to400_Tune4C_13TeV-madgraph-tauola']
	dirs = [path + d for d in dirs]
	print dirs 

	fprocessor = FileProcessor(dirs)
	leaves = [3, 2, 2]

	fprocessor.process(leaves, overwrite = False, parallel = False)

	# Initialise the processor 
	path = '../../Data/'
	files = ['old/ZJetsToNuNu_HT-200to400_Tune4C_13TeV-madgraph-tauola.hdf5']

	files = [path + f for f in files] 
	print files

	dprocessor = DatasetProcessor(files)

	# Perform cuts in the dataset
	sumcut = 0 
Пример #30
0
def upload_file_s():
    try:
        print("Here in uploader new")
        if request.method == 'POST':
            file = request.files['file']
            pname = request.form['pname']
            lang = request.form['lang']

            if file.filename == '':
                print("file name is empty")
                return redirect(
                    url_for('addfiles.html', message='No selected file'))
            if file and allowed_file(file.filename):
                filename = secure_filename(file.filename)
                file.filename = filename
                print("file", file.filename)

                file.save(
                    os.path.join(app.config['UPLOAD_PATH_PDF'], file.filename))
                processed = "N"
                datetime_now = datetime.datetime.now()
                formatted_date = datetime_now.strftime('%Y-%m-%d')

                projectFile_ = ProjectFile(file.filename, pname,
                                           session['user'], processed,
                                           formatted_date,
                                           app.config['UPLOAD_PATH_PDF'], lang)

                dbHandler_ = DBHandler(app.config["DATABASEIP"],
                                       app.config["DB_USER"],
                                       app.config["DB_PASSWORD"],
                                       app.config["DATABASE"])
                dbHandler_.insertFiles(projectFile_)
                if (projectFile_.lang == "eng"):
                    try:
                        FileProcessor_ = FileProcessor()
                        FileProcessor_.process(projectFile_)
                        dbHandler_.updateFileStatus(projectFile_.FileName, "Y")
                    except Exception as e:
                        dbHandler_.updateFileStatus(projectFile_.FileName, "F")
                        return render_template(
                            'addfiles.html',
                            email=session['user'],
                            projectList=session['projectList'],
                            message="File processing is failed due to error:" +
                            str(e))

                return render_template(
                    'addfiles.html',
                    email=session['user'],
                    projectList=session['projectList'],
                    message=
                    "File is successfully uploaded. File will be processed in a while"
                )

    except DBError as e:
        return render_template('addfiles.html',
                               projectList=session['projectList'],
                               email=session['user'],
                               message='Exception in file processing' + str(e))

    except Exception as e:
        return render_template('addfiles.html',
                               projectList=session['projectList'],
                               email=session['user'],
                               message='Exception in file processing' + str(e))
Пример #31
0
def get_vocabulary_size():
        fp = FileProcessor(vocabulary_filepath, ' ')
        lines = fp.parse_input_file()
        return len(lines)
Пример #32
0
import nltk
import unicodedata
import re
import spacy
from Parser import Parser
from FileProcessor import FileProcessor
from TextPreProcessor import TextPreProcessor

import json
import spacy

fp = FileProcessor()
p = Parser()
tpp = TextPreProcessor()
file = open("rapport.pdf", "rb").read()

fp.process(("/pfe/sd/sd:sd/sd:sd/", file))

nlp_spacy = spacy.load("fr")
content, content_type = p.parse_file(file)
# words, content_type, content = tpp.preprocess_text("t- - pp les obstacles9 5 0ç  à école éllève l'élève qu'affaire FAVEUR. \n\n\t Qu’il a rencontrés", nlp_spacy)
words, content_type, content = tpp.preprocess_text(content, nlp_spacy)
data = {}
data["content"] = content
data["content_type"] = content_type
data["words"] = words

with open('data.json', 'w') as outfile:
    json.dump(data, outfile)
Пример #33
0
    if recrawl:
        # print(len(reverse_linkdict.keys()))
        os.chdir(folder_path)  # change the director for the folder path
        print("Start crawling")
        # Created an instance of crawler and pass user number and links file into
        crawler = Crawler(user, "res.txt", reverse_linkdict, linkdict)
        # #Call crawl_and_createfile method to get all target links and create file for each source link
        crawler.crawl_and_createfile(False, False)

    if reprocess:
        if not reinit:
            with open(dir + "/res.txt", "r") as f:
                num_of_links = len(f.readlines())

        fileprocess = FileProcessor(folder_path, user, num_of_links, path)
        fileprocess.file_filling()
        fileprocess.index_value()
        fileprocess.rename()

    if remapping:
        if mode == 1:
            file_transfer = FileTransfer(user, folder_path, path, num_of_links)
            file_coded_transfer = FileCodedTransfer(user, folder_path, path,
                                                    num_of_links)
            result, number_result = file_transfer.Mapping()
        else:
            # file_transfer = (user, folder_path, path, num_of_links)
            file_coded_transfer = FileCodedTransfer(user, folder_path, path,
                                                    num_of_links)
            result, number_result = file_coded_transfer.Mapping()