예제 #1
0
def readExcel(filePath, url, ontologyData):
    try:
        urr(url, filePath)
        try:
            workbook = xlrd.open_workbook(filePath)
            worksheets = workbook.sheet_names()
            for worksheet_name in worksheets:
                worksheet = workbook.sheet_by_name(worksheet_name)
                num_rows = worksheet.nrows - 1
                num_cells = worksheet.ncols - 1
                curr_row = -1
                while curr_row < num_rows:
                    curr_row += 1
                    # row = worksheet.row(curr_row)
                    # print ('Row:', curr_row)
                    curr_cell = -1
                    while curr_cell < num_cells:
                        curr_cell += 1
                        # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank
                        cell_type = worksheet.cell_type(curr_row, curr_cell)
                        cell_value = worksheet.cell_value(curr_row, curr_cell)
                        if cell_type == 1:
                            sentences = comm.replaceToPunkts(cell_value)
                            for sentence in sentences:
                                getEntities.getEntities(url, sentence, ontologyData)

        except:
            comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url)
            pass
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url)
        pass
예제 #2
0
        for fname in rdfFnames:
            # rdf URL in worker machine
            # /var/www/html/outputf/<ip>/rdf_files/<date_dir>/<fname>.rdf
            # /var/www/html/outputf/<ip>/rdf_files/<date_dir>/LOC.rdf
            # /var/www/html/outputf/<ip>/rdf_files/<date_dir>/ORG.rdf
            # /var/www/html/outputf/<ip>/rdf_files/<date_dir>/PER.rdf
            inputf = remote_date_dir_path + "/" + fname + ".rdf"
            # create
            # rdf dirs' paths in master machine:
            # /var/www/html/rdf_files/<fname>/
            # e.g.
            # /var/www/html/rdf_files/LOC/
            # /var/www/html/rdf_files/ORG/
            # /var/www/html/rdf_files/PER/
            outputDir = comm.pathToRDFdir + fname + "/"
            if not os.path.isdir(outputDir):  # make dirs ORG, PER, LOC in master's dir 'rdf_files'
                os.makedirs(outputDir)

            # rdf file path in master machine:
            # /var/www/html/rdf_files/<fname>/<remoteName>_<date_dir>_.rdf
            # e.g.
            # /var/www/html/rdf_files/rdf_files/ORG/worker1_08_2015_09_10.rdf
            # old was: output = outputDir + remoteName + ".rdf"
            output = outputDir + remoteName + "_" + date_dir + "_" + ".rdf"
            try:
                urr(inputf, output)
            except:
                pass

    mergeRDFfiles()
예제 #3
0
def dowloadFromJsons(ajadir):
    nrOfDownloads = 0
    jsons=comm.jsonsDir
    """loop over every file in jsons-folder"""
    for filePath in listdir(jsons):
            if(filePath != "errors.txt"):
                #open json file:
                #'jsons' is a folder where json-files are saved
                #'filePath' is a filename in this folder
                ##'jsons'-folder lives in the folder "datadownload"
                ##'downloaded_files'-folder lives also in the folder "datadownload"
                try:
                    """load json-file into directory-type"""
                    jsonToDict = json.load(open(jsons+filePath));
                except:
                    continue
                #'base_url' is the hostname, before "/"-slashes, in json-file
                #'base_url' is the json-file name ('filePath'), followed by an extension '.json'
                #'base_url' is also a directory name in 'downloaded_files'-folder
                base_url = jsonToDict['base_url']#becomes folderName
                for fname_key in jsonToDict.keys():
                    #At first level, there are two sorts of keys in json-file:
                    #1. base-url
                    #2. sha(s) of filename(s), (sha of file full url, including "/"-slashes)
                    ##As the file content may change over time, every sha(filename)-element contains
                    ##1. sha(s) of a content(s)
                    ###Every sha of a content contains
                    ###1. metadata of a file/content
                    if (fname_key != 'base_url'):#fname_key(sha of file url) becomes local filename
                        #'jsonToDict[fname_key].keys()' is a list of sha(content) of a current sha(filename)
                        #loop over every sha(content) of a sha(filename)
                        #here, csha is the sha(filecontent)
                        for csha in jsonToDict[fname_key].keys():
                            contentKeyExists=False
                            """check if metadata contains key 'Content-Type'"""
                            try:
                               if ('Content-Type' in jsonToDict[fname_key][csha]):
                                   contentKeyExists=True
                            except:
                                contentKeyExists=False
                                pass
                            """Get the time the json-file was made"""
                            timeDir = jsonToDict[fname_key][csha]['timeDir']
                                                   #download only today's changes!
                            if(contentKeyExists) & (ajadir == timeDir):
                                #excel type is already downloaded
                                if("excel" not in jsonToDict[fname_key][csha]['Content-Type']):
                                    """Full URL of a file"""
                                    file_url = jsonToDict[fname_key][csha]['file_url']
                                    """'dirPath' is the path of a folder of a file currently wants to be downloaded"""
                                    dirPath = comm.downloadsDir + base_url + "/"
                                    try:
                                        """create folder for this 'date/base_url' if does not exist"""
                                        if (not os.path.isdir(dirPath)) & (not os.path.exists(dirPath)):
                                            os.makedirs(dirPath)
                                        try:
                                            #download the file into that folder
                                            #fname_key is the sha(filename)
                                            #resulting path of a file will become 'date/base_url/sha(filename)'
                                            urr(file_url, dirPath + fname_key)
                                            nrOfDownloads += 1
                                            #print(timeDir, base_url, , file_url)
                                            
                                        except:
                                            comm.printException(comm.pathToSaveDownloadErrors, filePath)
                                            pass
                                    except:
                                        comm.printException(comm.pathToSaveDownloadErrors, filePath)
                                        pass
    return nrOfDownloads
예제 #4
0
            #/var/www/html/outputf/<ip>/rdf_files/<date_dir>/<fname>.rdf
            #/var/www/html/outputf/<ip>/rdf_files/<date_dir>/LOC.rdf
            #/var/www/html/outputf/<ip>/rdf_files/<date_dir>/ORG.rdf
            #/var/www/html/outputf/<ip>/rdf_files/<date_dir>/PER.rdf
            inputf = remote_date_dir_path + "/" + fname + ".rdf"
            #create
            #rdf dirs' paths in master machine:
            #/var/www/html/rdf_files/<fname>/
            #e.g.
            #/var/www/html/rdf_files/LOC/
            #/var/www/html/rdf_files/ORG/
            #/var/www/html/rdf_files/PER/
            outputDir = comm.pathToRDFdir + fname + "/"
            if not os.path.isdir(
                    outputDir
            ):  #make dirs ORG, PER, LOC in master's dir 'rdf_files'
                os.makedirs(outputDir)

            #rdf file path in master machine:
            #/var/www/html/rdf_files/<fname>/<remoteName>_<date_dir>_.rdf
            #e.g.
            #/var/www/html/rdf_files/rdf_files/ORG/worker1_08_2015_09_10.rdf
            #old was: output = outputDir + remoteName + ".rdf"
            output = outputDir + remoteName + "_" + date_dir + "_" + ".rdf"
            try:
                urr(inputf, output)
            except:
                pass

    mergeRDFfiles()