Пример #1
0
def spreadURLsByContentType(url, httpResponse, tyyp, od, _encoding, filePath = None):
    #od = initRdf.OntologyData('/var/www/html/mag/rdf_files/')
    #initRdf.RdfFilesCreator(od)
    doctext = httpResponse#detectEncoding(_encoding, httpResponse)
    '''#parse excel file'''
    if("excel" in tyyp.lower()):
        '''#parse web page excel'''
        read_eksel.readExcel(filePath, url, od)
    elif("xml" in tyyp.lower()):
        #print(tyyp)
        '''#parse web page xml'''
        doctext = detectEncoding(_encoding, httpResponse)
        read_xml.readXml(url, doctext, od)
    elif("html" in tyyp.lower()) :
        '''#parse web page html/txt'''
        doctext = detectEncoding(_encoding, httpResponse)
        read_html.readHtmlPage(url, doctext, od)
    elif("json" in tyyp.lower()):
        '''#parse json app/json'''
        doctext = detectEncoding(_encoding, httpResponse)
        read_json.readJson(url, doctext, od)
    elif("pdf" in tyyp.lower()):
        '''#parse pdf'''
        read_pdf.readPdf(url, doctext, od)
    elif("plain" in tyyp.lower()) or ("text" in tyyp.lower()):
        doctext = detectEncoding(_encoding, httpResponse)
        '''#assumes incoming is plain text try to parse text lines'''
        read_plaintext.readPlainText(url, doctext, od)
    else:
        jf = open(comm.pathToSaveParsingErrors, 'a',  encoding='utf-8')
        jf.write(time.strftime("%d/%m/%Y_%H:%M:%S") + " " + url + " " + "The_parser_for_the_type_" + tyyp + "_is_not_implemented\n")
        jf.close()
Пример #2
0
def spreadURLsByContentType(url, httpResponse, tyyp, od, _encoding, filePath = None):
    doctext = httpResponse
    '''#parse excel file'''
    if("excel" in tyyp):
        try:
            '''#parse web page excel'''
            read_eksel.readExcel(filePath, url, od)
        except:
            comm.printException(comm.pathToSaveParsingErrors, "fileparser_excel")
            pass
    
    elif("pdf" in tyyp):
        try:
            '''#parse pdf'''
            read_pdf.readPdf(filePath, url, od)
        except:
            comm.printException(comm.pathToSaveParsingErrors, "fileparser_pdf")
            pass
            
    elif("xml" in tyyp):
        try:
            '''#parse web page xml'''
            doctext = detectEncoding(_encoding, httpResponse)
            read_xml.readXml(url, doctext, od)
        except:
            comm.printException(comm.pathToSaveParsingErrors, "fileparser_xml")
            pass
    elif("html" in tyyp) :
        try:
            '''#parse web page html/txt'''
            doctext = detectEncoding(_encoding, httpResponse)
            read_html.readHtmlPage(url, doctext, od, _encoding)
        except:
            comm.printException(comm.pathToSaveParsingErrors, "fileparser_html")
            pass
    elif("json" in tyyp):
        try:
            '''#parse json app/json'''
            doctext = detectEncoding(_encoding, httpResponse)
            read_json.readJson(url, doctext, od, _encoding)
        except:
            comm.printException(comm.pathToSaveParsingErrors, "fileparser_json")
            pass
    elif("plain" in tyyp) or ("text" in tyyp):
        try:
            doctext = detectEncoding(_encoding, httpResponse)
            '''#assumes incoming is plain text try to parse text lines'''
            read_plaintext.readPlainText(url, doctext, od, _encoding)
        except:
            comm.printException(comm.pathToSaveParsingErrors, "fileparser_plainText")
            pass
    else:
        comm.printException(comm.pathToSaveParsingErrors, "The_parser_for_the_type_" + tyyp + "_is_not_implemented\n")
Пример #3
0
def spreadURLsByContentType(url,
                            httpResponse,
                            tyyp,
                            od,
                            _encoding,
                            filePath=None):
    #od = initRdf.OntologyData('/var/www/html/mag/rdf_files/')
    #initRdf.RdfFilesCreator(od)
    doctext = httpResponse  #detectEncoding(_encoding, httpResponse)
    '''#parse excel file'''
    if ("excel" in tyyp.lower()):
        '''#parse web page excel'''
        read_eksel.readExcel(filePath, url, od)
    elif ("xml" in tyyp.lower()):
        #print(tyyp)
        '''#parse web page xml'''
        doctext = detectEncoding(_encoding, httpResponse)
        read_xml.readXml(url, doctext, od)
    elif ("html" in tyyp.lower()):
        '''#parse web page html/txt'''
        doctext = detectEncoding(_encoding, httpResponse)
        read_html.readHtmlPage(url, doctext, od)
    elif ("json" in tyyp.lower()):
        '''#parse json app/json'''
        doctext = detectEncoding(_encoding, httpResponse)
        read_json.readJson(url, doctext, od)
    elif ("pdf" in tyyp.lower()):
        '''#parse pdf'''
        read_pdf.readPdf(url, doctext, od)
    elif ("plain" in tyyp.lower()) or ("text" in tyyp.lower()):
        doctext = detectEncoding(_encoding, httpResponse)
        '''#assumes incoming is plain text try to parse text lines'''
        read_plaintext.readPlainText(url, doctext, od)
    else:
        jf = open(comm.pathToSaveParsingErrors, 'a', encoding='utf-8')
        jf.write(
            time.strftime("%d/%m/%Y_%H:%M:%S") + " " + url + " " +
            "The_parser_for_the_type_" + tyyp + "_is_not_implemented\n")
        jf.close()