Пример #1
0
def fetch_site():
    url = request.args['url']
    r = requests.get(url)
    if "UTF-8" not in r.encoding:
        r.encoding = r.apparent_encoding

    htmlcontent = r.text

    #htmlcontent = htmlcontent.replace('href="/', 'href="' + url + '/')

    baseurl = re.sub('(https*://)([^/]+)/*.*', r'\1'+ r'\2', url,flags=re.IGNORECASE)
    baseurl = baseurl.replace('‍','')

    #print('Base URL')
    #print(baseurl)

    htmlcontent = convert(request.args['source'], request.args['target'], htmlcontent, json.loads(request.args['nativize']),
        json.loads(request.args['preOptions']), json.loads(request.args['postOptions']))

    # Replace relative paths with absolute paths
    htmlcontent=re.sub("(\")/",r"\1"+baseurl+"/",htmlcontent)
    htmlcontent=re.sub("(\.\")/",r"\1"+baseurl+"/",htmlcontent)
    htmlcontent=re.sub("(url\()\/",r"\1"+baseurl+"/",htmlcontent)

    ## Parameters

    params = 'source=' + request.args['source'] + '&target=' + request.args['target'] + '&preOptions=' + request.args['preOptions'] + '&postOptions=' + request.args['postOptions'] + '&nativize=' + request.args['nativize']

    transurl = html.escape("http://aksharamukha.appspot.com/api/website?"+params+'&url=')

    # fix double dot
    urlparts = url.split("/")
    doubledot =""
    for  i in range(0, len(urlparts)-2):
        doubledot = doubledot + urlparts[i]+ "/"

    htmlcontent=htmlcontent.replace("../",doubledot)

    ## Replace links

    htmlcontent=re.sub("(<a href\=\"?)",r"\1"+transurl,htmlcontent)
    htmlcontent=re.sub("(<a class=.*? href\=\"?)",r"\1"+transurl,htmlcontent)
    htmlcontent=re.sub("(<a target\=\"\_blank\" href\=\")",r"\1"+transurl,htmlcontent)
    htmlcontent=re.sub("(<a target\=\"\_self\" href\=\")",r"\1"+transurl,htmlcontent)

    ## Replace with native numerals

    htmlcontent = PostProcess.RetainIndicNumerals(htmlcontent, request.args['target'], True)

    ## Retain Dandas

    htmlcontent = PostProcess.RetainDandasIndic(htmlcontent, request.args['target'], True)

    return htmlcontent