示例#1
0
def save_path(count, xpathString, found):

    if count is True:
        countFound = 0
        countNotfound = 0

        try:
            with open(rss_config.PATH_FILENAME_STAT, "r") as file:
                for line in file:
                    if line.startswith(xpathString):
                        splittedLine = line.split(";")
                        countFound = (int)(splittedLine[1])
                        countNotfound = (int)(splittedLine[2])
        except Exception:
            with open(rss_config.PATH_FILENAME_STAT, 'a') as file:
                file.write("")

        if found:
            countFound += 1
        elif not found:
            countNotfound += 1

        xpathStringWithStats = xpathString + ";" + str(countFound) + ";" + str(
            countNotfound) + ";"

        replace_line_in_file(rss_config.PATH_FILENAME_STAT, xpathString + ";",
                             xpathStringWithStats)

        rss_print.print_debug(__file__, xpathStringWithStats, 4)
示例#2
0
def read_file_string_from_disk(osCacheFolderDomainArticle):
    if not os.path.isfile(osCacheFolderDomainArticle):
        rss_print.print_debug(
            __file__,
            "kettal pole lugemiseks faili: " + osCacheFolderDomainArticle, 2)
        return ""

    try:
        with gzip.open(osCacheFolderDomainArticle, 'rb') as cacheReadFile:
            htmlPageBytes = cacheReadFile.read()
    except Exception as e:
        rss_print.print_debug(__file__, "exception = '" + str(e) + "'", 1)

        # pakitud faili ei leitud, proovime tavalist
        try:
            with open(osCacheFolderDomainArticle, 'rb') as cacheReadFile:
                htmlPageBytes = cacheReadFile.read()
        except Exception as e:
            rss_print.print_debug(__file__, "exception = '" + str(e) + "'", 1)
            return ""

    try:
        htmlPageString = htmlPageBytes.decode(rss_config.CACHE_FILE_ENCODING)
    except Exception as e:
        rss_print.print_debug(
            __file__,
            "kettalt loetud faili dekodeerimine utf-8 vorminguga EBAõnnestus",
            0)
        rss_print.print_debug(__file__, "exception = '" + str(e) + "'", 1)
        return ""

    return htmlPageString
示例#3
0
def html_tree_from_document_string(htmlString, caller):
    """
    See funktsioon teeb root html treed.
    """
    if caller:
        rss_print.print_debug(__file__,
                              "asume looma html objekti kutsujale: " + caller,
                              4)

    htmlString = htmlString.strip()
    if not htmlString:
        rss_print.print_debug(
            __file__, "puudub html stringi sisu kutsujal: '" + caller + "'", 0)
        htmlString = "<html><head></head></html>"

    if htmlString.startswith('<?xml version="1.0" encoding="utf-8"?>'):
        # kui unicode ei käi, proovime utf-8 "Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration."
        htmlStringUtf = htmlString.encode('utf-8')
        htmlTree = html.document_fromstring(htmlStringUtf)
    else:
        try:
            htmlTree = html.document_fromstring(htmlString)
        except Exception as e:
            rss_print.print_debug(
                __file__,
                "ei õnnestunud luua mitteutf-8 html objekti kutsujal: '" +
                caller + "'", 0)
            rss_print.print_debug(__file__, "exception = '" + str(e) + "'", 1)
            rss_print.print_debug(
                __file__,
                "ei õnnestunud luua mitteutf-8 html objekti stringist: '" +
                htmlString + "'", 3)

    return htmlTree
示例#4
0
def html_string_children(htmlString):
    if not isinstance(htmlString, str):
        rss_print.print_debug(__file__,
                              "sisend pole string, tagastame tühjuse", 0)
        return ""

    if htmlString[0] != "<":
        rss_print.print_debug(
            __file__, "katkestame, algus pole tag: '" + htmlString + "'", 4)
        return htmlString

    if htmlString[-1] != ">":
        rss_print.print_debug(
            __file__, "katkestame, lõpp pole tag: '" + htmlString + "'", 4)
        return htmlString

    if "</" not in htmlString:
        rss_print.print_debug(
            __file__, "sisendis pole child elementi, tagastame sisendi", 0)
        return htmlString

    if len(htmlString) <= 7:  # <b></b>
        rss_print.print_debug(
            __file__, "liiga lühike, tagastame sisendi: '" + htmlString + "'",
            0)
        return htmlString

    tagOpening = htmlString.find(">") + 1
    tagClosing = htmlString.rfind("</")

    # lõikame stringist vajaliku osa
    htmlString = htmlString[tagOpening:tagClosing]
    htmlString = htmlString.strip()

    return htmlString
示例#5
0
def html_tree_from_string(htmlString, caller):
    """
    See funktsioon ei tee root html treed.
    """
    if caller:
        rss_print.print_debug(__file__,
                              "asume looma html objekti kutsujale: " + caller,
                              4)

    htmlString = htmlString.strip()
    if not htmlString:
        rss_print.print_debug(
            __file__, "puudub html stringi sisu kutsujal: '" + caller + "'", 0)

    try:
        htmlTree = html.fromstring(htmlString)
    except Exception as e:
        rss_print.print_debug(
            __file__,
            "ei õnnestunud luua mitteutf-8 html objekti kutsujal: '" + caller +
            "'", 0)
        rss_print.print_debug(__file__, "exception = '" + str(e) + "'", 1)
        rss_print.print_debug(
            __file__,
            "ei õnnestunud luua mitteutf-8 html objekti stringist: '" +
            htmlString + "'", 3)

    return htmlTree
示例#6
0
def raw_to_float(rawDateTimeText, rawDateTimeSyntax):
    """
    Teeb sisseantud ajatekstist ja süntaksist float tüüpi aja.
    rawDateTimeText = aeg teksti kujul, näiteks: "23. 11 2007 /"
    rawDateTimeSyntax = selle teksti süntaks, näiteks "%d. %m %Y /"
    Süntaksi seletus: https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior
    """
    curDateTimeText = rawDateTimeText.strip()
    if not curDateTimeText:
        rss_print.print_debug(__file__, "curDateTimeText = '" + curDateTimeText + "' tühi, tagastame nulli", 0)
        return 0

    try:
        datetimeStruct = time.strptime(curDateTimeText, rawDateTimeSyntax)
        datetimeList = list(datetimeStruct)

        if datetimeList[0] == 1900:
            if datetimeList[1] > int(time.strftime('%m')):
                rss_print.print_debug(__file__, "curDateTimeText = '" + curDateTimeText + "', muudame puuduva aasta eelmiseks aastaks", 0)
                datetimeList[0] = int(time.strftime('%Y')) - 1
            else:
                rss_print.print_debug(__file__, "curDateTimeText = '" + curDateTimeText + "', muudame puuduva aasta praeguseks aastaks", 0)
                datetimeList[0] = int(time.strftime('%Y'))

        datetimeTuple = tuple(datetimeList)
        datetimeFloat = time.mktime(datetimeTuple)
    except Exception as e:
        rss_print.print_debug(__file__, "curDateTimeText = '" + curDateTimeText + "' dekodeerimine rawDateTimeSyntax = '" + rawDateTimeSyntax + "' EBAõnnestus, tagastame nulli", 0)
        rss_print.print_debug(__file__, "exception = '" + str(e) + "'", 1)
        return 0

    return datetimeFloat
示例#7
0
def dict_add_dict(articleDataDictMain, articleDataDictNew):
    rss_print.print_debug(__file__, "ühendame dictCur ja dictNew", 4)

    for key in articleDataDictMain.keys():
        articleDataDictMain[
            key] = articleDataDictMain[key] + articleDataDictNew[key]

    return articleDataDictMain
示例#8
0
def str_domain_url(domain, articleUrl):
    """
    Ühendab domeeni URLiga.
    """
    articleUrl = domain.rstrip('/') + '/' + articleUrl.lstrip('./').lstrip('/')
    rss_print.print_debug(
        __file__, "pärast domeeni lisamist lingile: " + str(articleUrl), 4)

    return articleUrl
示例#9
0
def float_to_datetime_rfc2822(floatDateTime):
    """
    Teeb sisse antud floadist rfc2822 aja.
    Süntaksi seletus: https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior
    """
    rss_print.print_debug(__file__, "floatDateTime = '" + str(floatDateTime) + "'", 5)
    datetimeRFC2822 = formatdate(floatDateTime, True, True)
    rss_print.print_debug(__file__, "datetimeRFC2822 = '" + str(datetimeRFC2822) + "'", 4)

    return datetimeRFC2822
示例#10
0
def html_page_cleanup(htmlString):
    if not htmlString:
        rss_print.print_debug(__file__,
                              "katkestame, tühi sisend: '" + htmlString + "'",
                              0)
        return htmlString

    rss_print.print_debug(__file__,
                          "puhastame html stringi üleliigsest jamast", 3)

    # remove styles
    htmlString = re.sub(r"<style[\s\S]*?<\/style>", "", htmlString)

    # remove comments
    htmlString = re.sub(r"<!--[\s\S]*?-->", "", htmlString)

    # remove scripts from links
    htmlString = re.sub(r' onclick=(\")[\s\S]*?(\")', "", htmlString)
    htmlString = re.sub(r" onclick=(')[\s\S]*?(')", "", htmlString)

    # remove scripts
    htmlString = re.sub(r"<script[\s\S]*?<\/script>", "", htmlString)

    # remove trackers from links
    htmlString = htmlString.replace("&amp;", "&")
    htmlString = re.sub(r'(&|\?)_[0-9A-Za-z_-]*', "", htmlString)  # delfi
    htmlString = re.sub(r'=2\.[0-9.-]*', "", htmlString)
    htmlString = re.sub(
        r'_ga=[0-9.-]*', "",
        htmlString)  # _ga=2.22935807.513285745.1595741966-250801514.1594127878
    htmlString = re.sub(r'fbclid=[0-9A-Za-z-_]*', "", htmlString)
    htmlString = re.sub(r'gclid=[0-9A-Za-z-_]*', "", htmlString)
    htmlString = re.sub(r'refid=[0-9A-Za-z=.%_-]*', "", htmlString)
    htmlString = re.sub(r'utm_source=[0-9A-Za-z-_&=.]*', "", htmlString)

    # fix link without trackers
    htmlString = htmlString.replace("?&", "?")

    # eemaldame html-i vahelise whitespace-i
    htmlString = re.sub(r"\s\s+(?=<)", "", htmlString)

    # eemaldame allesjäänud tühikud
    htmlString = htmlString.replace('\\n', " ")
    htmlString = htmlString.replace('\\r', " ")
    htmlString = htmlString.replace('\\t', " ")

    # br - peab tegema, kuna muidu ei saa xpath oma teekondasid kätte
    htmlString = htmlString.replace("<br/>", "<br>")
    htmlString = htmlString.replace(" <br>", "<br>")
    htmlString = htmlString.replace("<br> ", "<br>")
    htmlString = htmlString.replace("<br><br>", "<br>")

    htmlString = " ".join(htmlString.split())

    return htmlString
示例#11
0
def get_service_log_path(articleUrl):
    if rss_config.PRINT_MESSAGE_LEVEL > 0:
        serviceLogPath = "/tmp/webdriver_" + articleUrl.replace("/",
                                                                "|") + ".log"
        rss_print.print_debug(__file__, "logime asukohta: " + serviceLogPath,
                              0)
    else:
        os.environ['MOZ_HEADLESS'] = '1'
        serviceLogPath = os.devnull

    return serviceLogPath
示例#12
0
def article_posts_range(articlePosts, maxArticlePosts):
    """
    Viimasest tagasi kuni piirarvu täitumiseni.
    """
    articlePostsLen = len(articlePosts)
    rss_print.print_debug(
        __file__,
        "xpath parsimisel leitud artikli poste: " + str(articlePostsLen), 2)
    retRange = range(max(0, articlePostsLen - maxArticlePosts),
                     articlePostsLen)

    return retRange
示例#13
0
def get_url_string_from_disk(articleUrl):
    rss_print.print_debug(__file__, "kettalt proovitav leht: " + articleUrl, 3)
    osPath = os.path.dirname(os.path.abspath(__file__))
    osCacheFolder = osPath + '/' + 'article_cache'
    cacheArticleUrl = articleUrl.replace('/', '|')
    cacheDomainFolder = articleUrl.split('/')[2]
    osCacheFolderDomain = osCacheFolder + '/' + cacheDomainFolder
    osCacheFolderDomainArticle = osCacheFolderDomain + '/' + cacheArticleUrl

    htmlPageString = read_file_string_from_disk(osCacheFolderDomainArticle)

    return htmlPageString
示例#14
0
def get_url_from_internet(curDomainLong,
                          stamp,
                          seleniumPath="",
                          seleniumProfile=False):
    rss_print.print_debug(__file__,
                          "algatame internetipäringu: " + curDomainLong, 2)

    seleniumClicks = []

    # selenium
    if "auto24.ee" in curDomainLong:
        seleniumPath = '//div[@class="section messages"]'
    elif "err.ee/uudised" in curDomainLong:
        seleniumPath = '//div[@class="ng-scope"]'
    elif "kultuuriaken.tartu.ee/et/syndmused" in curDomainLong:
        seleniumClicks = [
            '//input[@name="starting_time" and @value="2"]',
            '//a[@data-view="list-view"]'
        ]
        seleniumPath = '//div[@class="col-12"]/h1[@class="py-3"]'
    elif "levila.ee" in curDomainLong:
        seleniumPath = '//a[@class="post-item-meta__link"]'
    elif "mixcloud.com" in curDomainLong:
        seleniumPath = '//main/div[@class="content"]/div/div/div'
    elif "sky.ee" in curDomainLong:
        seleniumPath = '//div[@class="box-news-block-title "]'
    elif "treraadio.ee" in curDomainLong:
        seleniumPath = '//a[@id="scrollBtn"]'
    elif "tv3.ee" in curDomainLong:
        seleniumPath = '//a[@class="sc-1kym84g-0 dxESGf c950ig-0 eUNpOJ"]'
    elif "twitter.com" in curDomainLong:
        seleniumPath = '//article[@role="article"]'

    # teeme päringu
    if seleniumPath:
        htmlPageString = rss_selenium.get_article_string(
            curDomainLong, seleniumClicks, seleniumPath, seleniumProfile)
    else:
        htmlPageString = rss_requests.get_article_string(
            curDomainLong, rss_config.HEADERS)

    # puhastame lehe üleliigsest jamast
    htmlPageString = parsers_html.html_page_cleanup(htmlPageString)

    # salvestame kõikide netipäringute tulemused alati kettale
    if stamp:
        rss_disk.write_file_string_to_cache(curDomainLong + "#" + stamp,
                                            htmlPageString)
    else:
        rss_disk.write_file_string_to_cache(curDomainLong, htmlPageString)

    return htmlPageString
示例#15
0
def add_value_to_time_string(curArtPubDate, curDateFormat, offsetDays=0):
    """
    Lisab ajale stringi.
    @curArtPubDate = nt: 03.01
    @curDateFormat = algusesse lisatav osa nt: 2019.
    @offsetDays = 0 täna, -1 eile
    """
    datetimeOffset = datetime_offset_from_format(offsetDays)

    curArtPubDate = (datetime.now() + datetimeOffset).strftime(curDateFormat) + curArtPubDate
    rss_print.print_debug(__file__, "lisasime tänasele kellaajale kuupäeva: " + curArtPubDate, 3)

    return curArtPubDate
示例#16
0
def raw_to_datetime(rawDateTimeText, rawDateTimeSyntax):
    """
    Teeb sisseantud ajatekstist ja süntaksist datetime tüüpi aja.
    rawDateTimeText = aeg teksti kujul, näiteks: "23. 11 2007 /"
    rawDateTimeSyntax = selle teksti süntaks, näiteks "%d. %m %Y /"
    Süntaksi seletus: https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior
    """
    curDateTimeText = rawDateTimeText
    curDateTimeText = curDateTimeText.strip()
    curDateTimeText = parsers_common.str_lchop(curDateTimeText, "\\t")
    curDateTimeText = parsers_common.str_rchop(curDateTimeText, "\\r\\n")

    if not curDateTimeText:
        rss_print.print_debug(__file__, "tühi ajasisend: curDateTimeText = '" + curDateTimeText + "'", 0)
    else:
        rss_print.print_debug(__file__, "curDateTimeText = '" + curDateTimeText + "'", 5)

    if not rawDateTimeSyntax:
        rss_print.print_debug(__file__, "tühi ajasisend: rawDateTimeSyntax = '" + rawDateTimeSyntax + "'", 0)
    else:
        rss_print.print_debug(__file__, "rawDateTimeSyntax = '" + rawDateTimeSyntax + "'", 5)

    datetimeFloat = raw_to_float(curDateTimeText, rawDateTimeSyntax)
    datetimeRFC2822 = float_to_datetime_rfc2822(datetimeFloat)

    return datetimeRFC2822
示例#17
0
def list_del_elem_if_set(inpList, inpIndex):
    inpListLen = len(inpList)
    indexHumanreadable = inpIndex + 1
    if inpListLen >= indexHumanreadable:
        rss_print.print_debug(
            __file__, "listi pikkus on: " + str(inpListLen) +
            ", eemaldasime listi elemendi nr: " + str(indexHumanreadable), 4)
        del inpList[inpIndex]
    else:
        rss_print.print_debug(
            __file__, "listi pikkus on: " + str(inpListLen) +
            ", ei eemaldand listi elementi nr: " + str(indexHumanreadable), 4)

    return inpList
示例#18
0
def replace_line_in_file(inpfile, searchExp, replaceExp):

    found = False

    for line in fileinput.input(inpfile, inplace=1):
        if line.startswith(searchExp):
            found = True
            line = replaceExp + "\n"
        sys.stdout.write(line)

    if not found:
        rss_print.print_debug(__file__, "lisame lõppu: " + replaceExp, 1)
        with open(inpfile, 'a') as file:
            file.write(replaceExp + "\n")
示例#19
0
def replace_string_with_timeformat(inpString, stringToReplace, dateTimeformat, offsetDays=0):
    """
    Asendab sisendis etteantud stringi mingit formaati ajaga.
    Sisendid:
        inpString="eile, 23:34"
        stringToReplace="eile",
        dateTimeformat="%d %m %Y",
        offsetDays=-1
    Väljund: 24 05 2020, 23:34
    """
    if stringToReplace in inpString:
        datetimeOffset = datetime_offset_from_format(offsetDays)
        inpString = inpString.replace(stringToReplace, str((datetime.now() + datetimeOffset).strftime(dateTimeformat)))
        rss_print.print_debug(__file__, "asendasime stringis sõna ajaga: '" + stringToReplace + "' -> " + inpString, 3)

    return inpString
示例#20
0
def xpath_to_single(elementStrings, elementsLen, xpathString, parent):
    """
    Leiab etteantud artikli lehe puust etteantud xpathi väärtuse alusel objekt.
    """
    element = ""

    # peab olema nii, kuna mitteoodatud mitmese leiu korral pandaks muidu väärtused kokku
    for i in range(elementsLen):
        elem = elementStrings[i]

        if not isinstance(elem, str):
            elem = parsers_html.html_to_string(elem, prettyPrint=False)
            rss_print.print_debug(
                __file__, "'" + xpathString + "' väärtus[" + str(i) +
                "] polnud string, stringimise järel: " + elem, 4)

        countParentNodes = parsers_html.html_string_count_parent_nodes(
            elem, "xpath_to_single")

        if not countParentNodes:
            if parent is True:
                rss_print.print_debug(
                    __file__, "'" + xpathString + "' väärtus[" + str(i) +
                    "] on valestimääratud parent?: " + elem, 0)
        elif countParentNodes == 1:
            elem = parsers_html.html_remove_single_parents(elem)
        elif countParentNodes > 1:
            if parent is True:
                rss_print.print_debug(
                    __file__, "'" + xpathString + "' väärtus[" + str(i) +
                    "] on valestimääratud parent? " + elem, 0)
            else:
                rss_print.print_debug(
                    __file__, "'" + xpathString + "' väärtus[" + str(i) +
                    "] on hoiatatud parent: " + elem, 3)

        elem = elem.strip()
        if elem:
            if element:
                element += "<br>"
            element += elem

    rss_print.print_debug(__file__,
                          "'" + xpathString + "' väljund: '" + element + "'",
                          4)

    return element
示例#21
0
def html_change_short_urls(htmlPageString, curDomainShort):
    """
    Fix short urls.
    """
    htmlPageString = htmlPageString.replace('src="//', 'src="http://')
    htmlPageString = htmlPageString.replace('src="./',
                                            'src="' + curDomainShort + '/')
    htmlPageString = htmlPageString.replace('src="/',
                                            'src="' + curDomainShort + '/')
    htmlPageString = htmlPageString.replace('href="//', 'href="http://')
    htmlPageString = htmlPageString.replace('href="./',
                                            'href="' + curDomainShort + '/')
    htmlPageString = htmlPageString.replace('href="/',
                                            'href="' + curDomainShort + '/')
    rss_print.print_debug(__file__, "html string: " + htmlPageString, 5)

    return htmlPageString
示例#22
0
def article_urls_range(articleUrls):
    """
    Esimesest edasi kuni objektide lõpuni.
    """
    articleUrlsLen = len(articleUrls)
    if articleUrlsLen == 0:
        retRange = range(0)
        rss_print.print_debug(
            __file__,
            "xpath parsimisel leitud artikleid: " + str(articleUrlsLen), 1)
    else:
        retRange = range(articleUrlsLen)
        rss_print.print_debug(
            __file__,
            "xpath parsimisel leitud artikleid: " + str(articleUrlsLen), 3)

    return retRange
示例#23
0
def str_lchop(curString, stripString):
    if not curString:
        rss_print.print_debug(
            __file__,
            "sisend tühi, katkestame: curString = '" + str(curString) + "'", 3)
        return curString
    if not stripString:
        rss_print.print_debug(
            __file__, "sisend tühi, katkestame: stripString = '" +
            str(stripString) + "'", 0)
        return curString

    # constant
    stripStringLen = len(stripString)

    while curString.startswith(stripString):
        curString = curString[stripStringLen:]

    return curString
示例#24
0
def raw_to_datetime_guess_missing(inpArtPubDate, lastArtPubDate, dateStringPrefix, dateStringMain, daysToOffset):
    curOffsetDays = 0
    curArtPubDate = inpArtPubDate

    curArtPubDate = add_value_to_time_string(curArtPubDate, dateStringPrefix, curOffsetDays)
    curArtPubDate = raw_to_datetime(curArtPubDate, dateStringPrefix + dateStringMain)
    if lastArtPubDate and not increasing_datetime_rfc2822(curArtPubDate, lastArtPubDate):
        rss_print.print_debug(__file__, "uudise päev: täna " + str(curArtPubDate) + " ja eile " + str(lastArtPubDate), 3)
        rss_print.print_debug(__file__, "esineb ajahüpe, peame muutma tambovi lisamise offsetti", 3)
        curOffsetDays += daysToOffset
        curArtPubDate = inpArtPubDate
        curArtPubDate = add_value_to_time_string(curArtPubDate, dateStringPrefix, curOffsetDays)
        curArtPubDate = raw_to_datetime(curArtPubDate, dateStringPrefix + dateStringMain)
        rss_print.print_debug(__file__, "uudise eelmine päev: " + str(lastArtPubDate), 3)
        rss_print.print_debug(__file__, "uudise praegune päev muutus: " + inpArtPubDate + " -> " + str(curArtPubDate), 2)
    else:
        rss_print.print_debug(__file__, "uudise päev: täna " + str(curArtPubDate) + " ja eile " + str(lastArtPubDate), 4)

    return curArtPubDate
示例#25
0
def xpath_to_list(elementStrings, elementsLen, xpathString, parent):
    """
    Leiab etteantud artikli lehe puust etteantud xpathi väärtuse alusel objektid.
    """
    for i, elem in enumerate(elementStrings):
        if not isinstance(elem, str):
            elem = parsers_html.html_to_string(elem, prettyPrint=False)
            rss_print.print_debug(
                __file__, "'" + xpathString + "' väärtus[" + str(i) +
                "] polnud string, stringimise järel: " + elem, 4)

        countParentNodes = parsers_html.html_string_count_parent_nodes(
            elem, "xpath_to_list")

        if not countParentNodes:
            if parent is True:
                rss_print.print_debug(
                    __file__, "'" + xpathString + "' väärtus[" + str(i) +
                    "] on valestimääratud parent?: " + elem, 0)
        elif countParentNodes == 1:
            elem = parsers_html.html_remove_single_parents(elem)
        elif countParentNodes > 1:
            if parent is True:
                rss_print.print_debug(
                    __file__, "'" + xpathString + "' väärtus[" + str(i) +
                    "] on valestimääratud parent? " + elem, 0)
            else:
                rss_print.print_debug(
                    __file__, "'" + xpathString + "' väärtus[" + str(i) +
                    "] on hoiatatud parent: " + elem, 3)

        # ja siis ühine strip
        elementStrings[i] = elem.strip()

    rss_print.print_debug(
        __file__, "'" + xpathString + "' väärtused: elementStrings = " +
        str(elementStrings), 4)

    return elementStrings
示例#26
0
def html_first_node(htmlString):
    htmlStringStartTag = htmlString.split(" ")[0]
    htmlStringStartTag = htmlStringStartTag.split(">")[0]
    htmlStringEndTag = htmlStringStartTag.replace("<", "</") + ">"
    htmlStringList = htmlString.split(htmlStringEndTag)
    countStartTags = htmlStringList[0].count(htmlStringStartTag)

    if countStartTags == 1:
        rss_print.print_debug(
            __file__, "esimesest splitist leiti " + str(countStartTags) +
            " esimest tagi '" + htmlStringStartTag + "'", 2)
        htmlString = htmlStringList[0]
        htmlString = htmlString + htmlStringEndTag
    else:
        rss_print.print_debug(
            __file__, "esimesest splitist leiti " + str(countStartTags) +
            " esimest tagi '" + htmlStringStartTag + "': " +
            str(htmlStringList[0]), 1)
        htmlString = htmlStringEndTag.join(htmlStringList[0:countStartTags])
        htmlString = htmlString + htmlStringEndTag

    return htmlString
示例#27
0
def dict_reverse_order(articleDataDict):
    """
    Newest events last for feedly ordering.
    """
    maxLen = -1
    for key in articleDataDict.keys():
        curLen = len(articleDataDict[key])
        if curLen > 0:
            if maxLen == -1:
                maxLen = curLen
            elif maxLen != curLen:
                rss_print.print_debug(
                    __file__, "mittekonsistentse pikkusega dict, katkestame",
                    0)
                return articleDataDict

    rss_print.print_debug(__file__, "pöörame suuna", 2)

    for key in articleDataDict.keys():
        articleDataDict[key].reverse()

    return articleDataDict
示例#28
0
def str_rchop(curString, stripString):
    """
    Eemaldab sisendstringi lõpust kõik etteantud stringid.
    """
    if not curString:
        rss_print.print_debug(
            __file__,
            "sisend tühi, katkestame: curString = '" + str(curString) + "'", 3)
        return curString
    if not stripString:
        rss_print.print_debug(
            __file__, "sisend tühi, katkestame: stripString = '" +
            str(stripString) + "'", 0)
        return curString

    # constant
    stripStringLen = len(stripString)

    while curString.endswith(stripString):
        curStringLenWithoutStripString = len(curString) - stripStringLen
        curString = curString[:curStringLenWithoutStripString]

    return curString
示例#29
0
def write_file_string_to_cache(articleUrl, htmlPageString):

    osPath = os.path.dirname(os.path.abspath(__file__))
    osCacheFolder = osPath + '/' + 'article_cache'
    cacheArticleUrl = articleUrl.replace('/', '|')
    cacheDomainFolder = articleUrl.split('/')[2]
    osCacheFolderDomain = osCacheFolder + '/' + cacheDomainFolder

    if not os.path.exists(osCacheFolder):
        rss_print.print_debug(__file__,
                              "loome puuduva kausta: " + osCacheFolder, 0)
        os.makedirs(osCacheFolder)
        set_user_as_file_owner(osCacheFolder)
    if not os.path.exists(osCacheFolderDomain):
        rss_print.print_debug(__file__,
                              "loome puuduva kausta: " + osCacheFolderDomain,
                              0)
        os.makedirs(osCacheFolderDomain)
        set_user_as_file_owner(osCacheFolderDomain)

    write_file(osCacheFolderDomain,
               cacheArticleUrl,
               htmlPageString,
               fileType="gzip")
示例#30
0
def article_data_dict_clean(articleDataDict, dictList, dictCond, dictField):
    """
    Eemaldame tingimusele vastavad kanded.
    """
    if not articleDataDict[dictField]:
        rss_print.print_debug(__file__, "tühi sisend: articleDataDict", 0)
        return articleDataDict

    if not dictList:
        rss_print.print_debug(__file__, "tühi sisend: dictList", 0)
        return articleDataDict

    i = 0
    while i < len(articleDataDict[dictField]):
        curArticleDictElem = articleDataDict[dictField][i]
        curArticleDictElem = curArticleDictElem.casefold()
        rss_print.print_debug(
            __file__, "kande(" + str(i + 1) + "/" +
            str(len(articleDataDict[dictField])) + ") kontrollime: " +
            curArticleDictElem[0:800], 3)

        found = False
        for dictListElem in dictList:
            dictListElem = dictListElem.casefold()
            if dictCond == "not in" and dictListElem not in curArticleDictElem:
                found = True
                break
            if dictCond == "in" and dictListElem in curArticleDictElem:
                found = True
                break
            if dictCond == "==" and dictListElem == curArticleDictElem:
                found = True
                break

        # kontrollime eemaldamistingimusele vastamist
        if found is True:
            rss_print.print_debug(
                __file__, "kande(" + str(i + 1) + "/" +
                str(len(articleDataDict[dictField])) +
                ") tingimus täidetud: '" + dictListElem + "' " + dictCond +
                " '" + curArticleDictElem + "'", 2)
            articleDataDict = dict_del_article_index(articleDataDict, i)
        else:
            i += 1

    return articleDataDict