示例#1
0
 def getURL(self, url, uid):
     if not os.path.isfile(str(uid) + ".html"):
         with open(str(uid) + ".html", "w") as fptr:
             try:
                 response = self.opener.open(url)
             except:
                 return None
             data = response.read()
             try:
                 decoded = UnicodeDammit(
                     gzip.GzipFile(fileobj=io.BytesIO(data)).read(),
                     ["windows-1252"],
                     smart_quotes_to="html").unicode_markup
             except:
                 decoded = UnicodeDammit(
                     data, ["windows-1252"],
                     smart_quotes_to="html").unicode_markup
             decoded = decoded.replace(u"%20", u" ").replace(
                 u"\u00c2",
                 u" ").replace(u"\xe2€™", u"\'").replace(
                     u"\xe2€œ",
                     u"\"").replace(u"\xe2€",
                                    "\"").replace(u"\"“", "-")
             #.replace(u"\xe2\x80\x9c", u"\"").replace(u"\xe2\x80\x9d", u"\"").replace(u"\xc3\xb3", u"\u00f3").replace(u"\xc3\xad", u"\u00ed").replace(u"\xe2\x20\xac\x21\x22", u"\'").replace(u"\xe2\x20\xac\x01\x53", u"\"").replace(u"\xe2\x20\xac", u"\"").replace(u"\xe2\x20\xac\x20\x1c", u" - ").replace(u"\xc3", u"\u00e9").replace(u"\x00\xc2", u" ")
             print >> fptr, decoded.encode('utf8')
     parser = etree.HTMLParser(target=Parser())
     with open(str(uid) + ".html", 'r') as fptr:
         data = fptr.read()
         parser.feed(data.decode('utf8'))
     return parser.close()
示例#2
0
def clean_tabs(body):
    converted = UnicodeDammit(body).unicode_markup
    converted = converted.replace("\n\n", " ")
    converted = converted.replace("\\n", " ")
    converted = converted.replace("\n", " ")
    converted = converted.replace("\\r", " ")
    converted = converted.replace("\r\r", " ")
    converted = converted.replace("\r\r", " ")
    converted = converted.replace("<br>", " ")
    converted = converted.replace("</a>", " ")
    converted = converted.replace("<a href=", " ")
    converted_cleaned = converted.replace("\r", " ")
    return converted_cleaned
示例#3
0
def getContent(soup, source=''):
    newContent = []
    # Cleanning phase
    genericCleaning(soup)
    sourceSpecificcleaning(soup, source)

    # f = open("content.html", 'w'); f.write(soup.prettify().encode('utf-8')); f.close();
    # Finding content in the tree
    bestElem = None
    bestText = ''
    for el in soup.findAll(True):
        score = 0.0
        hasTitle = False
        if el.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7'
                       ] and el.parent.name == '[document]':
            score += 3
        for c in el:
            if c.name == 'br':  # business insider style
                score += 0.5
            if c.name == 'p':
                score += 1.0
            if not hasTitle and c.name in [
                    'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7'
            ]:
                score += 1.0
                hasTitle = True
        if score >= 3.0:  # at least 3 paragraphs
            textOutput = getText(el)
            if float(
                    len(textOutput)
            ) / score > 20.0:  # we need at least 20 characters per container
                newContent.append(textOutput)
        elif score >= 1.0:
            if bestElem is None:
                bestElem = el
                bestText = getText(el, False)
            else:
                a = getText(el, False)
                if bestElem is None or len(a) > len(bestText):
                    bestElem = el
                    bestText = a
    if len(
            newContent
    ) == 0 and bestElem is not None:  # in case nothing had a score of 3, but something had a score of 1 or more
        newContent.append(bestText)

    finalText = UnicodeDammit(u'\n'.join(newContent),
                              smart_quotes_to='ascii').unicode_markup
    return finalText.replace('\n\n', '\n')
示例#4
0
def remove_evernote_link(link, html):
    html = UnicodeDammit(html, ['utf-8'], is_html=True).unicode_markup
    link_converted = UnicodeDammit(link.WholeRegexMatch, ['utf-8'], is_html=True).unicode_markup
    sep = u'<span style="color: rgb(105, 170, 53);"> | </span>'
    sep_regex = escape_regex(sep)
    no_start_tag_regex = r'[^<]*'
    regex_replace = r'<{0}[^>]*>[^<]*{1}[^<]*</{0}>'
    # html = re.sub(regex_replace.format('li', link.WholeRegexMatch), "", html)
    # Remove link
    html = html.replace(link.WholeRegexMatch, "")
    # Remove empty li
    html = re.sub(regex_replace.format('li', no_start_tag_regex), "", html)
    # Remove dangling separator

    regex_span = regex_replace.format('span', no_start_tag_regex) + no_start_tag_regex + sep_regex
    html = re.sub(regex_span, "", html)
    # Remove double separator
    html = re.sub(sep_regex + no_start_tag_regex + sep_regex, sep_regex, html)
    return html
示例#5
0
def remove_evernote_link(link, html):
    html = UnicodeDammit(html, ["utf-8"], is_html=True).unicode_markup
    link_converted = UnicodeDammit(link.WholeRegexMatch, ["utf-8"], is_html=True).unicode_markup
    sep = u'<span style="color: rgb(105, 170, 53);"> | </span>'
    sep_regex = escape_regex(sep)
    no_start_tag_regex = r"[^<]*"
    regex_replace = r"<{0}[^>]*>[^<]*{1}[^<]*</{0}>"
    # html = re.sub(regex_replace.format('li', link.WholeRegexMatch), "", html)
    # Remove link
    html = html.replace(link.WholeRegexMatch, "")
    # Remove empty li
    html = re.sub(regex_replace.format("li", no_start_tag_regex), "", html)
    # Remove dangling separator

    regex_span = regex_replace.format("span", no_start_tag_regex) + no_start_tag_regex + sep_regex
    html = re.sub(regex_span, "", html)
    # Remove double separator
    html = re.sub(sep_regex + no_start_tag_regex + sep_regex, sep_regex, html)
    return html
示例#6
0
def getContent(soup, source=''):
    newContent = []
    # Cleanning phase
    genericCleaning(soup)
    sourceSpecificcleaning(soup, source)

    # f = open("content.html", 'w'); f.write(soup.prettify().encode('utf-8')); f.close();
    # Finding content in the tree
    bestElem = None; bestText = '';
    for el in soup.findAll(True):
        score = 0.0;  hasTitle = False
        if el.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7'] and el.parent.name == '[document]':
            score += 3
        for c in el:
            if c.name == 'br': # business insider style
                score += 0.5
            if c.name == 'p':
                score += 1.0
            if not hasTitle and c.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7']:
                score += 1.0
                hasTitle = True
        if score >= 3.0: # at least 3 paragraphs
            textOutput = getText(el)
            if float(len(textOutput))/score > 20.0: # we need at least 20 characters per container
                newContent.append(textOutput)
        elif score >= 1.0:
            if bestElem is None:
                bestElem = el; bestText = getText(el, False)
            else:
                a = getText(el, False)
                if bestElem is None or len(a) > len(bestText):
                    bestElem = el; bestText = a
    if len(newContent) == 0 and bestElem is not None: # in case nothing had a score of 3, but something had a score of 1 or more
        newContent.append(bestText)

    finalText = UnicodeDammit(u'\n'.join(newContent), smart_quotes_to='ascii').unicode_markup
    return finalText.replace('\n\n', '\n')
# (or default to 'config.ini')
parser = argparse.ArgumentParser(description='Create players for past '
                                             'DF Game Club sessions')
parser.add_argument('config', nargs='?', default='config.ini',
                    help='configuration file with past session data')
namespace = parser.parse_args()

# Read configuration file
conf = configparser.ConfigParser(interpolation=None)
conf.read(namespace.config)

for section in conf.sections():
    pastebin_url = PASTEBIN_RAW_PREFIX + \
                    urlparse(conf[section]["pastebin_url"]).path.strip('/')
    log = UnicodeDammit(urlopen(pastebin_url).read()).unicode_markup
    log = log.replace('\r', '').split('\n')

    regex = re.compile(conf[section]['regex'])
    timestamp_format = conf[section]['timestamp_format']
    video_timestamp = conf[section]['video_timestamp']

    twitch_url = urlparse(conf[section]['twitch_url'])\
        .path.strip('/').split('/')
    channel, archive_id = twitch_url[0], twitch_url[2]

    ignore_lines = [
        int(num) for num in [
            item for item in
                conf[section]['ignore_lines'].replace(' ','').split(',')
            if item != ''
        ]