def html_adhoc_fetcher(url):
    html = None
    for _ in range(5):
        opener = urllib2.build_opener()
        TIME_OUT = 5
        try:
            html = opener.open(str(url), timeout = TIME_OUT).read()
        except :
            print('[WARN] Cannot access url with UnicodeEncodeError, try number is...', e, _, url, mp.current_process() )
            continue
    #print "b"
    if html == None:
        return None
    line = html.replace('\n', '^A^B^C')
    line = regex.sub('<!--.*?-->', '',  line)
    line = regex.sub('<style.*?/style>', '',  line)
    html = regex.sub('<script.*?/script>', '', line ).replace('^A^B^C', ' ')
 
    #print "c"
    soup = bs4.BeautifulSoup(html, "html.parser")
    title = (lambda x:unicode(x.string) if x != None else 'Untitled')( soup.title )
    contents0_text = (lambda x:x.text.encode('utf-8') if x != None else "" )( soup.find('div', {'class': 'ui-section-body'}) )
    #contents0_text = "dummy"
    links = set([a['href'] for a in soup.find_all('a', href=True)])
    return title, contents0_text, links
Пример #2
0
 def remove_article(self, text):
     for art in self.articles:
         text = re.sub('\s*\m%s\M\s*' % art, ' ', text)
         
     text = re.sub('\mdel\M', 'de', text)
     text = re.sub('^\s*es\M\s*', '', text)
     return text.strip()
Пример #3
0
def preprocess(msg_body, delimiter, content_type='text/plain'):
    """Prepares msg_body for being stripped.

    Replaces link brackets so that they couldn't be taken for quotation marker.
    Splits line in two if splitter pattern preceded by some text on the same
    line (done only for 'On <date> <person> wrote:' pattern).
    """
    # normalize links i.e. replace '<', '>' wrapping the link with some symbols
    # so that '>' closing the link couldn't be mistakenly taken for quotation
    # marker.
    def link_wrapper(link):
        newline_index = msg_body[:link.start()].rfind("\n")
        if msg_body[newline_index + 1] == ">":
            return link.group()
        else:
            return "@@%s@@" % link.group(1)

    msg_body = re.sub(RE_LINK, link_wrapper, msg_body)

    def splitter_wrapper(splitter):
        """Wraps splitter with new line"""
        if splitter.start() and msg_body[splitter.start() - 1] != '\n':
            return '%s%s' % (delimiter, splitter.group())
        else:
            return splitter.group()

    if content_type == 'text/plain':
        msg_body = re.sub(RE_ON_DATE_SMB_WROTE, splitter_wrapper, msg_body)

    return msg_body
Пример #4
0
def clean_tweet_text(tweet_text):
    tweet_text = tweet_text.lower()
    tweet_text = re.sub(ur"\p{P}+", "", tweet_text)
    tweet_text = re.sub("[^a-zA-Z\s]","", tweet_text)
    tweet_text = filter(lambda x: x in string.printable, tweet_text)
    tweet_text.encode('ascii',errors='ignore')
    return tweet_text
Пример #5
0
def expand_parens(string, parens="()", include_spaces=False, substitute_string=''):
    output = []
    open_paren = re.escape(parens[0])
    close_paren = re.escape(parens[1])
    substitute_string = re.escape(substitute_string)
    in_string = re.sub(open_paren + substitute_string, parens[0], string)
    in_string = re.sub(substitute_string + close_paren, parens[1], in_string)

    if include_spaces:
        regex1 = regex2 = re.compile(r'(^.*)' + open_paren + r'(.+)' + close_paren + r'(.*$)')
    else:
        regex1 = re.compile(r'(^.*\S)' + open_paren + r'(\S+)' + close_paren + r'(.*$)')
        regex2 = re.compile(r'(^.*)' + open_paren + r'(\S+)' + close_paren + r'(\S.*$)')

    re_match1 = regex1.search(in_string)
    re_match2 = regex2.search(in_string)
    if re_match1:
        within = re_match1.group(1) + re_match1.group(2) + re_match1.group(3)
        without = re_match1.group(1) + re_match1.group(3)
    elif re_match2:
        within = re_match2.group(1) + re_match2.group(2) + re_match2.group(3)
        without = re_match2.group(1) + re_match2.group(3)
    else:
        return [string]

    output = [clean_str(without), clean_str(within)]

    return output
def scrape_wiki():
    url = u"https://he.wikipedia.org/wiki/%D7%9E%D7%A0%D7%99%D7%99%D7%9F_%D7%94%D7%9E%D7%A6%D7%95%D7%95%D7%AA_%D7%A2%D7%9C_%D7%A4%D7%99_%D7%A1%D7%A4%D7%A8_%D7%94%D7%97%D7%99%D7%A0%D7%95%D7%9A"

    page = requests.get(url)
    soup_body = BeautifulSoup(page.text, "lxml")
    tables = soup_body.select(".mw-parser-output > table")

    pairs = []
    links = []

    for table in tables:
        table_tr = table.select("tr")
        for col in table_tr:
            pairs.append((col.contents[1].text.strip(), re.sub(u'</?td>', u'', col.contents[-1].text).strip()))

    for pair in pairs:
        if re.search(u'ספר|מספר', pair[0]):
            continue
        neg_pos = u"Negative Mitzvot" if re.search(u"לאו", pair[1]) else u'Positive Mitzvot'
        rambam = getGematria(re.sub(u'עשה|לאו', u'', pair[1]).strip())
        chinukh = getGematria(pair[0])
        print chinukh, rambam
        chinukh_simanlen = len(Ref(u'Sefer HaChinukh.{}'.format(chinukh)).all_segment_refs())
        print neg_pos
        link = ({"refs": [
            u'Sefer HaChinukh.{}.{}-{}'.format(chinukh, 1, chinukh_simanlen),
            u'Mishneh Torah, {}.{}'.format(neg_pos, rambam)
        ],
            "type": "Sifrei Mitzvot",
            "auto": True,
            "generated_by": "chinukh_rambam_sfm_linker"  # _sfm_linker what is this parametor intended to be?
        })
        print link['refs']
        links.append(link)
        return links
Пример #7
0
 def clean_text(text):
     clear_text_regexp = re.compile(r'(?u)\w+|[,.!?]')
     text_ = " ".join(clear_text_regexp.findall(text)).replace(" .", ".").replace(" ,", ",")
     text_ = re.sub("[,]+", ",", text_)
     text_ = re.sub("[.]+", ".", text_)
     text_ = re.sub("\s+", " ", text_)
     return text_
Пример #8
0
def all_caps_text(s, site):
    s = regex.sub("<[^>]*>", "", s)   # remove HTML tags
    s = regex.sub("&\w+;", "", s)     # remove HTML entities
    if len(s) <= 150 and regex.compile(ur"SQL|\b(ERROR|PHP|QUERY|ANDROID|CASE|SELECT|HAVING|COUNT|GROUP|ORDER BY|INNER|OUTER)\b").search(s):
        return False, ""   # common words in non-spam all-caps titles
    if len(s) >= 25 and regex.compile(ur"^(?=.*\p{upper})\P{lower}*$", regex.UNICODE).search(s):
        return True, "All in caps"
def main():

	transDict, Greek_word_num, Greek_search_dict, Greek_text = preprocessing()

	# Save lemma to translations found
	found_translist = {}

	try:
		while (True):

			scoreKeeper = scoreboard(MAX_SCOREBOARD_SIZE, MIN_SCORE)

			input_phrase = input("Enter Search Phrase>  ")

			if re.sub(" ", "", re.sub("q", "", input_phrase)) == "" or re.sub(" ", "", re.sub("quit", "", input_phrase)) == "":
				exit(0)

			if (valid_search(input_phrase)):
				
				search = search_phrase(input_phrase, "Latin")

				# Find all the translations of the given words
				for i in range(search.search_len):
					search.has_translation[i] = trn.get_translation(search.text[i], transDict, found_translist)
		
				xls.try_all_search_combos(search, scoreKeeper, Greek_word_num, Greek_search_dict, Greek_text)

				print(scoreKeeper)

			else:
				print('Please enter a valid string\n')

	except KeyboardInterrupt:
		print('\nProgram Terminated\n')
		sys.exit(0)
Пример #10
0
def tei_spellcheck(facsimile, dictionary, deletion_dictionary,
                   filter_punctuation=False):
    """
    Performs a spell check on an TEI XML document.

    Each ``seg`` element is treated as a single word and spelling corrections
    will be inserted using a choice tag. Correct words will be untouched and
    correction candidates will be sorted by edit distance.

    Args:
        facsimile (nidaba.tei.TEIFacsimile): TEIFacsimile object.
        dictionary (unicode): Path to a base dictionary.
        deletion_dictionary (unicode): Path to a deletion dictionary.
        filter_punctuation (bool): Switch to filter punctuation inside
                                   segments.

    Returns:
        A TEIFacsimile object containing the spelling corrections.
    """
    text_tokens = [x[-1] for x in facsimile.segments]
    if filter_punctuation:
        text_tokens = [regex.sub('[^\w]', '', x) for x in text_tokens]
    suggestions = spellcheck(text_tokens, dictionary, deletion_dictionary)
    facsimile.add_respstmt('spell-checker', 'nidaba-levenshtein')
    for segment in facsimile.segments:
        key = alg.sanitize(segment[-1])
        if filter_punctuation:
            key = regex.sub('[^\w]', '', key)
        if key not in suggestions:
            continue
        for sugg in suggestions[key]:
            facsimile.add_choices(segment[-2], [(sugg, 100 - 10 *
                                  alg.edit_distance(key, sugg))])
    return facsimile
def parse_text(element):
    n = (element.attrib["_note"])
    n = re.sub(r'[/]', '<br>', n)
    n = re.sub(r'[(]', '<em><small>', n)
    n = re.sub(r'[)]', '</small></em>', n)
    prayer = n.strip().splitlines()
    return prayer
Пример #12
0
def normalize_newlines(string):
    out = string.strip()
    out = re.sub(r'\r\n', '\n', out)
    out = re.sub(r'\n{3,}', '\n\n', out)
    out = re.sub(r'\n\s*\n', '\n\n', out)
    out = re.sub(r'"$', '" ', out)
    return out
Пример #13
0
def lcc_range(string):
    """
    Takes a string, returns a tuple of two LCClassNumbers, the start and
    end of the range.
    """
    string = string.encode("ascii","replace")
    string = string.replace("(","")
    string = string.replace(")","")
    if string.endswith("A-Z"):
        # TMI in the schedules when they're alphabetical.
        # I don't care.
        string.replace("A-Z","")

    if "-" not in string:
        # A range of self length.
        return (LCCallNumber(string), LCCallNumber(string))

    parts = string.split("-")
    if re.search(r"^\d",parts[1]):
        header = re.sub("^([A-Z]+).*",r"\1",parts[0])
    elif re.search(r"^\.",parts[1]):
        header = re.sub(r"^([A-Z]+\d)+\..*",r"\1",parts[0])
    elif re.search(r"^[A-Z]",parts[1]):
        header = re.sub(r"^([A-Z]+\d)+\..[A-Z]*",r"\1.",parts[0])            
    else:
        header = " "

    parts[1] = header + parts[1]
    return (
        LCCallNumber(parts[0]),
        LCCallNumber(parts[1])
    )
Пример #14
0
def fix_broken_paragraphs(in_bytes):   
    out = in_bytes
    out = regex.sub(rb'''(?<=\p{lower}\s*)</(blockquote|p|div)>
                        \s*
                        <\1[^>]*>\s*(?=\p{lower})''', 
                        b' ',
                        out, flags=regex.VERBOSE|regex.I)
    
    out = regex.sub(rb'''(?<=\p{lower}\s*)
                        <p[^>]*>(?=\s*\p{lower})''', 
                        b' ',
                        out, flags=regex.VERBOSE|regex.I)
    
    # Deal with a wrong paragraph break on a hyphenated word
    # (v.ugly)
    out = regex.sub(rb'''(?<=\p{lower})-</(blockquote|p|div)>
                        \s*
                        <\1[^>]*>\s*(?=\p{lower})''', 
                        b'',
                        out, flags=regex.VERBOSE|regex.I)
    
    out = regex.sub(rb'(?<=\p{lower})-<p[^>]*>(?=\s*\p{lower})', 
                        b'',
                        out, regex.I)
    return out
def rev_ip(ip, delimiter=None):
    revip = False
    eip = expand_ip(ip)
    prefix = False

    if '/' in eip:
        eip, prefix = regex.split('/', eip)[0:2]
    else:
        if is_ip4.search(eip):
            prefix = '32'
        elif is_ip6.search(eip):
            prefix = '128'

    if prefix:
        prefix = int(prefix)
        if is_ip4.search(eip):
            if prefix in (8, 16, 24, 32):
                revip = '.'.join(eip.split('.')[0:int(prefix / 8)][::-1]) + '.in-addr.arpa.'
            elif delimiter:
                octs = eip.split('.')[::-1]
                octs[3 - int(prefix / 8)] = octs[3 - int(prefix / 8)] + delimiter + str(prefix)
                revip = '.'.join(octs[3 - int(prefix / 8):]) + '.in-addr.arpa.'

        elif is_ip6.search(eip):
            if prefix in (4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128):
                revip = '.'.join(filter(None, regex.split('(.)', regex.sub(':', '', eip))))[0:(prefix / 4) * 2][::-1].strip('.') + '.ip6.arpa.'
            elif delimiter:
                nibs = filter(None, regex.split('(.)', regex.sub(':', '', eip)))[::-1]
                nibs[31 - int(prefix / 4)] = nibs[31 - int(prefix /4)] + delimiter + str(prefix)
                revip = '.'.join(nibs[31 - int(prefix /4):]) + '.ip6.arpa.'

    return revip
Пример #16
0
def cleanTweet(tweet, query_term):
    """
    """
    new_string = ''
    for i in tweet.split(): # remove urls, hashtag characters, and full @username mentions
        s, n, p, pa, q, f = urlparse.urlparse(i)
        if s and n:
            pass
        elif i[:1] == '@':
            pass
        elif i[:1] == '#':
            new_string = new_string.strip() + ' ' + i[1:]
        else:
            new_string = new_string.strip() + ' ' + i

    table = string.maketrans("","") # make a translation table
    new_string = re.sub("[^A-Za-z']+", ' ', new_string) # agressive and removes all non-alphanumeric (works only for latin-based and maybe only English)
    new_string = new_string.replace(" amp ", " ") # remove html code for ampersands($)
    new_string = new_string.lower() # lowercase entire tweet
    new_string = re.sub(r'(.)\1+', r'\1\1', new_string) # reduces any char repition of > 2 to 2.
    new_string = new_string.replace(query_term, " ") # take the original value used to collect tweets as a system argument, and remove it from tweets
    new_string = re.sub(r'(?<!\S)\S{1}(?!\S)', '', new_string)
    new_string = ' '.join(new_string.split()) # remove additional spaces

    return new_string
Пример #17
0
    def writeout(self, igraph, out):
        
        char = chr(int(igraph['code'], 16))
        if char not in self.existing or char in self.seen:
            return

        definition = igraph.get('kDefinition', '')
        definition = regex.sub(r' U\+\w+', '', definition)

        phon = set()
        mn = igraph.get('kMandarin', None)
        hu = igraph.get('kHanyuPinlu', None)
        hn = igraph.get('kHanyuPinyin', None)
        if hn:
            hn = regex.sub(r'\d+\.\d+:', '', hn)
        if hu:
            hu = regex.sub(r'\(\d+\)', '', hu)
        for p in [mn, hu, hn]:
            if p:
                phon.update(regex.split(r'[, ]+', p))
        phon = ",".join(sorted(phon))

        if not phon:
            return
        
        if not self.first:
            out.write(',\n')
        else:
            self.first = False
        out.write('\'{}\': {}'.format(char, [phon, definition]))
Пример #18
0
def normalize_number(number, country_code):
    """
    Normalizes the passed in number, they should be only digits, some backends prepend + and
    maybe crazy users put in dashes or parentheses in the console.
    :param number: the number, e.g. "0783835665"
    :param country_code: the 2-letter country code, e.g. "RW"
    :return: a tuple of the normalized number and whether it looks like a possible full international number
    """
    # if the number ends with e11, then that is Excel corrupting it, remove it
    if number.lower().endswith("e+11") or number.lower().endswith("e+12"):
        number = number[0:-4].replace('.', '')

    # remove other characters
    number = regex.sub('[^0-9a-z\+]', '', number.lower(), regex.V0)

    # add on a plus if it looks like it could be a fully qualified number
    if len(number) >= 11 and number[0] != '+':
        number = '+' + number

    try:
        normalized = phonenumbers.parse(number, str(country_code) if country_code else None)

        # now does it look plausible?
        if phonenumbers.is_possible_number(normalized):
            return phonenumbers.format_number(normalized, phonenumbers.PhoneNumberFormat.E164), True
    except Exception:
        pass

    # this must be a local number of some kind, just lowercase and save
    return regex.sub('[^0-9a-z]', '', number.lower(), regex.V0), False
Пример #19
0
    def _clean_word(self, word):
        """
        Preprocess words after tokenizing words from sentences.

        - Remove apostrophes ['s, s'].
        - Bring to lowercase.
        - Remove punctuations.
        - Remove English words from Non-English corpus data.
        """
        if self.language is "english":
            regex = r"((\p{P}+)|(\p{S}+)|([0-9]+))"
        else:
            regex = r"((\p{P}+)|(\p{S}+)|([0-9]+)|([A-Za-z]))"
        # Handle Apostrophe's correctly you'll => you
        selected_word = re.match(pattern=u"(.*)['’].*?", string=word)
        # If selected word matches a word with apostrophe
        if selected_word is not None:
            word = selected_word.groups()[0]
        # Handle Pair words ice-cream => ice cream
        word = re.sub(pattern="-", repl=' ', string=word)
        return re.sub(
            pattern=regex,
            repl='',
            string=word.lower()
        ).strip().split()
Пример #20
0
 def transform(self, text):
     for pattern, replace in self.pattern_replace_pair_list:
         try:
             text = regex.sub(pattern, replace, text)
         except:
             pass
     return regex.sub(r"\s+", " ", text).strip()
Пример #21
0
 def test_post(title, body, user_name, site, is_answer, body_is_summary):
     result = []
     for rule in FindSpam.rules:
         body_to_check = body
         if rule['stripcodeblocks']:
             body_to_check = regex.sub("<pre>.*?</pre>", "", body, flags=regex.DOTALL)
             body_to_check = regex.sub("<code>.*?</code>", "", body_to_check, flags=regex.DOTALL)
         if rule['all'] != (site in rule['sites']):
             matched_title = regex.compile(rule['regex'], regex.UNICODE).findall(title)
             matched_username = regex.compile(rule['regex'], regex.UNICODE).findall(user_name)
             matched_body = regex.compile(rule['regex'], regex.UNICODE).findall(body_to_check)
             if matched_title and rule['title']:
                 try:
                     if getattr(FindSpam, "%s" % rule['validation_method'])(matched_title):
                         result.append(rule['reason'])
                 except KeyError:  # There is no special logic for this rule
                     result.append(rule['reason'].replace("{}", "title"))
             if matched_username and rule['username']:
                 try:
                     if getattr(FindSpam, "%s" % rule['validation_method'])(matched_username):
                         result.append(rule['reason'])
                 except KeyError:  # There is no special logic for this rule
                     result.append(rule['reason'].replace("{}", "username"))
             if matched_body and rule['body'] and (not body_is_summary or rule['body_summary']):
                 type_of_post = "answer" if is_answer else "body"
                 try:
                     if getattr(FindSpam, "%s" % rule['validation_method'])(matched_body):
                         result.append(rule['reason'].replace("{}", type_of_post))
                 except KeyError:  # There is no special logic for this rule
                     result.append(rule['reason'].replace("{}", type_of_post))
     return result
Пример #22
0
    def wptexturize(self, text):
        # Transform into regexp sub-expression used in _wptexturize_pushpop_element
        # Must do this every time in case plugins use these filters in a context sensitive manner
        no_texturize_tags = '(' + '|'.join(self.default_no_texturize_tags) + ')'
        no_texturize_shortcodes = '(' + '|'.join(self.default_no_texturize_shortcodes) + ')'

        no_texturize_tags_stack = []
        no_texturize_shortcodes_stack = []

        # PHP: Since Python doesn't support PHP's /U modifier (which inverts quantifier's greediness), I modified the regular expression accordingly
        textarr = regex.split('(<.*?>|\[.*?\])', text, flags=regex.DOTALL)

        result = []
        for curl in textarr:
            if len(curl) == 0:
                continue

            # Only call _wptexturize_pushpop_element if first char is correct tag opening
            first = curl[0]
            if '<' == first:
                self.__wptexturize_pushpop_element(curl, no_texturize_tags_stack, no_texturize_tags, '<', '>')
            elif '[' == first:
                self.__wptexturize_pushpop_element(curl, no_texturize_shortcodes_stack, no_texturize_shortcodes, '[', ']')
            elif len(no_texturize_shortcodes_stack) == 0 and len(no_texturize_tags_stack) == 0:
                # This is not a tag, nor is the texturization disabled static strings
                for search, replacement in self.static:
                    curl = curl.replace(search, replacement)
                # regular expressions
                for search, replacement in self.dynamic:
                    curl = regex.sub(search, replacement, curl)
            curl = regex.sub('&([^#])(?![a-zA-Z1-4]{1,8};)', '&#038;\\1', curl)
            result.append(curl)
        return ''.join(result)
    def parse_implied_depth(self, element):
        ja_depth_pattern = ur"\[(\d)\]$"
        ja_sections_pattern = ur"\[(.*)\]$"
        title_str = element.get('text').strip()

        depth_match = re.search(ja_depth_pattern, title_str)
        if depth_match:
            depth = int(depth_match.group(1))
            placeholder_sections = ['Volume', 'Chapter', 'Section', 'Paragraph']
            element.set('text', re.sub(ja_depth_pattern, "", title_str))
            return {'section_names': placeholder_sections[(-1 * depth):], 'address_types' : ['Integer'] * depth}

        sections_match = re.search(ja_sections_pattern, title_str)
        if sections_match:
            sections = [s.strip() for s in sections_match.group(1).split(",")]
            element.set('text', re.sub(ja_sections_pattern, "", title_str))
            section_names = []
            address_types = []
            for s in sections:
                tpl = s.split(":")
                section_names.append(tpl[0])
                address_types.append(tpl[1] if len(tpl) > 1 else 'Integer')

            return {'section_names': section_names, 'address_types' : address_types}
        else:
            return None
Пример #24
0
    def after(self):
        order = [u'Roman', u'Wrong?', u'Okay?', u'Other']
        for key, data in sorted(self.content_mixed_cyrl_latn_extra.items(), key=lambda x: order.index(x[0])):
            content = u"""== Описание ==
Здесь представлены статьи, в которых присутствует смесь кириллицы и латиницы в содержимом.

Обсудить можно '''[[Обсуждение Викисловаря:Отчёты|здесь]]'''.

== Список результатов ==
"""
            items = sorted(data.items(), key=lambda x: x[0])
            for title, sub_items in items:
                content += u"# [[{0}]]\n".format(title)
                for value in sub_items:
                    value = \
                        regex.sub(u'(\p{IsLatin}+)',
                               u'<span style="background-color: #FFD0D0;">\g<1></span>',
                               value, flags=re.IGNORECASE | re.UNICODE)
                    value = \
                        regex.sub(u'(\p{IsCyrillic}+)',
                               u'<span style="background-color: #D0FFD0;">\g<1></span>',
                               value, flags=re.IGNORECASE | re.UNICODE)
                    content += u'#* <code>{}</code>\n'.format(value.replace('\n', ' ').strip())
            title = u'Ошибки/Содержимое/Ошибки/Смесь кириллицы и латиницы/Однобуквенные случаи/{}'.format(key)
            count = len(data)
            self.process_report(title, content, count)
        super(ContentMixedCyrlLatnExtra, self).after()
Пример #25
0
    def remove_article(self, text):
        for art in self.articles:
            text = re.sub("^\s*\m%s\M\s*" % art, " ", text)

        text = re.sub("\s*\mο\M", "", text)
        text = re.sub("\s*\mείναι\M", "", text)
        return text.strip()
Пример #26
0
def clean_name(name):
    """
    Cleans a show/movie name for searching.

    :param name: release name
    :return: cleaned name
    """

    name = unicodedata.normalize('NFKD', name)

    name = regex.sub('[._\-]', ' ', name)
    name = regex.sub('[\':!"#*’,()?]', '', name)
    name = regex.sub('\s{2,}', ' ', name)
    name = regex.sub('\[.*?\]', '', name)

    replace_chars = {
        '$': 's',
        '&': 'and',
        'ß': 'ss'
    }

    for k, v in replace_chars.items():
        name = name.replace(k, v)

    name = CLEANING_REGEX.sub('', name)

    return name.lower()
Пример #27
0
def main():

    args = parser.parse_args()

    tt = TinyTokenizer()

    for line in open(args.infile):
        line=line.strip()

        out = tt.tokenize(line)
        outline = " ".join(out)
        try:
            assert(str(regex.sub(r"\s","",line))==str(regex.sub("\s","",outline)))
            if args.conll:
                for w in out:
                    print(w)
                print()
            else:
                print(outline)
            
        except:
            print("==== CHECK FILE! ====",  args.infile, file=sys.stderr)
            print("+"*20, file=sys.stderr)
            print("in:  >>{}<<".format(line), file=sys.stderr)
            print("out: >>{}<<".format(outline), file=sys.stderr)     
            print(str(regex.sub(r"\s","",line)), file=sys.stderr)
            print(str(regex.sub(r"\s","",outline)), file=sys.stderr)
Пример #28
0
def fix_hyphens(word):
    for i in range(0, 2):
        word = regex.sub(r'-({})({})'.format(cons, cons), r'\1-\2', word, flags=regex.I)
        word = regex.sub(r'([kgcjḍṭdtpb])-(h{})'.format(vowel_pattern), r'\1\2-', word, flags=regex.I)
    word = regex.sub(r'^(\p{alpha}{0,3})-', r'\1', word)
    word = regex.sub(r'-(\p{alpha}{0,3})$', r'\1', word)
    return word
def parse_text(element):
    n = element.attrib["_note"]
    n = re.sub(r"[/]", "<br>", n)
    n = re.sub(r"[(]", "<em><small>", n)
    n = re.sub(r"[)]", "</small></em>", n)
    prayer = n.strip().splitlines()
    return prayer
Пример #30
0
 def normalize(self, s):
     s = re.sub(":","",s) # subtitle :
     s = re.sub("-","",s) # subtitle -
     s = re.sub("  "," ",s) # remove double space
     s = re.sub("The ","",s) # remove prefix The      
     s = re.sub(", The","",s) # remove suffix ,The
     return s        
Пример #31
0
def whitespace_clean(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text
Пример #32
0
def on_msg(msg, client):
    global _room_roles

    if not isinstance(msg, events.MessagePosted) and not isinstance(
            msg, events.MessageEdited):
        return

    message = msg.message
    room_ident = (client.host, message.room.id)
    room_data = _rooms[room_ident]

    if message.owner.id == client._br.user_id:
        if 'direct' in _room_roles and room_ident in _room_roles['direct']:
            SocketScience.receive(
                message.content_source.replace("\u200B",
                                               "").replace("\u200C", ""))

        return

    if message.content.startswith("<div class='partial'>"):
        message.content = message.content[21:]
        if message.content.endswith("</div>"):
            message.content = message.content[:-6]

    if message.parent:
        try:
            if message.parent.owner.id == client._br.user_id:
                strip_mention = regex.sub(
                    "^(<span class=(\"|')mention(\"|')>)?@.*?(</span>)? ", "",
                    message.content)
                cmd = GlobalVars.parser.unescape(strip_mention)

                result = dispatch_reply_command(message.parent, message, cmd)

                if result:
                    s = ":{}\n{}" if "\n" not in result and len(
                        result) >= 488 else ":{} {}"
                    _msg_queue.put((room_data, s.format(message.id,
                                                        result), None))
        except ValueError:
            pass
    elif message.content.lower().startswith("sd "):
        result = dispatch_shorthand_command(message)

        if result:
            s = ":{}\n{}" if "\n" not in result and len(
                result) >= 488 else ":{} {}"
            _msg_queue.put((room_data, s.format(message.id, result), None))
    elif message.content.startswith("!!/"):
        result = dispatch_command(message)

        if result:
            s = ":{}\n{}" if "\n" not in result and len(
                result) >= 488 else ":{} {}"
            _msg_queue.put((room_data, s.format(message.id, result), None))
    elif classes.feedback.FEEDBACK_REGEX.search(message.content) \
            and is_privileged(message.owner, message.room) and datahandling.last_feedbacked:
        ids, expires_in = datahandling.last_feedbacked

        if time.time() < expires_in:
            Tasks.do(metasmoke.Metasmoke.post_auto_comment,
                     message.content_source,
                     message.owner,
                     ids=ids)
    elif 'direct' in _room_roles and room_ident in _room_roles['direct']:
        SocketScience.receive(
            message.content_source.replace("\u200B", "").replace("\u200C", ""))
!unzip amazon-reviews-unlocked-mobile-phones.zip

#saving the data
with open("Amazon_Unlocked_Mobile.csv") as csv_file:
  csv_reader = csv.reader(csv_file)
  colnames = next(csv_reader)
  data = list(csv_reader)

#printing one sample point to see how it is saved like
print(random.sample(data,1))
#each entry consists of product name, brand, price (string), rating (in string), review, review votes.

"""# **Data Cleaning and Preprocessing - Overall Sentiment**"""

x=re.sub("[^a-zA-Z0-9\s]", "", re.sub("[,.&:-]"," ","matu6738,at&t,,3"))
print(x)

x.split()

#extracting initial reviews and ratings from the original data 
initial_reviews = []
ratings = []
review_vote = []  #could be useful later 

for x in data:
  ratings.append(int(x[3]))
  initial_reviews.append(re.sub("[^a-zA-Z0-9\s]", "", re.sub("[,&.:-]"," ",x[4].lower())))
  review_vote.append(x[5])

clean_vote = []
Пример #34
0
def number_to_substring(text, latex=False):
    return regex.sub("(\d*\.?\d+)", r'_\1', text) if latex else regex.sub(
        "(\d*\.?\d+)", r'<sub>\1</sub>', text)