Python sub примеры, regex.sub Python примеры использования

Пример #1

0

Показать файл

Файл: adhocSuumoJournalParser.py Проект: GINK03/KindleReferencedIndexScore

def html_adhoc_fetcher(url):
    html = None
    for _ in range(5):
        opener = urllib2.build_opener()
        TIME_OUT = 5
        try:
            html = opener.open(str(url), timeout = TIME_OUT).read()
        except :
            print('[WARN] Cannot access url with UnicodeEncodeError, try number is...', e, _, url, mp.current_process() )
            continue
    #print "b"
    if html == None:
        return None
    line = html.replace('\n', '^A^B^C')
    line = regex.sub('<!--.*?-->', '',  line)
    line = regex.sub('<style.*?/style>', '',  line)
    html = regex.sub('<script.*?/script>', '', line ).replace('^A^B^C', ' ')
 
    #print "c"
    soup = bs4.BeautifulSoup(html, "html.parser")
    title = (lambda x:unicode(x.string) if x != None else 'Untitled')( soup.title )
    contents0_text = (lambda x:x.text.encode('utf-8') if x != None else "" )( soup.find('div', {'class': 'ui-section-body'}) )
    #contents0_text = "dummy"
    links = set([a['href'] for a in soup.find_all('a', href=True)])
    return title, contents0_text, links

Пример #2

0

Показать файл

Файл: spanish.py Проект: garykpdx/panlex-tools

 def remove_article(self, text):
     for art in self.articles:
         text = re.sub('\s*\m%s\M\s*' % art, ' ', text)
         
     text = re.sub('\mdel\M', 'de', text)
     text = re.sub('^\s*es\M\s*', '', text)
     return text.strip()

Пример #3

0

Показать файл

Файл: quotations.py Проект: afedosenko/talon

def preprocess(msg_body, delimiter, content_type='text/plain'):
    """Prepares msg_body for being stripped.

    Replaces link brackets so that they couldn't be taken for quotation marker.
    Splits line in two if splitter pattern preceded by some text on the same
    line (done only for 'On <date> <person> wrote:' pattern).
    """
    # normalize links i.e. replace '<', '>' wrapping the link with some symbols
    # so that '>' closing the link couldn't be mistakenly taken for quotation
    # marker.
    def link_wrapper(link):
        newline_index = msg_body[:link.start()].rfind("\n")
        if msg_body[newline_index + 1] == ">":
            return link.group()
        else:
            return "@@%s@@" % link.group(1)

    msg_body = re.sub(RE_LINK, link_wrapper, msg_body)

    def splitter_wrapper(splitter):
        """Wraps splitter with new line"""
        if splitter.start() and msg_body[splitter.start() - 1] != '\n':
            return '%s%s' % (delimiter, splitter.group())
        else:
            return splitter.group()

    if content_type == 'text/plain':
        msg_body = re.sub(RE_ON_DATE_SMB_WROTE, splitter_wrapper, msg_body)

    return msg_body

Пример #4

0

Показать файл

Файл: arrffMaker2.py Проект: cvhooper22/cs478TwitterGroup

def clean_tweet_text(tweet_text):
    tweet_text = tweet_text.lower()
    tweet_text = re.sub(ur"\p{P}+", "", tweet_text)
    tweet_text = re.sub("[^a-zA-Z\s]","", tweet_text)
    tweet_text = filter(lambda x: x in string.printable, tweet_text)
    tweet_text.encode('ascii',errors='ignore')
    return tweet_text

Пример #5

0

Показать файл

Файл: string_manipulation.py Проект: longnow/panlex-tools

def expand_parens(string, parens="()", include_spaces=False, substitute_string=''):
    output = []
    open_paren = re.escape(parens[0])
    close_paren = re.escape(parens[1])
    substitute_string = re.escape(substitute_string)
    in_string = re.sub(open_paren + substitute_string, parens[0], string)
    in_string = re.sub(substitute_string + close_paren, parens[1], in_string)

    if include_spaces:
        regex1 = regex2 = re.compile(r'(^.*)' + open_paren + r'(.+)' + close_paren + r'(.*$)')
    else:
        regex1 = re.compile(r'(^.*\S)' + open_paren + r'(\S+)' + close_paren + r'(.*$)')
        regex2 = re.compile(r'(^.*)' + open_paren + r'(\S+)' + close_paren + r'(\S.*$)')

    re_match1 = regex1.search(in_string)
    re_match2 = regex2.search(in_string)
    if re_match1:
        within = re_match1.group(1) + re_match1.group(2) + re_match1.group(3)
        without = re_match1.group(1) + re_match1.group(3)
    elif re_match2:
        within = re_match2.group(1) + re_match2.group(2) + re_match2.group(3)
        without = re_match2.group(1) + re_match2.group(3)
    else:
        return [string]

    output = [clean_str(without), clean_str(within)]

    return output

Пример #6

0

Показать файл

Файл: scraper_chinukh_rambam.py Проект: Sefaria/Sefaria-Data

def scrape_wiki():
    url = u"https://he.wikipedia.org/wiki/%D7%9E%D7%A0%D7%99%D7%99%D7%9F_%D7%94%D7%9E%D7%A6%D7%95%D7%95%D7%AA_%D7%A2%D7%9C_%D7%A4%D7%99_%D7%A1%D7%A4%D7%A8_%D7%94%D7%97%D7%99%D7%A0%D7%95%D7%9A"

    page = requests.get(url)
    soup_body = BeautifulSoup(page.text, "lxml")
    tables = soup_body.select(".mw-parser-output > table")

    pairs = []
    links = []

    for table in tables:
        table_tr = table.select("tr")
        for col in table_tr:
            pairs.append((col.contents[1].text.strip(), re.sub(u'</?td>', u'', col.contents[-1].text).strip()))

    for pair in pairs:
        if re.search(u'ספר|מספר', pair[0]):
            continue
        neg_pos = u"Negative Mitzvot" if re.search(u"לאו", pair[1]) else u'Positive Mitzvot'
        rambam = getGematria(re.sub(u'עשה|לאו', u'', pair[1]).strip())
        chinukh = getGematria(pair[0])
        print chinukh, rambam
        chinukh_simanlen = len(Ref(u'Sefer HaChinukh.{}'.format(chinukh)).all_segment_refs())
        print neg_pos
        link = ({"refs": [
            u'Sefer HaChinukh.{}.{}-{}'.format(chinukh, 1, chinukh_simanlen),
            u'Mishneh Torah, {}.{}'.format(neg_pos, rambam)
        ],
            "type": "Sifrei Mitzvot",
            "auto": True,
            "generated_by": "chinukh_rambam_sfm_linker"  # _sfm_linker what is this parametor intended to be?
        })
        print link['refs']
        links.append(link)
        return links

Пример #7

0

Показать файл

Файл: TextCleaner.py Проект: RenatKhayrullin/Diploma

 def clean_text(text):
     clear_text_regexp = re.compile(r'(?u)\w+|[,.!?]')
     text_ = " ".join(clear_text_regexp.findall(text)).replace(" .", ".").replace(" ,", ",")
     text_ = re.sub("[,]+", ",", text_)
     text_ = re.sub("[.]+", ".", text_)
     text_ = re.sub("\s+", " ", text_)
     return text_

Пример #8

0

Показать файл

Файл: findspam.py Проект: rekire/SmokeDetector

def all_caps_text(s, site):
    s = regex.sub("<[^>]*>", "", s)   # remove HTML tags
    s = regex.sub("&\w+;", "", s)     # remove HTML entities
    if len(s) <= 150 and regex.compile(ur"SQL|\b(ERROR|PHP|QUERY|ANDROID|CASE|SELECT|HAVING|COUNT|GROUP|ORDER BY|INNER|OUTER)\b").search(s):
        return False, ""   # common words in non-spam all-caps titles
    if len(s) >= 25 and regex.compile(ur"^(?=.*\p{upper})\P{lower}*$", regex.UNICODE).search(s):
        return True, "All in caps"

Пример #9

0

Показать файл

Файл: search_by_phrase.py Проект: baileymiller/intertextualityProject

def main():

	transDict, Greek_word_num, Greek_search_dict, Greek_text = preprocessing()

	# Save lemma to translations found
	found_translist = {}

	try:
		while (True):

			scoreKeeper = scoreboard(MAX_SCOREBOARD_SIZE, MIN_SCORE)

			input_phrase = input("Enter Search Phrase>  ")

			if re.sub(" ", "", re.sub("q", "", input_phrase)) == "" or re.sub(" ", "", re.sub("quit", "", input_phrase)) == "":
				exit(0)

			if (valid_search(input_phrase)):
				
				search = search_phrase(input_phrase, "Latin")

				# Find all the translations of the given words
				for i in range(search.search_len):
					search.has_translation[i] = trn.get_translation(search.text[i], transDict, found_translist)
		
				xls.try_all_search_combos(search, scoreKeeper, Greek_word_num, Greek_search_dict, Greek_text)

				print(scoreKeeper)

			else:
				print('Please enter a valid string\n')

	except KeyboardInterrupt:
		print('\nProgram Terminated\n')
		sys.exit(0)

Пример #10

0

Показать файл

Файл: lex.py Проект: amitdo/nidaba

def tei_spellcheck(facsimile, dictionary, deletion_dictionary,
                   filter_punctuation=False):
    """
    Performs a spell check on an TEI XML document.

    Each ``seg`` element is treated as a single word and spelling corrections
    will be inserted using a choice tag. Correct words will be untouched and
    correction candidates will be sorted by edit distance.

    Args:
        facsimile (nidaba.tei.TEIFacsimile): TEIFacsimile object.
        dictionary (unicode): Path to a base dictionary.
        deletion_dictionary (unicode): Path to a deletion dictionary.
        filter_punctuation (bool): Switch to filter punctuation inside
                                   segments.

    Returns:
        A TEIFacsimile object containing the spelling corrections.
    """
    text_tokens = [x[-1] for x in facsimile.segments]
    if filter_punctuation:
        text_tokens = [regex.sub('[^\w]', '', x) for x in text_tokens]
    suggestions = spellcheck(text_tokens, dictionary, deletion_dictionary)
    facsimile.add_respstmt('spell-checker', 'nidaba-levenshtein')
    for segment in facsimile.segments:
        key = alg.sanitize(segment[-1])
        if filter_punctuation:
            key = regex.sub('[^\w]', '', key)
        if key not in suggestions:
            continue
        for sugg in suggestions[key]:
            facsimile.add_choices(segment[-2], [(sugg, 100 - 10 *
                                  alg.edit_distance(key, sugg))])
    return facsimile

Пример #11

0

Показать файл

Файл: Parse_Index_and_Version.py Проект: bachrach44/Sefaria-Project

def parse_text(element):
    n = (element.attrib["_note"])
    n = re.sub(r'[/]', '<br>', n)
    n = re.sub(r'[(]', '<em><small>', n)
    n = re.sub(r'[)]', '</small></em>', n)
    prayer = n.strip().splitlines()
    return prayer

Пример #12

0

Показать файл

Файл: utils.py Проект: crw/python-textile

def normalize_newlines(string):
    out = string.strip()
    out = re.sub(r'\r\n', '\n', out)
    out = re.sub(r'\n{3,}', '\n\n', out)
    out = re.sub(r'\n\s*\n', '\n\n', out)
    out = re.sub(r'"$', '" ', out)
    return out

Пример #13

0

Показать файл

Файл: bookwormMARC.py Проект: Bookworm-project/Bookworm-MARC

def lcc_range(string):
    """
    Takes a string, returns a tuple of two LCClassNumbers, the start and
    end of the range.
    """
    string = string.encode("ascii","replace")
    string = string.replace("(","")
    string = string.replace(")","")
    if string.endswith("A-Z"):
        # TMI in the schedules when they're alphabetical.
        # I don't care.
        string.replace("A-Z","")

    if "-" not in string:
        # A range of self length.
        return (LCCallNumber(string), LCCallNumber(string))

    parts = string.split("-")
    if re.search(r"^\d",parts[1]):
        header = re.sub("^([A-Z]+).*",r"\1",parts[0])
    elif re.search(r"^\.",parts[1]):
        header = re.sub(r"^([A-Z]+\d)+\..*",r"\1",parts[0])
    elif re.search(r"^[A-Z]",parts[1]):
        header = re.sub(r"^([A-Z]+\d)+\..[A-Z]*",r"\1.",parts[0])            
    else:
        header = " "

    parts[1] = header + parts[1]
    return (
        LCCallNumber(parts[0]),
        LCCallNumber(parts[1])
    )

Пример #14

0

Показать файл

Файл: __init__.py Проект: jgpacker/suttacentral

def fix_broken_paragraphs(in_bytes):   
    out = in_bytes
    out = regex.sub(rb'''(?<=\p{lower}\s*)</(blockquote|p|div)>
                        \s*
                        <\1[^>]*>\s*(?=\p{lower})''', 
                        b' ',
                        out, flags=regex.VERBOSE|regex.I)
    
    out = regex.sub(rb'''(?<=\p{lower}\s*)
                        <p[^>]*>(?=\s*\p{lower})''', 
                        b' ',
                        out, flags=regex.VERBOSE|regex.I)
    
    # Deal with a wrong paragraph break on a hyphenated word
    # (v.ugly)
    out = regex.sub(rb'''(?<=\p{lower})-</(blockquote|p|div)>
                        \s*
                        <\1[^>]*>\s*(?=\p{lower})''', 
                        b'',
                        out, flags=regex.VERBOSE|regex.I)
    
    out = regex.sub(rb'(?<=\p{lower})-<p[^>]*>(?=\s*\p{lower})', 
                        b'',
                        out, regex.I)
    return out

Пример #15

0

Показать файл

Файл: unbound-dns-filter.py Проект: cbuijs/unbound-dns-filter

def rev_ip(ip, delimiter=None):
    revip = False
    eip = expand_ip(ip)
    prefix = False

    if '/' in eip:
        eip, prefix = regex.split('/', eip)[0:2]
    else:
        if is_ip4.search(eip):
            prefix = '32'
        elif is_ip6.search(eip):
            prefix = '128'

    if prefix:
        prefix = int(prefix)
        if is_ip4.search(eip):
            if prefix in (8, 16, 24, 32):
                revip = '.'.join(eip.split('.')[0:int(prefix / 8)][::-1]) + '.in-addr.arpa.'
            elif delimiter:
                octs = eip.split('.')[::-1]
                octs[3 - int(prefix / 8)] = octs[3 - int(prefix / 8)] + delimiter + str(prefix)
                revip = '.'.join(octs[3 - int(prefix / 8):]) + '.in-addr.arpa.'

        elif is_ip6.search(eip):
            if prefix in (4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128):
                revip = '.'.join(filter(None, regex.split('(.)', regex.sub(':', '', eip))))[0:(prefix / 4) * 2][::-1].strip('.') + '.ip6.arpa.'
            elif delimiter:
                nibs = filter(None, regex.split('(.)', regex.sub(':', '', eip)))[::-1]
                nibs[31 - int(prefix / 4)] = nibs[31 - int(prefix /4)] + delimiter + str(prefix)
                revip = '.'.join(nibs[31 - int(prefix /4):]) + '.ip6.arpa.'

    return revip

Пример #16

0

Показать файл

Файл: tweet-soap.py Проект: asmiley/tweet-emotion-classifier

def cleanTweet(tweet, query_term):
    """
    """
    new_string = ''
    for i in tweet.split(): # remove urls, hashtag characters, and full @username mentions
        s, n, p, pa, q, f = urlparse.urlparse(i)
        if s and n:
            pass
        elif i[:1] == '@':
            pass
        elif i[:1] == '#':
            new_string = new_string.strip() + ' ' + i[1:]
        else:
            new_string = new_string.strip() + ' ' + i

    table = string.maketrans("","") # make a translation table
    new_string = re.sub("[^A-Za-z']+", ' ', new_string) # agressive and removes all non-alphanumeric (works only for latin-based and maybe only English)
    new_string = new_string.replace(" amp ", " ") # remove html code for ampersands($)
    new_string = new_string.lower() # lowercase entire tweet
    new_string = re.sub(r'(.)\1+', r'\1\1', new_string) # reduces any char repition of > 2 to 2.
    new_string = new_string.replace(query_term, " ") # take the original value used to collect tweets as a system argument, and remove it from tweets
    new_string = re.sub(r'(?<!\S)\S{1}(?!\S)', '', new_string)
    new_string = ' '.join(new_string.split()) # remove additional spaces

    return new_string

Пример #17

0

Показать файл

    def writeout(self, igraph, out):
        
        char = chr(int(igraph['code'], 16))
        if char not in self.existing or char in self.seen:
            return

        definition = igraph.get('kDefinition', '')
        definition = regex.sub(r' U\+\w+', '', definition)

        phon = set()
        mn = igraph.get('kMandarin', None)
        hu = igraph.get('kHanyuPinlu', None)
        hn = igraph.get('kHanyuPinyin', None)
        if hn:
            hn = regex.sub(r'\d+\.\d+:', '', hn)
        if hu:
            hu = regex.sub(r'\(\d+\)', '', hu)
        for p in [mn, hu, hn]:
            if p:
                phon.update(regex.split(r'[, ]+', p))
        phon = ",".join(sorted(phon))

        if not phon:
            return
        
        if not self.first:
            out.write(',\n')
        else:
            self.first = False
        out.write('\'{}\': {}'.format(char, [phon, definition]))

Пример #18

0

Показать файл

Файл: utils.py Проект: eHealthAfrica/flows

def normalize_number(number, country_code):
    """
    Normalizes the passed in number, they should be only digits, some backends prepend + and
    maybe crazy users put in dashes or parentheses in the console.
    :param number: the number, e.g. "0783835665"
    :param country_code: the 2-letter country code, e.g. "RW"
    :return: a tuple of the normalized number and whether it looks like a possible full international number
    """
    # if the number ends with e11, then that is Excel corrupting it, remove it
    if number.lower().endswith("e+11") or number.lower().endswith("e+12"):
        number = number[0:-4].replace('.', '')

    # remove other characters
    number = regex.sub('[^0-9a-z\+]', '', number.lower(), regex.V0)

    # add on a plus if it looks like it could be a fully qualified number
    if len(number) >= 11 and number[0] != '+':
        number = '+' + number

    try:
        normalized = phonenumbers.parse(number, str(country_code) if country_code else None)

        # now does it look plausible?
        if phonenumbers.is_possible_number(normalized):
            return phonenumbers.format_number(normalized, phonenumbers.PhoneNumberFormat.E164), True
    except Exception:
        pass

    # this must be a local number of some kind, just lowercase and save
    return regex.sub('[^0-9a-z]', '', number.lower(), regex.V0), False

Пример #19

0

Показать файл

Файл: leipzig_preprocessor.py Проект: KshitijKarthick/tvecs

    def _clean_word(self, word):
        """
        Preprocess words after tokenizing words from sentences.

        - Remove apostrophes ['s, s'].
        - Bring to lowercase.
        - Remove punctuations.
        - Remove English words from Non-English corpus data.
        """
        if self.language is "english":
            regex = r"((\p{P}+)|(\p{S}+)|([0-9]+))"
        else:
            regex = r"((\p{P}+)|(\p{S}+)|([0-9]+)|([A-Za-z]))"
        # Handle Apostrophe's correctly you'll => you
        selected_word = re.match(pattern=u"(.*)['’].*?", string=word)
        # If selected word matches a word with apostrophe
        if selected_word is not None:
            word = selected_word.groups()[0]
        # Handle Pair words ice-cream => ice cream
        word = re.sub(pattern="-", repl=' ', string=word)
        return re.sub(
            pattern=regex,
            repl='',
            string=word.lower()
        ).strip().split()

Пример #20

0

Показать файл

Файл: data_processor.py Проект: amsqr/Kaggle_HomeDepot

 def transform(self, text):
     for pattern, replace in self.pattern_replace_pair_list:
         try:
             text = regex.sub(pattern, replace, text)
         except:
             pass
     return regex.sub(r"\s+", " ", text).strip()

Пример #21

0

Показать файл

Файл: findspam.py Проект: JC3/SmokeDetector

 def test_post(title, body, user_name, site, is_answer, body_is_summary):
     result = []
     for rule in FindSpam.rules:
         body_to_check = body
         if rule['stripcodeblocks']:
             body_to_check = regex.sub("<pre>.*?</pre>", "", body, flags=regex.DOTALL)
             body_to_check = regex.sub("<code>.*?</code>", "", body_to_check, flags=regex.DOTALL)
         if rule['all'] != (site in rule['sites']):
             matched_title = regex.compile(rule['regex'], regex.UNICODE).findall(title)
             matched_username = regex.compile(rule['regex'], regex.UNICODE).findall(user_name)
             matched_body = regex.compile(rule['regex'], regex.UNICODE).findall(body_to_check)
             if matched_title and rule['title']:
                 try:
                     if getattr(FindSpam, "%s" % rule['validation_method'])(matched_title):
                         result.append(rule['reason'])
                 except KeyError:  # There is no special logic for this rule
                     result.append(rule['reason'].replace("{}", "title"))
             if matched_username and rule['username']:
                 try:
                     if getattr(FindSpam, "%s" % rule['validation_method'])(matched_username):
                         result.append(rule['reason'])
                 except KeyError:  # There is no special logic for this rule
                     result.append(rule['reason'].replace("{}", "username"))
             if matched_body and rule['body'] and (not body_is_summary or rule['body_summary']):
                 type_of_post = "answer" if is_answer else "body"
                 try:
                     if getattr(FindSpam, "%s" % rule['validation_method'])(matched_body):
                         result.append(rule['reason'].replace("{}", type_of_post))
                 except KeyError:  # There is no special logic for this rule
                     result.append(rule['reason'].replace("{}", type_of_post))
     return result

Пример #22

0

Показать файл

Файл: default_filters.py Проект: ChillarAnand/plugins

    def wptexturize(self, text):
        # Transform into regexp sub-expression used in _wptexturize_pushpop_element
        # Must do this every time in case plugins use these filters in a context sensitive manner
        no_texturize_tags = '(' + '|'.join(self.default_no_texturize_tags) + ')'
        no_texturize_shortcodes = '(' + '|'.join(self.default_no_texturize_shortcodes) + ')'

        no_texturize_tags_stack = []
        no_texturize_shortcodes_stack = []

        # PHP: Since Python doesn't support PHP's /U modifier (which inverts quantifier's greediness), I modified the regular expression accordingly
        textarr = regex.split('(<.*?>|\[.*?\])', text, flags=regex.DOTALL)

        result = []
        for curl in textarr:
            if len(curl) == 0:
                continue

            # Only call _wptexturize_pushpop_element if first char is correct tag opening
            first = curl[0]
            if '<' == first:
                self.__wptexturize_pushpop_element(curl, no_texturize_tags_stack, no_texturize_tags, '<', '>')
            elif '[' == first:
                self.__wptexturize_pushpop_element(curl, no_texturize_shortcodes_stack, no_texturize_shortcodes, '[', ']')
            elif len(no_texturize_shortcodes_stack) == 0 and len(no_texturize_tags_stack) == 0:
                # This is not a tag, nor is the texturization disabled static strings
                for search, replacement in self.static:
                    curl = curl.replace(search, replacement)
                # regular expressions
                for search, replacement in self.dynamic:
                    curl = regex.sub(search, replacement, curl)
            curl = regex.sub('&([^#])(?![a-zA-Z1-4]{1,8};)', '&#038;\\1', curl)
            result.append(curl)
        return ''.join(result)

Пример #23

0

Показать файл

Файл: parse_index_and_version.py Проект: JonMosenkis/Sefaria-Project

    def parse_implied_depth(self, element):
        ja_depth_pattern = ur"\[(\d)\]$"
        ja_sections_pattern = ur"\[(.*)\]$"
        title_str = element.get('text').strip()

        depth_match = re.search(ja_depth_pattern, title_str)
        if depth_match:
            depth = int(depth_match.group(1))
            placeholder_sections = ['Volume', 'Chapter', 'Section', 'Paragraph']
            element.set('text', re.sub(ja_depth_pattern, "", title_str))
            return {'section_names': placeholder_sections[(-1 * depth):], 'address_types' : ['Integer'] * depth}

        sections_match = re.search(ja_sections_pattern, title_str)
        if sections_match:
            sections = [s.strip() for s in sections_match.group(1).split(",")]
            element.set('text', re.sub(ja_sections_pattern, "", title_str))
            section_names = []
            address_types = []
            for s in sections:
                tpl = s.split(":")
                section_names.append(tpl[0])
                address_types.append(tpl[1] if len(tpl) > 1 else 'Integer')

            return {'section_names': section_names, 'address_types' : address_types}
        else:
            return None

Пример #24

0

Показать файл

Файл: mixed_cyrl_latn_extra.py Проект: 2vitalik/words

    def after(self):
        order = [u'Roman', u'Wrong?', u'Okay?', u'Other']
        for key, data in sorted(self.content_mixed_cyrl_latn_extra.items(), key=lambda x: order.index(x[0])):
            content = u"""== Описание ==
Здесь представлены статьи, в которых присутствует смесь кириллицы и латиницы в содержимом.

Обсудить можно '''[[Обсуждение Викисловаря:Отчёты|здесь]]'''.

== Список результатов ==
"""
            items = sorted(data.items(), key=lambda x: x[0])
            for title, sub_items in items:
                content += u"# [[{0}]]\n".format(title)
                for value in sub_items:
                    value = \
                        regex.sub(u'(\p{IsLatin}+)',
                               u'<span style="background-color: #FFD0D0;">\g<1></span>',
                               value, flags=re.IGNORECASE | re.UNICODE)
                    value = \
                        regex.sub(u'(\p{IsCyrillic}+)',
                               u'<span style="background-color: #D0FFD0;">\g<1></span>',
                               value, flags=re.IGNORECASE | re.UNICODE)
                    content += u'#* <code>{}</code>\n'.format(value.replace('\n', ' ').strip())
            title = u'Ошибки/Содержимое/Ошибки/Смесь кириллицы и латиницы/Однобуквенные случаи/{}'.format(key)
            count = len(data)
            self.process_report(title, content, count)
        super(ContentMixedCyrlLatnExtra, self).after()

Пример #25

0

Показать файл

Файл: greek.py Проект: garykpdx/panlex-tools

    def remove_article(self, text):
        for art in self.articles:
            text = re.sub("^\s*\m%s\M\s*" % art, " ", text)

        text = re.sub("\s*\mο\M", "", text)
        text = re.sub("\s*\mείναι\M", "", text)
        return text.strip()

Пример #26

0

Показать файл

Файл: ids.py Проект: Murodese/pynab

def clean_name(name):
    """
    Cleans a show/movie name for searching.

    :param name: release name
    :return: cleaned name
    """

    name = unicodedata.normalize('NFKD', name)

    name = regex.sub('[._\-]', ' ', name)
    name = regex.sub('[\':!"#*’,()?]', '', name)
    name = regex.sub('\s{2,}', ' ', name)
    name = regex.sub('\[.*?\]', '', name)

    replace_chars = {
        '$': 's',
        '&': 'and',
        'ß': 'ss'
    }

    for k, v in replace_chars.items():
        name = name.replace(k, v)

    name = CLEANING_REGEX.sub('', name)

    return name.lower()

Пример #27

0

Показать файл

Файл: tinytokenizer.py Проект: bplank/multilingualtokenizer

def main():

    args = parser.parse_args()

    tt = TinyTokenizer()

    for line in open(args.infile):
        line=line.strip()

        out = tt.tokenize(line)
        outline = " ".join(out)
        try:
            assert(str(regex.sub(r"\s","",line))==str(regex.sub("\s","",outline)))
            if args.conll:
                for w in out:
                    print(w)
                print()
            else:
                print(outline)
            
        except:
            print("==== CHECK FILE! ====",  args.infile, file=sys.stderr)
            print("+"*20, file=sys.stderr)
            print("in:  >>{}<<".format(line), file=sys.stderr)
            print("out: >>{}<<".format(outline), file=sys.stderr)     
            print(str(regex.sub(r"\s","",line)), file=sys.stderr)
            print(str(regex.sub(r"\s","",outline)), file=sys.stderr)

Пример #28

0

Показать файл

def fix_hyphens(word):
    for i in range(0, 2):
        word = regex.sub(r'-({})({})'.format(cons, cons), r'\1-\2', word, flags=regex.I)
        word = regex.sub(r'([kgcjḍṭdtpb])-(h{})'.format(vowel_pattern), r'\1\2-', word, flags=regex.I)
    word = regex.sub(r'^(\p{alpha}{0,3})-', r'\1', word)
    word = regex.sub(r'-(\p{alpha}{0,3})$', r'\1', word)
    return word

Пример #29

0

Показать файл

Файл: Parse_Index_and_Version.py Проект: rneiss/Sefaria-Project

def parse_text(element):
    n = element.attrib["_note"]
    n = re.sub(r"[/]", "<br>", n)
    n = re.sub(r"[(]", "<em><small>", n)
    n = re.sub(r"[)]", "</small></em>", n)
    prayer = n.strip().splitlines()
    return prayer

Пример #30

0

Показать файл

Файл: matcher.py Проект: larrykoubiak/gamedbpython

 def normalize(self, s):
     s = re.sub(":","",s) # subtitle :
     s = re.sub("-","",s) # subtitle -
     s = re.sub("  "," ",s) # remove double space
     s = re.sub("The ","",s) # remove prefix The      
     s = re.sub(", The","",s) # remove suffix ,The
     return s

Пример #31

0

Показать файл

Файл: clip.py Проект: yangboz/big-sleep

def whitespace_clean(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

Пример #32

0

Показать файл

def on_msg(msg, client):
    global _room_roles

    if not isinstance(msg, events.MessagePosted) and not isinstance(
            msg, events.MessageEdited):
        return

    message = msg.message
    room_ident = (client.host, message.room.id)
    room_data = _rooms[room_ident]

    if message.owner.id == client._br.user_id:
        if 'direct' in _room_roles and room_ident in _room_roles['direct']:
            SocketScience.receive(
                message.content_source.replace("\u200B",
                                               "").replace("\u200C", ""))

        return

    if message.content.startswith("<div class='partial'>"):
        message.content = message.content[21:]
        if message.content.endswith("</div>"):
            message.content = message.content[:-6]

    if message.parent:
        try:
            if message.parent.owner.id == client._br.user_id:
                strip_mention = regex.sub(
                    "^(<span class=(\"|')mention(\"|')>)?@.*?(</span>)? ", "",
                    message.content)
                cmd = GlobalVars.parser.unescape(strip_mention)

                result = dispatch_reply_command(message.parent, message, cmd)

                if result:
                    s = ":{}\n{}" if "\n" not in result and len(
                        result) >= 488 else ":{} {}"
                    _msg_queue.put((room_data, s.format(message.id,
                                                        result), None))
        except ValueError:
            pass
    elif message.content.lower().startswith("sd "):
        result = dispatch_shorthand_command(message)

        if result:
            s = ":{}\n{}" if "\n" not in result and len(
                result) >= 488 else ":{} {}"
            _msg_queue.put((room_data, s.format(message.id, result), None))
    elif message.content.startswith("!!/"):
        result = dispatch_command(message)

        if result:
            s = ":{}\n{}" if "\n" not in result and len(
                result) >= 488 else ":{} {}"
            _msg_queue.put((room_data, s.format(message.id, result), None))
    elif classes.feedback.FEEDBACK_REGEX.search(message.content) \
            and is_privileged(message.owner, message.room) and datahandling.last_feedbacked:
        ids, expires_in = datahandling.last_feedbacked

        if time.time() < expires_in:
            Tasks.do(metasmoke.Metasmoke.post_auto_comment,
                     message.content_source,
                     message.owner,
                     ids=ids)
    elif 'direct' in _room_roles and room_ident in _room_roles['direct']:
        SocketScience.receive(
            message.content_source.replace("\u200B", "").replace("\u200C", ""))

Пример #33

0

Показать файл

Файл: Preliminary_Results.py Проект: sarahhafez/aspect-based-try-uno

!unzip amazon-reviews-unlocked-mobile-phones.zip

#saving the data
with open("Amazon_Unlocked_Mobile.csv") as csv_file:
  csv_reader = csv.reader(csv_file)
  colnames = next(csv_reader)
  data = list(csv_reader)

#printing one sample point to see how it is saved like
print(random.sample(data,1))
#each entry consists of product name, brand, price (string), rating (in string), review, review votes.

"""# **Data Cleaning and Preprocessing - Overall Sentiment**"""

x=re.sub("[^a-zA-Z0-9\s]", "", re.sub("[,.&:-]"," ","matu6738,at&t,,3"))
print(x)

x.split()

#extracting initial reviews and ratings from the original data 
initial_reviews = []
ratings = []
review_vote = []  #could be useful later 

for x in data:
  ratings.append(int(x[3]))
  initial_reviews.append(re.sub("[^a-zA-Z0-9\s]", "", re.sub("[,&.:-]"," ",x[4].lower())))
  review_vote.append(x[5])

clean_vote = []

Пример #34

0

Показать файл

Файл: word_embeddings.py Проект: slee-lab/lbnlp

def number_to_substring(text, latex=False):
    return regex.sub("(\d*\.?\d+)", r'_\1', text) if latex else regex.sub(
        "(\d*\.?\d+)", r'<sub>\1</sub>', text)

Python sub примеры использования