Python split 예제들, regex.split Python 예제들

예제 #1

0

파일 보기

파일: unbound-dns-filter.py 프로젝트: cbuijs/unbound-dns-filter

def rev_ip(ip, delimiter=None):
    revip = False
    eip = expand_ip(ip)
    prefix = False

    if '/' in eip:
        eip, prefix = regex.split('/', eip)[0:2]
    else:
        if is_ip4.search(eip):
            prefix = '32'
        elif is_ip6.search(eip):
            prefix = '128'

    if prefix:
        prefix = int(prefix)
        if is_ip4.search(eip):
            if prefix in (8, 16, 24, 32):
                revip = '.'.join(eip.split('.')[0:int(prefix / 8)][::-1]) + '.in-addr.arpa.'
            elif delimiter:
                octs = eip.split('.')[::-1]
                octs[3 - int(prefix / 8)] = octs[3 - int(prefix / 8)] + delimiter + str(prefix)
                revip = '.'.join(octs[3 - int(prefix / 8):]) + '.in-addr.arpa.'

        elif is_ip6.search(eip):
            if prefix in (4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128):
                revip = '.'.join(filter(None, regex.split('(.)', regex.sub(':', '', eip))))[0:(prefix / 4) * 2][::-1].strip('.') + '.ip6.arpa.'
            elif delimiter:
                nibs = filter(None, regex.split('(.)', regex.sub(':', '', eip)))[::-1]
                nibs[31 - int(prefix / 4)] = nibs[31 - int(prefix /4)] + delimiter + str(prefix)
                revip = '.'.join(nibs[31 - int(prefix /4):]) + '.ip6.arpa.'

    return revip

예제 #2

0

파일 보기

파일: frencBur2011.py 프로젝트: aserlich/frenchBur

def getNames(evalLines, locs1, locs2, x):
	"""
	Calls appropriate helper functions to accurately parse name information
	"""		
	locs = locs1 + locs2
	locs = sorted(locs)
	entries = []	
	keys = ['loc','name', 'nameFlag', 'rank','email','tel','fax', 'title']
	for i in locs:
		D= {key: None for key in keys}
		m = None
		D['loc'] = i
		if i in locs1:
			m = regex.split(r'•\s*(.*?):\s*',x[i]) #split line with rank
			D['rank'] = m[1].strip()
		elif i in locs2:
			m = regex.split(r'•\s*(.*?):\s*', ' '.join([x[i-1].strip(), x[i].strip()]), flag=regex.UNICODE) #do something
			D['rank'] = m[1].strip()
		if len(m[2].strip()) == 0: #if there is only a rank
		 	#assume name is on next line potentially with some other info
			evalLines.append(i+1)
			emtefa(x[i+1], D, x[i+2])
		else:  #name is on the same line
			emtefa(m[2].strip(), D, x[i+1])
		entries.append(D)
	return(evalLines,entries)

예제 #3

0

파일 보기

파일: filecorct.py 프로젝트: seekerFactory/spelled

def main():
	lang="english"
	datapkg = "corpus"
	book = "eng/myBigErrorsList.txt"
	
	data = WP.readBook(WP, datapkg, book)
	#words = regex.split("(\n+)", data.lower())
	words = regex.split("(\n+)", data)
	ng = NG(lang)
	cletter, n ="", 0;
	for word in words:
		if "\n" in word:
			cletter += str('\n')
		else:
			for w in regex.split("\W+", word):
				if len(w):
					n +=1
					print("correct(%r) => %r" % (w, ng.correct(w.lower())))
					cletter += str(ng.correct(w) + str(" "))
    
	print("######## Original Txt ########")
	print(data)
	print("######## Txt After Correction ########")
	print(cletter)
	print("################")

예제 #4

0

파일 보기

파일: basic_ein_parser.py 프로젝트: JonMosenkis/Sefaria-Data

 def parse_semag(self, str, mass):
     # split = re.split('\s', str.strip())
     reg_book = re.compile(u'ו?(עשין|שם|לאוין)')
     split = re.split(reg_book, str.strip())
     # str_list = filter(None, split)
     str_list = filter(None, [item.strip() for item in split])
     resolveds = []
     # it = iter(str_list)
     derabanan_flag = False
     book = None
     for i, word in enumerate(str_list):
         if derabanan_flag:
             derabanan_flag = False
             resolved = self._tracker.resolve(book, [1])
             resolveds.append(resolved)
             continue
         elif re.search(reg_book, word):
             # book = word
             # if book == u'שם':
             #     book = None
             # elif book == u'לאוין':
             #     book = u'Sefer Mitzvot Gadol, Volume One'
             try:
                 if word != u'שם':
                     derabanan = filter(None, [item.strip() for item in re.split(u'(מד"ס|מ?דרבנן)',str_list[i+1].strip())])
             except IndexError:
                 # mass.ErrorFile.write('error smg, no place in book notation')
                 mass.error_flag = 'error smg, no place in book notation'
                 print 'error smg, no place in book notation'
                 return
             if word == u'עשין' and len(derabanan) > 1 and (derabanan[0] != u"סימן"):
                 book = re.search(u'[א-ה]',derabanan[1])
                 # print book.group(0)
                 book = self._table[book.group(0)]
                 derabanan_flag = True
             elif re.match(reg_book, word):
                 book = self._table[word]
             else:
                 mass.ErrorFile.write("error smg, don't recognize book name")
                 print "error smg, don't recognize book name", book
                 return
         else:
             mitzva = re.split('\s', word)
             for m in mitzva:
                 # if m == u'סימן':
                 #     continue
                 if m == u'שם':
                     m = None
                 elif getGematriaVav(m):
                     m = getGematriaVav(m)
                 else:
                     m = None
                 resolved = self._tracker.resolve(book, [m])
                 resolveds.append(resolved)
     if not resolveds:
         resolved = self._tracker.resolve(book, [None])
         resolveds.append(resolved)
     # print resolveds
     return resolveds

예제 #5

0

파일 보기

파일: ein_parser.py 프로젝트: JonMosenkis/Sefaria-Data

    def parse_semag(self, str, mass):
        reg_book = re.compile(u'ו?ב?(עשין|שם|לאוין|לאין)')
        split = re.split(reg_book, str.strip())
        str_list = filter(None, [item.strip() for item in split])
        resolveds = []
        derabanan_flag = False
        book = None
        reg_siman = u"סי'?|סימן"
        reg_vav = u'ו{}'.format(reg_siman)
        for i, word in enumerate(str_list):
            if derabanan_flag:
                derabanan_flag = False
                # resolved = self._tracker.resolve(book, [1])
                resolved = resolveExceptin(self._tracker, book, [1])
                resolveds.append(resolved)
                continue
            elif re.search(reg_book, word):
                try:
                    if word != u'שם':
                        derabanan = filter(None, [item.strip() for item in re.split(u'(מד"ס|מ?דרבנן)',str_list[i+1].strip())])
                except IndexError:
                    mass.write_shgia('error smg, no place in book notation')
                    return
                if word == u'עשין' and len(derabanan) > 1:
                    book = re.search(u'[א-ה]',derabanan[1])
                    book = self._table[book.group(0)]
                    derabanan_flag = True
                elif re.match(reg_book, word):
                    book = self._table[word]
                else:
                    mass.write_shgia("error smg, don't recognize book name")
                    return
            else:
                mitzva = re.split('\s', word)
                for m in mitzva:
                    if re.search(reg_vav, m) and not book:
                        # resolved = self._tracker.resolve(book, [None])
                        resolved = resolveExceptin(self._tracker, book, [None])
                        resolveds.append(resolved)

                    if m == u'ו?שם':
                        m = None
                    elif re.search(reg_siman, m):
                        continue
                    elif getGematriaVav(m, mass):
                        m = getGematriaVav(m, mass)
                    else:
                        m = None
                    # resolved = self._tracker.resolve(book, [m])
                    resolved = resolveExceptin(self._tracker, book, [m])
                    resolveds.append(resolved)
        if not resolveds:
            # resolved = self._tracker.resolve(book, [None])
            resolved = resolveExceptin(self._tracker, book, [None])

            resolveds.append(resolved)
        if len([item for item in resolveds if not isinstance(item, Ref)]) > 0:
            mass.write_shgia(u'error from ibid in Ref or table none problem')
        return resolveds

예제 #6

0

파일 보기

파일: default_filters.py 프로젝트: ChillarAnand/plugins

    def wptexturize(self, text):
        # Transform into regexp sub-expression used in _wptexturize_pushpop_element
        # Must do this every time in case plugins use these filters in a context sensitive manner
        no_texturize_tags = '(' + '|'.join(self.default_no_texturize_tags) + ')'
        no_texturize_shortcodes = '(' + '|'.join(self.default_no_texturize_shortcodes) + ')'

        no_texturize_tags_stack = []
        no_texturize_shortcodes_stack = []

        # PHP: Since Python doesn't support PHP's /U modifier (which inverts quantifier's greediness), I modified the regular expression accordingly
        textarr = regex.split('(<.*?>|\[.*?\])', text, flags=regex.DOTALL)

        result = []
        for curl in textarr:
            if len(curl) == 0:
                continue

            # Only call _wptexturize_pushpop_element if first char is correct tag opening
            first = curl[0]
            if '<' == first:
                self.__wptexturize_pushpop_element(curl, no_texturize_tags_stack, no_texturize_tags, '<', '>')
            elif '[' == first:
                self.__wptexturize_pushpop_element(curl, no_texturize_shortcodes_stack, no_texturize_shortcodes, '[', ']')
            elif len(no_texturize_shortcodes_stack) == 0 and len(no_texturize_tags_stack) == 0:
                # This is not a tag, nor is the texturization disabled static strings
                for search, replacement in self.static:
                    curl = curl.replace(search, replacement)
                # regular expressions
                for search, replacement in self.dynamic:
                    curl = regex.sub(search, replacement, curl)
            curl = regex.sub('&([^#])(?![a-zA-Z1-4]{1,8};)', '&#038;\\1', curl)
            result.append(curl)
        return ''.join(result)

예제 #7

0

파일 보기

파일: mitzvot_link_scraper.py 프로젝트: JonMosenkis/Sefaria-Data

def siman_smk_exctractor(smk_text):

    split = re.split(u'\s', smk_text)
    simanim = []
    for word in split:
        if not word or word == u'סימן' or word == u'סעיף':
            continue
        word = re.sub(u"[;.,']", u"", word)
        if re.search(u'-', word):
            borders = re.search(u"(.*?)-(.*)", word)
            start = getGematria(borders.group(1))
            end = getGematria(borders.group(2))
            for siman in range(start, end+1):
                simanim.append(siman)
        if not is_hebrew_number(word):
            if not check_vav(word):
                # print smk_text, simanim
                return simanim
            else:
                simanim.append(check_vav(word))
        else:
            smk_siman = getGematria(word)
            simanim.append(smk_siman)
    # print smk_text, simanim
    return simanim

예제 #8

0

파일 보기

파일: Completion.py 프로젝트: jleahred/miow

 def get_text_completion_list(self):
     # words = self.toPlainText().split(QRegExp("[^a-zA-Z0-9_]"),
     #                           QString.SkipEmptyParts)
     words = regex.split("\W+", unicode(self.toPlainText()), flags=regex.UNICODE)
     word_till_cursor = unicode(self.word_till_cursor())
     word_under_cursor = unicode(self.word_under_cursor())
     # words.removeDuplicates()
     # words.sort()
     words = sorted(set(words))
     completion_list = []
     completion_list_not_start_with = []
     for word in words:
         if (
             word != word_till_cursor
             and word != word_under_cursor
             and word.upper().find(word_till_cursor.upper()) == 0
         ):
             completion_list.append(word)
         elif word != word_till_cursor and len(word) > len(word_till_cursor):
             words_till_cursor = [x for x in word_till_cursor.split("_") if x != "" and len(x) >= 2]
             matches = 0
             for word_tc in words_till_cursor:
                 if word.upper().find(word_tc.upper()) >= 0:
                     matches += 1
             if matches == len(words_till_cursor):
                 completion_list.append(word)
             elif matches * 1.20 >= len(words_till_cursor):
                 completion_list_not_start_with.append(word)
     return (
         super(WithWordCompletionMulty_, self).get_text_completion_list()
         + completion_list
         + completion_list_not_start_with
     )

예제 #9

0

파일 보기

파일: bregex.py 프로젝트: qhw0820/BracketHighlighter

    def split(pattern, string, maxsplit=0, flags=0, concurrent=None, **kwargs):
        """Wrapper for split."""

        return regex.split(
            _apply_search_backrefs(pattern, flags), string,
            maxsplit, flags, concurrent, **kwargs
        )

예제 #10

0

파일 보기

파일: ibid.py 프로젝트: JonMosenkis/Sefaria-Data

 def node_volumes(node, volume):
     book = re.split('\s', node.full_title())
     book = book[1]
     if volume == u'א':
         title = u'I ' + book
         return library.get_schema_node(title)
     return library.get_schema_node(u'II ' + book)

예제 #11

0

파일 보기

파일: phylip.py 프로젝트: RichardLitt/lingpy

def read_dst(
        filename
        ):
    """
    Function reads files in Phylip dst-format.

    Parameters
    ----------
    filename : string
        Name of the file which should have the extension ``dst``.

    Returns
    -------
    data : tuple
        A tuple consisting of a list of taxa and a matrix.

    """

    try:
        f = open(filename)
    except:
        print("[!] Could not find the file {0}!".format(filename))

    taxa,matrix = [],[]
    
    
    for i,line in enumerate(f):
        if i > 0:
            taxa.append(line[0:10].strip())
            matrix.append([float(i) for i in
                re.split('\s+',line[11:].strip())])

    return taxa,matrix

예제 #12

0

파일 보기

파일: mesorat_hashas.py 프로젝트: JonMosenkis/Sefaria-Data

def tokenize_words_old(str):
    str = str.replace(u"־"," ")
    str = re.sub(r"</?[^>]+>","",str) #get rid of html tags
    str = re.sub(r"\([^\(\)]+\)","",str) #get rid of refs
    str = str.replace('"',"'")
    word_list = filter(bool,re.split(r"[\s\:\-\,\.\;\(\)\[\]\{\}]",str))
    return word_list

예제 #13

0

파일 보기

파일: seq_utils.py 프로젝트: synbiochem/synbiochem-py

    def __get_codon_usage(self):
        '''Gets the codon usage table for a given taxonomy id.'''
        aa_to_codon_prob = {aa_code: {} for aa_code in AA_CODES.values()}

        url = 'http://www.kazusa.or.jp/codon/cgi-bin/showcodon.cgi?species=' \
            + self.__taxonomy_id + '&aa=1&style=GCG'

        in_codons = False

        for line in urllib2.urlopen(url):
            if line == '<PRE>\n':
                in_codons = True
            elif line == '</PRE>\n':
                break
            elif in_codons:
                values = re.split('\\s+', line)

                if values[0] in AA_CODES:
                    codon_prob = aa_to_codon_prob[AA_CODES[values[0]]]
                    codon_prob[values[1]] = float(values[3])

        aa_to_codon_prob.update((x, _scale(y))
                                for x, y in aa_to_codon_prob.items())

        return aa_to_codon_prob

예제 #14

0

파일 보기

파일: util.py 프로젝트: kunbud1989/spaCy

def get_lang_class(name):
    if name in LANGUAGES:
        return LANGUAGES[name]
    lang = re.split('[^a-zA-Z0-9]', name, 1)[0]
    if lang not in LANGUAGES:
        raise RuntimeError('Language not supported: %s' % name)
    return LANGUAGES[lang]

예제 #15

0

파일 보기

파일: helpers.py 프로젝트: jgpacker/suttacentral

def fileiter(src, ext=None, rex=None):
    """Iterate over files starting at src.

    ext can be a string of space-seperated extensions. Or it can be an
    empty string. The empty string only matches files with no extension.

    rex should be a regular expression object or pattern. Only files which
    produce a match will be returned.
    """
    if ext is not None and ext != '':
        ext = regex.split(r'[, ]+', ext)
    if rex is not None and type(rex) is str:
        rex = regex.compile(rex)
    extrex = regex.compile(r'.*\.(.*)')
    for dirpath, dirnames, filenames in os.walk(src):
        for infile in (os.path.join(dirpath, a) for a in filenames):
            if ext is not None:
                m = extrex.search(infile)
                if m is None:
                    if ext != '':
                        continue
                else:
                    if m[1] not in ext:
                        continue
            if rex is not None:
                if rex.search(m) is None:
                    continue
            yield infile

예제 #16

0

파일 보기

파일: panlex.py 프로젝트: longnow/panlex-tools

 def split(self, pattern, maxsplit=0, flags=0, out_lvs=[''], dupe=False):
     split_list = re.split(pattern, str(self), maxsplit, flags)
     out_lvs = [out_lv if out_lv else self.lv for out_lv in out_lvs]
     out_lvs += [out_lvs[-1]] * (len(split_list) - len(out_lvs))
     if dupe:
         split_list += [split_list[-1]] * (len(out_lvs) - len(split_list))
     return [Ex(split_string, lv) for split_string, lv in zip(split_list, out_lvs)]

예제 #17

0

파일 보기

    def writeout(self, igraph, out):
        
        char = chr(int(igraph['code'], 16))
        if char not in self.existing or char in self.seen:
            return

        definition = igraph.get('kDefinition', '')
        definition = regex.sub(r' U\+\w+', '', definition)

        phon = set()
        mn = igraph.get('kMandarin', None)
        hu = igraph.get('kHanyuPinlu', None)
        hn = igraph.get('kHanyuPinyin', None)
        if hn:
            hn = regex.sub(r'\d+\.\d+:', '', hn)
        if hu:
            hu = regex.sub(r'\(\d+\)', '', hu)
        for p in [mn, hu, hn]:
            if p:
                phon.update(regex.split(r'[, ]+', p))
        phon = ",".join(sorted(phon))

        if not phon:
            return
        
        if not self.first:
            out.write(',\n')
        else:
            self.first = False
        out.write('\'{}\': {}'.format(char, [phon, definition]))

예제 #18

0

파일 보기

파일: text_reader.py 프로젝트: simson/repetiteur

    def Populate(self, filename):
        with open(filename) as f:
            for num, line in enumerate(f, 1):
                if num == 1:
                    self.Title = line
                    continue

                match = regex_acte.match(line)
                if match:
                    cur_acte = Acte(match.group("NB_Acte").strip())
                    self.Acts.append(cur_acte)

                else:
                    match = regex_scene.match(line)
                    if match:
                        cur_scene = Scene(match.group("NB_Scene").strip())
                        cur_acte.Scenes.append(cur_scene)

                    else:
                        match = regex_character.match(line)
                        if match:
                            info = [i.replace('\n', '')
                                    for i in re.split(", |\. ", line)]
                            personnage = [x for x in info if x.isupper()]
                            didascalie = [x for x in info if not x.isupper()]
                            self.Characters.update(personnage)
                            cur_replique = Replique(personnage, didascalie)
                            cur_scene.Repliques.append(cur_replique)
                        else:
                            if line and 'cur_replique' in locals():
                                cur_replique.text += line

예제 #19

0

파일 보기

파일: text_service.py 프로젝트: nichtawitz/Real-Time-Story-Illustrator

    def __init__(self, text, window, lang_en, def_counter):
        """
        :param text:
           Complete tale/story
        :param window:
            Story_UI window
        """
        QtCore.QObject.__init__(self)
        self.word_list = re.split('\s', text)
        self.window = window
        self.sentence_list = regex.split("(?V1)(?<=\.|:|;|-|,|\!)", text)
        self.sentence_list = self.join_short_sentences()
        self.keyword_list = []
        self.timing_list = []

        self.pool = ThreadPool(4)
        self.keyword_list = self.pool.map(derive_keyword, self.sentence_list)
        self.pool.close()
        self.pool.join()

        self.audio_service = AudioService(window)
        self.audio_thread = threading.Thread(target=self.audio_service.prepare_voice,
                                             args=(self.sentence_list, def_counter))
        self.audio_thread.setDaemon(True)
        self.audio_thread.start()
        self.image_thread = threading.Thread(target=image_from_keyword_list, args=(self.keyword_list, window, lang_en))
        self.image_thread.setDaemon(True)
        self.image_thread.start()

예제 #20

0

파일 보기

파일: util.py 프로젝트: aisbaa/reparse

def separate_string(string):
    """
    >>> separate_string("test <2>")
    (['test ', ''], ['2'])
    """
    string_list = regex.split(r'<(?![!=])', regex.sub(r'>', '<', string))
    return string_list[::2], string_list[1::2]  # Returns even and odd elements

예제 #21

0

파일 보기

파일: unbound-dns-filter.py 프로젝트: cbuijs/unbound-dns-filter

def read_list(filenames, listname, domlst, ip4lst, ip6lst, rxlst):
    for filename in filenames:
        lines = get_lines(filename, listname)

        if lines:
            count = 0
            for line in lines:
                count += 1
                entry = regex.split('\s*#\s*', line.replace('\r', '').replace('\n', ''))[0].strip() # Strip comments and line-feeds
                if entry:
                    if is_ip4.search(entry):
                        ip4lst[entry] = entry

                    elif is_ip6.search(entry):
                        ip6lst[entry] = entry.lower()

                    elif is_dom.search(entry):
                        domlst[entry.strip('.').lower() + '.'] = entry.lower()
                        #if tldextract.extract(entry)[2]:
                        #    domlst[entry.strip('.').lower() + '.'] = entry
                        #else:
                        #    log_err('LIST [#{0}]: Invalid TLD: \"{1}\"'.format(count, line))

                    else:
                        try:
                            rx = regex.compile(entry, regex.I) # To test/validate
                            #rxlst[rx] = entry
                            rxlst.add(entry)
                        except BaseException as err:
                            log_err('LIST [#{0}]: Invalid Syntax: \"{1}\"'.format(count, line))
        else:
            log_err('LIST: Empty file \"{0}\"'.format(filename))

    log_info('LIST-TOTALS [{0}]: {1} Domains, {2} IPv4-Addresses, {3} IPv6-Addresses and {4} Regexes'.format(listname, len(domlst), len(ip4lst), len(ip6lst), len(rxlst)))
    return domlst, ip4lst, ip6lst, rxlst

예제 #22

0

파일 보기

파일: ibid_scripts.py 프로젝트: Sefaria/Sefaria-Data

def count_regex_in_all_db(pattern=u'(?:\(|\([^)]*? )שם(?:\)| [^(]*?\))', lang='he', text='all', example_num = 7):
    '''
    This method is for counting testing perepesis,
    :param lang:
    :param text:
    :return:
    '''

    found = []
    category_dict = defaultdict(int)
    shams_dict = defaultdict(list)

    vtitle = None
    ind_done = 0
    if text == 'all':
        indecies = library.all_index_records()
        inds_len = len(indecies)
    else:
        indecies = [library.get_index(text)]
    for iindex, index in enumerate(indecies):
        print "{}/{}".format(iindex, len(indecies))
        # if index == Index().load({'title': 'Divrei Negidim'}):
        #     continue
        if text == 'all':
            ind_done += 1
            print ind_done*1.0/inds_len
        try:
            unit_list_temp = index.nodes.traverse_to_list(lambda n, _: TextChunk(n.ref(), lang,
                                                                                 vtitle=vtitle).ja().flatten_to_array() if not n.children else [])
            st = ' '.join(unit_list_temp)
            shams = re.finditer(pattern, st)
            cat_key = u'/'.join(index.categories)
            num_shams = 0
            if shams:
                for s in shams:
                    num_shams += 1
                    curr_sham = s.group()
                    if len(re.split(ur'\s+', curr_sham)) > 6:
                        continue
                    shams_dict[cat_key] += [s.group()]

                # print '{} : {}'.format(index, len(shams))
            found.append((index, num_shams))

            category_dict[cat_key] += num_shams

        except:  # sefaria.system.exceptions.NoVersionFoundError:

예제 #23

0

파일 보기

파일: config.py 프로젝트: mbakeranalecta/spfe-open-toolkit

 def _add_namespace_to_xpath(self, xpath, ns="{http://spfeopentoolkit.org/ns/spfe-ot/config}"):
     result = []
     for x in re.split(r"([/\[]+)", xpath):
         try:
             result.append(ns + x if re.match("[_\w]", x[0]) else x)
         except IndexError:
             pass
     return "".join(result)

예제 #24

0

파일 보기

파일: bag_of_words.py 프로젝트: Sergiuu17/fact-extractor

def main(all_articles, freq_out):
    tokens = []

    for l in all_articles:
        # Skip <doc> tags
        if not regex.match(ur'</?doc', l):
            l_tokens = regex.split(ur'[^\p{L}]+', l.lower())
            tokens += [token for token in l_tokens if token and token not in STOPWORDS]

예제 #25

0

파일 보기

파일: tokenizer.py 프로젝트: pnugues/ilppp

def tokenize(text):
    """uses the nonletters to break the text into words
    returns a list of words"""
    # words = re.split('[\s\-,;:!?.’\'«»()–...&‘’“”*—]+', text)
    # words = re.split('[^a-zåàâäæçéèêëîïôöœßùûüÿA-ZÅÀÂÄÆÇÉÈÊËÎÏÔÖŒÙÛÜŸ’\-]+', text)
    # words = re.split('\W+', text)
    words = re.split('\P{L}+', text)
    words.remove('')
    return words

예제 #26

0

파일 보기

파일: transcrib.py 프로젝트: QuLogic/ocropy

    def add_page(self, im, segmentation=None, records=None):
        """
        Adds an image to the transcription interface, optionally filling in
        information from a list of ocr_record objects.

        Args:
            im (PIL.Image): Input image
            records (list): A list of ocr_record objects.
        """
        page = {}
        fd = BytesIO()
        im.save(fd, format='png', optimize=True)
        page['index'] = self.page_idx
        self.page_idx += 1
        page['img'] = 'data:image/png;base64,' + base64.b64encode(fd.getvalue()).decode('ascii')
        page['lines'] = []
        if records:
            for record in records:
                splits = regex.split(u'(\s+)', record.prediction)
                bbox = max_bbox(record.cuts)
                line_offset = 0
                segments = []
                for segment, whitespace in zip_longest(splits[0::2], splits[1::2]):
                    if len(segment):
                        seg_bbox = max_bbox(record.cuts[line_offset:line_offset + len(segment)])
                        segments.append({'bbox': '{}, {}, {}, {}'.format(*seg_bbox), 'text': segment, 'index': self.seg_idx})
                        self.seg_idx += 1
                        line_offset += len(segment)
                    if whitespace:
                        line_offset += len(whitespace)
                page['lines'].append({'index': self.line_idx, 'recognition': segments,
                                      'left': 100*int(bbox[0]) / im.size[0],
                                      'top': 100*int(bbox[1]) / im.size[1],
                                      'width': 100*(bbox[2] - bbox[0])/im.size[0],
                                      'height': 100*(int(bbox[3]) - int(bbox[1]))/im.size[1],
                                      'bbox': '{}, {}, {}, {}'.format(int(bbox[0]),
                                                                      int(bbox[1]),
                                                                      int(bbox[2]),
                                                                      int(bbox[3]))})

                self.line_idx += 1
        elif segmentation:
            for bbox in segmentation:
                page['lines'].append({'index': self.line_idx, 
                                      'left': 100*int(bbox[0]) / im.size[0],
                                      'top': 100*int(bbox[1]) / im.size[1],
                                      'width': 100*(bbox[2] - bbox[0])/im.size[0],
                                      'height': 100*(int(bbox[3]) - int(bbox[1]))/im.size[1],
                                      'bbox': '{}, {}, {}, {}'.format(int(bbox[0]),
                                                                      int(bbox[1]),
                                                                      int(bbox[2]),
                                                                      int(bbox[3]))})
                self.line_idx += 1
        else:
            raise KrakenInputException('Neither segmentations nor records given')
        self.pages.append(page)

예제 #27

0

파일 보기

파일: helpers.py 프로젝트: vertelab/odoo-edi

def separate_components(src_string, data_element_separator='+', component_data_element_separator=':', segment_terminator='\'', release_character='?'):
    """Separate the components in an EDIFACT segment string."""
    output = []

    if src_string[-1] == '\'':
        src_string = src_string[0:-1]

    simple_separator_pattern = r'(?<!\{rc})\{des}'.format(des=data_element_separator, rc=release_character)
    simple_data_elements = regex.split(simple_separator_pattern, src_string)

    component_separator_pattern = r'(?<!\{rc})\{cdes}'.format(cdes=component_data_element_separator, rc=release_character)
    for simple_data_element in simple_data_elements:
        components = regex.split(component_separator_pattern, simple_data_element)
        if len(components) == 1:
            output.append(simple_data_element)
        else:
            output.append(components)

    return output

예제 #28

0

파일 보기

파일: freshness_date_parser.py 프로젝트: benjaoming/dateparser

    def _are_all_words_units(self, date_string):
        skip = [_UNITS,
                r'ago|\d+',
                r':|[ap]m']

        date_string = re.sub(r'\s+', ' ', date_string.strip())

        words = filter(lambda x: x if x else False, re.split(r'\W', date_string))
        words = filter(lambda x: not re.match(r'%s' % '|'.join(skip), x), words)
        return not list(words)

예제 #29

0

파일 보기

def mb_base_tokenizer(str):
    punc_pat = re.compile(ur"(\.|,|:|;)$")

    str = re.sub(ur"\([^\(\)]+\)", u"", str)
    str = re.sub(ur"''", ur'"', str)  # looks like double apostrophe in shulchan arukh is meant to be a quote
    str = re.sub(r"</?[a-z]+>", "", str)  # get rid of html tags
    str = hebrew.strip_cantillation(str, strip_vowels=True)
    word_list = re.split(ur"\s+", str)
    word_list = [re.sub(punc_pat, u"", w).strip() for w in word_list if len(
        re.sub(punc_pat, u"", w).strip()) > 0]  # remove empty strings and punctuation at the end of a word
    return word_list

예제 #30

0

파일 보기

파일: parser_functions.py 프로젝트: jofomah/rapidpro

def get_words(text, by_spaces):
    """
    Helper function which splits the given text string into words. If by_spaces is false, then text like
    '01-02-2014' will be split into 3 separate words. For backwards compatibility, this is the default for all
    expression functions.
    :param text: the text to split
    :param by_spaces: whether words should be split only by spaces or by punctuation like '-', '.' etc
    """
    rexp = r'\s+' if by_spaces else r'\W+'
    splits = regex.split(rexp, text, flags=regex.MULTILINE | regex.UNICODE | regex.V0)
    return [split for split in splits if split]   # return only non-empty

예제 #31

0

파일 보기

파일: VSMTagger.py 프로젝트: cadia-lvl/Ossian

    def _process_text_line(self, text):            

        split_text = [token for token in new_regex.split(self.tokenisation_pattern, text) \
                            if token != '']
        if self.replace_whitespace:
            new_text = []
            for token in split_text:
                if token.isspace():
                    new_text.append(self.replace_whitespace)                        
                else:
                    new_text.append(token)  
            split_text = new_text
        
        split_text = [token.strip(u' ') for token in split_text]  ## prevent multiple spaces
        split_text = [token for token in split_text if token != u'']  ## prevent multiple spaces
        split_text = [token.lower() for token in split_text]     ## lowercase
        text = ' '.join(split_text) 
        return text

예제 #32

0

파일 보기

def search_docs(inputs, max_ex=5, opts=None):
    """Given a set of document ids (returned by ranking for a question), search
    for top N best matching (by heuristic) paragraphs that contain the answer.
    """
    if not opts:
        raise RuntimeError('Options dict must be supplied.')

    doc_texts, q_tokens, answer = inputs
    examples = []
    for i, doc_text in enumerate(doc_texts):
        for j, paragraph in enumerate(re.split(r'\n+', doc_text)):
            found = find_answer(paragraph, q_tokens, answer, opts)
            if found:
                # Reverse ranking, giving priority to early docs + paragraphs
                score = (found[0], -i, -j, random.random())
                if len(examples) < max_ex:
                    heapq.heappush(examples, (score, found[1]))
                else:
                    heapq.heappushpop(examples, (score, found[1]))
    return [e[1] for e in examples]

예제 #33

0

파일 보기

def base_tokenizer(base_str):
    base_str = base_str.strip()
    base_str = bleach.clean(base_str, tags=[], strip=True)
    for match in re.finditer(r'\(.*?\)', base_str):
        if library.get_titles_in_string(
                match.group()) and len(match.group().split()) <= 5:
            base_str = base_str.replace(match.group(), "")
            # base_str = re.sub(ur"(?:\(.*?\)|<.*?>)", u"", base_str)
    base_str = re.sub(r'־', ' ', base_str)
    base_str = re.sub(r'[A-Za-z]', '', base_str)
    for phrase in stop_phrases:
        base_str = base_str.replace(phrase, '')
    word_list = re.split(r"\s+", base_str)
    word_list = [
        re.sub(r'\P{L}', '', re.sub(r'((?<!^)\u05D9)', '',
                                    re.sub(r'ו', '', w))) for w in word_list
        if w not in stop_words
    ]  #remove non-leading yuds and all vuvs
    word_list = [w for w in word_list if len(w.strip()) > 0]
    return word_list

예제 #34

0

파일 보기

파일: read_twix_hdr.py 프로젝트: soumickmj/mapVBVDPY

def read_twix_hdr(fid):
    # function to read raw data header information from siemens MRI scanners
    # (currently VB and VD software versions are supported and tested)
    nbuffers = np.fromfile(fid, dtype=np.uint32, count=1)[0]

    rstraj = []

    prot = {}
    for b in range(nbuffers):
        # now read string up to null termination
        bufname = fid.read(10).decode(errors='ignore')
        bufname = re.findall('^\w*', bufname)
        bufname = bufname[0]
        fid.seek(fid.tell() + len(bufname) - 9, 0)
        buflen = np.fromfile(fid, dtype=np.uint32, count=1)[0]
        buffer = fid.read(buflen).decode(errors='ignore')
        buffer = ''.join(re.split('\n\s*\n', buffer))
        prot[bufname] = parse_buffer(buffer)

    return prot, rstraj

예제 #35

0

파일 보기

파일: verbnet.py 프로젝트: sakbarpu/MineSoft

    def identify_desc_lead_comment(self, comment_block):

        comment_block = comment_block.replace("*", "").strip()
        comment_block = comment_block.replace("/", "")

        # check if comment block can be discarded
        first_word = comment_block.split(' ', 1)[0]
        if first_word.isupper() and len(first_word) > 1:
            return False
        if first_word.lower() in self.personal_pronouns:
            return False
        if first_word.lower() == '(non-javadoc)':
            return False

        # divide block into phrases
        phrases = regex.split('[;:.]', comment_block)

        # filter phrases that are nondescriptive
        all_phrases = []
        for phrase in phrases:
            phrase = phrase.lower().strip()
            phrase = self.remove_punctuations(phrase)

            if phrase.split(' ', 1)[0] == "if":
                continue

            if any(x in phrase for x in self.context_verbs):
                continue

            if any(x in phrase for x in self.java_keyverbs):
                continue

            phrase = phrase.split()

            if len(phrase) < 3:
                continue

            all_phrases.append(phrase)

        if len(all_phrases) == 0: return False
        return all_phrases

예제 #36

0

파일 보기

async def _(session: NLPSession):
    stripped_arg = session.msg_text.strip()

    # 将消息分为两部分（时间|事件）
    time, target = re.split(r"(?:提醒)|(?:通知)|(?:叫)|(?:告诉)",
                            stripped_arg,
                            maxsplit=1)

    # 解析时间
    tn = TimeNormalizer()
    time_json = tn.parse(time)

    if time_json["type"] == "error":
        return
    # 时间差转换为时间点
    elif time_json["type"] == "timedelta":
        time_diff = time_json["timedelta"]
        time_diff = timedelta(
            days=time_diff["day"],
            hours=time_diff["hour"],
            minutes=time_diff["minute"],
            seconds=time_diff["second"],
        )
        time_target = datetime.now() + time_diff
    elif time_json["type"] == "timestamp":
        time_target = datetime.strptime(time_json["timestamp"],
                                        "%Y-%m-%d %H:%M:%S")
        # 默认时间点为中午12点
        if (not re.search(r"[\d+一二两三四五六七八九十]+点", time)
                and time_target.hour == 0 and time_target.minute == 0
                and time_target.second == 0):
            time_target.replace(hour=12)

    return IntentCommand(
        90.0,
        "_alarm",
        args={
            "time": time_target,  # type: ignore
            "target": target.lstrip("我"),
        },
    )

예제 #37

0

파일 보기

def segment_column(segmentfile, reffile, massekhet, wikitext=False):
    final_list = []
    i = 0
    with open(segmentfile, 'r') as csvfile:
        seg_reader = csv.DictReader(csvfile)
        with open(reffile, 'r') as csvfile:
            ref_reader = csv.DictReader(csvfile)
            for segrow, refrow in zip(seg_reader, ref_reader):
                i += 1
                if not wikitext:
                    daf, daf_line = segrow[u'Daf'], segrow[u'Line']
                else:
                    split = re.split(u'[\s:]', segrow[u'full line'])
                    daf, daf_line = split[1], split[2]
                smg = convert_smg(refrow[u'Semag'])
                letter_dict = {u'Segment': u'{}.{}.{}'.format(massekhet, daf, daf_line),
                          u'Rambam': refrow[u'Rambam'],
                          u'Semag': smg,
                          u'Tur Shulchan Arukh': refrow[u'Tur Shulchan Arukh']}
                final_list.append(letter_dict)
    return final_list

예제 #38

0

파일 보기

 def _record(self, context):
     source, match = context
     try:
         line = source.next_line
     except EOFError:
         return "END", context
     indent = len(line) - len(line.lstrip())
     if self.patterns['blank-line'].match(line):
         return "SAM", context
     elif indent < self.doc.current_block.indent:
         source.return_line()
         return "SAM", context
     else:
         field_values = [x.strip() for x in re.split(r'(?<!\\),', line)]
         if len(field_values) != len(self.doc.fields):
             raise SAMParserError(
                 "Record length does not match record set header. At:\n\n "
                 + line)
         record = list(zip(self.doc.fields, field_values))
         self.doc.new_record(record)
         return "RECORD", context

예제 #39

0

파일 보기

    def write_txt(self, *val): # 現在の設定をテキストファイルに書き込み

        if len(val) != len(self.current):
            return

        with open(user_settings_path, "r") as f:
            lines = f.read()

        new_lines = []
        for line in lines.splitlines(False):
            separated = re.split(':=', line)

            for idx in range(len(val)):

                if separated[0] == self.current[idx][0]:
                    separated[1] = val[idx]
                    break
            new_lines.append(':='.join(map(str, separated)))

        with open(user_settings_path, "w") as f:
            f.write('\n'.join(new_lines))

예제 #40

0

파일 보기

def snake_case(s):
    """
    convert token(s) to snake case
    >>> snake_case('fooBar')
    'foo_bar'
    >>> snake_case('foo_bar')
    'foo_bar'
    >>> snake_case('foo-bar')
    'foo_bar'
    >>> snake_case('FooBar')
    'foo_bar'
    >>> snake_case('Foo-Bar')
    'foo_bar'
    >>> snake_case('foo bar')
    'foo_bar'
    """
    s = ensure_str(s)
    # turing uppercase to seperator with lowercase
    s = re.sub(r"[A-Z]", r"-\g<0>", s, flags=re.UNICODE)
    words = compact(re.split(r"\W+", s, flags=re.UNICODE))
    return "_".join([word.lower() for word in words])

예제 #41

0

파일 보기

파일: rss_interface.py 프로젝트: JoaoVieira97/SPLN-Utils

 def searchQuery(self):
     for prev_res in self.quartoContainer.winfo_children():
         if isinstance(prev_res, Button):
             prev_res.destroy()
     query = self.query_string.get()
     query = re.sub(r'\p{punct}', r'', query)
     query = list(filter(lambda x: len(x) > 0,
                         re.split(r'\s+',
                                  query)))  #remove empty query terms
     if len(query) == 0:
         self.msg_search["text"] = "Bad query for the search. Try again!"
     else:
         self.results = rsspider.procRequest(query)
         self.msg_search["text"] = "Search done. Look for your results!"
         for result in self.results:
             resfield = Button(self.quartoContainer,
                               text=result,
                               font=("Calibri", "12"),
                               command=lambda result=result: self.
                               openDocument(rsspider.directory + result))
             resfield.pack()

예제 #42

0

파일 보기

    def dou_parse(self, source: ExtractResult) -> ParseResult:
        result = ParseResult(source)

        source_text = self.replace_unit(source.text)

        if (regex.search(self.config.double_and_round_regex, source.text)) is not None:
            power = self.config.round_number_map_char[source_text[-1:]]
            result.value = self.get_digit_value(source_text[:-1], power)
        else:
            split_result = regex.split(self.config.point_regex, source_text)
            if split_result[0] == '':
                split_result[0] = self.config.zero_char
            if regex.search(self.config.negative_number_sign_regex, split_result[0]) is not None:
                result.value = self.get_int_value(
                    split_result[0]) - self.get_point_value(split_result[1])
            else:
                result.value = self.get_int_value(
                    split_result[0]) + self.get_point_value(split_result[1])

        result.resolution_str = self.__format(result.value)
        return result

예제 #43

0

파일 보기

파일: extract_tracklists.py 프로젝트: akjadoon/sets-to-spotify

def scan_yt_description(description):
    tracklist = find_multi_line_tracklist(description)
    if tracklist:
        return tracklist

    try:
        _, tracklist_text = regex.split(contains_tracklist_regex,
                                        description,
                                        flags=regex.IGNORECASE)
        print("contains track list")
        tracklist = match(
            tracklist_text,
            comment_regexps["single_track_per_line_description_only"],
            regex.MULTILINE)
    except ValueError:
        print("No Tracklist in description")
        pass
    if tracklist:
        return tracklist

    return None

예제 #44

0

파일 보기

파일: my_df.py 프로젝트: CraigDawson/my_df

    def main():
        """ print the 'df' command the way I like it """
        if sys.platform == 'darwin':
            ln = 'gdf --total -BG -t apfs -t ntfs -t msdos -l --output=size,used,avail,pcent,fstype,target'
        else:
            ln = 'df --total -BG -t ext4 -t xfs -t ntfs -t msdos -l --output=size,used,avail,pcent,fstype,target'

        titles = ['Blocks', 'Used', 'Avail', 'Use%', 'Type', 'Mounted']
        for title in titles:
            print(title.ljust(10), end='')
        print()
        print('-' * 60)
        # print('123456789|'*7)

        for index, line in enumerate(spawn(ln)):
            if index == 0:
                continue
            split = regex.split(r' *', line.lstrip())
            for item in split:
                print('{}'.format(item.rstrip().ljust(10)), end='')
            print()

예제 #45

0

파일 보기

파일: dict_seg.py 프로젝트: ydchen0108/s5-taibun-aug

 def cut(line):
     dest_words = []
     line = line.lower()
     line = unicodedata.normalize("NFKC", line)
     src_clauses = re.split("[\p{P}\s+\-~─⋯]+", line)
     segments = flatten([
         filter(lambda w: w,
                [match.group() for match in hanzi_others.finditer(clause)])
         for clause in src_clauses
     ])
     for segment in segments:
         #src_clause = "".join(Sentence.from_line(src_clause, remove_punct=True, form=args.form))
         if not hanzis.match(
                 segment
         ) and segment not in lexicon:  # don't split non-hanzi oov
             maybe_words = [[segment]]
         else:
             maybe_words = dict_seg(segment, dict_lexicon)
         dest_words += maybe_words[0]
     dest_sent = " ".join(dest_words)
     return dest_sent

예제 #46

0

파일 보기

파일: wordstirr.py 프로젝트: weiss-d/wordstirr

def text_stirr(input_string: str) -> str:
    """Stirrs every word in a given text.

    Parameters
    ----------
    input_string : str
        Text for processing (may contain line breaks and unicode symbols)

    Returns
    -------
    str
        Same text, but with stirred words.

    """
    text_list: List = []
    while input_string:
        split = regex.split(r"([^\p{L}]+)", input_string, maxsplit=1)
        text_list.append(_process_word(split[0]))
        text_list.append(split[1])
        input_string = split[2]
    return "".join(text_list)

예제 #47

0

파일 보기

def add_default_prefix(prefix, selector_text):
    """
    Adds prefix to all unprefixed type selector tokens (tag names)
    in selector_text. Returns prefixed selector.
    """
    # Note: regex here are valid thanks to cssutils's normalization
    # of selectors text (e.g. spaces around combinators are always added,
    # sequences of whitespace characters are always reduced to one U+0020).
    selector_ns = ''
    # https://www.w3.org/TR/css-syntax-3/#input-preprocessing
    # states that \r, \f and \r\n  must be replaced by \n
    # before tokenization.
    for token in re.split(
            r'(?<!\\(?:[a-fA-F0-9]{1,6})?)([ \n\t]:not\(|[ \n\t])',
            selector_text):
        if (re.match(r'-?(?:[A-Za-z_]|\\[^\n]|[^\u0000-\u007F])', token)
                and not re.search(r'(?<!\\)\|', token)):
            selector_ns += '{}|{}'.format(prefix, token)
        else:
            selector_ns += token
    return selector_ns

예제 #48

0

파일 보기

def answer_subsentence_similarity_by_ratio(index, question, answer):
    global valid_emoticon

    # Disabled or short or char emoticon
    if score_settings[
            'answer_subsentence_similarity_modifier_value'] is None or len(
                answer
            ) < score_settings[
                'answer_subsentence_similarity_sentence_len'] or valid_emoticon:
        return 0

    # Split response into subsentences
    answer = list(
        filter(None, re.split(score_settings['subsentence_dividers'], answer)))

    # Find max similarity
    max_ratio = 0
    for num, subsentence in enumerate(answer):
        for sunsentence2 in answer[num + 1:]:
            max_ratio = max(max_ratio,
                            Levenshtein.ratio(subsentence, sunsentence2))

    # Not similar
    if max_ratio < score_settings['answer_subsentence_similarity_threshold']:
        return 0

    # Apply value
    if score_settings['answer_subsentence_similarity_modifier'] == 'value':
        return score_settings['answer_subsentence_similarity_modifier_value']

    # Apply multiplier
    if score_settings[
            'answer_subsentence_similarity_modifier'] == 'multiplier':
        return (
            max_ratio -
            score_settings['answer_subsentence_similarity_threshold']) / (
                1 - score_settings['answer_subsentence_similarity_threshold']
            ) * score_settings['answer_subsentence_similarity_modifier_value']

    return 0

예제 #49

0

파일 보기

async def sed(c: Client, m: Message):
    exp = regex.split(r"(?<![^\\]\\)/", m.text)
    pattern = exp[1]
    replace_with = exp[2].replace(r"\/", "/")
    flags = exp[3] if len(exp) > 3 else ""

    count = 1
    rflags = 0

    if "g" in flags:
        count = 0
    if "i" in flags and "s" in flags:
        rflags = regex.I | regex.S
    elif "i" in flags:
        rflags = regex.I
    elif "s" in flags:
        rflags = regex.S

    text = m.reply_to_message.text or m.reply_to_message.caption

    if not text:
        return

    try:
        res = regex.sub(pattern,
                        replace_with,
                        text,
                        count=count,
                        flags=rflags,
                        timeout=1)
    except TimeoutError:
        await m.reply_text(await tld(m.chat.id, "regex_timeout"))
    except regex.error as e:
        await m.reply_text(str(e))
    else:
        await c.send_message(
            m.chat.id,
            f"{html.escape(res)}",
            reply_to_message_id=m.reply_to_message.message_id,
        )

예제 #50

0

파일 보기

def request_nobey(url='https://raw.githubusercontent.com/NoBey/Shadowsocks-free/master/README.md'):
    def strip_dot(x):
        return
    print('req nobey...')
    servers = list()
    try:
        data = re.split('##+|---+', requests.get(url).text)[2:5:2]
        info = {'message': '', 'name': 'NoBey', 'url': 'https://github.com/NoBey/Shadowsocks-free'}

        for i, server in enumerate(data):
            server = server.split('\n')

            name = server[0].strip()
            (
                ips,
                ports,
                _,
                method,
                password) = list(map(
                    lambda server: list(map(
                        lambda x: x.strip().strip('`').strip(),
                        server.strip('-').strip().split()[1:])),
                    server[1:6]))
            method = method[0]
            password = password[0]

            for j, ip in enumerate(ips):
                for k, port in enumerate(ports):
                    servers.append(dict())
                    servers[-1]['remarks'] = 'NoBey {}-{}-{}'.format(name, j, k)
                    (
                        servers[-1]['server'],
                        servers[-1]['password'],
                        servers[-1]['server_port'],
                        servers[-1]['method']) = (ip, password, port, method)

    except Exception as e:
        logging.exception(e, stack_info=True)
        return [], {'message': str(e), 'url': '', 'name': ''}
    return servers, info

예제 #51

0

파일 보기

파일: default_filters.py 프로젝트: tikki/getnikola-plugins

    def wptexturize(self, text):
        # Transform into regexp sub-expression used in _wptexturize_pushpop_element
        # Must do this every time in case plugins use these filters in a context sensitive manner
        no_texturize_tags = '(' + '|'.join(
            self.default_no_texturize_tags) + ')'
        no_texturize_shortcodes = '(' + '|'.join(
            self.default_no_texturize_shortcodes) + ')'

        no_texturize_tags_stack = []
        no_texturize_shortcodes_stack = []

        # PHP: Since Python doesn't support PHP's /U modifier (which inverts quantifier's greediness), I modified the regular expression accordingly
        textarr = regex.split('(<.*?>|\[.*?\])', text, flags=regex.DOTALL)

        result = []
        for curl in textarr:
            if len(curl) == 0:
                continue

            # Only call _wptexturize_pushpop_element if first char is correct tag opening
            first = curl[0]
            if '<' == first:
                self.__wptexturize_pushpop_element(curl,
                                                   no_texturize_tags_stack,
                                                   no_texturize_tags, '<', '>')
            elif '[' == first:
                self.__wptexturize_pushpop_element(
                    curl, no_texturize_shortcodes_stack,
                    no_texturize_shortcodes, '[', ']')
            elif len(no_texturize_shortcodes_stack) == 0 and len(
                    no_texturize_tags_stack) == 0:
                # This is not a tag, nor is the texturization disabled static strings
                for search, replacement in self.static:
                    curl = curl.replace(search, replacement)
                # regular expressions
                for search, replacement in self.dynamic:
                    curl = regex.sub(search, replacement, curl)
            curl = regex.sub('&([^#])(?![a-zA-Z1-4]{1,8};)', '&#038;\\1', curl)
            result.append(curl)
        return ''.join(result)

예제 #52

0

파일 보기

파일: transform.py 프로젝트: shelly77/ncov-ingest

def titlecase(text: Union[str, pd._libs.missing.NAType],
              articles: Set[str] = {},
              abbrev: Set[str] = {}) -> Optional[str]:
    """
    Returns a title cased location name from the given location name
    *tokens*. Ensures that no tokens contained in the *whitelist_tokens* are
    converted to title case.

    >>> articles = {'a', 'and', 'of', 'the', 'le'}
    >>> abbrev = {'USA', 'DC'}

    >>> titlecase("the night OF THE LIVING DEAD", articles)
    'The Night of the Living Dead'

    >>> titlecase("BRAINE-LE-COMTE, FRANCE", articles)
    'Braine-le-Comte, France'

    >>> titlecase("auvergne-RHÔNE-alpes", articles)
    'Auvergne-Rhône-Alpes'

    >>> titlecase("washington DC, usa", articles, abbrev)
    'Washington DC, USA'
    """
    if not isinstance(text, str):
        return

    words = enumerate(regex.split(r'\b', text, flags=regex.V1))

    def changecase(index, word):
        casefold = word.casefold()
        upper = word.upper()

        if upper in abbrev:
            return upper
        elif casefold in articles and index != 1:
            return word.lower()
        else:
            return word.title()

    return ''.join(changecase(i, w) for i, w in words)

예제 #53

0

파일 보기

def reconstruct_with_max_seq(doc, max_seq, tokenizer):
    ret = []
    to_add = []
    len_to_add = 0
    for split in regex.split(r'\n+', doc):
        split = split.strip()
        if len(split) == 0:
            continue

        len_split = len(tokenizer.tokenize(split))
        if len(to_add) > 0 and len_to_add + len_split > max_seq:
            ret.append(' '.join(to_add))
            to_add = []
            len_to_add = 0

        to_add.append(split)
        len_to_add += len_split

    if len(to_add) > 0:
        ret.append(' '.join(to_add))

    return ret

예제 #54

0

파일 보기

def ngroupChunkerCount(essaySet):
    """
    Count the ngroup(to distinguish it from single NN as a ngroup, we filter by len(ngroup) >=3 ) in given essay set.
    :param essaySet: a list of essays.
    :return: a list of numbers, representing the number of long ngroup used in each essay.
    """
    grammar = r"""
              NP:  {<DT>*(<NN.*>|<JJ.*>)*<NN.*>}    # Chunk sequences of DT, JJ, NN
              PP: {<IN><NP>} # Chunk prepositions followed by NP
              VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
              """
    cp = nltk.RegexpParser(grammar)

    ngroupCount = []

    for essay in essaySet:
        try:
            essay = essay.lower()
            sentences = filter(
                None,
                regex.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s',
                            essay))

            count = 0
            for s in sentences:
                s = s.decode('utf-8', 'ignore')
                s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
                tree = cp.parse(filter(None, nltk.pos_tag(s.split())))
                for subtree in tree.subtrees(filter=filt):
                    if len(subtree) >= 3:
                        count += 1

            ngroupCount.append(count)

        except Exception:
            print "Cannot write word_list into file due to the exception:", sys.exc_info(
            )[0]

    return ngroupCount

예제 #55

0

파일 보기

def find_change_flow(vba_func_dict, DG):
    """Finds alternative macros call flow that is utilized by malicious macros:
    A _Change event is created for an object, and then the object text is changed using code.
    This creates a dummy call flow without explicitly calling a function.

    Args:
        vba_func_dict (dict[func_name]=func_code): Functions dictionary
        DG (networkx.DiGraph): Generated directed graph

    Returns:
        networkx.DiGraph: Directed Graph with highlighted Change triggers
    """
    # Find all the all the objects that have a _Change event
    # like TextBox1_Change
    changed_objects = []
    for func_name in vba_func_dict:
        if "_Change" in func_name:
            changed_object = func_name.replace("_Change", "")
            changed_objects.append(changed_object)

    # Find  pieces of code that assign to an object, which would
    # cause a _Change event Trigger
    for func_name in vba_func_dict:
        func_code = vba_func_dict[func_name]
        # split function code into lines
        func_code_lines = [_f for _f in re.split("\n", func_code) if _f]
        for func_line in func_code_lines:
            for changed_object in changed_objects:
                # look for .[changed_object] pattern, followd by "="
                found_loc = func_line.find("." + changed_object)
                if found_loc > -1:
                    if func_line.find("=", found_loc) > -1:
                        # we found object with Change event that was assigned a value

                        # show this connection as a function call
                        DG.add_edge(
                            func_name, changed_object + "_Change", label="Triggers", fontcolor=color_scheme["COLOR_TRIGGERED_CALL_EDGE"]
                        )
    return DG

예제 #56

0

파일 보기

 def _split_doc(self, doc):
     """Given a doc, split it into chunks (by paragraph)."""
     GROUP_LENGTH = 0
     docs = []
     curr = []
     curr_len = 0
     for split in regex.split(r'\n+', doc):
         split = split.strip()
         if len(split) == 0:
             continue
         # Maybe group paragraphs together until we hit a length limit
         if len(curr) > 0 and curr_len + len(split) > GROUP_LENGTH:
             # yield ' '.join(curr)
             docs.append(' '.join(curr))
             curr = []
             curr_len = 0
         curr.append(split)
         curr_len += len(split)
     if len(curr) > 0:
         # yield ' '.join(curr)
         docs.append(' '.join(curr))
     return docs

예제 #57

0

파일 보기

    def _larkToSynExc(self, e):
        '''
        Convert lark exception to synapse BadSyntax exception
        '''
        mesg = regex.split('[\n!]', str(e))[0]
        at = len(self.text)
        if isinstance(e, lark.exceptions.UnexpectedCharacters):
            expected = sorted(terminalEnglishMap[t] for t in e.allowed)
            mesg += f'.  Expecting one of: {", ".join(expected)}'
            at = e.pos_in_stream
        elif isinstance(e, lark.exceptions.UnexpectedEOF):
            expected = sorted(terminalEnglishMap[t] for t in set(e.expected))
            mesg += ' ' + ', '.join(expected)
        elif isinstance(e, lark.exceptions.VisitError):
            # Lark unhelpfully wraps an exception raised from AstConverter in a VisitError.  Unwrap it.
            origexc = e.orig_exc
            if not isinstance(origexc, s_exc.SynErr):
                raise  # pragma: no cover
            origexc.errinfo['text'] = self.text
            return s_exc.BadSyntax(**origexc.errinfo)

        return s_exc.BadSyntax(at=at, text=self.text, mesg=mesg)

예제 #58

0

파일 보기

def run():
    totsum = 0
    for t in lines[1:]:
        vals = re.split(r'\s', t)
        
        if len(vals) > 1:
            edges[tuple(vals[:2])] = vals[2]
            totsum += int(vals[2])
    
    print(totsum)
    print(totsum/2)
    
    cut = dict()
    for i in range(int(v_count)):
        if flip(0.5):
            cut[str(i)] = i

    maxcut = 0
    for t in edges:
        if (t[0] in cut and t[1] not in cut) or (t[0] not in cut and t[1] in cut):
            maxcut += int(edges[(t[0], t[1])])
    return maxcut

예제 #59

0

파일 보기

def get_stats(code=None):
	'''
		Retreives some standard statistics of 
		a given piece of solidity code. 
	'''	
	is_request = False
	if not code:
		code = request.form.get('data')
		is_request = True

	clean = re.split(r'\n|//.*|/\*[\s\S]*?\*/',code)
	lines = [x for x in clean if x and x.strip() != ""]

	line_count   = len(lines)
	dependencies = len([x for x in lines if "import" in x])
	complexity   = len([x for x in lines if re.search(r'\(',x)])

	output = {"LOC":line_count, "Dependencies":dependencies, "Cyclomatic_Complexity":complexity}

	if is_request:
		return jsonify(output)
	return output

예제 #60

0

파일 보기

    def parse_boersennotiz(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)
        # logme
        self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)

        found_parenth = None
        origpost_used = origpost_red

        # log all location elements
        only_add_if_value = True
        split_post = regex.split('u\.|und|,', origpost_used)
        for entry in split_post:
            entry_stripped = entry.strip("., ")

            # find additional info in  each line and subtract it
            # find last parenthesis and filter
            #match_parenth = regex.findall(r"(\(.*?\))", entry_stripped)
            #combined_ps = []
            #for res in match_parenth:
                #combined_ps.append(res.strip())
                #origpost_used = origpost_red.replace(found_parenth, "")  # update the orignpost used
                # log additional info in last parenthesis

            #self.ef.add_to_my_obj("additional_info", combined_ps, object_number=element_counter,
            #                          only_filled = only_add_if_value)

            #if entry_stripped is None or entry_stripped == "":
                #if match_parenth:
                #    element_counter += 1
            entry_stripped = entry.replace("im Freiverkehr", "").replace("(amtl.)", "").strip("., ")
            if entry_stripped == None or entry_stripped == "":
                continue
            self.ef.add_to_my_obj("location", entry_stripped, object_number=element_counter, only_filled= only_add_if_value)
            element_counter += 1

        return True