def rev_ip(ip, delimiter=None): revip = False eip = expand_ip(ip) prefix = False if '/' in eip: eip, prefix = regex.split('/', eip)[0:2] else: if is_ip4.search(eip): prefix = '32' elif is_ip6.search(eip): prefix = '128' if prefix: prefix = int(prefix) if is_ip4.search(eip): if prefix in (8, 16, 24, 32): revip = '.'.join(eip.split('.')[0:int(prefix / 8)][::-1]) + '.in-addr.arpa.' elif delimiter: octs = eip.split('.')[::-1] octs[3 - int(prefix / 8)] = octs[3 - int(prefix / 8)] + delimiter + str(prefix) revip = '.'.join(octs[3 - int(prefix / 8):]) + '.in-addr.arpa.' elif is_ip6.search(eip): if prefix in (4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128): revip = '.'.join(filter(None, regex.split('(.)', regex.sub(':', '', eip))))[0:(prefix / 4) * 2][::-1].strip('.') + '.ip6.arpa.' elif delimiter: nibs = filter(None, regex.split('(.)', regex.sub(':', '', eip)))[::-1] nibs[31 - int(prefix / 4)] = nibs[31 - int(prefix /4)] + delimiter + str(prefix) revip = '.'.join(nibs[31 - int(prefix /4):]) + '.ip6.arpa.' return revip
def getNames(evalLines, locs1, locs2, x): """ Calls appropriate helper functions to accurately parse name information """ locs = locs1 + locs2 locs = sorted(locs) entries = [] keys = ['loc','name', 'nameFlag', 'rank','email','tel','fax', 'title'] for i in locs: D= {key: None for key in keys} m = None D['loc'] = i if i in locs1: m = regex.split(r'•\s*(.*?):\s*',x[i]) #split line with rank D['rank'] = m[1].strip() elif i in locs2: m = regex.split(r'•\s*(.*?):\s*', ' '.join([x[i-1].strip(), x[i].strip()]), flag=regex.UNICODE) #do something D['rank'] = m[1].strip() if len(m[2].strip()) == 0: #if there is only a rank #assume name is on next line potentially with some other info evalLines.append(i+1) emtefa(x[i+1], D, x[i+2]) else: #name is on the same line emtefa(m[2].strip(), D, x[i+1]) entries.append(D) return(evalLines,entries)
def main(): lang="english" datapkg = "corpus" book = "eng/myBigErrorsList.txt" data = WP.readBook(WP, datapkg, book) #words = regex.split("(\n+)", data.lower()) words = regex.split("(\n+)", data) ng = NG(lang) cletter, n ="", 0; for word in words: if "\n" in word: cletter += str('\n') else: for w in regex.split("\W+", word): if len(w): n +=1 print("correct(%r) => %r" % (w, ng.correct(w.lower()))) cletter += str(ng.correct(w) + str(" ")) print("######## Original Txt ########") print(data) print("######## Txt After Correction ########") print(cletter) print("################")
def parse_semag(self, str, mass): # split = re.split('\s', str.strip()) reg_book = re.compile(u'ו?(עשין|שם|לאוין)') split = re.split(reg_book, str.strip()) # str_list = filter(None, split) str_list = filter(None, [item.strip() for item in split]) resolveds = [] # it = iter(str_list) derabanan_flag = False book = None for i, word in enumerate(str_list): if derabanan_flag: derabanan_flag = False resolved = self._tracker.resolve(book, [1]) resolveds.append(resolved) continue elif re.search(reg_book, word): # book = word # if book == u'שם': # book = None # elif book == u'לאוין': # book = u'Sefer Mitzvot Gadol, Volume One' try: if word != u'שם': derabanan = filter(None, [item.strip() for item in re.split(u'(מד"ס|מ?דרבנן)',str_list[i+1].strip())]) except IndexError: # mass.ErrorFile.write('error smg, no place in book notation') mass.error_flag = 'error smg, no place in book notation' print 'error smg, no place in book notation' return if word == u'עשין' and len(derabanan) > 1 and (derabanan[0] != u"סימן"): book = re.search(u'[א-ה]',derabanan[1]) # print book.group(0) book = self._table[book.group(0)] derabanan_flag = True elif re.match(reg_book, word): book = self._table[word] else: mass.ErrorFile.write("error smg, don't recognize book name") print "error smg, don't recognize book name", book return else: mitzva = re.split('\s', word) for m in mitzva: # if m == u'סימן': # continue if m == u'שם': m = None elif getGematriaVav(m): m = getGematriaVav(m) else: m = None resolved = self._tracker.resolve(book, [m]) resolveds.append(resolved) if not resolveds: resolved = self._tracker.resolve(book, [None]) resolveds.append(resolved) # print resolveds return resolveds
def parse_semag(self, str, mass): reg_book = re.compile(u'ו?ב?(עשין|שם|לאוין|לאין)') split = re.split(reg_book, str.strip()) str_list = filter(None, [item.strip() for item in split]) resolveds = [] derabanan_flag = False book = None reg_siman = u"סי'?|סימן" reg_vav = u'ו{}'.format(reg_siman) for i, word in enumerate(str_list): if derabanan_flag: derabanan_flag = False # resolved = self._tracker.resolve(book, [1]) resolved = resolveExceptin(self._tracker, book, [1]) resolveds.append(resolved) continue elif re.search(reg_book, word): try: if word != u'שם': derabanan = filter(None, [item.strip() for item in re.split(u'(מד"ס|מ?דרבנן)',str_list[i+1].strip())]) except IndexError: mass.write_shgia('error smg, no place in book notation') return if word == u'עשין' and len(derabanan) > 1: book = re.search(u'[א-ה]',derabanan[1]) book = self._table[book.group(0)] derabanan_flag = True elif re.match(reg_book, word): book = self._table[word] else: mass.write_shgia("error smg, don't recognize book name") return else: mitzva = re.split('\s', word) for m in mitzva: if re.search(reg_vav, m) and not book: # resolved = self._tracker.resolve(book, [None]) resolved = resolveExceptin(self._tracker, book, [None]) resolveds.append(resolved) if m == u'ו?שם': m = None elif re.search(reg_siman, m): continue elif getGematriaVav(m, mass): m = getGematriaVav(m, mass) else: m = None # resolved = self._tracker.resolve(book, [m]) resolved = resolveExceptin(self._tracker, book, [m]) resolveds.append(resolved) if not resolveds: # resolved = self._tracker.resolve(book, [None]) resolved = resolveExceptin(self._tracker, book, [None]) resolveds.append(resolved) if len([item for item in resolveds if not isinstance(item, Ref)]) > 0: mass.write_shgia(u'error from ibid in Ref or table none problem') return resolveds
def wptexturize(self, text): # Transform into regexp sub-expression used in _wptexturize_pushpop_element # Must do this every time in case plugins use these filters in a context sensitive manner no_texturize_tags = '(' + '|'.join(self.default_no_texturize_tags) + ')' no_texturize_shortcodes = '(' + '|'.join(self.default_no_texturize_shortcodes) + ')' no_texturize_tags_stack = [] no_texturize_shortcodes_stack = [] # PHP: Since Python doesn't support PHP's /U modifier (which inverts quantifier's greediness), I modified the regular expression accordingly textarr = regex.split('(<.*?>|\[.*?\])', text, flags=regex.DOTALL) result = [] for curl in textarr: if len(curl) == 0: continue # Only call _wptexturize_pushpop_element if first char is correct tag opening first = curl[0] if '<' == first: self.__wptexturize_pushpop_element(curl, no_texturize_tags_stack, no_texturize_tags, '<', '>') elif '[' == first: self.__wptexturize_pushpop_element(curl, no_texturize_shortcodes_stack, no_texturize_shortcodes, '[', ']') elif len(no_texturize_shortcodes_stack) == 0 and len(no_texturize_tags_stack) == 0: # This is not a tag, nor is the texturization disabled static strings for search, replacement in self.static: curl = curl.replace(search, replacement) # regular expressions for search, replacement in self.dynamic: curl = regex.sub(search, replacement, curl) curl = regex.sub('&([^#])(?![a-zA-Z1-4]{1,8};)', '&\\1', curl) result.append(curl) return ''.join(result)
def siman_smk_exctractor(smk_text): split = re.split(u'\s', smk_text) simanim = [] for word in split: if not word or word == u'סימן' or word == u'סעיף': continue word = re.sub(u"[;.,']", u"", word) if re.search(u'-', word): borders = re.search(u"(.*?)-(.*)", word) start = getGematria(borders.group(1)) end = getGematria(borders.group(2)) for siman in range(start, end+1): simanim.append(siman) if not is_hebrew_number(word): if not check_vav(word): # print smk_text, simanim return simanim else: simanim.append(check_vav(word)) else: smk_siman = getGematria(word) simanim.append(smk_siman) # print smk_text, simanim return simanim
def get_text_completion_list(self): # words = self.toPlainText().split(QRegExp("[^a-zA-Z0-9_]"), # QString.SkipEmptyParts) words = regex.split("\W+", unicode(self.toPlainText()), flags=regex.UNICODE) word_till_cursor = unicode(self.word_till_cursor()) word_under_cursor = unicode(self.word_under_cursor()) # words.removeDuplicates() # words.sort() words = sorted(set(words)) completion_list = [] completion_list_not_start_with = [] for word in words: if ( word != word_till_cursor and word != word_under_cursor and word.upper().find(word_till_cursor.upper()) == 0 ): completion_list.append(word) elif word != word_till_cursor and len(word) > len(word_till_cursor): words_till_cursor = [x for x in word_till_cursor.split("_") if x != "" and len(x) >= 2] matches = 0 for word_tc in words_till_cursor: if word.upper().find(word_tc.upper()) >= 0: matches += 1 if matches == len(words_till_cursor): completion_list.append(word) elif matches * 1.20 >= len(words_till_cursor): completion_list_not_start_with.append(word) return ( super(WithWordCompletionMulty_, self).get_text_completion_list() + completion_list + completion_list_not_start_with )
def split(pattern, string, maxsplit=0, flags=0, concurrent=None, **kwargs): """Wrapper for split.""" return regex.split( _apply_search_backrefs(pattern, flags), string, maxsplit, flags, concurrent, **kwargs )
def node_volumes(node, volume): book = re.split('\s', node.full_title()) book = book[1] if volume == u'א': title = u'I ' + book return library.get_schema_node(title) return library.get_schema_node(u'II ' + book)
def read_dst( filename ): """ Function reads files in Phylip dst-format. Parameters ---------- filename : string Name of the file which should have the extension ``dst``. Returns ------- data : tuple A tuple consisting of a list of taxa and a matrix. """ try: f = open(filename) except: print("[!] Could not find the file {0}!".format(filename)) taxa,matrix = [],[] for i,line in enumerate(f): if i > 0: taxa.append(line[0:10].strip()) matrix.append([float(i) for i in re.split('\s+',line[11:].strip())]) return taxa,matrix
def tokenize_words_old(str): str = str.replace(u"־"," ") str = re.sub(r"</?[^>]+>","",str) #get rid of html tags str = re.sub(r"\([^\(\)]+\)","",str) #get rid of refs str = str.replace('"',"'") word_list = filter(bool,re.split(r"[\s\:\-\,\.\;\(\)\[\]\{\}]",str)) return word_list
def __get_codon_usage(self): '''Gets the codon usage table for a given taxonomy id.''' aa_to_codon_prob = {aa_code: {} for aa_code in AA_CODES.values()} url = 'http://www.kazusa.or.jp/codon/cgi-bin/showcodon.cgi?species=' \ + self.__taxonomy_id + '&aa=1&style=GCG' in_codons = False for line in urllib2.urlopen(url): if line == '<PRE>\n': in_codons = True elif line == '</PRE>\n': break elif in_codons: values = re.split('\\s+', line) if values[0] in AA_CODES: codon_prob = aa_to_codon_prob[AA_CODES[values[0]]] codon_prob[values[1]] = float(values[3]) aa_to_codon_prob.update((x, _scale(y)) for x, y in aa_to_codon_prob.items()) return aa_to_codon_prob
def get_lang_class(name): if name in LANGUAGES: return LANGUAGES[name] lang = re.split('[^a-zA-Z0-9]', name, 1)[0] if lang not in LANGUAGES: raise RuntimeError('Language not supported: %s' % name) return LANGUAGES[lang]
def fileiter(src, ext=None, rex=None): """Iterate over files starting at src. ext can be a string of space-seperated extensions. Or it can be an empty string. The empty string only matches files with no extension. rex should be a regular expression object or pattern. Only files which produce a match will be returned. """ if ext is not None and ext != '': ext = regex.split(r'[, ]+', ext) if rex is not None and type(rex) is str: rex = regex.compile(rex) extrex = regex.compile(r'.*\.(.*)') for dirpath, dirnames, filenames in os.walk(src): for infile in (os.path.join(dirpath, a) for a in filenames): if ext is not None: m = extrex.search(infile) if m is None: if ext != '': continue else: if m[1] not in ext: continue if rex is not None: if rex.search(m) is None: continue yield infile
def split(self, pattern, maxsplit=0, flags=0, out_lvs=[''], dupe=False): split_list = re.split(pattern, str(self), maxsplit, flags) out_lvs = [out_lv if out_lv else self.lv for out_lv in out_lvs] out_lvs += [out_lvs[-1]] * (len(split_list) - len(out_lvs)) if dupe: split_list += [split_list[-1]] * (len(out_lvs) - len(split_list)) return [Ex(split_string, lv) for split_string, lv in zip(split_list, out_lvs)]
def writeout(self, igraph, out): char = chr(int(igraph['code'], 16)) if char not in self.existing or char in self.seen: return definition = igraph.get('kDefinition', '') definition = regex.sub(r' U\+\w+', '', definition) phon = set() mn = igraph.get('kMandarin', None) hu = igraph.get('kHanyuPinlu', None) hn = igraph.get('kHanyuPinyin', None) if hn: hn = regex.sub(r'\d+\.\d+:', '', hn) if hu: hu = regex.sub(r'\(\d+\)', '', hu) for p in [mn, hu, hn]: if p: phon.update(regex.split(r'[, ]+', p)) phon = ",".join(sorted(phon)) if not phon: return if not self.first: out.write(',\n') else: self.first = False out.write('\'{}\': {}'.format(char, [phon, definition]))
def Populate(self, filename): with open(filename) as f: for num, line in enumerate(f, 1): if num == 1: self.Title = line continue match = regex_acte.match(line) if match: cur_acte = Acte(match.group("NB_Acte").strip()) self.Acts.append(cur_acte) else: match = regex_scene.match(line) if match: cur_scene = Scene(match.group("NB_Scene").strip()) cur_acte.Scenes.append(cur_scene) else: match = regex_character.match(line) if match: info = [i.replace('\n', '') for i in re.split(", |\. ", line)] personnage = [x for x in info if x.isupper()] didascalie = [x for x in info if not x.isupper()] self.Characters.update(personnage) cur_replique = Replique(personnage, didascalie) cur_scene.Repliques.append(cur_replique) else: if line and 'cur_replique' in locals(): cur_replique.text += line
def __init__(self, text, window, lang_en, def_counter): """ :param text: Complete tale/story :param window: Story_UI window """ QtCore.QObject.__init__(self) self.word_list = re.split('\s', text) self.window = window self.sentence_list = regex.split("(?V1)(?<=\.|:|;|-|,|\!)", text) self.sentence_list = self.join_short_sentences() self.keyword_list = [] self.timing_list = [] self.pool = ThreadPool(4) self.keyword_list = self.pool.map(derive_keyword, self.sentence_list) self.pool.close() self.pool.join() self.audio_service = AudioService(window) self.audio_thread = threading.Thread(target=self.audio_service.prepare_voice, args=(self.sentence_list, def_counter)) self.audio_thread.setDaemon(True) self.audio_thread.start() self.image_thread = threading.Thread(target=image_from_keyword_list, args=(self.keyword_list, window, lang_en)) self.image_thread.setDaemon(True) self.image_thread.start()
def separate_string(string): """ >>> separate_string("test <2>") (['test ', ''], ['2']) """ string_list = regex.split(r'<(?![!=])', regex.sub(r'>', '<', string)) return string_list[::2], string_list[1::2] # Returns even and odd elements
def read_list(filenames, listname, domlst, ip4lst, ip6lst, rxlst): for filename in filenames: lines = get_lines(filename, listname) if lines: count = 0 for line in lines: count += 1 entry = regex.split('\s*#\s*', line.replace('\r', '').replace('\n', ''))[0].strip() # Strip comments and line-feeds if entry: if is_ip4.search(entry): ip4lst[entry] = entry elif is_ip6.search(entry): ip6lst[entry] = entry.lower() elif is_dom.search(entry): domlst[entry.strip('.').lower() + '.'] = entry.lower() #if tldextract.extract(entry)[2]: # domlst[entry.strip('.').lower() + '.'] = entry #else: # log_err('LIST [#{0}]: Invalid TLD: \"{1}\"'.format(count, line)) else: try: rx = regex.compile(entry, regex.I) # To test/validate #rxlst[rx] = entry rxlst.add(entry) except BaseException as err: log_err('LIST [#{0}]: Invalid Syntax: \"{1}\"'.format(count, line)) else: log_err('LIST: Empty file \"{0}\"'.format(filename)) log_info('LIST-TOTALS [{0}]: {1} Domains, {2} IPv4-Addresses, {3} IPv6-Addresses and {4} Regexes'.format(listname, len(domlst), len(ip4lst), len(ip6lst), len(rxlst))) return domlst, ip4lst, ip6lst, rxlst
def count_regex_in_all_db(pattern=u'(?:\(|\([^)]*? )שם(?:\)| [^(]*?\))', lang='he', text='all', example_num = 7): ''' This method is for counting testing perepesis, :param lang: :param text: :return: ''' found = [] category_dict = defaultdict(int) shams_dict = defaultdict(list) vtitle = None ind_done = 0 if text == 'all': indecies = library.all_index_records() inds_len = len(indecies) else: indecies = [library.get_index(text)] for iindex, index in enumerate(indecies): print "{}/{}".format(iindex, len(indecies)) # if index == Index().load({'title': 'Divrei Negidim'}): # continue if text == 'all': ind_done += 1 print ind_done*1.0/inds_len try: unit_list_temp = index.nodes.traverse_to_list(lambda n, _: TextChunk(n.ref(), lang, vtitle=vtitle).ja().flatten_to_array() if not n.children else []) st = ' '.join(unit_list_temp) shams = re.finditer(pattern, st) cat_key = u'/'.join(index.categories) num_shams = 0 if shams: for s in shams: num_shams += 1 curr_sham = s.group() if len(re.split(ur'\s+', curr_sham)) > 6: continue shams_dict[cat_key] += [s.group()] # print '{} : {}'.format(index, len(shams)) found.append((index, num_shams)) category_dict[cat_key] += num_shams except: # sefaria.system.exceptions.NoVersionFoundError:
def _add_namespace_to_xpath(self, xpath, ns="{http://spfeopentoolkit.org/ns/spfe-ot/config}"): result = [] for x in re.split(r"([/\[]+)", xpath): try: result.append(ns + x if re.match("[_\w]", x[0]) else x) except IndexError: pass return "".join(result)
def main(all_articles, freq_out): tokens = [] for l in all_articles: # Skip <doc> tags if not regex.match(ur'</?doc', l): l_tokens = regex.split(ur'[^\p{L}]+', l.lower()) tokens += [token for token in l_tokens if token and token not in STOPWORDS]
def tokenize(text): """uses the nonletters to break the text into words returns a list of words""" # words = re.split('[\s\-,;:!?.’\'«»()–...&‘’“”*—]+', text) # words = re.split('[^a-zåàâäæçéèêëîïôöœßùûüÿA-ZÅÀÂÄÆÇÉÈÊËÎÏÔÖŒÙÛÜŸ’\-]+', text) # words = re.split('\W+', text) words = re.split('\P{L}+', text) words.remove('') return words
def add_page(self, im, segmentation=None, records=None): """ Adds an image to the transcription interface, optionally filling in information from a list of ocr_record objects. Args: im (PIL.Image): Input image records (list): A list of ocr_record objects. """ page = {} fd = BytesIO() im.save(fd, format='png', optimize=True) page['index'] = self.page_idx self.page_idx += 1 page['img'] = 'data:image/png;base64,' + base64.b64encode(fd.getvalue()).decode('ascii') page['lines'] = [] if records: for record in records: splits = regex.split(u'(\s+)', record.prediction) bbox = max_bbox(record.cuts) line_offset = 0 segments = [] for segment, whitespace in zip_longest(splits[0::2], splits[1::2]): if len(segment): seg_bbox = max_bbox(record.cuts[line_offset:line_offset + len(segment)]) segments.append({'bbox': '{}, {}, {}, {}'.format(*seg_bbox), 'text': segment, 'index': self.seg_idx}) self.seg_idx += 1 line_offset += len(segment) if whitespace: line_offset += len(whitespace) page['lines'].append({'index': self.line_idx, 'recognition': segments, 'left': 100*int(bbox[0]) / im.size[0], 'top': 100*int(bbox[1]) / im.size[1], 'width': 100*(bbox[2] - bbox[0])/im.size[0], 'height': 100*(int(bbox[3]) - int(bbox[1]))/im.size[1], 'bbox': '{}, {}, {}, {}'.format(int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]))}) self.line_idx += 1 elif segmentation: for bbox in segmentation: page['lines'].append({'index': self.line_idx, 'left': 100*int(bbox[0]) / im.size[0], 'top': 100*int(bbox[1]) / im.size[1], 'width': 100*(bbox[2] - bbox[0])/im.size[0], 'height': 100*(int(bbox[3]) - int(bbox[1]))/im.size[1], 'bbox': '{}, {}, {}, {}'.format(int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]))}) self.line_idx += 1 else: raise KrakenInputException('Neither segmentations nor records given') self.pages.append(page)
def separate_components(src_string, data_element_separator='+', component_data_element_separator=':', segment_terminator='\'', release_character='?'): """Separate the components in an EDIFACT segment string.""" output = [] if src_string[-1] == '\'': src_string = src_string[0:-1] simple_separator_pattern = r'(?<!\{rc})\{des}'.format(des=data_element_separator, rc=release_character) simple_data_elements = regex.split(simple_separator_pattern, src_string) component_separator_pattern = r'(?<!\{rc})\{cdes}'.format(cdes=component_data_element_separator, rc=release_character) for simple_data_element in simple_data_elements: components = regex.split(component_separator_pattern, simple_data_element) if len(components) == 1: output.append(simple_data_element) else: output.append(components) return output
def _are_all_words_units(self, date_string): skip = [_UNITS, r'ago|\d+', r':|[ap]m'] date_string = re.sub(r'\s+', ' ', date_string.strip()) words = filter(lambda x: x if x else False, re.split(r'\W', date_string)) words = filter(lambda x: not re.match(r'%s' % '|'.join(skip), x), words) return not list(words)
def mb_base_tokenizer(str): punc_pat = re.compile(ur"(\.|,|:|;)$") str = re.sub(ur"\([^\(\)]+\)", u"", str) str = re.sub(ur"''", ur'"', str) # looks like double apostrophe in shulchan arukh is meant to be a quote str = re.sub(r"</?[a-z]+>", "", str) # get rid of html tags str = hebrew.strip_cantillation(str, strip_vowels=True) word_list = re.split(ur"\s+", str) word_list = [re.sub(punc_pat, u"", w).strip() for w in word_list if len( re.sub(punc_pat, u"", w).strip()) > 0] # remove empty strings and punctuation at the end of a word return word_list
def get_words(text, by_spaces): """ Helper function which splits the given text string into words. If by_spaces is false, then text like '01-02-2014' will be split into 3 separate words. For backwards compatibility, this is the default for all expression functions. :param text: the text to split :param by_spaces: whether words should be split only by spaces or by punctuation like '-', '.' etc """ rexp = r'\s+' if by_spaces else r'\W+' splits = regex.split(rexp, text, flags=regex.MULTILINE | regex.UNICODE | regex.V0) return [split for split in splits if split] # return only non-empty
def _process_text_line(self, text): split_text = [token for token in new_regex.split(self.tokenisation_pattern, text) \ if token != ''] if self.replace_whitespace: new_text = [] for token in split_text: if token.isspace(): new_text.append(self.replace_whitespace) else: new_text.append(token) split_text = new_text split_text = [token.strip(u' ') for token in split_text] ## prevent multiple spaces split_text = [token for token in split_text if token != u''] ## prevent multiple spaces split_text = [token.lower() for token in split_text] ## lowercase text = ' '.join(split_text) return text
def search_docs(inputs, max_ex=5, opts=None): """Given a set of document ids (returned by ranking for a question), search for top N best matching (by heuristic) paragraphs that contain the answer. """ if not opts: raise RuntimeError('Options dict must be supplied.') doc_texts, q_tokens, answer = inputs examples = [] for i, doc_text in enumerate(doc_texts): for j, paragraph in enumerate(re.split(r'\n+', doc_text)): found = find_answer(paragraph, q_tokens, answer, opts) if found: # Reverse ranking, giving priority to early docs + paragraphs score = (found[0], -i, -j, random.random()) if len(examples) < max_ex: heapq.heappush(examples, (score, found[1])) else: heapq.heappushpop(examples, (score, found[1])) return [e[1] for e in examples]
def base_tokenizer(base_str): base_str = base_str.strip() base_str = bleach.clean(base_str, tags=[], strip=True) for match in re.finditer(r'\(.*?\)', base_str): if library.get_titles_in_string( match.group()) and len(match.group().split()) <= 5: base_str = base_str.replace(match.group(), "") # base_str = re.sub(ur"(?:\(.*?\)|<.*?>)", u"", base_str) base_str = re.sub(r'־', ' ', base_str) base_str = re.sub(r'[A-Za-z]', '', base_str) for phrase in stop_phrases: base_str = base_str.replace(phrase, '') word_list = re.split(r"\s+", base_str) word_list = [ re.sub(r'\P{L}', '', re.sub(r'((?<!^)\u05D9)', '', re.sub(r'ו', '', w))) for w in word_list if w not in stop_words ] #remove non-leading yuds and all vuvs word_list = [w for w in word_list if len(w.strip()) > 0] return word_list
def read_twix_hdr(fid): # function to read raw data header information from siemens MRI scanners # (currently VB and VD software versions are supported and tested) nbuffers = np.fromfile(fid, dtype=np.uint32, count=1)[0] rstraj = [] prot = {} for b in range(nbuffers): # now read string up to null termination bufname = fid.read(10).decode(errors='ignore') bufname = re.findall('^\w*', bufname) bufname = bufname[0] fid.seek(fid.tell() + len(bufname) - 9, 0) buflen = np.fromfile(fid, dtype=np.uint32, count=1)[0] buffer = fid.read(buflen).decode(errors='ignore') buffer = ''.join(re.split('\n\s*\n', buffer)) prot[bufname] = parse_buffer(buffer) return prot, rstraj
def identify_desc_lead_comment(self, comment_block): comment_block = comment_block.replace("*", "").strip() comment_block = comment_block.replace("/", "") # check if comment block can be discarded first_word = comment_block.split(' ', 1)[0] if first_word.isupper() and len(first_word) > 1: return False if first_word.lower() in self.personal_pronouns: return False if first_word.lower() == '(non-javadoc)': return False # divide block into phrases phrases = regex.split('[;:.]', comment_block) # filter phrases that are nondescriptive all_phrases = [] for phrase in phrases: phrase = phrase.lower().strip() phrase = self.remove_punctuations(phrase) if phrase.split(' ', 1)[0] == "if": continue if any(x in phrase for x in self.context_verbs): continue if any(x in phrase for x in self.java_keyverbs): continue phrase = phrase.split() if len(phrase) < 3: continue all_phrases.append(phrase) if len(all_phrases) == 0: return False return all_phrases
async def _(session: NLPSession): stripped_arg = session.msg_text.strip() # 将消息分为两部分(时间|事件) time, target = re.split(r"(?:提醒)|(?:通知)|(?:叫)|(?:告诉)", stripped_arg, maxsplit=1) # 解析时间 tn = TimeNormalizer() time_json = tn.parse(time) if time_json["type"] == "error": return # 时间差转换为时间点 elif time_json["type"] == "timedelta": time_diff = time_json["timedelta"] time_diff = timedelta( days=time_diff["day"], hours=time_diff["hour"], minutes=time_diff["minute"], seconds=time_diff["second"], ) time_target = datetime.now() + time_diff elif time_json["type"] == "timestamp": time_target = datetime.strptime(time_json["timestamp"], "%Y-%m-%d %H:%M:%S") # 默认时间点为中午12点 if (not re.search(r"[\d+一二两三四五六七八九十]+点", time) and time_target.hour == 0 and time_target.minute == 0 and time_target.second == 0): time_target.replace(hour=12) return IntentCommand( 90.0, "_alarm", args={ "time": time_target, # type: ignore "target": target.lstrip("我"), }, )
def segment_column(segmentfile, reffile, massekhet, wikitext=False): final_list = [] i = 0 with open(segmentfile, 'r') as csvfile: seg_reader = csv.DictReader(csvfile) with open(reffile, 'r') as csvfile: ref_reader = csv.DictReader(csvfile) for segrow, refrow in zip(seg_reader, ref_reader): i += 1 if not wikitext: daf, daf_line = segrow[u'Daf'], segrow[u'Line'] else: split = re.split(u'[\s:]', segrow[u'full line']) daf, daf_line = split[1], split[2] smg = convert_smg(refrow[u'Semag']) letter_dict = {u'Segment': u'{}.{}.{}'.format(massekhet, daf, daf_line), u'Rambam': refrow[u'Rambam'], u'Semag': smg, u'Tur Shulchan Arukh': refrow[u'Tur Shulchan Arukh']} final_list.append(letter_dict) return final_list
def _record(self, context): source, match = context try: line = source.next_line except EOFError: return "END", context indent = len(line) - len(line.lstrip()) if self.patterns['blank-line'].match(line): return "SAM", context elif indent < self.doc.current_block.indent: source.return_line() return "SAM", context else: field_values = [x.strip() for x in re.split(r'(?<!\\),', line)] if len(field_values) != len(self.doc.fields): raise SAMParserError( "Record length does not match record set header. At:\n\n " + line) record = list(zip(self.doc.fields, field_values)) self.doc.new_record(record) return "RECORD", context
def write_txt(self, *val): # 現在の設定をテキストファイルに書き込み if len(val) != len(self.current): return with open(user_settings_path, "r") as f: lines = f.read() new_lines = [] for line in lines.splitlines(False): separated = re.split(':=', line) for idx in range(len(val)): if separated[0] == self.current[idx][0]: separated[1] = val[idx] break new_lines.append(':='.join(map(str, separated))) with open(user_settings_path, "w") as f: f.write('\n'.join(new_lines))
def snake_case(s): """ convert token(s) to snake case >>> snake_case('fooBar') 'foo_bar' >>> snake_case('foo_bar') 'foo_bar' >>> snake_case('foo-bar') 'foo_bar' >>> snake_case('FooBar') 'foo_bar' >>> snake_case('Foo-Bar') 'foo_bar' >>> snake_case('foo bar') 'foo_bar' """ s = ensure_str(s) # turing uppercase to seperator with lowercase s = re.sub(r"[A-Z]", r"-\g<0>", s, flags=re.UNICODE) words = compact(re.split(r"\W+", s, flags=re.UNICODE)) return "_".join([word.lower() for word in words])
def searchQuery(self): for prev_res in self.quartoContainer.winfo_children(): if isinstance(prev_res, Button): prev_res.destroy() query = self.query_string.get() query = re.sub(r'\p{punct}', r'', query) query = list(filter(lambda x: len(x) > 0, re.split(r'\s+', query))) #remove empty query terms if len(query) == 0: self.msg_search["text"] = "Bad query for the search. Try again!" else: self.results = rsspider.procRequest(query) self.msg_search["text"] = "Search done. Look for your results!" for result in self.results: resfield = Button(self.quartoContainer, text=result, font=("Calibri", "12"), command=lambda result=result: self. openDocument(rsspider.directory + result)) resfield.pack()
def dou_parse(self, source: ExtractResult) -> ParseResult: result = ParseResult(source) source_text = self.replace_unit(source.text) if (regex.search(self.config.double_and_round_regex, source.text)) is not None: power = self.config.round_number_map_char[source_text[-1:]] result.value = self.get_digit_value(source_text[:-1], power) else: split_result = regex.split(self.config.point_regex, source_text) if split_result[0] == '': split_result[0] = self.config.zero_char if regex.search(self.config.negative_number_sign_regex, split_result[0]) is not None: result.value = self.get_int_value( split_result[0]) - self.get_point_value(split_result[1]) else: result.value = self.get_int_value( split_result[0]) + self.get_point_value(split_result[1]) result.resolution_str = self.__format(result.value) return result
def scan_yt_description(description): tracklist = find_multi_line_tracklist(description) if tracklist: return tracklist try: _, tracklist_text = regex.split(contains_tracklist_regex, description, flags=regex.IGNORECASE) print("contains track list") tracklist = match( tracklist_text, comment_regexps["single_track_per_line_description_only"], regex.MULTILINE) except ValueError: print("No Tracklist in description") pass if tracklist: return tracklist return None
def main(): """ print the 'df' command the way I like it """ if sys.platform == 'darwin': ln = 'gdf --total -BG -t apfs -t ntfs -t msdos -l --output=size,used,avail,pcent,fstype,target' else: ln = 'df --total -BG -t ext4 -t xfs -t ntfs -t msdos -l --output=size,used,avail,pcent,fstype,target' titles = ['Blocks', 'Used', 'Avail', 'Use%', 'Type', 'Mounted'] for title in titles: print(title.ljust(10), end='') print() print('-' * 60) # print('123456789|'*7) for index, line in enumerate(spawn(ln)): if index == 0: continue split = regex.split(r' *', line.lstrip()) for item in split: print('{}'.format(item.rstrip().ljust(10)), end='') print()
def cut(line): dest_words = [] line = line.lower() line = unicodedata.normalize("NFKC", line) src_clauses = re.split("[\p{P}\s+\-~─⋯]+", line) segments = flatten([ filter(lambda w: w, [match.group() for match in hanzi_others.finditer(clause)]) for clause in src_clauses ]) for segment in segments: #src_clause = "".join(Sentence.from_line(src_clause, remove_punct=True, form=args.form)) if not hanzis.match( segment ) and segment not in lexicon: # don't split non-hanzi oov maybe_words = [[segment]] else: maybe_words = dict_seg(segment, dict_lexicon) dest_words += maybe_words[0] dest_sent = " ".join(dest_words) return dest_sent
def text_stirr(input_string: str) -> str: """Stirrs every word in a given text. Parameters ---------- input_string : str Text for processing (may contain line breaks and unicode symbols) Returns ------- str Same text, but with stirred words. """ text_list: List = [] while input_string: split = regex.split(r"([^\p{L}]+)", input_string, maxsplit=1) text_list.append(_process_word(split[0])) text_list.append(split[1]) input_string = split[2] return "".join(text_list)
def add_default_prefix(prefix, selector_text): """ Adds prefix to all unprefixed type selector tokens (tag names) in selector_text. Returns prefixed selector. """ # Note: regex here are valid thanks to cssutils's normalization # of selectors text (e.g. spaces around combinators are always added, # sequences of whitespace characters are always reduced to one U+0020). selector_ns = '' # https://www.w3.org/TR/css-syntax-3/#input-preprocessing # states that \r, \f and \r\n must be replaced by \n # before tokenization. for token in re.split( r'(?<!\\(?:[a-fA-F0-9]{1,6})?)([ \n\t]:not\(|[ \n\t])', selector_text): if (re.match(r'-?(?:[A-Za-z_]|\\[^\n]|[^\u0000-\u007F])', token) and not re.search(r'(?<!\\)\|', token)): selector_ns += '{}|{}'.format(prefix, token) else: selector_ns += token return selector_ns
def answer_subsentence_similarity_by_ratio(index, question, answer): global valid_emoticon # Disabled or short or char emoticon if score_settings[ 'answer_subsentence_similarity_modifier_value'] is None or len( answer ) < score_settings[ 'answer_subsentence_similarity_sentence_len'] or valid_emoticon: return 0 # Split response into subsentences answer = list( filter(None, re.split(score_settings['subsentence_dividers'], answer))) # Find max similarity max_ratio = 0 for num, subsentence in enumerate(answer): for sunsentence2 in answer[num + 1:]: max_ratio = max(max_ratio, Levenshtein.ratio(subsentence, sunsentence2)) # Not similar if max_ratio < score_settings['answer_subsentence_similarity_threshold']: return 0 # Apply value if score_settings['answer_subsentence_similarity_modifier'] == 'value': return score_settings['answer_subsentence_similarity_modifier_value'] # Apply multiplier if score_settings[ 'answer_subsentence_similarity_modifier'] == 'multiplier': return ( max_ratio - score_settings['answer_subsentence_similarity_threshold']) / ( 1 - score_settings['answer_subsentence_similarity_threshold'] ) * score_settings['answer_subsentence_similarity_modifier_value'] return 0
async def sed(c: Client, m: Message): exp = regex.split(r"(?<![^\\]\\)/", m.text) pattern = exp[1] replace_with = exp[2].replace(r"\/", "/") flags = exp[3] if len(exp) > 3 else "" count = 1 rflags = 0 if "g" in flags: count = 0 if "i" in flags and "s" in flags: rflags = regex.I | regex.S elif "i" in flags: rflags = regex.I elif "s" in flags: rflags = regex.S text = m.reply_to_message.text or m.reply_to_message.caption if not text: return try: res = regex.sub(pattern, replace_with, text, count=count, flags=rflags, timeout=1) except TimeoutError: await m.reply_text(await tld(m.chat.id, "regex_timeout")) except regex.error as e: await m.reply_text(str(e)) else: await c.send_message( m.chat.id, f"{html.escape(res)}", reply_to_message_id=m.reply_to_message.message_id, )
def request_nobey(url='https://raw.githubusercontent.com/NoBey/Shadowsocks-free/master/README.md'): def strip_dot(x): return print('req nobey...') servers = list() try: data = re.split('##+|---+', requests.get(url).text)[2:5:2] info = {'message': '', 'name': 'NoBey', 'url': 'https://github.com/NoBey/Shadowsocks-free'} for i, server in enumerate(data): server = server.split('\n') name = server[0].strip() ( ips, ports, _, method, password) = list(map( lambda server: list(map( lambda x: x.strip().strip('`').strip(), server.strip('-').strip().split()[1:])), server[1:6])) method = method[0] password = password[0] for j, ip in enumerate(ips): for k, port in enumerate(ports): servers.append(dict()) servers[-1]['remarks'] = 'NoBey {}-{}-{}'.format(name, j, k) ( servers[-1]['server'], servers[-1]['password'], servers[-1]['server_port'], servers[-1]['method']) = (ip, password, port, method) except Exception as e: logging.exception(e, stack_info=True) return [], {'message': str(e), 'url': '', 'name': ''} return servers, info
def wptexturize(self, text): # Transform into regexp sub-expression used in _wptexturize_pushpop_element # Must do this every time in case plugins use these filters in a context sensitive manner no_texturize_tags = '(' + '|'.join( self.default_no_texturize_tags) + ')' no_texturize_shortcodes = '(' + '|'.join( self.default_no_texturize_shortcodes) + ')' no_texturize_tags_stack = [] no_texturize_shortcodes_stack = [] # PHP: Since Python doesn't support PHP's /U modifier (which inverts quantifier's greediness), I modified the regular expression accordingly textarr = regex.split('(<.*?>|\[.*?\])', text, flags=regex.DOTALL) result = [] for curl in textarr: if len(curl) == 0: continue # Only call _wptexturize_pushpop_element if first char is correct tag opening first = curl[0] if '<' == first: self.__wptexturize_pushpop_element(curl, no_texturize_tags_stack, no_texturize_tags, '<', '>') elif '[' == first: self.__wptexturize_pushpop_element( curl, no_texturize_shortcodes_stack, no_texturize_shortcodes, '[', ']') elif len(no_texturize_shortcodes_stack) == 0 and len( no_texturize_tags_stack) == 0: # This is not a tag, nor is the texturization disabled static strings for search, replacement in self.static: curl = curl.replace(search, replacement) # regular expressions for search, replacement in self.dynamic: curl = regex.sub(search, replacement, curl) curl = regex.sub('&([^#])(?![a-zA-Z1-4]{1,8};)', '&\\1', curl) result.append(curl) return ''.join(result)
def titlecase(text: Union[str, pd._libs.missing.NAType], articles: Set[str] = {}, abbrev: Set[str] = {}) -> Optional[str]: """ Returns a title cased location name from the given location name *tokens*. Ensures that no tokens contained in the *whitelist_tokens* are converted to title case. >>> articles = {'a', 'and', 'of', 'the', 'le'} >>> abbrev = {'USA', 'DC'} >>> titlecase("the night OF THE LIVING DEAD", articles) 'The Night of the Living Dead' >>> titlecase("BRAINE-LE-COMTE, FRANCE", articles) 'Braine-le-Comte, France' >>> titlecase("auvergne-RHÔNE-alpes", articles) 'Auvergne-Rhône-Alpes' >>> titlecase("washington DC, usa", articles, abbrev) 'Washington DC, USA' """ if not isinstance(text, str): return words = enumerate(regex.split(r'\b', text, flags=regex.V1)) def changecase(index, word): casefold = word.casefold() upper = word.upper() if upper in abbrev: return upper elif casefold in articles and index != 1: return word.lower() else: return word.title() return ''.join(changecase(i, w) for i, w in words)
def reconstruct_with_max_seq(doc, max_seq, tokenizer): ret = [] to_add = [] len_to_add = 0 for split in regex.split(r'\n+', doc): split = split.strip() if len(split) == 0: continue len_split = len(tokenizer.tokenize(split)) if len(to_add) > 0 and len_to_add + len_split > max_seq: ret.append(' '.join(to_add)) to_add = [] len_to_add = 0 to_add.append(split) len_to_add += len_split if len(to_add) > 0: ret.append(' '.join(to_add)) return ret
def ngroupChunkerCount(essaySet): """ Count the ngroup(to distinguish it from single NN as a ngroup, we filter by len(ngroup) >=3 ) in given essay set. :param essaySet: a list of essays. :return: a list of numbers, representing the number of long ngroup used in each essay. """ grammar = r""" NP: {<DT>*(<NN.*>|<JJ.*>)*<NN.*>} # Chunk sequences of DT, JJ, NN PP: {<IN><NP>} # Chunk prepositions followed by NP VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments """ cp = nltk.RegexpParser(grammar) ngroupCount = [] for essay in essaySet: try: essay = essay.lower() sentences = filter( None, regex.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', essay)) count = 0 for s in sentences: s = s.decode('utf-8', 'ignore') s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore') tree = cp.parse(filter(None, nltk.pos_tag(s.split()))) for subtree in tree.subtrees(filter=filt): if len(subtree) >= 3: count += 1 ngroupCount.append(count) except Exception: print "Cannot write word_list into file due to the exception:", sys.exc_info( )[0] return ngroupCount
def find_change_flow(vba_func_dict, DG): """Finds alternative macros call flow that is utilized by malicious macros: A _Change event is created for an object, and then the object text is changed using code. This creates a dummy call flow without explicitly calling a function. Args: vba_func_dict (dict[func_name]=func_code): Functions dictionary DG (networkx.DiGraph): Generated directed graph Returns: networkx.DiGraph: Directed Graph with highlighted Change triggers """ # Find all the all the objects that have a _Change event # like TextBox1_Change changed_objects = [] for func_name in vba_func_dict: if "_Change" in func_name: changed_object = func_name.replace("_Change", "") changed_objects.append(changed_object) # Find pieces of code that assign to an object, which would # cause a _Change event Trigger for func_name in vba_func_dict: func_code = vba_func_dict[func_name] # split function code into lines func_code_lines = [_f for _f in re.split("\n", func_code) if _f] for func_line in func_code_lines: for changed_object in changed_objects: # look for .[changed_object] pattern, followd by "=" found_loc = func_line.find("." + changed_object) if found_loc > -1: if func_line.find("=", found_loc) > -1: # we found object with Change event that was assigned a value # show this connection as a function call DG.add_edge( func_name, changed_object + "_Change", label="Triggers", fontcolor=color_scheme["COLOR_TRIGGERED_CALL_EDGE"] ) return DG
def _split_doc(self, doc): """Given a doc, split it into chunks (by paragraph).""" GROUP_LENGTH = 0 docs = [] curr = [] curr_len = 0 for split in regex.split(r'\n+', doc): split = split.strip() if len(split) == 0: continue # Maybe group paragraphs together until we hit a length limit if len(curr) > 0 and curr_len + len(split) > GROUP_LENGTH: # yield ' '.join(curr) docs.append(' '.join(curr)) curr = [] curr_len = 0 curr.append(split) curr_len += len(split) if len(curr) > 0: # yield ' '.join(curr) docs.append(' '.join(curr)) return docs
def _larkToSynExc(self, e): ''' Convert lark exception to synapse BadSyntax exception ''' mesg = regex.split('[\n!]', str(e))[0] at = len(self.text) if isinstance(e, lark.exceptions.UnexpectedCharacters): expected = sorted(terminalEnglishMap[t] for t in e.allowed) mesg += f'. Expecting one of: {", ".join(expected)}' at = e.pos_in_stream elif isinstance(e, lark.exceptions.UnexpectedEOF): expected = sorted(terminalEnglishMap[t] for t in set(e.expected)) mesg += ' ' + ', '.join(expected) elif isinstance(e, lark.exceptions.VisitError): # Lark unhelpfully wraps an exception raised from AstConverter in a VisitError. Unwrap it. origexc = e.orig_exc if not isinstance(origexc, s_exc.SynErr): raise # pragma: no cover origexc.errinfo['text'] = self.text return s_exc.BadSyntax(**origexc.errinfo) return s_exc.BadSyntax(at=at, text=self.text, mesg=mesg)
def run(): totsum = 0 for t in lines[1:]: vals = re.split(r'\s', t) if len(vals) > 1: edges[tuple(vals[:2])] = vals[2] totsum += int(vals[2]) print(totsum) print(totsum/2) cut = dict() for i in range(int(v_count)): if flip(0.5): cut[str(i)] = i maxcut = 0 for t in edges: if (t[0] in cut and t[1] not in cut) or (t[0] not in cut and t[1] in cut): maxcut += int(edges[(t[0], t[1])]) return maxcut
def get_stats(code=None): ''' Retreives some standard statistics of a given piece of solidity code. ''' is_request = False if not code: code = request.form.get('data') is_request = True clean = re.split(r'\n|//.*|/\*[\s\S]*?\*/',code) lines = [x for x in clean if x and x.strip() != ""] line_count = len(lines) dependencies = len([x for x in lines if "import" in x]) complexity = len([x for x in lines if re.search(r'\(',x)]) output = {"LOC":line_count, "Dependencies":dependencies, "Cyclomatic_Complexity":complexity} if is_request: return jsonify(output) return output
def parse_boersennotiz(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) found_parenth = None origpost_used = origpost_red # log all location elements only_add_if_value = True split_post = regex.split('u\.|und|,', origpost_used) for entry in split_post: entry_stripped = entry.strip("., ") # find additional info in each line and subtract it # find last parenthesis and filter #match_parenth = regex.findall(r"(\(.*?\))", entry_stripped) #combined_ps = [] #for res in match_parenth: #combined_ps.append(res.strip()) #origpost_used = origpost_red.replace(found_parenth, "") # update the orignpost used # log additional info in last parenthesis #self.ef.add_to_my_obj("additional_info", combined_ps, object_number=element_counter, # only_filled = only_add_if_value) #if entry_stripped is None or entry_stripped == "": #if match_parenth: # element_counter += 1 entry_stripped = entry.replace("im Freiverkehr", "").replace("(amtl.)", "").strip("., ") if entry_stripped == None or entry_stripped == "": continue self.ef.add_to_my_obj("location", entry_stripped, object_number=element_counter, only_filled= only_add_if_value) element_counter += 1 return True