def misc_fixes(filepath): if filepath.stem == 'info': return po = pofile(filepath.open('r')) changed = False for i, unit in enumerate(po.units): # Fix immersion for j, msgid in enumerate(unit.msgid): if 'immersion' in msgid: unit.msgid[j] = msgid.replace('immersion', 'samādhi') changed = True # fix evam evam_found = False if not evam_found and any( msgid.startswith('"Evaṃ me sutaṃ') for msgid in unit.msgid): evam_found = True if 'evam' not in ''.join(unit.automaticcomments): unit.automaticcomments += '#. <span class="evam">\n' po.units[i + 1].automaticcomments.insert(0, '#. </span>\n') changed = True for j, comment in enumerate(unit.automaticcomments): new_comment, n = regex.subn(r'-pi([^a-z])', r'-pli\1', comment) if n: unit.automaticcomments[j] = new_comment changed = True new_comment, n = regex.subn(r'\bpi-', 'pli-', new_comment) if n: unit.automaticcomments[j] = new_comment changed = True if changed: po.save()
def link_manuals(): index_path = Path("docs/index.html") index_text = index_path.read_text() (index_text, n) = regex.subn( r'(<li><a href="#about">About</a><ul>)', ('<li><a href="user_manual/index.html">User manual</a></li>' "<ul>" '<li><a href="user_manual/index.html#how-to-read-this-manual">How to read this manual</a></li>' '<li><a href="user_manual/index.html#preparing-your-program-collection">Preparing your program collection</a></li>' '<li><a href="user_manual/index.html#taxonomy">Taxonomy</a></li>' '<li><a href="user_manual/index.html#pipeline-tutorial">Pipeline tutorial</a></li>' '<li><a href="user_manual/index.html#pipeline-documentation">Pipeline documentation</a></li>' '<li><a href="user_manual/index.html#glossary">Glossary</a></li>' "</ul>" '<li><a href="developer_manual/index.html">Developer manual</a></li>' "<ul>" '<li><a href="developer_manual/index.html#bird-view">Bird view</a></li>' '<li><a href="developer_manual/index.html#helper-programs">Helper programs</a></li>' '<li><a href="developer_manual/index.html#tag-databases">Tag databases</a></li>' '<li><a href="developer_manual/index.html#implementation-notes">Implementation notes</a></li>' "</ul>" r"\1"), index_text, ) assert n == 1 (index_text, n) = regex.subn(r'\b(src|href)="docs/', r'\1="', index_text) assert n == 2 index_path.write_text(index_text)
def remove_citations(text): n_subs_made = 1 while n_subs_made > 0: text, n_subs_made = regex.subn(citation_re, ';', text) text, n_subs_made2 = regex.subn(citation_re, ';', text) n_subs_made += n_subs_made2 return text
async def doit(chat, match): fr = match.group(1) to = match.group(2) to = to.replace('\\/', '/') try: fl = match.group(3) if fl == None: fl = '' fl = fl[1:] except IndexError: fl = '' # Build Python regex flags count = 1 flags = 0 for f in fl: if f == 'i': flags |= re.IGNORECASE elif f == 'g': count = 0 else: await chat.reply('unknown flag: {}'.format(f)) return # Handle replies if 'reply_to_message' in chat.message: # Try to find the original message text message = chat.message['reply_to_message'] original = find_original(message) if not original: return # Substitute the text try: s, i = re.subn(fr, to, original) if i > 0: return (await Chat.from_message(bot, message).reply(s))['result'] except Exception as e: await chat.reply('u dun goofed m8: ' + str(e)) return # Try matching the last few messages global last_msgs if chat.id not in last_msgs: return for msg in reversed(last_msgs[chat.id]): try: original = find_original(msg) if not original: continue s, i = re.subn(fr, to, original, count=count, flags=flags) if i > 0: return (await Chat.from_message(bot, msg).reply(s))['result'] except Exception as e: await chat.reply('u dun goofed m8: ' + str(e)) return
def beta_code(self, text): """Replace method. Note: regex.subn() returns a tuple (new_string, number_of_subs_made). """ text = text.replace('-', '') for (pattern, repl) in self.pattern1: text = regex.subn(pattern, repl, text)[0] for (pattern, repl) in self.pattern2: text = regex.subn(pattern, repl, text)[0] # remove third run, if punct list not used for (pattern, repl) in self.pattern3: text = regex.subn(pattern, repl, text)[0] return text
def update_readme_example(): source = Path("docs/resources/fibonacci.py").read_text().strip() readme_path = Path("README.md") readme_text = readme_path.read_text() (readme_text, n) = regex.subn( r"(?sm)^\| Taxon \| Lines \|.+?(?=\n\n)", tag_program(f"# {source}"), readme_text, count=1, ) assert n == 1 (readme_text, n) = regex.subn(r"(?<=paroxython )\S+(?= loaded)", VERSION, readme_text) assert n == 1 readme_path.write_text(readme_text)
def name(name): name, number = re.subn(r'\s', '_', name.strip()) if number: return redirect(url_for('.name', name=name)) board = Board.q.filter(Board.name == name).one_or_none() return render_template('boards_view.html', board=board)
def substitute(m): if not m.raw_text: return None s, i = re.subn(fr, to, m.raw_text, count=count, flags=flags) if i > 0: return s
async def substitute(original, msg): try: s, i = re.subn(fr, to, original, count=count, flags=flags) if i > 0: return (await Chat.from_message(bot, msg).reply(s))['result'] except Exception as e: await chat.reply('u dun goofed m8: ' + str(e))
def FixError_GetProcAddress(line): #GetProcAddress\((.+,[ ])*_T\((.+?)\)\) #GetProcAddress($1$2) rslt = re.subn(r'GetProcAddress\((.+,[ ])*_T\((.+?)\)\)', r'GetProcAddress(\1\2)', line) if rslt[1] > 0: return rslt[0] return line
def subn(pattern, repl, string, count=0, flags=0, pos=None, endpos=None, concurrent=None, **kwargs): """Wrapper for subn.""" pattern = compile_search(pattern, flags) return regex.subn( pattern, compile_replace(pattern, repl), string, count, flags, pos, endpos, concurrent, **kwargs )
def combiningStrip(text): """ From a string, remove combining diacritics and modifiers. Parameters: text : string Requires regex module as re Return string with combining characters removed """ assert type(text) is str unicodeBlockList = [ r'\p{InCombining_Diacritical_Marks_for_Symbols}', r'\p{InSuperscripts_and_Subscripts}', r'\p{InCombining_Diacritical_Marks}', r'\p{InSpacing_Modifier_Letters}', r'\p{InCombining_Diacritical_Marks_Extended}' r'\p{InCombining_Diacritical_Marks_Supplement}' ] additionalChars = [r'ᴸ', r'ᵇ', r':', r'<', r'←', r'=', r"'", r"‚"] pattern = r'(' + r'|'.join(unicodeBlockList + additionalChars) + r')' pattern = re.compile(pattern) # re.search(pattern, text) result = re.subn(pattern, '', text) return result[0]
def FixError_Overlap_T(line): #_T\([ ]*(_T\([ ]*".+?"[ ]*\))[ ]*\) #$1 rslt = re.subn(r'_T\([ ]*(_T\([ ]*".*?"[ ]*\))[ ]*\)', r'\1', line) if rslt[1] > 0: return rslt[0] return line
def _normalize( cleanTxt, dirtyTxt ): cleanCorpus = unicodedata.normalize( 'NFKD', cleanTxt.lower( ) ).encode( 'ascii', 'ignore' ) cleanCorpus = cleanCorpus.decode( 'ascii' ) dirtyCorpus = dirtyTxt.lower( ) cleanCorpus = cleanCorpus.replace( '_', ' ' ).replace( '|', '' ).replace( '&c.', ' ' ). \ replace( '`', ' ' ).replace( '@', ' ' ).replace( '(', '' ).replace( ')', '' ).replace( '"', '' ). \ replace( "''", "" ) dirtyCorpus = dirtyCorpus.replace( '&c.', ' ' ). \ replace( '&', ' ' ).replace( '<', ' ' ). \ replace( '>', ' ' ).replace( r"\\", " " ).replace( '/', ' ' ).replace( '(', '' ).replace( ')', '' ).replace( '"', '' ) regexTasks = [ (r'(\w)\^(\w)', r'\1\2', lambda: cleanCorpus, "clean"), (r'([^-+])--([^-+])', r'\1 \2', lambda: corpus, "clean" ), (r'(\w)- (\w)', r'\1-\2', lambda: corpus, "clean" ), (r'\+-+?|-+\+|(-\s?){2,}', r' ', lambda: corpus, "clean" ), (r'\[[=)\']?(\w)[.]?\]', r'\1', lambda: corpus, "clean"), (r'[\]\[\}\{]', r' ', lambda: corpus, "clean"), (r'(\s?\.\s*){2,}', r' ', lambda: corpus, "clean"), (r'\s+', r' ', lambda: corpus, "clean"), (r'\n{4}.+\n{4}(?:.+\n{4})?', r' ', lambda: dirtyCorpus, "dirty"), (r'(\w)-\s{2,}(\w)', r'\1\2', lambda: corpus, "dirty" ), (r'(\s?\.\s*){2,}', r' ', lambda: corpus, "dirty"), (r'-{3,}', r' ', lambda: corpus, "dirty"), (r'\s+', r' ', lambda: corpus, "dirty" ), ] updated_corpora = dict( ) taskNum = 0 for ((corpus, n), Type) in map( lambda argLst: (regex.subn( *argLst[ 0 ] ), argLst[ 1 ]), map( lambda T: [ (T[ 0 ], T[ 1 ], T[ 2 ]( )), T[ 3 ] ], regexTasks ) ): updated_corpora[ Type ] = corpus if _DEBUG: print( "substituted {0} of {1} in {2} corpus".format( n, regexTasks[ taskNum ][ 0 ], Type ) ) taskNum += 1 sys.stdout.flush( ) return updated_corpora
def in_memory(string: str) -> str: """ Determine how string would appear in memory by removing opening/closing quotes and resolving escaped characters. """ in_mem = string[1:-1].replace("\\\\", "x") in_mem = in_mem.replace('\\"', "x") in_mem, _ = re.subn("\\\\x..", "x", in_mem) return in_mem
def inject_flow_diagram_in_nav(): path = Path("docs/developer_manual/index.html") text = path.read_text() (text, n) = regex.subn(r"(</nav>)", r'<p><img alt="" src="../resources/flow.png"></p>\1', text) assert n == 1 path.write_text(text)
def update_version_number(): for path in ["paroxython/cli_tag.py", "paroxython/cli_collect.py"]: path = Path(path) source = path.read_text() (source, n) = regex.subn( r"(?<=https://github\.com/laowantong/paroxython/blob/)[^/]+", VERSION, source) assert n == 1, path path.write_text(source)
def actually_doit(original): try: s = original.message if s.startswith(HEADER): s = s[len(HEADER):] s, i = regex.subn(fr, to, s, count=count, flags=flags) if i > 0: return original, s except Exception as e: return None, f"u dun goofed m8: {str(e)}" return None, None
def inject_taxonomy(): index_path = Path("docs/user_manual/index.html") text = index_path.read_text() tree = Path("docs/resources/tree.js").read_text() head = f""" <script type="text/javascript" src="https://www.gstatic.com/charts/loader.js"></script> <script type="text/javascript">{tree}</script> """ (text, n) = regex.subn("</head>", fr"{head}</head>", text) assert n == 1 index_path.write_text(text)
def update_github_links(): count = 2 source = Path("tests/test_recommend_programs.py").read_text() path = Path("docs/md/pipeline_documentation.md") text = path.read_text() (text, n) = regex.subn(r"test_recommend_programs.py#L\d+-L\d+", f"test_recommend_programs.py#L-L", text) assert n == count for i in range(1, count + 1): start = source.partition(f"# extract_{i} (start)")[0].count("\n") + 2 stop = source.partition(f"# extract_{i} (stop)")[0].count("\n") assert start < stop (text, n) = regex.subn( r"test_recommend_programs.py#L-L", f"test_recommend_programs.py#L{start}-L{stop}", text, count=1, ) assert n == 1 path.write_text(text)
def actually_doit(original): try: s, i = regex.subn(fr, to, original.message, count=count, flags=flags) if i > 0: return original, s except Exception as e: return None, f"u dun goofed m8: {str(e)}" return None, None
def compute_stats(): readme_path = Path("README.md") readme_text = readme_path.read_text() cleanup = Cleanup("full") directories = ["paroxython", "tests", "helpers"] for directory in directories: total = 0 for program_path in Path(directory).glob("**/*.py"): source = program_path.read_text() # Work around a weird error: # tokenize.TokenError: ('EOF in multi-line string', (12, 10)) source = source.replace( 'if __name__ == "__main__":\n bar = foo', "pass\npass") source = cleanup.run(source) total += source.count("\n") print(f"{directory}: {total} SLOC") total = 50 * round(total / 50) (readme_text, n) = regex.subn( fr"(?m)(!\[{directory} SLOC\].+?)~\d+(%20SLOC)", fr"\1~{total}\2", readme_text, ) assert n > 0, f"Unable to create badge for '{directory}' SLOC." total = 50 * round( Path("paroxython/resources/spec.md").read_text().count("\n") / 50) (readme_text, n) = regex.subn( fr"(?m)(!\[spec lines\].+?)~\d+(%20lines)", fr"\1~{total}\2", readme_text, ) assert n == 1 total = Path("paroxython/resources/taxonomy.tsv").read_text().partition( "-- EOF")[0].count("\n") (readme_text, n) = regex.subn( fr"(?m)(!\[taxonomy mappings\].+)-\d+(%20mappings)", fr"\1-{total}\2", readme_text, ) assert n == 1 readme_path.write_text(readme_text)
def subn(pattern, repl, string, count=0, flags=0, pos=None, endpos=None, concurrent=None, **kwargs): """Wrapper for `subn`.""" is_replace = _is_replace(repl) is_string = isinstance(repl, (_util.string_type, _util.binary_type)) if is_replace and repl.use_format: raise ValueError("Compiled replace cannot be a format object!") pattern = compile_search(pattern, flags) return _regex.subn( pattern, (compile_replace(pattern, repl) if is_replace or is_string else repl), string, count, flags, pos, endpos, concurrent, **kwargs )
def subfn(pattern, format, string, *args, **kwargs): # noqa A002 """Wrapper for `subfn`.""" flags = args[4] if len(args) > 4 else kwargs.get('flags', 0) is_replace = _is_replace(format) is_string = isinstance(format, (str, bytes)) if is_replace and not format.use_format: raise ValueError("Compiled replace is not a format object!") pattern = compile_search(pattern, flags) rflags = FORMAT if is_string else 0 return _regex.subn(pattern, (compile_replace(pattern, format, flags=rflags) if is_replace or is_string else format), string, *args, **kwargs)
def subn(pattern, repl, string, *args, **kwargs): """Wrapper for `subn`.""" flags = args[4] if len(args) > 4 else kwargs.get('flags', 0) is_replace = _is_replace(repl) is_string = isinstance(repl, (str, bytes)) if is_replace and repl.use_format: raise ValueError("Compiled replace cannot be a format object!") pattern = compile_search(pattern, flags) return _regex.subn( pattern, (compile_replace(pattern, repl) if is_replace or is_string else repl), string, *args, **kwargs)
def subn(pattern, repl, string, *args, **kwargs): """Wrapper for `subn`.""" flags = args[4] if len(args) > 4 else kwargs.get('flags', 0) is_replace = _is_replace(repl) is_string = isinstance(repl, (str, bytes)) if is_replace and repl.use_format: raise ValueError("Compiled replace cannot be a format object!") pattern = compile_search(pattern, flags) return _regex.subn( pattern, (compile_replace(pattern, repl) if is_replace or is_string else repl), string, *args, **kwargs )
def subfn(pattern, format, string, count=0, flags=0, pos=None, endpos=None, concurrent=None, **kwargs): # noqa A002 """Wrapper for `subfn`.""" is_replace = _is_replace(format) is_string = isinstance(format, (_util.string_type, _util.binary_type)) if is_replace and not format.use_format: raise ValueError("Compiled replace is not a format object!") pattern = compile_search(pattern, flags) rflags = FORMAT if is_string else 0 return _regex.subn( pattern, (compile_replace(pattern, format, flags=rflags) if is_replace or is_string else format), string, count, flags, pos, endpos, concurrent, **kwargs )
def __init__(self, in_data, timezone=None): self.time = datetime.now(pytz.timezone(timezone)) self.time_provided = False self.interval_provided = False self.targets = [] self.content = 'Reminder' self.content_provided = False mentions = re.search(self.find_mentions, in_data) in_data = re.subn(self.find_mentions, '', in_data, 1) self.process_mentions(mentions) self.try_and_match(in_data)
def subfn(pattern, format, string, *args, **kwargs): # noqa A002 """Wrapper for `subfn`.""" flags = args[4] if len(args) > 4 else kwargs.get('flags', 0) is_replace = _is_replace(format) is_string = isinstance(format, (str, bytes)) if is_replace and not format.use_format: raise ValueError("Compiled replace is not a format object!") pattern = compile_search(pattern, flags) rflags = FORMAT if is_string else 0 return _regex.subn( pattern, (compile_replace(pattern, format, flags=rflags) if is_replace or is_string else format), string, *args, **kwargs )
def register_submit(): v = Validate(request) email = v.require('email') user = v.require('user') if not v.ok: return render_template("register.html", valid=v) user, number = re.subn(r'[^a-zA-Z0-9_-]', '', user) v.expect( number == 0, "Invalid symbol in Username, Username can only contain ASCII letters, numbers, dashes, and underscores.", 'user') v.expect(len(user) > 3, "Username Too short", 'user') v.expect( len(user) and user[0] != '_', "Username can not start with an underscore.", 'user') if not v.ok: return render_template("register.html", valid=v) usr = User.query.filter(User.username.ilike(user)).one_or_none() v.expect(usr is None, "Username taken", 'user') if not v.ok: return render_template("register.html", valid=v) usr = User(username=user) usr.email = email usr.nickname = v.optional('nick') usr.discord = v.optional('discord') usr.postal = v.optional('postal') msg = "Unable to find this domain to send and email to. If the email address is valid, please open an issue." try: name, host = email.split('@', 1) addr = socket.gethostbyname(host) v.expect(addr is not None, msg, 'email') except socket.gaierror: v.expect(False, msg, 'email') if not v.ok: return render_template("register.html", valid=v) db.session.add(usr) db.session.commit() login_user(usr) return redirect("/")
def add_field(current_class, filename, field_name, field_type, config, lang, classes, depends): if filename in config.EnumMap and field_type in config.EnumMap[filename]: field_type = config.EnumMap[filename][field_type] field_flag = {} field_text = field_type field_local = True if "super_class" in classes[current_class]: super = classes[classes[current_class].super_class] if "fields" in super: if (len( list( filter(lambda x: x.field_name == field_name, super.fields))) != 0): field_local = False for key, value in lang.items(): if type(value) == str: field_text = regex.sub(key, value, field_text) else: field_text, n = regex.subn(key, value["replace"], field_text) if n > 0: if value["flag"]: field_flag.update(value["flag"]) classes[current_class].fields.append( util.attrdict(field_type=field_type, field_name=field_name, field_text=field_text)) classes[current_class].fields[-1].update(field_flag) classes[current_class].fields[-1]._N = len(classes[current_class].fields) if current_class == field_type: classes[current_class].fields[-1].optional = True classes[current_class].fields[-1].field_local = field_local if (field_type in classes and "abstract" in classes[field_type] and classes[field_type].abstract): classes[current_class].fields[-1].optional = True types = set(regex.sub("[^a-zA-Z0-9_]+", " ", field_type).split()) types.discard(current_class) depends[current_class].update(types)
def test_update_docstring(): indent = "\n " result = [] for (title, original, expected) in examples: original = original.replace("\n", indent) expected = expected.replace("\n", indent) result.append(f"- {title}") result.append(fr"```python{indent}{original}{indent}```") result.append(fr"```python{indent}{expected}{indent}```") result = regex.sub(f"(?m){indent}$", "\n", indent.join(result)) path = Path("paroxython/preprocess_source.py") source = path.read_text() (source, n) = regex.subn( r"(?sm)^(\s*def full_cleaning.+?Examples:\n).+?^\n(?= +All examples)", fr"\1 {result}\n\n", source, ) assert n == 1 path.write_text(source)
def patch_prose(): index_path = Path("docs/index.html") index_text = index_path.read_text() index_text = index_text.replace("<h1>Index</h1>\n", "") for title in ("User manual", "Developer manual"): slug = title.lower().replace(" ", "_") path = Path("docs") / slug / "index.html" text = path.read_text() (text, n) = regex.subn( f"""<h1 class="title">Module <code>paroxython.{slug}</code></h1>""", f"""<h1 class="title">{title}</h1>""", text, ) assert n == 1, f"Unable to change the title of {slug}!" (text, n) = regex.subn( f"<h1>Index</h1>", f"<h1>{title}</h1>", text, ) assert n == 1, f"Unable to change the title of {slug} in nav!" (text, n) = regex.subn(fr"""(?s)</div>\n<ul id="index">.+</ul>\n""", "", text) assert n == 1, f"Unable to suppress the index section in prose {slug}'s nav!" (index_text, n) = regex.subn(fr"""<li><code><a title="paroxython.{slug}".+\n""", "", index_text) assert n == 1, f"Unable to remove nav url for {slug}!" (index_text, n) = regex.subn( fr"""(?s)<dt><code class="name"><a title="paroxython\.{slug}".+?</dd>\n""", "", index_text, ) assert n == 1, f"Unable to remove module section for {slug}!" (text, n) = regex.subn(fr"""(?s)<details class="source">.+</details>\n""", "", text) assert n == 1, f"Unable to suppress the source code in prose {slug}!" (text, n) = regex.subn( """href="index.html">""", """href="../index.html">""", text, ) assert n == 1, f"Unable to patch the Home url in {slug}!" path.write_text(text) index_path.write_text(index_text)
def subn(pattern, repl, string, count=0, flags=0, pos=None, endpos=None, concurrent=None, **kwargs): """Wrapper for subn.""" is_replace = _is_replace(repl) is_string = isinstance(repl, (compat.string_type, compat.binary_type)) if is_replace and repl.use_format: raise ValueError("Compiled replace cannot be a format object!") pattern = compile_search(pattern, flags) return regex.subn(pattern, (compile_replace(pattern, repl) if is_replace or is_string else repl), string, count, flags, pos, endpos, concurrent, **kwargs)
def _fix_hijri_gregorian_feb_mismatch(self, date_formats, languages): # Now, search for 29th or 30th day of 2nd month. # If found, reduce it by 10 days and use regular parse # function again, if succeeds this time, then add 10 # days to parsed Hijri form. for lang_shortname in languages: language = default_language_loader.get_language(lang_shortname) translated = language.translate(self.source, settings=settings) def _sub_fn(m): digit = int(m.group(0)) return '{:02d}'.format(digit - 10) fixed_date_string, nreplaced = re.subn( r'(?<!\d)(29|30)', _sub_fn, translated, 1) if not nreplaced: continue date_data = self._parser_get_date(fixed_date_string, date_formats, languages) date_obj = date_data.get('date_obj') if date_obj: # Remember that, we have subtracted 10 days. date_data['date_obj'] = self._hijri_to_gregorian( date_obj.year, date_obj.month, date_obj.day + 10, date_obj) return date_data
def normalize_text(text, lcase=True): text = str(text).strip() if lcase: text = text.lower() text = unicodedata.normalize('NFKD', text) text = regex.subn(r'\p{P}+', '', text)[0] return text.encode('ascii', 'ignore').decode()
def adjustOutputsModel( textFolder ): with open( 'PickledData/HMM_data/outputs_FIXED1_stage1.pickle', 'rb' ) as file: emissions = pickle.load( file ) directory = "/home/jcavalie/NLPtools/wiki_dump/" + textFolder + '/' print( 'Directory:', directory ) wikiFiles = os.listdir( directory ) print( 'Files:', wikiFiles ) count = 0 for fileName in wikiFiles: print( "file count: ", count ) count += 1 with open( directory + fileName, 'r', encoding = "ISO-8859-15" ) as file: text_ = file.read( ) text_ = text_.replace( "-", " " ) text_ = parallelCorpora._normalize( text_, "" )[ 'clean' ] text_ = text_.strip( ) text_ = text_.replace( "''", " " ) pattern = r'([~`!@#$%&|*)(_+=\\^\]\[}{;:"><.,/?]+)' text_, num = regex.subn( pattern, ' ', text_ ) print( "removed unwanted chars: ", num ) text_ = regex.sub( r"(\d+)", " ", text_ ) text_ = regex.sub( r'(\s+)', ' ', text_ ) text_ = ' ' + text_ + ' ' gc.collect( ) print( "building Ngrams" ) corporaLength = len( text_ ) print( "CORPUS LENGTH: ", corporaLength ) counter = 0 print( "starting loop" ) for one_grams, two_grams, three_grams in zip_longest( ngrams( text_, 1 ), ngrams( text_, 2 ), ngrams( text_, 3 ) ): counter += 1 if not (counter) % 1000: print( "1000 more complete", counter ) if counter == corporaLength // 4: print( "~1/4 complete" ) elif counter == corporaLength // 2: print( "~1/2 complete" ) elif counter == int( corporaLength * (3 / 4) ): print( "~3/4 complete" ) if one_grams is not None: if emissions[ ''.join( one_grams ) ].get(''.join( one_grams ),None ) is None: N1 = emissions[ ''.join( one_grams ) ].N( ) # print( 'one_gram:[{0}]'.format( ''.join( one_grams ) ) ) emissions[ ''.join( one_grams ) ][ ''.join( one_grams ) ] += \ text_.count( ''.join( one_grams ) ) - N1 if emissions[ ''.join( one_grams ) ][ ''.join( one_grams ) ] < 0: emissions[ ''.join( one_grams ) ][ ''.join( one_grams ) ]=0 if two_grams is not None: if emissions[ ''.join( two_grams ) ].get(''.join( two_grams ),None ) is None: N2 = emissions[ ''.join( two_grams ) ].N( ) # print( 'two_gram:[{0}]'.format( ''.join( two_grams ) ) ) emissions[ ''.join( two_grams ) ][ ''.join( two_grams ) ] += \ text_.count( ''.join( two_grams ) ) - N2 if emissions[ ''.join( two_grams ) ][ ''.join( two_grams ) ] < 0: emissions[ ''.join( two_grams ) ][ ''.join( two_grams ) ]=0 if three_grams is not None: if emissions[ ''.join( three_grams ) ].get(''.join( three_grams ),None ) is None: N3 = emissions[ ''.join( three_grams ) ].N( ) # print( 'three_gram:[{0}]'.format( ''.join( three_grams ) ) ) emissions[ ''.join( three_grams ) ][ ''.join( three_grams ) ] += text_.count( ''.join( three_grams ) ) - N3 if emissions[ ''.join( three_grams ) ][ ''.join( three_grams ) ] < 0: emissions[ ''.join( three_grams ) ][ ''.join( three_grams ) ]=0 with open( 'PickledData/HMM_data/outputs_FIXED1_final.pickle', 'wb' ) as file: pickle.dump( emissions, file, pickle.HIGHEST_PROTOCOL )
def buildNgrams( textFolder ): bigram1_1 = ConditionalFreqDist( ) bigram2_2 = ConditionalFreqDist( ) bigram3_3 = ConditionalFreqDist( ) bigram1_2 = ConditionalFreqDist( ) bigram1_3 = ConditionalFreqDist( ) bigram2_1 = ConditionalFreqDist( ) bigram2_3 = ConditionalFreqDist( ) bigram3_1 = ConditionalFreqDist( ) bigram3_2 = ConditionalFreqDist( ) directory = "/home/jcavalie/NLPtools/wiki_dump/" + textFolder + '/' print( 'Directory:', directory ) wikiFiles = os.listdir( directory ) print( 'Files:', wikiFiles ) count = 0 for fileName in wikiFiles: print( "file count: ", count ) count += 1 with open( directory + fileName, 'r', encoding = "ISO-8859-15" ) as file: if 'wiki' in textFolder: text_ = cleanhtml( file.read( ) ) else: text_ = file.read( ) text_ = text_.replace( "-", " " ) text_ = parallelCorpora._normalize( text_, "" )[ 'clean' ] text_ = text_.strip( ) text_ = text_.replace( "''", " " ) pattern = r'([~`!@#$%&|*)(_+=\\^\]\[}{;:"><.,/?]+)' text_, num = regex.subn( pattern, ' ', text_ ) print( "removed unwanted chars: ", num ) text_ = regex.sub( r"(\d+)", " ", text_ ) text_ = regex.sub( r'(\s+)', ' ', text_ ) text_ = ' ' + text_ + ' ' gc.collect( ) print( "building Ngrams" ) corporaLength = len( text_ ) print( "CORPUS LENGTH: ", corporaLength ) counter = 0 print( "starting loop" ) for one_grams, two_grams, three_grams, four_grams, five_grams, six_grams in \ zip_longest( ngrams( text_, 1 ), ngrams( text_, 2 ), ngrams( text_, 3 ), ngrams( text_, 4 ), ngrams( text_, 5 ), ngrams( text_, 6 ) ): counter += 1 if not (counter) % 1000000: print( "1000000 more complete", counter ) if counter == corporaLength // 4: print( "~1/4 complete" ) elif counter == corporaLength // 2: print( "~1/2 complete" ) elif counter == int( corporaLength * (3 / 4) ): print( "~3/4 complete" ) if two_grams is not None: bigram1_1[ ''.join( two_grams[ :1 ] ) ][ ''.join( two_grams[ 1: ] ) ] += 1 if three_grams is not None: bigram1_2[ ''.join( three_grams[ :1 ] ) ][ ''.join( three_grams[ 1: ] ) ] += 1 bigram2_1[ ''.join( three_grams[ :2 ] ) ][ ''.join( three_grams[ 2: ] ) ] += 1 if four_grams is not None: bigram2_2[ ''.join( four_grams[ :2 ] ) ][ ''.join( four_grams[ 2: ] ) ] += 1 bigram3_1[ ''.join( four_grams[ :3 ] ) ][ ''.join( four_grams[ 3: ] ) ] += 1 bigram1_3[ ''.join( four_grams[ :1 ] ) ][ ''.join( four_grams[ 1: ] ) ] += 1 if five_grams is not None: bigram3_2[ ''.join( five_grams[ :3 ] ) ][ ''.join( five_grams[ 3: ] ) ] += 1 bigram2_3[ ''.join( five_grams[ :2 ] ) ][ ''.join( five_grams[ 2: ] ) ] += 1 if six_grams is not None: bigram3_3[ ''.join( six_grams[ :3 ] ) ][ ''.join( six_grams[ 3: ] ) ] += 1 print( "finished building, begin pickling" ) CORPUS = textFolder with open( './PickledData/langModels/bigrams1_1' + CORPUS + '.pickle', 'wb' ) as file1: pickle.dump( bigram1_1, file1, pickle.HIGHEST_PROTOCOL ) del bigram1_1 print( "finished 1-1" ) with open( './PickledData/langModels/bigrams2_2' + CORPUS + '.pickle', 'wb' ) as file2: pickle.dump( bigram2_2, file2, pickle.HIGHEST_PROTOCOL ) del bigram2_2 print( "finished 2-2" ) with open( './PickledData/langModels/bigrams3_3' + CORPUS + '.pickle', 'wb' ) as file3: pickle.dump( bigram3_3, file3, pickle.HIGHEST_PROTOCOL ) del bigram3_3 gc.collect( ) print( "finished 3-3" ) with open( './PickledData/langModels/bigrams1_2' + CORPUS + '.pickle', 'wb' ) as file4: pickle.dump( bigram1_2, file4, pickle.HIGHEST_PROTOCOL ) del bigram1_2 print( "finished 1-2" ) with open( './PickledData/langModels/bigrams1_3' + CORPUS + '.pickle', 'wb' ) as file5: pickle.dump( bigram1_3, file5, pickle.HIGHEST_PROTOCOL ) del bigram1_3 gc.collect( ) print( "finished 1-3" ) with open( './PickledData/langModels/bigrams2_1' + CORPUS + '.pickle', 'wb' ) as file6: pickle.dump( bigram2_1, file6, pickle.HIGHEST_PROTOCOL ) del bigram2_1 print( "finished 2-1" ) with open( './PickledData/langModels/bigrams2_3' + CORPUS + '.pickle', 'wb' ) as file7: pickle.dump( bigram2_3, file7, pickle.HIGHEST_PROTOCOL ) del bigram2_3 print( "finished 2-3" ) with open( './PickledData/langModels/bigrams3_1' + CORPUS + '.pickle', 'wb' ) as file8: pickle.dump( bigram3_1, file8, pickle.HIGHEST_PROTOCOL ) del bigram3_1 print( "finished 3-2" ) with open( './PickledData/langModels/bigrams3_2' + CORPUS + '.pickle', 'wb' ) as file9: pickle.dump( bigram3_2, file9, pickle.HIGHEST_PROTOCOL ) del bigram3_2 gc.collect( ) print( "finished all" ) return