def recoverUnknown(f_unknown_p, f_unknown, f_align_p, f_clean): """ Shows us the Foreign sentence that produced no formality while the English sentence had a "you". """ print "Recovering unknown sentences" unknown = loadStruct(f_unknown_p) align = loadStruct(f_align_p) with copen(f_unknown, "w", encoding="utf-8") as unknown_f: for doc, proj in unknown.iteritems(): if len(proj) > 0: de = [] links = align[doc] for p in proj: for link in links: if p in link[1].split(" "): de.extend(link[0].split(" ")) with copen(f_clean + doc[0].replace(".gz", "")) as doc_f: dom = parse(doc_f) nodes = dom.getElementsByTagName("s") for node in nodes: if node.getAttribute("id") in de: unknown_f.write("%s\n" % node.firstChild.nodeValue)
def init(self): """ Internal, make the dirs and touch the files """ pexists = os.path.exists pjoin = os.path.join #if not pexists(".htaccess"): # with copen(".htaccess", "w") as f: # f.write("DirectoryIndex %s" % argv[0]) for dir in [d for o, d in self.config.items("blog") if o.endswith("directory")]: if not pexists(dir): os.makedirs(dir, mode=(0770 if not "captcha" in dir else 0771)) with copen(pjoin(dir, ".htaccess"), "w") as f: f.write("Order Deny, Allow\nDeny from All") for index in [getattr(self, a) for a in self.__dict__ if a.endswith("_index")]: if not pexists(index): with copen(index, "w"): pass os.chmod(index, 0640) if pexists(self.init_file): with copen(self.init_file) as f: lines = f.readlines() try: result = [self.add_user(*l.strip("\n").split(":")) for l in lines] except TypeError: pass if result: with copen("%s.log" % self.init_file, "w") as f: f.writelines(result) os.remove(self.init_file)
def processGutenberg(f_gutenberg, f_gproj): """ Processing the Project Gutenberg corpus. """ for f_g in ["test/", "train/"]: createPath(f_gproj + f_g) for f_novel in listdir(f_gutenberg + f_g): if f_novel.endswith("_en.txt"): with copen(f_gproj + f_g + f_novel, "w", encoding="utf-8") as gproj_f: gproj_f.write("<d src=\"%s\">\n" % f_novel) with copen(f_gutenberg + f_g + f_novel, encoding="utf-8") as novel_f: j = 2 for i, line in enumerate(novel_f.readlines()): if i in xrange(j - 2, j): line = line.strip() if line.startswith("<S"): m = match( ".*sentNum:([0-9]+).*F:([0|1]) I:([0|1])", line) gproj_f.write( "<s id=\"%s\" f=\"%s\" i=\"%s\">" % (m.group(1), m.group(2), m.group(3))) else: gproj_f.write("%s</s>\n" % line) elif i == j: j += 4 gproj_f.write("</d>\n")
def line_iter(path1, path2): file1 = copen(path1, encoding='utf-8') file2 = copen(path2, encoding='utf-8') for p_line in izip(file1, file2): yield p_line file1.close() file2.close()
def processGutenberg(f_gutenberg, f_gproj): """ Processing the Project Gutenberg corpus. """ for f_g in ["test/", "train/"]: createPath(f_gproj + f_g) for f_novel in listdir(f_gutenberg + f_g): if f_novel.endswith("_en.txt"): with copen(f_gproj + f_g + f_novel, "w", encoding="utf-8") as gproj_f: gproj_f.write("<d src=\"%s\">\n" % f_novel) with copen(f_gutenberg + f_g + f_novel, encoding="utf-8") as novel_f: j = 2 for i, line in enumerate(novel_f.readlines()): if i in xrange(j - 2, j): line = line.strip() if line.startswith("<S"): m = match(".*sentNum:([0-9]+).*F:([0|1]) I:([0|1])", line) gproj_f.write("<s id=\"%s\" f=\"%s\" i=\"%s\">" % (m.group(1), m.group(2), m.group(3))) else: gproj_f.write("%s</s>\n" % line) elif i == j: j += 4 gproj_f.write("</d>\n")
def append(self, ical, filename=None): """Append a Remind command generated from the iCalendar to the file""" if not filename: filename = self._filename elif filename not in self._icals: return with self._lock: outdat = self.to_reminders(readOne(ical)) copen(filename, 'a', encoding='utf-8').write(outdat.decode('utf-8'))
def convert_file(file_path): print("[*]", file_path, "fixed!") foriginal = copen(file_path, "r", "utf8") content = foriginal.read() foriginal.close() ccontent = fix_encoding(content, ENCODING, NORMALIZE, True) fconverted = copen(file_path, "w", "utf8") fconverted.write(ccontent) fconverted.close()
def replace(source_file_path, pattern, substring, is_regexp): _, target_file_path = mkstemp() with copen(target_file_path, 'w', 'utf-8') as target_file: with copen(source_file_path, 'r', 'utf-8') as source_file: for line in source_file: if is_regexp: target_file.write(sub(pattern, substring, line)) else: target_file.write(line.replace(pattern, substring)) remove(source_file_path) move(target_file_path, source_file_path)
def convert_file(file_path): foriginal = copen(file_path, "r", "utf8") content = foriginal.read() foriginal.close() for codec in codecs: print("[*] codec:", codec) fconverted = copen(file_path.replace(".", "_%s." % codec), "w", "utf8") fconverted.write( content.encode(codec, "ignore").decode("utf8", "ignore")) fconverted.close()
def convert_file(srcFile, dstFile, delim = DELIM, src_codec=SOURCE_CODEC, dst_codec=DEST_CODEC): '''Convert a CSV file to standard format''' # From http://stackoverflow.com/a/191403 with copen(srcFile, "r", SOURCE_CODEC) as sourceFile: with copen(dstFile, "w", DEST_CODEC) as targetFile: while True: line = sourceFile.readline() line = ",".join(line.split(delim)) if not line: break targetFile.write(line)
def gettimes(fname, nframes=None): if nframes is None: try: with pytiff.Tiff(fname) as handle: tags = handle.read_tags() nframes = int(tags['image_description'].split()[2][7:]) except: print("I could not get the number of frames, please provide it") return with copen(fname, "r", "windows-1252") as f: j = 0 times = zeros((nframes)) while True: try: line = f.readline() # print(line) linesp = line.replace('\x00', '').strip().split() if len(linesp) == 5: if linesp[2] == 'Time_From_Last': k, t = linesp[1], linesp[-1] # print(int(k),float(t)) times[int(k) - 1] = float(t) j = j + 1 except: break if j >= nframes: break return (times)
def prepare_articles(names): '''saves tagged articles about given entities in a cache''' for f in glob.glob(join(raw_articles_path, "*.txt*")): os.remove(f) found = False link_dictionaries = {} for i, name in enumerate(names): try: get_article(name) except ArticleNotFoundError: try: article, link_dictionary = get_raw_article(name) link_dictionaries[i] = link_dictionary except ArticleNotFoundError: continue found = True article = '\n'.join(article.split('\n')[:article_sentence_limit]) out = copen(join(raw_articles_path, '%d.txt' % i), 'w', 'utf-8') print >> out, article if found: articles = lt.run_nlptools(link_dictionaries) for f in glob.glob(join(raw_articles_path, "*.txt*")): os.remove(f) #save processed articles for i, article in articles.iteritems(): Pickler.store(article, articles_cache_path % names[i])
def remove(self, uid, filename=None): """Remove the Remind command with the uid from the file""" if not filename: filename = self._filename elif filename not in self._icals: return uid = uid.split('@')[0] with self._lock: rem = copen(filename, encoding='utf-8').readlines() for (index, line) in enumerate(rem): if uid == md5(line[:-1].encode('utf-8')).hexdigest(): del rem[index] copen(filename, 'w', encoding='utf-8').writelines(rem) break
def _init_data_list(self): self.data_list = [] with copen(self.jsons_file, mode='r', encoding='latin1') as f: for line in f: new_relation = Relation.from_json( json.loads(line, object_pairs_hook=OrderedDict)) self.data_list.append(new_relation)
def replace(self, uid, ical, filename=None): """Update the Remind command with the uid in the file with the new iCalendar""" if not filename: filename = self._filename elif filename not in self._icals: return uid = uid.split('@')[0] with self._lock: rem = copen(filename, encoding='utf-8').readlines() for (index, line) in enumerate(rem): if uid == md5(line[:-1].encode('utf-8')).hexdigest(): rem[index] = self.to_reminders(readOne(ical)) copen(filename, 'w', encoding='utf-8').writelines(rem) break
def _get_comment_content(self, comm_id): try: with copen(self._path_to_comment(comm_id), "r", "utf-8") as f: comm = f.read() return comm except IOError: return None
def _get_post_content(self, post_id): try: with copen(self._path_to_post(post_id), "r", "utf-8") as f: post = f.read() return post except IOError: return None
def draw_title(path, title="", x=400, y=410, font_size=20, colour="black"): """Draws title into svg. :param path: path to svg :param title: text do input into picture :param x: starting x position of the title :param y: starting y position of the title :param font_size: font size of labels -- default is 20 :param colour: title colour """ with copen(path, "r+", "utf-8") as svg: svg.seek(-6, 2) svg.write( '\n<text x ="' + str(x) + '" y="' + str(y) + '" stroke="none" font-size="' + str(font_size) + '" fill="' + colour + '" font-family="sans-serif">' + title + "</text>\n</svg>" )
def load_file(self, file=-1, changing=False, label="", border=""): if file == -1: file = self.file try: with copen(file, "r", encoding="utf-8") as f: content = loads(f.read()) except Exception as e: if changing: label.config(fg=wc.WRONG) sound("source\\wrong.wav") if border != "": border.config(bg=wc.WRONG) self.root.update() sleep(0.2) border.config(bg="black") label["text"] = "The file {} wasn't found.".format(file) #print("Works") return -1 else: load_colors(content["Colors"]) self.content = content if type(self.content) != type(1): self.wordlist = list(self.content["Language"]) if changing: if border != "": border.config(bg=wc.GOOD) self.root.update() sound("source\\correct.wav") sleep(0.2) border.config(bg="black") label.config(fg="black") label["text"] = "File {} loaded correctly.".format(file) self.newWord()
def save_corpora(corpora_iter, path1, path2): """TODO: Docstring for save_corpora. :corpora_iter: TODO :path1: TODO :path2: TODO :returns: TODO """ with copen(path1, 'w', encoding='utf-8') as f1,\ copen(path2, 'w', encoding='utf-8') as f2: for sent1, sent2 in corpora_iter: sent1 = sent1.replace('\n', ' ').strip() + '\n' sent2 = sent2.replace('\n', ' ').strip() + '\n' f1.write(sent1) f2.write(sent2)
def analyze(self,data,_data,filename): listheaders = [] listpayloads = [] for _ in data: listheaders.append(str( _["fields"])) listpayloads.append(str( _["payload"])) headers = "".join(listheaders) content = "".join(listpayloads) with copen(self.intell+filename,"r",encoding='utf8') as f: for _ in loads(f.read()): try: if "Type" in _ and "WQREGEX" in _["Type"]: if _["Options"]["Word"] == "Normal" and "Header_Detection" in _: x = search(compile(r"{}".format(_["Header_Detection"]),_["Options"]["Flag"]),headers) elif _["Options"]["Word"] == "Normal" and "Content_Detection" in _: x = search(compile(r"{}".format(_["Content_Detection"]),_["Options"]["Flag"]),content) if x is not None: _data.append({"Matched":"1","Required":_["Options"]["Required"],"WAF":_["Name"],"Detected":x.group()}) except: pass self.check_proxy_bypass(data,_data)
def find_nearest_category_text(): """Finds closest article to process for each IAB sub category.""" #grab IAB subcats # - must be lower case # - must have spaces replaced by underscores iab_sub_cats = [] cats = create_category_dictionary(iab) for k, v in cats.iteritems(): for x in v: iab_sub_cats.append(x.lower().replace(" ", "_")) iab_sub_cats = set(iab_sub_cats) #process titles in file nearest_titles = defaultdict(list) with copen("id_to_page.tsv", encoding='utf8') as f: for n, line in enumerate(f): if line != "": try: title = line[:-1].split('\t')[1] comparison_title = title.lower() if comparison_title in iab_sub_cats: nearest_titles[comparison_title].append(title) except Exception, e: print Exception, e if n % 1000000 == 0: print "Done {0} found {1} of {2}".format( n, len(nearest_titles), len(iab_sub_cats))
def rcompile_and_find(self, data, filename): ''' parse the detections and check them against wordsstripped ''' with copen(filename, "r", encoding='utf8') as file: for _ in loads(file.read()): with ignore_excpetion(Exception): if "Type" in _ and "QREGEX" in _["Type"]: _list = [] tempmatches = 0 for item in _["Detection"]: if _["Options"]["Word"] == "Normal": temp_value = rsearch( rcompile(r"{}".format(item), _["Options"]["Flag"]), self.wordsstripped) elif _["Options"]["Word"] != "Normal": temp_value = rsearch( rcompile(r"\b{}\b".format(item), _["Options"]["Flag"]), self.wordsstripped) if temp_value is not None: _list.append(temp_value.group()) tempmatches += 1 if _list and tempmatches >= _["Options"]["Required"]: data.append({ "Matched": tempmatches, "Required": _["Options"]["Required"], "Behavior": _["Name"], "Detected": ', '.join(_list) })
def test_windows1252(self): vtt_string = copen(self.windows_path, encoding='windows-1252').read() vtt_file = from_string(vtt_string, encoding='windows-1252', eol='\r\n') self.assertEqual(len(vtt_file), 1332) self.assertEqual(vtt_file.eol, '\r\n') self.assertRaises(UnicodeDecodeError, vttopen, self.utf8_path, encoding='ascii')
def extract_rows(auto_parsed_file_name): wsj_id_re = re.compile('wsj_(\d\d\d\d)') wsj_id = wsj_id_re.search(auto_parsed_file_name).group(1) corenlp_result = json.loads(copen(auto_parsed_file_name, mode='r').read()) sentences = corenlp_result['sentences'] rows = [] for i, sentence in enumerate(sentences): split_clauses_from_sentence(sentence) sentence_start = sentence['words'][0][1]['CharacterOffsetBegin'] sentence_end = sentence['words'][-1][1]['CharacterOffsetEnd'] parse_json_string = json.dumps(sentence, cls=TJsonEncoder) coref_json_string = json.dumps([]) if 'coref' in corenlp_result: coref_json_string = json.dumps(corenlp_result['coref']) row = { 'wsj_section': int(wsj_id[0:2]), 'wsj_id': wsj_id, 'sentence_id': i, 'parse_json': parse_json_string, 'sentence_start': sentence_start, 'sentence_end': sentence_end, 'sentence_text': sentence['text'], 'coreference_json': coref_json_string } rows.append(row) return rows
def write_training_file(file_name, new_file_name, counter, cutoff): """Write the file such that the features are pruned based on the cutoff It slows down a bit because we have to re-read the file instead using whatever is already in the memory. """ with copen(file_name, encoding='utf8') as f: lines = f.readlines() new_training_file = copen(new_file_name, 'w', encoding='utf8') for line in lines: name, label, features = line.strip().split('\t') features = [x for x in features.split(' ') if counter[x] > cutoff] if len(features) == 0: features = ['NO_FEATURE'] new_training_file.write('%s\t%s\t%s\n' % (name, label, ' '.join(features))) new_training_file.close()
def get_synonims(): """ Retuns list of pair (word, list_of_synonims). """ print('Started reading synonims file...') SYN_NAME = 'engine/synonims.txt' data_reader = copen(SYN_NAME, 'r', 'windows-1251') raw_data = data_reader.read() data_reader.close() data = raw_data.split('\r\n') data = list(map(lambda x: tuple(x.split('|')), data)) # interested in synonims to one word precisely: data = list(filter(lambda x: '?' not in x[0] and ' ' not in x[0], data)) ret = [] for line in data: if len(line) < 2: continue word = InverseIndex._clean(line[0]) syno = line[1] syno = syno.split(',') # don't bother with multiple choices, like 'you (shall, will)' syno = list(filter(lambda x: ')' not in x and '(' not in x, syno)) syno = list(map(lambda x: InverseIndex._clean(x), syno)) ret.append((word, syno)) print('Finished reading synonims file!') return ret
def find_nearest_category_text(): """Finds closest article to process for each IAB sub category.""" #grab IAB subcats # - must be lower case # - must have spaces replaced by underscores iab_sub_cats = [] cats = create_category_dictionary(iab) for k,v in cats.iteritems(): for x in v: iab_sub_cats.append(x.lower().replace(" ", "_")) iab_sub_cats = set(iab_sub_cats) #process titles in file nearest_titles = defaultdict(list) with copen("id_to_page.tsv", encoding='utf8') as f: for n, line in enumerate(f): if line != "": try: title = line[:-1].split('\t')[1] comparison_title = title.lower() if comparison_title in iab_sub_cats: nearest_titles[comparison_title].append(title) except Exception, e: print Exception, e if n % 1000000 == 0: print "Done {0} found {1} of {2}".format(n, len(nearest_titles), len(iab_sub_cats))
def clean_answer(answer, prologue, code): ret = [] outp = copen('outp', code, encoding='utf-8') outp.write(prologue + '\n') for feature in answer: outp.write(feature[0] + ' ' + feature[1] + '\n') return ret
def _add_to_file(self, filename, line, join=False): varname = make_varname(filename) if hasattr(self, varname): setattr(self, varname, getattr(self, varname) + [self.fields_separator.join(line),]) with copen(filename, "a", "utf-8") as f: f.write((self.fields_separator.join(line) if join else line) + "\n")
def prepare_articles(names): '''saves tagged articles about given entities in a cache''' for f in glob.glob(join(raw_articles_path, "*.txt*")): os.remove(f) found = False link_dictionaries = {} for i, name in enumerate(names): try: get_article(name) except ArticleNotFoundError: try: article, link_dictionary = get_raw_article(name) link_dictionaries[i] = link_dictionary except ArticleNotFoundError: continue found = True article = '\n'.join(article.split('\n')[: article_sentence_limit]) out = copen(join(raw_articles_path, '%d.txt' % i), 'w', 'utf-8') print >>out, article if found: articles = lt.run_nlptools(link_dictionaries) for f in glob.glob(join(raw_articles_path, "*.txt*")): os.remove(f) #save processed articles for i, article in articles.iteritems(): Pickler.store(article, articles_cache_path % names[i])
def gettimes(fname, nframes=None): if nframes is None: try: tags = readtifInfo(fname, verbose=False) nframes = int([d for d in info[270].split('\n') \ if d.find('frames')>=0][0].split('=')[-1]) # ~ with pytiff.Tiff(fname) as handle: # ~ tags = handle.read_tags() # ~ nframes = int(tags['image_description'].split()[2][7:]) except: print("I could not get the number of frames, please provide it") return with copen(fname, "r", "windows-1252") as f: j = 0 times = zeros((nframes)) while True: try: # ~ print('linea ',j) line = f.readline() # ~ print(line) linesp = line.replace('\x00', '').strip().split() if len(linesp) == 5: if linesp[2] == 'Time_From_Last': k, t = linesp[1], linesp[-1] # print(int(k),float(t)) times[int(k) - 1] = float(t) j = j + 1 except Exception as e: print(e) break if j >= nframes: break return (times)
def crawl_ted(langs, output, ignore_urls=()): crawl_id = str(int(time.time())) ignore = [u.split('/')[-1] for u in ignore_urls] for lang in langs: log.info('Language %s', lang) path = os.path.join(output, crawl_id + '_' + lang) log.info('Saving into file %s', path) with copen(path, 'w', encoding='utf-8') as f: for url in get_pages(): log.info('Page %s', url) html = get_html_retry(url) if html is None: break try: for talk_url in get_processed(html, get_talks): if talk_url.split('/')[-1] in ignore: log.info('Ignoring %s', talk_url) continue talk_url += '/transcript?language=' + lang log.info('Talk %s', talk_url) talk_html = get_html_retry(talk_url) if talk_html is None: continue try: lines = get_processed(talk_html, get_transcript) except ParseError: pass else: f.write('\n'.join(lines)) except ParseError: break log.info('Done')
def main(): for file in listdir(): if file[-5:].lower() == ".json": with copen(file, "r", encoding="utf-8") as f: content = loads(f.read()) print("Configuring {}...".format(file)) if "Colors" in content: content["Colors"]["WIDTH"] = 700 content["Colors"]["HEIGHT"] = 400 content["Colors"]["MAIN_COLOR"] = "#00FF80" content["Colors"]["SECONDARY_COLOR"] = "#ECF0F1" content["Colors"]["WRONG"] = "#DF013A" content["Colors"]["GOOD"] = "#00FF80" content["Colors"]["ADD_WORD"] = "F7FE2E" with copen(file, "w", encoding="utf-8") as f: f.write(dumps(content, indent=4))
def get_long_description(): """ Retrieve the long description from DESCRIPTION.rst """ here = os.path.abspath(os.path.dirname(__file__)) with copen(os.path.join(here, 'README.rst'), encoding='utf-8') as description: return description.read()
def runScript(command=None, tempfile=None): timingfname = None scriptfname = None CMD = ['script'] if tempfile: timingfname = "%s.timing" % str(tempfile) scriptfname = "%s.log" % str(tempfile) with open(timingfname, 'w'): with open(scriptfname, 'w'): pass else: with NamedTemporaryFile(delete=False) as timingf: with NamedTemporaryFile(delete=False) as scriptf: timingfname = timingf.name scriptfname = scriptf.name CMD.append('-t') if command: CMD.append('-c') CMD.append(command) CMD.append(scriptfname) with open(timingfname, 'w') as timingf: proc = Popen(CMD, stderr=timingf) proc.wait() return copen(scriptfname, encoding='utf-8', errors='replace'), \ open(timingfname, 'r')
def compile_and_find(self, data, filename): ''' parse the detections and check them against wordsstripped ''' with copen(filename, "r", encoding='utf8') as f: for _ in loads(f.read()): try: if "Type" in _ and "QREGEX" in _["Type"]: _list = [] tempmatches = 0 for item in _["Detection"]: if _["Options"]["Word"] == "Normal": x = search( compile(r"{}".format(item), _["Options"]["Flag"]), self.wordsstripped) elif _["Options"]["Word"] != "Normal": #Functions end with A,W do not match using "Word" option x = search( compile(r"\b{}\b".format(item), _["Options"]["Flag"]), self.wordsstripped) if x is not None: _list.append(x.group()) tempmatches += 1 if _list and tempmatches >= _["Options"]["Required"]: data.append({ "Matched": tempmatches, "Required": _["Options"]["Required"], "Behavior": _["Name"], "Detected": ','.join(_list) }) except: pass
def get_file_content(self, path ): # aa = open(path, "rb") aa = copen(path, "rb",encoding="U8") cont = aa.read() # cont = cont.replace(" "," ") cont = cont.replace("&","DHTN__") return cont
def process_chapters(self, db_book, book_id, book_link): """ Extract the chapters, and do some initial processing of the verses :param book: An OpenLP bible database book object :param chapters: parsed chapters :return: None """ log.debug(book_link) book_file = os.path.join(self.base_dir, os.path.normpath(book_link)) with copen(book_file, encoding='utf-8', errors='ignore') as f: page = f.read() soup = BeautifulSoup(page, 'lxml') header_div = soup.find('div', 'textHeader') chapters_p = header_div.find('p') if not chapters_p: chapters_p = soup.p log.debug(chapters_p) for item in chapters_p.contents: if self.stop_import_flag: break if isinstance(item, Tag) and item.name in ['a', 'span']: chapter_number = int(item.string.strip()) self.set_current_chapter(db_book.name, chapter_number) self.process_verses(db_book, book_id, chapter_number)
def process_books(self): """ Extract and create the bible books from the parsed html :param bible_data: parsed xml :return: None """ with copen(os.path.join(self.base_dir, 'index.htm'), encoding='utf-8', errors='ignore') as index_file: page = index_file.read() soup = BeautifulSoup(page, 'lxml') bible_books = soup.find('div', 'textOptions').find_all('li') book_count = len(bible_books) for li_book in bible_books: log.debug(li_book) if self.stop_import_flag: break # Sometimes the structure is "[1] <a>Genesis</a>", and sometimes it's "<a>[1] Genesis</a>" if isinstance(li_book.contents[0], NavigableString) and str(li_book.contents[0]).strip(): book_string = str(li_book.contents[0]) book_name = str(li_book.a.contents[0]) elif li_book.a: book_string, book_name = str(li_book.a.contents[0]).split(' ', 1) book_link = li_book.a['href'] book_id = int(BOOK_NUMBER_PATTERN.search(book_string).group(1)) book_name = book_name.strip() db_book = self.find_and_create_book(book_name, book_count, self.language_id, book_id) self.process_chapters(db_book, book_id, book_link) self.session.commit()
def process_verses(self, db_book, book_number, chapter_number): """ Get the verses for a particular book """ chapter_file_name = os.path.join(self.base_dir, '{:02d}'.format(book_number), '{}.htm'.format(chapter_number)) with copen(chapter_file_name, encoding='utf-8', errors='ignore') as chapter_file: page = chapter_file.read() soup = BeautifulSoup(page, 'lxml') text_body = soup.find('div', 'textBody') if text_body: verses_p = text_body.find('p') else: verses_p = soup.find_all('p')[2] verse_number = 0 verse_text = '' for item in verses_p.contents: if self.stop_import_flag: break if isinstance(item, Tag) and 'verse' in item.get('class', []): if verse_number > 0: self.process_verse(db_book, chapter_number, verse_number, verse_text.strip()) verse_number = int(item.string.strip()) verse_text = '' elif isinstance(item, NavigableString): verse_text += str(item) elif isinstance(item, Tag) and item.name in ['span', 'a']: verse_text += str(item.string) else: log.warning('Can\'t store %s', item) self.process_verse(db_book, chapter_number, verse_number, verse_text.strip())
def main(args): dict_w = [] stop_w = [] if args.dictionary: dict_w = load_dictionary(args.dictionary) if args.stop_words: stop_w = load_dictionary(args.stop_words) file1 = copen(gen_name(args.paths[0]), 'w', encoding='utf-8') file2 = copen(gen_name(args.paths[1]), 'w', encoding='utf-8') for i, pair in enumerate(line_iter(*args.paths)): if is_ok(dict_w, stop_w, pair, abs_t=args.abs_diff, rel_t=args.rel_diff): file1.write(pair[0]) file2.write(pair[1]) if i % 1000 == 0: log.info('Lines processed %i', i+1) file1.close() file2.close()
def write_history(text,author): """\ write the quote to a the history file ~/qhistory in an format that you can use it for fortune (strfile) """ with copen(expanduser('~/.qhistory'), 'a', 'utf8') as fh: fh.write(text + "\n") fh.write(' ' * 50 + '- ' + author +"\n") fh.write("%\n")
def removeLabels(f_gold, f_test, slda=True): """ Remove labels from the gold for inference. Preprocessing: tail -n $(25%) corpus > gold sed -i "$(total - 25%),$(total)d" """ print "Removing labels" with copen(f_gold, "r", encoding="utf-8") as gold_f: with copen(f_test, "w", encoding="utf-8") as test_f: for line in gold_f: if not slda: test_f.write(sub("\[[0-9]\] ", "", line)) else: line = sub("\]", "", line) test_f.write(sub("\[[0-9]\|", "", line))
def splitText(f_lda, a, f_train, f_gold): """ Splitting the text in a certain percentage. """ with copen(f_lda, encoding="utf-8") as corpus_f: corpus = corpus_f.readlines() x = len(corpus) / 100 * a train = corpus[0:x] gold = corpus[x:] with copen(f_train, "w", encoding="utf-8") as train_f: for line in train: train_f.write(line) with copen(f_gold, "w", encoding="utf-8") as gold_f: for line in gold: gold_f.write(line)
def __init__(self): """Sets up the classifier""" #import the main payload with keywords for matching/blocking with copen("payload_lica.json", encoding='utf8') as f: self.payload = load(f) #Build a mapping in memory of keyword to category #The payload is kept in the reverse format to make it easier to edit self.positive_keywords = {} for top_level, sub_level in self.payload['positive_words'].iteritems(): for category, keywords in sub_level.iteritems(): for keyword in keywords: self.positive_keywords[keyword] = [top_level, category] #create a simple ignored words checker self.ignored_words = set(self.payload["ignore_words"]) #import the domain rules with copen("payload_domain_rules.json", encoding='utf8') as f: self.rules = load(f) #convert the host rules into an easily searchable format # e.g. "au.movies.yahoo.com": "television", # should be: "yahoo.com": { 'movies': { 'au': ['arts & entertainment', 'television'] } } self.host_rules = defaultdict(dict) for host_rule, category in self.rules['host_rules'].iteritems(): domain = extract( host_rule ) #ExtractResult(subdomain='au.movies', domain='yahoo', suffix='com') tld = domain.domain + "." + domain.suffix # yahoo.com host = domain.subdomain.split('.') #['au', 'movies'] tree = make_tree( host[::-1], category ) #{ 'movies': { 'au': ['arts & entertainment', 'television'] } } merge(self.host_rules, {tld: tree}) #merge the host rules with this new data #convert the path rules into an easily searchable format self.path_rules = defaultdict(dict) for path_rule, category in self.rules['path_rules'].iteritems(): domain = extract(path_rule) tld = domain.domain + "." + domain.suffix #sort of ignoring host+path rules, those can be covered by full DFR later path = path_rule.split('/')[1] self.path_rules[tld][path] = category
def parse_pdtb_file(pdtb_file_name): relation_jsons = [] with copen(pdtb_file_name, mode='r', encoding='latin1') as f: lines = deque([x for x in f.readlines()]) while len(lines) > 0: relation_json = relation(lines) relation_jsons.append(relation_json) return relation_jsons
def make_ontology_file(): wikipedia_page_keywords = {} #3) "Down_to_Earth_%28Justin_Bieber_song%29": ['one', 'girl'] with copen("topic_signatures_en.tsv", encoding='utf8') as raw: for n, line in enumerate(raw): line = line[:-1].split('\t') #remove the newline character and separate title from rest wiki_article_title = line[0] #useful rest = line[1].split('"') page_text_salient_keywords = [x for x in rest[-1].split() if x not in STOPWORDS] #useful wikipedia_page_keywords[wiki_article_title] = page_text_salient_keywords if n % 100000 == 0: print "Processed {0}% of the pages".format((n/3500000.0)*100) print "Total: {0} articles".format(len(wikipedia_page_keywords)) with copen("article_category_matrix.tsv", encoding='utf8') as f: #has 144k categories, 97k without numbers article_phrase_matrix = defaultdict(lambda: defaultdict(int)) for n, line in enumerate(f): line = line.split("\t") category = line[0] if not re.match('.*[0-9].*', category): #as long as the category doesn't have a number in it articles = line[1:] for article in articles: if article in wikipedia_page_keywords: for phrase in wikipedia_page_keywords[article]: article_phrase_matrix[category][phrase] += 1 if n % 10000 == 0: print "Processed {0}".format(n) #now export in the form: #category \t phrase \t count \t phrase \t count with copen('payload.lwca', 'w', encoding='utf8') as f: for category, words in article_phrase_matrix.iteritems(): phrases = [] for phrase, count in sorted(words.items(), key=lambda x: x[1], reverse=True): phrases.append(u"{0}\t{1}".format(phrase, count)) f.write(u"{0}\t{1}\n".format(category, '\t'.join(phrases)))
def output_file(self): if not hasattr(self, '_output_file'): if self.output_file_path: self._output_file = copen(self.output_file_path, 'w+', encoding=self.output_encoding) else: self._output_file = stdout return self._output_file
def __init__(self, file, *args, **kwargs): self.file = file if file[1] == ':' else '%s/%s' % ('/'.join( sys.argv[0].split('/')[:-1]), file) with copen(file, 'r', encoding='utf-8') as f: self.update( dict([(x.split(self.kvSep, 1)[0], x.split(self.kvSep, 1)[1]) for x in ''.join(f.read()).split(self.separator) if x != '' and x != '\n'])) self.update(dict(*args, **kwargs))
def parseFile(filename): result = [] try: with copen(filename, "r", 'utf-8') as f: result = json.load(f) except: print("Failed to parse: " + filename) quit(1) return result
def evaluateSLDA(f_gold, f_results): """ Evaluation Metrics for SLDA. """ print "Starting evaluation" counts = { "0": [0., 0., 0., 0.], "1": [0., 0., 0., 0.], "2": [0., 0., 0., 0.] } conf = [[0, 0, 0], [0, 0, 0], [0, 0, 0]] with copen(f_gold, encoding="utf-8") as gold_f: with open(f_results) as results_f: for g, r in zip(gold_f, results_f): g_topics = findall("\[([0-9])\|", g) r_topics = r.split() for gold, pred in zip(g_topics, r_topics): for k in counts: if k == pred: if k == gold: # TP conf[int(k)][int(gold)] += 1. counts[k][0] += 1. elif k != gold: # FP conf[int(k)][int(gold)] += 1. counts[k][1] += 1 elif k != pred: if k == gold: # FN counts[k][2] += 1. elif k != gold: # TN counts[k][3] += 1. for k in counts: precision = (counts[k][0] / (counts[k][0] + counts[k][1])) recall = (counts[k][0] / (counts[k][0] + counts[k][2])) print "Precision of %s:\t\t\t%.2f" % (k, round((precision * 100), 2)) print "Recall of %s:\t\t\t%.2f" % (k, round((recall * 100), 2)) print "Specificity of %s:\t\t\t%.2f" % ( k, round( ((counts[k][3] / (counts[k][3] + counts[k][1])) * 100), 2)) print "Accuracy of %s:\t\t\t%.2f" % ( k, round(( ((counts[k][0] + counts[k][3]) / (counts[k][0] + counts[k][1] + counts[k][2] + counts[k][3])) * 100), 2)) print "F1-Score of %s:\t\t\t%.2f" % (k, round( (2 * ((precision * recall) / (precision + recall))), 2)) print
def fetch_kle_json(gist_id): """Returns the JSON for a keyboard-layout-editor URL. """ cache_file = '/'.join((cache_dir, gist_id)) headers = {} if exists(cache_file): # We have a cached copy file_stat = stat(cache_file) file_age = time() - file_stat.st_mtime if file_stat.st_size == 0: logging.warning('Removing zero-length cache file %s', cache_file) remove(cache_file) elif file_age < 30: logging.info('Using cache file %s (%s < 30)', cache_file, file_age) return copen(cache_file, encoding='UTF-8').read() else: headers['If-Modified-Since'] = strftime( '%a, %d %b %Y %H:%M:%S %Z', localtime(file_stat.st_mtime)) logging.warning('Adding If-Modified-Since: %s to headers.', headers['If-Modified-Since']) keyboard = requests.get(gist_url % gist_id, headers=headers) if keyboard.status_code == 304: logging.debug("Source for %s hasn't changed, loading from disk.", cache_file) return copen(cache_file, encoding='UTF-8').read() keyboard = keyboard.json() for file in keyboard['files']: keyboard_text = keyboard['files'][file]['content'] break # First file wins, hope there's only one... if not exists(cache_dir): makedirs(cache_dir) with copen(cache_file, 'w', encoding='UTF-8') as fd: fd.write(keyboard_text) # Write this to a cache file return keyboard_text