def __load(self, ifile, encd): '''Load regular expressions from ifile. ''' re_list = [RegExpStruct()] match = None cnt = 0 finput = AltFileInput(ifile, encoding = encd) for line in finput: match = RE_OPTIONS.match(line) # different regexp options will separate different # chunks of regular expressions if match: # increment counter only if we have already seen any # regexps before if cnt != 0 or re_list[0][0]: re_list.append(RegExpStruct()) cnt += 1 # securily interpret options passed as strings as valid python # code re_list[cnt][1] = self.__parse_options(match) else: # strip off comments line = skip_comments(line) # and remember the line if it is not empty if line: re_list[cnt][0].extend(self.istring_hook(line)) return self.compile(re_list)
def __init__(self, fname, encoding = 'utf-8'): """Read P2P rules from file and populate instance.""" self.rules = [] self.flags = '' ifile = AltFileInput(fname, encoding = 'utf-8') for line in ifile: self.__parse(line)
def load_polar_dicts(dfnames): """Load polar words into polarity dictionary.""" global polar_dict # iterate over name of polarity dictionaries finput = AltFileInput(*dfnames) word = tag = "" score = 0 for iline in finput: if not COMMENT_RE.match(iline): word, tag, score = iline.split('\t') if tag == ANY_TAG: polar_dict[word.lower()] = abs(float(score)) else: # abs(float(score)) polar_dict[(word.lower(), tag)] = abs(float(score))
def __init__(self, ifile=DEFAULT_NR_FILE): """Create an instance of NoiseRestorer. @param ifile - name of file containing list of elements which should be restored """ # set of words which are replecements that should be restored self.rwords = set([]) # list of regexps, which are checked against replacements and once they # match these replecements, those replacements should be restored to # original form self.rre = [] # container for storing replacement information self.restoreList = [] self.tokenOffsets = Offsets() finput = AltFileInput(ifile) mobj = None for line in finput: line = skip_comments(line) if not line: continue mobj = __RWORD_RE__.match(line) if mobj: self.rwords.add(mobj.group(1)) continue mobj = __RREX_RE__.match(line) if mobj: self.rre.append("(?:" + mobj.group(1) + ")") continue raise RuleFormatError( "Unrecognized line format for NoiseRestorer.") if self.rre: self.rre = re.compile("(?:" + '|'.join(self.rre) + ")") else: self.rre = re.compile("(?!)") self.t_offset = -1 self.r_offset = -1 self.t_length = -1 self.r_length = -1
def __load(self, ifile): """Load map entries from file ifile.""" # load map entries from file output = {} optmatch = None finput = AltFileInput(ifile, encoding=self.encd) for line in finput: if line: optmatch = RE_OPTIONS.match(line) if optmatch: if self.flags: raise RuleFormatError( \ msg = "Multiple flag lines are not supported", \ efile = finput) else: self.flags = optmatch.group(1) self.ignorecase = RegExp(self.flags, "").re.flags & re.IGNORECASE continue # find map entries line = skip_comments(line) m = MAP_DELIMITER.search(line) if m: src, trg = self.__normalize_quotes(line[0:m.start()], \ line[m.end():]) if not (src and trg): print src.encode('utf-8') print trg.encode('utf-8') raise RuleFormatError(efile=finput) src = re.escape(src) if self.ignorecase: output[src.lower()] = trg else: output[src] = trg elif line: raise RuleFormatError(efile=finput) return output
help="switch verbose statistics mode on", action="store_true") args = argparser.parse_args() ################################################################## # Main unigram_prob = pickle.load(args.unigram_prob_file) args.unigram_prob_file.close() bigram_prob = pickle.load(args.bigram_prob_file) args.bigram_prob_file.close() esc_char = args.esc_char skip_line = args.skip_line foutput = AltFileOutput(encoding=args.encoding, flush=args.flush) finput = AltFileInput(*args.files, print_func=foutput.fprint, errors='replace') memory = Memory() # unfortunately, rules for restoration of misspellings are currently hard-coded # in the `misspellings.py` file misspelling_restorer = MisspellingRestorer(unigram_prob, bigram_prob) # iterate over input lines, skip empty and skip lines, pre-cache information # about replacements for line in finput: # print empty and skip lines unchanged if line == skip_line or not line: # check if memory is empty and print it otherwise print_mem() foutput.fprint(line) # check if current line contains meta information
action = 'store_true', default=True) argparser.add_argument("files", help = "input files in which equal and odd strings should be aligned", \ nargs = '*', type = argparse.FileType('r'), \ default = [sys.stdin]) args = argparser.parse_args() # input/output encoding enc = args.encoding # determine which type of alignment is requested if args.needleman_wunsch: alignfunc = nw_align else: alignfunc = hb_align # establish Input/Output foutput = AltFileOutput(encoding=args.encoding) finput = AltFileInput(*args.files, \ print_func = foutput.fprint, \ errors = "replace") # auxiliary variables line1 = line2 = '' oline1 = [] oline2 = [] alignment = [] c_list = [] c_i = c_len = 0 fnr = 0 # iterate over input lines for line in finput: if finput.fnr % 2 == 0: line2 = line
precede lines with meta-information""", nargs = 1, type = str, \ default = os.environ.get("SOCMEDIA_ESC_CHAR", "")) argparser.add_argument("-e", "--encoding", help="input/output encoding", \ default = DEFAULT_LANG) argparser.add_argument("conll_file", help="file with DG trees in CONLL format") argparser.add_argument("token_file", help="file with original tokenization") argparser.add_argument("word_file", help="file with MMAX words") argparser.add_argument("annotation_files", help="files with MMAX markables", nargs='*') args = argparser.parse_args() # variables esc_char = args.esc_char foutput = AltFileOutput(encoding=args.encoding) finput = AltFileInput(args.conll_file, print_func=foutput.fprint) # skip files with no annotation if not args.annotation_files: sys.exit(0) # read and parse CONLL file conlldic = read_conll(finput) # read and parse tokenization file tkndoc = ET.parse(args.token_file) # read and parse MMAX word file wrddoc = ET.parse(args.word_file) # merge annotation with CONLL data merge_conll_mmax_doc(conlldic, tkndoc, wrddoc, args.annotation_files)