def readdoc(rpath, fdocdict): files = [join(rpath, fname) for fname in listdir(rpath) if fname.endswith(".merge")] dr = DocReader() docdict = {} for fmerge in files: doc = dr.read(fmerge) docdict[fmerge] = doc print 'Write doc dict into {}'.format(fdocdict) with gzip.open(fdocdict, 'w') as fout: dump(docdict, fout)
def readdoc(rpath, fdocdict): files = [ join(rpath, fname) for fname in listdir(rpath) if fname.endswith(".merge") ] dr = DocReader() docdict = {} for fmerge in files: doc = dr.read(fmerge) docdict[fmerge] = doc print 'Write doc dict into {}'.format(fdocdict) with gzip.open(fdocdict, 'w') as fout: dump(docdict, fout)
def evalparser(path='./examples', report=False, bcvocab=None, draw=True, withdp=False, fdpvocab=None, fprojmat=None): """ Test the parsing performance :type path: string :param path: path to the evaluation data :type report: boolean :param report: whether to report (calculate) the f1 score """ # ---------------------------------------- # Load the parsing model print 'Load parsing model ...' pm = ParsingModel(withdp=withdp, fdpvocab=fdpvocab, fprojmat=fprojmat) pm.loadmodel("model/parsing-model.pickle.gz") # ---------------------------------------- # Evaluation met = Metrics(levels=['span','nuclearity','relation']) # ---------------------------------------- # Read all files from the given path doclist = [joinpath(path, fname) for fname in listdir(path) if fname.endswith('.merge')] for fmerge in doclist: # ---------------------------------------- # Read *.merge file dr = DocReader() doc = dr.read(fmerge) # ---------------------------------------- # Parsing pred_rst = pm.sr_parse(doc, bcvocab) if draw: strtree = pred_rst.parse() drawrst(strtree, fmerge.replace(".merge",".ps")) # Get brackets from parsing results pred_brackets = pred_rst.bracketing() fbrackets = fmerge.replace('.merge', '.brackets') # Write brackets into file writebrackets(fbrackets, pred_brackets) # ---------------------------------------- # Evaluate with gold RST tree if report: fdis = fmerge.replace('.merge', '.dis') gold_rst = RSTTree(fdis, fmerge) gold_rst.build() gold_brackets = gold_rst.bracketing() met.eval(gold_rst, pred_rst) if report: met.report()
def build(self): """ Build BINARY RST tree """ text = open(self.fdis).read() # Build RST as annotation self.tree = buildtree(text) # Binarize it self.tree = binarizetree(self.tree) # Read doc file if isfile(self.fmerge): dr = DocReader() self.doc = dr.read(self.fmerge) else: raise IOError("File doesn't exist: {}".format(self.fmerge)) # Prop information from doc on the binarized RST tree self.tree = backprop(self.tree, self.doc)
def eval_parser_unit(fmerge, bcvocab=None, pm=None, draw=False): bcvocab = global_bv pm = global_pm assert bcvocab is not None assert pm is not None dr = DocReader() doc = dr.read(fmerge) # ---------------------------------------- # Parsing pred_rst = pm.sr_parse(doc, bcvocab) if draw: strtree = pred_rst.parse() drawrst(strtree, fmerge.replace(".merge", ".ps")) # Get brackets from parsing results pred_brackets = pred_rst.bracketing() fbrackets = fmerge.replace('.merge', '.brackets') # Write brackets into file writebrackets(fbrackets, pred_brackets)
def diff_compare_files(): def is_docx(file): docx_mime = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' return file.filename.endswith( '.docx') and file.content_type == docx_mime def is_text(file): return file.content_type.startswith('text/') if request.method == 'POST': files = [] for k, v in request.files.items(): if k.startswith('file'): files.append(v) if not files or len(files) != 2: flash('Not enough files') return redirect('/diff') for f in files: if not f.filename: flash('No selected file') return redirect('/diff') file1, file2 = files body1 = None body2 = None print(file1) print(file2) if is_docx(file1) and is_docx(file2): dr = DocReader() body1 = list(dr.process(file1.stream)) body2 = list(dr.process(file2.stream)) elif is_text(file1) and is_text(file2): body1 = file1.stream.read().decode("utf-8").splitlines() body2 = file2.stream.read().decode("utf-8").splitlines() else: flash('unsupported.') return diff_compare(files[0].filename, body1, files[1].filename, body2)