def compare(): """Handle requests for /compare via POST""" # Read files if not request.files["file1"] or not request.files["file2"]: abort(400, "missing file") try: file1 = request.files["file1"].read().decode("utf-8") file2 = request.files["file2"].read().decode("utf-8") except Exception: abort(400, "invalid file") # Compare files if not request.form.get("algorithm"): abort(400, "missing algorithm") elif request.form.get("algorithm") == "lines": regexes = [f"^{re.escape(match)}$" for match in lines(file1, file2)] elif request.form.get("algorithm") == "sentences": regexes = [re.escape(match) for match in sentences(file1, file2)] elif request.form.get("algorithm") == "substrings": if not request.form.get("length"): abort(400, "missing length") elif not int(request.form.get("length")) > 0: abort(400, "invalid length") regexes = [re.escape(match) for match in substrings( file1, file2, int(request.form.get("length")))] else: abort(400, "invalid algorithm") # Highlight files highlights1 = highlight(file1, regexes) highlights2 = highlight(file2, regexes) # Output comparison return render_template("compare.html", file1=highlights1, file2=highlights2)
def compare(): """Handle requests for /compare via POST""" # Read files if not request.files["file1"] or not request.files["file2"]: abort(400, "missing file") try: file1 = request.files["file1"].read().decode("utf-8") file2 = request.files["file2"].read().decode("utf-8") except Exception: abort(400, "invalid file") # Compare files if not request.form.get("algorithm"): abort(400, "missing algorithm") elif request.form.get("algorithm") == "lines": regexes = [f"^{re.escape(match)}$" for match in lines(file1, file2)] elif request.form.get("algorithm") == "sentences": regexes = [re.escape(match) for match in sentences(file1, file2)] elif request.form.get("algorithm") == "substrings": if not request.form.get("length"): abort(400, "missing length") elif not int(request.form.get("length")) > 0: abort(400, "invalid length") regexes = [re.escape(match) for match in substrings( file1, file2, int(request.form.get("length")))] else: abort(400, "invalid algorithm") # Highlight files highlights1 = highlight(file1, regexes) highlights2 = highlight(file2, regexes) # Output comparison return render_template("compare.html", file1=highlights1, file2=highlights2)
def rank_sentences(doc, tfidf_dict, include_words=False): """given document and the document tfidf_dict {word:word_tfidfscore}, return list of ranked senteces and its score [(sent,score),(sen2,score2)]""" ranked_sentences = [] for sentence in helpers.sentences(doc): word_list = helpers.clean(sentence) if len(word_list) < 5: continue #if less than 5 words in sentence, skip score = sum([tfidf_dict[word[0]] for word in word_list]) ranked_sentences.append((sentence, score)) return ranked_sentences
def main(): # Parse command-line arguments parser = argparse.ArgumentParser() group = parser.add_mutually_exclusive_group(required=True) group.add_argument("--lines", action="store_true", help="compare lines") group.add_argument("--sentences", action="store_true", help="compare sentences") group.add_argument("--substrings", metavar="N", type=positive, help="compare substrings of length N") parser.add_argument("FILE1", help="file to compare") parser.add_argument("FILE2", help="file to compare") args = vars(parser.parse_args()) # Read files try: with open(args["FILE1"], "r") as file: file1 = file.read() except IOError: sys.exit(f"Could not read {args['FILE1']}") try: with open(args["FILE2"], "r") as file: file2 = file.read() except IOError: sys.exit(f"Could not read {args['FILE2']}") # Compare files if args["lines"]: matches = lines(file1, file2) elif args["sentences"]: matches = sentences(file1, file2) elif args["substrings"]: matches = substrings(file1, file2, args["substrings"]) # Output matches, sorted from longest to shortest, with line endings escaped for match in sorted(matches, key=len, reverse=True): print(match.replace("\n", "\\n").replace("\r", "\\r"))
#!/usr/bin/env python3 import sys, re, json, fileinput, glob from helpers import sentences REVIEWSDIR='UD_English/not-to-release/sources/reviews' CONLLULEX=sys.argv[1] # load UD data ud = {} udDocs = glob.glob(f'{REVIEWSDIR}/*.xml.conllu') for udDoc in udDocs: for sent in sentences(udDoc): ud[sent.meta_dict['sent_id']] = (udDoc, sent) nSentsChanged = nToksChanged = nTagsChanged = nLemmasChanged = nDepsChanged = 0 for sent in sentences(CONLLULEX): # metadata shouldn't change (assume tokenization hasn't changed) print(*sent.meta, sep='\n') newudDoc, newudsent = ud[sent.meta_dict['sent_id']] assert len(sent.tokens)==len(newudsent.tokens) sentChanged = False for tok,newudtok in zip(sent.tokens,newudsent.tokens): oldud = '\t'.join(tok.orig.split('\t')[:10]) newud = '\t'.join(newudtok.orig.split('\t')[:10]) if oldud!=newud: nToksChanged += 1 sentChanged = True