def find_similar_pairs(data): """ Find the most similar document for each document in the collection, output the pairs """ processor = TextProcessor() processor.map_json_data(data) similar_pairs = processor.similarity_analysis() data_output = { processor.doc_collection[f].link: processor.doc_collection[s].link for f, s in similar_pairs } with open('similar_0817.json', 'w') as file_output: json.dump(data_output, file_output)
class ProcessorTest(unittest.TestCase): @classmethod def setUpClass(self): self.documents = ('The sky is very blue', 'The sun is bright', 'The sun in the sky is bright', 'We can see the shining sun, the bright SUN') def setUp(self): self.processor = TextProcessor() def test_process_doc(self): self.assertEqual(self.processor.process_doc(self.documents[0]), ['sky', 'blue']) self.assertEqual(self.processor.process_doc(self.documents[3]), ['see', 'shine', 'sun', 'bright', 'sun']) self.assertEqual(self.processor.doc_count, 2) def test_gen_matrix(self): for doc in self.documents: self.processor.doc_collection.append( self.processor.process_doc(doc)) mat = self.processor.gen_matrix() print mat # verify the generated inverse list self.assertEqual(self.processor.inverse_list, { 'blue': 1, 'shine': 1, 'sun': 3, 'sky': 2, 'see': 1, 'bright': 3 }) # verify the tf-idf calculation expected = [[math.log(4), 0, 0, math.log(2), 0, 0], [0, 0, math.log(4 / 3), 0, 0, math.log(4 / 3)], [0, 0, math.log(4 / 3), math.log(2), 0, math.log(4 / 3)], [ 0, math.log(4), 2 * math.log(4 / 3), 0, math.log(4), math.log(4 / 3) ]] np.testing.assert_array_equal(self.processor.doc_mat, expected) def test_consine_similarity(self): # the formula is the dot product of d1 and d2 over the product of their euclidean lengths d1, d2 = [[1, 0, 2, 4], [0, 3, 2, 1]] self.assertEqual(self.processor.consine_similarity(d1, d2), 8 / (math.sqrt(21) * math.sqrt(14))) def test_get_top_items(self): arr = np.array([2, 6, 8, 4, 5, 3]) np.testing.assert_array_equal(self.processor.get_top_ind(arr, 3), [2, 1, 4])
def __init__(self, parent): Frame.__init__(self) self.parent = parent self.processor = TextProcessor() self.initUI()
class Example(Frame): def __init__(self, parent): Frame.__init__(self) self.parent = parent self.processor = TextProcessor() self.initUI() def readCorpus(self, path): self.data = Corpus(path) self.processor.calculateConditionalFrequency(self.data, self.selCategory.get()) #self.processor.calculateTotalTermFrequency(self.data) self.categoryOption['menu'].delete(0, 'end') for attr in self.data.attributes: self.categoryOption['menu'].add_command(label=attr, command=lambda v=attr: self.changeCategory(v) ) self.curdoc=0 self.txt1.delete('1.0', END) self.txt1.insert('1.0', self.data.docs[self.curdoc].text) def refreshTextInfo(self): if self.selCategory.get() != 'Categories': idcat = self.data.attributes.index(self.selCategory.get()) self.entry1.delete(0, END) self.entry1.insert(0, self.data.getAttributeVal(self.curdoc, self.selCategory.get() )) self.txt1.delete('1.0', END) self.txt1.insert('1.0', self.data.docs[self.curdoc].text) self.applyProcessing() def changeCategory(self, value): self.selCategory.set(value) self.entry1.delete(0, END) self.entry1.insert(0, self.data.getAttributeVal(self.curdoc, self.selCategory.get())) self.processor.calculateConditionalFrequency(self.data, self.selCategory.get()) def prevDocument(self): if self.curdoc>0: self.curdoc-=1 self.refreshTextInfo() def nextDocument(self): if self.curdoc<self.data.ndocs-1: self.curdoc+=1 self.refreshTextInfo() def popup(self, event): print "hello "+str(event.widget) self.popupmenu.tk_popup(event.x_root, event.y_root, 0) print event.widget.index("@%s,%s" % (event.x, event.y)) def applyProcessing(self): if self.selCategory.get() != 'Categories': indxCat = self.data.attributes.index( self.selCategory.get() ) textResult = self.processor.process(self.data.docs[self.curdoc], indxCat) else: textResult = "" self.txt2.delete('1.0', END) self.txt2.insert('1.0', textResult) def loadCorpus(self): path = tkFileDialog.askdirectory() self.readCorpus(path) self.refreshTextInfo() def hello(self): print "Hello" def initUI(self): self.parent.title("Simple") self.pack(fill=BOTH, expand=True) self.centerWindow() sw = self.parent.winfo_screenwidth() sh = self.parent.winfo_screenheight() frame1 = Frame(self, relief=RAISED, borderwidth=1) frame1.pack(fill=X) button1 = Button(frame1, text=u"<", command=self.prevDocument) button1.pack(side=LEFT, padx=5, pady=5) button2 = Button(frame1, text=u">", command=self.nextDocument) button2.pack(side=LEFT, padx=5, pady=5) self.selCategory = StringVar(self) self.categoryOption = OptionMenu(frame1, self.selCategory, *["Categories"], command=self.changeCategory) self.categoryOption.pack(side=LEFT, padx=5, pady=5) self.entry1 = Entry(frame1) self.entry1.pack(side=LEFT, padx=5, pady=5) self.ignoreActualDocVar = IntVar(self) checkButton1 = Checkbutton(frame1, text="Ignored", variable=self.ignoreActualDocVar) checkButton1.pack(side=LEFT, padx=5, pady=5) button3 = Button(frame1, text=u"Save document", command=self.prevDocument) button3.pack(side=LEFT, padx=5, pady=5) #entry1 = Entry(frame1) #entry1.pack(fill=X, padx=5, expand=True) frame2 = PanedWindow(self, orient=HORIZONTAL) frame2.pack(fill=BOTH, expand=1) self.txt1 = Text(frame2, width=sw/22) frame2.add(self.txt1) self.txt2 = Text(frame2) self.txt2.bind("<Button-3>", self.popup) frame2.add(self.txt2) frame3 = Frame(self, relief=RAISED, borderwidth=1) frame3.pack(fill=X) #lbl3 = Label(frame3, text="Author", width=6) #lbl3.pack(side=LEFT, padx=5, pady=5) #entry3 = Entry(frame3) #entry3.pack(fill=X, padx=5, expand=True) self.swVar = IntVar(self) checkButton1 = Checkbutton(frame3, text="Remove stop words", variable=self.swVar) checkButton1.pack(side=LEFT, padx=5, pady=5) self.lowerVar = IntVar(self) checkButton1 = Checkbutton(frame3, text="Convert to lower case", variable=self.lowerVar) checkButton1.pack(side=LEFT, padx=5, pady=5) button3 = Button(frame3, text=u"Apply", command=self.applyProcessing) button3.pack(side=LEFT, padx=5, pady=5) #self.readCorpus() # create a toplevel menu menubar = Menu(self) filemenu = Menu(menubar, tearoff=0) filemenu.add_command(label="Quit", command=self.parent.quit) filemenu.add_command(label="Open corpus", command=self.loadCorpus) menubar.add_cascade(label="Project", menu=filemenu) #menubar.add_command(label="Quit!") # , command=root.quit # display the menu self.parent.config(menu=menubar) self.popupmenu = Menu(self.parent, tearoff=0) self.popupmenu.add_command(label="Undo", command=self.hello) self.popupmenu.add_command(label="Redo", command=self.hello) def centerWindow(self): sw = self.parent.winfo_screenwidth() sh = self.parent.winfo_screenheight() w = sw/1.5 h = sh/1.5 x = (sw - w) / 2 y = (sh - h) / 2 self.parent.geometry('%dx%d+%d+%d' % (w, h, x, y))
"DATA_PATH": str, "session_num": 13, } if os.environ["HOME"] == "/root": args["DATA_PATH"] = "/content/gdrive/MyDrive/bert-for-hmltc/data" else: args["DATA_PATH"] = "data" random.seed(args["seed"]) np.random.seed(args["seed"]) torch.manual_seed(args["seed"]) logger.info("Initializing…") tokenizer = load_tokenizer(args) processor = TextProcessor(args, tokenizer, logger, "topic_list.json") if args["use_parents"]: model = create_experimental(args, len(processor.labels)) else: model = create_baseline(args, len(processor.labels)) model_state_dict = torch.load( join(args["DATA_PATH"], "model_files/13_finetuned_pytorch_model.bin"), map_location="cpu", ) model.load_state_dict(model_state_dict) if args["do_train"]: trainer = ModelTrainer(args, model, logger) logger.info("Loading data…")
from flask import Flask, render_template, request from processor import TextProcessor, Document import re from bs4 import BeautifulSoup import urllib3 import numpy as np from flask.ext.pymongo import PyMongo import os import sys app = Flask(__name__) app.config['MONGO_URI'] = os.getenv('MONGOHQ_URL') mongo = PyMongo(app) processor = TextProcessor() with app.app_context(): processor.map_data(mongo.db.postings.find()) processor.build_doc_matrix() @app.route('/', methods=['GET', 'POST']) def main(): if request.method == 'GET': return render_template('index.html') url = request.form['url'].strip() domain = 'newyork.craigslist.org/' if not domain in url: return render_template('index.html', error='Please enter a valid URL')
def setUp(self): self.processor = TextProcessor()