def find_similar_pairs(data): """ Find the most similar document for each document in the collection, output the pairs """ processor = TextProcessor() processor.map_json_data(data) similar_pairs = processor.similarity_analysis() data_output = { processor.doc_collection[f].link: processor.doc_collection[s].link for f, s in similar_pairs } with open('similar_0817.json', 'w') as file_output: json.dump(data_output, file_output)
"DATA_PATH": str, "session_num": 13, } if os.environ["HOME"] == "/root": args["DATA_PATH"] = "/content/gdrive/MyDrive/bert-for-hmltc/data" else: args["DATA_PATH"] = "data" random.seed(args["seed"]) np.random.seed(args["seed"]) torch.manual_seed(args["seed"]) logger.info("Initializing…") tokenizer = load_tokenizer(args) processor = TextProcessor(args, tokenizer, logger, "topic_list.json") if args["use_parents"]: model = create_experimental(args, len(processor.labels)) else: model = create_baseline(args, len(processor.labels)) model_state_dict = torch.load( join(args["DATA_PATH"], "model_files/13_finetuned_pytorch_model.bin"), map_location="cpu", ) model.load_state_dict(model_state_dict) if args["do_train"]: trainer = ModelTrainer(args, model, logger) logger.info("Loading data…")
from flask import Flask, render_template, request from processor import TextProcessor, Document import re from bs4 import BeautifulSoup import urllib3 import numpy as np from flask.ext.pymongo import PyMongo import os import sys app = Flask(__name__) app.config['MONGO_URI'] = os.getenv('MONGOHQ_URL') mongo = PyMongo(app) processor = TextProcessor() with app.app_context(): processor.map_data(mongo.db.postings.find()) processor.build_doc_matrix() @app.route('/', methods=['GET', 'POST']) def main(): if request.method == 'GET': return render_template('index.html') url = request.form['url'].strip() domain = 'newyork.craigslist.org/' if not domain in url: return render_template('index.html', error='Please enter a valid URL')
def setUp(self): self.processor = TextProcessor()