示例#1
0
def find_similar_pairs(data):
    """
	Find the most similar document for each document in the collection, output the pairs
	"""
    processor = TextProcessor()
    processor.map_json_data(data)
    similar_pairs = processor.similarity_analysis()
    data_output = {
        processor.doc_collection[f].link: processor.doc_collection[s].link
        for f, s in similar_pairs
    }

    with open('similar_0817.json', 'w') as file_output:
        json.dump(data_output, file_output)
示例#2
0
        "DATA_PATH": str,
        "session_num": 13,
    }

    if os.environ["HOME"] == "/root":
        args["DATA_PATH"] = "/content/gdrive/MyDrive/bert-for-hmltc/data"
    else:
        args["DATA_PATH"] = "data"

    random.seed(args["seed"])
    np.random.seed(args["seed"])
    torch.manual_seed(args["seed"])

    logger.info("Initializing…")
    tokenizer = load_tokenizer(args)
    processor = TextProcessor(args, tokenizer, logger, "topic_list.json")

    if args["use_parents"]:
        model = create_experimental(args, len(processor.labels))
    else:
        model = create_baseline(args, len(processor.labels))

    model_state_dict = torch.load(
        join(args["DATA_PATH"], "model_files/13_finetuned_pytorch_model.bin"),
        map_location="cpu",
    )
    model.load_state_dict(model_state_dict)
    if args["do_train"]:
        trainer = ModelTrainer(args, model, logger)

        logger.info("Loading data…")
示例#3
0
from flask import Flask, render_template, request
from processor import TextProcessor, Document
import re
from bs4 import BeautifulSoup
import urllib3
import numpy as np
from flask.ext.pymongo import PyMongo
import os
import sys

app = Flask(__name__)
app.config['MONGO_URI'] = os.getenv('MONGOHQ_URL')
mongo = PyMongo(app)
processor = TextProcessor()

with app.app_context():
    processor.map_data(mongo.db.postings.find())
    processor.build_doc_matrix()


@app.route('/', methods=['GET', 'POST'])
def main():
    if request.method == 'GET':
        return render_template('index.html')

    url = request.form['url'].strip()
    domain = 'newyork.craigslist.org/'

    if not domain in url:
        return render_template('index.html', error='Please enter a valid URL')
示例#4
0
 def setUp(self):
     self.processor = TextProcessor()