def start(self): """ start the job the job includes the following things: * fetch new unverified revisions * score this revisions * filter all suspected bad revisions * insert revisions to table """ t = TicToc() t.tic() sql_user_name = read_sql_user_name() wikishield_conn = WS(sql_user_name) wikishield_conn.start() lm = LangsManager() for lang_name in lm.langs_names: lang = lm.get_lang(lang_name) clf = WikiClassifier(lang, wikishield_conn.ctx) clf.learn(limit=None) file_path = WikiClassifier.PICKLE_FOLDER_PATH + '/' + lang_name + '.pickle' clf.pickle_to_file(file_path) wikishield_conn.close() t.toc() print("learn job summary: elapsed time = ", t.elapsed, "seconds") #TODO: remove this
def learn(self, limit: int = None): """ partition to train set and test convert revs to token counts by using CountVectorizer fit calssifier on the train set param limit: limit data - the number of revisions. if limit None the function fetch all revisions from dataset. """ sql_user_name = read_sql_user_name() wikishield_db = WikishieldDB(self.ctx, self.lang, sql_user_name) revs = wikishield_db.fetch_training_set(limit) X_text = np.asarray([rev[0] for rev in revs], dtype=object) y = np.asarray([rev[1] for rev in revs]) self.vectorizer = HashingVectorizer(token_pattern=self.lang.extract_words_regex, decode_error='ignore', n_features=2 ** 18, alternate_sign=False) X_train_text, _, y_train, _ = train_test_split(X_text, y, test_size=0.2, random_state=0, shuffle=True) X_train = self.vectorizer.fit_transform(X_train_text) self.classifier = SVC(probability=True) self.classifier.fit(X_train, y_train)
def _get_wikishield_db(lang: Lang): """ get Wikishield database param lang: language return Wikishield database """ sql_user_name = read_sql_user_name() wikishield_conn = WS(sql_user_name) wikishield_conn.start() return WikishieldDB(wikishield_conn.ctx, lang, sql_user_name)
def open_connections(lang: Lang): """ open databases connections param lang: language return tuple of and Wikishield connection and Wikimedia connection """ sql_user_name = read_sql_user_name() wd = WD(sql_user_name, lang) wd._connect() ws = WS(sql_user_name) ws._connect() return ws, wd
def init_sources(source_ctx: Connection, dest_ctx: Connection, lang: Lang): """ initialize sources wrappers param source_ctx: source context param source_ctx: destination context param lang: language return: tuple of (wikimedia_db, wikishield_db, wikimedia_api) """ sql_user_name = read_sql_user_name() wikimedia_db = WikimediaDB(source_ctx, lang) wikishield_db = WikishieldDB(dest_ctx, lang, sql_user_name) wikimedia_api = WikimediaApi(lang) return wikimedia_db, wikishield_db, wikimedia_api
lm = LangsManager() lang = lm.get_lang('en') from jobs.add_revs_job import AddRevsJob arj = AddRevsJob(lang) arj.start() import app app.app.run(debug=False, use_reloader=False) from db.wikimedia_connection import DBConnection as WD from db.connection_info import read_sql_user_name sql_user_name = read_sql_user_name() from lang.langs import LangsManager, Lang lm = LangsManager() lang = lm.get_lang('en') import db.connections_manager as cm ws, wd = cm.open_connections(lang) from wiki_api.wikimedia_api import WikimediaApi wapi = WikimediaApi(lang) diff_text, page_title = wapi.fetch_rev_diff(324651969, 324548952) print(diff_text)