示例#1
0
    def start(self):
        """
        start the job

        the job includes the following things:

            * fetch new unverified revisions
            * score this revisions
            * filter all suspected bad revisions
            * insert revisions to table
        """

        t = TicToc()
        t.tic()

        sql_user_name = read_sql_user_name()
        wikishield_conn = WS(sql_user_name)
        wikishield_conn.start()
        lm = LangsManager()
        for lang_name in lm.langs_names:
            lang = lm.get_lang(lang_name)
            clf = WikiClassifier(lang, wikishield_conn.ctx)
            clf.learn(limit=None)
            file_path = WikiClassifier.PICKLE_FOLDER_PATH + '/' + lang_name + '.pickle'
            clf.pickle_to_file(file_path)

        wikishield_conn.close()

        t.toc()
        print("learn job summary: elapsed time = ", t.elapsed,
              "seconds")  #TODO: remove this
    def learn(self, limit: int = None):
        """
        partition to train set and test

        convert revs to token counts by using CountVectorizer

        fit calssifier on the train set

        param limit: limit data - the number of revisions. if limit None the function fetch all revisions from dataset.
        """

        sql_user_name = read_sql_user_name()
        wikishield_db = WikishieldDB(self.ctx, self.lang, sql_user_name)
        revs = wikishield_db.fetch_training_set(limit)

        X_text = np.asarray([rev[0] for rev in revs], dtype=object)
        y = np.asarray([rev[1] for rev in revs])

        self.vectorizer = HashingVectorizer(token_pattern=self.lang.extract_words_regex, decode_error='ignore',
                                            n_features=2 ** 18, alternate_sign=False)

        X_train_text, _, y_train, _ = train_test_split(X_text, y, test_size=0.2, random_state=0, shuffle=True)
        X_train = self.vectorizer.fit_transform(X_train_text)

        self.classifier = SVC(probability=True)

        self.classifier.fit(X_train, y_train)
示例#3
0
def _get_wikishield_db(lang: Lang):
    """
    get Wikishield database

    param lang: language

    return Wikishield database
    """

    sql_user_name = read_sql_user_name()
    wikishield_conn = WS(sql_user_name)
    wikishield_conn.start()
    return WikishieldDB(wikishield_conn.ctx, lang, sql_user_name)
示例#4
0
def open_connections(lang: Lang):
    """
    open databases connections

    param lang: language

    return tuple of and Wikishield connection and Wikimedia connection  
    """

    sql_user_name = read_sql_user_name()
    
    wd = WD(sql_user_name, lang)
    wd._connect()
    ws = WS(sql_user_name)
    ws._connect()
    return ws, wd
示例#5
0
def init_sources(source_ctx: Connection, dest_ctx: Connection, lang: Lang):
    """
    initialize sources wrappers

    param source_ctx: source context

    param source_ctx: destination context

    param lang: language

    return: tuple of (wikimedia_db, wikishield_db, wikimedia_api)
    """

    sql_user_name = read_sql_user_name()
    wikimedia_db = WikimediaDB(source_ctx, lang)
    wikishield_db = WikishieldDB(dest_ctx, lang, sql_user_name)
    wikimedia_api = WikimediaApi(lang)
    return wikimedia_db, wikishield_db, wikimedia_api
示例#6
0
lm = LangsManager()
lang = lm.get_lang('en')
from jobs.add_revs_job import AddRevsJob

arj = AddRevsJob(lang)
arj.start()

import app

app.app.run(debug=False, use_reloader=False)

from db.wikimedia_connection import DBConnection as WD
from db.connection_info import read_sql_user_name

sql_user_name = read_sql_user_name()
from lang.langs import LangsManager, Lang

lm = LangsManager()
lang = lm.get_lang('en')

import db.connections_manager as cm

ws, wd = cm.open_connections(lang)

from wiki_api.wikimedia_api import WikimediaApi

wapi = WikimediaApi(lang)
diff_text, page_title = wapi.fetch_rev_diff(324651969, 324548952)
print(diff_text)