def train_doc2vec_curia(min_count, epoch_num, embedding_dim, learning_rate): print("Initializing database and loading documents...") docs = table_docs.get_docs_with_names(['Judgment']) helpers.create_folder_if_not_exists('trained_models') model_path = os.path.join('trained_models', helpers.setup_json['doc2vec_path']) content_gen = ContentGenerator(docs) contents = [list(chain.from_iterable(content)) for content in content_gen] contents = [ gensim.models.doc2vec.TaggedDocument(content, [i]) for i, content in enumerate(contents) ] print('Initializing and training model...') model = gensim.models.Doc2Vec(documents=contents, iter=epoch_num, size=embedding_dim, window=3, dm=1, min_count=min_count, negative=5, workers=4, alpha=learning_rate) # save final version model.save(model_path) print('Saving document embeddings...') save_doc_embeddings_doc2vec('doc2vec.pickle', model)
def train_word2vec_curia(min_count, epoch_num, embedding_dim, learning_rate): print("Initializing database and loading documents...") docs = table_docs.get_docs_with_names(['Judgment']) helpers.create_folder_if_not_exists('trained_models') model_path = os.path.join('trained_models', helpers.setup_json['word2vec_path']) content_gen = chain.from_iterable(ContentGenerator(docs)) contents = list(content_gen) # generate all contents at once print('Initializing and training model...') model = gensim.models.Word2Vec(sentences=contents, iter=epoch_num, size=embedding_dim, window=3, sg=1, min_count=min_count, negative=5, workers=4, alpha=learning_rate) # save final version model.wv.save_word2vec_format(model_path, binary=True) print('Saving document embeddings...') save_doc_embeddings_word2vec('word2vec.pickle', model)
def train_lsi_curia(embedding_dim): print("Initializing database and loading documents...") docs = table_docs.get_docs_with_names(['Judgment']) helpers.create_folder_if_not_exists('trained_models') model_path = os.path.join('trained_models', helpers.setup_json['lsi_path']) content_gen = ContentGenerator(docs) contents = [list(chain.from_iterable(content)) for content in content_gen] print('Initializing and training model...') dictionary = gensim.corpora.Dictionary(contents) content_bow = [dictionary.doc2bow(content) for content in contents] tfidf = gensim.models.TfidfModel(content_bow) content_tfidf = tfidf[content_bow] model = gensim.models.LsiModel(content_tfidf, id2word=dictionary, num_topics=embedding_dim) # save final version dictionary.save(os.path.splitext(model_path)[0] + '_dict.bin') tfidf.save(os.path.splitext(model_path)[0] + '_tfidf.bin') model.save(model_path) print('Saving document embeddings...') save_doc_embeddings_lsi('lsi.pickle', model, dictionary, tfidf)
def save_doc_embeddings_word2vec(file_name, model): """Saves document embeddings in a file using the provided word2vec or fasttext model. """ docs = table_docs.get_docs_with_names(['Judgment']) content_gen = ContentGenerator(docs) helpers.create_folder_if_not_exists('saved_embeddings') embs = [] for doc, content in zip(docs, content_gen): emb = get_embedding_doc_word2vec(content, model, stopword_removal=True) embs.append({'doc_id': doc['id'], 'emb': emb}) with open(os.path.join('saved_embeddings', file_name), 'wb') as f: pickle.dump(embs, f)
def save_doc_embeddings_lsi(file_name, model, dictionary, tfidf): """Saves document embeddings in a file using the provided lsi model. """ docs = table_docs.get_docs_with_names(['Judgment']) content_gen = ContentGenerator(docs) contents = [list(chain.from_iterable(content)) for content in content_gen] helpers.create_folder_if_not_exists('saved_embeddings') embs = [] for doc, content in zip(docs, contents): emb = get_embedding_doc_lsi(content, model, dictionary, tfidf) embs.append({'doc_id': doc['id'], 'emb': emb}) with open(os.path.join('saved_embeddings', file_name), 'wb') as f: pickle.dump(embs, f)
import argparse from itertools import chain from lazylawyer.content_generator import ContentGenerator from lazylawyer.database import table_cases, table_docs, table_doc_contents from lazylawyer import helpers import numpy as np import os import pickle from sklearn.naive_bayes import BernoulliNB, MultinomialNB from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.pipeline import Pipeline, make_pipeline from sklearn.metrics import accuracy_score from sklearn.preprocessing import FunctionTransformer # set general model path helpers.create_folder_if_not_exists('trained_models') model_path = os.path.join('trained_models', helpers.setup_json['subject_classifier_path']) def _identity_func(x): return x def _multiply_func(x): return x * 1000 def generate_class_labels(): labels_path = os.path.join( 'trained_models', helpers.setup_json['subject_classifier_labels_path'])
from lazylawyer import helpers import os from pathlib import Path import requests def download_doc_for_case(case, doc): """Downloads document from the web belonging to a specific case. Stores the document under [name].[format]. """ folder_path = Path('doc_dir/' + helpers.case_name_to_folder(case['name'])) helpers.create_folder_if_not_exists(folder_path) doc_filename = str(doc['id']) + '.' + doc['format'] if doc['link'] is not None: helpers.download_file(doc['link'], folder_path / doc_filename)