import pandas as pd import numpy as np from scipy.stats import ks_2samp from verification.verification import Verification from verification.evaluation import evaluate from verification.preprocessing import prepare_corpus, split_corpus random_state = 1000 data_path = "../data/" corpus = "caesar_dev" n_dev_pairs = 500 n_features = 10000 logging.info("preparing corpus") X_dev, X_test = split_corpus(prepare_corpus(data_path + corpus), controlled="authors", random_state=random_state) print corpus vsms = ('std', 'tf', 'tfidf') dms = ('minmax', 'euclidean', 'cityblock') # set fig params fig = sb.plt.figure(figsize=(len(vsms), len(dms))) cnt = 0 outer_grid = gridspec.GridSpec(len(vsms), len(dms), wspace=0.1, hspace=0.1) c1, c2 = sb.color_palette("Set1")[:2] for vsm_cnt, vsm in enumerate(vsms): print("\t+ " + vsm)
import pandas as pd import numpy as np from scipy.stats import ks_2samp from verification.verification import Verification from verification.evaluation import evaluate from verification.preprocessing import prepare_corpus, split_corpus random_state = 1000 data_path = "../data/" corpus = "gr_articles" #"du_essays" n_dev_pairs = 500 n_features = 10000 logging.info("preparing corpus") X_dev, X_test = split_corpus(prepare_corpus(data_path+corpus), controlled="authors", random_state=random_state) print corpus vsms = ('std', 'tf', 'tfidf', 'bin', 'plm') dms = ('minmax', 'euclidean', 'cityblock') # set fig params fig = sb.plt.figure(figsize=(len(vsms), len(dms))) cnt = 0 outer_grid = gridspec.GridSpec(len(vsms), len(dms), wspace=0.1, hspace=0.1) c1, c2 = sb.color_palette("Set1")[:2] # first baseline: df = pd.DataFrame(columns=["vector space model"]+list(dms)) for vsm_cnt, vsm in enumerate(vsms):
from sklearn.manifold import TSNE from sklearn.cluster import AgglomerativeClustering from bokeh.models import HoverTool, ColumnDataSource from bokeh.plotting import figure, show, output_file, save from bokeh.charts import Bar from bokeh.io import output_file, show, vplot, save from bokeh.plotting import figure from bokeh.models import Axis data_path = '../data/' corpus = 'soldier_letters' logging.info('preparing corpus') verif_dataset = prepare_corpus(data_path+corpus) fit = 0 if fit: """ We fit a vectorizer with the best parametrization we obtained during the development phase. """ verifier = Verification(random_state=1066, metric='minmax', feature_type='chars', ngram_range=4, sample_authors=False, sample_features=False, n_features=10000, n_dev_pairs=1000000000000,
from sklearn.decomposition import PCA from sklearn.manifold import TSNE from sklearn.cluster import MiniBatchKMeans from bokeh.models import HoverTool, ColumnDataSource from bokeh.plotting import figure, show, output_file, save from bokeh.charts import Bar from bokeh.io import output_file, show, vplot, save from bokeh.plotting import figure from bokeh.models import Axis data_path = '../data/' corpus = 'soldier_letters' # we first prepare the corpus in the normal way: verif_dataset = prepare_corpus(data_path + corpus) # we check which scribes appear more than once: cnt = Counter(verif_dataset.authors) included_authors = set(a for a in cnt if cnt[a] > 1) # now, we jumble the words in each letter and # divide them into two halves (for development purposes): lookup = [] # for later convenience lookup_idx = 0 random.seed(1072015) texts, titles, authors = [], [], [] for text, title, author in zip(verif_dataset.texts, verif_dataset.titles, verif_dataset.authors): if author in included_authors: random.shuffle(text)
from sklearn.decomposition import PCA from sklearn.manifold import TSNE from sklearn.cluster import MiniBatchKMeans from bokeh.models import HoverTool, ColumnDataSource from bokeh.plotting import figure, show, output_file, save from bokeh.charts import Bar from bokeh.io import output_file, show, vplot, save from bokeh.plotting import figure from bokeh.models import Axis data_path = '../data/' corpus = 'soldier_letters' # we first prepare the corpus in the normal way: verif_dataset = prepare_corpus(data_path+corpus) # we check which scribes appear more than once: cnt = Counter(verif_dataset.authors) included_authors = set(a for a in cnt if cnt[a] > 1) # now, we jumble the words in each letter and # divide them into two halves (for development purposes): lookup = [] # for later convenience lookup_idx = 0 random.seed(1072015) texts, titles, authors = [], [], [] for text, title, author in zip(verif_dataset.texts, verif_dataset.titles, verif_dataset.authors): if author in included_authors: random.shuffle(text) text_a, text_b = text[:int(len(text)/2.0)], text[int(len(text)/2.0):]
from verification.verification import Verification from verification.evaluation import evaluate, evaluate_with_threshold, average_precision_score from verification.evaluation import rank_predict from verification.plotting import draw_tree from verification.preprocessing import prepare_corpus, Dataset from sklearn.cross_validation import train_test_split import numpy as np import pandas as pd # select a data set dev = "../data/caesar_dev" test = "../data/caesar_test" # we prepare the corpus logging.info("preparing corpus") X_dev = prepare_corpus(test) X_test = prepare_corpus(test) dm = 'minmax' vsm = 'tf' print dm print vsm verifier = Verification(random_state=1000, metric=dm, n_features=10000, n_dev_pairs=0, n_test_pairs=99999999, vector_space_model=vsm, balanced_pairs=False,
from verification.verification import Verification from verification.evaluation import evaluate, evaluate_with_threshold, average_precision_score from verification.evaluation import rank_predict from verification.plotting import draw_tree from verification.preprocessing import prepare_corpus, Dataset from sklearn.cross_validation import train_test_split import numpy as np import pandas as pd # select a data set dev = "../data/caesar_test" # we prepare the corpus logging.info("preparing corpus") X_dev = prepare_corpus(dev) dms = ('minmax', 'euclidean', 'cityblock') vsms = ('std', 'plm', 'tf') fig = plt.figure() cnt = 0 outer_grid = gridspec.GridSpec(len(dms), len(vsms)) for dm_cnt, dm in enumerate(dms): print dm for vsm_cnt, vsm in enumerate(vsms): print vsm verifier = Verification(random_state=1000, sample_features=False, metric=dm,
from verification.verification import Verification from verification.evaluation import evaluate, evaluate_with_threshold, average_precision_score from verification.evaluation import rank_predict from verification.plotting import draw_tree from verification.preprocessing import prepare_corpus, Dataset from sklearn.cross_validation import train_test_split import numpy as np import pandas as pd # select a data set dev = "../data/caesar_dev" test = "../data/caesar_test" # we prepare the corpus logging.info("preparing corpus") X_dev = prepare_corpus(test) X_test = prepare_corpus(test) dm = 'minmax' vsm = 'tf' print dm print vsm verifier = Verification(random_state=1000, metric=dm, n_features=10000, n_dev_pairs=0, n_test_pairs=99999999, vector_space_model=vsm,
from verification.verification import Verification from verification.evaluation import evaluate, evaluate_with_threshold, average_precision_score from verification.evaluation import rank_predict from verification.plotting import draw_tree from verification.preprocessing import prepare_corpus, Dataset from sklearn.cross_validation import train_test_split import numpy as np import pandas as pd # select a data set dev = "../data/caesar_test" # we prepare the corpus logging.info("preparing corpus") X_dev = prepare_corpus(dev) dms = ('minmax', 'euclidean', 'cityblock') vsms = ('std', 'plm', 'tf') fig = plt.figure() cnt = 0 outer_grid = gridspec.GridSpec(len(dms), len(vsms)) for dm_cnt, dm in enumerate(dms): print dm for vsm_cnt, vsm in enumerate(vsms): print vsm verifier = Verification(random_state=1000, sample_features=False,