def writer2vec( data: Union[str, Iterable[str]], labels: Iterable[Any], outfiles: Union[str, Iterable[str]] = None, # filenames for embedding matrices *, embedding: 'EmbeddingModel' = None, **kwargs, ): if isinstance(data, str): data = [data] labels = [labels] authors = [] for i, (corpus, label) in enumerate(zip(data, labels), start=1): author = Author(corpus, label) author.writer2vec(embedding=embedding, **kwargs) authors.append(author) vectors = [] labels = [] for author in authors: vectors.append(author.dv) labels.append([author.label] * len(author.dv)) if outfiles: if isinstance(outfiles, str): outfiles = [outfiles] for author, outfile in zip(authors, outfiles): if outfile: author.embedding.save(outfile) return vectors, labels
def describe_corpus(corpus: dict): total_sents = 0 total_words = 0 total_chars = 0 for idx, (story, text) in enumerate(corpus.items(), start=1): author = Author(text) author.preprocess() print(f'{idx}.', story) print('\tSentences:', len(author.sentences)) print('\tWords:', len(author.words)) print('\tCharacters:', len(author.text)) total_sents += len(author.sentences) total_words += len(author.words) total_chars += len(author.text) print('Total sentences:', total_sents) print('Total words:', total_words) print('Total characters:', total_chars)
def writer2vec( data: Iterable[str], labels: Iterable[int], *, verbose: bool = True, **kwargs, ): # Verbosity fd = sys.stdout if verbose else open(os.devnull, 'w') vprint = functools.partial(print, file=fd) t = SmartTimer('Document Embedding Pipeline') authors = [] for i, (corpus, label) in enumerate(zip(data, labels), start=1): t.tic(f'{i}: Load corpus') author = Author(corpus, label) authors.append(author) t.toc() vprint(f'Author {i}:') vprint('\tCharacter count:', len(author.corpus)) t.tic(f'{i}: writer2vec') author.writer2vec(**kwargs) t.toc() vprint('\tCorpus sentences:', len(author.sentences)) vprint('\tCorpus tokens:', len(author.words)) vprint('\tCorpus vocabulary:', len(author.parsed.vocabulary)) vprint('\tDocuments:', len(author.docs)) vprint('\tDocument tokens:', author.docs[0].size) vprint('\tEmbedding vocabulary:', len(author.embedding.vocabulary)) vprint('\tEmbedding matrix:', author.embedding.vectors.shape) vprint('\tDocuments embedding matrix:', author.docs_vectors.shape) vprint('Splitting train/test documents...') t.tic('Train/test documents split') vectors = [] labels = [] for author in authors: vectors.append(author.docs_vectors) labels.append([author.label] * len(author.docs_vectors)) t.toc() vprint('writer2vec pipeline walltime (s):', t.walltime) vprint(t) return vectors, labels
def perturb_author(corpus, embedding_file=None, tag=False, **kwargs): if embedding_file is None: embedding = None else: embedding_model = EmbeddingModel() embedding_model.load(embedding_file) embedding = embedding_model.model author = Author(corpus) author.preprocess() repl_words = perturb_document_extended(author.words_str, embedding=embedding, **kwargs) words = author.words count = 0 for i, (w, rw) in enumerate(zip(words, repl_words)): if str(w) != rw: word = f'<{w}|{rw}>' if tag else rw words[i] = TextSpan(word, w.span) count += 1 return Author.substitute(author.corpus, words), count
def get_documents(corpus_and_labels, part_size: int = None): if isinstance(corpus_and_labels, str): corpus_and_labels = [(corpus_and_labels, None)] docs = [] for corpus, label in corpus_and_labels: author = Author(corpus, label) author.preprocess(Tokenizer(lemmatizer='wordnet')) author.partition_into_documents(part_size) for doc in author.parsed_documents: words = doc.get_tokens() docs.append({ 'label': author.label, 'text': words.substitute(author.text), }) return docs
} mlp_params = { 'solver': 'lbfgs', 'alpha': 1e-5, 'hidden_layer_sizes': (50, ), 'random_state': seed, } ############## # Processing # ############## t = SmartTimer('Pipeline') t.tic('Load corpus') doyle = Author(doyle_infile) rinehart = Author(rinehart_infile) christie = Author(christie_infile) t.toc() print('Doyle corpus characters:', len(doyle.corpus)) print('Rinehart corpus characters:', len(rinehart.corpus)) print('Christie corpus characters:', len(christie.corpus)) # Names and object handles to enable looping through same operations names = ['Doyle', 'Rinehart', 'Christie'] authors = [doyle, rinehart, christie] for name, author in zip(names, authors): t.tic(f'{name}: writer2vec') author.writer2vec( tokenizer=Tokenizer(),
part_size = 3500 # Factor for capturing as much as possible from trailing text # Default is 1. but set to 0.1 because 3500/350=10% remain_factor = 0.1 test_size = 0.1 train_outfile = 'Doyle_90.txt' test_outfile = 'Doyle_10.txt' ############## # Processing # ############## t = SmartTimer('10/90 Split') t.tic('Load corpus') a = Author(infile) t.toc() print('Corpus characters:', len(a.corpus)) t.tic('Preprocessing: Tokenizer') a.preprocess(Tokenizer(lemmatizer=None)) t.toc() print('Corpus sentences:', len(a.sentences)) print('Corpus tokens:', len(a.words)) t.tic('Document partitioning') a.partition_into_docs(part_size, remain_factor) t.toc() print('Documents:', len(a.docs)) print('Document tokens:', a.docs[0].size)
print(f'Usage: {sys.argv[0]} lang infile outfile') print('lang (str): uk, us') print('infile (str): JSON file') print('outfile (str): JSON file') sys.exit() lang, infile, outfile = sys.argv[1:] print('Input file:', infile) print('Output file:', outfile) # Generate list of documents docs = load_json(infile) print('Total documents:', len(docs)) total_word_count = 0 total_repl_count = 0 perturb_freq_map = {} for i, doc in enumerate(docs): perturbed_text, repl_count = translate(doc['text'], lang) author = Author(perturbed_text) author.preprocess(Tokenizer(lemmatizer='wordnet')) perturb_freq_map[i] = repl_count / len(author.words) total_repl_count += repl_count total_word_count += len(author.words) print('Perturbation ratio:', total_repl_count / total_word_count) print('Total replacement count:', total_repl_count) print('Total word count:', total_word_count) save_json(perturb_freq_map, outfile)
from authordetect import Author, TextSpan, textutils, save_json if __name__ == '__main__': if len(sys.argv) < 3: print(f'Usage: {sys.argv[0]} part_size infile [outfile]') print('part_size (int): number of words per partition') print('infile (str): Text file') print('outfile (str): JSON file') print('remain_factor (float): Partition remainder fraction') sys.exit() part_size = int(sys.argv[1]) infile = sys.argv[2] outfile = sys.argv[3] if len(sys.argv) > 3 else None print('Input file:', infile) print('Output file:', outfile) author = Author(infile) author.preprocess() author.partition_into_documents(part_size) print('Documents:', len(author.documents)) print('Document tokens:', author.parsed_documents[0].size, author.parsed_documents[-1].size) if outfile: text = [ textutils.get_text_from_span(author.text, doc.span) for doc in author.parsed_documents ] save_json(text, outfile)
###################### # User Configuration # ###################### infile = '../data/Doyle_10.txt' author_embedfile = 'model.bin' part_size = 3500 workers = 1 seed = 0 ############## # Processing # ############## t = SmartTimer('Pipeline') t.tic('Load corpus') a = Author(infile) t.toc() print('Corpus characters:', len(a.corpus)) t.tic('writer2vec') a.writer2vec( tokenizer=Tokenizer(), part_size=part_size, workers=workers, seed=seed, ) t.toc() print('Corpus sentences:', len(a.sentences)) print('Corpus tokens:', len(a.words)) print('Corpus vocabulary:', len(a.parsed.vocabulary)) print('Documents:', len(a.docs))
seed = 0 word = 'watson' top = 14 n_components = 20 #-----------------------Embedding------------------------ # Method 1: Load existing embedding embedding_file = '../training/doyle_50dim_350part.bin' embedding = EmbeddingModel(embedding_file) # Method 2: Compute embedding model # embedding = None #-----------------------Processing------------------------ # Load corpus author = Author(infile) print('Corpus characters:', len(author.text)) # Sentence segmentation and tokenization author.preprocess(Tokenizer(lemmatizer='wordnet')) print('Corpus sentences:', len(author.sentences)) print('Corpus tokens:', len(author.words)) print('Corpus vocabulary:', len(author.parsed_text.vocabulary)) # Create an author's word2vec embedding model author.embed(embedding=embedding, seed=seed) print('Embedding vocabulary:', len(author.embedding.vocabulary)) print('Embedding matrix:', author.embedding.vectors.shape) embedding_size = embedding.vectors.shape[1] w2v_model = author.embedding.model # access Gensim's Word2Vec directly
# across Python interpreter processes. import os os.environ['PYTHONHASHSEED'] = str(0) ###################### # User Configuration # ###################### infile = '../data/Doyle_10.txt' workers = 1 seed = 0 ############## # Processing # ############## # Load corpus a = Author(infile) print('Corpus characters:', len(a.corpus)) # Sentence segmentation and tokenization a.preprocess(Tokenizer()) print('Corpus sentences:', len(a.sentences)) print('Corpus tokens:', len(a.words)) print('Corpus vocabulary:', len(a.parsed.vocabulary)) # Create an author's word2vec embedding model a.embed(workers=workers, seed=seed) print('Embedding vocabulary:', len(a.model.vocabulary)) print('Embedding matrix:', a.model.vectors.shape) # Access the embedding matrix a.model.vectors
###################### # infile = 'https://www.gutenberg.org/files/244/244-0.txt' # label = 1 infile = 'https://www.gutenberg.org/files/863/863-0.txt' label = 0 part_size = 350 workers = 4 seed = None # 0 ############## # Processing # ############## t = SmartTimer('Pipeline') t.tic('Load corpus') author = Author(infile) t.toc() print('Corpus characters:', len(author.corpus)) t.tic(f'writer2vec') author.writer2vec( tokenizer=Tokenizer(), stopwords=Tokenizer.STOPWORDS, part_size=part_size, workers=workers, seed=seed, use_norm=True, ) t.toc() print('Corpus sentences:', len(author.sentences))
sys.exit() part_size = int(sys.argv[1]) infile = sys.argv[2] trainfile = sys.argv[3] if len(sys.argv) > 3 else None testfile = sys.argv[4] if len(sys.argv) > 4 else None test_size = float(sys.argv[5]) if len(sys.argv) > 5 else .1 # Factor for capturing as much as possible from trailing text # Default is 1. but set to 0.1 because 3500/350=10% remain_factor = float(sys.argv[6]) if len(sys.argv) > 6 else (350 / part_size) seed = int(sys.argv[7]) if len(sys.argv) > 7 else 0 print('Input file:', infile) print('Train file:', trainfile) print('Test file:', testfile) author = Author(infile) author.preprocess() author.partition_into_documents(part_size, remain_factor) print('Documents:', len(author.documents)) print('Document tokens:', author.parsed_documents[0].size, author.parsed_documents[-1].size) # Train/test splits train_docs, test_docs = trainutils.split_data_into_train_test( author.parsed_documents, test_size=test_size, random_state=seed, ) train_docs = TextSpan(train_docs) test_docs = TextSpan(test_docs) print('Training documents:', len(train_docs)) print('Training tokens:', train_docs[0].size, train_docs[-1].size)