Пример #1
0
def writer2vec(
    data: Union[str, Iterable[str]],
    labels: Iterable[Any],
    outfiles: Union[str, Iterable[str]] = None,  # filenames for embedding matrices
    *,
    embedding: 'EmbeddingModel' = None,
    **kwargs,
):
    if isinstance(data, str):
        data = [data]
        labels = [labels]

    authors = []
    for i, (corpus, label) in enumerate(zip(data, labels), start=1):
        author = Author(corpus, label)
        author.writer2vec(embedding=embedding, **kwargs)
        authors.append(author)

    vectors = []
    labels = []
    for author in authors:
        vectors.append(author.dv)
        labels.append([author.label] * len(author.dv))

    if outfiles:
        if isinstance(outfiles, str):
            outfiles = [outfiles]
        for author, outfile in zip(authors, outfiles):
            if outfile:
                author.embedding.save(outfile)

    return vectors, labels
Пример #2
0
def describe_corpus(corpus: dict):
    total_sents = 0
    total_words = 0
    total_chars = 0
    for idx, (story, text) in enumerate(corpus.items(), start=1):
        author = Author(text)
        author.preprocess()
        print(f'{idx}.', story)
        print('\tSentences:', len(author.sentences))
        print('\tWords:', len(author.words))
        print('\tCharacters:', len(author.text))
        total_sents += len(author.sentences)
        total_words += len(author.words)
        total_chars += len(author.text)
    print('Total sentences:', total_sents)
    print('Total words:', total_words)
    print('Total characters:', total_chars)
def writer2vec(
    data: Iterable[str],
    labels: Iterable[int],
    *,
    verbose: bool = True,
    **kwargs,
):
    # Verbosity
    fd = sys.stdout if verbose else open(os.devnull, 'w')
    vprint = functools.partial(print, file=fd)

    t = SmartTimer('Document Embedding Pipeline')
    authors = []
    for i, (corpus, label) in enumerate(zip(data, labels), start=1):
        t.tic(f'{i}: Load corpus')
        author = Author(corpus, label)
        authors.append(author)
        t.toc()
        vprint(f'Author {i}:')
        vprint('\tCharacter count:', len(author.corpus))

        t.tic(f'{i}: writer2vec')
        author.writer2vec(**kwargs)
        t.toc()
        vprint('\tCorpus sentences:', len(author.sentences))
        vprint('\tCorpus tokens:', len(author.words))
        vprint('\tCorpus vocabulary:', len(author.parsed.vocabulary))
        vprint('\tDocuments:', len(author.docs))
        vprint('\tDocument tokens:', author.docs[0].size)
        vprint('\tEmbedding vocabulary:', len(author.embedding.vocabulary))
        vprint('\tEmbedding matrix:', author.embedding.vectors.shape)
        vprint('\tDocuments embedding matrix:', author.docs_vectors.shape)

    vprint('Splitting train/test documents...')
    t.tic('Train/test documents split')
    vectors = []
    labels = []
    for author in authors:
        vectors.append(author.docs_vectors)
        labels.append([author.label] * len(author.docs_vectors))
    t.toc()

    vprint('writer2vec pipeline walltime (s):', t.walltime)
    vprint(t)

    return vectors, labels
Пример #4
0
def perturb_author(corpus, embedding_file=None, tag=False, **kwargs):
    if embedding_file is None:
        embedding = None
    else:
        embedding_model = EmbeddingModel()
        embedding_model.load(embedding_file)
        embedding = embedding_model.model

    author = Author(corpus)
    author.preprocess()
    repl_words = perturb_document_extended(author.words_str, embedding=embedding, **kwargs)
    words = author.words
    count = 0
    for i, (w, rw) in enumerate(zip(words, repl_words)):
        if str(w) != rw:
            word = f'<{w}|{rw}>' if tag else rw
            words[i] = TextSpan(word, w.span)
            count += 1
    return Author.substitute(author.corpus, words), count
def get_documents(corpus_and_labels, part_size: int = None):
    if isinstance(corpus_and_labels, str):
        corpus_and_labels = [(corpus_and_labels, None)]
    docs = []
    for corpus, label in corpus_and_labels:
        author = Author(corpus, label)
        author.preprocess(Tokenizer(lemmatizer='wordnet'))
        author.partition_into_documents(part_size)
        for doc in author.parsed_documents:
            words = doc.get_tokens()
            docs.append({
                'label': author.label,
                'text': words.substitute(author.text),
            })
    return docs
}

mlp_params = {
    'solver': 'lbfgs',
    'alpha': 1e-5,
    'hidden_layer_sizes': (50, ),
    'random_state': seed,
}

##############
# Processing #
##############
t = SmartTimer('Pipeline')

t.tic('Load corpus')
doyle = Author(doyle_infile)
rinehart = Author(rinehart_infile)
christie = Author(christie_infile)
t.toc()
print('Doyle corpus characters:', len(doyle.corpus))
print('Rinehart corpus characters:', len(rinehart.corpus))
print('Christie corpus characters:', len(christie.corpus))

# Names and object handles to enable looping through same operations
names = ['Doyle', 'Rinehart', 'Christie']
authors = [doyle, rinehart, christie]

for name, author in zip(names, authors):
    t.tic(f'{name}: writer2vec')
    author.writer2vec(
        tokenizer=Tokenizer(),
part_size = 3500
# Factor for capturing as much as possible from trailing text
# Default is 1. but set to 0.1 because 3500/350=10%
remain_factor = 0.1

test_size = 0.1
train_outfile = 'Doyle_90.txt'
test_outfile = 'Doyle_10.txt'

##############
# Processing #
##############
t = SmartTimer('10/90 Split')

t.tic('Load corpus')
a = Author(infile)
t.toc()
print('Corpus characters:', len(a.corpus))

t.tic('Preprocessing: Tokenizer')
a.preprocess(Tokenizer(lemmatizer=None))
t.toc()
print('Corpus sentences:', len(a.sentences))
print('Corpus tokens:', len(a.words))

t.tic('Document partitioning')
a.partition_into_docs(part_size, remain_factor)
t.toc()
print('Documents:', len(a.docs))
print('Document tokens:', a.docs[0].size)
Пример #8
0
        print(f'Usage: {sys.argv[0]} lang infile outfile')
        print('lang (str): uk, us')
        print('infile (str): JSON file')
        print('outfile (str): JSON file')
        sys.exit()

    lang, infile, outfile = sys.argv[1:]
    print('Input file:', infile)
    print('Output file:', outfile)

    # Generate list of documents
    docs = load_json(infile)
    print('Total documents:', len(docs))

    total_word_count = 0
    total_repl_count = 0
    perturb_freq_map = {}
    for i, doc in enumerate(docs):
        perturbed_text, repl_count = translate(doc['text'], lang)
        author = Author(perturbed_text)
        author.preprocess(Tokenizer(lemmatizer='wordnet'))
        perturb_freq_map[i] = repl_count / len(author.words)

        total_repl_count += repl_count
        total_word_count += len(author.words)

    print('Perturbation ratio:', total_repl_count / total_word_count)
    print('Total replacement count:', total_repl_count)
    print('Total word count:', total_word_count)
    save_json(perturb_freq_map, outfile)
from authordetect import Author, TextSpan, textutils, save_json

if __name__ == '__main__':
    if len(sys.argv) < 3:
        print(f'Usage: {sys.argv[0]} part_size infile [outfile]')
        print('part_size (int): number of words per partition')
        print('infile (str): Text file')
        print('outfile (str): JSON file')
        print('remain_factor (float): Partition remainder fraction')
        sys.exit()

    part_size = int(sys.argv[1])
    infile = sys.argv[2]
    outfile = sys.argv[3] if len(sys.argv) > 3 else None
    print('Input file:', infile)
    print('Output file:', outfile)

    author = Author(infile)
    author.preprocess()
    author.partition_into_documents(part_size)
    print('Documents:', len(author.documents))
    print('Document tokens:', author.parsed_documents[0].size,
          author.parsed_documents[-1].size)

    if outfile:
        text = [
            textutils.get_text_from_span(author.text, doc.span)
            for doc in author.parsed_documents
        ]
        save_json(text, outfile)
Пример #10
0
######################
# User Configuration #
######################
infile = '../data/Doyle_10.txt'
author_embedfile = 'model.bin'
part_size = 3500
workers = 1
seed = 0

##############
# Processing #
##############
t = SmartTimer('Pipeline')

t.tic('Load corpus')
a = Author(infile)
t.toc()
print('Corpus characters:', len(a.corpus))

t.tic('writer2vec')
a.writer2vec(
    tokenizer=Tokenizer(),
    part_size=part_size,
    workers=workers,
    seed=seed,
)
t.toc()
print('Corpus sentences:', len(a.sentences))
print('Corpus tokens:', len(a.words))
print('Corpus vocabulary:', len(a.parsed.vocabulary))
print('Documents:', len(a.docs))
Пример #11
0
seed = 0
word = 'watson'
top = 14
n_components = 20

#-----------------------Embedding------------------------
# Method 1: Load existing embedding
embedding_file = '../training/doyle_50dim_350part.bin'
embedding = EmbeddingModel(embedding_file)

# Method 2: Compute embedding model
# embedding = None

#-----------------------Processing------------------------
# Load corpus
author = Author(infile)
print('Corpus characters:', len(author.text))

# Sentence segmentation and tokenization
author.preprocess(Tokenizer(lemmatizer='wordnet'))
print('Corpus sentences:', len(author.sentences))
print('Corpus tokens:', len(author.words))
print('Corpus vocabulary:', len(author.parsed_text.vocabulary))

# Create an author's word2vec embedding model
author.embed(embedding=embedding, seed=seed)
print('Embedding vocabulary:', len(author.embedding.vocabulary))
print('Embedding matrix:', author.embedding.vectors.shape)

embedding_size = embedding.vectors.shape[1]
w2v_model = author.embedding.model  # access Gensim's Word2Vec directly
Пример #12
0
# across Python interpreter processes.
import os
os.environ['PYTHONHASHSEED'] = str(0)

######################
# User Configuration #
######################
infile = '../data/Doyle_10.txt'
workers = 1
seed = 0

##############
# Processing #
##############
# Load corpus
a = Author(infile)
print('Corpus characters:', len(a.corpus))

# Sentence segmentation and tokenization
a.preprocess(Tokenizer())
print('Corpus sentences:', len(a.sentences))
print('Corpus tokens:', len(a.words))
print('Corpus vocabulary:', len(a.parsed.vocabulary))

# Create an author's word2vec embedding model
a.embed(workers=workers, seed=seed)
print('Embedding vocabulary:', len(a.model.vocabulary))
print('Embedding matrix:', a.model.vectors.shape)

# Access the embedding matrix
a.model.vectors
Пример #13
0
######################
# infile = 'https://www.gutenberg.org/files/244/244-0.txt'
# label = 1
infile = 'https://www.gutenberg.org/files/863/863-0.txt'
label = 0
part_size = 350
workers = 4
seed = None  # 0

##############
# Processing #
##############
t = SmartTimer('Pipeline')

t.tic('Load corpus')
author = Author(infile)
t.toc()
print('Corpus characters:', len(author.corpus))

t.tic(f'writer2vec')
author.writer2vec(
    tokenizer=Tokenizer(),
    stopwords=Tokenizer.STOPWORDS,
    part_size=part_size,
    workers=workers,
    seed=seed,
    use_norm=True,
)
t.toc()

print('Corpus sentences:', len(author.sentences))
        sys.exit()

    part_size = int(sys.argv[1])
    infile = sys.argv[2]
    trainfile = sys.argv[3] if len(sys.argv) > 3 else None
    testfile = sys.argv[4] if len(sys.argv) > 4 else None
    test_size = float(sys.argv[5]) if len(sys.argv) > 5 else .1
    # Factor for capturing as much as possible from trailing text
    # Default is 1. but set to 0.1 because 3500/350=10%
    remain_factor = float(sys.argv[6]) if len(sys.argv) > 6 else (350 / part_size)
    seed = int(sys.argv[7]) if len(sys.argv) > 7 else 0
    print('Input file:', infile)
    print('Train file:', trainfile)
    print('Test file:', testfile)

    author = Author(infile)
    author.preprocess()
    author.partition_into_documents(part_size, remain_factor)
    print('Documents:', len(author.documents))
    print('Document tokens:', author.parsed_documents[0].size, author.parsed_documents[-1].size)

    # Train/test splits
    train_docs, test_docs = trainutils.split_data_into_train_test(
        author.parsed_documents,
        test_size=test_size,
        random_state=seed,
    )
    train_docs = TextSpan(train_docs)
    test_docs = TextSpan(test_docs)
    print('Training documents:', len(train_docs))
    print('Training tokens:', train_docs[0].size, train_docs[-1].size)