Python normalize_and_remove_stop_words示例，src.utils.normalize.normalize_and_remove_stop_words Python示例

示例#1

0

显示文件

def gen_vocab(corpus_dict, vocab=1000, stem=False, **kwargs):
    '''
    Generates a dictionary of words to be used as the vocabulary in features that utilize bag of words.

    Args:
        corpus_dict (OrderedDict): An ordered list of the most frequently occurring tokens in the corpus
        vocab_size (int): the number of words to be used in the vocabulary

    Returns:
        dict: a dictionary of size vocab_size that contains the most frequent normalized and non-stop words in the corpus
    '''
    index = 0
    vocabdict = dict()
    for word in corpus_dict:
        if len(vocabdict) < vocab:
            cleantext = normalize_and_remove_stop_words(word, stem)
            if cleantext != '':
                if not cleantext in vocabdict:
                    vocabdict[cleantext] = index
                    index+=1
        else: break
    return vocabdict

示例#2

0

显示文件

def build_lda(trainingdata, vocabdict, topics=40, random_state=0, **kwargs):
    '''
    Fits a LDA topic model based on the corpus vocabulary.

    Args:
        trainingdata (list): A list containing the corpus as parsed JSON text
        vocabdict (dict): A dictionary containing the vocabulary to be used in the LDA model
        topics (int): the number of topics to be used in the LDA model
        random_state (int or np.random.RandomState): seed value or random number generator state

    Returns:
        LatentDirichletAllocation: A LDA model fit to the training data and corpus vocabulary
    '''

    vectorizer = CountVectorizer(analyzer = "word", vocabulary = vocabdict)
    trainingdocs = []

    for entry in trainingdata: trainingdocs.append(normalize_and_remove_stop_words(entry['body_text']))
    trainingvectors = vectorizer.transform(trainingdocs)

    lda_model = LatentDirichletAllocation(n_topics=topics, random_state=random_state)
    lda_model.fit(trainingvectors)
    return lda_model

示例#3

0

显示文件

def gen_observations(all_clusters, lookup_order, document_data, features, parameters, vocab, full_vocab, encoder_decoder, lda_model, tf_session, w2v_model):
    '''
    Generates observations for each cluster found in JSON file and calculates the specified features.

    Args:
        all_clusters (set): cluster IDs
        lookup_order (dict): document arrival order
        document_data (array): parsed JSON documents
        features (dict): the specified features to be calculated
        parameters (???): data structure with run parameters
        vocab (dict): the vocabulary of the data set
        full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors
        lda_model (sklearn.???): trained LDA model
        tf_session: active TensorFlow session
        w2v_model (gensim.word2vec): trained word2vec model

    Returns:
        data(list): contains for each obeservation the features of the document vs corpus which could include:
            tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding
        labels(list): the labels for each document where a one is novel and zero is duplicate
    '''

    # Prepare to store results of feature assessments
    data = list()
    labels = list()
    # mem_net_features is used when the mem_net algorithm is ran
    # It consist of inputs, labels(answers), input_masks and questions for each entry
    mem_net_features = {}
    inputs = []
    input_masks = []
    questions = []
    # Sentence punctuation delimiters
    punkt = ['.','?','!']

    corpus_unprocessed = list()

    # Create random state
    random_state = np.random.RandomState(parameters['seed'])

    # Iterate through clusters found in JSON file, generate observations
    # pairing data and label
    for cluster in all_clusters:
        # Determine arrival order in this cluster
#
        sorted_entries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])]

        observations = [document_data[sorted_entries[0]]]

        for index in sorted_entries[1:]:
            next_doc = document_data[index]
            observations.append(next_doc)
            labeled_observation = { 'novelty' : next_doc['novelty'],
                    'data' : copy.copy(observations) }
            corpus_unprocessed.append(labeled_observation)
    
    # Resample if necessary
    # If oversampling +/- replacement, sample up
    # to larger class size for both classes, with replacement
    # If -oversampling, sample down to 
    # smaller class size for both classes with or w/o replacement
    if 'resampling' in parameters:
        if 'over' in parameters:
            desired_size = None
            parameters['replacement'] = True
        else:
            desired_size = -np.Inf
        if 'replacement' in parameters:
            replacement = True
        else:
            replacement = False
        corpus = sampling.label_sample(corpus_unprocessed, "novelty", replacement, desired_size, random_state)  
    else:
        corpus = corpus_unprocessed

    # Featurize each observation
    # Some duplication of effort here bc docs will appear multiple times 
    # across observations
    for case in corpus:
        # Create raw and normalized document arrays
        case_docs_raw = [ record['body_text'] for record in case['data'] ]
        case_docs_normalized = [ normalize.normalize_and_remove_stop_words(body_text) for body_text in case_docs_raw ]
        # Pull out query documents
        doc_raw = case_docs_raw[-1]
        doc_normalized = case_docs_raw[-1]
        # Create lists of background documents
        bkgd_docs_raw = case_docs_raw[:-1]
        bkgd_docs_normalized = case_docs_normalized[:-1]
        bkgd_text_raw = '\n'.join(bkgd_docs_raw)
        bkgd_text_normalized = '\n'.join(bkgd_docs_normalized)

        feature_vectors = list()

        if 'mem_net' in features:
            # Get all sentences for the memory network algorithm
            bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw)
            doc_input, doc_questions, doc_masks = gen_mem_net_observations(doc_raw, bkgd_text_raw, bkgd_sentences_full, features['mem_net'], vocab, full_vocab, w2v_model, encoder_decoder)

            # Now add all of the input docs to the primary list
            inputs.append(doc_input)
            questions.append(doc_questions)
            input_masks.append(doc_masks)

        else:

            if 'bow' in features:
                feature_vectors = bow(doc_normalized, bkgd_text_raw, bkgd_docs_normalized, vocab, features['bow'], feature_vectors)

            if 'st' in features:
                sentences = [ get_first_and_last_sentence(doc) for doc in bkgd_docs_raw ]
                feature_vectors = st(doc_raw, sentences, encoder_decoder, features['st'], feature_vectors)

            if 'lda' in features:
                feature_vectors = lda(doc_normalized, bkgd_text_normalized, vocab, lda_model, features['lda'], feature_vectors)

            if 'w2v' in features:
                feature_vectors = w2v(doc_raw, bkgd_text_normalized, w2v_model, features['w2v'], feature_vectors)

            if 'cnn' in features:
                feature_vectors = run_cnn(doc_normalized, bkgd_text_normalized, tf_session)

            if 'wordonehot' in features:
                feature_vectors = wordonehot(doc_raw, bkgd_text_raw, full_vocab, features['wordonehot'], feature_vectors)

            # Save features and label
            feature_vectors = np.concatenate(feature_vectors, axis=0)
            data.append(feature_vectors)
        if case["novelty"]:
            labels.append(1)
        else:
            labels.append(0)

    mem_net_features['inputs'] = inputs
    mem_net_features['questions'] = questions
    mem_net_features['input_masks'] = input_masks
    mem_net_features['answers'] = labels

    if 'mem_net' in features:
        return mem_net_features, labels
    else:
        return data, labels

示例#4

0

显示文件

文件： data_gen.py 项目： Lab41/pythia

def gen_observations(all_clusters, lookup_order, document_data, features, parameters, vocab, full_vocab, encoder_decoder, lda_model, tf_session, w2v_model, hdf5_path=None, dtype=np.float32):
    '''
    Generates observations for each cluster found in JSON file and calculates the specified features.

    Args:
        all_clusters (set): cluster IDs
        lookup_order (dict): document arrival order
        document_data (array): parsed JSON documents
        features (dict): the specified features to be calculated
        parameters (dict): data structure with run parameters
        vocab (dict): the vocabulary of the data set
        full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors
        lda_model (sklearn.???): trained LDA model
        tf_session: active TensorFlow session
        w2v_model (gensim.word2vec): trained word2vec model

    Returns:
        data(list): contains for each obeservation the features of the document vs corpus which could include:
            tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding
        labels(list): the labels for each document where a one is novel and zero is duplicate
    '''

    # Prepare to store results of feature assessments
    data = list()
    labels = list()
    # mem_net_features is used when the mem_net algorithm is ran
    # It consist of inputs, labels(answers), input_masks and questions for each entry
    mem_net_features = {}
    inputs = []
    input_masks = []
    questions = []
    # Sentence punctuation delimiters
    punkt = ['.','?','!']

    corpus_unprocessed = list()
    # HDF5-related parameters
    hdf5_save_frequency=parameters['hdf5_save_frequency']
    data_key = 'data'
    labels_key = 'labels'
    # Truncate any existing files at save location, or return early if 
    # using existing files
    if hdf5_path is not None:
        if parameters['hdf5_use_existing'] and os.path.isfile(hdf5_path):
            return hdf5_path, hdf5_path
        open(hdf5_path, 'w').close()

    # Create random state
    random_state = np.random.RandomState(parameters['seed'])

    # Iterate through clusters found in JSON file, generate observations
    # pairing data and label
    for cluster in all_clusters:
        # Determine arrival order in this cluster
        sorted_entries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])]
        observations = [document_data[sorted_entries[0]]]
        for index in sorted_entries[1:]:
            next_doc = document_data[index]
            observations.append(next_doc)
            labeled_observation = { 'novelty' : next_doc['novelty'],
                    'data' : copy.copy(observations) }
            corpus_unprocessed.append(labeled_observation)
    
    # Resample if necessary
    # If oversampling +/- replacement, sample up
    # to larger class size for both classes, with replacement
    # If -oversampling, sample down to 
    # smaller class size for both classes with or w/o replacement
    if 'resampling' in parameters:
        resampling_parameters = parameters['resampling']
        if resampling_parameters.get('over', False):
            desired_size = None
            resampling_parameters['replacement'] = True
        else:
            desired_size = -np.Inf
        if resampling_parameters.get('replacement', False):
            replacement = True
        else:
            replacement = False
        logger.debug("Replacement: {}, Desired size: {}".format(replacement, desired_size))
        logger.debug("Size of data: {}, Number of clusters: {}".format(len(corpus_unprocessed), len(all_clusters)))
        corpus = sampling.label_sample(corpus_unprocessed, "novelty", replacement, desired_size, random_state)  
    else:
        corpus = corpus_unprocessed

    # Featurize each observation
    # Some duplication of effort here bc docs will appear multiple times 
    # across observations
    
    clusterids = []
    postids = []
    for case in corpus:
        
        # Create raw and normalized document arrays
        case_docs_raw = [ record['body_text'] for record in case['data'] ]
        case_docs_normalized = [ normalize.xml_normalize(body_text) for body_text in case_docs_raw ]
        case_docs_no_stop_words = [ normalize.normalize_and_remove_stop_words(body_text) for body_text in case_docs_raw ]
        #create ids for individual data points
        postid = [record['post_id'] for record in case['data'] ][-1]
        postids.append(postid)
        clusterid = [ record['cluster_id'] for record in case['data'] ][0]
        clusterids.append(clusterid)
        # Pull out query documents
        doc_raw = case_docs_raw[-1]
        doc_normalized = case_docs_normalized[-1]
        doc_no_stop_words = case_docs_no_stop_words[-1]
        # Create lists of background documents
        bkgd_docs_raw = case_docs_raw[:-1]
        bkgd_docs_normalized = case_docs_normalized[:-1]
        bkgd_docs_no_stop_words = case_docs_no_stop_words[:-1]
        bkgd_text_raw = '\n'.join(bkgd_docs_raw)
        bkgd_text_normalized = '\n'.join(bkgd_docs_normalized) 
        bkgd_text_no_stop_words = '\n'.join(bkgd_docs_no_stop_words)
        feature_vectors = list()

        if 'mem_net' in features:
            # Get all sentences for the memory network algorithm
            bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw)
            doc_input, doc_questions, doc_masks = gen_mem_net_observations(doc_raw, bkgd_text_raw, bkgd_sentences_full, features['mem_net'], vocab, full_vocab, w2v_model, encoder_decoder)

            # Now add all of the input docs to the primary list
            inputs.append(doc_input)
            questions.append(doc_questions)
            input_masks.append(doc_masks)

        else:

            if 'bow' in features:
                feature_vectors = bow(doc_no_stop_words, bkgd_text_no_stop_words,
                    bkgd_docs_no_stop_words, vocab, features['bow'], feature_vectors)
            if 'st' in features:
                sentences = []
                for doc in bkgd_docs_raw:
                    for item in get_first_and_last_sentence(doc):
                        sentences.append(item)
                feature_vectors = st(doc_raw, sentences, encoder_decoder, features['st'], feature_vectors)

            if 'lda' in features:
                feature_vectors = lda(doc_no_stop_words, bkgd_text_no_stop_words, vocab, lda_model, features['lda'], feature_vectors)

            if 'w2v' in features:
                feature_vectors = w2v(doc_normalized, bkgd_docs_normalized, w2v_model, features['w2v'], feature_vectors)

            if 'cnn' in features:
                feature_vectors = run_cnn(normalize.xml_normalize(doc_raw), normalize.xml_normalize(bkgd_text_raw), tf_session)

            if 'wordonehot' in features:
                feature_vectors = wordonehot(doc_raw, bkgd_text_raw, full_vocab, features['wordonehot'], feature_vectors)

            # Save features and label
            feature_vectors = np.concatenate(feature_vectors, axis=0).astype(dtype)
            # Fail catastrphically on zero vector (not sure if we need this)
            #assert not (feature_vectors < 0.0001).all() 
            data.append(feature_vectors)
        if case["novelty"]:
            labels.append(1)
        else:
            labels.append(0)
        
        # save to HDF5 if desired
        if hdf5_path is not None and len(data) % hdf5_save_frequency == 0:
            with h5py.File(hdf5_path, 'a') as h5:
                data_np = np.array(data)
                labels_np = np.reshape(np.array(labels), (-1, 1))
                add_to_hdf5(h5, data_np, data_key)
                add_to_hdf5(h5, labels_np, labels_key, np.uint8)
                labels = list()
                data = list()
    # Save off any remainder
    if hdf5_path is not None and len(data) > 0:
        with h5py.File(hdf5_path, 'a') as h5:
            data_np = np.array(data)
            labels_np = np.reshape(np.array(labels), (-1, 1))
            add_to_hdf5(h5, data_np, data_key)
            add_to_hdf5(h5, labels_np, labels_key, np.uint8)

    mem_net_features['inputs'] = inputs
    mem_net_features['questions'] = questions
    mem_net_features['input_masks'] = input_masks
    mem_net_features['answers'] = labels
    
    ids = ["C" + str(clusterid) + "_P" + str(postid) for clusterid, postid in zip(clusterids,postids)]

   
    if 'mem_net' in features: 
        return mem_net_features, labels, ids
    if hdf5_path is not None:
        return hdf5_path, hdf5_path, ids
    else:
        return data, labels, ids

示例#5

0

显示文件

文件： data_gen.py 项目： dav009/pythia

def gen_observations(all_clusters, lookup_order, documentData, features, vocab, encoder_decoder, lda_topics):
    '''
    Generates observations for each cluster found in JSON file and calculates the specified features.
    
    Args:
        all_clusters (set): cluster IDs
        lookup_order (dict): document arrival order
        documentData (array): parsed JSON documents
        filename (str): the name of the corpus file
        features (namedTuple): the specified features to be calculated
        vocab (dict): the vocabulary of the data set
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors
    
    Returns:
        list: contains for each obeservation a namedtupled with the cluster_id, post_id, novelty, tfidf sum, cosine similarity, bag of words vectors and skip thoughts  (scores are None if feature is unwanted)
    '''

    # Prepare to store results of feature assessments
    data = list()
    labels = list()

    #Iterate through clusters found in JSON file, do feature assessments,
    #build a rolling corpus from ordered documents for each cluster
    for cluster in all_clusters:
        # Determine arrival order in this cluster
        sortedEntries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])]
        
        first_doc = documentData[sortedEntries[0]]["body_text"]

        # Set corpus to first doc in this cluster and prepare to update corpus with new document vocabulary
        corpus = normalize.normalize_and_remove_stop_words(first_doc)

        # Create a document array for TFIDF
        corpus_array = [corpus]

        # Store a list of sentences in the cluster at each iteration
        sentences = []
        sentences += (get_first_and_last_sentence(first_doc))

        for index in sortedEntries[1:]:
            # Find next document in order
            raw_doc = documentData[index]["body_text"]
            
            #normalize and remove stop words from doc
            doc = normalize.normalize_and_remove_stop_words(raw_doc)

            corpus_array.append(doc)
            
            feature = list()

            if 'bow' in features:
                feature = bow(doc, corpus, corpus_array, vocab, features['bow'], feature)

            if 'st' in features:
                feature = st(raw_doc, sentences, encoder_decoder, features['st'], feature)

            if 'lda' in features:
                feature = lda(doc, corpus, vocab, lda_topics, features['lda'], feature)

            # Save feature and label
            feature = np.concatenate(feature, axis=0)
            data.append(feature)
            if documentData[index]["novelty"]: labels.append(1)
            else: labels.append(0)
           
            # Update corpus and add newest sentence to sentences vector
            corpus+=doc
            sentences += get_first_and_last_sentence(doc)

    return data, labels

示例#6

0

显示文件

文件： test_normalize.py 项目： Lab41/pythia

def test_empty_string():
    assert normalize.normalize_and_remove_stop_words("") == ""

示例#7

0

显示文件

def test_letters():
    assert normalize.normalize_and_remove_stop_words(
        "19 cats&dogs don't eat?") == "cats dogs don t eat"

示例#8

0

显示文件

文件： data_gen.py 项目： colinsongf/pythia

def gen_observations(all_clusters,
                     lookup_order,
                     document_data,
                     features,
                     parameters,
                     vocab,
                     full_vocab,
                     encoder_decoder,
                     lda_model,
                     tf_session,
                     w2v_model,
                     hdf5_path=None,
                     dtype=np.float32):
    '''
    Generates observations for each cluster found in JSON file and calculates the specified features.

    Args:
        all_clusters (set): cluster IDs
        lookup_order (dict): document arrival order
        document_data (array): parsed JSON documents
        features (dict): the specified features to be calculated
        parameters (dict): data structure with run parameters
        vocab (dict): the vocabulary of the data set
        full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors
        lda_model (sklearn.???): trained LDA model
        tf_session: active TensorFlow session
        w2v_model (gensim.word2vec): trained word2vec model

    Returns:
        data(list): contains for each obeservation the features of the document vs corpus which could include:
            tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding
        labels(list): the labels for each document where a one is novel and zero is duplicate
    '''

    # Prepare to store results of feature assessments
    data = list()
    labels = list()
    # mem_net_features is used when the mem_net algorithm is ran
    # It consist of inputs, labels(answers), input_masks and questions for each entry
    mem_net_features = {}
    inputs = []
    input_masks = []
    questions = []
    # Sentence punctuation delimiters
    punkt = ['.', '?', '!']

    corpus_unprocessed = list()
    # HDF5-related parameters
    hdf5_save_frequency = parameters['hdf5_save_frequency']
    data_key = 'data'
    labels_key = 'labels'
    # Truncate any existing files at save location, or return early if
    # using existing files
    if hdf5_path is not None:
        if parameters['hdf5_use_existing'] and os.path.isfile(hdf5_path):
            return hdf5_path, hdf5_path
        open(hdf5_path, 'w').close()

    # Create random state
    random_state = np.random.RandomState(parameters['seed'])

    # Iterate through clusters found in JSON file, generate observations
    # pairing data and label
    for cluster in all_clusters:
        # Determine arrival order in this cluster
        sorted_entries = [
            x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])
        ]
        observations = [document_data[sorted_entries[0]]]
        for index in sorted_entries[1:]:
            next_doc = document_data[index]
            observations.append(next_doc)
            labeled_observation = {
                'novelty': next_doc['novelty'],
                'data': copy.copy(observations)
            }
            corpus_unprocessed.append(labeled_observation)

    # Resample if necessary
    # If oversampling +/- replacement, sample up
    # to larger class size for both classes, with replacement
    # If -oversampling, sample down to
    # smaller class size for both classes with or w/o replacement
    if 'resampling' in parameters:
        resampling_parameters = parameters['resampling']
        if resampling_parameters.get('over', False):
            desired_size = None
            resampling_parameters['replacement'] = True
        else:
            desired_size = -np.Inf
        if resampling_parameters.get('replacement', False):
            replacement = True
        else:
            replacement = False
        logger.debug("Replacement: {}, Desired size: {}".format(
            replacement, desired_size))
        logger.debug("Size of data: {}, Number of clusters: {}".format(
            len(corpus_unprocessed), len(all_clusters)))
        corpus = sampling.label_sample(corpus_unprocessed, "novelty",
                                       replacement, desired_size, random_state)
    else:
        corpus = corpus_unprocessed

    # Featurize each observation
    # Some duplication of effort here bc docs will appear multiple times
    # across observations

    clusterids = []
    postids = []
    for case in corpus:

        # Create raw and normalized document arrays
        case_docs_raw = [record['body_text'] for record in case['data']]
        case_docs_normalized = [
            normalize.xml_normalize(body_text) for body_text in case_docs_raw
        ]
        case_docs_no_stop_words = [
            normalize.normalize_and_remove_stop_words(body_text)
            for body_text in case_docs_raw
        ]
        #create ids for individual data points
        postid = [record['post_id'] for record in case['data']][-1]
        postids.append(postid)
        clusterid = [record['cluster_id'] for record in case['data']][0]
        clusterids.append(clusterid)
        # Pull out query documents
        doc_raw = case_docs_raw[-1]
        doc_normalized = case_docs_normalized[-1]
        doc_no_stop_words = case_docs_no_stop_words[-1]
        # Create lists of background documents
        bkgd_docs_raw = case_docs_raw[:-1]
        bkgd_docs_normalized = case_docs_normalized[:-1]
        bkgd_docs_no_stop_words = case_docs_no_stop_words[:-1]
        bkgd_text_raw = '\n'.join(bkgd_docs_raw)
        bkgd_text_normalized = '\n'.join(bkgd_docs_normalized)
        bkgd_text_no_stop_words = '\n'.join(bkgd_docs_no_stop_words)
        feature_vectors = list()

        if 'mem_net' in features:
            # Get all sentences for the memory network algorithm
            bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw)
            doc_input, doc_questions, doc_masks = gen_mem_net_observations(
                doc_raw, bkgd_text_raw, bkgd_sentences_full,
                features['mem_net'], vocab, full_vocab, w2v_model,
                encoder_decoder)

            # Now add all of the input docs to the primary list
            inputs.append(doc_input)
            questions.append(doc_questions)
            input_masks.append(doc_masks)

        else:

            if 'bow' in features:
                feature_vectors = bow(doc_no_stop_words,
                                      bkgd_text_no_stop_words,
                                      bkgd_docs_no_stop_words, vocab,
                                      features['bow'], feature_vectors)
            if 'st' in features:
                sentences = []
                for doc in bkgd_docs_raw:
                    for item in get_first_and_last_sentence(doc):
                        sentences.append(item)
                feature_vectors = st(doc_raw, sentences, encoder_decoder,
                                     features['st'], feature_vectors)

            if 'lda' in features:
                feature_vectors = lda(doc_no_stop_words,
                                      bkgd_text_no_stop_words, vocab,
                                      lda_model, features['lda'],
                                      feature_vectors)

            if 'w2v' in features:
                feature_vectors = w2v(doc_normalized, bkgd_docs_normalized,
                                      w2v_model, features['w2v'],
                                      feature_vectors)

            if 'cnn' in features:
                feature_vectors = run_cnn(
                    normalize.xml_normalize(doc_raw),
                    normalize.xml_normalize(bkgd_text_raw), tf_session)

            if 'wordonehot' in features:
                feature_vectors = wordonehot(doc_raw, bkgd_text_raw,
                                             full_vocab,
                                             features['wordonehot'],
                                             feature_vectors)

            # Save features and label
            feature_vectors = np.concatenate(feature_vectors,
                                             axis=0).astype(dtype)
            # Fail catastrphically on zero vector (not sure if we need this)
            #assert not (feature_vectors < 0.0001).all()
            data.append(feature_vectors)
        if case["novelty"]:
            labels.append(1)
        else:
            labels.append(0)

        # save to HDF5 if desired
        if hdf5_path is not None and len(data) % hdf5_save_frequency == 0:
            with h5py.File(hdf5_path, 'a') as h5:
                data_np = np.array(data)
                labels_np = np.reshape(np.array(labels), (-1, 1))
                add_to_hdf5(h5, data_np, data_key)
                add_to_hdf5(h5, labels_np, labels_key, np.uint8)
                labels = list()
                data = list()
    # Save off any remainder
    if hdf5_path is not None and len(data) > 0:
        with h5py.File(hdf5_path, 'a') as h5:
            data_np = np.array(data)
            labels_np = np.reshape(np.array(labels), (-1, 1))
            add_to_hdf5(h5, data_np, data_key)
            add_to_hdf5(h5, labels_np, labels_key, np.uint8)

    mem_net_features['inputs'] = inputs
    mem_net_features['questions'] = questions
    mem_net_features['input_masks'] = input_masks
    mem_net_features['answers'] = labels

    ids = [
        "C" + str(clusterid) + "_P" + str(postid)
        for clusterid, postid in zip(clusterids, postids)
    ]

    if 'mem_net' in features:
        return mem_net_features, labels, ids
    if hdf5_path is not None:
        return hdf5_path, hdf5_path, ids
    else:
        return data, labels, ids

示例#9

0

显示文件

文件： test_normalize.py 项目： Lab41/pythia

def test_lower_case():
    assert normalize.normalize_and_remove_stop_words("Hi BillyBob JOE") == "hi billybob joe"

示例#10

0

显示文件

文件： test_normalize.py 项目： Lab41/pythia

def test_letters():
    assert normalize.normalize_and_remove_stop_words("19 cats&dogs don't eat?") == "cats dogs don t eat"

示例#11

0

显示文件

文件： test_normalize.py 项目： Lab41/pythia

def test_HTML():
    assert normalize.normalize_and_remove_stop_words("<p> <title=cats> cats pounce </p>") == "cats pounce"

示例#12

0

显示文件

def test_empty_string():
    assert normalize.normalize_and_remove_stop_words("") == ""

示例#13

0

显示文件

def test_combo():
    assert normalize.normalize_and_remove_stop_words(
        "<p> <title=cats> <body> Cats pounce all the time! <http://catlink.com> is a video of cats JUMPING 10 times!! cool, right? </body></p>"
    ) == "cats pounce time video cats jumping times cool right"

示例#14

0

显示文件

def test_stop_words_text():
    assert normalize.normalize_and_remove_stop_words(
        "the cat has name") == "cat"

示例#15

0

显示文件

def test_lower_case():
    assert normalize.normalize_and_remove_stop_words(
        "Hi BillyBob JOE") == "hi billybob joe"

示例#16

0

显示文件

文件： test_normalize.py 项目： Lab41/pythia

def test_stop_words_text():
    assert normalize.normalize_and_remove_stop_words("the cat has name") == "cat"

示例#17

0

显示文件

文件： data_gen.py 项目： pcallier/pythia

def gen_observations(all_clusters, lookup_order, document_data, features, parameters, vocab, full_vocab, encoder_decoder, lda_model, tf_session, w2v_model):
    '''
    Generates observations for each cluster found in JSON file and calculates the specified features.

    Args:
        all_clusters (set): cluster IDs
        lookup_order (dict): document arrival order
        document_data (array): parsed JSON documents
        features (dict): the specified features to be calculated
        parameters (???): data structure with run parameters
        vocab (dict): the vocabulary of the data set
        full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors
        lda_model (sklearn.???): trained LDA model
        tf_session: active TensorFlow session
        w2v_model (gensim.word2vec): trained word2vec model

    Returns:
        data(list): contains for each obeservation the features of the document vs corpus which could include:
            tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding
        labels(list): the labels for each document where a one is novel and zero is duplicate
    '''

    # Prepare to store results of feature assessments
    data = list()
    labels = list()
    # mem_net_features is used when the mem_net algorithm is ran
    # It consist of inputs, labels(answers), input_masks and questions for each entry
    mem_net_features = {}
    inputs = []
    input_masks = []
    questions = []
    # Sentence punctuation delimiters
    punkt = ['.','?','!']

    corpus_unprocessed = list()

    # Create random state
    random_state = np.random.RandomState(parameters['seed'])

    # Iterate through clusters found in JSON file, generate observations
    # pairing data and label
    for cluster in all_clusters:
        # Determine arrival order in this cluster
#
        sorted_entries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])]

        observations = [document_data[sorted_entries[0]]]

        for index in sorted_entries[1:]:
            next_doc = document_data[index]
            observations.append(next_doc)
            labeled_observation = { 'novelty' : next_doc['novelty'],
                    'data' : copy.copy(observations) }
            corpus_unprocessed.append(labeled_observation)
    
    # Resample if necessary
    # If oversampling +/- replacement, sample up
    # to larger class size for both classes, with replacement
    # If -oversampling, sample down to 
    # smaller class size for both classes with or w/o replacement
    if 'resampling' in parameters:
        if 'over' in parameters:
            desired_size = None
            parameters['replacement'] = True
        else:
            desired_size = -np.Inf
        if 'replacement' in parameters:
            replacement = True
        else:
            replacement = False
        corpus = sampling.label_sample(corpus_unprocessed, "novelty", replacement, desired_size, random_state)  
    else:
        corpus = corpus_unprocessed

    # Featurize each observation
    # Some duplication of effort here bc docs will appear multiple times 
    # across observations
    
    clusterids = []
    postids = []
    for case in corpus:
        
        # Create raw and normalized document arrays
        case_docs_raw = [ record['body_text'] for record in case['data'] ]
        case_docs_normalized = [ normalize.normalize_and_remove_stop_words(body_text) for body_text in case_docs_raw ]
        #create ids for individual data points
        postid = [record['post_id'] for record in case['data'] ][-1]
        postids.append(postid)
        clusterid = [ record['cluster_id'] for record in case['data'] ][0]
        clusterids.append(clusterid)
        # Pull out query documents
        doc_raw = case_docs_raw[-1]
        doc_normalized = case_docs_normalized[-1]
        # Create lists of background documents
        bkgd_docs_raw = case_docs_raw[:-1]
        bkgd_docs_normalized = case_docs_normalized[:-1]
        bkgd_text_raw = '\n'.join(bkgd_docs_raw)
        bkgd_text_normalized = '\n'.join(bkgd_docs_normalized) 
        feature_vectors = list()

        if 'mem_net' in features:
            # Get all sentences for the memory network algorithm
            bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw)
            doc_input, doc_questions, doc_masks = gen_mem_net_observations(doc_raw, bkgd_text_raw, bkgd_sentences_full, features['mem_net'], vocab, full_vocab, w2v_model, encoder_decoder)

            # Now add all of the input docs to the primary list
            inputs.append(doc_input)
            questions.append(doc_questions)
            input_masks.append(doc_masks)

        else:

            if 'bow' in features:
                feature_vectors = bow(doc_normalized, bkgd_text_raw, bkgd_docs_normalized, vocab, features['bow'], feature_vectors)

            if 'st' in features:
                sentences = []
                for doc in bkgd_docs_raw:
                    for item in get_first_and_last_sentence(doc):
                        sentences.append(item)
                feature_vectors = st(doc_raw, sentences, encoder_decoder, features['st'], feature_vectors)

            if 'lda' in features:
                feature_vectors = lda(doc_normalized, bkgd_text_normalized, vocab, lda_model, features['lda'], feature_vectors)

            if 'w2v' in features:
                feature_vectors = w2v(doc_raw, bkgd_text_normalized, w2v_model, features['w2v'], feature_vectors)

            if 'cnn' in features:
                feature_vectors = run_cnn(doc_normalized, bkgd_text_normalized, tf_session)

            if 'wordonehot' in features:
                feature_vectors = wordonehot(doc_raw, bkgd_text_raw, full_vocab, features['wordonehot'], feature_vectors)

            # Save features and label
            feature_vectors = np.concatenate(feature_vectors, axis=0)
            data.append(feature_vectors)
        if case["novelty"]:
            labels.append(1)
        else:
            labels.append(0)

    mem_net_features['inputs'] = inputs
    mem_net_features['questions'] = questions
    mem_net_features['input_masks'] = input_masks
    mem_net_features['answers'] = labels
    
    ids = ["C" + str(clusterid) + "_P" + str(postid) for clusterid, postid in zip(clusterids,postids)]

   
    if 'mem_net' in features:
        return mem_net_features, labels, ids
    else:
        return data, labels, ids

示例#18

0

显示文件

文件： test_normalize.py 项目： Lab41/pythia

def test_combo():
    assert normalize.normalize_and_remove_stop_words("<p> <title=cats> <body> Cats pounce all the time! <http://catlink.com> is a video of cats JUMPING 10 times!! cool, right? </body></p>") == "cats pounce time video cats jumping times cool right"

示例#19

0

显示文件

文件： observations.py 项目： dav009/pythia

def gen_observations(all_clusters, lookup_order, documentData, filename, features, vocab, encoder_decoder, lda):
    '''
    Generates observations for each cluster found in JSON file and calculates the specified features.
    
    Args:
        all_clusters (set): cluster IDs
        lookup_order (dict): document arrival order
        documentData (array): parsed JSON documents
        filename (str): the name of the corpus file
        features (namedTuple): the specified features to be calculated
        vocab (dict): the vocabulary of the data set
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors
    
    Returns:
        list: contains for each obeservation a namedtupled with the cluster_id, post_id, novelty, tfidf sum, cosine similarity, bag of words vectors and skip thoughts  (scores are None if feature is unwanted)
    '''

    # Prepare to store results of feature assessments
    postScores = []
    postTuple = namedtuple('postScore','corpus,cluster_id,post_id,novelty,bagwordsScore,tfidfScore,bog,skipthoughts,ldavector')
    
    #Iterate through clusters found in JSON file, do feature assessments,
    #build a rolling corpus from ordered documents for each cluster
    for cluster in all_clusters:
        # Determine arrival order in this cluster
        sortedEntries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])]
        
        first_doc = documentData[sortedEntries[0]]["body_text"]

        # Set corpus to first doc in this cluster and prepare to update corpus with new document vocabulary
        corpus = normalize.normalize_and_remove_stop_words(first_doc)

        # Create a document array for TFIDF
        corpus_array = [corpus]

        # Store a list of sentences in the cluster at each iteration
        sentences = []
        sentences += (get_first_and_last_sentence(first_doc))

        # Use filename as corpus name if corpus name was not defined in JSON
        try: corpusName = documentData[sortedEntries[0]]["corpus"]
        except KeyError: corpusName = basename(filename)

        for index in sortedEntries[1:]:
            # Find next document in order
            raw_doc = documentData[index]["body_text"]
            
            #normalize and remove stop words from doc
            doc = normalize.normalize_and_remove_stop_words(raw_doc)

            corpus_array.append(doc)
            
            similarityScore = None
            tfidfScore = None
            bog = None
            skipthoughts = None
            ldavector = None
            
            if features.tfidf_sum:
                tfidfScore = tfidf_sum(doc, corpus_array, vocab)

            if features.cos_similarity:
                bagwordsVectors = bag_of_words_vectors(doc, corpus, vocab)
                similarityScore = 1 - spatial.distance.cosine(bagwordsVectors[0], bagwordsVectors[1])

            if features.bag_of_words:
                bagwordsVectors = bag_of_words_vectors(doc, corpus, vocab)
                bog = np.concatenate(bagwordsVectors, axis=0)

            if features.skipthoughts:
                skipthoughts = skipthoughts_vectors(raw_doc, sentences, encoder_decoder)
                # Add newest sentence to sentences vector
                sentences += get_first_and_last_sentence(doc)

            if features.lda:
                doclda = run_lda(lda, doc, vocab)
                corpuslda = run_lda(lda,corpus,vocab)
                ldavector = np.concatenate([doclda, corpuslda], axis=0)

            # Save results in namedtuple and add to array
            postScore = postTuple(corpusName, cluster, documentData[index]["post_id"], documentData[index]["novelty"], similarityScore, tfidfScore, bog, skipthoughts,ldavector)
            postScores.append(postScore)

            # Update corpus
            corpus+=doc

    return postScores

示例#20

0

显示文件

def test_HTML():
    assert normalize.normalize_and_remove_stop_words(
        "<p> <title=cats> cats pounce </p>") == "cats pounce"