def main(data_dir, out_dir):

    docid2path = dict()

    # iterable of (doctext, docpath) tuple
    reader = TextsStreamReader(data_dir, as_lines=False)
    outfile = codecs.open(
        os.path.join(out_dir, 'processed_enron_docs_as_lines.txt'), 'w',
        'utf-8', 'ignore')

    docid = 0
    opts = dict(sents=False,
                lower=True,
                stem=False,
                min_token_len=3,
                min_sent_len=4,
                remove_stops=True,
                filters=[
                    'strip_multiple_whitespaces', 'strip_tags',
                    'strip_punctuation', 'split_alphanum', 'strip_numeric'
                ])

    for doctext, docpath in reader:
        doctext = preprocess_text(doctext, **opts)
        # generator to list
        doctext = list(doctext)
        if doctext:
            # when sents=False, each document is returned as single sentence (first element),
            # where every element is a list of tokens
            doctext = doctext[0]
            if doctext:
                docid2path[docid] = docpath
                outfile.write(" ".join(doctext) + '\n')
                docid += 1

    outfile.close()
    utils.pickle(docid2path, os.path.join(out_dir, 'docid2path.pkl'))

    # create another file to hold sentences (useful for word2vec)
    outfile = codecs.open(
        os.path.join(out_dir, 'processed_enron_sents_as_lines.txt'), 'w',
        'utf-8', 'ignore')
    opts['sents'] = True

    for doctext, _ in reader:
        docsents = preprocess_text(doctext, **opts)
        docsents = list(docsents)
        if docsents:
            for sent in docsents:
                if sent:
                    outfile.write(" ".join(sent) + '\n')

    outfile.close()
def _cooccurrence_preprocessing(doc, context, already_preprocessed):
    """Preprocess document as needed for co-occurrence network creation"""
    if context=='window':
        if already_preprocessed:
            doc = doc.split(' ')
        else:
            doc = preprocess.preprocess_text(doc)
    elif context=='sentence':
        doc = preprocess.tokenize_sentences(doc)
        for i, sentence in enumerate(doc):
            sentence = preprocess.preprocess_text(sentence)
            doc[i] = sentence
    return doc
Exemplo n.º 3
0
def _cooccurrence_preprocessing(doc, context, already_preprocessed):
    """Preprocess document as needed for co-occurrence network creation"""
    if context == 'window':
        if already_preprocessed:
            doc = doc.split(' ')
        else:
            doc = preprocess.preprocess_text(doc)
    elif context == 'sentence':
        doc = preprocess.tokenize_sentences(doc)
        for i, sentence in enumerate(doc):
            sentence = preprocess.preprocess_text(sentence)
            doc[i] = sentence
    return doc
Exemplo n.º 4
0
def create_model(d, f):
    model = Counter()
    for file in f:
        content = preprocess_text(d+file)
        c = ngrams(content, 2)
        model.update(c)
    return model
Exemplo n.º 5
0
def text_to_vector(docs, metric):
    """ Create frequency based feature-vector from text

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
    tf_dists = [] # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc: fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)


    all_tokens = doc_freqs.keys()
    num_docs = len(docs)
    num_features = len(all_tokens)


    # Build feature x document matrix
    matrix = np.zeros((num_features, num_docs))
    for i, fd in enumerate(tf_dists):
        if metric == FrequencyMetrics.TF:
            v = [fd.freq(word) for word in all_tokens]
        elif metric == FrequencyMetrics.TF_IDF:
            v = [fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) for word in all_tokens]
        else:
            raise ValueError("No such feature type: %s" % feature_type);
        matrix[:,i] = v

    return matrix
Exemplo n.º 6
0
def text_to_dict(docs, metric):
    """ Create dictionaries of term frequencies based on documents

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
    tf_dists = [] # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc: fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)


    num_docs = len(docs)
    # Build dictionaries
    dicts = []
    for i, fd in enumerate(tf_dists):
        if i%100==0: print '    dict',str(i)+'/'+str(len(tf_dists))
        d = {}
        if metric == FrequencyMetrics.TF:
            for word in fd.samples():
                d[word] = fd.freq(word)
        elif metric == FrequencyMetrics.TF_IDF:
            for word in fd.samples():
                d[word] = fd.freq(word) * math.log(float(num_docs)/doc_freqs[word])
        else:
            raise ValueError("No such feature type: %s" % feature_type);
        dicts.append(d)
    return dicts
Exemplo n.º 7
0
def get_models_predictions(text):
    preprocessed_text = preprocess_text(text)
    labels = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]
    with graph.as_default():
        raw_probabilities = [
            *map(
                lambda model: numpy.squeeze(model.predict(preprocessed_text),
                                            axis=0).tolist(), models)
        ]
        probabilities_with_labels = [
            *map(
                lambda probability: [
                    *map(
                        lambda i: {
                            'label': labels[i],
                            'probability': probability[i]
                        }, range(0, 6))
                ], raw_probabilities)
        ]

    averaged_probabilities = numpy.average(raw_probabilities, axis=0).tolist()

    return {
        'probabilities_of_models': raw_probabilities,
        'probabilities_of_models_with_labels': probabilities_with_labels,
        'models_averaged_probabilities': averaged_probabilities,
        'most_probable_category': {
            'label': labels[numpy.argmax(averaged_probabilities)],
            'probability': numpy.max(averaged_probabilities)
        }
    }
Exemplo n.º 8
0
def text_to_dict(docs, metric):
    """ Create dictionaries of term frequencies based on documents

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist(
    )  # Distribution over how many documents each word appear in.
    tf_dists = []  # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc:
            fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)

    num_docs = len(docs)
    # Build dictionaries
    dicts = []
    for i, fd in enumerate(tf_dists):
        if i % 100 == 0: print '    dict', str(i) + '/' + str(len(tf_dists))
        d = {}
        if metric == FrequencyMetrics.TF:
            for word in fd.samples():
                d[word] = fd.freq(word)
        elif metric == FrequencyMetrics.TF_IDF:
            for word in fd.samples():
                d[word] = fd.freq(word) * math.log(
                    float(num_docs) / doc_freqs[word])
        else:
            raise ValueError("No such feature type: %s" % feature_type)
        dicts.append(d)
    return dicts
Exemplo n.º 9
0
def index_markdown(markdown_filepath, ix_writer):
    file = path.basename(markdown_filepath)
    with open(markdown_filepath) as f:
        content = preprocess_text(f.read())
    # Do any preprocessing here, but the QA model may also read from the filepath.
    ix_writer.add_document(title=file,
                           content=content,
                           filepath=markdown_filepath)
Exemplo n.º 10
0
    def generate_perplexity(self, n, sentences, r=[0.4, 0.6, 1]):
        for z in xrange(n):
            x = z + 1
            self.nprob_dic[x] = self.nprob_dic[
                x] if x in self.nprob_dic else self.generate_ngram(x)
        tokens = preprocess.preprocess_text(sentences).split()

        # Prepare sentences for each ngram
        # token_list = [[],[],[]]
        # token_list[0] = tokens.replace('<s>', '').split()
        # token_list[1] = tokens.split()
        # token_list[2] = tokens.replace('<s>', '<s1> <s2>').split()
        # tokens = tokens.split()

        # use unk_1 to repalce word not in ncounter_dic[1]
        self.ncounter_dic[1] = self.ncounter_dic[
            1] if 1 in self.ncounter_dic else self.ntoken_count(1)
        for i, token in enumerate(tokens):
            key = tuple([token])
            if key not in self.ncounter_dic[1]:
                tokens[i] = '<unk_1>'

        # calculate perplexity
        perp = 0

        _len = len(tokens)

        # iters = [0, 0, 0]

        for i in xrange(_len):
            prob_tup = []
            for j in xrange(n):
                key = tuple(tokens[i - j:i + 1])

                if j > 0:
                    unk = '<unk_{}>'.format(j)
                    if key != ():
                        if key not in self.nprob_dic[j + 1]:
                            key = tuple([unk, tokens[-1]])
                if key == () or (('<s>' in key) and key[-1] != '<s>'
                                 and j > 0):
                    prob_element = 0
                else:
                    prob_element = self.nprob_dic[j + 1][key]
                prob_tup.append(prob_element)
            ntemp = n - 1
            while (prob_tup[ntemp] == 0 or prob_tup[ntemp] == 1):
                ntemp -= 1

            prob = prob_tup[ntemp] * r[ntemp]

            perp -= log(prob)
        perp = exp(1.0 * perp / len(tokens))
        return perp
Exemplo n.º 11
0
def word_prob_solver(text):
    orig_text = text
    nlp, spacy_parser, dep_parser, tree_parser, verb_cats_json = init_parsers()
    text = preprocess_text(text)
    # print(text)
    document = nlp(text)
    non_lem_sents = [str(sent) for sent in document]
    h = get_num_dep_nouns(document, dep_parser)
    h_lem = set([])
    is_h_lemmatized = False
    for h_noun in h:
        dh = nlp(h_noun)
        h_noun_lem = dh[0][0].lemma
        if h_noun_lem != h_noun:
            text = text.replace(h_noun, h_noun_lem)
            is_h_lemmatized = True
        h_lem.update([h_noun_lem])
    all_h = deepcopy(h)
    h = list(h_lem)
    if is_h_lemmatized:
        document = nlp(text)
        # print(text)
    sentences, numbers = get_numbers(document)
    NPs, et = get_noun_phrases_entities(h, sentences, tree_parser)
    et = filter_et(et, sentences, numbers)
    document2 = spacy_parser(text)
    ex = get_ex(document2, sentences, h)
    numt = get_numt(et, numbers)
    process_bare_num(numbers, sentences, numt, et, h)
    vt = get_verbs(et + [ex], non_lem_sents, dep_parser, nlp)
    vx = vt[-1]
    del vt[-1]
    at, ax = get_attributes(et, ex, dep_parser)
    fragments = get_fragments(et, numt, vt, at, ex, vx, ax, sentences,
                              non_lem_sents)
    assert len(fragments) == len(sentences)
    ct = get_containers(fragments, all_h, dep_parser, nlp)
    print("final fragments :", "\n")
    for fragment in fragments:
        print(fragment, "\n")
    fragx = fragments[-1]
    del fragments[-1]
    verb_cats = []
    for fragment in fragments:
        verb_cats.append(verb_category(fragment[4], nlp, verb_cats_json))
    states = get_states(fragments, verb_cats, ex, ax)
    equations = build_equations(states)
    solutions = solve_equations(equations)
    answer = get_answer(solutions, states, fragments, fragx, orig_text, nlp,
                        verb_cats_json)
    return answer
Exemplo n.º 12
0
def create_occurrences_dict(string):
    """Given a string, count occurences of preprocessed tokens."""

    # Keeps track of preprocessed tokens count
    occurrences = {}
    tokens = preprocess_text(string)

    # Count occurences of tokens in string
    for token in tokens:
        if token not in occurrences:
            occurrences[token] = 0
        occurrences[token] += 1

    return occurrences
Exemplo n.º 13
0
def get_test_loader(batch_size):
    #df = pd.read_csv(os.path.join(settings.DATA_DIR, 'test_clean.csv'))
    df = pd.read_csv(os.path.join(settings.DATA_DIR, 'test.csv'))
    #print(df.head())
    df.comment_text = preprocess_text(df.comment_text)
    #print(df.head())
    ds_test = ToxicDataset(df, train_mode=False, labeled=False)
    loader = data.DataLoader(ds_test,
                             batch_size=batch_size,
                             shuffle=False,
                             num_workers=4,
                             collate_fn=ds_test.collate_fn,
                             drop_last=False)
    loader.num = len(df)

    return loader
Exemplo n.º 14
0
def collect_data(root_dir, do_lemmatize=True, from_file='', encoding='cp1251'):
    data = OrderedDict()
    if from_file != '' and do_lemmatize:
        logging.info("loading data from file")
        with open(from_file, mode='rb') as art_pkl:
            data = pickle.load(art_pkl)
    else:
        for cur_root, dirs, files in os.walk(root_dir):
            for name in files:
                with open(os.path.join(cur_root, name), encoding=encoding) as tf:
                    text = get_title(tf.name) if conf.only_title else tf.read()
                    data[tf.name] = preprocess_text(text, do_lemmatize)
        logging.info("saving collected data")
        with open('./%s/articles.%spkl' % (SAVED_DIR, 'lemmatized.' if do_lemmatize else ''),
                  mode='wb') as art_pkl:
            pickle.dump(data, art_pkl)
    return data
Exemplo n.º 15
0
    def predict_old(self, X_test):
        predicted = []
        print('Testing..')

        for test_case in tqdm(X_test):
            target_sums = [self.target_data[t]['prior'] for t in self.targets]
            test_words = preprocess_text(test_case)

            for t_idx, t in enumerate(self.targets):
                t_data = self.target_data[t]
                for word in test_words:
                    if word in t_data['likelihood']:
                        target_sums[t_idx] += t_data['likelihood'][word]

            # Get biggest result
            predicted.append(self.targets[np.argmax(target_sums)])

        return predicted
Exemplo n.º 16
0
def get_train_val_loaders(batch_size=64,
                          val_batch_size=256,
                          val_percent=0.95,
                          val_num=10000):
    #df = shuffle(pd.read_csv(os.path.join(settings.DATA_DIR, 'train_clean.csv')), random_state=1234)
    df = shuffle(pd.read_csv(os.path.join(settings.DATA_DIR, 'train.csv')),
                 random_state=1234)
    #print(df.head())
    df.comment_text = preprocess_text(df.comment_text)
    add_loss_weight(df)

    print(df.shape)

    split_index = int(len(df) * val_percent)

    df_train = df[:split_index]
    df_val = df[split_index:]

    if val_num is not None:
        df_val = df_val[:val_num]

    print(df_train.head())
    print(df_val.head())

    ds_train = ToxicDataset(df_train)
    train_loader = data.DataLoader(ds_train,
                                   batch_size=batch_size,
                                   shuffle=True,
                                   num_workers=4,
                                   collate_fn=ds_train.collate_fn,
                                   drop_last=True)
    train_loader.num = len(df_train)

    ds_val = ToxicDataset(df_val)
    val_loader = data.DataLoader(ds_val,
                                 batch_size=val_batch_size,
                                 shuffle=False,
                                 num_workers=4,
                                 collate_fn=ds_val.collate_fn,
                                 drop_last=False)
    val_loader.num = len(df_val)
    val_loader.df = df_val

    return train_loader, val_loader
Exemplo n.º 17
0
def tokenize_cases(document):
    tokenized_document = []
    for line in document:
        output = line.strip(' ')
        things = re.findall('\([\w]+\)', output)
        for thing in things:  # makes the job of the section tokenizer easier
            while thing in output:
                output = output.replace(thing, '')
        while '  ' in output:
            output = output.replace('  ', ' ')
        output = output.lower().replace(
            'section 7703',
            'sec_7703')  # somehow the tokenizer sometimes fails for this
        output = output.lower().replace(
            'section 68',
            'sec_68')  # somehow the tokenizer sometimes fails for this
        output = preprocess_text(output.lower())
        tokenized_document.append(output.strip('\n'))
    return tokenized_document
Exemplo n.º 18
0
def detect_fake(text, device, model, tokenizer):
    text_parts = preprocess_text(text, device, tokenizer)
    overall_output = torch.zeros((1, 2)).to(device)
    try:
        for part in text_parts:
            if len(part) > 0:
                overall_output += model(part.reshape(1, -1))[0]
    except RuntimeError:
        print("GPU out of memory, skipping this entry.")

    overall_output = F.softmax(overall_output[0], dim=-1)

    value, result = overall_output.max(0)

    term = False
    if result.item() == 0:
        term = True

    print("Is real - {} at {}%".format(term, value.item() * 100))
    return term, value.item() * 100
Exemplo n.º 19
0
    def test_preprocess_flow(self):
        words_dict, text = genertate_text()
        result_words_dict = preprocess.preprocess_text(text)
        for k, v in words_dict.items():
            self.assertEqual(result_words_dict[k], v)

        cr = Crypto()
        encrypted = preprocess.words_dict_encrypt_and_hashed(
            result_words_dict, cr)
        decrypted = preprocess.words_dict_decrypt(encrypted, cr)
        for item in decrypted:
            word = item['word']
            self.assertIn(word, words_dict.keys())
            self.assertEqual(item['count'], words_dict[word])

        with open(TEST_HTML) as f:
            text = f.read()

        # Next we expect found more 5 unique words in text
        self.assertGreater(len(preprocess.main_preprocess(text)), 5)
Exemplo n.º 20
0
def predict(config, text, code, model=None, embedding_input=None):
    if model is None:
        model = load_model(config, code)

    preprocessed = preprocess_text(text)

    if embedding_input is None:
        embedding = []
        word_model = load_word2vec(config.embeddings_model)
        for word in preprocessed.split(' '):
            if word in word_model.wv.index2word:
                vec = word_model.wv[word]
                embedding.append(vec)

        embedding_input = Variable(
            torch.Tensor(np_sentence_to_list(embedding)))

    pred = model(embedding_input)
    pred_label = pred.data.max(1)[1].numpy()[0]
    pred_char = get_char_for_binary(code, pred_label)
    return pred_char
Exemplo n.º 21
0
def construct_random_network(doc, p=0.2):
    """Construct random network for use as baseline.

    Create a random network based on *doc*, with words used for nodes.
    Edges are created between any given pair of nodes (a,b)  with probability *p*.

    All edges will have weight = 1.0
    """
    doc = preprocess.preprocess_text(doc)
    words = list(set(doc))  # list of unique words

    # create graph
    graph = nx.DiGraph()
    graph.add_nodes_from(words)

    # add edges
    for word_a in graph.nodes():
        for word_b in graph.nodes():
            if word_a != word_b and rand() < p:
                _update_edge_weight(graph, word_a, word_b)

    return graph
Exemplo n.º 22
0
def construct_random_network(doc, p=0.2):
    """Construct random network for use as baseline.

    Create a random network based on *doc*, with words used for nodes.
    Edges are created between any given pair of nodes (a,b)  with probability *p*.

    All edges will have weight = 1.0
    """
    doc = preprocess.preprocess_text(doc)
    words = list(set(doc)) # list of unique words

    # create graph
    graph = nx.DiGraph()
    graph.add_nodes_from(words)

    # add edges
    for word_a in graph.nodes():
        for word_b in graph.nodes():
            if word_a != word_b and rand() < p:
                _update_edge_weight(graph, word_a, word_b)

    return graph
Exemplo n.º 23
0
    def predict(self, X_test):
        print('Testing {}...'.format(self.name))
        predicted = []
        lap_predicted = []

        smooth_probs = [
            math.log(1 / (t_data['doc_count'] + len(self.X_train)))
            for t, t_data in self.target_data.items()
        ]

        # TODO: multiprocessing
        for test_case in tqdm(X_test, unit='test'):
            target_sums = [self.target_data[t]['prior'] for t in self.targets]
            lap_target_sums = [
                self.target_data[t]['prior'] for t in self.targets
            ]
            test_words = preprocess_text(test_case)

            for t_idx, t in enumerate(self.target_data):
                t_data = self.target_data[t]
                for word in test_words:
                    if word in t_data['likelihood']:
                        target_sums[t_idx] += t_data['likelihood'][word]
                        lap_target_sums[t_idx] += t_data['lap_likelihood'][
                            word]
                    else:
                        for t2_idx, t2 in enumerate(self.targets):
                            t2_data = self.target_data[t2]
                            if t2 != t and word in t2_data['likelihood']:
                                target_sums[t_idx] += smooth_probs[t_idx]
                                lap_target_sums[t_idx] += smooth_probs[t_idx]
                                break

            # Get biggest result
            predicted.append(self.targets[np.argmax(target_sums)])
            lap_predicted.append(self.targets[np.argmax(lap_target_sums)])

        return predicted, lap_predicted
Exemplo n.º 24
0
def tokenize_statutes(document):
    current_section = ''
    tokenized_document = []
    for line in document:
        # 1. normalize names of sections
        output = line
        if line.startswith('Section '):  # remove leading 'Section XYZ'
            section_name = output.split('.')[0]
            rest = '.'.join(output.split('.')[1:])
            new_section_name = 'sec_' + section_name.split(' ')[1]
            current_section = new_section_name
            output = rest
        things = re.findall('\([\w]+\)', output)
        for thing in things:
            while thing in output:
                output = output.replace(thing, '')
        while '  ' in output:
            output = output.replace('  ', ' ')
        output = current_section + ' ' + output
        # 2. tokenize
        output = preprocess_text(output.strip(' '))
        tokenized_document.append(output.strip('\n'))
    return tokenized_document
Exemplo n.º 25
0
 def run(self):
     self.timerThread.start(time.time())
     lines = preprocess_text(self.text)
     output = []
     for count, line in enumerate(lines):
         _mutex1.lock()
         if _running1 == False:
             _mutex1.unlock()
             self.interruptSignal.emit()
             return
         else:
             _mutex1.unlock()
         self.iterSignal.emit((count, len(lines)))
         sequence = np.array(text_to_sequence(
             line, ['english_cleaners']))[None, :]
         device = torch.device('cuda' if self.use_cuda else 'cpu')
         sequence = torch.autograd.Variable(
             torch.from_numpy(sequence)).to(device).long()
         # Decode text input
         mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(
             sequence)
         with torch.no_grad():
             audio = self.waveglow.infer(
                 mel_outputs_postnet,
                 sigma=0.666,
                 progress_callback=self.progress,
                 elapsed_callback=self.elapsed,
                 get_interruptflag=self.get_interruptflag)
             if type(audio) != torch.Tensor:
                 # Catches when waveglow is interrupted and returns none
                 self.interruptSignal.emit()
                 return
             self.iterSignal.emit((count + 1, len(lines)))
             wav = audio[0].data.cpu().numpy()
         output.append(wav)
     outwav = np.concatenate(output)
     self.audioSignal.emit(outwav)
Exemplo n.º 26
0
def text_to_vector(docs, metric):
    """ Create frequency based feature-vector from text

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist(
    )  # Distribution over how many documents each word appear in.
    tf_dists = []  # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc:
            fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)

    all_tokens = doc_freqs.keys()
    num_docs = len(docs)
    num_features = len(all_tokens)

    # Build feature x document matrix
    matrix = np.zeros((num_features, num_docs))
    for i, fd in enumerate(tf_dists):
        if metric == FrequencyMetrics.TF:
            v = [fd.freq(word) for word in all_tokens]
        elif metric == FrequencyMetrics.TF_IDF:
            v = [
                fd.freq(word) * math.log(float(num_docs) / doc_freqs[word])
                for word in all_tokens
            ]
        else:
            raise ValueError("No such feature type: %s" % feature_type)
        matrix[:, i] = v

    return matrix
Exemplo n.º 27
0
def tokenize_sentences(sentences):
    """Tokenizes sentences using Preprocessor"""
    for sen in sentences:
        sen.tokens = preprocess.preprocess_text(sen.original)
Exemplo n.º 28
0
import pickle
from preprocess import preprocess_text

text = open('./data/alice/alice-in-wonderland.txt', 'r').read()
preprocess_text(text, './data/alice/alice-processed.pickle')
Exemplo n.º 29
0
from preprocess import preprocess_text

text = open('./data/shakespeare/sonnets.txt', 'r').read()

preprocess_text(text, './data/shakespeare/processed.pickle')
Exemplo n.º 30
0
def main(csvfile):
    # Read data
    reviews_df = pd.read_csv(csvfile)
    # Removing NA's
    reviews_df = reviews_df.dropna()
    reviews_df = reviews_df.reset_index(drop=True)

    # Create labels
    # Divide Reviewer_Scores into four classes, 3 with score>7.5, 2 with score > 5, 2 with score >2.5, 0 with score <2.5
    reviews_df["Label"] = reviews_df["Reviewer_Score"].apply(
        lambda x: 3 if x > 7.5 else (2 if x > 5 else (1 if x > 2.5 else 0)))
    reviews_df = reviews_df[[
        "Additional_Number_of_Scoring", "Average_Score",
        "Review_Total_Negative_Word_Counts",
        "Review_Total_Positive_Word_Counts",
        "Total_Number_of_Reviews_Reviewer_Has_Given", "Negative_Review",
        "Positive_Review", "Label"
    ]]
    # The whole dataset is too large and here only take 30% of the dataset
    reviews_df = reviews_df.sample(frac=0.3, replace=False, random_state=42)

    # PART 1: Prediction without nlp feature
    print("Without NLP features:")
    # Feature selection
    features = [
        "Additional_Number_of_Scoring", "Average_Score",
        "Review_Total_Negative_Word_Counts",
        "Review_Total_Positive_Word_Counts",
        "Total_Number_of_Reviews_Reviewer_Has_Given"
    ]
    X_train, X_test, y_train, y_test = train_test_split(reviews_df[features],
                                                        reviews_df["Label"],
                                                        test_size=0.30,
                                                        random_state=20)

    # Logistic Regression
    logistic_regression(X_train, y_train, X_test, y_test)

    # Random Forest
    random_forest(X_train, y_train, X_test, y_test)

    # XGBoost
    xgboost(X_train, y_train, X_test, y_test)

    # PART 2: Prediction with adding nlp features
    print("With NLP features:")
    # Append the positive and negative text reviews
    reviews_df["review"] = reviews_df["Negative_Review"] + reviews_df[
        "Positive_Review"]
    # Remove 'No Negative' or 'No Positive' from text
    reviews_df["review"] = reviews_df["review"].apply(
        lambda x: x.replace("No Negative", "").replace("No Positive", ""))

    # Clean text data
    print("Start preprocessing textual columns...")
    reviews_df["review_clean"] = reviews_df["review"].apply(
        lambda x: preprocess_text(x))

    # Train a Doc2Vec model with text data
    print("Adding Doc2Vec...")
    reviews_df = doc2vec(reviews_df)

    # Add tf-idf columns
    print("Adding TF-IDF...")
    reviews_df = tf_idf(reviews_df)

    # Feature selection
    label = "Label"
    ignore_cols = [
        label, "review", "review_clean", "Negative_Review", "Positive_Review"
    ]
    features_2 = [c for c in reviews_df.columns if c not in ignore_cols]
    X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(
        reviews_df[features_2],
        reviews_df["Label"],
        test_size=0.30,
        random_state=20)

    # Logistic Regression
    logistic_regression(X_train_2, y_train_2, X_test_2, y_test_2)

    # Random Forest
    random_forest(X_train_2, y_train_2, X_test_2, y_test_2)

    # XGBoost with nlp features
    xgboost(X_train_2, y_train_2, X_test_2, y_test_2)
Exemplo n.º 31
0
def map_function(dynamicRecord):
    tweet = dynamicRecord["tweet"]
    features = preprocess.preprocess_text(tweet, 140)
    dynamicRecord["features"] = features
    return dynamicRecord
Exemplo n.º 32
0
def _text_to_preprocessed_text(text):
    """Convert text to preprocessed text"""
    prep = preprocess.preprocess_text(text)
    return ' '.join(prep)
Exemplo n.º 33
0
 def execute_this_fn(self, TOKEN, min_donation, channel, se_opts, use_cuda,
                     model, waveglow, offset, prev_time, startup_time,
                     progress_callback, elapsed_callback, text_ready,
                     fn_callback):
     # TODO: refactor this messy block
     fn_callback.emit(('GUI: start of polling loop', None))
     text_ready.emit("Sta2:Connecting to StreamElements")
     url = "https://api.streamelements.com/kappa/v2/tips/" + self.channel_id
     headers = {
         'accept': 'application/json',
         "Authorization": "Bearer " + TOKEN
     }
     text_ready.emit('Log2:Initializing')
     text_ready.emit('Log2:Minimum amount for TTS: ' + str(min_donation))
     while True:
         _mutex2.lock()
         if _running2 == False:
             _mutex2.unlock()
             break
         else:
             _mutex2.unlock()
         if not channel.get_busy():
             #print('Polling', datetime.datetime.utcnow().isoformat())
             text_ready.emit("Sta2:Waiting for incoming donations . . .")
             current_time = datetime.datetime.utcnow().isoformat()
             # TODO: possible bug: missed donations once time pasts midnight
             querystring = {
                 "offset": offset,
                 "limit": "1",
                 "sort": "createdAt",
                 "after": startup_time,
                 "before": current_time
             }
             response = requests.request("GET",
                                         url,
                                         headers=headers,
                                         params=querystring)
             data = json.loads(response.text)
             for dono in data['docs']:
                 text_ready.emit("Sta2:Processing donations")
                 dono_time = dono['createdAt']
                 offset += 1
                 if dono_time > prev_time:  # Str comparison
                     amount = dono['donation']['amount']  # Int
                     if float(amount) >= min_donation and dono[
                             'approved'] == 'allowed':
                         name = dono['donation']['user']['username']
                         msg = dono['donation']['message']
                         if msg.isspace(): break  # Check for empty line
                         ## TODO Allow multiple speaker in msg
                         currency = dono['donation']['currency']
                         dono_id = dono['_id']
                         text_ready.emit(
                             "Log2:\n###########################")
                         text_ready.emit("Log2:" + name + ' donated ' +
                                         currency + str(amount))
                         text_ready.emit("Log2:" + msg)
                         lines = preprocess_text(msg)
                         if se_opts[
                                 'read dono amount'] == 1:  # reads dono name and amount
                             msg = '{} donated {} {}.'.format(
                                 name, str(amount),
                                 cleaners.expand_currency(currency))
                             lines.insert(0, msg)  # Add to head to list
                         output = []
                         for count, line in enumerate(lines):
                             fn_callback.emit(
                                 ('GUI: progress bar 2 text', (count,
                                                               len(lines))))
                             sequence = np.array(
                                 text_to_sequence(
                                     line, ['english_cleaners']))[None, :]
                             # Inference
                             device = torch.device(
                                 'cuda' if use_cuda else 'cpu')
                             sequence = torch.autograd.Variable(
                                 torch.from_numpy(sequence)).to(
                                     device).long()
                             # Decode text input
                             mel_outputs, mel_outputs_postnet, _, alignments = model.inference(
                                 sequence)
                             with torch.no_grad():
                                 audio = waveglow.infer(
                                     mel_outputs_postnet,
                                     sigma=0.666,
                                     progress_callback=progress_callback,
                                     elapsed_callback=None,
                                     get_interruptflag=self.
                                     get_interruptflag2)
                                 if type(audio) != torch.Tensor:
                                     # Catches when waveglow is interrupted and returns none
                                     break
                                 fn_callback.emit(
                                     ('GUI: progress bar 2 text',
                                      (count + 1, len(lines))))
                                 wav = audio[0].data.cpu().numpy()
                             output.append(wav)
                         _mutex3.lock()
                         if _running3 == True:
                             _mutex3.unlock()
                             outwav = np.concatenate(output)
                             # Playback
                             fn_callback.emit(('Wav: playback', outwav))
                         else:
                             _mutex3.unlock()
                         prev_time = dono_time  # Increment time
         time.sleep(0.5)
     fn_callback.emit(('GUI: end of polling loop', None))
     text_ready.emit('Log2:\nDisconnected')
     text_ready.emit('Sta2:Ready')
     fn_callback.emit(('Var: offset', offset))
     fn_callback.emit(('Var: prev_time', prev_time))
     return  #'Return value of execute_this_fn'
Exemplo n.º 34
0
from preprocess import preprocess_text

book_paths = [
    './data/fitzgerald/beautiful-and-damned.txt',
    './data/fitzgerald/flappers-and-philosophers.txt',
    './data/fitzgerald/tales-of-the-jazz-age.txt',
    './data/fitzgerald/this-side-of-paradise.txt',
]

# Combine books into 1 big book
combined_text = ''
combined_len = 0
for path in book_paths:
    txt = open(path, 'r').read()
    combined_len = combined_len + len(txt)
    combined_text = combined_text + ' ' + txt

preprocess_text(combined_text, './data/fitzgerald/processed-all-books.pickle')

Exemplo n.º 35
0
    ents = nlp(text).ents
    glose_ents = []
    for ent in ents:
        if ent.label_ == "PERSON":
            cat = "PER"
        elif ent.label_ in ["ORG", "LOC"]:
            cat = ent.label_
        else:
            cat = "MISC"
        glose_ent = GloseEntity(ent.text, ent.start_char, ent.end_char, cat)
        glose_ents.append(glose_ent)
elif args.model != None:
    from keras.models import load_model

    lword_id_sents, casing_id_sents, pos_sents = preprocess_text(text)

    model_path = get_model_path(args.model)
    model = load_model(model_path)
    pred_label_id_sents = model.predict([lword_id_sents,
                                         casing_id_sents]).argmax(axis=2)

    # Create entities from model predictions

    glose_ents = []

    for pred_label_id_sent, pos_sent in zip(pred_label_id_sents, pos_sents):
        # Remove padding
        pred_label_id_sent = pred_label_id_sent[-len(pos_sent):]

        ent = None