Пример #1
0
class MyCorpus(corpora.TextCorpus):
    def __init__(self, input=None, path='../processed_papers'):
        self.path = path
        super(corpora.TextCorpus, self).__init__()
        self.input = input
        self.dictionary = Dictionary()
        self.metadata = False
        self.dictionary.add_documents(self.get_texts())

    def __len__(self):
        files = os.listdir(self.path)
        return len(files)

    def get_texts(self):
        files = os.listdir(self.path)
        counteR = 0
        json_data = {}
        for fl in files:
            #print(counteR, ': ', fl)
            if (counteR % 1000 == 0):
                print(counteR)
            counteR += 1
            text = ''
            json_data = json.load(open(self.path + '/' + fl))
            if json_data["title"] is not None:
                text += (json_data["title"] + " ")
            for val in json_data["abstract_sentences"].values():
                if val is not None:
                    text += val + " "
            for val in json_data['body_sentences'].values():
                if val is not None:
                    text += val + " "
            yield ie_preprocess(text)
Пример #2
0
def build_dictionary():
    dictionary = Dictionary()
    for line in open(wiki_index.ARTICLES_FILE):
        dictionary.add_documents([line.lower().split()])
    dictionary.filter_extremes(no_below=2, no_above=0.5)
    dictionary.save(DICTIONARY_FILE)
    return dictionary
Пример #3
0
class CorpusOfMethodContents(TextCorpus):
    
    def __init__(self):
        self.mapMethodFQNtoIndex = {}
        self.methodFqns = []
        self.methodContents = []
        TextCorpus.__init__(self)
        
    def addDocument(self, methodFqn, words):
        if methodFqn not in self.mapMethodFQNtoIndex:
            self.methodFqns.append(methodFqn)
            self.mapMethodFQNtoIndex[methodFqn] = len(self.mapMethodFQNtoIndex) - 1
            self.methodContents.append(words)
            self.dictionary.doc2bow(words, allow_update = True)
        else:
            self.methodContents[self.mapMethodFQNtoIndex[methodFqn]] = words
            self.dictionary = Dictionary()
            self.dictionary.add_documents(self.get_texts())
    
    def getMethodContentsForFqn(self, fqn):
        if fqn in self.mapMethodFQNtoIndex.keys():
            return self.methodContents[self.mapMethodFQNtoIndex[fqn]]
        return None
    
    def get_texts(self):
        for content in self.methodContents:
            yield content
Пример #4
0
class Corpus(object):
    def __init__(self, path, dict_path):
        self.dictionary = Dictionary()
        add_to_dict = True
        if dict_path and os.path.exists(dict_path):
            print('loading dictionary')
            self.dictionary = self.dictionary.load(dict_path)
            add_to_dict = False
        self.train = self.tokenize(os.path.join(path, 'train.txt'),
                                   add_to_dict)
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'),
                                   add_to_dict)
        self.test = self.tokenize(os.path.join(path, 'test.txt'), add_to_dict)
        if dict_path and not os.path.exists(dict_path):
            self.dictionary.save(dict_path)

    def tokenize(self, path, add_to_dict):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        all_words = list(
            chain.from_iterable([
                sent.split() + ['<eos>']
                for sent in open(path).read().split('\n')
            ]))
        if add_to_dict:
            self.dictionary.add_documents([all_words])
        return torch.LongTensor(self.dictionary.doc2idx(all_words))
Пример #5
0
def download_dictionary(corpus_name: str, target_path: str) -> Dictionary:
    """
    Download dictionary only for a corpus from UCI website

    :param corpus_name: name of UCI corpus
    :param target_path: output directory for dictionary file
    :return: gensim Dictionary
    """

    url_root = "https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/"
    target_path = os.path.join(target_path, "uci", "raw")
    if not os.path.exists(target_path):
        print("creating target path: {}".format(target_path))
        os.makedirs(target_path)

    vocab_file = os.path.join(target_path, "vocab.{}.txt".format(corpus_name))
    print("downloading {} vocab file to: {}".format(corpus_name, vocab_file))
    urllib.request.urlretrieve(url_root + "vocab.{}.txt".format(corpus_name),
                               filename=vocab_file)

    dictionary = Dictionary()
    with open(vocab_file) as f:
        for line in f:
            dictionary.add_documents([[line.strip()]])

    dictionary.compactify()

    return dictionary
def topic_model(df_train, df_test, topic_count=10):
    ## general remove text
    df_train['tweet'] = df_train['tweet'].map(general_text_processing)
    df_test['tweet'] = df_test['tweet'].map(general_text_processing)

    ## remove stop words
    df_train['tweet'] = df_train['tweet'].map(remove_stop_words)
    df_test['tweet'] = df_test['tweet'].map(remove_stop_words)

    ## gensim lda
    from gensim.corpora.dictionary import Dictionary
    from gensim.models.ldamodel import LdaModel
    dictionary = Dictionary()
    for t in df_train.tweet.values.tolist():
        #print(t)
        dictionary.add_documents([t.split()])
    #for  t in df_test['tweet'].values.tolist() :
    #print(t)
    # print(t[0].split())
    #print(dictionary.doc2bow(t.split()))
    train_doc2_corupus = [
        dictionary.doc2bow(text.split())
        for text in df_train['tweet'].values.tolist()
    ]
    #print(train_doc2_corupus)
    lda_model = LdaModel(train_doc2_corupus, num_topics=topic_count)
    """
    fill topics
    """
    df_test = fill_lda_result(df_test, lda_model, dictionary, topic_count)
    df_train = fill_lda_result(df_train, lda_model, dictionary, topic_count)
    """
    return 
    """
    return df_train, df_test
Пример #7
0
def build_dictionary_from_splits(splits_template, column, n, save_pickle=None):
    ''' Build dictionary from splits. If `save_pickle` is provided, then save. '''
    unfiltered_dict = Dictionary()
    for eid in xrange(n):
        unfiltered_dict.add_documents(csv_isolator("../../data/proc_Train_%d.csv" % eid, column))
    print "Before filtering,", unfiltered_dict
    if save_pickle:
        print "\nsaving..."
        unfiltered_dict.save(save_pickle)
    
    return unfiltered_dict
class SublexicalizedCorpus(TextCorpus):
    def __init__(self, base_corpus, order=3, word_limit=None, clean_func=mahoney_clean, create_dictionary=True,
                 n_proc=1):
        self.order = order

        self.clean_func = clean_func
        self.base_corpus = base_corpus
        self.word_limit = word_limit
        self.n_proc = n_proc

        super(SublexicalizedCorpus, self).__init__()

        self.dictionary = Dictionary()

        if create_dictionary:
            self.dictionary.add_documents(self.get_texts())

    def get_texts(self):
        a_count = 0
        t_count = 0

        texts = ((text, self.clean_func, self.order) for text in self.base_corpus.get_texts())

        pool = multiprocessing.Pool(self.n_proc)

        start = time.clock()
        prev = start

        for group in chunkize(texts, chunksize=10 * self.n_proc, maxsize=100):
            for tokens in pool.imap_unordered(process, group):
                a_count += 1

                cur = time.clock()

                if cur - prev > 60:
                    logging.info("Sublexicalized %d in %d seconds, %.0f t/s"
                                 % (t_count, cur - start, t_count*1. / (cur - start)))

                    prev = cur

                t_count += len(tokens)

                yield tokens

                if self.word_limit and t_count > self.word_limit:
                    break

        pool.terminate()

        end = time.clock()
        logging.info("Sublexicalizing %d finished in %d seconds, %.0f t/s"
                     % (t_count, end - start, t_count*1. / (end - start)))

        self.length = t_count
Пример #9
0
    def __init__(self, fname, dictionary=None):
        """
        Initialize the corpus. Unless a dictionary is provided, this scans the
        corpus once, to determine its vocabulary.
        """
        self.fname = fname
        self.metadata = False

        if dictionary is None:
            dictionary = Dictionary()
            for text in self.get_texts():
                dictionary.add_documents([text])
        self.dictionary = dictionary
Пример #10
0
    def __init__(self, fname, dictionary=None):
        """
        Initialize the corpus. Unless a dictionary is provided, this scans the
        corpus once, to determine its vocabulary.
        """
        self.fname = fname
        self.metadata = False

        if dictionary is None:
            dictionary = Dictionary()
            for text in self.get_texts():
                dictionary.add_documents([text])
        self.dictionary = dictionary
def initialize_lda():
    path = os.path.join("../data", "train.csv")
    dct = Dictionary(common_texts)
    corpus = [dct.doc2bow(text) for text in common_texts]

    with open(path, 'r') as file:
        csv_file = csv.DictReader(file)
        for row in csv_file:
            row = dict(row)
            new_texts = [row['story'].split()]
            dct.add_documents(new_texts)
            corpus += [dct.doc2bow(text) for text in new_texts]
    lda = models.ldamodel.LdaModel(corpus, num_topics=50)
    lda.save(os.path.join("lda_model", "model"))
    dct.save_as_text(os.path.join("lda_model", "dictionary"))
Пример #12
0
class TextCorpus(gensim.corpora.TextCorpus):
    """A corpus class which makes some minor extensions to the Gensim
    `TextCorpus` implementation:

    - Support loading of pre-built dictionary
    """

    def __init__(self, input=None, dictionary=None, dictionary_save_path=None,
                 pre_tokenized=False, lowercase=False):
        super(gensim.corpora.TextCorpus, self).__init__()

        self.input = input
        self.metadata = False

        self.pre_tokenized = pre_tokenized
        self.lowercase = lowercase

        if dictionary is None:
            self.dictionary = Dictionary()

            if input is not None:
                self.dictionary.add_documents(self.get_texts())
            else:
                logging.warning("No input document stream provided; "
                                "assuming dictionary will be "
                                "initialized in some other way.")
        else:
            self.dictionary = dictionary

        if dictionary_save_path is not None:
            self.dictionary.save(dictionary_save_path)

    def get_texts(self):
        length = 0

        # Input should have one document (sentence, for the word2vec case) per line
        for line in getstream(self.input):
            length += 1

            if self.pre_tokenized:
		if not isinstance(line, unicode):
		    line = unicode(line, encoding='utf8', errors='strict')
                yield line
            else:
                yield gensim.utils.tokenize(line, lowercase=self.lowercase)

        self.length = length
def topic_model(df_train, df_test, topic_count=10):
    ## general remove text
    df_train['tweet'] = df_train['tweet'].fillna("")
    df_test['tweet'] = df_test['tweet'].fillna("")

    df_train['tweet'] = df_train['tweet'].map(general_text_processing)
    df_test['tweet'] = df_test['tweet'].map(general_text_processing)

    ## remove stop words
    df_train['tweet'] = df_train['tweet'].map(remove_stop_words)
    df_test['tweet'] = df_test['tweet'].map(remove_stop_words)

    ## gensim lda
    dictionary = Dictionary()
    for t in df_train.tweet.values.tolist():
        #print(t)
        dictionary.add_documents([t.split()])
    #for  t in df_test['tweet'].values.tolist() :
    #print(t)
    # print(t[0].split())
    #print(dictionary.doc2bow(t.split()))
    train_doc2_corupus = [
        dictionary.doc2bow(text.split())
        for text in df_train['tweet'].values.tolist()
    ]
    #print(train_doc2_corupus)
    print("Started LDA")
    lda_model = LdaModel(train_doc2_corupus,
                         num_topics=topic_count,
                         iterations=30)
    print("Completed LDA")
    """
    fill topics
    """
    df_test = fill_lda_result(df_test, lda_model, dictionary, topic_count)
    df_train = fill_lda_result(df_train, lda_model, dictionary, topic_count)
    """
    return 
    """
    print('LDA Completed')
    return df_train, df_test
Пример #14
0
class FolderCorpus(corpora.TextCorpus):
    def __init__(self, filepaths, preprocess=[], dictionary=None):
        self.filepaths = filepaths
        self.preprocess = preprocess
        self.metadata = None

        self.dictionary = Dictionary()

        self.dictionary.add_documents(self.get_texts())
        self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=500000)
        self.dictionary.compactify()

    def get_texts(self):
        for path in self.filepaths:
            with codecs.open(path, encoding='utf8') as f:
                raw_text = f.read()
                raw_text = raw_text.lower()
                for filt in self.preprocess:
                    raw_text = filt(raw_text)
                text = list(utils.tokenize(raw_text, deacc=True, lowercase=True))
                yield text
Пример #15
0
class ArchiveCorpus(corpora.TextCorpus):

	def __init__(self, datafile, preprocess=[], dictionary=None):
		self.datafile = datafile
		self.preprocess = preprocess
		self.metadata = None

		if dictionary:
				self.dictionary = dictionary
		else:
				self.dictionary = Dictionary()
				if datafile is not None:
					self.dictionary.add_documents(self.get_texts())
					self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=500000)


	def get_texts(self):
		with utils.smart_open(self.datafile) as inputfile:
			for line in inputfile:
				for f in self.preprocess:
					line = f(line)
				text = list(utils.tokenize(line, deacc=True, lowercase=True))
				yield text
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--use_domain", action="store_true")
    parser.add_argument("--update", action="store_true")
    parser.add_argument("--save_interval", type=int, default=100)
    args = parser.parse_args()

    if args.update:
        common_dict = Dictionary.load_from_text("./common_dict.txt")
    else:
        common_dict = Dictionary()
    for i, url in enumerate(sys.stdin):
        print("url " + str(i))
        text = fetch_contents_from_url(url.strip(), use_domain=args.use_domain)
        if not text:
            continue

        word_list = doc2word_list(text)
        common_dict.add_documents([word_list])

        if i % args.save_interval == args.save_interval - 1:
            common_dict.save_as_text("./common_dict.txt")

    common_dict.save_as_text("./common_dict.txt")
Пример #17
0
def construct_test(tagger):
    f = open(cfg.PATH_TO_VGR_domain_text2)
    g = open(cfg.PATH_TO_X_TEST, 'w')
    line = f.readline()
    word_dic = Dictionary()
    char_dic = Dictionary()
    word_dic.add_documents([["UNK", "EOS"]])
    char_dic.add_documents([["UNK", "BOW"]])
    while line:
        sentence = _tokenize(line, tagger)
        g.write(" ".join(sentence) + "\n")
        word_dic.add_documents([sentence])
        char_dic.add_documents([list(line)])
        line = f.readline()
    f.close
    g.close
    return list(word_dic.itervalues()), list(char_dic.itervalues())
Пример #18
0
class Vocab():
    def __init__(self):
        self.dic = Dictionary()
        self.dic.add_documents([[u'<UNK>']])

    def construct(self, input_file):
        f = codecs.open(input_file, 'r', 'utf-8')
        sentences = []
        for line in f:
            line = line.strip().split()
            sentences.append(line)
        self.dic.add_documents(sentences)
        f.close()
        self.dic.id2token = {v: k for k, v in self.dic.token2id.items()}

    def word2id(self, input_file, output_file):
        f = codecs.open(input_file, 'r', 'utf-8')
        g = open(output_file, 'w')
        for line in f:
            line = line.strip().split()
            line = map(lambda x: str(self.dic.token2id[x]), line)
            line = u" ".join(line) + u"\n"
            g.write(line)
        f.close()
        g.close()

    def id2word(self, input_file, output_file):
        f = open(input_file, 'r')
        g = codecs.open(output_file, 'w', 'utf-8')
        for line in f:
            line = line.strip().split()
            line = map(lambda x: self.dic.id2token.get(int(x), u'#'), line)
            line = u" ".join(line) + u"\n"
            g.write(line)
        f.close()
        g.close()
Пример #19
0
    def buildDict(self):
        batchiter = BatchIterBert(self.trainDataIter,
                                  filling_last_batch=False,
                                  postProcessor=xonlyBatchProcessor,
                                  batch_size=1)
        common_dictionary = Dictionary(batchiter)
        print(len(common_dictionary))
        if self.testReaderargs:
            print('update vocab from test set')
            batchiter = BatchIterBert(self.testDataIter,
                                      filling_last_batch=False,
                                      postProcessor=xonlyBatchProcessor,
                                      batch_size=1)
            common_dictionary.add_documents(batchiter)
            print(len(common_dictionary))

        common_dictionary.filter_extremes(no_below=self.dict_no_below,
                                          no_above=self.dict_no_above,
                                          keep_n=self.dict_keep_n)
        self.dictProcess = DictionaryProcess(common_dictionary)
        self.postProcessor.dictProcess = self.dictProcess
        self.vocab_dim = len(self.dictProcess)
        self.have_dict = True

        if 1:
            count_list = []
            self.trainDataIter._reset_iter()
            batchiter = BatchIterBert(self.trainDataIter,
                                      filling_last_batch=False,
                                      postProcessor=xonlyBatchProcessor,
                                      batch_size=1)
            for item in batchiter:
                current_count = sum(item)
                count_list.append(current_count)
                #print(current_count)
            print(sum(count_list) / len(count_list))
def create_vocab(tweets):
    print("Building vocabulary...")
    vocab = Dictionary()
    vocab.add_documents(tweets)
    vocab.save('vocab_sentiment')
    return vocab
Пример #21
0
                    help='File name to give the dictionary upon saving')

args = parser.parse_args()

input_path = args.input_path
output_name = args.output_name
CHUNK_SIZE = args.chunk_size

# Stream in documents from path
rdr = lmd.Reader(input_path)
gnr = rdr.stream_data(get_meta=True)

# Build a dictionary out of the validation documents
dictionary = Dictionary()
docs = rdr.stream_data(threaded=True)
doc_chunks = chunks(docs, size=CHUNK_SIZE)
# Progress in chunks
for chunk in doc_chunks:
    print("Adding ", CHUNK_SIZE, " docs")
    tokenized = [[
        tok.lower_ for tok in doc if not tok.is_stop and tok.is_alpha
    ] for doc in tokenizer.pipe(
        [item for item in chunk if language(item) == 'en'],
        batch_size=CHUNK_SIZE)]
    dictionary.add_documents(tokenized)

# Keep only 2**16 most frequent tokens
dictionary.filter_extremes(keep_n=2**16)
dictionary.compactify()
dictionary.save(output_name)
Пример #22
0
# >>> DataFrame['column'].apply(str.lower).apply(word_tokenize)

# Also we added the START and the END symbol to the sentences.
english_sents = [START] + df['English'].apply(
    str.lower).apply(word_tokenize) + [END]
indo_sents = [START] + df['Indonesian'].apply(
    str.lower).apply(word_tokenize) + [END]

# We're sort of getting into the data into the shape we want.
# But now it's still too humanly readable and redundant.
## Cut-away: Computers like it to be simpler, more concise. -_-|||
print('First English sentence:', english_sents[0])
print('First Indo sentence:', indo_sents[0])

english_vocab = Dictionary([['<s>'], ['</s>'], ['UNK']])
english_vocab.add_documents(english_sents)

indo_vocab = Dictionary([['<s>'], ['</s>'], ['UNK']])
indo_vocab.add_documents(indo_sents)

# First ten words in the vocabulary.
print('First 10 Indonesian words in Dictionary:\n',
      sorted(indo_vocab.items())[:10])
print()
print('First 10 English words in Dictionary:\n',
      sorted(english_vocab.items())[:10])

english_vocab = Dictionary([['<s>'], ['</s>'], ['UNK']])
english_vocab.add_documents(english_sents)

indo_vocab = Dictionary([['<s>'], ['</s>'], ['UNK']])
Пример #23
0
class MultiVectorizer():

    reserved = ["<PAD>", "<UNK>"]
    embedding_matrix = None
    embedding_word_vector = {}
    glove = False

    def __init__(self, reserved=None, min_occur=1, glove_path=None, tokenizer=None, embedding_size=300):

        self.mi_occur = min_occur
        self.embedding_size = embedding_size

        self.nlp = spacy.load("en")
        if tokenizer is None:
            self.tokenizer = English().Defaults.create_tokenizer(self.nlp)
        else:
            self.tokenizer = tokenizer

        if glove_path is not None:
            self.load_glove(glove_path)
            self.glove = True

        if reserved is not None:
            self.vocabulary = Dictionary([self.reserved.extend(reserved)])
        else:
            self.vocabulary = Dictionary([self.reserved])

    def get_vocabulary_size(self):
        return len(self.vocabulary.token2id.items())

    def load_glove(self, glove_file_path):
        f = open(glove_file_path, encoding="utf-8")
        for line in tqdm(f):
            value = line.split(" ")
            word = value[0]
            coef = np.array(value[1:], dtype='float32')
            self.embedding_word_vector[word] = coef
        f.close()

    def get_embedding_matrix(self):
        return self.embedding_matrix

    def is_word(self, string_value):
        if self.embedding_word_vector.get(string_value):
            return True

    def get_vocabulary(self):
        return self.vocabulary

    def get_word_id(self, word):
        return self.vocabulary.token2id[word]

    def get_word_from_id(self, index):
        return self.vocabulary.id2token[index]

    def fit_document(self, documents):
        document_tokens = []
        for document in documents:
            section_tokens = []
            for section in document:
                sentence_tokens = []
                for sentence in section:
                    tokens = self.tokenizer(sentence.lower())
                    word_str_tokens = list(map(convert_to_string, tokens))
                    sentence_tokens.append(word_str_tokens)
                    self.vocabulary.add_documents(sentence_tokens)
                section_tokens.append(sentence_tokens)
            document_tokens.append(section_tokens)
        return document_tokens

    def fit_samples_with_sentences(self, samples):
        output_tokens = []
        for sample in samples:
            sentence_tokens = []
            for sentence in sample:
                tokens = self.tokenizer(sentence.lower())
                word_str_tokens = list(map(convert_to_string, tokens))
                sentence_tokens.append(word_str_tokens)
                self.vocabulary.add_documents(sentence_tokens)
            output_tokens.append(sentence_tokens)
        return output_tokens

    def fit(self, X):
        if type(X[0]) == list:
            x_tokens = self.fit_samples_with_sentences(X) #self.fit_document(X)
        else:
            x_tokens = self.fit_text(X)

        self.vocabulary.filter_extremes(no_below=self.mi_occur, no_above=1.0, keep_tokens=self.reserved)

        if self.glove:
            print("Vocabulary Size:",self.get_vocabulary_size())
            self.embedding_matrix = np.zeros((self.get_vocabulary_size(), self.embedding_size))
            for word, i in tqdm(self.vocabulary.token2id.items()):
                if word == "<PAD>":
                    embedding_value = np.zeros((1, self.embedding_size))
                elif word == "<UNK>":
                    sd =  1/np.sqrt(self.embedding_size)
                    np.random.seed(seed=42)
                    embedding_value = np.random.normal(0, scale=sd, size=[1, self.embedding_size])
                else:
                    embedding_value = self.embedding_word_vector.get(word)
                    if embedding_value is None:
                        embedding_value = self.embedding_word_vector.get("<UNK>")
                if embedding_value is not None:
                    self.embedding_matrix[i] = embedding_value
        return  self.transform(x_tokens)

    def fit_text(self, X):
        x_tokens = []
        for x in X:
            if x is not None:
                # x_tokens.append(word_tokenize(x.lower()))
                tokens = self.tokenizer(x.lower())
                word_str_tokens = list(map(convert_to_string, tokens))
                x_tokens.append(word_str_tokens)
                self.vocabulary.add_documents(x_tokens)
        return x_tokens

    def transform(self, X):
        return self.transform_list_of_list(X)

    def transform_list_of_list(self, samples):
        samples_tokens = []
        for sample in samples:
            encoded_tokens = self.transform_section(sample)
            samples_tokens.append(encoded_tokens)
        return samples_tokens

    def transform_document(self, documents):
        document_tokens = []
        for document in documents:
            section_tokens = []
            encoded_tokens = []
            for section in document:
                if type(section) == str:
                    encoded_tokens.append(section)
                    if len(encoded_tokens) == len(document):
                        section_tokens.append(encoded_tokens)
                        section_tokens = self.transform_section(section_tokens)
                else:
                    encoded_tokens = self.transform_section(section)
                    section_tokens.append(encoded_tokens)
            document_tokens.append(section_tokens)
        return document_tokens

    def transform_section(self, X):
        if hasattr(self, "limit"):
            return [[i if i < self.limit else self.reserved.index("<UNK>")
                     for i in self.vocabulary.doc2idx(x, unknown_word_index=self.reserved.index("<UNK>"))]
                    for x in X]
        else:
            return [self.vocabulary.doc2idx(x, unknown_word_index=self.reserved.index("<UNK>")) for x in X]

    def inverse_transform(self, X):
        return [[ self.vocabulary[i] for i in x ] for x in X]

    def save(self, file_path="./vecorizer.vec"):
        with open(file_path, "wb") as handle:
            pickle.dump(self, handle, protocol=pickle.HIGHEST_PROTOCOL)
        return file_path

    @classmethod
    def load(cls, file_path):
        with open(file_path, "rb") as handle:
            self = pickle.load(handle)
        return self
Пример #24
0
def construct_vocab():
    f = open(cfg.PATH_TO_ENG_Y_TRAIN)
    g = open(cfg.PATH_TO_ENG_Y_TEST)
    word_dic = Dictionary()
    char_dic = Dictionary()
    target_dic = Dictionary()
    word_dic.add_documents([["UNK", "EOS"]])
    char_dic.add_documents([["UNK", "BOW"]])
    target_dic.add_documents([["UNK", "BOW"]])

    line = f.readline()
    while line:
        sentence = _tokenize(line)
        word_dic.add_documents([sentence])
        char_dic.add_documents([get_chars(line)])
        target_dic.add_documents([sentence])
        target_dic.add_documents([get_chars(line)])
        line = f.readline()
    f.close

    line = g.readline()
    while line:
        sentence = _tokenize(line)
        word_dic.add_documents([sentence])
        char_dic.add_documents([get_chars(line)])
        line = g.readline()
    g.close
    return list(word_dic.itervalues()), list(char_dic.itervalues()), list(
        target_dic.itervalues())
Пример #25
0
class MultiVectorizer():

    reserved = ["<PAD>", "<UNK>"]
    embedding_matrix = None
    embedding_word_vector = {}
    glove = False

    def __init__(self, reserved=None, min_occur=1, use_bert=False, glove_path=None, tokenizer=None, embedding_size=300):

        self.mi_occur = min_occur
        self.embedding_size = embedding_size
        self.use_bert = use_bert

        self.nlp = spacy.load("en")
        if tokenizer is None:
            self.tokenizer = English().Defaults.create_tokenizer(self.nlp)
        else:
            self.tokenizer = tokenizer

        if glove_path is not None:
            self.load_glove(glove_path)
            self.glove = True

        if reserved is not None:
            self.vocabulary = Dictionary([self.reserved.extend(reserved)])
        else:
            self.vocabulary = Dictionary([self.reserved])

    def get_vocabulary_size(self):
        if not self.use_bert:
            return len(self.vocabulary.token2id.items())
        else:
            return len(self.tokenizer.vocab.keys())

    def load_glove(self, glove_file_path):
        f = open(glove_file_path, encoding="utf-8")
        for line in tqdm(f):
            value = line.split(" ")
            word = value[0]
            coef = np.array(value[1:], dtype='float32')
            self.embedding_word_vector[word] = coef
        f.close()

    def get_embedding_matrix(self):
        return self.embedding_matrix

    def is_word(self, string_value):
        if self.embedding_word_vector.get(string_value):
            return True

    def get_vocabulary(self):
        if not self.use_bert:
            return self.vocabulary
        else:
            return self.tokenizer.vocab

    def get_word_id(self, word):
        if not self.use_bert:
            return self.vocabulary.token2id[word]
        else:
            return self.tokenizer.vocab[word]


    def get_word_from_id(self, index):
        if not self.use_bert:
            return self.vocabulary.id2token[index]
        else:
            return self.tokenizer.inv_vocab[index]

    def fit_document(self, documents):
        document_tokens = []
        for document in documents:
            section_tokens = []
            for section in document:
                sentence_tokens = []
                for sentence in section:
                    tokens = self.tokenizer(sentence.lower())
                    word_str_tokens = list(map(convert_to_string, tokens))
                    sentence_tokens.append(word_str_tokens)
                    self.vocabulary.add_documents(sentence_tokens)
                section_tokens.append(sentence_tokens)
            document_tokens.append(section_tokens)
        return document_tokens

    def fit_bert_sentences(self, samples, remove_stop_words=True):
        output_tokens = []
        vocab = []
        stop_words = set(stopwords.words('english'))
        for sample in tqdm(samples):
            sentence_tokens = []
            for sentence in sample:
                tokens = self.tokenizer.tokenize(sentence.lower())
                tokens = [w for w in tokens if not w in stop_words]
                tokens = ["[CLS]"] + tokens + ["[SEP]"]
                sentence_tokens.append(tokens)
                vocab.append(tokens)
            output_tokens.append(sentence_tokens)
        #self.vocabulary.add_documents(vocab)
        return output_tokens

    def fit_samples_with_sentences(self, samples, remove_stop_words=True):
        output_tokens = []
        vocab = []
        for sample in tqdm(samples):
            sentence_tokens = []
            for sentence in sample:
                tokens = self.tokenizer(sentence.lower())
                if remove_stop_words:
                    tokens = [token for token in tokens if not token.is_stop]
                word_str_tokens = list(map(convert_to_string, tokens))
                sentence_tokens.append(word_str_tokens)
                vocab.append(word_str_tokens)
            output_tokens.append(sentence_tokens)
        self.vocabulary.add_documents(vocab)
        return output_tokens

    def fit(self, X, remove_stop_words=True, list_of_lists=False):
        if list_of_lists:
            if not self.use_bert:
                x_tokens = self.fit_samples_with_sentences(X,remove_stop_words=remove_stop_words) #self.fit_document(X)
            else:
                x_tokens = self.fit_bert_sentences(X, remove_stop_words=remove_stop_words)
        else:
            x_tokens = self.fit_text(X)

        self.vocabulary.filter_extremes(no_below=self.mi_occur, no_above=1.0, keep_tokens=self.reserved)
        unknown_words = []
        if self.glove:
            #spell = Spellchecker()
            print("Vocabulary Size:",self.get_vocabulary_size())
            self.embedding_matrix = np.zeros((self.get_vocabulary_size(), self.embedding_size))
            for word, i in tqdm(self.vocabulary.token2id.items()):
                if word == "<PAD>":
                    embedding_value = np.zeros((1, self.embedding_size))
                elif word == "<UNK>":
                    sd =  1/np.sqrt(self.embedding_size)
                    np.random.seed(seed=42)
                    embedding_value = np.random.normal(0, scale=sd, size=[1, self.embedding_size])
                else:
                    embedding_value = self.embedding_word_vector.get(word)
                    if embedding_value is None:
                        embedding_value = self.embedding_word_vector.get(self.correct_word(word))
                        if embedding_value is None:
                            unknown_words.append(word)
                            embedding_value = self.embedding_word_vector.get("<UNK>")

                if embedding_value is not None:
                    self.embedding_matrix[i] = embedding_value
        print("Number of unknown words:",len(unknown_words))
        unknown_words_df = pd.DataFrame()
        unknown_words_df["Unknown Words"] = unknown_words
        encoded_tokens = self.transform(x_tokens, list_of_lists=list_of_lists)
        return  encoded_tokens

    def fit_text(self, X, remove_stop_words=True):
        output_tokens = []
        for sample in tqdm(X):
            tokens = self.tokenizer(sample.lower())
            if remove_stop_words:
                tokens = [token for token in tokens if not token.is_stop]
            word_str_tokens = list(map(convert_to_string, tokens))
            output_tokens.append(word_str_tokens)
        self.vocabulary.add_documents(output_tokens)
        return output_tokens

    def correct_word(self, word):
        return word

    def transform(self, X, list_of_lists=False):
        if list_of_lists:
            if not self.use_bert:
                return self.transform_list_of_list(X)
            else:
                return self.transform_bert(X)
        else:
            return self.transform_text(X)

    def transform_list_of_list(self, samples):
        samples_tokens = []
        for sample in samples:
            encoded_tokens = self.transform_text(sample)
            samples_tokens.append(encoded_tokens)
        return samples_tokens

    def transform_document(self, documents):
        document_tokens = []
        for document in documents:
            section_tokens = []
            encoded_tokens = []
            for section in document:
                if type(section) == str:
                    encoded_tokens.append(section)
                    if len(encoded_tokens) == len(document):
                        section_tokens.append(encoded_tokens)
                        section_tokens = self.transform_text(section_tokens)
                else:
                    encoded_tokens = self.transform_text(section)
                    section_tokens.append(encoded_tokens)
            document_tokens.append(section_tokens)
        return document_tokens

    def transform_bert(self, samples):
        samples_tokens = []
        for sample in samples:
            encoded_sentences = []
            for sentence_tokens in sample:
                encoded_tokens = self.tokenizer.convert_tokens_to_ids(sentence_tokens)
                encoded_sentences.append(encoded_tokens)
            samples_tokens.append(encoded_sentences)
        return samples_tokens

    def transform_text(self, X):
        if hasattr(self, "limit"):
            return [[i if i < self.limit else self.reserved.index("<UNK>")
                     for i in self.vocabulary.doc2idx(x, unknown_word_index=self.reserved.index("<UNK>"))]
                    for x in X]
        else:
            return [self.vocabulary.doc2idx(x, unknown_word_index=self.reserved.index("<UNK>")) for x in X]

    def inverse_transform(self, X):
        return [[ self.vocabulary[i] for i in x ] for x in X]

    def save(self, file_path="./vecorizer.vec"):
        with open(file_path, "wb") as handle:
            pickle.dump(self, handle, protocol=pickle.HIGHEST_PROTOCOL)
        return file_path

    @classmethod
    def load(cls, file_path):
        with open(file_path, "rb") as handle:
            self = pickle.load(handle)
        return self
Пример #26
0
class Vocab():
    def __init__(self):
        self.dic = Dictionary()
        self.dic.add_documents([[u'<UNK>', u',']])

    def construct(self, input_file):
        f = codecs.open(input_file, 'r', 'utf-8')
        sentences = []
        for line in f:
            line = line.strip().split()
            sentences.append(line)
        self.dic.add_documents(sentences)
        f.close()
        self.dic.id2token = {v: k for k, v in self.dic.token2id.items()}

    def load_cond(self, input_file, cond_length, unk):
        """Get a list of unique conditions"""
        f = codecs.open(input_file, 'r', 'utf-8')
        conditions = []
        lines = f.readlines()
        if lines[-1].strip() == '':
            print("deleted the last element:", lines[-1])
            lines = lines[:-1]
        lines = list(set(lines))
        for line in lines:
            line = map(int, line.strip().split())
            line = padding(line, cond_length, unk)
            if not line in conditions:
                conditions.append(line)
        self.cond = np.array(conditions)
        self.n_cond = len(conditions)

    def choice_cond(self, num):
        return self.cond[np.random.choice(len(self.cond), num)]

    def word2id(self, input_file, output_file):
        def get_id(dic, key):
            if key in dic:
                return str(dic[key])
            else:
                ret = []
                key = list(key)
                for k in key:
                    ret.append(str(dic.get(k, 0)))
                return u" ".join(ret)

        f = codecs.open(input_file, 'r', 'utf-8')
        g = open(output_file, 'w')
        for line in f:
            line = line.strip().split()
            line = map(lambda x: get_id(self.dic.token2id, x), line)
            line = u" ".join(line) + u"\n"
            g.write(line)
        f.close()
        g.close()

    def id2word(self, input_file, output_file):
        f = open(input_file, 'r')
        g = codecs.open(output_file, 'w', 'utf-8')
        for line in f:
            line = line.strip().split()
            line = map(lambda x: self.dic.id2token.get(int(x), u'#'), line)
            line = u" ".join(line) + u"\n"
            g.write(line)
        f.close()
        g.close()
Пример #27
0
def train_LDA(base_path,
              table_paths,
              batch_size,
              limit,
              use_dictionary=False,
              **kwargs):

    model_name = dic2name(kwargs)
    print("Model: ", model_name)
    topic_num = kwargs['tn']

    # Pass 1 get the dictionary
    if use_dictionary == 'True':
        dic = Dictionary.load(
            join(LDA_CACHE, 'dictionary_{}'.format(model_name)))
    else:

        dic = Dictionary([])
        b = 0
        for corpus in corpus_iter(base_path, table_paths, batch_size, limit,
                                  **kwargs):
            dic.add_documents(corpus)
            print('Dictionary batch {}: current dic size {}'.format(
                b, len(dic)))
            b += 1

        # save dictionary
        dic.save(join(LDA_CACHE, 'dictionary_{}'.format(model_name)))

    print("Dictionary size", len(dic))

    # Pass 2 train LDA
    whole_corpus = corpus_iter(base_path, table_paths, batch_size, limit,
                               **kwargs)
    first_batch = next(whole_corpus)
    first_bow = [dic.doc2bow(text, allow_update=False) for text in first_batch]
    #print(first_bow)

    lda = LdaModel(first_bow,
                   id2word=dic,
                   num_topics=topic_num,
                   minimum_probability=0.0)
    batch_no = 0
    print('LDA update batch {}'.format(batch_no))

    for batch in whole_corpus:
        batch_bow = [dic.doc2bow(text, allow_update=False) for text in batch]
        #print(corpus_bow)
        lda.update(batch_bow)
        batch_no += 1
        print('LDA update batch {}'.format(batch_no))

    # Save model to disk.
    temp_file = join(LDA_CACHE, "model_{}".format(model_name))
    lda.save(temp_file)

    print(
        "Training from {} done. Batch_size: {}, long str tokenization threshold: {}, numerical representations: {}.\
          \nTotal size of dictionary: {}".format(table_paths, batch_size,
                                                 kwargs['thr'], kwargs['num'],
                                                 len(dic)))
    return
Пример #28
0
class TextCorpus(interfaces.CorpusABC):
    """
    Helper class to simplify the pipeline of getting bag-of-words vectors (= a
    gensim corpus) from plain text.

    This is an abstract base class: override the `get_texts()` method to match
    your particular input.

    Given a filename (or a file-like object) in constructor, the corpus object
    will be automatically initialized with a dictionary in `self.dictionary` and
    will support the `iter` corpus method. You must only provide a correct `get_texts`
    implementation.

    """
    def __init__(self, input=None):
        super(TextCorpus, self).__init__()
        self.input = input
        self.dictionary = Dictionary()
        self.metadata = False
        if input is not None:
            self.dictionary.add_documents(self.get_texts())
        else:
            logger.warning("No input document stream provided; assuming "
                           "dictionary will be initialized some other way.")


    def __iter__(self):
        """
        The function that defines a corpus.

        Iterating over the corpus must yield sparse vectors, one for each document.
        """
        for text in self.get_texts():
            if self.metadata:
                yield (self.dictionary.doc2bow(text[0], allow_update=False), text[1])
            else:
                yield self.dictionary.doc2bow(text, allow_update=False)


    def getstream(self):
        return getstream(self.input)


    def get_texts(self):
        """
        Iterate over the collection, yielding one document at a time. A document
        is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`.

        Override this function to match your input (parse input files, do any
        text preprocessing, lowercasing, tokenizing etc.). There will be no further
        preprocessing of the words coming out of this function.
        """
        # Instead of raising NotImplementedError, let's provide a sample implementation:
        # assume documents are lines in a single file (one document per line).
        # Yield each document as a list of lowercase tokens, via `utils.tokenize`.
        length = 0
        for lineno, line in enumerate(getstream(self.input)):
            length += 1
            yield utils.tokenize(line, lowercase=True)
        self.length = length


    def __len__(self):
        return self.length # will throw if corpus not initialized
Пример #29
0
Файл: lda.py Проект: freygit/36
class LDA(object):

    def __init__(self, topics = 10, 
                 worker = 3, 
                 pretrained_model = None, 
                 dictionary = None):
        """
        lda模型训练初始化。
        Args:
            topics -- 指定主题个数
            worker -- 并行化参数,一般为core数量减一
            pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型
            dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典
        Example:
            >>> lda = LDA(topics = 20, worker = 2, 
                          pretrained_model = model_file, 
                          dictionary = dictionary_file)
            >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']]
            >>> lda.update(corpus)
            >>> lda.save(model_file, dictionary_file)
            >>> topics = lda.inference(['word5', 'word6'])
        """

        self._topics = topics
        self._workers = worker
        self._model = None
        self._common_dictionary = None
        if pretrained_model and common_dictionary:
            self._model = LdaModel.load(pretrained_model)
            self._common_dictionary = Dictionary.load(dictionary)

    def save(self, model_file, dictionary_file):
        """
        保存训练的模型,同时保存对应的词典
        Args:
            model_file -- 模型文件
            dictionary_file -- 词典文件
        Returns:
            无
        """

        if self._model:
            self._model.save(model_file)
        if self._common_dictionary:
            self._common_dictionary.save(dictionary_file)

    def update(self, corpus = [[]]):
        """
        在线更新,在已有模型的基础上在线更新
        Args:
            corpus -- 用于更新的文档列表
        """

        if not self._model and len(corpus) > 0:
            self._common_dictionary = Dictionary(corpus)
            corpus_data =  [self._common_dictionary.doc2bow(sentence) for sentence in corpus]
            self._model = LdaModel(corpus_data, self._topics)
        elif self._model and len(corpus) > 0:
            self._common_dictionary.add_documents(corpus)
            new_corpus_data =  [self._common_dictionary.doc2bow(sentence) for sentence in corpus]
            self._model.update(new_corpus_data)

    def inference(self, document = []):
        """
        对新文档推断其话题分布
        Args:
            document -- 文档,其实是词列表
        Returns:
            话题分布列表        
        """
        if self._model:
            doc =  [self._common_dictionary.doc2bow(document)]
            return self._model.get_document_topics(doc)
        return []

    @property
    def model(self):
        return self._model

    @property
    def dictionary(self):
        return self._common_dictionary
Пример #30
0
    else:
        logging.info('no calculated files found, recomputing...')
        logging.info('loading files...')
        with open(data_dark_file, 'r') as f1, open(data_clean_file, 'r') as f2:
            logging.info('loading dark text...')
            dark_text = [line.split() for line in f1.readlines()]
            logging.info('loading clean text...')
            clean_text = [line.split() for line in f2.readlines()]
            logging.info('load file done')

        if os.path.exists(dict_file):
            dictionary = Dictionary.load(dict_file)
        else:
            logging.info('creating the dictionary...')
            dictionary = Dictionary(dark_text)
            dictionary.add_documents(clean_text)
            dictionary.save(dict_file)

        dictionary = filter_dict(args.vocab_size, dictionary,
                                 get_keep_tokens(dictionary))
        logging.info('dictionary created')

        logging.info('building neighbor unigrams...')

        if os.path.exists(file_unigram_dark) and os.path.exists(
                file_unigram_dark_all):
            unigram_dark = np.load(file_unigram_dark)
            unigram_dark_all = np.load(file_unigram_dark_all)
        else:
            unigram_dark, unigram_dark_all = get_neighbor_unigram(
                dictionary, dark_text, args.num_neighbors)
Пример #31
0
class TextCorpus(interfaces.CorpusABC):
    """
    Helper class to simplify the pipeline of getting bag-of-words vectors (= a
    gensim_package corpus) from plain text.

    This is an abstract base class: override the `get_texts()` and `__len__()`
    methods to match your particular input.

    Given a filename (or a file-like object) in constructor, the corpus object
    will be automatically initialized with a dictionary in `self.dictionary` and
    will support the `iter` corpus method. You must only provide a correct `get_texts`
    implementation.

    """
    def __init__(self, input=None):
        super(TextCorpus, self).__init__()
        self.input = input
        self.dictionary = Dictionary()
        self.metadata = False
        if input is not None:
            self.dictionary.add_documents(self.get_texts())
        else:
            logger.warning("No input document stream provided; assuming "
                           "dictionary will be initialized some other way.")

    def __iter__(self):
        """
        The function that defines a corpus.

        Iterating over the corpus must yield sparse vectors, one for each document.
        """
        for text in self.get_texts():
            if self.metadata:
                yield self.dictionary.doc2bow(text[0],
                                              allow_update=False), text[1]
            else:
                yield self.dictionary.doc2bow(text, allow_update=False)

    def getstream(self):
        return utils.file_or_filename(self.input)

    def get_texts(self):
        """
        Iterate over the collection, yielding one document at a time. A document
        is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`.

        Override this function to match your input (parse input files, do any
        text preprocessing, lowercasing, tokenizing etc.). There will be no further
        preprocessing of the words coming out of this function.
        """
        # Instead of raising NotImplementedError, let's provide a sample implementation:
        # assume documents are lines in a single file (one document per line).
        # Yield each document as a list of lowercase tokens, via `utils.tokenize`.
        with self.getstream() as lines:
            for lineno, line in enumerate(lines):
                if self.metadata:
                    yield utils.tokenize(line, lowercase=True), (lineno, )
                else:
                    yield utils.tokenize(line, lowercase=True)

    def __len__(self):
        if not hasattr(self, 'length'):
            # cache the corpus length
            self.length = sum(1 for _ in self.get_texts())
        return self.length
class TrainModel(BaseModel):
    _TRAIN_EXTENSION = 'csv'
    _PKL_EXT = 'pkl'

    def __init__(self, train_path, pkl_path):
        super(TrainModel, self).__init__()

        self._validate_path(train_path, self._TRAIN_EXTENSION)
        self._validate_path(pkl_path, self._PKL_EXT)

        self._train_path = train_path
        self._pkl_path = pkl_path

        self.dictionary = Dictionary()
        self.weights = None

    def __call__(self, *args, **kwargs):
        if self._pkl_path is not None:
            pass
        pass

    def train(self):
        """Pkl path returned empty, so we need to bring in the contents from the train path, and create a new model
        to be saved off

        :return:
        """
        print("Reading in training set")
        train_set = pd.read_csv(self._train_path)

        print("Tokenizing training set")
        train_tokenized = self.get_tokens(train_set)

        tokens_weights = defaultdict(dict)

        # add the tokens to the dictionary to keep track if what words we have found
        for cat, tokens in train_tokenized:
            print(f"\nBeginning training process for {cat}")
            print("Adding tokens to dictionary")
            self.dictionary.add_documents(tokens.values())

            print("Determining weights.")
            tokens_weights[cat]['weights'] = self._create_training_weights(tokens=tokens.values())

            print("Finding intersections.")
            tokens_weights[cat]['intersections'] = self._get_intersections(tokens.values())

            print(f"Intersections for {cat}: {tokens_weights[cat]['intersections']}")

        self.weights = tokens_weights

        self._write(self)

        print("Successfully trained model.")

    def get_tokens(self, train_set):
        return (
            (cat, self._tokenize_doc(self._get_training_paragraphs(train_set, cat)))
            for cat in self._get_train_categories(train_set)
        )

    def read(self):
        """Read from the pkl_path. If there are contents, return it. No training needed.

        :return:
        """
        try:
            with open(self._pkl_path, "rb") as f:
                pkl = pickle.load(f)
        except IOError:  # File doesn't exist
            return None
        else:
            return pkl

    def _validate_path(self, path, ext):
        if not path.split('.')[-1] == ext:
            raise IncorrectExtensionError(f"{self._pkl_path} does not have correct extension. Expected {ext}")

    def _write(self, train):
        with open(self._pkl_path, "wb") as f:
            pickle.dump(train, f)

    @staticmethod
    def _get_training_paragraphs(train_set, category):
        """Iterates over df, and returns all paragraphs where data_key_friendly_name matches self._category

        :param train_set: pd.DataFrame()
        :return: list()
        """
        return (
            row.paragraph_text for _, row in train_set[train_set.data_key_friendly_name == category].iterrows()
        )

    @staticmethod
    def _get_train_categories(train_set):
        if 'data_key_friendly_name' not in train_set.columns:
            raise UnexpectedColumnError(
                f"'data_key_friendly_name' not found in columns, got {train_set.columns} instead")

        return (
            cat for cat in train_set.data_key_friendly_name.unique().tolist()
            if cat != 'Unknown Share Repurchase Data'
        )

    @staticmethod
    def _get_intersections(tokens):
        """Find all unique processed words from the training paragraphs.

        :param tokens: list of lists
        :return:
        """
        intersections = set()
        for token in tokens:
            if not intersections:
                intersections = set(token)
            else:
                intersections = intersections.intersection(set(token))

        return list(intersections)

    @staticmethod
    def _create_training_weights(tokens):
        """Creates a list of normalized weights to apply to the tf-idf model once it has been determined

        :param tokens: list of lists
        :return: dict() {word: weight}
            0 <= word <= 1
        """
        # create count of all words in training set
        counts = Counter()
        for doc in tokens:
            for word in doc:
                counts[word] += 1

        # Grab the min and max counts in the set
        min_counts = counts.most_common()[-1]
        max_counts = counts.most_common(1)[0]

        min_max = min_counts[1], max_counts[1]
        diff = min_max[1] - min_max[0]

        return {word: ((count - min_max[0]) / diff) for word, count in counts.items()}
class DefaultJsonCorpus(object):
    """
    A default JSON corpus based on gensim TextCorpus. It assumes a file or list of JSON as input.
    The methods provided by gensim TextCorpus are needed for the GenSim training.
    Any corpus provided to DocumentSimilarity should provide the methods given in this class.
    """
    def __init__(self, input=None,create_dictionary=True):
        super(DefaultJsonCorpus, self).__init__()
        self.input = input
        self.dictionary = Dictionary()
        self.metadata = False
        if create_dictionary:
            self.dictionary.add_documents(self.get_texts())


    def __iter__(self):
        for text in self.get_texts():
            yield self.dictionary.doc2bow(text, allow_update=False)

    def getstream(self):
        return utils.file_or_filename(self.input)

    def __len__(self):
        if not hasattr(self, 'length'):
            # cache the corpus length
            self.length = sum(1 for _ in self.get_texts())
        return self.length

    def get_json(self):
        if isinstance(self.input,list):
            for j in self.input:
                yield j
        else:
            with self.getstream() as lines:
                for line in lines:
                    line = line.rstrip()
                    j = json.loads(line)
                    yield j

    def get_texts(self,raw=False):
        """
        yield raw text or tokenized text
        """
        for j in self.get_json():
            text = j["text"]
            if raw:
                yield text
            else:
                yield utils.tokenize(text, deacc=True, lowercase=True)

    def get_meta(self):
        """
        return a json object with meta data for the documents. It must return:
        id - id for this document
        optional title and tags. Tags will be used as base truth used to score document similarity results.
        """
        doc_id = 0
        for j in self.get_json():
            m = copy.deepcopy(j)
            m['id'] = long(m['id'])
            m['corpus_seq_id'] = doc_id
            doc_id += 1
            yield m

    def get_dictionary(self):
        return self.dictionary
class SublexicalizedCorpus(TextCorpus):
    def __init__(self,
                 base_corpus,
                 order=3,
                 word_limit=None,
                 clean_func=mahoney_clean,
                 create_dictionary=True,
                 n_proc=1):
        self.order = order

        self.clean_func = clean_func
        self.base_corpus = base_corpus
        self.word_limit = word_limit
        self.n_proc = n_proc

        super(SublexicalizedCorpus, self).__init__()

        self.dictionary = Dictionary()

        if create_dictionary:
            self.dictionary.add_documents(self.get_texts())

    def get_texts(self):
        a_count = 0
        t_count = 0

        texts = ((text, self.clean_func, self.order)
                 for text in self.base_corpus.get_texts())

        pool = multiprocessing.Pool(self.n_proc)

        start = time.clock()
        prev = start

        for group in chunkize(texts, chunksize=10 * self.n_proc, maxsize=100):
            for tokens in pool.imap_unordered(process, group):
                a_count += 1

                cur = time.clock()

                if cur - prev > 60:
                    logging.info("Sublexicalized %d in %d seconds, %.0f t/s" %
                                 (t_count, cur - start, t_count * 1. /
                                  (cur - start)))

                    prev = cur

                t_count += len(tokens)

                yield tokens

                if self.word_limit and t_count > self.word_limit:
                    break

        pool.terminate()

        end = time.clock()
        logging.info("Sublexicalizing %d finished in %d seconds, %.0f t/s" %
                     (t_count, end - start, t_count * 1. / (end - start)))

        self.length = t_count
Пример #35
0
class DefaultJsonCorpus(object):
    """
    A default JSON corpus based on gensim TextCorpus. It assumes a file or list of JSON as input.
    The methods provided by gensim TextCorpus are needed for the GenSim training.
    Any corpus provided to DocumentSimilarity should provide the methods given in this class.
    """
    def __init__(self, input=None):
        super(DefaultJsonCorpus, self).__init__()
        self.input = input
        self.dictionary = Dictionary()
        self.metadata = False
        self.dictionary.add_documents(self.get_texts())

    def __iter__(self):
        for text in self.get_texts():
            yield self.dictionary.doc2bow(text, allow_update=False)

    def getstream(self):
        return utils.file_or_filename(self.input)

    def __len__(self):
        if not hasattr(self, 'length'):
            # cache the corpus length
            self.length = sum(1 for _ in self.get_texts())
        return self.length

    def get_json(self):
        if isinstance(self.input, list):
            for j in self.input:
                yield j
        else:
            with self.getstream() as lines:
                for line in lines:
                    line = line.rstrip()
                    j = json.loads(line)
                    yield j

    def get_texts(self, raw=False):
        """
        yield raw text or tokenized text
        """
        for j in self.get_json():
            text = j["text"]
            if raw:
                yield text
            else:
                yield utils.tokenize(text, deacc=True, lowercase=True)

    def get_meta(self):
        """
        return a json object with meta data for the documents. It must return:
        id - id for this document
        optional title and tags. Tags will be used as base truth used to score document similarity results.
        """
        doc_id = 0
        for j in self.get_json():
            m = copy.deepcopy(j)
            m['id'] = long(m['id'])
            m['corpus_seq_id'] = doc_id
            doc_id += 1
            yield m

    def get_dictionary(self):
        return self.dictionary
Пример #36
0
class LDA(object):
    def __init__(self,
                 topics=10,
                 worker=3,
                 pretrained_model=None,
                 dictionary=None):
        """
        lda模型训练初始化。
        Args:
            topics -- 指定主题个数
            worker -- 并行化参数,一般为core数量减一
            pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型
            dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典
        Example:
            >>> lda = LDA(topics = 20, worker = 2, 
                          pretrained_model = model_file, 
                          dictionary = dictionary_file)
            >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']]
            >>> lda.update(corpus)
            >>> lda.save(model_file, dictionary_file)
            >>> topics = lda.inference(['word5', 'word6'])
        """

        self._topics = topics
        self._workers = worker
        self._model = None
        self._common_dictionary = None
        if pretrained_model and common_dictionary:
            self._model = LdaModel.load(pretrained_model)
            self._common_dictionary = Dictionary.load(dictionary)

    def save(self, model_file, dictionary_file):
        """
        保存训练的模型,同时保存对应的词典
        Args:
            model_file -- 模型文件
            dictionary_file -- 词典文件
        Returns:
            无
        """

        if self._model:
            self._model.save(model_file)
        if self._common_dictionary:
            self._common_dictionary.save(dictionary_file)

    def update(self, corpus=[[]]):
        """
        在线更新,在已有模型的基础上在线更新
        Args:
            corpus -- 用于更新的文档列表
        """

        if not self._model and len(corpus) > 0:
            self._common_dictionary = Dictionary(corpus)
            corpus_data = [
                self._common_dictionary.doc2bow(sentence)
                for sentence in corpus
            ]
            self._model = LdaModel(corpus_data, self._topics)
        elif self._model and len(corpus) > 0:
            self._common_dictionary.add_documents(corpus)
            new_corpus_data = [
                self._common_dictionary.doc2bow(sentence)
                for sentence in corpus
            ]
            self._model.update(new_corpus_data)

    def inference(self, document=[]):
        """
        对新文档推断其话题分布
        Args:
            document -- 文档,其实是词列表
        Returns:
            话题分布列表        
        """
        if self._model:
            doc = [self._common_dictionary.doc2bow(document)]
            return self._model.get_document_topics(doc)
        return []

    @property
    def model(self):
        return self._model

    @property
    def dictionary(self):
        return self._common_dictionary
Пример #37
0
class Similarities(object):
    """
    Class for text similarities stuff
    """

    def __init__(self, mongo_conn_rec, stopwords=None):
        self._stopwords = set(stopwords) if stopwords is not None else set()
        self._mongo_connection_record = mongo_conn_rec
        self._lsi_mapping = dict()
        self._sim_index = None
        self._dictionary = None
        self._lsimodel = None

        self._run_transformers()

    @staticmethod
    def logger():
        """
        Scrapper's specific logger instance. Use this to log inside scrappers.
        :return: Returns a logging.Logger('openews.scrappers') instance.
        """
        return logging.getLogger('openews.language')

    @property
    def considerable_doc_property(self):
        """
        The document property to use for training. this is the actually data we take from the MongoDB documents to
        parse and train.
        :return: str
        """
        return 'title'

    @property
    def dictionary_file(self):
        """
        The filename to use when serializing gensim.corpora.dictionary.Dictionary to disk.
        :return: str
        """
        return "openews.processors.dict"

    @property
    def dictionary(self):
        """
        The used Dictionary.
        :return: gensim.corpora.dictionary.Dictionary
        """
        return self._dictionary

    @property
    def lsi_model(self):
        """
        The used LSI model.
        :return: gensim.models.lsimodel.LsiModel
        """
        return self._lsimodel

    @property
    def similarity_index(self):
        """
        The similarity index instance
        :return: gensim.similarities.docsim.MatrixSimilarity
        """
        return self._sim_index

    @property
    def similarity_threshold(self):
        """
        The similarity threshold.
        Anything above or equals to this value will be considered as similar document.
        :return: float
        """
        return server_app.config['SIMILARITY_THRESHOLD']

    @property
    def lsi_index_mapping(self):
        """
        A mapping between the LSI model index (key) and the documents (Collection the document is in, document)
        :return: dict
        """
        return self._lsi_mapping

    @staticmethod
    def _create_resource_path(resource_file):
        """
        Creates a absolute path to resource_file based on the given system's temp directory.
        :param resource_file: str
        :return: str
        """
        return os.path.join(tempfile.gettempdir(), resource_file)

    def _resource_exists(self, resource_file):
        """
        Checks if resource_file exists in the given system's temp directory.
        :param resource_file: str
        :return: bool
        """
        return os.path.isfile(self._create_resource_path(resource_file))

    def _run_transformers(self):
        """
        Runs all the transformer methods listed providing the MongoDB client context instance.
        """
        with MongoClientContext(self._mongo_connection_record) as client:
            self._create_dictionary(client)
            self._create_lsi_similarity_index(client)

    def _create_dictionary(self, mongo_client):
        """
        Creates the gensim Dictionary (gensim.corpora.dictionary.Dictionary) or loads it if it already exists and sets
        the object's dictionary property.
        :param mongo_client: server.db.MongoClientContext
        """
        from gensim.corpora.dictionary import Dictionary

        if self._resource_exists(self.dictionary_file):
            self.logger().debug(
                    "Dictionary file found, loading it [%s]" % self._create_resource_path(self.dictionary_file))
            self._dictionary = Dictionary.load(self._create_resource_path(self.dictionary_file))
        else:
            self.logger().debug("Dictionary file not found, creating a new Dictionary file")
            self._dictionary = Dictionary()

        documents = []
        for doc in [di for d in mongo_client.scrappers_collections() for di in d.find()]:
            documents.append(self.tokenize_sentence(doc[self.considerable_doc_property]))

        self.logger().debug("Adding %d documents to dictionary (will skip existing ones)" % len(documents))
        self._dictionary.add_documents(documents)
        self._dictionary.save(self._create_resource_path(self.dictionary_file))

    def _create_lsi_similarity_index(self, mongo_client):
        """
        Creates a Similarity index based on LSI model from the available dictionary. Sets the object's lsi_model and
        similarity_index object properties.
        """
        from gensim.models import LsiModel
        from gensim.similarities import MatrixSimilarity

        self._lsi_mapping.clear()
        bow_corpus = []
        for idx, tp in enumerate([(c, di) for c in mongo_client.scrappers_collections() for di in c.find()]):
            self._lsi_mapping[idx] = tp
            bow_corpus.append(self.sentence_to_bow(tp[1][self.considerable_doc_property]))

        self._lsimodel = LsiModel(bow_corpus, id2word=self.dictionary)
        self._sim_index = MatrixSimilarity(self._lsimodel[bow_corpus])

    def calculate_similarities(self):
        """
        Find / calculate similarities between documents in the index.
        Returns a defaultdict with the key as the LSI index and the value is a list of tuples with the following values
        (LSI model Index, similarity threshold - numpy.float32)
        tuple
        :return: defaultdict(list)
        """
        similarities = defaultdict(list)
        if not self.lsi_index_mapping:
            return

        for idx, tp in sorted(self.lsi_index_mapping.items(), key=itemgetter(0)):
            sentence = tp[1][self.considerable_doc_property]
            bow = self.sentence_to_bow(sentence)
            latent_space_vector = self.lsi_model[bow]
            sim_vector = self.similarity_index[latent_space_vector]
            sorted_mapped_vector = list(sorted(enumerate(sim_vector), key=itemgetter(1)))
            for sit in [v for v in sorted_mapped_vector if
                        v[0] != idx and v[1] >= self.similarity_threshold and tp[0].name !=
                                self.lsi_index_mapping[v[0]][0].name]:
                if sit[0] not in similarities:
                    similarities[idx].append(sit)

        for s in similarities.items():
            main_sentence = self.lsi_index_mapping[s[0]][1][self.considerable_doc_property]
            print("[%s] %s:" % (self.lsi_index_mapping[s[0]][0].name, main_sentence))
            for sm in s[1]:
                print("\t[%f][%s]: %s" % (sm[1], self._lsi_mapping[sm[0]][0].name,
                                          self.lsi_index_mapping[sm[0]][1][self.considerable_doc_property]))
        return similarities

    def store_similarities(self, update=False):
        """
        Stores the similarities to the database
        :param update: True to update existing, False to delete and add new items
        """
        with MongoClientContext(self._mongo_connection_record) as client:
            pass

    def tokenize_sentence(self, sentence):
        """
        Tokenize a sentence (see 'tokenized_corpus_sentences' method on what tokenization in this context means).
        :param sentence: str
        :return: a list
        """
        excluded = set(chain(self._stopwords, string.punctuation))
        return [w.lower() for w in word_tokenize(sentence) if w.lower() not in excluded]

    def sentence_to_bow(self, sentence):
        """
        Transforms a string sentence to a VSM bag-of-words representation.
        :param sentence: str
        :return: list of tuples
        """
        return self.dictionary.doc2bow(self.tokenize_sentence(sentence))
Пример #38
0
class Corpus(object):
    def __init__(self, path, save_data, max_len=16):
        self.train = os.path.join(path, "train")
        self.valid = os.path.join(path, "valid")
        self._save_data = save_data
        self.train_sents = []
        self.train_labels = []
        self.valid_sents = []
        self.valid_labels = []
        self.max_len = max_len
        self.dict = Dictionary()
        self.l = Labels()

    def parse_data_from_file(self, _file, is_train=True):

        _sents, _labels = [], []
        for sentence in open(_file):
            label, _, _words = sentence.replace('\xf0', ' ').partition(
                ' ')  #特定格式:类别 文档,可改写该段代码
            label = label.split(":")[0]

            words = _words.lower().strip().split()

            if len(words) > self.max_len:
                words = words[:self.max_len]

            _sents += [words]
            _labels += [label]
        if is_train:
            self.train_sents.extend(_sents)
            self.train_labels.extend(_labels)
            self.l(self.train_labels)
            self.build_dict(self.train_sents)
        else:
            self.valid_sents.extend(_sents)
            self.valid_labels.extend(_labels)

    def parse_data_from_dir(self,
                            dirs,
                            is_train=True,
                            lines_are_documents=True):
        _sents, _labels = [], []
        dirs = os.path.expanduser(dirs)
        for label in sorted(os.listdir(dirs)):
            d = os.path.join(dirs, label)
            if not os.path.isdir(d):
                continue

            for root, _, fnames in sorted(os.walk(d)):
                for fname in sorted(fnames):
                    path = os.path.join(root, fname)
                    with open(path, 'rt') as f:
                        if lines_are_documents:
                            for line in f:
                                _sents += [line.lower().strip().split()]
                                _labels += [label]
                        else:
                            _sents += [f.read().strip().split()]
                            _labels += [label]
        if is_train:
            self.train_sents.extend(_sents)
            self.train_labels.extend(_labels)
            self.l(self.train_labels)
            self.build_dict(self.train_sents)
        else:
            self.valid_sents.extend(_sents)
            self.valid_labels.extend(_labels)

    def build_dict(self, _sents):
        self.dict.add_documents(_sents)

    def build_vocab(self):
        for key in self.dict.token2id.keys():
            self.dict.token2id[key] += 1

    def save(self):
        self.parse_data_from_file(self.train, is_train=True)
        self.parse_data_from_file(self.valid, is_train=False)
        #self.parse_data_from_dir("./data/corpus/data",lines_are_documents=False)
        self.build_vocab()
        data = {
            'max_len': self.max_len,
            'dict': {
                'train': self.dict.token2id,
                'vocab_size': len(self.dict),
                'label': self.l.word2idx,
                'label_size': len(self.l),
            },
            'train': {
                'doc': word2idx(self.train_sents, self.dict.token2id),
                'label': [self.l.word2idx[l] for l in self.train_labels]
            },
            'valid': {
                'doc': word2idx(self.valid_sents, self.dict.token2id),
                'label': [self.l.word2idx[l] for l in self.valid_labels]
            }
        }

        torch.save(data, self._save_data)
        print('Finish dumping the data to file - [{}]'.format(self._save_data))
        print('words length - [{}]'.format(len(self.dict)))
        print('label size - [{}]'.format(len(self.l)))
        print('train_src length - [{}]'.format(len(data['train']['doc'])))
        print('valid_src length - [{}]'.format(len(data['valid']['doc'])))
Пример #39
0
def topic_model(df_train, df_test, topic_count=10, cached=True):

    lda_train_save_file = '../data/lsa_train.csv'
    lda_test_save_file = '../data/lsa_test.csv'

    if (os.path.exists(lda_train_save_file) and cached):
        pd.read_csv(lda_train_save_file), pd.read_csv(lda_test_save_file)

    ### cleanup
    #parallel_proces(test_src,'../data/training_user_tweet_processed.csv')

    ## general remove text
    #df_train['tweet'] = df_train['tweet'].fillna("")
    #df_test['tweet'] = df_test['tweet'].fillna("")

    # df_train['tweet'] = df_train['tweet'].map(general_text_processing)
    # df_test['tweet'] = df_test['tweet'].map(general_text_processing)
    """
        Parallel tweet.
    """
    # df_test['tweet'] = parallelize(df_test, clean_tweet)
    # df_train['tweet'] = parallelize(df_train, clean_tweet)

    #df_train['tweet'] = df_train['tweet'].map(clean_tweet)
    #df_test['tweet'] = df_test['tweet'].map(clean_tweet)

    ## remove stop words
    # df_train['tweet'] = df_train['tweet'].map(remove_stop_words)
    # df_test['tweet'] = df_test['tweet'].map(remove_stop_words)

    ## gensim lda
    # dictionary = Dictionary()
    # for t in df_train.tweet.values.tolist():
    #     #print(t)
    #     dictionary.add_documents([t.split()])

    dictionary = Dictionary()
    for t in df_train.tweet.values.tolist():
        # print(t)
        dictionary.add_documents([t])
        # for  t in df_test['tweet'].values.tolist() :
        # print(t)
        # print(t[0].split())
        # print(dictionary.doc2bow(t.split()))

    train_doc2_corupus = [
        dictionary.doc2bow(text) for text in df_train['tweet'].values.tolist()
    ]

    # train_doc2_corupus = [dictionary.doc2bow(text.split()) for
    # text in df_train['tweet'].values.tolist()]
    # print(train_doc2_corupus)
    print("Started LDA")
    lda_model = LdaModel(train_doc2_corupus,
                         num_topics=topic_count,
                         iterations=30)
    print("Completed LDA")
    """
    fill topics
    """
    df_test = fill_lda_result_2(df_test, lda_model, dictionary, topic_count)
    df_train = fill_lda_result_2(df_train, lda_model, dictionary, topic_count)
    """ 
        Save the file
    """

    df_train.to_csv(lda_train_save_file, index=False)
    df_test.to_csv(lda_test_save_file, index=False)
    """
    return 
    """
    print('LDA Completed')
    return df_train, df_test