Пример #1
0
def main():
    parser = ArgumentParser()
    parser.add_argument('-d', '--wiki-dump')
    parser.add_argument('-l', '--limit', default=None, type=int)
    parser.add_argument('-p', '--num-procs', default=1, type=int)
    parser.add_argument('-o', '--out', default='vocab')
    opts = parser.parse_args()

    dump_loc = opts.wiki_dump
    limit = opts.limit
    n_procs = opts.num_procs
    out_fn = opts.out

    dump_gen = get_dump_gen(dump_loc, limit=limit, n_procs=n_procs)

    nlp = spacy.en.English()
    vocab = Dictionary(([
        token.text.lower().strip() for token in doc if token.text.strip() != ""
    ] for doc in nlp.pipe((art['article.text'] for art in dump_gen),
                          n_threads=n_procs,
                          parse=False,
                          tag=False,
                          entity=False)))

    vocab.save('%s.vocab' % out_fn)
    vocab.save_as_text('%s.txt' % out_fn)
def initialize_lda():
    path = os.path.join("../data", "train.csv")
    dct = Dictionary(common_texts)
    corpus = [dct.doc2bow(text) for text in common_texts]

    with open(path, 'r') as file:
        csv_file = csv.DictReader(file)
        for row in csv_file:
            row = dict(row)
            new_texts = [row['story'].split()]
            dct.add_documents(new_texts)
            corpus += [dct.doc2bow(text) for text in new_texts]
    lda = models.ldamodel.LdaModel(corpus, num_topics=50)
    lda.save(os.path.join("lda_model", "model"))
    dct.save_as_text(os.path.join("lda_model", "dictionary"))
Пример #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data-dir',
                        default='./data/test_arxiv_plain.txt',
                        help='Path to directory where the data is stored')
    parser.add_argument('--model-dir',
                        default='../model',
                        help='Path to directory where the model is stored')
    parser.add_argument('--train',
                        default=True,
                        help='True for train, False for test mode')
    parser.add_argument('--n_topic', default=20, help='Number of of topics')
    args = parser.parse_args()
    model_dir = './model/model'
    dict_dir = './model/dict.txt'

    if args.train == True:
        print('Reading texts')
        with open(args.data_dir) as f_in:
            texts = f_in.read().split('\n')
        del texts[-1]
        for i in tqdm(range(len(texts))):
            texts[i] = texts[i].split()

        print('Generating corpora')
        dictionary = Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        dictionary.save_as_text(dict_dir)

        print('Loading model')
        lda = LdaModel(corpus, num_topics=args.n_topic)
        lda.save(model_dir)
    else:
        lda = LdaModel.load(model_dir, mmap='r')
        dictionary = Dictionary()
        dictionary.load_from_text(dict_dir)

    print('Processing results')
    topics = lda.print_topics()
    with open('./report.txt', 'w') as f_out:
        for topic_id, topic_pair in topics:
            print(topic_id, end=': ', file=f_out)
            topic_words = topic_pair.split('"')[1::2]
            topic_words = list(map(int, topic_words))
            topic_words = [dictionary.get(word) for word in topic_words]
            print(topic_words, file=f_out)
Пример #4
0
def main():
    try:
        dictionary = Dictionary.load_from_text("dictionary.txt")
    except:
        dictionary = Dictionary(rcv1_train)
        dictionary.filter_extremes()
        dictionary.save_as_text("dictionary.txt")

    class RCV1BowCorpus(object):
        def __iter__(self):
            for document in rcv1_train:
                yield dictionary.doc2bow(document)

    ln.debug("Training model on %s documents" % len(rcv1_train))
    try:
        vector_model = LsiModel.load("lsi_model")
    except:
        vector_model = LsiModel(corpus=RCV1BowCorpus(),
                                num_topics=100,
                                id2word=dictionary)
        vector_model.save("lsi_model")

    def get_lsi_features(text):
        """
        Must return either numpy array or dictionary
        """
        res = vector_model[dictionary.doc2bow(text)]
        return dict(res)

    def get_bow_features(text):
        return dict(dictionary.doc2bow(text))

    clf = train_classifier(train_samples=rcv1_train,
                           train_targets=rcv1_train_target,
                           get_features=get_lsi_features,
                           classifier="sgd")

    evaluate_classifier(clf,
                        rcv1_test,
                        rcv1_test_target,
                        get_features=get_lsi_features)
def main():
    parser = ArgumentParser()
    parser.add_argument('-d', '--wiki-dump')
    parser.add_argument('-l', '--limit', default=None, type=int)
    parser.add_argument('-p', '--num-procs', default=1, type=int)
    parser.add_argument('-o', '--out', default='vocab')
    opts = parser.parse_args()

    dump_loc = opts.wiki_dump
    limit = opts.limit
    n_procs = opts.num_procs
    out_fn = opts.out

    dump_gen = get_dump_gen(dump_loc, limit=limit, n_procs=n_procs)

    nlp = spacy.en.English()
    vocab = Dictionary(([token.text.lower().strip() for token in doc if token.text.strip() != ""]
                        for doc in nlp.pipe((art['article.text'] for art in dump_gen), n_threads=n_procs,
                                            parse=False, tag=False, entity=False)))

    vocab.save('%s.vocab' % out_fn)
    vocab.save_as_text('%s.txt' % out_fn)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--use_domain", action="store_true")
    parser.add_argument("--update", action="store_true")
    parser.add_argument("--save_interval", type=int, default=100)
    args = parser.parse_args()

    if args.update:
        common_dict = Dictionary.load_from_text("./common_dict.txt")
    else:
        common_dict = Dictionary()
    for i, url in enumerate(sys.stdin):
        print("url " + str(i))
        text = fetch_contents_from_url(url.strip(), use_domain=args.use_domain)
        if not text:
            continue

        word_list = doc2word_list(text)
        common_dict.add_documents([word_list])

        if i % args.save_interval == args.save_interval - 1:
            common_dict.save_as_text("./common_dict.txt")

    common_dict.save_as_text("./common_dict.txt")
Пример #7
0
def build_lda_model(stem):
    corpus = []
    ps = PorterStemmer()
    number_of_topics = 100

    # read in data from publications
    with open(get_lda_base(), 'r') as f:
        for line in f:
            if stem:
                stemmed = []

                for w in line.split():
                    s = ps.stem(w)
                    if len(s) > 1:
                        stemmed.append(s)

                corpus.append(stemmed)
            else:
                corpus.append(line.split())

    # build vocabulary and transform texts in vocab format
    dictionary = Dictionary(corpus)
    corpus = [dictionary.doc2bow(text) for text in corpus]

    # do lda
    lda = ldamodel.LdaModel(corpus=corpus, num_topics=number_of_topics, passes=20, id2word=dictionary,
                            minimum_probability=0)

    if stem:
        temp_file = datapath('lda_model_stemmed')
        dictionary.save_as_text(get_file_base() + 'lda_data/dict_stemmed')
    else:
        temp_file = datapath('lda_model_unstemmed')
        dictionary.save_as_text(get_file_base() + 'lda_data/dict_unstemmed')

    lda.save(temp_file)
Пример #8
0
class text_corpus(object):
    def __init__(self, tsv_path, n_examples=100000):
        print("Getting %s iterator..." % tsv_path)
        self.n_examples = n_examples
        self.document_path = tsv_path
        self.fin = open(self.document_path, 'rb')
        self.instances = sum(1 for line in open(tsv_path))
        self.bigram = Phraser(Phrases())
        self.trigram = Phraser(Phrases())

    def __iter__(self):
        for i, doc in self.indexed_docs(self.n_examples):
            yield TaggedDocument(self.process(doc), [i])

    def process(self, text):
        return self.trigram[self.bigram[tokenize(text)]]

    def docs(self, n_examples=None):
        if n_examples == None:
            n_examples = self.n_examples
        for _, doc in self.indexed_docs(n_examples):
            yield self.process(doc)

    def reset_docs(self):
        self.fin.close()
        self.fin = open(self.document_path, 'rb')

    def indexed_docs(self, n_examples=-1):
        if n_examples == -1:
            with open(self.document_path, 'rb') as fin:
                for line in fin:
                    try:
                        i, doc = line.decode(
                            'utf-8', errors='replace').strip().split('\t')
                        yield i, doc
                    except:
                        pass
        else:
            current_example = 0
            for line in self.fin:
                if (current_example < n_examples):
                    try:
                        i, doc = line.decode(
                            'utf-8', errors='replace').strip().split('\t')
                        current_example += 1
                        yield i, doc
                    except:
                        pass
                else:
                    raise StopIteration

    def get_phraser(self, directory, sensitivity=3):

        if not os.path.isdir(directory):
            os.makedirs(directory)

        print("\t\tGetting bigram detector...")
        if not os.path.isfile(directory + '/bigrams.pkl'):
            self.bigram = Phraser(
                Phrases(self.docs(n_examples=-1),
                        min_count=2,
                        threshold=sensitivity,
                        max_vocab_size=2000000))
            self.bigram.save(directory + '/bigrams.pkl')
        else:
            self.bigram = Phraser.load(directory + '/bigrams.pkl')

        print("\t\tGetting trigram detector...")
        if not os.path.isfile(directory + '/trigrams.pkl'):
            self.trigram = Phraser(
                Phrases(self.bigram[self.docs(n_examples=-1)],
                        min_count=2,
                        threshold=sensitivity + 1,
                        max_vocab_size=2000000))
            self.trigram.save(directory + '/trigrams.pkl')
        else:
            self.trigram = Phraser.load(directory + '/trigrams.pkl')

    def load_phraser(self, directory):
        print("\tLoading gram detector...")
        self.bigram = Phraser.load(directory + '/bigrams.pkl')
        self.trigram = Phraser.load(directory + '/trigrams.pkl')

    def get_dictionary(self, directory, keep=100000):
        if not os.path.isdir(directory):
            os.makedirs(directory)
        if not os.path.isfile(directory + '/dictionary.dict'):
            print("\tBuilding dictionary...")
            self.dictionary = Dictionary(self.docs(n_examples=-1),
                                         prune_at=2000000)
            print("\tFiltering dictionary extremes...")
            self.dictionary.filter_extremes(no_below=3,
                                            no_above=0.5,
                                            keep_n=keep)
            print("\tSaving dictionary...")
            self.dictionary.save(directory + '/dictionary.dict')
            self.dictionary.save_as_text(directory + '/word_list.tsv')
        else:
            self.load_dictionary(directory)

    def get_word_ids(self):
        word_list = set()
        for doc in self.docs(n_examples=-1):
            word_list.update(doc)
        return dict(zip(range(len(word_list)), word_list))

    def load_dictionary(self, directory):
        print("\tLoading dictionary...")
        self.dictionary = Dictionary.load(directory + '/dictionary.dict')
Пример #9
0
class LDAModel(object):
    """

    """

    def __init__(self,path,model_file,dictionary_file,corpus_file,num_topics=21):
            """
            进行数据预处理,获取训练集和测试集
            class biological分子与细胞_cleaned.csv : 12
            class biological现代生物技术专题_cleaned.csv : 14
            class biological生物技术实践_cleaned.csv : 16
            class biological生物科学与社会_cleaned.csv : 18
            class biological稳态与环境_cleaned.csv : 110
            class biological遗传与进化_cleaned.csv : 112
            class geography人口与城市_cleaned.csv : 42
            class geography区域可持续发展_cleaned.csv : 44
            class geography地球与地图_cleaned.csv : 46
            class geography宇宙中的地球_cleaned.csv : 48
            class geography生产活动与地域联系_cleaned.csv : 410
            class history古代史_cleaned.csv : 52
            class history现代史_cleaned.csv : 54
            AttributeError: 'PyDB' object has no attribute 'has_plugin_line_breaks'
            Exception ignored in: '_pydevd_frame_eval.pydevd_frame_evaluator_darwin_36_64.get_bytecode_while_frame_eval'
            AttributeError: 'PyDB' object has no attribute 'has_plugin_line_breaks'
            class history近代史_cleaned.csv : 56
            class political公民道德与伦理常识_cleaned.csv : 102
            class political时事政治_cleaned.csv : 104
            class political生活中的法律常识_cleaned.csv : 106
            class political科学思维常识_cleaned.csv : 108
            class political科学社会主义常识_cleaned.csv : 1010
            class political经济学常识_cleaned.csv : 1012
            :param file:语料文件
            :param ratio:测试训练的比列
            :return lda:返回lda模型
            """



            dirs = os.listdir(path)
            x_list = []
            item_x = []
            labels = []
            multiLabels = []
            label11 = 0

            for file in dirs:
                #print(os.path.join(path, file))
                path2 = os.path.join(path, file)
                if os.path.isdir(path2):
                    category = file
                    dirs2 = os.listdir(path2)
                    label12 = 0
                    for file2 in dirs2:
                        file3 = os.path.join(path2, file2)
                        if os.path.isfile(file3) and file2.endswith('_cleaned.csv'):
                            print('class {}{} : {}{}'.format(file, file2, label11, label12))
                            src_df = pd.read_csv(file3)
                            src_df = parallelize(src_df, data_fram_proc) #上采样

                            #merged_df = pd.concat([src_df['items'], src_df['knowledge']], axis=1)
                            src_df['item'] = src_df['items'] + src_df['knowledge']
                            x = np.array(src_df['item']).tolist()
                            item_x += x
                            x = [[word for word in doc.split(' ') if word != "" ] for  doc in x]
                            x_list+= x # list
                            #labels += ['__label__'+str(label11)+''+str(label12) for i in range(len(x))]
                            fn = str(file2).replace('_cleaned.csv','').replace('\t','').replace('\n','')
                            labels += ['__label__' + str(file) + '_' + fn  for i in range(len(x))]
                            bug = 0
                            mls = np.array(src_df['label']).tolist()
                            multiLabels += [ str(file).replace('_',' ') +' '+fn+' '+  str(ml).replace('\t','').replace('\n','') for ml in mls ]
                            bug = 1
                        label12 += 1
                label11 += 1

            c = {'label': labels,'item': item_x,'multiLabels':multiLabels}  # 合并成一个新的字典c
            df = pd.DataFrame(c)  # 将c传入DataFrame并创建
            df.to_csv(corpus_file, index=None,  header=True)


            # 把文章转成list,字典里面 "token2id "
            self.dictionary = Dictionary(x_list)
            # 把文本转成词袋形式  id : freq
            self.corpus = [self.dictionary.doc2bow(text) for text in x_list]

            # 调用lda模型,并指定10个主题
            self.lda = LdaModel(self.corpus, id2word=self.dictionary, num_topics=num_topics)
            # 检查结果
            results = self.lda.print_topics(num_topics, num_words=50)
            for result in results:
                print(result)

            # Save model to disk.
            self.lda.save(model_file)

            self.dictionary.save_as_text(dictionary_file)


    def __retrain(self, model_file,other_texts):
           """
           lda = LdaModel.load(model_file)
           other_corpus = [self.dictionary.doc2bow(text) for text in other_texts]
           lda.update(other_corpus)
           """

    def getDocSVector(self):
        self.docSVector = []
        for d in self.corpus:
            self.docSVector.append(self.lda.get_document_topics(d,minimum_probability = 0))
        return self.docSVector