Exemplo n.º 1
0
    def test_saveAsText(self):
        """`Dictionary` can be saved as textfile. """
        tmpf = get_tmpfile('save_dict_test.txt')
        small_text = [
            ["prvé", "slovo"],
            ["slovo", "druhé"],
            ["druhé", "slovo"]]

        d = Dictionary(small_text)

        d.save_as_text(tmpf)
        with codecs.open(tmpf, 'r', encoding='utf-8') as file:
            serialized_lines = file.readlines()
            self.assertEqual(serialized_lines[0], u"3\n")
            self.assertEqual(len(serialized_lines), 4)
            # We do not know, which word will have which index
            self.assertEqual(serialized_lines[1][1:], u"\tdruhé\t2\n")
            self.assertEqual(serialized_lines[2][1:], u"\tprvé\t1\n")
            self.assertEqual(serialized_lines[3][1:], u"\tslovo\t3\n")

        d.save_as_text(tmpf, sort_by_word=False)
        with codecs.open(tmpf, 'r', encoding='utf-8') as file:
            serialized_lines = file.readlines()
            self.assertEqual(serialized_lines[0], u"3\n")
            self.assertEqual(len(serialized_lines), 4)
            self.assertEqual(serialized_lines[1][1:], u"\tslovo\t3\n")
            self.assertEqual(serialized_lines[2][1:], u"\tdruhé\t2\n")
            self.assertEqual(serialized_lines[3][1:], u"\tprvé\t1\n")
def main():
    parser = ArgumentParser()
    parser.add_argument('-d', '--dataset')
    parser.add_argument('-p', '--dataset-path', default=default_dataset_path())
    parser.add_argument('-o', '--output')
    opts = parser.parse_args()

    dataset_name = opts.dataset
    dataset_path = opts.dataset_path
    out_fn = opts.output

    if not out_fn:
        logging.error('--output argument required ...')
        parser.print_usage()
        sys.exit(1)

    if not dataset_name:
        logging.error('--dataset argument required ...')
        parser.print_usage()
        sys.exit(1)

    if dataset_name == 'newsgroups':
        corpus = (preprocess_ng(doc) for doc
                  in newsgroups.iterator(download_file(newsgroups.NEWSGROUPS_ARCHIVE_URL, dataset_path)))
    if dataset_name == 'ndt':
        dataset = NDTDataset(dataset_path=dataset_path)
        dataset.install()

        corpus = (preprocess_ndt(doc) for doc in dataset)
    else:
        logging.error('Unknown dataset %s ...' % dataset_name)
        sys.exit(1)

    d = Dictionary(corpus)
    d.save_as_text(out_fn, sort_by_word=False)
    def produce(self):        

        print('Getting src docs')
        docs = []
        doctokens = [] # aka Gensim's "text"
        stopwords = nltk.corpus.stopwords.words('english')
        for doc in self.src_doc_generator():
            (doc_id,doc_label,doc_str) = doc
            docs.append(doc)
            doctokens.append([token for token in nltk.word_tokenize(doc_str) if token not in stopwords])
            if len(docs) % 1000 == 0: print(len(docs))
                
        print('Creating the dictionary')
        dictionary = Dictionary(doctokens)
        #dictionary.compactify()
        #dictionary.filter_extremes(keep_n=None)
        if self.dictfile:
            dictionary.save_as_text(self.dictfile+'.dict', sort_by_word=True)

        with self.dbi as db:

            print('Creating WORD') # aka Gensim's "dictionary"
            db.create_table('word')
            for word_id, word_str in dictionary.iteritems():
                db.cur.execute('INSERT INTO word (word_id, word_str) VALUES (?,?)',(word_id,word_str))
            
            print('Creating DOC and DOCWORD')
            db.create_table('doc')
            db.create_table('docword')
            for doc_idx, doc in enumerate(docs):
                db.cur.execute('INSERT INTO doc (doc_index,doc_id,doc_label,doc_str ) VALUES (?,?,?,?)',(doc_idx,doc[0],doc[1],doc[2]))
                doc_id = doc[0]
                for word_id, word_count in (dictionary.doc2bow(doctokens[doc_idx])):
                    word_str = dictionary.get(word_id) # Is this valid? I believe it is.
                    db.cur.execute('INSERT INTO docword (doc_index,doc_id,word_id,word_str,word_count) VALUES (?,?,?,?,?)',(doc_idx,doc_id,word_id,word_str,word_count))
Exemplo n.º 4
0
    def test_saveAsText_and_loadFromText(self):
        """ `Dictionary` can be saved as textfile and loaded again from textfile. """
        tmpf = get_tmpfile('dict_test.txt')
        d = Dictionary(self.texts)
        d.save_as_text(tmpf)
        # does the file exists
        self.assertTrue(os.path.exists(tmpf))

        d_loaded = Dictionary.load_from_text(get_tmpfile('dict_test.txt'))
        self.assertNotEqual(d_loaded, None)
        self.assertEqual(d_loaded.token2id, d.token2id)
    def test_saveAsText_and_loadFromText(self):
        """`Dictionary` can be saved as textfile and loaded again from textfile. """
        tmpf = get_tmpfile('dict_test.txt')
        for sort_by_word in [True, False]:
            d = Dictionary(self.texts)
            d.save_as_text(tmpf, sort_by_word=sort_by_word)
            self.assertTrue(os.path.exists(tmpf))

            d_loaded = Dictionary.load_from_text(tmpf)
            self.assertNotEqual(d_loaded, None)
            self.assertEqual(d_loaded.token2id, d.token2id)
def build_dictionary():
    corpus = CorpusIterator(dir_list=dir_list)

    dictionary = Dictionary(corpus)

    dictionary.save_as_text(
        '/home/andre/Develop/corpora/lsamodel_wordids.txt.bz2')

    dictionary.filter_extremes(no_below=10, no_above=0.1, keep_n=500000)

    dictionary.save_as_text(
        '/home/andre/Develop/corpora/lsamodel_wordids_filtered.txt.bz2')
def build_dictionary():
    corpus = CorpusIterator(dir_list=dir_list)

    dictionary = Dictionary(corpus)

    dictionary.save_as_text(
        '/home/andre/Develop/corpora/lsamodel_wordids.txt.bz2')

    dictionary.filter_extremes(no_below=10, no_above=0.1, keep_n=500000)

    dictionary.save_as_text(
        '/home/andre/Develop/corpora/lsamodel_wordids_filtered.txt.bz2')
Exemplo n.º 8
0
def create_dictionary(doc_iterator, dict_file, as_text=False):
    """
    Creates a gensim.corpora.Dictionary object from given document iterator 
    and serializes it to given dict_file (filename) in a memory efficient way.
    @Params:
      as_text   - flag: dictionary saved as text (default: binary)
    """    
    d = Dictionary(doc.strip().lower().split() for doc in doc_iterator)
    if as_text:
        d.save_as_text(dict_file)
    else:
        d.save(dict_file)
Exemplo n.º 9
0
def Gensim_Dic(sentences, tem_fname):
    dct = Dictionary(sentences)

    a = []
    for w in stopwords:
        if w in dct.token2id.keys():
            a.append(dct.token2id[w])

    dct.filter_extremes(no_below=10)

    dct.filter_tokens(bad_ids=a)
    dct.compactify()
    dct.save_as_text(tmp_fname)
Exemplo n.º 10
0
    def get_dictionary(self):
        tmp_fname = self.path + "lda.dictionary"

        if os.path.exists(tmp_fname):
            return Dictionary.load_from_text(tmp_fname)

        else:
            print("Creating dictionary.")
            docs_by_id = read_ap.get_processed_docs()
            docs = [doc for doc_id, doc in docs_by_id.items()]
            dictionary = Dictionary(docs)
            dictionary.save_as_text(tmp_fname)
            return dictionary
Exemplo n.º 11
0
    def get_dictionary(self):
        tmp_fname = self.path + self.model_type + "_dictionary"

        if os.path.exists(tmp_fname):
            return Dictionary.load_from_text(tmp_fname)

        else:
            print("Creating dictionary.")
            docs_by_id = read_ap.get_processed_docs()
            docs = [doc for doc_id, doc in docs_by_id.items()]
            dictionary = Dictionary(docs)
            dictionary.filter_extremes(no_below=20, no_above=0.5)
            dictionary.save_as_text(tmp_fname)
            return dictionary
Exemplo n.º 12
0
def main():
    global dictionary
    try:
        dictionary = Dictionary.load_from_text(
            "persist/reuters_dictionary.txt")
        #dictionary = Dictionary.load_from_text("persist/wiki_stem-False_keep-100000_nobelow-20_noabove-0.1_wordids.txt.bz2")

    except:
        dictionary = Dictionary(ReutersCorpus())
        dictionary.filter_extremes()
        dictionary.save_as_text("persist/reuters_dictionary.txt")

    models = train_models()

    if settings["models"]["bow"]:
        bowmodel = BOWmodel()
        bowmodel.__out_size = len(dictionary)
        models["bow"] = bowmodel

    if settings["models"]["noise"]:
        noisemodel = NoiseModel(1000)
        noisemodel.__out_size = 1000
        models["noise"] = noisemodel

    num_train_samples = 21578 - settings["held_out_docs"]
    test_samples = []

    class generate_train_samples(object):
        first_iteration = True

        def __iter__(self):
            count = 0
            for document in stream_reuters_documents():
                sample = document["content"], "acq" in document[
                    "topics"]  # todo: maybe try "usa" or "earn"
                if count > num_train_samples:
                    if self.first_iteration:
                        test_samples.append(sample)
                else:
                    yield sample
                count += 1
            self.first_iteration = False

    classifiers = train_classifiers(models, generate_train_samples())

    classifications = run_evaluation(classifiers, models, test_samples)
    #output_results(classifications)

    return classifications
Exemplo n.º 13
0
def main():
    global dictionary
    try:
        dictionary = Dictionary.load_from_text("persist/reuters_dictionary.txt")
        #dictionary = Dictionary.load_from_text("persist/wiki_stem-False_keep-100000_nobelow-20_noabove-0.1_wordids.txt.bz2")

    except:
        dictionary = Dictionary(ReutersCorpus())
        dictionary.filter_extremes()
        dictionary.save_as_text("persist/reuters_dictionary.txt")

    models = train_models()

    if settings["models"]["bow"]:
        bowmodel = BOWmodel()
        bowmodel.__out_size = len(dictionary)
        models["bow"] = bowmodel

    if settings["models"]["noise"]:
        noisemodel = NoiseModel(1000)
        noisemodel.__out_size = 1000
        models["noise"] = noisemodel

    num_train_samples = 21578 - settings["held_out_docs"]
    test_samples = []


    class generate_train_samples(object):
        first_iteration = True

        def __iter__(self):
            count = 0
            for document in stream_reuters_documents():
                sample = document["content"], "acq" in document["topics"]  # todo: maybe try "usa" or "earn"
                if count > num_train_samples:
                    if self.first_iteration:
                        test_samples.append(sample)
                else:
                    yield sample
                count += 1
            self.first_iteration = False

    classifiers = train_classifiers(models, generate_train_samples())

    classifications = run_evaluation(classifiers, models, test_samples)
    #output_results(classifications)

    return classifications
Exemplo n.º 14
0
def build_vocab():
    start = time.time()
    test_path = os.path.join(config.DATA_PATH, 'test.csv')
    train_path = os.path.join(config.DATA_PATH, 'train.csv')
    normalized_text_path = os.path.join(config.PROCESSED_PATH, 'normalized_comments.txt')
    bigram_path = os.path.join(config.PROCESSED_PATH, 'bigram')
    bigram_comments_path = os.path.join(config.PROCESSED_PATH, 'bigram_commnets.txt')

    if config.PROCESSED_PATH not in os.listdir(config.DATA_PATH):
        try:
            os.mkdir(config.PROCESSED_PATH)
        except OSError:
            pass

    vocab = {}

    train_df = read_file(train_path)
    test_df = read_file(test_path)
    print('tokenizing vocab file')
    texts =  np.concatenate([train_df.comment_text.fillna('N/A').values,
                             test_df.comment_text.fillna('N/A').values])


    with open(normalized_text_path, 'w') as f:
        processed_text = parallelize_dataframe(texts, tokenizer)
        for line in processed_text:
            f.write(line + '\n')
    gc.collect()
    lines = LineSentence(normalized_text_path)
    bigram = Phrases(lines)
    bigram.save(bigram_path)
    phraser = Phraser(bigram)

    with open(bigram_comments_path, 'w', encoding='utf_8') as f:
       for comment in lines:
            comm = u' '.join(phraser[comment])
            f.write(comm + '\n')

    commnets = LineSentence(bigram_comments_path)
    bigram_dict = Dictionary(commnets)
    bigram_dict.filter_extremes(no_below=config.THRESHOLD)
    bigram_dict.save_as_text(config.VOCAB_PATH)
    bigram_dict.add_documents([['<pad>']])

    with open(os.path.join(config.ROOT, 'src', 'config.py'), 'a') as f:
        f.write('VOCAB_SIZE = {}'.format(len(bigram_dict)))

    print('time passed: {} minutes'.format((time.time() - start) / 60))
Exemplo n.º 15
0
def get_data_tokenizer(fromdate, todate):
    print 'Starting get and save data from mysql-server into local folder....'

    fromdate = fromdate + ' 00:00:00'
    todate = todate + ' 23:59:59'

    connection = my_connection.getConnection()
    cursor = connection.cursor()

    query = 'SELECT id, vntokenizer, catid FROM news WHERE create_time BETWEEN ' + '\'' + fromdate + '\' AND \'' + todate + '\';'
    print query

    cursor.execute(query)
    rows = cursor.fetchall()
    count = 0

    token_dictionary = Dictionary()
    data = dict()

    for row in rows:
        id = row[0]
        tokenizer = row[1]
        catid = row[2]
        if tokenizer != None:
            tokenizer = tokenizer.lower()
            count += 1
            print count
            print tokenizer
            token_list = tokenizer.split(' ')
            valid_token_list = list()
            for token in token_list:
                if my_util.check_valid_token(token):
                    valid_token_list.append(token)
            token_dictionary.add_documents([valid_token_list])
            if catid == my_catid:
                data[id] = valid_token_list

    my_connection.closeConnection(connection)

    # save dictionary and data into text file
    token_dictionary.save_as_text('..' + parameter.FILE_DICTIONARY)
    fb = open('..' + parameter.FILE_DATA, 'wb')
    pickle.dump(data, fb)
    fb.close()

    print 'Done get and save data from mysql-server!'
def prepare_word_embedding():
    """Construct vocabulary file and word embedding file.
    """
    df = pd.read_csv(
        "data/raw/train.csv", usecols=["original_phrase1", "original_phrase2", "ytrue"]
    )

    model = KeyedVectors.load_word2vec_format(
        "/data/mayu-ot/Data/Model/GoogleNews-vectors-negative300.bin.gz", binary=True
    )

    CUSTOM_FILTERS = [
        lambda x: x.lower(),
        strip_punctuation,
        strip_multiple_whitespaces,
        strip_numeric,
    ]

    doc = [preprocess_string(x, CUSTOM_FILTERS) for x in df.values[:, :2].ravel()]

    dct = Dictionary(doc)

    bad_ids = []
    for k, v in dct.iteritems():
        if v not in model:
            bad_ids.append(k)
    dct.filter_tokens(bad_ids)

    dct.compactify()

    for k, v in dct.iteritems():
        print(k, v)
        if k == 10:
            break

    dct.save_as_text("data/processed/dictionary.txt")

    word_emb = np.ones((len(dct), 300))

    for k, v in dct.iteritems():
        word_emb[k] = model[v]

    np.save("data/processed/word2vec", word_emb)
Exemplo n.º 17
0
 def create_LDA_model(self):
     trigram_articles = LineSentence(self.trigram_articles_filepath)
     trigram_dictionary = Dictionary(trigram_articles)
     trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
     trigram_dictionary.compactify()
     trigram_dictionary.save_as_text(self.trigram_dictionary_filepath)
     # trigram_dictionary = Dictionary.load(self.trigram_dictionary_filepath)
     MmCorpus.serialize(self.trigram_bow_filepath,
                        self.trigram_bow_generator(self.trigram_articles_filepath,
                                                   trigram_dictionary))
     trigram_bow_corpus = MmCorpus(self.trigram_bow_filepath)
     print(trigram_bow_corpus)
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         lda = LdaMulticore(trigram_bow_corpus,
                            num_topics=20,
                            id2word=trigram_dictionary,
                            workers=3)
     lda.save(self.lda_model_filepath)
Exemplo n.º 18
0
    def produce(self):
        doc_n = 0
        docs = []
        doctokens = [] # AKA gensim "text"
        stopwords = nltk.corpus.stopwords.words('english')

        NOALPHA = re.compile('[^a-z]+')
        def prep_string(my_string,pattern = NOALPHA):
            return re.sub(pattern, ' ', my_string.strip().lower())

        print('Getting src docs')
        for doc in self.src_doc_generator():
            content = re.sub(NOALPHA, ' ', doc) # Do this in the corpus generator?
            docs.append(content)
            doctokens.append([token for token in nltk.word_tokenize(content) if token not in stopwords])
            doc_n += 1
            if doc_n % 1000 == 0: print(doc_n)
                
        print('Creating the dictionary')
        dictionary = Dictionary(doctokens)
        dictionary.compactify()
        dictionary.filter_extremes(keep_n=None)
        if self.dictfile:
            dictionary.save_as_text(self.dictfile+'.dict', sort_by_word=True)

        with self.dbi as db:

            print('Creating DOC')
            db.create_table('doc')
            for i, doc in enumerate(docs):
                db.cur.execute('INSERT INTO doc VALUES (?,?)',(i,doc))

            print('Creating WORD')
            db.create_table('word')
            for item in dictionary.iteritems():
                db.cur.execute('INSERT INTO word (word_id, word_str) VALUES (?,?)',item)

            print('Creating DOCWORD')
            db.create_table('docword')
            for i, tokens in enumerate(doctokens):
                for item in (dictionary.doc2bow(tokens)):
                    db.cur.execute('INSERT INTO docword (doc_id,word_id,word_count) VALUES (?,?,?)',[i,item[0],item[1]])
Exemplo n.º 19
0
def main():
    parser = argparse.ArgumentParser(description='creates an id2author mapping gensim dictionary a document->authorid contributions MatrixMarket file and a binary article title file from a given WikiMedia *-pages-meta-history dump (considering only articles in mainspace!)')
    parser.add_argument('--history-dump', type=argparse.FileType('r'), help='path to input WikiMedia *-pages-meta-history file (.xml/.xml.bz2)', required=True)
    parser.add_argument('--id2author', type=argparse.FileType('w'), help='path to output text id2author dictionary (.txt/.txt.bz2)', required=True)
    parser.add_argument('--contribs', type=argparse.FileType('w'), help='path to output MatrixMarket contributions .mm file; also creates a binary article title file CONTRIBS.metadata.cpickle', required=True)
    parser.add_argument('--contribution-value', choices=CONTRIBUTION_VALUE_FUNCTIONS, help='calculated per-contribution value; choices: {}'.format(CONTRIBUTION_VALUE_FUNCTIONS.keys()), required=True)
    parser.add_argument("--namespace-prefixes", type=argparse.FileType('r'), help='file of namespace prefixes to ignore')    
        
    args = parser.parse_args()
    args = parser.parse_args()
    input_history_dump_path = args.history_dump.name
    output_id2author_path = args.id2author.name
    output_contribs_path = args.contribs.name
    contribution_value = args.contribution_value
    namespace_prefixes = read_lines(args.namespace_prefixes.name) if args.namespace_prefixes else ()
        
    logger.info('running with:\n{}'.format(pformat({'input_history_dump_path':input_history_dump_path, 'output_id2author_path':output_id2author_path, 'output_contribs_path':output_contribs_path, 'contribution_value':contribution_value, 'namespace_prefixes':namespace_prefixes})))        
            
    # konstruiere id2author-Dictionary: mappt Autornamen von registrierten, Nicht-Bot-Autoren auf IDs und umgekehrt
    with smart_open(input_history_dump_path) as history_dump_file:    
        logger.info('generating author->id mappings')
        history_dump = xml_dump.Iterator.from_file(history_dump_file)
        # benutze id2word-Dictionary von gensim als id2author-Dictionary: Autoren entsprechen Termen
        id2author = Dictionary(get_revision_authors_of_pages(history_dump, namespace_prefixes))
        logger.info('found {} different authors'.format(len(id2author)))
        logger.info('removing non-registered authors')
        remove_from_dictionary(id2author, is_registered_user)
        logger.info('reduced to {} registered authors'.format(len(id2author)))
        logger.info('removing bots')
        remove_from_dictionary(id2author, is_not_bot_user)
        logger.info('reduced to {} registered non-bot authors'.format(len(id2author)))
        id2author.compactify()
        id2author.save_as_text(output_id2author_path)
        
    # berechne & speichere Einträge (Autor-ID, Versionswert) Versionen gültiger Autoren für alle Artikel 
    with smart_open(input_history_dump_path) as history_dump_file: 
        logger.info('generating MatrixMarket representation per revision: (docid, authorid, value of revision)')
        history_dump = xml_dump.Iterator.from_file(history_dump_file)
        revision_value_fun = CONTRIBUTION_VALUE_FUNCTIONS[contribution_value]
        doc_auth_contribs = MetadataCorpus(get_revision_values(get_revisions_of_pages(history_dump, namespace_prefixes), id2author, revision_value_fun))
        MmWriter.write_corpus(output_contribs_path, corpus=doc_auth_contribs, num_terms=len(id2author), index=False, progress_cnt=10000, metadata=True)   
Exemplo n.º 20
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
    print('Creating speech serialized corpus')
    # Create the speech corpus, it is inside the rawfile as a json format:
    # "id0": {"text": [" "], "url": "http://www.americanrhetoric.com/"}
    with open(RAWFILE, 'r') as f:
        speech_dict = json.load(f)
    with open(RAWIDS, 'r') as f:
        id_dict = json.load(f)
    # We also need to make sure that the article ids are saved in the correct
    # format so that the gensimple engine can understand it, like this:
    # "int": ["url", "title"],
    texts = []
    article_dict = {}
    counter = 0
    for key, value in speech_dict.items():
        texts.append([token for token in value['text']])
        article_dict[str(counter)] = [value['url'], id_dict[key]['title']]
        counter += 1
    with open(ARTICLEDICT, 'w') as f:
        json.dump(article_dict, f)
    dictionary = Dictionary(texts)
    dictionary.save_as_text(DICTFILE)
    corpus = [dictionary.doc2bow(text) for text in texts]
    MmCorpus.serialize(MMFILE, corpus)
    print('Speech serialized corpus created')
    # # Now run LSI on TDIDF
    dictionary = Dictionary.load_from_text(DICTFILE)
    mm = MmCorpus(MMFILE)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(TDIFMODEL)
    MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
    mm_tdif = MmCorpus(TDIFFILE)
    lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
    index = similarities.MatrixSimilarity(lsi[mm_tdif])
    index.save(SIMMATRIX)
    lsi.save(LSIMODEL)
    print("LSI model and index created")
Exemplo n.º 21
0
class Index(object):
    """define an index instance along with its associated methods"""
    def __init__(self, stops, minsize=3):
        """initialize index variables"""
        self.ix = None
        self.tokenizer = StandardAnalyzer(stoplist=stops, minsize=minsize)
        self.umls = umls.UMLSLookup()
        self.term_dict = {}
        self.token2cuis = {}
        self.concept_dict = {"__NULL__": 0}
        self.synsets = {}

    def get_doc_ids(self, corpus_path, corpus_name):
        """get doc ids from corpus"""
        if "OHSUMED" in corpus_name:
            docs = safir_utils.gen_trec_doc(corpus_path)
        elif "TREC_CDS" in corpus_name:
            docs = safir_utils.gen_cds_doc(corpus_path)
        return [docno for docno, doc in docs]

    def only_digits(self, token):
        """check whether input token contains only digits and/or punctuation"""
        return all(char.isdigit() or char in string.punctuation
                   for char in token)

    def preprocess_text(self, text, tags=False, remove_digits=True):
        """preprocess text: tokenize docs, lowerize text, remove words with length < min_size, remove tags, remove only-digits tokens and remove stopwords"""
        if tags:  # remove tags
            text = strip_tags(text)
        if remove_digits:  # tokenize and remove digits-only tokens
            text = [
                token.text for token in self.tokenizer(text)
                if not self.only_digits(token.text)
            ]
        else:  # tokenize and keep digits-only tokens
            text = [token.text for token in self.tokenizer(text)]
        # return preprocessed doc
        return text

    def preprocess_corpus(self, corpus_path, corpus_name, out_corpus, out_ids):
        """preprocess corpus: apply preprocess_text to each doc within corpus"""
        if "OHSUMED" in corpus_name:
            docs = safir_utils.gen_trec_doc(corpus_path)
        elif "TREC_CDS" in corpus_name:
            docs = safir_utils.gen_cds_doc(corpus_path)
        # tokenize docs
        print("pre processing docs...")
        #pproc_corpus = [self.preprocess_text(doc) for docno, doc in docs]
        pproc_corpus = []
        doc_ids = []
        # iterate over docs and store pre processed docs and docnos
        for docno, doc in docs:
            pproc_corpus.append(self.preprocess_text(doc))
            doc_ids.append(docno)
        print("pre processing finished!")
        # store pproc_corpus
        print("store pre processed corpus in {}".format(out_corpus))
        with open(out_corpus, 'w') as outf:
            json.dump(pproc_corpus, outf)
        # store docnos
        print("store doc_ids in {}".format(out_ids))
        with open(out_ids, 'w') as outf:
            json.dump(doc_ids, outf)
        # return pproc_corpus and doc_ids
        return pproc_corpus, doc_ids

    def load_pproc_corpus(self, fname):
        """load stored pre processed corpus"""
        with open(fname, 'r') as inf:
            pproc_corpus = json.load(inf)
        return pproc_corpus

    def load_doc_ids(self, fname):
        """load stored doc ids"""
        with open(fname, 'r') as inf:
            doc_ids = json.load(inf)
        return doc_ids

    def index_corpus(self, pproc_corpus, fname):
        """index pre processed corpus using gensim dictionary - fast doc2bow, doc2idx conversion"""
        self.ix = Dictionary(pproc_corpus)
        self.ix.save_as_text(fname)
        return True

    def load_index(self, fname):
        """load stored index"""
        self.ix = Dictionary.load_from_text(fname)
        return True

    def build_term_dict(self,
                        pproc_corpus,
                        fname,
                        dict_size=131072,
                        remove_digits=True,
                        min_df=2,
                        max_df=0.5):
        """create term dictionary"""
        ttf = {}
        # filter terms with df lower than 2 and greater than 0.5 (in %) and store their ttf
        for doc in tqdm(pproc_corpus):
            # get doc in bow format
            bow = self.ix.doc2bow(doc)
            for idx, tf in bow:
                if self.ix.dfs[idx] >= 2 and self.ix.dfs[
                        idx] / self.ix.num_docs <= 0.5:
                    if idx in ttf:
                        ttf[idx] += tf
                    else:
                        ttf[idx] = tf
        # convert ttf dict into counter and keep dict_size most frequent terms
        count = Counter(ttf).most_common(dict_size)
        # create term dict - two-levels encoding (i.e. self.term_dict[self.ix.token2id[token]])
        for idx, ttf in count:
            self.term_dict[idx] = len(self.term_dict)
        # store term dictionary
        with open(fname, 'w') as outf:
            json.dump(self.term_dict, outf)
        return True

    def load_term_dict(self, fname):
        """load term dictionary"""
        with open(fname, 'r') as inf:
            self.term_dict = json.load(inf)
        # convert keys from str back to int - json stores dict keys as str
        self.term_dict = {
            int(ix_term): dict_term
            for ix_term, dict_term in self.term_dict.items()
        }
        return True

    def get_pos2token(self, text):
        """split text into tokens and return {pos: [token, ["__NULL__"]]}"""
        pos2token = {}
        tokens = text.split(
        )  # split on whitespaces as text has been already pre processed
        # set text index
        index = text.index
        running_offset = 0
        # loop over tokens
        for token in tokens:
            token_offset = index(token, running_offset)
            token_len = len(token)
            # update running offset
            running_offset = token_offset + token_len
            pos2token[token_offset] = [self.ix.token2id[token], ["__NULL__"]
                                       ]  # note: ["__NULL__"] is for later use
        return pos2token

    def associate_token2cuis(self, pos2token, terms_candidate_cuis):
        """return list of (token, [cui1, cui2, ...]) pairs given token position and candidate concepts"""
        for term_cuis in terms_candidate_cuis:
            # get positional information
            start = term_cuis[0]['start']
            # check whether 'start' matches with any pos2token key
            if start in pos2token:
                # update ["__NULL__"] with candidate cuis
                pos2token[start][1] = [concept['cui'] for concept in term_cuis]
        # return pos2token values only - i.e. (term, [cui1, cui2, ...]) pairs
        return list(pos2token.values())

    def map_token2cuis(self, fname, threshold=1.0, stypes_fname=None):
        """map candidate cuis to each token in the index"""
        terms_str = ' '.join(list(self.ix.token2id.keys()))
        # split term_str into substrings of length <= 999999 - max length allowed by scipy parser
        substrs = wrap(terms_str,
                       width=999999,
                       break_long_words=False,
                       break_on_hyphens=False)
        if stypes_fname is not None:  # load user-specified UMLS semantic types
            print("user-specified UMLS semantic types for QuickUMLS enabled")
            semtypes = ','.join(safir_utils.load_semtypes(stypes_fname))
        else:  # keep default QuickUMLS semantic types
            semtypes = None
        # initialize QuickUMLS server
        server = QuickUMLS(window=1, threshold=threshold, semtypes=semtypes)
        server.launch_quickumls()
        # initialize concept matcher
        matcher = get_quickumls_client()
        token2cuis = []
        # extract concepts from substrs
        for substr in substrs:
            terms_candidate_cuis = matcher.match(substr)
            # get position dict: {pos: [token, ["__NULL__"]]} given substr
            pos2token = self.get_pos2token(substr)
            # associate each token with its candidate concepts
            token2cuis += self.associate_token2cuis(pos2token,
                                                    terms_candidate_cuis)
        # close connection with QuickUMLS server
        server.close_quickumls()
        # store token2cuis as dict
        self.token2cuis = dict(token2cuis)
        # store token2cuis
        with open(fname, 'w') as outf:
            json.dump(self.token2cuis, outf)
        return True

    def load_token2cuis(self, fname):
        """load token2cuis"""
        with open(fname, 'r') as inf:
            self.token2cuis = json.load(inf)
        # convert keys from str back to int - json stores dict keys as str
        self.token2cuis = {
            int(token): cuis
            for token, cuis in self.token2cuis.items()
        }
        return True

    def update_concept_dict(self, cui):
        """update concept dictionary"""
        if cui in self.concept_dict:
            return True
        else:
            self.concept_dict[cui] = len(self.concept_dict)
            return True

    def load_concept_dict(self, fname):
        """load concept dictionary"""
        with open(fname, 'r') as inf:
            self.concept_dict = json.load(inf)
        return True

    def update_synsets(self, cui, idx):
        """update synonyms set"""
        if self.concept_dict[
                cui] in self.synsets:  # add term to set of synonyms for the given cui
            self.synsets[self.concept_dict[cui]].add(self.term_dict[idx])
            return True
        elif self.concept_dict[cui] != self.concept_dict[
                "__NULL__"]:  # initialize set of synsets for given cui
            self.synsets[self.concept_dict[cui]] = {self.term_dict[idx]}
            return True
        else:  # do not update synsets
            return False

    def load_synsets(self, fname):
        """load synsets"""
        with open(fname, 'r') as inf:
            self.synsets = json.load(inf)
        # convert keys from str back to int - json stores dict keys as str
        self.synsets = {int(cui): syns for cui, syns in self.synsets.items()}
        return True

    def get_sense_pairs(self):
        """return senses as (term, cui) 2-dim np array"""
        syns = [
            list(itertools.product(self.synsets[cui], [cui]))
            for cui in self.synsets
        ]
        synp = [list(itertools.combinations(syn, 2)) for syn in syns]
        return np.array(list(itertools.chain.from_iterable(synp)))

    def s_wsd(self, doc, table_name, query=False):
        """shallow word-sense disambiguation: disambiguate polysemous terms based on shallow word-concept connectivity within UMLS"""
        doc_cuis = {}
        # convert doc into doc2idx format
        doc2idx = self.ix.doc2idx(doc)
        # get cuis from doc tokens
        for idx in doc2idx:
            if idx in self.token2cuis and self.token2cuis[idx] != ["__NULL__"]:
                for cui in self.token2cuis[idx]:
                    if cui in doc_cuis:  # increase cui count
                        doc_cuis[cui] += 1
                    else:  # initialize cui count
                        doc_cuis[cui] = 1
        # perform shallow word-sense disambiguation
        enc_doc = []
        for idx in doc2idx:
            if idx in self.term_dict:  # disambiguate only for terms contained within self.term_dict
                max_edges = 0  # relative maximum connections (edges)
                if len(self.token2cuis[idx]) == 1:  # monosemous term
                    ref_cui = self.token2cuis[idx][0]
                    if not query:  # update concept dict and synsets
                        self.update_concept_dict(ref_cui)
                        self.update_synsets(ref_cui, idx)
                    # encode (term, cui) pair
                    enc_doc.append(
                        [self.term_dict[idx], self.concept_dict[ref_cui]])
                else:  # polysemous term
                    candidates = []
                    # loop over cadidate concepts
                    for subj_cui in self.token2cuis[idx]:
                        num_edges = 0  # number of edges
                        if doc_cuis[
                                subj_cui] == 1:  # subj_cui is only associated with current term (idx)
                            obj_cuis = list(
                                set(doc_cuis.keys()).difference({subj_cui}))
                        else:  # subj_cui is associated with other terms in the doc too
                            obj_cuis = list(doc_cuis.keys())
                        num_edges += self.umls.compute_num_edges(
                            subj_cui, obj_cuis, table_name)
                        # verify connectivity
                        if num_edges > max_edges:
                            # set candidates to subj_cui
                            candidates = [subj_cui]
                            # update max_edges
                            max_edges = num_edges
                        else:
                            # append subj_cui to candidates
                            candidates.append(subj_cui)
                    # keep head candidate - when disambiguation is not complete, it allows to get the most likely concept based on QuickUMLS ordering
                    ref_cui = candidates[0]
                    if not query:  # update concept dict and synsets
                        self.update_concept_dict(ref_cui)
                        self.update_synsets(ref_cui, idx)
                    # encode (term, cui) pair
                    enc_doc.append(
                        [self.term_dict[idx], self.concept_dict[ref_cui]])
            else:  # term oov
                continue
        return enc_doc

    def encode_corpus(self,
                      pproc_corpus,
                      corpus_name,
                      ecorpus_fname,
                      t2c_fname,
                      cdict_fname,
                      syn_fname,
                      threshold=0.7,
                      stypes_fname=None):
        """perform semantic indexing and encode corpus"""
        print("map UMLS concepts to (indexed) tokens")
        self.map_token2cuis(t2c_fname,
                            threshold=threshold,
                            stypes_fname=stypes_fname)
        # get UMLS concepts mapped to (indexed) tokens
        ix_concepts = {
            cui
            for cuis in self.token2cuis.values() for cui in cuis
            if cui != "__NULL__"
        }
        # create sql table to store relations between concepts associated to indexed tokens - allows for fast accessing compared to MRREL table
        print(
            "create table to store UMLS relations between concepts associated to (indexed) tokens - fast access is enabled by indexes"
        )
        self.umls.restrict_to_ix_concepts(ix_concepts, corpus_name)
        # create indexes to speed up requests
        self.umls.create_index("CUI1_" + corpus_name, ["CUI1"],
                               corpus_name)  # create index for subject column
        self.umls.create_index("CUI2_" + corpus_name, ["CUI2"],
                               corpus_name)  # create index for object column
        self.umls.create_index(
            "CUI1_CUI2_" + corpus_name, ["CUI1", "CUI2"],
            corpus_name)  # create multicolumn index (subj, obj)
        # encode corpus
        print("disambiguate polysemous tokens and encode corpus")
        enc_corpus = [
            self.s_wsd(doc, corpus_name, query=False)
            for doc in tqdm(pproc_corpus)
        ]
        # store synsets as dict of lists - enables json encoding
        self.synsets = {cui: list(syns) for cui, syns in self.synsets.items()}
        # store semantic data and encoded corpus
        with open(ecorpus_fname, 'w') as outf:
            json.dump(enc_corpus, outf)
        with open(cdict_fname, 'w') as outf:
            json.dump(self.concept_dict, outf)
        with open(syn_fname, 'w') as outf:
            json.dump(self.synsets, outf)
        # return encoded corpus
        return enc_corpus

    def load_enc_corpus(self, fname):
        """load encoded corpus"""
        with open(fname, 'r') as inf:
            enc_corpus = json.load(inf)
        return enc_corpus

    def preprocess_query(self, query):
        """pre process query"""
        pproc_query = self.preprocess_text(query)
        return pproc_query

    def encode_query(self, pproc_query, corpus_name):
        """disambiguate polysemous terms and encode query"""
        enc_query = self.s_wsd(pproc_query, corpus_name, query=True)
        if not enc_query:
            print("query does not contain known terms")
            return None
        else:
            return np.array(enc_query)

    def project_query(self,
                      query,
                      corpus_name,
                      word_embs,
                      proj_weights,
                      concept_embs=None):
        """project encoded query into dense vector of size [1, doc_embs]"""
        enc_query = self.encode_query(self.preprocess_query(query),
                                      corpus_name)
        if enc_query is None:
            return None
        else:
            if concept_embs is None:  # only terms are considered
                return np.matmul(proj_weights,
                                 np.mean(word_embs[enc_query[:, 0]], axis=0))
            else:  # terms + concepts are considered (i.e. senses)
                return np.matmul(
                    proj_weights,
                    np.mean(np.add(word_embs[enc_query[:, 0]],
                                   concept_embs[enc_query[:, 1]]),
                            axis=0))

    def semantic_search(self, doc_ids, docs, query_ids, queries,
                        ranking_folder, ranking_name):
        """perform search over queries using neural semantic models and return ranking"""
        doc_ids = np.array(doc_ids)
        print("compute similarities between docs and queries")
        similarities = cosine_similarity(docs, queries)
        out = open(ranking_folder + '/' + ranking_name + '.txt', 'w')
        for i in tqdm(range(similarities.shape[1])):
            rank = np.argsort(-similarities[:, i])[:1000]
            docs_rank = doc_ids[rank]
            qid = query_ids[i]
            if qid.isdigit(
            ):  # cast to integer - this operation avoids storing topic ids as '0##' instead of '##'
                qid = str(int(qid))  # convert to int and then back to str
            for j in range(len(docs_rank)):
                out.write('%s %s %s %d %f %s\n' %
                          (qid, 'Q0', docs_rank[j], j,
                           similarities[rank[j]][i], ranking_name))
        out.close()
        return True
    def create_dictionary(self):
        YELP_DATASET_DIR = config.get("YELP_DATASET_DIR", None)
        SAVE_REVIEWS_BY_CATEGORY_DIRECTORY = config.get(
            "SAVE_REVIEWS_BY_CATEGORY_DIRECTORY", None)
        SAVE_DICTIONARY_DIR = config.get("SAVE_DICTIONARY_DIR", None)
        SAVE_BAG_OF_WORDS_DIR = config.get("SAVE_BAG_OF_WORDS_DIR", None)
        SAVE_N_BAG_OF_WORDS_DOCS_PER_FILE = int(
            config.get("SAVE_N_BAG_OF_WORDS_DOCS_PER_FILE", 25000))

        if not (YELP_DATASET_DIR and SAVE_REVIEWS_BY_CATEGORY_DIRECTORY
                and SAVE_DICTIONARY_DIR and SAVE_BAG_OF_WORDS_DIR
                and SAVE_DICTIONARY_DIR):
            print(
                "config keys are not set correctly in the config file: socialconfig.py"
            )
            exit(0)

        SAVE_UNFILTERED_DICTIONARY_DIR = os.path.join(SAVE_DICTIONARY_DIR,
                                                      "Unfiltered")

        if not os.path.exists(
                SAVE_REVIEWS_BY_CATEGORY_DIRECTORY) and not os.path.isdir(
                    SAVE_REVIEWS_BY_CATEGORY_DIRECTORY):
            raise ("Directory {d} does not exist".format(
                d=SAVE_REVIEWS_BY_CATEGORY_DIRECTORY))

        if not (os.path.exists(SAVE_BAG_OF_WORDS_DIR)
                and os.path.isdir(SAVE_BAG_OF_WORDS_DIR)):
            os.makedirs(SAVE_BAG_OF_WORDS_DIR)

        if not (os.path.exists(SAVE_UNFILTERED_DICTIONARY_DIR)
                and os.path.isdir(SAVE_UNFILTERED_DICTIONARY_DIR)):
            os.makedirs(SAVE_UNFILTERED_DICTIONARY_DIR)

        for pardir, sub_dirs, files in os.walk(
                SAVE_REVIEWS_BY_CATEGORY_DIRECTORY):

            if len(files) > 0:
                error_count = 0
                review_docs = []
                negative_docs = []
                positive_docs = []

                doc_count = 0
                docs_per_file = SAVE_N_BAG_OF_WORDS_DOCS_PER_FILE
                file_num = str((doc_count / docs_per_file) + 1)
                for file in files:
                    if "yelp_reviews_" in file and "category" in pardir:
                        reviews = get_reviews_iterable(
                            os.path.join(pardir, file))
                        yelp_category = pardir.split('/')[-1]

                        CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR = os.path.join(
                            SAVE_BAG_OF_WORDS_DIR, yelp_category)
                        if not (os.path.exists(
                                CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR) and os.path
                                .isdir(CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR)):
                            os.makedirs(CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR)

                        fname = os.path.join(
                            SAVE_BAG_OF_WORDS_DIR, yelp_category,
                            "{cat}_file_{file_num}.txt".format(
                                cat=yelp_category, file_num=file_num))
                        bow_file = open(fname, 'w')
                        print(
                            "Writing docs (in bag of words form) for {cat} to directory: {d}"
                            .format(cat=yelp_category,
                                    d=os.path.join(SAVE_BAG_OF_WORDS_DIR,
                                                   yelp_category)))
                        for review in reviews:
                            try:
                                review_dict = ujson.loads(review)
                            except:
                                error_count += 1
                                pass
                            adjs = review_dict.get("adjectives", None)
                            rating = int(review_dict.get("rating", -1))
                            if adjs:
                                doc_count += 1
                                bow_file.write(
                                    ujson.dumps(adjs.encode("utf-8")) + "\n")
                                review_docs.append(adjs.strip().split())
                                if (doc_count % docs_per_file) == 0:
                                    if bow_file:
                                        bow_file.close()
                                    file_num = str((doc_count /
                                                    docs_per_file) + 1)
                                    fname = os.path.join(
                                        SAVE_BAG_OF_WORDS_DIR, yelp_category,
                                        "{cat}_file_{file_num}.txt".format(
                                            cat=yelp_category,
                                            file_num=file_num))
                                    bow_file = open(fname, 'w')
                            if rating:
                                if rating > 3:
                                    positive_docs.append(adjs.strip().split())
                                elif rating < 3:
                                    negative_docs.append(adjs.strip().split())
                                else:
                                    pass
                print("Wrote {total} docs in {cat} category".format(
                    total=str(doc_count), cat=yelp_category))

                dictionary = Dictionary(review_docs)

                CATEGORY_SPECIFIC_DICT_DIR = os.path.join(
                    SAVE_UNFILTERED_DICTIONARY_DIR, yelp_category)
                POSITIVE_SUB_DIR = os.path.join(CATEGORY_SPECIFIC_DICT_DIR,
                                                "positive")
                NEGATIVE_SUB_DIR = os.path.join(CATEGORY_SPECIFIC_DICT_DIR,
                                                "negative")
                if not (os.path.exists(CATEGORY_SPECIFIC_DICT_DIR)
                        and os.path.isdir(CATEGORY_SPECIFIC_DICT_DIR)):
                    os.makedirs(CATEGORY_SPECIFIC_DICT_DIR)
                    os.makedirs(POSITIVE_SUB_DIR)
                    os.makedirs(NEGATIVE_SUB_DIR)

                dictionary.save(
                    os.path.join(
                        CATEGORY_SPECIFIC_DICT_DIR,
                        "{yelp_category}_dict.dict".format(
                            yelp_category=yelp_category)))
                dictionary.save_as_text(
                    os.path.join(
                        CATEGORY_SPECIFIC_DICT_DIR,
                        "{yelp_category}_dict.txt".format(
                            yelp_category=yelp_category)))
                sorted_doc_freqs = sorted(dictionary.dfs.items(),
                                          key=lambda x: x[1],
                                          reverse=True)

                # print("Will save file in:\n " + os.path.join(CATEGORY_SPECIFIC_DICT_DIR,"{yelp_category}_dict.txt".format(yelp_category=yelp_category)))
                with open(
                        os.path.join(
                            CATEGORY_SPECIFIC_DICT_DIR,
                            "{yelp_category}_words_doc_frequencies.txt".format(
                                yelp_category=yelp_category)), 'w') as df_file:
                    for (token_id, doc_freq) in sorted_doc_freqs:
                        df_file.write(
                            str(
                                dictionary.get(token_id, "Unknown").encode(
                                    'utf-8')) + " " + str(doc_freq) + "\n")

                del dictionary
                del review_docs
                del sorted_doc_freqs

                pos_dictionary = Dictionary(positive_docs)
                del positive_docs

                neg_dictionary = Dictionary(negative_docs)
                del negative_docs

                pos_dictionary.save(
                    os.path.join(
                        POSITIVE_SUB_DIR,
                        "{yelp_category}_pos_dict.dict".format(
                            yelp_category=yelp_category)))
                pos_dictionary.save_as_text(
                    os.path.join(
                        POSITIVE_SUB_DIR,
                        "{yelp_category}_pos_dict.txt".format(
                            yelp_category=yelp_category)))

                sorted_pos_doc_freqs = sorted(pos_dictionary.dfs.items(),
                                              key=lambda x: x[1],
                                              reverse=True)
                with open(
                        os.path.join(
                            POSITIVE_SUB_DIR,
                            "{yelp_category}_pos_words_doc_frequencies.txt".
                            format(yelp_category=yelp_category)),
                        'w') as df_file:
                    for (token_id, doc_freq) in sorted_pos_doc_freqs:
                        df_file.write(
                            str(
                                pos_dictionary.get(token_id, "Unknown").encode(
                                    'utf-8')) + " " + str(doc_freq) + "\n")

                del pos_dictionary
                del sorted_pos_doc_freqs

                neg_dictionary.save(
                    os.path.join(
                        NEGATIVE_SUB_DIR,
                        "{yelp_category}_neg_dict.dict".format(
                            yelp_category=yelp_category)))
                neg_dictionary.save_as_text(
                    os.path.join(
                        NEGATIVE_SUB_DIR,
                        "{yelp_category}_neg_dict.txt".format(
                            yelp_category=yelp_category)))
                sorted_neg_doc_freqs = sorted(neg_dictionary.dfs.items(),
                                              key=lambda x: x[1],
                                              reverse=True)
                with open(
                        os.path.join(
                            NEGATIVE_SUB_DIR,
                            "{yelp_category}_neg_words_doc_frequencies.txt".
                            format(yelp_category=yelp_category)),
                        'w') as df_file:
                    for (token_id, doc_freq) in sorted_neg_doc_freqs:
                        df_file.write(
                            str(
                                neg_dictionary.get(token_id, "Unknown").encode(
                                    'utf-8')) + " " + str(doc_freq) + "\n")

                del neg_dictionary
                del sorted_neg_doc_freqs

                print(
                    "{count} {cat} reviews were discarded because of parsing errors"
                    .format(count=error_count, cat=yelp_category))
                print("Created dictionary for {cat} tokens".format(
                    cat=yelp_category))
Exemplo n.º 23
0
if len(sys.argv) < 3:
    print 'Usage: \n python train.py wiki.zh.chs.seg.utf.stop lda.model'
    sys.exit(1)

inp, outp = sys.argv[1:3]

logging.basicConfig(format = '%(asctime)s: %(levelname)s: %(message)s', level = logging.INFO)
logging.info('Loading training set...')
fp = codecs.open(inp, 'r', encoding='utf8')
train = []
for line in fp:
    train.append(line.split())
fp.close()

logging.info('Preparing corpus...')
dictionary = Dictionary(train)
dictionary.save_as_text('wiki.dictionary.bz2')
corpus = [ dictionary.doc2bow(text) for text in train ]
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

del train, tfidf

logging.info('Training...')
lda = LdaModel(corpus_tfidf, id2word=dictionary, num_topics=200)
#lda = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=100, workers=2)

logging.info('Saving LDA model...')
lda.save(outp)
Exemplo n.º 24
0
class TextClassifier(object):
    LABEL_TO_INDEX = {'auto':0, 'business':1, 'sports':2}
    INDEX_TO_LABEL = {0:'auto', 1:'business', 2:'sports'}

    def __init__(self, dict_file=None, model_file=None):
        if dict_file:
            self.dictionary = Dictionary.load_from_text(dict_file)
        else:
            self.dictionary = Dictionary()

        if model_file:
            self.model = joblib.load(model_file)
        else:
            self.model = None

    def expand_sent_terms(self, sent, ngrams=[2]):
        expd_sent = list(sent)
        ngram_terms = self._get_ngram_terms(sent, ngrams)
        expd_sent.extend(ngram_terms)

        return expd_sent

    def sentence_to_bow(self, sent):
        if self.dictionary:
            return self.dictionary.doc2bow(sent)
        else:
            return None

    def bow_to_feature_vec(self, bow_corpus):
        data = []
        rows = []
        cols = []
        line_count = 0
        for bow_sent in bow_corpus:
            for elem in bow_sent:
                rows.append(line_count)
                cols.append(elem[0])
                data.append(elem[1])
            line_count += 1

        return csr_matrix(
            (data, (rows,cols)), shape=(line_count, len(self.dictionary)))

    def load_text(self, data_file, train=False):
        term_corpus = []
        labels = []
        with open(data_file) as fin:
            for line in fin:
                parts = line.strip().decode('utf8').split('\t')
                if len(parts) < 2:
                    continue

                label = parts[0]
                sent = parts[1].split()

                # Expand sentence with more features.
                sent = self.expand_sent_terms(sent, [2])

                # Save sentences and labels.
                term_corpus.append(sent)
                labels.append(self.LABEL_TO_INDEX[label])

                # Update dictionary.
                if train:
                    self.dictionary.add_documents([sent])

        if train:
            # Compacitify dictionary.
            self.dictionary.filter_extremes(no_below=5,
                                            no_above=0.6,
                                            keep_n=None)
            self.dictionary.compactify()

        # Change text format corpus to bow format.
        bow_corpus = []
        for sent in term_corpus:
            sent_bow = self.dictionary.doc2bow(sent)
            bow_corpus.append(sent_bow)

        return bow_corpus, labels

    def _get_ngram_terms(self, words, ngrams):
        terms = []
        for i in range(1, len(words)):
            # Bigram terms.
            if 2 in ngrams and (i - 1) >= 0:
                terms.append('%s_%s' % (words[i - 1], words[i]))
            # Trigram terms.
            if 3 in ngrams and (i - 2) >= 0:
                terms.append(
                    '%s_%s_%s' % (words[i - 2], words[i - 1], words[i]))

        return terms

    def dump_dict(self, dict_file):
        self.dictionary.save_as_text(dict_file)

    def dump_model(self, model_file):
        if self.model:
            joblib.dump(self.model, model_file)

    def train(self, x_list, y_list, model='lr'):
        X_train, X_test, y_train, y_test = train_test_split(x_list,
                                                            y_list,
                                                            test_size=0.3)
        if model == 'lr':
            self.model = LogisticRegression(C=1.0,
                                            multi_class='multinomial',
                                            penalty='l2',
                                            solver='sag',
                                            tol=0.1)
        else:
            logging.error('Unknown model name!')
            return

        self.model.fit(X_train, y_train)
        score = self.model.score(X_train, y_train)
        print("Evaluation on train set : %.4f" % score)
        score = self.model.score(X_test, y_test)
        print("Evaluation on test set : %.4f" % score)

    def predict(self, X):
        return self.model.predict(X)

    def predict_proba(self, X):
        return self.model.predict_proba(X)

    def eval(self, X, y):
        score = self.model.score(X, y)
        print("Evaluation on validation set : %.4f" % score)
Exemplo n.º 25
0
topics = 20
dictionary.filter_extremes(no_below=no_below, no_above=no_above)
logger.info("Making Corpus...")
corpus = [dictionary.doc2bow(text) for text in tqdm(texts)]

#========================================================================
# LDA Calculate
#========================================================================
logger.info("LDA Calculation...")
lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=topics)

#========================================================================
# Model Save
#========================================================================
logger.info("Dictionary & LDA Model Save...")
dictionary.save_as_text('../model/1111_gensim_dict_below10_above08')
with open('../model/1111_LDA_20topics_gensim__below10_above08',
          mode='wb') as f:
    pkl.dump(lda, f)
for topic in lda.show_topics(num_topics=-1):
    print(f'topics: {topic}\n')

# LDA Value write to Train & Test
mx = np.zeros((len(texts), topics))

# Get LDA Topic Value from corpus
logger.info("Get LDA Value from corpus...")
arg_list = []
for i, bow in tqdm(enumerate(corpus)):

    # Pararell ===
Exemplo n.º 26
0
epoch = int(sys.argv[3])
batch = int(sys.argv[4])
n_hidden = 128

df = pd.read_csv(data_file, names=('question', 'answer'), dtype='object')

q_maxlen = df['question'].map(len).max()
a_maxlen = df['answer'].map(len).max()

rpad_blank = lambda size: (lambda s: s.ljust(size, ' '))

que = df['question'].map(rpad_blank(q_maxlen))
ans = df['answer'].map(rpad_blank(a_maxlen))

dic = Dictionary([list(' '.join(df.values.flatten()))])
dic.save_as_text(f'{data_file}.dic')

one_hot = lambda s: np.eye(len(dic))[dic.doc2idx(list(s))]

x = np.array([one_hot(q) for q in que])
y = np.array([one_hot(a) for a in ans])

model = Sequential()

# encoder
model.add(LSTM(n_hidden, input_shape=(q_maxlen, len(dic))))

# decoder
model.add(RepeatVector(a_maxlen))
model.add(LSTM(n_hidden, return_sequences=True))
Exemplo n.º 27
0
 segmentor.load_with_lexicon(cws_model_path, LTP_DATA_DIR+'/user_dict.txt') # 加载模型,第二个参数是您的外部词典文件路径
 postagger = Postagger() # 初始化实例
 postagger.load(pos_model_path)  # 加载模型
 
 combain_comtent = []
 for file in file_list:
     combain_comtent.append(get_content(file))
     
 segmentor.release()  # 释放模型
 
 dictionary = Dictionary(combain_comtent)
 corpus = [ dictionary.doc2bow(text) for text in combain_comtent]
 lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=31)
 
 #词典的保存
 dictionary.save_as_text(write_path+"dictionary.txt")
 # lda模型保存
 lda.save(write_path+"model")
 
 for file in lda.print_topics(31):
     print(file[0])
 
 
 topic_list = []
 for i in lda.get_document_topics(corpus):
     listj=[]
     for j in i:
         listj.append(j[1])
     topic_list.append(listj.index(max(listj)))
     
 file_dict = {}
Exemplo n.º 28
0
class Dataset(object):
    '''
    Create dataset for training supervised model
    '''
    def __init__(self, config):
        self.config = config
        self.train_data = None
        self.test_data = None
        self.val_data = None
        self.vocab = None
        self.word_embeddings = None

    def get_pandas_df(self, filename):
        '''
        Load the data into Pandas.DataFrame object
        This will be used to convert data to torchtext object
        '''
        with open(filename, 'r', encoding='utf-8') as datafile:
            data = [line.strip().split(' ', maxsplit=1) for line in datafile]
            data_text = list(map(lambda x: x[1], data))
            data_label = list(map(lambda x: x[0], data))

        full_df = pd.DataFrame({"text": data_text, "label": data_label})
        return full_df

    def load_data(self,
                  train_file,
                  test_file,
                  dataname,
                  embed_file=None,
                  val_file=None):
        '''
        Loads the data from files   
        Sets up iterators for training, validation and test data
        Also create vocabulary and word embeddings based on the data

        Inputs:
            embed_file (String): absolute path to file containing word embeddings (GloVe/Word2Vec)
            train_file (String): absolute path to training file
            test_file (String): absolute path to test file
            val_file (String): absolute path to validation file
        '''
        # load embeddings
        voc_file = dataname + '_vocab.txt'
        new_embed = dataname + '_embed.pkl'
        train_X, train_Y = read_labeled(train_file)
        test_X, test_Y = read_labeled(test_file)
        val_X = None
        val_Y = None
        if val_file:
            val_X, val_Y = read_labeled(val_file)
        else:
            sp = int(len(train_X) * 0.8)
            train_X, val_X = (train_X[:sp], train_X[sp:])
            train_Y, val_Y = (train_Y[:sp], train_Y[sp:])
        train_X = [doc_padding(x, self.config.max_sen_len) for x in train_X]
        test_X = [doc_padding(x, self.config.max_sen_len) for x in test_X]
        val_X = [doc_padding(x, self.config.max_sen_len) for x in val_X]

        if os.path.isfile(voc_file):
            self.vocab = Dictionary.load_from_text(voc_file)
        else:
            self.vocab = Dictionary(train_X)
            special_tokens = {'<pad>': 0, '<unk>': 1}
            self.vocab.patch_with_special_tokens(special_tokens)
            self.vocab.save_as_text(voc_file)
        # build vocab
        train_X = [self.vocab.doc2idx(x, 1) for x in train_X]
        test_X = [self.vocab.doc2idx(x, 1) for x in test_X]
        val_X = [self.vocab.doc2idx(x, 1) for x in val_X]
        # transform words to index
        if os.path.isfile(new_embed):
            self.word_embeddings = torch.load(new_embed)
        else:
            embeds = Vectors(embed_file,
                             unk_init=lambda x: torch.Tensor(
                                 np.random.normal(scale=0.6, size=(x.size()))))
            self.word_embeddings = weight_matrix(self.vocab, embeds)
            torch.save(self.word_embeddings, new_embed)
        self.train_data = (train_X, train_Y)
        self.test_data = (test_X, test_Y)
        self.val_data = (val_X, val_Y)

        print("Loaded {} training examples".format(len(train_X)))
        print("Loaded {} test examples".format(len(test_X)))
        print("Loaded {} validation examples".format(len(val_X)))

    def train_iterator(self):
        return batch_iter(*self.train_data, self.config.batch_size)

    def test_iterator(self):
        return batch_iter(*self.test_data, self.config.batch_size, False)

    def val_iterator(self):
        return batch_iter(*self.val_data, self.config.batch_size, False)
Exemplo n.º 29
0
class ArticlesCollection:
    """Class which holds all articles (perhaps over several years)
       -- with ability to perform LDA on it."""
    
    def __init__(self, year_range, text_output_dirpath, lang=DE_LANG):
        self.year_range = year_range
        self.text_output_dirpath = text_output_dirpath
        self.lang = lang
        self.articles = []
        self.bow_corpus = None
        self.identifier = ''
        self.wordsids_filepath = ''
        self.bowmm_filepath = ''
        self.tfidf_filepath = ''
        self.number_of_docs = 0
        self.number_of_tokens = 0
        self.number_of_types = 0
        
        # gensim data structures
        self.dictionary = None
        
        # Read in collection & clean it & start LDA process
        self._read_collection()
        self._collection_identifier()
        self._set_filepaths()
        self._create_dictionary()
        self._create_bow_representation()
        self._set_number_of_docs()
        self._set_number_of_tokens()
        self._set_number_of_types()
        
        # Create tf*idf matrix if requested.
        if USE_TFIDF:
            self._create_tfidf_matrix()
    
    def show_lda(self):
        """Show latent topics found."""
        
        model = None
        
        # Only use tf*idf input if requested.
        corpus = self.bow_corpus
        if USE_TFIDF:
            corpus = MmCorpus(self.tfidf_filepath)
        
        # k = number of documents = number of topics (for now)
        num_topics = self.number_of_docs
        if NUM_TOPICS != -1:
            num_topics = NUM_TOPICS
        
        print('Number of docs presented: ' + str(self.number_of_docs))
        print('Number of origin. tokens: ' + str(self.number_of_tokens))
        print('Number of original types: ' + str(self.number_of_types))
        print('Number of types at usage: ' + str(len(self.dictionary.\
                                                     keys())))
        print('Number of topics to find: ' + str(num_topics))
        print('Number of topics to show: ' + str(TOPICS_DISPLAY))
        
        if MODEL == 'LdaMallet':
            model = LdaMallet(PATH_TO_MALLET_BIN,
                            corpus=corpus,
                            num_topics=num_topics,
                            id2word=self.dictionary,
                            iterations=ITERATIONS)
                            
        elif MODEL == 'HdpModel':
            model = HdpModel(corpus, self.dictionary)
        else:
            model = LdaModel(corpus=corpus,
                           id2word=self.dictionary,
                           num_topics=num_topics,
                           iterations=ITERATIONS,
                           update_every=1,
                           chunksize=10,
                           passes=1,
                           distributed=False)
                           
            '''
            More possible options above:
                           chunksize=1,
                           update_every=1,
                           decay=0.5,
            '''
        
        if MODEL == 'LdaModel' or MODEL == 'LdaMallet':               
            topic_number = 0
            for topic in model.show_topics(topics=TOPICS_DISPLAY, 
                                         topn=WORDS_DISPLAY,
                                         formatted=True):
                topic_number += 1
                print('Topic#' + str(topic_number) + ': ', topic)
        else: # For MODEL 'HdpModel'
            for topic in model.print_topics(topics=TOPICS_DISPLAY, \
                               topn=WORDS_DISPLAY):
                print topic

    def _set_number_of_types(self):
        """Set number of types (from tokens)."""
        self.number_of_types = len(set(list(itertools.\
                                    chain(*self.articles))))
        
    def _set_number_of_tokens(self):
        """Set number of tokens gotten in all documents."""
        self.number_of_tokens = sum(len(article) \
                                    for article in self.articles)
        
    def _set_number_of_docs(self):
        """Set number of docs found in collection read in."""
        self.number_of_docs = len(self.articles)
        
    def _set_filepaths(self):
        """Sets filepaths for intermediate data."""

        # Filepaths necessary for topic modeling
        self.wordsids_filepath = WORDSIDS_DIR + self.identifier + \
                                 '_' + 'wordsids.txt'
        self.bowmm_filepath = BOWMM_DIR + self.identifier + '_' + \
                              'bow.mm'
        self.tfidf_filepath = TFIDF_DIR + self.identifier + '_' + \
                              'tfidf.mm'

    def _create_dictionary(self):
        """Create a mapping of ids and surface froms (=words)."""
        
        print('Create dictionary of collection.')
        self.dictionary = Dictionary(self.articles)
        self.dictionary.filter_extremes(no_below=NO_BELOW,
                                        no_above=NO_ABOVE)
        self.dictionary.save_as_text(self.wordsids_filepath)
        self.dictionary.compactify()
        print(self.dictionary)
    
    def _create_bow_representation(self):
        """Create bag-of-words representation of collection, and save it 
           in Matrix Matrix format to disk."""
        
        print('Create bag-of-words matrix representation.')
        self.bow_corpus = [self.dictionary.doc2bow(article) 
                           for article in self.articles]
        MmCorpus.serialize(self.bowmm_filepath, self.bow_corpus)

    def _create_tfidf_matrix(self):
        """Create TF-IDF matrix and save it in Matrix Matrix format to 
           disk"""
        
        print('Create TF-IDF matrix of collection.')
        tfidf = TfidfModel(self.bow_corpus, 
                           id2word=self.dictionary, 
                           normalize=True)
        MmCorpus.serialize(self.tfidf_filepath, 
                           tfidf[self.bow_corpus])
        print('Number of documents:', tfidf.num_docs)

    def _collection_identifier(self):
        """Collection id is important for the caching files and the
           file naming of the corresponding files."""
           
        start_year = self.year_range[0]
        end_year = self.year_range[-1]
        
        if start_year == end_year:
            self.identifier = str(start_year) + '_' + self.lang
        else:
            self.identifier = str(start_year) + '-' + str(end_year) + \
                              '_' + self.lang 
        
    def _read_collection(self):
        """Iterate through all years in order to get all articles read
           in."""
        for year in self.year_range:
            # Not every single yearbook is available.
            try:
                self._read_book(year)
            except:
                print('Skip (inexistent) yearbook ' + str(year) + '.')
        
    def _read_book(self, year):
        """Read in a a single book and save its articles."""
        filepath = sac_filepath(year, lang=self.lang)
        
        print('Read in yearbook ' + str(year) + '.')
        sac_xml = etree.parse(SAC_XML_DIR + filepath)
        sac_xml_articles_list = sac_xml.xpath('.//article')
        
        # For each article
        for sac_xml_article in sac_xml_articles_list:
            
            # Prepare file to write out words
            sac_xml_article_no = sac_xml_article.attrib['n']
            out_filename = str(year) + '-' + str(self.lang) + '-' \
                           + sac_xml_article_no + '.txt'
            out_filepath = self.text_output_dirpath + sep + out_filename
            print(out_filepath)
            out_filehdl = open(out_filepath, 'w')
                               
            article_word_list = []
            sac_xml_sentences_list = \
                sac_xml_article.xpath('.//s[@lang=\'' + \
                                      self.lang + '\']')
            # For each sentence (in the article)
            for sac_xml_sentence in sac_xml_sentences_list:
                sac_xml_words_list = \
                    sac_xml_words_list = sac_xml_sentence.xpath('.//w')
                # For each word (in the sentence of the article)
                for sac_xml_word in sac_xml_words_list:
                    word = None
                    try:
                        if WITH_POS_FILTER is False:
                            if WITH_LEMMATA:
                                word = sac_xml_word.attrib['lemma'].lower()
                                if self._is_lemma_bogus(word):
                                    word = sac_xml_word.text.lower()
                            if WITH_LEMMATA is False:
                                word = sac_xml_word.text.lower()
                        elif WITH_POS_FILTER:
                            word = self._get_pos_filtered_word(sac_xml_word)
                    except:
                        pass
                        
                    # Don't add stop words, in any case
                    if not word in STOPWORDS[self.lang] \
                    and word is not None and len(word) >= MIN_WORDLEN:
                        article_word_list.append(self.\
                                                 _normalize_word(word).\
                                                 encode(ENCODING))
            # Save article as bag-of-words (of the sentences)
            self.articles.append(article_word_list)
            out_filehdl.write(' '.join(article_word_list))
            out_filehdl.close()
    
    def _get_pos_filtered_word(self, sac_xml_word):
        """ Get word by PoS filter
        """
        # There are words without PoS tags, i. e. try
        try:
            if sac_xml_word.attrib['pos'] \
            in POS_FILTER[self.lang]:
                if WITH_LEMMATA:
                    word = sac_xml_word.attrib['lemma'].lower()
                    if self._is_lemma_bogus(word):
                        return sac_xml_word.text.lower()
                    else:
                        return sac_xml_word.attrib['lemma'].lower()
                else:
                    return sac_xml_word.text.lower()
            else:
                return None
        except:
            return None
    
    def _is_lemma_bogus(self, lemma):
        """ Return true if the lemma is not useful for LDA, otherwise
            false.
        """
        
        for bogus_symbol in SURFACE_TRIGGERS:
            if bogus_symbol in lemma:
                return True
        
        # That's the last resort
        return False
    
    def _normalize_word(self, word_to_normalize):
        """
        This function helps to normalize words, because of encoding
        issues of some LDA tools ...
        @return: Normalized word as str type
        """
        
        # Transform umlauts to ASCII friendly form
        word = word_to_normalize.replace(u"ä","ae").replace(u"ö","oe"). \
            replace(u"ü","ue").replace(u"ß","ss")
        return word
                
    def __str__(self):
        """ Return a string which shows document number, number of
            words and number of types.
        """
        ret_string = ''
        art_number = 0
        
        for article in self.articles:
            art_number += 1
            ret_string += 'Doc#' + str(art_number) + ': '
            ret_string += str(len(article)) + ' [' + \
                          str(len(set((article)))) + ']'
            ret_string += '\n'
            
        return ret_string
Exemplo n.º 30
0
class Classifier(object):
    """
    新建对象或者调用trainModel方法,可以生成Classifier模型
    调用predict方法,可以预测新的日志文件类型及其置信度
    $DATA/models/l1file_info.csv:记录原始样本文件信息(暂时不要?)
    $DATA/l1cache/: 存储各样本文件。目录结构就是被管服务器原始结构
    """
    __corpusCacheFile = os.path.join(G.projectModelPath, 'corpuscache.1')
    l1_dbf = os.path.join(G.projectModelPath, 'metadata.1')
    __MaxLines = G.cfg.getint('Classifier', 'MaxLines')

    def __init__(self, model_file=''):
        self.model_file = model_file
        self.model_id = 0 if model_file == G.productFileClassifierModel else 1
        self.common_filenames, self.l1_structure = ([], [])
        self.ruleSet = None  # 处理文件正则表达式
        self.statsScope = None  # 样本文件字符数、字数统计值(均值、标准差、中位数)的最小-最大值范围
        self.dictionary = None  # 字典对象(Gensim Dictionary)
        self.model = None  # 聚类模型(Kmeans)
        self.categories = None  # 聚类的类型(名称,数量占比,分位点距离,边界点距离)

        if os.path.exists(model_file):  # 从模型文件装载模型
            self.ruleSet, self.dictionary, self.statsScope, self.model, self.categories = joblib.load(
                model_file)
        else:
            G.log.warning('No model loaded!')

    # 重新训练模型
    def reCluster(self):
        for folder in [G.l1_cache, G.l2_cache, G.outputs]:
            if os.path.exists(folder):
                shutil.rmtree(folder)
        time.sleep(3)
        for folder in [G.l1_cache, G.l2_cache, G.outputs]:
            os.mkdir(folder)

        common_files, file2merged = Util.mergeFilesByName(
            G.l0_inputs, G.l1_cache)
        self.__dbFilesSampled(file2merged)
        results = self.trainModel(k_=35)
        self.__saveModel()  # model saved to file

        Util.clearModel(self.model_id)
        self.dbUpdCategories()

        db = Util.dbConnect()
        if not db:
            return
        cursor = db.cursor()
        classified_common_files, unclassified_common_files = self.splitResults(
            results)
        Util.dbFilesMerged(cursor, file2merged, classified_common_files,
                           unclassified_common_files)
        Util.mergeFilesByClass(cursor, G.l2_cache)  # 同类文件合并到to_目录下
        wildcard_log_files = Util.genGatherList(cursor)  # 生成采集文件列表
        Util.dbWildcardLogFiles(cursor, wildcard_log_files)
        db.commit()
        db.close()

    def __dbFilesSampled(self, file2merged):
        db = Util.dbConnect()
        if not db:
            return
        cursor = db.cursor()
        for file_fullname, anchor_name, anchor_colRange, common_file_fullname in file2merged:
            file_fullname = file_fullname.replace('\\', '/')
            host = file_fullname[len(G.l0_inputs):].strip('/')
            host, filename = host.split('/', 1)
            archive_path, filename = os.path.split(filename)
            remote_path = '/' + archive_path if archive_path[
                1] != '_' else archive_path[0] + ':' + archive_path[2:]
            host = '"%s"' % host.strip('/')
            archive_path = '"%s"' % archive_path
            remote_path = '"%s"' % remote_path
            file_fullname = '"%s"' % file_fullname
            filename = '"%s"' % filename
            sql = 'INSERT INTO files_sampled (file_fullname,host,archive_path,filename,remote_path) VALUES(%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE file_fullname=%s' % (
                file_fullname, host, archive_path, filename, remote_path,
                file_fullname)
            cursor.execute(sql)
        db.commit()
        db.close()

    def __iter__(self):
        self.category_id = 0
        return self

    # 返回类别的(名称,数量占比,分位点到中心距离,边界到分位点距离)
    def __next__(self):
        i = self.category_id
        if i >= len(self.categories[0]):
            raise StopIteration
        self.category_id += 1
        return self.categories[0][i], self.categories[1][i], self.categories[
            2][i], self.categories[3][i]

    def __len__(self):
        return len(self.categories[0])

    def __getitem__(self, item):
        if item < -len(self.categories[0]) or item >= len(self.categories[0]):
            raise IndexError
        return self.categories[0][item], self.categories[1][
            item], self.categories[2][item], self.categories[3][item]

    def __setitem__(self, key, value):
        if key < -len(self.categories[0]) or key >= len(self.categories[0]):
            raise IndexError
        name = str(value)
        if name in self.categories[0]:
            raise ValueError
        self.categories[0][key] = name
        self.dbUpdCategories()

    def dbUpdCategories(self):
        db = Util.dbConnect()
        if db:  # Can't connect ro db, waiting and retry forever
            cursor = db.cursor()
            c = self.categories
            for category_id, (name, percent, boundary, quantile) in enumerate(
                    zip(c[0], c[1], c[2], c[3])):
                name = '"%s"' % name
                sql = 'INSERT INTO file_class (model_id, category_id, name, quantile, boundary, percent) VALUES(%d, %d, %s, %e,%e, %f) ON DUPLICATE KEY UPDATE name=%s,quantile=%e,boundary=%e,percent=%f' % (
                    self.model_id, category_id, name, quantile, boundary,
                    percent, name, quantile, boundary, percent)
                cursor.execute(sql)
            db.commit()
        db.close()

    # 训练、生成模型并保存在$models/xxx.mdl中,dataset:绝对/相对路径样本文件名,或者可迭代样本字符流
    def trainModel(self, dataset_path=G.l1_cache, k_=0):
        """
        Train and generate K-Means Model
        :param dataset_path: source path contains merged log files, or iterable char stream
        :param k_: K-means parameter, 0 means auto detect
        """
        rule_sets = []  # 文本处理的替换、停用词和k-shingle规则
        for ruleset_name in sorted([
                section for section in G.cfg.sections()
                if section.split('-')[0] == 'RuleSet'
        ]):
            replace_rules, stop_words, k_list = [], [], []
            for key, value in G.cfg.items(ruleset_name):
                if key == 'stopwords':
                    stop_words = value.split(',')
                elif key == 'k-shingles':
                    k_list = eval(value)
                else:
                    replace_from, replace_to = value.split('TO')
                    replace_rules.append(
                        (re.compile(replace_from.strip(),
                                    re.I), replace_to.strip()))
            rule_sets.append((ruleset_name, replace_rules, stop_words, k_list))
        # 尝试不同的向量化规则,确定聚类数量K
        for self.ruleSet in rule_sets:
            corpus_fp = self.__buildDictionary(dataset_path)  # 建立字典,返回文档结构信息
            if len(self.dictionary) < G.cfg.getint(
                    'Classifier', 'LeastTokens'):  # 字典字数太少,重新采样
                corpus_fp.close()
                self.__clearCache()
                G.log.info('Too few tokens[%d], Re-sample with next RuleSet.',
                           len(self.dictionary))
                continue
            corpus_fp.seek(0)
            vectors = self.__buildVectors(
                corpus_fp, self.dictionary.num_docs)  # 建立稀疏矩阵doc*(dct + stats)
            corpus_fp.close()  # 关闭缓存文件

            #            start_k = self.__findStartK(vectors)  # 快速定位符合分布相对均衡的起点K
            #            if start_k is None:  # 聚类不均衡,换rule set重新采样
            #                continue

            start_k = min(50, int(vectors.shape[0] / 100))
            k_ = k_ if k_ else self.__pilotClustering(vectors,
                                                      start_k)  # 多个K值试聚类,返回最佳K
            if k_ != 0:  # 找到合适的K,跳出循环
                break
            self.__clearCache()  # 清除缓存的ruleset
        else:
            raise UserWarning(
                'Cannot generate qualified corpus by all RuleSets')

        # 重新聚类, 得到模型(向量数、中心点和距离)和分类(向量-所属类)
        self.model, percents, boundaries, quantiles = self.__buildModel(
            k_, vectors)
        names = ['fc%d' % i for i in range(len(percents))]
        self.categories = [names, percents, boundaries, quantiles]
        results = self.__getResult(vectors)
        return results

    # 建立词典,同时缓存词表文件
    def __buildDictionary(self, new_dataset_path):
        self.dictionary = Dictionary()

        # 装载处理过的缓存语料
        cache_fp = open(self.__corpusCacheFile, mode='a+t',
                        encoding='utf-8')  # 创建或打开语料缓存文件
        if cache_fp.tell() != 0:
            if os.path.exists(self.l1_dbf):
                self.ruleSet, self.common_filenames, self.l1_structure, self.statsScope = joblib.load(
                    self.l1_dbf)
            cache_fp.seek(0)
            cached_documents = len(self.common_filenames)
            for lines, line_ in enumerate(cache_fp):
                if lines < cached_documents:
                    self.dictionary.add_documents([line_.split()])
            G.log.info('%d cached documents loaded.', lines)

        # 继续处理新增语料
        for document in self.__buildDocument(new_dataset_path):
            self.dictionary.add_documents([document])
            cache_fp.write(' '.join([word for word in document]) + '\n')

        if self.dictionary.num_docs < G.cfg.getint(
                'Classifier', 'LeastFiles'):  # 字典字数太少或文档数太少,没必要聚类
            cache_fp.close()
            self.__clearCache()
            raise UserWarning('Too few documents[%d] to clustering' %
                              self.dictionary.num_docs)

        # 去掉低频词,压缩字典
        num_token = len(self.dictionary)
        no_below = int(
            min(G.cfg.getfloat('Classifier', 'NoBelow'),
                int(self.dictionary.num_docs / 50)))
        self.dictionary.filter_extremes(no_below=no_below,
                                        no_above=0.999,
                                        keep_n=G.cfg.getint(
                                            'Classifier', 'KeepN'))
        self.dictionary.compactify()
        G.log.info(
            'Dictionary built with [%s](%d tokens, reduced from %d), from %d files( %d words)',
            self.ruleSet[0], len(self.dictionary), num_token,
            self.dictionary.num_docs, self.dictionary.num_pos)

        statistics = np.array(self.l1_structure)[:, 1:7]
        statistics[statistics > 500] = 500  # 防止异常大的数干扰效果
        self.statsScope = np.min(statistics, axis=0), np.max(statistics,
                                                             axis=0)
        joblib.dump((self.ruleSet, self.common_filenames, self.l1_structure,
                     self.statsScope), self.l1_dbf)  # 保存模型,供后续使用

        return cache_fp

    # 预处理,迭代方式返回某个文件的词表.
    def __buildDocument(self, dataset_path):
        amount_files, failed_files, file_fullname = 0, 0, ''
        G.log.info('Start Converting documents from ' + dataset_path)

        processed_files = os.path.join(G.projectModelPath, 'buildDocument.dbf')
        processed = [] if not os.path.exists(processed_files) else joblib.load(
            processed_files)

        for dir_path, dir_names, file_names in os.walk(dataset_path):
            for file_name in file_names:
                try:
                    file_fullname = os.path.join(dir_path, file_name)
                    if file_fullname in processed:
                        continue
                    amount_files += 1
                    if amount_files % 50 == 0:
                        G.log.info('Converted %d[%d failed] files:\t%s',
                                   amount_files, failed_files, file_fullname)
                    processed.append(file_fullname)
                    yield self.__file2doc(file_fullname)
                except Exception as err:
                    failed_files += 1
                    G.log.warning('Failed to convert\t%s, ignored.\t%s',
                                  file_fullname, str(err))
                    continue
        joblib.dump(processed, processed_files)
        G.log.info('Converted %d files,%d failed', amount_files, failed_files)
        raise StopIteration()

    # 使用规则集匹配和转换后,转化为词表
    def __file2doc(self, file_fullname, encoding='utf-8'):
        document = []
        line_idx, lc, lw = 0, [], []

        G.log.debug('Converting ' + file_fullname)
        for line_idx, line in enumerate(
                open(file_fullname, 'r', encoding=encoding)):
            words = G.getWords(line, rule_set=self.ruleSet)
            document += words  # 生成词表
            lc.append(len(line))
            lw.append(len(words))
            if line_idx > self.__MaxLines:
                break
        line_idx += 1

        # 计算统计数据
        subtotal_chars = list(
            np.histogram(np.array(lc), bins=[0, 40, 80, 120, 160, 200, 1000
                                             ])[0] / line_idx)
        subtotal_words = list(
            np.histogram(np.array(lw), bins=[0, 4, 8, 12, 16, 20, 100])[0] /
            line_idx)
        stats = [
            np.mean(lc),
            np.mean(lw),
            np.std(lc),
            np.std(lw),
            np.median(lc),
            np.median(lw)
        ]
        doc_structure = [line_idx] + stats + subtotal_chars + subtotal_words
        # 汇总和保持元数据
        self.common_filenames.append(file_fullname)
        self.l1_structure.append(doc_structure)
        return document

    # 从词表和文档结构形成聚类向量
    def __buildVectors(self, corpus, rows):
        cols = len(self.dictionary)

        # 构造tf-idf词袋和文档向量
        tfidf_model = TfidfModel(dictionary=self.dictionary, normalize=True)
        vectors = np.zeros((rows, cols))
        for doc_idx, document in enumerate(corpus):
            if type(document) == str:
                document = document.split()
            for (word_idx, tf_idf_value
                 ) in tfidf_model[self.dictionary.doc2bow(document)]:
                vectors[doc_idx, word_idx] = tf_idf_value  # tfidf词表加入向量

        # 按每个文档的行数对tfidf向量进行标准化,保证文档之间的可比性
        l1_fd = np.array(
            self.l1_structure)[-rows:, :]  # [[行数, 均值/标准差/中位数,字节和字数的12个分段数量比例]]

        lines = l1_fd[:, 0:1]
        vectors /= lines

        # 文档结构数据归一化处理,并生成向量
        min_, max_ = self.statsScope
        statistics = l1_fd[:, 1:7]
        statistics[statistics > 500] = 500  # 防止异常大的数干扰效果
        statistics = (statistics - min_) / (max_ -
                                            min_) * 0.01  # 6列统计值各占1%左右权重
        subtotal = l1_fd[:, 7:] * 0.005  # subtotal 12列各占0.5%左右权重

        cols += len(self.l1_structure[0])
        if rows > 300:
            G.log.info('[%d*%d]Vectors built' % (rows, cols))

        return np.hstack((statistics, subtotal, vectors))

    # 从k=64开始,二分法确定Top5类样本量小于指定比例的K
    @staticmethod
    def __findStartK(vectors):
        k_from, k_, k_to = 5, 64, 0
        while k_ < min(G.cfg.getint('Classifier', 'MaxCategory'),
                       len(vectors)):
            kmeans = KMeans(n_clusters=k_).fit(vectors)  # 聚类
            n = min(5, int(k_ * 0.1) + 1)
            top5_ratio = sum([
                v for (k, v) in Counter(kmeans.labels_).most_common(n)
            ]) / vectors.shape[0]
            G.log.debug(
                'locating the starter. k=%d, SSE= %e, Top%d labels=%d%%', k_,
                kmeans.inertia_, n, top5_ratio * 100)

            if top5_ratio < G.cfg.getfloat('Classifier', 'Top5Ratio'):  # 向前找
                if k_ - k_from < 4:  # 已靠近低限,找到大致起点
                    G.log.info('start k=%d', k_from)
                    return k_from
                k_to = k_ - 1
                k_ = k_from + int((k_ - k_from) / 2)
            else:  # 向后找
                if k_ < k_to < k_ + 4:  # 已靠近高点,找到大致起点
                    G.log.info('start k=%d', k_)
                    return k_
                k_from = k_ + 1
                if k_to > 0:  # 有上限
                    k_ = k_to - int((k_to - k_) / 2)
                else:  # 无上限
                    k_ *= 2

            if kmeans.inertia_ < 1e-5:  # 已经完全分类,但仍不均衡
                break

        G.log.info('No starter found')
        return None  # No found,re-samples

    # 聚类,得到各簇SSE(sum of the squared errors),作为手肘法评估确定k的依据
    @staticmethod
    def __pilotClustering(vectors,
                          k_from=1,
                          k_to=G.cfg.getint('Classifier', 'MaxCategory')):
        norm_factor = vectors.shape[1] * vectors.shape[
            0]  # 按行/样本数和列/字典宽度标准化因子,保证不同向量的可比性
        termination_inertia = G.cfg.getfloat(
            'Classifier', 'NormalizedTerminationInertia') * norm_factor
        cfg_q = G.cfg.getfloat('Classifier', 'Quantile')
        k_, pilot_list = 0, [
        ]  # [(k_, inertia, criterion, top5_percent, bad_percent)] criteria取inertia变化率的一阶微分的极大值
        # 从k_from到k_to聚类,得到pilot_list
        for k_ in range(k_from, k_to):
            kmeans = KMeans(n_clusters=k_, tol=1e-5).fit(vectors)  # 试聚类
            if k_ < k_from + 2:
                pilot_list.append([k_, kmeans.inertia_, 0, 0, 0])
                continue

            retry = 0  # 多聚几次,保证inertia递减
            for retry in range(5):  # 如果inertia因误差变大,重新聚几次
                inertia = kmeans.inertia_
                if inertia <= pilot_list[-1][1]:
                    break
                G.log.debug('retries=%d, inertia=%e', retry + 1, inertia)
                kmeans = KMeans(n_clusters=k_).fit(vectors)
            else:
                inertia = pilot_list[-1][1]

            pilot_list[-1][2] = pilot_list[-2][1] / pilot_list[-1][
                1] - pilot_list[-1][1] / inertia
            a = pilot_list[-1]
            G.log.info(
                'pilot clustering. (k,inertia,criteria,top5,bad)=\t%d\t%e\t%.3f\t%.3f\t%.3f',
                pilot_list[-1][0], pilot_list[-1][1], pilot_list[-1][2],
                pilot_list[-1][3], pilot_list[-1][4])

            top5_percent = sum([
                v for (k, v) in Counter(kmeans.labels_).most_common(5)
            ]) / len(kmeans.labels_)
            # 计算距离特别远(0.8分位点2倍距离以上)的坏点比例
            v_scores = -np.array([kmeans.score([v]) for v in vectors])
            groups = pd.DataFrame({
                'C': kmeans.labels_,
                'S': v_scores
            }).groupby('C')
            c_quantiles_double = 2 * np.array(
                [groups.get_group(i)['S'].quantile(cfg_q) for i in range(k_)])
            bad_samples = 0
            for idx, score in enumerate(v_scores):
                if score > c_quantiles_double[kmeans.labels_[idx]]:
                    bad_samples += 1
            bad_percent = bad_samples / len(v_scores)

            pilot_list.append([k_, inertia, None, top5_percent, bad_percent])
            if inertia < termination_inertia:  # 已经收敛到很小且找到可选值,没必要继续增加
                break
        # 从pilot list中取极大值的top5
        pilot_list = np.array(pilot_list)[1:-1, :]  # 去掉第一个和最后一个没法计算criterion值的
        pilot_list = pilot_list[pilot_list[:, 3] < G.cfg.getfloat(
            'Classifier', 'Top5Ratio')]  # 去掉top5占比超标的
        pilot_list = pilot_list[argrelextrema(pilot_list[:, 2],
                                              np.greater)]  # 得到极大值
        criteria = pilot_list[:, 2].tolist()
        if not criteria:  # 没有极值
            return None

        max_top_n, idx_ = [], 0
        while criteria[idx_:]:
            idx_ = criteria.index(max(criteria[idx_:]))
            max_top_n.append(pilot_list[idx_])
            idx_ += 1
        G.log.debug(
            'topN k=\n%s', '\n'.join([
                '%d\t%e\t%.3f\t%.3f\t%.3f' % (k, i, c, t, b)
                for k, i, c, t, b in max_top_n
            ]))
        products = [k * c for k, i, c, t, b in max_top_n]
        idx_ = products.index(max(products))
        preferred = max_top_n[idx_][0]
        G.log.info('pilot-clustering[k:%d] finished. preferred k=(%d)', k_,
                   preferred)
        return preferred

    # 重新聚类,得到各Cluster的中心点、分位点距离、边界距离以及数量占比等
    @staticmethod
    def __buildModel(k_, vectors):
        # 再次聚类并对结果分组。 Kmeans不支持余弦距离
        kmeans = KMeans(n_clusters=k_, n_init=20, max_iter=500).fit(vectors)
        norm_factor = -vectors.shape[1]  # 按字典宽度归一化
        groups = pd.DataFrame({
            'C':
            kmeans.labels_,
            'S': [kmeans.score([v]) / norm_factor for v in vectors]
        }).groupby('C')
        percents = groups.size() / len(vectors)  # 该簇向量数在聚类总向量数中的占比
        cfg_q = G.cfg.getfloat('Classifier', 'Quantile')
        quantiles = np.array([
            groups.get_group(i)['S'].quantile(cfg_q, interpolation='higher')
            for i in range(k_)
        ])
        boundaries = groups['S'].agg('max').values  # 该簇中最远点距离

        quantiles2 = quantiles * 2
        boundaries[boundaries > quantiles2] = quantiles2[
            boundaries > quantiles2]  # 边界太远的话,修正一下
        boundaries[boundaries < 1e-100] = 1e-100  # 边界为零的话,修正一下
        quantiles = boundaries - quantiles
        quantiles[quantiles < 1e-100] = 1e-100  # 避免出现0/0

        G.log.info(
            'Model(k=%d) built. inertia=%e, max proportion=%.2f%%, max quantile=%e, max border=%e',
            k_, kmeans.inertia_,
            max(percents) * 100, max(quantiles), max(boundaries))
        return kmeans, percents, boundaries, quantiles

    # 保存文本格式模型
    def __saveModel(self):
        joblib.dump((self.ruleSet, self.dictionary, self.statsScope,
                     self.model, self.categories),
                    G.projectFileClassifierModel)
        self.dictionary.save_as_text(
            os.path.join(G.logsPath, 'FileDictionary.csv'))
        category_names, percents, boundaries, quantiles = self.categories
        l2_fd = pd.DataFrame({
            '类名': category_names,
            '占比': percents,
            '分位点到边界': quantiles,
            '边界点': boundaries
        })
        l2_fd.to_csv(os.path.join(G.logsPath, 'FileCategories.csv'),
                     sep='\t',
                     encoding='GBK')
        G.log.info(
            'Model is built and saved to %s, %s and Database: FileDictionary.csv, FileCategories.csv successful.',
            G.projectFileClassifierModel, G.logsPath)

    def splitResults(self, results):
        classified_files, unclassified_files = [], []
        for common_name, category, category_name, confidence, distance in zip(
                self.common_filenames, results[0], results[1], results[2],
                results[3]):
            if confidence < G.minConfidence:  # 置信度不够,未完成分类
                unclassified_files.append(common_name)
            else:
                classified_files.append([
                    self.model_id, common_name, category, category_name,
                    confidence, distance
                ])
        return classified_files, unclassified_files

    # 对单个样本文件进行分类,返回文件名称、时间戳锚点位置,类别和置信度
    def predictFile(self, file_fullname, encoding='utf-8'):
        """
        :param file_fullname: log file to be predicted:
        :param encoding: encoding of the file
        :return: None if file __process errors, tuple of filename, number-of-lines, timestamp-cols, predict category
                 index, name, confidence and distance-to-center.
                 confidence > 1 means nearer than 0.8-quantile to the center, < 0 means out of boundary
        """
        if self.model is None:
            raise UserWarning('Failed to predict: Model is not exist!')

        try:
            document = self.__file2doc(file_fullname,
                                       encoding=encoding)  # 文件转为词表
            vectors = self.__buildVectors([document], 1)
            categories, names, confidences, distances = self.__getResult(
                vectors)  # 预测分类并计算可信度
            return categories[0], names[0], confidences[0], distances[0]

        except Exception as err:
            G.log.warning('Failed to predict\t%s, ignored.\t%s', file_fullname,
                          str(err))
            return None

    # 对目录下多个样本文件进行分类,返回文件名称、时间戳锚点位置,类别和置信度
    def predictFiles(self, dataset_path, encoding='utf-8'):
        """
        :param dataset_path: path which contains filed to be predicted
        :param encoding: encoding of the file
        :return: list of file-names, number-of-line, timestamp-col, predict category index, name, confidence and
                 distance-center. confidence > 1 means nearer than 0.8-quantile to the center, < 0 means out of boundary
        """
        if self.model is None:
            raise UserWarning('Failed to predict: Model is not exist!')

        corpus = []
        start_ = len(self.common_filenames)
        amount_files, failed_files, file_fullname = 0, 0, ''
        G.log.info('Start __process documents from ' + dataset_path)
        for dir_path, dir_names, file_names in os.walk(dataset_path):
            try:
                for file_name in file_names:
                    file_fullname = os.path.join(dir_path, file_name)
                    amount_files += 1
                    if amount_files % 50 == 0:
                        G.log.info('Processed %d files, failed %d',
                                   amount_files, failed_files)
                    corpus.append(
                        self.__file2doc(file_fullname,
                                        encoding=encoding))  # 文件转为词表
            except Exception as err:
                failed_files += 1
                G.log.warning('Failed to __process\t%s, ignored.\t%s',
                              file_fullname, str(err))
                continue
        G.log.info('Converted %d files,%d(%d%%) failed', amount_files,
                   failed_files, failed_files / amount_files * 100)

        vectors = self.__buildVectors(corpus, len(corpus))
        categories, category_names, confidences, distances = self.__getResult(
            vectors)  # 预测分类并计算可信度
        files = self.common_filenames[start_:]
        return files, list(categories), category_names, list(
            confidences), distances

    # 预测分类并计算可信度。<0 表示超出边界,完全不对,〉1完全表示比分位点还近,非常可信
    def __getResult(self, vectors):
        c_names, c_percents, c_boundaries, c_quantiles = self.categories
        norm_factor = -vectors.shape[1]  # 按字典宽度归一化

        predicted_labels = self.model.predict(vectors)  # 使用聚类模型预测记录的类别
        predicted_names = [c_names[label] for label in predicted_labels]
        confidences = []
        distances = []
        for i, v in enumerate(vectors):
            distance = self.model.score([v]) / norm_factor
            distances.append(distance)
            category = predicted_labels[i]
            confidences.append(
                (c_boundaries[category] - distance) / c_quantiles[category])
        confidences = np.array(confidences, copy=False)
        confidences[confidences > 99.9] = 99.9
        confidences[confidences < -99.9] = -99.9

        return predicted_labels, predicted_names, confidences, distances

    # 删除缓存文件
    def __clearCache(self):
        for f in [self.__corpusCacheFile, self.l1_dbf]:
            try:
                os.remove(self.l1_dbf) if os.path.exists(self.l1_dbf) else None
                os.remove(self.__corpusCacheFile) if os.path.exists(
                    self.__corpusCacheFile) else None
            except Exception as err:
                G.log.warning('Failed to clear %s. %s' % (f, str(err)))
                continue
Exemplo n.º 31
0
class DocDataset(Dataset):
    def __init__(self,
                 taskname,
                 txtPath=None,
                 lang="zh",
                 tokenizer=None,
                 stopwords=None,
                 no_below=5,
                 no_above=0.1,
                 hasLable=False,
                 rebuild=False,
                 use_tfidf=False):
        cwd = os.getcwd()
        txtPath = os.path.join(
            cwd, 'data',
            f'{taskname}_lines.txt') if txtPath == None else txtPath
        tmpDir = os.path.join(cwd, 'data', taskname)
        self.txtLines = [
            line.strip('\n') for line in open(txtPath, 'r', encoding='utf-8')
        ]
        self.dictionary = None
        self.bows, self.docs = None, None
        self.use_tfidf = use_tfidf
        self.tfidf, self.tfidf_model = None, None
        if not os.path.exists(tmpDir):
            os.mkdir(tmpDir)
        if not rebuild and os.path.exists(os.path.join(tmpDir, 'corpus.mm')):
            self.bows = gensim.corpora.MmCorpus(
                os.path.join(tmpDir, 'corpus.mm'))
            if self.use_tfidf:
                self.tfidf = gensim.corpora.MmCorpus(
                    os.path.join(tmpDir, 'tfidf.mm'))
            self.dictionary = Dictionary.load_from_text(
                os.path.join(tmpDir, 'dict.txt'))
            self.docs = pickle.load(
                open(os.path.join(tmpDir, 'docs.pkl'), 'rb'))
            self.dictionary.id2token = {
                v: k
                for k, v in self.dictionary.token2id.items()
            }  # because id2token is empty be default, it is a bug.
        else:
            if stopwords == None:
                stopwords = set([
                    l.strip('\n').strip()
                    for l in open(os.path.join(cwd, 'data', 'stopwords.txt'),
                                  'r',
                                  encoding='utf-8')
                ])
            # self.txtLines is the list of string, without any preprocessing.
            # self.texts is the list of list of tokens.
            print('Tokenizing ...')
            if tokenizer is None:
                tokenizer = globals()[LANG_CLS[lang]](stopwords=stopwords)
            self.docs = tokenizer.tokenize(self.txtLines)
            self.docs = [line for line in self.docs if line != []]
            # build dictionary
            self.dictionary = Dictionary(self.docs)
            #self.dictionary.filter_n_most_frequent(remove_n=20)
            # self.dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=1994)  # use Dictionary to remove un-relevant tokens
            self.dictionary.compactify()
            self.dictionary.id2token = {
                v: k
                for k, v in self.dictionary.token2id.items()
            }  # because id2token is empty by default, it is a bug.
            # convert to BOW representation
            self.bows, _docs = [], []
            for doc in self.docs:
                _bow = self.dictionary.doc2bow(doc)
                if _bow != []:
                    _docs.append(list(doc))
                    self.bows.append(_bow)
            self.docs = _docs
            if self.use_tfidf == True:
                self.tfidf_model = TfidfModel(self.bows)
                self.tfidf = [self.tfidf_model[bow] for bow in self.bows]
            # serialize the dictionary
            gensim.corpora.MmCorpus.serialize(
                os.path.join(tmpDir, 'corpus.mm'), self.bows)
            self.dictionary.save_as_text(os.path.join(tmpDir, 'dict.txt'))
            pickle.dump(self.docs, open(os.path.join(tmpDir, 'docs.pkl'),
                                        'wb'))
            if self.use_tfidf:
                gensim.corpora.MmCorpus.serialize(
                    os.path.join(tmpDir, 'tfidf.mm'), self.tfidf)
        self.vocabsize = len(self.dictionary)
        self.numDocs = len(self.bows)
        print(f'Processed {len(self.bows)} documents.')

    def __getitem__(self, idx):
        bow = torch.zeros(self.vocabsize)
        if self.use_tfidf:
            item = list(zip(*self.tfidf[idx]))
        else:
            item = list(
                zip(*self.bows[idx]
                    ))  # bow = [[token_id1,token_id2,...],[freq1,freq2,...]]
        bow[list(item[0])] = torch.tensor(list(item[1])).float()
        txt = self.docs[idx]
        return txt, bow

    def __len__(self):
        return self.numDocs

    def collate_fn(self, batch_data):
        texts, bows = list(zip(*batch_data))
        return texts, torch.stack(bows, dim=0)

    def __iter__(self):
        for doc in self.docs:
            yield doc

    def show_dfs_topk(self, topk=20):
        ndoc = len(self.docs)
        dfs_topk = sorted([(self.dictionary.id2token[k], fq)
                           for k, fq in self.dictionary.dfs.items()],
                          key=lambda x: x[1],
                          reverse=True)[:topk]
        for i, (word, freq) in enumerate(dfs_topk):
            print(f'{i+1}:{word} --> {freq}/{ndoc} = {(1.0*freq/ndoc):>.13f}')
        return dfs_topk

    def show_cfs_topk(self, topk=20):
        ntokens = sum([v for k, v in self.dictionary.cfs.items()])
        cfs_topk = sorted([(self.dictionary.id2token[k], fq)
                           for k, fq in self.dictionary.cfs.items()],
                          key=lambda x: x[1],
                          reverse=True)[:topk]
        for i, (word, freq) in enumerate(cfs_topk):
            print(
                f'{i+1}:{word} --> {freq}/{ntokens} = {(1.0*freq/ntokens):>.13f}'
            )

    def topk_dfs(self, topk=20):
        ndoc = len(self.docs)
        dfs_topk = self.show_dfs_topk(topk=topk)
        return 1.0 * dfs_topk[-1][-1] / ndoc
Exemplo n.º 32
0
class SenseClassifier(object):
    DATE_PTN = re.compile(
        u'(((19)*9\d|(20)*[01]\d)\-?)?((0[1-9]|1[012])\-?)([012]\d|3[01])')
    LABEL_TO_INDEX = {
        'movie': 0,
        'episode': 1,
        'enter': 2,
        'cartoon': 3,
        'game': 4
    }
    INDEX_TO_LABEL = {
        0: 'movie',
        1: 'episode',
        2: 'enter',
        3: 'cartoon',
        4: 'game'
    }

    def __init__(self, dict_file=None, model_file=None):
        if dict_file:
            self.dictionary = Dictionary.load_from_text(dict_file)
        else:
            self.dictionary = Dictionary()

        if model_file:
            self.model = joblib.load(model_file)
        else:
            self.model = None

    def dictionary_size(self):
        return len(self.dictionary)

    def expand_sent_terms(self, sent, center, rm_kw=False):
        expd_sent = list(sent)

        # Expand with ngram and position_term features.
        if center >= 0:
            ngram_terms = self._get_ngram_terms(sent, center)
            expd_sent.extend(ngram_terms)
            posi_terms = self._get_posi_terms(sent, center)
            expd_sent.extend(posi_terms)

        # Remove the keyword itself.
        if rm_kw and center >= 0:
            del expd_sent[center]

        return expd_sent

    def sentence_to_bow(self, sent):
        if self.dictionary:
            return self.dictionary.doc2bow(sent)
        else:
            return None

    def bow_to_feature_vec(self, bow_corpus):
        data = []
        rows = []
        cols = []
        line_count = 0
        for bow_sent in bow_corpus:
            for elem in bow_sent:
                rows.append(line_count)
                cols.append(elem[0])
                data.append(elem[1])
            line_count += 1

        return csr_matrix((data, (rows, cols)),
                          shape=(line_count, len(self.dictionary)))

    def load_text(self, data_file, train=False):
        term_corpus = []
        labels = []
        with open(data_file) as fin:
            for line in fin:
                parts = line.strip().decode('utf8').split('\t')
                if len(parts) < 3:
                    continue

                label, keyword = parts[0:2]
                orig_sent = parts[2:]
                if train:
                    keyword_count = sum(
                        [1 if x == keyword else 0 for x in orig_sent])
                    if keyword_count != 1:
                        continue

                # Normalize special terms.
                sent = [
                    '@date@' if self.DATE_PTN.match(term) else term
                    for term in orig_sent
                ]

                # Expand sentence with more features.
                center = sent.index(keyword)
                sent = self.expand_sent_terms(sent, center, True)

                # Save sentences and labels.
                term_corpus.append(sent)
                labels.append(self.LABEL_TO_INDEX[label])

                # Update dictionary.
                if train:
                    self.dictionary.add_documents([sent])

        if train:
            # Compacitify dictionary.
            self.dictionary.filter_extremes(no_below=5,
                                            no_above=0.5,
                                            keep_n=None)
            self.dictionary.compactify()

        # Change text format corpus to bow format.
        bow_corpus = []
        for sent in term_corpus:
            sent_bow = self.dictionary.doc2bow(sent)
            bow_corpus.append(sent_bow)

        return bow_corpus, labels

    WINDOW_SIZE = 3

    def _get_posi_terms(self, words, center):
        terms = []
        for i in range(self.WINDOW_SIZE):
            offset = (i + 1)
            left_posi = center - offset
            if left_posi >= 0:
                terms.append('%s-%d' % (words[left_posi], offset))

            right_posi = center + offset
            if right_posi < len(words):
                terms.append('%s+%d' % (words[right_posi], offset))

        return terms

    NGRAM_WINDOW_SIZE = 10

    def _get_ngram_terms(self, words, center):
        terms = []
        for i in range(1, self.NGRAM_WINDOW_SIZE):
            offset = (i + 1)
            left_posi = center - offset
            if left_posi >= 0:
                terms.append('%s_%s' %
                             (words[left_posi], words[left_posi + 1]))

            right_posi = center + offset
            if right_posi < len(words):
                terms.append('%s_%s' %
                             (words[right_posi - 1], words[right_posi]))

        return terms

    def dump_dict(self, dict_file):
        self.dictionary.save_as_text(dict_file)

    def dump_model(self, model_file):
        if self.model:
            joblib.dump(self.model, model_file)

    def train(self, x_list, y_list, model='lr'):
        X_train, X_test, y_train, y_test = train_test_split(x_list,
                                                            y_list,
                                                            test_size=0.3)
        if model == 'lr':
            self.model = LogisticRegression(C=1.0,
                                            multi_class='multinomial',
                                            penalty='l2',
                                            solver='sag',
                                            tol=0.1)
        else:
            logging.error('Unknown model name!')
            return

        self.model.fit(X_train, y_train)
        score = self.model.score(X_train, y_train)
        print("Evaluation on train set : %.4f" % score)
        score = self.model.score(X_test, y_test)
        print("Evaluation on test set : %.4f" % score)

    def predict(self, X):
        return self.model.predict(X)

    def predict_proba(self, X):
        return self.model.predict_proba(X)

    def eval(self, X, y):
        score = self.model.score(X, y)
        print("Evaluation on validation set : %.4f" % score)
Exemplo n.º 33
0
def phrases():
    unigram_sentences = LineSentence(unigram_sentences_filepath)
    bigram_model = Phrases(unigram_sentences)
    bigram_model.save(bigram_model_filepath)
    bigram_model = Phrases.load(bigram_model_filepath)
    bigram_sentences_filepath = intermediate_directory + 'bigram_model_all.txt'
    with open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for unigram_sentence in unigram_sentences:
            bigram_sentence = u' '.join(bigram_model[unigram_sentence])
            f.write(bigram_sentence)

    bigram_sentences = LineSentence(bigram_sentences_filepath)

    trigram_model_filepath = intermediate_directory + 'trigram_sentences_all'

    trigram_model = Phrases(bigram_sentences)
    trigram_model.save(trigram_model_filepath)
    trigram_model = Phrases.load(trigram_model_filepath)
    trigram_sentences_filepath = intermediate_directory + 'trigram_sentences_all.txt'

    with open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for bigram_sentence in bigram_sentences:
            trigram_sentence = ' '.join(trigram_model[bigram_sentence])
            f.write(trigram_sentence + '\n')

    trigram_sentences = LineSentence(trigram_sentences_filepath)

    ### STOP WORDS REMOVAL ###

    trigram_reviews_filepath = intermediate_directory + 'trigram_transformed_reviews_all.txt'
    with open(trigram_reviews_filepath, 'w', encoding='utf_8') as f:
        for parsed_review in nlp.pipe(line_review('data/'),
                                      batch_size=10000, n_threads=4):

            # lemmatize the text, removing punctuation and whitespace
            unigram_review = [token.lemma_ for token in parsed_review
                              if not punct_space(token)]

            # apply the first-order and second-order phrase models
            bigram_review = bigram_model[unigram_review]
            trigram_review = trigram_model[bigram_review]


            trigram_review = [term for term in trigram_review if
                              term not in STOP_WORDS and term != '-PRON-' and term != '‘' and term != '’' and term != "'s" and term != "’s"]

            # write the transformed review as a line in the new file
            trigram_review = ' '.join(trigram_review)

            ## MOVED OUTSIDE OF THE LOOP SO WE COULD GET A SINGULAR CORPUS WITH ALL THE TEXT
            # print(trigram_review)
            f.write(trigram_review + '\n')

    ######BAG OF WORDS CREATION #########
    trigram_reviews = LineSentence(trigram_reviews_filepath)

    # learn the dictionary by iterating over all of the reviews
    trigram_dictionary = Dictionary()
    trigram_dictionary.add_documents(trigram_reviews)

    # add keep_n=10000
    trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    trigram_dictionary.compactify()

    trigram_dictionary.save_as_text(trigram_dictionary_filepath)
Exemplo n.º 34
0
"""
make_dic.py
1. Read csv file
2. Make dictionary
3. Update dictionary
4. Save dictionary into a txt file
"""
import pandas as pd
from gensim.corpora import Dictionary

# Read csv file
df = pd.read_csv("livedoor_news.csv")

# 辞書
dct = Dictionary()
for i, news in enumerate(df["news"]):
    # Update dictionary with new documents
    dct.add_documents([news.split()])

dct.save_as_text("vocab.txt")
Exemplo n.º 35
0
class Vocab:
    def __init__(self):
        self.dictionary = Dictionary()
        self.dictionary.token2id['<UNK>'] = -1
        self.dictionary.id2token[-1] = '<UNK>'
        self.dictionary.dfs[-1] = 0

    def set(self, corpus, prune_at=2000000):
        self.dictionary.add_documents(corpus, prune_at)

    def prune(self, **kwargs):
        # it is best if pruning is applied after all the updates
        # otherwise dropped tokens during pruning, seen in update
        # docs will produce wrong counts
        if self.dictionary.dfs == {}:
            raise ValueError('no vocab to filter; build vocab first')
        no_below = kwargs.get('no_below', 5)
        no_above = kwargs.get('no_above', 0.7)
        keep_n = kwargs.get('keep_n', 100000)
        keep_tokens = kwargs.get('keep_tokens', None)
        if keep_tokens:
            keep_tokens.append('UNK')
        else:
            keep_tokens = ['UNK']
        preprune_count = sum([df for _, df in self.dictionary.dfs.items()])
        self.dictionary.filter_extremes(no_below, no_above, keep_n,
                                        keep_tokens)
        postprune_count = sum([df for _, df in self.dictionary.dfs.items()])
        self.dictionary.dfs[-1] = preprune_count - postprune_count
        # add UNK back (gets pruned due to 0 initial val)
        self.dictionary.token2id['<UNK>'] = -1
        self.dictionary.id2token[-1] = '<UNK>'

    def update(self, docs, prune_at=2000000):
        self.add_documents(docs, prune_at)

    def transform(self, docs, transform_to='ids', with_unk=True):
        if transform_to == 'ids':
            for doc in docs:
                yield self.dictionary.doc2idx(doc)
        elif transform_to == 'bow':
            for doc in docs:
                if with_unk:
                    yield self.doc2bow(doc)
                else:
                    yield self.dictionary.doc2bow(doc)
        else:
            raise ValueError('unknwon transformation format')

    def fit_transform(self,
                      docs,
                      transform_to='ids',
                      prune_at=2000000,
                      filter_vocab=False,
                      **kwargs):
        self.set(docs, prune_at)
        if filter_vocab:
            self.prune(**kwargs)
        yield from self.transform(docs, transform_to)

    def merge(self, other):
        self.dictionary.merge_with(other)

    def save(self, fname, as_text=False, sort_by_word=False):
        if as_text:
            self.dictionary.save_as_text(fname, sort_by_word)
        else:
            self.dictionary.save(fname)

    def load(self, fname, from_text=False):
        if from_text:
            self.dictionary = Dictionary.load_from_text(fname)
        else:
            self.dictionary = Dictionary.load(fname)

    def __len__(self):
        return len(self.dictionary)

    def __iter__(self):
        return iter(self.dictionary)

    def keys(self):
        return list(self.dictionary.token2id.values())

    def __str__(self):
        return str(self.dictionary)

    def __getitem__(self, tokenid):
        return self.dictionary[tokenid]

    def doc2bow(self, document):
        # note: slight variation to BoW format conversion from gensim
        # to allow '<UNK>' tokens
        if isinstance(document, string_types):
            raise TypeError(
                "doc2bow expects an array of unicode tokens on input, not a single string"
            )

        # Construct (word, frequency) mapping.
        counter = defaultdict(int)
        for w in document:
            if w in self.dictionary.token2id:
                counter[self.dictionary.token2id[w]] += 1
            else:
                counter[-1] += 1

        # return tokenids, in ascending id order
        counter = sorted(iteritems(counter))
        return counter
Exemplo n.º 36
0
        dictionary = Dictionary()
        dictionary.add_documents(wiki.get_texts(), prune_at=None)

        print('    Building dictionary took %s' % formatTime(time.time() - t0))
        print('    %d unique tokens before pruning.' % len(dictionary))
        sys.stdout.flush()

        # keep_words = 100000

        # The initial dictionary is huge (~8.75M words in my Wikipedia dump), 
        # so let's filter it down. We want to keep the words that are neither 
        # very rare or overly common. To do this, we will keep only words that 
        # exist within at least 20 articles, but not more than 10% of all 
        # documents. Finally, we'll also put a hard limit on the dictionary 
        # size and just keep the 'keep_words' most frequent works.
        dictionary.save_as_text('./data/dictionary_full.txt.bz2')
        dictionary.filter_extremes(no_below=50, no_above=0.15, keep_n=None)
        print('    %d unique tokens after pruning.' % len(dictionary))

        # Write out the dictionary to disk.
        # For my run, this file is 769KB when compressed.
        # TODO -- This text format lets you peruse it, but you can
        # compress it better as binary...
        dictionary.save_as_text('./data/dictionary.txt.bz2')
    else:
        # Nothing to do here.
        print('')

    # ======== STEP 2: Convert Articles To Bag-of-words ========    
    # Now that we have our finalized dictionary, we can create bag-of-words
    # representations for the Wikipedia articles. This means taking another
Exemplo n.º 37
0
def store_contents(data_path,
                   save_path,
                   datasource,
                   processOnlyFilesinOriginalQrels,
                   num_workers=None):
    """Preprocess and store a corpus of documents in sqlite.

    Args:
        data_path: Root path to directory (or directory of directories) of files
          containing json encoded documents (must have `id` and `text` fields).
        save_path: Path to output sqlite db.
        preprocess: Path to file defining a custom `preprocess` function. Takes
          in and outputs a structured doc.
        num_workers: Number of parallel processes to use when reading docs.
    """
    if os.path.isfile(save_path):
        raise RuntimeError('%s already exists! Not overwriting.' % save_path)

    print save_path
    print data_path
    docIds = []  # list of TREC DocID
    docIdToDocIndex = {}  # key is DocID, value is docIndex
    docIndex = 0

    workers = ProcessPool(num_workers)
    files = []
    if processOnlyFilesinOriginalQrels == True:
        topicData = TRECTopics(datasource, start_topic[datasource],
                               end_topic[datasource])
        qrelDocList = topicData.qrelDocIdLister(
            qrelAddress[datasource], save_path,
            topic_original_qrels_doc_list_file_name)
        files = []
        for docId in qrelDocList:
            fileid = docId + '.txt'
            files.append(os.path.join(data_path, fileid))
        #files = [f for f in iter_files(data_path) if os.path.splitext(os.path.basename(f))[0] in qrelDocList]
        print "Number of unique documents in the qrels", len(files)

    else:
        files = [f for f in iter_files(data_path)]

    dictionary = Dictionary()
    count = 0

    with tqdm(total=len(files)) as pbar:
        for pairs in tqdm(workers.imap_unordered(get_contents, files)):
            count += len(pairs)
            dictionary.add_documents([
                pairs[0][1].split()
            ])  # pairs[0][0]-->docId, pairs[0][1]-->documentContent
            docIdToDocIndex[pairs[0][0]] = docIndex
            docIds.append(pairs[0][0])
            docIndex = docIndex + 1
            pbar.update()

    print("Number of documents:", docIndex, len(docIds), len(docIdToDocIndex))
    total_documents = len(docIds)
    metadata = {}
    metadata['docIdToDocIndex'] = docIdToDocIndex
    metadata['docIndexToDocId'] = docIds
    # protocol 2 for version compaitability
    pickle.dump(metadata,
                open(save_path + meta_data_file_name[datasource], 'wb'),
                protocol=2)

    # keep only words that
    # exist within at least 20 articles
    # keep only the top most freqent 15000 tokens
    dictionary.filter_extremes(no_below=20, keep_n=dictionary_features_number)
    dictionary.compactify()
    dictionary.save_as_text(save_path + dictionary_name)

    dictionary = Dictionary.load_from_text(save_path + dictionary_name)
    start_time = time.time()
    corpus_bow_stream = stream_corpus(data_path, dictionary, files)
    MmCorpus.serialize(save_path + corpus_bow_file_name,
                       corpus_bow_stream,
                       progress_cnt=10000)
    corpus_bow = MmCorpus(save_path + corpus_bow_file_name)
    model_tfidf = TfidfModel(corpus_bow, id2word=dictionary, normalize=True)
    model_tfidf.save(save_path + corpus_tfidf_model_file_name)
    corpus_tfidf = model_tfidf[corpus_bow]  # apply model
    MmCorpus.serialize(save_path + corpus_tfidf_file_name,
                       corpus_tfidf,
                       progress_cnt=1000)

    # Load the tf-idf corpus back from disk.
    corpus_tfidf = MmCorpus(save_path + corpus_tfidf_file_name)
    #n_items = len(dictionary)
    #print corpus_tfidf

    # CSR matrix construction phase
    indptr = [0]
    indices = []
    data = []
    # processing took 9:26s
    with tqdm(total=total_documents) as pbar:
        for doc in corpus_tfidf:
            for (index, values) in doc:
                indices.append(index)
                data.append(values)
            indptr.append(len(indices))
            pbar.update()

    start = time.time()
    sparse_matrix = sp.csr_matrix((data, indices, indptr), dtype=float)
    # saving took 01:21s
    sp.save_npz(save_path + csr_matrix_file_name[datasource], sparse_matrix)
    print "Finished in:", (time.time() - start)