Python Phraser примеры, gensim.models.phrases.Phraser Python примеры использования

Пример #1

0

Показать файл

def train(args):
    # Output during training
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    # use text8 corpus as training data, haikus dont provide sufficient context
    training_data = api.load('text8')

    # use the phrase model to recognize bigrams like "White House" or "Climate Change"
    bigram_model = Phrases(training_data)
    # Export the trained model = use less RAM, faster processing. Model updates no longer possible.
    bigrams = Phraser(bigram_model)

    # # create and train model
    model = Word2Vec(bigrams[training_data], size=args.embedding_dim)

    word_list = list(model.wv.vocab.keys())
    vector_list = [model[word] for word in word_list]

    # the basic model doesnt seem to be supporting item assignment
    # but WordEmbeddingsKeyedVectors does
    kv = WordEmbeddingsKeyedVectors(args.embedding_dim)
    kv.add(word_list, vector_list)

    kv.add(["<eos>", "<n>", "<unk>"], np.random.rand(3, args.embedding_dim))

    # just to be safe, clear the cache of normalized vectors
    # as i had a similar issue as https://github.com/RaRe-Technologies/gensim/issues/2532
    del kv.vectors_norm

    # save the new models
    bigrams.save(f"{args.model_path}/bigram.model")
    kv.save(f"{args.model_path}/word2vec.model")

Пример #2

0

Показать файл

def extract_corpus(column):
    try:
        corpus = column
    except Exception as e:
        return print(e)
    ## create empty list of lists of unigrams
    lst_corpus = []
    try:
        for string in corpus:
            lst_words = string.split()
            lst_grams = [
                " ".join(lst_words[i:i + 1])
                for i in range(0, len(lst_words), 1)
            ]
            lst_corpus.append(lst_grams)
        ## detect bigrams and trigrams
        bigrams_detector = Phrases(lst_corpus,
                                   delimiter=" ".encode(),
                                   min_count=10,
                                   threshold=10)
        bigrams_detector = Phraser(bigrams_detector)
        trigrams_detector = Phrases(bigrams_detector[lst_corpus],
                                    delimiter=" ".encode(),
                                    min_count=15,
                                    threshold=10)
        trigrams_detector = Phraser(trigrams_detector)
        ## detect common bigrams and trigrams using the fitted detectors
        lst_corpus = list(bigrams_detector[lst_corpus])
        lst_corpus = list(trigrams_detector[lst_corpus])
        return lst_corpus
    except Exception as e:
        return print(e)

Пример #3

0

Показать файл

    def trigramGenerator(self):
        corpusStream = self.sentenceStream()
        biGramPhrases = Phrases(corpusStream,
                                min_count=self.bigramMinCount,
                                threshold=self.thresholdBigram)
        bigram = Phraser(biGramPhrases)

        inputStream = self.sentenceStream()
        bigramSentenceList = (bigram[sentence] for sentence in inputStream)

        triGramPhrases = Phrases(bigramSentenceList,
                                 min_count=self.trigramMinCount,
                                 threshold=self.thresholdTrigram)
        trigram = Phraser(triGramPhrases)

        inputStream = self.sentenceStream()
        bigramSentenceList = (bigram[sentence] for sentence in inputStream)
        trigramSentenceList = (trigram[sentence]
                               for sentence in bigramSentenceList)

        trigramList = set()
        for trigramSentence in trigramSentenceList:
            for item in trigramSentence:
                if "_" in item:
                    trigramList.add(item)

        print("Number of Unique Trigrams = ", len(trigramList))
        for item in sorted(trigramList):
            if not os.path.exists(self.trainingLocation):
                os.makedirs(self.trainingLocation)
            with open(
                    os.path.join(self.trainingLocation,
                                 "TC-phrases-bi-tri.txt"), "a") as outFile:
                outFile.write(item + "\n")

Пример #4

0

Показать файл

Файл: PhraseDetection.py Проект: yaldahashemi/DomainSpecificThesaurus

 def fit(self, sentencesPath):
     """
     train phrases
     :param sentencesPath:the path of text file, the text file should be the format: one line one sentence
     """
     self.phrasers = []
     # path detect
     for path in self.savePhraserPaths:
         if not os.path.exists(os.path.dirname(path)):
             raise FileNotFoundError(os.path.dirname(path) + " not exist")
     for path in self.savePhraserPaths:
         if not os.path.exists(path):  # need train
             self.phrasers = None
             break
     if self.phrasers is not None and self.file_overwrite == False:
         logging.info("models are already exist, will read it")
         for path in self.savePhraserPaths:
             self.phrasers.append(Phraser.load(path))
         return True
     self.phrasers = []
     c = 2
     for path in self.savePhraserPaths:
         logging.info("getting %d-gram phrase......" % c)
         c += 1
         phraser = Phraser(
             Phrases(sentences=TxtIter(sentences=codecs.open(
                 sentencesPath, mode="r", encoding="utf-8"),
                                       ngrams=self.phrasers),
                     min_count=self.min_count,
                     threshold=self.threshold,
                     max_vocab_size=self.max_vocab_size,
                     delimiter=self.delimiter,
                     scoring=self.scoring))
         phraser.save(path)
         self.phrasers.append(phraser)

Пример #5

0

Показать файл

def get_text_search_terms(keywords, synonyms_threshold, fasttext_model):
    bi_gram_model = Phraser.load('src/models/bi_gram_model.pkl')
    tri_gram_model = Phraser.load('src/models/tri_gram_model.pkl')

    # clean tokens
    cleaned_terms = clean_tokenized_sentence(keywords.split(' ')).split()
    # remove empty terms
    cleaned_terms = [term for term in cleaned_terms if term]
    # stem terms
    cleaned_terms = [ps.stem(term) for term in cleaned_terms]
    # create bi-grams
    terms_with_bigrams = bi_gram_model[' '.join(cleaned_terms).split(' ')]
    # create tri-grams
    terms_with_trigrams = tri_gram_model[terms_with_bigrams]
    # expand query with synonyms
    search_terms = [
        fasttext_model.wv.most_similar(token) for token in terms_with_trigrams
    ]
    # filter synonyms above threshold (and flatten the list of lists)
    search_terms = [
        synonym[0] for synonyms in search_terms for synonym in synonyms
        if synonym[1] >= synonyms_threshold
    ]
    # expand keywords with synonyms
    search_terms = list(terms_with_trigrams) + search_terms
    return search_terms

Пример #6

0

Показать файл

def getTrigramList(g_DataQueue, g_FinishRead, savePath, bigramPath,
                   trigramPath):
    """

    :param g_DataQueue:
    :param g_FinishRead:
    :param savePath:保存字典路径
    :param bigramPath:
    :param trigramPath:
    :return:
    """
    count = 0
    vocabulary_dic = {}
    bigram = Phraser(Phrases.load(bigramPath))
    trigram = Phraser(Phrases.load(trigramPath))
    while (g_FinishRead.value == 0 or (not g_DataQueue.empty())):
        words = g_DataQueue.get()
        count += len(words)
        print("have processed sentences:", count)
        # 获取短语
        trigram_list = trigram[bigram[words]]
        del words
        gc.collect()
        # 放入字典中
        for phrase_list in trigram_list:
            for phrase in phrase_list:
                if phrase not in vocabulary_dic:
                    vocabulary_dic[phrase] = 0
                vocabulary_dic[phrase] += 1
    # 存入本地
    fw = codecs.open(savePath, "w", encoding="utf-8")
    fw.write(json.dumps(vocabulary_dic))
    fw.close()
    del vocabulary_dic
    gc.collect()

Пример #7

0

Показать файл

def main():
    get_args()

    def sentences():
        return chain.from_iterable(
            (read_slice(data) for data in read_corpus()))

    bigram = Phrases(sentences(), min_count=1, threshold=1, delimiter=b' ')
    bigram_phraser = Phraser(bigram)

    bigrammed = map(lambda x: bigram_phraser[x], sentences())

    trigram = Phrases(bigrammed, min_count=1, threshold=1, delimiter=b' ')
    trigram_phraser = Phraser(trigram)

    only_trigrams = {b' '.join(trigram_tuple): score for (trigram_tuple, score) in \
        trigram_phraser.phrasegrams.items() if b' '.join(trigram_tuple).count(b' ') == 2}

    for key, value in sorted(only_trigrams.items(),
                             key=lambda item: item[1],
                             reverse=True)[:10]:
        print(key, value)

    scores = list(only_trigrams.values())
    print("""
    Unique trigrams: {unique}
    Mean score:{mean}
    Max score:{max}
    Min score:{min}
    """.format(unique=len(only_trigrams),
               mean=mean(scores) if len(scores) != 0 else 0,
               max=max(scores) if len(scores) != 0 else 0,
               min=min(scores) if len(scores) != 0 else 0))

Пример #8

0

Показать файл

Файл: train.py Проект: bwalkowi/pjn

def get_bigram_phraser(directory):
    if os.path.isfile(BIGRAM):
        return Phraser.load(BIGRAM)
    else:
        bigram = Phraser(Phrases(corpus(directory)))
        bigram.save(BIGRAM)
        return bigram

Пример #9

0

Показать файл

def phrasing_sentences(sentences):
    phrases_bi = Phrases(sentences, min_count=5, threshold=1)
    bigram = Phraser(phrases_bi)
    sentences = map(lambda x: x, bigram[sentences])
    phrases_tri = Phrases(sentences, min_count=5, threshold=1)
    trigram = Phraser(phrases_tri)
    return map(lambda x: x, trigram[sentences])

Пример #10

0

Показать файл

Файл: predict.py Проект: MaxBamberger/DataScienceProjects

    def _preprocess(self, text, min_tok_len=1):
        stop_words = set(nltk.corpus.stopwords.words('english'))
        lemm_stemm = lambda tok: WordNetLemmatizer().lemmatize(tok, pos='v')

        result = []

        #remove proper nouns
        tagged_sent = pos_tag(text.split())
        noProper = [word for word, pos in tagged_sent if pos != 'NNP']
        noProper = ' '.join(noProper)

        for token in simple_preprocess(noProper):
            if len(token) > min_tok_len and token not in stop_words:
                result.append(lemm_stemm(token))

        # Build the bigram and trigram models
        bigram = Phrases(result, min_count=5,
                         threshold=10)  # higher threshold fewer phrases.
        trigram = Phrases(bigram[result], threshold=10)

        # Faster way to get a sentence clubbed as a trigram/bigram
        bigram_mod = Phraser(bigram)
        trigram_mod = Phraser(trigram)

        result = trigram_mod[bigram_mod[result]]

        return [result]

Пример #11

0

Показать файл

def tokeniseAll(posts, stopWords, urduNames):
    '''Function to tokenise all comments in the file, including ngrams
    
    Parameters
    ---------------------------------------
    comments: the pandas data frame column containing the comments, transformed into a list
    
    stopWords: A list of stopwords
    
    urduNames: A list of common Urdu names'''

    #posts = comments.tolist()
    n_grams = 3
    tokenized_corp = []
    for doc in posts:
        tokenized_corp.append(createToken(doc, stopWords, urduNames))

    # Add n_grams
    bigram = Phrases(tokenized_corp, min_count=5, threshold=10)
    trigram = Phrases(bigram[tokenized_corp], threshold=10)
    bigram_mod = Phraser(bigram)
    trigram_mod = Phraser(trigram)

    if n_grams > 1:
        for i, doc in enumerate(tokenized_corp):
            tokenized_corp[i] = bigram_mod[doc]
            if n_grams > 2:
                tokenized_corp[i] = trigram_mod[bigram_mod[doc]]
    return tokenized_corp

Пример #12

0

Показать файл

def visulaizer_of_gensim(content_list):
    stop_words = stopwords.words('english')

    data_words = list(sent_to_words(content_list))

    bigram = Phrases(data_words, min_count=5, threshold=100)
    trigram = Phrases(bigram[data_words], threshold=100)
    bigram_mod = Phraser(bigram)
    trigram_mod = Phraser(trigram)

    data_words_nostops = remove_stopwords(data_words, stop_words)
    data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod)
    data_words_trigrams = make_trigrams(data_words_bigrams, bigram_mod,
                                        trigram_mod)
    data_lemmatized = lemmatization(
        data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    id2word = corpora.Dictionary(data_lemmatized)
    texts = data_lemmatized
    corpus = [id2word.doc2bow(text) for text in texts]

    lda_model = LdaModel(corpus=corpus,
                         id2word=id2word,
                         num_topics=20,
                         random_state=100,
                         update_every=1,
                         chunksize=100,
                         passes=10,
                         alpha='auto',
                         per_word_topics=True)

    vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

    return vis

Пример #13

0

Показать файл

Файл: searchengine.py Проект: sarahJune1/covid19

    def __init__(self,
                 sentences_file: str,
                 bigram_model_path: str,
                 trigram_model_path: str,
                 fasttext_model_path: str):
        print(f'Loading CSV: {sentences_file} and building mapping dictionary...')
        sentences_df = pd.read_csv(sentences_file)
        self.sentence_id_to_metadata = {}
        for row_count, row in sentences_df.iterrows():
            self.sentence_id_to_metadata[row_count] = dict(
                paper_id=row['paper_id'],
                cord_uid=row['cord_uid'],
                source=row['source'],
                publish_time=row['publish_time'],
                authors=row['authors'],
                section=row['section'],
                sentence=row['sentence'],
            )
        print(f'Finished loading CSV: {sentences_file} and building mapping dictionary')
        self.cleaned_sentences = sentences_df['cleaned_sentence'].tolist()
        print(f'Loaded {len(self.cleaned_sentences)} sentences')

        print(f'Loading bi-gram model: {bigram_model_path}')
        self.bigram_model = Phraser.load(bigram_model_path)
        print(f'Finished loading bi-gram model: {bigram_model_path}')

        print(f'Loading tri-gram model: {trigram_model_path}')
        self.trigram_model = Phraser.load(trigram_model_path)
        print(f'Finished loading tri-gram model: {trigram_model_path}')

        self.synonyms_model = Synonyms(fasttext_model_path)

Пример #14

0

Показать файл

Файл: LDA.py Проект: sc17hs/RELSIM

    def split(self):
        start = time()
        n_gram = self.grams

        ap_text = self.series.apply(self.clean)
        ap_text_list = [i.split() for i in ap_text]
        print(len(ap_text_list))

        print('used: {:.2f}s'.format(time() - start))
        if n_gram == 1:
            self.prepared = ap_text_list

        elif n_gram == 2:
            phs = Phrases(ap_text_list)
            bi_gram = Phraser(phs)
            new_bi_list = [bi_gram[i] for i in ap_text_list]
            self.prepared = new_bi_list

        else:
            phs = Phrases(ap_text_list)
            bi_gram = Phraser(phs)
            new_bi_list = [bi_gram[i] for i in ap_text_list]

            phs3 = Phrases(new_bi_list)
            tri_gram = Phraser(phs3)
            new_tri_list2 = [tri_gram[i] for i in new_bi_list]

            self.prepared = new_tri_list2

Пример #15

0

Показать файл

Файл: TextPreprocessing.py Проект: GuyPozner/KCPM

def create_bigram_and_trigram(sentences):
    bigram = Phrases(sentences, min_count=10, threshold=10, delimiter=b' ')
    bigram_phraser = Phraser(bigram)
    bigramer = bigram_phraser[sentences]
    trigram = Phrases(bigram_phraser[sentences], min_count=10, threshold=10, delimiter=b' ')
    trigram_phraser = Phraser(trigram)
    trigramer = trigram_phraser[bigramer]  
    return trigramer	#the trigamer also include trigrams and bigrams

Пример #16

0

Показать файл

Файл: main.py Проект: bfaure/WikiClassify2.0

 def __init__(self, tsv_path, n_examples=100000):
     print("Getting %s iterator..." % tsv_path)
     self.n_examples = n_examples
     self.document_path = tsv_path
     self.fin = open(self.document_path, 'rb')
     self.instances = sum(1 for line in open(tsv_path))
     self.bigram = Phraser(Phrases())
     self.trigram = Phraser(Phrases())

Пример #17

0

Показать файл

Файл: topic_modeling.py Проект: PejicM/Legal-Case-Reports-Topic-Modeling

def create_ngram_models(documents):
    bigram = Phrases(documents, min_count=5, threshold=100)
    trigram = Phrases(bigram[documents], threshold=100)

    bigram_model = Phraser(bigram)
    trigram_model = Phraser(trigram)

    return bigram_model, trigram_model

Пример #18

0

Показать файл

Файл: text_preprocessing.py Проект: elainespak/glassdoor_aspect_based_sentiment_analysis

def make_ngrams_model(tokenized_sentences, set_min_count=30, set_threshold=80):
    bigram = Phrases(tokenized_sentences,
                     min_count=set_min_count,
                     threshold=set_threshold)
    trigram = Phrases(bigram[tokenized_sentences], threshold=set_threshold)
    bigram_mod = Phraser(bigram)
    trigram_mod = Phraser(trigram)
    return bigram_mod, trigram_mod

Пример #19

0

Показать файл

Файл: util.py Проект: bobflagg/Topic-Modeling-Without-Tears

def load_phrasers(directory=MODEL_DIRECTORY):
    path = os.path.join(directory, "bigram-phraser.pkl")
    bigram_phraser = Phraser.load(path)

    path = os.path.join(directory, "trigram-phraser.pkl")
    trigram_phraser = Phraser.load(path)

    return bigram_phraser, trigram_phraser

Пример #20

0

Показать файл

Файл: text2phraser.py Проект: X5GON/lamapi

def train_model(texts: List[Text], savedir: PathType) -> None:
    print(f"Trainning of phraser model")
    texts = [t.split() for t in texts]
    phrases = [Phraser(Phrases(texts, min_count=100, delimiter=b'_'))]
    corpus = [phrases[-1][texts]]
    for n in range(3, 7):
        save_phraser(phrases[-1], os.path.join(savedir, f"{n-1}gramsphraser"))
        phrases.append(Phraser(Phrases(corpus[-1], delimiter=b'_')))
        corpus.append(phrases[-1][corpus[-1]])

Пример #21

0

Показать файл

Файл: tokenizer.py Проект: shubhampachori12110095/flyvec

    def from_file(cls, dict_fname, phraser_fname=None):
        """Load tokenizer information from a dictionary file (generated by gensim dictionary.save) and a phraser file."""
        d = Dictionary.load(str(dict_fname))
        if phraser_fname is not None:
            p = Phraser.load(phraser_fname)
        else:
            p = Phraser(Phrases([[]]))

        return cls(d, p)

Пример #22

0

Показать файл

Файл: train.py Проект: bwalkowi/pjn

def get_trigram_phraser(directory):
    if os.path.isfile(TRIGRAM):
        return Phraser.load(TRIGRAM)
    else:
        bigram = get_bigram_phraser(directory)
        sentence_stream = (bigram[sentence] for sentence in corpus(directory))
        trigram = Phraser(Phrases(sentence_stream))
        trigram.save(TRIGRAM)
        return trigram

Пример #23

0

Показать файл

 def testEmptyPhrasifiedSentencesIterator(self):
     bigram_phrases = Phrases(self.sentences)
     bigram_phraser = Phraser(bigram_phrases)
     trigram_phrases = Phrases(bigram_phraser[self.sentences])
     trigram_phraser = Phraser(trigram_phrases)
     trigrams = trigram_phraser[bigram_phraser[self.sentences]]
     fst, snd = list(trigrams), list(trigrams)
     self.assertEqual(fst, snd)
     self.assertNotEqual(snd, [])

Пример #24

0

Показать файл

Файл: test_phrases.py Проект: lopusz/gensim

 def testSaveLoad(self):
     """ Saving and loading a Phraser object."""
     with temporary_file("test.pkl") as fpath:
         bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1))
         bigram.save(fpath)
         bigram_loaded = Phraser.load(fpath)
         self.assertEqual(
             bigram_loaded[['graph', 'minors', 'survey', 'human', 'interface', 'system']],
             ['graph_minors', 'survey', 'human_interface', 'system'])

Пример #25

0

Показать файл

    def __init__(self):
        #read the bigram and trigram objects
        with open(r"../model/bigram_transformer.pickle", "rb") as input_file:
            bigram_transformer = pickle.load(input_file)

        with open(r"../model/trigram_transformer.pickle", "rb") as input_file2:
            trigram_transformer = pickle.load(input_file2)

        self.bigram_phraser = Phraser(bigram_transformer)
        self.trigram_phraser = Phraser(trigram_transformer)

Пример #26

0

Показать файл

Файл: test_phrases.py Проект: lopusz/gensim

    def testSaveLoadCustomScorer(self):
        """Saving and loading a Phraser object with a custom scorer """

        with temporary_file("test.pkl") as fpath:
            bigram = Phraser(
                Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer))
            bigram.save(fpath)
            bigram_loaded = Phraser.load(fpath)
            # we do not much with scoring, just verify its the one expected
            self.assertEqual(bigram_loaded.scoring, dumb_scorer)

Пример #27

0

Показать файл

 def testSaveLoad(self):
     """ Saving and loading a Phraser object."""
     with temporary_file("test.pkl") as fpath:
         bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1))
         bigram.save(fpath)
         bigram_loaded = Phraser.load(fpath)
         self.assertEqual(
             bigram_loaded[[
                 'graph', 'minors', 'survey', 'human', 'interface', 'system'
             ]], ['graph_minors', 'survey', 'human_interface', 'system'])

Пример #28

0

Показать файл

def make_trigrams(
    sentences: Iterable, save_model_path: Path, **phrases_kw
):
    """Entrena modelo de bigramas de gensim."""
    bigram = Phrases(sentences, **phrases_kw)
    bigram_phraser = Phraser(bigram)
    tokens = bigram_phraser[sentences]
    trigram = Phrases(tokens, delimiter=b" ")
    trigram_phraser = Phraser(trigram)
    trigram_phraser.save(str(save_model_path))

Пример #29

0

Показать файл

Файл: topic_utils.py Проект: aashishkhadka1992/SentimentAnalysis

    def make_trigrams(self):
        bigram = Phrases(self.sent_to_words(), min_count=5, threshold=100)
        bigram_mod = Phraser(bigram)
        bigram_data_words = [
            bigram_mod[doc] for doc in self.remove_stopwords()
        ]

        trigram = Phrases(bigram[self.sent_to_words()], threshold=100)
        trigram_mod = Phraser(trigram)
        return [trigram_mod[bigram_mod[doc]] for doc in bigram_data_words]

Пример #30

0

Показать файл

    def __init__(self,
                 source,
                 max_sentence_length=MAX_WORDS_IN_BATCH,
                 limit=None):

        self.source = source
        self.max_sentence_length = max_sentence_length
        self.limit = limit
        self.bigram = Phraser.load('./preprocessed_big_phrases')
        self.trigram = Phraser.load('./preprocessed_trigram_phrases')

Пример #31

0

Показать файл

Файл: ex9.py Проект: mat-hek/pjn

def mk_bigrams():
    with open(dump_base + "judgments", 'r', encoding="utf-8") as f:
        judgments = f.read()

    sentences = [list(gensim.utils.simple_tokenize(s)) for s in textcleaner.split_sentences(judgments)]

    bigramer = Phraser(Phrases(sentences))

    bigramer.save(dump_base + "bigramer")

    return [bigramer[s] for s in sentences]

Пример #32

0

Показать файл

def train_phraser(sentence_stream, stopword_list, threshold, model_path,
                  save_prefix):
    phrases_model = Phrases(sentence_stream,
                            common_terms=stopword_list,
                            threshold=threshold)
    phrases_model.save(
        os.path.join(model_path, '{}_phrases.bin'.format(save_prefix)))
    phraser_model = Phraser(phrases_model)
    phraser_model.save(
        os.path.join(model_path, '{}_phraser.bin'.format(save_prefix)))
    return phraser_model

Пример #33

0

Показать файл

Файл: test_phrases.py Проект: RaRe-Technologies/gensim

    def testCompatibilty(self):
        phr = Phraser.load(datapath("phraser-3.6.0.model"))
        model = Phrases.load(datapath("phrases-3.6.0.model"))

        test_sentences = ['trees', 'graph', 'minors']
        expected_res = ['trees', 'graph_minors']

        phr_out = phr[test_sentences]
        model_out = model[test_sentences]

        self.assertEqual(phr_out, expected_res)
        self.assertEqual(model_out, expected_res)

Пример #34

0

Показать файл

Файл: build_model.py Проект: ffaristocrat/ml-sandbox

    def build_phrases(self):
        threads = ReadThreads(
            self.board, self.input_dir,
            return_func=lambda x, y: (x, y.split()))
        filename = op.join(self.input_dir, f'{self.board}.trigrams')
        trigram_mod = Phraser.load(filename)

        filename = op.join(self.input_dir, f'{self.board}.phrases')
        with open(filename, 'wt') as f:
            for num, thread in threads:
                line = ' '.join([
                    word for word in trigram_mod[thread]
                    if word not in STOPWORDS and
                    len(word) >= 3
                ])
                print(f'{num}\t{line}', file=f)

Пример #35

0

Показать файл

Файл: build_model.py Проект: ffaristocrat/ml-sandbox

    def build_doc2vec_model(self, vectors: int=200):
        filename = op.join(self.input_dir, f'{self.board}.phraser')
        phraser = Phraser.load(filename)
        documents = ReadThreads(
            self.board, input_dir=self.input_dir, file_type='phrases',
            return_func=lambda x, y: TaggedDocument(phraser[y.split()], [x]))
        model = Doc2Vec(vector_size=vectors, window=2, min_count=5, workers=3)
        model.build_vocab(documents=documents)

        model.train(
            documents=documents,
            total_examples=model.corpus_count,
            epochs=model.iter,
        )
        
        filename = op.join(self.input_dir, f'{self.board}.doc2vec')
        model.save(filename)

        return model

Пример #36

0

Показать файл

Файл: build_model.py Проект: ffaristocrat/ml-sandbox

    def build_phraser(self, threshold: int=None):
        tokens = ReadThreads(
            self.board, self.input_dir, return_func=lambda x, y: y.split())
        bigram = Phrases(tokens, min_count=5, threshold=threshold)
        trigram = Phrases(bigram[tokens], threshold=threshold)

        bigram_mod = Phraser(bigram)
        trigram_mod = Phraser(trigram)

        filename = op.join(self.input_dir, f'{self.board}.bigrams')
        bigram_mod.save(filename)
        filename = op.join(self.input_dir, f'{self.board}.trigrams')
        trigram_mod.save(filename)

        return trigram_mod

Пример #37

0

Показать файл

Файл: test_phrases.py Проект: lopusz/gensim

 def testSaveLoadNoCommonTerms(self):
     """ Ensure backwards compatibility with old versions of Phraser, before common_terms"""
     bigram_loaded = Phraser.load(datapath("phraser-no-common-terms.pkl"))
     self.assertEqual(bigram_loaded.common_terms, frozenset())

Пример #38

0

Показать файл

Файл: test_phrases.py Проект: lopusz/gensim

 def testSaveLoadNoScoring(self):
     """ Saving and loading a Phraser object with no scoring parameter.
     This should ensure backwards compatibility with old versions of Phraser"""
     bigram_loaded = Phraser.load(datapath("phraser-no-scoring.pkl"))
     # we do not much with scoring, just verify its the one expected
     self.assertEqual(bigram_loaded.scoring, original_scorer)

Python Phraser примеры использования