Python Phraser.load примеры, gensim.models.phrases.Phraser.load Python примеры использования

Пример #1

0

Показать файл

def get_text_search_terms(keywords, synonyms_threshold, fasttext_model):
    bi_gram_model = Phraser.load('src/models/bi_gram_model.pkl')
    tri_gram_model = Phraser.load('src/models/tri_gram_model.pkl')

    # clean tokens
    cleaned_terms = clean_tokenized_sentence(keywords.split(' ')).split()
    # remove empty terms
    cleaned_terms = [term for term in cleaned_terms if term]
    # stem terms
    cleaned_terms = [ps.stem(term) for term in cleaned_terms]
    # create bi-grams
    terms_with_bigrams = bi_gram_model[' '.join(cleaned_terms).split(' ')]
    # create tri-grams
    terms_with_trigrams = tri_gram_model[terms_with_bigrams]
    # expand query with synonyms
    search_terms = [
        fasttext_model.wv.most_similar(token) for token in terms_with_trigrams
    ]
    # filter synonyms above threshold (and flatten the list of lists)
    search_terms = [
        synonym[0] for synonyms in search_terms for synonym in synonyms
        if synonym[1] >= synonyms_threshold
    ]
    # expand keywords with synonyms
    search_terms = list(terms_with_trigrams) + search_terms
    return search_terms

Пример #2

0

Показать файл

Файл: main.py Проект: bfaure/WikiClassify2.0

    def get_phraser(self, directory, sensitivity=3):

        if not os.path.isdir(directory):
            os.makedirs(directory)

        print("\t\tGetting bigram detector...")
        if not os.path.isfile(directory + '/bigrams.pkl'):
            self.bigram = Phraser(
                Phrases(self.docs(n_examples=-1),
                        min_count=2,
                        threshold=sensitivity,
                        max_vocab_size=2000000))
            self.bigram.save(directory + '/bigrams.pkl')
        else:
            self.bigram = Phraser.load(directory + '/bigrams.pkl')

        print("\t\tGetting trigram detector...")
        if not os.path.isfile(directory + '/trigrams.pkl'):
            self.trigram = Phraser(
                Phrases(self.bigram[self.docs(n_examples=-1)],
                        min_count=2,
                        threshold=sensitivity + 1,
                        max_vocab_size=2000000))
            self.trigram.save(directory + '/trigrams.pkl')
        else:
            self.trigram = Phraser.load(directory + '/trigrams.pkl')

Пример #3

0

Показать файл

Файл: searchengine.py Проект: sarahJune1/covid19

    def __init__(self,
                 sentences_file: str,
                 bigram_model_path: str,
                 trigram_model_path: str,
                 fasttext_model_path: str):
        print(f'Loading CSV: {sentences_file} and building mapping dictionary...')
        sentences_df = pd.read_csv(sentences_file)
        self.sentence_id_to_metadata = {}
        for row_count, row in sentences_df.iterrows():
            self.sentence_id_to_metadata[row_count] = dict(
                paper_id=row['paper_id'],
                cord_uid=row['cord_uid'],
                source=row['source'],
                publish_time=row['publish_time'],
                authors=row['authors'],
                section=row['section'],
                sentence=row['sentence'],
            )
        print(f'Finished loading CSV: {sentences_file} and building mapping dictionary')
        self.cleaned_sentences = sentences_df['cleaned_sentence'].tolist()
        print(f'Loaded {len(self.cleaned_sentences)} sentences')

        print(f'Loading bi-gram model: {bigram_model_path}')
        self.bigram_model = Phraser.load(bigram_model_path)
        print(f'Finished loading bi-gram model: {bigram_model_path}')

        print(f'Loading tri-gram model: {trigram_model_path}')
        self.trigram_model = Phraser.load(trigram_model_path)
        print(f'Finished loading tri-gram model: {trigram_model_path}')

        self.synonyms_model = Synonyms(fasttext_model_path)

Пример #4

0

Показать файл

Файл: util.py Проект: bobflagg/Topic-Modeling-Without-Tears

def load_phrasers(directory=MODEL_DIRECTORY):
    path = os.path.join(directory, "bigram-phraser.pkl")
    bigram_phraser = Phraser.load(path)

    path = os.path.join(directory, "trigram-phraser.pkl")
    trigram_phraser = Phraser.load(path)

    return bigram_phraser, trigram_phraser

Пример #5

0

Показать файл

    def __init__(self,
                 source,
                 max_sentence_length=MAX_WORDS_IN_BATCH,
                 limit=None):

        self.source = source
        self.max_sentence_length = max_sentence_length
        self.limit = limit
        self.bigram = Phraser.load('./preprocessed_big_phrases')
        self.trigram = Phraser.load('./preprocessed_trigram_phrases')

Пример #6

0

Показать файл

    def __init__(self):
        #model/xgb_pipeline.pkl
        with open('model/fake_tokenizer.pickle', 'rb') as handle:
            self.tokenzs = pickle.load(handle)
        with open('model/model_in_json2.json', 'r') as f:
            self.model_json = json.load(f)
            #print(self.model_json)

        self.l_model = model_from_json(self.model_json)
        self.l_model.load_weights('model/model_weights2.h5')
        self.bigrams = Phraser.load("model/bigrams")
        self.trigrams = Phraser.load("model/trigrams")
        self.fake_score = 0

Пример #7

0

Показать файл

def get_params(mode):
    if mode == 'all':
        bad_words = pd.read_csv('data/bad_words.csv').values
        n_grams = Phraser.load("data/ngrams_model.pkl")
        dict_tfidf = np.load('data/my_dict.npy', allow_pickle='TRUE').item()
        word2vec = Word2Vec.load('data/word2vec.model')
        descriptors = pd.read_csv('data/descriptors.csv').set_index(
            'raw descriptor')
        return bad_words, n_grams, dict_tfidf, word2vec, descriptors
    if mode == 'half':
        bad_words = pd.read_csv('data/bad_words.csv').values
        n_grams = Phraser.load("data/ngrams_model.pkl")
        descriptors = pd.read_csv('data/descriptors.csv').set_index(
            'raw descriptor')
        return bad_words, n_grams, descriptors

Пример #8

0

Показать файл

def load_phrases_model(path) -> Phraser:
    """
    Load a phraser to get statistic results
    :param path:
    :return:
    """
    return Phraser.load(path)

Пример #9

0

Показать файл

Файл: util_kw.py Проект: sigmaroles/caste-news-nlp

def scan_processed_with_phraser_(path_journal_processed, journ, path_phraser_models):
    print ("Launching scan of "+journ)
    # load the phraser model of the given journal name from given path
    fpath = path_phraser_models + '/' + journ + '_00_bigramphraser'
    bigram = Phraser.load(fpath)
    
    # dict of dicts; filenames as key, dict of frequency {word:count} as value
    texts = {}
    
    # iterate over all data from that journal
    fnames = os.listdir(path_journal_processed)
    ln_fnames = len(fnames)
    with open('logfile.txt', 'w') as lfh:
        for i, fname in enumerate(fnames):
            text = []
            for line in open(os.path.join(path_journal_processed, fname)):
                text = text + bigram[line.split()]
            frequency = defaultdict(float)
            for word in text:
                frequency[word] += 1
            
            # add the frequency counts of this text file to master dict
            texts[fname] = frequency
            if i%150==0:
                lfh.write("Done scanning "+fname+" ; {} out of {}".format(i+1, ln_fnames) + '\n')
                lfh.flush()
    
    return texts

Пример #10

0

Показать файл

Файл: class_net_lin.py Проект: tostre/eika2_data

def load_vector_data(dataset_name, bgr=False, split_factor=0.2):
    sentences = pd.read_csv("../cleaned/" + dataset_name + "_stems.csv",
                            delimiter=",",
                            dtype=str).astype(str).values.tolist()
    targets = pd.read_csv("../cleaned/" + dataset_name + "_clean.csv",
                          dtype=types)["a"]
    vector_model = FastText.load("../models/word_embeddings/" + dataset_name +
                                 "_fasttext")
    # replace placeholders (" "), make one-string-sentences
    print("... replacing placeholders")
    for index, sample in enumerate(sentences):
        sentences[index] = list(filter((" ").__ne__, sample))
    inputs = [" ".join(sentence) for sentence in sentences]
    tokenized = sentences
    if bgr:
        bigram = Phraser.load("../models/bigrams/bigram_" + dataset_name +
                              ".pkl")
        bigrammed = [bigram[sentence] for sentence in sentences]
        tokenized = bigrammed
    inputs = [
        np.sum(vector_model.wv[sent], 0).tolist() if sent else np.zeros(32)
        for sent in tokenized
    ]
    inputs = np.array(inputs)
    train_loader, val_loader, test_loader = make_loader(
        inputs, targets, split_factor)
    return len(inputs[0]), train_loader, val_loader, test_loader

Пример #11

0

Показать файл

Файл: PhraseDetection.py Проект: yaldahashemi/DomainSpecificThesaurus

 def fit(self, sentencesPath):
     """
     train phrases
     :param sentencesPath:the path of text file, the text file should be the format: one line one sentence
     """
     self.phrasers = []
     # path detect
     for path in self.savePhraserPaths:
         if not os.path.exists(os.path.dirname(path)):
             raise FileNotFoundError(os.path.dirname(path) + " not exist")
     for path in self.savePhraserPaths:
         if not os.path.exists(path):  # need train
             self.phrasers = None
             break
     if self.phrasers is not None and self.file_overwrite == False:
         logging.info("models are already exist, will read it")
         for path in self.savePhraserPaths:
             self.phrasers.append(Phraser.load(path))
         return True
     self.phrasers = []
     c = 2
     for path in self.savePhraserPaths:
         logging.info("getting %d-gram phrase......" % c)
         c += 1
         phraser = Phraser(
             Phrases(sentences=TxtIter(sentences=codecs.open(
                 sentencesPath, mode="r", encoding="utf-8"),
                                       ngrams=self.phrasers),
                     min_count=self.min_count,
                     threshold=self.threshold,
                     max_vocab_size=self.max_vocab_size,
                     delimiter=self.delimiter,
                     scoring=self.scoring))
         phraser.save(path)
         self.phrasers.append(phraser)

Пример #12

0

Показать файл

Файл: train.py Проект: bwalkowi/pjn

def get_bigram_phraser(directory):
    if os.path.isfile(BIGRAM):
        return Phraser.load(BIGRAM)
    else:
        bigram = Phraser(Phrases(corpus(directory)))
        bigram.save(BIGRAM)
        return bigram

Пример #13

0

Показать файл

def main(coursesList):
    lda = LDA.load("./best_model.lda")
    dictionary = Dictionary.load("best_model.lda.id2word")
    bigrams = Phraser.load("./bigram_model.pkl")
    trigrams = Phraser.load("./trigram_model.pkl")
    text_clean = [doc.split(' ') for doc in coursesList['description']]
    corpus = [dictionary.doc2bow(text) for text in text_clean]
    create_vector_topics(lda, corpus, dictionary, coursesList)
    courses_topic = config.matrix_courses_topic.to_numpy()

    #lda, dictionary, bigrams, trigrams = create_LDA_model(coursesList)
    #courses_topic = config.matrix_courses_topic.to_numpy()

    cursor.execute("select id from auth_group")
    id_groups = cursor.fetchall()
    for i in id_groups:
        cursor.execute(
            "select distinct studyplan_id from students where group_id = %(id)s ",
            {'id': i[0]})
        studyplan_id = cursor.fetchall()
        for j in studyplan_id:
            subject_list = pd.DataFrame(columns=['id_subject', 'description'])
            subject_list = WordProcessing.word_processing(
                get_work_program(j[0], subject_list))
            #for k in subject_list:
            token_stud_prog = [
                program.split(' ') for program in subject_list['description']
            ]
            #token_stud_prog = add_n_grams(token_stud_prog, bigrams, trigrams)
            prog_corp = [
                dictionary.doc2bow(program) for program in token_stud_prog
            ]
            topic_prog = lda.get_document_topics(prog_corp)
            for l in range(0, len(topic_prog)):
                profile_student = np.zeros(config.num_lda_topic)
                dense_topic_prog = np.zeros(config.num_lda_topic)
                for m in topic_prog[l]:
                    dense_topic_prog[m[0]] += m[1]
                #mask = np.argsort(dense_topic_prog)[::-1][:1]
                #profile_student[mask] += 1
                profile_student = dense_topic_prog
                cosine_similarities = linear_kernel(
                    profile_student.reshape(1, -1), courses_topic).flatten()
                top_courses = np.where(cosine_similarities >= 0.2)[0]
                print(subject_list.loc[l, 'id_subject'])
                #print(top_courses)
                print(coursesList.loc[top_courses, 'name':'link'])

Пример #14

0

Показать файл

def ngrams_preprocess(corpus, ngrams=1, grams_join=" "):
    lst_corpus = []
    for string in corpus:
        lst_words = string.split()
        lst_grams = [
            grams_join.join(lst_words[i:i + ngrams])
            for i in range(0, len(lst_words), ngrams)
        ]
        lst_corpus.append(lst_grams)

    bigram = Phraser.load('\\Users\\Zeden\\Desktop\\bigram')
    trigram = Phraser.load('\\Users\\Zeden\\Desktop\\trigram')
    lst_ngrams_detectors = [bigram, trigram]
    if len(lst_ngrams_detectors) != 0:
        for detector in lst_ngrams_detectors:
            lst_corpus = list(detector[lst_corpus])
    return lst_corpus

Пример #15

0

Показать файл

    def __init__(self):
        assert spacy.en.STOP_WORDS

        self.STOP_WORDS = spacy.en.STOP_WORDS
        nlp = spacy.load('en')
        nlp.pipeline = [nlp.tagger, nlp.parser]
        self.nlp = nlp
        self.bigram_model = Phraser.load('bigram_model')

Пример #16

0

Показать файл

    def __init__(self):
        assert spacy.en.STOP_WORDS

        self.STOP_WORDS = spacy.en.STOP_WORDS
        nlp = spacy.load('en')
        nlp.pipeline = [nlp.tagger, nlp.parser]
        self.nlp = nlp
        self.bigram_model = Phraser.load('C:/Users/raula/dissertation_demo/python_scripts/bigram_model')

Пример #17

0

Показать файл

Файл: train_w2v.py Проект: lorenzo-romanelli/compbeauty_code

 def loadNgrams(self, N=2):
     phrasesdir = environment.MODELS_DIR + "phrases/"
     if N == 2:
         phr_path = "bigrams.phr"
     elif N == 3:
         phr_path = "trigrams.phr"
     else:
         phr_path = "bigrams.phr"
     return Phraser.load(phrasesdir + phr_path)

Пример #18

0

Показать файл

    def __init__(self, args):
        # load all the haikus from a file
        with open(args.data_path, "r", encoding="utf8",
                  errors="ignore") as infile:
            self.data = infile.read().splitlines()

        self.word2vec = gensim.models.KeyedVectors.load(
            f"{args.model_path}/word2vec.model")
        self.bigrams = Phraser.load(f"{args.model_path}/bigram.model")

Пример #19

0

Показать файл

Файл: tokenizer.py Проект: shubhampachori12110095/flyvec

    def from_file(cls, dict_fname, phraser_fname=None):
        """Load tokenizer information from a dictionary file (generated by gensim dictionary.save) and a phraser file."""
        d = Dictionary.load(str(dict_fname))
        if phraser_fname is not None:
            p = Phraser.load(phraser_fname)
        else:
            p = Phraser(Phrases([[]]))

        return cls(d, p)

Пример #20

0

Показать файл

Файл: train.py Проект: bwalkowi/pjn

def get_trigram_phraser(directory):
    if os.path.isfile(TRIGRAM):
        return Phraser.load(TRIGRAM)
    else:
        bigram = get_bigram_phraser(directory)
        sentence_stream = (bigram[sentence] for sentence in corpus(directory))
        trigram = Phraser(Phrases(sentence_stream))
        trigram.save(TRIGRAM)
        return trigram

Пример #21

0

Показать файл

Файл: test_phrases.py Проект: lopusz/gensim

 def testSaveLoad(self):
     """ Saving and loading a Phraser object."""
     with temporary_file("test.pkl") as fpath:
         bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1))
         bigram.save(fpath)
         bigram_loaded = Phraser.load(fpath)
         self.assertEqual(
             bigram_loaded[['graph', 'minors', 'survey', 'human', 'interface', 'system']],
             ['graph_minors', 'survey', 'human_interface', 'system'])

Пример #22

0

Показать файл

def build_csv(metadata: str, dirs: List[str], output: str,
              bigram_model_path: str, trigram_model_path: str,
              filter_covid_19: bool, sentences: bool):
    print(f'Building metadata dictionary from file: {metadata} ...')
    sha_to_properties = _build_metadata_dict(metadata)
    print(f'Finished building metadata dictionary from file: {metadata}')

    print(f'Filtering only COVID-19 articles: {filter_covid_19}')

    bigram_model = None
    trigram_model = None
    if bigram_model_path and trigram_model_path:
        bigram_model = Phraser.load(bigram_model_path)
        trigram_model = Phraser.load(trigram_model_path)

    all_df = None
    for dir_name in dirs:
        print(f'Loading files from directory: {dir_name} ...')
        dir_files = load_files(dir_name)
        print(f'Finished loading files from directory: {dir_name}')
        if sentences:
            clean_df = generate_df_sentence_level(dir_files, sha_to_properties,
                                                  filter_covid_19,
                                                  bigram_model, trigram_model)
        else:
            clean_df = generate_df(dir_files, sha_to_properties,
                                   filter_covid_19)

        if all_df is None:  # first call
            all_df = clean_df
        else:
            all_df = all_df.append(clean_df)

    all_df.fillna("", inplace=True)

    # filter out short sentences
    all_df = all_df[all_df['cleaned_sentence'].apply(
        lambda sent: len(sent.split()) >= 5)]

    print(f'All files DataFrame shape: {all_df.shape}')

    print(f'Writing CSV file to: {output}')
    with open(output, 'w+') as out_fp:
        all_df.to_csv(out_fp, index=True)

Пример #23

0

Показать файл

    def load_bigram(self):
        '''
        Search for and load a pre-existing bigrams file

        :update: self.bigram
        '''
        self.bigram = Phraser.load(
            f"./tmp/{self.date}_bigram_model_{self.b_min}.pkl")

        print("Bigram loaded.")

Пример #24

0

Показать файл

Файл: test_phrases.py Проект: lopusz/gensim

    def testSaveLoadCustomScorer(self):
        """Saving and loading a Phraser object with a custom scorer """

        with temporary_file("test.pkl") as fpath:
            bigram = Phraser(
                Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer))
            bigram.save(fpath)
            bigram_loaded = Phraser.load(fpath)
            # we do not much with scoring, just verify its the one expected
            self.assertEqual(bigram_loaded.scoring, dumb_scorer)

Пример #25

0

Показать файл

 def testSaveLoad(self):
     """ Saving and loading a Phraser object."""
     with temporary_file("test.pkl") as fpath:
         bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1))
         bigram.save(fpath)
         bigram_loaded = Phraser.load(fpath)
         self.assertEqual(
             bigram_loaded[[
                 'graph', 'minors', 'survey', 'human', 'interface', 'system'
             ]], ['graph_minors', 'survey', 'human_interface', 'system'])

Пример #26

0

Показать файл

Файл: examples.py Проект: brianwilcken/gc_text_analysis

def build_ngram_model(docs):
    bigram_model_path = Path('bigram_phraser.pkl')
    trigram_model_path = Path('trigram_phraser.pkl')
    if not bigram_model_path.exists() or not trigram_model_path.exists():
        print('Building n-gram models')
        bigram = Phrases(docs, min_count=3, threshold=6)
        trigram = Phrases(bigram[docs], min_count=3, threshold=6)

        bigram_model = Phraser(bigram)
        trigram_model = Phraser(trigram)

        bigram_model.save('bigram_phraser.pkl')
        trigram_model.save('trigram_phraser.pkl')
    else:
        print('Loading saved n-gram models')
        bigram_model = Phraser.load('bigram_phraser.pkl')
        trigram_model = Phraser.load('trigram_phraser.pkl')

    return (bigram_model, trigram_model)

Пример #27

0

Показать файл

    def testCompatibilty(self):
        phr = Phraser.load(datapath("phraser-3.6.0.model"))
        model = Phrases.load(datapath("phrases-3.6.0.model"))

        test_sentences = ['trees', 'graph', 'minors']
        expected_res = ['trees', 'graph_minors']

        phr_out = phr[test_sentences]
        model_out = model[test_sentences]

        self.assertEqual(phr_out, expected_res)
        self.assertEqual(model_out, expected_res)

Пример #28

0

Показать файл

Файл: test_phrases.py Проект: RaRe-Technologies/gensim

    def testCompatibilty(self):
        phr = Phraser.load(datapath("phraser-3.6.0.model"))
        model = Phrases.load(datapath("phrases-3.6.0.model"))

        test_sentences = ['trees', 'graph', 'minors']
        expected_res = ['trees', 'graph_minors']

        phr_out = phr[test_sentences]
        model_out = model[test_sentences]

        self.assertEqual(phr_out, expected_res)
        self.assertEqual(model_out, expected_res)

Пример #29

0

Показать файл

def build_corpus(dirs: List[str], output: str, bigram_model_path: str,
                 trigram_model_path: str, filter_covid19: bool):
    bigram_model = None
    trigram_model = None
    if bigram_model_path and trigram_model_path:
        bigram_model = Phraser.load(bigram_model_path)
        trigram_model = Phraser.load(trigram_model_path)

    all_sentences = []
    for dir_name in dirs:
        print(f'Loading files from directory: {dir_name} ...')
        dir_files = _load_files(dir_name)
        print(f'Finished loading files from directory: {dir_name}')
        all_sentences.extend(
            _generate_sentences(dir_files, bigram_model, trigram_model,
                                filter_covid19))

    print(f'No, of lines: {len(all_sentences)}')

    print(f'Writing TXT file to: {output}')
    with open(output, 'w+') as out_fp:
        out_fp.write("\n".join(all_sentences))

Пример #30

0

Показать файл

    def load_phraser(self, phraser_name=None):
        """
        Loads phraser from phrasers folder

        :param phraser_name: defaults to tag, the name of phraser file to load
        """
        # initializes optional arguments to tag
        if phraser_name is None:
            phraser_name = self.tag

        # loads phraser
        filename = os.path.join(PHRASERS_PATH, f'{phraser_name}.pkl')
        self._phraser = Phraser.load(filename)

Пример #31

0

Показать файл

    def testSaveLoadCustomScorer(self):
        """Saving and loading a Phraser object with a custom scorer """

        with temporary_file("test.pkl") as fpath:
            bigram = Phraser(
                Phrases(self.sentences,
                        min_count=1,
                        threshold=.001,
                        scoring=dumb_scorer))
            bigram.save(fpath)
            bigram_loaded = Phraser.load(fpath)
            # we do not much with scoring, just verify its the one expected
            self.assertEqual(bigram_loaded.scoring, dumb_scorer)

Пример #32

0

Показать файл

Файл: word_embeddings.py Проект: slee-lab/lbnlp

    def __init__(self,
                 embeddings_source=EMBEDDINGS,
                 out_embeddings_source=OUT_EMBEDDINGS,
                 formulas_source=FORMULAS,
                 phraser_source=PHRASER):
        """

        :param embeddings_source: can be instance of a Magnitude object
        or url or path to a serialized Magnitude object
        :param out_embeddings_source: can be instance of a Magnitude object
        or url or path to a serialized Magnitude object
        :param formulas_source: can be url or path to a JSON-serialized dict
        of formulae, if not supplied a default file is loaded
        """

        # hidden layer embeddings (W)
        self.embeddings = Magnitude(embeddings_source, eager=False)

        # output layer embeddings (O)
        self.out_embeddings = Magnitude(out_embeddings_source)

        # load pre-trained formulas from embeddings
        with open(formulas_source, 'r') as f:
            self.formulas_with_abbreviations = load(f)

        self.dp = DataPreparation(local=False)

        self.es = ElasticConnection()

        self.formulas = {
            k: v
            for k, v in self.formulas_with_abbreviations.items()
            if k not in self.ABBR_LIST
        }

        self.formula_counts = {
            root_formula: sum(formulas.values())
            for root_formula, formulas in self.formulas.items()
        }

        self.most_common_forms = {
            formula_group_name:
            (formula_group_name if formula_group_name in self.dp.ELEMENTS else
             max(formulae.items(), key=operator.itemgetter(1))[0])
            for formula_group_name, formulae in
            self.formulas_with_abbreviations.items()
        }

        self.phraser = Phraser.load(phraser_source)

Пример #33

0

Показать файл

def phrase_corpus(infile, outfile, phraserfile):
    """
    Load a trained phraser object and apply it to the extracted
    wikipedia corpus text.

    :param infile: wikipedia xml file
    :param outfile: .bz2 archive file to save phrased text to
    :param phraserfile: gensim phraser file
    :return:
    """
    p = Phraser.load(phraserfile)
    with bz2.open(outfile, "wt", encoding="utf8") as F:
        for i in tqdm(file_yielder(infile), desc="Phrasing"):
            F.write(" ".join(p[i.split()]) + "\n")

    return 0

Пример #34

0

Показать файл

Файл: build_model.py Проект: ffaristocrat/ml-sandbox

    def build_phrases(self):
        threads = ReadThreads(
            self.board, self.input_dir,
            return_func=lambda x, y: (x, y.split()))
        filename = op.join(self.input_dir, f'{self.board}.trigrams')
        trigram_mod = Phraser.load(filename)

        filename = op.join(self.input_dir, f'{self.board}.phrases')
        with open(filename, 'wt') as f:
            for num, thread in threads:
                line = ' '.join([
                    word for word in trigram_mod[thread]
                    if word not in STOPWORDS and
                    len(word) >= 3
                ])
                print(f'{num}\t{line}', file=f)

Пример #35

0

Показать файл

Файл: build_model.py Проект: ffaristocrat/ml-sandbox

    def build_doc2vec_model(self, vectors: int=200):
        filename = op.join(self.input_dir, f'{self.board}.phraser')
        phraser = Phraser.load(filename)
        documents = ReadThreads(
            self.board, input_dir=self.input_dir, file_type='phrases',
            return_func=lambda x, y: TaggedDocument(phraser[y.split()], [x]))
        model = Doc2Vec(vector_size=vectors, window=2, min_count=5, workers=3)
        model.build_vocab(documents=documents)

        model.train(
            documents=documents,
            total_examples=model.corpus_count,
            epochs=model.iter,
        )
        
        filename = op.join(self.input_dir, f'{self.board}.doc2vec')
        model.save(filename)

        return model

Пример #36

0

Показать файл

Файл: test_phrases.py Проект: lopusz/gensim

 def testSaveLoadNoCommonTerms(self):
     """ Ensure backwards compatibility with old versions of Phraser, before common_terms"""
     bigram_loaded = Phraser.load(datapath("phraser-no-common-terms.pkl"))
     self.assertEqual(bigram_loaded.common_terms, frozenset())

Пример #37

0

Показать файл

Файл: test_phrases.py Проект: lopusz/gensim

 def testSaveLoadNoScoring(self):
     """ Saving and loading a Phraser object with no scoring parameter.
     This should ensure backwards compatibility with old versions of Phraser"""
     bigram_loaded = Phraser.load(datapath("phraser-no-scoring.pkl"))
     # we do not much with scoring, just verify its the one expected
     self.assertEqual(bigram_loaded.scoring, original_scorer)

Python Phraser.load примеры использования