def get_text_search_terms(keywords, synonyms_threshold, fasttext_model): bi_gram_model = Phraser.load('src/models/bi_gram_model.pkl') tri_gram_model = Phraser.load('src/models/tri_gram_model.pkl') # clean tokens cleaned_terms = clean_tokenized_sentence(keywords.split(' ')).split() # remove empty terms cleaned_terms = [term for term in cleaned_terms if term] # stem terms cleaned_terms = [ps.stem(term) for term in cleaned_terms] # create bi-grams terms_with_bigrams = bi_gram_model[' '.join(cleaned_terms).split(' ')] # create tri-grams terms_with_trigrams = tri_gram_model[terms_with_bigrams] # expand query with synonyms search_terms = [ fasttext_model.wv.most_similar(token) for token in terms_with_trigrams ] # filter synonyms above threshold (and flatten the list of lists) search_terms = [ synonym[0] for synonyms in search_terms for synonym in synonyms if synonym[1] >= synonyms_threshold ] # expand keywords with synonyms search_terms = list(terms_with_trigrams) + search_terms return search_terms
def get_phraser(self, directory, sensitivity=3): if not os.path.isdir(directory): os.makedirs(directory) print("\t\tGetting bigram detector...") if not os.path.isfile(directory + '/bigrams.pkl'): self.bigram = Phraser( Phrases(self.docs(n_examples=-1), min_count=2, threshold=sensitivity, max_vocab_size=2000000)) self.bigram.save(directory + '/bigrams.pkl') else: self.bigram = Phraser.load(directory + '/bigrams.pkl') print("\t\tGetting trigram detector...") if not os.path.isfile(directory + '/trigrams.pkl'): self.trigram = Phraser( Phrases(self.bigram[self.docs(n_examples=-1)], min_count=2, threshold=sensitivity + 1, max_vocab_size=2000000)) self.trigram.save(directory + '/trigrams.pkl') else: self.trigram = Phraser.load(directory + '/trigrams.pkl')
def __init__(self, sentences_file: str, bigram_model_path: str, trigram_model_path: str, fasttext_model_path: str): print(f'Loading CSV: {sentences_file} and building mapping dictionary...') sentences_df = pd.read_csv(sentences_file) self.sentence_id_to_metadata = {} for row_count, row in sentences_df.iterrows(): self.sentence_id_to_metadata[row_count] = dict( paper_id=row['paper_id'], cord_uid=row['cord_uid'], source=row['source'], publish_time=row['publish_time'], authors=row['authors'], section=row['section'], sentence=row['sentence'], ) print(f'Finished loading CSV: {sentences_file} and building mapping dictionary') self.cleaned_sentences = sentences_df['cleaned_sentence'].tolist() print(f'Loaded {len(self.cleaned_sentences)} sentences') print(f'Loading bi-gram model: {bigram_model_path}') self.bigram_model = Phraser.load(bigram_model_path) print(f'Finished loading bi-gram model: {bigram_model_path}') print(f'Loading tri-gram model: {trigram_model_path}') self.trigram_model = Phraser.load(trigram_model_path) print(f'Finished loading tri-gram model: {trigram_model_path}') self.synonyms_model = Synonyms(fasttext_model_path)
def load_phrasers(directory=MODEL_DIRECTORY): path = os.path.join(directory, "bigram-phraser.pkl") bigram_phraser = Phraser.load(path) path = os.path.join(directory, "trigram-phraser.pkl") trigram_phraser = Phraser.load(path) return bigram_phraser, trigram_phraser
def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): self.source = source self.max_sentence_length = max_sentence_length self.limit = limit self.bigram = Phraser.load('./preprocessed_big_phrases') self.trigram = Phraser.load('./preprocessed_trigram_phrases')
def __init__(self): #model/xgb_pipeline.pkl with open('model/fake_tokenizer.pickle', 'rb') as handle: self.tokenzs = pickle.load(handle) with open('model/model_in_json2.json', 'r') as f: self.model_json = json.load(f) #print(self.model_json) self.l_model = model_from_json(self.model_json) self.l_model.load_weights('model/model_weights2.h5') self.bigrams = Phraser.load("model/bigrams") self.trigrams = Phraser.load("model/trigrams") self.fake_score = 0
def get_params(mode): if mode == 'all': bad_words = pd.read_csv('data/bad_words.csv').values n_grams = Phraser.load("data/ngrams_model.pkl") dict_tfidf = np.load('data/my_dict.npy', allow_pickle='TRUE').item() word2vec = Word2Vec.load('data/word2vec.model') descriptors = pd.read_csv('data/descriptors.csv').set_index( 'raw descriptor') return bad_words, n_grams, dict_tfidf, word2vec, descriptors if mode == 'half': bad_words = pd.read_csv('data/bad_words.csv').values n_grams = Phraser.load("data/ngrams_model.pkl") descriptors = pd.read_csv('data/descriptors.csv').set_index( 'raw descriptor') return bad_words, n_grams, descriptors
def load_phrases_model(path) -> Phraser: """ Load a phraser to get statistic results :param path: :return: """ return Phraser.load(path)
def scan_processed_with_phraser_(path_journal_processed, journ, path_phraser_models): print ("Launching scan of "+journ) # load the phraser model of the given journal name from given path fpath = path_phraser_models + '/' + journ + '_00_bigramphraser' bigram = Phraser.load(fpath) # dict of dicts; filenames as key, dict of frequency {word:count} as value texts = {} # iterate over all data from that journal fnames = os.listdir(path_journal_processed) ln_fnames = len(fnames) with open('logfile.txt', 'w') as lfh: for i, fname in enumerate(fnames): text = [] for line in open(os.path.join(path_journal_processed, fname)): text = text + bigram[line.split()] frequency = defaultdict(float) for word in text: frequency[word] += 1 # add the frequency counts of this text file to master dict texts[fname] = frequency if i%150==0: lfh.write("Done scanning "+fname+" ; {} out of {}".format(i+1, ln_fnames) + '\n') lfh.flush() return texts
def load_vector_data(dataset_name, bgr=False, split_factor=0.2): sentences = pd.read_csv("../cleaned/" + dataset_name + "_stems.csv", delimiter=",", dtype=str).astype(str).values.tolist() targets = pd.read_csv("../cleaned/" + dataset_name + "_clean.csv", dtype=types)["a"] vector_model = FastText.load("../models/word_embeddings/" + dataset_name + "_fasttext") # replace placeholders (" "), make one-string-sentences print("... replacing placeholders") for index, sample in enumerate(sentences): sentences[index] = list(filter((" ").__ne__, sample)) inputs = [" ".join(sentence) for sentence in sentences] tokenized = sentences if bgr: bigram = Phraser.load("../models/bigrams/bigram_" + dataset_name + ".pkl") bigrammed = [bigram[sentence] for sentence in sentences] tokenized = bigrammed inputs = [ np.sum(vector_model.wv[sent], 0).tolist() if sent else np.zeros(32) for sent in tokenized ] inputs = np.array(inputs) train_loader, val_loader, test_loader = make_loader( inputs, targets, split_factor) return len(inputs[0]), train_loader, val_loader, test_loader
def fit(self, sentencesPath): """ train phrases :param sentencesPath:the path of text file, the text file should be the format: one line one sentence """ self.phrasers = [] # path detect for path in self.savePhraserPaths: if not os.path.exists(os.path.dirname(path)): raise FileNotFoundError(os.path.dirname(path) + " not exist") for path in self.savePhraserPaths: if not os.path.exists(path): # need train self.phrasers = None break if self.phrasers is not None and self.file_overwrite == False: logging.info("models are already exist, will read it") for path in self.savePhraserPaths: self.phrasers.append(Phraser.load(path)) return True self.phrasers = [] c = 2 for path in self.savePhraserPaths: logging.info("getting %d-gram phrase......" % c) c += 1 phraser = Phraser( Phrases(sentences=TxtIter(sentences=codecs.open( sentencesPath, mode="r", encoding="utf-8"), ngrams=self.phrasers), min_count=self.min_count, threshold=self.threshold, max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, scoring=self.scoring)) phraser.save(path) self.phrasers.append(phraser)
def get_bigram_phraser(directory): if os.path.isfile(BIGRAM): return Phraser.load(BIGRAM) else: bigram = Phraser(Phrases(corpus(directory))) bigram.save(BIGRAM) return bigram
def main(coursesList): lda = LDA.load("./best_model.lda") dictionary = Dictionary.load("best_model.lda.id2word") bigrams = Phraser.load("./bigram_model.pkl") trigrams = Phraser.load("./trigram_model.pkl") text_clean = [doc.split(' ') for doc in coursesList['description']] corpus = [dictionary.doc2bow(text) for text in text_clean] create_vector_topics(lda, corpus, dictionary, coursesList) courses_topic = config.matrix_courses_topic.to_numpy() #lda, dictionary, bigrams, trigrams = create_LDA_model(coursesList) #courses_topic = config.matrix_courses_topic.to_numpy() cursor.execute("select id from auth_group") id_groups = cursor.fetchall() for i in id_groups: cursor.execute( "select distinct studyplan_id from students where group_id = %(id)s ", {'id': i[0]}) studyplan_id = cursor.fetchall() for j in studyplan_id: subject_list = pd.DataFrame(columns=['id_subject', 'description']) subject_list = WordProcessing.word_processing( get_work_program(j[0], subject_list)) #for k in subject_list: token_stud_prog = [ program.split(' ') for program in subject_list['description'] ] #token_stud_prog = add_n_grams(token_stud_prog, bigrams, trigrams) prog_corp = [ dictionary.doc2bow(program) for program in token_stud_prog ] topic_prog = lda.get_document_topics(prog_corp) for l in range(0, len(topic_prog)): profile_student = np.zeros(config.num_lda_topic) dense_topic_prog = np.zeros(config.num_lda_topic) for m in topic_prog[l]: dense_topic_prog[m[0]] += m[1] #mask = np.argsort(dense_topic_prog)[::-1][:1] #profile_student[mask] += 1 profile_student = dense_topic_prog cosine_similarities = linear_kernel( profile_student.reshape(1, -1), courses_topic).flatten() top_courses = np.where(cosine_similarities >= 0.2)[0] print(subject_list.loc[l, 'id_subject']) #print(top_courses) print(coursesList.loc[top_courses, 'name':'link'])
def ngrams_preprocess(corpus, ngrams=1, grams_join=" "): lst_corpus = [] for string in corpus: lst_words = string.split() lst_grams = [ grams_join.join(lst_words[i:i + ngrams]) for i in range(0, len(lst_words), ngrams) ] lst_corpus.append(lst_grams) bigram = Phraser.load('\\Users\\Zeden\\Desktop\\bigram') trigram = Phraser.load('\\Users\\Zeden\\Desktop\\trigram') lst_ngrams_detectors = [bigram, trigram] if len(lst_ngrams_detectors) != 0: for detector in lst_ngrams_detectors: lst_corpus = list(detector[lst_corpus]) return lst_corpus
def __init__(self): assert spacy.en.STOP_WORDS self.STOP_WORDS = spacy.en.STOP_WORDS nlp = spacy.load('en') nlp.pipeline = [nlp.tagger, nlp.parser] self.nlp = nlp self.bigram_model = Phraser.load('bigram_model')
def __init__(self): assert spacy.en.STOP_WORDS self.STOP_WORDS = spacy.en.STOP_WORDS nlp = spacy.load('en') nlp.pipeline = [nlp.tagger, nlp.parser] self.nlp = nlp self.bigram_model = Phraser.load('C:/Users/raula/dissertation_demo/python_scripts/bigram_model')
def loadNgrams(self, N=2): phrasesdir = environment.MODELS_DIR + "phrases/" if N == 2: phr_path = "bigrams.phr" elif N == 3: phr_path = "trigrams.phr" else: phr_path = "bigrams.phr" return Phraser.load(phrasesdir + phr_path)
def __init__(self, args): # load all the haikus from a file with open(args.data_path, "r", encoding="utf8", errors="ignore") as infile: self.data = infile.read().splitlines() self.word2vec = gensim.models.KeyedVectors.load( f"{args.model_path}/word2vec.model") self.bigrams = Phraser.load(f"{args.model_path}/bigram.model")
def from_file(cls, dict_fname, phraser_fname=None): """Load tokenizer information from a dictionary file (generated by gensim dictionary.save) and a phraser file.""" d = Dictionary.load(str(dict_fname)) if phraser_fname is not None: p = Phraser.load(phraser_fname) else: p = Phraser(Phrases([[]])) return cls(d, p)
def get_trigram_phraser(directory): if os.path.isfile(TRIGRAM): return Phraser.load(TRIGRAM) else: bigram = get_bigram_phraser(directory) sentence_stream = (bigram[sentence] for sentence in corpus(directory)) trigram = Phraser(Phrases(sentence_stream)) trigram.save(TRIGRAM) return trigram
def testSaveLoad(self): """ Saving and loading a Phraser object.""" with temporary_file("test.pkl") as fpath: bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1)) bigram.save(fpath) bigram_loaded = Phraser.load(fpath) self.assertEqual( bigram_loaded[['graph', 'minors', 'survey', 'human', 'interface', 'system']], ['graph_minors', 'survey', 'human_interface', 'system'])
def build_csv(metadata: str, dirs: List[str], output: str, bigram_model_path: str, trigram_model_path: str, filter_covid_19: bool, sentences: bool): print(f'Building metadata dictionary from file: {metadata} ...') sha_to_properties = _build_metadata_dict(metadata) print(f'Finished building metadata dictionary from file: {metadata}') print(f'Filtering only COVID-19 articles: {filter_covid_19}') bigram_model = None trigram_model = None if bigram_model_path and trigram_model_path: bigram_model = Phraser.load(bigram_model_path) trigram_model = Phraser.load(trigram_model_path) all_df = None for dir_name in dirs: print(f'Loading files from directory: {dir_name} ...') dir_files = load_files(dir_name) print(f'Finished loading files from directory: {dir_name}') if sentences: clean_df = generate_df_sentence_level(dir_files, sha_to_properties, filter_covid_19, bigram_model, trigram_model) else: clean_df = generate_df(dir_files, sha_to_properties, filter_covid_19) if all_df is None: # first call all_df = clean_df else: all_df = all_df.append(clean_df) all_df.fillna("", inplace=True) # filter out short sentences all_df = all_df[all_df['cleaned_sentence'].apply( lambda sent: len(sent.split()) >= 5)] print(f'All files DataFrame shape: {all_df.shape}') print(f'Writing CSV file to: {output}') with open(output, 'w+') as out_fp: all_df.to_csv(out_fp, index=True)
def load_bigram(self): ''' Search for and load a pre-existing bigrams file :update: self.bigram ''' self.bigram = Phraser.load( f"./tmp/{self.date}_bigram_model_{self.b_min}.pkl") print("Bigram loaded.")
def testSaveLoadCustomScorer(self): """Saving and loading a Phraser object with a custom scorer """ with temporary_file("test.pkl") as fpath: bigram = Phraser( Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)) bigram.save(fpath) bigram_loaded = Phraser.load(fpath) # we do not much with scoring, just verify its the one expected self.assertEqual(bigram_loaded.scoring, dumb_scorer)
def testSaveLoad(self): """ Saving and loading a Phraser object.""" with temporary_file("test.pkl") as fpath: bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1)) bigram.save(fpath) bigram_loaded = Phraser.load(fpath) self.assertEqual( bigram_loaded[[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]], ['graph_minors', 'survey', 'human_interface', 'system'])
def build_ngram_model(docs): bigram_model_path = Path('bigram_phraser.pkl') trigram_model_path = Path('trigram_phraser.pkl') if not bigram_model_path.exists() or not trigram_model_path.exists(): print('Building n-gram models') bigram = Phrases(docs, min_count=3, threshold=6) trigram = Phrases(bigram[docs], min_count=3, threshold=6) bigram_model = Phraser(bigram) trigram_model = Phraser(trigram) bigram_model.save('bigram_phraser.pkl') trigram_model.save('trigram_phraser.pkl') else: print('Loading saved n-gram models') bigram_model = Phraser.load('bigram_phraser.pkl') trigram_model = Phraser.load('trigram_phraser.pkl') return (bigram_model, trigram_model)
def testCompatibilty(self): phr = Phraser.load(datapath("phraser-3.6.0.model")) model = Phrases.load(datapath("phrases-3.6.0.model")) test_sentences = ['trees', 'graph', 'minors'] expected_res = ['trees', 'graph_minors'] phr_out = phr[test_sentences] model_out = model[test_sentences] self.assertEqual(phr_out, expected_res) self.assertEqual(model_out, expected_res)
def build_corpus(dirs: List[str], output: str, bigram_model_path: str, trigram_model_path: str, filter_covid19: bool): bigram_model = None trigram_model = None if bigram_model_path and trigram_model_path: bigram_model = Phraser.load(bigram_model_path) trigram_model = Phraser.load(trigram_model_path) all_sentences = [] for dir_name in dirs: print(f'Loading files from directory: {dir_name} ...') dir_files = _load_files(dir_name) print(f'Finished loading files from directory: {dir_name}') all_sentences.extend( _generate_sentences(dir_files, bigram_model, trigram_model, filter_covid19)) print(f'No, of lines: {len(all_sentences)}') print(f'Writing TXT file to: {output}') with open(output, 'w+') as out_fp: out_fp.write("\n".join(all_sentences))
def load_phraser(self, phraser_name=None): """ Loads phraser from phrasers folder :param phraser_name: defaults to tag, the name of phraser file to load """ # initializes optional arguments to tag if phraser_name is None: phraser_name = self.tag # loads phraser filename = os.path.join(PHRASERS_PATH, f'{phraser_name}.pkl') self._phraser = Phraser.load(filename)
def __init__(self, embeddings_source=EMBEDDINGS, out_embeddings_source=OUT_EMBEDDINGS, formulas_source=FORMULAS, phraser_source=PHRASER): """ :param embeddings_source: can be instance of a Magnitude object or url or path to a serialized Magnitude object :param out_embeddings_source: can be instance of a Magnitude object or url or path to a serialized Magnitude object :param formulas_source: can be url or path to a JSON-serialized dict of formulae, if not supplied a default file is loaded """ # hidden layer embeddings (W) self.embeddings = Magnitude(embeddings_source, eager=False) # output layer embeddings (O) self.out_embeddings = Magnitude(out_embeddings_source) # load pre-trained formulas from embeddings with open(formulas_source, 'r') as f: self.formulas_with_abbreviations = load(f) self.dp = DataPreparation(local=False) self.es = ElasticConnection() self.formulas = { k: v for k, v in self.formulas_with_abbreviations.items() if k not in self.ABBR_LIST } self.formula_counts = { root_formula: sum(formulas.values()) for root_formula, formulas in self.formulas.items() } self.most_common_forms = { formula_group_name: (formula_group_name if formula_group_name in self.dp.ELEMENTS else max(formulae.items(), key=operator.itemgetter(1))[0]) for formula_group_name, formulae in self.formulas_with_abbreviations.items() } self.phraser = Phraser.load(phraser_source)
def phrase_corpus(infile, outfile, phraserfile): """ Load a trained phraser object and apply it to the extracted wikipedia corpus text. :param infile: wikipedia xml file :param outfile: .bz2 archive file to save phrased text to :param phraserfile: gensim phraser file :return: """ p = Phraser.load(phraserfile) with bz2.open(outfile, "wt", encoding="utf8") as F: for i in tqdm(file_yielder(infile), desc="Phrasing"): F.write(" ".join(p[i.split()]) + "\n") return 0
def build_phrases(self): threads = ReadThreads( self.board, self.input_dir, return_func=lambda x, y: (x, y.split())) filename = op.join(self.input_dir, f'{self.board}.trigrams') trigram_mod = Phraser.load(filename) filename = op.join(self.input_dir, f'{self.board}.phrases') with open(filename, 'wt') as f: for num, thread in threads: line = ' '.join([ word for word in trigram_mod[thread] if word not in STOPWORDS and len(word) >= 3 ]) print(f'{num}\t{line}', file=f)
def build_doc2vec_model(self, vectors: int=200): filename = op.join(self.input_dir, f'{self.board}.phraser') phraser = Phraser.load(filename) documents = ReadThreads( self.board, input_dir=self.input_dir, file_type='phrases', return_func=lambda x, y: TaggedDocument(phraser[y.split()], [x])) model = Doc2Vec(vector_size=vectors, window=2, min_count=5, workers=3) model.build_vocab(documents=documents) model.train( documents=documents, total_examples=model.corpus_count, epochs=model.iter, ) filename = op.join(self.input_dir, f'{self.board}.doc2vec') model.save(filename) return model
def testSaveLoadNoCommonTerms(self): """ Ensure backwards compatibility with old versions of Phraser, before common_terms""" bigram_loaded = Phraser.load(datapath("phraser-no-common-terms.pkl")) self.assertEqual(bigram_loaded.common_terms, frozenset())
def testSaveLoadNoScoring(self): """ Saving and loading a Phraser object with no scoring parameter. This should ensure backwards compatibility with old versions of Phraser""" bigram_loaded = Phraser.load(datapath("phraser-no-scoring.pkl")) # we do not much with scoring, just verify its the one expected self.assertEqual(bigram_loaded.scoring, original_scorer)