def extract_feature(): """ 抽取语料特征 """ corpus = get_corpus() corpus.initialize() corpus.cal_feature()
def train(): """ 语料处理及hmm模型概率计算 """ corpus = get_corpus() corpus.initialize() corpus.cal_state()
async def publications(first: str, second: str) -> List[Dict]: """ - **first**: first element or measure - **second**: second element or one label """ index = INDEX return get_corpus(**locals())
def main(): """set-up applcation and send love""" api = get_handle() markov = Markov(get_corpus()) people = People(api) love = markov.generate_markov_text(random.randrange(15, 20)) to_email = people.get_random() print to_email print love if api.send_love(to_email, love): return 0 else: return -1
def compute_vectors(filename, data_pos): data = corpus.get_corpus(filename, data_pos) sentences = it.chain(sent for sent in data.loc[:]['sentence']) print('Computing word vectors for {}...'.format(filename)) wvmodel = w2v.Word2Vec( sentences=sentences, sg=1, size=6, hs=1, min_count=1, workers=1, iter=50000, compute_loss=True, window=6, seed=123, ) print('Saving Model...') wvmodel.save(MODELPATH + '/' + filename + '_w2v_6_1') return 0
def main(): model = w2v.Word2Vec.load(MODELPATH + '/dataset7_w2v_6') data_corpus = corpus.get_corpus('dataset7', 0) # data_orig = corpus.get_datalist('jeu_hota1') senvectors = [] for i in data_corpus.index: senvecs = [] for x in data_corpus.loc[i]['sentence']: senvecs.append(model[x]) senvecs = np.vstack(np.array(senvecs)) senvecs = np.mean(senvecs, axis=1) senvectors.append(senvecs) senvectors = np.vstack(np.array(senvectors)) kmeans = KMeans(n_clusters=4) kmeans.fit(senvectors) cl_kmeans = kmeans.predict(senvectors) with open(RESULTPATH + '/kmeans4_dataset7', 'w') as f: for l in cl_kmeans: f.write(str(l) + '\n')
def pre_process(): """ 抽取语料特征 """ corpus = get_corpus() corpus.pre_process()
def __init__(self): self.corpus = get_corpus() self.states, self.init_p = self.get_init_state() self.trans_p = self.get_trans_state() self.vocabs, self.emit_p = self.get_emit_state() self.model = self.get_model()
def __init__(self): self.corpus = get_corpus() self.corpus.initialize() self.classes = len(self.corpus._states) self.config = get_config() self.model = None
pos_tags.add(l) pos_tags.update(rl) Rule(l, rl) for tag in pos_tags: Pattern(tag) def align_corpus(sens): for i, sn in enumerate(sens.sens): print('---------------------') al = sn.sl.align_tree_to_flat(sn.tl.children) sn.sl.filter_align(al) sn.sl.suggest_rules([i], sn.tl.children) print(al) print(sn.sl) print(sn.tl) if __name__ == '__main__': import corpus parser = corpus.make_corpus_argparse('generate rtx rules from skelton CFG file') parser.add_argument('cfg_rules', help='new-line separated CFG rules, such as generated by rtx-comp -s') parser.add_argument('rtx_file', help='file to write generated rules to') args = parser.parse_args() read_rules(args.cfg_rules) generate_rule_file(args.rtx_file) sens = corpus.get_corpus(args) for sen in sens.sens: print(sen.sl) print('+++++++++++++++++++++++++++++') sens.compile_and_retree(args.rtx_file) align_corpus(sens)
def __init__(self): self.corpus = get_corpus() self.config = get_config() self.model = None self.vectorizer = None
def test_with_frequency(): c = corpus.get_corpus("big-shark-little-shark") assert "the" in c assert c.get_count("the") > 0
def __init__(self): self.corpus = get_corpus() self.corpus.initialize() self.config = get_config() self.model = None
def main(): parser = argparse.ArgumentParser( description='Run pairwise coreference resolution system') parser.add_argument('task', help='choose one task to go on: train/dev/test') parser.add_argument('-g', '--gold', default='gold', help='gold file name') parser.add_argument('-o', '--output', default='response', help='response file name') parser.add_argument('-m', '--model', default='model', help='model name') parser.add_argument('-v', '--vec', default='vec', help='vector name') args = parser.parse_args() task = args.task gold = args.gold + ".txt" output = args.output + ".txt" model_name = args.model + ".pkl" vec_name = args.vec + ".pkl" path = { "dev": 'conll-2012/dev/english/annotations', "train": 'conll-2012/train/english/annotations', "test": 'conll-2012/test/english/annotations' } start = time.time() print("Parsing", task, "files...") X, y, corpus_data, corpus_pairs = get_corpus(path[task], train=True)\ if task == "train" \ else get_corpus(path[task], train=False) if task == "train": vec = DictVectorizer() vec.fit(X) print("Saving vec...") joblib.dump(vec, vec_name) print("Transforming data...") clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1) X = vec.transform(X) print("Training model...") model = clf.fit(X, y) print("Saving model...") joblib.dump(model, model_name) else: print("Transforming data...") vec = joblib.load(vec_name) model = joblib.load(model_name) X = vec.transform(X) print("Predicting...") pred = model.predict(X) print("sk-learn Pairwise classification F1:", f1_score(y, pred, average='macro')) pred = iter(pred.tolist()) generate_files(corpus_data, corpus_pairs, pred, gold, output) end = time.time() print("Total time used: ", (end - start))
rule_scores = [evaluate_rule(corp, r, constituents) for r in rules] rule_ls = list(zip(rules, rule_scores)) rule_ls.sort(reverse=True, key=lambda x: x[1][0] + x[1][1]) for r, s in rule_ls: print('%s\t%s' % (str(r), s)) ret = [] while rule_ls: todo = [] cur = rule_ls[0][0] for rl, score in rule_ls[1:]: if score[0] + score[1] < (len(corp.sens) / 100.0): break if not cur.overlap(rl): todo.append((rl, score)) ret.append(cur) rule_ls = todo return ret if __name__ == '__main__': parser = corpus.make_corpus_argparse( 'build rules from corresponding word/phrase pairs') parser.add_argument('rtx_file', help='file to write generated rules to') args = parser.parse_args() corp = corpus.get_corpus(args) rls = [] for i in range(10): rls += add_rules(corp, 'PHRASE_' + str(i)) generate_rule_file(args.rtx_file, rls) corp.compile_and_retree(args.rtx_file)
import preprocessing import testing import corpus import classifiers_ioana as c_i import data_models if __name__ == "__main__": # Gather news from websites and separate in json files # scraper.scrape_data() # Get fake news and data news corpus true_news_corpus, fake_news_corpus = corpus.get_corpus() data_models.get_corpus_word_count(true_news_corpus, fake_news_corpus) # Preprocess data and add to json files # preprocessing.preprocess_data(true_news_corpus, fake_news_corpus) # Get preprocessed data true_pre_data, fake_pre_data = preprocessing.get_preprocessed_data() data_models.get_processed_data_word_count(true_pre_data, fake_pre_data) # Merge labeled data merged_labeled_data = preprocessing.merge_news(true_pre_data, fake_pre_data) # Get word_frequency word_frequency = preprocessing.get_word_frequency(merged_labeled_data) # Get vocabulary vocabulary = preprocessing.get_vocabulary(merged_labeled_data, word_frequency)
def _get_corpus() -> corpus.Corpus: selected_corpus_id = _get_settings().selected_corpus_id return corpus.get_corpus(selected_corpus_id)
def test_without_frequency(): c = corpus.get_corpus("dolch") assert "the" in c assert c.get_count("the") is None