def __init__(self, encode_model_path, decode_model_path): self.epath = encode_model_path self.dpath = decode_model_path #设定设备 self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") #获取语料库 self.cp = corpus.Corpus('valid.en-zh.zh.sgm', 'valid.en-zh.en.sgm', 'zh_dictionary.pth', 'en_dictionary.pth') #初始化超参数 暂定encode与decode网络的嵌入参数和隐层参数一致 self.embedding_size = 2000 self.hidden_dim = 500 self.learning_rate = 0.0002 self.batch_size = 10 #初始化模型 self.encode_network = None self.decode_network = None self.model_init() #定义优化器 损失函数 self.encode_optim = torch.optim.SGD(self.encode_network.parameters(), lr=self.learning_rate) self.dncode_optim = torch.optim.SGD(self.decode_network.parameters(), lr=self.learning_rate) self.criterion = torch.nn.CrossEntropyLoss() #获取文本迭代器 self.zh = self.cp.sentence_iterator('zh') self.en = self.cp.sentence_iterator('en') #定义encode初始ht和ct self.h0 = torch.zeros(1, 1, self.hidden_dim).to(self.device) self.c0 = torch.zeros(1, 1, self.hidden_dim).to(self.device)
def add_voice(self): new_voice = voice.Voice( {}) # creates new voice with no name and empty tree of corpora texts = os.listdir('texts') add_another_corpus = '' while add_another_corpus != 'n': for i in range(len(texts)): print("%s %s" % (i + 1, texts[i])) choice = input( 'Enter the number of the corpus you want to load:\n') corpus_name = texts[int(choice) - 1] path = 'texts/%s' % corpus_name f = open(path, 'r') text = f.read() corpus_weight_prompt = 'Enter the weight for %s:\n' % corpus_name corpus_weight = float(input(corpus_weight_prompt)) new_voice.add_corpus(corpus.Corpus(text, corpus_name), corpus_weight) texts.remove(corpus_name) add_another_corpus = input( 'Add another corpus to this voice? y/n\n') voicename = input('Name this voice:\n') new_voice.name = voicename new_voice.normalize_weights() self.voices[voicename] = new_voice
def main(): #doc = "/home/daniel/data/Ciclo6/Tesis2/stompol-tweets-train-tagged.xml" doc = "/home/daniel/data/Ciclo6/Tesis2/xmlSampleFile.xml" #doc = "/home/daniel/data/Ciclo6/Tesis2/xmlSampleFile2.xml" #doc = "/home/daniel/data/Ciclo6/Tesis2/xmlStandardFile.xml" xmlparser = XML.XmlParser(doc) tweets = xmlparser.root corpus = COR.Corpus() for tweet in tweets: tweetEntities = xmlparser.extractEntity(tweet) corpus.addNewEntities(tweetEntities) for tweetEntity in tweetEntities: entity = corpus.getEntity(tweetEntity) entity.addReview(tweet) lsa = LAT.LSA(tweets) lsa.singularValueDecomposition() lsa.reduceDimension() lsa.reconstructMatrix() corpus.assignSemanticSimilarity(lsa) sentiStrength = SENSTR.sentiStrength() corpus.assignPolaritySimilarity(sentiStrength) for entity in corpus.entities: entity.obtainLeaders() entity.obtainCommunities() entity.assignOrder() entity.fullParsing() print(entity.generateSummary()) print()
def init(): filepath = '/home/hr/Scripts/python/markov_chains/elliot/test_corpus.txt' while True: r = input('\n[ENTER]: Continue [Q]: Quit \n').lower() if r == 'q': break else: order = int(input('Enter n-gram order: ')) num_sentences = 5 c = corpus.Corpus(filepath, order) words = c.get_corpus() tokens = c.tokenize(words) counts = parse_corpus.probabilities(tokens[3]) i = 0 while i <= num_sentences: generate(tokens, counts, order) i += 1
def __init__(self, db_name, coll_name): self.base_url = 'http://stream.twitter.com/1/statuses/sample.json' self.config = Pit.get('twitter_api_gardenhose') self.user = self.config['user'] self.passwd = self.config['passwd'] self.db = corpus.Corpus(database=db_name, collection=coll_name)
def load_voices_from_transcript(self): transcripts = os.listdir('texts/transcripts') for i in range(len(transcripts)): print("%s %s" % (i + 1, transcripts[i])) choice = input('Enter the number of the transcript you want to load:\n') transcript_name = transcripts[int(choice) - 1] number = int(input('Enter the number of voices to load:\n')) for charname, size in self.biggest_characters(transcript_name, number): print(charname) path = 'texts/transcripts/%s/%s' % (transcript_name, charname) source_text = open(path).read() corpus_name = charname weighted_corpora = {} weighted_corpora[charname] = [corpus.Corpus(source_text, corpus_name), 1] self.voices[charname] = voice.Voice(weighted_corpora, charname)
def emailsByTimes(): timeAndEmails = parseSQL.getEmailsAndTimes(HILLARY) timesSentence = {} for key in timeAndEmails.keys(): print "HOUR", str(key) + ":00" words_dict = utils.basic_count(timeAndEmails[key]) print "number of distinct words", len(words_dict) print "top 5 words", sorted(words_dict, key=words_dict.get, reverse=True)[:10] n = ngram.NGram(2, 'word', words_dict) c = corpus.Corpus('../output/hillary/times/' + str(key) + 'Hour.txt') timesSentence[key] = c.numtokens / c.numsents print c.display_stats() print n.display_stats() print timesSentence
def pickle_corpus(): """ Reads in the entire json Queen Victoria Correspondence Corpus, stores the data in Letter and Corpus objects, and pickles the objects. Only call this function if the corpus has changed. :return: None """ letter_corp = corpus.Corpus([]) with open('letters.json', encoding='utf8') as f: letter_list = json.load(f) for i, l in enumerate(letter_list): l_obj = letter.Letter(l['writer'], l['addressee'], l['year'], l['language'], l['text'], i) letter_corp.add_letter(l_obj) letter_corp.add_writer(l['writer'].lower()) letter_corp.add_addressee(l['addressee'].lower()) letter_corp.add_year(l['year']) letter_corp.compute_total_word_count() letter_corp.sort_years() with open('corpus.pickle', 'wb') as f: pickle.dump(letter_corp, f)
parser.add_argument('--gpu', type=int, default=0, help='gpu to use') args = parser.parse_args() # Set the random seed manually for reproducibility. torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print("WARNING: You have a CUDA device, so you should probably run with --cuda") else: torch.cuda.set_device(args.gpu) torch.cuda.manual_seed(args.seed) if not args.lm1b: with doing('Loading data'): corpus = corpus.Corpus(args.data, args.dic) ntokens = len(corpus.dictionary.idx2word) cutoffs = args.cutoffs + [ntokens] else: ############################################################################### # Load data ############################################################################### # Torch word_freq = load_lua(os.path.join(args.data, 'word_freq.th7')).numpy() mapto = torch.from_numpy(util.reverse(np.argsort(-word_freq))).long() print("load word frequency mapping - complete") ntokens = len(word_freq) nsampled = 8192
def __init__(self): self.window = tkinter.Tk() self.index = pickle.load(open('inverted_index.pkl',"rb")) self.corpus = corpus.Corpus()
import sys sys.path.append('..') import corpus as cp import distributed_representation as dr import utility #data download dl = utility.data_loader() dl.dataload() corpus = cp.Corpus(data='data/simple-examples/data/ptb.train.txt', mode="l", max_vocabulary_size=5000, max_line=10, minimum_freq=5) window_size = 1 embedding_dims = 100 batch_size = 128 import time start = time.time() dr_sgns = dr.DistributedRepresentation(corpus, embedding_dims, window_size, batch_size, model_type="skip-gram", ns=0,
cur_loss = total_loss / interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | l_rate {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, l_rate, elapsed * 1000 / interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() return loss if __name__ == "__main__": args = args_parse() corpus = corpus.Corpus(args.data) eval_batch_size = 10 train_data = batchify(corpus.train, args.batch_size) # size(total_len//bsz, bsz) val_data = batchify(corpus.valid, eval_batch_size) test_data = batchify(corpus.test, eval_batch_size) # Build the model interval = 200 # interval to report ntokens = len(corpus.dictionary) # 10000 model = model.RNNModel(ntokens, args.embed_size, args.n_hid, args.n_layers, args.dropout) print(model) criterion = nn.CrossEntropyLoss()
# -*- coding: utf-8 -*- import sys reload(sys) sys.setdefaultencoding('utf-8') import yaml import io from googletrans import Translator import corpus import os translator = Translator() corpus = corpus.Corpus() file_name='/home/kjh948/chatterbot/chatterbot-corpus/chatterbot_corpus/data/english/ai.yml' file_name2='ai_ko.yml' dotted_path='/home/kjh948/workspace/chatterbot/chatterbot-corpus/chatterbot_corpus/data/english/' list_corpus_files = corpus.list_corpus_files( dotted_path) for yml_file in list_corpus_files: with io.open(yml_file, encoding='utf-8') as data_file: src = yaml.load(data_file) print('loading '+yml_file) for x in range(0,len(src['conversations'])): print(str(x) + ' in '+ str(len(src['conversations']))) for t in range(0,len(src['conversations'][x])): try:
iterations_test = 100 cvgThreshold = 0.1 training_ratio = 0.9 eval_every = 100 no_below = 2000 no_above = 1.0 dataset_dir = 'datasets/' dataset_file = 'abcnews-date-text.csv' result_dir = 'results/' filename_pattern = 'result_dataset={0}_k={1}_V={2}_iter={3}.csv' start_time = time.time() corpus = cor.Corpus(os.path.join(dirname, dataset_dir + dataset_file)) vocabulary = vb.Vocabulary(corpus.get_docs()) print("Number of documents {0}".format(vocabulary.docs_num)) print("Number of words {0}".format(len(vocabulary.word_id))) vocabulary.filter(no_below=no_below, no_above=no_above) print("Number of words after filtering {0}".format(len(vocabulary.word_id))) docs = [] word_no = 0 for i, doc in enumerate(corpus.get_docs()): bow = vocabulary.doc_to_bow(doc) for tupel in bow: word_no += tupel[1] docs.append(bow)
def setUp(self): self.corpus = COR.Corpus()
(opts, args) = getopt.getopt(sys.argv[1:], 'f:c:l:d:s:', options) source_file = None source_path = None target_path = None add_files = False shuffle = False language = config.LANGUAGE datasource = config.DATASOURCE pipeline_config = config.DEFAULT_PIPELINE for opt, val in opts: if opt in ('-l', '--language'): language = val if opt in ('-d', '--data'): datasource = val if opt in ('-f', '--filelist'): source_file = val if opt in ('-s', '--source'): source_path = val if opt in ('-c', '--corpus'): target_path = val if opt == '--shuffle': shuffle = True if opt == '--add': add_files = True if datasource == 'cnki': language = 'cn' if language == 'cn': pipeline_config = config.DEFAULT_PIPELINE_CN if add_files: add_files_to_corpus(target_path, source_file) else: corpus.Corpus(language, datasource, source_file, source_path, target_path, pipeline_config, shuffle)
else: ending = '\n\t' paragraph_size = random.randint(min_paragraph_size, max_paragraph_size) yield self.gen_sentence() + ending except ValueError: print 'Bad Values of Arguments.' if __name__ == "__main__": logging.basicConfig(format='%(message)s', level=logging.INFO) template = '{:-^50}' REGEX = r"[mMdD][rs]s?\. ?[\w,]+|[\w]+'?[\w,]+|[\.!\?:]" END_CHARS = '.?!' corpus = corpus.Corpus(REGEX, END_CHARS) INPUT_FILE_NAME = sys.argv[1] log = 'Reading from {}'.format(INPUT_FILE_NAME) logging.info(template.format(log)) start_time = time.time() corpus.load(INPUT_FILE_NAME) log = 'Time: {} s'.format(time.time() - start_time) logging.info(template.format(log)) OUTPUT_FILE_NAME = sys.argv[2] SENTENCES_COUNT = int(sys.argv[3]) log = 'Writing to {}'.format(OUTPUT_FILE_NAME) logging.info(template.format(log))
if line.strip() is not '': yield line def call(self): while True: stream = self._request() while True: try: streaming_response = json.loads(stream.next()) self.db.append(streaming_response) except StopIteration as e: print(e.message) time.sleep(10) break except KeyError, ValueError: pass if __name__ == '__main__': hankaku_all = re.compile(r"^[!-~]+$") import sys db = corpus.Corpus(database='corpus', collection=sys.argv[1]) client = StreamingAPI(db_name='corpus', coll_name='twitter')