Exemplos de Corpus em Python, exemplos de corpus.Corpus em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: main.py Projeto: hui98/NLP

    def __init__(self, encode_model_path, decode_model_path):
        self.epath = encode_model_path
        self.dpath = decode_model_path
        #设定设备
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        #获取语料库
        self.cp = corpus.Corpus('valid.en-zh.zh.sgm', 'valid.en-zh.en.sgm',
                                'zh_dictionary.pth', 'en_dictionary.pth')

        #初始化超参数  暂定encode与decode网络的嵌入参数和隐层参数一致
        self.embedding_size = 2000
        self.hidden_dim = 500
        self.learning_rate = 0.0002
        self.batch_size = 10

        #初始化模型
        self.encode_network = None
        self.decode_network = None
        self.model_init()

        #定义优化器 损失函数
        self.encode_optim = torch.optim.SGD(self.encode_network.parameters(),
                                            lr=self.learning_rate)
        self.dncode_optim = torch.optim.SGD(self.decode_network.parameters(),
                                            lr=self.learning_rate)
        self.criterion = torch.nn.CrossEntropyLoss()

        #获取文本迭代器
        self.zh = self.cp.sentence_iterator('zh')
        self.en = self.cp.sentence_iterator('en')

        #定义encode初始ht和ct
        self.h0 = torch.zeros(1, 1, self.hidden_dim).to(self.device)
        self.c0 = torch.zeros(1, 1, self.hidden_dim).to(self.device)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: voicebox.py Projeto: vtbassmatt/pt-voicebox

 def add_voice(self):
     new_voice = voice.Voice(
         {})  # creates new voice with no name and empty tree of corpora
     texts = os.listdir('texts')
     add_another_corpus = ''
     while add_another_corpus != 'n':
         for i in range(len(texts)):
             print("%s %s" % (i + 1, texts[i]))
         choice = input(
             'Enter the number of the corpus you want to load:\n')
         corpus_name = texts[int(choice) - 1]
         path = 'texts/%s' % corpus_name
         f = open(path, 'r')
         text = f.read()
         corpus_weight_prompt = 'Enter the weight for %s:\n' % corpus_name
         corpus_weight = float(input(corpus_weight_prompt))
         new_voice.add_corpus(corpus.Corpus(text, corpus_name),
                              corpus_weight)
         texts.remove(corpus_name)
         add_another_corpus = input(
             'Add another corpus to this voice? y/n\n')
     voicename = input('Name this voice:\n')
     new_voice.name = voicename
     new_voice.normalize_weights()
     self.voices[voicename] = new_voice

Exemplo n.º 3

0

Exibir arquivo

def main():
    #doc = "/home/daniel/data/Ciclo6/Tesis2/stompol-tweets-train-tagged.xml"
    doc = "/home/daniel/data/Ciclo6/Tesis2/xmlSampleFile.xml"
    #doc = "/home/daniel/data/Ciclo6/Tesis2/xmlSampleFile2.xml"
    #doc = "/home/daniel/data/Ciclo6/Tesis2/xmlStandardFile.xml"

    xmlparser = XML.XmlParser(doc)
    tweets = xmlparser.root
    corpus = COR.Corpus()

    for tweet in tweets:
        tweetEntities = xmlparser.extractEntity(tweet)
        corpus.addNewEntities(tweetEntities)
        for tweetEntity in tweetEntities:
            entity = corpus.getEntity(tweetEntity)
            entity.addReview(tweet)

    lsa = LAT.LSA(tweets)
    lsa.singularValueDecomposition()
    lsa.reduceDimension()
    lsa.reconstructMatrix()
    corpus.assignSemanticSimilarity(lsa)

    sentiStrength = SENSTR.sentiStrength()
    corpus.assignPolaritySimilarity(sentiStrength)

    for entity in corpus.entities:
        entity.obtainLeaders()
        entity.obtainCommunities()
        entity.assignOrder()
        entity.fullParsing()
        print(entity.generateSummary())
        print()

Exemplo n.º 4

0

Exibir arquivo

def init():

	filepath = '/home/hr/Scripts/python/markov_chains/elliot/test_corpus.txt'


	while True:

		r = input('\n[ENTER]: Continue [Q]: Quit \n').lower()

		if r == 'q':
			break

		else:

			order = int(input('Enter n-gram order:  '))
			num_sentences = 5

			c = corpus.Corpus(filepath, order)
			words = c.get_corpus()
			tokens = c.tokenize(words)
			counts = parse_corpus.probabilities(tokens[3])

			i = 0

			while i <= num_sentences:

				generate(tokens, counts, order)

				i += 1

Exemplo n.º 5

0

Exibir arquivo

    def __init__(self, db_name, coll_name):

        self.base_url = 'http://stream.twitter.com/1/statuses/sample.json'

        self.config = Pit.get('twitter_api_gardenhose')

        self.user = self.config['user']
        self.passwd = self.config['passwd']

        self.db = corpus.Corpus(database=db_name, collection=coll_name)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: voicebox.py Projeto: jbrew/stereotype

 def load_voices_from_transcript(self):
     transcripts = os.listdir('texts/transcripts')
     for i in range(len(transcripts)):
         print("%s %s" % (i + 1, transcripts[i]))
     choice = input('Enter the number of the transcript you want to load:\n')
     transcript_name = transcripts[int(choice) - 1]
     number = int(input('Enter the number of voices to load:\n'))
     for charname, size in self.biggest_characters(transcript_name, number):
         print(charname)
         path = 'texts/transcripts/%s/%s' % (transcript_name, charname)
         source_text = open(path).read()
         corpus_name = charname
         weighted_corpora = {}
         weighted_corpora[charname] = [corpus.Corpus(source_text, corpus_name), 1]
         self.voices[charname] = voice.Voice(weighted_corpora, charname)

Exemplo n.º 7

0

Exibir arquivo

def emailsByTimes():
    timeAndEmails = parseSQL.getEmailsAndTimes(HILLARY)
    timesSentence = {}
    for key in timeAndEmails.keys():
        print "HOUR", str(key) + ":00"
        words_dict = utils.basic_count(timeAndEmails[key])
        print "number of distinct words", len(words_dict)
        print "top 5 words", sorted(words_dict,
                                    key=words_dict.get,
                                    reverse=True)[:10]
        n = ngram.NGram(2, 'word', words_dict)
        c = corpus.Corpus('../output/hillary/times/' + str(key) + 'Hour.txt')
        timesSentence[key] = c.numtokens / c.numsents
        print c.display_stats()
        print n.display_stats()
    print timesSentence

Exemplo n.º 8

0

Exibir arquivo

Arquivo: corpusgui.py Projeto: user1837/QueenVictoriaCorpus

def pickle_corpus():
    """
    Reads in the entire json Queen Victoria Correspondence Corpus, stores the data in Letter and Corpus objects, and pickles the objects. Only call this function if the corpus has changed.
    :return: None
    """
    letter_corp = corpus.Corpus([])
    with open('letters.json', encoding='utf8') as f:
        letter_list = json.load(f)
    for i, l in enumerate(letter_list):
        l_obj = letter.Letter(l['writer'], l['addressee'], l['year'],
                              l['language'], l['text'], i)
        letter_corp.add_letter(l_obj)
        letter_corp.add_writer(l['writer'].lower())
        letter_corp.add_addressee(l['addressee'].lower())
        letter_corp.add_year(l['year'])
    letter_corp.compute_total_word_count()
    letter_corp.sort_years()
    with open('corpus.pickle', 'wb') as f:
        pickle.dump(letter_corp, f)

Exemplo n.º 9

0

Exibir arquivo

parser.add_argument('--gpu', type=int,  default=0,
                    help='gpu to use')
args = parser.parse_args()

# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    if not args.cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")
    else:
        torch.cuda.set_device(args.gpu)
        torch.cuda.manual_seed(args.seed)

if not args.lm1b:
    with doing('Loading data'):
        corpus = corpus.Corpus(args.data, args.dic)
        ntokens = len(corpus.dictionary.idx2word)
        cutoffs = args.cutoffs + [ntokens]
else:
    ###############################################################################
    # Load data
    ###############################################################################

    # Torch
    word_freq = load_lua(os.path.join(args.data, 'word_freq.th7')).numpy()
    mapto = torch.from_numpy(util.reverse(np.argsort(-word_freq))).long()
    print("load word frequency mapping - complete")

    ntokens = len(word_freq)
    nsampled = 8192

Exemplo n.º 10

0

Exibir arquivo

Arquivo: main.py Projeto: iamzwzhong/search-engine

 def __init__(self):
     self.window = tkinter.Tk()
     self.index = pickle.load(open('inverted_index.pkl',"rb"))
     self.corpus = corpus.Corpus()

Exemplo n.º 11

0

Exibir arquivo

import sys
sys.path.append('..')

import corpus as cp
import distributed_representation as dr

import utility

#data download
dl = utility.data_loader()
dl.dataload()

corpus = cp.Corpus(data='data/simple-examples/data/ptb.train.txt',
                   mode="l",
                   max_vocabulary_size=5000,
                   max_line=10,
                   minimum_freq=5)

window_size = 1
embedding_dims = 100
batch_size = 128

import time
start = time.time()

dr_sgns = dr.DistributedRepresentation(corpus,
                                       embedding_dims,
                                       window_size,
                                       batch_size,
                                       model_type="skip-gram",
                                       ns=0,

Exemplo n.º 12

0

Exibir arquivo

            cur_loss = total_loss / interval
            elapsed = time.time() - start_time
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | l_rate {:02.2f} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch,
                    len(train_data) // args.bptt, l_rate,
                    elapsed * 1000 / interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
    return loss


if __name__ == "__main__":
    args = args_parse()
    corpus = corpus.Corpus(args.data)

    eval_batch_size = 10
    train_data = batchify(corpus.train,
                          args.batch_size)  # size(total_len//bsz, bsz)
    val_data = batchify(corpus.valid, eval_batch_size)
    test_data = batchify(corpus.test, eval_batch_size)

    # Build the model
    interval = 200  # interval to report
    ntokens = len(corpus.dictionary)  # 10000
    model = model.RNNModel(ntokens, args.embed_size, args.n_hid, args.n_layers,
                           args.dropout)

    print(model)
    criterion = nn.CrossEntropyLoss()

Exemplo n.º 13

0

Exibir arquivo

Arquivo: trans.py Projeto: kjh948/R2D2_ROS

# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

import yaml
import io
from googletrans import Translator
import corpus
import os

translator = Translator()
corpus = corpus.Corpus()
    
file_name='/home/kjh948/chatterbot/chatterbot-corpus/chatterbot_corpus/data/english/ai.yml'
file_name2='ai_ko.yml'


dotted_path='/home/kjh948/workspace/chatterbot/chatterbot-corpus/chatterbot_corpus/data/english/'
list_corpus_files = corpus.list_corpus_files( dotted_path)

for yml_file in list_corpus_files:

    with io.open(yml_file, encoding='utf-8') as data_file:
        src = yaml.load(data_file)
    
    print('loading '+yml_file)
    for x in range(0,len(src['conversations'])):
        print(str(x) + ' in '+ str(len(src['conversations'])))
        for t in range(0,len(src['conversations'][x])):
            try:

Exemplo n.º 14

0

Exibir arquivo

Arquivo: main.py Projeto: aciborowska/CollapsedLDA

    iterations_test = 100
    cvgThreshold = 0.1
    training_ratio = 0.9
    eval_every = 100

    no_below = 2000
    no_above = 1.0

    dataset_dir = 'datasets/'
    dataset_file = 'abcnews-date-text.csv'
    result_dir = 'results/'
    filename_pattern = 'result_dataset={0}_k={1}_V={2}_iter={3}.csv'

    start_time = time.time()

    corpus = cor.Corpus(os.path.join(dirname, dataset_dir + dataset_file))

    vocabulary = vb.Vocabulary(corpus.get_docs())
    print("Number of documents {0}".format(vocabulary.docs_num))
    print("Number of words {0}".format(len(vocabulary.word_id)))

    vocabulary.filter(no_below=no_below, no_above=no_above)
    print("Number of words after filtering {0}".format(len(vocabulary.word_id)))

    docs = []
    word_no = 0
    for i, doc in enumerate(corpus.get_docs()):
        bow = vocabulary.doc_to_bow(doc)
        for tupel in bow:
            word_no += tupel[1]
        docs.append(bow)

Exemplo n.º 15

0

Exibir arquivo

 def setUp(self):
     self.corpus = COR.Corpus()

Exemplo n.º 16

0

Exibir arquivo

Arquivo: step1_init.py Projeto: techknowledgist/test

    (opts, args) = getopt.getopt(sys.argv[1:], 'f:c:l:d:s:', options)

    source_file = None
    source_path = None
    target_path = None
    add_files = False
    shuffle = False
    language = config.LANGUAGE
    datasource = config.DATASOURCE
    pipeline_config = config.DEFAULT_PIPELINE
    
    for opt, val in opts:
        if opt in ('-l', '--language'): language = val
        if opt in ('-d', '--data'): datasource = val
        if opt in ('-f', '--filelist'): source_file = val
        if opt in ('-s', '--source'): source_path = val
        if opt in ('-c', '--corpus'): target_path = val
        if opt == '--shuffle': shuffle = True
        if opt == '--add': add_files = True

    if datasource == 'cnki':
        language = 'cn'
    if language == 'cn':
        pipeline_config = config.DEFAULT_PIPELINE_CN

    if add_files:
        add_files_to_corpus(target_path, source_file)
    else:
        corpus.Corpus(language, datasource, source_file, source_path,
                      target_path, pipeline_config, shuffle)

Exemplo n.º 17

0

Exibir arquivo

                else:
                    ending = '\n\t'
                    paragraph_size = random.randint(min_paragraph_size,
                                                    max_paragraph_size)
                yield self.gen_sentence() + ending
        except ValueError:
            print 'Bad Values of Arguments.'


if __name__ == "__main__":
    logging.basicConfig(format='%(message)s', level=logging.INFO)
    template = '{:-^50}'

    REGEX = r"[mMdD][rs]s?\. ?[\w,]+|[\w]+'?[\w,]+|[\.!\?:]"
    END_CHARS = '.?!'
    corpus = corpus.Corpus(REGEX, END_CHARS)

    INPUT_FILE_NAME = sys.argv[1]
    log = 'Reading from {}'.format(INPUT_FILE_NAME)
    logging.info(template.format(log))
    start_time = time.time()

    corpus.load(INPUT_FILE_NAME)

    log = 'Time: {} s'.format(time.time() - start_time)
    logging.info(template.format(log))

    OUTPUT_FILE_NAME = sys.argv[2]
    SENTENCES_COUNT = int(sys.argv[3])
    log = 'Writing to {}'.format(OUTPUT_FILE_NAME)
    logging.info(template.format(log))

Exemplo n.º 18

0

Exibir arquivo

            if line.strip() is not '':
                yield line

    def call(self):

        while True:
            stream = self._request()

            while True:
                try:
                    streaming_response = json.loads(stream.next())
                    self.db.append(streaming_response)

                except StopIteration as e:
                    print(e.message)
                    time.sleep(10)
                    break

                except KeyError, ValueError:
                    pass


if __name__ == '__main__':

    hankaku_all = re.compile(r"^[!-~]+$")

    import sys
    db = corpus.Corpus(database='corpus', collection=sys.argv[1])

    client = StreamingAPI(db_name='corpus', coll_name='twitter')