Пример #1
0
def main(args):

    data = json.load(open(args.input_refexps_json, 'r'))
    max_length = 0
    all_refexps = []
    for keys in data:
        for ref_id in data[keys]:
            all_refexps.append(data[keys][ref_id])

    for r in all_refexps:
        t = tokenize(
            r,
            punct_to_keep=[',', ';'],
            punct_to_remove=['?', '.']
        )
        if len(t) > max_length:
            max_length = len(t)

    refexp_token_to_idx = build_vocab(
        all_refexps,
        punct_to_keep=[',', ';'],
        punct_to_remove=['?', '.']
    )

    with open(args.output_vocab_json, 'w') as f:
        json.dump(refexp_token_to_idx, f)

    with h5py.File(args.output_refexps_h5df, 'w') as f:
        for keys in data:
            one_image_refexps = []
            # img_name = keys.split('.')[0]
            one_image_refexps_to_idx = []
            img_all_refexps = data[keys]

            for ref_id in img_all_refexps:
                # refexp = img_all_refexps[ref_id]
                # one_image_refexps.append(refexp)
                refexp = img_all_refexps[ref_id]
                one_image_refexps.append(refexp)

            for refexps in one_image_refexps:
                tokens = tokenize(refexps, punct_to_remove=['?', '.'], punct_to_keep=[';', ','])
                refexps_idx = encode(tokens, refexp_token_to_idx)
                one_image_refexps_to_idx.append(refexps_idx)

            for refexp_ in one_image_refexps_to_idx:
                num_null = max_length - len(refexp_)
                if num_null > 0:
                    refexp_ += [refexp_token_to_idx['<NULL>']]*num_null

            one_image_refexps_to_idx_numpy = np.asarray(one_image_refexps_to_idx, dtype=np.int32)

            f.create_dataset(keys, data=one_image_refexps_to_idx_numpy)
Пример #2
0
def dataset(input_directory, batch_size, emb_dict, max_time):
    file_gen = os.listdir(input_directory)
    random.shuffle(file_gen)
    batch_num = 0
    inputs = np.zeros((batch_size, max_time))
    targets = np.zeros((batch_size, 2))
    sequence_length = np.zeros((batch_size))
    for name in file_gen:
        file_path = input_directory + '/' + name
        f = open(file_path)
        try:
            w = preprocess.tokenize(f.read())
        except UnicodeDecodeError:
            print('Encountered unicode error, continuing')
            continue
        rating = int(re.sub('_|\.txt', ' ', name).split()[1])
        targets[batch_num][0:2] = [0, 1] if rating < 5 else [1, 0]
        sequence_length[batch_num] = len(w)
        for time_num in range(min(max_time, len(w))):
            inputs[batch_num][time_num] = emb_dict.get(w[time_num], 0)

        batch_num += 1
        if batch_num == batch_size:
            yield inputs, targets, sequence_length
            batch_num = 0
            inputs = np.zeros((batch_size, max_time))
            targets = np.zeros((batch_size, 2))
            sequence_length = np.zeros((batch_size))
Пример #3
0
def titile_tokens():
    cnt = 0
    try:
        sw_path = os.path.join(cwd, "text/news_stopwords.txt")
        sw_list = prep.get_stopwords(sw_path)
        conn = MongoClient("127.0.0.1", 27017)
        db = conn.netease
        target = db.token_war.find({})

        for i in target:
            if "title_keywords" not in i.keys() or not i['title_keywords']:
                cnt += 1
                title_keywords = []
                if len(i['keywords']) > 1:  # article have keywords
                    title_keywords = i['title'] + '\t\t' + ' '.join(
                        i['keywords'])
                else:
                    title_keywords = i['title']
                tokens_string = ' '.join(
                    prep.tokenize(title_keywords,
                                  sw_list=sw_list,
                                  language='CN'))
                db.token_war.update_one(
                    {'number': i['number']},
                    {'$set': {
                        'title_keywords': tokens_string
                    }})
        print "%d title tokenize completed!" % cnt

        conn.close()
    except Exception as e:
        print "From:art_tokenize:\t\nUnexpect Error: {}".format(e)
Пример #4
0
def main(args):
    '''Example script to run prediction on a pre-trained model with sample lyrics dataset'''
    c = Configuration()

    if args.artist:
        c.set_artist(args.artist)

    print("Artist:", c.artist.replace("_", " ").title())

    lyrics_dataset = pp.read_lyrics_files(c.path)

    dictionary = torch.load(open(c.dictionary_path, 'rb'))

    print("Vocabulary size: ", len(dictionary))
    print("----------------------------")

    tokenized = pp.tokenize(lyrics_dataset)

    seed_lyrics = generate_seed_lyrics(tokenized, c.window_size, args.censored)

    model = torch.load(open(c.model_path, 'rb'))

    predicted_lyrics = predict(model,
                               seed_lyrics,
                               dictionary,
                               num_words=args.words,
                               topk=c.predict_topk)

    predicted_lyrics = postprocess(predicted_lyrics, args.censored)

    print(predicted_lyrics)
Пример #5
0
def art_tokenize():
    cnt = 0
    new = []
    point = datetime(2018, 1, 1)
    try:
        conn = MongoClient("127.0.0.1", 27017)
        db = conn.netease
        target = db.war.find({'date': {'$gt': point}})
        for i in target:
            if db.token_war.find({'number': i['number']}).count() < 1:
                tokens_list = prep.tokenize(i['content'],
                                            sw_list=[],
                                            language='CN')
                del i['content'], i['comments'], i['commenturl'], i[
                    'tie_count']
                i['tokens'] = ' '.join(tokens_list)
                new.append(i)
                cnt += 1
        if new:
            # insert bulk
            db.token_war.insert_many(new)
            print "%d article tokenized completed!" % cnt
            return "%d article tokenized completed!" % cnt
        else:
            print "No new article requires tokenize!"
            return "No new article requires tokenize!"
        conn.close()
    except Exception as e:
        print "From:art_tokenize:\t\nUnexpect Error: {}".format(e)
Пример #6
0
def prepare_inputs(token_mapping, w2v_W, w2v_U, sentences):
    """
    Converts a 2-D list of sentences (list of list of words)
    to one-hot encoded tokens of shape [n_sentences, n_words, len(token_mapping), 1].
    """
    tokens = [tokenize(token_mapping, sentence) for sentence in sentences]

    depth = len(token_mapping)
    one_hot_tokens = []
    for sentence in tokens:
        one_hot_sentence = []
        for i, token in enumerate(sentence):
            if token != token_mapping['#UNK#']:
                one_hot_sentence.append(one_hot_encode(token, depth))
            else:
                if i <= 2:
                    context_tokens = sentence[:i] + sentence[i + 1:i + 3]
                else:
                    context_tokens = sentence[i - 2:i] + sentence[i + 1:i + 3]
                context_one_hot = [
                    one_hot_encode(token, depth) for token in context_tokens
                ]
                context_mean = np.mean(np.asarray(context_one_hot), axis=0)
                one_hot_sentence.append(context_mean)
        one_hot_tokens.append(one_hot_sentence)

    one_hot_tokens = [np.asarray(ls) for ls in one_hot_tokens]
    vec_tokens = [
        word2vec(w2v_W, w2v_U, sentence)
        for sentence in tqdm(one_hot_tokens, desc='Vectorizing tokens')
    ]
    return vec_tokens
Пример #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--dir', default='data')
    args = parser.parse_args()

    model_file = os.path.join(args.dir, 'model.pickle')
    with open(model_file, 'rb') as f:
        model = pickle.load(f)

    label_encoder_file = os.path.join(args.dir, 'label_encoder.pickle')
    with open(label_encoder_file, 'rb') as f:
        label_encoder = pickle.load(f)

    vectorizer_file = os.path.join(args.dir, 'vectorizer.pickle')
    with open(vectorizer_file, 'rb') as f:
        vectorizer = pickle.load(f)

    tagger = MeCab.Tagger('-Owakati')

    while True:
        text = input()
        tokenized = preprocess.tokenize(tagger, text)

        x = vectorizer.transform([tokenized])
        y = model.predict(x)
        label = label_encoder.inverse_transform(y)[0]
        print('Tokenized:', tokenized)
        print('Label:', label)
Пример #8
0
def categorize_this_happy_moment(happy_moment):
    input = preprocess.tokenize(happy_moment)
    input = lstm_convert_data([input], vocab)
    input = np.array(input)
    input = sequence.pad_sequences(input, maxlen=max_review_length)
    prediction = model.predict(input)
    max_index = np.argmax(np.array(prediction))
    print(predict_category.get(max_index))
Пример #9
0
 def __init__(self, review_filename):
     self.review_rating = int(
         re.search('_\d+', review_filename).group()[1:])
     target_list = [0, 1] if self.review_rating > 5 else [1, 0]
     self.sentiment = 'pos' if self.review_rating > 5 else 'neg'
     review_file = open(review_filename).read()
     self.tokens = preprocess.tokenize(review_file)
     self.targets = np.array([target_list])
     self.length = len(self.tokens)
Пример #10
0
def clean(s):
    s = preprocess(s)
    tokenize_s = tokenize(s)
    for item in tokenize_s:
        if not item.isalpha():
            tokenize_s.remove(item)
        elif len(item) == 1:
            tokenize_s.remove(item)
    return ' '.join(tokenize_s)
Пример #11
0
def converse():
    while True:
        # Function invoked at end of each epoch. Prints generated text.
        user_input = input('>> ')
        sentence_array = preprocess.tokenize(user_input)

        print('Response:')
        print(get_response(sentence_array, 50))
        print()
Пример #12
0
def send():
    userID = None
    if request.method == 'POST':
        userID = request.form['userID']
        keyword = request.form['keyword']
        print(type(userID))
        print(type(int(userID)))
        result = 1
        result = sameModel.recommendProducts(int(userID), 100)
        tmp = {}  # key = business_num, value = business_rating
        lda_list = []
        for line in result:
            tmp[line[1]] = line[2]
            lda_list.append(line[1])
        [bn, br] = openfile()
        name_list = []
        for key in lda_list:
            name_list.append(bn[key])
        Recommendation = name_list[:10]
        #LDA
        if len(request.form['keyword']) >= 1:
            rawList = tokenize(br, lda_list)

            stopped_result = stop_words(rawList)
            [corpus, dictionary] = doc_term_matrix(stopped_result)
            lsiList = lsi(corpus, dictionary)

            #word matching
            document = list()
            for i in range(0, 99):
                document.append(list())
            for i in range(0, 99):
                item = lsiList[i]
                for word in item:
                    document[i].append(','.join([str(word[0])]))
            model = gensim.models.Word2Vec(document, min_count=1)

            a = [0] * 100
            dic = {}
            for i in range(0, 99):
                business = document[i]
                for item in business:
                    a[i] = a[i] + model.similarity(keyword, item)
                dic[a[i]] = lda_list[i]
                #!!!!!!!!!!!gaichicken!!!!!!!!!#############################
            #a = np.sort(a,axis=0)
            a = sorted(a, reverse=True)
            Recommendation = []
            #a = np.ndarray.tolist(a)
            for key2 in a[0:10]:
                print(key2)
                Recommendation.append(bn[dic[key2]])

        #Recommendation = json.dumps(result)
        return render_template('index.html', Recommendation=Recommendation)
    return render_template('index.html')
Пример #13
0
def process(smi):
    smis = smi.strip()

    # Only include compounds that exclusively
    # use tokens the model can generate
    if any(tok not in model.vocab2id for tok in tokenize(smis)):
        return None

    # Standardize SMILES
    return [molvs.standardize_smiles(smi) for smi in smis.split('.')]
Пример #14
0
def get_new_vec(model, docs, sw_path=default_sw_path, language="CN"):
    sw_list = get_stopwords(sw_path)
    new_doc_vec = []
    try:
        for doc in docs:
            tokens = tokenize(doc, sw_list, language=language)
            doc_vec = model.infer_vector(tokens)
            new_doc_vec.append(doc_vec)
        return new_doc_vec
    except Exception as e:
        print "From get_new_vec:\n\tUnexpect Error:{}".format(e)
Пример #15
0
 def __iter__(self):
     sw_list = get_stopwords(self.sw_path)
     f = open(self.file_path)
     csv_reader = csv.reader(f, delimiter='\t')
     for i, line in enumerate(csv_reader):
         if i + 1 > self.lines:
             self.lines = i + 1
             self.label_list.append(line[0])  # get the doc label
         tag = "%s_%s" % (self.file_name, str(i))
         # print '---1'
         yield doc2vec.TaggedDocument(tokenize(line[1], sw_list, self.t),
                                      tags=[tag])
Пример #16
0
def main(argv):
    emot_dic = {"anger":0, "joy":1, "sadness":2, "fear":3}

    x = tokenize(argv[0])

    emot = np.array([0, 0, 0, 0])
    emot[emot_dic[argv[1]]] = 1

    model = get_model("model.h5")

    out = model.predict([x.reshape((1, 50, 1)), emot.reshape((1, 4))])[0]

    print(argv[1], ":",  np.argmax(out))
Пример #17
0
def predict(doc):
    predictions = {}
    predictions['input'] = doc
    doc = tokenize(doc).rstrip()
    predictions['tokenized'] = doc
    doc_vector = vectorize(doc, word_vec, tfidf)
    for method in METHODS:
        predictions[method] = {
            'rule': METHODS[method](doc),
            'model': models[method].predict([doc_vector])[0]
        }

    return predictions
Пример #18
0
def BM25Similarity(Query, Passage, k1=1.5, b=0.75, delimiter=' ') :
    
    global docIDFDict,avgDocLength

    # query_words= Query.strip().lower().split(delimiter)
    # passage_words = Passage.strip().lower().split(delimiter)
    query_words=tokenize(Query.strip().lower())
    passage_words = tokenize(Passage.strip().lower())
    passageLen = len(passage_words)
    docTF = {}
    for word in set(query_words):   #Find Term Frequency of all query unique words
        docTF[word] = passage_words.count(word)
    commonWords = set(query_words) & set(passage_words)
    tmp_score = []
    for word in commonWords :   
        numer = (docTF[word] * (k1+1))   #Numerator part of BM25 Formula
        denom = ((docTF[word]) + k1*(1 - b + b*passageLen/avgDocLength)) #Denominator part of BM25 Formula 
        if(word in docIDFDict) :
            tmp_score.append(docIDFDict[word] * numer / denom)

    score = sum(tmp_score)
    return score
Пример #19
0
def main(args):
    '''Example script to train a model on the sample lyrics dataset'''
    c = Configuration()

    if args.artist:
        c.set_artist(args.artist)

    print("Hyperparameters: ", c)
    print("Loading data from path: ", c.path)

    lyrics_dataset = pp.read_lyrics_files(c.path)

    tokenized = pp.tokenize(lyrics_dataset)

    x, y, dictionary = pp.preprocess(tokenized, c.window_size)

    training_data = DataLoader(list(zip(x, y)),
                               batch_size=c.train_batch_size,
                               shuffle=True)

    model = LyricPredictor(len(dictionary), c.output_size)

    print("Training model...")

    model, _, _ = train(model=model,
                        training_data=training_data,
                        num_epochs=c.num_epochs,
                        lr=c.lr,
                        grad_norm=c.grad_max_norm)

    print("Saving model: ", c.model_path)

    torch.save(model, c.model_path)

    print("Saving dictionary: ", c.dictionary_path)

    torch.save(dictionary, c.dictionary_path)

    print("Generating lyrics...")

    seed_lyrics = generate_seed_lyrics(tokenized, c.window_size, args.censored)

    predicted_lyrics = predict(model,
                               seed_lyrics,
                               dictionary,
                               num_words=args.words,
                               topk=c.predict_topk)

    predicted_lyrics = postprocess(predicted_lyrics, args.censored)

    print(predicted_lyrics)
Пример #20
0
def prepare_inputs(token_mapping, sentences):
    """
    Converts a 2-D list of sentences (list of list of words) to 
        one-hot encoded tokens of shape 
        [n_sentences, n_words, len(token_mapping), 1].
    """
    tokens = [tokenize(token_mapping, sentence) for sentence in sentences]

    depth = len(token_mapping)
    one_hot_tokens = [[one_hot_encode(token, depth) for token in sentence]
                      for sentence in tokens]
    one_hot_tokens = [np.asarray(ls) for ls in one_hot_tokens
                      ]  # list of [n_words, len(token_mapping), 1]
    return one_hot_tokens
Пример #21
0
def process_text_from_string(text):
	"""
		Process 1 text given as a string
		
		Args:
			text:	string containing all the text
			
		Returns:
			A row-like array of all the metrics extracted from the text
	"""
	from preprocess import tokenize
	sentences, words = tokenize(text)
	
	return get_info(text, sentences, words)
def get_predictions(query, model_name, magic_string):
    query = clean(query)
    query = tokenize(query, magic_string, 'embedding' in model_name)
    query = np.expand_dims(query, 0)
    if 'conv' in model_name and 'embedding' not in model_name:
        query = np.expand_dims(query, 2)

    prediction = model.predict(query)
    prediction = prediction[0]

    indexed = list(enumerate(prediction))
    weighted = sorted(indexed, key=lambda e: e[1], reverse=True)

    print('\n'.join([f"{map_[str(r[0])]}: {r[1]:.2f}" for r in weighted[:10]]))
    print()
Пример #23
0
def build_corpus_dictionary():

    input_path = 'dataset/comments_array.json'
    json_array = preprocess.load_json_file(input_path)
    
    field_array = ['content']
    str_list = preprocess.extract_from_json(json_array, field_array)

    texts = preprocess.tokenize(str_list)
    removed_texts = preprocess.remove_stop_words(texts)
    
    dictionary = corpora.Dictionary(texts)
    corpus = preprocess.convert_texts_to_corpus(removed_texts, dictionary)

    import ipdb; ipdb.set_trace()
Пример #24
0
def transform(text):
    text = text.lower()
    text = normalize_thai_number(text)
    text = unescape_html(text)
    text = remove_markup_tag(text)
    text = normalize_link(text)
    text = normalize_mention(text)
    text = normalize_email(text)
    text = normalize_laugh(text)
    text = normalize_number(text, place_holder='')
    text = normalize_emoji(text)
    hashtags = extract_hashtag(text)
    text = normalize_hashtag(text, place_holder='')
    tokens = tokenize(text, stopwords=None, punctuation=punctuation)
    tokens = replace_with_actual_hashtag(tokens, hashtags)
    
    return tokens
Пример #25
0
def main():

	input_path = 'dataset/taipei_city.json'
	json_array = preprocess.load_json_file(input_path)

	field_array = ['content']
	str_list, answer = preprocess.extract_from_json_with_answer(json_array['data'], field_array)

	texts = preprocess.tokenize(str_list)
	removed_texts = preprocess.remove_stop_words(texts)
    
	#dictionary = pickle.load(open('dictionary.obj', 'rb'))
	dictionary = corpora.Dictionary(removed_texts)
	data_corpus = preprocess.convert_texts_to_corpus(removed_texts, dictionary)

	#corpus = pickle.load(open('corpus.obj', 'rb'))
	
	result_table = pd.DataFrame()

	# preprocess with Tfidf model
	params = {"corpus": data_corpus}
	X, y = convert_to_X_y(TfidfModel, params, data_corpus, answer)
	result_table = train_with_dummy(result_table, X, y, 'tfidf')
	result_table = train_with_random_forest(result_table, X, y, 'tfidf')
	result_table = train_with_logistic_regression(result_table, X, y, 'tfidf')

	'''
	# preprocess with lda model
	for num_topics in [10, 50, 100, 150, 200]:
		params = {"corpus": data_corpus, "num_topics": num_topics}
		X, y = convert_to_X_y(LdaModel, params, data_corpus, answer)
		result_table = train_with_dummy(result_table, X, y, 'lda_'+str(params['num_topics']))
		result_table = train_with_random_forest(result_table, X, y, 'lda_'+str(params['num_topics']))
		result_table = train_with_logistic_regression(result_table, X, y, 'lda_'+str(params['num_topics']))
	
	# preprocess with lsi model
	for num_topics in [10, 50, 100, 150, 200]:
		params = {"corpus": data_corpus, "num_topics": num_topics}
		X, y = convert_to_X_y(LsiModel, params, data_corpus, answer)
		result_table = train_with_dummy(result_table, X, y, 'lsi_'+str(params['num_topics']))
		result_table = train_with_random_forest(result_table, X, y, 'lsi_'+str(params['num_topics']))
		result_table = train_with_logistic_regression(result_table, X, y, 'lsi_'+str(params['num_topics']))

	'''
	output_file = sys.argv[1]
	result_table.to_csv(output_file, sep='\t')
Пример #26
0
def main(argv):
    emot_dic = {"anger": 0, "joy": 1, "sadness": 2, "fear": 3}

    model = get_model("model.h5")

    for s in ["anger", "joy", "sadness", "fear"]:

        df = pd.read_csv(argv[0],
                         sep='\t',
                         header=None,
                         encoding='utf-8',
                         quoting=3)
        df.columns = ['id', 'text', 'polarity', 'class']

        df = df[df["polarity"] == s]

        test = np.array(df['text'])
        test_type = np.array(df["polarity"])

        X = []
        for x in test:
            X.append(tokenize(x))
        X = np.array(X)

        emot = np.zeros((len(test), 4))
        for x in range(len(test_type)):
            emot[x, emot_dic[test_type[x]]] = 1

        out = model.predict(
            [X.reshape((len(test), 50, 1)),
             emot.reshape((len(test), 4))])

        y_ = np.array(df["class"])
        y = np.array([int(x[0]) for x in y_])

        acc = np.count_nonzero(y == out.argmax(axis=1)) / float(
            out.argmax(axis=1).shape[0])

        print(s, acc)

        df["class"] = out.argmax(axis=1)

        df.to_csv("EI-oc_en_" + s + "_pred.txt",
                  sep='\t',
                  header=None,
                  index=None)
Пример #27
0
    def setup(self):
        corpus = self._create_corpus()
        self.tokens = pp.tokenize(corpus)

        sequences = []
        for line in corpus:
            token_list = self.tokens.texts_to_sequences([line])[0]
            for i in range(1, len(token_list)):
                n_gram_sequence = token_list[:i+1]
                sequences.append(n_gram_sequence)

        padder = tf.keras.preprocessing.sequence.pad_sequences
        sequences = np.array(padder(sequences, padding='pre'))
        input_sequences, labels = sequences[:,:-1], sequences[:,-1]
        total_words = len(self.tokens.word_index) + 1
        one_hot_labels = tf.keras.utils.to_categorical(labels, num_classes=total_words)

        return input_sequences, one_hot_labels        
Пример #28
0
def pure_model(choices,
               query,
               model,
               magic_string,
               model_name,
               return_weights=False):
    query = clean(query)
    query = tokenize(query, magic_string, 'embedding' in model_name)
    query = np.expand_dims(query, 0)
    if 'conv' in model_name and 'embedding' not in model_name:
        query = np.expand_dims(query, 2)

    prediction = model.predict(query)
    prediction = prediction[0]

    indexed = list(enumerate(prediction))
    weighted = sorted(indexed, key=lambda e: e[1], reverse=True)
    if not return_weights:
        return [choices[r[0]]['name'] for r in weighted[:10]]
    return [(choices[r[0]]['name'], r[1]) for r in weighted[:10]]
def build_model(lines_num = -1):
	dataset = open('rawdata.csv', 'r')

	[bidList,rawList]=tokenize(dataset,lines_num)

	stopped_result = stop_words(rawList,lines_num)

	stem_result = stem(stopped_result,lines_num)

	[corpus,dictionary] = doc_term_matrix(stem_result)
	# it seems like without stem the words makes more sense, still working on it
	
	################
	####  LDA  #####
	################
	ldaList = lda(corpus, dictionary,lines_num)
	
	print("load data...")
	
	with open('outfile','wb') as fp:
		pickle.dump(ldaList,fp)
Пример #30
0
def tokenize():
    content = request.args.get('content', None, type=str)

    print "From view.py tokenize:%s" % content
    sw_list = prep.get_stopwords(
        "/home/skipper/study/python/project/text/news_stopwords.txt")
    tokens = prep.tokenize(content, sw_list, language="CN")
    for i in tokens:
        print i
    tokens_string = '/'.join(tokens)
    print tokens_string
    pos_tag, pos_string = prep.pos_test(tokens)

    print "From view.py tokenize: %s\n%s\n%s\n" % (tokens_string, pos_tag,
                                                   pos_string)
    detail = {}
    detail['tokens_string'] = tokens_string
    detail['pos_tag'] = pos_tag
    detail['pos_string'] = pos_string

    return jsonify(detail)
Пример #31
0
def mixed_model(choices,
                query,
                model,
                magic_string,
                model_name,
                return_weights=False):
    names = [s['name'] for s in choices]
    fuzzy_results = process.extract(query, names, scorer=fuzz.ratio)
    fuzzy_sum = max(sum(r[1] for r in fuzzy_results), 0.001)
    fuzzy_matches_and_confidences = [(r[0], r[1] / fuzzy_sum)
                                     for r in fuzzy_results]

    # net
    query = clean(query)
    query = tokenize(query, magic_string, 'embedding' in model_name)
    query = np.expand_dims(query, 0)
    if 'conv' in model_name and 'embedding' not in model_name:
        query = np.expand_dims(query, 2)

    prediction = model.predict(query)
    prediction = prediction[0]

    indexed = list(enumerate(prediction))
    weighted = sorted(indexed, key=lambda e: e[1], reverse=True)
    net_weighted = [(choices[r[0]]['name'], r[1]) for r in weighted]

    sorted_weighted = sorted(fuzzy_matches_and_confidences + net_weighted,
                             key=lambda e: e[1],
                             reverse=True)

    # build results list, unique
    results = []
    weights = []
    for r in sorted_weighted:
        if r[0] not in results:
            results.append(r[0])
            weights.append(r[1])
    if not return_weights:
        return results
    return list(zip(results, weights))
Пример #32
0
def load_train_messages():
    filename = '.data/messages.txt'
    lines = []
    vocab = {}
    dics = []

    org_lines = None
    with open(filename) as f:
        itr = f.read().split('\n')
        itr = filter(lambda l: l, itr)
        org_lines = list(itr)

    for l in org_lines:
        text, d = preprocess.preprocess(l)
        dics.append(d)
        ws = preprocess.tokenize(text)
        lines.append(' '.join(ws))
        for w in ws:
            if w not in vocab:
                vocab[w] = len(vocab)

    return (lines, vocab, dics)
Пример #33
0
def build_model(lines_num = -1):
	#dataset = open('rawdata.csv', 'r')
	


	rawList=tokenize(lines_num)

	print(rawList[0])
	stopped_result = stop_words(rawList,lines_num)

	#stem_result = stem(stopped_result,lines_num)

	[corpus,dictionary] = doc_term_matrix(stopped_result)
	# it seems like without stem the words makes more sense, still working on it
	
	################
	####  LDA  #####
	################
	#ldaList = lda(corpus, dictionary,lines_num)
	lsiList = lsi(corpus, dictionary,lines_num)


	save_file(lsiList)
Пример #34
0
        h2 = m.output_step(x, gh, volatile='on')
        wid = np.argmax(F.softmax(m.W(h2)).data[0])
        result_words.append(id2wd.get(wid, UNKNOWN_WORD))
        loop += 1

    return ' '.join(result_words)


msg_lines, msg_vocab, msg_dics = helpers.load_train_messages()
msg_vocab[EOS] = len(msg_vocab)

cmd_lines, cmd_vocab, cmd_id2wd = helpers.load_train_commands()
id = len(cmd_vocab)
cmd_vocab[EOS] = id
cmd_id2wd[id] = EOS


print('> ', end='')
test_msg = input() # 'ミーティングは来週の月曜日14時にやる'
text, dic = preprocess.preprocess(test_msg)
print('dic = {0}'.format(dic))
ws = preprocess.tokenize(text)

demb = 100
for epoch in range(20):
    m = model.Attention(msg_vocab, cmd_vocab, demb)
    filename = ".dest/m2c-{}.model".format(epoch)
    serializers.load_npz(filename, m)

    print(epoch, ': ', translate(m, cmd_id2wd, ws))
Пример #35
0
import math
import preprocess
from datetime import datetime

startTime = datetime.now()

# Parameter for tuning
k = 2

#Tokenize the queries and documents:
queryList = preprocess.tokenize('qrys.txt')
docList = preprocess.tokenize('quotes.txt')
 
# Compute average document length (used to compute the tdf portion of the score further down)
sum = 0
for doc in docList:
	sum += len(doc)
meanDocLen = float(sum)/len(docList)

output = file('tfidf.top','w')

for query in queryList:
	queryNumber = query.pop(0)
	queryUnique = list(set(query))  # This removes duplicate words in the query
	
	# Get term frequencies for the query
	queryFreqs = {}
	for word in queryUnique:
		queryFreqs[word] = query.count(word)

	# Compute document frequency for the words in the query
Пример #36
0
         header_index = int(line.split('\t')[0][2:])
         ##print header, header_index, text
     elif line.startswith("#P"):
         idx, text = line.split('\t')
         thisparaid, sentid = idx.split()
         thisparaid = int(thisparaid.split('\t')[0][2:])+1
         sentid = int(sentid.split('\t')[0][2:])+1
         #text = tokenize(text, lang)
         
         ##print text
         ##print text2naf(text, sentid, thisparaid)
         
         if lang == 'cmn':
             lines.append(text)
         else:
             tagged_text = pos_tag(tokenize(text,lang), lang)
             tokens, tags = zip(*tagged_text)
             tl, newwordid = text2naf(" ".join(tokens), sentid, 
                                      thisparaid,wordid)
             textlayer.append(tl)
             termlayer.append(term2naf(tokens, tags, wordid))
             wordid = newwordid
 
 if lang == 'cmn':
     tagged_texts = pos_tag(tokenize(lines,lang, batch=True), 
                            lang, batch=True)
     
     for tagged_text in tagged_texts:
         tokens, tags = zip(*tagged_text)
         tl, newwordid = text2naf(" ".join(tokens), sentid, 
                                  thisparaid,wordid)