示例#1
0
txtFilename = '../Data/mc500.train.txt'
txtFilename2 = '../Data/mc160.train.txt'
stopWordFile = '../Data/stopwords.txt'
#'../Data/mc160.train.txt'
#"../Data/mc500.test.txt"
#"../Data/mc160.test.txt"
#"../Data/mc500.val.txt"
#"../Data/mc160.val.txt"
dataPickle_name = "../Pickle/" + "mc500+mc160.train" + ".lstm.noStopWord.x24.pickle"
print dataPickle_name
data = []

ans = parseAnsOneHot(ansFilename)

print "Loading", txtFilename.split('/')[-1], "..."
txtList = readTXT(txtFilename)
stopWord = []
f = open(stopWordFile, 'r')
for line in f:
    stopWord.append(line.split()[0])
idxCounter = 0
print "Loading word2vec..."
word_vec = Word2Vec.load_word2vec_format(wordvec_file, binary=False)
for one in txtList:
    #print 'The shape of one before is '+str(np.shape(one))
    oneQ = []
    for entry in one:
        tempList = []
        temp_vector = np.zeros(300, dtype='float32')
        for word in entry:
            word = word.lower()
示例#2
0
stopWordFile = '../Data/stopwords.txt'
#'../Data/mc160.train.txt'
#"../Data/mc500.test.txt"
#"../Data/mc160.test.txt"
#"../Data/mc500.val.txt"
#"../Data/mc160.val.txt"
dataPickle_name = "../Pickle/"+txtFilename.split('/')[-1].split('.')[0]+"."+txtFilename.split('/')[-1].split('.')[1]+".mod1.pickle"
print dataPickle_name
print "Loading word2vec..."
word_vec = Word2Vec.load_word2vec_format(wordvec_file, binary=False)
data = []

ans = parseAnsOneHot(ansFilename)

print "Loading",txtFilename.split('/')[-1],"..."
txtList = readTXT(txtFilename)
stopWord = []
f = open(stopWordFile, 'r')
for line in f:
    stopWord.append(line.split()[0])
pdb.set_trace()
idxCounter = 0
for one in txtList:
    #print 'The shape of one before is '+str(np.shape(one))
    oneQ = []
    for entry in one:
	count = 0.
	temp_vector = np.zeros(300,dtype='float32')
	for word in entry:
	    word = word.lower()
	    if word in stopWord:
示例#3
0
#'../Data/mc160.train.txt'
#"../Data/mc500.test.txt"
#"../Data/mc160.test.txt"
#"../Data/mc500.val.txt"
#"../Data/mc160.val.txt"
dataPickle_name = "../Pickle/" + txtFilename.split('/')[-1].split(
    '.')[0] + "." + txtFilename.split('/')[-1].split('.')[1] + ".mod1.pickle"
print dataPickle_name
print "Loading word2vec..."
word_vec = Word2Vec.load_word2vec_format(wordvec_file, binary=False)
data = []

ans = parseAnsOneHot(ansFilename)

print "Loading", txtFilename.split('/')[-1], "..."
txtList = readTXT(txtFilename)
stopWord = []
f = open(stopWordFile, 'r')
for line in f:
    stopWord.append(line.split()[0])
pdb.set_trace()
idxCounter = 0
for one in txtList:
    #print 'The shape of one before is '+str(np.shape(one))
    oneQ = []
    for entry in one:
        count = 0.
        temp_vector = np.zeros(300, dtype='float32')
        for word in entry:
            word = word.lower()
            if word in stopWord:
示例#4
0
from cut1 import readTXT
import sys
import numpy as np
import collections

a = readTXT('../Data/mc500.train.txt')

count = collections.Counter()
maxlen = 0
for i, aaa in enumerate(a):
    if i % 4 == 0:
        count.update(aaa[0])
        if maxlen < len(aaa[0]):
            maxlen = len(aaa[0])
    for j in range(1, len(aaa)):
        count.update(aaa[j])
示例#5
0
    exit(1)

input_ = argv[1]
output = argv[2]
dictionary = argv[3]

OOV = 1
d_arr = {}
with open(dictionary, 'r') as d:
    word_id = 1
    for line in d:
        word = line.rstrip().lower()
        d_arr[word] = word_id
        word_id += 1

txt = readTXT(input_)
#with open(output, 'w') as fout:
for q_id in range(len(txt)):
    print q_id
    '''
    for oneQ in txt:
	fout = open(output+str(q_id),"w")
        for word in oneQ[0]:
            word = word.lower()
            if word in d_arr:
                token = d_arr[word]
	    elif word.split(".")[0] in d_arr:
		token = d_arr[word.split(".")[0]]
	    elif word.split("'s")[0] in d_arr:
		token = d_arr[word.split("'s")[0]]
	    elif word.split(":")[0] in d_arr:
txtFilename='../Data/mc500.train.txt'
txtFilename2='../Data/mc160.train.txt'
stopWordFile = '../Data/stopwords.txt'
#'../Data/mc160.train.txt'
#"../Data/mc500.test.txt"
#"../Data/mc160.test.txt"
#"../Data/mc500.val.txt"
#"../Data/mc160.val.txt"
dataPickle_name = "../Pickle/"+"mc500+mc160.train"+".lstm.noStopWord.x24.pickle"
print dataPickle_name
data = []

ans = parseAnsOneHot(ansFilename)

print "Loading",txtFilename.split('/')[-1],"..."
txtList = readTXT(txtFilename)
stopWord = []
f = open(stopWordFile, 'r')
for line in f:
    stopWord.append(line.split()[0])
idxCounter = 0
print "Loading word2vec..."
word_vec = Word2Vec.load_word2vec_format(wordvec_file, binary=False)
for one in txtList:
    #print 'The shape of one before is '+str(np.shape(one))
    oneQ = []
    for entry in one:
	tempList = []
	temp_vector = np.zeros(300,dtype='float32')
	for word in entry:
	    word = word.lower()
示例#7
0
from cut1 import readTXT
import sys
import numpy as np
import collections

a = readTXT('../Data/mc500.train.txt')

count = collections.Counter()
maxlen = 0
for i,aaa in enumerate(a):
  if i%4 == 0:
    count.update(aaa[0])
    if maxlen < len(aaa[0]):
      maxlen = len(aaa[0])
  for j in range(1,len(aaa)):
    count.update(aaa[j])