예제 #1
0
    def __init__(self,
                 fName,
                 vocabF,
                 batchSize,
                 seqLen,
                 minContext,
                 rand=False):
        self.seqLen = seqLen + 1
        self.minContext = minContext
        self.batchSize = batchSize
        self.rand = rand
        self.pos = 0

        self.vocab = utils.loadDict(vocabF)
        self.invVocab = utils.invertDict(self.vocab)

        realLen = self.seqLen - minContext
        dat = open(fName).read()

        dat = dat[:int(len(dat) / (realLen)) * realLen]
        dat = nlp.applyVocab(dat, self.vocab)
        dat = np.asarray(list(dat))
        dat = dat.reshape(-1, realLen)
        dat = np.hstack((dat[:-1], dat[1:, :minContext]))

        self.batches = int(np.floor(dat.shape[0] / batchSize))
        self.dat = dat[:(self.batches * batchSize)]
        self.m = len(self.dat.flat)
   def __init__(self, fName, vocabF, 
         batchSize, seqLen, minContext, rand=False, mode='ptb'):
      self.seqLen = seqLen+1
      self.minContext = minContext
      self.batchSize = batchSize
      self.rand = rand
      self.pos = 0

      self.vocab = utils.loadDict(vocabF)
      self.invVocab = utils.invertDict(self.vocab)

      realLen = self.seqLen - minContext
      if mode == 'french':
          dat = open(fName,encoding = "ISO-8859-1").read()
          dat = dat.lower()
          remove = string.punctuation
          remove = remove.replace("'","")
          pattern = r"[{}]".format(remove)
          dat = re.sub(pattern,"",dat)
          dat = dat[:int(len(dat)/(realLen))*realLen]

      elif mode == 'english_words':
          dat = open(fName).read()
          dat = dat.lower()

      elif mode == 'ptb':
          dat = open(fName).read()
          dat = dat[:int(len(dat)/(realLen))*realLen]

      dat = nlp.applyVocab(dat, self.vocab,mode)
      dat = np.asarray(list(dat))
      if mode =='english_words':
          dat = dat[:int(len(dat)/(realLen))*realLen]

      dat = dat.reshape(-1, realLen)
      dat = np.hstack((dat[:-1], dat[1:, :minContext]))

      self.batches = int(np.floor(dat.shape[0] / batchSize))
      self.dat = dat[:(self.batches*batchSize)]
      self.m = len(self.dat.flat)
예제 #3
0
def countsToFile():
    array1 = np.zeros(2 * (2 * ut.N)**2)
    array2 = np.zeros(2 * (2 * ut.N)**2)
    array = [""] * 2 * (2 * ut.N)**2
    data, tot_num = ut.loadDict(file)
    hists = ut.histFromDict(data)

    for h in hists:
        array[name_hash(h)] = h
        array1[name_hash(h)] = len(hists[h])
        array2[name_hash(h)] = sum(2 * fun(np.asarray(hists[h])))

    with open("hash.txt", "w") as f:
        for i in range(len(array)):
            f.write(str(i) + ":  " + str(array[i]) + "\n")

    with open("counts1.txt", "w") as f:
        for i in range(len(array1)):
            f.write(str(i) + ":  " + str(array1[i]) + "\n")

    with open("counts2.txt", "w") as f:
        for i in range(len(array2)):
            f.write(str(i) + ":  " + str(array2[i]) + "\n")
예제 #4
0
def runAllHists():
    # data_list, tot_num = ut.loadAllFiles()
    # hists = ut.histFromDictList(data_list)

    data, tot_num = ut.loadDict(file)
    hists = ut.histFromDict(data)

    # saveToCache(data_list, "data")
    saveToCache(hists, "hists")
    saveToCache(tot_num, "tot_num")

    # for h in hists:
    #    w = np.ones_like(hists[h])/len(hists[h])
    #    ut.plotHist((h, hists[h]), bin_num=100, toFile=True, weights=w, show=False)

    # plt.figure()
    # for h in hists:
    #    ut.plotToOneHist((h, hists[h]), bin_num=100)
    # plt.close()

    # write_log(tot_num)
    print("Total num:  ", tot_num)
    return hists, tot_num
예제 #5
0
with open('./preprocessing/config.json') as config_file:
    data = json.load(config_file)

cnx = connection.MySQLConnection(user=data['mysql']['user'],
                                 password=data['mysql']['passwd'],
                                 host=data['mysql']['host'],
                                 database=data['mysql']['db'])
cursor = cnx.cursor()

query = (
    "SELECT id, description FROM cs_entry_comment WHERE entry_id IS NOT NULL AND status IN (1,2)"
)

cursor.execute(query)

dictionary = loadDict()
f = open('preprocessing/normalize_like.txt', 'r')
review_views = {}
for line in f:
    tmp = line.split(",")
    review_views[tmp[0]] = tmp[1]

convertedData = open('converted_data.txt', 'w')

for (id, description) in cursor:
    print(id)
    num_words = 0

    doc = {}
    for w in dictionary:
        doc[w] = 0