cleanFileName = './%s/%s_%sData_clean.txt' % (person,person,dataType) f = open(dataSegFileName,'r') l = open(cleanFileName,'a') count = 0 while True: line = f.readline() count += 1 if line == '': break if count <= 194833: continue line = line.split()[1:] for term in line: try: if stop.stop3(term.split('/')[1]): continue except: print count print 'ERROR' print term #quit() term = term.split('/')[0] l.write(term + ' ') l.write('\n') f.close() l.close() quit() word2vec.doc2vec(cleanFileName, fileName, cbow=0, size=50, window=10, negative=5, hs=0, sample='1e-4', threads=12, iter_=20, min_count=1, verbose=True) word2vec.word2phrase(cleanFileName, fileName, verbose=True)
def LDA_INIT(): global vocab,word_dict global model global id_list global topic_num global vocab global idToTitle_dict global word_dict global docTermMatrix ### Construct Vocabulary List vocab = [] w = open(vocabFileName,'r') while True: line = w.readline() if line == '': break try: line.split()[1] except: continue if int(line.split()[1]) < termFreqLowBound: break line = line.split()[0] word_dict[line] = 0 w.close() vocab = word_dict.keys() ############################# ### Construct Title List w = open(cleanFileName,'r') while True: line = w.readline() if line == '': break try: idToTitle_dict.append(line) except: continue w.close() ############################# ### Construct Doc-Term Matrix f = open(dataSegFileName,'r') count = 0 while not loadJson: line = f.readline() count += 1 print count if line == '': break ID = line.split()[0] id_list.append(ID) line = line.split()[1:] reset_word_dict(word_dict) for term in line: termAttribute = term.split('/')[-1] if stop.stop3(termAttribute): continue try: #print term term = term.split('/')[0] if word_dict.has_key(term): word_dict[term] += 1 #else: #print 'ERROR: word_dict does not have term' except: print 'ERROR: term error' print term vec = word_dict.values() docTermMatrix.append(vec) f.close() if loadJson: jsonFileName = './%s/%s_LDA_Doc_Term_Matrix_%s.json' % (person,person,dataType) docTermMatrix = json.load(open(jsonFileName,'r')) if writeJson: jsonFileName = './%s/%s_LDA_Doc_Term_Matrix_%s.json' % (person,person,dataType) json.dump(docTermMatrix, open(jsonFileName,'w')) ############################# print 'LDA_INIT DONE!!!'
while True: line = f.readline() count += 1 if line == '': break line = line.split('\t') try: sentence = line[1].split() except: print count,sentence quit() for i in range(len(sentence)): termAttr = sentence[i].split('/')[-1] if stop.stop3(termAttr): continue bigram = '' if i is len(sentence)-1: word = sentence[i].split('/')[0] addVocab(word) vocab[word] += 1 else: word = sentence[i].split('/')[0] try: if bigram and i+1 != len(sentence) and not stop.stop3(sentence[i+1].split('/')[1]): bigram = word + sentence[i+1].split('/')[0] addVocab(bigram) vocab[bigram] += 1 except: