def createIDFFile(): fname = "./initialResult/idfResult.txt" res = os.path.isfile(fname) if res: pass else: f = open(fname, 'w') dIDFs = [] allDictionary = dictionary.getDictionary() jieba.load_userdict("./data/dictionary_utf8.txt") fileList = getFileList.getFilesListFromFile() inde = 1 for sub in allDictionary: dIDF = {} dIDFsub = 0 # print(sub, seg_list, "\n") pattern = re.compile(sub) for fv in fileList: result1 = pattern.findall(fv) if len(result1) > 0: dIDFsub += 1 idfNum = dIDFsub strWrite = str(idfNum) # strWrite = sub + " " + str(idfNum) f.write(strWrite + "\n") print(inde) inde += 1 f.close()
def createIDFFile(): allDocumentNumber = 2265.0 fname = "./initialResult/idfResult.txt" res = os.path.isfile(fname) if res: # print fname + ' file has exists.' pass else: fileList = getFileList.getIntsFromFile() f = open(fname, 'w') dIDFs = [] allDictionary = dictionary.getDictionary() # fileList = getFileList.getFilesListFromFile() for sub in allDictionary: dIDF = {} dIDFsub = 0 for fv in fileList: if sub in fv: dIDFsub = dIDFsub + 1 idfNum = allDocumentNumber / dIDFsub strWrite = str(idfNum) f.write(strWrite + "\n") f.close()
def createDocumentTF(): allDictionary = dictionary.getDictionary() fileList = getFileList.getFilesListFromFile() fname = './initialResult/documentTFResult.txt' res = os.path.isfile(fname) if res: pass else: f = open(fname, 'w') for sub in allDictionary: strWrite = sub + " " for fv in fileList: strWrite = strWrite + " " + str(fv.split().count(sub)) f.write(strWrite + "\n") f.close()
def createQueryTFFile(): allDictionary = dictionary.getDictionary() fname = './initialResult/queryTF.txt' res = os.path.isfile(fname) if res: pass else: tempQueryList = getQuerysList.getIntsFromFile() f = open(fname, 'w') for sub in allDictionary: strWrite = str(sub) for v in tempQueryList: c_wd = v.count(sub) strWrite += " " + str(c_wd) f.write(strWrite + "\n") f.close()
def createQueryTFFile(): allDictionary = dictionary.getDictionary() fname = './initialResult/queryTF.txt' res = os.path.isfile(fname) if res: pass else: tempQueryList = queryList.getQueryFilesList() f = open(fname, 'w') for sub in allDictionary: strWrite = sub for v in tempQueryList: strWrite = strWrite + " " + str( v["content"].split().count(sub)) f.write(strWrite + "\n") f.close()
def createDocumentTF(): allDictionary = dictionary.getDictionary() fileList = getFileList.getIntsFromFile() fname = './initialResult/documentTFResult.txt' res = os.path.isfile(fname) if res: pass else: f = open(fname, 'w') for sub in allDictionary: strWrite = "" docIndex = 0 for fv in fileList: c_wd = fv.count(sub) # if c_wd > 0: strWrite += " " + str(c_wd) # strWrite += " " + str(docIndex) + ":" + str(c_wd) docIndex += 1 f.write(strWrite + "\n") f.close()
def getDict_DocCountWord(): fname = "./initialResult/dict_DocCountWord.txt" res = os.path.isfile(fname) result = [] if res: with open(fname) as f: lines = f.read().splitlines() for line in lines: tempList = [] ll = line.split(" ") # del(ll[-1]) for item in ll: tempList.append(list(map(int, item.split(":")))) result.append(tempList) else: with open('./initialResult/wordNumber.txt') as f: lines = f.read().splitlines() f = open(fname, 'w') createWordDocCount() allDictionary = dictionary.getDictionary() for word in allDictionary: strWrite = "" doc_index = 0 for line in lines: strTemp = ''.join(line.split("\r\n")) aDocWordCountList = strTemp.split(" ") del (aDocWordCountList[-1]) for value in aDocWordCountList: results = value.split(":") if results[0] == word: strWrite = strWrite + str( doc_index) + ":" + results[1] + " " doc_index = doc_index + 1 f.write(strWrite + "\r\n") f.close() return result
from keras.models import Model from keras.layers import Input, Dense from datetime import datetime import numpy as np from sklearn.preprocessing import LabelBinarizer from matplotlib import pyplot import dictionary import getFileList import getDocL k = 1 # context windows size wordsNum = 13290 docsL = getFileList.getIntsFromFile() dic = dictionary.getDictionary() def getAllWordsNum(): all_words_num = 0 for doc in docsL: all_words_num += len(doc) return all_words_num all_words_num = getAllWordsNum() # 393021 input_size = all_words_num - 2 * k - 1 # initial input left_Input = np.zeros(shape=(input_size, 1)) right_Input = np.zeros(shape=(input_size, 1))
# coding: utf-8 import os import math import numpy as np import queryList import dictionary import documentTF import queryTF import idfResult import getFileList allDictionary = dictionary.getDictionary() wordNumber = len(allDictionary) idfList = idfResult.getIDF() numberOfDoc = 2265 docTF = documentTF.getDocumentTF() querTF = queryTF.getQueryTF() fileNameList = getFileList.getFileNameList() def computeAquery(queryIndex): lineNum1 = 0 qtfL = [] idfL = [] while lineNum1 < wordNumber: idfL.append(map(float, idfList[lineNum1].split())[1]) qtfL.append(map(float, querTF[lineNum1].split())[queryIndex]) lineNum1 += 1
#!/usr/bin/python3 # coding: utf-8 import numpy as np import math from datetime import datetime import dictionary import document import wordDocsCount import wordDocsNoCount startInitial = datetime.now() docList = document.getFilesName() wordList = dictionary.getDictionary() wNumber = len(wordList) dNumber = len(docList) # 2265 topicNum = 10 P_d = np.random.dirichlet(np.ones(dNumber), size=1).tolist()[0] P_w_T = np.random.dirichlet(np.ones(topicNum), size=wNumber) P_T_d = np.random.dirichlet(np.ones(dNumber), size=topicNum) P_T_wd = np.zeros(shape=(topicNum * dNumber, wNumber)) count_w_d = wordDocsCount.getWordDocCount() noCount_w_d = wordDocsNoCount.getWordDocNoCount() dict_DocCountWord = wordDocsCount.getDict_DocCountWord() def get_BG(): fname = "./BGLM.txt" with open(fname) as f:
# TWITTER API BUILDING auth = OAuthHandler(keys.getCK(), keys.getCS()) auth.set_access_token(keys.getAT(), keys.getAS()) api = tweepy.API(auth) # INITIALSTATE API BUILDING streamer = Streamer(bucket_name=keys.getBN(), bucket_key=keys.getBK(), access_key=keys.getAK()) # EMOJIS emCodes = [] emNames = [] e = 'emojis' emDict = getDictionary() for key, value in emDict.iteritems(): emCodes.append(value['code']) emNames.append(value['id']) class MyListener(StreamListener): def on_data(self, data): d = json.loads(data) if 'text' in d: txt = d['text'] for emoj in emCodes: if emoj in txt: streamer.log(e, emNames[emCodes.index(emoj)]) # HANDLES LOCATION (IF ANY)