def itemizedBatchDictionary(batch, betas): fullTexts = '' fullTexts = ''.join([fullTexts + doc[-1] for doc in batch]) intersect = list(set(f10X.cleanText(fullTexts)).intersection(dictionary.keys())) colNames = intersect rowNames = ["doc"+str(i+1) for i in range(len(batch))] zeros = np.zeros((len(rowNames),len(colNames))) batch_mat = pd.DataFrame(zeros, index=rowNames, columns=colNames) betas_mat = pd.DataFrame(zeros, index=rowNames, columns=colNames) Xs_mat = pd.DataFrame(zeros, index=rowNames, columns=colNames) XtimesBeta = pd.DataFrame(zeros, index=rowNames, columns=colNames) batch_dict = {} i=1 for doc in batch: rowStr = "doc"+str(i) docList = list(set(f10X.cleanText(doc[-1])).intersection(intersect)) d = {} d = {k: dictionary[k] for k in docList} freq = collections.Counter(f10X.cleanText(doc[-1])) items = fNN.getItems(doc[-1]) for item in items: itemList = list(set(f10X.cleanText(item[-1])).intersection(docList)) for k in docList: rowStr = "doc"+str(i) n = dictionary[k]['ndocs'] batch_mat.loc[rowStr,k] = (1+np.log(freq[k]))*np.log(n_docs/n) batch_dict.update(d) i+=1 return batch_dict, batch_mat
def forwardPropagation(batch, batch_dict, batch_mat, W): y = [] y_hat = [] X = [] i = 0 for item in batch: i += 1 text = item[-1] price_boolean = item[4] price = item[5] y.append(price_boolean) text = f10X.cleanText(text) text = list(set(text)) # From text to values doc = "doc" + str(i) values = [[ batch_mat.at[doc, w] * batch_dict[w]['pos'], batch_mat.at[doc, w] * batch_dict[w]['neg'] ] for w in text if w in batch_dict] #values = [[batch_dict[w]['pos'],batch_dict[w]['neg']] for w in text if w in batch_dict] values = np.array(values) # Summation layer - results in 2x1 vector sumValues = (values.sum(axis=0)) X.append(sumValues) # First linear layer - results in 2x1 vector linlayer = W.dot(sumValues) # Softmax y_hat.append(fNN.softmax(linlayer)) y = np.column_stack(y) y_hat = np.column_stack(y_hat) X = np.row_stack(X) return y, y_hat, X
def batchDictionary(batch): fullTexts = '' fullTexts = ''.join([fullTexts + doc[-1] for doc in batch]) inter = list( set(f10X.cleanText(fullTexts)).intersection(dictionary.keys())) colNames = inter rowNames = ["doc" + str(i + 1) for i in range(len(batch))] zeros = np.zeros((len(rowNames), len(colNames))) batch_mat = pd.DataFrame(zeros, index=rowNames, columns=colNames) batch_dict = {} i = 1 for doc in batch: docList = list(set(f10X.cleanText(doc[-1])).intersection(inter)) d = {} d = {k: dictionary[k] for k in docList} freq = collections.Counter(f10X.cleanText(doc[-1])) for k in docList: rowStr = "doc" + str(i) batch_mat.loc[rowStr, k] = freq[k] batch_dict.update(d) i += 1 return batch_dict, batch_mat
def getOmega(text, dictionary): text = f10X.cleanText(text) values = [dictionary[w]['Score'] for w in text if w in dictionary] if len(values)==0: return [] else: len_text = len(text) n_sentWords = len(values) values = np.array(values) n_pos = nPos(values) n_neg = nNeg(values) score1 = int(sum(values)) #scorw2 return [len_text, n_sentWords, n_pos, n_neg, score1]
def forwardPropagation(batch, batch_dict, batch_mat, coefficients): betas = coefficients[0] D = coefficients[1] W = coefficients[2] C = coefficients[3] y = [] y_hat = [] X = [] i = 0 for item in batch: i += 1 text = item[-1] price_boolean = item[4] price = item[5] y.append(price_boolean) text = f10X.cleanText(text) text = list(set(text)) # From text to values doc = "doc" + str(i) values = [[ batch_mat.at[doc, w] * batch_dict[w]['pos'], batch_mat.at[doc, w] * batch_dict[w]['neg'] ] for w in text if w in batch_dict] #values = [[batch_dict[w]['pos'],batch_dict[w]['neg']] for w in text if w in batch_dict] values = np.array(values) # Summation layer - results in 2x1 vector sumValues = (values.sum(axis=0)) X.append(sumValues) # NN layers linlayer1 = D.dot(sumValues) e2a = np.exp(2 * linlayer1) tanh = (e2a - 1) / (e2a + 1) linlayer2 = W.dot(tanh) + C # Softmax a = fNN.softmax(linlayer2.T) y_hat.append(a) y = np.column_stack(y) y_hat = np.column_stack(y_hat) X = np.row_stack(X) return y, y_hat, X
def getOmega2(text, dictionary): N = 276880 freq = collections.Counter(f10X.cleanText(text)) avg_count = sum(freq.values())/len(freq.values()) values = [] for key, value in freq.items(): if key in dictionary: score = dictionary[key]['Score'] weight = (1+np.log(value))/(1+np.log(avg_count))*np.log(dictionary[key]['ndocs']/N) values.append(score*weight) values = [dictionary[w]['Score'] for w in text if w in dictionary] if len(values)==0: return [] else: len_text = len(text) n_sentWords = len(values) values = np.array(values) n_pos = nPos(values) n_neg = nNeg(values) score1 = int(sum(values)) #scorw2 return [len_text, n_sentWords, n_pos, n_neg, score1]