def load_file_to_dict(self, filename: str, cols: List[int] = None) -> Dict[str, int]: """Load columns of a csv file to word_dict. Args: filename: a csv file with ',' as separator cols: column indexes to be added to vocab Returns: word_dict: {<word>: frequency} """ data_frame = pd.read_csv(filename) if not cols: cols = range(data_frame.shape[1]) for row in data_frame.itertuples(index=False): for i in cols: sentence = str(row[i]) if self.language == 'zh': words = lawa.lcut(sentence) else: # 'en' words = nltk.word_tokenize(sentence) for word in words: self.word_dict[word] = self.word_dict.get(word, 0) + 1 return self.word_dict
def tokenize(sentence: str) -> List[str]: """Cut words for a sentence. Args: sentence: sentence Returns: words list """ return lawa.lcut(sentence)
def tokenize(self, sentence: str) -> List[str]: """Cut words for a sentence. Args: sentence: sentence Returns: words list """ if self.language == 'zh': words = lawa.lcut(sentence) else: # 'en' words = nltk.word_tokenize(sentence) return words
def get_test(positive): positive = positive.head(10) # 文本分词 mycut = lambda s: ' '.join(lawa.lcut(s)) # 自定义分词函数 po =positive.content.apply(mycut) # ne =negtive.comment.apply(mycut) #停用词过滤(停用词文本可以自己写,一行一个或者用别人整理好的,我这是用别人的) # with open(r'C:\Users\Administrator\Desktop\python\项目\电商评论情感分析\stoplist.txt',encoding = 'utf-8') as f: # stop = f.read() # stop =[' ',''] +list(stop[0:]) # 因为读进来的数据缺少空格,我们自己添加进去 po = po.apply(lambda s: s.split(' ')) # 将分词后的文本以空格切割 return po
def load_user_log(self, filename="data/user_search.csv"): df = pandas.read_csv(filename) topics, properties = [], [] for row in df.itertuples(index=False): words = list(lawa.lcut(row[1])) pos_corpus = self.pos_dic.doc2bow(words) list_topic = self.pos_lda.get_document_topics(pos_corpus) topic = torch.zeros((1,self.pos_lda.num_topics), device='cuda') for id, top in list_topic: topic[0,id] += top user_id = self.username_embedding[row[0]] property = self.userproperty_embedding[user_id] topics.append(topic) properties.append(property) topics = torch.cat(topics) properties = torch.cat(properties) return TensorDataset(topics, properties)
model = KeyedVectors.load_word2vec_format('model/word2vec.txt', binary=False) df = pandas.read_csv("data/summary.csv", sep=',', names=['gid', 'summary', 'sort']) adf = df[df.summary.notnull()] total = len(adf) print(total) dim = model.wv.vector_size vocab = set(model.wv.vocab.keys()) with open("modelsummary/summary-512.txt", 'w', encoding='utf-8') as file: file.write(str(total) + " " + str(dim) + '\n') for row in tqdm(adf.itertuples(index=False)): gid = str(row[0]) summary = str(row[1]) words = lawa.lcut(summary) vecs = numpy.array([model[w] for w in words if w in vocab]) if len(vecs): docvec = numpy.mean(vecs, axis=0) else: docvec = model['。'] vecline = (gid + ' ' + numpy.array2string(docvec, separator=' ', max_line_width=10**10, precision=8, floatmode='fixed', suppress_small=True).strip('[]') + '\n').replace(" ", " ", 10**10) file.write(vecline) #print('.')
help='model config file') args = parser.parse_args() positive = pd.read_csv(args.data_file, encoding='utf-8') # negtive = pd.read_excel(r'C:\Users\Administrator\Desktop\python\项目\爬虫\京东评论\com_neg.xls',encoding = 'utf-8') # 文本去重(文本去重主要是一些系统自动默认好评的那些评论 ) # positive = positive['content'].drop_duplicates() # positive = positive['content'] # negtive = negtive ['comment'].drop_duplicates() # negtive = negtive['comment'] type1 = positive['type1'].drop_duplicates() print('类型:', len(type1), type1.tolist()) #positive = positive.head(10) # 文本分词 mycut = lambda s: ' '.join(lawa.lcut(str(s))) # 自定义分词函数 po = positive.content.apply(mycut) # ne =negtive.comment.apply(mycut) # 停用词过滤(停用词文本可以自己写,一行一个或者用别人整理好的,我这是用别人的) # with open(r'C:\Users\Administrator\Desktop\python\项目\电商评论情感分析\stoplist.txt',encoding = 'utf-8') as f: # stop = f.read() # stop =[' ',''] +list(stop[0:]) # 因为读进来的数据缺少空格,我们自己添加进去 po = po.apply(lambda s: s.split(' ')) # 将分词后的文本以空格切割 # po['2'] = po['1'].apply(lambda x:[i for i in x if i not in stop])# 过滤停用词 # 在这里我们也可以用到之前的词云图分析 # post = []