def readlistWholeJsonDataSet(datasetName, isStopWord=True): ps = PorterStemmer() # stopWs=getScikitLearn_StopWords() stopWs = loadStopWords('stopWords.txt') file1 = open(datasetName, "r") lines = file1.readlines() file1.close() list_pred_true_words_index = [] i = -1 for line in lines: line = line.strip() n = eval(line) id = str(n['Id']).strip() true = str(n['clusterNo']).strip() words = str(n['textCleaned']).strip().split(' ') #synWords = [] #for word in words: # syns = wordnet.synsets(word) # print(syns) # synWords.extend(syns) #words = words + synWords words = [ps.stem(w) for w in words] if isStopWord: words = [w for w in words if not w in stopWs] if len(words) == 0: words = str(n['textCleaned']).strip().split(' ') if len(true) == 0 or len(words) == 0: continue i += 1 list_pred_true_words_index.append([-1, true, words, i]) return list_pred_true_words_index
def __init__(self, dataDir, wordToIdMap, wordList): # stopWs = getScikitLearn_StopWords() stopWs = loadStopWords('stopWords.txt') self.D = 0 # The number of documents maxData = 50000 self.documents = [] dataDir = r'D:\githubprojects\PyMigrationRecommendation\src\notebooks' \ r'\train_stackoverflow_r_true_id_title_tags_body_createtime ' with open(dataDir) as input: line = input.readline() while line: line = line.strip().lower() arr = line.split('\t') trueLabel = arr[0] postId = int(arr[1]) title = arr[2] tag = arr[3] body = arr[4] createtime = arr[5] tag = ' '.join(tag.strip('<').strip('>').split('><')).strip() text = tag text = text.strip().replace('"', '').replace("\\", '').strip() ws_org = text.strip().split(' ') ws = [w for w in ws_org if not w in stopWs] if len(ws) == 0: ws = ws_org if len(ws) > 0: self.D += 1 document = Document(ws, wordToIdMap, wordList, postId, trueLabel) self.documents.append(document) line = input.readline() # if self.D > maxData: # break print("number of documents is ", self.D)
def readStackOverflowDataSet( datasetName, isStopWord=True ): # stackoverflow_javascript_true_id_title_tags/ id=postId # stopWs=getScikitLearn_StopWords() stopWs = loadStopWords('stopWords.txt') ps = PorterStemmer() file1 = open(datasetName, "r") lines = file1.readlines() file1.close() list_pred_true_words_index_postid = [] i = -1 for line in lines: line = line.strip() if len(line) == 0: continue arr = re.split("\t", line) if len(arr) != 4: continue true = arr[0].strip() postId = arr[1].strip() # text=arr[2].strip() #+' '+' '.join(arr[3].strip('<').strip('>').split('><')) text = ' '.join(arr[3].strip('<').strip('>').split('><')) text = text.replace('"', '').replace("\\", '').strip() if len(true) == 0 or len(text) == 0 or len(postId) == 0: continue words = text.split(' ') if isStopWord == True: words = [ps.stem(w) for w in words] words = [w for w in words if not w in stopWs] else: words = [ps.stem(w) for w in words] if len(words) == 0: words = text.split(' ') # continue i += 1 list_pred_true_words_index_postid.append([-1, true, words, i, postId]) return list_pred_true_words_index_postid
def readStackOverflowDataSetTagTitleBody(inputfile, isStopWord=True, columnsInFile=6, tagIgnore='<r>', randMax=0): # stopWs=getScikitLearn_StopWords() stopWs = loadStopWords('stopWords.txt') ps = PorterStemmer() file1 = open(inputfile, "r") lines = file1.readlines() file1.close() list_CPost = [] i = -1 for line in lines: line = line.strip().lower() if len(line) == 0: continue arr = re.split("\t", line) if len(arr) != columnsInFile: continue true = arr[0].strip() if true == '-1' and randMax > 0 and random.randint( 0, randMax) != randMax: # use 1/10 th data continue postId = arr[1].strip() title = arr[2].strip().replace('"', '').replace("\\", '').strip() tags = arr[3].strip().replace('"', '').replace("\\", '').strip() body = arr[4].strip().replace('"', '').replace("\\", '').strip() createtime = arr[5].strip() if tagIgnore != '' and len(tagIgnore) > 0: tags = tags.replace(tagIgnore, '').strip() # print('tagIgnore', tagIgnore, tags) if len(tags) == 0: tags = arr[3].strip().replace('"', '').replace("\\", '').strip() tagText = ' '.join(tags.strip('<').strip('>').split('><')).strip() titleText = title bodyText = body tagWords = tagText.split(' ') org_titleWords = titleText.split(' ') org_bodyWords = bodyText.split(' ') # tagWords = [ps.stem(w) for w in tagWords] # titleWords = [ps.stem(w) for w in titleWords] # bodyWords = [ps.stem(w) for w in bodyWords] if isStopWord: titleWords = [w for w in org_titleWords if not w in stopWs] bodyWords = [w for w in org_bodyWords if not w in stopWs] if len(titleWords) == 0: titleWords = org_titleWords if len(bodyWords) == 0: bodyWords = org_bodyWords if len(true) == 0 or len(tagWords) == 0 or len(titleWords) == 0 or len( bodyWords) == 0 or len(postId) == 0 or len(createtime) == 0: continue i += 1 postCreatetime = datetime.strptime( createtime.split("t")[0], "%Y-%m-%d") list_CPost.append( CPost(-1, int(true), tagWords, titleWords, bodyWords, i, int(postId), postCreatetime)) # print(-1, int(true), tagWords, titleWords, bodyWords, i, int(postId), postCreatetime) return list_CPost
def readStackOverflowDataSetBody( datasetName, isStopWord=True, columnsInFile=6, texttype='tag', tagIgnore='<c++>' ): # stackoverflow_javascript_true_id_title_tags/ id=postId # stopWs=getScikitLearn_StopWords() stopWs = loadStopWords('stopWords.txt') ps = PorterStemmer() file1 = open(datasetName, "r") lines = file1.readlines() file1.close() list_pred_true_words_index_postid_createtime = [] i = -1 for line in lines: line = line.strip().lower() if len(line) == 0: continue arr = re.split("\t", line) if len(arr) != columnsInFile: continue true = arr[0].strip() postId = arr[1].strip() title = arr[2].strip().replace('"', '').replace("\\", '').strip() tags = arr[3].strip().replace('"', '').replace("\\", '').strip() if tagIgnore != '' and len(tagIgnore) > 0: tags = tags.replace(tagIgnore, '').strip() print('tagIgnore', tagIgnore, tags) if len(tags) == 0: tags = arr[3].strip().replace('"', '').replace("\\", '').strip() # print("restore readStackOverflowDataSetBody", tags) body = arr[4].strip().replace('"', '').replace("\\", '').strip() createtime = arr[5].strip() if texttype == 'tag': text = ' '.join(tags.strip('<').strip('>').split('><')).strip() elif texttype == 'title': text = title elif texttype == 'body': text = body else: text = ' '.join(tags.strip('<').strip('>').split('><')) text = text.replace('"', '').replace("\\", '').strip() if len(true) == 0 or len(text) == 0 or len(postId) == 0 or len( createtime) == 0: continue words = text.split(' ') if texttype == 'tag': words = words elif isStopWord == True and texttype != 'tag': words = [ps.stem(w) for w in words] words = [w for w in words if not w in stopWs] else: words = [ps.stem(w) for w in words] if len(words) == 0: words = text.split(' ') # continue i += 1 # print(words) list_pred_true_words_index_postid_createtime.append( [-1, true, words, i, postId, createtime]) return list_pred_true_words_index_postid_createtime
def readStackOverflowDataSetTagTitleBody( datasetName, isStopWord=True, columns=6, texttype='tag', tagIgnore='<c++>' ): # stackoverflow_javascript_true_id_title_tags/ id=postId # stopWs=getScikitLearn_StopWords() stopWs = loadStopWords('stopWords.txt') ps = PorterStemmer() file1 = open(datasetName, "r") lines = file1.readlines() file1.close() list_pred_true_words_index_postid_createtime_tag_title_body = [] i = -1 for line in lines: line = line.strip().lower() if len(line) == 0: continue arr = re.split("\t", line) if len(arr) != columns: continue true = arr[0].strip() postId = arr[1].strip() title = arr[2].strip().replace('"', '').replace("\\", '').strip() tags = arr[3].strip().replace('"', '').replace("\\", '').strip() body = arr[4].strip().replace('"', '').replace("\\", '').strip() createtime = arr[5].strip() if len(true) == 0 or len(postId) == 0 or len(createtime) == 0: continue i += 1 # print(words) # if i > 10000: # break tag_words = ' '.join( tags.strip('<').strip('>').split('><')).strip().split(' ') title_words = title.split(' ') body_words = body.split(' ') title_words = [ps.stem(w) for w in title_words] body_words = [ps.stem(w) for w in body_words] if isStopWord: title_words = [w for w in title_words if not w in stopWs] body_words = [w for w in body_words if not w in stopWs] if len(title_words) == 0: title_words = title.split(' ') if len(body_words) == 0: body_words = body.split(' ') list_pred_true_words_index_postid_createtime_tag_title_body.append([ -1, true, tag_words, i, postId, createtime, tag_words, title_words, body_words ]) return list_pred_true_words_index_postid_createtime_tag_title_body