def original_generate_token_graph(): corp = [] sentences = [] # Initialize an empty list of sentences input_folders = [ sub_dir for sub_dir in listdir(dataset_folder) if isdir(join(dataset_folder, sub_dir)) ] for folder in input_folders: dir_path = dataset_folder + os.sep + folder + os.sep files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ] for file in files: file_path = dir_path + file file_name, file_extension = splitext(file_path) doc = "" if file_extension == ".pdf": doc = convert_pdf_to_txt(file_path) elif file_extension == ".docx": doc = convert_docx_to_txt(file_path) else: continue if doc != "": doc = doc.decode("utf8") #doc = words_to_phrases(doc) doc = doc.lower() doc = doc_to_wordlist(doc,True) corp = it.chain(corp,doc) #sentences += doc_to_sentences(doc, tokenizer, remove_stopwords=False) corp = list(corp) graph = nx.Graph() weights = Counter() edges = set() window = corp[0:5] for tup in it.permutations(window,2): weights[tup] += 1 for i in range(3,len(corp)-2): for j in range(i-2,i+2): weights[(corp[j],corp[i+2])] += 1 weights[(corp[i+2],corp[j])] += 1 edges.add((corp[i+2],corp[j])) for e in edges: graph.add_edge(e[0], e[1], {'weight':weights[e]}) print graph nx.write_weighted_edgelist(graph, "graph.g") print nx.to_numpy_matrix(graph) np.savetxt("graph.adj", nx.to_numpy_matrix(graph)) print "finished"
doc = "" user = "" if file_extension == ".pdf": user = file.split("_")[0] doc = convert_pdf_to_txt(file_path) elif file_extension == ".docx": user = file.split("_")[0] doc = convert_docx_to_txt(file_path) else: continue if doc != "": doc = doc.decode("utf8") doc = doc_to_wordlist(doc) if users.has_key(user): users[user] += doc else: users[user] = doc print users.keys() print "Number of users = " + str(len(users)) # Initial a vector of syn0 and syn1 for a vector of a label new_syn0 = empty((1, model.layer1_size), dtype=REAL) new_syn1 = empty((1, model.layer1_size), dtype=REAL)
files = [ f for f in listdir(subsub_dir_path) if isfile(join(subsub_dir_path,f)) if ".pdf" in f ] for file_name in files: file_paths.append(subsub_dir_path + file_name) #print file_paths #print len(file_paths) label_i = 1 for file_path in file_paths: doc = convert_pdf_to_txt(file_path) if doc != "": doc = doc.decode("utf8") doc = doc_to_wordlist(doc, remove_stopwords=True) label = str(folder) + "_" + str(label_i) label_i = label_i + 1 doc_labels[label] = doc # Initial a vector of syn0 and syn1 for a vector of a label new_syn0 = empty((1, model.layer1_size), dtype=REAL) new_syn1 = empty((1, model.layer1_size), dtype=REAL) is_first = True # Initialize and add a vector of syn0 and syn1 for a vector of a label for doc_label in doc_labels:
def generate_token_group_graph(group="author",output="graph"): corp = defaultdict(list) sentences = [] # Initialize an empty list of sentences input_folders = [ sub_dir for sub_dir in listdir(dataset_folder) if isdir(join(dataset_folder, sub_dir)) ] for folder in input_folders: dir_path = dataset_folder + os.sep + folder + os.sep files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ] for file in files: file_path = dir_path + file file_name, file_extension = splitext(file_path) doc = "" if file_extension == ".pdf": doc = convert_pdf_to_txt(file_path) elif file_extension == ".docx": doc = convert_docx_to_txt(file_path) elif file_extension == ".txt": with open(file_path) as f: for line in f: doc = doc+" "+line else: continue if doc != "": # Extract the author name from the filename if group is "author": group = file_name.split("_")[0] elif group is "document": group = file_name else: group = "" doc = doc.decode("utf8") doc = doc.lower() doc = doc_to_wordlist(doc,True) corp[group] = it.chain(corp[group],doc) #sentences += doc_to_sentences(doc, tokenizer, remove_stopwords=False) graph = nx.Graph() weights = Counter() edges = set() if group is "": coll = list(corp[""]) window = coll[0:5] for tup in it.permutations(window,2): weights[tup] += 1 weights[(tup[1],tup[0])] += 1 edges.add(tup) for i in range(2,len(coll)-2): for j in range(i-2,i+2): weights[(coll[j],coll[i+2])] += 1 weights[(coll[i+2],coll[j])] += 1 edges.add((coll[i+2],coll[j])) for e in edges: graph.add_edge(e[0], e[1], {'weight':weights[e]}) else: for (g,coll) in corp: coll = list(coll) window = coll[0:5] for tup in it.permutations(window,2): weights[tup] += 1 weights[(tup[1],tup[0])] += 1 edges.add(tup) for t in window: if not (t is g): weights[(g,t)] += 1 weights[(t,g)] += 1 edges.add((g,t)) for i in range(2,len(coll)-2): for j in range(i-2,i+2): if not (coll[i+2] is g): weights[(g,coll[i+2])] += 1 weights[(coll[i+2],g)] += 1 edges.add((g,coll[i+2])) weights[(coll[j],coll[i+2])] += 1 weights[(coll[i+2],coll[j])] += 1 edges.add((coll[i+2],coll[j])) for e in edges: graph.add_edge(e[0], e[1], {'weight':weights[e]}) nx.write_weighted_edgelist(graph, output+".g") print nx.to_numpy_matrix(graph) np.savetxt(output+".adj", nx.to_numpy_matrix(graph)) generate_graph_statistics(graph,output) print "finished"
def generate_token_group_bigraph(group="author",output="graph", threshold=5): corp = defaultdict(list) sentences = [] # Initialize an empty list of sentences input_folders = [ sub_dir for sub_dir in listdir(dataset_folder) if isdir(join(dataset_folder, sub_dir)) ] for folder in input_folders: dir_path = dataset_folder + os.sep + folder + os.sep files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ] for file in files: file_path = dir_path + file file_name, file_extension = splitext(file_path) doc = "" if file_extension == ".pdf": doc = convert_pdf_to_txt(file_path) elif file_extension == ".docx": doc = convert_docx_to_txt(file_path) elif file_extension == ".txt": with open(file_path) as f: for line in f: doc = doc+" "+line else: continue if doc != "": # Extract the author name from the filename if group is "author": # Before I just consider the author another token. # Now I need to distinguish the author tokens from # the author identifier group = "a_"+file_name.split("_")[0] elif group is "document": group = file_name else: group = "" doc = doc.decode("utf8") doc = doc.lower() doc = doc_to_wordlist(doc,True) corp[group] = it.chain(corp[group],doc) #sentences += doc_to_sentences(doc, tokenizer, remove_stopwords=False) graph = nx.Graph() weights = Counter() edges = set() if group is "": coll = list(corp[""]) window = coll[0:5] for tup in it.permutations(window,2): weights[tup] += 1 weights[(tup[1],tup[0])] += 1 edges.add(tup) for i in range(2,len(coll)-2): for j in range(i-2,i+2): weights[(coll[j],coll[i+2])] += 1 weights[(coll[i+2],coll[j])] += 1 edges.add((coll[i+2],coll[j])) for e in edges: graph.add_edge(e[0], e[1], {'weight':weights[e]}) else: for (g,coll) in corp.iteritems(): coll = list(coll) window = coll[0:5] """ This code adds edges from one token to every other token. Need to remove for tup in it.permutations(window,2): weights[tup] += 1 weights[(tup[1],tup[0])] += 1 edges.add(tup) """ for t in window: if not (t is g): weights[(g,t)] += 1 weights[(t,g)] += 1 edges.add((g,t)) for i in range(2,len(coll)-2): for j in range(i-2,i+2): if not (coll[i+2] is g): weights[(g,coll[i+2])] += 1 weights[(coll[i+2],g)] += 1 edges.add((g,coll[i+2])) """ This code also adds token to token edges weights[(coll[j],coll[i+2])] += 1 weights[(coll[i+2],coll[j])] += 1 edges.add((coll[i+2],coll[j])) """ for e in edges: # Bimax and biclique detection techniques do not use weights, so just # create threshed edges if weights[e] > threshold: graph.add_edge(e[0], e[1]) #nx.write_weighted_edgelist(graph, output+"_b.g") #print nx.to_numpy_matrix(graph) #np.savetxt(output+"_b.adj", nx.to_numpy_matrix(graph)) generate_graph_statistics(graph,output+"_b") print "finished"
def generate_token_group_bigraph_from_amazon_imdb_dataset(output="graph", threshold=5): #This function is used to create a graph from the imdb/amazon dataset #and store it a file called edgelist.graph in the folder in which the code is being run. corp = [] input_folders = [ sub_dir for sub_dir in listdir(dataset_folder) if isdir(join(dataset_folder, sub_dir)) ] for folder in input_folders: dir_path = dataset_folder + os.sep + folder + os.sep files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ] for file in files: file_path = dir_path + file file_name, file_extension = splitext(file_path) doc = "" if file_extension == ".txt": with open(file_path) as f: for line in f: doc = doc+" "+line else: continue if doc != "": doc = doc.decode("utf8") doc = doc.lower() doc = doc_to_wordlist(doc,True) corp = it.chain(corp,doc) graph = nx.Graph() weights = Counter() edges = set() coll = list(corp) window = coll[0:5] for tup in it.permutations(window,2): weights[tup] += 1 weights[(tup[1],tup[0])] += 1 edges.add(tup) for i in range(2,len(coll)-2): for j in range(i-2,i+2): weights[(coll[j],coll[i+2])] += 1 weights[(coll[i+2],coll[j])] += 1 edges.add((coll[i+2],coll[j])) for e in edges: graph.add_edge(e[0], e[1], {'weight':weights[e]}) #nx.write_weighted_edgelist(graph, output+"_b.g") #print nx.to_numpy_matrix(graph) #np.savetxt(output+"_b.adj", nx.to_numpy_matrix(graph)) nx.write_edgelist(graph, "edgelist.graph", data=['weight']) print "finished" #generate_token_group_bigraph_from_amazon_imdb_dataset(group="",output="agraph") #generate_graph_statistics_from_file("edgelist.graph") #parsereviewfile()
def generate_token_group_bigraph_from_amazon_imdb_dataset(output="graph", threshold=5): #This function is used to create a graph from the imdb/amazon dataset #and store it a file called edgelist.graph in the folder in which the code is being run. corp = [] input_folders = [ sub_dir for sub_dir in listdir(dataset_folder) if isdir(join(dataset_folder, sub_dir)) ] for folder in input_folders: dir_path = dataset_folder + os.sep + folder + os.sep files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ] print "Looking inside: " + dir_path for file in files: file_path = dir_path + file file_name, file_extension = splitext(file_path) print 'Scanning File: ', file_path doc = "" if file_extension == ".txt": with open(file_path) as f: print 'Copying data from: ', file_path for line in f: doc = doc+" "+line else: continue if doc != "": #doc = doc.decode("utf8") doc = doc.lower() doc = doc_to_wordlist(doc,True) print 'Words retrived: ', doc corp = it.chain(corp,doc) graph = nx.Graph() weights = Counter() edges = list() coll = list(corp) wordList = list(set(coll)) wordDict = {} revWordDict = {} for i in range(len(wordList)): wordDict['text'+str(i)]=wordList[i]; revWordDict[wordList[i]]='text'+str(i); window = coll[0:5] for tup in it.permutations(window,2): weights[tup] += 1 weights[(tup[1],tup[0])] += 1 print 'Trying to add edge: ', [revWordDict[tup[0]],revWordDict[tup[1]]] edges.append([revWordDict[tup[0]],revWordDict[tup[1]]]) for i in range(2,len(coll)-2): for j in range(i-2,i+2): edges.append([revWordDict[coll[i+2]],revWordDict[coll[j]]]) for e in edges: print 'Trying to add edge: ', e[0],e[1] graph.add_edge(e[0], e[1]) #nx.write_weighted_edgelist(graph, output+"_b.g") #print nx.to_numpy_matrix(graph) #np.savetxt(output+"_b.adj", nx.to_numpy_matrix(graph)) nx.write_edgelist(graph, "corpusEdgelist.graph") g = open('index2word.pkl', 'w') cPickle.dump(wordDict, g, -1) g.close() g = open('word2index.pkl', 'w') cPickle.dump(revWordDict, g, -1) g.close() print "finished"