def original_generate_token_graph():
    corp = []
    sentences = []      # Initialize an empty list of sentences
    input_folders = [ sub_dir for sub_dir in listdir(dataset_folder) if isdir(join(dataset_folder, sub_dir)) ]
    for folder in input_folders:
        dir_path = dataset_folder + os.sep + folder + os.sep
        files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ]
        for file in files:
            file_path = dir_path + file
            file_name, file_extension = splitext(file_path)
            doc = ""
            if file_extension == ".pdf":
                doc = convert_pdf_to_txt(file_path)
            elif file_extension == ".docx":
                doc = convert_docx_to_txt(file_path)
            if doc != "":
                doc = doc.decode("utf8")
                #doc = words_to_phrases(doc)
                doc = doc.lower()
                doc = doc_to_wordlist(doc,True)
                corp = it.chain(corp,doc)
                #sentences += doc_to_sentences(doc, tokenizer, remove_stopwords=False)
    corp = list(corp)
    graph = nx.Graph()
    weights = Counter()
    edges = set()
    window = corp[0:5]
    for tup in it.permutations(window,2):
        weights[tup] += 1
    for i in range(3,len(corp)-2):
        for j in range(i-2,i+2):
            weights[(corp[j],corp[i+2])] += 1
            weights[(corp[i+2],corp[j])] += 1
    for e in edges:
        graph.add_edge(e[0], e[1], {'weight':weights[e]})
    print graph
    nx.write_weighted_edgelist(graph, "graph.g")
    print nx.to_numpy_matrix(graph)
    np.savetxt("graph.adj", nx.to_numpy_matrix(graph))
    print "finished"
def generate_token_group_graph(group="author",output="graph"):
    corp = defaultdict(list)
    sentences = []      # Initialize an empty list of sentences
    input_folders = [ sub_dir for sub_dir in listdir(dataset_folder) if isdir(join(dataset_folder, sub_dir)) ]
    for folder in input_folders:
        dir_path = dataset_folder + os.sep + folder + os.sep
        files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ]
        for file in files:
            file_path = dir_path + file
            file_name, file_extension = splitext(file_path)
            doc = ""
            if file_extension == ".pdf":
                doc = convert_pdf_to_txt(file_path)
            elif file_extension == ".docx":
                doc = convert_docx_to_txt(file_path)
            elif file_extension == ".txt":
                with open(file_path) as f:
                    for line in f:
                        doc = doc+" "+line
            if doc != "":
                # Extract the author name from the filename
                if group is "author":
                    group = file_name.split("_")[0]
                elif group is "document":
                    group = file_name
                    group = ""
                doc = doc.decode("utf8")
                doc = doc.lower()
                doc = doc_to_wordlist(doc,True)
                corp[group] = it.chain(corp[group],doc)
                #sentences += doc_to_sentences(doc, tokenizer, remove_stopwords=False)
    graph = nx.Graph()
    weights = Counter()
    edges = set()
    if group is "":
        coll = list(corp[""])
        window = coll[0:5]
        for tup in it.permutations(window,2):
            weights[tup] += 1
            weights[(tup[1],tup[0])] += 1
        for i in range(2,len(coll)-2):
            for j in range(i-2,i+2):
                weights[(coll[j],coll[i+2])] += 1
                weights[(coll[i+2],coll[j])] += 1
        for e in edges:
            graph.add_edge(e[0], e[1], {'weight':weights[e]})
        for (g,coll) in corp:
            coll = list(coll)
            window = coll[0:5]
            for tup in it.permutations(window,2):
                weights[tup] += 1
                weights[(tup[1],tup[0])] += 1
            for t in window:
                if not (t is g):
                    weights[(g,t)] += 1
                    weights[(t,g)] += 1
            for i in range(2,len(coll)-2):
                for j in range(i-2,i+2):
                    if not (coll[i+2] is g):
                        weights[(g,coll[i+2])] += 1
                        weights[(coll[i+2],g)] += 1

                    weights[(coll[j],coll[i+2])] += 1
                    weights[(coll[i+2],coll[j])] += 1
            for e in edges:
                graph.add_edge(e[0], e[1], {'weight':weights[e]})

    nx.write_weighted_edgelist(graph, output+".g")
    print nx.to_numpy_matrix(graph)
    np.savetxt(output+".adj", nx.to_numpy_matrix(graph))
    print "finished"
def generate_token_group_bigraph(group="author",output="graph", threshold=5):
    corp = defaultdict(list)
    sentences = []      # Initialize an empty list of sentences
    input_folders = [ sub_dir for sub_dir in listdir(dataset_folder) if isdir(join(dataset_folder, sub_dir)) ]
    for folder in input_folders:
        dir_path = dataset_folder + os.sep + folder + os.sep
        files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ]
        for file in files:
            file_path = dir_path + file
            file_name, file_extension = splitext(file_path)
            doc = ""
            if file_extension == ".pdf":
                doc = convert_pdf_to_txt(file_path)
            elif file_extension == ".docx":
                doc = convert_docx_to_txt(file_path)
            elif file_extension == ".txt":
                with open(file_path) as f:
                    for line in f:
                        doc = doc+" "+line
            if doc != "":
                # Extract the author name from the filename
                if group is "author":
                    # Before I just consider the author another token.
                    # Now I need to distinguish the author tokens from
                    # the author identifier
                    group = "a_"+file_name.split("_")[0]
                elif group is "document":
                    group = file_name
                    group = ""
                doc = doc.decode("utf8")
                doc = doc.lower()
                doc = doc_to_wordlist(doc,True)
                corp[group] = it.chain(corp[group],doc)
                #sentences += doc_to_sentences(doc, tokenizer, remove_stopwords=False)
    graph = nx.Graph()
    weights = Counter()
    edges = set()
    if group is "":
        coll = list(corp[""])
        window = coll[0:5]
        for tup in it.permutations(window,2):
            weights[tup] += 1
            weights[(tup[1],tup[0])] += 1

        for i in range(2,len(coll)-2):
            for j in range(i-2,i+2):
                weights[(coll[j],coll[i+2])] += 1
                weights[(coll[i+2],coll[j])] += 1
        for e in edges:
            graph.add_edge(e[0], e[1], {'weight':weights[e]})
        for (g,coll) in corp.iteritems():
            coll = list(coll)
            window = coll[0:5]
            for tup in it.permutations(window,2):
                weights[tup] += 1
                weights[(tup[1],tup[0])] += 1
            for t in window:
                if not (t is g):
                    weights[(g,t)] += 1
                    weights[(t,g)] += 1
            for i in range(2,len(coll)-2):
                for j in range(i-2,i+2):
                    if not (coll[i+2] is g):
                        weights[(g,coll[i+2])] += 1
                        weights[(coll[i+2],g)] += 1
                    weights[(coll[j],coll[i+2])] += 1
                    weights[(coll[i+2],coll[j])] += 1
            for e in edges:
                # Bimax and biclique detection techniques do not use weights, so just
                # create threshed edges
                if weights[e] > threshold:
                    graph.add_edge(e[0], e[1])

    #nx.write_weighted_edgelist(graph, output+"_b.g")
    #print nx.to_numpy_matrix(graph)
    #np.savetxt(output+"_b.adj", nx.to_numpy_matrix(graph))
    print "finished"
def generate_token_group_bigraph_from_amazon_imdb_dataset(output="graph", threshold=5): 
	#This function is used to create a graph from the imdb/amazon dataset 
	#and store it a file called edgelist.graph in the folder in which the code is being run.
    corp = []
    input_folders = [ sub_dir for sub_dir in listdir(dataset_folder) if isdir(join(dataset_folder, sub_dir)) ]
    for folder in input_folders:
        dir_path = dataset_folder + os.sep + folder + os.sep
        files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ]
        for file in files:
            file_path = dir_path + file
            file_name, file_extension = splitext(file_path)
            doc = ""
            if file_extension == ".txt":
                with open(file_path) as f:
                    for line in f:
                        doc = doc+" "+line
            if doc != "":
                doc = doc.decode("utf8")
                doc = doc.lower()
                doc = doc_to_wordlist(doc,True)
                corp = it.chain(corp,doc)
    graph = nx.Graph()
    weights = Counter()
    edges = set()
    coll = list(corp)
    window = coll[0:5]
    for tup in it.permutations(window,2):
        weights[tup] += 1
        weights[(tup[1],tup[0])] += 1

    for i in range(2,len(coll)-2):
        for j in range(i-2,i+2):
            weights[(coll[j],coll[i+2])] += 1
            weights[(coll[i+2],coll[j])] += 1
    for e in edges:
        graph.add_edge(e[0], e[1], {'weight':weights[e]})

    #nx.write_weighted_edgelist(graph, output+"_b.g")
    #print nx.to_numpy_matrix(graph)
    #np.savetxt(output+"_b.adj", nx.to_numpy_matrix(graph))

    nx.write_edgelist(graph, "edgelist.graph", data=['weight'])
    print "finished"


